xref: /illumos-gate/usr/src/uts/common/fs/zfs/arc.c (revision 67d74cc3e7c9d9461311136a0b2069813a3fd927)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23  * Copyright (c) 2018, Joyent, Inc.
24  * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25  * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
27  */
28 
29 /*
30  * DVA-based Adjustable Replacement Cache
31  *
32  * While much of the theory of operation used here is
33  * based on the self-tuning, low overhead replacement cache
34  * presented by Megiddo and Modha at FAST 2003, there are some
35  * significant differences:
36  *
37  * 1. The Megiddo and Modha model assumes any page is evictable.
38  * Pages in its cache cannot be "locked" into memory.  This makes
39  * the eviction algorithm simple: evict the last page in the list.
40  * This also make the performance characteristics easy to reason
41  * about.  Our cache is not so simple.  At any given moment, some
42  * subset of the blocks in the cache are un-evictable because we
43  * have handed out a reference to them.  Blocks are only evictable
44  * when there are no external references active.  This makes
45  * eviction far more problematic:  we choose to evict the evictable
46  * blocks that are the "lowest" in the list.
47  *
48  * There are times when it is not possible to evict the requested
49  * space.  In these circumstances we are unable to adjust the cache
50  * size.  To prevent the cache growing unbounded at these times we
51  * implement a "cache throttle" that slows the flow of new data
52  * into the cache until we can make space available.
53  *
54  * 2. The Megiddo and Modha model assumes a fixed cache size.
55  * Pages are evicted when the cache is full and there is a cache
56  * miss.  Our model has a variable sized cache.  It grows with
57  * high use, but also tries to react to memory pressure from the
58  * operating system: decreasing its size when system memory is
59  * tight.
60  *
61  * 3. The Megiddo and Modha model assumes a fixed page size. All
62  * elements of the cache are therefore exactly the same size.  So
63  * when adjusting the cache size following a cache miss, its simply
64  * a matter of choosing a single page to evict.  In our model, we
65  * have variable sized cache blocks (rangeing from 512 bytes to
66  * 128K bytes).  We therefore choose a set of blocks to evict to make
67  * space for a cache miss that approximates as closely as possible
68  * the space used by the new block.
69  *
70  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71  * by N. Megiddo & D. Modha, FAST 2003
72  */
73 
74 /*
75  * The locking model:
76  *
77  * A new reference to a cache buffer can be obtained in two
78  * ways: 1) via a hash table lookup using the DVA as a key,
79  * or 2) via one of the ARC lists.  The arc_read() interface
80  * uses method 1, while the internal ARC algorithms for
81  * adjusting the cache use method 2.  We therefore provide two
82  * types of locks: 1) the hash table lock array, and 2) the
83  * ARC list locks.
84  *
85  * Buffers do not have their own mutexes, rather they rely on the
86  * hash table mutexes for the bulk of their protection (i.e. most
87  * fields in the arc_buf_hdr_t are protected by these mutexes).
88  *
89  * buf_hash_find() returns the appropriate mutex (held) when it
90  * locates the requested buffer in the hash table.  It returns
91  * NULL for the mutex if the buffer was not in the table.
92  *
93  * buf_hash_remove() expects the appropriate hash mutex to be
94  * already held before it is invoked.
95  *
96  * Each ARC state also has a mutex which is used to protect the
97  * buffer list associated with the state.  When attempting to
98  * obtain a hash table lock while holding an ARC list lock you
99  * must use: mutex_tryenter() to avoid deadlock.  Also note that
100  * the active state mutex must be held before the ghost state mutex.
101  *
102  * Note that the majority of the performance stats are manipulated
103  * with atomic operations.
104  *
105  * The L2ARC uses the l2ad_mtx on each vdev for the following:
106  *
107  *	- L2ARC buflist creation
108  *	- L2ARC buflist eviction
109  *	- L2ARC write completion, which walks L2ARC buflists
110  *	- ARC header destruction, as it removes from L2ARC buflists
111  *	- ARC header release, as it removes from L2ARC buflists
112  */
113 
114 /*
115  * ARC operation:
116  *
117  * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure.
118  * This structure can point either to a block that is still in the cache or to
119  * one that is only accessible in an L2 ARC device, or it can provide
120  * information about a block that was recently evicted. If a block is
121  * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough
122  * information to retrieve it from the L2ARC device. This information is
123  * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block
124  * that is in this state cannot access the data directly.
125  *
126  * Blocks that are actively being referenced or have not been evicted
127  * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within
128  * the arc_buf_hdr_t that will point to the data block in memory. A block can
129  * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC
130  * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and
131  * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd).
132  *
133  * The L1ARC's data pointer may or may not be uncompressed. The ARC has the
134  * ability to store the physical data (b_pabd) associated with the DVA of the
135  * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block,
136  * it will match its on-disk compression characteristics. This behavior can be
137  * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the
138  * compressed ARC functionality is disabled, the b_pabd will point to an
139  * uncompressed version of the on-disk data.
140  *
141  * Data in the L1ARC is not accessed by consumers of the ARC directly. Each
142  * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it.
143  * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC
144  * consumer. The ARC will provide references to this data and will keep it
145  * cached until it is no longer in use. The ARC caches only the L1ARC's physical
146  * data block and will evict any arc_buf_t that is no longer referenced. The
147  * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the
148  * "overhead_size" kstat.
149  *
150  * Depending on the consumer, an arc_buf_t can be requested in uncompressed or
151  * compressed form. The typical case is that consumers will want uncompressed
152  * data, and when that happens a new data buffer is allocated where the data is
153  * decompressed for them to use. Currently the only consumer who wants
154  * compressed arc_buf_t's is "zfs send", when it streams data exactly as it
155  * exists on disk. When this happens, the arc_buf_t's data buffer is shared
156  * with the arc_buf_hdr_t.
157  *
158  * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The
159  * first one is owned by a compressed send consumer (and therefore references
160  * the same compressed data buffer as the arc_buf_hdr_t) and the second could be
161  * used by any other consumer (and has its own uncompressed copy of the data
162  * buffer).
163  *
164  *   arc_buf_hdr_t
165  *   +-----------+
166  *   | fields    |
167  *   | common to |
168  *   | L1- and   |
169  *   | L2ARC     |
170  *   +-----------+
171  *   | l2arc_buf_hdr_t
172  *   |           |
173  *   +-----------+
174  *   | l1arc_buf_hdr_t
175  *   |           |              arc_buf_t
176  *   | b_buf     +------------>+-----------+      arc_buf_t
177  *   | b_pabd    +-+           |b_next     +---->+-----------+
178  *   +-----------+ |           |-----------|     |b_next     +-->NULL
179  *                 |           |b_comp = T |     +-----------+
180  *                 |           |b_data     +-+   |b_comp = F |
181  *                 |           +-----------+ |   |b_data     +-+
182  *                 +->+------+               |   +-----------+ |
183  *        compressed  |      |               |                 |
184  *           data     |      |<--------------+                 | uncompressed
185  *                    +------+          compressed,            |     data
186  *                                        shared               +-->+------+
187  *                                         data                    |      |
188  *                                                                 |      |
189  *                                                                 +------+
190  *
191  * When a consumer reads a block, the ARC must first look to see if the
192  * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new
193  * arc_buf_t and either copies uncompressed data into a new data buffer from an
194  * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a
195  * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the
196  * hdr is compressed and the desired compression characteristics of the
197  * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the
198  * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be
199  * the last buffer in the hdr's b_buf list, however a shared compressed buf can
200  * be anywhere in the hdr's list.
201  *
202  * The diagram below shows an example of an uncompressed ARC hdr that is
203  * sharing its data with an arc_buf_t (note that the shared uncompressed buf is
204  * the last element in the buf list):
205  *
206  *                arc_buf_hdr_t
207  *                +-----------+
208  *                |           |
209  *                |           |
210  *                |           |
211  *                +-----------+
212  * l2arc_buf_hdr_t|           |
213  *                |           |
214  *                +-----------+
215  * l1arc_buf_hdr_t|           |
216  *                |           |                 arc_buf_t    (shared)
217  *                |    b_buf  +------------>+---------+      arc_buf_t
218  *                |           |             |b_next   +---->+---------+
219  *                |  b_pabd   +-+           |---------|     |b_next   +-->NULL
220  *                +-----------+ |           |         |     +---------+
221  *                              |           |b_data   +-+   |         |
222  *                              |           +---------+ |   |b_data   +-+
223  *                              +->+------+             |   +---------+ |
224  *                                 |      |             |               |
225  *                   uncompressed  |      |             |               |
226  *                        data     +------+             |               |
227  *                                    ^                 +->+------+     |
228  *                                    |       uncompressed |      |     |
229  *                                    |           data     |      |     |
230  *                                    |                    +------+     |
231  *                                    +---------------------------------+
232  *
233  * Writing to the ARC requires that the ARC first discard the hdr's b_pabd
234  * since the physical block is about to be rewritten. The new data contents
235  * will be contained in the arc_buf_t. As the I/O pipeline performs the write,
236  * it may compress the data before writing it to disk. The ARC will be called
237  * with the transformed data and will bcopy the transformed on-disk block into
238  * a newly allocated b_pabd. Writes are always done into buffers which have
239  * either been loaned (and hence are new and don't have other readers) or
240  * buffers which have been released (and hence have their own hdr, if there
241  * were originally other readers of the buf's original hdr). This ensures that
242  * the ARC only needs to update a single buf and its hdr after a write occurs.
243  *
244  * When the L2ARC is in use, it will also take advantage of the b_pabd. The
245  * L2ARC will always write the contents of b_pabd to the L2ARC. This means
246  * that when compressed ARC is enabled that the L2ARC blocks are identical
247  * to the on-disk block in the main data pool. This provides a significant
248  * advantage since the ARC can leverage the bp's checksum when reading from the
249  * L2ARC to determine if the contents are valid. However, if the compressed
250  * ARC is disabled, then the L2ARC's block must be transformed to look
251  * like the physical block in the main data pool before comparing the
252  * checksum and determining its validity.
253  *
254  * The L1ARC has a slightly different system for storing encrypted data.
255  * Raw (encrypted + possibly compressed) data has a few subtle differences from
256  * data that is just compressed. The biggest difference is that it is not
257  * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded.
258  * The other difference is that encryption cannot be treated as a suggestion.
259  * If a caller would prefer compressed data, but they actually wind up with
260  * uncompressed data the worst thing that could happen is there might be a
261  * performance hit. If the caller requests encrypted data, however, we must be
262  * sure they actually get it or else secret information could be leaked. Raw
263  * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore,
264  * may have both an encrypted version and a decrypted version of its data at
265  * once. When a caller needs a raw arc_buf_t, it is allocated and the data is
266  * copied out of this header. To avoid complications with b_pabd, raw buffers
267  * cannot be shared.
268  */
269 
270 #include <sys/spa.h>
271 #include <sys/zio.h>
272 #include <sys/spa_impl.h>
273 #include <sys/zio_compress.h>
274 #include <sys/zio_checksum.h>
275 #include <sys/zfs_context.h>
276 #include <sys/arc.h>
277 #include <sys/refcount.h>
278 #include <sys/vdev.h>
279 #include <sys/vdev_impl.h>
280 #include <sys/dsl_pool.h>
281 #include <sys/zio_checksum.h>
282 #include <sys/multilist.h>
283 #include <sys/abd.h>
284 #include <sys/zil.h>
285 #include <sys/fm/fs/zfs.h>
286 #ifdef _KERNEL
287 #include <sys/vmsystm.h>
288 #include <vm/anon.h>
289 #include <sys/fs/swapnode.h>
290 #include <sys/dnlc.h>
291 #endif
292 #include <sys/callb.h>
293 #include <sys/kstat.h>
294 #include <sys/zthr.h>
295 #include <zfs_fletcher.h>
296 #include <sys/aggsum.h>
297 #include <sys/cityhash.h>
298 
299 #ifndef _KERNEL
300 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
301 boolean_t arc_watch = B_FALSE;
302 int arc_procfd;
303 #endif
304 
305 /*
306  * This thread's job is to keep enough free memory in the system, by
307  * calling arc_kmem_reap_now() plus arc_shrink(), which improves
308  * arc_available_memory().
309  */
310 static zthr_t		*arc_reap_zthr;
311 
312 /*
313  * This thread's job is to keep arc_size under arc_c, by calling
314  * arc_adjust(), which improves arc_is_overflowing().
315  */
316 static zthr_t		*arc_adjust_zthr;
317 
318 static kmutex_t		arc_adjust_lock;
319 static kcondvar_t	arc_adjust_waiters_cv;
320 static boolean_t	arc_adjust_needed = B_FALSE;
321 
322 uint_t arc_reduce_dnlc_percent = 3;
323 
324 /*
325  * The number of headers to evict in arc_evict_state_impl() before
326  * dropping the sublist lock and evicting from another sublist. A lower
327  * value means we're more likely to evict the "correct" header (i.e. the
328  * oldest header in the arc state), but comes with higher overhead
329  * (i.e. more invocations of arc_evict_state_impl()).
330  */
331 int zfs_arc_evict_batch_limit = 10;
332 
333 /* number of seconds before growing cache again */
334 int arc_grow_retry = 60;
335 
336 /*
337  * Minimum time between calls to arc_kmem_reap_soon().  Note that this will
338  * be converted to ticks, so with the default hz=100, a setting of 15 ms
339  * will actually wait 2 ticks, or 20ms.
340  */
341 int arc_kmem_cache_reap_retry_ms = 1000;
342 
343 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */
344 int zfs_arc_overflow_shift = 8;
345 
346 /* shift of arc_c for calculating both min and max arc_p */
347 int arc_p_min_shift = 4;
348 
349 /* log2(fraction of arc to reclaim) */
350 int arc_shrink_shift = 7;
351 
352 /*
353  * log2(fraction of ARC which must be free to allow growing).
354  * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
355  * when reading a new block into the ARC, we will evict an equal-sized block
356  * from the ARC.
357  *
358  * This must be less than arc_shrink_shift, so that when we shrink the ARC,
359  * we will still not allow it to grow.
360  */
361 int			arc_no_grow_shift = 5;
362 
363 
364 /*
365  * minimum lifespan of a prefetch block in clock ticks
366  * (initialized in arc_init())
367  */
368 static int		zfs_arc_min_prefetch_ms = 1;
369 static int		zfs_arc_min_prescient_prefetch_ms = 6;
370 
371 /*
372  * If this percent of memory is free, don't throttle.
373  */
374 int arc_lotsfree_percent = 10;
375 
376 static boolean_t arc_initialized;
377 
378 /*
379  * The arc has filled available memory and has now warmed up.
380  */
381 static boolean_t arc_warm;
382 
383 /*
384  * log2 fraction of the zio arena to keep free.
385  */
386 int arc_zio_arena_free_shift = 2;
387 
388 /*
389  * These tunables are for performance analysis.
390  */
391 uint64_t zfs_arc_max;
392 uint64_t zfs_arc_min;
393 uint64_t zfs_arc_meta_limit = 0;
394 uint64_t zfs_arc_meta_min = 0;
395 int zfs_arc_grow_retry = 0;
396 int zfs_arc_shrink_shift = 0;
397 int zfs_arc_p_min_shift = 0;
398 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
399 
400 /*
401  * ARC dirty data constraints for arc_tempreserve_space() throttle
402  */
403 uint_t zfs_arc_dirty_limit_percent = 50;	/* total dirty data limit */
404 uint_t zfs_arc_anon_limit_percent = 25;		/* anon block dirty limit */
405 uint_t zfs_arc_pool_dirty_percent = 20;		/* each pool's anon allowance */
406 
407 boolean_t zfs_compressed_arc_enabled = B_TRUE;
408 
409 /*
410  * Note that buffers can be in one of 6 states:
411  *	ARC_anon	- anonymous (discussed below)
412  *	ARC_mru		- recently used, currently cached
413  *	ARC_mru_ghost	- recentely used, no longer in cache
414  *	ARC_mfu		- frequently used, currently cached
415  *	ARC_mfu_ghost	- frequently used, no longer in cache
416  *	ARC_l2c_only	- exists in L2ARC but not other states
417  * When there are no active references to the buffer, they are
418  * are linked onto a list in one of these arc states.  These are
419  * the only buffers that can be evicted or deleted.  Within each
420  * state there are multiple lists, one for meta-data and one for
421  * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
422  * etc.) is tracked separately so that it can be managed more
423  * explicitly: favored over data, limited explicitly.
424  *
425  * Anonymous buffers are buffers that are not associated with
426  * a DVA.  These are buffers that hold dirty block copies
427  * before they are written to stable storage.  By definition,
428  * they are "ref'd" and are considered part of arc_mru
429  * that cannot be freed.  Generally, they will aquire a DVA
430  * as they are written and migrate onto the arc_mru list.
431  *
432  * The ARC_l2c_only state is for buffers that are in the second
433  * level ARC but no longer in any of the ARC_m* lists.  The second
434  * level ARC itself may also contain buffers that are in any of
435  * the ARC_m* states - meaning that a buffer can exist in two
436  * places.  The reason for the ARC_l2c_only state is to keep the
437  * buffer header in the hash table, so that reads that hit the
438  * second level ARC benefit from these fast lookups.
439  */
440 
441 typedef struct arc_state {
442 	/*
443 	 * list of evictable buffers
444 	 */
445 	multilist_t *arcs_list[ARC_BUFC_NUMTYPES];
446 	/*
447 	 * total amount of evictable data in this state
448 	 */
449 	zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES];
450 	/*
451 	 * total amount of data in this state; this includes: evictable,
452 	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
453 	 */
454 	zfs_refcount_t arcs_size;
455 } arc_state_t;
456 
457 /* The 6 states: */
458 static arc_state_t ARC_anon;
459 static arc_state_t ARC_mru;
460 static arc_state_t ARC_mru_ghost;
461 static arc_state_t ARC_mfu;
462 static arc_state_t ARC_mfu_ghost;
463 static arc_state_t ARC_l2c_only;
464 
465 typedef struct arc_stats {
466 	kstat_named_t arcstat_hits;
467 	kstat_named_t arcstat_misses;
468 	kstat_named_t arcstat_demand_data_hits;
469 	kstat_named_t arcstat_demand_data_misses;
470 	kstat_named_t arcstat_demand_metadata_hits;
471 	kstat_named_t arcstat_demand_metadata_misses;
472 	kstat_named_t arcstat_prefetch_data_hits;
473 	kstat_named_t arcstat_prefetch_data_misses;
474 	kstat_named_t arcstat_prefetch_metadata_hits;
475 	kstat_named_t arcstat_prefetch_metadata_misses;
476 	kstat_named_t arcstat_mru_hits;
477 	kstat_named_t arcstat_mru_ghost_hits;
478 	kstat_named_t arcstat_mfu_hits;
479 	kstat_named_t arcstat_mfu_ghost_hits;
480 	kstat_named_t arcstat_deleted;
481 	/*
482 	 * Number of buffers that could not be evicted because the hash lock
483 	 * was held by another thread.  The lock may not necessarily be held
484 	 * by something using the same buffer, since hash locks are shared
485 	 * by multiple buffers.
486 	 */
487 	kstat_named_t arcstat_mutex_miss;
488 	/*
489 	 * Number of buffers skipped when updating the access state due to the
490 	 * header having already been released after acquiring the hash lock.
491 	 */
492 	kstat_named_t arcstat_access_skip;
493 	/*
494 	 * Number of buffers skipped because they have I/O in progress, are
495 	 * indirect prefetch buffers that have not lived long enough, or are
496 	 * not from the spa we're trying to evict from.
497 	 */
498 	kstat_named_t arcstat_evict_skip;
499 	/*
500 	 * Number of times arc_evict_state() was unable to evict enough
501 	 * buffers to reach its target amount.
502 	 */
503 	kstat_named_t arcstat_evict_not_enough;
504 	kstat_named_t arcstat_evict_l2_cached;
505 	kstat_named_t arcstat_evict_l2_eligible;
506 	kstat_named_t arcstat_evict_l2_ineligible;
507 	kstat_named_t arcstat_evict_l2_skip;
508 	kstat_named_t arcstat_hash_elements;
509 	kstat_named_t arcstat_hash_elements_max;
510 	kstat_named_t arcstat_hash_collisions;
511 	kstat_named_t arcstat_hash_chains;
512 	kstat_named_t arcstat_hash_chain_max;
513 	kstat_named_t arcstat_p;
514 	kstat_named_t arcstat_c;
515 	kstat_named_t arcstat_c_min;
516 	kstat_named_t arcstat_c_max;
517 	/* Not updated directly; only synced in arc_kstat_update. */
518 	kstat_named_t arcstat_size;
519 	/*
520 	 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd.
521 	 * Note that the compressed bytes may match the uncompressed bytes
522 	 * if the block is either not compressed or compressed arc is disabled.
523 	 */
524 	kstat_named_t arcstat_compressed_size;
525 	/*
526 	 * Uncompressed size of the data stored in b_pabd. If compressed
527 	 * arc is disabled then this value will be identical to the stat
528 	 * above.
529 	 */
530 	kstat_named_t arcstat_uncompressed_size;
531 	/*
532 	 * Number of bytes stored in all the arc_buf_t's. This is classified
533 	 * as "overhead" since this data is typically short-lived and will
534 	 * be evicted from the arc when it becomes unreferenced unless the
535 	 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level
536 	 * values have been set (see comment in dbuf.c for more information).
537 	 */
538 	kstat_named_t arcstat_overhead_size;
539 	/*
540 	 * Number of bytes consumed by internal ARC structures necessary
541 	 * for tracking purposes; these structures are not actually
542 	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
543 	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
544 	 * caches), and arc_buf_t structures (allocated via arc_buf_t
545 	 * cache).
546 	 * Not updated directly; only synced in arc_kstat_update.
547 	 */
548 	kstat_named_t arcstat_hdr_size;
549 	/*
550 	 * Number of bytes consumed by ARC buffers of type equal to
551 	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
552 	 * on disk user data (e.g. plain file contents).
553 	 * Not updated directly; only synced in arc_kstat_update.
554 	 */
555 	kstat_named_t arcstat_data_size;
556 	/*
557 	 * Number of bytes consumed by ARC buffers of type equal to
558 	 * ARC_BUFC_METADATA. This is generally consumed by buffers
559 	 * backing on disk data that is used for internal ZFS
560 	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
561 	 * Not updated directly; only synced in arc_kstat_update.
562 	 */
563 	kstat_named_t arcstat_metadata_size;
564 	/*
565 	 * Number of bytes consumed by various buffers and structures
566 	 * not actually backed with ARC buffers. This includes bonus
567 	 * buffers (allocated directly via zio_buf_* functions),
568 	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
569 	 * cache), and dnode_t structures (allocated via dnode_t cache).
570 	 * Not updated directly; only synced in arc_kstat_update.
571 	 */
572 	kstat_named_t arcstat_other_size;
573 	/*
574 	 * Total number of bytes consumed by ARC buffers residing in the
575 	 * arc_anon state. This includes *all* buffers in the arc_anon
576 	 * state; e.g. data, metadata, evictable, and unevictable buffers
577 	 * are all included in this value.
578 	 * Not updated directly; only synced in arc_kstat_update.
579 	 */
580 	kstat_named_t arcstat_anon_size;
581 	/*
582 	 * Number of bytes consumed by ARC buffers that meet the
583 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
584 	 * residing in the arc_anon state, and are eligible for eviction
585 	 * (e.g. have no outstanding holds on the buffer).
586 	 * Not updated directly; only synced in arc_kstat_update.
587 	 */
588 	kstat_named_t arcstat_anon_evictable_data;
589 	/*
590 	 * Number of bytes consumed by ARC buffers that meet the
591 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
592 	 * residing in the arc_anon state, and are eligible for eviction
593 	 * (e.g. have no outstanding holds on the buffer).
594 	 * Not updated directly; only synced in arc_kstat_update.
595 	 */
596 	kstat_named_t arcstat_anon_evictable_metadata;
597 	/*
598 	 * Total number of bytes consumed by ARC buffers residing in the
599 	 * arc_mru state. This includes *all* buffers in the arc_mru
600 	 * state; e.g. data, metadata, evictable, and unevictable buffers
601 	 * are all included in this value.
602 	 * Not updated directly; only synced in arc_kstat_update.
603 	 */
604 	kstat_named_t arcstat_mru_size;
605 	/*
606 	 * Number of bytes consumed by ARC buffers that meet the
607 	 * following criteria: backing buffers of type ARC_BUFC_DATA,
608 	 * residing in the arc_mru state, and are eligible for eviction
609 	 * (e.g. have no outstanding holds on the buffer).
610 	 * Not updated directly; only synced in arc_kstat_update.
611 	 */
612 	kstat_named_t arcstat_mru_evictable_data;
613 	/*
614 	 * Number of bytes consumed by ARC buffers that meet the
615 	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
616 	 * residing in the arc_mru state, and are eligible for eviction
617 	 * (e.g. have no outstanding holds on the buffer).
618 	 * Not updated directly; only synced in arc_kstat_update.
619 	 */
620 	kstat_named_t arcstat_mru_evictable_metadata;
621 	/*
622 	 * Total number of bytes that *would have been* consumed by ARC
623 	 * buffers in the arc_mru_ghost state. The key thing to note
624 	 * here, is the fact that this size doesn't actually indicate
625 	 * RAM consumption. The ghost lists only consist of headers and
626 	 * don't actually have ARC buffers linked off of these headers.
627 	 * Thus, *if* the headers had associated ARC buffers, these
628 	 * buffers *would have* consumed this number of bytes.
629 	 * Not updated directly; only synced in arc_kstat_update.
630 	 */
631 	kstat_named_t arcstat_mru_ghost_size;
632 	/*
633 	 * Number of bytes that *would have been* consumed by ARC
634 	 * buffers that are eligible for eviction, of type
635 	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
636 	 * Not updated directly; only synced in arc_kstat_update.
637 	 */
638 	kstat_named_t arcstat_mru_ghost_evictable_data;
639 	/*
640 	 * Number of bytes that *would have been* consumed by ARC
641 	 * buffers that are eligible for eviction, of type
642 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
643 	 * Not updated directly; only synced in arc_kstat_update.
644 	 */
645 	kstat_named_t arcstat_mru_ghost_evictable_metadata;
646 	/*
647 	 * Total number of bytes consumed by ARC buffers residing in the
648 	 * arc_mfu state. This includes *all* buffers in the arc_mfu
649 	 * state; e.g. data, metadata, evictable, and unevictable buffers
650 	 * are all included in this value.
651 	 * Not updated directly; only synced in arc_kstat_update.
652 	 */
653 	kstat_named_t arcstat_mfu_size;
654 	/*
655 	 * Number of bytes consumed by ARC buffers that are eligible for
656 	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
657 	 * state.
658 	 * Not updated directly; only synced in arc_kstat_update.
659 	 */
660 	kstat_named_t arcstat_mfu_evictable_data;
661 	/*
662 	 * Number of bytes consumed by ARC buffers that are eligible for
663 	 * eviction, of type ARC_BUFC_METADATA, and reside in the
664 	 * arc_mfu state.
665 	 * Not updated directly; only synced in arc_kstat_update.
666 	 */
667 	kstat_named_t arcstat_mfu_evictable_metadata;
668 	/*
669 	 * Total number of bytes that *would have been* consumed by ARC
670 	 * buffers in the arc_mfu_ghost state. See the comment above
671 	 * arcstat_mru_ghost_size for more details.
672 	 * Not updated directly; only synced in arc_kstat_update.
673 	 */
674 	kstat_named_t arcstat_mfu_ghost_size;
675 	/*
676 	 * Number of bytes that *would have been* consumed by ARC
677 	 * buffers that are eligible for eviction, of type
678 	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
679 	 * Not updated directly; only synced in arc_kstat_update.
680 	 */
681 	kstat_named_t arcstat_mfu_ghost_evictable_data;
682 	/*
683 	 * Number of bytes that *would have been* consumed by ARC
684 	 * buffers that are eligible for eviction, of type
685 	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
686 	 * Not updated directly; only synced in arc_kstat_update.
687 	 */
688 	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
689 	kstat_named_t arcstat_l2_hits;
690 	kstat_named_t arcstat_l2_misses;
691 	kstat_named_t arcstat_l2_feeds;
692 	kstat_named_t arcstat_l2_rw_clash;
693 	kstat_named_t arcstat_l2_read_bytes;
694 	kstat_named_t arcstat_l2_write_bytes;
695 	kstat_named_t arcstat_l2_writes_sent;
696 	kstat_named_t arcstat_l2_writes_done;
697 	kstat_named_t arcstat_l2_writes_error;
698 	kstat_named_t arcstat_l2_writes_lock_retry;
699 	kstat_named_t arcstat_l2_evict_lock_retry;
700 	kstat_named_t arcstat_l2_evict_reading;
701 	kstat_named_t arcstat_l2_evict_l1cached;
702 	kstat_named_t arcstat_l2_free_on_write;
703 	kstat_named_t arcstat_l2_abort_lowmem;
704 	kstat_named_t arcstat_l2_cksum_bad;
705 	kstat_named_t arcstat_l2_io_error;
706 	kstat_named_t arcstat_l2_lsize;
707 	kstat_named_t arcstat_l2_psize;
708 	/* Not updated directly; only synced in arc_kstat_update. */
709 	kstat_named_t arcstat_l2_hdr_size;
710 	kstat_named_t arcstat_memory_throttle_count;
711 	/* Not updated directly; only synced in arc_kstat_update. */
712 	kstat_named_t arcstat_meta_used;
713 	kstat_named_t arcstat_meta_limit;
714 	kstat_named_t arcstat_meta_max;
715 	kstat_named_t arcstat_meta_min;
716 	kstat_named_t arcstat_async_upgrade_sync;
717 	kstat_named_t arcstat_demand_hit_predictive_prefetch;
718 	kstat_named_t arcstat_demand_hit_prescient_prefetch;
719 } arc_stats_t;
720 
721 static arc_stats_t arc_stats = {
722 	{ "hits",			KSTAT_DATA_UINT64 },
723 	{ "misses",			KSTAT_DATA_UINT64 },
724 	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
725 	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
726 	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
727 	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
728 	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
729 	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
730 	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
731 	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
732 	{ "mru_hits",			KSTAT_DATA_UINT64 },
733 	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
734 	{ "mfu_hits",			KSTAT_DATA_UINT64 },
735 	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
736 	{ "deleted",			KSTAT_DATA_UINT64 },
737 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
738 	{ "access_skip",		KSTAT_DATA_UINT64 },
739 	{ "evict_skip",			KSTAT_DATA_UINT64 },
740 	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
741 	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
742 	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
743 	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
744 	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
745 	{ "hash_elements",		KSTAT_DATA_UINT64 },
746 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
747 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
748 	{ "hash_chains",		KSTAT_DATA_UINT64 },
749 	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
750 	{ "p",				KSTAT_DATA_UINT64 },
751 	{ "c",				KSTAT_DATA_UINT64 },
752 	{ "c_min",			KSTAT_DATA_UINT64 },
753 	{ "c_max",			KSTAT_DATA_UINT64 },
754 	{ "size",			KSTAT_DATA_UINT64 },
755 	{ "compressed_size",		KSTAT_DATA_UINT64 },
756 	{ "uncompressed_size",		KSTAT_DATA_UINT64 },
757 	{ "overhead_size",		KSTAT_DATA_UINT64 },
758 	{ "hdr_size",			KSTAT_DATA_UINT64 },
759 	{ "data_size",			KSTAT_DATA_UINT64 },
760 	{ "metadata_size",		KSTAT_DATA_UINT64 },
761 	{ "other_size",			KSTAT_DATA_UINT64 },
762 	{ "anon_size",			KSTAT_DATA_UINT64 },
763 	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
764 	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
765 	{ "mru_size",			KSTAT_DATA_UINT64 },
766 	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
767 	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
768 	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
769 	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
770 	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
771 	{ "mfu_size",			KSTAT_DATA_UINT64 },
772 	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
773 	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
774 	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
775 	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
776 	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
777 	{ "l2_hits",			KSTAT_DATA_UINT64 },
778 	{ "l2_misses",			KSTAT_DATA_UINT64 },
779 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
780 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
781 	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
782 	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
783 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
784 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
785 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
786 	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
787 	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
788 	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
789 	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
790 	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
791 	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
792 	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
793 	{ "l2_io_error",		KSTAT_DATA_UINT64 },
794 	{ "l2_size",			KSTAT_DATA_UINT64 },
795 	{ "l2_asize",			KSTAT_DATA_UINT64 },
796 	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
797 	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
798 	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
799 	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
800 	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
801 	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
802 	{ "async_upgrade_sync",		KSTAT_DATA_UINT64 },
803 	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
804 	{ "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
805 };
806 
807 #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
808 
809 #define	ARCSTAT_INCR(stat, val) \
810 	atomic_add_64(&arc_stats.stat.value.ui64, (val))
811 
812 #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
813 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
814 
815 #define	ARCSTAT_MAX(stat, val) {					\
816 	uint64_t m;							\
817 	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
818 	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
819 		continue;						\
820 }
821 
822 #define	ARCSTAT_MAXSTAT(stat) \
823 	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
824 
825 /*
826  * We define a macro to allow ARC hits/misses to be easily broken down by
827  * two separate conditions, giving a total of four different subtypes for
828  * each of hits and misses (so eight statistics total).
829  */
830 #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
831 	if (cond1) {							\
832 		if (cond2) {						\
833 			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
834 		} else {						\
835 			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
836 		}							\
837 	} else {							\
838 		if (cond2) {						\
839 			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
840 		} else {						\
841 			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
842 		}							\
843 	}
844 
845 kstat_t			*arc_ksp;
846 static arc_state_t	*arc_anon;
847 static arc_state_t	*arc_mru;
848 static arc_state_t	*arc_mru_ghost;
849 static arc_state_t	*arc_mfu;
850 static arc_state_t	*arc_mfu_ghost;
851 static arc_state_t	*arc_l2c_only;
852 
853 /*
854  * There are several ARC variables that are critical to export as kstats --
855  * but we don't want to have to grovel around in the kstat whenever we wish to
856  * manipulate them.  For these variables, we therefore define them to be in
857  * terms of the statistic variable.  This assures that we are not introducing
858  * the possibility of inconsistency by having shadow copies of the variables,
859  * while still allowing the code to be readable.
860  */
861 #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
862 #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
863 #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
864 #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
865 #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
866 #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
867 #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
868 
869 /* compressed size of entire arc */
870 #define	arc_compressed_size	ARCSTAT(arcstat_compressed_size)
871 /* uncompressed size of entire arc */
872 #define	arc_uncompressed_size	ARCSTAT(arcstat_uncompressed_size)
873 /* number of bytes in the arc from arc_buf_t's */
874 #define	arc_overhead_size	ARCSTAT(arcstat_overhead_size)
875 
876 /*
877  * There are also some ARC variables that we want to export, but that are
878  * updated so often that having the canonical representation be the statistic
879  * variable causes a performance bottleneck. We want to use aggsum_t's for these
880  * instead, but still be able to export the kstat in the same way as before.
881  * The solution is to always use the aggsum version, except in the kstat update
882  * callback.
883  */
884 aggsum_t arc_size;
885 aggsum_t arc_meta_used;
886 aggsum_t astat_data_size;
887 aggsum_t astat_metadata_size;
888 aggsum_t astat_hdr_size;
889 aggsum_t astat_other_size;
890 aggsum_t astat_l2_hdr_size;
891 
892 static int		arc_no_grow;	/* Don't try to grow cache size */
893 static hrtime_t		arc_growtime;
894 static uint64_t		arc_tempreserve;
895 static uint64_t		arc_loaned_bytes;
896 
897 typedef struct arc_callback arc_callback_t;
898 
899 struct arc_callback {
900 	void			*acb_private;
901 	arc_read_done_func_t	*acb_done;
902 	arc_buf_t		*acb_buf;
903 	boolean_t		acb_encrypted;
904 	boolean_t		acb_compressed;
905 	boolean_t		acb_noauth;
906 	zbookmark_phys_t	acb_zb;
907 	zio_t			*acb_zio_dummy;
908 	zio_t			*acb_zio_head;
909 	arc_callback_t		*acb_next;
910 };
911 
912 typedef struct arc_write_callback arc_write_callback_t;
913 
914 struct arc_write_callback {
915 	void			*awcb_private;
916 	arc_write_done_func_t	*awcb_ready;
917 	arc_write_done_func_t	*awcb_children_ready;
918 	arc_write_done_func_t	*awcb_physdone;
919 	arc_write_done_func_t	*awcb_done;
920 	arc_buf_t		*awcb_buf;
921 };
922 
923 /*
924  * ARC buffers are separated into multiple structs as a memory saving measure:
925  *   - Common fields struct, always defined, and embedded within it:
926  *       - L2-only fields, always allocated but undefined when not in L2ARC
927  *       - L1-only fields, only allocated when in L1ARC
928  *
929  *           Buffer in L1                     Buffer only in L2
930  *    +------------------------+          +------------------------+
931  *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
932  *    |                        |          |                        |
933  *    |                        |          |                        |
934  *    |                        |          |                        |
935  *    +------------------------+          +------------------------+
936  *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
937  *    | (undefined if L1-only) |          |                        |
938  *    +------------------------+          +------------------------+
939  *    | l1arc_buf_hdr_t        |
940  *    |                        |
941  *    |                        |
942  *    |                        |
943  *    |                        |
944  *    +------------------------+
945  *
946  * Because it's possible for the L2ARC to become extremely large, we can wind
947  * up eating a lot of memory in L2ARC buffer headers, so the size of a header
948  * is minimized by only allocating the fields necessary for an L1-cached buffer
949  * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
950  * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
951  * words in pointers. arc_hdr_realloc() is used to switch a header between
952  * these two allocation states.
953  */
954 typedef struct l1arc_buf_hdr {
955 	kmutex_t		b_freeze_lock;
956 	zio_cksum_t		*b_freeze_cksum;
957 #ifdef ZFS_DEBUG
958 	/*
959 	 * Used for debugging with kmem_flags - by allocating and freeing
960 	 * b_thawed when the buffer is thawed, we get a record of the stack
961 	 * trace that thawed it.
962 	 */
963 	void			*b_thawed;
964 #endif
965 
966 	arc_buf_t		*b_buf;
967 	uint32_t		b_bufcnt;
968 	/* for waiting on writes to complete */
969 	kcondvar_t		b_cv;
970 	uint8_t			b_byteswap;
971 
972 	/* protected by arc state mutex */
973 	arc_state_t		*b_state;
974 	multilist_node_t	b_arc_node;
975 
976 	/* updated atomically */
977 	clock_t			b_arc_access;
978 
979 	/* self protecting */
980 	zfs_refcount_t		b_refcnt;
981 
982 	arc_callback_t		*b_acb;
983 	abd_t			*b_pabd;
984 } l1arc_buf_hdr_t;
985 
986 /*
987  * Encrypted blocks will need to be stored encrypted on the L2ARC
988  * disk as they appear in the main pool. In order for this to work we
989  * need to pass around the encryption parameters so they can be used
990  * to write data to the L2ARC. This struct is only defined in the
991  * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED
992  * flag set.
993  */
994 typedef struct arc_buf_hdr_crypt {
995 	abd_t		*b_rabd;		/* raw encrypted data */
996 	dmu_object_type_t	b_ot;		/* object type */
997 	uint32_t		b_ebufcnt;	/* number or encryped buffers */
998 
999 	/* dsobj for looking up encryption key for l2arc encryption */
1000 	uint64_t		b_dsobj;	/* for looking up key */
1001 
1002 	/* encryption parameters */
1003 	uint8_t		b_salt[ZIO_DATA_SALT_LEN];
1004 	uint8_t		b_iv[ZIO_DATA_IV_LEN];
1005 
1006 	/*
1007 	 * Technically this could be removed since we will always be able to
1008 	 * get the mac from the bp when we need it. However, it is inconvenient
1009 	 * for callers of arc code to have to pass a bp in all the time. This
1010 	 * also allows us to assert that L2ARC data is properly encrypted to
1011 	 * match the data in the main storage pool.
1012 	 */
1013 	uint8_t		b_mac[ZIO_DATA_MAC_LEN];
1014 } arc_buf_hdr_crypt_t;
1015 
1016 typedef struct l2arc_dev l2arc_dev_t;
1017 
1018 typedef struct l2arc_buf_hdr {
1019 	/* protected by arc_buf_hdr mutex */
1020 	l2arc_dev_t		*b_dev;		/* L2ARC device */
1021 	uint64_t		b_daddr;	/* disk address, offset byte */
1022 
1023 	list_node_t		b_l2node;
1024 } l2arc_buf_hdr_t;
1025 
1026 struct arc_buf_hdr {
1027 	/* protected by hash lock */
1028 	dva_t			b_dva;
1029 	uint64_t		b_birth;
1030 
1031 	arc_buf_contents_t	b_type;
1032 	arc_buf_hdr_t		*b_hash_next;
1033 	arc_flags_t		b_flags;
1034 
1035 	/*
1036 	 * This field stores the size of the data buffer after
1037 	 * compression, and is set in the arc's zio completion handlers.
1038 	 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes).
1039 	 *
1040 	 * While the block pointers can store up to 32MB in their psize
1041 	 * field, we can only store up to 32MB minus 512B. This is due
1042 	 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e.
1043 	 * a field of zeros represents 512B in the bp). We can't use a
1044 	 * bias of 1 since we need to reserve a psize of zero, here, to
1045 	 * represent holes and embedded blocks.
1046 	 *
1047 	 * This isn't a problem in practice, since the maximum size of a
1048 	 * buffer is limited to 16MB, so we never need to store 32MB in
1049 	 * this field. Even in the upstream illumos code base, the
1050 	 * maximum size of a buffer is limited to 16MB.
1051 	 */
1052 	uint16_t		b_psize;
1053 
1054 	/*
1055 	 * This field stores the size of the data buffer before
1056 	 * compression, and cannot change once set. It is in units
1057 	 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes)
1058 	 */
1059 	uint16_t		b_lsize;	/* immutable */
1060 	uint64_t		b_spa;		/* immutable */
1061 
1062 	/* L2ARC fields. Undefined when not in L2ARC. */
1063 	l2arc_buf_hdr_t		b_l2hdr;
1064 	/* L1ARC fields. Undefined when in l2arc_only state */
1065 	l1arc_buf_hdr_t		b_l1hdr;
1066 	/*
1067 	 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED
1068 	 * is set and the L1 header exists.
1069 	 */
1070 	arc_buf_hdr_crypt_t b_crypt_hdr;
1071 };
1072 
1073 #define	GHOST_STATE(state)	\
1074 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
1075 	(state) == arc_l2c_only)
1076 
1077 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
1078 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
1079 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
1080 #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
1081 #define	HDR_PRESCIENT_PREFETCH(hdr)	\
1082 	((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
1083 #define	HDR_COMPRESSION_ENABLED(hdr)	\
1084 	((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
1085 
1086 #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
1087 #define	HDR_L2_READING(hdr)	\
1088 	(((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
1089 	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
1090 #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
1091 #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
1092 #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
1093 #define	HDR_PROTECTED(hdr)	((hdr)->b_flags & ARC_FLAG_PROTECTED)
1094 #define	HDR_NOAUTH(hdr)		((hdr)->b_flags & ARC_FLAG_NOAUTH)
1095 #define	HDR_SHARED_DATA(hdr)	((hdr)->b_flags & ARC_FLAG_SHARED_DATA)
1096 
1097 #define	HDR_ISTYPE_METADATA(hdr)	\
1098 	((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
1099 #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
1100 
1101 #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
1102 #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
1103 #define	HDR_HAS_RABD(hdr)	\
1104 	(HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) &&	\
1105 	(hdr)->b_crypt_hdr.b_rabd != NULL)
1106 #define	HDR_ENCRYPTED(hdr)	\
1107 	(HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
1108 #define	HDR_AUTHENTICATED(hdr)	\
1109 	(HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot))
1110 
1111 /* For storing compression mode in b_flags */
1112 #define	HDR_COMPRESS_OFFSET	(highbit64(ARC_FLAG_COMPRESS_0) - 1)
1113 
1114 #define	HDR_GET_COMPRESS(hdr)	((enum zio_compress)BF32_GET((hdr)->b_flags, \
1115 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS))
1116 #define	HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \
1117 	HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp));
1118 
1119 #define	ARC_BUF_LAST(buf)	((buf)->b_next == NULL)
1120 #define	ARC_BUF_SHARED(buf)	((buf)->b_flags & ARC_BUF_FLAG_SHARED)
1121 #define	ARC_BUF_COMPRESSED(buf)	((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED)
1122 #define	ARC_BUF_ENCRYPTED(buf)	((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED)
1123 
1124 /*
1125  * Other sizes
1126  */
1127 
1128 #define	HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
1129 #define	HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr))
1130 #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
1131 
1132 /*
1133  * Hash table routines
1134  */
1135 
1136 #define	HT_LOCK_PAD	64
1137 
1138 struct ht_lock {
1139 	kmutex_t	ht_lock;
1140 #ifdef _KERNEL
1141 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
1142 #endif
1143 };
1144 
1145 #define	BUF_LOCKS 256
1146 typedef struct buf_hash_table {
1147 	uint64_t ht_mask;
1148 	arc_buf_hdr_t **ht_table;
1149 	struct ht_lock ht_locks[BUF_LOCKS];
1150 } buf_hash_table_t;
1151 
1152 static buf_hash_table_t buf_hash_table;
1153 
1154 #define	BUF_HASH_INDEX(spa, dva, birth) \
1155 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
1156 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
1157 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
1158 #define	HDR_LOCK(hdr) \
1159 	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
1160 
1161 uint64_t zfs_crc64_table[256];
1162 
1163 /*
1164  * Level 2 ARC
1165  */
1166 
1167 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
1168 #define	L2ARC_HEADROOM		2			/* num of writes */
1169 /*
1170  * If we discover during ARC scan any buffers to be compressed, we boost
1171  * our headroom for the next scanning cycle by this percentage multiple.
1172  */
1173 #define	L2ARC_HEADROOM_BOOST	200
1174 #define	L2ARC_FEED_SECS		1		/* caching interval secs */
1175 #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
1176 
1177 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
1178 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
1179 
1180 /* L2ARC Performance Tunables */
1181 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
1182 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
1183 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
1184 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
1185 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
1186 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
1187 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
1188 boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
1189 boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
1190 
1191 /*
1192  * L2ARC Internals
1193  */
1194 struct l2arc_dev {
1195 	vdev_t			*l2ad_vdev;	/* vdev */
1196 	spa_t			*l2ad_spa;	/* spa */
1197 	uint64_t		l2ad_hand;	/* next write location */
1198 	uint64_t		l2ad_start;	/* first addr on device */
1199 	uint64_t		l2ad_end;	/* last addr on device */
1200 	boolean_t		l2ad_first;	/* first sweep through */
1201 	boolean_t		l2ad_writing;	/* currently writing */
1202 	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1203 	list_t			l2ad_buflist;	/* buffer list */
1204 	list_node_t		l2ad_node;	/* device list node */
1205 	zfs_refcount_t		l2ad_alloc;	/* allocated bytes */
1206 };
1207 
1208 static list_t L2ARC_dev_list;			/* device list */
1209 static list_t *l2arc_dev_list;			/* device list pointer */
1210 static kmutex_t l2arc_dev_mtx;			/* device list mutex */
1211 static l2arc_dev_t *l2arc_dev_last;		/* last device used */
1212 static list_t L2ARC_free_on_write;		/* free after write buf list */
1213 static list_t *l2arc_free_on_write;		/* free after write list ptr */
1214 static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
1215 static uint64_t l2arc_ndev;			/* number of devices */
1216 
1217 typedef struct l2arc_read_callback {
1218 	arc_buf_hdr_t		*l2rcb_hdr;		/* read header */
1219 	blkptr_t		l2rcb_bp;		/* original blkptr */
1220 	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
1221 	int			l2rcb_flags;		/* original flags */
1222 	abd_t			*l2rcb_abd;		/* temporary buffer */
1223 } l2arc_read_callback_t;
1224 
1225 typedef struct l2arc_write_callback {
1226 	l2arc_dev_t	*l2wcb_dev;		/* device info */
1227 	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
1228 } l2arc_write_callback_t;
1229 
1230 typedef struct l2arc_data_free {
1231 	/* protected by l2arc_free_on_write_mtx */
1232 	abd_t		*l2df_abd;
1233 	size_t		l2df_size;
1234 	arc_buf_contents_t l2df_type;
1235 	list_node_t	l2df_list_node;
1236 } l2arc_data_free_t;
1237 
1238 static kmutex_t l2arc_feed_thr_lock;
1239 static kcondvar_t l2arc_feed_thr_cv;
1240 static uint8_t l2arc_thread_exit;
1241 
1242 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *);
1243 typedef enum arc_fill_flags {
1244 	ARC_FILL_LOCKED		= 1 << 0, /* hdr lock is held */
1245 	ARC_FILL_COMPRESSED	= 1 << 1, /* fill with compressed data */
1246 	ARC_FILL_ENCRYPTED	= 1 << 2, /* fill with encrypted data */
1247 	ARC_FILL_NOAUTH		= 1 << 3, /* don't attempt to authenticate */
1248 	ARC_FILL_IN_PLACE	= 1 << 4  /* fill in place (special case) */
1249 } arc_fill_flags_t;
1250 
1251 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *);
1252 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *);
1253 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *);
1254 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *);
1255 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag);
1256 static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t);
1257 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t);
1258 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
1259 static boolean_t arc_is_overflowing();
1260 static void arc_buf_watch(arc_buf_t *);
1261 
1262 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
1263 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
1264 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1265 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags);
1266 
1267 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *);
1268 static void l2arc_read_done(zio_t *);
1269 
1270 
1271 /*
1272  * We use Cityhash for this. It's fast, and has good hash properties without
1273  * requiring any large static buffers.
1274  */
1275 static uint64_t
1276 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1277 {
1278 	return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth));
1279 }
1280 
1281 #define	HDR_EMPTY(hdr)						\
1282 	((hdr)->b_dva.dva_word[0] == 0 &&			\
1283 	(hdr)->b_dva.dva_word[1] == 0)
1284 
1285 #define	HDR_EQUAL(spa, dva, birth, hdr)				\
1286 	((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1287 	((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1288 	((hdr)->b_birth == birth) && ((hdr)->b_spa == spa)
1289 
1290 static void
1291 buf_discard_identity(arc_buf_hdr_t *hdr)
1292 {
1293 	hdr->b_dva.dva_word[0] = 0;
1294 	hdr->b_dva.dva_word[1] = 0;
1295 	hdr->b_birth = 0;
1296 }
1297 
1298 static arc_buf_hdr_t *
1299 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1300 {
1301 	const dva_t *dva = BP_IDENTITY(bp);
1302 	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1303 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1304 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1305 	arc_buf_hdr_t *hdr;
1306 
1307 	mutex_enter(hash_lock);
1308 	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1309 	    hdr = hdr->b_hash_next) {
1310 		if (HDR_EQUAL(spa, dva, birth, hdr)) {
1311 			*lockp = hash_lock;
1312 			return (hdr);
1313 		}
1314 	}
1315 	mutex_exit(hash_lock);
1316 	*lockp = NULL;
1317 	return (NULL);
1318 }
1319 
1320 /*
1321  * Insert an entry into the hash table.  If there is already an element
1322  * equal to elem in the hash table, then the already existing element
1323  * will be returned and the new element will not be inserted.
1324  * Otherwise returns NULL.
1325  * If lockp == NULL, the caller is assumed to already hold the hash lock.
1326  */
1327 static arc_buf_hdr_t *
1328 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1329 {
1330 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1331 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1332 	arc_buf_hdr_t *fhdr;
1333 	uint32_t i;
1334 
1335 	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1336 	ASSERT(hdr->b_birth != 0);
1337 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1338 
1339 	if (lockp != NULL) {
1340 		*lockp = hash_lock;
1341 		mutex_enter(hash_lock);
1342 	} else {
1343 		ASSERT(MUTEX_HELD(hash_lock));
1344 	}
1345 
1346 	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1347 	    fhdr = fhdr->b_hash_next, i++) {
1348 		if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1349 			return (fhdr);
1350 	}
1351 
1352 	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1353 	buf_hash_table.ht_table[idx] = hdr;
1354 	arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1355 
1356 	/* collect some hash table performance data */
1357 	if (i > 0) {
1358 		ARCSTAT_BUMP(arcstat_hash_collisions);
1359 		if (i == 1)
1360 			ARCSTAT_BUMP(arcstat_hash_chains);
1361 
1362 		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1363 	}
1364 
1365 	ARCSTAT_BUMP(arcstat_hash_elements);
1366 	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1367 
1368 	return (NULL);
1369 }
1370 
1371 static void
1372 buf_hash_remove(arc_buf_hdr_t *hdr)
1373 {
1374 	arc_buf_hdr_t *fhdr, **hdrp;
1375 	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1376 
1377 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1378 	ASSERT(HDR_IN_HASH_TABLE(hdr));
1379 
1380 	hdrp = &buf_hash_table.ht_table[idx];
1381 	while ((fhdr = *hdrp) != hdr) {
1382 		ASSERT3P(fhdr, !=, NULL);
1383 		hdrp = &fhdr->b_hash_next;
1384 	}
1385 	*hdrp = hdr->b_hash_next;
1386 	hdr->b_hash_next = NULL;
1387 	arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
1388 
1389 	/* collect some hash table performance data */
1390 	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1391 
1392 	if (buf_hash_table.ht_table[idx] &&
1393 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1394 		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1395 }
1396 
1397 /*
1398  * Global data structures and functions for the buf kmem cache.
1399  */
1400 
1401 static kmem_cache_t *hdr_full_cache;
1402 static kmem_cache_t *hdr_full_crypt_cache;
1403 static kmem_cache_t *hdr_l2only_cache;
1404 static kmem_cache_t *buf_cache;
1405 
1406 static void
1407 buf_fini(void)
1408 {
1409 	int i;
1410 
1411 	kmem_free(buf_hash_table.ht_table,
1412 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1413 	for (i = 0; i < BUF_LOCKS; i++)
1414 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1415 	kmem_cache_destroy(hdr_full_cache);
1416 	kmem_cache_destroy(hdr_full_crypt_cache);
1417 	kmem_cache_destroy(hdr_l2only_cache);
1418 	kmem_cache_destroy(buf_cache);
1419 }
1420 
1421 /*
1422  * Constructor callback - called when the cache is empty
1423  * and a new buf is requested.
1424  */
1425 /* ARGSUSED */
1426 static int
1427 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1428 {
1429 	arc_buf_hdr_t *hdr = vbuf;
1430 
1431 	bzero(hdr, HDR_FULL_SIZE);
1432 	hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
1433 	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1434 	zfs_refcount_create(&hdr->b_l1hdr.b_refcnt);
1435 	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1436 	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1437 	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1438 
1439 	return (0);
1440 }
1441 
1442 /* ARGSUSED */
1443 static int
1444 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag)
1445 {
1446 	arc_buf_hdr_t *hdr = vbuf;
1447 
1448 	(void) hdr_full_cons(vbuf, unused, kmflag);
1449 	bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr));
1450 	arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1451 
1452 	return (0);
1453 }
1454 
1455 /* ARGSUSED */
1456 static int
1457 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1458 {
1459 	arc_buf_hdr_t *hdr = vbuf;
1460 
1461 	bzero(hdr, HDR_L2ONLY_SIZE);
1462 	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1463 
1464 	return (0);
1465 }
1466 
1467 /* ARGSUSED */
1468 static int
1469 buf_cons(void *vbuf, void *unused, int kmflag)
1470 {
1471 	arc_buf_t *buf = vbuf;
1472 
1473 	bzero(buf, sizeof (arc_buf_t));
1474 	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1475 	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1476 
1477 	return (0);
1478 }
1479 
1480 /*
1481  * Destructor callback - called when a cached buf is
1482  * no longer required.
1483  */
1484 /* ARGSUSED */
1485 static void
1486 hdr_full_dest(void *vbuf, void *unused)
1487 {
1488 	arc_buf_hdr_t *hdr = vbuf;
1489 
1490 	ASSERT(HDR_EMPTY(hdr));
1491 	cv_destroy(&hdr->b_l1hdr.b_cv);
1492 	zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1493 	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1494 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1495 	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1496 }
1497 
1498 /* ARGSUSED */
1499 static void
1500 hdr_full_crypt_dest(void *vbuf, void *unused)
1501 {
1502 	arc_buf_hdr_t *hdr = vbuf;
1503 
1504 	hdr_full_dest(hdr, unused);
1505 	arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS);
1506 }
1507 
1508 /* ARGSUSED */
1509 static void
1510 hdr_l2only_dest(void *vbuf, void *unused)
1511 {
1512 	arc_buf_hdr_t *hdr = vbuf;
1513 
1514 	ASSERT(HDR_EMPTY(hdr));
1515 	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1516 }
1517 
1518 /* ARGSUSED */
1519 static void
1520 buf_dest(void *vbuf, void *unused)
1521 {
1522 	arc_buf_t *buf = vbuf;
1523 
1524 	mutex_destroy(&buf->b_evict_lock);
1525 	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1526 }
1527 
1528 /*
1529  * Reclaim callback -- invoked when memory is low.
1530  */
1531 /* ARGSUSED */
1532 static void
1533 hdr_recl(void *unused)
1534 {
1535 	dprintf("hdr_recl called\n");
1536 	/*
1537 	 * umem calls the reclaim func when we destroy the buf cache,
1538 	 * which is after we do arc_fini().
1539 	 */
1540 	if (arc_initialized)
1541 		zthr_wakeup(arc_reap_zthr);
1542 }
1543 
1544 static void
1545 buf_init(void)
1546 {
1547 	uint64_t *ct;
1548 	uint64_t hsize = 1ULL << 12;
1549 	int i, j;
1550 
1551 	/*
1552 	 * The hash table is big enough to fill all of physical memory
1553 	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1554 	 * By default, the table will take up
1555 	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1556 	 */
1557 	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1558 		hsize <<= 1;
1559 retry:
1560 	buf_hash_table.ht_mask = hsize - 1;
1561 	buf_hash_table.ht_table =
1562 	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1563 	if (buf_hash_table.ht_table == NULL) {
1564 		ASSERT(hsize > (1ULL << 8));
1565 		hsize >>= 1;
1566 		goto retry;
1567 	}
1568 
1569 	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1570 	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1571 	hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt",
1572 	    HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest,
1573 	    hdr_recl, NULL, NULL, 0);
1574 	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1575 	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1576 	    NULL, NULL, 0);
1577 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1578 	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1579 
1580 	for (i = 0; i < 256; i++)
1581 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1582 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1583 
1584 	for (i = 0; i < BUF_LOCKS; i++) {
1585 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1586 		    NULL, MUTEX_DEFAULT, NULL);
1587 	}
1588 }
1589 
1590 /*
1591  * This is the size that the buf occupies in memory. If the buf is compressed,
1592  * it will correspond to the compressed size. You should use this method of
1593  * getting the buf size unless you explicitly need the logical size.
1594  */
1595 int32_t
1596 arc_buf_size(arc_buf_t *buf)
1597 {
1598 	return (ARC_BUF_COMPRESSED(buf) ?
1599 	    HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr));
1600 }
1601 
1602 int32_t
1603 arc_buf_lsize(arc_buf_t *buf)
1604 {
1605 	return (HDR_GET_LSIZE(buf->b_hdr));
1606 }
1607 
1608 /*
1609  * This function will return B_TRUE if the buffer is encrypted in memory.
1610  * This buffer can be decrypted by calling arc_untransform().
1611  */
1612 boolean_t
1613 arc_is_encrypted(arc_buf_t *buf)
1614 {
1615 	return (ARC_BUF_ENCRYPTED(buf) != 0);
1616 }
1617 
1618 /*
1619  * Returns B_TRUE if the buffer represents data that has not had its MAC
1620  * verified yet.
1621  */
1622 boolean_t
1623 arc_is_unauthenticated(arc_buf_t *buf)
1624 {
1625 	return (HDR_NOAUTH(buf->b_hdr) != 0);
1626 }
1627 
1628 void
1629 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt,
1630     uint8_t *iv, uint8_t *mac)
1631 {
1632 	arc_buf_hdr_t *hdr = buf->b_hdr;
1633 
1634 	ASSERT(HDR_PROTECTED(hdr));
1635 
1636 	bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN);
1637 	bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN);
1638 	bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN);
1639 	*byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
1640 	    /* CONSTCOND */
1641 	    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
1642 }
1643 
1644 /*
1645  * Indicates how this buffer is compressed in memory. If it is not compressed
1646  * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with
1647  * arc_untransform() as long as it is also unencrypted.
1648  */
1649 enum zio_compress
1650 arc_get_compression(arc_buf_t *buf)
1651 {
1652 	return (ARC_BUF_COMPRESSED(buf) ?
1653 	    HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF);
1654 }
1655 
1656 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
1657 
1658 /*
1659  * Return the compression algorithm used to store this data in the ARC. If ARC
1660  * compression is enabled or this is an encrypted block, this will be the same
1661  * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF.
1662  */
1663 static inline enum zio_compress
1664 arc_hdr_get_compress(arc_buf_hdr_t *hdr)
1665 {
1666 	return (HDR_COMPRESSION_ENABLED(hdr) ?
1667 	    HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF);
1668 }
1669 
1670 static inline boolean_t
1671 arc_buf_is_shared(arc_buf_t *buf)
1672 {
1673 	boolean_t shared = (buf->b_data != NULL &&
1674 	    buf->b_hdr->b_l1hdr.b_pabd != NULL &&
1675 	    abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
1676 	    buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
1677 	IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
1678 	IMPLY(shared, ARC_BUF_SHARED(buf));
1679 	IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
1680 
1681 	/*
1682 	 * It would be nice to assert arc_can_share() too, but the "hdr isn't
1683 	 * already being shared" requirement prevents us from doing that.
1684 	 */
1685 
1686 	return (shared);
1687 }
1688 
1689 /*
1690  * Free the checksum associated with this header. If there is no checksum, this
1691  * is a no-op.
1692  */
1693 static inline void
1694 arc_cksum_free(arc_buf_hdr_t *hdr)
1695 {
1696 	ASSERT(HDR_HAS_L1HDR(hdr));
1697 
1698 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1699 	if (hdr->b_l1hdr.b_freeze_cksum != NULL) {
1700 		kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t));
1701 		hdr->b_l1hdr.b_freeze_cksum = NULL;
1702 	}
1703 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1704 }
1705 
1706 /*
1707  * Return true iff at least one of the bufs on hdr is not compressed.
1708  * Encrypted buffers count as compressed.
1709  */
1710 static boolean_t
1711 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr)
1712 {
1713 	ASSERT(hdr->b_l1hdr.b_state == arc_anon ||
1714 	    MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1715 
1716 	for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) {
1717 		if (!ARC_BUF_COMPRESSED(b)) {
1718 			return (B_TRUE);
1719 		}
1720 	}
1721 	return (B_FALSE);
1722 }
1723 
1724 /*
1725  * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data
1726  * matches the checksum that is stored in the hdr. If there is no checksum,
1727  * or if the buf is compressed, this is a no-op.
1728  */
1729 static void
1730 arc_cksum_verify(arc_buf_t *buf)
1731 {
1732 	arc_buf_hdr_t *hdr = buf->b_hdr;
1733 	zio_cksum_t zc;
1734 
1735 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1736 		return;
1737 
1738 	if (ARC_BUF_COMPRESSED(buf))
1739 		return;
1740 
1741 	ASSERT(HDR_HAS_L1HDR(hdr));
1742 
1743 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1744 
1745 	if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) {
1746 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1747 		return;
1748 	}
1749 
1750 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc);
1751 	if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc))
1752 		panic("buffer modified while frozen!");
1753 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1754 }
1755 
1756 /*
1757  * This function makes the assumption that data stored in the L2ARC
1758  * will be transformed exactly as it is in the main pool. Because of
1759  * this we can verify the checksum against the reading process's bp.
1760  */
1761 static boolean_t
1762 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio)
1763 {
1764 	enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp);
1765 	boolean_t valid_cksum;
1766 
1767 	ASSERT(!BP_IS_EMBEDDED(zio->io_bp));
1768 	VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr));
1769 
1770 	/*
1771 	 * We rely on the blkptr's checksum to determine if the block
1772 	 * is valid or not. When compressed arc is enabled, the l2arc
1773 	 * writes the block to the l2arc just as it appears in the pool.
1774 	 * This allows us to use the blkptr's checksum to validate the
1775 	 * data that we just read off of the l2arc without having to store
1776 	 * a separate checksum in the arc_buf_hdr_t. However, if compressed
1777 	 * arc is disabled, then the data written to the l2arc is always
1778 	 * uncompressed and won't match the block as it exists in the main
1779 	 * pool. When this is the case, we must first compress it if it is
1780 	 * compressed on the main pool before we can validate the checksum.
1781 	 */
1782 	if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) {
1783 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
1784 		uint64_t lsize = HDR_GET_LSIZE(hdr);
1785 		uint64_t csize;
1786 
1787 		abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE);
1788 		csize = zio_compress_data(compress, zio->io_abd,
1789 		    abd_to_buf(cdata), lsize);
1790 
1791 		ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr));
1792 		if (csize < HDR_GET_PSIZE(hdr)) {
1793 			/*
1794 			 * Compressed blocks are always a multiple of the
1795 			 * smallest ashift in the pool. Ideally, we would
1796 			 * like to round up the csize to the next
1797 			 * spa_min_ashift but that value may have changed
1798 			 * since the block was last written. Instead,
1799 			 * we rely on the fact that the hdr's psize
1800 			 * was set to the psize of the block when it was
1801 			 * last written. We set the csize to that value
1802 			 * and zero out any part that should not contain
1803 			 * data.
1804 			 */
1805 			abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize);
1806 			csize = HDR_GET_PSIZE(hdr);
1807 		}
1808 		zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL);
1809 	}
1810 
1811 	/*
1812 	 * Block pointers always store the checksum for the logical data.
1813 	 * If the block pointer has the gang bit set, then the checksum
1814 	 * it represents is for the reconstituted data and not for an
1815 	 * individual gang member. The zio pipeline, however, must be able to
1816 	 * determine the checksum of each of the gang constituents so it
1817 	 * treats the checksum comparison differently than what we need
1818 	 * for l2arc blocks. This prevents us from using the
1819 	 * zio_checksum_error() interface directly. Instead we must call the
1820 	 * zio_checksum_error_impl() so that we can ensure the checksum is
1821 	 * generated using the correct checksum algorithm and accounts for the
1822 	 * logical I/O size and not just a gang fragment.
1823 	 */
1824 	valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp,
1825 	    BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size,
1826 	    zio->io_offset, NULL) == 0);
1827 	zio_pop_transforms(zio);
1828 	return (valid_cksum);
1829 }
1830 
1831 /*
1832  * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a
1833  * checksum and attaches it to the buf's hdr so that we can ensure that the buf
1834  * isn't modified later on. If buf is compressed or there is already a checksum
1835  * on the hdr, this is a no-op (we only checksum uncompressed bufs).
1836  */
1837 static void
1838 arc_cksum_compute(arc_buf_t *buf)
1839 {
1840 	arc_buf_hdr_t *hdr = buf->b_hdr;
1841 
1842 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1843 		return;
1844 
1845 	ASSERT(HDR_HAS_L1HDR(hdr));
1846 
1847 	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1848 	if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) {
1849 		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1850 		return;
1851 	}
1852 
1853 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
1854 	ASSERT(!ARC_BUF_COMPRESSED(buf));
1855 	hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t),
1856 	    KM_SLEEP);
1857 	fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL,
1858 	    hdr->b_l1hdr.b_freeze_cksum);
1859 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1860 	arc_buf_watch(buf);
1861 }
1862 
1863 #ifndef _KERNEL
1864 typedef struct procctl {
1865 	long cmd;
1866 	prwatch_t prwatch;
1867 } procctl_t;
1868 #endif
1869 
1870 /* ARGSUSED */
1871 static void
1872 arc_buf_unwatch(arc_buf_t *buf)
1873 {
1874 #ifndef _KERNEL
1875 	if (arc_watch) {
1876 		int result;
1877 		procctl_t ctl;
1878 		ctl.cmd = PCWATCH;
1879 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1880 		ctl.prwatch.pr_size = 0;
1881 		ctl.prwatch.pr_wflags = 0;
1882 		result = write(arc_procfd, &ctl, sizeof (ctl));
1883 		ASSERT3U(result, ==, sizeof (ctl));
1884 	}
1885 #endif
1886 }
1887 
1888 /* ARGSUSED */
1889 static void
1890 arc_buf_watch(arc_buf_t *buf)
1891 {
1892 #ifndef _KERNEL
1893 	if (arc_watch) {
1894 		int result;
1895 		procctl_t ctl;
1896 		ctl.cmd = PCWATCH;
1897 		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1898 		ctl.prwatch.pr_size = arc_buf_size(buf);
1899 		ctl.prwatch.pr_wflags = WA_WRITE;
1900 		result = write(arc_procfd, &ctl, sizeof (ctl));
1901 		ASSERT3U(result, ==, sizeof (ctl));
1902 	}
1903 #endif
1904 }
1905 
1906 static arc_buf_contents_t
1907 arc_buf_type(arc_buf_hdr_t *hdr)
1908 {
1909 	arc_buf_contents_t type;
1910 	if (HDR_ISTYPE_METADATA(hdr)) {
1911 		type = ARC_BUFC_METADATA;
1912 	} else {
1913 		type = ARC_BUFC_DATA;
1914 	}
1915 	VERIFY3U(hdr->b_type, ==, type);
1916 	return (type);
1917 }
1918 
1919 boolean_t
1920 arc_is_metadata(arc_buf_t *buf)
1921 {
1922 	return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0);
1923 }
1924 
1925 static uint32_t
1926 arc_bufc_to_flags(arc_buf_contents_t type)
1927 {
1928 	switch (type) {
1929 	case ARC_BUFC_DATA:
1930 		/* metadata field is 0 if buffer contains normal data */
1931 		return (0);
1932 	case ARC_BUFC_METADATA:
1933 		return (ARC_FLAG_BUFC_METADATA);
1934 	default:
1935 		break;
1936 	}
1937 	panic("undefined ARC buffer type!");
1938 	return ((uint32_t)-1);
1939 }
1940 
1941 void
1942 arc_buf_thaw(arc_buf_t *buf)
1943 {
1944 	arc_buf_hdr_t *hdr = buf->b_hdr;
1945 
1946 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
1947 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1948 
1949 	arc_cksum_verify(buf);
1950 
1951 	/*
1952 	 * Compressed buffers do not manipulate the b_freeze_cksum.
1953 	 */
1954 	if (ARC_BUF_COMPRESSED(buf))
1955 		return;
1956 
1957 	ASSERT(HDR_HAS_L1HDR(hdr));
1958 	arc_cksum_free(hdr);
1959 
1960 	mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
1961 #ifdef ZFS_DEBUG
1962 	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1963 		if (hdr->b_l1hdr.b_thawed != NULL)
1964 			kmem_free(hdr->b_l1hdr.b_thawed, 1);
1965 		hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1966 	}
1967 #endif
1968 
1969 	mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
1970 
1971 	arc_buf_unwatch(buf);
1972 }
1973 
1974 void
1975 arc_buf_freeze(arc_buf_t *buf)
1976 {
1977 	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1978 		return;
1979 
1980 	if (ARC_BUF_COMPRESSED(buf))
1981 		return;
1982 
1983 	ASSERT(HDR_HAS_L1HDR(buf->b_hdr));
1984 	arc_cksum_compute(buf);
1985 }
1986 
1987 /*
1988  * The arc_buf_hdr_t's b_flags should never be modified directly. Instead,
1989  * the following functions should be used to ensure that the flags are
1990  * updated in a thread-safe way. When manipulating the flags either
1991  * the hash_lock must be held or the hdr must be undiscoverable. This
1992  * ensures that we're not racing with any other threads when updating
1993  * the flags.
1994  */
1995 static inline void
1996 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
1997 {
1998 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
1999 	hdr->b_flags |= flags;
2000 }
2001 
2002 static inline void
2003 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags)
2004 {
2005 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2006 	hdr->b_flags &= ~flags;
2007 }
2008 
2009 /*
2010  * Setting the compression bits in the arc_buf_hdr_t's b_flags is
2011  * done in a special way since we have to clear and set bits
2012  * at the same time. Consumers that wish to set the compression bits
2013  * must use this function to ensure that the flags are updated in
2014  * thread-safe manner.
2015  */
2016 static void
2017 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp)
2018 {
2019 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2020 
2021 	/*
2022 	 * Holes and embedded blocks will always have a psize = 0 so
2023 	 * we ignore the compression of the blkptr and set the
2024 	 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF.
2025 	 * Holes and embedded blocks remain anonymous so we don't
2026 	 * want to uncompress them. Mark them as uncompressed.
2027 	 */
2028 	if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) {
2029 		arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2030 		ASSERT(!HDR_COMPRESSION_ENABLED(hdr));
2031 	} else {
2032 		arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC);
2033 		ASSERT(HDR_COMPRESSION_ENABLED(hdr));
2034 	}
2035 
2036 	HDR_SET_COMPRESS(hdr, cmp);
2037 	ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp);
2038 }
2039 
2040 /*
2041  * Looks for another buf on the same hdr which has the data decompressed, copies
2042  * from it, and returns true. If no such buf exists, returns false.
2043  */
2044 static boolean_t
2045 arc_buf_try_copy_decompressed_data(arc_buf_t *buf)
2046 {
2047 	arc_buf_hdr_t *hdr = buf->b_hdr;
2048 	boolean_t copied = B_FALSE;
2049 
2050 	ASSERT(HDR_HAS_L1HDR(hdr));
2051 	ASSERT3P(buf->b_data, !=, NULL);
2052 	ASSERT(!ARC_BUF_COMPRESSED(buf));
2053 
2054 	for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL;
2055 	    from = from->b_next) {
2056 		/* can't use our own data buffer */
2057 		if (from == buf) {
2058 			continue;
2059 		}
2060 
2061 		if (!ARC_BUF_COMPRESSED(from)) {
2062 			bcopy(from->b_data, buf->b_data, arc_buf_size(buf));
2063 			copied = B_TRUE;
2064 			break;
2065 		}
2066 	}
2067 
2068 	/*
2069 	 * Note: With encryption support, the following assertion is no longer
2070 	 * necessarily valid. If we receive two back to back raw snapshots
2071 	 * (send -w), the second receive can use a hdr with a cksum already
2072 	 * calculated. This happens via:
2073 	 *    dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf()
2074 	 * The rsend/send_mixed_raw test case exercises this code path.
2075 	 *
2076 	 * There were no decompressed bufs, so there should not be a
2077 	 * checksum on the hdr either.
2078 	 * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL);
2079 	 */
2080 
2081 	return (copied);
2082 }
2083 
2084 /*
2085  * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t.
2086  */
2087 static uint64_t
2088 arc_hdr_size(arc_buf_hdr_t *hdr)
2089 {
2090 	uint64_t size;
2091 
2092 	if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
2093 	    HDR_GET_PSIZE(hdr) > 0) {
2094 		size = HDR_GET_PSIZE(hdr);
2095 	} else {
2096 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0);
2097 		size = HDR_GET_LSIZE(hdr);
2098 	}
2099 	return (size);
2100 }
2101 
2102 static int
2103 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj)
2104 {
2105 	int ret;
2106 	uint64_t csize;
2107 	uint64_t lsize = HDR_GET_LSIZE(hdr);
2108 	uint64_t psize = HDR_GET_PSIZE(hdr);
2109 	void *tmpbuf = NULL;
2110 	abd_t *abd = hdr->b_l1hdr.b_pabd;
2111 
2112 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2113 	ASSERT(HDR_AUTHENTICATED(hdr));
2114 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2115 
2116 	/*
2117 	 * The MAC is calculated on the compressed data that is stored on disk.
2118 	 * However, if compressed arc is disabled we will only have the
2119 	 * decompressed data available to us now. Compress it into a temporary
2120 	 * abd so we can verify the MAC. The performance overhead of this will
2121 	 * be relatively low, since most objects in an encrypted objset will
2122 	 * be encrypted (instead of authenticated) anyway.
2123 	 */
2124 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
2125 	    !HDR_COMPRESSION_ENABLED(hdr)) {
2126 		tmpbuf = zio_buf_alloc(lsize);
2127 		abd = abd_get_from_buf(tmpbuf, lsize);
2128 		abd_take_ownership_of_buf(abd, B_TRUE);
2129 
2130 		csize = zio_compress_data(HDR_GET_COMPRESS(hdr),
2131 		    hdr->b_l1hdr.b_pabd, tmpbuf, lsize);
2132 		ASSERT3U(csize, <=, psize);
2133 		abd_zero_off(abd, csize, psize - csize);
2134 	}
2135 
2136 	/*
2137 	 * Authentication is best effort. We authenticate whenever the key is
2138 	 * available. If we succeed we clear ARC_FLAG_NOAUTH.
2139 	 */
2140 	if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) {
2141 		ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF);
2142 		ASSERT3U(lsize, ==, psize);
2143 		ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd,
2144 		    psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
2145 	} else {
2146 		ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize,
2147 		    hdr->b_crypt_hdr.b_mac);
2148 	}
2149 
2150 	if (ret == 0)
2151 		arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH);
2152 	else if (ret != ENOENT)
2153 		goto error;
2154 
2155 	if (tmpbuf != NULL)
2156 		abd_free(abd);
2157 
2158 	return (0);
2159 
2160 error:
2161 	if (tmpbuf != NULL)
2162 		abd_free(abd);
2163 
2164 	return (ret);
2165 }
2166 
2167 /*
2168  * This function will take a header that only has raw encrypted data in
2169  * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in
2170  * b_l1hdr.b_pabd. If designated in the header flags, this function will
2171  * also decompress the data.
2172  */
2173 static int
2174 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb)
2175 {
2176 	int ret;
2177 	abd_t *cabd = NULL;
2178 	void *tmp = NULL;
2179 	boolean_t no_crypt = B_FALSE;
2180 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
2181 
2182 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2183 	ASSERT(HDR_ENCRYPTED(hdr));
2184 
2185 	arc_hdr_alloc_pabd(hdr, B_FALSE);
2186 
2187 	ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot,
2188 	    B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv,
2189 	    hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd,
2190 	    hdr->b_crypt_hdr.b_rabd, &no_crypt);
2191 	if (ret != 0)
2192 		goto error;
2193 
2194 	if (no_crypt) {
2195 		abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd,
2196 		    HDR_GET_PSIZE(hdr));
2197 	}
2198 
2199 	/*
2200 	 * If this header has disabled arc compression but the b_pabd is
2201 	 * compressed after decrypting it, we need to decompress the newly
2202 	 * decrypted data.
2203 	 */
2204 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
2205 	    !HDR_COMPRESSION_ENABLED(hdr)) {
2206 		/*
2207 		 * We want to make sure that we are correctly honoring the
2208 		 * zfs_abd_scatter_enabled setting, so we allocate an abd here
2209 		 * and then loan a buffer from it, rather than allocating a
2210 		 * linear buffer and wrapping it in an abd later.
2211 		 */
2212 		cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
2213 		tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
2214 
2215 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2216 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
2217 		    HDR_GET_LSIZE(hdr));
2218 		if (ret != 0) {
2219 			abd_return_buf(cabd, tmp, arc_hdr_size(hdr));
2220 			goto error;
2221 		}
2222 
2223 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
2224 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
2225 		    arc_hdr_size(hdr), hdr);
2226 		hdr->b_l1hdr.b_pabd = cabd;
2227 	}
2228 
2229 	return (0);
2230 
2231 error:
2232 	arc_hdr_free_pabd(hdr, B_FALSE);
2233 	if (cabd != NULL)
2234 		arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr);
2235 
2236 	return (ret);
2237 }
2238 
2239 /*
2240  * This function is called during arc_buf_fill() to prepare the header's
2241  * abd plaintext pointer for use. This involves authenticated protected
2242  * data and decrypting encrypted data into the plaintext abd.
2243  */
2244 static int
2245 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa,
2246     const zbookmark_phys_t *zb, boolean_t noauth)
2247 {
2248 	int ret;
2249 
2250 	ASSERT(HDR_PROTECTED(hdr));
2251 
2252 	if (hash_lock != NULL)
2253 		mutex_enter(hash_lock);
2254 
2255 	if (HDR_NOAUTH(hdr) && !noauth) {
2256 		/*
2257 		 * The caller requested authenticated data but our data has
2258 		 * not been authenticated yet. Verify the MAC now if we can.
2259 		 */
2260 		ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset);
2261 		if (ret != 0)
2262 			goto error;
2263 	} else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) {
2264 		/*
2265 		 * If we only have the encrypted version of the data, but the
2266 		 * unencrypted version was requested we take this opportunity
2267 		 * to store the decrypted version in the header for future use.
2268 		 */
2269 		ret = arc_hdr_decrypt(hdr, spa, zb);
2270 		if (ret != 0)
2271 			goto error;
2272 	}
2273 
2274 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2275 
2276 	if (hash_lock != NULL)
2277 		mutex_exit(hash_lock);
2278 
2279 	return (0);
2280 
2281 error:
2282 	if (hash_lock != NULL)
2283 		mutex_exit(hash_lock);
2284 
2285 	return (ret);
2286 }
2287 
2288 /*
2289  * This function is used by the dbuf code to decrypt bonus buffers in place.
2290  * The dbuf code itself doesn't have any locking for decrypting a shared dnode
2291  * block, so we use the hash lock here to protect against concurrent calls to
2292  * arc_buf_fill().
2293  */
2294 /* ARGSUSED */
2295 static void
2296 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock)
2297 {
2298 	arc_buf_hdr_t *hdr = buf->b_hdr;
2299 
2300 	ASSERT(HDR_ENCRYPTED(hdr));
2301 	ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2302 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
2303 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
2304 
2305 	zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data,
2306 	    arc_buf_size(buf));
2307 	buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
2308 	buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2309 	hdr->b_crypt_hdr.b_ebufcnt -= 1;
2310 }
2311 
2312 /*
2313  * Given a buf that has a data buffer attached to it, this function will
2314  * efficiently fill the buf with data of the specified compression setting from
2315  * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr
2316  * are already sharing a data buf, no copy is performed.
2317  *
2318  * If the buf is marked as compressed but uncompressed data was requested, this
2319  * will allocate a new data buffer for the buf, remove that flag, and fill the
2320  * buf with uncompressed data. You can't request a compressed buf on a hdr with
2321  * uncompressed data, and (since we haven't added support for it yet) if you
2322  * want compressed data your buf must already be marked as compressed and have
2323  * the correct-sized data buffer.
2324  */
2325 static int
2326 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2327     arc_fill_flags_t flags)
2328 {
2329 	int error = 0;
2330 	arc_buf_hdr_t *hdr = buf->b_hdr;
2331 	boolean_t hdr_compressed =
2332 	    (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
2333 	boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0;
2334 	boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0;
2335 	dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap;
2336 	kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr);
2337 
2338 	ASSERT3P(buf->b_data, !=, NULL);
2339 	IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf));
2340 	IMPLY(compressed, ARC_BUF_COMPRESSED(buf));
2341 	IMPLY(encrypted, HDR_ENCRYPTED(hdr));
2342 	IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
2343 	IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
2344 	IMPLY(encrypted, !ARC_BUF_SHARED(buf));
2345 
2346 	/*
2347 	 * If the caller wanted encrypted data we just need to copy it from
2348 	 * b_rabd and potentially byteswap it. We won't be able to do any
2349 	 * further transforms on it.
2350 	 */
2351 	if (encrypted) {
2352 		ASSERT(HDR_HAS_RABD(hdr));
2353 		abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd,
2354 		    HDR_GET_PSIZE(hdr));
2355 		goto byteswap;
2356 	}
2357 
2358 	/*
2359 	 * Adjust encrypted and authenticated headers to accomodate
2360 	 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are
2361 	 * allowed to fail decryption due to keys not being loaded
2362 	 * without being marked as an IO error.
2363 	 */
2364 	if (HDR_PROTECTED(hdr)) {
2365 		error = arc_fill_hdr_crypt(hdr, hash_lock, spa,
2366 		    zb, !!(flags & ARC_FILL_NOAUTH));
2367 		if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) {
2368 			return (error);
2369 		} else if (error != 0) {
2370 			if (hash_lock != NULL)
2371 				mutex_enter(hash_lock);
2372 			arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2373 			if (hash_lock != NULL)
2374 				mutex_exit(hash_lock);
2375 			return (error);
2376 		}
2377 	}
2378 
2379 	/*
2380 	 * There is a special case here for dnode blocks which are
2381 	 * decrypting their bonus buffers. These blocks may request to
2382 	 * be decrypted in-place. This is necessary because there may
2383 	 * be many dnodes pointing into this buffer and there is
2384 	 * currently no method to synchronize replacing the backing
2385 	 * b_data buffer and updating all of the pointers. Here we use
2386 	 * the hash lock to ensure there are no races. If the need
2387 	 * arises for other types to be decrypted in-place, they must
2388 	 * add handling here as well.
2389 	 */
2390 	if ((flags & ARC_FILL_IN_PLACE) != 0) {
2391 		ASSERT(!hdr_compressed);
2392 		ASSERT(!compressed);
2393 		ASSERT(!encrypted);
2394 
2395 		if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) {
2396 			ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE);
2397 
2398 			if (hash_lock != NULL)
2399 				mutex_enter(hash_lock);
2400 			arc_buf_untransform_in_place(buf, hash_lock);
2401 			if (hash_lock != NULL)
2402 				mutex_exit(hash_lock);
2403 
2404 			/* Compute the hdr's checksum if necessary */
2405 			arc_cksum_compute(buf);
2406 		}
2407 
2408 		return (0);
2409 	}
2410 
2411 	if (hdr_compressed == compressed) {
2412 		if (!arc_buf_is_shared(buf)) {
2413 			abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
2414 			    arc_buf_size(buf));
2415 		}
2416 	} else {
2417 		ASSERT(hdr_compressed);
2418 		ASSERT(!compressed);
2419 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr));
2420 
2421 		/*
2422 		 * If the buf is sharing its data with the hdr, unlink it and
2423 		 * allocate a new data buffer for the buf.
2424 		 */
2425 		if (arc_buf_is_shared(buf)) {
2426 			ASSERT(ARC_BUF_COMPRESSED(buf));
2427 
2428 			/* We need to give the buf its own b_data */
2429 			buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
2430 			buf->b_data =
2431 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2432 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
2433 
2434 			/* Previously overhead was 0; just add new overhead */
2435 			ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
2436 		} else if (ARC_BUF_COMPRESSED(buf)) {
2437 			/* We need to reallocate the buf's b_data */
2438 			arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
2439 			    buf);
2440 			buf->b_data =
2441 			    arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf);
2442 
2443 			/* We increased the size of b_data; update overhead */
2444 			ARCSTAT_INCR(arcstat_overhead_size,
2445 			    HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr));
2446 		}
2447 
2448 		/*
2449 		 * Regardless of the buf's previous compression settings, it
2450 		 * should not be compressed at the end of this function.
2451 		 */
2452 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
2453 
2454 		/*
2455 		 * Try copying the data from another buf which already has a
2456 		 * decompressed version. If that's not possible, it's time to
2457 		 * bite the bullet and decompress the data from the hdr.
2458 		 */
2459 		if (arc_buf_try_copy_decompressed_data(buf)) {
2460 			/* Skip byteswapping and checksumming (already done) */
2461 			ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL);
2462 			return (0);
2463 		} else {
2464 			error = zio_decompress_data(HDR_GET_COMPRESS(hdr),
2465 			    hdr->b_l1hdr.b_pabd, buf->b_data,
2466 			    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2467 
2468 			/*
2469 			 * Absent hardware errors or software bugs, this should
2470 			 * be impossible, but log it anyway so we can debug it.
2471 			 */
2472 			if (error != 0) {
2473 				zfs_dbgmsg(
2474 				    "hdr %p, compress %d, psize %d, lsize %d",
2475 				    hdr, arc_hdr_get_compress(hdr),
2476 				    HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr));
2477 				if (hash_lock != NULL)
2478 					mutex_enter(hash_lock);
2479 				arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
2480 				if (hash_lock != NULL)
2481 					mutex_exit(hash_lock);
2482 				return (SET_ERROR(EIO));
2483 			}
2484 		}
2485 	}
2486 
2487 byteswap:
2488 	/* Byteswap the buf's data if necessary */
2489 	if (bswap != DMU_BSWAP_NUMFUNCS) {
2490 		ASSERT(!HDR_SHARED_DATA(hdr));
2491 		ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS);
2492 		dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr));
2493 	}
2494 
2495 	/* Compute the hdr's checksum if necessary */
2496 	arc_cksum_compute(buf);
2497 
2498 	return (0);
2499 }
2500 
2501 /*
2502  * If this function is being called to decrypt an encrypted buffer or verify an
2503  * authenticated one, the key must be loaded and a mapping must be made
2504  * available in the keystore via spa_keystore_create_mapping() or one of its
2505  * callers.
2506  */
2507 int
2508 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
2509     boolean_t in_place)
2510 {
2511 	int ret;
2512 	arc_fill_flags_t flags = 0;
2513 
2514 	if (in_place)
2515 		flags |= ARC_FILL_IN_PLACE;
2516 
2517 	ret = arc_buf_fill(buf, spa, zb, flags);
2518 	if (ret == ECKSUM) {
2519 		/*
2520 		 * Convert authentication and decryption errors to EIO
2521 		 * (and generate an ereport) before leaving the ARC.
2522 		 */
2523 		ret = SET_ERROR(EIO);
2524 		spa_log_error(spa, zb);
2525 		zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
2526 		    spa, NULL, zb, NULL, 0, 0);
2527 	}
2528 
2529 	return (ret);
2530 }
2531 
2532 /*
2533  * Increment the amount of evictable space in the arc_state_t's refcount.
2534  * We account for the space used by the hdr and the arc buf individually
2535  * so that we can add and remove them from the refcount individually.
2536  */
2537 static void
2538 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
2539 {
2540 	arc_buf_contents_t type = arc_buf_type(hdr);
2541 
2542 	ASSERT(HDR_HAS_L1HDR(hdr));
2543 
2544 	if (GHOST_STATE(state)) {
2545 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2546 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2547 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2548 		ASSERT(!HDR_HAS_RABD(hdr));
2549 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2550 		    HDR_GET_LSIZE(hdr), hdr);
2551 		return;
2552 	}
2553 
2554 	ASSERT(!GHOST_STATE(state));
2555 	if (hdr->b_l1hdr.b_pabd != NULL) {
2556 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2557 		    arc_hdr_size(hdr), hdr);
2558 	}
2559 	if (HDR_HAS_RABD(hdr)) {
2560 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2561 		    HDR_GET_PSIZE(hdr), hdr);
2562 	}
2563 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2564 	    buf = buf->b_next) {
2565 		if (arc_buf_is_shared(buf))
2566 			continue;
2567 		(void) zfs_refcount_add_many(&state->arcs_esize[type],
2568 		    arc_buf_size(buf), buf);
2569 	}
2570 }
2571 
2572 /*
2573  * Decrement the amount of evictable space in the arc_state_t's refcount.
2574  * We account for the space used by the hdr and the arc buf individually
2575  * so that we can add and remove them from the refcount individually.
2576  */
2577 static void
2578 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
2579 {
2580 	arc_buf_contents_t type = arc_buf_type(hdr);
2581 
2582 	ASSERT(HDR_HAS_L1HDR(hdr));
2583 
2584 	if (GHOST_STATE(state)) {
2585 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
2586 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2587 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2588 		ASSERT(!HDR_HAS_RABD(hdr));
2589 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2590 		    HDR_GET_LSIZE(hdr), hdr);
2591 		return;
2592 	}
2593 
2594 	ASSERT(!GHOST_STATE(state));
2595 	if (hdr->b_l1hdr.b_pabd != NULL) {
2596 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2597 		    arc_hdr_size(hdr), hdr);
2598 	}
2599 	if (HDR_HAS_RABD(hdr)) {
2600 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2601 		    HDR_GET_PSIZE(hdr), hdr);
2602 	}
2603 	for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2604 	    buf = buf->b_next) {
2605 		if (arc_buf_is_shared(buf))
2606 			continue;
2607 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
2608 		    arc_buf_size(buf), buf);
2609 	}
2610 }
2611 
2612 /*
2613  * Add a reference to this hdr indicating that someone is actively
2614  * referencing that memory. When the refcount transitions from 0 to 1,
2615  * we remove it from the respective arc_state_t list to indicate that
2616  * it is not evictable.
2617  */
2618 static void
2619 add_reference(arc_buf_hdr_t *hdr, void *tag)
2620 {
2621 	ASSERT(HDR_HAS_L1HDR(hdr));
2622 	if (!MUTEX_HELD(HDR_LOCK(hdr))) {
2623 		ASSERT(hdr->b_l1hdr.b_state == arc_anon);
2624 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2625 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2626 	}
2627 
2628 	arc_state_t *state = hdr->b_l1hdr.b_state;
2629 
2630 	if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
2631 	    (state != arc_anon)) {
2632 		/* We don't use the L2-only state list. */
2633 		if (state != arc_l2c_only) {
2634 			multilist_remove(state->arcs_list[arc_buf_type(hdr)],
2635 			    hdr);
2636 			arc_evictable_space_decrement(hdr, state);
2637 		}
2638 		/* remove the prefetch flag if we get a reference */
2639 		arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
2640 	}
2641 }
2642 
2643 /*
2644  * Remove a reference from this hdr. When the reference transitions from
2645  * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's
2646  * list making it eligible for eviction.
2647  */
2648 static int
2649 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
2650 {
2651 	int cnt;
2652 	arc_state_t *state = hdr->b_l1hdr.b_state;
2653 
2654 	ASSERT(HDR_HAS_L1HDR(hdr));
2655 	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
2656 	ASSERT(!GHOST_STATE(state));
2657 
2658 	/*
2659 	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
2660 	 * check to prevent usage of the arc_l2c_only list.
2661 	 */
2662 	if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
2663 	    (state != arc_anon)) {
2664 		multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr);
2665 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
2666 		arc_evictable_space_increment(hdr, state);
2667 	}
2668 	return (cnt);
2669 }
2670 
2671 /*
2672  * Move the supplied buffer to the indicated state. The hash lock
2673  * for the buffer must be held by the caller.
2674  */
2675 static void
2676 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
2677     kmutex_t *hash_lock)
2678 {
2679 	arc_state_t *old_state;
2680 	int64_t refcnt;
2681 	uint32_t bufcnt;
2682 	boolean_t update_old, update_new;
2683 	arc_buf_contents_t buftype = arc_buf_type(hdr);
2684 
2685 	/*
2686 	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
2687 	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
2688 	 * L1 hdr doesn't always exist when we change state to arc_anon before
2689 	 * destroying a header, in which case reallocating to add the L1 hdr is
2690 	 * pointless.
2691 	 */
2692 	if (HDR_HAS_L1HDR(hdr)) {
2693 		old_state = hdr->b_l1hdr.b_state;
2694 		refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt);
2695 		bufcnt = hdr->b_l1hdr.b_bufcnt;
2696 
2697 		update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL ||
2698 		    HDR_HAS_RABD(hdr));
2699 	} else {
2700 		old_state = arc_l2c_only;
2701 		refcnt = 0;
2702 		bufcnt = 0;
2703 		update_old = B_FALSE;
2704 	}
2705 	update_new = update_old;
2706 
2707 	ASSERT(MUTEX_HELD(hash_lock));
2708 	ASSERT3P(new_state, !=, old_state);
2709 	ASSERT(!GHOST_STATE(new_state) || bufcnt == 0);
2710 	ASSERT(old_state != arc_anon || bufcnt <= 1);
2711 
2712 	/*
2713 	 * If this buffer is evictable, transfer it from the
2714 	 * old state list to the new state list.
2715 	 */
2716 	if (refcnt == 0) {
2717 		if (old_state != arc_anon && old_state != arc_l2c_only) {
2718 			ASSERT(HDR_HAS_L1HDR(hdr));
2719 			multilist_remove(old_state->arcs_list[buftype], hdr);
2720 
2721 			if (GHOST_STATE(old_state)) {
2722 				ASSERT0(bufcnt);
2723 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2724 				update_old = B_TRUE;
2725 			}
2726 			arc_evictable_space_decrement(hdr, old_state);
2727 		}
2728 		if (new_state != arc_anon && new_state != arc_l2c_only) {
2729 
2730 			/*
2731 			 * An L1 header always exists here, since if we're
2732 			 * moving to some L1-cached state (i.e. not l2c_only or
2733 			 * anonymous), we realloc the header to add an L1hdr
2734 			 * beforehand.
2735 			 */
2736 			ASSERT(HDR_HAS_L1HDR(hdr));
2737 			multilist_insert(new_state->arcs_list[buftype], hdr);
2738 
2739 			if (GHOST_STATE(new_state)) {
2740 				ASSERT0(bufcnt);
2741 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
2742 				update_new = B_TRUE;
2743 			}
2744 			arc_evictable_space_increment(hdr, new_state);
2745 		}
2746 	}
2747 
2748 	ASSERT(!HDR_EMPTY(hdr));
2749 	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2750 		buf_hash_remove(hdr);
2751 
2752 	/* adjust state sizes (ignore arc_l2c_only) */
2753 
2754 	if (update_new && new_state != arc_l2c_only) {
2755 		ASSERT(HDR_HAS_L1HDR(hdr));
2756 		if (GHOST_STATE(new_state)) {
2757 			ASSERT0(bufcnt);
2758 
2759 			/*
2760 			 * When moving a header to a ghost state, we first
2761 			 * remove all arc buffers. Thus, we'll have a
2762 			 * bufcnt of zero, and no arc buffer to use for
2763 			 * the reference. As a result, we use the arc
2764 			 * header pointer for the reference.
2765 			 */
2766 			(void) zfs_refcount_add_many(&new_state->arcs_size,
2767 			    HDR_GET_LSIZE(hdr), hdr);
2768 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2769 			ASSERT(!HDR_HAS_RABD(hdr));
2770 		} else {
2771 			uint32_t buffers = 0;
2772 
2773 			/*
2774 			 * Each individual buffer holds a unique reference,
2775 			 * thus we must remove each of these references one
2776 			 * at a time.
2777 			 */
2778 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2779 			    buf = buf->b_next) {
2780 				ASSERT3U(bufcnt, !=, 0);
2781 				buffers++;
2782 
2783 				/*
2784 				 * When the arc_buf_t is sharing the data
2785 				 * block with the hdr, the owner of the
2786 				 * reference belongs to the hdr. Only
2787 				 * add to the refcount if the arc_buf_t is
2788 				 * not shared.
2789 				 */
2790 				if (arc_buf_is_shared(buf))
2791 					continue;
2792 
2793 				(void) zfs_refcount_add_many(
2794 				    &new_state->arcs_size,
2795 				    arc_buf_size(buf), buf);
2796 			}
2797 			ASSERT3U(bufcnt, ==, buffers);
2798 
2799 			if (hdr->b_l1hdr.b_pabd != NULL) {
2800 				(void) zfs_refcount_add_many(
2801 				    &new_state->arcs_size,
2802 				    arc_hdr_size(hdr), hdr);
2803 			}
2804 
2805 			if (HDR_HAS_RABD(hdr)) {
2806 				(void) zfs_refcount_add_many(
2807 				    &new_state->arcs_size,
2808 				    HDR_GET_PSIZE(hdr), hdr);
2809 			}
2810 		}
2811 	}
2812 
2813 	if (update_old && old_state != arc_l2c_only) {
2814 		ASSERT(HDR_HAS_L1HDR(hdr));
2815 		if (GHOST_STATE(old_state)) {
2816 			ASSERT0(bufcnt);
2817 			ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
2818 			ASSERT(!HDR_HAS_RABD(hdr));
2819 
2820 			/*
2821 			 * When moving a header off of a ghost state,
2822 			 * the header will not contain any arc buffers.
2823 			 * We use the arc header pointer for the reference
2824 			 * which is exactly what we did when we put the
2825 			 * header on the ghost state.
2826 			 */
2827 
2828 			(void) zfs_refcount_remove_many(&old_state->arcs_size,
2829 			    HDR_GET_LSIZE(hdr), hdr);
2830 		} else {
2831 			uint32_t buffers = 0;
2832 
2833 			/*
2834 			 * Each individual buffer holds a unique reference,
2835 			 * thus we must remove each of these references one
2836 			 * at a time.
2837 			 */
2838 			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2839 			    buf = buf->b_next) {
2840 				ASSERT3U(bufcnt, !=, 0);
2841 				buffers++;
2842 
2843 				/*
2844 				 * When the arc_buf_t is sharing the data
2845 				 * block with the hdr, the owner of the
2846 				 * reference belongs to the hdr. Only
2847 				 * add to the refcount if the arc_buf_t is
2848 				 * not shared.
2849 				 */
2850 				if (arc_buf_is_shared(buf))
2851 					continue;
2852 
2853 				(void) zfs_refcount_remove_many(
2854 				    &old_state->arcs_size, arc_buf_size(buf),
2855 				    buf);
2856 			}
2857 			ASSERT3U(bufcnt, ==, buffers);
2858 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
2859 			    HDR_HAS_RABD(hdr));
2860 
2861 			if (hdr->b_l1hdr.b_pabd != NULL) {
2862 				(void) zfs_refcount_remove_many(
2863 				    &old_state->arcs_size, arc_hdr_size(hdr),
2864 				    hdr);
2865 			}
2866 
2867 			if (HDR_HAS_RABD(hdr)) {
2868 				(void) zfs_refcount_remove_many(
2869 				    &old_state->arcs_size, HDR_GET_PSIZE(hdr),
2870 				    hdr);
2871 			}
2872 		}
2873 	}
2874 
2875 	if (HDR_HAS_L1HDR(hdr))
2876 		hdr->b_l1hdr.b_state = new_state;
2877 
2878 	/*
2879 	 * L2 headers should never be on the L2 state list since they don't
2880 	 * have L1 headers allocated.
2881 	 */
2882 	ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2883 	    multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2884 }
2885 
2886 void
2887 arc_space_consume(uint64_t space, arc_space_type_t type)
2888 {
2889 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2890 
2891 	switch (type) {
2892 	case ARC_SPACE_DATA:
2893 		aggsum_add(&astat_data_size, space);
2894 		break;
2895 	case ARC_SPACE_META:
2896 		aggsum_add(&astat_metadata_size, space);
2897 		break;
2898 	case ARC_SPACE_OTHER:
2899 		aggsum_add(&astat_other_size, space);
2900 		break;
2901 	case ARC_SPACE_HDRS:
2902 		aggsum_add(&astat_hdr_size, space);
2903 		break;
2904 	case ARC_SPACE_L2HDRS:
2905 		aggsum_add(&astat_l2_hdr_size, space);
2906 		break;
2907 	}
2908 
2909 	if (type != ARC_SPACE_DATA)
2910 		aggsum_add(&arc_meta_used, space);
2911 
2912 	aggsum_add(&arc_size, space);
2913 }
2914 
2915 void
2916 arc_space_return(uint64_t space, arc_space_type_t type)
2917 {
2918 	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2919 
2920 	switch (type) {
2921 	case ARC_SPACE_DATA:
2922 		aggsum_add(&astat_data_size, -space);
2923 		break;
2924 	case ARC_SPACE_META:
2925 		aggsum_add(&astat_metadata_size, -space);
2926 		break;
2927 	case ARC_SPACE_OTHER:
2928 		aggsum_add(&astat_other_size, -space);
2929 		break;
2930 	case ARC_SPACE_HDRS:
2931 		aggsum_add(&astat_hdr_size, -space);
2932 		break;
2933 	case ARC_SPACE_L2HDRS:
2934 		aggsum_add(&astat_l2_hdr_size, -space);
2935 		break;
2936 	}
2937 
2938 	if (type != ARC_SPACE_DATA) {
2939 		ASSERT(aggsum_compare(&arc_meta_used, space) >= 0);
2940 		/*
2941 		 * We use the upper bound here rather than the precise value
2942 		 * because the arc_meta_max value doesn't need to be
2943 		 * precise. It's only consumed by humans via arcstats.
2944 		 */
2945 		if (arc_meta_max < aggsum_upper_bound(&arc_meta_used))
2946 			arc_meta_max = aggsum_upper_bound(&arc_meta_used);
2947 		aggsum_add(&arc_meta_used, -space);
2948 	}
2949 
2950 	ASSERT(aggsum_compare(&arc_size, space) >= 0);
2951 	aggsum_add(&arc_size, -space);
2952 }
2953 
2954 /*
2955  * Given a hdr and a buf, returns whether that buf can share its b_data buffer
2956  * with the hdr's b_pabd.
2957  */
2958 static boolean_t
2959 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf)
2960 {
2961 	/*
2962 	 * The criteria for sharing a hdr's data are:
2963 	 * 1. the buffer is not encrypted
2964 	 * 2. the hdr's compression matches the buf's compression
2965 	 * 3. the hdr doesn't need to be byteswapped
2966 	 * 4. the hdr isn't already being shared
2967 	 * 5. the buf is either compressed or it is the last buf in the hdr list
2968 	 *
2969 	 * Criterion #5 maintains the invariant that shared uncompressed
2970 	 * bufs must be the final buf in the hdr's b_buf list. Reading this, you
2971 	 * might ask, "if a compressed buf is allocated first, won't that be the
2972 	 * last thing in the list?", but in that case it's impossible to create
2973 	 * a shared uncompressed buf anyway (because the hdr must be compressed
2974 	 * to have the compressed buf). You might also think that #3 is
2975 	 * sufficient to make this guarantee, however it's possible
2976 	 * (specifically in the rare L2ARC write race mentioned in
2977 	 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that
2978 	 * is sharable, but wasn't at the time of its allocation. Rather than
2979 	 * allow a new shared uncompressed buf to be created and then shuffle
2980 	 * the list around to make it the last element, this simply disallows
2981 	 * sharing if the new buf isn't the first to be added.
2982 	 */
2983 	ASSERT3P(buf->b_hdr, ==, hdr);
2984 	boolean_t hdr_compressed = arc_hdr_get_compress(hdr) !=
2985 	    ZIO_COMPRESS_OFF;
2986 	boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0;
2987 	return (!ARC_BUF_ENCRYPTED(buf) &&
2988 	    buf_compressed == hdr_compressed &&
2989 	    hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS &&
2990 	    !HDR_SHARED_DATA(hdr) &&
2991 	    (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf)));
2992 }
2993 
2994 /*
2995  * Allocate a buf for this hdr. If you care about the data that's in the hdr,
2996  * or if you want a compressed buffer, pass those flags in. Returns 0 if the
2997  * copy was made successfully, or an error code otherwise.
2998  */
2999 static int
3000 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
3001     void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth,
3002     boolean_t fill, arc_buf_t **ret)
3003 {
3004 	arc_buf_t *buf;
3005 	arc_fill_flags_t flags = ARC_FILL_LOCKED;
3006 
3007 	ASSERT(HDR_HAS_L1HDR(hdr));
3008 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3009 	VERIFY(hdr->b_type == ARC_BUFC_DATA ||
3010 	    hdr->b_type == ARC_BUFC_METADATA);
3011 	ASSERT3P(ret, !=, NULL);
3012 	ASSERT3P(*ret, ==, NULL);
3013 	IMPLY(encrypted, compressed);
3014 
3015 	buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
3016 	buf->b_hdr = hdr;
3017 	buf->b_data = NULL;
3018 	buf->b_next = hdr->b_l1hdr.b_buf;
3019 	buf->b_flags = 0;
3020 
3021 	add_reference(hdr, tag);
3022 
3023 	/*
3024 	 * We're about to change the hdr's b_flags. We must either
3025 	 * hold the hash_lock or be undiscoverable.
3026 	 */
3027 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3028 
3029 	/*
3030 	 * Only honor requests for compressed bufs if the hdr is actually
3031 	 * compressed. This must be overriden if the buffer is encrypted since
3032 	 * encrypted buffers cannot be decompressed.
3033 	 */
3034 	if (encrypted) {
3035 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
3036 		buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED;
3037 		flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED;
3038 	} else if (compressed &&
3039 	    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
3040 		buf->b_flags |= ARC_BUF_FLAG_COMPRESSED;
3041 		flags |= ARC_FILL_COMPRESSED;
3042 	}
3043 
3044 	if (noauth) {
3045 		ASSERT0(encrypted);
3046 		flags |= ARC_FILL_NOAUTH;
3047 	}
3048 
3049 	/*
3050 	 * If the hdr's data can be shared then we share the data buffer and
3051 	 * set the appropriate bit in the hdr's b_flags to indicate the hdr is
3052 	 * allocate a new buffer to store the buf's data.
3053 	 *
3054 	 * There are two additional restrictions here because we're sharing
3055 	 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
3056 	 * actively involved in an L2ARC write, because if this buf is used by
3057 	 * an arc_write() then the hdr's data buffer will be released when the
3058 	 * write completes, even though the L2ARC write might still be using it.
3059 	 * Second, the hdr's ABD must be linear so that the buf's user doesn't
3060 	 * need to be ABD-aware.
3061 	 */
3062 	boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
3063 	    hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
3064 
3065 	/* Set up b_data and sharing */
3066 	if (can_share) {
3067 		buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd);
3068 		buf->b_flags |= ARC_BUF_FLAG_SHARED;
3069 		arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3070 	} else {
3071 		buf->b_data =
3072 		    arc_get_data_buf(hdr, arc_buf_size(buf), buf);
3073 		ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3074 	}
3075 	VERIFY3P(buf->b_data, !=, NULL);
3076 
3077 	hdr->b_l1hdr.b_buf = buf;
3078 	hdr->b_l1hdr.b_bufcnt += 1;
3079 	if (encrypted)
3080 		hdr->b_crypt_hdr.b_ebufcnt += 1;
3081 
3082 	/*
3083 	 * If the user wants the data from the hdr, we need to either copy or
3084 	 * decompress the data.
3085 	 */
3086 	if (fill) {
3087 		ASSERT3P(zb, !=, NULL);
3088 		return (arc_buf_fill(buf, spa, zb, flags));
3089 	}
3090 
3091 	return (0);
3092 }
3093 
3094 static char *arc_onloan_tag = "onloan";
3095 
3096 static inline void
3097 arc_loaned_bytes_update(int64_t delta)
3098 {
3099 	atomic_add_64(&arc_loaned_bytes, delta);
3100 
3101 	/* assert that it did not wrap around */
3102 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
3103 }
3104 
3105 /*
3106  * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
3107  * flight data by arc_tempreserve_space() until they are "returned". Loaned
3108  * buffers must be returned to the arc before they can be used by the DMU or
3109  * freed.
3110  */
3111 arc_buf_t *
3112 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size)
3113 {
3114 	arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag,
3115 	    is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size);
3116 
3117 	arc_loaned_bytes_update(arc_buf_size(buf));
3118 
3119 	return (buf);
3120 }
3121 
3122 arc_buf_t *
3123 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize,
3124     enum zio_compress compression_type)
3125 {
3126 	arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag,
3127 	    psize, lsize, compression_type);
3128 
3129 	arc_loaned_bytes_update(arc_buf_size(buf));
3130 
3131 	return (buf);
3132 }
3133 
3134 arc_buf_t *
3135 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder,
3136     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
3137     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3138     enum zio_compress compression_type)
3139 {
3140 	arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj,
3141 	    byteorder, salt, iv, mac, ot, psize, lsize, compression_type);
3142 
3143 	atomic_add_64(&arc_loaned_bytes, psize);
3144 	return (buf);
3145 }
3146 
3147 
3148 /*
3149  * Return a loaned arc buffer to the arc.
3150  */
3151 void
3152 arc_return_buf(arc_buf_t *buf, void *tag)
3153 {
3154 	arc_buf_hdr_t *hdr = buf->b_hdr;
3155 
3156 	ASSERT3P(buf->b_data, !=, NULL);
3157 	ASSERT(HDR_HAS_L1HDR(hdr));
3158 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
3159 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
3160 
3161 	arc_loaned_bytes_update(-arc_buf_size(buf));
3162 }
3163 
3164 /* Detach an arc_buf from a dbuf (tag) */
3165 void
3166 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
3167 {
3168 	arc_buf_hdr_t *hdr = buf->b_hdr;
3169 
3170 	ASSERT3P(buf->b_data, !=, NULL);
3171 	ASSERT(HDR_HAS_L1HDR(hdr));
3172 	(void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
3173 	(void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
3174 
3175 	arc_loaned_bytes_update(arc_buf_size(buf));
3176 }
3177 
3178 static void
3179 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type)
3180 {
3181 	l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP);
3182 
3183 	df->l2df_abd = abd;
3184 	df->l2df_size = size;
3185 	df->l2df_type = type;
3186 	mutex_enter(&l2arc_free_on_write_mtx);
3187 	list_insert_head(l2arc_free_on_write, df);
3188 	mutex_exit(&l2arc_free_on_write_mtx);
3189 }
3190 
3191 static void
3192 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3193 {
3194 	arc_state_t *state = hdr->b_l1hdr.b_state;
3195 	arc_buf_contents_t type = arc_buf_type(hdr);
3196 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3197 
3198 	/* protected by hash lock, if in the hash table */
3199 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
3200 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3201 		ASSERT(state != arc_anon && state != arc_l2c_only);
3202 
3203 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
3204 		    size, hdr);
3205 	}
3206 	(void) zfs_refcount_remove_many(&state->arcs_size, size, hdr);
3207 	if (type == ARC_BUFC_METADATA) {
3208 		arc_space_return(size, ARC_SPACE_META);
3209 	} else {
3210 		ASSERT(type == ARC_BUFC_DATA);
3211 		arc_space_return(size, ARC_SPACE_DATA);
3212 	}
3213 
3214 	if (free_rdata) {
3215 		l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type);
3216 	} else {
3217 		l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type);
3218 	}
3219 }
3220 
3221 /*
3222  * Share the arc_buf_t's data with the hdr. Whenever we are sharing the
3223  * data buffer, we transfer the refcount ownership to the hdr and update
3224  * the appropriate kstats.
3225  */
3226 static void
3227 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3228 {
3229 	/* LINTED */
3230 	arc_state_t *state = hdr->b_l1hdr.b_state;
3231 
3232 	ASSERT(arc_can_share(hdr, buf));
3233 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3234 	ASSERT(!ARC_BUF_ENCRYPTED(buf));
3235 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3236 
3237 	/*
3238 	 * Start sharing the data buffer. We transfer the
3239 	 * refcount ownership to the hdr since it always owns
3240 	 * the refcount whenever an arc_buf_t is shared.
3241 	 */
3242 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
3243 	    arc_hdr_size(hdr), buf, hdr);
3244 	hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf));
3245 	abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd,
3246 	    HDR_ISTYPE_METADATA(hdr));
3247 	arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA);
3248 	buf->b_flags |= ARC_BUF_FLAG_SHARED;
3249 
3250 	/*
3251 	 * Since we've transferred ownership to the hdr we need
3252 	 * to increment its compressed and uncompressed kstats and
3253 	 * decrement the overhead size.
3254 	 */
3255 	ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr));
3256 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3257 	ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf));
3258 }
3259 
3260 static void
3261 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3262 {
3263 	/* LINTED */
3264 	arc_state_t *state = hdr->b_l1hdr.b_state;
3265 
3266 	ASSERT(arc_buf_is_shared(buf));
3267 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3268 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3269 
3270 	/*
3271 	 * We are no longer sharing this buffer so we need
3272 	 * to transfer its ownership to the rightful owner.
3273 	 */
3274 	zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size,
3275 	    arc_hdr_size(hdr), hdr, buf);
3276 	arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3277 	abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd);
3278 	abd_put(hdr->b_l1hdr.b_pabd);
3279 	hdr->b_l1hdr.b_pabd = NULL;
3280 	buf->b_flags &= ~ARC_BUF_FLAG_SHARED;
3281 
3282 	/*
3283 	 * Since the buffer is no longer shared between
3284 	 * the arc buf and the hdr, count it as overhead.
3285 	 */
3286 	ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr));
3287 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3288 	ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf));
3289 }
3290 
3291 /*
3292  * Remove an arc_buf_t from the hdr's buf list and return the last
3293  * arc_buf_t on the list. If no buffers remain on the list then return
3294  * NULL.
3295  */
3296 static arc_buf_t *
3297 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf)
3298 {
3299 	arc_buf_t **bufp = &hdr->b_l1hdr.b_buf;
3300 	arc_buf_t *lastbuf = NULL;
3301 
3302 	ASSERT(HDR_HAS_L1HDR(hdr));
3303 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3304 
3305 	/*
3306 	 * Remove the buf from the hdr list and locate the last
3307 	 * remaining buffer on the list.
3308 	 */
3309 	while (*bufp != NULL) {
3310 		if (*bufp == buf)
3311 			*bufp = buf->b_next;
3312 
3313 		/*
3314 		 * If we've removed a buffer in the middle of
3315 		 * the list then update the lastbuf and update
3316 		 * bufp.
3317 		 */
3318 		if (*bufp != NULL) {
3319 			lastbuf = *bufp;
3320 			bufp = &(*bufp)->b_next;
3321 		}
3322 	}
3323 	buf->b_next = NULL;
3324 	ASSERT3P(lastbuf, !=, buf);
3325 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL);
3326 	IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL);
3327 	IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf));
3328 
3329 	return (lastbuf);
3330 }
3331 
3332 /*
3333  * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's
3334  * list and free it.
3335  */
3336 static void
3337 arc_buf_destroy_impl(arc_buf_t *buf)
3338 {
3339 	arc_buf_hdr_t *hdr = buf->b_hdr;
3340 
3341 	/*
3342 	 * Free up the data associated with the buf but only if we're not
3343 	 * sharing this with the hdr. If we are sharing it with the hdr, the
3344 	 * hdr is responsible for doing the free.
3345 	 */
3346 	if (buf->b_data != NULL) {
3347 		/*
3348 		 * We're about to change the hdr's b_flags. We must either
3349 		 * hold the hash_lock or be undiscoverable.
3350 		 */
3351 		ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr));
3352 
3353 		arc_cksum_verify(buf);
3354 		arc_buf_unwatch(buf);
3355 
3356 		if (arc_buf_is_shared(buf)) {
3357 			arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
3358 		} else {
3359 			uint64_t size = arc_buf_size(buf);
3360 			arc_free_data_buf(hdr, buf->b_data, size, buf);
3361 			ARCSTAT_INCR(arcstat_overhead_size, -size);
3362 		}
3363 		buf->b_data = NULL;
3364 
3365 		ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
3366 		hdr->b_l1hdr.b_bufcnt -= 1;
3367 
3368 		if (ARC_BUF_ENCRYPTED(buf)) {
3369 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
3370 
3371 			/*
3372 			 * If we have no more encrypted buffers and we've
3373 			 * already gotten a copy of the decrypted data we can
3374 			 * free b_rabd to save some space.
3375 			 */
3376 			if (hdr->b_crypt_hdr.b_ebufcnt == 0 &&
3377 			    HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL &&
3378 			    !HDR_IO_IN_PROGRESS(hdr)) {
3379 				arc_hdr_free_pabd(hdr, B_TRUE);
3380 			}
3381 		}
3382 	}
3383 
3384 	arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
3385 
3386 	if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
3387 		/*
3388 		 * If the current arc_buf_t is sharing its data buffer with the
3389 		 * hdr, then reassign the hdr's b_pabd to share it with the new
3390 		 * buffer at the end of the list. The shared buffer is always
3391 		 * the last one on the hdr's buffer list.
3392 		 *
3393 		 * There is an equivalent case for compressed bufs, but since
3394 		 * they aren't guaranteed to be the last buf in the list and
3395 		 * that is an exceedingly rare case, we just allow that space be
3396 		 * wasted temporarily. We must also be careful not to share
3397 		 * encrypted buffers, since they cannot be shared.
3398 		 */
3399 		if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
3400 			/* Only one buf can be shared at once */
3401 			VERIFY(!arc_buf_is_shared(lastbuf));
3402 			/* hdr is uncompressed so can't have compressed buf */
3403 			VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
3404 
3405 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3406 			arc_hdr_free_pabd(hdr, B_FALSE);
3407 
3408 			/*
3409 			 * We must setup a new shared block between the
3410 			 * last buffer and the hdr. The data would have
3411 			 * been allocated by the arc buf so we need to transfer
3412 			 * ownership to the hdr since it's now being shared.
3413 			 */
3414 			arc_share_buf(hdr, lastbuf);
3415 		}
3416 	} else if (HDR_SHARED_DATA(hdr)) {
3417 		/*
3418 		 * Uncompressed shared buffers are always at the end
3419 		 * of the list. Compressed buffers don't have the
3420 		 * same requirements. This makes it hard to
3421 		 * simply assert that the lastbuf is shared so
3422 		 * we rely on the hdr's compression flags to determine
3423 		 * if we have a compressed, shared buffer.
3424 		 */
3425 		ASSERT3P(lastbuf, !=, NULL);
3426 		ASSERT(arc_buf_is_shared(lastbuf) ||
3427 		    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
3428 	}
3429 
3430 	/*
3431 	 * Free the checksum if we're removing the last uncompressed buf from
3432 	 * this hdr.
3433 	 */
3434 	if (!arc_hdr_has_uncompressed_buf(hdr)) {
3435 		arc_cksum_free(hdr);
3436 	}
3437 
3438 	/* clean up the buf */
3439 	buf->b_hdr = NULL;
3440 	kmem_cache_free(buf_cache, buf);
3441 }
3442 
3443 static void
3444 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata)
3445 {
3446 	uint64_t size;
3447 
3448 	ASSERT3U(HDR_GET_LSIZE(hdr), >, 0);
3449 	ASSERT(HDR_HAS_L1HDR(hdr));
3450 	ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata);
3451 	IMPLY(alloc_rdata, HDR_PROTECTED(hdr));
3452 
3453 	if (alloc_rdata) {
3454 		size = HDR_GET_PSIZE(hdr);
3455 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL);
3456 		hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr);
3457 		ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL);
3458 	} else {
3459 		size = arc_hdr_size(hdr);
3460 		ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3461 		hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr);
3462 		ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
3463 	}
3464 
3465 	ARCSTAT_INCR(arcstat_compressed_size, size);
3466 	ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr));
3467 }
3468 
3469 static void
3470 arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata)
3471 {
3472 	uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr);
3473 
3474 	ASSERT(HDR_HAS_L1HDR(hdr));
3475 	ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
3476 	IMPLY(free_rdata, HDR_HAS_RABD(hdr));
3477 
3478 
3479 	/*
3480 	 * If the hdr is currently being written to the l2arc then
3481 	 * we defer freeing the data by adding it to the l2arc_free_on_write
3482 	 * list. The l2arc will free the data once it's finished
3483 	 * writing it to the l2arc device.
3484 	 */
3485 	if (HDR_L2_WRITING(hdr)) {
3486 		arc_hdr_free_on_write(hdr, free_rdata);
3487 		ARCSTAT_BUMP(arcstat_l2_free_on_write);
3488 	} else if (free_rdata) {
3489 		arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr);
3490 	} else {
3491 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
3492 		    size, hdr);
3493 	}
3494 
3495 	if (free_rdata) {
3496 		hdr->b_crypt_hdr.b_rabd = NULL;
3497 	} else {
3498 		hdr->b_l1hdr.b_pabd = NULL;
3499 	}
3500 
3501 	if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr))
3502 		hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
3503 
3504 	ARCSTAT_INCR(arcstat_compressed_size, -size);
3505 	ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr));
3506 }
3507 
3508 static arc_buf_hdr_t *
3509 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize,
3510     boolean_t protected, enum zio_compress compression_type,
3511     arc_buf_contents_t type, boolean_t alloc_rdata)
3512 {
3513 	arc_buf_hdr_t *hdr;
3514 
3515 	VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA);
3516 	if (protected) {
3517 		hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE);
3518 	} else {
3519 		hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
3520 	}
3521 	ASSERT(HDR_EMPTY(hdr));
3522 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3523 	ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL);
3524 	HDR_SET_PSIZE(hdr, psize);
3525 	HDR_SET_LSIZE(hdr, lsize);
3526 	hdr->b_spa = spa;
3527 	hdr->b_type = type;
3528 	hdr->b_flags = 0;
3529 	arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR);
3530 	arc_hdr_set_compress(hdr, compression_type);
3531 	if (protected)
3532 		arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED);
3533 
3534 	hdr->b_l1hdr.b_state = arc_anon;
3535 	hdr->b_l1hdr.b_arc_access = 0;
3536 	hdr->b_l1hdr.b_bufcnt = 0;
3537 	hdr->b_l1hdr.b_buf = NULL;
3538 
3539 	/*
3540 	 * Allocate the hdr's buffer. This will contain either
3541 	 * the compressed or uncompressed data depending on the block
3542 	 * it references and compressed arc enablement.
3543 	 */
3544 	arc_hdr_alloc_pabd(hdr, alloc_rdata);
3545 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3546 
3547 	return (hdr);
3548 }
3549 
3550 /*
3551  * Transition between the two allocation states for the arc_buf_hdr struct.
3552  * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
3553  * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
3554  * version is used when a cache buffer is only in the L2ARC in order to reduce
3555  * memory usage.
3556  */
3557 static arc_buf_hdr_t *
3558 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
3559 {
3560 	ASSERT(HDR_HAS_L2HDR(hdr));
3561 
3562 	arc_buf_hdr_t *nhdr;
3563 	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3564 
3565 	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
3566 	    (old == hdr_l2only_cache && new == hdr_full_cache));
3567 
3568 	/*
3569 	 * if the caller wanted a new full header and the header is to be
3570 	 * encrypted we will actually allocate the header from the full crypt
3571 	 * cache instead. The same applies to freeing from the old cache.
3572 	 */
3573 	if (HDR_PROTECTED(hdr) && new == hdr_full_cache)
3574 		new = hdr_full_crypt_cache;
3575 	if (HDR_PROTECTED(hdr) && old == hdr_full_cache)
3576 		old = hdr_full_crypt_cache;
3577 
3578 	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
3579 
3580 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
3581 	buf_hash_remove(hdr);
3582 
3583 	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
3584 
3585 	if (new == hdr_full_cache || new == hdr_full_crypt_cache) {
3586 		arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3587 		/*
3588 		 * arc_access and arc_change_state need to be aware that a
3589 		 * header has just come out of L2ARC, so we set its state to
3590 		 * l2c_only even though it's about to change.
3591 		 */
3592 		nhdr->b_l1hdr.b_state = arc_l2c_only;
3593 
3594 		/* Verify previous threads set to NULL before freeing */
3595 		ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL);
3596 		ASSERT(!HDR_HAS_RABD(hdr));
3597 	} else {
3598 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
3599 		ASSERT0(hdr->b_l1hdr.b_bufcnt);
3600 		ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3601 
3602 		/*
3603 		 * If we've reached here, We must have been called from
3604 		 * arc_evict_hdr(), as such we should have already been
3605 		 * removed from any ghost list we were previously on
3606 		 * (which protects us from racing with arc_evict_state),
3607 		 * thus no locking is needed during this check.
3608 		 */
3609 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3610 
3611 		/*
3612 		 * A buffer must not be moved into the arc_l2c_only
3613 		 * state if it's not finished being written out to the
3614 		 * l2arc device. Otherwise, the b_l1hdr.b_pabd field
3615 		 * might try to be accessed, even though it was removed.
3616 		 */
3617 		VERIFY(!HDR_L2_WRITING(hdr));
3618 		VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL);
3619 		ASSERT(!HDR_HAS_RABD(hdr));
3620 
3621 #ifdef ZFS_DEBUG
3622 		if (hdr->b_l1hdr.b_thawed != NULL) {
3623 			kmem_free(hdr->b_l1hdr.b_thawed, 1);
3624 			hdr->b_l1hdr.b_thawed = NULL;
3625 		}
3626 #endif
3627 
3628 		arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR);
3629 	}
3630 	/*
3631 	 * The header has been reallocated so we need to re-insert it into any
3632 	 * lists it was on.
3633 	 */
3634 	(void) buf_hash_insert(nhdr, NULL);
3635 
3636 	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
3637 
3638 	mutex_enter(&dev->l2ad_mtx);
3639 
3640 	/*
3641 	 * We must place the realloc'ed header back into the list at
3642 	 * the same spot. Otherwise, if it's placed earlier in the list,
3643 	 * l2arc_write_buffers() could find it during the function's
3644 	 * write phase, and try to write it out to the l2arc.
3645 	 */
3646 	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
3647 	list_remove(&dev->l2ad_buflist, hdr);
3648 
3649 	mutex_exit(&dev->l2ad_mtx);
3650 
3651 	/*
3652 	 * Since we're using the pointer address as the tag when
3653 	 * incrementing and decrementing the l2ad_alloc refcount, we
3654 	 * must remove the old pointer (that we're about to destroy) and
3655 	 * add the new pointer to the refcount. Otherwise we'd remove
3656 	 * the wrong pointer address when calling arc_hdr_destroy() later.
3657 	 */
3658 
3659 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3660 	    hdr);
3661 	(void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr),
3662 	    nhdr);
3663 
3664 	buf_discard_identity(hdr);
3665 	kmem_cache_free(old, hdr);
3666 
3667 	return (nhdr);
3668 }
3669 
3670 /*
3671  * This function allows an L1 header to be reallocated as a crypt
3672  * header and vice versa. If we are going to a crypt header, the
3673  * new fields will be zeroed out.
3674  */
3675 static arc_buf_hdr_t *
3676 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt)
3677 {
3678 	arc_buf_hdr_t *nhdr;
3679 	arc_buf_t *buf;
3680 	kmem_cache_t *ncache, *ocache;
3681 
3682 	ASSERT(HDR_HAS_L1HDR(hdr));
3683 	ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt);
3684 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3685 	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
3686 	ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node));
3687 	ASSERT3P(hdr->b_hash_next, ==, NULL);
3688 
3689 	if (need_crypt) {
3690 		ncache = hdr_full_crypt_cache;
3691 		ocache = hdr_full_cache;
3692 	} else {
3693 		ncache = hdr_full_cache;
3694 		ocache = hdr_full_crypt_cache;
3695 	}
3696 
3697 	nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE);
3698 
3699 	/*
3700 	 * Copy all members that aren't locks or condvars to the new header.
3701 	 * No lists are pointing to us (as we asserted above), so we don't
3702 	 * need to worry about the list nodes.
3703 	 */
3704 	nhdr->b_dva = hdr->b_dva;
3705 	nhdr->b_birth = hdr->b_birth;
3706 	nhdr->b_type = hdr->b_type;
3707 	nhdr->b_flags = hdr->b_flags;
3708 	nhdr->b_psize = hdr->b_psize;
3709 	nhdr->b_lsize = hdr->b_lsize;
3710 	nhdr->b_spa = hdr->b_spa;
3711 	nhdr->b_l2hdr.b_dev = hdr->b_l2hdr.b_dev;
3712 	nhdr->b_l2hdr.b_daddr = hdr->b_l2hdr.b_daddr;
3713 	nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum;
3714 	nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt;
3715 	nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap;
3716 	nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state;
3717 	nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access;
3718 	nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb;
3719 	nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd;
3720 #ifdef ZFS_DEBUG
3721 	if (hdr->b_l1hdr.b_thawed != NULL) {
3722 		nhdr->b_l1hdr.b_thawed = hdr->b_l1hdr.b_thawed;
3723 		hdr->b_l1hdr.b_thawed = NULL;
3724 	}
3725 #endif
3726 
3727 	/*
3728 	 * This refcount_add() exists only to ensure that the individual
3729 	 * arc buffers always point to a header that is referenced, avoiding
3730 	 * a small race condition that could trigger ASSERTs.
3731 	 */
3732 	(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG);
3733 	nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf;
3734 	for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) {
3735 		mutex_enter(&buf->b_evict_lock);
3736 		buf->b_hdr = nhdr;
3737 		mutex_exit(&buf->b_evict_lock);
3738 	}
3739 	zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt);
3740 	(void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG);
3741 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
3742 
3743 	if (need_crypt) {
3744 		arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED);
3745 	} else {
3746 		arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED);
3747 	}
3748 
3749 	/* unset all members of the original hdr */
3750 	bzero(&hdr->b_dva, sizeof (dva_t));
3751 	hdr->b_birth = 0;
3752 	hdr->b_type = ARC_BUFC_INVALID;
3753 	hdr->b_flags = 0;
3754 	hdr->b_psize = 0;
3755 	hdr->b_lsize = 0;
3756 	hdr->b_spa = 0;
3757 	hdr->b_l2hdr.b_dev = NULL;
3758 	hdr->b_l2hdr.b_daddr = 0;
3759 	hdr->b_l1hdr.b_freeze_cksum = NULL;
3760 	hdr->b_l1hdr.b_buf = NULL;
3761 	hdr->b_l1hdr.b_bufcnt = 0;
3762 	hdr->b_l1hdr.b_byteswap = 0;
3763 	hdr->b_l1hdr.b_state = NULL;
3764 	hdr->b_l1hdr.b_arc_access = 0;
3765 	hdr->b_l1hdr.b_acb = NULL;
3766 	hdr->b_l1hdr.b_pabd = NULL;
3767 
3768 	if (ocache == hdr_full_crypt_cache) {
3769 		ASSERT(!HDR_HAS_RABD(hdr));
3770 		hdr->b_crypt_hdr.b_ot = DMU_OT_NONE;
3771 		hdr->b_crypt_hdr.b_ebufcnt = 0;
3772 		hdr->b_crypt_hdr.b_dsobj = 0;
3773 		bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3774 		bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3775 		bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3776 	}
3777 
3778 	buf_discard_identity(hdr);
3779 	kmem_cache_free(ocache, hdr);
3780 
3781 	return (nhdr);
3782 }
3783 
3784 /*
3785  * This function is used by the send / receive code to convert a newly
3786  * allocated arc_buf_t to one that is suitable for a raw encrypted write. It
3787  * is also used to allow the root objset block to be uupdated without altering
3788  * its embedded MACs. Both block types will always be uncompressed so we do not
3789  * have to worry about compression type or psize.
3790  */
3791 void
3792 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder,
3793     dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv,
3794     const uint8_t *mac)
3795 {
3796 	arc_buf_hdr_t *hdr = buf->b_hdr;
3797 
3798 	ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET);
3799 	ASSERT(HDR_HAS_L1HDR(hdr));
3800 	ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3801 
3802 	buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED);
3803 	if (!HDR_PROTECTED(hdr))
3804 		hdr = arc_hdr_realloc_crypt(hdr, B_TRUE);
3805 	hdr->b_crypt_hdr.b_dsobj = dsobj;
3806 	hdr->b_crypt_hdr.b_ot = ot;
3807 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3808 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3809 	if (!arc_hdr_has_uncompressed_buf(hdr))
3810 		arc_cksum_free(hdr);
3811 
3812 	if (salt != NULL)
3813 		bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3814 	if (iv != NULL)
3815 		bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3816 	if (mac != NULL)
3817 		bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3818 }
3819 
3820 /*
3821  * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller.
3822  * The buf is returned thawed since we expect the consumer to modify it.
3823  */
3824 arc_buf_t *
3825 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size)
3826 {
3827 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size,
3828 	    B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE);
3829 	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3830 
3831 	arc_buf_t *buf = NULL;
3832 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE,
3833 	    B_FALSE, B_FALSE, &buf));
3834 	arc_buf_thaw(buf);
3835 
3836 	return (buf);
3837 }
3838 
3839 /*
3840  * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this
3841  * for bufs containing metadata.
3842  */
3843 arc_buf_t *
3844 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
3845     enum zio_compress compression_type)
3846 {
3847 	ASSERT3U(lsize, >, 0);
3848 	ASSERT3U(lsize, >=, psize);
3849 	ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF);
3850 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3851 
3852 	arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
3853 	    B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE);
3854 	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3855 
3856 	arc_buf_t *buf = NULL;
3857 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE,
3858 	    B_TRUE, B_FALSE, B_FALSE, &buf));
3859 	arc_buf_thaw(buf);
3860 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3861 
3862 	if (!arc_buf_is_shared(buf)) {
3863 		/*
3864 		 * To ensure that the hdr has the correct data in it if we call
3865 		 * arc_untransform() on this buf before it's been written to
3866 		 * disk, it's easiest if we just set up sharing between the
3867 		 * buf and the hdr.
3868 		 */
3869 		ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
3870 		arc_hdr_free_pabd(hdr, B_FALSE);
3871 		arc_share_buf(hdr, buf);
3872 	}
3873 
3874 	return (buf);
3875 }
3876 
3877 arc_buf_t *
3878 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder,
3879     const uint8_t *salt, const uint8_t *iv, const uint8_t *mac,
3880     dmu_object_type_t ot, uint64_t psize, uint64_t lsize,
3881     enum zio_compress compression_type)
3882 {
3883 	arc_buf_hdr_t *hdr;
3884 	arc_buf_t *buf;
3885 	arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ?
3886 	    ARC_BUFC_METADATA : ARC_BUFC_DATA;
3887 
3888 	ASSERT3U(lsize, >, 0);
3889 	ASSERT3U(lsize, >=, psize);
3890 	ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF);
3891 	ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS);
3892 
3893 	hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE,
3894 	    compression_type, type, B_TRUE);
3895 	ASSERT(!MUTEX_HELD(HDR_LOCK(hdr)));
3896 
3897 	hdr->b_crypt_hdr.b_dsobj = dsobj;
3898 	hdr->b_crypt_hdr.b_ot = ot;
3899 	hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ?
3900 	    DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot);
3901 	bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN);
3902 	bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN);
3903 	bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN);
3904 
3905 	/*
3906 	 * This buffer will be considered encrypted even if the ot is not an
3907 	 * encrypted type. It will become authenticated instead in
3908 	 * arc_write_ready().
3909 	 */
3910 	buf = NULL;
3911 	VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE,
3912 	    B_FALSE, B_FALSE, &buf));
3913 	arc_buf_thaw(buf);
3914 	ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
3915 
3916 	return (buf);
3917 }
3918 
3919 static void
3920 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
3921 {
3922 	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
3923 	l2arc_dev_t *dev = l2hdr->b_dev;
3924 	uint64_t psize = HDR_GET_PSIZE(hdr);
3925 	uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
3926 
3927 	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
3928 	ASSERT(HDR_HAS_L2HDR(hdr));
3929 
3930 	list_remove(&dev->l2ad_buflist, hdr);
3931 
3932 	ARCSTAT_INCR(arcstat_l2_psize, -psize);
3933 	ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
3934 
3935 	vdev_space_update(dev->l2ad_vdev, -asize, 0, 0);
3936 
3937 	(void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr),
3938 	    hdr);
3939 	arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
3940 }
3941 
3942 static void
3943 arc_hdr_destroy(arc_buf_hdr_t *hdr)
3944 {
3945 	if (HDR_HAS_L1HDR(hdr)) {
3946 		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
3947 		    hdr->b_l1hdr.b_bufcnt > 0);
3948 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
3949 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
3950 	}
3951 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
3952 	ASSERT(!HDR_IN_HASH_TABLE(hdr));
3953 
3954 	if (!HDR_EMPTY(hdr))
3955 		buf_discard_identity(hdr);
3956 
3957 	if (HDR_HAS_L2HDR(hdr)) {
3958 		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
3959 		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
3960 
3961 		if (!buflist_held)
3962 			mutex_enter(&dev->l2ad_mtx);
3963 
3964 		/*
3965 		 * Even though we checked this conditional above, we
3966 		 * need to check this again now that we have the
3967 		 * l2ad_mtx. This is because we could be racing with
3968 		 * another thread calling l2arc_evict() which might have
3969 		 * destroyed this header's L2 portion as we were waiting
3970 		 * to acquire the l2ad_mtx. If that happens, we don't
3971 		 * want to re-destroy the header's L2 portion.
3972 		 */
3973 		if (HDR_HAS_L2HDR(hdr))
3974 			arc_hdr_l2hdr_destroy(hdr);
3975 
3976 		if (!buflist_held)
3977 			mutex_exit(&dev->l2ad_mtx);
3978 	}
3979 
3980 	if (HDR_HAS_L1HDR(hdr)) {
3981 		arc_cksum_free(hdr);
3982 
3983 		while (hdr->b_l1hdr.b_buf != NULL)
3984 			arc_buf_destroy_impl(hdr->b_l1hdr.b_buf);
3985 
3986 #ifdef ZFS_DEBUG
3987 		if (hdr->b_l1hdr.b_thawed != NULL) {
3988 			kmem_free(hdr->b_l1hdr.b_thawed, 1);
3989 			hdr->b_l1hdr.b_thawed = NULL;
3990 		}
3991 #endif
3992 
3993 		if (hdr->b_l1hdr.b_pabd != NULL) {
3994 			arc_hdr_free_pabd(hdr, B_FALSE);
3995 		}
3996 
3997 		if (HDR_HAS_RABD(hdr))
3998 			arc_hdr_free_pabd(hdr, B_TRUE);
3999 	}
4000 
4001 	ASSERT3P(hdr->b_hash_next, ==, NULL);
4002 	if (HDR_HAS_L1HDR(hdr)) {
4003 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4004 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
4005 
4006 		if (!HDR_PROTECTED(hdr)) {
4007 			kmem_cache_free(hdr_full_cache, hdr);
4008 		} else {
4009 			kmem_cache_free(hdr_full_crypt_cache, hdr);
4010 		}
4011 	} else {
4012 		kmem_cache_free(hdr_l2only_cache, hdr);
4013 	}
4014 }
4015 
4016 void
4017 arc_buf_destroy(arc_buf_t *buf, void* tag)
4018 {
4019 	arc_buf_hdr_t *hdr = buf->b_hdr;
4020 	kmutex_t *hash_lock = HDR_LOCK(hdr);
4021 
4022 	if (hdr->b_l1hdr.b_state == arc_anon) {
4023 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
4024 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4025 		VERIFY0(remove_reference(hdr, NULL, tag));
4026 		arc_hdr_destroy(hdr);
4027 		return;
4028 	}
4029 
4030 	mutex_enter(hash_lock);
4031 	ASSERT3P(hdr, ==, buf->b_hdr);
4032 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
4033 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4034 	ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon);
4035 	ASSERT3P(buf->b_data, !=, NULL);
4036 
4037 	(void) remove_reference(hdr, hash_lock, tag);
4038 	arc_buf_destroy_impl(buf);
4039 	mutex_exit(hash_lock);
4040 }
4041 
4042 /*
4043  * Evict the arc_buf_hdr that is provided as a parameter. The resultant
4044  * state of the header is dependent on its state prior to entering this
4045  * function. The following transitions are possible:
4046  *
4047  *    - arc_mru -> arc_mru_ghost
4048  *    - arc_mfu -> arc_mfu_ghost
4049  *    - arc_mru_ghost -> arc_l2c_only
4050  *    - arc_mru_ghost -> deleted
4051  *    - arc_mfu_ghost -> arc_l2c_only
4052  *    - arc_mfu_ghost -> deleted
4053  */
4054 static int64_t
4055 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
4056 {
4057 	arc_state_t *evicted_state, *state;
4058 	int64_t bytes_evicted = 0;
4059 	int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
4060 	    zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms;
4061 
4062 	ASSERT(MUTEX_HELD(hash_lock));
4063 	ASSERT(HDR_HAS_L1HDR(hdr));
4064 
4065 	state = hdr->b_l1hdr.b_state;
4066 	if (GHOST_STATE(state)) {
4067 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4068 		ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4069 
4070 		/*
4071 		 * l2arc_write_buffers() relies on a header's L1 portion
4072 		 * (i.e. its b_pabd field) during its write phase.
4073 		 * Thus, we cannot push a header onto the arc_l2c_only
4074 		 * state (removing its L1 piece) until the header is
4075 		 * done being written to the l2arc.
4076 		 */
4077 		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
4078 			ARCSTAT_BUMP(arcstat_evict_l2_skip);
4079 			return (bytes_evicted);
4080 		}
4081 
4082 		ARCSTAT_BUMP(arcstat_deleted);
4083 		bytes_evicted += HDR_GET_LSIZE(hdr);
4084 
4085 		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
4086 
4087 		if (HDR_HAS_L2HDR(hdr)) {
4088 			ASSERT(hdr->b_l1hdr.b_pabd == NULL);
4089 			ASSERT(!HDR_HAS_RABD(hdr));
4090 			/*
4091 			 * This buffer is cached on the 2nd Level ARC;
4092 			 * don't destroy the header.
4093 			 */
4094 			arc_change_state(arc_l2c_only, hdr, hash_lock);
4095 			/*
4096 			 * dropping from L1+L2 cached to L2-only,
4097 			 * realloc to remove the L1 header.
4098 			 */
4099 			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
4100 			    hdr_l2only_cache);
4101 		} else {
4102 			arc_change_state(arc_anon, hdr, hash_lock);
4103 			arc_hdr_destroy(hdr);
4104 		}
4105 		return (bytes_evicted);
4106 	}
4107 
4108 	ASSERT(state == arc_mru || state == arc_mfu);
4109 	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
4110 
4111 	/* prefetch buffers have a minimum lifespan */
4112 	if (HDR_IO_IN_PROGRESS(hdr) ||
4113 	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
4114 	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
4115 		ARCSTAT_BUMP(arcstat_evict_skip);
4116 		return (bytes_evicted);
4117 	}
4118 
4119 	ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt));
4120 	while (hdr->b_l1hdr.b_buf) {
4121 		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4122 		if (!mutex_tryenter(&buf->b_evict_lock)) {
4123 			ARCSTAT_BUMP(arcstat_mutex_miss);
4124 			break;
4125 		}
4126 		if (buf->b_data != NULL)
4127 			bytes_evicted += HDR_GET_LSIZE(hdr);
4128 		mutex_exit(&buf->b_evict_lock);
4129 		arc_buf_destroy_impl(buf);
4130 	}
4131 
4132 	if (HDR_HAS_L2HDR(hdr)) {
4133 		ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr));
4134 	} else {
4135 		if (l2arc_write_eligible(hdr->b_spa, hdr)) {
4136 			ARCSTAT_INCR(arcstat_evict_l2_eligible,
4137 			    HDR_GET_LSIZE(hdr));
4138 		} else {
4139 			ARCSTAT_INCR(arcstat_evict_l2_ineligible,
4140 			    HDR_GET_LSIZE(hdr));
4141 		}
4142 	}
4143 
4144 	if (hdr->b_l1hdr.b_bufcnt == 0) {
4145 		arc_cksum_free(hdr);
4146 
4147 		bytes_evicted += arc_hdr_size(hdr);
4148 
4149 		/*
4150 		 * If this hdr is being evicted and has a compressed
4151 		 * buffer then we discard it here before we change states.
4152 		 * This ensures that the accounting is updated correctly
4153 		 * in arc_free_data_impl().
4154 		 */
4155 		if (hdr->b_l1hdr.b_pabd != NULL)
4156 			arc_hdr_free_pabd(hdr, B_FALSE);
4157 
4158 		if (HDR_HAS_RABD(hdr))
4159 			arc_hdr_free_pabd(hdr, B_TRUE);
4160 
4161 		arc_change_state(evicted_state, hdr, hash_lock);
4162 		ASSERT(HDR_IN_HASH_TABLE(hdr));
4163 		arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE);
4164 		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
4165 	}
4166 
4167 	return (bytes_evicted);
4168 }
4169 
4170 static uint64_t
4171 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
4172     uint64_t spa, int64_t bytes)
4173 {
4174 	multilist_sublist_t *mls;
4175 	uint64_t bytes_evicted = 0;
4176 	arc_buf_hdr_t *hdr;
4177 	kmutex_t *hash_lock;
4178 	int evict_count = 0;
4179 
4180 	ASSERT3P(marker, !=, NULL);
4181 	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
4182 
4183 	mls = multilist_sublist_lock(ml, idx);
4184 
4185 	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
4186 	    hdr = multilist_sublist_prev(mls, marker)) {
4187 		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
4188 		    (evict_count >= zfs_arc_evict_batch_limit))
4189 			break;
4190 
4191 		/*
4192 		 * To keep our iteration location, move the marker
4193 		 * forward. Since we're not holding hdr's hash lock, we
4194 		 * must be very careful and not remove 'hdr' from the
4195 		 * sublist. Otherwise, other consumers might mistake the
4196 		 * 'hdr' as not being on a sublist when they call the
4197 		 * multilist_link_active() function (they all rely on
4198 		 * the hash lock protecting concurrent insertions and
4199 		 * removals). multilist_sublist_move_forward() was
4200 		 * specifically implemented to ensure this is the case
4201 		 * (only 'marker' will be removed and re-inserted).
4202 		 */
4203 		multilist_sublist_move_forward(mls, marker);
4204 
4205 		/*
4206 		 * The only case where the b_spa field should ever be
4207 		 * zero, is the marker headers inserted by
4208 		 * arc_evict_state(). It's possible for multiple threads
4209 		 * to be calling arc_evict_state() concurrently (e.g.
4210 		 * dsl_pool_close() and zio_inject_fault()), so we must
4211 		 * skip any markers we see from these other threads.
4212 		 */
4213 		if (hdr->b_spa == 0)
4214 			continue;
4215 
4216 		/* we're only interested in evicting buffers of a certain spa */
4217 		if (spa != 0 && hdr->b_spa != spa) {
4218 			ARCSTAT_BUMP(arcstat_evict_skip);
4219 			continue;
4220 		}
4221 
4222 		hash_lock = HDR_LOCK(hdr);
4223 
4224 		/*
4225 		 * We aren't calling this function from any code path
4226 		 * that would already be holding a hash lock, so we're
4227 		 * asserting on this assumption to be defensive in case
4228 		 * this ever changes. Without this check, it would be
4229 		 * possible to incorrectly increment arcstat_mutex_miss
4230 		 * below (e.g. if the code changed such that we called
4231 		 * this function with a hash lock held).
4232 		 */
4233 		ASSERT(!MUTEX_HELD(hash_lock));
4234 
4235 		if (mutex_tryenter(hash_lock)) {
4236 			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
4237 			mutex_exit(hash_lock);
4238 
4239 			bytes_evicted += evicted;
4240 
4241 			/*
4242 			 * If evicted is zero, arc_evict_hdr() must have
4243 			 * decided to skip this header, don't increment
4244 			 * evict_count in this case.
4245 			 */
4246 			if (evicted != 0)
4247 				evict_count++;
4248 
4249 			/*
4250 			 * If arc_size isn't overflowing, signal any
4251 			 * threads that might happen to be waiting.
4252 			 *
4253 			 * For each header evicted, we wake up a single
4254 			 * thread. If we used cv_broadcast, we could
4255 			 * wake up "too many" threads causing arc_size
4256 			 * to significantly overflow arc_c; since
4257 			 * arc_get_data_impl() doesn't check for overflow
4258 			 * when it's woken up (it doesn't because it's
4259 			 * possible for the ARC to be overflowing while
4260 			 * full of un-evictable buffers, and the
4261 			 * function should proceed in this case).
4262 			 *
4263 			 * If threads are left sleeping, due to not
4264 			 * using cv_broadcast here, they will be woken
4265 			 * up via cv_broadcast in arc_adjust_cb() just
4266 			 * before arc_adjust_zthr sleeps.
4267 			 */
4268 			mutex_enter(&arc_adjust_lock);
4269 			if (!arc_is_overflowing())
4270 				cv_signal(&arc_adjust_waiters_cv);
4271 			mutex_exit(&arc_adjust_lock);
4272 		} else {
4273 			ARCSTAT_BUMP(arcstat_mutex_miss);
4274 		}
4275 	}
4276 
4277 	multilist_sublist_unlock(mls);
4278 
4279 	return (bytes_evicted);
4280 }
4281 
4282 /*
4283  * Evict buffers from the given arc state, until we've removed the
4284  * specified number of bytes. Move the removed buffers to the
4285  * appropriate evict state.
4286  *
4287  * This function makes a "best effort". It skips over any buffers
4288  * it can't get a hash_lock on, and so, may not catch all candidates.
4289  * It may also return without evicting as much space as requested.
4290  *
4291  * If bytes is specified using the special value ARC_EVICT_ALL, this
4292  * will evict all available (i.e. unlocked and evictable) buffers from
4293  * the given arc state; which is used by arc_flush().
4294  */
4295 static uint64_t
4296 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
4297     arc_buf_contents_t type)
4298 {
4299 	uint64_t total_evicted = 0;
4300 	multilist_t *ml = state->arcs_list[type];
4301 	int num_sublists;
4302 	arc_buf_hdr_t **markers;
4303 
4304 	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
4305 
4306 	num_sublists = multilist_get_num_sublists(ml);
4307 
4308 	/*
4309 	 * If we've tried to evict from each sublist, made some
4310 	 * progress, but still have not hit the target number of bytes
4311 	 * to evict, we want to keep trying. The markers allow us to
4312 	 * pick up where we left off for each individual sublist, rather
4313 	 * than starting from the tail each time.
4314 	 */
4315 	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
4316 	for (int i = 0; i < num_sublists; i++) {
4317 		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
4318 
4319 		/*
4320 		 * A b_spa of 0 is used to indicate that this header is
4321 		 * a marker. This fact is used in arc_adjust_type() and
4322 		 * arc_evict_state_impl().
4323 		 */
4324 		markers[i]->b_spa = 0;
4325 
4326 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4327 		multilist_sublist_insert_tail(mls, markers[i]);
4328 		multilist_sublist_unlock(mls);
4329 	}
4330 
4331 	/*
4332 	 * While we haven't hit our target number of bytes to evict, or
4333 	 * we're evicting all available buffers.
4334 	 */
4335 	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
4336 		/*
4337 		 * Start eviction using a randomly selected sublist,
4338 		 * this is to try and evenly balance eviction across all
4339 		 * sublists. Always starting at the same sublist
4340 		 * (e.g. index 0) would cause evictions to favor certain
4341 		 * sublists over others.
4342 		 */
4343 		int sublist_idx = multilist_get_random_index(ml);
4344 		uint64_t scan_evicted = 0;
4345 
4346 		for (int i = 0; i < num_sublists; i++) {
4347 			uint64_t bytes_remaining;
4348 			uint64_t bytes_evicted;
4349 
4350 			if (bytes == ARC_EVICT_ALL)
4351 				bytes_remaining = ARC_EVICT_ALL;
4352 			else if (total_evicted < bytes)
4353 				bytes_remaining = bytes - total_evicted;
4354 			else
4355 				break;
4356 
4357 			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
4358 			    markers[sublist_idx], spa, bytes_remaining);
4359 
4360 			scan_evicted += bytes_evicted;
4361 			total_evicted += bytes_evicted;
4362 
4363 			/* we've reached the end, wrap to the beginning */
4364 			if (++sublist_idx >= num_sublists)
4365 				sublist_idx = 0;
4366 		}
4367 
4368 		/*
4369 		 * If we didn't evict anything during this scan, we have
4370 		 * no reason to believe we'll evict more during another
4371 		 * scan, so break the loop.
4372 		 */
4373 		if (scan_evicted == 0) {
4374 			/* This isn't possible, let's make that obvious */
4375 			ASSERT3S(bytes, !=, 0);
4376 
4377 			/*
4378 			 * When bytes is ARC_EVICT_ALL, the only way to
4379 			 * break the loop is when scan_evicted is zero.
4380 			 * In that case, we actually have evicted enough,
4381 			 * so we don't want to increment the kstat.
4382 			 */
4383 			if (bytes != ARC_EVICT_ALL) {
4384 				ASSERT3S(total_evicted, <, bytes);
4385 				ARCSTAT_BUMP(arcstat_evict_not_enough);
4386 			}
4387 
4388 			break;
4389 		}
4390 	}
4391 
4392 	for (int i = 0; i < num_sublists; i++) {
4393 		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
4394 		multilist_sublist_remove(mls, markers[i]);
4395 		multilist_sublist_unlock(mls);
4396 
4397 		kmem_cache_free(hdr_full_cache, markers[i]);
4398 	}
4399 	kmem_free(markers, sizeof (*markers) * num_sublists);
4400 
4401 	return (total_evicted);
4402 }
4403 
4404 /*
4405  * Flush all "evictable" data of the given type from the arc state
4406  * specified. This will not evict any "active" buffers (i.e. referenced).
4407  *
4408  * When 'retry' is set to B_FALSE, the function will make a single pass
4409  * over the state and evict any buffers that it can. Since it doesn't
4410  * continually retry the eviction, it might end up leaving some buffers
4411  * in the ARC due to lock misses.
4412  *
4413  * When 'retry' is set to B_TRUE, the function will continually retry the
4414  * eviction until *all* evictable buffers have been removed from the
4415  * state. As a result, if concurrent insertions into the state are
4416  * allowed (e.g. if the ARC isn't shutting down), this function might
4417  * wind up in an infinite loop, continually trying to evict buffers.
4418  */
4419 static uint64_t
4420 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
4421     boolean_t retry)
4422 {
4423 	uint64_t evicted = 0;
4424 
4425 	while (zfs_refcount_count(&state->arcs_esize[type]) != 0) {
4426 		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
4427 
4428 		if (!retry)
4429 			break;
4430 	}
4431 
4432 	return (evicted);
4433 }
4434 
4435 /*
4436  * Evict the specified number of bytes from the state specified,
4437  * restricting eviction to the spa and type given. This function
4438  * prevents us from trying to evict more from a state's list than
4439  * is "evictable", and to skip evicting altogether when passed a
4440  * negative value for "bytes". In contrast, arc_evict_state() will
4441  * evict everything it can, when passed a negative value for "bytes".
4442  */
4443 static uint64_t
4444 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
4445     arc_buf_contents_t type)
4446 {
4447 	int64_t delta;
4448 
4449 	if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) {
4450 		delta = MIN(zfs_refcount_count(&state->arcs_esize[type]),
4451 		    bytes);
4452 		return (arc_evict_state(state, spa, delta, type));
4453 	}
4454 
4455 	return (0);
4456 }
4457 
4458 /*
4459  * Evict metadata buffers from the cache, such that arc_meta_used is
4460  * capped by the arc_meta_limit tunable.
4461  */
4462 static uint64_t
4463 arc_adjust_meta(uint64_t meta_used)
4464 {
4465 	uint64_t total_evicted = 0;
4466 	int64_t target;
4467 
4468 	/*
4469 	 * If we're over the meta limit, we want to evict enough
4470 	 * metadata to get back under the meta limit. We don't want to
4471 	 * evict so much that we drop the MRU below arc_p, though. If
4472 	 * we're over the meta limit more than we're over arc_p, we
4473 	 * evict some from the MRU here, and some from the MFU below.
4474 	 */
4475 	target = MIN((int64_t)(meta_used - arc_meta_limit),
4476 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4477 	    zfs_refcount_count(&arc_mru->arcs_size) - arc_p));
4478 
4479 	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4480 
4481 	/*
4482 	 * Similar to the above, we want to evict enough bytes to get us
4483 	 * below the meta limit, but not so much as to drop us below the
4484 	 * space allotted to the MFU (which is defined as arc_c - arc_p).
4485 	 */
4486 	target = MIN((int64_t)(meta_used - arc_meta_limit),
4487 	    (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) -
4488 	    (arc_c - arc_p)));
4489 
4490 	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4491 
4492 	return (total_evicted);
4493 }
4494 
4495 /*
4496  * Return the type of the oldest buffer in the given arc state
4497  *
4498  * This function will select a random sublist of type ARC_BUFC_DATA and
4499  * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
4500  * is compared, and the type which contains the "older" buffer will be
4501  * returned.
4502  */
4503 static arc_buf_contents_t
4504 arc_adjust_type(arc_state_t *state)
4505 {
4506 	multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA];
4507 	multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA];
4508 	int data_idx = multilist_get_random_index(data_ml);
4509 	int meta_idx = multilist_get_random_index(meta_ml);
4510 	multilist_sublist_t *data_mls;
4511 	multilist_sublist_t *meta_mls;
4512 	arc_buf_contents_t type;
4513 	arc_buf_hdr_t *data_hdr;
4514 	arc_buf_hdr_t *meta_hdr;
4515 
4516 	/*
4517 	 * We keep the sublist lock until we're finished, to prevent
4518 	 * the headers from being destroyed via arc_evict_state().
4519 	 */
4520 	data_mls = multilist_sublist_lock(data_ml, data_idx);
4521 	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
4522 
4523 	/*
4524 	 * These two loops are to ensure we skip any markers that
4525 	 * might be at the tail of the lists due to arc_evict_state().
4526 	 */
4527 
4528 	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
4529 	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
4530 		if (data_hdr->b_spa != 0)
4531 			break;
4532 	}
4533 
4534 	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
4535 	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
4536 		if (meta_hdr->b_spa != 0)
4537 			break;
4538 	}
4539 
4540 	if (data_hdr == NULL && meta_hdr == NULL) {
4541 		type = ARC_BUFC_DATA;
4542 	} else if (data_hdr == NULL) {
4543 		ASSERT3P(meta_hdr, !=, NULL);
4544 		type = ARC_BUFC_METADATA;
4545 	} else if (meta_hdr == NULL) {
4546 		ASSERT3P(data_hdr, !=, NULL);
4547 		type = ARC_BUFC_DATA;
4548 	} else {
4549 		ASSERT3P(data_hdr, !=, NULL);
4550 		ASSERT3P(meta_hdr, !=, NULL);
4551 
4552 		/* The headers can't be on the sublist without an L1 header */
4553 		ASSERT(HDR_HAS_L1HDR(data_hdr));
4554 		ASSERT(HDR_HAS_L1HDR(meta_hdr));
4555 
4556 		if (data_hdr->b_l1hdr.b_arc_access <
4557 		    meta_hdr->b_l1hdr.b_arc_access) {
4558 			type = ARC_BUFC_DATA;
4559 		} else {
4560 			type = ARC_BUFC_METADATA;
4561 		}
4562 	}
4563 
4564 	multilist_sublist_unlock(meta_mls);
4565 	multilist_sublist_unlock(data_mls);
4566 
4567 	return (type);
4568 }
4569 
4570 /*
4571  * Evict buffers from the cache, such that arc_size is capped by arc_c.
4572  */
4573 static uint64_t
4574 arc_adjust(void)
4575 {
4576 	uint64_t total_evicted = 0;
4577 	uint64_t bytes;
4578 	int64_t target;
4579 	uint64_t asize = aggsum_value(&arc_size);
4580 	uint64_t ameta = aggsum_value(&arc_meta_used);
4581 
4582 	/*
4583 	 * If we're over arc_meta_limit, we want to correct that before
4584 	 * potentially evicting data buffers below.
4585 	 */
4586 	total_evicted += arc_adjust_meta(ameta);
4587 
4588 	/*
4589 	 * Adjust MRU size
4590 	 *
4591 	 * If we're over the target cache size, we want to evict enough
4592 	 * from the list to get back to our target size. We don't want
4593 	 * to evict too much from the MRU, such that it drops below
4594 	 * arc_p. So, if we're over our target cache size more than
4595 	 * the MRU is over arc_p, we'll evict enough to get back to
4596 	 * arc_p here, and then evict more from the MFU below.
4597 	 */
4598 	target = MIN((int64_t)(asize - arc_c),
4599 	    (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) +
4600 	    zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p));
4601 
4602 	/*
4603 	 * If we're below arc_meta_min, always prefer to evict data.
4604 	 * Otherwise, try to satisfy the requested number of bytes to
4605 	 * evict from the type which contains older buffers; in an
4606 	 * effort to keep newer buffers in the cache regardless of their
4607 	 * type. If we cannot satisfy the number of bytes from this
4608 	 * type, spill over into the next type.
4609 	 */
4610 	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
4611 	    ameta > arc_meta_min) {
4612 		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4613 		total_evicted += bytes;
4614 
4615 		/*
4616 		 * If we couldn't evict our target number of bytes from
4617 		 * metadata, we try to get the rest from data.
4618 		 */
4619 		target -= bytes;
4620 
4621 		total_evicted +=
4622 		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4623 	} else {
4624 		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
4625 		total_evicted += bytes;
4626 
4627 		/*
4628 		 * If we couldn't evict our target number of bytes from
4629 		 * data, we try to get the rest from metadata.
4630 		 */
4631 		target -= bytes;
4632 
4633 		total_evicted +=
4634 		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
4635 	}
4636 
4637 	/*
4638 	 * Adjust MFU size
4639 	 *
4640 	 * Now that we've tried to evict enough from the MRU to get its
4641 	 * size back to arc_p, if we're still above the target cache
4642 	 * size, we evict the rest from the MFU.
4643 	 */
4644 	target = asize - arc_c;
4645 
4646 	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
4647 	    ameta > arc_meta_min) {
4648 		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4649 		total_evicted += bytes;
4650 
4651 		/*
4652 		 * If we couldn't evict our target number of bytes from
4653 		 * metadata, we try to get the rest from data.
4654 		 */
4655 		target -= bytes;
4656 
4657 		total_evicted +=
4658 		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4659 	} else {
4660 		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
4661 		total_evicted += bytes;
4662 
4663 		/*
4664 		 * If we couldn't evict our target number of bytes from
4665 		 * data, we try to get the rest from data.
4666 		 */
4667 		target -= bytes;
4668 
4669 		total_evicted +=
4670 		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
4671 	}
4672 
4673 	/*
4674 	 * Adjust ghost lists
4675 	 *
4676 	 * In addition to the above, the ARC also defines target values
4677 	 * for the ghost lists. The sum of the mru list and mru ghost
4678 	 * list should never exceed the target size of the cache, and
4679 	 * the sum of the mru list, mfu list, mru ghost list, and mfu
4680 	 * ghost list should never exceed twice the target size of the
4681 	 * cache. The following logic enforces these limits on the ghost
4682 	 * caches, and evicts from them as needed.
4683 	 */
4684 	target = zfs_refcount_count(&arc_mru->arcs_size) +
4685 	    zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
4686 
4687 	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
4688 	total_evicted += bytes;
4689 
4690 	target -= bytes;
4691 
4692 	total_evicted +=
4693 	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
4694 
4695 	/*
4696 	 * We assume the sum of the mru list and mfu list is less than
4697 	 * or equal to arc_c (we enforced this above), which means we
4698 	 * can use the simpler of the two equations below:
4699 	 *
4700 	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
4701 	 *		    mru ghost + mfu ghost <= arc_c
4702 	 */
4703 	target = zfs_refcount_count(&arc_mru_ghost->arcs_size) +
4704 	    zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
4705 
4706 	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
4707 	total_evicted += bytes;
4708 
4709 	target -= bytes;
4710 
4711 	total_evicted +=
4712 	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
4713 
4714 	return (total_evicted);
4715 }
4716 
4717 void
4718 arc_flush(spa_t *spa, boolean_t retry)
4719 {
4720 	uint64_t guid = 0;
4721 
4722 	/*
4723 	 * If retry is B_TRUE, a spa must not be specified since we have
4724 	 * no good way to determine if all of a spa's buffers have been
4725 	 * evicted from an arc state.
4726 	 */
4727 	ASSERT(!retry || spa == 0);
4728 
4729 	if (spa != NULL)
4730 		guid = spa_load_guid(spa);
4731 
4732 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
4733 	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
4734 
4735 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
4736 	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
4737 
4738 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
4739 	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
4740 
4741 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
4742 	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
4743 }
4744 
4745 static void
4746 arc_reduce_target_size(int64_t to_free)
4747 {
4748 	uint64_t asize = aggsum_value(&arc_size);
4749 	if (arc_c > arc_c_min) {
4750 
4751 		if (arc_c > arc_c_min + to_free)
4752 			atomic_add_64(&arc_c, -to_free);
4753 		else
4754 			arc_c = arc_c_min;
4755 
4756 		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
4757 		if (asize < arc_c)
4758 			arc_c = MAX(asize, arc_c_min);
4759 		if (arc_p > arc_c)
4760 			arc_p = (arc_c >> 1);
4761 		ASSERT(arc_c >= arc_c_min);
4762 		ASSERT((int64_t)arc_p >= 0);
4763 	}
4764 
4765 	if (asize > arc_c) {
4766 		/* See comment in arc_adjust_cb_check() on why lock+flag */
4767 		mutex_enter(&arc_adjust_lock);
4768 		arc_adjust_needed = B_TRUE;
4769 		mutex_exit(&arc_adjust_lock);
4770 		zthr_wakeup(arc_adjust_zthr);
4771 	}
4772 }
4773 
4774 typedef enum free_memory_reason_t {
4775 	FMR_UNKNOWN,
4776 	FMR_NEEDFREE,
4777 	FMR_LOTSFREE,
4778 	FMR_SWAPFS_MINFREE,
4779 	FMR_PAGES_PP_MAXIMUM,
4780 	FMR_HEAP_ARENA,
4781 	FMR_ZIO_ARENA,
4782 } free_memory_reason_t;
4783 
4784 int64_t last_free_memory;
4785 free_memory_reason_t last_free_reason;
4786 
4787 /*
4788  * Additional reserve of pages for pp_reserve.
4789  */
4790 int64_t arc_pages_pp_reserve = 64;
4791 
4792 /*
4793  * Additional reserve of pages for swapfs.
4794  */
4795 int64_t arc_swapfs_reserve = 64;
4796 
4797 /*
4798  * Return the amount of memory that can be consumed before reclaim will be
4799  * needed.  Positive if there is sufficient free memory, negative indicates
4800  * the amount of memory that needs to be freed up.
4801  */
4802 static int64_t
4803 arc_available_memory(void)
4804 {
4805 	int64_t lowest = INT64_MAX;
4806 	int64_t n;
4807 	free_memory_reason_t r = FMR_UNKNOWN;
4808 
4809 #ifdef _KERNEL
4810 	if (needfree > 0) {
4811 		n = PAGESIZE * (-needfree);
4812 		if (n < lowest) {
4813 			lowest = n;
4814 			r = FMR_NEEDFREE;
4815 		}
4816 	}
4817 
4818 	/*
4819 	 * check that we're out of range of the pageout scanner.  It starts to
4820 	 * schedule paging if freemem is less than lotsfree and needfree.
4821 	 * lotsfree is the high-water mark for pageout, and needfree is the
4822 	 * number of needed free pages.  We add extra pages here to make sure
4823 	 * the scanner doesn't start up while we're freeing memory.
4824 	 */
4825 	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
4826 	if (n < lowest) {
4827 		lowest = n;
4828 		r = FMR_LOTSFREE;
4829 	}
4830 
4831 	/*
4832 	 * check to make sure that swapfs has enough space so that anon
4833 	 * reservations can still succeed. anon_resvmem() checks that the
4834 	 * availrmem is greater than swapfs_minfree, and the number of reserved
4835 	 * swap pages.  We also add a bit of extra here just to prevent
4836 	 * circumstances from getting really dire.
4837 	 */
4838 	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
4839 	    desfree - arc_swapfs_reserve);
4840 	if (n < lowest) {
4841 		lowest = n;
4842 		r = FMR_SWAPFS_MINFREE;
4843 	}
4844 
4845 
4846 	/*
4847 	 * Check that we have enough availrmem that memory locking (e.g., via
4848 	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
4849 	 * stores the number of pages that cannot be locked; when availrmem
4850 	 * drops below pages_pp_maximum, page locking mechanisms such as
4851 	 * page_pp_lock() will fail.)
4852 	 */
4853 	n = PAGESIZE * (availrmem - pages_pp_maximum -
4854 	    arc_pages_pp_reserve);
4855 	if (n < lowest) {
4856 		lowest = n;
4857 		r = FMR_PAGES_PP_MAXIMUM;
4858 	}
4859 
4860 #if defined(__i386)
4861 	/*
4862 	 * If we're on an i386 platform, it's possible that we'll exhaust the
4863 	 * kernel heap space before we ever run out of available physical
4864 	 * memory.  Most checks of the size of the heap_area compare against
4865 	 * tune.t_minarmem, which is the minimum available real memory that we
4866 	 * can have in the system.  However, this is generally fixed at 25 pages
4867 	 * which is so low that it's useless.  In this comparison, we seek to
4868 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
4869 	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
4870 	 * free)
4871 	 */
4872 	n = (int64_t)vmem_size(heap_arena, VMEM_FREE) -
4873 	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
4874 	if (n < lowest) {
4875 		lowest = n;
4876 		r = FMR_HEAP_ARENA;
4877 	}
4878 #endif
4879 
4880 	/*
4881 	 * If zio data pages are being allocated out of a separate heap segment,
4882 	 * then enforce that the size of available vmem for this arena remains
4883 	 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free.
4884 	 *
4885 	 * Note that reducing the arc_zio_arena_free_shift keeps more virtual
4886 	 * memory (in the zio_arena) free, which can avoid memory
4887 	 * fragmentation issues.
4888 	 */
4889 	if (zio_arena != NULL) {
4890 		n = (int64_t)vmem_size(zio_arena, VMEM_FREE) -
4891 		    (vmem_size(zio_arena, VMEM_ALLOC) >>
4892 		    arc_zio_arena_free_shift);
4893 		if (n < lowest) {
4894 			lowest = n;
4895 			r = FMR_ZIO_ARENA;
4896 		}
4897 	}
4898 #else
4899 	/* Every 100 calls, free a small amount */
4900 	if (spa_get_random(100) == 0)
4901 		lowest = -1024;
4902 #endif
4903 
4904 	last_free_memory = lowest;
4905 	last_free_reason = r;
4906 
4907 	return (lowest);
4908 }
4909 
4910 
4911 /*
4912  * Determine if the system is under memory pressure and is asking
4913  * to reclaim memory. A return value of B_TRUE indicates that the system
4914  * is under memory pressure and that the arc should adjust accordingly.
4915  */
4916 static boolean_t
4917 arc_reclaim_needed(void)
4918 {
4919 	return (arc_available_memory() < 0);
4920 }
4921 
4922 static void
4923 arc_kmem_reap_soon(void)
4924 {
4925 	size_t			i;
4926 	kmem_cache_t		*prev_cache = NULL;
4927 	kmem_cache_t		*prev_data_cache = NULL;
4928 	extern kmem_cache_t	*zio_buf_cache[];
4929 	extern kmem_cache_t	*zio_data_buf_cache[];
4930 	extern kmem_cache_t	*range_seg_cache;
4931 	extern kmem_cache_t	*abd_chunk_cache;
4932 
4933 #ifdef _KERNEL
4934 	if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) {
4935 		/*
4936 		 * We are exceeding our meta-data cache limit.
4937 		 * Purge some DNLC entries to release holds on meta-data.
4938 		 */
4939 		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
4940 	}
4941 #if defined(__i386)
4942 	/*
4943 	 * Reclaim unused memory from all kmem caches.
4944 	 */
4945 	kmem_reap();
4946 #endif
4947 #endif
4948 
4949 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
4950 		if (zio_buf_cache[i] != prev_cache) {
4951 			prev_cache = zio_buf_cache[i];
4952 			kmem_cache_reap_soon(zio_buf_cache[i]);
4953 		}
4954 		if (zio_data_buf_cache[i] != prev_data_cache) {
4955 			prev_data_cache = zio_data_buf_cache[i];
4956 			kmem_cache_reap_soon(zio_data_buf_cache[i]);
4957 		}
4958 	}
4959 	kmem_cache_reap_soon(abd_chunk_cache);
4960 	kmem_cache_reap_soon(buf_cache);
4961 	kmem_cache_reap_soon(hdr_full_cache);
4962 	kmem_cache_reap_soon(hdr_l2only_cache);
4963 	kmem_cache_reap_soon(range_seg_cache);
4964 
4965 	if (zio_arena != NULL) {
4966 		/*
4967 		 * Ask the vmem arena to reclaim unused memory from its
4968 		 * quantum caches.
4969 		 */
4970 		vmem_qcache_reap(zio_arena);
4971 	}
4972 }
4973 
4974 /* ARGSUSED */
4975 static boolean_t
4976 arc_adjust_cb_check(void *arg, zthr_t *zthr)
4977 {
4978 	/*
4979 	 * This is necessary in order for the mdb ::arc dcmd to
4980 	 * show up to date information. Since the ::arc command
4981 	 * does not call the kstat's update function, without
4982 	 * this call, the command may show stale stats for the
4983 	 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
4984 	 * with this change, the data might be up to 1 second
4985 	 * out of date(the arc_adjust_zthr has a maximum sleep
4986 	 * time of 1 second); but that should suffice.  The
4987 	 * arc_state_t structures can be queried directly if more
4988 	 * accurate information is needed.
4989 	 */
4990 	if (arc_ksp != NULL)
4991 		arc_ksp->ks_update(arc_ksp, KSTAT_READ);
4992 
4993 	/*
4994 	 * We have to rely on arc_get_data_impl() to tell us when to adjust,
4995 	 * rather than checking if we are overflowing here, so that we are
4996 	 * sure to not leave arc_get_data_impl() waiting on
4997 	 * arc_adjust_waiters_cv.  If we have become "not overflowing" since
4998 	 * arc_get_data_impl() checked, we need to wake it up.  We could
4999 	 * broadcast the CV here, but arc_get_data_impl() may have not yet
5000 	 * gone to sleep.  We would need to use a mutex to ensure that this
5001 	 * function doesn't broadcast until arc_get_data_impl() has gone to
5002 	 * sleep (e.g. the arc_adjust_lock).  However, the lock ordering of
5003 	 * such a lock would necessarily be incorrect with respect to the
5004 	 * zthr_lock, which is held before this function is called, and is
5005 	 * held by arc_get_data_impl() when it calls zthr_wakeup().
5006 	 */
5007 	return (arc_adjust_needed);
5008 }
5009 
5010 /*
5011  * Keep arc_size under arc_c by running arc_adjust which evicts data
5012  * from the ARC.
5013  */
5014 /* ARGSUSED */
5015 static void
5016 arc_adjust_cb(void *arg, zthr_t *zthr)
5017 {
5018 	uint64_t evicted = 0;
5019 
5020 	/* Evict from cache */
5021 	evicted = arc_adjust();
5022 
5023 	/*
5024 	 * If evicted is zero, we couldn't evict anything
5025 	 * via arc_adjust(). This could be due to hash lock
5026 	 * collisions, but more likely due to the majority of
5027 	 * arc buffers being unevictable. Therefore, even if
5028 	 * arc_size is above arc_c, another pass is unlikely to
5029 	 * be helpful and could potentially cause us to enter an
5030 	 * infinite loop.  Additionally, zthr_iscancelled() is
5031 	 * checked here so that if the arc is shutting down, the
5032 	 * broadcast will wake any remaining arc adjust waiters.
5033 	 */
5034 	mutex_enter(&arc_adjust_lock);
5035 	arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) &&
5036 	    evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0;
5037 	if (!arc_adjust_needed) {
5038 		/*
5039 		 * We're either no longer overflowing, or we
5040 		 * can't evict anything more, so we should wake
5041 		 * up any waiters.
5042 		 */
5043 		cv_broadcast(&arc_adjust_waiters_cv);
5044 	}
5045 	mutex_exit(&arc_adjust_lock);
5046 }
5047 
5048 /* ARGSUSED */
5049 static boolean_t
5050 arc_reap_cb_check(void *arg, zthr_t *zthr)
5051 {
5052 	int64_t free_memory = arc_available_memory();
5053 
5054 	/*
5055 	 * If a kmem reap is already active, don't schedule more.  We must
5056 	 * check for this because kmem_cache_reap_soon() won't actually
5057 	 * block on the cache being reaped (this is to prevent callers from
5058 	 * becoming implicitly blocked by a system-wide kmem reap -- which,
5059 	 * on a system with many, many full magazines, can take minutes).
5060 	 */
5061 	if (!kmem_cache_reap_active() &&
5062 	    free_memory < 0) {
5063 		arc_no_grow = B_TRUE;
5064 		arc_warm = B_TRUE;
5065 		/*
5066 		 * Wait at least zfs_grow_retry (default 60) seconds
5067 		 * before considering growing.
5068 		 */
5069 		arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
5070 		return (B_TRUE);
5071 	} else if (free_memory < arc_c >> arc_no_grow_shift) {
5072 		arc_no_grow = B_TRUE;
5073 	} else if (gethrtime() >= arc_growtime) {
5074 		arc_no_grow = B_FALSE;
5075 	}
5076 
5077 	return (B_FALSE);
5078 }
5079 
5080 /*
5081  * Keep enough free memory in the system by reaping the ARC's kmem
5082  * caches.  To cause more slabs to be reapable, we may reduce the
5083  * target size of the cache (arc_c), causing the arc_adjust_cb()
5084  * to free more buffers.
5085  */
5086 /* ARGSUSED */
5087 static void
5088 arc_reap_cb(void *arg, zthr_t *zthr)
5089 {
5090 	int64_t free_memory;
5091 
5092 	/*
5093 	 * Kick off asynchronous kmem_reap()'s of all our caches.
5094 	 */
5095 	arc_kmem_reap_soon();
5096 
5097 	/*
5098 	 * Wait at least arc_kmem_cache_reap_retry_ms between
5099 	 * arc_kmem_reap_soon() calls. Without this check it is possible to
5100 	 * end up in a situation where we spend lots of time reaping
5101 	 * caches, while we're near arc_c_min.  Waiting here also gives the
5102 	 * subsequent free memory check a chance of finding that the
5103 	 * asynchronous reap has already freed enough memory, and we don't
5104 	 * need to call arc_reduce_target_size().
5105 	 */
5106 	delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000);
5107 
5108 	/*
5109 	 * Reduce the target size as needed to maintain the amount of free
5110 	 * memory in the system at a fraction of the arc_size (1/128th by
5111 	 * default).  If oversubscribed (free_memory < 0) then reduce the
5112 	 * target arc_size by the deficit amount plus the fractional
5113 	 * amount.  If free memory is positive but less then the fractional
5114 	 * amount, reduce by what is needed to hit the fractional amount.
5115 	 */
5116 	free_memory = arc_available_memory();
5117 
5118 	int64_t to_free =
5119 	    (arc_c >> arc_shrink_shift) - free_memory;
5120 	if (to_free > 0) {
5121 #ifdef _KERNEL
5122 		to_free = MAX(to_free, ptob(needfree));
5123 #endif
5124 		arc_reduce_target_size(to_free);
5125 	}
5126 }
5127 
5128 /*
5129  * Adapt arc info given the number of bytes we are trying to add and
5130  * the state that we are coming from.  This function is only called
5131  * when we are adding new content to the cache.
5132  */
5133 static void
5134 arc_adapt(int bytes, arc_state_t *state)
5135 {
5136 	int mult;
5137 	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
5138 	int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size);
5139 	int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size);
5140 
5141 	if (state == arc_l2c_only)
5142 		return;
5143 
5144 	ASSERT(bytes > 0);
5145 	/*
5146 	 * Adapt the target size of the MRU list:
5147 	 *	- if we just hit in the MRU ghost list, then increase
5148 	 *	  the target size of the MRU list.
5149 	 *	- if we just hit in the MFU ghost list, then increase
5150 	 *	  the target size of the MFU list by decreasing the
5151 	 *	  target size of the MRU list.
5152 	 */
5153 	if (state == arc_mru_ghost) {
5154 		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
5155 		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
5156 
5157 		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
5158 	} else if (state == arc_mfu_ghost) {
5159 		uint64_t delta;
5160 
5161 		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
5162 		mult = MIN(mult, 10);
5163 
5164 		delta = MIN(bytes * mult, arc_p);
5165 		arc_p = MAX(arc_p_min, arc_p - delta);
5166 	}
5167 	ASSERT((int64_t)arc_p >= 0);
5168 
5169 	/*
5170 	 * Wake reap thread if we do not have any available memory
5171 	 */
5172 	if (arc_reclaim_needed()) {
5173 		zthr_wakeup(arc_reap_zthr);
5174 		return;
5175 	}
5176 
5177 
5178 	if (arc_no_grow)
5179 		return;
5180 
5181 	if (arc_c >= arc_c_max)
5182 		return;
5183 
5184 	/*
5185 	 * If we're within (2 * maxblocksize) bytes of the target
5186 	 * cache size, increment the target cache size
5187 	 */
5188 	if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) >
5189 	    0) {
5190 		atomic_add_64(&arc_c, (int64_t)bytes);
5191 		if (arc_c > arc_c_max)
5192 			arc_c = arc_c_max;
5193 		else if (state == arc_anon)
5194 			atomic_add_64(&arc_p, (int64_t)bytes);
5195 		if (arc_p > arc_c)
5196 			arc_p = arc_c;
5197 	}
5198 	ASSERT((int64_t)arc_p >= 0);
5199 }
5200 
5201 /*
5202  * Check if arc_size has grown past our upper threshold, determined by
5203  * zfs_arc_overflow_shift.
5204  */
5205 static boolean_t
5206 arc_is_overflowing(void)
5207 {
5208 	/* Always allow at least one block of overflow */
5209 	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
5210 	    arc_c >> zfs_arc_overflow_shift);
5211 
5212 	/*
5213 	 * We just compare the lower bound here for performance reasons. Our
5214 	 * primary goals are to make sure that the arc never grows without
5215 	 * bound, and that it can reach its maximum size. This check
5216 	 * accomplishes both goals. The maximum amount we could run over by is
5217 	 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block
5218 	 * in the ARC. In practice, that's in the tens of MB, which is low
5219 	 * enough to be safe.
5220 	 */
5221 	return (aggsum_lower_bound(&arc_size) >= arc_c + overflow);
5222 }
5223 
5224 static abd_t *
5225 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5226 {
5227 	arc_buf_contents_t type = arc_buf_type(hdr);
5228 
5229 	arc_get_data_impl(hdr, size, tag);
5230 	if (type == ARC_BUFC_METADATA) {
5231 		return (abd_alloc(size, B_TRUE));
5232 	} else {
5233 		ASSERT(type == ARC_BUFC_DATA);
5234 		return (abd_alloc(size, B_FALSE));
5235 	}
5236 }
5237 
5238 static void *
5239 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5240 {
5241 	arc_buf_contents_t type = arc_buf_type(hdr);
5242 
5243 	arc_get_data_impl(hdr, size, tag);
5244 	if (type == ARC_BUFC_METADATA) {
5245 		return (zio_buf_alloc(size));
5246 	} else {
5247 		ASSERT(type == ARC_BUFC_DATA);
5248 		return (zio_data_buf_alloc(size));
5249 	}
5250 }
5251 
5252 /*
5253  * Allocate a block and return it to the caller. If we are hitting the
5254  * hard limit for the cache size, we must sleep, waiting for the eviction
5255  * thread to catch up. If we're past the target size but below the hard
5256  * limit, we'll only signal the reclaim thread and continue on.
5257  */
5258 static void
5259 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5260 {
5261 	arc_state_t *state = hdr->b_l1hdr.b_state;
5262 	arc_buf_contents_t type = arc_buf_type(hdr);
5263 
5264 	arc_adapt(size, state);
5265 
5266 	/*
5267 	 * If arc_size is currently overflowing, and has grown past our
5268 	 * upper limit, we must be adding data faster than the evict
5269 	 * thread can evict. Thus, to ensure we don't compound the
5270 	 * problem by adding more data and forcing arc_size to grow even
5271 	 * further past its target size, we halt and wait for the
5272 	 * eviction thread to catch up.
5273 	 *
5274 	 * It's also possible that the reclaim thread is unable to evict
5275 	 * enough buffers to get arc_size below the overflow limit (e.g.
5276 	 * due to buffers being un-evictable, or hash lock collisions).
5277 	 * In this case, we want to proceed regardless if we're
5278 	 * overflowing; thus we don't use a while loop here.
5279 	 */
5280 	if (arc_is_overflowing()) {
5281 		mutex_enter(&arc_adjust_lock);
5282 
5283 		/*
5284 		 * Now that we've acquired the lock, we may no longer be
5285 		 * over the overflow limit, lets check.
5286 		 *
5287 		 * We're ignoring the case of spurious wake ups. If that
5288 		 * were to happen, it'd let this thread consume an ARC
5289 		 * buffer before it should have (i.e. before we're under
5290 		 * the overflow limit and were signalled by the reclaim
5291 		 * thread). As long as that is a rare occurrence, it
5292 		 * shouldn't cause any harm.
5293 		 */
5294 		if (arc_is_overflowing()) {
5295 			arc_adjust_needed = B_TRUE;
5296 			zthr_wakeup(arc_adjust_zthr);
5297 			(void) cv_wait(&arc_adjust_waiters_cv,
5298 			    &arc_adjust_lock);
5299 		}
5300 		mutex_exit(&arc_adjust_lock);
5301 	}
5302 
5303 	VERIFY3U(hdr->b_type, ==, type);
5304 	if (type == ARC_BUFC_METADATA) {
5305 		arc_space_consume(size, ARC_SPACE_META);
5306 	} else {
5307 		arc_space_consume(size, ARC_SPACE_DATA);
5308 	}
5309 
5310 	/*
5311 	 * Update the state size.  Note that ghost states have a
5312 	 * "ghost size" and so don't need to be updated.
5313 	 */
5314 	if (!GHOST_STATE(state)) {
5315 
5316 		(void) zfs_refcount_add_many(&state->arcs_size, size, tag);
5317 
5318 		/*
5319 		 * If this is reached via arc_read, the link is
5320 		 * protected by the hash lock. If reached via
5321 		 * arc_buf_alloc, the header should not be accessed by
5322 		 * any other thread. And, if reached via arc_read_done,
5323 		 * the hash lock will protect it if it's found in the
5324 		 * hash table; otherwise no other thread should be
5325 		 * trying to [add|remove]_reference it.
5326 		 */
5327 		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5328 			ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5329 			(void) zfs_refcount_add_many(&state->arcs_esize[type],
5330 			    size, tag);
5331 		}
5332 
5333 		/*
5334 		 * If we are growing the cache, and we are adding anonymous
5335 		 * data, and we have outgrown arc_p, update arc_p
5336 		 */
5337 		if (aggsum_compare(&arc_size, arc_c) < 0 &&
5338 		    hdr->b_l1hdr.b_state == arc_anon &&
5339 		    (zfs_refcount_count(&arc_anon->arcs_size) +
5340 		    zfs_refcount_count(&arc_mru->arcs_size) > arc_p))
5341 			arc_p = MIN(arc_c, arc_p + size);
5342 	}
5343 }
5344 
5345 static void
5346 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag)
5347 {
5348 	arc_free_data_impl(hdr, size, tag);
5349 	abd_free(abd);
5350 }
5351 
5352 static void
5353 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag)
5354 {
5355 	arc_buf_contents_t type = arc_buf_type(hdr);
5356 
5357 	arc_free_data_impl(hdr, size, tag);
5358 	if (type == ARC_BUFC_METADATA) {
5359 		zio_buf_free(buf, size);
5360 	} else {
5361 		ASSERT(type == ARC_BUFC_DATA);
5362 		zio_data_buf_free(buf, size);
5363 	}
5364 }
5365 
5366 /*
5367  * Free the arc data buffer.
5368  */
5369 static void
5370 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag)
5371 {
5372 	arc_state_t *state = hdr->b_l1hdr.b_state;
5373 	arc_buf_contents_t type = arc_buf_type(hdr);
5374 
5375 	/* protected by hash lock, if in the hash table */
5376 	if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
5377 		ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5378 		ASSERT(state != arc_anon && state != arc_l2c_only);
5379 
5380 		(void) zfs_refcount_remove_many(&state->arcs_esize[type],
5381 		    size, tag);
5382 	}
5383 	(void) zfs_refcount_remove_many(&state->arcs_size, size, tag);
5384 
5385 	VERIFY3U(hdr->b_type, ==, type);
5386 	if (type == ARC_BUFC_METADATA) {
5387 		arc_space_return(size, ARC_SPACE_META);
5388 	} else {
5389 		ASSERT(type == ARC_BUFC_DATA);
5390 		arc_space_return(size, ARC_SPACE_DATA);
5391 	}
5392 }
5393 
5394 /*
5395  * This routine is called whenever a buffer is accessed.
5396  * NOTE: the hash lock is dropped in this function.
5397  */
5398 static void
5399 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
5400 {
5401 	clock_t now;
5402 
5403 	ASSERT(MUTEX_HELD(hash_lock));
5404 	ASSERT(HDR_HAS_L1HDR(hdr));
5405 
5406 	if (hdr->b_l1hdr.b_state == arc_anon) {
5407 		/*
5408 		 * This buffer is not in the cache, and does not
5409 		 * appear in our "ghost" list.  Add the new buffer
5410 		 * to the MRU state.
5411 		 */
5412 
5413 		ASSERT0(hdr->b_l1hdr.b_arc_access);
5414 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5415 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5416 		arc_change_state(arc_mru, hdr, hash_lock);
5417 
5418 	} else if (hdr->b_l1hdr.b_state == arc_mru) {
5419 		now = ddi_get_lbolt();
5420 
5421 		/*
5422 		 * If this buffer is here because of a prefetch, then either:
5423 		 * - clear the flag if this is a "referencing" read
5424 		 *   (any subsequent access will bump this into the MFU state).
5425 		 * or
5426 		 * - move the buffer to the head of the list if this is
5427 		 *   another prefetch (to make it less likely to be evicted).
5428 		 */
5429 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5430 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
5431 				/* link protected by hash lock */
5432 				ASSERT(multilist_link_active(
5433 				    &hdr->b_l1hdr.b_arc_node));
5434 			} else {
5435 				arc_hdr_clear_flags(hdr,
5436 				    ARC_FLAG_PREFETCH |
5437 				    ARC_FLAG_PRESCIENT_PREFETCH);
5438 				ARCSTAT_BUMP(arcstat_mru_hits);
5439 			}
5440 			hdr->b_l1hdr.b_arc_access = now;
5441 			return;
5442 		}
5443 
5444 		/*
5445 		 * This buffer has been "accessed" only once so far,
5446 		 * but it is still in the cache. Move it to the MFU
5447 		 * state.
5448 		 */
5449 		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
5450 			/*
5451 			 * More than 125ms have passed since we
5452 			 * instantiated this buffer.  Move it to the
5453 			 * most frequently used state.
5454 			 */
5455 			hdr->b_l1hdr.b_arc_access = now;
5456 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5457 			arc_change_state(arc_mfu, hdr, hash_lock);
5458 		}
5459 		ARCSTAT_BUMP(arcstat_mru_hits);
5460 	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
5461 		arc_state_t	*new_state;
5462 		/*
5463 		 * This buffer has been "accessed" recently, but
5464 		 * was evicted from the cache.  Move it to the
5465 		 * MFU state.
5466 		 */
5467 
5468 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5469 			new_state = arc_mru;
5470 			if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
5471 				arc_hdr_clear_flags(hdr,
5472 				    ARC_FLAG_PREFETCH |
5473 				    ARC_FLAG_PRESCIENT_PREFETCH);
5474 			}
5475 			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
5476 		} else {
5477 			new_state = arc_mfu;
5478 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5479 		}
5480 
5481 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5482 		arc_change_state(new_state, hdr, hash_lock);
5483 
5484 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
5485 	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
5486 		/*
5487 		 * This buffer has been accessed more than once and is
5488 		 * still in the cache.  Keep it in the MFU state.
5489 		 *
5490 		 * NOTE: an add_reference() that occurred when we did
5491 		 * the arc_read() will have kicked this off the list.
5492 		 * If it was a prefetch, we will explicitly move it to
5493 		 * the head of the list now.
5494 		 */
5495 		ARCSTAT_BUMP(arcstat_mfu_hits);
5496 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5497 	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
5498 		arc_state_t	*new_state = arc_mfu;
5499 		/*
5500 		 * This buffer has been accessed more than once but has
5501 		 * been evicted from the cache.  Move it back to the
5502 		 * MFU state.
5503 		 */
5504 
5505 		if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
5506 			/*
5507 			 * This is a prefetch access...
5508 			 * move this block back to the MRU state.
5509 			 */
5510 			new_state = arc_mru;
5511 		}
5512 
5513 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5514 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5515 		arc_change_state(new_state, hdr, hash_lock);
5516 
5517 		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
5518 	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
5519 		/*
5520 		 * This buffer is on the 2nd Level ARC.
5521 		 */
5522 
5523 		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
5524 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
5525 		arc_change_state(arc_mfu, hdr, hash_lock);
5526 	} else {
5527 		ASSERT(!"invalid arc state");
5528 	}
5529 }
5530 
5531 /*
5532  * This routine is called by dbuf_hold() to update the arc_access() state
5533  * which otherwise would be skipped for entries in the dbuf cache.
5534  */
5535 void
5536 arc_buf_access(arc_buf_t *buf)
5537 {
5538 	mutex_enter(&buf->b_evict_lock);
5539 	arc_buf_hdr_t *hdr = buf->b_hdr;
5540 
5541 	/*
5542 	 * Avoid taking the hash_lock when possible as an optimization.
5543 	 * The header must be checked again under the hash_lock in order
5544 	 * to handle the case where it is concurrently being released.
5545 	 */
5546 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5547 		mutex_exit(&buf->b_evict_lock);
5548 		return;
5549 	}
5550 
5551 	kmutex_t *hash_lock = HDR_LOCK(hdr);
5552 	mutex_enter(hash_lock);
5553 
5554 	if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) {
5555 		mutex_exit(hash_lock);
5556 		mutex_exit(&buf->b_evict_lock);
5557 		ARCSTAT_BUMP(arcstat_access_skip);
5558 		return;
5559 	}
5560 
5561 	mutex_exit(&buf->b_evict_lock);
5562 
5563 	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
5564 	    hdr->b_l1hdr.b_state == arc_mfu);
5565 
5566 	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
5567 	arc_access(hdr, hash_lock);
5568 	mutex_exit(hash_lock);
5569 
5570 	ARCSTAT_BUMP(arcstat_hits);
5571 	ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
5572 	    demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits);
5573 }
5574 
5575 /* a generic arc_read_done_func_t which you can use */
5576 /* ARGSUSED */
5577 void
5578 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5579     arc_buf_t *buf, void *arg)
5580 {
5581 	if (buf == NULL)
5582 		return;
5583 
5584 	bcopy(buf->b_data, arg, arc_buf_size(buf));
5585 	arc_buf_destroy(buf, arg);
5586 }
5587 
5588 /* a generic arc_read_done_func_t */
5589 void
5590 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
5591     arc_buf_t *buf, void *arg)
5592 {
5593 	arc_buf_t **bufp = arg;
5594 
5595 	if (buf == NULL) {
5596 		ASSERT(zio == NULL || zio->io_error != 0);
5597 		*bufp = NULL;
5598 	} else {
5599 		ASSERT(zio == NULL || zio->io_error == 0);
5600 		*bufp = buf;
5601 		ASSERT(buf->b_data != NULL);
5602 	}
5603 }
5604 
5605 static void
5606 arc_hdr_verify(arc_buf_hdr_t *hdr, const blkptr_t *bp)
5607 {
5608 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
5609 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0);
5610 		ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF);
5611 	} else {
5612 		if (HDR_COMPRESSION_ENABLED(hdr)) {
5613 			ASSERT3U(arc_hdr_get_compress(hdr), ==,
5614 			    BP_GET_COMPRESS(bp));
5615 		}
5616 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
5617 		ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp));
5618 		ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp));
5619 	}
5620 }
5621 
5622 /*
5623  * XXX this should be changed to return an error, and callers
5624  * re-read from disk on failure (on nondebug bits).
5625  */
5626 static void
5627 arc_hdr_verify_checksum(spa_t *spa, arc_buf_hdr_t *hdr, const blkptr_t *bp)
5628 {
5629 	arc_hdr_verify(hdr, bp);
5630 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
5631 		return;
5632 	int err = 0;
5633 	abd_t *abd = NULL;
5634 	if (BP_IS_ENCRYPTED(bp)) {
5635 		if (HDR_HAS_RABD(hdr)) {
5636 			abd = hdr->b_crypt_hdr.b_rabd;
5637 		}
5638 	} else if (HDR_COMPRESSION_ENABLED(hdr)) {
5639 		abd = hdr->b_l1hdr.b_pabd;
5640 	}
5641 	if (abd != NULL) {
5642 		/*
5643 		 * The offset is only used for labels, which are not
5644 		 * cached in the ARC, so it doesn't matter what we
5645 		 * pass for the offset parameter.
5646 		 */
5647 		int psize = HDR_GET_PSIZE(hdr);
5648 		err = zio_checksum_error_impl(spa, bp,
5649 		    BP_GET_CHECKSUM(bp), abd, psize, 0, NULL);
5650 		if (err != 0) {
5651 			/*
5652 			 * Use abd_copy_to_buf() rather than
5653 			 * abd_borrow_buf_copy() so that we are sure to
5654 			 * include the buf in crash dumps.
5655 			 */
5656 			void *buf = kmem_alloc(psize, KM_SLEEP);
5657 			abd_copy_to_buf(buf, abd, psize);
5658 			panic("checksum of cached data doesn't match BP "
5659 			    "err=%u hdr=%p bp=%p abd=%p buf=%p",
5660 			    err, (void *)hdr, (void *)bp, (void *)abd, buf);
5661 		}
5662 	}
5663 }
5664 
5665 static void
5666 arc_read_done(zio_t *zio)
5667 {
5668 	blkptr_t	*bp = zio->io_bp;
5669 	arc_buf_hdr_t	*hdr = zio->io_private;
5670 	kmutex_t	*hash_lock = NULL;
5671 	arc_callback_t	*callback_list;
5672 	arc_callback_t	*acb;
5673 	boolean_t	freeable = B_FALSE;
5674 
5675 	/*
5676 	 * The hdr was inserted into hash-table and removed from lists
5677 	 * prior to starting I/O.  We should find this header, since
5678 	 * it's in the hash table, and it should be legit since it's
5679 	 * not possible to evict it during the I/O.  The only possible
5680 	 * reason for it not to be found is if we were freed during the
5681 	 * read.
5682 	 */
5683 	if (HDR_IN_HASH_TABLE(hdr)) {
5684 		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
5685 		ASSERT3U(hdr->b_dva.dva_word[0], ==,
5686 		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
5687 		ASSERT3U(hdr->b_dva.dva_word[1], ==,
5688 		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
5689 
5690 		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
5691 		    &hash_lock);
5692 
5693 		ASSERT((found == hdr &&
5694 		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
5695 		    (found == hdr && HDR_L2_READING(hdr)));
5696 		ASSERT3P(hash_lock, !=, NULL);
5697 	}
5698 
5699 	if (BP_IS_PROTECTED(bp)) {
5700 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
5701 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
5702 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
5703 		    hdr->b_crypt_hdr.b_iv);
5704 
5705 		if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) {
5706 			void *tmpbuf;
5707 
5708 			tmpbuf = abd_borrow_buf_copy(zio->io_abd,
5709 			    sizeof (zil_chain_t));
5710 			zio_crypt_decode_mac_zil(tmpbuf,
5711 			    hdr->b_crypt_hdr.b_mac);
5712 			abd_return_buf(zio->io_abd, tmpbuf,
5713 			    sizeof (zil_chain_t));
5714 		} else {
5715 			zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
5716 		}
5717 	}
5718 
5719 	if (zio->io_error == 0) {
5720 		/* byteswap if necessary */
5721 		if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
5722 			if (BP_GET_LEVEL(zio->io_bp) > 0) {
5723 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
5724 			} else {
5725 				hdr->b_l1hdr.b_byteswap =
5726 				    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
5727 			}
5728 		} else {
5729 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
5730 		}
5731 	}
5732 
5733 	arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED);
5734 	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
5735 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE);
5736 
5737 	callback_list = hdr->b_l1hdr.b_acb;
5738 	ASSERT3P(callback_list, !=, NULL);
5739 
5740 	if (hash_lock && zio->io_error == 0 &&
5741 	    hdr->b_l1hdr.b_state == arc_anon) {
5742 		/*
5743 		 * Only call arc_access on anonymous buffers.  This is because
5744 		 * if we've issued an I/O for an evicted buffer, we've already
5745 		 * called arc_access (to prevent any simultaneous readers from
5746 		 * getting confused).
5747 		 */
5748 		arc_access(hdr, hash_lock);
5749 	}
5750 
5751 	/*
5752 	 * If a read request has a callback (i.e. acb_done is not NULL), then we
5753 	 * make a buf containing the data according to the parameters which were
5754 	 * passed in. The implementation of arc_buf_alloc_impl() ensures that we
5755 	 * aren't needlessly decompressing the data multiple times.
5756 	 */
5757 	int callback_cnt = 0;
5758 	for (acb = callback_list; acb != NULL; acb = acb->acb_next) {
5759 		if (!acb->acb_done)
5760 			continue;
5761 
5762 		callback_cnt++;
5763 
5764 		if (zio->io_error != 0)
5765 			continue;
5766 
5767 		int error = arc_buf_alloc_impl(hdr, zio->io_spa,
5768 		    &acb->acb_zb, acb->acb_private, acb->acb_encrypted,
5769 		    acb->acb_compressed, acb->acb_noauth, B_TRUE,
5770 		    &acb->acb_buf);
5771 
5772 		/*
5773 		 * Assert non-speculative zios didn't fail because an
5774 		 * encryption key wasn't loaded
5775 		 */
5776 		ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) ||
5777 		    error != EACCES);
5778 
5779 		/*
5780 		 * If we failed to decrypt, report an error now (as the zio
5781 		 * layer would have done if it had done the transforms).
5782 		 */
5783 		if (error == ECKSUM) {
5784 			ASSERT(BP_IS_PROTECTED(bp));
5785 			error = SET_ERROR(EIO);
5786 			if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) {
5787 				spa_log_error(zio->io_spa, &acb->acb_zb);
5788 				zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION,
5789 				    zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0);
5790 			}
5791 		}
5792 
5793 		if (error != 0) {
5794 			/*
5795 			 * Decompression failed.  Set io_error
5796 			 * so that when we call acb_done (below),
5797 			 * we will indicate that the read failed.
5798 			 * Note that in the unusual case where one
5799 			 * callback is compressed and another
5800 			 * uncompressed, we will mark all of them
5801 			 * as failed, even though the uncompressed
5802 			 * one can't actually fail.  In this case,
5803 			 * the hdr will not be anonymous, because
5804 			 * if there are multiple callbacks, it's
5805 			 * because multiple threads found the same
5806 			 * arc buf in the hash table.
5807 			 */
5808 			zio->io_error = error;
5809 		}
5810 	}
5811 
5812 	/*
5813 	 * If there are multiple callbacks, we must have the hash lock,
5814 	 * because the only way for multiple threads to find this hdr is
5815 	 * in the hash table.  This ensures that if there are multiple
5816 	 * callbacks, the hdr is not anonymous.  If it were anonymous,
5817 	 * we couldn't use arc_buf_destroy() in the error case below.
5818 	 */
5819 	ASSERT(callback_cnt < 2 || hash_lock != NULL);
5820 
5821 	hdr->b_l1hdr.b_acb = NULL;
5822 	arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
5823 	if (callback_cnt == 0)
5824 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
5825 
5826 	ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
5827 	    callback_list != NULL);
5828 
5829 	if (zio->io_error == 0) {
5830 		arc_hdr_verify(hdr, zio->io_bp);
5831 	} else {
5832 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
5833 		if (hdr->b_l1hdr.b_state != arc_anon)
5834 			arc_change_state(arc_anon, hdr, hash_lock);
5835 		if (HDR_IN_HASH_TABLE(hdr))
5836 			buf_hash_remove(hdr);
5837 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5838 	}
5839 
5840 	/*
5841 	 * Broadcast before we drop the hash_lock to avoid the possibility
5842 	 * that the hdr (and hence the cv) might be freed before we get to
5843 	 * the cv_broadcast().
5844 	 */
5845 	cv_broadcast(&hdr->b_l1hdr.b_cv);
5846 
5847 	if (hash_lock != NULL) {
5848 		mutex_exit(hash_lock);
5849 	} else {
5850 		/*
5851 		 * This block was freed while we waited for the read to
5852 		 * complete.  It has been removed from the hash table and
5853 		 * moved to the anonymous state (so that it won't show up
5854 		 * in the cache).
5855 		 */
5856 		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
5857 		freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
5858 	}
5859 
5860 	/* execute each callback and free its structure */
5861 	while ((acb = callback_list) != NULL) {
5862 
5863 		if (acb->acb_done != NULL) {
5864 			if (zio->io_error != 0 && acb->acb_buf != NULL) {
5865 				/*
5866 				 * If arc_buf_alloc_impl() fails during
5867 				 * decompression, the buf will still be
5868 				 * allocated, and needs to be freed here.
5869 				 */
5870 				arc_buf_destroy(acb->acb_buf, acb->acb_private);
5871 				acb->acb_buf = NULL;
5872 			}
5873 			acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
5874 			    acb->acb_buf, acb->acb_private);
5875 		}
5876 
5877 		if (acb->acb_zio_dummy != NULL) {
5878 			acb->acb_zio_dummy->io_error = zio->io_error;
5879 			zio_nowait(acb->acb_zio_dummy);
5880 		}
5881 
5882 		callback_list = acb->acb_next;
5883 		kmem_free(acb, sizeof (arc_callback_t));
5884 	}
5885 
5886 	if (freeable)
5887 		arc_hdr_destroy(hdr);
5888 }
5889 
5890 /*
5891  * "Read" the block at the specified DVA (in bp) via the
5892  * cache.  If the block is found in the cache, invoke the provided
5893  * callback immediately and return.  Note that the `zio' parameter
5894  * in the callback will be NULL in this case, since no IO was
5895  * required.  If the block is not in the cache pass the read request
5896  * on to the spa with a substitute callback function, so that the
5897  * requested block will be added to the cache.
5898  *
5899  * If a read request arrives for a block that has a read in-progress,
5900  * either wait for the in-progress read to complete (and return the
5901  * results); or, if this is a read with a "done" func, add a record
5902  * to the read to invoke the "done" func when the read completes,
5903  * and return; or just return.
5904  *
5905  * arc_read_done() will invoke all the requested "done" functions
5906  * for readers of this block.
5907  */
5908 int
5909 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done,
5910     void *private, zio_priority_t priority, int zio_flags,
5911     arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
5912 {
5913 	arc_buf_hdr_t *hdr = NULL;
5914 	kmutex_t *hash_lock = NULL;
5915 	zio_t *rzio;
5916 	uint64_t guid = spa_load_guid(spa);
5917 	boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0;
5918 	boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) &&
5919 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5920 	boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) &&
5921 	    (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0;
5922 	int rc = 0;
5923 
5924 	ASSERT(!BP_IS_EMBEDDED(bp) ||
5925 	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
5926 
5927 top:
5928 	if (!BP_IS_EMBEDDED(bp)) {
5929 		/*
5930 		 * Embedded BP's have no DVA and require no I/O to "read".
5931 		 * Create an anonymous arc buf to back it.
5932 		 */
5933 		hdr = buf_hash_find(guid, bp, &hash_lock);
5934 	}
5935 
5936 	/*
5937 	 * Determine if we have an L1 cache hit or a cache miss. For simplicity
5938 	 * we maintain encrypted data seperately from compressed / uncompressed
5939 	 * data. If the user is requesting raw encrypted data and we don't have
5940 	 * that in the header we will read from disk to guarantee that we can
5941 	 * get it even if the encryption keys aren't loaded.
5942 	 */
5943 	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) ||
5944 	    (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) {
5945 		arc_buf_t *buf = NULL;
5946 		*arc_flags |= ARC_FLAG_CACHED;
5947 
5948 		if (HDR_IO_IN_PROGRESS(hdr)) {
5949 			zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head;
5950 
5951 			ASSERT3P(head_zio, !=, NULL);
5952 			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
5953 			    priority == ZIO_PRIORITY_SYNC_READ) {
5954 				/*
5955 				 * This is a sync read that needs to wait for
5956 				 * an in-flight async read. Request that the
5957 				 * zio have its priority upgraded.
5958 				 */
5959 				zio_change_priority(head_zio, priority);
5960 				DTRACE_PROBE1(arc__async__upgrade__sync,
5961 				    arc_buf_hdr_t *, hdr);
5962 				ARCSTAT_BUMP(arcstat_async_upgrade_sync);
5963 			}
5964 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
5965 				arc_hdr_clear_flags(hdr,
5966 				    ARC_FLAG_PREDICTIVE_PREFETCH);
5967 			}
5968 
5969 			if (*arc_flags & ARC_FLAG_WAIT) {
5970 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
5971 				mutex_exit(hash_lock);
5972 				goto top;
5973 			}
5974 			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
5975 
5976 			if (done) {
5977 				arc_callback_t *acb = NULL;
5978 
5979 				acb = kmem_zalloc(sizeof (arc_callback_t),
5980 				    KM_SLEEP);
5981 				acb->acb_done = done;
5982 				acb->acb_private = private;
5983 				acb->acb_compressed = compressed_read;
5984 				acb->acb_encrypted = encrypted_read;
5985 				acb->acb_noauth = noauth_read;
5986 				acb->acb_zb = *zb;
5987 				if (pio != NULL)
5988 					acb->acb_zio_dummy = zio_null(pio,
5989 					    spa, NULL, NULL, NULL, zio_flags);
5990 
5991 				ASSERT3P(acb->acb_done, !=, NULL);
5992 				acb->acb_zio_head = head_zio;
5993 				acb->acb_next = hdr->b_l1hdr.b_acb;
5994 				hdr->b_l1hdr.b_acb = acb;
5995 				mutex_exit(hash_lock);
5996 				return (0);
5997 			}
5998 			mutex_exit(hash_lock);
5999 			return (0);
6000 		}
6001 
6002 		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
6003 		    hdr->b_l1hdr.b_state == arc_mfu);
6004 
6005 		if (done) {
6006 			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
6007 				/*
6008 				 * This is a demand read which does not have to
6009 				 * wait for i/o because we did a predictive
6010 				 * prefetch i/o for it, which has completed.
6011 				 */
6012 				DTRACE_PROBE1(
6013 				    arc__demand__hit__predictive__prefetch,
6014 				    arc_buf_hdr_t *, hdr);
6015 				ARCSTAT_BUMP(
6016 				    arcstat_demand_hit_predictive_prefetch);
6017 				arc_hdr_clear_flags(hdr,
6018 				    ARC_FLAG_PREDICTIVE_PREFETCH);
6019 			}
6020 
6021 			if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
6022 				ARCSTAT_BUMP(
6023 				    arcstat_demand_hit_prescient_prefetch);
6024 				arc_hdr_clear_flags(hdr,
6025 				    ARC_FLAG_PRESCIENT_PREFETCH);
6026 			}
6027 
6028 			ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
6029 
6030 			arc_hdr_verify_checksum(spa, hdr, bp);
6031 
6032 			/* Get a buf with the desired data in it. */
6033 			rc = arc_buf_alloc_impl(hdr, spa, zb, private,
6034 			    encrypted_read, compressed_read, noauth_read,
6035 			    B_TRUE, &buf);
6036 			if (rc == ECKSUM) {
6037 				/*
6038 				 * Convert authentication and decryption errors
6039 				 * to EIO (and generate an ereport if needed)
6040 				 * before leaving the ARC.
6041 				 */
6042 				rc = SET_ERROR(EIO);
6043 				if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) {
6044 					spa_log_error(spa, zb);
6045 					zfs_ereport_post(
6046 					    FM_EREPORT_ZFS_AUTHENTICATION,
6047 					    spa, NULL, zb, NULL, 0, 0);
6048 				}
6049 			}
6050 			if (rc != 0) {
6051 				(void) remove_reference(hdr, hash_lock,
6052 				    private);
6053 				arc_buf_destroy_impl(buf);
6054 				buf = NULL;
6055 			}
6056 			/* assert any errors weren't due to unloaded keys */
6057 			ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) ||
6058 			    rc != EACCES);
6059 		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
6060 		    zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
6061 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
6062 		}
6063 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
6064 		arc_access(hdr, hash_lock);
6065 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6066 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
6067 		if (*arc_flags & ARC_FLAG_L2CACHE)
6068 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6069 		mutex_exit(hash_lock);
6070 		ARCSTAT_BUMP(arcstat_hits);
6071 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6072 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
6073 		    data, metadata, hits);
6074 
6075 		if (done)
6076 			done(NULL, zb, bp, buf, private);
6077 	} else {
6078 		uint64_t lsize = BP_GET_LSIZE(bp);
6079 		uint64_t psize = BP_GET_PSIZE(bp);
6080 		arc_callback_t *acb;
6081 		vdev_t *vd = NULL;
6082 		uint64_t addr = 0;
6083 		boolean_t devw = B_FALSE;
6084 		uint64_t size;
6085 		abd_t *hdr_abd;
6086 
6087 		if (hdr == NULL) {
6088 			/* this block is not in the cache */
6089 			arc_buf_hdr_t *exists = NULL;
6090 			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
6091 			hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize,
6092 			    BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type,
6093 			    encrypted_read);
6094 
6095 			if (!BP_IS_EMBEDDED(bp)) {
6096 				hdr->b_dva = *BP_IDENTITY(bp);
6097 				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
6098 				exists = buf_hash_insert(hdr, &hash_lock);
6099 			}
6100 			if (exists != NULL) {
6101 				/* somebody beat us to the hash insert */
6102 				mutex_exit(hash_lock);
6103 				buf_discard_identity(hdr);
6104 				arc_hdr_destroy(hdr);
6105 				goto top; /* restart the IO request */
6106 			}
6107 		} else {
6108 			/*
6109 			 * This block is in the ghost cache or encrypted data
6110 			 * was requested and we didn't have it. If it was
6111 			 * L2-only (and thus didn't have an L1 hdr),
6112 			 * we realloc the header to add an L1 hdr.
6113 			 */
6114 			if (!HDR_HAS_L1HDR(hdr)) {
6115 				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
6116 				    hdr_full_cache);
6117 			}
6118 
6119 			if (GHOST_STATE(hdr->b_l1hdr.b_state)) {
6120 				ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6121 				ASSERT(!HDR_HAS_RABD(hdr));
6122 				ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6123 				ASSERT0(zfs_refcount_count(
6124 				    &hdr->b_l1hdr.b_refcnt));
6125 				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
6126 				ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL);
6127 			} else if (HDR_IO_IN_PROGRESS(hdr)) {
6128 				/*
6129 				 * If this header already had an IO in progress
6130 				 * and we are performing another IO to fetch
6131 				 * encrypted data we must wait until the first
6132 				 * IO completes so as not to confuse
6133 				 * arc_read_done(). This should be very rare
6134 				 * and so the performance impact shouldn't
6135 				 * matter.
6136 				 */
6137 				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
6138 				mutex_exit(hash_lock);
6139 				goto top;
6140 			}
6141 
6142 			/*
6143 			 * This is a delicate dance that we play here.
6144 			 * This hdr might be in the ghost list so we access
6145 			 * it to move it out of the ghost list before we
6146 			 * initiate the read. If it's a prefetch then
6147 			 * it won't have a callback so we'll remove the
6148 			 * reference that arc_buf_alloc_impl() created. We
6149 			 * do this after we've called arc_access() to
6150 			 * avoid hitting an assert in remove_reference().
6151 			 */
6152 			arc_access(hdr, hash_lock);
6153 			arc_hdr_alloc_pabd(hdr, encrypted_read);
6154 		}
6155 
6156 		if (encrypted_read) {
6157 			ASSERT(HDR_HAS_RABD(hdr));
6158 			size = HDR_GET_PSIZE(hdr);
6159 			hdr_abd = hdr->b_crypt_hdr.b_rabd;
6160 			zio_flags |= ZIO_FLAG_RAW;
6161 		} else {
6162 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
6163 			size = arc_hdr_size(hdr);
6164 			hdr_abd = hdr->b_l1hdr.b_pabd;
6165 
6166 			if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) {
6167 				zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6168 			}
6169 
6170 			/*
6171 			 * For authenticated bp's, we do not ask the ZIO layer
6172 			 * to authenticate them since this will cause the entire
6173 			 * IO to fail if the key isn't loaded. Instead, we
6174 			 * defer authentication until arc_buf_fill(), which will
6175 			 * verify the data when the key is available.
6176 			 */
6177 			if (BP_IS_AUTHENTICATED(bp))
6178 				zio_flags |= ZIO_FLAG_RAW_ENCRYPT;
6179 		}
6180 
6181 		if (*arc_flags & ARC_FLAG_PREFETCH &&
6182 		    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
6183 			arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
6184 		if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
6185 			arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
6186 
6187 		if (*arc_flags & ARC_FLAG_L2CACHE)
6188 			arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6189 		if (BP_IS_AUTHENTICATED(bp))
6190 			arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6191 		if (BP_GET_LEVEL(bp) > 0)
6192 			arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT);
6193 		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
6194 			arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH);
6195 		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
6196 
6197 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
6198 		acb->acb_done = done;
6199 		acb->acb_private = private;
6200 		acb->acb_compressed = compressed_read;
6201 		acb->acb_encrypted = encrypted_read;
6202 		acb->acb_noauth = noauth_read;
6203 		acb->acb_zb = *zb;
6204 
6205 		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6206 		hdr->b_l1hdr.b_acb = acb;
6207 		arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6208 
6209 		if (HDR_HAS_L2HDR(hdr) &&
6210 		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
6211 			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
6212 			addr = hdr->b_l2hdr.b_daddr;
6213 			/*
6214 			 * Lock out L2ARC device removal.
6215 			 */
6216 			if (vdev_is_dead(vd) ||
6217 			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
6218 				vd = NULL;
6219 		}
6220 
6221 		/*
6222 		 * We count both async reads and scrub IOs as asynchronous so
6223 		 * that both can be upgraded in the event of a cache hit while
6224 		 * the read IO is still in-flight.
6225 		 */
6226 		if (priority == ZIO_PRIORITY_ASYNC_READ ||
6227 		    priority == ZIO_PRIORITY_SCRUB)
6228 			arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6229 		else
6230 			arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ);
6231 
6232 		/*
6233 		 * At this point, we have a level 1 cache miss.  Try again in
6234 		 * L2ARC if possible.
6235 		 */
6236 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize);
6237 
6238 		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
6239 		    uint64_t, lsize, zbookmark_phys_t *, zb);
6240 		ARCSTAT_BUMP(arcstat_misses);
6241 		ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr),
6242 		    demand, prefetch, !HDR_ISTYPE_METADATA(hdr),
6243 		    data, metadata, misses);
6244 
6245 		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
6246 			/*
6247 			 * Read from the L2ARC if the following are true:
6248 			 * 1. The L2ARC vdev was previously cached.
6249 			 * 2. This buffer still has L2ARC metadata.
6250 			 * 3. This buffer isn't currently writing to the L2ARC.
6251 			 * 4. The L2ARC entry wasn't evicted, which may
6252 			 *    also have invalidated the vdev.
6253 			 * 5. This isn't prefetch and l2arc_noprefetch is set.
6254 			 */
6255 			if (HDR_HAS_L2HDR(hdr) &&
6256 			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
6257 			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
6258 				l2arc_read_callback_t *cb;
6259 				abd_t *abd;
6260 				uint64_t asize;
6261 
6262 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
6263 				ARCSTAT_BUMP(arcstat_l2_hits);
6264 
6265 				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
6266 				    KM_SLEEP);
6267 				cb->l2rcb_hdr = hdr;
6268 				cb->l2rcb_bp = *bp;
6269 				cb->l2rcb_zb = *zb;
6270 				cb->l2rcb_flags = zio_flags;
6271 
6272 				asize = vdev_psize_to_asize(vd, size);
6273 				if (asize != size) {
6274 					abd = abd_alloc_for_io(asize,
6275 					    HDR_ISTYPE_METADATA(hdr));
6276 					cb->l2rcb_abd = abd;
6277 				} else {
6278 					abd = hdr_abd;
6279 				}
6280 
6281 				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
6282 				    addr + asize <= vd->vdev_psize -
6283 				    VDEV_LABEL_END_SIZE);
6284 
6285 				/*
6286 				 * l2arc read.  The SCL_L2ARC lock will be
6287 				 * released by l2arc_read_done().
6288 				 * Issue a null zio if the underlying buffer
6289 				 * was squashed to zero size by compression.
6290 				 */
6291 				ASSERT3U(arc_hdr_get_compress(hdr), !=,
6292 				    ZIO_COMPRESS_EMPTY);
6293 				rzio = zio_read_phys(pio, vd, addr,
6294 				    asize, abd,
6295 				    ZIO_CHECKSUM_OFF,
6296 				    l2arc_read_done, cb, priority,
6297 				    zio_flags | ZIO_FLAG_DONT_CACHE |
6298 				    ZIO_FLAG_CANFAIL |
6299 				    ZIO_FLAG_DONT_PROPAGATE |
6300 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
6301 				acb->acb_zio_head = rzio;
6302 
6303 				if (hash_lock != NULL)
6304 					mutex_exit(hash_lock);
6305 
6306 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
6307 				    zio_t *, rzio);
6308 				ARCSTAT_INCR(arcstat_l2_read_bytes,
6309 				    HDR_GET_PSIZE(hdr));
6310 
6311 				if (*arc_flags & ARC_FLAG_NOWAIT) {
6312 					zio_nowait(rzio);
6313 					return (0);
6314 				}
6315 
6316 				ASSERT(*arc_flags & ARC_FLAG_WAIT);
6317 				if (zio_wait(rzio) == 0)
6318 					return (0);
6319 
6320 				/* l2arc read error; goto zio_read() */
6321 				if (hash_lock != NULL)
6322 					mutex_enter(hash_lock);
6323 			} else {
6324 				DTRACE_PROBE1(l2arc__miss,
6325 				    arc_buf_hdr_t *, hdr);
6326 				ARCSTAT_BUMP(arcstat_l2_misses);
6327 				if (HDR_L2_WRITING(hdr))
6328 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
6329 				spa_config_exit(spa, SCL_L2ARC, vd);
6330 			}
6331 		} else {
6332 			if (vd != NULL)
6333 				spa_config_exit(spa, SCL_L2ARC, vd);
6334 			if (l2arc_ndev != 0) {
6335 				DTRACE_PROBE1(l2arc__miss,
6336 				    arc_buf_hdr_t *, hdr);
6337 				ARCSTAT_BUMP(arcstat_l2_misses);
6338 			}
6339 		}
6340 
6341 		rzio = zio_read(pio, spa, bp, hdr_abd, size,
6342 		    arc_read_done, hdr, priority, zio_flags, zb);
6343 		acb->acb_zio_head = rzio;
6344 
6345 		if (hash_lock != NULL)
6346 			mutex_exit(hash_lock);
6347 
6348 		if (*arc_flags & ARC_FLAG_WAIT)
6349 			return (zio_wait(rzio));
6350 
6351 		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
6352 		zio_nowait(rzio);
6353 	}
6354 	return (rc);
6355 }
6356 
6357 /*
6358  * Notify the arc that a block was freed, and thus will never be used again.
6359  */
6360 void
6361 arc_freed(spa_t *spa, const blkptr_t *bp)
6362 {
6363 	arc_buf_hdr_t *hdr;
6364 	kmutex_t *hash_lock;
6365 	uint64_t guid = spa_load_guid(spa);
6366 
6367 	ASSERT(!BP_IS_EMBEDDED(bp));
6368 
6369 	hdr = buf_hash_find(guid, bp, &hash_lock);
6370 	if (hdr == NULL)
6371 		return;
6372 
6373 	/*
6374 	 * We might be trying to free a block that is still doing I/O
6375 	 * (i.e. prefetch) or has a reference (i.e. a dedup-ed,
6376 	 * dmu_sync-ed block). If this block is being prefetched, then it
6377 	 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr
6378 	 * until the I/O completes. A block may also have a reference if it is
6379 	 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would
6380 	 * have written the new block to its final resting place on disk but
6381 	 * without the dedup flag set. This would have left the hdr in the MRU
6382 	 * state and discoverable. When the txg finally syncs it detects that
6383 	 * the block was overridden in open context and issues an override I/O.
6384 	 * Since this is a dedup block, the override I/O will determine if the
6385 	 * block is already in the DDT. If so, then it will replace the io_bp
6386 	 * with the bp from the DDT and allow the I/O to finish. When the I/O
6387 	 * reaches the done callback, dbuf_write_override_done, it will
6388 	 * check to see if the io_bp and io_bp_override are identical.
6389 	 * If they are not, then it indicates that the bp was replaced with
6390 	 * the bp in the DDT and the override bp is freed. This allows
6391 	 * us to arrive here with a reference on a block that is being
6392 	 * freed. So if we have an I/O in progress, or a reference to
6393 	 * this hdr, then we don't destroy the hdr.
6394 	 */
6395 	if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) &&
6396 	    zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) {
6397 		arc_change_state(arc_anon, hdr, hash_lock);
6398 		arc_hdr_destroy(hdr);
6399 		mutex_exit(hash_lock);
6400 	} else {
6401 		mutex_exit(hash_lock);
6402 	}
6403 
6404 }
6405 
6406 /*
6407  * Release this buffer from the cache, making it an anonymous buffer.  This
6408  * must be done after a read and prior to modifying the buffer contents.
6409  * If the buffer has more than one reference, we must make
6410  * a new hdr for the buffer.
6411  */
6412 void
6413 arc_release(arc_buf_t *buf, void *tag)
6414 {
6415 	arc_buf_hdr_t *hdr = buf->b_hdr;
6416 
6417 	/*
6418 	 * It would be nice to assert that if its DMU metadata (level >
6419 	 * 0 || it's the dnode file), then it must be syncing context.
6420 	 * But we don't know that information at this level.
6421 	 */
6422 
6423 	mutex_enter(&buf->b_evict_lock);
6424 
6425 	ASSERT(HDR_HAS_L1HDR(hdr));
6426 
6427 	/*
6428 	 * We don't grab the hash lock prior to this check, because if
6429 	 * the buffer's header is in the arc_anon state, it won't be
6430 	 * linked into the hash table.
6431 	 */
6432 	if (hdr->b_l1hdr.b_state == arc_anon) {
6433 		mutex_exit(&buf->b_evict_lock);
6434 		/*
6435 		 * If we are called from dmu_convert_mdn_block_to_raw(),
6436 		 * a write might be in progress.  This is OK because
6437 		 * the caller won't change the content of this buffer,
6438 		 * only the flags (via arc_convert_to_raw()).
6439 		 */
6440 		/* ASSERT(!HDR_IO_IN_PROGRESS(hdr)); */
6441 		ASSERT(!HDR_IN_HASH_TABLE(hdr));
6442 		ASSERT(!HDR_HAS_L2HDR(hdr));
6443 		ASSERT(HDR_EMPTY(hdr));
6444 
6445 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6446 		ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
6447 		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
6448 
6449 		hdr->b_l1hdr.b_arc_access = 0;
6450 
6451 		/*
6452 		 * If the buf is being overridden then it may already
6453 		 * have a hdr that is not empty.
6454 		 */
6455 		buf_discard_identity(hdr);
6456 		arc_buf_thaw(buf);
6457 
6458 		return;
6459 	}
6460 
6461 	kmutex_t *hash_lock = HDR_LOCK(hdr);
6462 	mutex_enter(hash_lock);
6463 
6464 	/*
6465 	 * This assignment is only valid as long as the hash_lock is
6466 	 * held, we must be careful not to reference state or the
6467 	 * b_state field after dropping the lock.
6468 	 */
6469 	arc_state_t *state = hdr->b_l1hdr.b_state;
6470 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6471 	ASSERT3P(state, !=, arc_anon);
6472 
6473 	/* this buffer is not on any list */
6474 	ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0);
6475 
6476 	if (HDR_HAS_L2HDR(hdr)) {
6477 		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6478 
6479 		/*
6480 		 * We have to recheck this conditional again now that
6481 		 * we're holding the l2ad_mtx to prevent a race with
6482 		 * another thread which might be concurrently calling
6483 		 * l2arc_evict(). In that case, l2arc_evict() might have
6484 		 * destroyed the header's L2 portion as we were waiting
6485 		 * to acquire the l2ad_mtx.
6486 		 */
6487 		if (HDR_HAS_L2HDR(hdr))
6488 			arc_hdr_l2hdr_destroy(hdr);
6489 
6490 		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
6491 	}
6492 
6493 	/*
6494 	 * Do we have more than one buf?
6495 	 */
6496 	if (hdr->b_l1hdr.b_bufcnt > 1) {
6497 		arc_buf_hdr_t *nhdr;
6498 		uint64_t spa = hdr->b_spa;
6499 		uint64_t psize = HDR_GET_PSIZE(hdr);
6500 		uint64_t lsize = HDR_GET_LSIZE(hdr);
6501 		boolean_t protected = HDR_PROTECTED(hdr);
6502 		enum zio_compress compress = arc_hdr_get_compress(hdr);
6503 		arc_buf_contents_t type = arc_buf_type(hdr);
6504 		VERIFY3U(hdr->b_type, ==, type);
6505 
6506 		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
6507 		(void) remove_reference(hdr, hash_lock, tag);
6508 
6509 		if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
6510 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6511 			ASSERT(ARC_BUF_LAST(buf));
6512 		}
6513 
6514 		/*
6515 		 * Pull the data off of this hdr and attach it to
6516 		 * a new anonymous hdr. Also find the last buffer
6517 		 * in the hdr's buffer list.
6518 		 */
6519 		arc_buf_t *lastbuf = arc_buf_remove(hdr, buf);
6520 		ASSERT3P(lastbuf, !=, NULL);
6521 
6522 		/*
6523 		 * If the current arc_buf_t and the hdr are sharing their data
6524 		 * buffer, then we must stop sharing that block.
6525 		 */
6526 		if (arc_buf_is_shared(buf)) {
6527 			ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
6528 			VERIFY(!arc_buf_is_shared(lastbuf));
6529 
6530 			/*
6531 			 * First, sever the block sharing relationship between
6532 			 * buf and the arc_buf_hdr_t.
6533 			 */
6534 			arc_unshare_buf(hdr, buf);
6535 
6536 			/*
6537 			 * Now we need to recreate the hdr's b_pabd. Since we
6538 			 * have lastbuf handy, we try to share with it, but if
6539 			 * we can't then we allocate a new b_pabd and copy the
6540 			 * data from buf into it.
6541 			 */
6542 			if (arc_can_share(hdr, lastbuf)) {
6543 				arc_share_buf(hdr, lastbuf);
6544 			} else {
6545 				arc_hdr_alloc_pabd(hdr, B_FALSE);
6546 				abd_copy_from_buf(hdr->b_l1hdr.b_pabd,
6547 				    buf->b_data, psize);
6548 			}
6549 			VERIFY3P(lastbuf->b_data, !=, NULL);
6550 		} else if (HDR_SHARED_DATA(hdr)) {
6551 			/*
6552 			 * Uncompressed shared buffers are always at the end
6553 			 * of the list. Compressed buffers don't have the
6554 			 * same requirements. This makes it hard to
6555 			 * simply assert that the lastbuf is shared so
6556 			 * we rely on the hdr's compression flags to determine
6557 			 * if we have a compressed, shared buffer.
6558 			 */
6559 			ASSERT(arc_buf_is_shared(lastbuf) ||
6560 			    arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
6561 			ASSERT(!ARC_BUF_SHARED(buf));
6562 		}
6563 		ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
6564 		ASSERT3P(state, !=, arc_l2c_only);
6565 
6566 		(void) zfs_refcount_remove_many(&state->arcs_size,
6567 		    arc_buf_size(buf), buf);
6568 
6569 		if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
6570 			ASSERT3P(state, !=, arc_l2c_only);
6571 			(void) zfs_refcount_remove_many(
6572 			    &state->arcs_esize[type],
6573 			    arc_buf_size(buf), buf);
6574 		}
6575 
6576 		hdr->b_l1hdr.b_bufcnt -= 1;
6577 		if (ARC_BUF_ENCRYPTED(buf))
6578 			hdr->b_crypt_hdr.b_ebufcnt -= 1;
6579 
6580 		arc_cksum_verify(buf);
6581 		arc_buf_unwatch(buf);
6582 
6583 		/* if this is the last uncompressed buf free the checksum */
6584 		if (!arc_hdr_has_uncompressed_buf(hdr))
6585 			arc_cksum_free(hdr);
6586 
6587 		mutex_exit(hash_lock);
6588 
6589 		/*
6590 		 * Allocate a new hdr. The new hdr will contain a b_pabd
6591 		 * buffer which will be freed in arc_write().
6592 		 */
6593 		nhdr = arc_hdr_alloc(spa, psize, lsize, protected,
6594 		    compress, type, HDR_HAS_RABD(hdr));
6595 		ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL);
6596 		ASSERT0(nhdr->b_l1hdr.b_bufcnt);
6597 		ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt));
6598 		VERIFY3U(nhdr->b_type, ==, type);
6599 		ASSERT(!HDR_SHARED_DATA(nhdr));
6600 
6601 		nhdr->b_l1hdr.b_buf = buf;
6602 		nhdr->b_l1hdr.b_bufcnt = 1;
6603 		if (ARC_BUF_ENCRYPTED(buf))
6604 			nhdr->b_crypt_hdr.b_ebufcnt = 1;
6605 		(void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
6606 		buf->b_hdr = nhdr;
6607 
6608 		mutex_exit(&buf->b_evict_lock);
6609 		(void) zfs_refcount_add_many(&arc_anon->arcs_size,
6610 		    arc_buf_size(buf), buf);
6611 	} else {
6612 		mutex_exit(&buf->b_evict_lock);
6613 		ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
6614 		/* protected by hash lock, or hdr is on arc_anon */
6615 		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
6616 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6617 		arc_change_state(arc_anon, hdr, hash_lock);
6618 		hdr->b_l1hdr.b_arc_access = 0;
6619 
6620 		mutex_exit(hash_lock);
6621 		buf_discard_identity(hdr);
6622 		arc_buf_thaw(buf);
6623 	}
6624 }
6625 
6626 int
6627 arc_released(arc_buf_t *buf)
6628 {
6629 	int released;
6630 
6631 	mutex_enter(&buf->b_evict_lock);
6632 	released = (buf->b_data != NULL &&
6633 	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
6634 	mutex_exit(&buf->b_evict_lock);
6635 	return (released);
6636 }
6637 
6638 #ifdef ZFS_DEBUG
6639 int
6640 arc_referenced(arc_buf_t *buf)
6641 {
6642 	int referenced;
6643 
6644 	mutex_enter(&buf->b_evict_lock);
6645 	referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
6646 	mutex_exit(&buf->b_evict_lock);
6647 	return (referenced);
6648 }
6649 #endif
6650 
6651 static void
6652 arc_write_ready(zio_t *zio)
6653 {
6654 	arc_write_callback_t *callback = zio->io_private;
6655 	arc_buf_t *buf = callback->awcb_buf;
6656 	arc_buf_hdr_t *hdr = buf->b_hdr;
6657 	blkptr_t *bp = zio->io_bp;
6658 	uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp);
6659 
6660 	ASSERT(HDR_HAS_L1HDR(hdr));
6661 	ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
6662 	ASSERT(hdr->b_l1hdr.b_bufcnt > 0);
6663 
6664 	/*
6665 	 * If we're reexecuting this zio because the pool suspended, then
6666 	 * cleanup any state that was previously set the first time the
6667 	 * callback was invoked.
6668 	 */
6669 	if (zio->io_flags & ZIO_FLAG_REEXECUTED) {
6670 		arc_cksum_free(hdr);
6671 		arc_buf_unwatch(buf);
6672 		if (hdr->b_l1hdr.b_pabd != NULL) {
6673 			if (arc_buf_is_shared(buf)) {
6674 				arc_unshare_buf(hdr, buf);
6675 			} else {
6676 				arc_hdr_free_pabd(hdr, B_FALSE);
6677 			}
6678 		}
6679 
6680 		if (HDR_HAS_RABD(hdr))
6681 			arc_hdr_free_pabd(hdr, B_TRUE);
6682 	}
6683 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6684 	ASSERT(!HDR_HAS_RABD(hdr));
6685 	ASSERT(!HDR_SHARED_DATA(hdr));
6686 	ASSERT(!arc_buf_is_shared(buf));
6687 
6688 	callback->awcb_ready(zio, buf, callback->awcb_private);
6689 
6690 	if (HDR_IO_IN_PROGRESS(hdr))
6691 		ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED);
6692 
6693 	arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6694 
6695 	if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr))
6696 		hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp));
6697 
6698 	if (BP_IS_PROTECTED(bp)) {
6699 		/* ZIL blocks are written through zio_rewrite */
6700 		ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
6701 		ASSERT(HDR_PROTECTED(hdr));
6702 
6703 		if (BP_SHOULD_BYTESWAP(bp)) {
6704 			if (BP_GET_LEVEL(bp) > 0) {
6705 				hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64;
6706 			} else {
6707 				hdr->b_l1hdr.b_byteswap =
6708 				    DMU_OT_BYTESWAP(BP_GET_TYPE(bp));
6709 			}
6710 		} else {
6711 			hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS;
6712 		}
6713 
6714 		hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp);
6715 		hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset;
6716 		zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt,
6717 		    hdr->b_crypt_hdr.b_iv);
6718 		zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac);
6719 	}
6720 
6721 	/*
6722 	 * If this block was written for raw encryption but the zio layer
6723 	 * ended up only authenticating it, adjust the buffer flags now.
6724 	 */
6725 	if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) {
6726 		arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH);
6727 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6728 		if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF)
6729 			buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6730 	} else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) {
6731 		buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED;
6732 		buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED;
6733 	}
6734 
6735 	/* this must be done after the buffer flags are adjusted */
6736 	arc_cksum_compute(buf);
6737 
6738 	enum zio_compress compress;
6739 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) {
6740 		compress = ZIO_COMPRESS_OFF;
6741 	} else {
6742 		ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp));
6743 		compress = BP_GET_COMPRESS(bp);
6744 	}
6745 	HDR_SET_PSIZE(hdr, psize);
6746 	arc_hdr_set_compress(hdr, compress);
6747 
6748 	if (zio->io_error != 0 || psize == 0)
6749 		goto out;
6750 
6751 	/*
6752 	 * Fill the hdr with data. If the buffer is encrypted we have no choice
6753 	 * but to copy the data into b_rabd. If the hdr is compressed, the data
6754 	 * we want is available from the zio, otherwise we can take it from
6755 	 * the buf.
6756 	 *
6757 	 * We might be able to share the buf's data with the hdr here. However,
6758 	 * doing so would cause the ARC to be full of linear ABDs if we write a
6759 	 * lot of shareable data. As a compromise, we check whether scattered
6760 	 * ABDs are allowed, and assume that if they are then the user wants
6761 	 * the ARC to be primarily filled with them regardless of the data being
6762 	 * written. Therefore, if they're allowed then we allocate one and copy
6763 	 * the data into it; otherwise, we share the data directly if we can.
6764 	 */
6765 	if (ARC_BUF_ENCRYPTED(buf)) {
6766 		ASSERT3U(psize, >, 0);
6767 		ASSERT(ARC_BUF_COMPRESSED(buf));
6768 		arc_hdr_alloc_pabd(hdr, B_TRUE);
6769 		abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6770 	} else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) {
6771 		/*
6772 		 * Ideally, we would always copy the io_abd into b_pabd, but the
6773 		 * user may have disabled compressed ARC, thus we must check the
6774 		 * hdr's compression setting rather than the io_bp's.
6775 		 */
6776 		if (BP_IS_ENCRYPTED(bp)) {
6777 			ASSERT3U(psize, >, 0);
6778 			arc_hdr_alloc_pabd(hdr, B_TRUE);
6779 			abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize);
6780 		} else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF &&
6781 		    !ARC_BUF_COMPRESSED(buf)) {
6782 			ASSERT3U(psize, >, 0);
6783 			arc_hdr_alloc_pabd(hdr, B_FALSE);
6784 			abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize);
6785 		} else {
6786 			ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr));
6787 			arc_hdr_alloc_pabd(hdr, B_FALSE);
6788 			abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data,
6789 			    arc_buf_size(buf));
6790 		}
6791 	} else {
6792 		ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd));
6793 		ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf));
6794 		ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1);
6795 		arc_share_buf(hdr, buf);
6796 	}
6797 
6798 out:
6799 	arc_hdr_verify(hdr, bp);
6800 }
6801 
6802 static void
6803 arc_write_children_ready(zio_t *zio)
6804 {
6805 	arc_write_callback_t *callback = zio->io_private;
6806 	arc_buf_t *buf = callback->awcb_buf;
6807 
6808 	callback->awcb_children_ready(zio, buf, callback->awcb_private);
6809 }
6810 
6811 /*
6812  * The SPA calls this callback for each physical write that happens on behalf
6813  * of a logical write.  See the comment in dbuf_write_physdone() for details.
6814  */
6815 static void
6816 arc_write_physdone(zio_t *zio)
6817 {
6818 	arc_write_callback_t *cb = zio->io_private;
6819 	if (cb->awcb_physdone != NULL)
6820 		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
6821 }
6822 
6823 static void
6824 arc_write_done(zio_t *zio)
6825 {
6826 	arc_write_callback_t *callback = zio->io_private;
6827 	arc_buf_t *buf = callback->awcb_buf;
6828 	arc_buf_hdr_t *hdr = buf->b_hdr;
6829 
6830 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6831 
6832 	if (zio->io_error == 0) {
6833 		arc_hdr_verify(hdr, zio->io_bp);
6834 
6835 		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
6836 			buf_discard_identity(hdr);
6837 		} else {
6838 			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
6839 			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
6840 		}
6841 	} else {
6842 		ASSERT(HDR_EMPTY(hdr));
6843 	}
6844 
6845 	/*
6846 	 * If the block to be written was all-zero or compressed enough to be
6847 	 * embedded in the BP, no write was performed so there will be no
6848 	 * dva/birth/checksum.  The buffer must therefore remain anonymous
6849 	 * (and uncached).
6850 	 */
6851 	if (!HDR_EMPTY(hdr)) {
6852 		arc_buf_hdr_t *exists;
6853 		kmutex_t *hash_lock;
6854 
6855 		ASSERT3U(zio->io_error, ==, 0);
6856 
6857 		arc_cksum_verify(buf);
6858 
6859 		exists = buf_hash_insert(hdr, &hash_lock);
6860 		if (exists != NULL) {
6861 			/*
6862 			 * This can only happen if we overwrite for
6863 			 * sync-to-convergence, because we remove
6864 			 * buffers from the hash table when we arc_free().
6865 			 */
6866 			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
6867 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6868 					panic("bad overwrite, hdr=%p exists=%p",
6869 					    (void *)hdr, (void *)exists);
6870 				ASSERT(zfs_refcount_is_zero(
6871 				    &exists->b_l1hdr.b_refcnt));
6872 				arc_change_state(arc_anon, exists, hash_lock);
6873 				mutex_exit(hash_lock);
6874 				arc_hdr_destroy(exists);
6875 				exists = buf_hash_insert(hdr, &hash_lock);
6876 				ASSERT3P(exists, ==, NULL);
6877 			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
6878 				/* nopwrite */
6879 				ASSERT(zio->io_prop.zp_nopwrite);
6880 				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
6881 					panic("bad nopwrite, hdr=%p exists=%p",
6882 					    (void *)hdr, (void *)exists);
6883 			} else {
6884 				/* Dedup */
6885 				ASSERT(hdr->b_l1hdr.b_bufcnt == 1);
6886 				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
6887 				ASSERT(BP_GET_DEDUP(zio->io_bp));
6888 				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
6889 			}
6890 		}
6891 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6892 		/* if it's not anon, we are doing a scrub */
6893 		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
6894 			arc_access(hdr, hash_lock);
6895 		mutex_exit(hash_lock);
6896 	} else {
6897 		arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
6898 	}
6899 
6900 	ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
6901 	callback->awcb_done(zio, buf, callback->awcb_private);
6902 
6903 	abd_put(zio->io_abd);
6904 	kmem_free(callback, sizeof (arc_write_callback_t));
6905 }
6906 
6907 zio_t *
6908 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
6909     boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready,
6910     arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone,
6911     arc_write_done_func_t *done, void *private, zio_priority_t priority,
6912     int zio_flags, const zbookmark_phys_t *zb)
6913 {
6914 	arc_buf_hdr_t *hdr = buf->b_hdr;
6915 	arc_write_callback_t *callback;
6916 	zio_t *zio;
6917 	zio_prop_t localprop = *zp;
6918 
6919 	ASSERT3P(ready, !=, NULL);
6920 	ASSERT3P(done, !=, NULL);
6921 	ASSERT(!HDR_IO_ERROR(hdr));
6922 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
6923 	ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
6924 	ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0);
6925 	if (l2arc)
6926 		arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
6927 
6928 	if (ARC_BUF_ENCRYPTED(buf)) {
6929 		ASSERT(ARC_BUF_COMPRESSED(buf));
6930 		localprop.zp_encrypt = B_TRUE;
6931 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6932 		/* CONSTCOND */
6933 		localprop.zp_byteorder =
6934 		    (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ?
6935 		    ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER;
6936 		bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt,
6937 		    ZIO_DATA_SALT_LEN);
6938 		bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv,
6939 		    ZIO_DATA_IV_LEN);
6940 		bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac,
6941 		    ZIO_DATA_MAC_LEN);
6942 		if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) {
6943 			localprop.zp_nopwrite = B_FALSE;
6944 			localprop.zp_copies =
6945 			    MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1);
6946 		}
6947 		zio_flags |= ZIO_FLAG_RAW;
6948 	} else if (ARC_BUF_COMPRESSED(buf)) {
6949 		ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf));
6950 		localprop.zp_compress = HDR_GET_COMPRESS(hdr);
6951 		zio_flags |= ZIO_FLAG_RAW_COMPRESS;
6952 	}
6953 
6954 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
6955 	callback->awcb_ready = ready;
6956 	callback->awcb_children_ready = children_ready;
6957 	callback->awcb_physdone = physdone;
6958 	callback->awcb_done = done;
6959 	callback->awcb_private = private;
6960 	callback->awcb_buf = buf;
6961 
6962 	/*
6963 	 * The hdr's b_pabd is now stale, free it now. A new data block
6964 	 * will be allocated when the zio pipeline calls arc_write_ready().
6965 	 */
6966 	if (hdr->b_l1hdr.b_pabd != NULL) {
6967 		/*
6968 		 * If the buf is currently sharing the data block with
6969 		 * the hdr then we need to break that relationship here.
6970 		 * The hdr will remain with a NULL data pointer and the
6971 		 * buf will take sole ownership of the block.
6972 		 */
6973 		if (arc_buf_is_shared(buf)) {
6974 			arc_unshare_buf(hdr, buf);
6975 		} else {
6976 			arc_hdr_free_pabd(hdr, B_FALSE);
6977 		}
6978 		VERIFY3P(buf->b_data, !=, NULL);
6979 	}
6980 
6981 	if (HDR_HAS_RABD(hdr))
6982 		arc_hdr_free_pabd(hdr, B_TRUE);
6983 
6984 	if (!(zio_flags & ZIO_FLAG_RAW))
6985 		arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF);
6986 
6987 	ASSERT(!arc_buf_is_shared(buf));
6988 	ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL);
6989 
6990 	zio = zio_write(pio, spa, txg, bp,
6991 	    abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)),
6992 	    HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready,
6993 	    (children_ready != NULL) ? arc_write_children_ready : NULL,
6994 	    arc_write_physdone, arc_write_done, callback,
6995 	    priority, zio_flags, zb);
6996 
6997 	return (zio);
6998 }
6999 
7000 static int
7001 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg)
7002 {
7003 #ifdef _KERNEL
7004 	uint64_t available_memory = ptob(freemem);
7005 
7006 #if defined(__i386)
7007 	available_memory =
7008 	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
7009 #endif
7010 
7011 	if (freemem > physmem * arc_lotsfree_percent / 100)
7012 		return (0);
7013 
7014 	if (txg > spa->spa_lowmem_last_txg) {
7015 		spa->spa_lowmem_last_txg = txg;
7016 		spa->spa_lowmem_page_load = 0;
7017 	}
7018 	/*
7019 	 * If we are in pageout, we know that memory is already tight,
7020 	 * the arc is already going to be evicting, so we just want to
7021 	 * continue to let page writes occur as quickly as possible.
7022 	 */
7023 	if (curproc == proc_pageout) {
7024 		if (spa->spa_lowmem_page_load >
7025 		    MAX(ptob(minfree), available_memory) / 4)
7026 			return (SET_ERROR(ERESTART));
7027 		/* Note: reserve is inflated, so we deflate */
7028 		atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8);
7029 		return (0);
7030 	} else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) {
7031 		/* memory is low, delay before restarting */
7032 		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
7033 		return (SET_ERROR(EAGAIN));
7034 	}
7035 	spa->spa_lowmem_page_load = 0;
7036 #endif /* _KERNEL */
7037 	return (0);
7038 }
7039 
7040 void
7041 arc_tempreserve_clear(uint64_t reserve)
7042 {
7043 	atomic_add_64(&arc_tempreserve, -reserve);
7044 	ASSERT((int64_t)arc_tempreserve >= 0);
7045 }
7046 
7047 int
7048 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg)
7049 {
7050 	int error;
7051 	uint64_t anon_size;
7052 
7053 	if (reserve > arc_c/4 && !arc_no_grow)
7054 		arc_c = MIN(arc_c_max, reserve * 4);
7055 	if (reserve > arc_c)
7056 		return (SET_ERROR(ENOMEM));
7057 
7058 	/*
7059 	 * Don't count loaned bufs as in flight dirty data to prevent long
7060 	 * network delays from blocking transactions that are ready to be
7061 	 * assigned to a txg.
7062 	 */
7063 
7064 	/* assert that it has not wrapped around */
7065 	ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0);
7066 
7067 	anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) -
7068 	    arc_loaned_bytes), 0);
7069 
7070 	/*
7071 	 * Writes will, almost always, require additional memory allocations
7072 	 * in order to compress/encrypt/etc the data.  We therefore need to
7073 	 * make sure that there is sufficient available memory for this.
7074 	 */
7075 	error = arc_memory_throttle(spa, reserve, txg);
7076 	if (error != 0)
7077 		return (error);
7078 
7079 	/*
7080 	 * Throttle writes when the amount of dirty data in the cache
7081 	 * gets too large.  We try to keep the cache less than half full
7082 	 * of dirty blocks so that our sync times don't grow too large.
7083 	 *
7084 	 * In the case of one pool being built on another pool, we want
7085 	 * to make sure we don't end up throttling the lower (backing)
7086 	 * pool when the upper pool is the majority contributor to dirty
7087 	 * data. To insure we make forward progress during throttling, we
7088 	 * also check the current pool's net dirty data and only throttle
7089 	 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty
7090 	 * data in the cache.
7091 	 *
7092 	 * Note: if two requests come in concurrently, we might let them
7093 	 * both succeed, when one of them should fail.  Not a huge deal.
7094 	 */
7095 	uint64_t total_dirty = reserve + arc_tempreserve + anon_size;
7096 	uint64_t spa_dirty_anon = spa_dirty_data(spa);
7097 
7098 	if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 &&
7099 	    anon_size > arc_c * zfs_arc_anon_limit_percent / 100 &&
7100 	    spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) {
7101 		uint64_t meta_esize =
7102 		    zfs_refcount_count(
7103 		    &arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7104 		uint64_t data_esize =
7105 		    zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7106 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
7107 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
7108 		    arc_tempreserve >> 10, meta_esize >> 10,
7109 		    data_esize >> 10, reserve >> 10, arc_c >> 10);
7110 		return (SET_ERROR(ERESTART));
7111 	}
7112 	atomic_add_64(&arc_tempreserve, reserve);
7113 	return (0);
7114 }
7115 
7116 static void
7117 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
7118     kstat_named_t *evict_data, kstat_named_t *evict_metadata)
7119 {
7120 	size->value.ui64 = zfs_refcount_count(&state->arcs_size);
7121 	evict_data->value.ui64 =
7122 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]);
7123 	evict_metadata->value.ui64 =
7124 	    zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]);
7125 }
7126 
7127 static int
7128 arc_kstat_update(kstat_t *ksp, int rw)
7129 {
7130 	arc_stats_t *as = ksp->ks_data;
7131 
7132 	if (rw == KSTAT_WRITE) {
7133 		return (EACCES);
7134 	} else {
7135 		arc_kstat_update_state(arc_anon,
7136 		    &as->arcstat_anon_size,
7137 		    &as->arcstat_anon_evictable_data,
7138 		    &as->arcstat_anon_evictable_metadata);
7139 		arc_kstat_update_state(arc_mru,
7140 		    &as->arcstat_mru_size,
7141 		    &as->arcstat_mru_evictable_data,
7142 		    &as->arcstat_mru_evictable_metadata);
7143 		arc_kstat_update_state(arc_mru_ghost,
7144 		    &as->arcstat_mru_ghost_size,
7145 		    &as->arcstat_mru_ghost_evictable_data,
7146 		    &as->arcstat_mru_ghost_evictable_metadata);
7147 		arc_kstat_update_state(arc_mfu,
7148 		    &as->arcstat_mfu_size,
7149 		    &as->arcstat_mfu_evictable_data,
7150 		    &as->arcstat_mfu_evictable_metadata);
7151 		arc_kstat_update_state(arc_mfu_ghost,
7152 		    &as->arcstat_mfu_ghost_size,
7153 		    &as->arcstat_mfu_ghost_evictable_data,
7154 		    &as->arcstat_mfu_ghost_evictable_metadata);
7155 
7156 		ARCSTAT(arcstat_size) = aggsum_value(&arc_size);
7157 		ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used);
7158 		ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size);
7159 		ARCSTAT(arcstat_metadata_size) =
7160 		    aggsum_value(&astat_metadata_size);
7161 		ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size);
7162 		ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size);
7163 		ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size);
7164 	}
7165 
7166 	return (0);
7167 }
7168 
7169 /*
7170  * This function *must* return indices evenly distributed between all
7171  * sublists of the multilist. This is needed due to how the ARC eviction
7172  * code is laid out; arc_evict_state() assumes ARC buffers are evenly
7173  * distributed between all sublists and uses this assumption when
7174  * deciding which sublist to evict from and how much to evict from it.
7175  */
7176 unsigned int
7177 arc_state_multilist_index_func(multilist_t *ml, void *obj)
7178 {
7179 	arc_buf_hdr_t *hdr = obj;
7180 
7181 	/*
7182 	 * We rely on b_dva to generate evenly distributed index
7183 	 * numbers using buf_hash below. So, as an added precaution,
7184 	 * let's make sure we never add empty buffers to the arc lists.
7185 	 */
7186 	ASSERT(!HDR_EMPTY(hdr));
7187 
7188 	/*
7189 	 * The assumption here, is the hash value for a given
7190 	 * arc_buf_hdr_t will remain constant throughout its lifetime
7191 	 * (i.e. its b_spa, b_dva, and b_birth fields don't change).
7192 	 * Thus, we don't need to store the header's sublist index
7193 	 * on insertion, as this index can be recalculated on removal.
7194 	 *
7195 	 * Also, the low order bits of the hash value are thought to be
7196 	 * distributed evenly. Otherwise, in the case that the multilist
7197 	 * has a power of two number of sublists, each sublists' usage
7198 	 * would not be evenly distributed.
7199 	 */
7200 	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
7201 	    multilist_get_num_sublists(ml));
7202 }
7203 
7204 static void
7205 arc_state_init(void)
7206 {
7207 	arc_anon = &ARC_anon;
7208 	arc_mru = &ARC_mru;
7209 	arc_mru_ghost = &ARC_mru_ghost;
7210 	arc_mfu = &ARC_mfu;
7211 	arc_mfu_ghost = &ARC_mfu_ghost;
7212 	arc_l2c_only = &ARC_l2c_only;
7213 
7214 	arc_mru->arcs_list[ARC_BUFC_METADATA] =
7215 	    multilist_create(sizeof (arc_buf_hdr_t),
7216 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7217 	    arc_state_multilist_index_func);
7218 	arc_mru->arcs_list[ARC_BUFC_DATA] =
7219 	    multilist_create(sizeof (arc_buf_hdr_t),
7220 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7221 	    arc_state_multilist_index_func);
7222 	arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] =
7223 	    multilist_create(sizeof (arc_buf_hdr_t),
7224 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7225 	    arc_state_multilist_index_func);
7226 	arc_mru_ghost->arcs_list[ARC_BUFC_DATA] =
7227 	    multilist_create(sizeof (arc_buf_hdr_t),
7228 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7229 	    arc_state_multilist_index_func);
7230 	arc_mfu->arcs_list[ARC_BUFC_METADATA] =
7231 	    multilist_create(sizeof (arc_buf_hdr_t),
7232 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7233 	    arc_state_multilist_index_func);
7234 	arc_mfu->arcs_list[ARC_BUFC_DATA] =
7235 	    multilist_create(sizeof (arc_buf_hdr_t),
7236 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7237 	    arc_state_multilist_index_func);
7238 	arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] =
7239 	    multilist_create(sizeof (arc_buf_hdr_t),
7240 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7241 	    arc_state_multilist_index_func);
7242 	arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] =
7243 	    multilist_create(sizeof (arc_buf_hdr_t),
7244 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7245 	    arc_state_multilist_index_func);
7246 	arc_l2c_only->arcs_list[ARC_BUFC_METADATA] =
7247 	    multilist_create(sizeof (arc_buf_hdr_t),
7248 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7249 	    arc_state_multilist_index_func);
7250 	arc_l2c_only->arcs_list[ARC_BUFC_DATA] =
7251 	    multilist_create(sizeof (arc_buf_hdr_t),
7252 	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
7253 	    arc_state_multilist_index_func);
7254 
7255 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7256 	zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7257 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7258 	zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7259 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7260 	zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7261 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7262 	zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7263 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7264 	zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7265 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7266 	zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7267 
7268 	zfs_refcount_create(&arc_anon->arcs_size);
7269 	zfs_refcount_create(&arc_mru->arcs_size);
7270 	zfs_refcount_create(&arc_mru_ghost->arcs_size);
7271 	zfs_refcount_create(&arc_mfu->arcs_size);
7272 	zfs_refcount_create(&arc_mfu_ghost->arcs_size);
7273 	zfs_refcount_create(&arc_l2c_only->arcs_size);
7274 
7275 	aggsum_init(&arc_meta_used, 0);
7276 	aggsum_init(&arc_size, 0);
7277 	aggsum_init(&astat_data_size, 0);
7278 	aggsum_init(&astat_metadata_size, 0);
7279 	aggsum_init(&astat_hdr_size, 0);
7280 	aggsum_init(&astat_other_size, 0);
7281 	aggsum_init(&astat_l2_hdr_size, 0);
7282 }
7283 
7284 static void
7285 arc_state_fini(void)
7286 {
7287 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]);
7288 	zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]);
7289 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]);
7290 	zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]);
7291 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]);
7292 	zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]);
7293 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]);
7294 	zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]);
7295 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]);
7296 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]);
7297 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]);
7298 	zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]);
7299 
7300 	zfs_refcount_destroy(&arc_anon->arcs_size);
7301 	zfs_refcount_destroy(&arc_mru->arcs_size);
7302 	zfs_refcount_destroy(&arc_mru_ghost->arcs_size);
7303 	zfs_refcount_destroy(&arc_mfu->arcs_size);
7304 	zfs_refcount_destroy(&arc_mfu_ghost->arcs_size);
7305 	zfs_refcount_destroy(&arc_l2c_only->arcs_size);
7306 
7307 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]);
7308 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
7309 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]);
7310 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
7311 	multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]);
7312 	multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
7313 	multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]);
7314 	multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
7315 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
7316 	multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
7317 
7318 	aggsum_fini(&arc_meta_used);
7319 	aggsum_fini(&arc_size);
7320 	aggsum_fini(&astat_data_size);
7321 	aggsum_fini(&astat_metadata_size);
7322 	aggsum_fini(&astat_hdr_size);
7323 	aggsum_fini(&astat_other_size);
7324 	aggsum_fini(&astat_l2_hdr_size);
7325 
7326 }
7327 
7328 uint64_t
7329 arc_max_bytes(void)
7330 {
7331 	return (arc_c_max);
7332 }
7333 
7334 void
7335 arc_init(void)
7336 {
7337 	/*
7338 	 * allmem is "all memory that we could possibly use".
7339 	 */
7340 #ifdef _KERNEL
7341 	uint64_t allmem = ptob(physmem - swapfs_minfree);
7342 #else
7343 	uint64_t allmem = (physmem * PAGESIZE) / 2;
7344 #endif
7345 	mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL);
7346 	cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL);
7347 
7348 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
7349 	arc_c_min = MAX(allmem / 32, 64 << 20);
7350 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
7351 	if (allmem >= 1 << 30)
7352 		arc_c_max = allmem - (1 << 30);
7353 	else
7354 		arc_c_max = arc_c_min;
7355 	arc_c_max = MAX(allmem * 3 / 4, arc_c_max);
7356 
7357 	/*
7358 	 * In userland, there's only the memory pressure that we artificially
7359 	 * create (see arc_available_memory()).  Don't let arc_c get too
7360 	 * small, because it can cause transactions to be larger than
7361 	 * arc_c, causing arc_tempreserve_space() to fail.
7362 	 */
7363 #ifndef _KERNEL
7364 	arc_c_min = arc_c_max / 2;
7365 #endif
7366 
7367 	/*
7368 	 * Allow the tunables to override our calculations if they are
7369 	 * reasonable (ie. over 64MB)
7370 	 */
7371 	if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) {
7372 		arc_c_max = zfs_arc_max;
7373 		arc_c_min = MIN(arc_c_min, arc_c_max);
7374 	}
7375 	if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max)
7376 		arc_c_min = zfs_arc_min;
7377 
7378 	arc_c = arc_c_max;
7379 	arc_p = (arc_c >> 1);
7380 
7381 	/* limit meta-data to 1/4 of the arc capacity */
7382 	arc_meta_limit = arc_c_max / 4;
7383 
7384 #ifdef _KERNEL
7385 	/*
7386 	 * Metadata is stored in the kernel's heap.  Don't let us
7387 	 * use more than half the heap for the ARC.
7388 	 */
7389 	arc_meta_limit = MIN(arc_meta_limit,
7390 	    vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2);
7391 #endif
7392 
7393 	/* Allow the tunable to override if it is reasonable */
7394 	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
7395 		arc_meta_limit = zfs_arc_meta_limit;
7396 
7397 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
7398 		arc_c_min = arc_meta_limit / 2;
7399 
7400 	if (zfs_arc_meta_min > 0) {
7401 		arc_meta_min = zfs_arc_meta_min;
7402 	} else {
7403 		arc_meta_min = arc_c_min / 2;
7404 	}
7405 
7406 	if (zfs_arc_grow_retry > 0)
7407 		arc_grow_retry = zfs_arc_grow_retry;
7408 
7409 	if (zfs_arc_shrink_shift > 0)
7410 		arc_shrink_shift = zfs_arc_shrink_shift;
7411 
7412 	/*
7413 	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
7414 	 */
7415 	if (arc_no_grow_shift >= arc_shrink_shift)
7416 		arc_no_grow_shift = arc_shrink_shift - 1;
7417 
7418 	if (zfs_arc_p_min_shift > 0)
7419 		arc_p_min_shift = zfs_arc_p_min_shift;
7420 
7421 	/* if kmem_flags are set, lets try to use less memory */
7422 	if (kmem_debugging())
7423 		arc_c = arc_c / 2;
7424 	if (arc_c < arc_c_min)
7425 		arc_c = arc_c_min;
7426 
7427 	arc_state_init();
7428 
7429 	/*
7430 	 * The arc must be "uninitialized", so that hdr_recl() (which is
7431 	 * registered by buf_init()) will not access arc_reap_zthr before
7432 	 * it is created.
7433 	 */
7434 	ASSERT(!arc_initialized);
7435 	buf_init();
7436 
7437 	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
7438 	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
7439 
7440 	if (arc_ksp != NULL) {
7441 		arc_ksp->ks_data = &arc_stats;
7442 		arc_ksp->ks_update = arc_kstat_update;
7443 		kstat_install(arc_ksp);
7444 	}
7445 
7446 	arc_adjust_zthr = zthr_create(arc_adjust_cb_check,
7447 	    arc_adjust_cb, NULL);
7448 	arc_reap_zthr = zthr_create_timer(arc_reap_cb_check,
7449 	    arc_reap_cb, NULL, SEC2NSEC(1));
7450 
7451 	arc_initialized = B_TRUE;
7452 	arc_warm = B_FALSE;
7453 
7454 	/*
7455 	 * Calculate maximum amount of dirty data per pool.
7456 	 *
7457 	 * If it has been set by /etc/system, take that.
7458 	 * Otherwise, use a percentage of physical memory defined by
7459 	 * zfs_dirty_data_max_percent (default 10%) with a cap at
7460 	 * zfs_dirty_data_max_max (default 4GB).
7461 	 */
7462 	if (zfs_dirty_data_max == 0) {
7463 		zfs_dirty_data_max = physmem * PAGESIZE *
7464 		    zfs_dirty_data_max_percent / 100;
7465 		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
7466 		    zfs_dirty_data_max_max);
7467 	}
7468 }
7469 
7470 void
7471 arc_fini(void)
7472 {
7473 	/* Use B_TRUE to ensure *all* buffers are evicted */
7474 	arc_flush(NULL, B_TRUE);
7475 
7476 	arc_initialized = B_FALSE;
7477 
7478 	if (arc_ksp != NULL) {
7479 		kstat_delete(arc_ksp);
7480 		arc_ksp = NULL;
7481 	}
7482 
7483 	(void) zthr_cancel(arc_adjust_zthr);
7484 	zthr_destroy(arc_adjust_zthr);
7485 
7486 	(void) zthr_cancel(arc_reap_zthr);
7487 	zthr_destroy(arc_reap_zthr);
7488 
7489 	mutex_destroy(&arc_adjust_lock);
7490 	cv_destroy(&arc_adjust_waiters_cv);
7491 
7492 	/*
7493 	 * buf_fini() must proceed arc_state_fini() because buf_fin() may
7494 	 * trigger the release of kmem magazines, which can callback to
7495 	 * arc_space_return() which accesses aggsums freed in act_state_fini().
7496 	 */
7497 	buf_fini();
7498 	arc_state_fini();
7499 
7500 	ASSERT0(arc_loaned_bytes);
7501 }
7502 
7503 /*
7504  * Level 2 ARC
7505  *
7506  * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
7507  * It uses dedicated storage devices to hold cached data, which are populated
7508  * using large infrequent writes.  The main role of this cache is to boost
7509  * the performance of random read workloads.  The intended L2ARC devices
7510  * include short-stroked disks, solid state disks, and other media with
7511  * substantially faster read latency than disk.
7512  *
7513  *                 +-----------------------+
7514  *                 |         ARC           |
7515  *                 +-----------------------+
7516  *                    |         ^     ^
7517  *                    |         |     |
7518  *      l2arc_feed_thread()    arc_read()
7519  *                    |         |     |
7520  *                    |  l2arc read   |
7521  *                    V         |     |
7522  *               +---------------+    |
7523  *               |     L2ARC     |    |
7524  *               +---------------+    |
7525  *                   |    ^           |
7526  *          l2arc_write() |           |
7527  *                   |    |           |
7528  *                   V    |           |
7529  *                 +-------+      +-------+
7530  *                 | vdev  |      | vdev  |
7531  *                 | cache |      | cache |
7532  *                 +-------+      +-------+
7533  *                 +=========+     .-----.
7534  *                 :  L2ARC  :    |-_____-|
7535  *                 : devices :    | Disks |
7536  *                 +=========+    `-_____-'
7537  *
7538  * Read requests are satisfied from the following sources, in order:
7539  *
7540  *	1) ARC
7541  *	2) vdev cache of L2ARC devices
7542  *	3) L2ARC devices
7543  *	4) vdev cache of disks
7544  *	5) disks
7545  *
7546  * Some L2ARC device types exhibit extremely slow write performance.
7547  * To accommodate for this there are some significant differences between
7548  * the L2ARC and traditional cache design:
7549  *
7550  * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
7551  * the ARC behave as usual, freeing buffers and placing headers on ghost
7552  * lists.  The ARC does not send buffers to the L2ARC during eviction as
7553  * this would add inflated write latencies for all ARC memory pressure.
7554  *
7555  * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
7556  * It does this by periodically scanning buffers from the eviction-end of
7557  * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
7558  * not already there. It scans until a headroom of buffers is satisfied,
7559  * which itself is a buffer for ARC eviction. If a compressible buffer is
7560  * found during scanning and selected for writing to an L2ARC device, we
7561  * temporarily boost scanning headroom during the next scan cycle to make
7562  * sure we adapt to compression effects (which might significantly reduce
7563  * the data volume we write to L2ARC). The thread that does this is
7564  * l2arc_feed_thread(), illustrated below; example sizes are included to
7565  * provide a better sense of ratio than this diagram:
7566  *
7567  *	       head -->                        tail
7568  *	        +---------------------+----------+
7569  *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
7570  *	        +---------------------+----------+   |   o L2ARC eligible
7571  *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
7572  *	        +---------------------+----------+   |
7573  *	             15.9 Gbytes      ^ 32 Mbytes    |
7574  *	                           headroom          |
7575  *	                                      l2arc_feed_thread()
7576  *	                                             |
7577  *	                 l2arc write hand <--[oooo]--'
7578  *	                         |           8 Mbyte
7579  *	                         |          write max
7580  *	                         V
7581  *		  +==============================+
7582  *	L2ARC dev |####|#|###|###|    |####| ... |
7583  *	          +==============================+
7584  *	                     32 Gbytes
7585  *
7586  * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
7587  * evicted, then the L2ARC has cached a buffer much sooner than it probably
7588  * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
7589  * safe to say that this is an uncommon case, since buffers at the end of
7590  * the ARC lists have moved there due to inactivity.
7591  *
7592  * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
7593  * then the L2ARC simply misses copying some buffers.  This serves as a
7594  * pressure valve to prevent heavy read workloads from both stalling the ARC
7595  * with waits and clogging the L2ARC with writes.  This also helps prevent
7596  * the potential for the L2ARC to churn if it attempts to cache content too
7597  * quickly, such as during backups of the entire pool.
7598  *
7599  * 5. After system boot and before the ARC has filled main memory, there are
7600  * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
7601  * lists can remain mostly static.  Instead of searching from tail of these
7602  * lists as pictured, the l2arc_feed_thread() will search from the list heads
7603  * for eligible buffers, greatly increasing its chance of finding them.
7604  *
7605  * The L2ARC device write speed is also boosted during this time so that
7606  * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
7607  * there are no L2ARC reads, and no fear of degrading read performance
7608  * through increased writes.
7609  *
7610  * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
7611  * the vdev queue can aggregate them into larger and fewer writes.  Each
7612  * device is written to in a rotor fashion, sweeping writes through
7613  * available space then repeating.
7614  *
7615  * 7. The L2ARC does not store dirty content.  It never needs to flush
7616  * write buffers back to disk based storage.
7617  *
7618  * 8. If an ARC buffer is written (and dirtied) which also exists in the
7619  * L2ARC, the now stale L2ARC buffer is immediately dropped.
7620  *
7621  * The performance of the L2ARC can be tweaked by a number of tunables, which
7622  * may be necessary for different workloads:
7623  *
7624  *	l2arc_write_max		max write bytes per interval
7625  *	l2arc_write_boost	extra write bytes during device warmup
7626  *	l2arc_noprefetch	skip caching prefetched buffers
7627  *	l2arc_headroom		number of max device writes to precache
7628  *	l2arc_headroom_boost	when we find compressed buffers during ARC
7629  *				scanning, we multiply headroom by this
7630  *				percentage factor for the next scan cycle,
7631  *				since more compressed buffers are likely to
7632  *				be present
7633  *	l2arc_feed_secs		seconds between L2ARC writing
7634  *
7635  * Tunables may be removed or added as future performance improvements are
7636  * integrated, and also may become zpool properties.
7637  *
7638  * There are three key functions that control how the L2ARC warms up:
7639  *
7640  *	l2arc_write_eligible()	check if a buffer is eligible to cache
7641  *	l2arc_write_size()	calculate how much to write
7642  *	l2arc_write_interval()	calculate sleep delay between writes
7643  *
7644  * These three functions determine what to write, how much, and how quickly
7645  * to send writes.
7646  */
7647 
7648 static boolean_t
7649 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr)
7650 {
7651 	/*
7652 	 * A buffer is *not* eligible for the L2ARC if it:
7653 	 * 1. belongs to a different spa.
7654 	 * 2. is already cached on the L2ARC.
7655 	 * 3. has an I/O in progress (it may be an incomplete read).
7656 	 * 4. is flagged not eligible (zfs property).
7657 	 */
7658 	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
7659 	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr))
7660 		return (B_FALSE);
7661 
7662 	return (B_TRUE);
7663 }
7664 
7665 static uint64_t
7666 l2arc_write_size(void)
7667 {
7668 	uint64_t size;
7669 
7670 	/*
7671 	 * Make sure our globals have meaningful values in case the user
7672 	 * altered them.
7673 	 */
7674 	size = l2arc_write_max;
7675 	if (size == 0) {
7676 		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
7677 		    "be greater than zero, resetting it to the default (%d)",
7678 		    L2ARC_WRITE_SIZE);
7679 		size = l2arc_write_max = L2ARC_WRITE_SIZE;
7680 	}
7681 
7682 	if (arc_warm == B_FALSE)
7683 		size += l2arc_write_boost;
7684 
7685 	return (size);
7686 
7687 }
7688 
7689 static clock_t
7690 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
7691 {
7692 	clock_t interval, next, now;
7693 
7694 	/*
7695 	 * If the ARC lists are busy, increase our write rate; if the
7696 	 * lists are stale, idle back.  This is achieved by checking
7697 	 * how much we previously wrote - if it was more than half of
7698 	 * what we wanted, schedule the next write much sooner.
7699 	 */
7700 	if (l2arc_feed_again && wrote > (wanted / 2))
7701 		interval = (hz * l2arc_feed_min_ms) / 1000;
7702 	else
7703 		interval = hz * l2arc_feed_secs;
7704 
7705 	now = ddi_get_lbolt();
7706 	next = MAX(now, MIN(now + interval, began + interval));
7707 
7708 	return (next);
7709 }
7710 
7711 /*
7712  * Cycle through L2ARC devices.  This is how L2ARC load balances.
7713  * If a device is returned, this also returns holding the spa config lock.
7714  */
7715 static l2arc_dev_t *
7716 l2arc_dev_get_next(void)
7717 {
7718 	l2arc_dev_t *first, *next = NULL;
7719 
7720 	/*
7721 	 * Lock out the removal of spas (spa_namespace_lock), then removal
7722 	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
7723 	 * both locks will be dropped and a spa config lock held instead.
7724 	 */
7725 	mutex_enter(&spa_namespace_lock);
7726 	mutex_enter(&l2arc_dev_mtx);
7727 
7728 	/* if there are no vdevs, there is nothing to do */
7729 	if (l2arc_ndev == 0)
7730 		goto out;
7731 
7732 	first = NULL;
7733 	next = l2arc_dev_last;
7734 	do {
7735 		/* loop around the list looking for a non-faulted vdev */
7736 		if (next == NULL) {
7737 			next = list_head(l2arc_dev_list);
7738 		} else {
7739 			next = list_next(l2arc_dev_list, next);
7740 			if (next == NULL)
7741 				next = list_head(l2arc_dev_list);
7742 		}
7743 
7744 		/* if we have come back to the start, bail out */
7745 		if (first == NULL)
7746 			first = next;
7747 		else if (next == first)
7748 			break;
7749 
7750 	} while (vdev_is_dead(next->l2ad_vdev));
7751 
7752 	/* if we were unable to find any usable vdevs, return NULL */
7753 	if (vdev_is_dead(next->l2ad_vdev))
7754 		next = NULL;
7755 
7756 	l2arc_dev_last = next;
7757 
7758 out:
7759 	mutex_exit(&l2arc_dev_mtx);
7760 
7761 	/*
7762 	 * Grab the config lock to prevent the 'next' device from being
7763 	 * removed while we are writing to it.
7764 	 */
7765 	if (next != NULL)
7766 		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
7767 	mutex_exit(&spa_namespace_lock);
7768 
7769 	return (next);
7770 }
7771 
7772 /*
7773  * Free buffers that were tagged for destruction.
7774  */
7775 static void
7776 l2arc_do_free_on_write()
7777 {
7778 	list_t *buflist;
7779 	l2arc_data_free_t *df, *df_prev;
7780 
7781 	mutex_enter(&l2arc_free_on_write_mtx);
7782 	buflist = l2arc_free_on_write;
7783 
7784 	for (df = list_tail(buflist); df; df = df_prev) {
7785 		df_prev = list_prev(buflist, df);
7786 		ASSERT3P(df->l2df_abd, !=, NULL);
7787 		abd_free(df->l2df_abd);
7788 		list_remove(buflist, df);
7789 		kmem_free(df, sizeof (l2arc_data_free_t));
7790 	}
7791 
7792 	mutex_exit(&l2arc_free_on_write_mtx);
7793 }
7794 
7795 /*
7796  * A write to a cache device has completed.  Update all headers to allow
7797  * reads from these buffers to begin.
7798  */
7799 static void
7800 l2arc_write_done(zio_t *zio)
7801 {
7802 	l2arc_write_callback_t *cb;
7803 	l2arc_dev_t *dev;
7804 	list_t *buflist;
7805 	arc_buf_hdr_t *head, *hdr, *hdr_prev;
7806 	kmutex_t *hash_lock;
7807 	int64_t bytes_dropped = 0;
7808 
7809 	cb = zio->io_private;
7810 	ASSERT3P(cb, !=, NULL);
7811 	dev = cb->l2wcb_dev;
7812 	ASSERT3P(dev, !=, NULL);
7813 	head = cb->l2wcb_head;
7814 	ASSERT3P(head, !=, NULL);
7815 	buflist = &dev->l2ad_buflist;
7816 	ASSERT3P(buflist, !=, NULL);
7817 	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
7818 	    l2arc_write_callback_t *, cb);
7819 
7820 	if (zio->io_error != 0)
7821 		ARCSTAT_BUMP(arcstat_l2_writes_error);
7822 
7823 	/*
7824 	 * All writes completed, or an error was hit.
7825 	 */
7826 top:
7827 	mutex_enter(&dev->l2ad_mtx);
7828 	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
7829 		hdr_prev = list_prev(buflist, hdr);
7830 
7831 		hash_lock = HDR_LOCK(hdr);
7832 
7833 		/*
7834 		 * We cannot use mutex_enter or else we can deadlock
7835 		 * with l2arc_write_buffers (due to swapping the order
7836 		 * the hash lock and l2ad_mtx are taken).
7837 		 */
7838 		if (!mutex_tryenter(hash_lock)) {
7839 			/*
7840 			 * Missed the hash lock. We must retry so we
7841 			 * don't leave the ARC_FLAG_L2_WRITING bit set.
7842 			 */
7843 			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
7844 
7845 			/*
7846 			 * We don't want to rescan the headers we've
7847 			 * already marked as having been written out, so
7848 			 * we reinsert the head node so we can pick up
7849 			 * where we left off.
7850 			 */
7851 			list_remove(buflist, head);
7852 			list_insert_after(buflist, hdr, head);
7853 
7854 			mutex_exit(&dev->l2ad_mtx);
7855 
7856 			/*
7857 			 * We wait for the hash lock to become available
7858 			 * to try and prevent busy waiting, and increase
7859 			 * the chance we'll be able to acquire the lock
7860 			 * the next time around.
7861 			 */
7862 			mutex_enter(hash_lock);
7863 			mutex_exit(hash_lock);
7864 			goto top;
7865 		}
7866 
7867 		/*
7868 		 * We could not have been moved into the arc_l2c_only
7869 		 * state while in-flight due to our ARC_FLAG_L2_WRITING
7870 		 * bit being set. Let's just ensure that's being enforced.
7871 		 */
7872 		ASSERT(HDR_HAS_L1HDR(hdr));
7873 
7874 		if (zio->io_error != 0) {
7875 			/*
7876 			 * Error - drop L2ARC entry.
7877 			 */
7878 			list_remove(buflist, hdr);
7879 			arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR);
7880 
7881 			uint64_t psize = HDR_GET_PSIZE(hdr);
7882 			ARCSTAT_INCR(arcstat_l2_psize, -psize);
7883 			ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr));
7884 
7885 			bytes_dropped +=
7886 			    vdev_psize_to_asize(dev->l2ad_vdev, psize);
7887 			(void) zfs_refcount_remove_many(&dev->l2ad_alloc,
7888 			    arc_hdr_size(hdr), hdr);
7889 		}
7890 
7891 		/*
7892 		 * Allow ARC to begin reads and ghost list evictions to
7893 		 * this L2ARC entry.
7894 		 */
7895 		arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING);
7896 
7897 		mutex_exit(hash_lock);
7898 	}
7899 
7900 	atomic_inc_64(&l2arc_writes_done);
7901 	list_remove(buflist, head);
7902 	ASSERT(!HDR_HAS_L1HDR(head));
7903 	kmem_cache_free(hdr_l2only_cache, head);
7904 	mutex_exit(&dev->l2ad_mtx);
7905 
7906 	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
7907 
7908 	l2arc_do_free_on_write();
7909 
7910 	kmem_free(cb, sizeof (l2arc_write_callback_t));
7911 }
7912 
7913 static int
7914 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb)
7915 {
7916 	int ret;
7917 	spa_t *spa = zio->io_spa;
7918 	arc_buf_hdr_t *hdr = cb->l2rcb_hdr;
7919 	blkptr_t *bp = zio->io_bp;
7920 	uint8_t salt[ZIO_DATA_SALT_LEN];
7921 	uint8_t iv[ZIO_DATA_IV_LEN];
7922 	uint8_t mac[ZIO_DATA_MAC_LEN];
7923 	boolean_t no_crypt = B_FALSE;
7924 
7925 	/*
7926 	 * ZIL data is never be written to the L2ARC, so we don't need
7927 	 * special handling for its unique MAC storage.
7928 	 */
7929 	ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG);
7930 	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
7931 	ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
7932 
7933 	/*
7934 	 * If the data was encrypted, decrypt it now. Note that
7935 	 * we must check the bp here and not the hdr, since the
7936 	 * hdr does not have its encryption parameters updated
7937 	 * until arc_read_done().
7938 	 */
7939 	if (BP_IS_ENCRYPTED(bp)) {
7940 		abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
7941 
7942 		zio_crypt_decode_params_bp(bp, salt, iv);
7943 		zio_crypt_decode_mac_bp(bp, mac);
7944 
7945 		ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb,
7946 		    BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp),
7947 		    salt, iv, mac, HDR_GET_PSIZE(hdr), eabd,
7948 		    hdr->b_l1hdr.b_pabd, &no_crypt);
7949 		if (ret != 0) {
7950 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
7951 			goto error;
7952 		}
7953 
7954 		/*
7955 		 * If we actually performed decryption, replace b_pabd
7956 		 * with the decrypted data. Otherwise we can just throw
7957 		 * our decryption buffer away.
7958 		 */
7959 		if (!no_crypt) {
7960 			arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
7961 			    arc_hdr_size(hdr), hdr);
7962 			hdr->b_l1hdr.b_pabd = eabd;
7963 			zio->io_abd = eabd;
7964 		} else {
7965 			arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr);
7966 		}
7967 	}
7968 
7969 	/*
7970 	 * If the L2ARC block was compressed, but ARC compression
7971 	 * is disabled we decompress the data into a new buffer and
7972 	 * replace the existing data.
7973 	 */
7974 	if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
7975 	    !HDR_COMPRESSION_ENABLED(hdr)) {
7976 		abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr);
7977 		void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr));
7978 
7979 		ret = zio_decompress_data(HDR_GET_COMPRESS(hdr),
7980 		    hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr),
7981 		    HDR_GET_LSIZE(hdr));
7982 		if (ret != 0) {
7983 			abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
7984 			arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr);
7985 			goto error;
7986 		}
7987 
7988 		abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr));
7989 		arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd,
7990 		    arc_hdr_size(hdr), hdr);
7991 		hdr->b_l1hdr.b_pabd = cabd;
7992 		zio->io_abd = cabd;
7993 		zio->io_size = HDR_GET_LSIZE(hdr);
7994 	}
7995 
7996 	return (0);
7997 
7998 error:
7999 	return (ret);
8000 }
8001 
8002 
8003 /*
8004  * A read to a cache device completed.  Validate buffer contents before
8005  * handing over to the regular ARC routines.
8006  */
8007 static void
8008 l2arc_read_done(zio_t *zio)
8009 {
8010 	int tfm_error = 0;
8011 	l2arc_read_callback_t *cb = zio->io_private;
8012 	arc_buf_hdr_t *hdr;
8013 	kmutex_t *hash_lock;
8014 	boolean_t valid_cksum;
8015 	boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) &&
8016 	    (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT));
8017 
8018 	ASSERT3P(zio->io_vd, !=, NULL);
8019 	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
8020 
8021 	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
8022 
8023 	ASSERT3P(cb, !=, NULL);
8024 	hdr = cb->l2rcb_hdr;
8025 	ASSERT3P(hdr, !=, NULL);
8026 
8027 	hash_lock = HDR_LOCK(hdr);
8028 	mutex_enter(hash_lock);
8029 	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
8030 
8031 	/*
8032 	 * If the data was read into a temporary buffer,
8033 	 * move it and free the buffer.
8034 	 */
8035 	if (cb->l2rcb_abd != NULL) {
8036 		ASSERT3U(arc_hdr_size(hdr), <, zio->io_size);
8037 		if (zio->io_error == 0) {
8038 			if (using_rdata) {
8039 				abd_copy(hdr->b_crypt_hdr.b_rabd,
8040 				    cb->l2rcb_abd, arc_hdr_size(hdr));
8041 			} else {
8042 				abd_copy(hdr->b_l1hdr.b_pabd,
8043 				    cb->l2rcb_abd, arc_hdr_size(hdr));
8044 			}
8045 		}
8046 
8047 		/*
8048 		 * The following must be done regardless of whether
8049 		 * there was an error:
8050 		 * - free the temporary buffer
8051 		 * - point zio to the real ARC buffer
8052 		 * - set zio size accordingly
8053 		 * These are required because zio is either re-used for
8054 		 * an I/O of the block in the case of the error
8055 		 * or the zio is passed to arc_read_done() and it
8056 		 * needs real data.
8057 		 */
8058 		abd_free(cb->l2rcb_abd);
8059 		zio->io_size = zio->io_orig_size = arc_hdr_size(hdr);
8060 
8061 		if (using_rdata) {
8062 			ASSERT(HDR_HAS_RABD(hdr));
8063 			zio->io_abd = zio->io_orig_abd =
8064 			    hdr->b_crypt_hdr.b_rabd;
8065 		} else {
8066 			ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
8067 			zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd;
8068 		}
8069 	}
8070 
8071 	ASSERT3P(zio->io_abd, !=, NULL);
8072 
8073 	/*
8074 	 * Check this survived the L2ARC journey.
8075 	 */
8076 	ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd ||
8077 	    (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd));
8078 	zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
8079 	zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
8080 
8081 	valid_cksum = arc_cksum_is_equal(hdr, zio);
8082 
8083 	/*
8084 	 * b_rabd will always match the data as it exists on disk if it is
8085 	 * being used. Therefore if we are reading into b_rabd we do not
8086 	 * attempt to untransform the data.
8087 	 */
8088 	if (valid_cksum && !using_rdata)
8089 		tfm_error = l2arc_untransform(zio, cb);
8090 
8091 	if (valid_cksum && tfm_error == 0 && zio->io_error == 0 &&
8092 	    !HDR_L2_EVICTED(hdr)) {
8093 		mutex_exit(hash_lock);
8094 		zio->io_private = hdr;
8095 		arc_read_done(zio);
8096 	} else {
8097 		mutex_exit(hash_lock);
8098 		/*
8099 		 * Buffer didn't survive caching.  Increment stats and
8100 		 * reissue to the original storage device.
8101 		 */
8102 		if (zio->io_error != 0) {
8103 			ARCSTAT_BUMP(arcstat_l2_io_error);
8104 		} else {
8105 			zio->io_error = SET_ERROR(EIO);
8106 		}
8107 		if (!valid_cksum || tfm_error != 0)
8108 			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
8109 
8110 		/*
8111 		 * If there's no waiter, issue an async i/o to the primary
8112 		 * storage now.  If there *is* a waiter, the caller must
8113 		 * issue the i/o in a context where it's OK to block.
8114 		 */
8115 		if (zio->io_waiter == NULL) {
8116 			zio_t *pio = zio_unique_parent(zio);
8117 			void *abd = (using_rdata) ?
8118 			    hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd;
8119 
8120 			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
8121 
8122 			zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp,
8123 			    abd, zio->io_size, arc_read_done,
8124 			    hdr, zio->io_priority, cb->l2rcb_flags,
8125 			    &cb->l2rcb_zb));
8126 		}
8127 	}
8128 
8129 	kmem_free(cb, sizeof (l2arc_read_callback_t));
8130 }
8131 
8132 /*
8133  * This is the list priority from which the L2ARC will search for pages to
8134  * cache.  This is used within loops (0..3) to cycle through lists in the
8135  * desired order.  This order can have a significant effect on cache
8136  * performance.
8137  *
8138  * Currently the metadata lists are hit first, MFU then MRU, followed by
8139  * the data lists.  This function returns a locked list, and also returns
8140  * the lock pointer.
8141  */
8142 static multilist_sublist_t *
8143 l2arc_sublist_lock(int list_num)
8144 {
8145 	multilist_t *ml = NULL;
8146 	unsigned int idx;
8147 
8148 	ASSERT(list_num >= 0 && list_num <= 3);
8149 
8150 	switch (list_num) {
8151 	case 0:
8152 		ml = arc_mfu->arcs_list[ARC_BUFC_METADATA];
8153 		break;
8154 	case 1:
8155 		ml = arc_mru->arcs_list[ARC_BUFC_METADATA];
8156 		break;
8157 	case 2:
8158 		ml = arc_mfu->arcs_list[ARC_BUFC_DATA];
8159 		break;
8160 	case 3:
8161 		ml = arc_mru->arcs_list[ARC_BUFC_DATA];
8162 		break;
8163 	}
8164 
8165 	/*
8166 	 * Return a randomly-selected sublist. This is acceptable
8167 	 * because the caller feeds only a little bit of data for each
8168 	 * call (8MB). Subsequent calls will result in different
8169 	 * sublists being selected.
8170 	 */
8171 	idx = multilist_get_random_index(ml);
8172 	return (multilist_sublist_lock(ml, idx));
8173 }
8174 
8175 /*
8176  * Evict buffers from the device write hand to the distance specified in
8177  * bytes.  This distance may span populated buffers, it may span nothing.
8178  * This is clearing a region on the L2ARC device ready for writing.
8179  * If the 'all' boolean is set, every buffer is evicted.
8180  */
8181 static void
8182 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
8183 {
8184 	list_t *buflist;
8185 	arc_buf_hdr_t *hdr, *hdr_prev;
8186 	kmutex_t *hash_lock;
8187 	uint64_t taddr;
8188 
8189 	buflist = &dev->l2ad_buflist;
8190 
8191 	if (!all && dev->l2ad_first) {
8192 		/*
8193 		 * This is the first sweep through the device.  There is
8194 		 * nothing to evict.
8195 		 */
8196 		return;
8197 	}
8198 
8199 	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
8200 		/*
8201 		 * When nearing the end of the device, evict to the end
8202 		 * before the device write hand jumps to the start.
8203 		 */
8204 		taddr = dev->l2ad_end;
8205 	} else {
8206 		taddr = dev->l2ad_hand + distance;
8207 	}
8208 	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
8209 	    uint64_t, taddr, boolean_t, all);
8210 
8211 top:
8212 	mutex_enter(&dev->l2ad_mtx);
8213 	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
8214 		hdr_prev = list_prev(buflist, hdr);
8215 
8216 		hash_lock = HDR_LOCK(hdr);
8217 
8218 		/*
8219 		 * We cannot use mutex_enter or else we can deadlock
8220 		 * with l2arc_write_buffers (due to swapping the order
8221 		 * the hash lock and l2ad_mtx are taken).
8222 		 */
8223 		if (!mutex_tryenter(hash_lock)) {
8224 			/*
8225 			 * Missed the hash lock.  Retry.
8226 			 */
8227 			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
8228 			mutex_exit(&dev->l2ad_mtx);
8229 			mutex_enter(hash_lock);
8230 			mutex_exit(hash_lock);
8231 			goto top;
8232 		}
8233 
8234 		/*
8235 		 * A header can't be on this list if it doesn't have L2 header.
8236 		 */
8237 		ASSERT(HDR_HAS_L2HDR(hdr));
8238 
8239 		/* Ensure this header has finished being written. */
8240 		ASSERT(!HDR_L2_WRITING(hdr));
8241 		ASSERT(!HDR_L2_WRITE_HEAD(hdr));
8242 
8243 		if (!all && (hdr->b_l2hdr.b_daddr >= taddr ||
8244 		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
8245 			/*
8246 			 * We've evicted to the target address,
8247 			 * or the end of the device.
8248 			 */
8249 			mutex_exit(hash_lock);
8250 			break;
8251 		}
8252 
8253 		if (!HDR_HAS_L1HDR(hdr)) {
8254 			ASSERT(!HDR_L2_READING(hdr));
8255 			/*
8256 			 * This doesn't exist in the ARC.  Destroy.
8257 			 * arc_hdr_destroy() will call list_remove()
8258 			 * and decrement arcstat_l2_lsize.
8259 			 */
8260 			arc_change_state(arc_anon, hdr, hash_lock);
8261 			arc_hdr_destroy(hdr);
8262 		} else {
8263 			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
8264 			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
8265 			/*
8266 			 * Invalidate issued or about to be issued
8267 			 * reads, since we may be about to write
8268 			 * over this location.
8269 			 */
8270 			if (HDR_L2_READING(hdr)) {
8271 				ARCSTAT_BUMP(arcstat_l2_evict_reading);
8272 				arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED);
8273 			}
8274 
8275 			arc_hdr_l2hdr_destroy(hdr);
8276 		}
8277 		mutex_exit(hash_lock);
8278 	}
8279 	mutex_exit(&dev->l2ad_mtx);
8280 }
8281 
8282 /*
8283  * Handle any abd transforms that might be required for writing to the L2ARC.
8284  * If successful, this function will always return an abd with the data
8285  * transformed as it is on disk in a new abd of asize bytes.
8286  */
8287 static int
8288 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize,
8289     abd_t **abd_out)
8290 {
8291 	int ret;
8292 	void *tmp = NULL;
8293 	abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd;
8294 	enum zio_compress compress = HDR_GET_COMPRESS(hdr);
8295 	uint64_t psize = HDR_GET_PSIZE(hdr);
8296 	uint64_t size = arc_hdr_size(hdr);
8297 	boolean_t ismd = HDR_ISTYPE_METADATA(hdr);
8298 	boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS);
8299 	dsl_crypto_key_t *dck = NULL;
8300 	uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 };
8301 	boolean_t no_crypt = B_FALSE;
8302 
8303 	ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF &&
8304 	    !HDR_COMPRESSION_ENABLED(hdr)) ||
8305 	    HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize);
8306 	ASSERT3U(psize, <=, asize);
8307 
8308 	/*
8309 	 * If this data simply needs its own buffer, we simply allocate it
8310 	 * and copy the data. This may be done to eliminate a dependency on a
8311 	 * shared buffer or to reallocate the buffer to match asize.
8312 	 */
8313 	if (HDR_HAS_RABD(hdr) && asize != psize) {
8314 		ASSERT3U(asize, >=, psize);
8315 		to_write = abd_alloc_for_io(asize, ismd);
8316 		abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize);
8317 		if (psize != asize)
8318 			abd_zero_off(to_write, psize, asize - psize);
8319 		goto out;
8320 	}
8321 
8322 	if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) &&
8323 	    !HDR_ENCRYPTED(hdr)) {
8324 		ASSERT3U(size, ==, psize);
8325 		to_write = abd_alloc_for_io(asize, ismd);
8326 		abd_copy(to_write, hdr->b_l1hdr.b_pabd, size);
8327 		if (size != asize)
8328 			abd_zero_off(to_write, size, asize - size);
8329 		goto out;
8330 	}
8331 
8332 	if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) {
8333 		cabd = abd_alloc_for_io(asize, ismd);
8334 		tmp = abd_borrow_buf(cabd, asize);
8335 
8336 		psize = zio_compress_data(compress, to_write, tmp, size);
8337 		ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr));
8338 		if (psize < asize)
8339 			bzero((char *)tmp + psize, asize - psize);
8340 		psize = HDR_GET_PSIZE(hdr);
8341 		abd_return_buf_copy(cabd, tmp, asize);
8342 		to_write = cabd;
8343 	}
8344 
8345 	if (HDR_ENCRYPTED(hdr)) {
8346 		eabd = abd_alloc_for_io(asize, ismd);
8347 
8348 		/*
8349 		 * If the dataset was disowned before the buffer
8350 		 * made it to this point, the key to re-encrypt
8351 		 * it won't be available. In this case we simply
8352 		 * won't write the buffer to the L2ARC.
8353 		 */
8354 		ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj,
8355 		    FTAG, &dck);
8356 		if (ret != 0)
8357 			goto error;
8358 
8359 		ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key,
8360 		    hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt,
8361 		    hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd,
8362 		    &no_crypt);
8363 		if (ret != 0)
8364 			goto error;
8365 
8366 		if (no_crypt)
8367 			abd_copy(eabd, to_write, psize);
8368 
8369 		if (psize != asize)
8370 			abd_zero_off(eabd, psize, asize - psize);
8371 
8372 		/* assert that the MAC we got here matches the one we saved */
8373 		ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN));
8374 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
8375 
8376 		if (to_write == cabd)
8377 			abd_free(cabd);
8378 
8379 		to_write = eabd;
8380 	}
8381 
8382 out:
8383 	ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd);
8384 	*abd_out = to_write;
8385 	return (0);
8386 
8387 error:
8388 	if (dck != NULL)
8389 		spa_keystore_dsl_key_rele(spa, dck, FTAG);
8390 	if (cabd != NULL)
8391 		abd_free(cabd);
8392 	if (eabd != NULL)
8393 		abd_free(eabd);
8394 
8395 	*abd_out = NULL;
8396 	return (ret);
8397 }
8398 
8399 /*
8400  * Find and write ARC buffers to the L2ARC device.
8401  *
8402  * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
8403  * for reading until they have completed writing.
8404  * The headroom_boost is an in-out parameter used to maintain headroom boost
8405  * state between calls to this function.
8406  *
8407  * Returns the number of bytes actually written (which may be smaller than
8408  * the delta by which the device hand has changed due to alignment).
8409  */
8410 static uint64_t
8411 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
8412 {
8413 	arc_buf_hdr_t *hdr, *hdr_prev, *head;
8414 	uint64_t write_asize, write_psize, write_lsize, headroom;
8415 	boolean_t full;
8416 	l2arc_write_callback_t *cb;
8417 	zio_t *pio, *wzio;
8418 	uint64_t guid = spa_load_guid(spa);
8419 
8420 	ASSERT3P(dev->l2ad_vdev, !=, NULL);
8421 
8422 	pio = NULL;
8423 	write_lsize = write_asize = write_psize = 0;
8424 	full = B_FALSE;
8425 	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
8426 	arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR);
8427 
8428 	/*
8429 	 * Copy buffers for L2ARC writing.
8430 	 */
8431 	for (int try = 0; try <= 3; try++) {
8432 		multilist_sublist_t *mls = l2arc_sublist_lock(try);
8433 		uint64_t passed_sz = 0;
8434 
8435 		VERIFY3P(mls, !=, NULL);
8436 
8437 		/*
8438 		 * L2ARC fast warmup.
8439 		 *
8440 		 * Until the ARC is warm and starts to evict, read from the
8441 		 * head of the ARC lists rather than the tail.
8442 		 */
8443 		if (arc_warm == B_FALSE)
8444 			hdr = multilist_sublist_head(mls);
8445 		else
8446 			hdr = multilist_sublist_tail(mls);
8447 
8448 		headroom = target_sz * l2arc_headroom;
8449 		if (zfs_compressed_arc_enabled)
8450 			headroom = (headroom * l2arc_headroom_boost) / 100;
8451 
8452 		for (; hdr; hdr = hdr_prev) {
8453 			kmutex_t *hash_lock;
8454 			abd_t *to_write = NULL;
8455 
8456 			if (arc_warm == B_FALSE)
8457 				hdr_prev = multilist_sublist_next(mls, hdr);
8458 			else
8459 				hdr_prev = multilist_sublist_prev(mls, hdr);
8460 
8461 			hash_lock = HDR_LOCK(hdr);
8462 			if (!mutex_tryenter(hash_lock)) {
8463 				/*
8464 				 * Skip this buffer rather than waiting.
8465 				 */
8466 				continue;
8467 			}
8468 
8469 			passed_sz += HDR_GET_LSIZE(hdr);
8470 			if (passed_sz > headroom) {
8471 				/*
8472 				 * Searched too far.
8473 				 */
8474 				mutex_exit(hash_lock);
8475 				break;
8476 			}
8477 
8478 			if (!l2arc_write_eligible(guid, hdr)) {
8479 				mutex_exit(hash_lock);
8480 				continue;
8481 			}
8482 
8483 			/*
8484 			 * We rely on the L1 portion of the header below, so
8485 			 * it's invalid for this header to have been evicted out
8486 			 * of the ghost cache, prior to being written out. The
8487 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8488 			 */
8489 			ASSERT(HDR_HAS_L1HDR(hdr));
8490 
8491 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8492 			ASSERT3U(arc_hdr_size(hdr), >, 0);
8493 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8494 			    HDR_HAS_RABD(hdr));
8495 			uint64_t psize = HDR_GET_PSIZE(hdr);
8496 			uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev,
8497 			    psize);
8498 
8499 			if ((write_asize + asize) > target_sz) {
8500 				full = B_TRUE;
8501 				mutex_exit(hash_lock);
8502 				break;
8503 			}
8504 
8505 			/*
8506 			 * We rely on the L1 portion of the header below, so
8507 			 * it's invalid for this header to have been evicted out
8508 			 * of the ghost cache, prior to being written out. The
8509 			 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
8510 			 */
8511 			arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING);
8512 			ASSERT(HDR_HAS_L1HDR(hdr));
8513 
8514 			ASSERT3U(HDR_GET_PSIZE(hdr), >, 0);
8515 			ASSERT(hdr->b_l1hdr.b_pabd != NULL ||
8516 			    HDR_HAS_RABD(hdr));
8517 			ASSERT3U(arc_hdr_size(hdr), >, 0);
8518 
8519 			/*
8520 			 * If this header has b_rabd, we can use this since it
8521 			 * must always match the data exactly as it exists on
8522 			 * disk. Otherwise, the L2ARC can normally use the
8523 			 * hdr's data, but if we're sharing data between the
8524 			 * hdr and one of its bufs, L2ARC needs its own copy of
8525 			 * the data so that the ZIO below can't race with the
8526 			 * buf consumer. To ensure that this copy will be
8527 			 * available for the lifetime of the ZIO and be cleaned
8528 			 * up afterwards, we add it to the l2arc_free_on_write
8529 			 * queue. If we need to apply any transforms to the
8530 			 * data (compression, encryption) we will also need the
8531 			 * extra buffer.
8532 			 */
8533 			if (HDR_HAS_RABD(hdr) && psize == asize) {
8534 				to_write = hdr->b_crypt_hdr.b_rabd;
8535 			} else if ((HDR_COMPRESSION_ENABLED(hdr) ||
8536 			    HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) &&
8537 			    !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) &&
8538 			    psize == asize) {
8539 				to_write = hdr->b_l1hdr.b_pabd;
8540 			} else {
8541 				int ret;
8542 				arc_buf_contents_t type = arc_buf_type(hdr);
8543 
8544 				ret = l2arc_apply_transforms(spa, hdr, asize,
8545 				    &to_write);
8546 				if (ret != 0) {
8547 					arc_hdr_clear_flags(hdr,
8548 					    ARC_FLAG_L2_WRITING);
8549 					mutex_exit(hash_lock);
8550 					continue;
8551 				}
8552 
8553 				l2arc_free_abd_on_write(to_write, asize, type);
8554 			}
8555 
8556 			if (pio == NULL) {
8557 				/*
8558 				 * Insert a dummy header on the buflist so
8559 				 * l2arc_write_done() can find where the
8560 				 * write buffers begin without searching.
8561 				 */
8562 				mutex_enter(&dev->l2ad_mtx);
8563 				list_insert_head(&dev->l2ad_buflist, head);
8564 				mutex_exit(&dev->l2ad_mtx);
8565 
8566 				cb = kmem_alloc(
8567 				    sizeof (l2arc_write_callback_t), KM_SLEEP);
8568 				cb->l2wcb_dev = dev;
8569 				cb->l2wcb_head = head;
8570 				pio = zio_root(spa, l2arc_write_done, cb,
8571 				    ZIO_FLAG_CANFAIL);
8572 			}
8573 
8574 			hdr->b_l2hdr.b_dev = dev;
8575 			hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
8576 			arc_hdr_set_flags(hdr,
8577 			    ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR);
8578 
8579 			mutex_enter(&dev->l2ad_mtx);
8580 			list_insert_head(&dev->l2ad_buflist, hdr);
8581 			mutex_exit(&dev->l2ad_mtx);
8582 
8583 			(void) zfs_refcount_add_many(&dev->l2ad_alloc,
8584 			    arc_hdr_size(hdr), hdr);
8585 
8586 			wzio = zio_write_phys(pio, dev->l2ad_vdev,
8587 			    hdr->b_l2hdr.b_daddr, asize, to_write,
8588 			    ZIO_CHECKSUM_OFF, NULL, hdr,
8589 			    ZIO_PRIORITY_ASYNC_WRITE,
8590 			    ZIO_FLAG_CANFAIL, B_FALSE);
8591 
8592 			write_lsize += HDR_GET_LSIZE(hdr);
8593 			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
8594 			    zio_t *, wzio);
8595 
8596 			write_psize += psize;
8597 			write_asize += asize;
8598 			dev->l2ad_hand += asize;
8599 			vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
8600 
8601 			mutex_exit(hash_lock);
8602 
8603 			(void) zio_nowait(wzio);
8604 		}
8605 
8606 		multilist_sublist_unlock(mls);
8607 
8608 		if (full == B_TRUE)
8609 			break;
8610 	}
8611 
8612 	/* No buffers selected for writing? */
8613 	if (pio == NULL) {
8614 		ASSERT0(write_lsize);
8615 		ASSERT(!HDR_HAS_L1HDR(head));
8616 		kmem_cache_free(hdr_l2only_cache, head);
8617 		return (0);
8618 	}
8619 
8620 	ASSERT3U(write_asize, <=, target_sz);
8621 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
8622 	ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize);
8623 	ARCSTAT_INCR(arcstat_l2_lsize, write_lsize);
8624 	ARCSTAT_INCR(arcstat_l2_psize, write_psize);
8625 
8626 	/*
8627 	 * Bump device hand to the device start if it is approaching the end.
8628 	 * l2arc_evict() will already have evicted ahead for this case.
8629 	 */
8630 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
8631 		dev->l2ad_hand = dev->l2ad_start;
8632 		dev->l2ad_first = B_FALSE;
8633 	}
8634 
8635 	dev->l2ad_writing = B_TRUE;
8636 	(void) zio_wait(pio);
8637 	dev->l2ad_writing = B_FALSE;
8638 
8639 	return (write_asize);
8640 }
8641 
8642 /*
8643  * This thread feeds the L2ARC at regular intervals.  This is the beating
8644  * heart of the L2ARC.
8645  */
8646 /* ARGSUSED */
8647 static void
8648 l2arc_feed_thread(void *unused)
8649 {
8650 	callb_cpr_t cpr;
8651 	l2arc_dev_t *dev;
8652 	spa_t *spa;
8653 	uint64_t size, wrote;
8654 	clock_t begin, next = ddi_get_lbolt();
8655 
8656 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
8657 
8658 	mutex_enter(&l2arc_feed_thr_lock);
8659 
8660 	while (l2arc_thread_exit == 0) {
8661 		CALLB_CPR_SAFE_BEGIN(&cpr);
8662 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
8663 		    next);
8664 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
8665 		next = ddi_get_lbolt() + hz;
8666 
8667 		/*
8668 		 * Quick check for L2ARC devices.
8669 		 */
8670 		mutex_enter(&l2arc_dev_mtx);
8671 		if (l2arc_ndev == 0) {
8672 			mutex_exit(&l2arc_dev_mtx);
8673 			continue;
8674 		}
8675 		mutex_exit(&l2arc_dev_mtx);
8676 		begin = ddi_get_lbolt();
8677 
8678 		/*
8679 		 * This selects the next l2arc device to write to, and in
8680 		 * doing so the next spa to feed from: dev->l2ad_spa.   This
8681 		 * will return NULL if there are now no l2arc devices or if
8682 		 * they are all faulted.
8683 		 *
8684 		 * If a device is returned, its spa's config lock is also
8685 		 * held to prevent device removal.  l2arc_dev_get_next()
8686 		 * will grab and release l2arc_dev_mtx.
8687 		 */
8688 		if ((dev = l2arc_dev_get_next()) == NULL)
8689 			continue;
8690 
8691 		spa = dev->l2ad_spa;
8692 		ASSERT3P(spa, !=, NULL);
8693 
8694 		/*
8695 		 * If the pool is read-only then force the feed thread to
8696 		 * sleep a little longer.
8697 		 */
8698 		if (!spa_writeable(spa)) {
8699 			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
8700 			spa_config_exit(spa, SCL_L2ARC, dev);
8701 			continue;
8702 		}
8703 
8704 		/*
8705 		 * Avoid contributing to memory pressure.
8706 		 */
8707 		if (arc_reclaim_needed()) {
8708 			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
8709 			spa_config_exit(spa, SCL_L2ARC, dev);
8710 			continue;
8711 		}
8712 
8713 		ARCSTAT_BUMP(arcstat_l2_feeds);
8714 
8715 		size = l2arc_write_size();
8716 
8717 		/*
8718 		 * Evict L2ARC buffers that will be overwritten.
8719 		 */
8720 		l2arc_evict(dev, size, B_FALSE);
8721 
8722 		/*
8723 		 * Write ARC buffers.
8724 		 */
8725 		wrote = l2arc_write_buffers(spa, dev, size);
8726 
8727 		/*
8728 		 * Calculate interval between writes.
8729 		 */
8730 		next = l2arc_write_interval(begin, size, wrote);
8731 		spa_config_exit(spa, SCL_L2ARC, dev);
8732 	}
8733 
8734 	l2arc_thread_exit = 0;
8735 	cv_broadcast(&l2arc_feed_thr_cv);
8736 	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
8737 	thread_exit();
8738 }
8739 
8740 boolean_t
8741 l2arc_vdev_present(vdev_t *vd)
8742 {
8743 	l2arc_dev_t *dev;
8744 
8745 	mutex_enter(&l2arc_dev_mtx);
8746 	for (dev = list_head(l2arc_dev_list); dev != NULL;
8747 	    dev = list_next(l2arc_dev_list, dev)) {
8748 		if (dev->l2ad_vdev == vd)
8749 			break;
8750 	}
8751 	mutex_exit(&l2arc_dev_mtx);
8752 
8753 	return (dev != NULL);
8754 }
8755 
8756 /*
8757  * Add a vdev for use by the L2ARC.  By this point the spa has already
8758  * validated the vdev and opened it.
8759  */
8760 void
8761 l2arc_add_vdev(spa_t *spa, vdev_t *vd)
8762 {
8763 	l2arc_dev_t *adddev;
8764 
8765 	ASSERT(!l2arc_vdev_present(vd));
8766 
8767 	/*
8768 	 * Create a new l2arc device entry.
8769 	 */
8770 	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
8771 	adddev->l2ad_spa = spa;
8772 	adddev->l2ad_vdev = vd;
8773 	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
8774 	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
8775 	adddev->l2ad_hand = adddev->l2ad_start;
8776 	adddev->l2ad_first = B_TRUE;
8777 	adddev->l2ad_writing = B_FALSE;
8778 
8779 	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
8780 	/*
8781 	 * This is a list of all ARC buffers that are still valid on the
8782 	 * device.
8783 	 */
8784 	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
8785 	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
8786 
8787 	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
8788 	zfs_refcount_create(&adddev->l2ad_alloc);
8789 
8790 	/*
8791 	 * Add device to global list
8792 	 */
8793 	mutex_enter(&l2arc_dev_mtx);
8794 	list_insert_head(l2arc_dev_list, adddev);
8795 	atomic_inc_64(&l2arc_ndev);
8796 	mutex_exit(&l2arc_dev_mtx);
8797 }
8798 
8799 /*
8800  * Remove a vdev from the L2ARC.
8801  */
8802 void
8803 l2arc_remove_vdev(vdev_t *vd)
8804 {
8805 	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
8806 
8807 	/*
8808 	 * Find the device by vdev
8809 	 */
8810 	mutex_enter(&l2arc_dev_mtx);
8811 	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
8812 		nextdev = list_next(l2arc_dev_list, dev);
8813 		if (vd == dev->l2ad_vdev) {
8814 			remdev = dev;
8815 			break;
8816 		}
8817 	}
8818 	ASSERT3P(remdev, !=, NULL);
8819 
8820 	/*
8821 	 * Remove device from global list
8822 	 */
8823 	list_remove(l2arc_dev_list, remdev);
8824 	l2arc_dev_last = NULL;		/* may have been invalidated */
8825 	atomic_dec_64(&l2arc_ndev);
8826 	mutex_exit(&l2arc_dev_mtx);
8827 
8828 	/*
8829 	 * Clear all buflists and ARC references.  L2ARC device flush.
8830 	 */
8831 	l2arc_evict(remdev, 0, B_TRUE);
8832 	list_destroy(&remdev->l2ad_buflist);
8833 	mutex_destroy(&remdev->l2ad_mtx);
8834 	zfs_refcount_destroy(&remdev->l2ad_alloc);
8835 	kmem_free(remdev, sizeof (l2arc_dev_t));
8836 }
8837 
8838 void
8839 l2arc_init(void)
8840 {
8841 	l2arc_thread_exit = 0;
8842 	l2arc_ndev = 0;
8843 	l2arc_writes_sent = 0;
8844 	l2arc_writes_done = 0;
8845 
8846 	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
8847 	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
8848 	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
8849 	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
8850 
8851 	l2arc_dev_list = &L2ARC_dev_list;
8852 	l2arc_free_on_write = &L2ARC_free_on_write;
8853 	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
8854 	    offsetof(l2arc_dev_t, l2ad_node));
8855 	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
8856 	    offsetof(l2arc_data_free_t, l2df_list_node));
8857 }
8858 
8859 void
8860 l2arc_fini(void)
8861 {
8862 	/*
8863 	 * This is called from dmu_fini(), which is called from spa_fini();
8864 	 * Because of this, we can assume that all l2arc devices have
8865 	 * already been removed when the pools themselves were removed.
8866 	 */
8867 
8868 	l2arc_do_free_on_write();
8869 
8870 	mutex_destroy(&l2arc_feed_thr_lock);
8871 	cv_destroy(&l2arc_feed_thr_cv);
8872 	mutex_destroy(&l2arc_dev_mtx);
8873 	mutex_destroy(&l2arc_free_on_write_mtx);
8874 
8875 	list_destroy(l2arc_dev_list);
8876 	list_destroy(l2arc_free_on_write);
8877 }
8878 
8879 void
8880 l2arc_start(void)
8881 {
8882 	if (!(spa_mode_global & FWRITE))
8883 		return;
8884 
8885 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
8886 	    TS_RUN, minclsyspri);
8887 }
8888 
8889 void
8890 l2arc_stop(void)
8891 {
8892 	if (!(spa_mode_global & FWRITE))
8893 		return;
8894 
8895 	mutex_enter(&l2arc_feed_thr_lock);
8896 	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
8897 	l2arc_thread_exit = 1;
8898 	while (l2arc_thread_exit != 0)
8899 		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
8900 	mutex_exit(&l2arc_feed_thr_lock);
8901 }
8902