xref: /titanic_50/usr/src/uts/common/fs/zfs/arc.c (revision f5ca7025dd17c71560135c6dca33cac4bec399b5)
1  /*
2   * CDDL HEADER START
3   *
4   * The contents of this file are subject to the terms of the
5   * Common Development and Distribution License (the "License").
6   * You may not use this file except in compliance with the License.
7   *
8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9   * or http://www.opensolaris.org/os/licensing.
10   * See the License for the specific language governing permissions
11   * and limitations under the License.
12   *
13   * When distributing Covered Code, include this CDDL HEADER in each
14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15   * If applicable, add the following below this CDDL HEADER, with the
16   * fields enclosed by brackets "[]" replaced with your own identifying
17   * information: Portions Copyright [yyyy] [name of copyright owner]
18   *
19   * CDDL HEADER END
20   */
21  /*
22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25   * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26   * Copyright 2015 Nexenta Systems, Inc.  All rights reserved.
27   */
28  
29  /*
30   * DVA-based Adjustable Replacement Cache
31   *
32   * While much of the theory of operation used here is
33   * based on the self-tuning, low overhead replacement cache
34   * presented by Megiddo and Modha at FAST 2003, there are some
35   * significant differences:
36   *
37   * 1. The Megiddo and Modha model assumes any page is evictable.
38   * Pages in its cache cannot be "locked" into memory.  This makes
39   * the eviction algorithm simple: evict the last page in the list.
40   * This also make the performance characteristics easy to reason
41   * about.  Our cache is not so simple.  At any given moment, some
42   * subset of the blocks in the cache are un-evictable because we
43   * have handed out a reference to them.  Blocks are only evictable
44   * when there are no external references active.  This makes
45   * eviction far more problematic:  we choose to evict the evictable
46   * blocks that are the "lowest" in the list.
47   *
48   * There are times when it is not possible to evict the requested
49   * space.  In these circumstances we are unable to adjust the cache
50   * size.  To prevent the cache growing unbounded at these times we
51   * implement a "cache throttle" that slows the flow of new data
52   * into the cache until we can make space available.
53   *
54   * 2. The Megiddo and Modha model assumes a fixed cache size.
55   * Pages are evicted when the cache is full and there is a cache
56   * miss.  Our model has a variable sized cache.  It grows with
57   * high use, but also tries to react to memory pressure from the
58   * operating system: decreasing its size when system memory is
59   * tight.
60   *
61   * 3. The Megiddo and Modha model assumes a fixed page size. All
62   * elements of the cache are therefore exactly the same size.  So
63   * when adjusting the cache size following a cache miss, its simply
64   * a matter of choosing a single page to evict.  In our model, we
65   * have variable sized cache blocks (rangeing from 512 bytes to
66   * 128K bytes).  We therefore choose a set of blocks to evict to make
67   * space for a cache miss that approximates as closely as possible
68   * the space used by the new block.
69   *
70   * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71   * by N. Megiddo & D. Modha, FAST 2003
72   */
73  
74  /*
75   * The locking model:
76   *
77   * A new reference to a cache buffer can be obtained in two
78   * ways: 1) via a hash table lookup using the DVA as a key,
79   * or 2) via one of the ARC lists.  The arc_read() interface
80   * uses method 1, while the internal arc algorithms for
81   * adjusting the cache use method 2.  We therefore provide two
82   * types of locks: 1) the hash table lock array, and 2) the
83   * arc list locks.
84   *
85   * Buffers do not have their own mutexes, rather they rely on the
86   * hash table mutexes for the bulk of their protection (i.e. most
87   * fields in the arc_buf_hdr_t are protected by these mutexes).
88   *
89   * buf_hash_find() returns the appropriate mutex (held) when it
90   * locates the requested buffer in the hash table.  It returns
91   * NULL for the mutex if the buffer was not in the table.
92   *
93   * buf_hash_remove() expects the appropriate hash mutex to be
94   * already held before it is invoked.
95   *
96   * Each arc state also has a mutex which is used to protect the
97   * buffer list associated with the state.  When attempting to
98   * obtain a hash table lock while holding an arc list lock you
99   * must use: mutex_tryenter() to avoid deadlock.  Also note that
100   * the active state mutex must be held before the ghost state mutex.
101   *
102   * Arc buffers may have an associated eviction callback function.
103   * This function will be invoked prior to removing the buffer (e.g.
104   * in arc_do_user_evicts()).  Note however that the data associated
105   * with the buffer may be evicted prior to the callback.  The callback
106   * must be made with *no locks held* (to prevent deadlock).  Additionally,
107   * the users of callbacks must ensure that their private data is
108   * protected from simultaneous callbacks from arc_clear_callback()
109   * and arc_do_user_evicts().
110   *
111   * Note that the majority of the performance stats are manipulated
112   * with atomic operations.
113   *
114   * The L2ARC uses the l2ad_mtx on each vdev for the following:
115   *
116   *	- L2ARC buflist creation
117   *	- L2ARC buflist eviction
118   *	- L2ARC write completion, which walks L2ARC buflists
119   *	- ARC header destruction, as it removes from L2ARC buflists
120   *	- ARC header release, as it removes from L2ARC buflists
121   */
122  
123  #include <sys/spa.h>
124  #include <sys/zio.h>
125  #include <sys/zio_compress.h>
126  #include <sys/zfs_context.h>
127  #include <sys/arc.h>
128  #include <sys/refcount.h>
129  #include <sys/vdev.h>
130  #include <sys/vdev_impl.h>
131  #include <sys/dsl_pool.h>
132  #include <sys/multilist.h>
133  #ifdef _KERNEL
134  #include <sys/vmsystm.h>
135  #include <vm/anon.h>
136  #include <sys/fs/swapnode.h>
137  #include <sys/dnlc.h>
138  #endif
139  #include <sys/callb.h>
140  #include <sys/kstat.h>
141  #include <zfs_fletcher.h>
142  #include <sys/byteorder.h>
143  #include <sys/spa_impl.h>
144  #include <sys/zfs_ioctl.h>
145  
146  #ifndef _KERNEL
147  /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
148  boolean_t arc_watch = B_FALSE;
149  int arc_procfd;
150  #endif
151  
152  static kmutex_t		arc_reclaim_lock;
153  static kcondvar_t	arc_reclaim_thread_cv;
154  static boolean_t	arc_reclaim_thread_exit;
155  static kcondvar_t	arc_reclaim_waiters_cv;
156  
157  static kmutex_t		arc_user_evicts_lock;
158  static kcondvar_t	arc_user_evicts_cv;
159  static boolean_t	arc_user_evicts_thread_exit;
160  
161  uint_t arc_reduce_dnlc_percent = 3;
162  
163  /*
164   * The number of headers to evict in arc_evict_state_impl() before
165   * dropping the sublist lock and evicting from another sublist. A lower
166   * value means we're more likely to evict the "correct" header (i.e. the
167   * oldest header in the arc state), but comes with higher overhead
168   * (i.e. more invocations of arc_evict_state_impl()).
169   */
170  int zfs_arc_evict_batch_limit = 10;
171  
172  /*
173   * The number of sublists used for each of the arc state lists. If this
174   * is not set to a suitable value by the user, it will be configured to
175   * the number of CPUs on the system in arc_init().
176   */
177  int zfs_arc_num_sublists_per_state = 0;
178  
179  /* number of seconds before growing cache again */
180  static int		arc_grow_retry = 60;
181  
182  /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
183  int		zfs_arc_overflow_shift = 8;
184  
185  /* shift of arc_c for calculating both min and max arc_p */
186  static int		arc_p_min_shift = 4;
187  
188  /* log2(fraction of arc to reclaim) */
189  static int		arc_shrink_shift = 7;
190  
191  /*
192   * log2(fraction of ARC which must be free to allow growing).
193   * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
194   * when reading a new block into the ARC, we will evict an equal-sized block
195   * from the ARC.
196   *
197   * This must be less than arc_shrink_shift, so that when we shrink the ARC,
198   * we will still not allow it to grow.
199   */
200  int			arc_no_grow_shift = 5;
201  
202  
203  /*
204   * minimum lifespan of a prefetch block in clock ticks
205   * (initialized in arc_init())
206   */
207  static int		arc_min_prefetch_lifespan;
208  
209  /*
210   * If this percent of memory is free, don't throttle.
211   */
212  int arc_lotsfree_percent = 10;
213  
214  static int arc_dead;
215  
216  /*
217   * The arc has filled available memory and has now warmed up.
218   */
219  static boolean_t arc_warm;
220  
221  /*
222   * These tunables are for performance analysis.
223   */
224  uint64_t zfs_arc_max;
225  uint64_t zfs_arc_min;
226  uint64_t zfs_arc_meta_limit = 0;
227  uint64_t zfs_arc_meta_min = 0;
228  int zfs_arc_grow_retry = 0;
229  int zfs_arc_shrink_shift = 0;
230  int zfs_arc_p_min_shift = 0;
231  int zfs_disable_dup_eviction = 0;
232  int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
233  
234  /*
235   * Note that buffers can be in one of 6 states:
236   *	ARC_anon	- anonymous (discussed below)
237   *	ARC_mru		- recently used, currently cached
238   *	ARC_mru_ghost	- recentely used, no longer in cache
239   *	ARC_mfu		- frequently used, currently cached
240   *	ARC_mfu_ghost	- frequently used, no longer in cache
241   *	ARC_l2c_only	- exists in L2ARC but not other states
242   * When there are no active references to the buffer, they are
243   * are linked onto a list in one of these arc states.  These are
244   * the only buffers that can be evicted or deleted.  Within each
245   * state there are multiple lists, one for meta-data and one for
246   * non-meta-data.  Meta-data (indirect blocks, blocks of dnodes,
247   * etc.) is tracked separately so that it can be managed more
248   * explicitly: favored over data, limited explicitly.
249   *
250   * Anonymous buffers are buffers that are not associated with
251   * a DVA.  These are buffers that hold dirty block copies
252   * before they are written to stable storage.  By definition,
253   * they are "ref'd" and are considered part of arc_mru
254   * that cannot be freed.  Generally, they will aquire a DVA
255   * as they are written and migrate onto the arc_mru list.
256   *
257   * The ARC_l2c_only state is for buffers that are in the second
258   * level ARC but no longer in any of the ARC_m* lists.  The second
259   * level ARC itself may also contain buffers that are in any of
260   * the ARC_m* states - meaning that a buffer can exist in two
261   * places.  The reason for the ARC_l2c_only state is to keep the
262   * buffer header in the hash table, so that reads that hit the
263   * second level ARC benefit from these fast lookups.
264   */
265  
266  typedef struct arc_state {
267  	/*
268  	 * list of evictable buffers
269  	 */
270  	multilist_t arcs_list[ARC_BUFC_NUMTYPES];
271  	/*
272  	 * total amount of evictable data in this state
273  	 */
274  	uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
275  	/*
276  	 * total amount of data in this state; this includes: evictable,
277  	 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
278  	 */
279  	refcount_t arcs_size;
280  } arc_state_t;
281  
282  /* The 6 states: */
283  static arc_state_t ARC_anon;
284  static arc_state_t ARC_mru;
285  static arc_state_t ARC_mru_ghost;
286  static arc_state_t ARC_mfu;
287  static arc_state_t ARC_mfu_ghost;
288  static arc_state_t ARC_l2c_only;
289  
290  typedef struct arc_stats {
291  	kstat_named_t arcstat_hits;
292  	kstat_named_t arcstat_misses;
293  	kstat_named_t arcstat_demand_hits_data;
294  	kstat_named_t arcstat_demand_misses_data;
295  	kstat_named_t arcstat_demand_hits_metadata;
296  	kstat_named_t arcstat_demand_misses_metadata;
297  	kstat_named_t arcstat_prefetch_hits_data;
298  	kstat_named_t arcstat_prefetch_misses_data;
299  	kstat_named_t arcstat_prefetch_hits_metadata;
300  	kstat_named_t arcstat_prefetch_misses_metadata;
301  	kstat_named_t arcstat_mru_hits;
302  	kstat_named_t arcstat_mru_ghost_hits;
303  	kstat_named_t arcstat_mfu_hits;
304  	kstat_named_t arcstat_mfu_ghost_hits;
305  	kstat_named_t arcstat_deleted;
306  	/*
307  	 * Number of buffers that could not be evicted because the hash lock
308  	 * was held by another thread.  The lock may not necessarily be held
309  	 * by something using the same buffer, since hash locks are shared
310  	 * by multiple buffers.
311  	 */
312  	kstat_named_t arcstat_mutex_miss;
313  	/*
314  	 * Number of buffers skipped because they have I/O in progress, are
315  	 * indrect prefetch buffers that have not lived long enough, or are
316  	 * not from the spa we're trying to evict from.
317  	 */
318  	kstat_named_t arcstat_evict_skip;
319  	/*
320  	 * Number of times arc_evict_state() was unable to evict enough
321  	 * buffers to reach it's target amount.
322  	 */
323  	kstat_named_t arcstat_evict_not_enough;
324  	kstat_named_t arcstat_evict_l2_cached;
325  	kstat_named_t arcstat_evict_l2_eligible;
326  	kstat_named_t arcstat_evict_l2_ineligible;
327  	kstat_named_t arcstat_evict_l2_skip;
328  	kstat_named_t arcstat_hash_elements;
329  	kstat_named_t arcstat_hash_elements_max;
330  	kstat_named_t arcstat_hash_collisions;
331  	kstat_named_t arcstat_hash_chains;
332  	kstat_named_t arcstat_hash_chain_max;
333  	kstat_named_t arcstat_p;
334  	kstat_named_t arcstat_c;
335  	kstat_named_t arcstat_c_min;
336  	kstat_named_t arcstat_c_max;
337  	kstat_named_t arcstat_size;
338  	/*
339  	 * Number of bytes consumed by internal ARC structures necessary
340  	 * for tracking purposes; these structures are not actually
341  	 * backed by ARC buffers. This includes arc_buf_hdr_t structures
342  	 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
343  	 * caches), and arc_buf_t structures (allocated via arc_buf_t
344  	 * cache).
345  	 */
346  	kstat_named_t arcstat_hdr_size;
347  	/*
348  	 * Number of bytes consumed by ARC buffers of type equal to
349  	 * ARC_BUFC_DATA. This is generally consumed by buffers backing
350  	 * on disk user data (e.g. plain file contents).
351  	 */
352  	kstat_named_t arcstat_data_size;
353  	/*
354  	 * Number of bytes consumed by ARC buffers of type equal to
355  	 * ARC_BUFC_METADATA. This is generally consumed by buffers
356  	 * backing on disk data that is used for internal ZFS
357  	 * structures (e.g. ZAP, dnode, indirect blocks, etc).
358  	 */
359  	kstat_named_t arcstat_metadata_size;
360  	/*
361  	 * Number of bytes consumed by various buffers and structures
362  	 * not actually backed with ARC buffers. This includes bonus
363  	 * buffers (allocated directly via zio_buf_* functions),
364  	 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
365  	 * cache), and dnode_t structures (allocated via dnode_t cache).
366  	 */
367  	kstat_named_t arcstat_other_size;
368  	/*
369  	 * Total number of bytes consumed by ARC buffers residing in the
370  	 * arc_anon state. This includes *all* buffers in the arc_anon
371  	 * state; e.g. data, metadata, evictable, and unevictable buffers
372  	 * are all included in this value.
373  	 */
374  	kstat_named_t arcstat_anon_size;
375  	/*
376  	 * Number of bytes consumed by ARC buffers that meet the
377  	 * following criteria: backing buffers of type ARC_BUFC_DATA,
378  	 * residing in the arc_anon state, and are eligible for eviction
379  	 * (e.g. have no outstanding holds on the buffer).
380  	 */
381  	kstat_named_t arcstat_anon_evictable_data;
382  	/*
383  	 * Number of bytes consumed by ARC buffers that meet the
384  	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
385  	 * residing in the arc_anon state, and are eligible for eviction
386  	 * (e.g. have no outstanding holds on the buffer).
387  	 */
388  	kstat_named_t arcstat_anon_evictable_metadata;
389  	/*
390  	 * Total number of bytes consumed by ARC buffers residing in the
391  	 * arc_mru state. This includes *all* buffers in the arc_mru
392  	 * state; e.g. data, metadata, evictable, and unevictable buffers
393  	 * are all included in this value.
394  	 */
395  	kstat_named_t arcstat_mru_size;
396  	/*
397  	 * Number of bytes consumed by ARC buffers that meet the
398  	 * following criteria: backing buffers of type ARC_BUFC_DATA,
399  	 * residing in the arc_mru state, and are eligible for eviction
400  	 * (e.g. have no outstanding holds on the buffer).
401  	 */
402  	kstat_named_t arcstat_mru_evictable_data;
403  	/*
404  	 * Number of bytes consumed by ARC buffers that meet the
405  	 * following criteria: backing buffers of type ARC_BUFC_METADATA,
406  	 * residing in the arc_mru state, and are eligible for eviction
407  	 * (e.g. have no outstanding holds on the buffer).
408  	 */
409  	kstat_named_t arcstat_mru_evictable_metadata;
410  	/*
411  	 * Total number of bytes that *would have been* consumed by ARC
412  	 * buffers in the arc_mru_ghost state. The key thing to note
413  	 * here, is the fact that this size doesn't actually indicate
414  	 * RAM consumption. The ghost lists only consist of headers and
415  	 * don't actually have ARC buffers linked off of these headers.
416  	 * Thus, *if* the headers had associated ARC buffers, these
417  	 * buffers *would have* consumed this number of bytes.
418  	 */
419  	kstat_named_t arcstat_mru_ghost_size;
420  	/*
421  	 * Number of bytes that *would have been* consumed by ARC
422  	 * buffers that are eligible for eviction, of type
423  	 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
424  	 */
425  	kstat_named_t arcstat_mru_ghost_evictable_data;
426  	/*
427  	 * Number of bytes that *would have been* consumed by ARC
428  	 * buffers that are eligible for eviction, of type
429  	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
430  	 */
431  	kstat_named_t arcstat_mru_ghost_evictable_metadata;
432  	/*
433  	 * Total number of bytes consumed by ARC buffers residing in the
434  	 * arc_mfu state. This includes *all* buffers in the arc_mfu
435  	 * state; e.g. data, metadata, evictable, and unevictable buffers
436  	 * are all included in this value.
437  	 */
438  	kstat_named_t arcstat_mfu_size;
439  	/*
440  	 * Number of bytes consumed by ARC buffers that are eligible for
441  	 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
442  	 * state.
443  	 */
444  	kstat_named_t arcstat_mfu_evictable_data;
445  	/*
446  	 * Number of bytes consumed by ARC buffers that are eligible for
447  	 * eviction, of type ARC_BUFC_METADATA, and reside in the
448  	 * arc_mfu state.
449  	 */
450  	kstat_named_t arcstat_mfu_evictable_metadata;
451  	/*
452  	 * Total number of bytes that *would have been* consumed by ARC
453  	 * buffers in the arc_mfu_ghost state. See the comment above
454  	 * arcstat_mru_ghost_size for more details.
455  	 */
456  	kstat_named_t arcstat_mfu_ghost_size;
457  	/*
458  	 * Number of bytes that *would have been* consumed by ARC
459  	 * buffers that are eligible for eviction, of type
460  	 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
461  	 */
462  	kstat_named_t arcstat_mfu_ghost_evictable_data;
463  	/*
464  	 * Number of bytes that *would have been* consumed by ARC
465  	 * buffers that are eligible for eviction, of type
466  	 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
467  	 */
468  	kstat_named_t arcstat_mfu_ghost_evictable_metadata;
469  	kstat_named_t arcstat_l2_hits;
470  	kstat_named_t arcstat_l2_misses;
471  	kstat_named_t arcstat_l2_feeds;
472  	kstat_named_t arcstat_l2_rw_clash;
473  	kstat_named_t arcstat_l2_read_bytes;
474  	kstat_named_t arcstat_l2_write_bytes;
475  	kstat_named_t arcstat_l2_writes_sent;
476  	kstat_named_t arcstat_l2_writes_done;
477  	kstat_named_t arcstat_l2_writes_error;
478  	kstat_named_t arcstat_l2_writes_lock_retry;
479  	kstat_named_t arcstat_l2_evict_lock_retry;
480  	kstat_named_t arcstat_l2_evict_reading;
481  	kstat_named_t arcstat_l2_evict_l1cached;
482  	kstat_named_t arcstat_l2_free_on_write;
483  	kstat_named_t arcstat_l2_cdata_free_on_write;
484  	kstat_named_t arcstat_l2_abort_lowmem;
485  	kstat_named_t arcstat_l2_cksum_bad;
486  	kstat_named_t arcstat_l2_io_error;
487  	kstat_named_t arcstat_l2_size;
488  	kstat_named_t arcstat_l2_asize;
489  	kstat_named_t arcstat_l2_hdr_size;
490  	kstat_named_t arcstat_l2_compress_successes;
491  	kstat_named_t arcstat_l2_compress_zeros;
492  	kstat_named_t arcstat_l2_compress_failures;
493  	kstat_named_t arcstat_l2_log_blk_writes;
494  	kstat_named_t arcstat_l2_log_blk_avg_size;
495  	kstat_named_t arcstat_l2_data_to_meta_ratio;
496  	kstat_named_t arcstat_l2_rebuild_successes;
497  	kstat_named_t arcstat_l2_rebuild_abort_unsupported;
498  	kstat_named_t arcstat_l2_rebuild_abort_io_errors;
499  	kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
500  	kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
501  	kstat_named_t arcstat_l2_rebuild_abort_lowmem;
502  	kstat_named_t arcstat_l2_rebuild_size;
503  	kstat_named_t arcstat_l2_rebuild_bufs;
504  	kstat_named_t arcstat_l2_rebuild_bufs_precached;
505  	kstat_named_t arcstat_l2_rebuild_psize;
506  	kstat_named_t arcstat_l2_rebuild_log_blks;
507  	kstat_named_t arcstat_memory_throttle_count;
508  	kstat_named_t arcstat_duplicate_buffers;
509  	kstat_named_t arcstat_duplicate_buffers_size;
510  	kstat_named_t arcstat_duplicate_reads;
511  	kstat_named_t arcstat_meta_used;
512  	kstat_named_t arcstat_meta_limit;
513  	kstat_named_t arcstat_meta_max;
514  	kstat_named_t arcstat_meta_min;
515  	kstat_named_t arcstat_sync_wait_for_async;
516  	kstat_named_t arcstat_demand_hit_predictive_prefetch;
517  } arc_stats_t;
518  
519  static arc_stats_t arc_stats = {
520  	{ "hits",			KSTAT_DATA_UINT64 },
521  	{ "misses",			KSTAT_DATA_UINT64 },
522  	{ "demand_data_hits",		KSTAT_DATA_UINT64 },
523  	{ "demand_data_misses",		KSTAT_DATA_UINT64 },
524  	{ "demand_metadata_hits",	KSTAT_DATA_UINT64 },
525  	{ "demand_metadata_misses",	KSTAT_DATA_UINT64 },
526  	{ "prefetch_data_hits",		KSTAT_DATA_UINT64 },
527  	{ "prefetch_data_misses",	KSTAT_DATA_UINT64 },
528  	{ "prefetch_metadata_hits",	KSTAT_DATA_UINT64 },
529  	{ "prefetch_metadata_misses",	KSTAT_DATA_UINT64 },
530  	{ "mru_hits",			KSTAT_DATA_UINT64 },
531  	{ "mru_ghost_hits",		KSTAT_DATA_UINT64 },
532  	{ "mfu_hits",			KSTAT_DATA_UINT64 },
533  	{ "mfu_ghost_hits",		KSTAT_DATA_UINT64 },
534  	{ "deleted",			KSTAT_DATA_UINT64 },
535  	{ "mutex_miss",			KSTAT_DATA_UINT64 },
536  	{ "evict_skip",			KSTAT_DATA_UINT64 },
537  	{ "evict_not_enough",		KSTAT_DATA_UINT64 },
538  	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
539  	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
540  	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
541  	{ "evict_l2_skip",		KSTAT_DATA_UINT64 },
542  	{ "hash_elements",		KSTAT_DATA_UINT64 },
543  	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
544  	{ "hash_collisions",		KSTAT_DATA_UINT64 },
545  	{ "hash_chains",		KSTAT_DATA_UINT64 },
546  	{ "hash_chain_max",		KSTAT_DATA_UINT64 },
547  	{ "p",				KSTAT_DATA_UINT64 },
548  	{ "c",				KSTAT_DATA_UINT64 },
549  	{ "c_min",			KSTAT_DATA_UINT64 },
550  	{ "c_max",			KSTAT_DATA_UINT64 },
551  	{ "size",			KSTAT_DATA_UINT64 },
552  	{ "hdr_size",			KSTAT_DATA_UINT64 },
553  	{ "data_size",			KSTAT_DATA_UINT64 },
554  	{ "metadata_size",		KSTAT_DATA_UINT64 },
555  	{ "other_size",			KSTAT_DATA_UINT64 },
556  	{ "anon_size",			KSTAT_DATA_UINT64 },
557  	{ "anon_evictable_data",	KSTAT_DATA_UINT64 },
558  	{ "anon_evictable_metadata",	KSTAT_DATA_UINT64 },
559  	{ "mru_size",			KSTAT_DATA_UINT64 },
560  	{ "mru_evictable_data",		KSTAT_DATA_UINT64 },
561  	{ "mru_evictable_metadata",	KSTAT_DATA_UINT64 },
562  	{ "mru_ghost_size",		KSTAT_DATA_UINT64 },
563  	{ "mru_ghost_evictable_data",	KSTAT_DATA_UINT64 },
564  	{ "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
565  	{ "mfu_size",			KSTAT_DATA_UINT64 },
566  	{ "mfu_evictable_data",		KSTAT_DATA_UINT64 },
567  	{ "mfu_evictable_metadata",	KSTAT_DATA_UINT64 },
568  	{ "mfu_ghost_size",		KSTAT_DATA_UINT64 },
569  	{ "mfu_ghost_evictable_data",	KSTAT_DATA_UINT64 },
570  	{ "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
571  	{ "l2_hits",			KSTAT_DATA_UINT64 },
572  	{ "l2_misses",			KSTAT_DATA_UINT64 },
573  	{ "l2_feeds",			KSTAT_DATA_UINT64 },
574  	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
575  	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
576  	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
577  	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
578  	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
579  	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
580  	{ "l2_writes_lock_retry",	KSTAT_DATA_UINT64 },
581  	{ "l2_evict_lock_retry",	KSTAT_DATA_UINT64 },
582  	{ "l2_evict_reading",		KSTAT_DATA_UINT64 },
583  	{ "l2_evict_l1cached",		KSTAT_DATA_UINT64 },
584  	{ "l2_free_on_write",		KSTAT_DATA_UINT64 },
585  	{ "l2_cdata_free_on_write",	KSTAT_DATA_UINT64 },
586  	{ "l2_abort_lowmem",		KSTAT_DATA_UINT64 },
587  	{ "l2_cksum_bad",		KSTAT_DATA_UINT64 },
588  	{ "l2_io_error",		KSTAT_DATA_UINT64 },
589  	{ "l2_size",			KSTAT_DATA_UINT64 },
590  	{ "l2_asize",			KSTAT_DATA_UINT64 },
591  	{ "l2_hdr_size",		KSTAT_DATA_UINT64 },
592  	{ "l2_compress_successes",	KSTAT_DATA_UINT64 },
593  	{ "l2_compress_zeros",		KSTAT_DATA_UINT64 },
594  	{ "l2_compress_failures",	KSTAT_DATA_UINT64 },
595  	{ "l2_log_blk_writes",		KSTAT_DATA_UINT64 },
596  	{ "l2_log_blk_avg_size",	KSTAT_DATA_UINT64 },
597  	{ "l2_data_to_meta_ratio",	KSTAT_DATA_UINT64 },
598  	{ "l2_rebuild_successes",	KSTAT_DATA_UINT64 },
599  	{ "l2_rebuild_unsupported",	KSTAT_DATA_UINT64 },
600  	{ "l2_rebuild_io_errors",	KSTAT_DATA_UINT64 },
601  	{ "l2_rebuild_cksum_errors",	KSTAT_DATA_UINT64 },
602  	{ "l2_rebuild_loop_errors",	KSTAT_DATA_UINT64 },
603  	{ "l2_rebuild_lowmem",		KSTAT_DATA_UINT64 },
604  	{ "l2_rebuild_size",		KSTAT_DATA_UINT64 },
605  	{ "l2_rebuild_bufs",		KSTAT_DATA_UINT64 },
606  	{ "l2_rebuild_bufs_precached",	KSTAT_DATA_UINT64 },
607  	{ "l2_rebuild_psize",		KSTAT_DATA_UINT64 },
608  	{ "l2_rebuild_log_blks",	KSTAT_DATA_UINT64 },
609  	{ "memory_throttle_count",	KSTAT_DATA_UINT64 },
610  	{ "duplicate_buffers",		KSTAT_DATA_UINT64 },
611  	{ "duplicate_buffers_size",	KSTAT_DATA_UINT64 },
612  	{ "duplicate_reads",		KSTAT_DATA_UINT64 },
613  	{ "arc_meta_used",		KSTAT_DATA_UINT64 },
614  	{ "arc_meta_limit",		KSTAT_DATA_UINT64 },
615  	{ "arc_meta_max",		KSTAT_DATA_UINT64 },
616  	{ "arc_meta_min",		KSTAT_DATA_UINT64 },
617  	{ "sync_wait_for_async",	KSTAT_DATA_UINT64 },
618  	{ "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
619  };
620  
621  #define	ARCSTAT(stat)	(arc_stats.stat.value.ui64)
622  
623  #define	ARCSTAT_INCR(stat, val) \
624  	atomic_add_64(&arc_stats.stat.value.ui64, (val))
625  
626  #define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
627  #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
628  
629  #define	ARCSTAT_MAX(stat, val) {					\
630  	uint64_t m;							\
631  	while ((val) > (m = arc_stats.stat.value.ui64) &&		\
632  	    (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val))))	\
633  		continue;						\
634  }
635  
636  #define	ARCSTAT_MAXSTAT(stat) \
637  	ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
638  
639  /*
640   * We define a macro to allow ARC hits/misses to be easily broken down by
641   * two separate conditions, giving a total of four different subtypes for
642   * each of hits and misses (so eight statistics total).
643   */
644  #define	ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
645  	if (cond1) {							\
646  		if (cond2) {						\
647  			ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
648  		} else {						\
649  			ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
650  		}							\
651  	} else {							\
652  		if (cond2) {						\
653  			ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
654  		} else {						\
655  			ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
656  		}							\
657  	}
658  
659  /*
660   * This macro allows us to use kstats as floating averages. Each time we
661   * update this kstat, we first factor it and the update value by
662   * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
663   * average. This macro assumes that integer loads and stores are atomic, but
664   * is not safe for multiple writers updating the kstat in parallel (only the
665   * last writer's update will remain).
666   */
667  #define	ARCSTAT_F_AVG_FACTOR	3
668  #define	ARCSTAT_F_AVG(stat, value) \
669  	do { \
670  		uint64_t x = ARCSTAT(stat); \
671  		x = x - x / ARCSTAT_F_AVG_FACTOR + \
672  		    (value) / ARCSTAT_F_AVG_FACTOR; \
673  		ARCSTAT(stat) = x; \
674  		_NOTE(CONSTCOND) \
675  	} while (0)
676  
677  kstat_t			*arc_ksp;
678  static arc_state_t	*arc_anon;
679  static arc_state_t	*arc_mru;
680  static arc_state_t	*arc_mru_ghost;
681  static arc_state_t	*arc_mfu;
682  static arc_state_t	*arc_mfu_ghost;
683  static arc_state_t	*arc_l2c_only;
684  
685  /*
686   * There are several ARC variables that are critical to export as kstats --
687   * but we don't want to have to grovel around in the kstat whenever we wish to
688   * manipulate them.  For these variables, we therefore define them to be in
689   * terms of the statistic variable.  This assures that we are not introducing
690   * the possibility of inconsistency by having shadow copies of the variables,
691   * while still allowing the code to be readable.
692   */
693  #define	arc_size	ARCSTAT(arcstat_size)	/* actual total arc size */
694  #define	arc_p		ARCSTAT(arcstat_p)	/* target size of MRU */
695  #define	arc_c		ARCSTAT(arcstat_c)	/* target size of cache */
696  #define	arc_c_min	ARCSTAT(arcstat_c_min)	/* min target cache size */
697  #define	arc_c_max	ARCSTAT(arcstat_c_max)	/* max target cache size */
698  #define	arc_meta_limit	ARCSTAT(arcstat_meta_limit) /* max size for metadata */
699  #define	arc_meta_min	ARCSTAT(arcstat_meta_min) /* min size for metadata */
700  #define	arc_meta_used	ARCSTAT(arcstat_meta_used) /* size of metadata */
701  #define	arc_meta_max	ARCSTAT(arcstat_meta_max) /* max size of metadata */
702  
703  #define	L2ARC_IS_VALID_COMPRESS(_c_) \
704  	((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
705  
706  static int		arc_no_grow;	/* Don't try to grow cache size */
707  static uint64_t		arc_tempreserve;
708  static uint64_t		arc_loaned_bytes;
709  
710  typedef struct arc_callback arc_callback_t;
711  
712  struct arc_callback {
713  	void			*acb_private;
714  	arc_done_func_t		*acb_done;
715  	arc_buf_t		*acb_buf;
716  	zio_t			*acb_zio_dummy;
717  	arc_callback_t		*acb_next;
718  };
719  
720  typedef struct arc_write_callback arc_write_callback_t;
721  
722  struct arc_write_callback {
723  	void		*awcb_private;
724  	arc_done_func_t	*awcb_ready;
725  	arc_done_func_t	*awcb_physdone;
726  	arc_done_func_t	*awcb_done;
727  	arc_buf_t	*awcb_buf;
728  };
729  
730  /*
731   * ARC buffers are separated into multiple structs as a memory saving measure:
732   *   - Common fields struct, always defined, and embedded within it:
733   *       - L2-only fields, always allocated but undefined when not in L2ARC
734   *       - L1-only fields, only allocated when in L1ARC
735   *
736   *           Buffer in L1                     Buffer only in L2
737   *    +------------------------+          +------------------------+
738   *    | arc_buf_hdr_t          |          | arc_buf_hdr_t          |
739   *    |                        |          |                        |
740   *    |                        |          |                        |
741   *    |                        |          |                        |
742   *    +------------------------+          +------------------------+
743   *    | l2arc_buf_hdr_t        |          | l2arc_buf_hdr_t        |
744   *    | (undefined if L1-only) |          |                        |
745   *    +------------------------+          +------------------------+
746   *    | l1arc_buf_hdr_t        |
747   *    |                        |
748   *    |                        |
749   *    |                        |
750   *    |                        |
751   *    +------------------------+
752   *
753   * Because it's possible for the L2ARC to become extremely large, we can wind
754   * up eating a lot of memory in L2ARC buffer headers, so the size of a header
755   * is minimized by only allocating the fields necessary for an L1-cached buffer
756   * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
757   * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
758   * words in pointers. arc_hdr_realloc() is used to switch a header between
759   * these two allocation states.
760   */
761  typedef struct l1arc_buf_hdr {
762  	kmutex_t		b_freeze_lock;
763  #ifdef ZFS_DEBUG
764  	/*
765  	 * used for debugging wtih kmem_flags - by allocating and freeing
766  	 * b_thawed when the buffer is thawed, we get a record of the stack
767  	 * trace that thawed it.
768  	 */
769  	void			*b_thawed;
770  #endif
771  
772  	arc_buf_t		*b_buf;
773  	uint32_t		b_datacnt;
774  	/* for waiting on writes to complete */
775  	kcondvar_t		b_cv;
776  
777  	/* protected by arc state mutex */
778  	arc_state_t		*b_state;
779  	multilist_node_t	b_arc_node;
780  
781  	/* updated atomically */
782  	clock_t			b_arc_access;
783  
784  	/* self protecting */
785  	refcount_t		b_refcnt;
786  
787  	arc_callback_t		*b_acb;
788  	/* temporary buffer holder for in-flight compressed data */
789  	void			*b_tmp_cdata;
790  } l1arc_buf_hdr_t;
791  
792  typedef struct l2arc_dev l2arc_dev_t;
793  
794  typedef struct l2arc_buf_hdr {
795  	/* protected by arc_buf_hdr mutex */
796  	l2arc_dev_t		*b_dev;		/* L2ARC device */
797  	uint64_t		b_daddr;	/* disk address, offset byte */
798  	/* real alloc'd buffer size depending on b_compress applied */
799  	int32_t			b_asize;
800  	uint8_t			b_compress;
801  
802  	list_node_t		b_l2node;
803  } l2arc_buf_hdr_t;
804  
805  struct arc_buf_hdr {
806  	/* protected by hash lock */
807  	dva_t			b_dva;
808  	uint64_t		b_birth;
809  	/*
810  	 * Even though this checksum is only set/verified when a buffer is in
811  	 * the L1 cache, it needs to be in the set of common fields because it
812  	 * must be preserved from the time before a buffer is written out to
813  	 * L2ARC until after it is read back in.
814  	 */
815  	zio_cksum_t		*b_freeze_cksum;
816  
817  	arc_buf_hdr_t		*b_hash_next;
818  	arc_flags_t		b_flags;
819  
820  	/* immutable */
821  	int32_t			b_size;
822  	uint64_t		b_spa;
823  
824  	/* L2ARC fields. Undefined when not in L2ARC. */
825  	l2arc_buf_hdr_t		b_l2hdr;
826  	/* L1ARC fields. Undefined when in l2arc_only state */
827  	l1arc_buf_hdr_t		b_l1hdr;
828  };
829  
830  static arc_buf_t *arc_eviction_list;
831  static arc_buf_hdr_t arc_eviction_hdr;
832  
833  #define	GHOST_STATE(state)	\
834  	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
835  	(state) == arc_l2c_only)
836  
837  #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
838  #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
839  #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_FLAG_IO_ERROR)
840  #define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_FLAG_PREFETCH)
841  #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
842  #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
843  
844  #define	HDR_L2CACHE(hdr)	((hdr)->b_flags & ARC_FLAG_L2CACHE)
845  #define	HDR_L2COMPRESS(hdr)	((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
846  #define	HDR_L2_READING(hdr)	\
847  	    (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) &&	\
848  	    ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
849  #define	HDR_L2_WRITING(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITING)
850  #define	HDR_L2_EVICTED(hdr)	((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
851  #define	HDR_L2_WRITE_HEAD(hdr)	((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
852  
853  #define	HDR_ISTYPE_METADATA(hdr)	\
854  	    ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
855  #define	HDR_ISTYPE_DATA(hdr)	(!HDR_ISTYPE_METADATA(hdr))
856  
857  #define	HDR_HAS_L1HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
858  #define	HDR_HAS_L2HDR(hdr)	((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
859  
860  /*
861   * Other sizes
862   */
863  
864  #define	HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
865  #define	HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
866  
867  /*
868   * Hash table routines
869   */
870  
871  #define	HT_LOCK_PAD	64
872  
873  struct ht_lock {
874  	kmutex_t	ht_lock;
875  #ifdef _KERNEL
876  	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
877  #endif
878  };
879  
880  #define	BUF_LOCKS 256
881  typedef struct buf_hash_table {
882  	uint64_t ht_mask;
883  	arc_buf_hdr_t **ht_table;
884  	struct ht_lock ht_locks[BUF_LOCKS];
885  } buf_hash_table_t;
886  
887  static buf_hash_table_t buf_hash_table;
888  
889  #define	BUF_HASH_INDEX(spa, dva, birth) \
890  	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
891  #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
892  #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
893  #define	HDR_LOCK(hdr) \
894  	(BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
895  
896  uint64_t zfs_crc64_table[256];
897  
898  /*
899   * Level 2 ARC
900   */
901  
902  #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
903  #define	L2ARC_HEADROOM		2			/* num of writes */
904  /*
905   * If we discover during ARC scan any buffers to be compressed, we boost
906   * our headroom for the next scanning cycle by this percentage multiple.
907   */
908  #define	L2ARC_HEADROOM_BOOST	200
909  #define	L2ARC_FEED_SECS		1		/* caching interval secs */
910  #define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
911  
912  /*
913   * Used to distinguish headers that are being process by
914   * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
915   * address. This can happen when the header is added to the l2arc's list
916   * of buffers to write in the first stage of l2arc_write_buffers(), but
917   * has not yet been written out which happens in the second stage of
918   * l2arc_write_buffers().
919   */
920  #define	L2ARC_ADDR_UNSET	((uint64_t)(-1))
921  
922  #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
923  #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
924  
925  /* L2ARC Performance Tunables */
926  uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
927  uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
928  uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
929  uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
930  uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
931  uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
932  boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
933  boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
934  boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
935  
936  static list_t L2ARC_dev_list;			/* device list */
937  static list_t *l2arc_dev_list;			/* device list pointer */
938  static kmutex_t l2arc_dev_mtx;			/* device list mutex */
939  static l2arc_dev_t *l2arc_dev_last;		/* last device used */
940  static list_t L2ARC_free_on_write;		/* free after write buf list */
941  static list_t *l2arc_free_on_write;		/* free after write list ptr */
942  static kmutex_t l2arc_free_on_write_mtx;	/* mutex for list */
943  static uint64_t l2arc_ndev;			/* number of devices */
944  
945  typedef struct l2arc_read_callback {
946  	arc_buf_t		*l2rcb_buf;		/* read buffer */
947  	spa_t			*l2rcb_spa;		/* spa */
948  	blkptr_t		l2rcb_bp;		/* original blkptr */
949  	zbookmark_phys_t	l2rcb_zb;		/* original bookmark */
950  	int			l2rcb_flags;		/* original flags */
951  	enum zio_compress	l2rcb_compress;		/* applied compress */
952  } l2arc_read_callback_t;
953  
954  typedef struct l2arc_write_callback {
955  	l2arc_dev_t	*l2wcb_dev;		/* device info */
956  	arc_buf_hdr_t	*l2wcb_head;		/* head of write buflist */
957  	list_t		l2wcb_log_blk_buflist;	/* in-flight log blocks */
958  } l2arc_write_callback_t;
959  
960  typedef struct l2arc_data_free {
961  	/* protected by l2arc_free_on_write_mtx */
962  	void		*l2df_data;
963  	size_t		l2df_size;
964  	void		(*l2df_func)(void *, size_t);
965  	list_node_t	l2df_list_node;
966  } l2arc_data_free_t;
967  
968  static kmutex_t l2arc_feed_thr_lock;
969  static kcondvar_t l2arc_feed_thr_cv;
970  static uint8_t l2arc_thread_exit;
971  
972  static void arc_get_data_buf(arc_buf_t *);
973  static void arc_access(arc_buf_hdr_t *, kmutex_t *);
974  static boolean_t arc_is_overflowing();
975  static void arc_buf_watch(arc_buf_t *);
976  static void l2arc_read_done(zio_t *zio);
977  static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
978  
979  static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
980  static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
981  static arc_buf_contents_t arc_flags_to_bufc(uint32_t);
982  
983  static boolean_t l2arc_write_eligible(uint64_t, uint64_t, arc_buf_hdr_t *);
984  static void l2arc_read_done(zio_t *);
985  
986  static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
987  static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
988  static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
989  
990  static void
arc_update_hit_stat(arc_buf_hdr_t * hdr,boolean_t hit)991  arc_update_hit_stat(arc_buf_hdr_t *hdr, boolean_t hit)
992  {
993  	boolean_t pf = !HDR_PREFETCH(hdr);
994  	switch (arc_buf_type(hdr)) {
995  	case ARC_BUFC_DATA:
996  		ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses, data);
997  		break;
998  	case ARC_BUFC_METADATA:
999  		ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses,
1000  		    metadata);
1001  		break;
1002  	default:
1003  		break;
1004  	}
1005  }
1006  
1007  enum {
1008  	L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0)	/* mirror of l2ad_first */
1009  };
1010  
1011  /*
1012   * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
1013   */
1014  typedef struct l2arc_log_blkptr {
1015  	uint64_t	lbp_daddr;	/* device address of log */
1016  	/*
1017  	 * lbp_prop is the same format as the blk_prop in blkptr_t:
1018  	 *	* logical size (in sectors)
1019  	 *	* physical (compressed) size (in sectors)
1020  	 *	* compression algorithm (we always LZ4-compress l2arc logs)
1021  	 *	* checksum algorithm (used for lbp_cksum)
1022  	 *	* object type & level (unused for now)
1023  	 */
1024  	uint64_t	lbp_prop;
1025  	zio_cksum_t	lbp_cksum;	/* fletcher4 of log */
1026  } l2arc_log_blkptr_t;
1027  
1028  /*
1029   * The persistent L2ARC device header.
1030   * Byte order of magic determines whether 64-bit bswap of fields is necessary.
1031   */
1032  typedef struct l2arc_dev_hdr_phys {
1033  	uint64_t	dh_magic;	/* L2ARC_DEV_HDR_MAGIC */
1034  	zio_cksum_t	dh_self_cksum;	/* fletcher4 of fields below */
1035  
1036  	/*
1037  	 * Global L2ARC device state and metadata.
1038  	 */
1039  	uint64_t	dh_spa_guid;
1040  	uint64_t	dh_alloc_space;		/* vdev space alloc status */
1041  	uint64_t	dh_flags;		/* l2arc_dev_hdr_flags_t */
1042  
1043  	/*
1044  	 * Start of log block chain. [0] -> newest log, [1] -> one older (used
1045  	 * for initiating prefetch).
1046  	 */
1047  	l2arc_log_blkptr_t	dh_start_lbps[2];
1048  
1049  	const uint64_t	dh_pad[44];		/* pad to 512 bytes */
1050  } l2arc_dev_hdr_phys_t;
1051  CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
1052  
1053  /*
1054   * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
1055   */
1056  typedef struct l2arc_log_ent_phys {
1057  	dva_t			le_dva;	/* dva of buffer */
1058  	uint64_t		le_birth;	/* birth txg of buffer */
1059  	zio_cksum_t		le_freeze_cksum;
1060  	/*
1061  	 * le_prop is the same format as the blk_prop in blkptr_t:
1062  	 *	* logical size (in sectors)
1063  	 *	* physical (compressed) size (in sectors)
1064  	 *	* compression algorithm
1065  	 *	* checksum algorithm (used for b_freeze_cksum)
1066  	 *	* object type & level (used to restore arc_buf_contents_t)
1067  	 */
1068  	uint64_t		le_prop;
1069  	uint64_t		le_daddr;	/* buf location on l2dev */
1070  	const uint64_t		le_pad[7];	/* resv'd for future use */
1071  } l2arc_log_ent_phys_t;
1072  
1073  /*
1074   * These design limits give us the following metadata overhead (before
1075   * compression):
1076   *	avg_blk_sz	overhead
1077   *	1k		12.51 %
1078   *	2k		 6.26 %
1079   *	4k		 3.13 %
1080   *	8k		 1.56 %
1081   *	16k		 0.78 %
1082   *	32k		 0.39 %
1083   *	64k		 0.20 %
1084   *	128k		 0.10 %
1085   * Compression should be able to sequeeze these down by about a factor of 2x.
1086   */
1087  #define	L2ARC_LOG_BLK_SIZE			(128 * 1024)	/* 128k */
1088  #define	L2ARC_LOG_BLK_HEADER_LEN		(128)
1089  #define	L2ARC_LOG_BLK_ENTRIES			/* 1023 entries */	\
1090  	((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) /		\
1091  	sizeof (l2arc_log_ent_phys_t))
1092  /*
1093   * Maximum amount of data in an l2arc log block (used to terminate rebuilding
1094   * before we hit the write head and restore potentially corrupted blocks).
1095   */
1096  #define	L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE	\
1097  	(SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
1098  /*
1099   * For the persistency and rebuild algorithms to operate reliably we need
1100   * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
1101   * excessive log block looping might confuse the log chain end detection).
1102   * Under normal circumstances this is not a problem, since this is somewhere
1103   * around only 400 MB.
1104   */
1105  #define	L2ARC_PERSIST_MIN_SIZE	(3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
1106  
1107  /*
1108   * A log block of up to 1023 ARC buffer log entries, chained into the
1109   * persistent L2ARC metadata linked list. Byte order of magic determines
1110   * whether 64-bit bswap of fields is necessary.
1111   */
1112  typedef struct l2arc_log_blk_phys {
1113  	/* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
1114  	uint64_t		lb_magic;	/* L2ARC_LOG_BLK_MAGIC */
1115  	l2arc_log_blkptr_t	lb_back2_lbp;	/* back 2 steps in chain */
1116  	uint64_t		lb_pad[9];	/* resv'd for future use */
1117  	/* Payload */
1118  	l2arc_log_ent_phys_t	lb_entries[L2ARC_LOG_BLK_ENTRIES];
1119  } l2arc_log_blk_phys_t;
1120  
1121  CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
1122  CTASSERT(offsetof(l2arc_log_blk_phys_t, lb_entries) -
1123      offsetof(l2arc_log_blk_phys_t, lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
1124  
1125  /*
1126   * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
1127   * written to the L2ARC device. They may be compressed, hence the uint8_t[].
1128   */
1129  typedef struct l2arc_log_blk_buf {
1130  	uint8_t		lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
1131  	list_node_t	lbb_node;
1132  } l2arc_log_blk_buf_t;
1133  
1134  /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
1135  #define	BLKPROP_GET_LSIZE(_obj, _field)		\
1136  	BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
1137  #define	BLKPROP_SET_LSIZE(_obj, _field, x)	\
1138  	BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
1139  #define	BLKPROP_GET_PSIZE(_obj, _field)		\
1140  	BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
1141  #define	BLKPROP_SET_PSIZE(_obj, _field, x)	\
1142  	BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
1143  #define	BLKPROP_GET_COMPRESS(_obj, _field)	\
1144  	BF64_GET((_obj)->_field, 32, 8)
1145  #define	BLKPROP_SET_COMPRESS(_obj, _field, x)	\
1146  	BF64_SET((_obj)->_field, 32, 8, x)
1147  #define	BLKPROP_GET_CHECKSUM(_obj, _field)	\
1148  	BF64_GET((_obj)->_field, 40, 8)
1149  #define	BLKPROP_SET_CHECKSUM(_obj, _field, x)	\
1150  	BF64_SET((_obj)->_field, 40, 8, x)
1151  #define	BLKPROP_GET_TYPE(_obj, _field)		\
1152  	BF64_GET((_obj)->_field, 48, 8)
1153  #define	BLKPROP_SET_TYPE(_obj, _field, x)	\
1154  	BF64_SET((_obj)->_field, 48, 8, x)
1155  
1156  /* Macros for manipulating a l2arc_log_blkptr_t->lbp_prop field */
1157  #define	LBP_GET_LSIZE(_add)		BLKPROP_GET_LSIZE(_add, lbp_prop)
1158  #define	LBP_SET_LSIZE(_add, x)		BLKPROP_SET_LSIZE(_add, lbp_prop, x)
1159  #define	LBP_GET_PSIZE(_add)		BLKPROP_GET_PSIZE(_add, lbp_prop)
1160  #define	LBP_SET_PSIZE(_add, x)		BLKPROP_SET_PSIZE(_add, lbp_prop, x)
1161  #define	LBP_GET_COMPRESS(_add)		BLKPROP_GET_COMPRESS(_add, lbp_prop)
1162  #define	LBP_SET_COMPRESS(_add, x)	BLKPROP_SET_COMPRESS(_add, lbp_prop, \
1163      x)
1164  #define	LBP_GET_CHECKSUM(_add)		BLKPROP_GET_CHECKSUM(_add, lbp_prop)
1165  #define	LBP_SET_CHECKSUM(_add, x)	BLKPROP_SET_CHECKSUM(_add, lbp_prop, \
1166      x)
1167  #define	LBP_GET_TYPE(_add)		BLKPROP_GET_TYPE(_add, lbp_prop)
1168  #define	LBP_SET_TYPE(_add, x)		BLKPROP_SET_TYPE(_add, lbp_prop, x)
1169  
1170  /* Macros for manipulating a l2arc_log_ent_phys_t->le_prop field */
1171  #define	LE_GET_LSIZE(_le)	BLKPROP_GET_LSIZE(_le, le_prop)
1172  #define	LE_SET_LSIZE(_le, x)	BLKPROP_SET_LSIZE(_le, le_prop, x)
1173  #define	LE_GET_PSIZE(_le)	BLKPROP_GET_PSIZE(_le, le_prop)
1174  #define	LE_SET_PSIZE(_le, x)	BLKPROP_SET_PSIZE(_le, le_prop, x)
1175  #define	LE_GET_COMPRESS(_le)	BLKPROP_GET_COMPRESS(_le, le_prop)
1176  #define	LE_SET_COMPRESS(_le, x)	BLKPROP_SET_COMPRESS(_le, le_prop, x)
1177  #define	LE_GET_CHECKSUM(_le)	BLKPROP_GET_CHECKSUM(_le, le_prop)
1178  #define	LE_SET_CHECKSUM(_le, x)	BLKPROP_SET_CHECKSUM(_le, le_prop, x)
1179  #define	LE_GET_TYPE(_le)	BLKPROP_GET_TYPE(_le, le_prop)
1180  #define	LE_SET_TYPE(_le, x)	BLKPROP_SET_TYPE(_le, le_prop, x)
1181  
1182  #define	PTR_SWAP(x, y)		\
1183  	do {			\
1184  		void *tmp = (x);\
1185  		x = y;		\
1186  		y = tmp;	\
1187  		_NOTE(CONSTCOND)\
1188  	} while (0)
1189  
1190  #define	L2ARC_DEV_HDR_MAGIC	0x5a46534341434845LLU	/* ASCII: "ZFSCACHE" */
1191  #define	L2ARC_LOG_BLK_MAGIC	0x4c4f47424c4b4844LLU	/* ASCII: "LOGBLKHD" */
1192  
1193  /*
1194   * Performance tuning of L2ARC persistency:
1195   *
1196   * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
1197   *		pool import or when adding one manually later) will attempt
1198   *		to rebuild L2ARC buffer contents. In special circumstances,
1199   *		the administrator may want to set this to B_FALSE, if they
1200   *		are having trouble importing a pool or attaching an L2ARC
1201   *		device (e.g. the L2ARC device is slow to read in stored log
1202   *		metadata, or the metadata has become somehow
1203   *		fragmented/unusable).
1204   */
1205  boolean_t l2arc_rebuild_enabled = B_TRUE;
1206  
1207  /* L2ARC persistency rebuild control routines. */
1208  static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
1209  static int l2arc_rebuild(l2arc_dev_t *dev);
1210  
1211  /* L2ARC persistency read I/O routines. */
1212  static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
1213  static int l2arc_log_blk_read(l2arc_dev_t *dev,
1214      const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
1215      l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
1216      uint8_t *this_lb_buf, uint8_t *next_lb_buf,
1217      zio_t *this_io, zio_t **next_io);
1218  static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
1219      const l2arc_log_blkptr_t *lp, uint8_t *lb_buf);
1220  static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1221  
1222  /* L2ARC persistency block restoration routines. */
1223  static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
1224      const l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
1225  static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
1226      l2arc_dev_t *dev, uint64_t guid);
1227  
1228  /* L2ARC persistency write I/O routines. */
1229  static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1230  static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1231      l2arc_write_callback_t *cb);
1232  
1233  /* L2ARC persistency auxilliary routines. */
1234  static boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1235      const l2arc_log_blkptr_t *lp);
1236  static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1237      zio_cksum_t *cksum);
1238  static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1239      const arc_buf_hdr_t *ab);
1240  static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1241      uint64_t top, uint64_t check);
1242  
1243  /*
1244   * L2ARC Internals
1245   */
1246  struct l2arc_dev {
1247  	vdev_t			*l2ad_vdev;	/* vdev */
1248  	spa_t			*l2ad_spa;	/* spa */
1249  	uint64_t		l2ad_hand;	/* next write location */
1250  	uint64_t		l2ad_start;	/* first addr on device */
1251  	uint64_t		l2ad_end;	/* last addr on device */
1252  	boolean_t		l2ad_first;	/* first sweep through */
1253  	boolean_t		l2ad_writing;	/* currently writing */
1254  	kmutex_t		l2ad_mtx;	/* lock for buffer list */
1255  	list_t			l2ad_buflist;	/* buffer list */
1256  	list_node_t		l2ad_node;	/* device list node */
1257  	refcount_t		l2ad_alloc;	/* allocated bytes */
1258  	l2arc_dev_hdr_phys_t	*l2ad_dev_hdr;	/* persistent device header */
1259  	uint64_t		l2ad_dev_hdr_asize; /* aligned hdr size */
1260  	l2arc_log_blk_phys_t	l2ad_log_blk;	/* currently open log block */
1261  	int			l2ad_log_ent_idx; /* index into cur log blk */
1262  	/* number of bytes in current log block's payload */
1263  	uint64_t		l2ad_log_blk_payload_asize;
1264  	/* flag indicating whether a rebuild is scheduled or is going on */
1265  	boolean_t		l2ad_rebuild;
1266  	boolean_t		l2ad_rebuild_cancel;
1267  	kt_did_t		l2ad_rebuild_did;
1268  };
1269  
1270  static inline uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)1271  buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1272  {
1273  	uint8_t *vdva = (uint8_t *)dva;
1274  	uint64_t crc = -1ULL;
1275  	int i;
1276  
1277  	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1278  
1279  	for (i = 0; i < sizeof (dva_t); i++)
1280  		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1281  
1282  	crc ^= (spa>>8) ^ birth;
1283  
1284  	return (crc);
1285  }
1286  
1287  #define	BUF_EMPTY(buf)						\
1288  	((buf)->b_dva.dva_word[0] == 0 &&			\
1289  	(buf)->b_dva.dva_word[1] == 0)
1290  
1291  #define	BUF_EQUAL(spa, dva, birth, buf)				\
1292  	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
1293  	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
1294  	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1295  
1296  static void
buf_discard_identity(arc_buf_hdr_t * hdr)1297  buf_discard_identity(arc_buf_hdr_t *hdr)
1298  {
1299  	hdr->b_dva.dva_word[0] = 0;
1300  	hdr->b_dva.dva_word[1] = 0;
1301  	hdr->b_birth = 0;
1302  }
1303  
1304  static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)1305  buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1306  {
1307  	const dva_t *dva = BP_IDENTITY(bp);
1308  	uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1309  	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1310  	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1311  	arc_buf_hdr_t *hdr;
1312  
1313  	mutex_enter(hash_lock);
1314  	for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1315  	    hdr = hdr->b_hash_next) {
1316  		if (BUF_EQUAL(spa, dva, birth, hdr)) {
1317  			*lockp = hash_lock;
1318  			return (hdr);
1319  		}
1320  	}
1321  	mutex_exit(hash_lock);
1322  	*lockp = NULL;
1323  	return (NULL);
1324  }
1325  
1326  /*
1327   * Insert an entry into the hash table.  If there is already an element
1328   * equal to elem in the hash table, then the already existing element
1329   * will be returned and the new element will not be inserted.
1330   * Otherwise returns NULL.
1331   * If lockp == NULL, the caller is assumed to already hold the hash lock.
1332   */
1333  static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)1334  buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1335  {
1336  	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1337  	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1338  	arc_buf_hdr_t *fhdr;
1339  	uint32_t i;
1340  
1341  	ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1342  	ASSERT(hdr->b_birth != 0);
1343  	ASSERT(!HDR_IN_HASH_TABLE(hdr));
1344  
1345  	if (lockp != NULL) {
1346  		*lockp = hash_lock;
1347  		mutex_enter(hash_lock);
1348  	} else {
1349  		ASSERT(MUTEX_HELD(hash_lock));
1350  	}
1351  
1352  	for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1353  	    fhdr = fhdr->b_hash_next, i++) {
1354  		if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1355  			return (fhdr);
1356  	}
1357  
1358  	hdr->b_hash_next = buf_hash_table.ht_table[idx];
1359  	buf_hash_table.ht_table[idx] = hdr;
1360  	hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1361  
1362  	/* collect some hash table performance data */
1363  	if (i > 0) {
1364  		ARCSTAT_BUMP(arcstat_hash_collisions);
1365  		if (i == 1)
1366  			ARCSTAT_BUMP(arcstat_hash_chains);
1367  
1368  		ARCSTAT_MAX(arcstat_hash_chain_max, i);
1369  	}
1370  
1371  	ARCSTAT_BUMP(arcstat_hash_elements);
1372  	ARCSTAT_MAXSTAT(arcstat_hash_elements);
1373  
1374  	return (NULL);
1375  }
1376  
1377  static void
buf_hash_remove(arc_buf_hdr_t * hdr)1378  buf_hash_remove(arc_buf_hdr_t *hdr)
1379  {
1380  	arc_buf_hdr_t *fhdr, **hdrp;
1381  	uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1382  
1383  	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1384  	ASSERT(HDR_IN_HASH_TABLE(hdr));
1385  
1386  	hdrp = &buf_hash_table.ht_table[idx];
1387  	while ((fhdr = *hdrp) != hdr) {
1388  		ASSERT(fhdr != NULL);
1389  		hdrp = &fhdr->b_hash_next;
1390  	}
1391  	*hdrp = hdr->b_hash_next;
1392  	hdr->b_hash_next = NULL;
1393  	hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1394  
1395  	/* collect some hash table performance data */
1396  	ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1397  
1398  	if (buf_hash_table.ht_table[idx] &&
1399  	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1400  		ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1401  }
1402  
1403  /*
1404   * Global data structures and functions for the buf kmem cache.
1405   */
1406  static kmem_cache_t *hdr_full_cache;
1407  static kmem_cache_t *hdr_l2only_cache;
1408  static kmem_cache_t *buf_cache;
1409  
1410  static void
buf_fini(void)1411  buf_fini(void)
1412  {
1413  	int i;
1414  
1415  	kmem_free(buf_hash_table.ht_table,
1416  	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
1417  	for (i = 0; i < BUF_LOCKS; i++)
1418  		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1419  	kmem_cache_destroy(hdr_full_cache);
1420  	kmem_cache_destroy(hdr_l2only_cache);
1421  	kmem_cache_destroy(buf_cache);
1422  }
1423  
1424  /*
1425   * Constructor callback - called when the cache is empty
1426   * and a new buf is requested.
1427   */
1428  /* ARGSUSED */
1429  static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)1430  hdr_full_cons(void *vbuf, void *unused, int kmflag)
1431  {
1432  	arc_buf_hdr_t *hdr = vbuf;
1433  
1434  	bzero(hdr, HDR_FULL_SIZE);
1435  	cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1436  	refcount_create(&hdr->b_l1hdr.b_refcnt);
1437  	mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1438  	multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1439  	arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1440  
1441  	return (0);
1442  }
1443  
1444  /* ARGSUSED */
1445  static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1446  hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1447  {
1448  	arc_buf_hdr_t *hdr = vbuf;
1449  
1450  	bzero(hdr, HDR_L2ONLY_SIZE);
1451  	arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1452  
1453  	return (0);
1454  }
1455  
1456  /* ARGSUSED */
1457  static int
buf_cons(void * vbuf,void * unused,int kmflag)1458  buf_cons(void *vbuf, void *unused, int kmflag)
1459  {
1460  	arc_buf_t *buf = vbuf;
1461  
1462  	bzero(buf, sizeof (arc_buf_t));
1463  	mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1464  	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1465  
1466  	return (0);
1467  }
1468  
1469  /*
1470   * Destructor callback - called when a cached buf is
1471   * no longer required.
1472   */
1473  /* ARGSUSED */
1474  static void
hdr_full_dest(void * vbuf,void * unused)1475  hdr_full_dest(void *vbuf, void *unused)
1476  {
1477  	arc_buf_hdr_t *hdr = vbuf;
1478  
1479  	ASSERT(BUF_EMPTY(hdr));
1480  	cv_destroy(&hdr->b_l1hdr.b_cv);
1481  	refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1482  	mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1483  	ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1484  	arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1485  }
1486  
1487  /* ARGSUSED */
1488  static void
hdr_l2only_dest(void * vbuf,void * unused)1489  hdr_l2only_dest(void *vbuf, void *unused)
1490  {
1491  	arc_buf_hdr_t *hdr = vbuf;
1492  
1493  	ASSERT(BUF_EMPTY(hdr));
1494  	arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1495  }
1496  
1497  /* ARGSUSED */
1498  static void
buf_dest(void * vbuf,void * unused)1499  buf_dest(void *vbuf, void *unused)
1500  {
1501  	arc_buf_t *buf = vbuf;
1502  
1503  	mutex_destroy(&buf->b_evict_lock);
1504  	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1505  }
1506  
1507  /*
1508   * Reclaim callback -- invoked when memory is low.
1509   */
1510  /* ARGSUSED */
1511  static void
hdr_recl(void * unused)1512  hdr_recl(void *unused)
1513  {
1514  	dprintf("hdr_recl called\n");
1515  	/*
1516  	 * umem calls the reclaim func when we destroy the buf cache,
1517  	 * which is after we do arc_fini().
1518  	 */
1519  	if (!arc_dead)
1520  		cv_signal(&arc_reclaim_thread_cv);
1521  }
1522  
1523  static void
buf_init(void)1524  buf_init(void)
1525  {
1526  	uint64_t *ct;
1527  	uint64_t hsize = 1ULL << 12;
1528  	int i, j;
1529  
1530  	/*
1531  	 * The hash table is big enough to fill all of physical memory
1532  	 * with an average block size of zfs_arc_average_blocksize (default 8K).
1533  	 * By default, the table will take up
1534  	 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1535  	 */
1536  	while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1537  		hsize <<= 1;
1538  retry:
1539  	buf_hash_table.ht_mask = hsize - 1;
1540  	buf_hash_table.ht_table =
1541  	    kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1542  	if (buf_hash_table.ht_table == NULL) {
1543  		ASSERT(hsize > (1ULL << 8));
1544  		hsize >>= 1;
1545  		goto retry;
1546  	}
1547  
1548  	hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1549  	    0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1550  	hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1551  	    HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1552  	    NULL, NULL, 0);
1553  	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1554  	    0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1555  
1556  	for (i = 0; i < 256; i++)
1557  		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1558  			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1559  
1560  	for (i = 0; i < BUF_LOCKS; i++) {
1561  		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1562  		    NULL, MUTEX_DEFAULT, NULL);
1563  	}
1564  }
1565  
1566  /*
1567   * Transition between the two allocation states for the arc_buf_hdr struct.
1568   * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1569   * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1570   * version is used when a cache buffer is only in the L2ARC in order to reduce
1571   * memory usage.
1572   */
1573  static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)1574  arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1575  {
1576  	ASSERT(HDR_HAS_L2HDR(hdr));
1577  
1578  	arc_buf_hdr_t *nhdr;
1579  	l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1580  
1581  	ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1582  	    (old == hdr_l2only_cache && new == hdr_full_cache));
1583  
1584  	nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1585  
1586  	ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1587  	buf_hash_remove(hdr);
1588  
1589  	bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1590  
1591  	if (new == hdr_full_cache) {
1592  		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1593  		/*
1594  		 * arc_access and arc_change_state need to be aware that a
1595  		 * header has just come out of L2ARC, so we set its state to
1596  		 * l2c_only even though it's about to change.
1597  		 */
1598  		nhdr->b_l1hdr.b_state = arc_l2c_only;
1599  
1600  		/* Verify previous threads set to NULL before freeing */
1601  		ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1602  	} else {
1603  		ASSERT(hdr->b_l1hdr.b_buf == NULL);
1604  		ASSERT0(hdr->b_l1hdr.b_datacnt);
1605  
1606  		/*
1607  		 * If we've reached here, We must have been called from
1608  		 * arc_evict_hdr(), as such we should have already been
1609  		 * removed from any ghost list we were previously on
1610  		 * (which protects us from racing with arc_evict_state),
1611  		 * thus no locking is needed during this check.
1612  		 */
1613  		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1614  
1615  		/*
1616  		 * A buffer must not be moved into the arc_l2c_only
1617  		 * state if it's not finished being written out to the
1618  		 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1619  		 * might try to be accessed, even though it was removed.
1620  		 */
1621  		VERIFY(!HDR_L2_WRITING(hdr));
1622  		VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1623  
1624  #ifdef ZFS_DEBUG
1625  		if (hdr->b_l1hdr.b_thawed != NULL) {
1626  			kmem_free(hdr->b_l1hdr.b_thawed, 1);
1627  			hdr->b_l1hdr.b_thawed = NULL;
1628  		}
1629  #endif
1630  
1631  		nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1632  	}
1633  	/*
1634  	 * The header has been reallocated so we need to re-insert it into any
1635  	 * lists it was on.
1636  	 */
1637  	(void) buf_hash_insert(nhdr, NULL);
1638  
1639  	ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1640  
1641  	mutex_enter(&dev->l2ad_mtx);
1642  
1643  	/*
1644  	 * We must place the realloc'ed header back into the list at
1645  	 * the same spot. Otherwise, if it's placed earlier in the list,
1646  	 * l2arc_write_buffers() could find it during the function's
1647  	 * write phase, and try to write it out to the l2arc.
1648  	 */
1649  	list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1650  	list_remove(&dev->l2ad_buflist, hdr);
1651  
1652  	mutex_exit(&dev->l2ad_mtx);
1653  
1654  	/*
1655  	 * Since we're using the pointer address as the tag when
1656  	 * incrementing and decrementing the l2ad_alloc refcount, we
1657  	 * must remove the old pointer (that we're about to destroy) and
1658  	 * add the new pointer to the refcount. Otherwise we'd remove
1659  	 * the wrong pointer address when calling arc_hdr_destroy() later.
1660  	 */
1661  
1662  	(void) refcount_remove_many(&dev->l2ad_alloc,
1663  	    hdr->b_l2hdr.b_asize, hdr);
1664  
1665  	(void) refcount_add_many(&dev->l2ad_alloc,
1666  	    nhdr->b_l2hdr.b_asize, nhdr);
1667  
1668  	buf_discard_identity(hdr);
1669  	hdr->b_freeze_cksum = NULL;
1670  	kmem_cache_free(old, hdr);
1671  
1672  	return (nhdr);
1673  }
1674  
1675  
1676  #define	ARC_MINTIME	(hz>>4) /* 62 ms */
1677  
1678  static void
arc_cksum_verify(arc_buf_t * buf)1679  arc_cksum_verify(arc_buf_t *buf)
1680  {
1681  	zio_cksum_t zc;
1682  
1683  	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1684  		return;
1685  
1686  	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1687  	if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1688  		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1689  		return;
1690  	}
1691  	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
1692  	if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1693  		panic("buffer modified while frozen!");
1694  	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1695  }
1696  
1697  static int
arc_cksum_equal(arc_buf_t * buf)1698  arc_cksum_equal(arc_buf_t *buf)
1699  {
1700  	zio_cksum_t zc;
1701  	int equal;
1702  
1703  	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1704  	fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
1705  	equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1706  	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1707  
1708  	return (equal);
1709  }
1710  
1711  static void
arc_cksum_compute(arc_buf_t * buf,boolean_t force)1712  arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1713  {
1714  	if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1715  		return;
1716  
1717  	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1718  	if (buf->b_hdr->b_freeze_cksum != NULL) {
1719  		mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1720  		return;
1721  	}
1722  	buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1723  	fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1724  	    NULL, buf->b_hdr->b_freeze_cksum);
1725  	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1726  	arc_buf_watch(buf);
1727  }
1728  
1729  #ifndef _KERNEL
1730  typedef struct procctl {
1731  	long cmd;
1732  	prwatch_t prwatch;
1733  } procctl_t;
1734  #endif
1735  
1736  /* ARGSUSED */
1737  static void
arc_buf_unwatch(arc_buf_t * buf)1738  arc_buf_unwatch(arc_buf_t *buf)
1739  {
1740  #ifndef _KERNEL
1741  	if (arc_watch) {
1742  		int result;
1743  		procctl_t ctl;
1744  		ctl.cmd = PCWATCH;
1745  		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1746  		ctl.prwatch.pr_size = 0;
1747  		ctl.prwatch.pr_wflags = 0;
1748  		result = write(arc_procfd, &ctl, sizeof (ctl));
1749  		ASSERT3U(result, ==, sizeof (ctl));
1750  	}
1751  #endif
1752  }
1753  
1754  /* ARGSUSED */
1755  static void
arc_buf_watch(arc_buf_t * buf)1756  arc_buf_watch(arc_buf_t *buf)
1757  {
1758  #ifndef _KERNEL
1759  	if (arc_watch) {
1760  		int result;
1761  		procctl_t ctl;
1762  		ctl.cmd = PCWATCH;
1763  		ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1764  		ctl.prwatch.pr_size = buf->b_hdr->b_size;
1765  		ctl.prwatch.pr_wflags = WA_WRITE;
1766  		result = write(arc_procfd, &ctl, sizeof (ctl));
1767  		ASSERT3U(result, ==, sizeof (ctl));
1768  	}
1769  #endif
1770  }
1771  
1772  static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1773  arc_buf_type(arc_buf_hdr_t *hdr)
1774  {
1775  	if (HDR_ISTYPE_METADATA(hdr)) {
1776  		return (ARC_BUFC_METADATA);
1777  	} else {
1778  		return (ARC_BUFC_DATA);
1779  	}
1780  }
1781  
1782  static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1783  arc_bufc_to_flags(arc_buf_contents_t type)
1784  {
1785  	switch (type) {
1786  	case ARC_BUFC_DATA:
1787  		/* metadata field is 0 if buffer contains normal data */
1788  		return (0);
1789  	case ARC_BUFC_METADATA:
1790  		return (ARC_FLAG_BUFC_METADATA);
1791  	default:
1792  		break;
1793  	}
1794  	panic("undefined ARC buffer type!");
1795  	return ((uint32_t)-1);
1796  }
1797  
1798  static arc_buf_contents_t
arc_flags_to_bufc(uint32_t flags)1799  arc_flags_to_bufc(uint32_t flags)
1800  {
1801  	if (flags & ARC_FLAG_BUFC_METADATA)
1802  		return (ARC_BUFC_METADATA);
1803  	return (ARC_BUFC_DATA);
1804  }
1805  
1806  void
arc_buf_thaw(arc_buf_t * buf)1807  arc_buf_thaw(arc_buf_t *buf)
1808  {
1809  	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1810  		if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1811  			panic("modifying non-anon buffer!");
1812  		if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1813  			panic("modifying buffer while i/o in progress!");
1814  		arc_cksum_verify(buf);
1815  	}
1816  
1817  	mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1818  	if (buf->b_hdr->b_freeze_cksum != NULL) {
1819  		kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1820  		buf->b_hdr->b_freeze_cksum = NULL;
1821  	}
1822  
1823  #ifdef ZFS_DEBUG
1824  	if (zfs_flags & ZFS_DEBUG_MODIFY) {
1825  		if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1826  			kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1827  		buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1828  	}
1829  #endif
1830  
1831  	mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1832  
1833  	arc_buf_unwatch(buf);
1834  }
1835  
1836  void
arc_buf_freeze(arc_buf_t * buf)1837  arc_buf_freeze(arc_buf_t *buf)
1838  {
1839  	kmutex_t *hash_lock;
1840  
1841  	if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1842  		return;
1843  
1844  	hash_lock = HDR_LOCK(buf->b_hdr);
1845  	mutex_enter(hash_lock);
1846  
1847  	ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1848  	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
1849  	arc_cksum_compute(buf, B_FALSE);
1850  	mutex_exit(hash_lock);
1851  
1852  }
1853  
1854  static void
add_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1855  add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1856  {
1857  	ASSERT(HDR_HAS_L1HDR(hdr));
1858  	ASSERT(MUTEX_HELD(hash_lock));
1859  	arc_state_t *state = hdr->b_l1hdr.b_state;
1860  
1861  	if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1862  	    (state != arc_anon)) {
1863  		/* We don't use the L2-only state list. */
1864  		if (state != arc_l2c_only) {
1865  			arc_buf_contents_t type = arc_buf_type(hdr);
1866  			uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1867  			multilist_t *list = &state->arcs_list[type];
1868  			uint64_t *size = &state->arcs_lsize[type];
1869  
1870  			multilist_remove(list, hdr);
1871  
1872  			if (GHOST_STATE(state)) {
1873  				ASSERT0(hdr->b_l1hdr.b_datacnt);
1874  				ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1875  				delta = hdr->b_size;
1876  			}
1877  			ASSERT(delta > 0);
1878  			ASSERT3U(*size, >=, delta);
1879  			atomic_add_64(size, -delta);
1880  		}
1881  		/* remove the prefetch flag if we get a reference */
1882  		hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1883  	}
1884  }
1885  
1886  static int
remove_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1887  remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1888  {
1889  	int cnt;
1890  	arc_state_t *state = hdr->b_l1hdr.b_state;
1891  
1892  	ASSERT(HDR_HAS_L1HDR(hdr));
1893  	ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1894  	ASSERT(!GHOST_STATE(state));
1895  
1896  	/*
1897  	 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1898  	 * check to prevent usage of the arc_l2c_only list.
1899  	 */
1900  	if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1901  	    (state != arc_anon)) {
1902  		arc_buf_contents_t type = arc_buf_type(hdr);
1903  		multilist_t *list = &state->arcs_list[type];
1904  		uint64_t *size = &state->arcs_lsize[type];
1905  
1906  		multilist_insert(list, hdr);
1907  
1908  		ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1909  		atomic_add_64(size, hdr->b_size *
1910  		    hdr->b_l1hdr.b_datacnt);
1911  	}
1912  	return (cnt);
1913  }
1914  
1915  /*
1916   * Move the supplied buffer to the indicated state. The hash lock
1917   * for the buffer must be held by the caller.
1918   */
1919  static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr,kmutex_t * hash_lock)1920  arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1921      kmutex_t *hash_lock)
1922  {
1923  	arc_state_t *old_state;
1924  	int64_t refcnt;
1925  	uint32_t datacnt;
1926  	uint64_t from_delta, to_delta;
1927  	arc_buf_contents_t buftype = arc_buf_type(hdr);
1928  
1929  	/*
1930  	 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1931  	 * in arc_read() when bringing a buffer out of the L2ARC.  However, the
1932  	 * L1 hdr doesn't always exist when we change state to arc_anon before
1933  	 * destroying a header, in which case reallocating to add the L1 hdr is
1934  	 * pointless.
1935  	 */
1936  	if (HDR_HAS_L1HDR(hdr)) {
1937  		old_state = hdr->b_l1hdr.b_state;
1938  		refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1939  		datacnt = hdr->b_l1hdr.b_datacnt;
1940  	} else {
1941  		old_state = arc_l2c_only;
1942  		refcnt = 0;
1943  		datacnt = 0;
1944  	}
1945  
1946  	ASSERT(MUTEX_HELD(hash_lock));
1947  	ASSERT3P(new_state, !=, old_state);
1948  	ASSERT(refcnt == 0 || datacnt > 0);
1949  	ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1950  	ASSERT(old_state != arc_anon || datacnt <= 1);
1951  
1952  	from_delta = to_delta = datacnt * hdr->b_size;
1953  
1954  	/*
1955  	 * If this buffer is evictable, transfer it from the
1956  	 * old state list to the new state list.
1957  	 */
1958  	if (refcnt == 0) {
1959  		if (old_state != arc_anon && old_state != arc_l2c_only) {
1960  			uint64_t *size = &old_state->arcs_lsize[buftype];
1961  
1962  			ASSERT(HDR_HAS_L1HDR(hdr));
1963  			multilist_remove(&old_state->arcs_list[buftype], hdr);
1964  
1965  			/*
1966  			 * If prefetching out of the ghost cache,
1967  			 * we will have a non-zero datacnt.
1968  			 */
1969  			if (GHOST_STATE(old_state) && datacnt == 0) {
1970  				/* ghost elements have a ghost size */
1971  				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1972  				from_delta = hdr->b_size;
1973  			}
1974  			ASSERT3U(*size, >=, from_delta);
1975  			atomic_add_64(size, -from_delta);
1976  		}
1977  		if (new_state != arc_anon && new_state != arc_l2c_only) {
1978  			uint64_t *size = &new_state->arcs_lsize[buftype];
1979  
1980  			/*
1981  			 * An L1 header always exists here, since if we're
1982  			 * moving to some L1-cached state (i.e. not l2c_only or
1983  			 * anonymous), we realloc the header to add an L1hdr
1984  			 * beforehand.
1985  			 */
1986  			ASSERT(HDR_HAS_L1HDR(hdr));
1987  			multilist_insert(&new_state->arcs_list[buftype], hdr);
1988  
1989  			/* ghost elements have a ghost size */
1990  			if (GHOST_STATE(new_state)) {
1991  				ASSERT0(datacnt);
1992  				ASSERT(hdr->b_l1hdr.b_buf == NULL);
1993  				to_delta = hdr->b_size;
1994  			}
1995  			atomic_add_64(size, to_delta);
1996  		}
1997  	}
1998  
1999  	ASSERT(!BUF_EMPTY(hdr));
2000  	if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2001  		buf_hash_remove(hdr);
2002  
2003  	/* adjust state sizes (ignore arc_l2c_only) */
2004  
2005  	if (to_delta && new_state != arc_l2c_only) {
2006  		ASSERT(HDR_HAS_L1HDR(hdr));
2007  		if (GHOST_STATE(new_state)) {
2008  			ASSERT0(datacnt);
2009  
2010  			/*
2011  			 * We moving a header to a ghost state, we first
2012  			 * remove all arc buffers. Thus, we'll have a
2013  			 * datacnt of zero, and no arc buffer to use for
2014  			 * the reference. As a result, we use the arc
2015  			 * header pointer for the reference.
2016  			 */
2017  			(void) refcount_add_many(&new_state->arcs_size,
2018  			    hdr->b_size, hdr);
2019  		} else {
2020  			ASSERT3U(datacnt, !=, 0);
2021  
2022  			/*
2023  			 * Each individual buffer holds a unique reference,
2024  			 * thus we must remove each of these references one
2025  			 * at a time.
2026  			 */
2027  			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2028  			    buf = buf->b_next) {
2029  				(void) refcount_add_many(&new_state->arcs_size,
2030  				    hdr->b_size, buf);
2031  			}
2032  		}
2033  	}
2034  
2035  	if (from_delta && old_state != arc_l2c_only) {
2036  		ASSERT(HDR_HAS_L1HDR(hdr));
2037  		if (GHOST_STATE(old_state)) {
2038  			/*
2039  			 * When moving a header off of a ghost state,
2040  			 * there's the possibility for datacnt to be
2041  			 * non-zero. This is because we first add the
2042  			 * arc buffer to the header prior to changing
2043  			 * the header's state. Since we used the header
2044  			 * for the reference when putting the header on
2045  			 * the ghost state, we must balance that and use
2046  			 * the header when removing off the ghost state
2047  			 * (even though datacnt is non zero).
2048  			 */
2049  
2050  			IMPLY(datacnt == 0, new_state == arc_anon ||
2051  			    new_state == arc_l2c_only);
2052  
2053  			(void) refcount_remove_many(&old_state->arcs_size,
2054  			    hdr->b_size, hdr);
2055  		} else {
2056  			ASSERT3P(datacnt, !=, 0);
2057  
2058  			/*
2059  			 * Each individual buffer holds a unique reference,
2060  			 * thus we must remove each of these references one
2061  			 * at a time.
2062  			 */
2063  			for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2064  			    buf = buf->b_next) {
2065  				(void) refcount_remove_many(
2066  				    &old_state->arcs_size, hdr->b_size, buf);
2067  			}
2068  		}
2069  	}
2070  
2071  	if (HDR_HAS_L1HDR(hdr))
2072  		hdr->b_l1hdr.b_state = new_state;
2073  
2074  	/*
2075  	 * L2 headers should never be on the L2 state list since they don't
2076  	 * have L1 headers allocated.
2077  	 */
2078  	ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2079  	    multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2080  }
2081  
2082  void
arc_space_consume(uint64_t space,arc_space_type_t type)2083  arc_space_consume(uint64_t space, arc_space_type_t type)
2084  {
2085  	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2086  
2087  	switch (type) {
2088  	case ARC_SPACE_DATA:
2089  		ARCSTAT_INCR(arcstat_data_size, space);
2090  		break;
2091  	case ARC_SPACE_META:
2092  		ARCSTAT_INCR(arcstat_metadata_size, space);
2093  		break;
2094  	case ARC_SPACE_OTHER:
2095  		ARCSTAT_INCR(arcstat_other_size, space);
2096  		break;
2097  	case ARC_SPACE_HDRS:
2098  		ARCSTAT_INCR(arcstat_hdr_size, space);
2099  		break;
2100  	case ARC_SPACE_L2HDRS:
2101  		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
2102  		break;
2103  	}
2104  
2105  	if (type != ARC_SPACE_DATA)
2106  		ARCSTAT_INCR(arcstat_meta_used, space);
2107  
2108  	atomic_add_64(&arc_size, space);
2109  }
2110  
2111  void
arc_space_return(uint64_t space,arc_space_type_t type)2112  arc_space_return(uint64_t space, arc_space_type_t type)
2113  {
2114  	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2115  
2116  	switch (type) {
2117  	case ARC_SPACE_DATA:
2118  		ARCSTAT_INCR(arcstat_data_size, -space);
2119  		break;
2120  	case ARC_SPACE_META:
2121  		ARCSTAT_INCR(arcstat_metadata_size, -space);
2122  		break;
2123  	case ARC_SPACE_OTHER:
2124  		ARCSTAT_INCR(arcstat_other_size, -space);
2125  		break;
2126  	case ARC_SPACE_HDRS:
2127  		ARCSTAT_INCR(arcstat_hdr_size, -space);
2128  		break;
2129  	case ARC_SPACE_L2HDRS:
2130  		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
2131  		break;
2132  	}
2133  
2134  	if (type != ARC_SPACE_DATA) {
2135  		ASSERT(arc_meta_used >= space);
2136  		if (arc_meta_max < arc_meta_used)
2137  			arc_meta_max = arc_meta_used;
2138  		ARCSTAT_INCR(arcstat_meta_used, -space);
2139  	}
2140  
2141  	ASSERT(arc_size >= space);
2142  	atomic_add_64(&arc_size, -space);
2143  }
2144  
2145  arc_buf_t *
arc_buf_alloc(spa_t * spa,int32_t size,void * tag,arc_buf_contents_t type)2146  arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
2147  {
2148  	arc_buf_hdr_t *hdr;
2149  	arc_buf_t *buf;
2150  
2151  	ASSERT3U(size, >, 0);
2152  	hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
2153  	ASSERT(BUF_EMPTY(hdr));
2154  	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
2155  	hdr->b_size = size;
2156  	hdr->b_spa = spa_load_guid(spa);
2157  
2158  	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2159  	buf->b_hdr = hdr;
2160  	buf->b_data = NULL;
2161  	buf->b_efunc = NULL;
2162  	buf->b_private = NULL;
2163  	buf->b_next = NULL;
2164  
2165  	hdr->b_flags = arc_bufc_to_flags(type);
2166  	hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
2167  
2168  	hdr->b_l1hdr.b_buf = buf;
2169  	hdr->b_l1hdr.b_state = arc_anon;
2170  	hdr->b_l1hdr.b_arc_access = 0;
2171  	hdr->b_l1hdr.b_datacnt = 1;
2172  	hdr->b_l1hdr.b_tmp_cdata = NULL;
2173  
2174  	arc_get_data_buf(buf);
2175  	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2176  	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2177  
2178  	return (buf);
2179  }
2180  
2181  /*
2182   * Allocates an ARC buf header that's in an evicted & L2-cached state.
2183   * This is used during l2arc reconstruction to make empty ARC buffers
2184   * which circumvent the regular disk->arc->l2arc path and instead come
2185   * into being in the reverse order, i.e. l2arc->arc.
2186   */
2187  arc_buf_hdr_t *
arc_buf_alloc_l2only(uint64_t load_guid,int size,arc_buf_contents_t type,l2arc_dev_t * dev,dva_t dva,uint64_t daddr,int32_t asize,uint64_t birth,zio_cksum_t cksum,enum zio_compress compress)2188  arc_buf_alloc_l2only(uint64_t load_guid, int size, arc_buf_contents_t type,
2189      l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t asize, uint64_t birth,
2190      zio_cksum_t cksum, enum zio_compress compress)
2191  {
2192  	arc_buf_hdr_t *hdr;
2193  
2194  	ASSERT3U(size, >, 0);
2195  	hdr = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
2196  	ASSERT(BUF_EMPTY(hdr));
2197  	ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
2198  	hdr->b_dva = dva;
2199  	hdr->b_birth = birth;
2200  	hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
2201  	bcopy(&cksum, hdr->b_freeze_cksum, sizeof (cksum));
2202  	hdr->b_flags = arc_bufc_to_flags(type);
2203  	hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
2204  	hdr->b_size = size;
2205  	hdr->b_spa = load_guid;
2206  
2207  	hdr->b_l2hdr.b_compress = compress;
2208  	hdr->b_l2hdr.b_dev = dev;
2209  	hdr->b_l2hdr.b_daddr = daddr;
2210  	hdr->b_l2hdr.b_asize = asize;
2211  
2212  	return (hdr);
2213  }
2214  
2215  static char *arc_onloan_tag = "onloan";
2216  
2217  /*
2218   * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2219   * flight data by arc_tempreserve_space() until they are "returned". Loaned
2220   * buffers must be returned to the arc before they can be used by the DMU or
2221   * freed.
2222   */
2223  arc_buf_t *
arc_loan_buf(spa_t * spa,int size)2224  arc_loan_buf(spa_t *spa, int size)
2225  {
2226  	arc_buf_t *buf;
2227  
2228  	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
2229  
2230  	atomic_add_64(&arc_loaned_bytes, size);
2231  	return (buf);
2232  }
2233  
2234  /*
2235   * Return a loaned arc buffer to the arc.
2236   */
2237  void
arc_return_buf(arc_buf_t * buf,void * tag)2238  arc_return_buf(arc_buf_t *buf, void *tag)
2239  {
2240  	arc_buf_hdr_t *hdr = buf->b_hdr;
2241  
2242  	ASSERT(buf->b_data != NULL);
2243  	ASSERT(HDR_HAS_L1HDR(hdr));
2244  	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2245  	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2246  
2247  	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
2248  }
2249  
2250  /* Detach an arc_buf from a dbuf (tag) */
2251  void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)2252  arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2253  {
2254  	arc_buf_hdr_t *hdr = buf->b_hdr;
2255  
2256  	ASSERT(buf->b_data != NULL);
2257  	ASSERT(HDR_HAS_L1HDR(hdr));
2258  	(void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2259  	(void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2260  	buf->b_efunc = NULL;
2261  	buf->b_private = NULL;
2262  
2263  	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
2264  }
2265  
2266  static arc_buf_t *
arc_buf_clone(arc_buf_t * from)2267  arc_buf_clone(arc_buf_t *from)
2268  {
2269  	arc_buf_t *buf;
2270  	arc_buf_hdr_t *hdr = from->b_hdr;
2271  	uint64_t size = hdr->b_size;
2272  
2273  	ASSERT(HDR_HAS_L1HDR(hdr));
2274  	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2275  
2276  	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2277  	buf->b_hdr = hdr;
2278  	buf->b_data = NULL;
2279  	buf->b_efunc = NULL;
2280  	buf->b_private = NULL;
2281  	buf->b_next = hdr->b_l1hdr.b_buf;
2282  	hdr->b_l1hdr.b_buf = buf;
2283  	arc_get_data_buf(buf);
2284  	bcopy(from->b_data, buf->b_data, size);
2285  
2286  	/*
2287  	 * This buffer already exists in the arc so create a duplicate
2288  	 * copy for the caller.  If the buffer is associated with user data
2289  	 * then track the size and number of duplicates.  These stats will be
2290  	 * updated as duplicate buffers are created and destroyed.
2291  	 */
2292  	if (HDR_ISTYPE_DATA(hdr)) {
2293  		ARCSTAT_BUMP(arcstat_duplicate_buffers);
2294  		ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
2295  	}
2296  	hdr->b_l1hdr.b_datacnt += 1;
2297  	return (buf);
2298  }
2299  
2300  void
arc_buf_add_ref(arc_buf_t * buf,void * tag)2301  arc_buf_add_ref(arc_buf_t *buf, void* tag)
2302  {
2303  	arc_buf_hdr_t *hdr;
2304  	kmutex_t *hash_lock;
2305  
2306  	/*
2307  	 * Check to see if this buffer is evicted.  Callers
2308  	 * must verify b_data != NULL to know if the add_ref
2309  	 * was successful.
2310  	 */
2311  	mutex_enter(&buf->b_evict_lock);
2312  	if (buf->b_data == NULL) {
2313  		mutex_exit(&buf->b_evict_lock);
2314  		return;
2315  	}
2316  	hash_lock = HDR_LOCK(buf->b_hdr);
2317  	mutex_enter(hash_lock);
2318  	hdr = buf->b_hdr;
2319  	ASSERT(HDR_HAS_L1HDR(hdr));
2320  	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2321  	mutex_exit(&buf->b_evict_lock);
2322  
2323  	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
2324  	    hdr->b_l1hdr.b_state == arc_mfu);
2325  
2326  	add_reference(hdr, hash_lock, tag);
2327  	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2328  	arc_access(hdr, hash_lock);
2329  	mutex_exit(hash_lock);
2330  	ARCSTAT_BUMP(arcstat_hits);
2331  	arc_update_hit_stat(hdr, B_TRUE);
2332  }
2333  
2334  static void
arc_buf_free_on_write(void * data,size_t size,void (* free_func)(void *,size_t))2335  arc_buf_free_on_write(void *data, size_t size,
2336      void (*free_func)(void *, size_t))
2337  {
2338  	l2arc_data_free_t *df;
2339  
2340  	df = kmem_alloc(sizeof (*df), KM_SLEEP);
2341  	df->l2df_data = data;
2342  	df->l2df_size = size;
2343  	df->l2df_func = free_func;
2344  	mutex_enter(&l2arc_free_on_write_mtx);
2345  	list_insert_head(l2arc_free_on_write, df);
2346  	mutex_exit(&l2arc_free_on_write_mtx);
2347  }
2348  
2349  /*
2350   * Free the arc data buffer.  If it is an l2arc write in progress,
2351   * the buffer is placed on l2arc_free_on_write to be freed later.
2352   */
2353  static void
arc_buf_data_free(arc_buf_t * buf,void (* free_func)(void *,size_t))2354  arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
2355  {
2356  	arc_buf_hdr_t *hdr = buf->b_hdr;
2357  
2358  	if (HDR_L2_WRITING(hdr)) {
2359  		arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
2360  		ARCSTAT_BUMP(arcstat_l2_free_on_write);
2361  	} else {
2362  		free_func(buf->b_data, hdr->b_size);
2363  	}
2364  }
2365  
2366  static void
arc_buf_l2_cdata_free(arc_buf_hdr_t * hdr)2367  arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2368  {
2369  	ASSERT(HDR_HAS_L2HDR(hdr));
2370  	ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2371  
2372  	/*
2373  	 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2374  	 * that doesn't exist, the header is in the arc_l2c_only state,
2375  	 * and there isn't anything to free (it's already been freed).
2376  	 */
2377  	if (!HDR_HAS_L1HDR(hdr))
2378  		return;
2379  
2380  	/*
2381  	 * The header isn't being written to the l2arc device, thus it
2382  	 * shouldn't have a b_tmp_cdata to free.
2383  	 */
2384  	if (!HDR_L2_WRITING(hdr)) {
2385  		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2386  		return;
2387  	}
2388  
2389  	/*
2390  	 * The header does not have compression enabled. This can be due
2391  	 * to the buffer not being compressible, or because we're
2392  	 * freeing the buffer before the second phase of
2393  	 * l2arc_write_buffer() has started (which does the compression
2394  	 * step). In either case, b_tmp_cdata does not point to a
2395  	 * separately compressed buffer, so there's nothing to free (it
2396  	 * points to the same buffer as the arc_buf_t's b_data field).
2397  	 */
2398  	if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
2399  		hdr->b_l1hdr.b_tmp_cdata = NULL;
2400  		return;
2401  	}
2402  
2403  	/*
2404  	 * There's nothing to free since the buffer was all zero's and
2405  	 * compressed to a zero length buffer.
2406  	 */
2407  	if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
2408  		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2409  		return;
2410  	}
2411  
2412  	ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
2413  
2414  	arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2415  	    hdr->b_size, zio_data_buf_free);
2416  
2417  	ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2418  	hdr->b_l1hdr.b_tmp_cdata = NULL;
2419  }
2420  
2421  /*
2422   * Free up buf->b_data and if 'remove' is set, then pull the
2423   * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2424   */
2425  static void
arc_buf_destroy(arc_buf_t * buf,boolean_t remove)2426  arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2427  {
2428  	arc_buf_t **bufp;
2429  
2430  	/* free up data associated with the buf */
2431  	if (buf->b_data != NULL) {
2432  		arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2433  		uint64_t size = buf->b_hdr->b_size;
2434  		arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2435  
2436  		arc_cksum_verify(buf);
2437  		arc_buf_unwatch(buf);
2438  
2439  		if (type == ARC_BUFC_METADATA) {
2440  			arc_buf_data_free(buf, zio_buf_free);
2441  			arc_space_return(size, ARC_SPACE_META);
2442  		} else {
2443  			ASSERT(type == ARC_BUFC_DATA);
2444  			arc_buf_data_free(buf, zio_data_buf_free);
2445  			arc_space_return(size, ARC_SPACE_DATA);
2446  		}
2447  
2448  		/* protected by hash lock, if in the hash table */
2449  		if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2450  			uint64_t *cnt = &state->arcs_lsize[type];
2451  
2452  			ASSERT(refcount_is_zero(
2453  			    &buf->b_hdr->b_l1hdr.b_refcnt));
2454  			ASSERT(state != arc_anon && state != arc_l2c_only);
2455  
2456  			ASSERT3U(*cnt, >=, size);
2457  			atomic_add_64(cnt, -size);
2458  		}
2459  
2460  		(void) refcount_remove_many(&state->arcs_size, size, buf);
2461  		buf->b_data = NULL;
2462  
2463  		/*
2464  		 * If we're destroying a duplicate buffer make sure
2465  		 * that the appropriate statistics are updated.
2466  		 */
2467  		if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2468  		    HDR_ISTYPE_DATA(buf->b_hdr)) {
2469  			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2470  			ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2471  		}
2472  		ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2473  		buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2474  	}
2475  
2476  	/* only remove the buf if requested */
2477  	if (!remove)
2478  		return;
2479  
2480  	/* remove the buf from the hdr list */
2481  	for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2482  	    bufp = &(*bufp)->b_next)
2483  		continue;
2484  	*bufp = buf->b_next;
2485  	buf->b_next = NULL;
2486  
2487  	ASSERT(buf->b_efunc == NULL);
2488  
2489  	/* clean up the buf */
2490  	buf->b_hdr = NULL;
2491  	kmem_cache_free(buf_cache, buf);
2492  }
2493  
2494  static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t * hdr)2495  arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2496  {
2497  	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2498  	l2arc_dev_t *dev = l2hdr->b_dev;
2499  
2500  	ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2501  	ASSERT(HDR_HAS_L2HDR(hdr));
2502  
2503  	list_remove(&dev->l2ad_buflist, hdr);
2504  
2505  	/*
2506  	 * We don't want to leak the b_tmp_cdata buffer that was
2507  	 * allocated in l2arc_write_buffers()
2508  	 */
2509  	arc_buf_l2_cdata_free(hdr);
2510  
2511  	/*
2512  	 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2513  	 * this header is being processed by l2arc_write_buffers() (i.e.
2514  	 * it's in the first stage of l2arc_write_buffers()).
2515  	 * Re-affirming that truth here, just to serve as a reminder. If
2516  	 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2517  	 * may not have its HDR_L2_WRITING flag set. (the write may have
2518  	 * completed, in which case HDR_L2_WRITING will be false and the
2519  	 * b_daddr field will point to the address of the buffer on disk).
2520  	 */
2521  	IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2522  
2523  	/*
2524  	 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2525  	 * l2arc_write_buffers(). Since we've just removed this header
2526  	 * from the l2arc buffer list, this header will never reach the
2527  	 * second stage of l2arc_write_buffers(), which increments the
2528  	 * accounting stats for this header. Thus, we must be careful
2529  	 * not to decrement them for this header either.
2530  	 */
2531  	if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2532  		ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2533  		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2534  
2535  		vdev_space_update(dev->l2ad_vdev,
2536  		    -l2hdr->b_asize, 0, 0);
2537  
2538  		(void) refcount_remove_many(&dev->l2ad_alloc,
2539  		    l2hdr->b_asize, hdr);
2540  	}
2541  
2542  	hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2543  }
2544  
2545  static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)2546  arc_hdr_destroy(arc_buf_hdr_t *hdr)
2547  {
2548  	if (HDR_HAS_L1HDR(hdr)) {
2549  		ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2550  		    hdr->b_l1hdr.b_datacnt > 0);
2551  		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2552  		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2553  	}
2554  	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2555  	ASSERT(!HDR_IN_HASH_TABLE(hdr));
2556  
2557  	if (HDR_HAS_L2HDR(hdr)) {
2558  		l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2559  		boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2560  
2561  		if (!buflist_held)
2562  			mutex_enter(&dev->l2ad_mtx);
2563  
2564  		/*
2565  		 * Even though we checked this conditional above, we
2566  		 * need to check this again now that we have the
2567  		 * l2ad_mtx. This is because we could be racing with
2568  		 * another thread calling l2arc_evict() which might have
2569  		 * destroyed this header's L2 portion as we were waiting
2570  		 * to acquire the l2ad_mtx. If that happens, we don't
2571  		 * want to re-destroy the header's L2 portion.
2572  		 */
2573  		if (HDR_HAS_L2HDR(hdr))
2574  			arc_hdr_l2hdr_destroy(hdr);
2575  
2576  		if (!buflist_held)
2577  			mutex_exit(&dev->l2ad_mtx);
2578  	}
2579  
2580  	if (!BUF_EMPTY(hdr))
2581  		buf_discard_identity(hdr);
2582  
2583  	if (hdr->b_freeze_cksum != NULL) {
2584  		kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2585  		hdr->b_freeze_cksum = NULL;
2586  	}
2587  
2588  	if (HDR_HAS_L1HDR(hdr)) {
2589  		while (hdr->b_l1hdr.b_buf) {
2590  			arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2591  
2592  			if (buf->b_efunc != NULL) {
2593  				mutex_enter(&arc_user_evicts_lock);
2594  				mutex_enter(&buf->b_evict_lock);
2595  				ASSERT(buf->b_hdr != NULL);
2596  				arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
2597  				hdr->b_l1hdr.b_buf = buf->b_next;
2598  				buf->b_hdr = &arc_eviction_hdr;
2599  				buf->b_next = arc_eviction_list;
2600  				arc_eviction_list = buf;
2601  				mutex_exit(&buf->b_evict_lock);
2602  				cv_signal(&arc_user_evicts_cv);
2603  				mutex_exit(&arc_user_evicts_lock);
2604  			} else {
2605  				arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
2606  			}
2607  		}
2608  #ifdef ZFS_DEBUG
2609  		if (hdr->b_l1hdr.b_thawed != NULL) {
2610  			kmem_free(hdr->b_l1hdr.b_thawed, 1);
2611  			hdr->b_l1hdr.b_thawed = NULL;
2612  		}
2613  #endif
2614  	}
2615  
2616  	ASSERT3P(hdr->b_hash_next, ==, NULL);
2617  	if (HDR_HAS_L1HDR(hdr)) {
2618  		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2619  		ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2620  		kmem_cache_free(hdr_full_cache, hdr);
2621  	} else {
2622  		kmem_cache_free(hdr_l2only_cache, hdr);
2623  	}
2624  }
2625  
2626  void
arc_buf_free(arc_buf_t * buf,void * tag)2627  arc_buf_free(arc_buf_t *buf, void *tag)
2628  {
2629  	arc_buf_hdr_t *hdr = buf->b_hdr;
2630  	int hashed = hdr->b_l1hdr.b_state != arc_anon;
2631  
2632  	ASSERT(buf->b_efunc == NULL);
2633  	ASSERT(buf->b_data != NULL);
2634  
2635  	if (hashed) {
2636  		kmutex_t *hash_lock = HDR_LOCK(hdr);
2637  
2638  		mutex_enter(hash_lock);
2639  		hdr = buf->b_hdr;
2640  		ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2641  
2642  		(void) remove_reference(hdr, hash_lock, tag);
2643  		if (hdr->b_l1hdr.b_datacnt > 1) {
2644  			arc_buf_destroy(buf, TRUE);
2645  		} else {
2646  			ASSERT(buf == hdr->b_l1hdr.b_buf);
2647  			ASSERT(buf->b_efunc == NULL);
2648  			hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2649  		}
2650  		mutex_exit(hash_lock);
2651  	} else if (HDR_IO_IN_PROGRESS(hdr)) {
2652  		int destroy_hdr;
2653  		/*
2654  		 * We are in the middle of an async write.  Don't destroy
2655  		 * this buffer unless the write completes before we finish
2656  		 * decrementing the reference count.
2657  		 */
2658  		mutex_enter(&arc_user_evicts_lock);
2659  		(void) remove_reference(hdr, NULL, tag);
2660  		ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2661  		destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2662  		mutex_exit(&arc_user_evicts_lock);
2663  		if (destroy_hdr)
2664  			arc_hdr_destroy(hdr);
2665  	} else {
2666  		if (remove_reference(hdr, NULL, tag) > 0)
2667  			arc_buf_destroy(buf, TRUE);
2668  		else
2669  			arc_hdr_destroy(hdr);
2670  	}
2671  }
2672  
2673  boolean_t
arc_buf_remove_ref(arc_buf_t * buf,void * tag)2674  arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2675  {
2676  	arc_buf_hdr_t *hdr = buf->b_hdr;
2677  	kmutex_t *hash_lock = HDR_LOCK(hdr);
2678  	boolean_t no_callback = (buf->b_efunc == NULL);
2679  
2680  	if (hdr->b_l1hdr.b_state == arc_anon) {
2681  		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2682  		arc_buf_free(buf, tag);
2683  		return (no_callback);
2684  	}
2685  
2686  	mutex_enter(hash_lock);
2687  	hdr = buf->b_hdr;
2688  	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2689  	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2690  	ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2691  	ASSERT(buf->b_data != NULL);
2692  
2693  	(void) remove_reference(hdr, hash_lock, tag);
2694  	if (hdr->b_l1hdr.b_datacnt > 1) {
2695  		if (no_callback)
2696  			arc_buf_destroy(buf, TRUE);
2697  	} else if (no_callback) {
2698  		ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2699  		ASSERT(buf->b_efunc == NULL);
2700  		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2701  	}
2702  	ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2703  	    refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2704  	mutex_exit(hash_lock);
2705  	return (no_callback);
2706  }
2707  
2708  int32_t
arc_buf_size(arc_buf_t * buf)2709  arc_buf_size(arc_buf_t *buf)
2710  {
2711  	return (buf->b_hdr->b_size);
2712  }
2713  
2714  /*
2715   * Called from the DMU to determine if the current buffer should be
2716   * evicted. In order to ensure proper locking, the eviction must be initiated
2717   * from the DMU. Return true if the buffer is associated with user data and
2718   * duplicate buffers still exist.
2719   */
2720  boolean_t
arc_buf_eviction_needed(arc_buf_t * buf)2721  arc_buf_eviction_needed(arc_buf_t *buf)
2722  {
2723  	arc_buf_hdr_t *hdr;
2724  	boolean_t evict_needed = B_FALSE;
2725  
2726  	if (zfs_disable_dup_eviction)
2727  		return (B_FALSE);
2728  
2729  	mutex_enter(&buf->b_evict_lock);
2730  	hdr = buf->b_hdr;
2731  	if (hdr == NULL) {
2732  		/*
2733  		 * We are in arc_do_user_evicts(); let that function
2734  		 * perform the eviction.
2735  		 */
2736  		ASSERT(buf->b_data == NULL);
2737  		mutex_exit(&buf->b_evict_lock);
2738  		return (B_FALSE);
2739  	} else if (buf->b_data == NULL) {
2740  		/*
2741  		 * We have already been added to the arc eviction list;
2742  		 * recommend eviction.
2743  		 */
2744  		ASSERT3P(hdr, ==, &arc_eviction_hdr);
2745  		mutex_exit(&buf->b_evict_lock);
2746  		return (B_TRUE);
2747  	}
2748  
2749  	if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2750  		evict_needed = B_TRUE;
2751  
2752  	mutex_exit(&buf->b_evict_lock);
2753  	return (evict_needed);
2754  }
2755  
2756  /*
2757   * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2758   * state of the header is dependent on it's state prior to entering this
2759   * function. The following transitions are possible:
2760   *
2761   *    - arc_mru -> arc_mru_ghost
2762   *    - arc_mfu -> arc_mfu_ghost
2763   *    - arc_mru_ghost -> arc_l2c_only
2764   *    - arc_mru_ghost -> deleted
2765   *    - arc_mfu_ghost -> arc_l2c_only
2766   *    - arc_mfu_ghost -> deleted
2767   */
2768  static int64_t
arc_evict_hdr(arc_buf_hdr_t * hdr,kmutex_t * hash_lock)2769  arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2770  {
2771  	arc_state_t *evicted_state, *state;
2772  	int64_t bytes_evicted = 0;
2773  
2774  	ASSERT(MUTEX_HELD(hash_lock));
2775  	ASSERT(HDR_HAS_L1HDR(hdr));
2776  
2777  	state = hdr->b_l1hdr.b_state;
2778  	if (GHOST_STATE(state)) {
2779  		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2780  		ASSERT(hdr->b_l1hdr.b_buf == NULL);
2781  
2782  		/*
2783  		 * l2arc_write_buffers() relies on a header's L1 portion
2784  		 * (i.e. it's b_tmp_cdata field) during it's write phase.
2785  		 * Thus, we cannot push a header onto the arc_l2c_only
2786  		 * state (removing it's L1 piece) until the header is
2787  		 * done being written to the l2arc.
2788  		 */
2789  		if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2790  			ARCSTAT_BUMP(arcstat_evict_l2_skip);
2791  			return (bytes_evicted);
2792  		}
2793  
2794  		ARCSTAT_BUMP(arcstat_deleted);
2795  		bytes_evicted += hdr->b_size;
2796  
2797  		DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2798  
2799  		if (HDR_HAS_L2HDR(hdr)) {
2800  			/*
2801  			 * This buffer is cached on the 2nd Level ARC;
2802  			 * don't destroy the header.
2803  			 */
2804  			arc_change_state(arc_l2c_only, hdr, hash_lock);
2805  			/*
2806  			 * dropping from L1+L2 cached to L2-only,
2807  			 * realloc to remove the L1 header.
2808  			 */
2809  			hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2810  			    hdr_l2only_cache);
2811  		} else {
2812  			arc_change_state(arc_anon, hdr, hash_lock);
2813  			arc_hdr_destroy(hdr);
2814  		}
2815  		return (bytes_evicted);
2816  	}
2817  
2818  	ASSERT(state == arc_mru || state == arc_mfu);
2819  	evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2820  
2821  	/* prefetch buffers have a minimum lifespan */
2822  	if (HDR_IO_IN_PROGRESS(hdr) ||
2823  	    ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2824  	    ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2825  	    arc_min_prefetch_lifespan)) {
2826  		ARCSTAT_BUMP(arcstat_evict_skip);
2827  		return (bytes_evicted);
2828  	}
2829  
2830  	ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2831  	ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2832  	while (hdr->b_l1hdr.b_buf) {
2833  		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2834  		if (!mutex_tryenter(&buf->b_evict_lock)) {
2835  			ARCSTAT_BUMP(arcstat_mutex_miss);
2836  			break;
2837  		}
2838  		if (buf->b_data != NULL)
2839  			bytes_evicted += hdr->b_size;
2840  		if (buf->b_efunc != NULL) {
2841  			mutex_enter(&arc_user_evicts_lock);
2842  			arc_buf_destroy(buf, FALSE);
2843  			hdr->b_l1hdr.b_buf = buf->b_next;
2844  			buf->b_hdr = &arc_eviction_hdr;
2845  			buf->b_next = arc_eviction_list;
2846  			arc_eviction_list = buf;
2847  			cv_signal(&arc_user_evicts_cv);
2848  			mutex_exit(&arc_user_evicts_lock);
2849  			mutex_exit(&buf->b_evict_lock);
2850  		} else {
2851  			mutex_exit(&buf->b_evict_lock);
2852  			arc_buf_destroy(buf, TRUE);
2853  		}
2854  	}
2855  
2856  	if (HDR_HAS_L2HDR(hdr)) {
2857  		ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2858  	} else {
2859  		if (l2arc_write_eligible(hdr->b_spa, UINT64_MAX, hdr))
2860  			ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2861  		else
2862  			ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2863  	}
2864  
2865  	if (hdr->b_l1hdr.b_datacnt == 0) {
2866  		arc_change_state(evicted_state, hdr, hash_lock);
2867  		ASSERT(HDR_IN_HASH_TABLE(hdr));
2868  		hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2869  		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2870  		DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2871  	}
2872  
2873  	return (bytes_evicted);
2874  }
2875  
2876  static uint64_t
arc_evict_state_impl(multilist_t * ml,int idx,arc_buf_hdr_t * marker,uint64_t spa,int64_t bytes)2877  arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2878      uint64_t spa, int64_t bytes)
2879  {
2880  	multilist_sublist_t *mls;
2881  	uint64_t bytes_evicted = 0;
2882  	arc_buf_hdr_t *hdr;
2883  	kmutex_t *hash_lock;
2884  	int evict_count = 0;
2885  
2886  	ASSERT3P(marker, !=, NULL);
2887  	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2888  
2889  	mls = multilist_sublist_lock(ml, idx);
2890  
2891  	for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2892  	    hdr = multilist_sublist_prev(mls, marker)) {
2893  		if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2894  		    (evict_count >= zfs_arc_evict_batch_limit))
2895  			break;
2896  
2897  		/*
2898  		 * To keep our iteration location, move the marker
2899  		 * forward. Since we're not holding hdr's hash lock, we
2900  		 * must be very careful and not remove 'hdr' from the
2901  		 * sublist. Otherwise, other consumers might mistake the
2902  		 * 'hdr' as not being on a sublist when they call the
2903  		 * multilist_link_active() function (they all rely on
2904  		 * the hash lock protecting concurrent insertions and
2905  		 * removals). multilist_sublist_move_forward() was
2906  		 * specifically implemented to ensure this is the case
2907  		 * (only 'marker' will be removed and re-inserted).
2908  		 */
2909  		multilist_sublist_move_forward(mls, marker);
2910  
2911  		/*
2912  		 * The only case where the b_spa field should ever be
2913  		 * zero, is the marker headers inserted by
2914  		 * arc_evict_state(). It's possible for multiple threads
2915  		 * to be calling arc_evict_state() concurrently (e.g.
2916  		 * dsl_pool_close() and zio_inject_fault()), so we must
2917  		 * skip any markers we see from these other threads.
2918  		 */
2919  		if (hdr->b_spa == 0)
2920  			continue;
2921  
2922  		/* we're only interested in evicting buffers of a certain spa */
2923  		if (spa != 0 && hdr->b_spa != spa) {
2924  			ARCSTAT_BUMP(arcstat_evict_skip);
2925  			continue;
2926  		}
2927  
2928  		hash_lock = HDR_LOCK(hdr);
2929  
2930  		/*
2931  		 * We aren't calling this function from any code path
2932  		 * that would already be holding a hash lock, so we're
2933  		 * asserting on this assumption to be defensive in case
2934  		 * this ever changes. Without this check, it would be
2935  		 * possible to incorrectly increment arcstat_mutex_miss
2936  		 * below (e.g. if the code changed such that we called
2937  		 * this function with a hash lock held).
2938  		 */
2939  		ASSERT(!MUTEX_HELD(hash_lock));
2940  
2941  		if (mutex_tryenter(hash_lock)) {
2942  			uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2943  			mutex_exit(hash_lock);
2944  
2945  			bytes_evicted += evicted;
2946  
2947  			/*
2948  			 * If evicted is zero, arc_evict_hdr() must have
2949  			 * decided to skip this header, don't increment
2950  			 * evict_count in this case.
2951  			 */
2952  			if (evicted != 0)
2953  				evict_count++;
2954  
2955  			/*
2956  			 * If arc_size isn't overflowing, signal any
2957  			 * threads that might happen to be waiting.
2958  			 *
2959  			 * For each header evicted, we wake up a single
2960  			 * thread. If we used cv_broadcast, we could
2961  			 * wake up "too many" threads causing arc_size
2962  			 * to significantly overflow arc_c; since
2963  			 * arc_get_data_buf() doesn't check for overflow
2964  			 * when it's woken up (it doesn't because it's
2965  			 * possible for the ARC to be overflowing while
2966  			 * full of un-evictable buffers, and the
2967  			 * function should proceed in this case).
2968  			 *
2969  			 * If threads are left sleeping, due to not
2970  			 * using cv_broadcast, they will be woken up
2971  			 * just before arc_reclaim_thread() sleeps.
2972  			 */
2973  			mutex_enter(&arc_reclaim_lock);
2974  			if (!arc_is_overflowing())
2975  				cv_signal(&arc_reclaim_waiters_cv);
2976  			mutex_exit(&arc_reclaim_lock);
2977  		} else {
2978  			ARCSTAT_BUMP(arcstat_mutex_miss);
2979  		}
2980  	}
2981  
2982  	multilist_sublist_unlock(mls);
2983  
2984  	return (bytes_evicted);
2985  }
2986  
2987  /*
2988   * Evict buffers from the given arc state, until we've removed the
2989   * specified number of bytes. Move the removed buffers to the
2990   * appropriate evict state.
2991   *
2992   * This function makes a "best effort". It skips over any buffers
2993   * it can't get a hash_lock on, and so, may not catch all candidates.
2994   * It may also return without evicting as much space as requested.
2995   *
2996   * If bytes is specified using the special value ARC_EVICT_ALL, this
2997   * will evict all available (i.e. unlocked and evictable) buffers from
2998   * the given arc state; which is used by arc_flush().
2999   */
3000  static uint64_t
arc_evict_state(arc_state_t * state,uint64_t spa,int64_t bytes,arc_buf_contents_t type)3001  arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
3002      arc_buf_contents_t type)
3003  {
3004  	uint64_t total_evicted = 0;
3005  	multilist_t *ml = &state->arcs_list[type];
3006  	int num_sublists;
3007  	arc_buf_hdr_t **markers;
3008  
3009  	IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3010  
3011  	num_sublists = multilist_get_num_sublists(ml);
3012  
3013  	/*
3014  	 * If we've tried to evict from each sublist, made some
3015  	 * progress, but still have not hit the target number of bytes
3016  	 * to evict, we want to keep trying. The markers allow us to
3017  	 * pick up where we left off for each individual sublist, rather
3018  	 * than starting from the tail each time.
3019  	 */
3020  	markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
3021  	for (int i = 0; i < num_sublists; i++) {
3022  		markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
3023  
3024  		/*
3025  		 * A b_spa of 0 is used to indicate that this header is
3026  		 * a marker. This fact is used in arc_adjust_type() and
3027  		 * arc_evict_state_impl().
3028  		 */
3029  		markers[i]->b_spa = 0;
3030  
3031  		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3032  		multilist_sublist_insert_tail(mls, markers[i]);
3033  		multilist_sublist_unlock(mls);
3034  	}
3035  
3036  	/*
3037  	 * While we haven't hit our target number of bytes to evict, or
3038  	 * we're evicting all available buffers.
3039  	 */
3040  	while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
3041  		/*
3042  		 * Start eviction using a randomly selected sublist,
3043  		 * this is to try and evenly balance eviction across all
3044  		 * sublists. Always starting at the same sublist
3045  		 * (e.g. index 0) would cause evictions to favor certain
3046  		 * sublists over others.
3047  		 */
3048  		int sublist_idx = multilist_get_random_index(ml);
3049  		uint64_t scan_evicted = 0;
3050  
3051  		for (int i = 0; i < num_sublists; i++) {
3052  			uint64_t bytes_remaining;
3053  			uint64_t bytes_evicted;
3054  
3055  			if (bytes == ARC_EVICT_ALL)
3056  				bytes_remaining = ARC_EVICT_ALL;
3057  			else if (total_evicted < bytes)
3058  				bytes_remaining = bytes - total_evicted;
3059  			else
3060  				break;
3061  
3062  			bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
3063  			    markers[sublist_idx], spa, bytes_remaining);
3064  
3065  			scan_evicted += bytes_evicted;
3066  			total_evicted += bytes_evicted;
3067  
3068  			/* we've reached the end, wrap to the beginning */
3069  			if (++sublist_idx >= num_sublists)
3070  				sublist_idx = 0;
3071  		}
3072  
3073  		/*
3074  		 * If we didn't evict anything during this scan, we have
3075  		 * no reason to believe we'll evict more during another
3076  		 * scan, so break the loop.
3077  		 */
3078  		if (scan_evicted == 0) {
3079  			/* This isn't possible, let's make that obvious */
3080  			ASSERT3S(bytes, !=, 0);
3081  
3082  			/*
3083  			 * When bytes is ARC_EVICT_ALL, the only way to
3084  			 * break the loop is when scan_evicted is zero.
3085  			 * In that case, we actually have evicted enough,
3086  			 * so we don't want to increment the kstat.
3087  			 */
3088  			if (bytes != ARC_EVICT_ALL) {
3089  				ASSERT3S(total_evicted, <, bytes);
3090  				ARCSTAT_BUMP(arcstat_evict_not_enough);
3091  			}
3092  
3093  			break;
3094  		}
3095  	}
3096  
3097  	for (int i = 0; i < num_sublists; i++) {
3098  		multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3099  		multilist_sublist_remove(mls, markers[i]);
3100  		multilist_sublist_unlock(mls);
3101  
3102  		kmem_cache_free(hdr_full_cache, markers[i]);
3103  	}
3104  	kmem_free(markers, sizeof (*markers) * num_sublists);
3105  
3106  	return (total_evicted);
3107  }
3108  
3109  /*
3110   * Flush all "evictable" data of the given type from the arc state
3111   * specified. This will not evict any "active" buffers (i.e. referenced).
3112   *
3113   * When 'retry' is set to FALSE, the function will make a single pass
3114   * over the state and evict any buffers that it can. Since it doesn't
3115   * continually retry the eviction, it might end up leaving some buffers
3116   * in the ARC due to lock misses.
3117   *
3118   * When 'retry' is set to TRUE, the function will continually retry the
3119   * eviction until *all* evictable buffers have been removed from the
3120   * state. As a result, if concurrent insertions into the state are
3121   * allowed (e.g. if the ARC isn't shutting down), this function might
3122   * wind up in an infinite loop, continually trying to evict buffers.
3123   */
3124  static uint64_t
arc_flush_state(arc_state_t * state,uint64_t spa,arc_buf_contents_t type,boolean_t retry)3125  arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
3126      boolean_t retry)
3127  {
3128  	uint64_t evicted = 0;
3129  
3130  	while (state->arcs_lsize[type] != 0) {
3131  		evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
3132  
3133  		if (!retry)
3134  			break;
3135  	}
3136  
3137  	return (evicted);
3138  }
3139  
3140  /*
3141   * Evict the specified number of bytes from the state specified,
3142   * restricting eviction to the spa and type given. This function
3143   * prevents us from trying to evict more from a state's list than
3144   * is "evictable", and to skip evicting altogether when passed a
3145   * negative value for "bytes". In contrast, arc_evict_state() will
3146   * evict everything it can, when passed a negative value for "bytes".
3147   */
3148  static uint64_t
arc_adjust_impl(arc_state_t * state,uint64_t spa,int64_t bytes,arc_buf_contents_t type)3149  arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
3150      arc_buf_contents_t type)
3151  {
3152  	int64_t delta;
3153  
3154  	if (bytes > 0 && state->arcs_lsize[type] > 0) {
3155  		delta = MIN(state->arcs_lsize[type], bytes);
3156  		return (arc_evict_state(state, spa, delta, type));
3157  	}
3158  
3159  	return (0);
3160  }
3161  
3162  /*
3163   * Evict metadata buffers from the cache, such that arc_meta_used is
3164   * capped by the arc_meta_limit tunable.
3165   */
3166  static uint64_t
arc_adjust_meta(void)3167  arc_adjust_meta(void)
3168  {
3169  	uint64_t total_evicted = 0;
3170  	int64_t target;
3171  
3172  	/*
3173  	 * If we're over the meta limit, we want to evict enough
3174  	 * metadata to get back under the meta limit. We don't want to
3175  	 * evict so much that we drop the MRU below arc_p, though. If
3176  	 * we're over the meta limit more than we're over arc_p, we
3177  	 * evict some from the MRU here, and some from the MFU below.
3178  	 */
3179  	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3180  	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
3181  	    refcount_count(&arc_mru->arcs_size) - arc_p));
3182  
3183  	total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3184  
3185  	/*
3186  	 * Similar to the above, we want to evict enough bytes to get us
3187  	 * below the meta limit, but not so much as to drop us below the
3188  	 * space alloted to the MFU (which is defined as arc_c - arc_p).
3189  	 */
3190  	target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3191  	    (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
3192  
3193  	total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3194  
3195  	return (total_evicted);
3196  }
3197  
3198  /*
3199   * Return the type of the oldest buffer in the given arc state
3200   *
3201   * This function will select a random sublist of type ARC_BUFC_DATA and
3202   * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
3203   * is compared, and the type which contains the "older" buffer will be
3204   * returned.
3205   */
3206  static arc_buf_contents_t
arc_adjust_type(arc_state_t * state)3207  arc_adjust_type(arc_state_t *state)
3208  {
3209  	multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
3210  	multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
3211  	int data_idx = multilist_get_random_index(data_ml);
3212  	int meta_idx = multilist_get_random_index(meta_ml);
3213  	multilist_sublist_t *data_mls;
3214  	multilist_sublist_t *meta_mls;
3215  	arc_buf_contents_t type;
3216  	arc_buf_hdr_t *data_hdr;
3217  	arc_buf_hdr_t *meta_hdr;
3218  
3219  	/*
3220  	 * We keep the sublist lock until we're finished, to prevent
3221  	 * the headers from being destroyed via arc_evict_state().
3222  	 */
3223  	data_mls = multilist_sublist_lock(data_ml, data_idx);
3224  	meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
3225  
3226  	/*
3227  	 * These two loops are to ensure we skip any markers that
3228  	 * might be at the tail of the lists due to arc_evict_state().
3229  	 */
3230  
3231  	for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
3232  	    data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
3233  		if (data_hdr->b_spa != 0)
3234  			break;
3235  	}
3236  
3237  	for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
3238  	    meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
3239  		if (meta_hdr->b_spa != 0)
3240  			break;
3241  	}
3242  
3243  	if (data_hdr == NULL && meta_hdr == NULL) {
3244  		type = ARC_BUFC_DATA;
3245  	} else if (data_hdr == NULL) {
3246  		ASSERT3P(meta_hdr, !=, NULL);
3247  		type = ARC_BUFC_METADATA;
3248  	} else if (meta_hdr == NULL) {
3249  		ASSERT3P(data_hdr, !=, NULL);
3250  		type = ARC_BUFC_DATA;
3251  	} else {
3252  		ASSERT3P(data_hdr, !=, NULL);
3253  		ASSERT3P(meta_hdr, !=, NULL);
3254  
3255  		/* The headers can't be on the sublist without an L1 header */
3256  		ASSERT(HDR_HAS_L1HDR(data_hdr));
3257  		ASSERT(HDR_HAS_L1HDR(meta_hdr));
3258  
3259  		if (data_hdr->b_l1hdr.b_arc_access <
3260  		    meta_hdr->b_l1hdr.b_arc_access) {
3261  			type = ARC_BUFC_DATA;
3262  		} else {
3263  			type = ARC_BUFC_METADATA;
3264  		}
3265  	}
3266  
3267  	multilist_sublist_unlock(meta_mls);
3268  	multilist_sublist_unlock(data_mls);
3269  
3270  	return (type);
3271  }
3272  
3273  /*
3274   * Evict buffers from the cache, such that arc_size is capped by arc_c.
3275   */
3276  static uint64_t
arc_adjust(void)3277  arc_adjust(void)
3278  {
3279  	uint64_t total_evicted = 0;
3280  	uint64_t bytes;
3281  	int64_t target;
3282  
3283  	/*
3284  	 * If we're over arc_meta_limit, we want to correct that before
3285  	 * potentially evicting data buffers below.
3286  	 */
3287  	total_evicted += arc_adjust_meta();
3288  
3289  	/*
3290  	 * Adjust MRU size
3291  	 *
3292  	 * If we're over the target cache size, we want to evict enough
3293  	 * from the list to get back to our target size. We don't want
3294  	 * to evict too much from the MRU, such that it drops below
3295  	 * arc_p. So, if we're over our target cache size more than
3296  	 * the MRU is over arc_p, we'll evict enough to get back to
3297  	 * arc_p here, and then evict more from the MFU below.
3298  	 */
3299  	target = MIN((int64_t)(arc_size - arc_c),
3300  	    (int64_t)(refcount_count(&arc_anon->arcs_size) +
3301  	    refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
3302  
3303  	/*
3304  	 * If we're below arc_meta_min, always prefer to evict data.
3305  	 * Otherwise, try to satisfy the requested number of bytes to
3306  	 * evict from the type which contains older buffers; in an
3307  	 * effort to keep newer buffers in the cache regardless of their
3308  	 * type. If we cannot satisfy the number of bytes from this
3309  	 * type, spill over into the next type.
3310  	 */
3311  	if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3312  	    arc_meta_used > arc_meta_min) {
3313  		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3314  		total_evicted += bytes;
3315  
3316  		/*
3317  		 * If we couldn't evict our target number of bytes from
3318  		 * metadata, we try to get the rest from data.
3319  		 */
3320  		target -= bytes;
3321  
3322  		total_evicted +=
3323  		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3324  	} else {
3325  		bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3326  		total_evicted += bytes;
3327  
3328  		/*
3329  		 * If we couldn't evict our target number of bytes from
3330  		 * data, we try to get the rest from metadata.
3331  		 */
3332  		target -= bytes;
3333  
3334  		total_evicted +=
3335  		    arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3336  	}
3337  
3338  	/*
3339  	 * Adjust MFU size
3340  	 *
3341  	 * Now that we've tried to evict enough from the MRU to get its
3342  	 * size back to arc_p, if we're still above the target cache
3343  	 * size, we evict the rest from the MFU.
3344  	 */
3345  	target = arc_size - arc_c;
3346  
3347  	if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
3348  	    arc_meta_used > arc_meta_min) {
3349  		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3350  		total_evicted += bytes;
3351  
3352  		/*
3353  		 * If we couldn't evict our target number of bytes from
3354  		 * metadata, we try to get the rest from data.
3355  		 */
3356  		target -= bytes;
3357  
3358  		total_evicted +=
3359  		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3360  	} else {
3361  		bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3362  		total_evicted += bytes;
3363  
3364  		/*
3365  		 * If we couldn't evict our target number of bytes from
3366  		 * data, we try to get the rest from data.
3367  		 */
3368  		target -= bytes;
3369  
3370  		total_evicted +=
3371  		    arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3372  	}
3373  
3374  	/*
3375  	 * Adjust ghost lists
3376  	 *
3377  	 * In addition to the above, the ARC also defines target values
3378  	 * for the ghost lists. The sum of the mru list and mru ghost
3379  	 * list should never exceed the target size of the cache, and
3380  	 * the sum of the mru list, mfu list, mru ghost list, and mfu
3381  	 * ghost list should never exceed twice the target size of the
3382  	 * cache. The following logic enforces these limits on the ghost
3383  	 * caches, and evicts from them as needed.
3384  	 */
3385  	target = refcount_count(&arc_mru->arcs_size) +
3386  	    refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3387  
3388  	bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3389  	total_evicted += bytes;
3390  
3391  	target -= bytes;
3392  
3393  	total_evicted +=
3394  	    arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3395  
3396  	/*
3397  	 * We assume the sum of the mru list and mfu list is less than
3398  	 * or equal to arc_c (we enforced this above), which means we
3399  	 * can use the simpler of the two equations below:
3400  	 *
3401  	 *	mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3402  	 *		    mru ghost + mfu ghost <= arc_c
3403  	 */
3404  	target = refcount_count(&arc_mru_ghost->arcs_size) +
3405  	    refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3406  
3407  	bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3408  	total_evicted += bytes;
3409  
3410  	target -= bytes;
3411  
3412  	total_evicted +=
3413  	    arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3414  
3415  	return (total_evicted);
3416  }
3417  
3418  static void
arc_do_user_evicts(void)3419  arc_do_user_evicts(void)
3420  {
3421  	mutex_enter(&arc_user_evicts_lock);
3422  	while (arc_eviction_list != NULL) {
3423  		arc_buf_t *buf = arc_eviction_list;
3424  		arc_eviction_list = buf->b_next;
3425  		mutex_enter(&buf->b_evict_lock);
3426  		buf->b_hdr = NULL;
3427  		mutex_exit(&buf->b_evict_lock);
3428  		mutex_exit(&arc_user_evicts_lock);
3429  
3430  		if (buf->b_efunc != NULL)
3431  			VERIFY0(buf->b_efunc(buf->b_private));
3432  
3433  		buf->b_efunc = NULL;
3434  		buf->b_private = NULL;
3435  		kmem_cache_free(buf_cache, buf);
3436  		mutex_enter(&arc_user_evicts_lock);
3437  	}
3438  	mutex_exit(&arc_user_evicts_lock);
3439  }
3440  
3441  void
arc_flush(spa_t * spa,boolean_t retry)3442  arc_flush(spa_t *spa, boolean_t retry)
3443  {
3444  	uint64_t guid = 0;
3445  
3446  	/*
3447  	 * If retry is TRUE, a spa must not be specified since we have
3448  	 * no good way to determine if all of a spa's buffers have been
3449  	 * evicted from an arc state.
3450  	 */
3451  	ASSERT(!retry || spa == 0);
3452  
3453  	if (spa != NULL)
3454  		guid = spa_load_guid(spa);
3455  
3456  	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3457  	(void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3458  
3459  	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3460  	(void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3461  
3462  	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3463  	(void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3464  
3465  	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3466  	(void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3467  
3468  	arc_do_user_evicts();
3469  	ASSERT(spa || arc_eviction_list == NULL);
3470  }
3471  
3472  void
arc_shrink(int64_t to_free)3473  arc_shrink(int64_t to_free)
3474  {
3475  	if (arc_c > arc_c_min) {
3476  
3477  		if (arc_c > arc_c_min + to_free)
3478  			atomic_add_64(&arc_c, -to_free);
3479  		else
3480  			arc_c = arc_c_min;
3481  
3482  		atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3483  		if (arc_c > arc_size)
3484  			arc_c = MAX(arc_size, arc_c_min);
3485  		if (arc_p > arc_c)
3486  			arc_p = (arc_c >> 1);
3487  		ASSERT(arc_c >= arc_c_min);
3488  		ASSERT((int64_t)arc_p >= 0);
3489  	}
3490  
3491  	if (arc_size > arc_c)
3492  		(void) arc_adjust();
3493  }
3494  
3495  typedef enum free_memory_reason_t {
3496  	FMR_UNKNOWN,
3497  	FMR_NEEDFREE,
3498  	FMR_LOTSFREE,
3499  	FMR_SWAPFS_MINFREE,
3500  	FMR_PAGES_PP_MAXIMUM,
3501  	FMR_HEAP_ARENA,
3502  	FMR_ZIO_ARENA,
3503  } free_memory_reason_t;
3504  
3505  int64_t last_free_memory;
3506  free_memory_reason_t last_free_reason;
3507  
3508  /*
3509   * Additional reserve of pages for pp_reserve.
3510   */
3511  int64_t arc_pages_pp_reserve = 64;
3512  
3513  /*
3514   * Additional reserve of pages for swapfs.
3515   */
3516  int64_t arc_swapfs_reserve = 64;
3517  
3518  /*
3519   * Return the amount of memory that can be consumed before reclaim will be
3520   * needed.  Positive if there is sufficient free memory, negative indicates
3521   * the amount of memory that needs to be freed up.
3522   */
3523  static int64_t
arc_available_memory(void)3524  arc_available_memory(void)
3525  {
3526  	int64_t lowest = INT64_MAX;
3527  	int64_t n;
3528  	free_memory_reason_t r = FMR_UNKNOWN;
3529  
3530  #ifdef _KERNEL
3531  	if (needfree > 0) {
3532  		n = PAGESIZE * (-needfree);
3533  		if (n < lowest) {
3534  			lowest = n;
3535  			r = FMR_NEEDFREE;
3536  		}
3537  	}
3538  
3539  	/*
3540  	 * check that we're out of range of the pageout scanner.  It starts to
3541  	 * schedule paging if freemem is less than lotsfree and needfree.
3542  	 * lotsfree is the high-water mark for pageout, and needfree is the
3543  	 * number of needed free pages.  We add extra pages here to make sure
3544  	 * the scanner doesn't start up while we're freeing memory.
3545  	 */
3546  	n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3547  	if (n < lowest) {
3548  		lowest = n;
3549  		r = FMR_LOTSFREE;
3550  	}
3551  
3552  	/*
3553  	 * check to make sure that swapfs has enough space so that anon
3554  	 * reservations can still succeed. anon_resvmem() checks that the
3555  	 * availrmem is greater than swapfs_minfree, and the number of reserved
3556  	 * swap pages.  We also add a bit of extra here just to prevent
3557  	 * circumstances from getting really dire.
3558  	 */
3559  	n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3560  	    desfree - arc_swapfs_reserve);
3561  	if (n < lowest) {
3562  		lowest = n;
3563  		r = FMR_SWAPFS_MINFREE;
3564  	}
3565  
3566  
3567  	/*
3568  	 * Check that we have enough availrmem that memory locking (e.g., via
3569  	 * mlock(3C) or memcntl(2)) can still succeed.  (pages_pp_maximum
3570  	 * stores the number of pages that cannot be locked; when availrmem
3571  	 * drops below pages_pp_maximum, page locking mechanisms such as
3572  	 * page_pp_lock() will fail.)
3573  	 */
3574  	n = PAGESIZE * (availrmem - pages_pp_maximum -
3575  	    arc_pages_pp_reserve);
3576  	if (n < lowest) {
3577  		lowest = n;
3578  		r = FMR_PAGES_PP_MAXIMUM;
3579  	}
3580  
3581  #if defined(__i386)
3582  	/*
3583  	 * If we're on an i386 platform, it's possible that we'll exhaust the
3584  	 * kernel heap space before we ever run out of available physical
3585  	 * memory.  Most checks of the size of the heap_area compare against
3586  	 * tune.t_minarmem, which is the minimum available real memory that we
3587  	 * can have in the system.  However, this is generally fixed at 25 pages
3588  	 * which is so low that it's useless.  In this comparison, we seek to
3589  	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3590  	 * heap is allocated.  (Or, in the calculation, if less than 1/4th is
3591  	 * free)
3592  	 */
3593  	n = vmem_size(heap_arena, VMEM_FREE) -
3594  	    (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3595  	if (n < lowest) {
3596  		lowest = n;
3597  		r = FMR_HEAP_ARENA;
3598  	}
3599  #endif
3600  
3601  	/*
3602  	 * If zio data pages are being allocated out of a separate heap segment,
3603  	 * then enforce that the size of available vmem for this arena remains
3604  	 * above about 1/16th free.
3605  	 *
3606  	 * Note: The 1/16th arena free requirement was put in place
3607  	 * to aggressively evict memory from the arc in order to avoid
3608  	 * memory fragmentation issues.
3609  	 */
3610  	if (zio_arena != NULL) {
3611  		n = vmem_size(zio_arena, VMEM_FREE) -
3612  		    (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3613  		if (n < lowest) {
3614  			lowest = n;
3615  			r = FMR_ZIO_ARENA;
3616  		}
3617  	}
3618  #else
3619  	/* Every 100 calls, free a small amount */
3620  	if (spa_get_random(100) == 0)
3621  		lowest = -1024;
3622  #endif
3623  
3624  	last_free_memory = lowest;
3625  	last_free_reason = r;
3626  
3627  	return (lowest);
3628  }
3629  
3630  
3631  /*
3632   * Determine if the system is under memory pressure and is asking
3633   * to reclaim memory. A return value of TRUE indicates that the system
3634   * is under memory pressure and that the arc should adjust accordingly.
3635   */
3636  static boolean_t
arc_reclaim_needed(void)3637  arc_reclaim_needed(void)
3638  {
3639  	return (arc_available_memory() < 0);
3640  }
3641  
3642  static void
arc_kmem_reap_now(void)3643  arc_kmem_reap_now(void)
3644  {
3645  	size_t			i;
3646  	kmem_cache_t		*prev_cache = NULL;
3647  	kmem_cache_t		*prev_data_cache = NULL;
3648  	extern kmem_cache_t	*zio_buf_cache[];
3649  	extern kmem_cache_t	*zio_data_buf_cache[];
3650  	extern kmem_cache_t	*range_seg_cache;
3651  
3652  #ifdef _KERNEL
3653  	if (arc_meta_used >= arc_meta_limit) {
3654  		/*
3655  		 * We are exceeding our meta-data cache limit.
3656  		 * Purge some DNLC entries to release holds on meta-data.
3657  		 */
3658  		dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3659  	}
3660  #if defined(__i386)
3661  	/*
3662  	 * Reclaim unused memory from all kmem caches.
3663  	 */
3664  	kmem_reap();
3665  #endif
3666  #endif
3667  
3668  	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3669  		if (zio_buf_cache[i] != prev_cache) {
3670  			prev_cache = zio_buf_cache[i];
3671  			kmem_cache_reap_now(zio_buf_cache[i]);
3672  		}
3673  		if (zio_data_buf_cache[i] != prev_data_cache) {
3674  			prev_data_cache = zio_data_buf_cache[i];
3675  			kmem_cache_reap_now(zio_data_buf_cache[i]);
3676  		}
3677  	}
3678  	kmem_cache_reap_now(buf_cache);
3679  	kmem_cache_reap_now(hdr_full_cache);
3680  	kmem_cache_reap_now(hdr_l2only_cache);
3681  	kmem_cache_reap_now(range_seg_cache);
3682  
3683  	if (zio_arena != NULL) {
3684  		/*
3685  		 * Ask the vmem arena to reclaim unused memory from its
3686  		 * quantum caches.
3687  		 */
3688  		vmem_qcache_reap(zio_arena);
3689  	}
3690  }
3691  
3692  /*
3693   * Threads can block in arc_get_data_buf() waiting for this thread to evict
3694   * enough data and signal them to proceed. When this happens, the threads in
3695   * arc_get_data_buf() are sleeping while holding the hash lock for their
3696   * particular arc header. Thus, we must be careful to never sleep on a
3697   * hash lock in this thread. This is to prevent the following deadlock:
3698   *
3699   *  - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3700   *    waiting for the reclaim thread to signal it.
3701   *
3702   *  - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3703   *    fails, and goes to sleep forever.
3704   *
3705   * This possible deadlock is avoided by always acquiring a hash lock
3706   * using mutex_tryenter() from arc_reclaim_thread().
3707   */
3708  static void
arc_reclaim_thread(void)3709  arc_reclaim_thread(void)
3710  {
3711  	hrtime_t		growtime = 0;
3712  	callb_cpr_t		cpr;
3713  
3714  	CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3715  
3716  	mutex_enter(&arc_reclaim_lock);
3717  	while (!arc_reclaim_thread_exit) {
3718  		int64_t free_memory = arc_available_memory();
3719  		uint64_t evicted = 0;
3720  
3721  		mutex_exit(&arc_reclaim_lock);
3722  
3723  		if (free_memory < 0) {
3724  
3725  			arc_no_grow = B_TRUE;
3726  			arc_warm = B_TRUE;
3727  
3728  			/*
3729  			 * Wait at least zfs_grow_retry (default 60) seconds
3730  			 * before considering growing.
3731  			 */
3732  			growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
3733  
3734  			arc_kmem_reap_now();
3735  
3736  			/*
3737  			 * If we are still low on memory, shrink the ARC
3738  			 * so that we have arc_shrink_min free space.
3739  			 */
3740  			free_memory = arc_available_memory();
3741  
3742  			int64_t to_free =
3743  			    (arc_c >> arc_shrink_shift) - free_memory;
3744  			if (to_free > 0) {
3745  #ifdef _KERNEL
3746  				to_free = MAX(to_free, ptob(needfree));
3747  #endif
3748  				arc_shrink(to_free);
3749  			}
3750  		} else if (free_memory < arc_c >> arc_no_grow_shift) {
3751  			arc_no_grow = B_TRUE;
3752  		} else if (gethrtime() >= growtime) {
3753  			arc_no_grow = B_FALSE;
3754  		}
3755  
3756  		evicted = arc_adjust();
3757  
3758  		mutex_enter(&arc_reclaim_lock);
3759  
3760  		/*
3761  		 * If evicted is zero, we couldn't evict anything via
3762  		 * arc_adjust(). This could be due to hash lock
3763  		 * collisions, but more likely due to the majority of
3764  		 * arc buffers being unevictable. Therefore, even if
3765  		 * arc_size is above arc_c, another pass is unlikely to
3766  		 * be helpful and could potentially cause us to enter an
3767  		 * infinite loop.
3768  		 */
3769  		if (arc_size <= arc_c || evicted == 0) {
3770  			/*
3771  			 * We're either no longer overflowing, or we
3772  			 * can't evict anything more, so we should wake
3773  			 * up any threads before we go to sleep.
3774  			 */
3775  			cv_broadcast(&arc_reclaim_waiters_cv);
3776  
3777  			/*
3778  			 * Block until signaled, or after one second (we
3779  			 * might need to perform arc_kmem_reap_now()
3780  			 * even if we aren't being signalled)
3781  			 */
3782  			CALLB_CPR_SAFE_BEGIN(&cpr);
3783  			(void) cv_timedwait_hires(&arc_reclaim_thread_cv,
3784  			    &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
3785  			CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3786  		}
3787  	}
3788  
3789  	arc_reclaim_thread_exit = FALSE;
3790  	cv_broadcast(&arc_reclaim_thread_cv);
3791  	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_lock */
3792  	thread_exit();
3793  }
3794  
3795  static void
arc_user_evicts_thread(void)3796  arc_user_evicts_thread(void)
3797  {
3798  	callb_cpr_t cpr;
3799  
3800  	CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3801  
3802  	mutex_enter(&arc_user_evicts_lock);
3803  	while (!arc_user_evicts_thread_exit) {
3804  		mutex_exit(&arc_user_evicts_lock);
3805  
3806  		arc_do_user_evicts();
3807  
3808  		/*
3809  		 * This is necessary in order for the mdb ::arc dcmd to
3810  		 * show up to date information. Since the ::arc command
3811  		 * does not call the kstat's update function, without
3812  		 * this call, the command may show stale stats for the
3813  		 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3814  		 * with this change, the data might be up to 1 second
3815  		 * out of date; but that should suffice. The arc_state_t
3816  		 * structures can be queried directly if more accurate
3817  		 * information is needed.
3818  		 */
3819  		if (arc_ksp != NULL)
3820  			arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3821  
3822  		mutex_enter(&arc_user_evicts_lock);
3823  
3824  		/*
3825  		 * Block until signaled, or after one second (we need to
3826  		 * call the arc's kstat update function regularly).
3827  		 */
3828  		CALLB_CPR_SAFE_BEGIN(&cpr);
3829  		(void) cv_timedwait(&arc_user_evicts_cv,
3830  		    &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3831  		CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3832  	}
3833  
3834  	arc_user_evicts_thread_exit = FALSE;
3835  	cv_broadcast(&arc_user_evicts_cv);
3836  	CALLB_CPR_EXIT(&cpr);		/* drops arc_user_evicts_lock */
3837  	thread_exit();
3838  }
3839  
3840  /*
3841   * Adapt arc info given the number of bytes we are trying to add and
3842   * the state that we are comming from.  This function is only called
3843   * when we are adding new content to the cache.
3844   */
3845  static void
arc_adapt(int bytes,arc_state_t * state)3846  arc_adapt(int bytes, arc_state_t *state)
3847  {
3848  	int mult;
3849  	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3850  	int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3851  	int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
3852  
3853  	if (state == arc_l2c_only)
3854  		return;
3855  
3856  	ASSERT(bytes > 0);
3857  	/*
3858  	 * Adapt the target size of the MRU list:
3859  	 *	- if we just hit in the MRU ghost list, then increase
3860  	 *	  the target size of the MRU list.
3861  	 *	- if we just hit in the MFU ghost list, then increase
3862  	 *	  the target size of the MFU list by decreasing the
3863  	 *	  target size of the MRU list.
3864  	 */
3865  	if (state == arc_mru_ghost) {
3866  		mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
3867  		mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3868  
3869  		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3870  	} else if (state == arc_mfu_ghost) {
3871  		uint64_t delta;
3872  
3873  		mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
3874  		mult = MIN(mult, 10);
3875  
3876  		delta = MIN(bytes * mult, arc_p);
3877  		arc_p = MAX(arc_p_min, arc_p - delta);
3878  	}
3879  	ASSERT((int64_t)arc_p >= 0);
3880  
3881  	if (arc_reclaim_needed()) {
3882  		cv_signal(&arc_reclaim_thread_cv);
3883  		return;
3884  	}
3885  
3886  	if (arc_no_grow)
3887  		return;
3888  
3889  	if (arc_c >= arc_c_max)
3890  		return;
3891  
3892  	/*
3893  	 * If we're within (2 * maxblocksize) bytes of the target
3894  	 * cache size, increment the target cache size
3895  	 */
3896  	if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3897  		atomic_add_64(&arc_c, (int64_t)bytes);
3898  		if (arc_c > arc_c_max)
3899  			arc_c = arc_c_max;
3900  		else if (state == arc_anon)
3901  			atomic_add_64(&arc_p, (int64_t)bytes);
3902  		if (arc_p > arc_c)
3903  			arc_p = arc_c;
3904  	}
3905  	ASSERT((int64_t)arc_p >= 0);
3906  }
3907  
3908  /*
3909   * Check if arc_size has grown past our upper threshold, determined by
3910   * zfs_arc_overflow_shift.
3911   */
3912  static boolean_t
arc_is_overflowing(void)3913  arc_is_overflowing(void)
3914  {
3915  	/* Always allow at least one block of overflow */
3916  	uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3917  	    arc_c >> zfs_arc_overflow_shift);
3918  
3919  	return (arc_size >= arc_c + overflow);
3920  }
3921  
3922  /*
3923   * The buffer, supplied as the first argument, needs a data block. If we
3924   * are hitting the hard limit for the cache size, we must sleep, waiting
3925   * for the eviction thread to catch up. If we're past the target size
3926   * but below the hard limit, we'll only signal the reclaim thread and
3927   * continue on.
3928   */
3929  static void
arc_get_data_buf(arc_buf_t * buf)3930  arc_get_data_buf(arc_buf_t *buf)
3931  {
3932  	arc_state_t		*state = buf->b_hdr->b_l1hdr.b_state;
3933  	uint64_t		size = buf->b_hdr->b_size;
3934  	arc_buf_contents_t	type = arc_buf_type(buf->b_hdr);
3935  
3936  	arc_adapt(size, state);
3937  
3938  	/*
3939  	 * If arc_size is currently overflowing, and has grown past our
3940  	 * upper limit, we must be adding data faster than the evict
3941  	 * thread can evict. Thus, to ensure we don't compound the
3942  	 * problem by adding more data and forcing arc_size to grow even
3943  	 * further past it's target size, we halt and wait for the
3944  	 * eviction thread to catch up.
3945  	 *
3946  	 * It's also possible that the reclaim thread is unable to evict
3947  	 * enough buffers to get arc_size below the overflow limit (e.g.
3948  	 * due to buffers being un-evictable, or hash lock collisions).
3949  	 * In this case, we want to proceed regardless if we're
3950  	 * overflowing; thus we don't use a while loop here.
3951  	 */
3952  	if (arc_is_overflowing()) {
3953  		mutex_enter(&arc_reclaim_lock);
3954  
3955  		/*
3956  		 * Now that we've acquired the lock, we may no longer be
3957  		 * over the overflow limit, lets check.
3958  		 *
3959  		 * We're ignoring the case of spurious wake ups. If that
3960  		 * were to happen, it'd let this thread consume an ARC
3961  		 * buffer before it should have (i.e. before we're under
3962  		 * the overflow limit and were signalled by the reclaim
3963  		 * thread). As long as that is a rare occurrence, it
3964  		 * shouldn't cause any harm.
3965  		 */
3966  		if (arc_is_overflowing()) {
3967  			cv_signal(&arc_reclaim_thread_cv);
3968  			cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3969  		}
3970  
3971  		mutex_exit(&arc_reclaim_lock);
3972  	}
3973  
3974  	if (type == ARC_BUFC_METADATA) {
3975  		buf->b_data = zio_buf_alloc(size);
3976  		arc_space_consume(size, ARC_SPACE_META);
3977  	} else {
3978  		ASSERT(type == ARC_BUFC_DATA);
3979  		buf->b_data = zio_data_buf_alloc(size);
3980  		arc_space_consume(size, ARC_SPACE_DATA);
3981  	}
3982  
3983  	/*
3984  	 * Update the state size.  Note that ghost states have a
3985  	 * "ghost size" and so don't need to be updated.
3986  	 */
3987  	if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3988  		arc_buf_hdr_t *hdr = buf->b_hdr;
3989  		arc_state_t *state = hdr->b_l1hdr.b_state;
3990  
3991  		(void) refcount_add_many(&state->arcs_size, size, buf);
3992  
3993  		/*
3994  		 * If this is reached via arc_read, the link is
3995  		 * protected by the hash lock. If reached via
3996  		 * arc_buf_alloc, the header should not be accessed by
3997  		 * any other thread. And, if reached via arc_read_done,
3998  		 * the hash lock will protect it if it's found in the
3999  		 * hash table; otherwise no other thread should be
4000  		 * trying to [add|remove]_reference it.
4001  		 */
4002  		if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4003  			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4004  			atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
4005  			    size);
4006  		}
4007  		/*
4008  		 * If we are growing the cache, and we are adding anonymous
4009  		 * data, and we have outgrown arc_p, update arc_p
4010  		 */
4011  		if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
4012  		    (refcount_count(&arc_anon->arcs_size) +
4013  		    refcount_count(&arc_mru->arcs_size) > arc_p))
4014  			arc_p = MIN(arc_c, arc_p + size);
4015  	}
4016  }
4017  
4018  /*
4019   * This routine is called whenever a buffer is accessed.
4020   * NOTE: the hash lock is dropped in this function.
4021   */
4022  static void
arc_access(arc_buf_hdr_t * hdr,kmutex_t * hash_lock)4023  arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
4024  {
4025  	clock_t now;
4026  
4027  	ASSERT(MUTEX_HELD(hash_lock));
4028  	ASSERT(HDR_HAS_L1HDR(hdr));
4029  
4030  	if (hdr->b_l1hdr.b_state == arc_anon) {
4031  		/*
4032  		 * This buffer is not in the cache, and does not
4033  		 * appear in our "ghost" list.  Add the new buffer
4034  		 * to the MRU state.
4035  		 */
4036  
4037  		ASSERT0(hdr->b_l1hdr.b_arc_access);
4038  		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4039  		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4040  		arc_change_state(arc_mru, hdr, hash_lock);
4041  
4042  	} else if (hdr->b_l1hdr.b_state == arc_mru) {
4043  		now = ddi_get_lbolt();
4044  
4045  		/*
4046  		 * If this buffer is here because of a prefetch, then either:
4047  		 * - clear the flag if this is a "referencing" read
4048  		 *   (any subsequent access will bump this into the MFU state).
4049  		 * or
4050  		 * - move the buffer to the head of the list if this is
4051  		 *   another prefetch (to make it less likely to be evicted).
4052  		 */
4053  		if (HDR_PREFETCH(hdr)) {
4054  			if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4055  				/* link protected by hash lock */
4056  				ASSERT(multilist_link_active(
4057  				    &hdr->b_l1hdr.b_arc_node));
4058  			} else {
4059  				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
4060  				ARCSTAT_BUMP(arcstat_mru_hits);
4061  			}
4062  			hdr->b_l1hdr.b_arc_access = now;
4063  			return;
4064  		}
4065  
4066  		/*
4067  		 * This buffer has been "accessed" only once so far,
4068  		 * but it is still in the cache. Move it to the MFU
4069  		 * state.
4070  		 */
4071  		if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
4072  			/*
4073  			 * More than 125ms have passed since we
4074  			 * instantiated this buffer.  Move it to the
4075  			 * most frequently used state.
4076  			 */
4077  			hdr->b_l1hdr.b_arc_access = now;
4078  			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4079  			arc_change_state(arc_mfu, hdr, hash_lock);
4080  		}
4081  		ARCSTAT_BUMP(arcstat_mru_hits);
4082  	} else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
4083  		arc_state_t	*new_state;
4084  		/*
4085  		 * This buffer has been "accessed" recently, but
4086  		 * was evicted from the cache.  Move it to the
4087  		 * MFU state.
4088  		 */
4089  
4090  		if (HDR_PREFETCH(hdr)) {
4091  			new_state = arc_mru;
4092  			if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
4093  				hdr->b_flags &= ~ARC_FLAG_PREFETCH;
4094  			DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4095  		} else {
4096  			new_state = arc_mfu;
4097  			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4098  		}
4099  
4100  		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4101  		arc_change_state(new_state, hdr, hash_lock);
4102  
4103  		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
4104  	} else if (hdr->b_l1hdr.b_state == arc_mfu) {
4105  		/*
4106  		 * This buffer has been accessed more than once and is
4107  		 * still in the cache.  Keep it in the MFU state.
4108  		 *
4109  		 * NOTE: an add_reference() that occurred when we did
4110  		 * the arc_read() will have kicked this off the list.
4111  		 * If it was a prefetch, we will explicitly move it to
4112  		 * the head of the list now.
4113  		 */
4114  		if ((HDR_PREFETCH(hdr)) != 0) {
4115  			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4116  			/* link protected by hash_lock */
4117  			ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4118  		}
4119  		ARCSTAT_BUMP(arcstat_mfu_hits);
4120  		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4121  	} else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
4122  		arc_state_t	*new_state = arc_mfu;
4123  		/*
4124  		 * This buffer has been accessed more than once but has
4125  		 * been evicted from the cache.  Move it back to the
4126  		 * MFU state.
4127  		 */
4128  
4129  		if (HDR_PREFETCH(hdr)) {
4130  			/*
4131  			 * This is a prefetch access...
4132  			 * move this block back to the MRU state.
4133  			 */
4134  			ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4135  			new_state = arc_mru;
4136  		}
4137  
4138  		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4139  		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4140  		arc_change_state(new_state, hdr, hash_lock);
4141  
4142  		ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
4143  	} else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
4144  		/*
4145  		 * This buffer is on the 2nd Level ARC.
4146  		 */
4147  
4148  		hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4149  		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4150  		arc_change_state(arc_mfu, hdr, hash_lock);
4151  	} else {
4152  		ASSERT(!"invalid arc state");
4153  	}
4154  }
4155  
4156  /* a generic arc_done_func_t which you can use */
4157  /* ARGSUSED */
4158  void
arc_bcopy_func(zio_t * zio,arc_buf_t * buf,void * arg)4159  arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4160  {
4161  	if (zio == NULL || zio->io_error == 0)
4162  		bcopy(buf->b_data, arg, buf->b_hdr->b_size);
4163  	VERIFY(arc_buf_remove_ref(buf, arg));
4164  }
4165  
4166  /* a generic arc_done_func_t */
4167  void
arc_getbuf_func(zio_t * zio,arc_buf_t * buf,void * arg)4168  arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4169  {
4170  	arc_buf_t **bufp = arg;
4171  	if (zio && zio->io_error) {
4172  		VERIFY(arc_buf_remove_ref(buf, arg));
4173  		*bufp = NULL;
4174  	} else {
4175  		*bufp = buf;
4176  		ASSERT(buf->b_data);
4177  	}
4178  }
4179  
4180  static void
arc_read_done(zio_t * zio)4181  arc_read_done(zio_t *zio)
4182  {
4183  	arc_buf_hdr_t	*hdr;
4184  	arc_buf_t	*buf;
4185  	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
4186  	kmutex_t	*hash_lock = NULL;
4187  	arc_callback_t	*callback_list, *acb;
4188  	int		freeable = FALSE;
4189  
4190  	buf = zio->io_private;
4191  	hdr = buf->b_hdr;
4192  
4193  	/*
4194  	 * The hdr was inserted into hash-table and removed from lists
4195  	 * prior to starting I/O.  We should find this header, since
4196  	 * it's in the hash table, and it should be legit since it's
4197  	 * not possible to evict it during the I/O.  The only possible
4198  	 * reason for it not to be found is if we were freed during the
4199  	 * read.
4200  	 */
4201  	if (HDR_IN_HASH_TABLE(hdr)) {
4202  		ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4203  		ASSERT3U(hdr->b_dva.dva_word[0], ==,
4204  		    BP_IDENTITY(zio->io_bp)->dva_word[0]);
4205  		ASSERT3U(hdr->b_dva.dva_word[1], ==,
4206  		    BP_IDENTITY(zio->io_bp)->dva_word[1]);
4207  
4208  		arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
4209  		    &hash_lock);
4210  
4211  		ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4212  		    hash_lock == NULL) ||
4213  		    (found == hdr &&
4214  		    DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4215  		    (found == hdr && HDR_L2_READING(hdr)));
4216  	}
4217  
4218  	hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
4219  	if (l2arc_noprefetch && HDR_PREFETCH(hdr))
4220  		hdr->b_flags &= ~ARC_FLAG_L2CACHE;
4221  
4222  	/* byteswap if necessary */
4223  	callback_list = hdr->b_l1hdr.b_acb;
4224  	ASSERT(callback_list != NULL);
4225  	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
4226  		dmu_object_byteswap_t bswap =
4227  		    DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
4228  		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
4229  		    byteswap_uint64_array :
4230  		    dmu_ot_byteswap[bswap].ob_func;
4231  		func(buf->b_data, hdr->b_size);
4232  	}
4233  
4234  	arc_cksum_compute(buf, B_FALSE);
4235  	arc_buf_watch(buf);
4236  
4237  	if (hash_lock && zio->io_error == 0 &&
4238  	    hdr->b_l1hdr.b_state == arc_anon) {
4239  		/*
4240  		 * Only call arc_access on anonymous buffers.  This is because
4241  		 * if we've issued an I/O for an evicted buffer, we've already
4242  		 * called arc_access (to prevent any simultaneous readers from
4243  		 * getting confused).
4244  		 */
4245  		arc_access(hdr, hash_lock);
4246  	}
4247  
4248  	/* create copies of the data buffer for the callers */
4249  	abuf = buf;
4250  	for (acb = callback_list; acb; acb = acb->acb_next) {
4251  		if (acb->acb_done) {
4252  			if (abuf == NULL) {
4253  				ARCSTAT_BUMP(arcstat_duplicate_reads);
4254  				abuf = arc_buf_clone(buf);
4255  			}
4256  			acb->acb_buf = abuf;
4257  			abuf = NULL;
4258  		}
4259  	}
4260  	hdr->b_l1hdr.b_acb = NULL;
4261  	hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4262  	ASSERT(!HDR_BUF_AVAILABLE(hdr));
4263  	if (abuf == buf) {
4264  		ASSERT(buf->b_efunc == NULL);
4265  		ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4266  		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4267  	}
4268  
4269  	ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4270  	    callback_list != NULL);
4271  
4272  	if (zio->io_error != 0) {
4273  		hdr->b_flags |= ARC_FLAG_IO_ERROR;
4274  		if (hdr->b_l1hdr.b_state != arc_anon)
4275  			arc_change_state(arc_anon, hdr, hash_lock);
4276  		if (HDR_IN_HASH_TABLE(hdr))
4277  			buf_hash_remove(hdr);
4278  		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4279  	}
4280  
4281  	/*
4282  	 * Broadcast before we drop the hash_lock to avoid the possibility
4283  	 * that the hdr (and hence the cv) might be freed before we get to
4284  	 * the cv_broadcast().
4285  	 */
4286  	cv_broadcast(&hdr->b_l1hdr.b_cv);
4287  
4288  	if (hash_lock != NULL) {
4289  		mutex_exit(hash_lock);
4290  	} else {
4291  		/*
4292  		 * This block was freed while we waited for the read to
4293  		 * complete.  It has been removed from the hash table and
4294  		 * moved to the anonymous state (so that it won't show up
4295  		 * in the cache).
4296  		 */
4297  		ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4298  		freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4299  	}
4300  
4301  	/* execute each callback and free its structure */
4302  	while ((acb = callback_list) != NULL) {
4303  		if (acb->acb_done)
4304  			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4305  
4306  		if (acb->acb_zio_dummy != NULL) {
4307  			acb->acb_zio_dummy->io_error = zio->io_error;
4308  			zio_nowait(acb->acb_zio_dummy);
4309  		}
4310  
4311  		callback_list = acb->acb_next;
4312  		kmem_free(acb, sizeof (arc_callback_t));
4313  	}
4314  
4315  	if (freeable)
4316  		arc_hdr_destroy(hdr);
4317  }
4318  
4319  /*
4320   * "Read" the block at the specified DVA (in bp) via the
4321   * cache.  If the block is found in the cache, invoke the provided
4322   * callback immediately and return.  Note that the `zio' parameter
4323   * in the callback will be NULL in this case, since no IO was
4324   * required.  If the block is not in the cache pass the read request
4325   * on to the spa with a substitute callback function, so that the
4326   * requested block will be added to the cache.
4327   *
4328   * If a read request arrives for a block that has a read in-progress,
4329   * either wait for the in-progress read to complete (and return the
4330   * results); or, if this is a read with a "done" func, add a record
4331   * to the read to invoke the "done" func when the read completes,
4332   * and return; or just return.
4333   *
4334   * arc_read_done() will invoke all the requested "done" functions
4335   * for readers of this block.
4336   */
4337  int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,arc_flags_t * arc_flags,const zbookmark_phys_t * zb)4338  arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
4339      void *private, zio_priority_t priority, int zio_flags,
4340      arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
4341  {
4342  	arc_buf_hdr_t *hdr = NULL;
4343  	arc_buf_t *buf = NULL;
4344  	kmutex_t *hash_lock = NULL;
4345  	zio_t *rzio;
4346  	uint64_t guid = spa_load_guid(spa);
4347  
4348  	ASSERT(!BP_IS_EMBEDDED(bp) ||
4349  	    BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4350  
4351  top:
4352  	if (!BP_IS_EMBEDDED(bp)) {
4353  		/*
4354  		 * Embedded BP's have no DVA and require no I/O to "read".
4355  		 * Create an anonymous arc buf to back it.
4356  		 */
4357  		hdr = buf_hash_find(guid, bp, &hash_lock);
4358  	}
4359  
4360  	if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
4361  
4362  		*arc_flags |= ARC_FLAG_CACHED;
4363  
4364  		if (HDR_IO_IN_PROGRESS(hdr)) {
4365  
4366  			if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
4367  			    priority == ZIO_PRIORITY_SYNC_READ) {
4368  				/*
4369  				 * This sync read must wait for an
4370  				 * in-progress async read (e.g. a predictive
4371  				 * prefetch).  Async reads are queued
4372  				 * separately at the vdev_queue layer, so
4373  				 * this is a form of priority inversion.
4374  				 * Ideally, we would "inherit" the demand
4375  				 * i/o's priority by moving the i/o from
4376  				 * the async queue to the synchronous queue,
4377  				 * but there is currently no mechanism to do
4378  				 * so.  Track this so that we can evaluate
4379  				 * the magnitude of this potential performance
4380  				 * problem.
4381  				 *
4382  				 * Note that if the prefetch i/o is already
4383  				 * active (has been issued to the device),
4384  				 * the prefetch improved performance, because
4385  				 * we issued it sooner than we would have
4386  				 * without the prefetch.
4387  				 */
4388  				DTRACE_PROBE1(arc__sync__wait__for__async,
4389  				    arc_buf_hdr_t *, hdr);
4390  				ARCSTAT_BUMP(arcstat_sync_wait_for_async);
4391  			}
4392  			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4393  				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4394  			}
4395  
4396  			if (*arc_flags & ARC_FLAG_WAIT) {
4397  				cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4398  				mutex_exit(hash_lock);
4399  				goto top;
4400  			}
4401  			ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4402  
4403  			if (done) {
4404  				arc_callback_t *acb = NULL;
4405  
4406  				acb = kmem_zalloc(sizeof (arc_callback_t),
4407  				    KM_SLEEP);
4408  				acb->acb_done = done;
4409  				acb->acb_private = private;
4410  				if (pio != NULL)
4411  					acb->acb_zio_dummy = zio_null(pio,
4412  					    spa, NULL, NULL, NULL, zio_flags);
4413  
4414  				ASSERT(acb->acb_done != NULL);
4415  				acb->acb_next = hdr->b_l1hdr.b_acb;
4416  				hdr->b_l1hdr.b_acb = acb;
4417  				add_reference(hdr, hash_lock, private);
4418  				mutex_exit(hash_lock);
4419  				return (0);
4420  			}
4421  			mutex_exit(hash_lock);
4422  			return (0);
4423  		}
4424  
4425  		ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4426  		    hdr->b_l1hdr.b_state == arc_mfu);
4427  
4428  		if (done) {
4429  			if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4430  				/*
4431  				 * This is a demand read which does not have to
4432  				 * wait for i/o because we did a predictive
4433  				 * prefetch i/o for it, which has completed.
4434  				 */
4435  				DTRACE_PROBE1(
4436  				    arc__demand__hit__predictive__prefetch,
4437  				    arc_buf_hdr_t *, hdr);
4438  				ARCSTAT_BUMP(
4439  				    arcstat_demand_hit_predictive_prefetch);
4440  				hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4441  			}
4442  			add_reference(hdr, hash_lock, private);
4443  			/*
4444  			 * If this block is already in use, create a new
4445  			 * copy of the data so that we will be guaranteed
4446  			 * that arc_release() will always succeed.
4447  			 */
4448  			buf = hdr->b_l1hdr.b_buf;
4449  			ASSERT(buf);
4450  			ASSERT(buf->b_data);
4451  			if (HDR_BUF_AVAILABLE(hdr)) {
4452  				ASSERT(buf->b_efunc == NULL);
4453  				hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4454  			} else {
4455  				buf = arc_buf_clone(buf);
4456  			}
4457  
4458  		} else if (*arc_flags & ARC_FLAG_PREFETCH &&
4459  		    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4460  			hdr->b_flags |= ARC_FLAG_PREFETCH;
4461  		}
4462  		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4463  		arc_access(hdr, hash_lock);
4464  		if (*arc_flags & ARC_FLAG_L2CACHE)
4465  			hdr->b_flags |= ARC_FLAG_L2CACHE;
4466  		if (*arc_flags & ARC_FLAG_L2COMPRESS)
4467  			hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4468  		mutex_exit(hash_lock);
4469  		ARCSTAT_BUMP(arcstat_hits);
4470  		arc_update_hit_stat(hdr, B_TRUE);
4471  
4472  		if (done)
4473  			done(NULL, buf, private);
4474  	} else {
4475  		uint64_t size = BP_GET_LSIZE(bp);
4476  		arc_callback_t *acb;
4477  		vdev_t *vd = NULL;
4478  		uint64_t addr = 0;
4479  		boolean_t devw = B_FALSE;
4480  		enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4481  		int32_t b_asize = 0;
4482  
4483  		if (hdr == NULL) {
4484  			/* this block is not in the cache */
4485  			arc_buf_hdr_t *exists = NULL;
4486  			arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4487  			buf = arc_buf_alloc(spa, size, private, type);
4488  			hdr = buf->b_hdr;
4489  			if (!BP_IS_EMBEDDED(bp)) {
4490  				hdr->b_dva = *BP_IDENTITY(bp);
4491  				hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4492  				exists = buf_hash_insert(hdr, &hash_lock);
4493  			}
4494  			if (exists != NULL) {
4495  				/* somebody beat us to the hash insert */
4496  				mutex_exit(hash_lock);
4497  				buf_discard_identity(hdr);
4498  				(void) arc_buf_remove_ref(buf, private);
4499  				goto top; /* restart the IO request */
4500  			}
4501  
4502  			/*
4503  			 * If there is a callback, we pass our reference to
4504  			 * it; otherwise we remove our reference.
4505  			 */
4506  			if (done == NULL) {
4507  				(void) remove_reference(hdr, hash_lock,
4508  				    private);
4509  			}
4510  			if (*arc_flags & ARC_FLAG_PREFETCH)
4511  				hdr->b_flags |= ARC_FLAG_PREFETCH;
4512  			if (*arc_flags & ARC_FLAG_L2CACHE)
4513  				hdr->b_flags |= ARC_FLAG_L2CACHE;
4514  			if (*arc_flags & ARC_FLAG_L2COMPRESS)
4515  				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4516  			if (BP_GET_LEVEL(bp) > 0)
4517  				hdr->b_flags |= ARC_FLAG_INDIRECT;
4518  		} else {
4519  			/*
4520  			 * This block is in the ghost cache. If it was L2-only
4521  			 * (and thus didn't have an L1 hdr), we realloc the
4522  			 * header to add an L1 hdr.
4523  			 */
4524  			if (!HDR_HAS_L1HDR(hdr)) {
4525  				hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4526  				    hdr_full_cache);
4527  			}
4528  
4529  			ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4530  			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4531  			ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4532  			ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4533  
4534  			/*
4535  			 * If there is a callback, we pass a reference to it.
4536  			 */
4537  			if (done != NULL)
4538  				add_reference(hdr, hash_lock, private);
4539  			if (*arc_flags & ARC_FLAG_PREFETCH)
4540  				hdr->b_flags |= ARC_FLAG_PREFETCH;
4541  			if (*arc_flags & ARC_FLAG_L2CACHE)
4542  				hdr->b_flags |= ARC_FLAG_L2CACHE;
4543  			if (*arc_flags & ARC_FLAG_L2COMPRESS)
4544  				hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4545  			buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4546  			buf->b_hdr = hdr;
4547  			buf->b_data = NULL;
4548  			buf->b_efunc = NULL;
4549  			buf->b_private = NULL;
4550  			buf->b_next = NULL;
4551  			hdr->b_l1hdr.b_buf = buf;
4552  			ASSERT0(hdr->b_l1hdr.b_datacnt);
4553  			hdr->b_l1hdr.b_datacnt = 1;
4554  			arc_get_data_buf(buf);
4555  			arc_access(hdr, hash_lock);
4556  		}
4557  
4558  		if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
4559  			hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
4560  		ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4561  
4562  		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4563  		acb->acb_done = done;
4564  		acb->acb_private = private;
4565  
4566  		ASSERT(hdr->b_l1hdr.b_acb == NULL);
4567  		hdr->b_l1hdr.b_acb = acb;
4568  		hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4569  
4570  		if (HDR_HAS_L2HDR(hdr) &&
4571  		    (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4572  			devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4573  			addr = hdr->b_l2hdr.b_daddr;
4574  			b_compress = hdr->b_l2hdr.b_compress;
4575  			b_asize = hdr->b_l2hdr.b_asize;
4576  			/*
4577  			 * Lock out device removal.
4578  			 */
4579  			if (vdev_is_dead(vd) ||
4580  			    !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4581  				vd = NULL;
4582  		}
4583  
4584  		if (hash_lock != NULL)
4585  			mutex_exit(hash_lock);
4586  
4587  		/*
4588  		 * At this point, we have a level 1 cache miss.  Try again in
4589  		 * L2ARC if possible.
4590  		 */
4591  		ASSERT3U(hdr->b_size, ==, size);
4592  		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4593  		    uint64_t, size, zbookmark_phys_t *, zb);
4594  		ARCSTAT_BUMP(arcstat_misses);
4595  		arc_update_hit_stat(hdr, B_FALSE);
4596  
4597  		if (priority == ZIO_PRIORITY_ASYNC_READ)
4598  			hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
4599  		else
4600  			hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
4601  
4602  		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4603  			/*
4604  			 * Read from the L2ARC if the following are true:
4605  			 * 1. The L2ARC vdev was previously cached.
4606  			 * 2. This buffer still has L2ARC metadata.
4607  			 * 3. This buffer isn't currently writing to the L2ARC.
4608  			 * 4. The L2ARC entry wasn't evicted, which may
4609  			 *    also have invalidated the vdev.
4610  			 * 5. This isn't prefetch and l2arc_noprefetch is set.
4611  			 */
4612  			if (HDR_HAS_L2HDR(hdr) &&
4613  			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4614  			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4615  				l2arc_read_callback_t *cb;
4616  
4617  				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4618  				ARCSTAT_BUMP(arcstat_l2_hits);
4619  
4620  				cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4621  				    KM_SLEEP);
4622  				cb->l2rcb_buf = buf;
4623  				cb->l2rcb_spa = spa;
4624  				cb->l2rcb_bp = *bp;
4625  				cb->l2rcb_zb = *zb;
4626  				cb->l2rcb_flags = zio_flags;
4627  				cb->l2rcb_compress = b_compress;
4628  
4629  				ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4630  				    addr + size < vd->vdev_psize -
4631  				    VDEV_LABEL_END_SIZE);
4632  
4633  				/*
4634  				 * l2arc read.  The SCL_L2ARC lock will be
4635  				 * released by l2arc_read_done().
4636  				 * Issue a null zio if the underlying buffer
4637  				 * was squashed to zero size by compression.
4638  				 */
4639  				if (b_compress == ZIO_COMPRESS_EMPTY) {
4640  					rzio = zio_null(pio, spa, vd,
4641  					    l2arc_read_done, cb,
4642  					    zio_flags | ZIO_FLAG_DONT_CACHE |
4643  					    ZIO_FLAG_CANFAIL |
4644  					    ZIO_FLAG_DONT_PROPAGATE |
4645  					    ZIO_FLAG_DONT_RETRY);
4646  				} else {
4647  					rzio = zio_read_phys(pio, vd, addr,
4648  					    b_asize, buf->b_data,
4649  					    ZIO_CHECKSUM_OFF,
4650  					    l2arc_read_done, cb, priority,
4651  					    zio_flags | ZIO_FLAG_DONT_CACHE |
4652  					    ZIO_FLAG_CANFAIL |
4653  					    ZIO_FLAG_DONT_PROPAGATE |
4654  					    ZIO_FLAG_DONT_RETRY, B_FALSE);
4655  				}
4656  				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4657  				    zio_t *, rzio);
4658  				ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4659  
4660  				if (*arc_flags & ARC_FLAG_NOWAIT) {
4661  					zio_nowait(rzio);
4662  					return (0);
4663  				}
4664  
4665  				ASSERT(*arc_flags & ARC_FLAG_WAIT);
4666  				if (zio_wait(rzio) == 0)
4667  					return (0);
4668  
4669  				/* l2arc read error; goto zio_read() */
4670  			} else {
4671  				DTRACE_PROBE1(l2arc__miss,
4672  				    arc_buf_hdr_t *, hdr);
4673  				ARCSTAT_BUMP(arcstat_l2_misses);
4674  				if (HDR_L2_WRITING(hdr))
4675  					ARCSTAT_BUMP(arcstat_l2_rw_clash);
4676  				spa_config_exit(spa, SCL_L2ARC, vd);
4677  			}
4678  		} else {
4679  			if (vd != NULL)
4680  				spa_config_exit(spa, SCL_L2ARC, vd);
4681  			if (l2arc_ndev != 0) {
4682  				DTRACE_PROBE1(l2arc__miss,
4683  				    arc_buf_hdr_t *, hdr);
4684  				ARCSTAT_BUMP(arcstat_l2_misses);
4685  			}
4686  		}
4687  
4688  		rzio = zio_read(pio, spa, bp, buf->b_data, size,
4689  		    arc_read_done, buf, priority, zio_flags, zb);
4690  
4691  		if (*arc_flags & ARC_FLAG_WAIT)
4692  			return (zio_wait(rzio));
4693  
4694  		ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4695  		zio_nowait(rzio);
4696  	}
4697  	return (0);
4698  }
4699  
4700  void
arc_set_callback(arc_buf_t * buf,arc_evict_func_t * func,void * private)4701  arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4702  {
4703  	ASSERT(buf->b_hdr != NULL);
4704  	ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4705  	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4706  	    func == NULL);
4707  	ASSERT(buf->b_efunc == NULL);
4708  	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4709  
4710  	buf->b_efunc = func;
4711  	buf->b_private = private;
4712  }
4713  
4714  /*
4715   * Notify the arc that a block was freed, and thus will never be used again.
4716   */
4717  void
arc_freed(spa_t * spa,const blkptr_t * bp)4718  arc_freed(spa_t *spa, const blkptr_t *bp)
4719  {
4720  	arc_buf_hdr_t *hdr;
4721  	kmutex_t *hash_lock;
4722  	uint64_t guid = spa_load_guid(spa);
4723  
4724  	ASSERT(!BP_IS_EMBEDDED(bp));
4725  
4726  	hdr = buf_hash_find(guid, bp, &hash_lock);
4727  	if (hdr == NULL)
4728  		return;
4729  	if (HDR_BUF_AVAILABLE(hdr)) {
4730  		arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4731  		add_reference(hdr, hash_lock, FTAG);
4732  		hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4733  		mutex_exit(hash_lock);
4734  
4735  		arc_release(buf, FTAG);
4736  		(void) arc_buf_remove_ref(buf, FTAG);
4737  	} else {
4738  		mutex_exit(hash_lock);
4739  	}
4740  
4741  }
4742  
4743  /*
4744   * Clear the user eviction callback set by arc_set_callback(), first calling
4745   * it if it exists.  Because the presence of a callback keeps an arc_buf cached
4746   * clearing the callback may result in the arc_buf being destroyed.  However,
4747   * it will not result in the *last* arc_buf being destroyed, hence the data
4748   * will remain cached in the ARC. We make a copy of the arc buffer here so
4749   * that we can process the callback without holding any locks.
4750   *
4751   * It's possible that the callback is already in the process of being cleared
4752   * by another thread.  In this case we can not clear the callback.
4753   *
4754   * Returns B_TRUE if the callback was successfully called and cleared.
4755   */
4756  boolean_t
arc_clear_callback(arc_buf_t * buf)4757  arc_clear_callback(arc_buf_t *buf)
4758  {
4759  	arc_buf_hdr_t *hdr;
4760  	kmutex_t *hash_lock;
4761  	arc_evict_func_t *efunc = buf->b_efunc;
4762  	void *private = buf->b_private;
4763  
4764  	mutex_enter(&buf->b_evict_lock);
4765  	hdr = buf->b_hdr;
4766  	if (hdr == NULL) {
4767  		/*
4768  		 * We are in arc_do_user_evicts().
4769  		 */
4770  		ASSERT(buf->b_data == NULL);
4771  		mutex_exit(&buf->b_evict_lock);
4772  		return (B_FALSE);
4773  	} else if (buf->b_data == NULL) {
4774  		/*
4775  		 * We are on the eviction list; process this buffer now
4776  		 * but let arc_do_user_evicts() do the reaping.
4777  		 */
4778  		buf->b_efunc = NULL;
4779  		mutex_exit(&buf->b_evict_lock);
4780  		VERIFY0(efunc(private));
4781  		return (B_TRUE);
4782  	}
4783  	hash_lock = HDR_LOCK(hdr);
4784  	mutex_enter(hash_lock);
4785  	hdr = buf->b_hdr;
4786  	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4787  
4788  	ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4789  	    hdr->b_l1hdr.b_datacnt);
4790  	ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4791  	    hdr->b_l1hdr.b_state == arc_mfu);
4792  
4793  	buf->b_efunc = NULL;
4794  	buf->b_private = NULL;
4795  
4796  	if (hdr->b_l1hdr.b_datacnt > 1) {
4797  		mutex_exit(&buf->b_evict_lock);
4798  		arc_buf_destroy(buf, TRUE);
4799  	} else {
4800  		ASSERT(buf == hdr->b_l1hdr.b_buf);
4801  		hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4802  		mutex_exit(&buf->b_evict_lock);
4803  	}
4804  
4805  	mutex_exit(hash_lock);
4806  	VERIFY0(efunc(private));
4807  	return (B_TRUE);
4808  }
4809  
4810  /*
4811   * Release this buffer from the cache, making it an anonymous buffer.  This
4812   * must be done after a read and prior to modifying the buffer contents.
4813   * If the buffer has more than one reference, we must make
4814   * a new hdr for the buffer.
4815   */
4816  void
arc_release(arc_buf_t * buf,void * tag)4817  arc_release(arc_buf_t *buf, void *tag)
4818  {
4819  	arc_buf_hdr_t *hdr = buf->b_hdr;
4820  
4821  	/*
4822  	 * It would be nice to assert that if it's DMU metadata (level >
4823  	 * 0 || it's the dnode file), then it must be syncing context.
4824  	 * But we don't know that information at this level.
4825  	 */
4826  
4827  	mutex_enter(&buf->b_evict_lock);
4828  
4829  	ASSERT(HDR_HAS_L1HDR(hdr));
4830  
4831  	/*
4832  	 * We don't grab the hash lock prior to this check, because if
4833  	 * the buffer's header is in the arc_anon state, it won't be
4834  	 * linked into the hash table.
4835  	 */
4836  	if (hdr->b_l1hdr.b_state == arc_anon) {
4837  		mutex_exit(&buf->b_evict_lock);
4838  		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4839  		ASSERT(!HDR_IN_HASH_TABLE(hdr));
4840  		ASSERT(!HDR_HAS_L2HDR(hdr));
4841  		ASSERT(BUF_EMPTY(hdr));
4842  
4843  		ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4844  		ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4845  		ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4846  
4847  		ASSERT3P(buf->b_efunc, ==, NULL);
4848  		ASSERT3P(buf->b_private, ==, NULL);
4849  
4850  		hdr->b_l1hdr.b_arc_access = 0;
4851  		arc_buf_thaw(buf);
4852  
4853  		return;
4854  	}
4855  
4856  	kmutex_t *hash_lock = HDR_LOCK(hdr);
4857  	mutex_enter(hash_lock);
4858  
4859  	/*
4860  	 * This assignment is only valid as long as the hash_lock is
4861  	 * held, we must be careful not to reference state or the
4862  	 * b_state field after dropping the lock.
4863  	 */
4864  	arc_state_t *state = hdr->b_l1hdr.b_state;
4865  	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4866  	ASSERT3P(state, !=, arc_anon);
4867  
4868  	/* this buffer is not on any list */
4869  	ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4870  
4871  	if (HDR_HAS_L2HDR(hdr)) {
4872  		mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4873  
4874  		/*
4875  		 * We have to recheck this conditional again now that
4876  		 * we're holding the l2ad_mtx to prevent a race with
4877  		 * another thread which might be concurrently calling
4878  		 * l2arc_evict(). In that case, l2arc_evict() might have
4879  		 * destroyed the header's L2 portion as we were waiting
4880  		 * to acquire the l2ad_mtx.
4881  		 */
4882  		if (HDR_HAS_L2HDR(hdr))
4883  			arc_hdr_l2hdr_destroy(hdr);
4884  
4885  		mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4886  	}
4887  
4888  	/*
4889  	 * Do we have more than one buf?
4890  	 */
4891  	if (hdr->b_l1hdr.b_datacnt > 1) {
4892  		arc_buf_hdr_t *nhdr;
4893  		arc_buf_t **bufp;
4894  		uint64_t blksz = hdr->b_size;
4895  		uint64_t spa = hdr->b_spa;
4896  		arc_buf_contents_t type = arc_buf_type(hdr);
4897  		uint32_t flags = hdr->b_flags;
4898  
4899  		ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4900  		/*
4901  		 * Pull the data off of this hdr and attach it to
4902  		 * a new anonymous hdr.
4903  		 */
4904  		(void) remove_reference(hdr, hash_lock, tag);
4905  		bufp = &hdr->b_l1hdr.b_buf;
4906  		while (*bufp != buf)
4907  			bufp = &(*bufp)->b_next;
4908  		*bufp = buf->b_next;
4909  		buf->b_next = NULL;
4910  
4911  		ASSERT3P(state, !=, arc_l2c_only);
4912  
4913  		(void) refcount_remove_many(
4914  		    &state->arcs_size, hdr->b_size, buf);
4915  
4916  		if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4917  			ASSERT3P(state, !=, arc_l2c_only);
4918  			uint64_t *size = &state->arcs_lsize[type];
4919  			ASSERT3U(*size, >=, hdr->b_size);
4920  			atomic_add_64(size, -hdr->b_size);
4921  		}
4922  
4923  		/*
4924  		 * We're releasing a duplicate user data buffer, update
4925  		 * our statistics accordingly.
4926  		 */
4927  		if (HDR_ISTYPE_DATA(hdr)) {
4928  			ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4929  			ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4930  			    -hdr->b_size);
4931  		}
4932  		hdr->b_l1hdr.b_datacnt -= 1;
4933  		arc_cksum_verify(buf);
4934  		arc_buf_unwatch(buf);
4935  
4936  		mutex_exit(hash_lock);
4937  
4938  		nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4939  		nhdr->b_size = blksz;
4940  		nhdr->b_spa = spa;
4941  
4942  		nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4943  		nhdr->b_flags |= arc_bufc_to_flags(type);
4944  		nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4945  
4946  		nhdr->b_l1hdr.b_buf = buf;
4947  		nhdr->b_l1hdr.b_datacnt = 1;
4948  		nhdr->b_l1hdr.b_state = arc_anon;
4949  		nhdr->b_l1hdr.b_arc_access = 0;
4950  		nhdr->b_l1hdr.b_tmp_cdata = NULL;
4951  		nhdr->b_freeze_cksum = NULL;
4952  
4953  		(void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4954  		buf->b_hdr = nhdr;
4955  		mutex_exit(&buf->b_evict_lock);
4956  		(void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
4957  	} else {
4958  		mutex_exit(&buf->b_evict_lock);
4959  		ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4960  		/* protected by hash lock, or hdr is on arc_anon */
4961  		ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4962  		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4963  		arc_change_state(arc_anon, hdr, hash_lock);
4964  		hdr->b_l1hdr.b_arc_access = 0;
4965  		mutex_exit(hash_lock);
4966  
4967  		buf_discard_identity(hdr);
4968  		arc_buf_thaw(buf);
4969  	}
4970  	buf->b_efunc = NULL;
4971  	buf->b_private = NULL;
4972  }
4973  
4974  int
arc_released(arc_buf_t * buf)4975  arc_released(arc_buf_t *buf)
4976  {
4977  	int released;
4978  
4979  	mutex_enter(&buf->b_evict_lock);
4980  	released = (buf->b_data != NULL &&
4981  	    buf->b_hdr->b_l1hdr.b_state == arc_anon);
4982  	mutex_exit(&buf->b_evict_lock);
4983  	return (released);
4984  }
4985  
4986  #ifdef ZFS_DEBUG
4987  int
arc_referenced(arc_buf_t * buf)4988  arc_referenced(arc_buf_t *buf)
4989  {
4990  	int referenced;
4991  
4992  	mutex_enter(&buf->b_evict_lock);
4993  	referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4994  	mutex_exit(&buf->b_evict_lock);
4995  	return (referenced);
4996  }
4997  #endif
4998  
4999  static void
arc_write_ready(zio_t * zio)5000  arc_write_ready(zio_t *zio)
5001  {
5002  	arc_write_callback_t *callback = zio->io_private;
5003  	arc_buf_t *buf = callback->awcb_buf;
5004  	arc_buf_hdr_t *hdr = buf->b_hdr;
5005  
5006  	ASSERT(HDR_HAS_L1HDR(hdr));
5007  	ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
5008  	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5009  	callback->awcb_ready(zio, buf, callback->awcb_private);
5010  
5011  	/*
5012  	 * If the IO is already in progress, then this is a re-write
5013  	 * attempt, so we need to thaw and re-compute the cksum.
5014  	 * It is the responsibility of the callback to handle the
5015  	 * accounting for any re-write attempt.
5016  	 */
5017  	if (HDR_IO_IN_PROGRESS(hdr)) {
5018  		mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
5019  		if (hdr->b_freeze_cksum != NULL) {
5020  			kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
5021  			hdr->b_freeze_cksum = NULL;
5022  		}
5023  		mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
5024  	}
5025  	arc_cksum_compute(buf, B_FALSE);
5026  	hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
5027  }
5028  
5029  /*
5030   * The SPA calls this callback for each physical write that happens on behalf
5031   * of a logical write.  See the comment in dbuf_write_physdone() for details.
5032   */
5033  static void
arc_write_physdone(zio_t * zio)5034  arc_write_physdone(zio_t *zio)
5035  {
5036  	arc_write_callback_t *cb = zio->io_private;
5037  	if (cb->awcb_physdone != NULL)
5038  		cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
5039  }
5040  
5041  static void
arc_write_done(zio_t * zio)5042  arc_write_done(zio_t *zio)
5043  {
5044  	arc_write_callback_t *callback = zio->io_private;
5045  	arc_buf_t *buf = callback->awcb_buf;
5046  	arc_buf_hdr_t *hdr = buf->b_hdr;
5047  
5048  	ASSERT(hdr->b_l1hdr.b_acb == NULL);
5049  
5050  	if (zio->io_error == 0) {
5051  		if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
5052  			buf_discard_identity(hdr);
5053  		} else {
5054  			hdr->b_dva = *BP_IDENTITY(zio->io_bp);
5055  			hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
5056  		}
5057  	} else {
5058  		ASSERT(BUF_EMPTY(hdr));
5059  	}
5060  
5061  	/*
5062  	 * If the block to be written was all-zero or compressed enough to be
5063  	 * embedded in the BP, no write was performed so there will be no
5064  	 * dva/birth/checksum.  The buffer must therefore remain anonymous
5065  	 * (and uncached).
5066  	 */
5067  	if (!BUF_EMPTY(hdr)) {
5068  		arc_buf_hdr_t *exists;
5069  		kmutex_t *hash_lock;
5070  
5071  		ASSERT(zio->io_error == 0);
5072  
5073  		arc_cksum_verify(buf);
5074  
5075  		exists = buf_hash_insert(hdr, &hash_lock);
5076  		if (exists != NULL) {
5077  			/*
5078  			 * This can only happen if we overwrite for
5079  			 * sync-to-convergence, because we remove
5080  			 * buffers from the hash table when we arc_free().
5081  			 */
5082  			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5083  				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5084  					panic("bad overwrite, hdr=%p exists=%p",
5085  					    (void *)hdr, (void *)exists);
5086  				ASSERT(refcount_is_zero(
5087  				    &exists->b_l1hdr.b_refcnt));
5088  				arc_change_state(arc_anon, exists, hash_lock);
5089  				mutex_exit(hash_lock);
5090  				arc_hdr_destroy(exists);
5091  				exists = buf_hash_insert(hdr, &hash_lock);
5092  				ASSERT3P(exists, ==, NULL);
5093  			} else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5094  				/* nopwrite */
5095  				ASSERT(zio->io_prop.zp_nopwrite);
5096  				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5097  					panic("bad nopwrite, hdr=%p exists=%p",
5098  					    (void *)hdr, (void *)exists);
5099  			} else {
5100  				/* Dedup */
5101  				ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5102  				ASSERT(hdr->b_l1hdr.b_state == arc_anon);
5103  				ASSERT(BP_GET_DEDUP(zio->io_bp));
5104  				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5105  			}
5106  		}
5107  		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5108  		/* if it's not anon, we are doing a scrub */
5109  		if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
5110  			arc_access(hdr, hash_lock);
5111  		mutex_exit(hash_lock);
5112  	} else {
5113  		hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5114  	}
5115  
5116  	ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5117  	callback->awcb_done(zio, buf, callback->awcb_private);
5118  
5119  	kmem_free(callback, sizeof (arc_write_callback_t));
5120  }
5121  
5122  zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t l2arc,boolean_t l2arc_compress,const zio_prop_t * zp,arc_done_func_t * ready,arc_done_func_t * physdone,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,const zbookmark_phys_t * zb)5123  arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
5124      blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
5125      const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
5126      arc_done_func_t *done, void *private, zio_priority_t priority,
5127      int zio_flags, const zbookmark_phys_t *zb)
5128  {
5129  	arc_buf_hdr_t *hdr = buf->b_hdr;
5130  	arc_write_callback_t *callback;
5131  	zio_t *zio;
5132  
5133  	ASSERT(ready != NULL);
5134  	ASSERT(done != NULL);
5135  	ASSERT(!HDR_IO_ERROR(hdr));
5136  	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5137  	ASSERT(hdr->b_l1hdr.b_acb == NULL);
5138  	ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5139  	if (l2arc)
5140  		hdr->b_flags |= ARC_FLAG_L2CACHE;
5141  	if (l2arc_compress)
5142  		hdr->b_flags |= ARC_FLAG_L2COMPRESS;
5143  	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
5144  	callback->awcb_ready = ready;
5145  	callback->awcb_physdone = physdone;
5146  	callback->awcb_done = done;
5147  	callback->awcb_private = private;
5148  	callback->awcb_buf = buf;
5149  
5150  	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
5151  	    arc_write_ready, arc_write_physdone, arc_write_done, callback,
5152  	    priority, zio_flags, zb);
5153  
5154  	return (zio);
5155  }
5156  
5157  static int
arc_memory_throttle(uint64_t reserve,uint64_t txg)5158  arc_memory_throttle(uint64_t reserve, uint64_t txg)
5159  {
5160  #ifdef _KERNEL
5161  	uint64_t available_memory = ptob(freemem);
5162  	static uint64_t page_load = 0;
5163  	static uint64_t last_txg = 0;
5164  
5165  #if defined(__i386)
5166  	available_memory =
5167  	    MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
5168  #endif
5169  
5170  	if (freemem > physmem * arc_lotsfree_percent / 100)
5171  		return (0);
5172  
5173  	if (txg > last_txg) {
5174  		last_txg = txg;
5175  		page_load = 0;
5176  	}
5177  	/*
5178  	 * If we are in pageout, we know that memory is already tight,
5179  	 * the arc is already going to be evicting, so we just want to
5180  	 * continue to let page writes occur as quickly as possible.
5181  	 */
5182  	if (curproc == proc_pageout) {
5183  		if (page_load > MAX(ptob(minfree), available_memory) / 4)
5184  			return (SET_ERROR(ERESTART));
5185  		/* Note: reserve is inflated, so we deflate */
5186  		page_load += reserve / 8;
5187  		return (0);
5188  	} else if (page_load > 0 && arc_reclaim_needed()) {
5189  		/* memory is low, delay before restarting */
5190  		ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
5191  		return (SET_ERROR(EAGAIN));
5192  	}
5193  	page_load = 0;
5194  #endif
5195  	return (0);
5196  }
5197  
5198  void
arc_tempreserve_clear(uint64_t reserve)5199  arc_tempreserve_clear(uint64_t reserve)
5200  {
5201  	atomic_add_64(&arc_tempreserve, -reserve);
5202  	ASSERT((int64_t)arc_tempreserve >= 0);
5203  }
5204  
5205  int
arc_tempreserve_space(uint64_t reserve,uint64_t txg)5206  arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5207  {
5208  	int error;
5209  	uint64_t anon_size;
5210  
5211  	if (reserve > arc_c/4 && !arc_no_grow)
5212  		arc_c = MIN(arc_c_max, reserve * 4);
5213  	if (reserve > arc_c)
5214  		return (SET_ERROR(ENOMEM));
5215  
5216  	/*
5217  	 * Don't count loaned bufs as in flight dirty data to prevent long
5218  	 * network delays from blocking transactions that are ready to be
5219  	 * assigned to a txg.
5220  	 */
5221  	anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5222  	    arc_loaned_bytes), 0);
5223  
5224  	/*
5225  	 * Writes will, almost always, require additional memory allocations
5226  	 * in order to compress/encrypt/etc the data.  We therefore need to
5227  	 * make sure that there is sufficient available memory for this.
5228  	 */
5229  	error = arc_memory_throttle(reserve, txg);
5230  	if (error != 0)
5231  		return (error);
5232  
5233  	/*
5234  	 * Throttle writes when the amount of dirty data in the cache
5235  	 * gets too large.  We try to keep the cache less than half full
5236  	 * of dirty blocks so that our sync times don't grow too large.
5237  	 * Note: if two requests come in concurrently, we might let them
5238  	 * both succeed, when one of them should fail.  Not a huge deal.
5239  	 */
5240  
5241  	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5242  	    anon_size > arc_c / 4) {
5243  		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5244  		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5245  		    arc_tempreserve>>10,
5246  		    arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5247  		    arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5248  		    reserve>>10, arc_c>>10);
5249  		return (SET_ERROR(ERESTART));
5250  	}
5251  	atomic_add_64(&arc_tempreserve, reserve);
5252  	return (0);
5253  }
5254  
5255  static void
arc_kstat_update_state(arc_state_t * state,kstat_named_t * size,kstat_named_t * evict_data,kstat_named_t * evict_metadata)5256  arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5257      kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5258  {
5259  	size->value.ui64 = refcount_count(&state->arcs_size);
5260  	evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5261  	evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5262  }
5263  
5264  static int
arc_kstat_update(kstat_t * ksp,int rw)5265  arc_kstat_update(kstat_t *ksp, int rw)
5266  {
5267  	arc_stats_t *as = ksp->ks_data;
5268  
5269  	if (rw == KSTAT_WRITE) {
5270  		return (EACCES);
5271  	} else {
5272  		arc_kstat_update_state(arc_anon,
5273  		    &as->arcstat_anon_size,
5274  		    &as->arcstat_anon_evictable_data,
5275  		    &as->arcstat_anon_evictable_metadata);
5276  		arc_kstat_update_state(arc_mru,
5277  		    &as->arcstat_mru_size,
5278  		    &as->arcstat_mru_evictable_data,
5279  		    &as->arcstat_mru_evictable_metadata);
5280  		arc_kstat_update_state(arc_mru_ghost,
5281  		    &as->arcstat_mru_ghost_size,
5282  		    &as->arcstat_mru_ghost_evictable_data,
5283  		    &as->arcstat_mru_ghost_evictable_metadata);
5284  		arc_kstat_update_state(arc_mfu,
5285  		    &as->arcstat_mfu_size,
5286  		    &as->arcstat_mfu_evictable_data,
5287  		    &as->arcstat_mfu_evictable_metadata);
5288  		arc_kstat_update_state(arc_mfu_ghost,
5289  		    &as->arcstat_mfu_ghost_size,
5290  		    &as->arcstat_mfu_ghost_evictable_data,
5291  		    &as->arcstat_mfu_ghost_evictable_metadata);
5292  	}
5293  
5294  	return (0);
5295  }
5296  
5297  /*
5298   * This function *must* return indices evenly distributed between all
5299   * sublists of the multilist. This is needed due to how the ARC eviction
5300   * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5301   * distributed between all sublists and uses this assumption when
5302   * deciding which sublist to evict from and how much to evict from it.
5303   */
5304  unsigned int
arc_state_multilist_index_func(multilist_t * ml,void * obj)5305  arc_state_multilist_index_func(multilist_t *ml, void *obj)
5306  {
5307  	arc_buf_hdr_t *hdr = obj;
5308  
5309  	/*
5310  	 * We rely on b_dva to generate evenly distributed index
5311  	 * numbers using buf_hash below. So, as an added precaution,
5312  	 * let's make sure we never add empty buffers to the arc lists.
5313  	 */
5314  	ASSERT(!BUF_EMPTY(hdr));
5315  
5316  	/*
5317  	 * The assumption here, is the hash value for a given
5318  	 * arc_buf_hdr_t will remain constant throughout it's lifetime
5319  	 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
5320  	 * Thus, we don't need to store the header's sublist index
5321  	 * on insertion, as this index can be recalculated on removal.
5322  	 *
5323  	 * Also, the low order bits of the hash value are thought to be
5324  	 * distributed evenly. Otherwise, in the case that the multilist
5325  	 * has a power of two number of sublists, each sublists' usage
5326  	 * would not be evenly distributed.
5327  	 */
5328  	return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5329  	    multilist_get_num_sublists(ml));
5330  }
5331  
5332  void
arc_init(void)5333  arc_init(void)
5334  {
5335  	/*
5336  	 * allmem is "all memory that we could possibly use".
5337  	 */
5338  #ifdef _KERNEL
5339  	uint64_t allmem = ptob(physmem - swapfs_minfree);
5340  #else
5341  	uint64_t allmem = (physmem * PAGESIZE) / 2;
5342  #endif
5343  
5344  	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5345  	cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5346  	cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5347  
5348  	mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5349  	cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
5350  
5351  	/* Convert seconds to clock ticks */
5352  	arc_min_prefetch_lifespan = 1 * hz;
5353  
5354  	/* Start out with 1/8 of all memory */
5355  	arc_c = allmem / 8;
5356  
5357  #ifdef _KERNEL
5358  	/*
5359  	 * On architectures where the physical memory can be larger
5360  	 * than the addressable space (intel in 32-bit mode), we may
5361  	 * need to limit the cache to 1/8 of VM size.
5362  	 */
5363  	arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
5364  #endif
5365  
5366  	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
5367  	arc_c_min = MAX(allmem / 32, 64 << 20);
5368  	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
5369  	if (allmem >= 1 << 30)
5370  		arc_c_max = allmem - (1 << 30);
5371  	else
5372  		arc_c_max = arc_c_min;
5373  	arc_c_max = MAX(allmem * 3 / 4, arc_c_max);
5374  
5375  	/*
5376  	 * In userland, there's only the memory pressure that we artificially
5377  	 * create (see arc_available_memory()).  Don't let arc_c get too
5378  	 * small, because it can cause transactions to be larger than
5379  	 * arc_c, causing arc_tempreserve_space() to fail.
5380  	 */
5381  #ifndef _KERNEL
5382  	arc_c_min = arc_c_max / 2;
5383  #endif
5384  
5385  	/*
5386  	 * Allow the tunables to override our calculations if they are
5387  	 * reasonable (ie. over 64MB)
5388  	 */
5389  	if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem)
5390  		arc_c_max = zfs_arc_max;
5391  	if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max)
5392  		arc_c_min = zfs_arc_min;
5393  
5394  	arc_c = arc_c_max;
5395  	arc_p = (arc_c >> 1);
5396  
5397  	/* limit meta-data to 1/4 of the arc capacity */
5398  	arc_meta_limit = arc_c_max / 4;
5399  
5400  	/* Allow the tunable to override if it is reasonable */
5401  	if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
5402  		arc_meta_limit = zfs_arc_meta_limit;
5403  
5404  	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
5405  		arc_c_min = arc_meta_limit / 2;
5406  
5407  	if (zfs_arc_meta_min > 0) {
5408  		arc_meta_min = zfs_arc_meta_min;
5409  	} else {
5410  		arc_meta_min = arc_c_min / 2;
5411  	}
5412  
5413  	if (zfs_arc_grow_retry > 0)
5414  		arc_grow_retry = zfs_arc_grow_retry;
5415  
5416  	if (zfs_arc_shrink_shift > 0)
5417  		arc_shrink_shift = zfs_arc_shrink_shift;
5418  
5419  	/*
5420  	 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
5421  	 */
5422  	if (arc_no_grow_shift >= arc_shrink_shift)
5423  		arc_no_grow_shift = arc_shrink_shift - 1;
5424  
5425  	if (zfs_arc_p_min_shift > 0)
5426  		arc_p_min_shift = zfs_arc_p_min_shift;
5427  
5428  	if (zfs_arc_num_sublists_per_state < 1)
5429  		zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
5430  
5431  	/* if kmem_flags are set, lets try to use less memory */
5432  	if (kmem_debugging())
5433  		arc_c = arc_c / 2;
5434  	if (arc_c < arc_c_min)
5435  		arc_c = arc_c_min;
5436  
5437  	arc_anon = &ARC_anon;
5438  	arc_mru = &ARC_mru;
5439  	arc_mru_ghost = &ARC_mru_ghost;
5440  	arc_mfu = &ARC_mfu;
5441  	arc_mfu_ghost = &ARC_mfu_ghost;
5442  	arc_l2c_only = &ARC_l2c_only;
5443  	arc_size = 0;
5444  
5445  	multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5446  	    sizeof (arc_buf_hdr_t),
5447  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5448  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5449  	multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5450  	    sizeof (arc_buf_hdr_t),
5451  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5452  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5453  	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5454  	    sizeof (arc_buf_hdr_t),
5455  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5456  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5457  	multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5458  	    sizeof (arc_buf_hdr_t),
5459  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5460  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5461  	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5462  	    sizeof (arc_buf_hdr_t),
5463  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5464  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5465  	multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5466  	    sizeof (arc_buf_hdr_t),
5467  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5468  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5469  	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5470  	    sizeof (arc_buf_hdr_t),
5471  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5472  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5473  	multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5474  	    sizeof (arc_buf_hdr_t),
5475  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5476  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5477  	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5478  	    sizeof (arc_buf_hdr_t),
5479  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5480  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5481  	multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5482  	    sizeof (arc_buf_hdr_t),
5483  	    offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5484  	    zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5485  
5486  	refcount_create(&arc_anon->arcs_size);
5487  	refcount_create(&arc_mru->arcs_size);
5488  	refcount_create(&arc_mru_ghost->arcs_size);
5489  	refcount_create(&arc_mfu->arcs_size);
5490  	refcount_create(&arc_mfu_ghost->arcs_size);
5491  	refcount_create(&arc_l2c_only->arcs_size);
5492  
5493  	buf_init();
5494  
5495  	arc_reclaim_thread_exit = FALSE;
5496  	arc_user_evicts_thread_exit = FALSE;
5497  	arc_eviction_list = NULL;
5498  	bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5499  
5500  	arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5501  	    sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5502  
5503  	if (arc_ksp != NULL) {
5504  		arc_ksp->ks_data = &arc_stats;
5505  		arc_ksp->ks_update = arc_kstat_update;
5506  		kstat_install(arc_ksp);
5507  	}
5508  
5509  	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5510  	    TS_RUN, minclsyspri);
5511  
5512  	(void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5513  	    TS_RUN, minclsyspri);
5514  
5515  	arc_dead = FALSE;
5516  	arc_warm = B_FALSE;
5517  
5518  	/*
5519  	 * Calculate maximum amount of dirty data per pool.
5520  	 *
5521  	 * If it has been set by /etc/system, take that.
5522  	 * Otherwise, use a percentage of physical memory defined by
5523  	 * zfs_dirty_data_max_percent (default 10%) with a cap at
5524  	 * zfs_dirty_data_max_max (default 4GB).
5525  	 */
5526  	if (zfs_dirty_data_max == 0) {
5527  		zfs_dirty_data_max = physmem * PAGESIZE *
5528  		    zfs_dirty_data_max_percent / 100;
5529  		zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5530  		    zfs_dirty_data_max_max);
5531  	}
5532  }
5533  
5534  void
arc_fini(void)5535  arc_fini(void)
5536  {
5537  	mutex_enter(&arc_reclaim_lock);
5538  	arc_reclaim_thread_exit = TRUE;
5539  	/*
5540  	 * The reclaim thread will set arc_reclaim_thread_exit back to
5541  	 * FALSE when it is finished exiting; we're waiting for that.
5542  	 */
5543  	while (arc_reclaim_thread_exit) {
5544  		cv_signal(&arc_reclaim_thread_cv);
5545  		cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5546  	}
5547  	mutex_exit(&arc_reclaim_lock);
5548  
5549  	mutex_enter(&arc_user_evicts_lock);
5550  	arc_user_evicts_thread_exit = TRUE;
5551  	/*
5552  	 * The user evicts thread will set arc_user_evicts_thread_exit
5553  	 * to FALSE when it is finished exiting; we're waiting for that.
5554  	 */
5555  	while (arc_user_evicts_thread_exit) {
5556  		cv_signal(&arc_user_evicts_cv);
5557  		cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5558  	}
5559  	mutex_exit(&arc_user_evicts_lock);
5560  
5561  	/* Use TRUE to ensure *all* buffers are evicted */
5562  	arc_flush(NULL, TRUE);
5563  
5564  	arc_dead = TRUE;
5565  
5566  	if (arc_ksp != NULL) {
5567  		kstat_delete(arc_ksp);
5568  		arc_ksp = NULL;
5569  	}
5570  
5571  	mutex_destroy(&arc_reclaim_lock);
5572  	cv_destroy(&arc_reclaim_thread_cv);
5573  	cv_destroy(&arc_reclaim_waiters_cv);
5574  
5575  	mutex_destroy(&arc_user_evicts_lock);
5576  	cv_destroy(&arc_user_evicts_cv);
5577  
5578  	refcount_destroy(&arc_anon->arcs_size);
5579  	refcount_destroy(&arc_mru->arcs_size);
5580  	refcount_destroy(&arc_mru_ghost->arcs_size);
5581  	refcount_destroy(&arc_mfu->arcs_size);
5582  	refcount_destroy(&arc_mfu_ghost->arcs_size);
5583  	refcount_destroy(&arc_l2c_only->arcs_size);
5584  
5585  	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5586  	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5587  	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5588  	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5589  	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5590  	multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5591  	multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5592  	multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5593  	multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5594  	multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
5595  
5596  	buf_fini();
5597  
5598  	ASSERT0(arc_loaned_bytes);
5599  }
5600  
5601  /*
5602   * Level 2 ARC
5603   *
5604   * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5605   * It uses dedicated storage devices to hold cached data, which are populated
5606   * using large infrequent writes.  The main role of this cache is to boost
5607   * the performance of random read workloads.  The intended L2ARC devices
5608   * include short-stroked disks, solid state disks, and other media with
5609   * substantially faster read latency than disk.
5610   *
5611   *                 +-----------------------+
5612   *                 |         ARC           |
5613   *                 +-----------------------+
5614   *                    |         ^     ^
5615   *                    |         |     |
5616   *      l2arc_feed_thread()    arc_read()
5617   *                    |         |     |
5618   *                    |  l2arc read   |
5619   *                    V         |     |
5620   *               +---------------+    |
5621   *               |     L2ARC     |    |
5622   *               +---------------+    |
5623   *                   |    ^           |
5624   *          l2arc_write() |           |
5625   *                   |    |           |
5626   *                   V    |           |
5627   *                 +-------+      +-------+
5628   *                 | vdev  |      | vdev  |
5629   *                 | cache |      | cache |
5630   *                 +-------+      +-------+
5631   *                 +=========+     .-----.
5632   *                 :  L2ARC  :    |-_____-|
5633   *                 : devices :    | Disks |
5634   *                 +=========+    `-_____-'
5635   *
5636   * Read requests are satisfied from the following sources, in order:
5637   *
5638   *	1) ARC
5639   *	2) vdev cache of L2ARC devices
5640   *	3) L2ARC devices
5641   *	4) vdev cache of disks
5642   *	5) disks
5643   *
5644   * Some L2ARC device types exhibit extremely slow write performance.
5645   * To accommodate for this there are some significant differences between
5646   * the L2ARC and traditional cache design:
5647   *
5648   * 1. There is no eviction path from the ARC to the L2ARC.  Evictions from
5649   * the ARC behave as usual, freeing buffers and placing headers on ghost
5650   * lists.  The ARC does not send buffers to the L2ARC during eviction as
5651   * this would add inflated write latencies for all ARC memory pressure.
5652   *
5653   * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5654   * It does this by periodically scanning buffers from the eviction-end of
5655   * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5656   * not already there. It scans until a headroom of buffers is satisfied,
5657   * which itself is a buffer for ARC eviction. If a compressible buffer is
5658   * found during scanning and selected for writing to an L2ARC device, we
5659   * temporarily boost scanning headroom during the next scan cycle to make
5660   * sure we adapt to compression effects (which might significantly reduce
5661   * the data volume we write to L2ARC). The thread that does this is
5662   * l2arc_feed_thread(), illustrated below; example sizes are included to
5663   * provide a better sense of ratio than this diagram:
5664   *
5665   *	       head -->                        tail
5666   *	        +---------------------+----------+
5667   *	ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->.   # already on L2ARC
5668   *	        +---------------------+----------+   |   o L2ARC eligible
5669   *	ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->|   : ARC buffer
5670   *	        +---------------------+----------+   |
5671   *	             15.9 Gbytes      ^ 32 Mbytes    |
5672   *	                           headroom          |
5673   *	                                      l2arc_feed_thread()
5674   *	                                             |
5675   *	                 l2arc write hand <--[oooo]--'
5676   *	                         |           8 Mbyte
5677   *	                         |          write max
5678   *	                         V
5679   *		  +==============================+
5680   *	L2ARC dev |####|#|###|###|    |####| ... |
5681   *	          +==============================+
5682   *	                     32 Gbytes
5683   *
5684   * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5685   * evicted, then the L2ARC has cached a buffer much sooner than it probably
5686   * needed to, potentially wasting L2ARC device bandwidth and storage.  It is
5687   * safe to say that this is an uncommon case, since buffers at the end of
5688   * the ARC lists have moved there due to inactivity.
5689   *
5690   * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5691   * then the L2ARC simply misses copying some buffers.  This serves as a
5692   * pressure valve to prevent heavy read workloads from both stalling the ARC
5693   * with waits and clogging the L2ARC with writes.  This also helps prevent
5694   * the potential for the L2ARC to churn if it attempts to cache content too
5695   * quickly, such as during backups of the entire pool.
5696   *
5697   * 5. After system boot and before the ARC has filled main memory, there are
5698   * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5699   * lists can remain mostly static.  Instead of searching from tail of these
5700   * lists as pictured, the l2arc_feed_thread() will search from the list heads
5701   * for eligible buffers, greatly increasing its chance of finding them.
5702   *
5703   * The L2ARC device write speed is also boosted during this time so that
5704   * the L2ARC warms up faster.  Since there have been no ARC evictions yet,
5705   * there are no L2ARC reads, and no fear of degrading read performance
5706   * through increased writes.
5707   *
5708   * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5709   * the vdev queue can aggregate them into larger and fewer writes.  Each
5710   * device is written to in a rotor fashion, sweeping writes through
5711   * available space then repeating.
5712   *
5713   * 7. The L2ARC does not store dirty content.  It never needs to flush
5714   * write buffers back to disk based storage.
5715   *
5716   * 8. If an ARC buffer is written (and dirtied) which also exists in the
5717   * L2ARC, the now stale L2ARC buffer is immediately dropped.
5718   *
5719   * The performance of the L2ARC can be tweaked by a number of tunables, which
5720   * may be necessary for different workloads:
5721   *
5722   *	l2arc_write_max		max write bytes per interval
5723   *	l2arc_write_boost	extra write bytes during device warmup
5724   *	l2arc_noprefetch	skip caching prefetched buffers
5725   *	l2arc_headroom		number of max device writes to precache
5726   *	l2arc_headroom_boost	when we find compressed buffers during ARC
5727   *				scanning, we multiply headroom by this
5728   *				percentage factor for the next scan cycle,
5729   *				since more compressed buffers are likely to
5730   *				be present
5731   *	l2arc_feed_secs		seconds between L2ARC writing
5732   *
5733   * Tunables may be removed or added as future performance improvements are
5734   * integrated, and also may become zpool properties.
5735   *
5736   * There are three key functions that control how the L2ARC warms up:
5737   *
5738   *	l2arc_write_eligible()	check if a buffer is eligible to cache
5739   *	l2arc_write_size()	calculate how much to write
5740   *	l2arc_write_interval()	calculate sleep delay between writes
5741   *
5742   * These three functions determine what to write, how much, and how quickly
5743   * to send writes.
5744   *
5745   * L2ARC persistency:
5746   *
5747   * When writing buffers to L2ARC, we periodically add some metadata to
5748   * make sure we can pick them up after reboot, thus dramatically reducing
5749   * the impact that any downtime has on the performance of storage systems
5750   * with large caches.
5751   *
5752   * The implementation works fairly simply by integrating the following two
5753   * modifications:
5754   *
5755   * *) Every now and then we mix in a piece of metadata (called a log block)
5756   *    into the L2ARC write. This allows us to understand what's been written,
5757   *    so that we can rebuild the arc_buf_hdr_t structures of the main ARC
5758   *    buffers. The log block also includes a "2-back-reference" pointer to
5759   *    he second-to-previous block, forming a back-linked list of blocks on
5760   *    the L2ARC device.
5761   *
5762   * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
5763   *    for our header bookkeeping purposes. This contains a device header,
5764   *    which contains our top-level reference structures. We update it each
5765   *    time we write a new log block, so that we're able to locate it in the
5766   *    L2ARC device. If this write results in an inconsistent device header
5767   *    (e.g. due to power failure), we detect this by verifying the header's
5768   *    checksum and simply drop the entries from L2ARC.
5769   *
5770   * Implementation diagram:
5771   *
5772   * +=== L2ARC device (not to scale) ======================================+
5773   * |       ___two newest log block pointers__.__________                  |
5774   * |      /                                   \1 back   \latest           |
5775   * |.____/_.                                   V         V                |
5776   * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
5777   * ||   hdr|      ^         /^       /^        /         /                |
5778   * |+------+  ...--\-------/  \-----/--\------/         /                 |
5779   * |                \--------------/    \--------------/                  |
5780   * +======================================================================+
5781   *
5782   * As can be seen on the diagram, rather than using a simple linked list,
5783   * we use a pair of linked lists with alternating elements. This is a
5784   * performance enhancement due to the fact that we only find out of the
5785   * address of the next log block access once the current block has been
5786   * completely read in. Obviously, this hurts performance, because we'd be
5787   * keeping the device's I/O queue at only a 1 operation deep, thus
5788   * incurring a large amount of I/O round-trip latency. Having two lists
5789   * allows us to "prefetch" two log blocks ahead of where we are currently
5790   * rebuilding L2ARC buffers.
5791   *
5792   * On-device data structures:
5793   *
5794   * L2ARC device header:	l2arc_dev_hdr_phys_t
5795   * L2ARC log block:	l2arc_log_blk_phys_t
5796   *
5797   * L2ARC reconstruction:
5798   *
5799   * When writing data, we simply write in the standard rotary fashion,
5800   * evicting buffers as we go and simply writing new data over them (writing
5801   * a new log block every now and then). This obviously means that once we
5802   * loop around the end of the device, we will start cutting into an already
5803   * committed log block (and its referenced data buffers), like so:
5804   *
5805   *    current write head__       __old tail
5806   *                        \     /
5807   *                        V    V
5808   * <--|bufs |lb |bufs |lb |    |bufs |lb |bufs |lb |-->
5809   *                         ^    ^^^^^^^^^___________________________________
5810   *                         |                                                \
5811   *                   <<nextwrite>> may overwrite this blk and/or its bufs --'
5812   *
5813   * When importing the pool, we detect this situation and use it to stop
5814   * our scanning process (see l2arc_rebuild).
5815   *
5816   * There is one significant caveat to consider when rebuilding ARC contents
5817   * from an L2ARC device: what about invalidated buffers? Given the above
5818   * construction, we cannot update blocks which we've already written to amend
5819   * them to remove buffers which were invalidated. Thus, during reconstruction,
5820   * we might be populating the cache with buffers for data that's not on the
5821   * main pool anymore, or may have been overwritten!
5822   *
5823   * As it turns out, this isn't a problem. Every arc_read request includes
5824   * both the DVA and, crucially, the birth TXG of the BP the caller is
5825   * looking for. So even if the cache were populated by completely rotten
5826   * blocks for data that had been long deleted and/or overwritten, we'll
5827   * never actually return bad data from the cache, since the DVA with the
5828   * birth TXG uniquely identify a block in space and time - once created,
5829   * a block is immutable on disk. The worst thing we have done is wasted
5830   * some time and memory at l2arc rebuild to reconstruct outdated ARC
5831   * entries that will get dropped from the l2arc as it is being updated
5832   * with new blocks.
5833   */
5834  
5835  static boolean_t
l2arc_write_eligible(uint64_t spa_guid,uint64_t sync_txg,arc_buf_hdr_t * hdr)5836  l2arc_write_eligible(uint64_t spa_guid, uint64_t sync_txg, arc_buf_hdr_t *hdr)
5837  {
5838  	/*
5839  	 * A buffer is *not* eligible for the L2ARC if it:
5840  	 * 1. belongs to a different spa.
5841  	 * 2. is already cached on the L2ARC.
5842  	 * 3. has an I/O in progress (it may be an incomplete read).
5843  	 * 4. is flagged not eligible (zfs property).
5844  	 * 5. is part of the syncing txg (and thus subject to change).
5845  	 */
5846  	if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5847  	    HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) ||
5848  	    hdr->b_birth >= sync_txg)
5849  		return (B_FALSE);
5850  
5851  	return (B_TRUE);
5852  }
5853  
5854  static uint64_t
l2arc_write_size(void)5855  l2arc_write_size(void)
5856  {
5857  	uint64_t size;
5858  
5859  	/*
5860  	 * Make sure our globals have meaningful values in case the user
5861  	 * altered them.
5862  	 */
5863  	size = l2arc_write_max;
5864  	if (size == 0) {
5865  		cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5866  		    "be greater than zero, resetting it to the default (%d)",
5867  		    L2ARC_WRITE_SIZE);
5868  		size = l2arc_write_max = L2ARC_WRITE_SIZE;
5869  	}
5870  
5871  	if (arc_warm == B_FALSE)
5872  		size += l2arc_write_boost;
5873  
5874  	return (size);
5875  
5876  }
5877  
5878  static clock_t
l2arc_write_interval(clock_t began,uint64_t wanted,uint64_t wrote)5879  l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5880  {
5881  	clock_t interval, next, now;
5882  
5883  	/*
5884  	 * If the ARC lists are busy, increase our write rate; if the
5885  	 * lists are stale, idle back.  This is achieved by checking
5886  	 * how much we previously wrote - if it was more than half of
5887  	 * what we wanted, schedule the next write much sooner.
5888  	 */
5889  	if (l2arc_feed_again && wrote > (wanted / 2))
5890  		interval = (hz * l2arc_feed_min_ms) / 1000;
5891  	else
5892  		interval = hz * l2arc_feed_secs;
5893  
5894  	now = ddi_get_lbolt();
5895  	next = MAX(now, MIN(now + interval, began + interval));
5896  
5897  	return (next);
5898  }
5899  
5900  /*
5901   * Cycle through L2ARC devices.  This is how L2ARC load balances.
5902   * If a device is returned, this also returns holding the spa config lock.
5903   */
5904  static l2arc_dev_t *
l2arc_dev_get_next(void)5905  l2arc_dev_get_next(void)
5906  {
5907  	l2arc_dev_t *first, *next = NULL;
5908  
5909  	/*
5910  	 * Lock out the removal of spas (spa_namespace_lock), then removal
5911  	 * of cache devices (l2arc_dev_mtx).  Once a device has been selected,
5912  	 * both locks will be dropped and a spa config lock held instead.
5913  	 */
5914  	mutex_enter(&spa_namespace_lock);
5915  	mutex_enter(&l2arc_dev_mtx);
5916  
5917  	/* if there are no vdevs, there is nothing to do */
5918  	if (l2arc_ndev == 0)
5919  		goto out;
5920  
5921  	first = NULL;
5922  	next = l2arc_dev_last;
5923  	do {
5924  		/* loop around the list looking for a non-faulted vdev */
5925  		if (next == NULL) {
5926  			next = list_head(l2arc_dev_list);
5927  		} else {
5928  			next = list_next(l2arc_dev_list, next);
5929  			if (next == NULL)
5930  				next = list_head(l2arc_dev_list);
5931  		}
5932  
5933  		/* if we have come back to the start, bail out */
5934  		if (first == NULL)
5935  			first = next;
5936  		else if (next == first)
5937  			break;
5938  
5939  	} while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
5940  
5941  	/* if we were unable to find any usable vdevs, return NULL */
5942  	if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
5943  		next = NULL;
5944  
5945  	l2arc_dev_last = next;
5946  
5947  out:
5948  	mutex_exit(&l2arc_dev_mtx);
5949  
5950  	/*
5951  	 * Grab the config lock to prevent the 'next' device from being
5952  	 * removed while we are writing to it.
5953  	 */
5954  	if (next != NULL)
5955  		spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5956  	mutex_exit(&spa_namespace_lock);
5957  
5958  	return (next);
5959  }
5960  
5961  /*
5962   * Free buffers that were tagged for destruction.
5963   */
5964  static void
l2arc_do_free_on_write()5965  l2arc_do_free_on_write()
5966  {
5967  	list_t *buflist;
5968  	l2arc_data_free_t *df, *df_prev;
5969  
5970  	mutex_enter(&l2arc_free_on_write_mtx);
5971  	buflist = l2arc_free_on_write;
5972  
5973  	for (df = list_tail(buflist); df; df = df_prev) {
5974  		df_prev = list_prev(buflist, df);
5975  		ASSERT(df->l2df_data != NULL);
5976  		ASSERT(df->l2df_func != NULL);
5977  		df->l2df_func(df->l2df_data, df->l2df_size);
5978  		list_remove(buflist, df);
5979  		kmem_free(df, sizeof (l2arc_data_free_t));
5980  	}
5981  
5982  	mutex_exit(&l2arc_free_on_write_mtx);
5983  }
5984  
5985  /*
5986   * A write to a cache device has completed.  Update all headers to allow
5987   * reads from these buffers to begin.
5988   */
5989  static void
l2arc_write_done(zio_t * zio)5990  l2arc_write_done(zio_t *zio)
5991  {
5992  	l2arc_write_callback_t *cb;
5993  	l2arc_dev_t *dev;
5994  	list_t *buflist;
5995  	arc_buf_hdr_t *head, *hdr, *hdr_prev;
5996  	kmutex_t *hash_lock;
5997  	int64_t bytes_dropped = 0;
5998  	l2arc_log_blk_buf_t *lb_buf;
5999  
6000  	cb = zio->io_private;
6001  	ASSERT(cb != NULL);
6002  	dev = cb->l2wcb_dev;
6003  	ASSERT(dev != NULL);
6004  	head = cb->l2wcb_head;
6005  	ASSERT(head != NULL);
6006  	buflist = &dev->l2ad_buflist;
6007  	ASSERT(buflist != NULL);
6008  	DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
6009  	    l2arc_write_callback_t *, cb);
6010  
6011  	if (zio->io_error != 0)
6012  		ARCSTAT_BUMP(arcstat_l2_writes_error);
6013  
6014  	/*
6015  	 * All writes completed, or an error was hit.
6016  	 */
6017  top:
6018  	mutex_enter(&dev->l2ad_mtx);
6019  	for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
6020  		hdr_prev = list_prev(buflist, hdr);
6021  
6022  		hash_lock = HDR_LOCK(hdr);
6023  
6024  		/*
6025  		 * We cannot use mutex_enter or else we can deadlock
6026  		 * with l2arc_write_buffers (due to swapping the order
6027  		 * the hash lock and l2ad_mtx are taken).
6028  		 */
6029  		if (!mutex_tryenter(hash_lock)) {
6030  			/*
6031  			 * Missed the hash lock. We must retry so we
6032  			 * don't leave the ARC_FLAG_L2_WRITING bit set.
6033  			 */
6034  			ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
6035  
6036  			/*
6037  			 * We don't want to rescan the headers we've
6038  			 * already marked as having been written out, so
6039  			 * we reinsert the head node so we can pick up
6040  			 * where we left off.
6041  			 */
6042  			list_remove(buflist, head);
6043  			list_insert_after(buflist, hdr, head);
6044  
6045  			mutex_exit(&dev->l2ad_mtx);
6046  
6047  			/*
6048  			 * We wait for the hash lock to become available
6049  			 * to try and prevent busy waiting, and increase
6050  			 * the chance we'll be able to acquire the lock
6051  			 * the next time around.
6052  			 */
6053  			mutex_enter(hash_lock);
6054  			mutex_exit(hash_lock);
6055  			goto top;
6056  		}
6057  
6058  		/*
6059  		 * We could not have been moved into the arc_l2c_only
6060  		 * state while in-flight due to our ARC_FLAG_L2_WRITING
6061  		 * bit being set. Let's just ensure that's being enforced.
6062  		 */
6063  		ASSERT(HDR_HAS_L1HDR(hdr));
6064  
6065  		/*
6066  		 * We may have allocated a buffer for L2ARC compression,
6067  		 * we must release it to avoid leaking this data.
6068  		 */
6069  		l2arc_release_cdata_buf(hdr);
6070  
6071  		if (zio->io_error != 0) {
6072  			/*
6073  			 * Error - drop L2ARC entry.
6074  			 */
6075  			list_remove(buflist, hdr);
6076  			hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
6077  
6078  			ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
6079  			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
6080  
6081  			bytes_dropped += hdr->b_l2hdr.b_asize;
6082  			(void) refcount_remove_many(&dev->l2ad_alloc,
6083  			    hdr->b_l2hdr.b_asize, hdr);
6084  		}
6085  
6086  		/*
6087  		 * Allow ARC to begin reads and ghost list evictions to
6088  		 * this L2ARC entry.
6089  		 */
6090  		hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
6091  
6092  		mutex_exit(hash_lock);
6093  	}
6094  
6095  	atomic_inc_64(&l2arc_writes_done);
6096  	list_remove(buflist, head);
6097  	ASSERT(!HDR_HAS_L1HDR(head));
6098  	kmem_cache_free(hdr_l2only_cache, head);
6099  	mutex_exit(&dev->l2ad_mtx);
6100  
6101  	ASSERT(dev->l2ad_vdev != NULL);
6102  	vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
6103  
6104  	l2arc_do_free_on_write();
6105  
6106  	while ((lb_buf = list_remove_tail(&cb->l2wcb_log_blk_buflist)) != NULL)
6107  		kmem_free(lb_buf, sizeof (*lb_buf));
6108  	list_destroy(&cb->l2wcb_log_blk_buflist);
6109  	kmem_free(cb, sizeof (l2arc_write_callback_t));
6110  }
6111  
6112  /*
6113   * A read to a cache device completed.  Validate buffer contents before
6114   * handing over to the regular ARC routines.
6115   */
6116  static void
l2arc_read_done(zio_t * zio)6117  l2arc_read_done(zio_t *zio)
6118  {
6119  	l2arc_read_callback_t *cb;
6120  	arc_buf_hdr_t *hdr;
6121  	arc_buf_t *buf;
6122  	kmutex_t *hash_lock;
6123  	int equal;
6124  
6125  	ASSERT(zio->io_vd != NULL);
6126  	ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
6127  
6128  	spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
6129  
6130  	cb = zio->io_private;
6131  	ASSERT(cb != NULL);
6132  	buf = cb->l2rcb_buf;
6133  	ASSERT(buf != NULL);
6134  
6135  	hash_lock = HDR_LOCK(buf->b_hdr);
6136  	mutex_enter(hash_lock);
6137  	hdr = buf->b_hdr;
6138  	ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6139  
6140  	/*
6141  	 * If the buffer was compressed, decompress it first.
6142  	 */
6143  	if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
6144  		l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
6145  	ASSERT(zio->io_data != NULL);
6146  	ASSERT3U(zio->io_size, ==, hdr->b_size);
6147  	ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
6148  
6149  	/*
6150  	 * Check this survived the L2ARC journey.
6151  	 */
6152  	equal = arc_cksum_equal(buf);
6153  	if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
6154  		mutex_exit(hash_lock);
6155  		zio->io_private = buf;
6156  		zio->io_bp_copy = cb->l2rcb_bp;	/* XXX fix in L2ARC 2.0	*/
6157  		zio->io_bp = &zio->io_bp_copy;	/* XXX fix in L2ARC 2.0	*/
6158  		arc_read_done(zio);
6159  	} else {
6160  		mutex_exit(hash_lock);
6161  		/*
6162  		 * Buffer didn't survive caching.  Increment stats and
6163  		 * reissue to the original storage device.
6164  		 */
6165  		if (zio->io_error != 0) {
6166  			ARCSTAT_BUMP(arcstat_l2_io_error);
6167  		} else {
6168  			zio->io_error = SET_ERROR(EIO);
6169  		}
6170  		if (!equal)
6171  			ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6172  
6173  		/*
6174  		 * If there's no waiter, issue an async i/o to the primary
6175  		 * storage now.  If there *is* a waiter, the caller must
6176  		 * issue the i/o in a context where it's OK to block.
6177  		 */
6178  		if (zio->io_waiter == NULL) {
6179  			zio_t *pio = zio_unique_parent(zio);
6180  
6181  			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6182  
6183  			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
6184  			    buf->b_data, hdr->b_size, arc_read_done, buf,
6185  			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
6186  		}
6187  	}
6188  
6189  	kmem_free(cb, sizeof (l2arc_read_callback_t));
6190  }
6191  
6192  /*
6193   * This is the list priority from which the L2ARC will search for pages to
6194   * cache.  This is used within loops (0..3) to cycle through lists in the
6195   * desired order.  This order can have a significant effect on cache
6196   * performance.
6197   *
6198   * Currently the metadata lists are hit first, MFU then MRU, followed by
6199   * the data lists.  This function returns a locked list, and also returns
6200   * the lock pointer.
6201   */
6202  static multilist_sublist_t *
l2arc_sublist_lock(int list_num)6203  l2arc_sublist_lock(int list_num)
6204  {
6205  	multilist_t *ml = NULL;
6206  	unsigned int idx;
6207  
6208  	ASSERT(list_num >= 0 && list_num <= 3);
6209  
6210  	switch (list_num) {
6211  	case 0:
6212  		ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
6213  		break;
6214  	case 1:
6215  		ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
6216  		break;
6217  	case 2:
6218  		ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
6219  		break;
6220  	case 3:
6221  		ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
6222  		break;
6223  	}
6224  
6225  	/*
6226  	 * Return a randomly-selected sublist. This is acceptable
6227  	 * because the caller feeds only a little bit of data for each
6228  	 * call (8MB). Subsequent calls will result in different
6229  	 * sublists being selected.
6230  	 */
6231  	idx = multilist_get_random_index(ml);
6232  	return (multilist_sublist_lock(ml, idx));
6233  }
6234  
6235  /*
6236   * Calculates the maximum overhead of L2ARC metadata log blocks for a given
6237   * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
6238   * overhead in processing to make sure there is enough headroom available
6239   * when writing buffers.
6240   */
6241  static inline uint64_t
l2arc_log_blk_overhead(uint64_t write_sz)6242  l2arc_log_blk_overhead(uint64_t write_sz)
6243  {
6244  	return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
6245  	    L2ARC_LOG_BLK_SIZE;
6246  }
6247  
6248  /*
6249   * Evict buffers from the device write hand to the distance specified in
6250   * bytes.  This distance may span populated buffers, it may span nothing.
6251   * This is clearing a region on the L2ARC device ready for writing.
6252   * If the 'all' boolean is set, every buffer is evicted.
6253   */
6254  static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)6255  l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6256  {
6257  	list_t *buflist;
6258  	arc_buf_hdr_t *hdr, *hdr_prev;
6259  	kmutex_t *hash_lock;
6260  	uint64_t taddr;
6261  
6262  	buflist = &dev->l2ad_buflist;
6263  
6264  	if (!all && dev->l2ad_first) {
6265  		/*
6266  		 * This is the first sweep through the device.  There is
6267  		 * nothing to evict.
6268  		 */
6269  		return;
6270  	}
6271  
6272  	/*
6273  	 * We need to add in the worst case scenario of log block overhead.
6274  	 */
6275  	distance += l2arc_log_blk_overhead(distance);
6276  	if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
6277  		/*
6278  		 * When nearing the end of the device, evict to the end
6279  		 * before the device write hand jumps to the start.
6280  		 */
6281  		taddr = dev->l2ad_end;
6282  	} else {
6283  		taddr = dev->l2ad_hand + distance;
6284  	}
6285  	DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6286  	    uint64_t, taddr, boolean_t, all);
6287  
6288  top:
6289  	mutex_enter(&dev->l2ad_mtx);
6290  	for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6291  		hdr_prev = list_prev(buflist, hdr);
6292  
6293  		hash_lock = HDR_LOCK(hdr);
6294  
6295  		/*
6296  		 * We cannot use mutex_enter or else we can deadlock
6297  		 * with l2arc_write_buffers (due to swapping the order
6298  		 * the hash lock and l2ad_mtx are taken).
6299  		 */
6300  		if (!mutex_tryenter(hash_lock)) {
6301  			/*
6302  			 * Missed the hash lock.  Retry.
6303  			 */
6304  			ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
6305  			mutex_exit(&dev->l2ad_mtx);
6306  			mutex_enter(hash_lock);
6307  			mutex_exit(hash_lock);
6308  			goto top;
6309  		}
6310  
6311  		if (HDR_L2_WRITE_HEAD(hdr)) {
6312  			/*
6313  			 * We hit a write head node.  Leave it for
6314  			 * l2arc_write_done().
6315  			 */
6316  			list_remove(buflist, hdr);
6317  			mutex_exit(hash_lock);
6318  			continue;
6319  		}
6320  
6321  		if (!all && HDR_HAS_L2HDR(hdr) &&
6322  		    (hdr->b_l2hdr.b_daddr > taddr ||
6323  		    hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
6324  			/*
6325  			 * We've evicted to the target address,
6326  			 * or the end of the device.
6327  			 */
6328  			mutex_exit(hash_lock);
6329  			break;
6330  		}
6331  
6332  		ASSERT(HDR_HAS_L2HDR(hdr));
6333  		if (!HDR_HAS_L1HDR(hdr)) {
6334  			ASSERT(!HDR_L2_READING(hdr));
6335  			/*
6336  			 * This doesn't exist in the ARC.  Destroy.
6337  			 * arc_hdr_destroy() will call list_remove()
6338  			 * and decrement arcstat_l2_size.
6339  			 */
6340  			arc_change_state(arc_anon, hdr, hash_lock);
6341  			arc_hdr_destroy(hdr);
6342  		} else {
6343  			ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6344  			ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
6345  			/*
6346  			 * Invalidate issued or about to be issued
6347  			 * reads, since we may be about to write
6348  			 * over this location.
6349  			 */
6350  			if (HDR_L2_READING(hdr)) {
6351  				ARCSTAT_BUMP(arcstat_l2_evict_reading);
6352  				hdr->b_flags |= ARC_FLAG_L2_EVICTED;
6353  			}
6354  
6355  			/* Ensure this header has finished being written */
6356  			ASSERT(!HDR_L2_WRITING(hdr));
6357  			ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6358  
6359  			arc_hdr_l2hdr_destroy(hdr);
6360  		}
6361  		mutex_exit(hash_lock);
6362  	}
6363  	mutex_exit(&dev->l2ad_mtx);
6364  }
6365  
6366  /*
6367   * Find and write ARC buffers to the L2ARC device.
6368   *
6369   * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
6370   * for reading until they have completed writing.
6371   * The headroom_boost is an in-out parameter used to maintain headroom boost
6372   * state between calls to this function.
6373   *
6374   * Returns the number of bytes actually written (which may be smaller than
6375   * the delta by which the device hand has changed due to alignment).
6376   */
6377  static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz,boolean_t * headroom_boost)6378  l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6379      boolean_t *headroom_boost)
6380  {
6381  	arc_buf_hdr_t *hdr, *hdr_prev, *head;
6382  	uint64_t write_asize, write_sz, headroom,
6383  	    buf_compress_minsz;
6384  	void *buf_data;
6385  	boolean_t full;
6386  	l2arc_write_callback_t *cb;
6387  	zio_t *pio, *wzio;
6388  	uint64_t guid = spa_load_guid(spa);
6389  	uint64_t sync_txg = spa_syncing_txg(spa);
6390  	const boolean_t do_headroom_boost = *headroom_boost;
6391  	boolean_t dev_hdr_update = B_FALSE;
6392  
6393  	ASSERT(dev->l2ad_vdev != NULL);
6394  
6395  	/* Lower the flag now, we might want to raise it again later. */
6396  	*headroom_boost = B_FALSE;
6397  
6398  	pio = NULL;
6399  	cb = NULL;
6400  	write_sz = write_asize = 0;
6401  	full = B_FALSE;
6402  	head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
6403  	head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
6404  	head->b_flags |= ARC_FLAG_HAS_L2HDR;
6405  
6406  	/*
6407  	 * We will want to try to compress buffers that are at least 2x the
6408  	 * device sector size.
6409  	 */
6410  	buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6411  
6412  	/*
6413  	 * Copy buffers for L2ARC writing.
6414  	 */
6415  	for (int try = 0; try <= 3; try++) {
6416  		multilist_sublist_t *mls = l2arc_sublist_lock(try);
6417  		uint64_t passed_sz = 0;
6418  
6419  		/*
6420  		 * L2ARC fast warmup.
6421  		 *
6422  		 * Until the ARC is warm and starts to evict, read from the
6423  		 * head of the ARC lists rather than the tail.
6424  		 */
6425  		if (arc_warm == B_FALSE)
6426  			hdr = multilist_sublist_head(mls);
6427  		else
6428  			hdr = multilist_sublist_tail(mls);
6429  
6430  		headroom = target_sz * l2arc_headroom;
6431  		if (do_headroom_boost)
6432  			headroom = (headroom * l2arc_headroom_boost) / 100;
6433  
6434  		for (; hdr; hdr = hdr_prev) {
6435  			kmutex_t *hash_lock;
6436  			uint64_t buf_sz;
6437  			uint64_t buf_a_sz;
6438  
6439  			if (arc_warm == B_FALSE)
6440  				hdr_prev = multilist_sublist_next(mls, hdr);
6441  			else
6442  				hdr_prev = multilist_sublist_prev(mls, hdr);
6443  
6444  			hash_lock = HDR_LOCK(hdr);
6445  			if (!mutex_tryenter(hash_lock)) {
6446  				/*
6447  				 * Skip this buffer rather than waiting.
6448  				 */
6449  				continue;
6450  			}
6451  
6452  			passed_sz += hdr->b_size;
6453  			if (passed_sz > headroom) {
6454  				/*
6455  				 * Searched too far.
6456  				 */
6457  				mutex_exit(hash_lock);
6458  				break;
6459  			}
6460  
6461  			if (!l2arc_write_eligible(guid, sync_txg, hdr)) {
6462  				mutex_exit(hash_lock);
6463  				continue;
6464  			}
6465  
6466  			/*
6467  			 * Assume that the buffer is not going to be compressed
6468  			 * and could take more space on disk because of a larger
6469  			 * disk block size.
6470  			 */
6471  			buf_sz = hdr->b_size;
6472  			buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6473  
6474  			if ((write_asize + buf_a_sz) > target_sz) {
6475  				full = B_TRUE;
6476  				mutex_exit(hash_lock);
6477  				break;
6478  			}
6479  
6480  			if (pio == NULL) {
6481  				/*
6482  				 * Insert a dummy header on the buflist so
6483  				 * l2arc_write_done() can find where the
6484  				 * write buffers begin without searching.
6485  				 */
6486  				mutex_enter(&dev->l2ad_mtx);
6487  				list_insert_head(&dev->l2ad_buflist, head);
6488  				mutex_exit(&dev->l2ad_mtx);
6489  
6490  				cb = kmem_zalloc(
6491  				    sizeof (l2arc_write_callback_t), KM_SLEEP);
6492  				cb->l2wcb_dev = dev;
6493  				cb->l2wcb_head = head;
6494  				list_create(&cb->l2wcb_log_blk_buflist,
6495  				    sizeof (l2arc_log_blk_buf_t),
6496  				    offsetof(l2arc_log_blk_buf_t, lbb_node));
6497  				pio = zio_root(spa, l2arc_write_done, cb,
6498  				    ZIO_FLAG_CANFAIL);
6499  			}
6500  
6501  			/*
6502  			 * Create and add a new L2ARC header.
6503  			 */
6504  			hdr->b_l2hdr.b_dev = dev;
6505  			hdr->b_flags |= ARC_FLAG_L2_WRITING;
6506  			/*
6507  			 * Temporarily stash the data buffer in b_tmp_cdata.
6508  			 * The subsequent write step will pick it up from
6509  			 * there. This is because can't access b_l1hdr.b_buf
6510  			 * without holding the hash_lock, which we in turn
6511  			 * can't access without holding the ARC list locks
6512  			 * (which we want to avoid during compression/writing).
6513  			 */
6514  			hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
6515  			hdr->b_l2hdr.b_asize = hdr->b_size;
6516  			hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
6517  
6518  			/*
6519  			 * Explicitly set the b_daddr field to a known
6520  			 * value which means "invalid address". This
6521  			 * enables us to differentiate which stage of
6522  			 * l2arc_write_buffers() the particular header
6523  			 * is in (e.g. this loop, or the one below).
6524  			 * ARC_FLAG_L2_WRITING is not enough to make
6525  			 * this distinction, and we need to know in
6526  			 * order to do proper l2arc vdev accounting in
6527  			 * arc_release() and arc_hdr_destroy().
6528  			 *
6529  			 * Note, we can't use a new flag to distinguish
6530  			 * the two stages because we don't hold the
6531  			 * header's hash_lock below, in the second stage
6532  			 * of this function. Thus, we can't simply
6533  			 * change the b_flags field to denote that the
6534  			 * IO has been sent. We can change the b_daddr
6535  			 * field of the L2 portion, though, since we'll
6536  			 * be holding the l2ad_mtx; which is why we're
6537  			 * using it to denote the header's state change.
6538  			 */
6539  			hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
6540  
6541  			hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
6542  
6543  			mutex_enter(&dev->l2ad_mtx);
6544  			list_insert_head(&dev->l2ad_buflist, hdr);
6545  			mutex_exit(&dev->l2ad_mtx);
6546  
6547  			/*
6548  			 * Compute and store the buffer cksum before
6549  			 * writing.  On debug the cksum is verified first.
6550  			 */
6551  			arc_cksum_verify(hdr->b_l1hdr.b_buf);
6552  			arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6553  
6554  			mutex_exit(hash_lock);
6555  
6556  			write_sz += buf_sz;
6557  			write_asize += buf_a_sz;
6558  		}
6559  
6560  		multilist_sublist_unlock(mls);
6561  
6562  		if (full == B_TRUE)
6563  			break;
6564  	}
6565  
6566  	/* No buffers selected for writing? */
6567  	if (pio == NULL) {
6568  		ASSERT0(write_sz);
6569  		ASSERT(!HDR_HAS_L1HDR(head));
6570  		kmem_cache_free(hdr_l2only_cache, head);
6571  		return (0);
6572  	}
6573  
6574  	mutex_enter(&dev->l2ad_mtx);
6575  
6576  	/*
6577  	 * Note that elsewhere in this file arcstat_l2_asize
6578  	 * and the used space on l2ad_vdev are updated using b_asize,
6579  	 * which is not necessarily rounded up to the device block size.
6580  	 * Too keep accounting consistent we do the same here as well:
6581  	 * stats_size accumulates the sum of b_asize of the written buffers,
6582  	 * while write_asize accumulates the sum of b_asize rounded up
6583  	 * to the device block size.
6584  	 * The latter sum is used only to validate the corectness of the code.
6585  	 */
6586  	uint64_t stats_size = 0;
6587  	write_asize = 0;
6588  
6589  	/*
6590  	 * Now start writing the buffers. We're starting at the write head
6591  	 * and work backwards, retracing the course of the buffer selector
6592  	 * loop above.
6593  	 */
6594  	for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6595  	    hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6596  		uint64_t buf_sz;
6597  
6598  		/*
6599  		 * We rely on the L1 portion of the header below, so
6600  		 * it's invalid for this header to have been evicted out
6601  		 * of the ghost cache, prior to being written out. The
6602  		 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6603  		 */
6604  		ASSERT(HDR_HAS_L1HDR(hdr));
6605  
6606  		/*
6607  		 * We shouldn't need to lock the buffer here, since we flagged
6608  		 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6609  		 * take care to only access its L2 cache parameters. In
6610  		 * particular, hdr->l1hdr.b_buf may be invalid by now due to
6611  		 * ARC eviction.
6612  		 */
6613  		hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
6614  
6615  		if ((HDR_L2COMPRESS(hdr)) &&
6616  		    hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6617  			if (l2arc_compress_buf(hdr)) {
6618  				/*
6619  				 * If compression succeeded, enable headroom
6620  				 * boost on the next scan cycle.
6621  				 */
6622  				*headroom_boost = B_TRUE;
6623  			}
6624  		}
6625  
6626  		/*
6627  		 * Pick up the buffer data we had previously stashed away
6628  		 * (and now potentially also compressed).
6629  		 */
6630  		buf_data = hdr->b_l1hdr.b_tmp_cdata;
6631  		buf_sz = hdr->b_l2hdr.b_asize;
6632  
6633  		/*
6634  		 * We need to do this regardless if buf_sz is zero or
6635  		 * not, otherwise, when this l2hdr is evicted we'll
6636  		 * remove a reference that was never added.
6637  		 */
6638  		(void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6639  
6640  		/* Compression may have squashed the buffer to zero length. */
6641  		if (buf_sz != 0) {
6642  			uint64_t buf_a_sz;
6643  
6644  			wzio = zio_write_phys(pio, dev->l2ad_vdev,
6645  			    dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6646  			    NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6647  			    ZIO_FLAG_CANFAIL, B_FALSE);
6648  
6649  			DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6650  			    zio_t *, wzio);
6651  			(void) zio_nowait(wzio);
6652  
6653  			stats_size += buf_sz;
6654  
6655  			/*
6656  			 * Keep the clock hand suitably device-aligned.
6657  			 */
6658  			buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6659  			write_asize += buf_a_sz;
6660  			dev->l2ad_hand += buf_a_sz;
6661  		}
6662  
6663  		/*
6664  		 * Append buf info to current log and commit if full.
6665  		 * arcstat_l2_{size,asize} kstats are updated internally.
6666  		 */
6667  		if (l2arc_log_blk_insert(dev, hdr)) {
6668  			l2arc_log_blk_commit(dev, pio, cb);
6669  			dev_hdr_update = B_TRUE;
6670  		}
6671  	}
6672  
6673  	mutex_exit(&dev->l2ad_mtx);
6674  
6675  	/*
6676  	 * If we wrote any logs as part of this write, update dev hdr
6677  	 * to point to it.
6678  	 */
6679  	if (dev_hdr_update)
6680  		l2arc_dev_hdr_update(dev, pio);
6681  
6682  	VERIFY3U(write_asize, <=, target_sz);
6683  	ARCSTAT_BUMP(arcstat_l2_writes_sent);
6684  	ARCSTAT_INCR(arcstat_l2_write_bytes, stats_size);
6685  	ARCSTAT_INCR(arcstat_l2_size, write_sz);
6686  	ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6687  	vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
6688  
6689  	/*
6690  	 * Bump device hand to the device start if it is approaching the end.
6691  	 * l2arc_evict() will already have evicted ahead for this case.
6692  	 */
6693  	if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
6694  	    dev->l2ad_end) {
6695  		dev->l2ad_hand = dev->l2ad_start;
6696  		dev->l2ad_first = B_FALSE;
6697  	}
6698  
6699  	dev->l2ad_writing = B_TRUE;
6700  	(void) zio_wait(pio);
6701  	dev->l2ad_writing = B_FALSE;
6702  
6703  	return (stats_size);
6704  }
6705  
6706  /*
6707   * Compresses an L2ARC buffer.
6708   * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6709   * size in l2hdr->b_asize. This routine tries to compress the data and
6710   * depending on the compression result there are three possible outcomes:
6711   * *) The buffer was incompressible. The original l2hdr contents were left
6712   *    untouched and are ready for writing to an L2 device.
6713   * *) The buffer was all-zeros, so there is no need to write it to an L2
6714   *    device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6715   *    set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6716   * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6717   *    data buffer which holds the compressed data to be written, and b_asize
6718   *    tells us how much data there is. b_compress is set to the appropriate
6719   *    compression algorithm. Once writing is done, invoke
6720   *    l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6721   *
6722   * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6723   * buffer was incompressible).
6724   */
6725  static boolean_t
l2arc_compress_buf(arc_buf_hdr_t * hdr)6726  l2arc_compress_buf(arc_buf_hdr_t *hdr)
6727  {
6728  	void *cdata;
6729  	size_t csize, len, rounded;
6730  	ASSERT(HDR_HAS_L2HDR(hdr));
6731  	l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6732  
6733  	ASSERT(HDR_HAS_L1HDR(hdr));
6734  	ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF);
6735  	ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6736  
6737  	len = l2hdr->b_asize;
6738  	cdata = zio_data_buf_alloc(len);
6739  	ASSERT3P(cdata, !=, NULL);
6740  	csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6741  	    cdata, l2hdr->b_asize);
6742  
6743  	rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6744  	if (rounded > csize) {
6745  		bzero((char *)cdata + csize, rounded - csize);
6746  		csize = rounded;
6747  	}
6748  
6749  	if (csize == 0) {
6750  		/* zero block, indicate that there's nothing to write */
6751  		zio_data_buf_free(cdata, len);
6752  		l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
6753  		l2hdr->b_asize = 0;
6754  		hdr->b_l1hdr.b_tmp_cdata = NULL;
6755  		ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6756  		return (B_TRUE);
6757  	} else if (csize > 0 && csize < len) {
6758  		/*
6759  		 * Compression succeeded, we'll keep the cdata around for
6760  		 * writing and release it afterwards.
6761  		 */
6762  		l2hdr->b_compress = ZIO_COMPRESS_LZ4;
6763  		l2hdr->b_asize = csize;
6764  		hdr->b_l1hdr.b_tmp_cdata = cdata;
6765  		ARCSTAT_BUMP(arcstat_l2_compress_successes);
6766  		return (B_TRUE);
6767  	} else {
6768  		/*
6769  		 * Compression failed, release the compressed buffer.
6770  		 * l2hdr will be left unmodified.
6771  		 */
6772  		zio_data_buf_free(cdata, len);
6773  		ARCSTAT_BUMP(arcstat_l2_compress_failures);
6774  		return (B_FALSE);
6775  	}
6776  }
6777  
6778  /*
6779   * Decompresses a zio read back from an l2arc device. On success, the
6780   * underlying zio's io_data buffer is overwritten by the uncompressed
6781   * version. On decompression error (corrupt compressed stream), the
6782   * zio->io_error value is set to signal an I/O error.
6783   *
6784   * Please note that the compressed data stream is not checksummed, so
6785   * if the underlying device is experiencing data corruption, we may feed
6786   * corrupt data to the decompressor, so the decompressor needs to be
6787   * able to handle this situation (LZ4 does).
6788   */
6789  static void
l2arc_decompress_zio(zio_t * zio,arc_buf_hdr_t * hdr,enum zio_compress c)6790  l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6791  {
6792  	ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6793  
6794  	if (zio->io_error != 0) {
6795  		/*
6796  		 * An io error has occured, just restore the original io
6797  		 * size in preparation for a main pool read.
6798  		 */
6799  		zio->io_orig_size = zio->io_size = hdr->b_size;
6800  		return;
6801  	}
6802  
6803  	if (c == ZIO_COMPRESS_EMPTY) {
6804  		/*
6805  		 * An empty buffer results in a null zio, which means we
6806  		 * need to fill its io_data after we're done restoring the
6807  		 * buffer's contents.
6808  		 */
6809  		ASSERT(hdr->b_l1hdr.b_buf != NULL);
6810  		bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6811  		zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6812  	} else {
6813  		ASSERT(zio->io_data != NULL);
6814  		/*
6815  		 * We copy the compressed data from the start of the arc buffer
6816  		 * (the zio_read will have pulled in only what we need, the
6817  		 * rest is garbage which we will overwrite at decompression)
6818  		 * and then decompress back to the ARC data buffer. This way we
6819  		 * can minimize copying by simply decompressing back over the
6820  		 * original compressed data (rather than decompressing to an
6821  		 * aux buffer and then copying back the uncompressed buffer,
6822  		 * which is likely to be much larger).
6823  		 */
6824  		uint64_t csize;
6825  		void *cdata;
6826  
6827  		csize = zio->io_size;
6828  		cdata = zio_data_buf_alloc(csize);
6829  		bcopy(zio->io_data, cdata, csize);
6830  		if (zio_decompress_data(c, cdata, zio->io_data, csize,
6831  		    hdr->b_size) != 0)
6832  			zio->io_error = EIO;
6833  		zio_data_buf_free(cdata, csize);
6834  	}
6835  
6836  	/* Restore the expected uncompressed IO size. */
6837  	zio->io_orig_size = zio->io_size = hdr->b_size;
6838  }
6839  
6840  /*
6841   * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6842   * This buffer serves as a temporary holder of compressed data while
6843   * the buffer entry is being written to an l2arc device. Once that is
6844   * done, we can dispose of it.
6845   */
6846  static void
l2arc_release_cdata_buf(arc_buf_hdr_t * hdr)6847  l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6848  {
6849  	ASSERT(HDR_HAS_L2HDR(hdr));
6850  	enum zio_compress comp = hdr->b_l2hdr.b_compress;
6851  
6852  	ASSERT(HDR_HAS_L1HDR(hdr));
6853  	ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6854  
6855  	if (comp == ZIO_COMPRESS_OFF) {
6856  		/*
6857  		 * In this case, b_tmp_cdata points to the same buffer
6858  		 * as the arc_buf_t's b_data field. We don't want to
6859  		 * free it, since the arc_buf_t will handle that.
6860  		 */
6861  		hdr->b_l1hdr.b_tmp_cdata = NULL;
6862  	} else if (comp == ZIO_COMPRESS_EMPTY) {
6863  		/*
6864  		 * In this case, b_tmp_cdata was compressed to an empty
6865  		 * buffer, thus there's nothing to free and b_tmp_cdata
6866  		 * should have been set to NULL in l2arc_write_buffers().
6867  		 */
6868  		ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6869  	} else {
6870  		/*
6871  		 * If the data was compressed, then we've allocated a
6872  		 * temporary buffer for it, so now we need to release it.
6873  		 */
6874  		ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6875  		zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6876  		    hdr->b_size);
6877  		hdr->b_l1hdr.b_tmp_cdata = NULL;
6878  	}
6879  
6880  }
6881  
6882  /*
6883   * This thread feeds the L2ARC at regular intervals.  This is the beating
6884   * heart of the L2ARC.
6885   */
6886  static void
l2arc_feed_thread(void)6887  l2arc_feed_thread(void)
6888  {
6889  	callb_cpr_t cpr;
6890  	l2arc_dev_t *dev;
6891  	spa_t *spa;
6892  	uint64_t size, wrote;
6893  	clock_t begin, next = ddi_get_lbolt();
6894  	boolean_t headroom_boost = B_FALSE;
6895  
6896  	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6897  
6898  	mutex_enter(&l2arc_feed_thr_lock);
6899  
6900  	while (l2arc_thread_exit == 0) {
6901  		CALLB_CPR_SAFE_BEGIN(&cpr);
6902  		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6903  		    next);
6904  		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6905  		next = ddi_get_lbolt() + hz;
6906  
6907  		/*
6908  		 * Quick check for L2ARC devices.
6909  		 */
6910  		mutex_enter(&l2arc_dev_mtx);
6911  		if (l2arc_ndev == 0) {
6912  			mutex_exit(&l2arc_dev_mtx);
6913  			continue;
6914  		}
6915  		mutex_exit(&l2arc_dev_mtx);
6916  		begin = ddi_get_lbolt();
6917  
6918  		/*
6919  		 * This selects the next l2arc device to write to, and in
6920  		 * doing so the next spa to feed from: dev->l2ad_spa.   This
6921  		 * will return NULL if there are now no l2arc devices or if
6922  		 * they are all faulted.
6923  		 *
6924  		 * If a device is returned, its spa's config lock is also
6925  		 * held to prevent device removal.  l2arc_dev_get_next()
6926  		 * will grab and release l2arc_dev_mtx.
6927  		 */
6928  		if ((dev = l2arc_dev_get_next()) == NULL)
6929  			continue;
6930  
6931  		spa = dev->l2ad_spa;
6932  		ASSERT(spa != NULL);
6933  
6934  		/*
6935  		 * If the pool is read-only then force the feed thread to
6936  		 * sleep a little longer.
6937  		 */
6938  		if (!spa_writeable(spa)) {
6939  			next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6940  			spa_config_exit(spa, SCL_L2ARC, dev);
6941  			continue;
6942  		}
6943  
6944  		/*
6945  		 * Avoid contributing to memory pressure.
6946  		 */
6947  		if (arc_reclaim_needed()) {
6948  			ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6949  			spa_config_exit(spa, SCL_L2ARC, dev);
6950  			continue;
6951  		}
6952  
6953  		ARCSTAT_BUMP(arcstat_l2_feeds);
6954  
6955  		size = l2arc_write_size();
6956  
6957  		/*
6958  		 * Evict L2ARC buffers that will be overwritten.
6959  		 */
6960  		l2arc_evict(dev, size, B_FALSE);
6961  
6962  		/*
6963  		 * Write ARC buffers.
6964  		 */
6965  		wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6966  
6967  		/*
6968  		 * Calculate interval between writes.
6969  		 */
6970  		next = l2arc_write_interval(begin, size, wrote);
6971  		spa_config_exit(spa, SCL_L2ARC, dev);
6972  	}
6973  
6974  	l2arc_thread_exit = 0;
6975  	cv_broadcast(&l2arc_feed_thr_cv);
6976  	CALLB_CPR_EXIT(&cpr);		/* drops l2arc_feed_thr_lock */
6977  	thread_exit();
6978  }
6979  
6980  boolean_t
l2arc_vdev_present(vdev_t * vd)6981  l2arc_vdev_present(vdev_t *vd)
6982  {
6983  	return (l2arc_vdev_get(vd) != NULL);
6984  }
6985  
6986  /*
6987   * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
6988   * the vdev_t isn't an L2ARC device.
6989   */
6990  static l2arc_dev_t *
l2arc_vdev_get(vdev_t * vd)6991  l2arc_vdev_get(vdev_t *vd)
6992  {
6993  	l2arc_dev_t	*dev;
6994  	boolean_t	held = MUTEX_HELD(&l2arc_dev_mtx);
6995  
6996  	if (!held)
6997  		mutex_enter(&l2arc_dev_mtx);
6998  	for (dev = list_head(l2arc_dev_list); dev != NULL;
6999  	    dev = list_next(l2arc_dev_list, dev)) {
7000  		if (dev->l2ad_vdev == vd)
7001  			break;
7002  	}
7003  	if (!held)
7004  		mutex_exit(&l2arc_dev_mtx);
7005  
7006  	return (dev);
7007  }
7008  
7009  /*
7010   * Add a vdev for use by the L2ARC.  By this point the spa has already
7011   * validated the vdev and opened it. The `rebuild' flag indicates whether
7012   * we should attempt an L2ARC persistency rebuild.
7013   */
7014  void
l2arc_add_vdev(spa_t * spa,vdev_t * vd,boolean_t rebuild)7015  l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
7016  {
7017  	l2arc_dev_t *adddev;
7018  
7019  	ASSERT(!l2arc_vdev_present(vd));
7020  
7021  	/*
7022  	 * Create a new l2arc device entry.
7023  	 */
7024  	adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
7025  	adddev->l2ad_spa = spa;
7026  	adddev->l2ad_vdev = vd;
7027  	/* leave extra size for an l2arc device header */
7028  	adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr),
7029  	    1 << vd->vdev_ashift);
7030  	adddev->l2ad_start = VDEV_LABEL_START_SIZE + adddev->l2ad_dev_hdr_asize;
7031  	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
7032  	ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
7033  	adddev->l2ad_hand = adddev->l2ad_start;
7034  	adddev->l2ad_first = B_TRUE;
7035  	adddev->l2ad_writing = B_FALSE;
7036  	adddev->l2ad_dev_hdr = kmem_zalloc(adddev->l2ad_dev_hdr_asize,
7037  	    KM_SLEEP);
7038  
7039  	mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
7040  	/*
7041  	 * This is a list of all ARC buffers that are still valid on the
7042  	 * device.
7043  	 */
7044  	list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
7045  	    offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
7046  
7047  	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
7048  	refcount_create(&adddev->l2ad_alloc);
7049  
7050  	/*
7051  	 * Add device to global list
7052  	 */
7053  	mutex_enter(&l2arc_dev_mtx);
7054  	list_insert_head(l2arc_dev_list, adddev);
7055  	atomic_inc_64(&l2arc_ndev);
7056  	if (rebuild && l2arc_rebuild_enabled &&
7057  	    adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
7058  		/*
7059  		 * Just mark the device as pending for a rebuild. We won't
7060  		 * be starting a rebuild in line here as it would block pool
7061  		 * import. Instead spa_load_impl will hand that off to an
7062  		 * async task which will call l2arc_spa_rebuild_start.
7063  		 */
7064  		adddev->l2ad_rebuild = B_TRUE;
7065  	}
7066  	mutex_exit(&l2arc_dev_mtx);
7067  }
7068  
7069  /*
7070   * Remove a vdev from the L2ARC.
7071   */
7072  void
l2arc_remove_vdev(vdev_t * vd)7073  l2arc_remove_vdev(vdev_t *vd)
7074  {
7075  	l2arc_dev_t *dev, *nextdev, *remdev = NULL;
7076  
7077  	/*
7078  	 * Find the device by vdev
7079  	 */
7080  	mutex_enter(&l2arc_dev_mtx);
7081  	for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
7082  		nextdev = list_next(l2arc_dev_list, dev);
7083  		if (vd == dev->l2ad_vdev) {
7084  			remdev = dev;
7085  			break;
7086  		}
7087  	}
7088  	ASSERT(remdev != NULL);
7089  
7090  	/*
7091  	 * Cancel any ongoing or scheduled rebuild (race protection with
7092  	 * l2arc_spa_rebuild_start provided via l2arc_dev_mtx).
7093  	 */
7094  	remdev->l2ad_rebuild_cancel = B_TRUE;
7095  	if (remdev->l2ad_rebuild_did != 0) {
7096  		/*
7097  		 * N.B. it should be safe to thread_join with the rebuild
7098  		 * thread while holding l2arc_dev_mtx because it is not
7099  		 * accessed from anywhere in the l2arc rebuild code below
7100  		 * (except for l2arc_spa_rebuild_start, which is ok).
7101  		 */
7102  		thread_join(remdev->l2ad_rebuild_did);
7103  	}
7104  
7105  	/*
7106  	 * Remove device from global list
7107  	 */
7108  	list_remove(l2arc_dev_list, remdev);
7109  	l2arc_dev_last = NULL;		/* may have been invalidated */
7110  	atomic_dec_64(&l2arc_ndev);
7111  	mutex_exit(&l2arc_dev_mtx);
7112  
7113  	/*
7114  	 * Clear all buflists and ARC references.  L2ARC device flush.
7115  	 */
7116  	l2arc_evict(remdev, 0, B_TRUE);
7117  	list_destroy(&remdev->l2ad_buflist);
7118  	mutex_destroy(&remdev->l2ad_mtx);
7119  	refcount_destroy(&remdev->l2ad_alloc);
7120  	kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
7121  	kmem_free(remdev, sizeof (l2arc_dev_t));
7122  }
7123  
7124  void
l2arc_init(void)7125  l2arc_init(void)
7126  {
7127  	l2arc_thread_exit = 0;
7128  	l2arc_ndev = 0;
7129  	l2arc_writes_sent = 0;
7130  	l2arc_writes_done = 0;
7131  
7132  	mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
7133  	cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
7134  	mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
7135  	mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
7136  
7137  	l2arc_dev_list = &L2ARC_dev_list;
7138  	l2arc_free_on_write = &L2ARC_free_on_write;
7139  	list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
7140  	    offsetof(l2arc_dev_t, l2ad_node));
7141  	list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
7142  	    offsetof(l2arc_data_free_t, l2df_list_node));
7143  }
7144  
7145  void
l2arc_fini(void)7146  l2arc_fini(void)
7147  {
7148  	/*
7149  	 * This is called from dmu_fini(), which is called from spa_fini();
7150  	 * Because of this, we can assume that all l2arc devices have
7151  	 * already been removed when the pools themselves were removed.
7152  	 */
7153  
7154  	l2arc_do_free_on_write();
7155  
7156  	mutex_destroy(&l2arc_feed_thr_lock);
7157  	cv_destroy(&l2arc_feed_thr_cv);
7158  	mutex_destroy(&l2arc_dev_mtx);
7159  	mutex_destroy(&l2arc_free_on_write_mtx);
7160  
7161  	list_destroy(l2arc_dev_list);
7162  	list_destroy(l2arc_free_on_write);
7163  }
7164  
7165  void
l2arc_start(void)7166  l2arc_start(void)
7167  {
7168  	if (!(spa_mode_global & FWRITE))
7169  		return;
7170  
7171  	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
7172  	    TS_RUN, minclsyspri);
7173  }
7174  
7175  void
l2arc_stop(void)7176  l2arc_stop(void)
7177  {
7178  	if (!(spa_mode_global & FWRITE))
7179  		return;
7180  
7181  	mutex_enter(&l2arc_feed_thr_lock);
7182  	cv_signal(&l2arc_feed_thr_cv);	/* kick thread out of startup */
7183  	l2arc_thread_exit = 1;
7184  	while (l2arc_thread_exit != 0)
7185  		cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
7186  	mutex_exit(&l2arc_feed_thr_lock);
7187  }
7188  
7189  /*
7190   * Punches out rebuild threads for the L2ARC devices in a spa. This should
7191   * be called after pool import from the spa async thread, since starting
7192   * these threads directly from spa_import() will make them part of the
7193   * "zpool import" context and delay process exit (and thus pool import).
7194   */
7195  void
l2arc_spa_rebuild_start(spa_t * spa)7196  l2arc_spa_rebuild_start(spa_t *spa)
7197  {
7198  	/*
7199  	 * Locate the spa's l2arc devices and kick off rebuild threads.
7200  	 */
7201  	mutex_enter(&l2arc_dev_mtx);
7202  	for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
7203  		l2arc_dev_t *dev =
7204  		    l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
7205  		ASSERT(dev != NULL);
7206  		if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
7207  			VERIFY3U(dev->l2ad_rebuild_did, ==, 0);
7208  #ifdef	_KERNEL
7209  			dev->l2ad_rebuild_did = thread_create(NULL, 0,
7210  			    l2arc_dev_rebuild_start, dev, 0, &p0, TS_RUN,
7211  			    minclsyspri)->t_did;
7212  #endif
7213  		}
7214  	}
7215  	mutex_exit(&l2arc_dev_mtx);
7216  }
7217  
7218  /*
7219   * Main entry point for L2ARC rebuilding.
7220   */
7221  static void
l2arc_dev_rebuild_start(l2arc_dev_t * dev)7222  l2arc_dev_rebuild_start(l2arc_dev_t *dev)
7223  {
7224  	if (!dev->l2ad_rebuild_cancel) {
7225  		VERIFY(dev->l2ad_rebuild);
7226  		(void) l2arc_rebuild(dev);
7227  		dev->l2ad_rebuild = B_FALSE;
7228  	}
7229  }
7230  
7231  /*
7232   * This function implements the actual L2ARC metadata rebuild. It:
7233   *
7234   * 1) reads the device's header
7235   * 2) if a good device header is found, starts reading the log block chain
7236   * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
7237   *
7238   * Operation stops under any of the following conditions:
7239   *
7240   * 1) We reach the end of the log blk chain (the back-reference in the blk is
7241   *    invalid or loops over our starting point).
7242   * 2) We encounter *any* error condition (cksum errors, io errors, looped
7243   *    blocks, etc.).
7244   */
7245  static int
l2arc_rebuild(l2arc_dev_t * dev)7246  l2arc_rebuild(l2arc_dev_t *dev)
7247  {
7248  	vdev_t			*vd = dev->l2ad_vdev;
7249  	spa_t			*spa = vd->vdev_spa;
7250  	int			err;
7251  	l2arc_log_blk_phys_t	*this_lb, *next_lb;
7252  	uint8_t			*this_lb_buf, *next_lb_buf;
7253  	zio_t			*this_io = NULL, *next_io = NULL;
7254  	l2arc_log_blkptr_t	lb_ptrs[2];
7255  	boolean_t		first_pass, lock_held;
7256  	uint64_t		load_guid;
7257  
7258  	this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
7259  	next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
7260  	this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
7261  	next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
7262  
7263  	/*
7264  	 * We prevent device removal while issuing reads to the device,
7265  	 * then during the rebuilding phases we drop this lock again so
7266  	 * that a spa_unload or device remove can be initiated - this is
7267  	 * safe, because the spa will signal us to stop before removing
7268  	 * our device and wait for us to stop.
7269  	 */
7270  	spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
7271  	lock_held = B_TRUE;
7272  
7273  	load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
7274  	/*
7275  	 * Device header processing phase.
7276  	 */
7277  	if ((err = l2arc_dev_hdr_read(dev)) != 0) {
7278  		/* device header corrupted, start a new one */
7279  		bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
7280  		goto out;
7281  	}
7282  
7283  	/* Retrieve the persistent L2ARC device state */
7284  	dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
7285  	    dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr +
7286  	    LBP_GET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0]));
7287  	dev->l2ad_first = !!(dev->l2ad_dev_hdr->dh_flags &
7288  	    L2ARC_DEV_HDR_EVICT_FIRST);
7289  
7290  	/* Prepare the rebuild processing state */
7291  	bcopy(dev->l2ad_dev_hdr->dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
7292  	first_pass = B_TRUE;
7293  
7294  	/* Start the rebuild process */
7295  	for (;;) {
7296  		if (!l2arc_log_blkptr_valid(dev, &lb_ptrs[0]))
7297  			/* We hit an invalid block address, end the rebuild. */
7298  			break;
7299  
7300  		if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
7301  		    this_lb, next_lb, this_lb_buf, next_lb_buf,
7302  		    this_io, &next_io)) != 0)
7303  			break;
7304  
7305  		spa_config_exit(spa, SCL_L2ARC, vd);
7306  		lock_held = B_FALSE;
7307  
7308  		/* Protection against infinite loops of log blocks. */
7309  		if (l2arc_range_check_overlap(lb_ptrs[1].lbp_daddr,
7310  		    lb_ptrs[0].lbp_daddr,
7311  		    dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) &&
7312  		    !first_pass) {
7313  			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
7314  			err = SET_ERROR(ELOOP);
7315  			break;
7316  		}
7317  
7318  		/*
7319  		 * Our memory pressure valve. If the system is running low
7320  		 * on memory, rather than swamping memory with new ARC buf
7321  		 * hdrs, we opt not to rebuild the L2ARC. At this point,
7322  		 * however, we have already set up our L2ARC dev to chain in
7323  		 * new metadata log blk, so the user may choose to re-add the
7324  		 * L2ARC dev at a later time to reconstruct it (when there's
7325  		 * less memory pressure).
7326  		 */
7327  		if (arc_reclaim_needed()) {
7328  			ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
7329  			cmn_err(CE_NOTE, "System running low on memory, "
7330  			    "aborting L2ARC rebuild.");
7331  			err = SET_ERROR(ENOMEM);
7332  			break;
7333  		}
7334  
7335  		/*
7336  		 * Now that we know that the next_lb checks out alright, we
7337  		 * can start reconstruction from this lb - we can be sure
7338  		 * that the L2ARC write hand has not yet reached any of our
7339  		 * buffers.
7340  		 */
7341  		l2arc_log_blk_restore(dev, load_guid, this_lb,
7342  		    LBP_GET_PSIZE(&lb_ptrs[0]));
7343  
7344  		/*
7345  		 * End of list detection. We can look ahead two steps in the
7346  		 * blk chain and if the 2nd blk from this_lb dips below the
7347  		 * initial chain starting point, then we know two things:
7348  		 *	1) it can't be valid, and
7349  		 *	2) the next_lb's ARC entries might have already been
7350  		 *	partially overwritten and so we should stop before
7351  		 *	we restore it
7352  		 */
7353  		if (l2arc_range_check_overlap(
7354  		    this_lb->lb_back2_lbp.lbp_daddr, lb_ptrs[0].lbp_daddr,
7355  		    dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) &&
7356  		    !first_pass)
7357  			break;
7358  
7359  		/* log blk restored, continue with next one in the list */
7360  		lb_ptrs[0] = lb_ptrs[1];
7361  		lb_ptrs[1] = this_lb->lb_back2_lbp;
7362  		PTR_SWAP(this_lb, next_lb);
7363  		PTR_SWAP(this_lb_buf, next_lb_buf);
7364  		this_io = next_io;
7365  		next_io = NULL;
7366  		first_pass = B_FALSE;
7367  
7368  		for (;;) {
7369  			if (dev->l2ad_rebuild_cancel) {
7370  				err = SET_ERROR(ECANCELED);
7371  				goto out;
7372  			}
7373  			if (spa_config_tryenter(spa, SCL_L2ARC, vd,
7374  			    RW_READER)) {
7375  				lock_held = B_TRUE;
7376  				break;
7377  			}
7378  			/*
7379  			 * L2ARC config lock held by somebody in writer,
7380  			 * possibly due to them trying to remove us. They'll
7381  			 * likely to want us to shut down, so after a little
7382  			 * delay, we check l2ad_rebuild_cancel and retry
7383  			 * the lock again.
7384  			 */
7385  			delay(1);
7386  		}
7387  	}
7388  out:
7389  	if (next_io != NULL)
7390  		l2arc_log_blk_prefetch_abort(next_io);
7391  	kmem_free(this_lb, sizeof (*this_lb));
7392  	kmem_free(next_lb, sizeof (*next_lb));
7393  	kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
7394  	kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
7395  	if (err == 0)
7396  		ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
7397  
7398  	if (lock_held)
7399  		spa_config_exit(spa, SCL_L2ARC, vd);
7400  
7401  	return (err);
7402  }
7403  
7404  /*
7405   * Attempts to read the device header on the provided L2ARC device and writes
7406   * it to `hdr'. On success, this function returns 0, otherwise the appropriate
7407   * error code is returned.
7408   */
7409  static int
l2arc_dev_hdr_read(l2arc_dev_t * dev)7410  l2arc_dev_hdr_read(l2arc_dev_t *dev)
7411  {
7412  	int			err;
7413  	uint64_t		guid;
7414  	zio_cksum_t		cksum;
7415  	l2arc_dev_hdr_phys_t	*hdr = dev->l2ad_dev_hdr;
7416  	const uint64_t		hdr_asize = dev->l2ad_dev_hdr_asize;
7417  
7418  	guid = spa_guid(dev->l2ad_vdev->vdev_spa);
7419  
7420  	if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
7421  	    VDEV_LABEL_START_SIZE, hdr_asize, hdr,
7422  	    ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
7423  	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
7424  	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
7425  		spa_config_exit(dev->l2ad_vdev->vdev_spa, SCL_L2ARC,
7426  		    dev->l2ad_vdev);
7427  		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
7428  		return (err);
7429  	}
7430  
7431  	if (hdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
7432  		byteswap_uint64_array(hdr, sizeof (*hdr));
7433  
7434  	if (hdr->dh_magic != L2ARC_DEV_HDR_MAGIC || hdr->dh_spa_guid != guid) {
7435  		/*
7436  		 * Attempt to rebuild a device containing no actual dev hdr
7437  		 * or containing a header from some other pool.
7438  		 */
7439  		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
7440  		return (SET_ERROR(ENOTSUP));
7441  	}
7442  
7443  	l2arc_dev_hdr_checksum(hdr, &cksum);
7444  	if (!ZIO_CHECKSUM_EQUAL(hdr->dh_self_cksum, cksum)) {
7445  		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
7446  		return (SET_ERROR(EINVAL));
7447  	}
7448  
7449  	return (0);
7450  }
7451  
7452  /*
7453   * Reads L2ARC log blocks from storage and validates their contents.
7454   *
7455   * This function implements a simple prefetcher to make sure that while
7456   * we're processing one buffer the L2ARC is already prefetching the next
7457   * one in the chain.
7458   *
7459   * The arguments this_lp and next_lp point to the current and next log blk
7460   * address in the block chain. Similarly, this_lb and next_lb hold the
7461   * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
7462   * and next_lb_buf must be buffers of appropriate to hold a raw
7463   * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
7464   * to buffer decompression).
7465   *
7466   * The `this_io' and `next_io' arguments are used for block prefetching.
7467   * When issuing the first blk IO during rebuild, you should pass NULL for
7468   * `this_io'. This function will then issue a sync IO to read the block and
7469   * also issue an async IO to fetch the next block in the block chain. The
7470   * prefetch IO is returned in `next_io'. On subsequent calls to this
7471   * function, pass the value returned in `next_io' from the previous call
7472   * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
7473   * Prior to the call, you should initialize your `next_io' pointer to be
7474   * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
7475   *
7476   * On success, this function returns 0, otherwise it returns an appropriate
7477   * error code. On error the prefetching IO is aborted and cleared before
7478   * returning from this function. Therefore, if we return `success', the
7479   * caller can assume that we have taken care of cleanup of prefetch IOs.
7480   */
7481  static int
l2arc_log_blk_read(l2arc_dev_t * dev,const l2arc_log_blkptr_t * this_lbp,const l2arc_log_blkptr_t * next_lbp,l2arc_log_blk_phys_t * this_lb,l2arc_log_blk_phys_t * next_lb,uint8_t * this_lb_buf,uint8_t * next_lb_buf,zio_t * this_io,zio_t ** next_io)7482  l2arc_log_blk_read(l2arc_dev_t *dev,
7483      const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
7484      l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
7485      uint8_t *this_lb_buf, uint8_t *next_lb_buf,
7486      zio_t *this_io, zio_t **next_io)
7487  {
7488  	int		err = 0;
7489  	zio_cksum_t	cksum;
7490  
7491  	ASSERT(this_lbp != NULL && next_lbp != NULL);
7492  	ASSERT(this_lb != NULL && next_lb != NULL);
7493  	ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
7494  	ASSERT(next_io != NULL && *next_io == NULL);
7495  	ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
7496  
7497  	/*
7498  	 * Check to see if we have issued the IO for this log blk in a
7499  	 * previous run. If not, this is the first call, so issue it now.
7500  	 */
7501  	if (this_io == NULL) {
7502  		this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
7503  		    this_lb_buf);
7504  	}
7505  
7506  	/*
7507  	 * Peek to see if we can start issuing the next IO immediately.
7508  	 */
7509  	if (l2arc_log_blkptr_valid(dev, next_lbp)) {
7510  		/*
7511  		 * Start issuing IO for the next log blk early - this
7512  		 * should help keep the L2ARC device busy while we
7513  		 * decompress and restore this log blk.
7514  		 */
7515  		*next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
7516  		    next_lb_buf);
7517  	}
7518  
7519  	/* Wait for the IO to read this log block to complete */
7520  	if ((err = zio_wait(this_io)) != 0) {
7521  		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
7522  		goto cleanup;
7523  	}
7524  
7525  	/* Make sure the buffer checks out */
7526  	fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), NULL, &cksum);
7527  	if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
7528  		ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
7529  		err = SET_ERROR(EINVAL);
7530  		goto cleanup;
7531  	}
7532  
7533  	/* Now we can take our time decoding this buffer */
7534  	switch (LBP_GET_COMPRESS(this_lbp)) {
7535  	case ZIO_COMPRESS_OFF:
7536  		bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
7537  		break;
7538  	case ZIO_COMPRESS_LZ4:
7539  		if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
7540  		    this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
7541  		    sizeof (*this_lb))) != 0) {
7542  			err = SET_ERROR(EINVAL);
7543  			goto cleanup;
7544  		}
7545  		break;
7546  	default:
7547  		err = SET_ERROR(EINVAL);
7548  		goto cleanup;
7549  	}
7550  	if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
7551  		byteswap_uint64_array(this_lb, sizeof (*this_lb));
7552  	if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
7553  		err = SET_ERROR(EINVAL);
7554  		goto cleanup;
7555  	}
7556  cleanup:
7557  	/* Abort an in-flight prefetch I/O in case of error */
7558  	if (err != 0 && *next_io != NULL) {
7559  		l2arc_log_blk_prefetch_abort(*next_io);
7560  		*next_io = NULL;
7561  	}
7562  	return (err);
7563  }
7564  
7565  /*
7566   * Restores the payload of a log blk to ARC. This creates empty ARC hdr
7567   * entries which only contain an l2arc hdr, essentially restoring the
7568   * buffers to their L2ARC evicted state. This function also updates space
7569   * usage on the L2ARC vdev to make sure it tracks restored buffers.
7570   */
7571  static void
l2arc_log_blk_restore(l2arc_dev_t * dev,uint64_t load_guid,const l2arc_log_blk_phys_t * lb,uint64_t lb_psize)7572  l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
7573      const l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
7574  {
7575  	uint64_t	size = 0, psize = 0;
7576  
7577  	for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
7578  		/*
7579  		 * Restore goes in the reverse temporal direction to preserve
7580  		 * correct temporal ordering of buffers in the l2ad_buflist.
7581  		 * l2arc_hdr_restore also does a list_insert_tail instead of
7582  		 * list_insert_head on the l2ad_buflist:
7583  		 *
7584  		 *		LIST	l2ad_buflist		LIST
7585  		 *		HEAD  <------ (time) ------	TAIL
7586  		 * direction	+-----+-----+-----+-----+-----+    direction
7587  		 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
7588  		 * fill		+-----+-----+-----+-----+-----+
7589  		 *		^				^
7590  		 *		|				|
7591  		 *		|				|
7592  		 *	l2arc_fill_thread		l2arc_rebuild
7593  		 *	places new bufs here		restores bufs here
7594  		 *
7595  		 * This also works when the restored bufs get evicted at any
7596  		 * point during the rebuild.
7597  		 */
7598  		l2arc_hdr_restore(&lb->lb_entries[i], dev, load_guid);
7599  		size += LE_GET_LSIZE(&lb->lb_entries[i]);
7600  		psize += LE_GET_PSIZE(&lb->lb_entries[i]);
7601  	}
7602  
7603  	/*
7604  	 * Record rebuild stats:
7605  	 *	size		In-memory size of restored buffer data in ARC
7606  	 *	psize		Physical size of restored buffers in the L2ARC
7607  	 *	bufs		# of ARC buffer headers restored
7608  	 *	log_blks	# of L2ARC log entries processed during restore
7609  	 */
7610  	ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
7611  	ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
7612  	ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
7613  	ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
7614  	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
7615  	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
7616  	vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
7617  }
7618  
7619  /*
7620   * Restores a single ARC buf hdr from a log block. The ARC buffer is put
7621   * into a state indicating that it has been evicted to L2ARC.
7622   */
7623  static void
l2arc_hdr_restore(const l2arc_log_ent_phys_t * le,l2arc_dev_t * dev,uint64_t load_guid)7624  l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
7625      uint64_t load_guid)
7626  {
7627  	arc_buf_hdr_t		*hdr, *exists;
7628  	kmutex_t		*hash_lock;
7629  	arc_buf_contents_t	type = LE_GET_TYPE(le);
7630  
7631  	/*
7632  	 * Do all the allocation before grabbing any locks, this lets us
7633  	 * sleep if memory is full and we don't have to deal with failed
7634  	 * allocations.
7635  	 */
7636  	ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le)) ||
7637  	    LE_GET_COMPRESS(le) == ZIO_COMPRESS_OFF);
7638  	hdr = arc_buf_alloc_l2only(load_guid, LE_GET_LSIZE(le), type,
7639  	    dev, le->le_dva, le->le_daddr, LE_GET_PSIZE(le), le->le_birth,
7640  	    le->le_freeze_cksum, LE_GET_COMPRESS(le));
7641  	if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) {
7642  		ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
7643  		ARCSTAT_INCR(arcstat_l2_asize, hdr->b_l2hdr.b_asize);
7644  	}
7645  
7646  	mutex_enter(&dev->l2ad_mtx);
7647  	/*
7648  	 * We connect the l2hdr to the hdr only after the hdr is in the hash
7649  	 * table, otherwise the rest of the arc hdr manipulation machinery
7650  	 * might get confused.
7651  	 */
7652  	list_insert_tail(&dev->l2ad_buflist, hdr);
7653  	(void) refcount_add_many(&dev->l2ad_alloc, hdr->b_l2hdr.b_asize, hdr);
7654  	mutex_exit(&dev->l2ad_mtx);
7655  
7656  	exists = buf_hash_insert(hdr, &hash_lock);
7657  	if (exists) {
7658  		/* Buffer was already cached, no need to restore it. */
7659  		mutex_exit(hash_lock);
7660  		arc_hdr_destroy(hdr);
7661  		ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
7662  		return;
7663  	}
7664  
7665  	mutex_exit(hash_lock);
7666  }
7667  
7668  /*
7669   * Starts an asynchronous read IO to read a log block. This is used in log
7670   * block reconstruction to start reading the next block before we are done
7671   * decoding and reconstructing the current block, to keep the l2arc device
7672   * nice and hot with read IO to process.
7673   * The returned zio will contain a newly allocated memory buffers for the IO
7674   * data which should then be freed by the caller once the zio is no longer
7675   * needed (i.e. due to it having completed). If you wish to abort this
7676   * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
7677   * care of disposing of the allocated buffers correctly.
7678   */
7679  static zio_t *
l2arc_log_blk_prefetch(vdev_t * vd,const l2arc_log_blkptr_t * lbp,uint8_t * lb_buf)7680  l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
7681      uint8_t *lb_buf)
7682  {
7683  	uint32_t	psize;
7684  	zio_t		*pio;
7685  
7686  	psize = LBP_GET_PSIZE(lbp);
7687  	ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
7688  	pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
7689  	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
7690  	    ZIO_FLAG_DONT_RETRY);
7691  	(void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize,
7692  	    lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
7693  	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
7694  	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
7695  
7696  	return (pio);
7697  }
7698  
7699  /*
7700   * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
7701   * buffers allocated for it.
7702   */
7703  static void
l2arc_log_blk_prefetch_abort(zio_t * zio)7704  l2arc_log_blk_prefetch_abort(zio_t *zio)
7705  {
7706  	(void) zio_wait(zio);
7707  }
7708  
7709  /*
7710   * Creates a zio to update the device header on an l2arc device. The zio is
7711   * initiated as a child of `pio'.
7712   */
7713  static void
l2arc_dev_hdr_update(l2arc_dev_t * dev,zio_t * pio)7714  l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
7715  {
7716  	zio_t			*wzio;
7717  	l2arc_dev_hdr_phys_t	*hdr = dev->l2ad_dev_hdr;
7718  	const uint64_t		hdr_asize = dev->l2ad_dev_hdr_asize;
7719  
7720  	hdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
7721  	hdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
7722  	hdr->dh_alloc_space = refcount_count(&dev->l2ad_alloc);
7723  	hdr->dh_flags = 0;
7724  	if (dev->l2ad_first)
7725  		hdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
7726  
7727  	/* checksum operation goes last */
7728  	l2arc_dev_hdr_checksum(hdr, &hdr->dh_self_cksum);
7729  
7730  	wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
7731  	    hdr_asize, hdr, ZIO_CHECKSUM_OFF, NULL, NULL,
7732  	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
7733  	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
7734  	(void) zio_nowait(wzio);
7735  }
7736  
7737  /*
7738   * Commits a log block to the L2ARC device. This routine is invoked from
7739   * l2arc_write_buffers when the log block fills up.
7740   * This function allocates some memory to temporarily hold the serialized
7741   * buffer to be written. This is then released in l2arc_write_done.
7742   */
7743  static void
l2arc_log_blk_commit(l2arc_dev_t * dev,zio_t * pio,l2arc_write_callback_t * cb)7744  l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
7745      l2arc_write_callback_t *cb)
7746  {
7747  	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
7748  	uint64_t		psize, asize;
7749  	l2arc_log_blk_buf_t	*lb_buf;
7750  	zio_t			*wzio;
7751  
7752  	VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
7753  
7754  	/* link the buffer into the block chain */
7755  	lb->lb_back2_lbp = dev->l2ad_dev_hdr->dh_start_lbps[1];
7756  	lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
7757  
7758  	/* try to compress the buffer */
7759  	lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
7760  	list_insert_tail(&cb->l2wcb_log_blk_buflist, lb_buf);
7761  	psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb, lb_buf->lbb_log_blk,
7762  	    sizeof (*lb));
7763  	/* a log block is never entirely zero */
7764  	ASSERT(psize != 0);
7765  	asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
7766  	ASSERT(asize <= sizeof (lb_buf->lbb_log_blk));
7767  
7768  	/*
7769  	 * Update the start log blk pointer in the device header to point
7770  	 * to the log block we're about to write.
7771  	 */
7772  	dev->l2ad_dev_hdr->dh_start_lbps[1] =
7773  	    dev->l2ad_dev_hdr->dh_start_lbps[0];
7774  	dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
7775  	_NOTE(CONSTCOND)
7776  	LBP_SET_LSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], sizeof (*lb));
7777  	LBP_SET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], asize);
7778  	LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7779  	    ZIO_CHECKSUM_FLETCHER_4);
7780  	LBP_SET_TYPE(&dev->l2ad_dev_hdr->dh_start_lbps[0], 0);
7781  	if (asize < sizeof (*lb)) {
7782  		/* compression succeeded */
7783  		bzero(lb_buf->lbb_log_blk + psize, asize - psize);
7784  		LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7785  		    ZIO_COMPRESS_LZ4);
7786  	} else {
7787  		/* compression failed */
7788  		bcopy(lb, lb_buf->lbb_log_blk, sizeof (*lb));
7789  		LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7790  		    ZIO_COMPRESS_OFF);
7791  	}
7792  	/* checksum what we're about to write */
7793  	fletcher_4_native(lb_buf->lbb_log_blk, asize, NULL,
7794  	    &dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_cksum);
7795  
7796  	/* perform the write itself */
7797  	CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
7798  	    L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
7799  	wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
7800  	    asize, lb_buf->lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
7801  	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
7802  	DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
7803  	(void) zio_nowait(wzio);
7804  
7805  	dev->l2ad_hand += asize;
7806  	vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
7807  
7808  	/* bump the kstats */
7809  	ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
7810  	ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
7811  	ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
7812  	ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
7813  	    dev->l2ad_log_blk_payload_asize / asize);
7814  
7815  	/* start a new log block */
7816  	dev->l2ad_log_ent_idx = 0;
7817  	dev->l2ad_log_blk_payload_asize = 0;
7818  }
7819  
7820  /*
7821   * Validates an L2ARC log blk address to make sure that it can be read
7822   * from the provided L2ARC device. Returns B_TRUE if the address is
7823   * within the device's bounds, or B_FALSE if not.
7824   */
7825  static boolean_t
l2arc_log_blkptr_valid(l2arc_dev_t * dev,const l2arc_log_blkptr_t * lbp)7826  l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
7827  {
7828  	uint64_t psize = LBP_GET_PSIZE(lbp);
7829  	uint64_t end = lbp->lbp_daddr + psize;
7830  
7831  	/*
7832  	 * A log block is valid if all of the following conditions are true:
7833  	 * - it fits entirely between l2ad_start and l2ad_end
7834  	 * - it has a valid size
7835  	 */
7836  	return (lbp->lbp_daddr >= dev->l2ad_start && end <= dev->l2ad_end &&
7837  	    psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t));
7838  }
7839  
7840  /*
7841   * Computes the checksum of `hdr' and stores it in `cksum'.
7842   */
7843  static void
l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t * hdr,zio_cksum_t * cksum)7844  l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
7845  {
7846  	fletcher_4_native((uint8_t *)hdr +
7847  	    offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid),
7848  	    sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid),
7849  	    NULL, cksum);
7850  }
7851  
7852  /*
7853   * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
7854   * The buffer being inserted must be present in L2ARC.
7855   * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
7856   * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
7857   */
7858  static boolean_t
l2arc_log_blk_insert(l2arc_dev_t * dev,const arc_buf_hdr_t * ab)7859  l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
7860  {
7861  	l2arc_log_blk_phys_t	*lb = &dev->l2ad_log_blk;
7862  	l2arc_log_ent_phys_t	*le;
7863  	int			index = dev->l2ad_log_ent_idx++;
7864  
7865  	ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
7866  
7867  	le = &lb->lb_entries[index];
7868  	bzero(le, sizeof (*le));
7869  	le->le_dva = ab->b_dva;
7870  	le->le_birth = ab->b_birth;
7871  	le->le_daddr = ab->b_l2hdr.b_daddr;
7872  	LE_SET_LSIZE(le, ab->b_size);
7873  	LE_SET_PSIZE(le, ab->b_l2hdr.b_asize);
7874  	LE_SET_COMPRESS(le, ab->b_l2hdr.b_compress);
7875  	if (ab->b_l2hdr.b_compress != ZIO_COMPRESS_OFF) {
7876  		ASSERT(L2ARC_IS_VALID_COMPRESS(ab->b_l2hdr.b_compress));
7877  		ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le)));
7878  	}
7879  	le->le_freeze_cksum = *ab->b_freeze_cksum;
7880  	LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
7881  	LE_SET_TYPE(le, arc_flags_to_bufc(ab->b_flags));
7882  	dev->l2ad_log_blk_payload_asize += ab->b_l2hdr.b_asize;
7883  
7884  	return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
7885  }
7886  
7887  /*
7888   * Checks whether a given L2ARC device address sits in a time-sequential
7889   * range. The trick here is that the L2ARC is a rotary buffer, so we can't
7890   * just do a range comparison, we need to handle the situation in which the
7891   * range wraps around the end of the L2ARC device. Arguments:
7892   *	bottom	Lower end of the range to check (written to earlier).
7893   *	top	Upper end of the range to check (written to later).
7894   *	check	The address for which we want to determine if it sits in
7895   *		between the top and bottom.
7896   *
7897   * The 3-way conditional below represents the following cases:
7898   *
7899   *	bottom < top : Sequentially ordered case:
7900   *	  <check>--------+-------------------+
7901   *	                 |  (overlap here?)  |
7902   *	 L2ARC dev       V                   V
7903   *	 |---------------<bottom>============<top>--------------|
7904   *
7905   *	bottom > top: Looped-around case:
7906   *	                      <check>--------+------------------+
7907   *	                                     |  (overlap here?) |
7908   *	 L2ARC dev                           V                  V
7909   *	 |===============<top>---------------<bottom>===========|
7910   *	 ^               ^
7911   *	 |  (or here?)   |
7912   *	 +---------------+---------<check>
7913   *
7914   *	top == bottom : Just a single address comparison.
7915   */
7916  static inline boolean_t
l2arc_range_check_overlap(uint64_t bottom,uint64_t top,uint64_t check)7917  l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
7918  {
7919  	if (bottom < top)
7920  		return (bottom <= check && check <= top);
7921  	else if (bottom > top)
7922  		return (check <= top || bottom <= check);
7923  	else
7924  		return (check == top);
7925  }
7926  
7927  /*
7928   * dump arc cache to user mode for debugging purposes
7929   */
7930  static void
arc_dump_entry(arc_buf_hdr_t * entry,arc_info_t * outp)7931  arc_dump_entry(arc_buf_hdr_t *entry, arc_info_t *outp)
7932  {
7933  	outp->ai_dva = entry->b_dva;
7934  	outp->ai_birth = entry->b_birth;
7935  	outp->ai_flags = entry->b_flags;
7936  	outp->ai_spa = entry->b_spa;
7937  	outp->ai_size = entry->b_size;
7938  	if (HDR_HAS_L1HDR(entry)) {
7939  		arc_state_t *state = entry->b_l1hdr.b_state;
7940  		if (state == arc_anon)
7941  			outp->ai_state = AIS_ANON;
7942  		else if (state == arc_mru)
7943  			outp->ai_state = AIS_MRU;
7944  		else if (state == arc_mru_ghost)
7945  			outp->ai_state = AIS_MRU_GHOST;
7946  		else if (state == arc_mfu)
7947  			outp->ai_state = AIS_MFU;
7948  		else if (state == arc_mfu_ghost)
7949  			outp->ai_state = AIS_MFU_GHOST;
7950  		else if (state == arc_l2c_only)
7951  			outp->ai_state = AIS_L2C_ONLY;
7952  		else
7953  			outp->ai_state = AIS_UNKNOWN;
7954  	} else {
7955  		outp->ai_state = AIS_NO_L1HDR;
7956  	}
7957  }
7958  
7959  int
arc_dump(int start_bucket,void * buf,size_t bufsize,size_t * returned_bytes)7960  arc_dump(int start_bucket, void *buf, size_t bufsize, size_t *returned_bytes)
7961  {
7962  	int i;
7963  	arc_info_t *outp = buf + sizeof(arc_info_hdr_t);
7964  	arc_info_t *maxp = buf + bufsize;
7965  	arc_info_hdr_t *aih = buf;
7966  	size_t nbuckets = buf_hash_table.ht_mask + 1;
7967  	size_t bph = nbuckets / BUF_LOCKS;	/* buckets per hash */
7968  	kmutex_t *last_lock = NULL;
7969  
7970  	if (bufsize < sizeof(arc_info_hdr_t))
7971  		return (ENOMEM);
7972  
7973  	aih->aih_buckets = nbuckets;
7974  	aih->aih_buf_locks = BUF_LOCKS;
7975  
7976  	ASSERT(start_bucket >= 0);
7977  	ASSERT(start_bucket < nbuckets);
7978  
7979  	for (i = start_bucket; i < nbuckets; ++i) {
7980  		kmutex_t *hash_lock;
7981  		arc_buf_hdr_t *entry;
7982  		arc_info_t *dryrun = outp;
7983  		int bucket;
7984  
7985  		/*
7986  		 * transform index. We want to enumerate the buckets in an
7987  		 * order that allows us to keep the mutex as long as possible
7988  		 */
7989  		bucket = (i / bph) + (i % bph) * BUF_LOCKS;
7990  
7991  		hash_lock = BUF_HASH_LOCK(bucket);
7992  		if (hash_lock != last_lock) {
7993  			if (last_lock)
7994  				mutex_exit(last_lock);
7995  			mutex_enter(hash_lock);
7996  		}
7997  		last_lock = hash_lock;
7998  		/* count entries to see if they will fit */
7999  		entry = buf_hash_table.ht_table[bucket];
8000  		while (entry != NULL) {
8001  			++dryrun;
8002  			entry = entry->b_hash_next;
8003  		}
8004  		if (dryrun > maxp) {
8005  			break;
8006  		}
8007  		/* actually copy entries */
8008  		entry = buf_hash_table.ht_table[bucket];
8009  		while (entry != NULL) {
8010  			arc_dump_entry(entry, outp);
8011  			++outp;
8012  			entry = entry->b_hash_next;
8013  		}
8014  	}
8015  	if (last_lock)
8016  		mutex_exit(last_lock);
8017  
8018  	*returned_bytes = (void *)outp - buf;
8019  	aih->aih_entries = (*returned_bytes - sizeof(*aih)) / sizeof(*outp);
8020  
8021  	if (i <= buf_hash_table.ht_mask)
8022  		aih->aih_next = i;
8023  	else
8024  		aih->aih_next = 0;
8025  
8026  	return (0);
8027  }
8028