1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2012, Joyent, Inc. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved.
26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved.
27 */
28
29 /*
30 * DVA-based Adjustable Replacement Cache
31 *
32 * While much of the theory of operation used here is
33 * based on the self-tuning, low overhead replacement cache
34 * presented by Megiddo and Modha at FAST 2003, there are some
35 * significant differences:
36 *
37 * 1. The Megiddo and Modha model assumes any page is evictable.
38 * Pages in its cache cannot be "locked" into memory. This makes
39 * the eviction algorithm simple: evict the last page in the list.
40 * This also make the performance characteristics easy to reason
41 * about. Our cache is not so simple. At any given moment, some
42 * subset of the blocks in the cache are un-evictable because we
43 * have handed out a reference to them. Blocks are only evictable
44 * when there are no external references active. This makes
45 * eviction far more problematic: we choose to evict the evictable
46 * blocks that are the "lowest" in the list.
47 *
48 * There are times when it is not possible to evict the requested
49 * space. In these circumstances we are unable to adjust the cache
50 * size. To prevent the cache growing unbounded at these times we
51 * implement a "cache throttle" that slows the flow of new data
52 * into the cache until we can make space available.
53 *
54 * 2. The Megiddo and Modha model assumes a fixed cache size.
55 * Pages are evicted when the cache is full and there is a cache
56 * miss. Our model has a variable sized cache. It grows with
57 * high use, but also tries to react to memory pressure from the
58 * operating system: decreasing its size when system memory is
59 * tight.
60 *
61 * 3. The Megiddo and Modha model assumes a fixed page size. All
62 * elements of the cache are therefore exactly the same size. So
63 * when adjusting the cache size following a cache miss, its simply
64 * a matter of choosing a single page to evict. In our model, we
65 * have variable sized cache blocks (rangeing from 512 bytes to
66 * 128K bytes). We therefore choose a set of blocks to evict to make
67 * space for a cache miss that approximates as closely as possible
68 * the space used by the new block.
69 *
70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache"
71 * by N. Megiddo & D. Modha, FAST 2003
72 */
73
74 /*
75 * The locking model:
76 *
77 * A new reference to a cache buffer can be obtained in two
78 * ways: 1) via a hash table lookup using the DVA as a key,
79 * or 2) via one of the ARC lists. The arc_read() interface
80 * uses method 1, while the internal arc algorithms for
81 * adjusting the cache use method 2. We therefore provide two
82 * types of locks: 1) the hash table lock array, and 2) the
83 * arc list locks.
84 *
85 * Buffers do not have their own mutexes, rather they rely on the
86 * hash table mutexes for the bulk of their protection (i.e. most
87 * fields in the arc_buf_hdr_t are protected by these mutexes).
88 *
89 * buf_hash_find() returns the appropriate mutex (held) when it
90 * locates the requested buffer in the hash table. It returns
91 * NULL for the mutex if the buffer was not in the table.
92 *
93 * buf_hash_remove() expects the appropriate hash mutex to be
94 * already held before it is invoked.
95 *
96 * Each arc state also has a mutex which is used to protect the
97 * buffer list associated with the state. When attempting to
98 * obtain a hash table lock while holding an arc list lock you
99 * must use: mutex_tryenter() to avoid deadlock. Also note that
100 * the active state mutex must be held before the ghost state mutex.
101 *
102 * Arc buffers may have an associated eviction callback function.
103 * This function will be invoked prior to removing the buffer (e.g.
104 * in arc_do_user_evicts()). Note however that the data associated
105 * with the buffer may be evicted prior to the callback. The callback
106 * must be made with *no locks held* (to prevent deadlock). Additionally,
107 * the users of callbacks must ensure that their private data is
108 * protected from simultaneous callbacks from arc_clear_callback()
109 * and arc_do_user_evicts().
110 *
111 * Note that the majority of the performance stats are manipulated
112 * with atomic operations.
113 *
114 * The L2ARC uses the l2ad_mtx on each vdev for the following:
115 *
116 * - L2ARC buflist creation
117 * - L2ARC buflist eviction
118 * - L2ARC write completion, which walks L2ARC buflists
119 * - ARC header destruction, as it removes from L2ARC buflists
120 * - ARC header release, as it removes from L2ARC buflists
121 */
122
123 #include <sys/spa.h>
124 #include <sys/zio.h>
125 #include <sys/zio_compress.h>
126 #include <sys/zfs_context.h>
127 #include <sys/arc.h>
128 #include <sys/refcount.h>
129 #include <sys/vdev.h>
130 #include <sys/vdev_impl.h>
131 #include <sys/dsl_pool.h>
132 #include <sys/multilist.h>
133 #ifdef _KERNEL
134 #include <sys/vmsystm.h>
135 #include <vm/anon.h>
136 #include <sys/fs/swapnode.h>
137 #include <sys/dnlc.h>
138 #endif
139 #include <sys/callb.h>
140 #include <sys/kstat.h>
141 #include <zfs_fletcher.h>
142 #include <sys/byteorder.h>
143 #include <sys/spa_impl.h>
144 #include <sys/zfs_ioctl.h>
145
146 #ifndef _KERNEL
147 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */
148 boolean_t arc_watch = B_FALSE;
149 int arc_procfd;
150 #endif
151
152 static kmutex_t arc_reclaim_lock;
153 static kcondvar_t arc_reclaim_thread_cv;
154 static boolean_t arc_reclaim_thread_exit;
155 static kcondvar_t arc_reclaim_waiters_cv;
156
157 static kmutex_t arc_user_evicts_lock;
158 static kcondvar_t arc_user_evicts_cv;
159 static boolean_t arc_user_evicts_thread_exit;
160
161 uint_t arc_reduce_dnlc_percent = 3;
162
163 /*
164 * The number of headers to evict in arc_evict_state_impl() before
165 * dropping the sublist lock and evicting from another sublist. A lower
166 * value means we're more likely to evict the "correct" header (i.e. the
167 * oldest header in the arc state), but comes with higher overhead
168 * (i.e. more invocations of arc_evict_state_impl()).
169 */
170 int zfs_arc_evict_batch_limit = 10;
171
172 /*
173 * The number of sublists used for each of the arc state lists. If this
174 * is not set to a suitable value by the user, it will be configured to
175 * the number of CPUs on the system in arc_init().
176 */
177 int zfs_arc_num_sublists_per_state = 0;
178
179 /* number of seconds before growing cache again */
180 static int arc_grow_retry = 60;
181
182 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */
183 int zfs_arc_overflow_shift = 8;
184
185 /* shift of arc_c for calculating both min and max arc_p */
186 static int arc_p_min_shift = 4;
187
188 /* log2(fraction of arc to reclaim) */
189 static int arc_shrink_shift = 7;
190
191 /*
192 * log2(fraction of ARC which must be free to allow growing).
193 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory,
194 * when reading a new block into the ARC, we will evict an equal-sized block
195 * from the ARC.
196 *
197 * This must be less than arc_shrink_shift, so that when we shrink the ARC,
198 * we will still not allow it to grow.
199 */
200 int arc_no_grow_shift = 5;
201
202
203 /*
204 * minimum lifespan of a prefetch block in clock ticks
205 * (initialized in arc_init())
206 */
207 static int arc_min_prefetch_lifespan;
208
209 /*
210 * If this percent of memory is free, don't throttle.
211 */
212 int arc_lotsfree_percent = 10;
213
214 static int arc_dead;
215
216 /*
217 * The arc has filled available memory and has now warmed up.
218 */
219 static boolean_t arc_warm;
220
221 /*
222 * These tunables are for performance analysis.
223 */
224 uint64_t zfs_arc_max;
225 uint64_t zfs_arc_min;
226 uint64_t zfs_arc_meta_limit = 0;
227 uint64_t zfs_arc_meta_min = 0;
228 int zfs_arc_grow_retry = 0;
229 int zfs_arc_shrink_shift = 0;
230 int zfs_arc_p_min_shift = 0;
231 int zfs_disable_dup_eviction = 0;
232 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */
233
234 /*
235 * Note that buffers can be in one of 6 states:
236 * ARC_anon - anonymous (discussed below)
237 * ARC_mru - recently used, currently cached
238 * ARC_mru_ghost - recentely used, no longer in cache
239 * ARC_mfu - frequently used, currently cached
240 * ARC_mfu_ghost - frequently used, no longer in cache
241 * ARC_l2c_only - exists in L2ARC but not other states
242 * When there are no active references to the buffer, they are
243 * are linked onto a list in one of these arc states. These are
244 * the only buffers that can be evicted or deleted. Within each
245 * state there are multiple lists, one for meta-data and one for
246 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
247 * etc.) is tracked separately so that it can be managed more
248 * explicitly: favored over data, limited explicitly.
249 *
250 * Anonymous buffers are buffers that are not associated with
251 * a DVA. These are buffers that hold dirty block copies
252 * before they are written to stable storage. By definition,
253 * they are "ref'd" and are considered part of arc_mru
254 * that cannot be freed. Generally, they will aquire a DVA
255 * as they are written and migrate onto the arc_mru list.
256 *
257 * The ARC_l2c_only state is for buffers that are in the second
258 * level ARC but no longer in any of the ARC_m* lists. The second
259 * level ARC itself may also contain buffers that are in any of
260 * the ARC_m* states - meaning that a buffer can exist in two
261 * places. The reason for the ARC_l2c_only state is to keep the
262 * buffer header in the hash table, so that reads that hit the
263 * second level ARC benefit from these fast lookups.
264 */
265
266 typedef struct arc_state {
267 /*
268 * list of evictable buffers
269 */
270 multilist_t arcs_list[ARC_BUFC_NUMTYPES];
271 /*
272 * total amount of evictable data in this state
273 */
274 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES];
275 /*
276 * total amount of data in this state; this includes: evictable,
277 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA.
278 */
279 refcount_t arcs_size;
280 } arc_state_t;
281
282 /* The 6 states: */
283 static arc_state_t ARC_anon;
284 static arc_state_t ARC_mru;
285 static arc_state_t ARC_mru_ghost;
286 static arc_state_t ARC_mfu;
287 static arc_state_t ARC_mfu_ghost;
288 static arc_state_t ARC_l2c_only;
289
290 typedef struct arc_stats {
291 kstat_named_t arcstat_hits;
292 kstat_named_t arcstat_misses;
293 kstat_named_t arcstat_demand_hits_data;
294 kstat_named_t arcstat_demand_misses_data;
295 kstat_named_t arcstat_demand_hits_metadata;
296 kstat_named_t arcstat_demand_misses_metadata;
297 kstat_named_t arcstat_prefetch_hits_data;
298 kstat_named_t arcstat_prefetch_misses_data;
299 kstat_named_t arcstat_prefetch_hits_metadata;
300 kstat_named_t arcstat_prefetch_misses_metadata;
301 kstat_named_t arcstat_mru_hits;
302 kstat_named_t arcstat_mru_ghost_hits;
303 kstat_named_t arcstat_mfu_hits;
304 kstat_named_t arcstat_mfu_ghost_hits;
305 kstat_named_t arcstat_deleted;
306 /*
307 * Number of buffers that could not be evicted because the hash lock
308 * was held by another thread. The lock may not necessarily be held
309 * by something using the same buffer, since hash locks are shared
310 * by multiple buffers.
311 */
312 kstat_named_t arcstat_mutex_miss;
313 /*
314 * Number of buffers skipped because they have I/O in progress, are
315 * indrect prefetch buffers that have not lived long enough, or are
316 * not from the spa we're trying to evict from.
317 */
318 kstat_named_t arcstat_evict_skip;
319 /*
320 * Number of times arc_evict_state() was unable to evict enough
321 * buffers to reach it's target amount.
322 */
323 kstat_named_t arcstat_evict_not_enough;
324 kstat_named_t arcstat_evict_l2_cached;
325 kstat_named_t arcstat_evict_l2_eligible;
326 kstat_named_t arcstat_evict_l2_ineligible;
327 kstat_named_t arcstat_evict_l2_skip;
328 kstat_named_t arcstat_hash_elements;
329 kstat_named_t arcstat_hash_elements_max;
330 kstat_named_t arcstat_hash_collisions;
331 kstat_named_t arcstat_hash_chains;
332 kstat_named_t arcstat_hash_chain_max;
333 kstat_named_t arcstat_p;
334 kstat_named_t arcstat_c;
335 kstat_named_t arcstat_c_min;
336 kstat_named_t arcstat_c_max;
337 kstat_named_t arcstat_size;
338 /*
339 * Number of bytes consumed by internal ARC structures necessary
340 * for tracking purposes; these structures are not actually
341 * backed by ARC buffers. This includes arc_buf_hdr_t structures
342 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only
343 * caches), and arc_buf_t structures (allocated via arc_buf_t
344 * cache).
345 */
346 kstat_named_t arcstat_hdr_size;
347 /*
348 * Number of bytes consumed by ARC buffers of type equal to
349 * ARC_BUFC_DATA. This is generally consumed by buffers backing
350 * on disk user data (e.g. plain file contents).
351 */
352 kstat_named_t arcstat_data_size;
353 /*
354 * Number of bytes consumed by ARC buffers of type equal to
355 * ARC_BUFC_METADATA. This is generally consumed by buffers
356 * backing on disk data that is used for internal ZFS
357 * structures (e.g. ZAP, dnode, indirect blocks, etc).
358 */
359 kstat_named_t arcstat_metadata_size;
360 /*
361 * Number of bytes consumed by various buffers and structures
362 * not actually backed with ARC buffers. This includes bonus
363 * buffers (allocated directly via zio_buf_* functions),
364 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t
365 * cache), and dnode_t structures (allocated via dnode_t cache).
366 */
367 kstat_named_t arcstat_other_size;
368 /*
369 * Total number of bytes consumed by ARC buffers residing in the
370 * arc_anon state. This includes *all* buffers in the arc_anon
371 * state; e.g. data, metadata, evictable, and unevictable buffers
372 * are all included in this value.
373 */
374 kstat_named_t arcstat_anon_size;
375 /*
376 * Number of bytes consumed by ARC buffers that meet the
377 * following criteria: backing buffers of type ARC_BUFC_DATA,
378 * residing in the arc_anon state, and are eligible for eviction
379 * (e.g. have no outstanding holds on the buffer).
380 */
381 kstat_named_t arcstat_anon_evictable_data;
382 /*
383 * Number of bytes consumed by ARC buffers that meet the
384 * following criteria: backing buffers of type ARC_BUFC_METADATA,
385 * residing in the arc_anon state, and are eligible for eviction
386 * (e.g. have no outstanding holds on the buffer).
387 */
388 kstat_named_t arcstat_anon_evictable_metadata;
389 /*
390 * Total number of bytes consumed by ARC buffers residing in the
391 * arc_mru state. This includes *all* buffers in the arc_mru
392 * state; e.g. data, metadata, evictable, and unevictable buffers
393 * are all included in this value.
394 */
395 kstat_named_t arcstat_mru_size;
396 /*
397 * Number of bytes consumed by ARC buffers that meet the
398 * following criteria: backing buffers of type ARC_BUFC_DATA,
399 * residing in the arc_mru state, and are eligible for eviction
400 * (e.g. have no outstanding holds on the buffer).
401 */
402 kstat_named_t arcstat_mru_evictable_data;
403 /*
404 * Number of bytes consumed by ARC buffers that meet the
405 * following criteria: backing buffers of type ARC_BUFC_METADATA,
406 * residing in the arc_mru state, and are eligible for eviction
407 * (e.g. have no outstanding holds on the buffer).
408 */
409 kstat_named_t arcstat_mru_evictable_metadata;
410 /*
411 * Total number of bytes that *would have been* consumed by ARC
412 * buffers in the arc_mru_ghost state. The key thing to note
413 * here, is the fact that this size doesn't actually indicate
414 * RAM consumption. The ghost lists only consist of headers and
415 * don't actually have ARC buffers linked off of these headers.
416 * Thus, *if* the headers had associated ARC buffers, these
417 * buffers *would have* consumed this number of bytes.
418 */
419 kstat_named_t arcstat_mru_ghost_size;
420 /*
421 * Number of bytes that *would have been* consumed by ARC
422 * buffers that are eligible for eviction, of type
423 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state.
424 */
425 kstat_named_t arcstat_mru_ghost_evictable_data;
426 /*
427 * Number of bytes that *would have been* consumed by ARC
428 * buffers that are eligible for eviction, of type
429 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
430 */
431 kstat_named_t arcstat_mru_ghost_evictable_metadata;
432 /*
433 * Total number of bytes consumed by ARC buffers residing in the
434 * arc_mfu state. This includes *all* buffers in the arc_mfu
435 * state; e.g. data, metadata, evictable, and unevictable buffers
436 * are all included in this value.
437 */
438 kstat_named_t arcstat_mfu_size;
439 /*
440 * Number of bytes consumed by ARC buffers that are eligible for
441 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu
442 * state.
443 */
444 kstat_named_t arcstat_mfu_evictable_data;
445 /*
446 * Number of bytes consumed by ARC buffers that are eligible for
447 * eviction, of type ARC_BUFC_METADATA, and reside in the
448 * arc_mfu state.
449 */
450 kstat_named_t arcstat_mfu_evictable_metadata;
451 /*
452 * Total number of bytes that *would have been* consumed by ARC
453 * buffers in the arc_mfu_ghost state. See the comment above
454 * arcstat_mru_ghost_size for more details.
455 */
456 kstat_named_t arcstat_mfu_ghost_size;
457 /*
458 * Number of bytes that *would have been* consumed by ARC
459 * buffers that are eligible for eviction, of type
460 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state.
461 */
462 kstat_named_t arcstat_mfu_ghost_evictable_data;
463 /*
464 * Number of bytes that *would have been* consumed by ARC
465 * buffers that are eligible for eviction, of type
466 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state.
467 */
468 kstat_named_t arcstat_mfu_ghost_evictable_metadata;
469 kstat_named_t arcstat_l2_hits;
470 kstat_named_t arcstat_l2_misses;
471 kstat_named_t arcstat_l2_feeds;
472 kstat_named_t arcstat_l2_rw_clash;
473 kstat_named_t arcstat_l2_read_bytes;
474 kstat_named_t arcstat_l2_write_bytes;
475 kstat_named_t arcstat_l2_writes_sent;
476 kstat_named_t arcstat_l2_writes_done;
477 kstat_named_t arcstat_l2_writes_error;
478 kstat_named_t arcstat_l2_writes_lock_retry;
479 kstat_named_t arcstat_l2_evict_lock_retry;
480 kstat_named_t arcstat_l2_evict_reading;
481 kstat_named_t arcstat_l2_evict_l1cached;
482 kstat_named_t arcstat_l2_free_on_write;
483 kstat_named_t arcstat_l2_cdata_free_on_write;
484 kstat_named_t arcstat_l2_abort_lowmem;
485 kstat_named_t arcstat_l2_cksum_bad;
486 kstat_named_t arcstat_l2_io_error;
487 kstat_named_t arcstat_l2_size;
488 kstat_named_t arcstat_l2_asize;
489 kstat_named_t arcstat_l2_hdr_size;
490 kstat_named_t arcstat_l2_compress_successes;
491 kstat_named_t arcstat_l2_compress_zeros;
492 kstat_named_t arcstat_l2_compress_failures;
493 kstat_named_t arcstat_l2_log_blk_writes;
494 kstat_named_t arcstat_l2_log_blk_avg_size;
495 kstat_named_t arcstat_l2_data_to_meta_ratio;
496 kstat_named_t arcstat_l2_rebuild_successes;
497 kstat_named_t arcstat_l2_rebuild_abort_unsupported;
498 kstat_named_t arcstat_l2_rebuild_abort_io_errors;
499 kstat_named_t arcstat_l2_rebuild_abort_cksum_errors;
500 kstat_named_t arcstat_l2_rebuild_abort_loop_errors;
501 kstat_named_t arcstat_l2_rebuild_abort_lowmem;
502 kstat_named_t arcstat_l2_rebuild_size;
503 kstat_named_t arcstat_l2_rebuild_bufs;
504 kstat_named_t arcstat_l2_rebuild_bufs_precached;
505 kstat_named_t arcstat_l2_rebuild_psize;
506 kstat_named_t arcstat_l2_rebuild_log_blks;
507 kstat_named_t arcstat_memory_throttle_count;
508 kstat_named_t arcstat_duplicate_buffers;
509 kstat_named_t arcstat_duplicate_buffers_size;
510 kstat_named_t arcstat_duplicate_reads;
511 kstat_named_t arcstat_meta_used;
512 kstat_named_t arcstat_meta_limit;
513 kstat_named_t arcstat_meta_max;
514 kstat_named_t arcstat_meta_min;
515 kstat_named_t arcstat_sync_wait_for_async;
516 kstat_named_t arcstat_demand_hit_predictive_prefetch;
517 } arc_stats_t;
518
519 static arc_stats_t arc_stats = {
520 { "hits", KSTAT_DATA_UINT64 },
521 { "misses", KSTAT_DATA_UINT64 },
522 { "demand_data_hits", KSTAT_DATA_UINT64 },
523 { "demand_data_misses", KSTAT_DATA_UINT64 },
524 { "demand_metadata_hits", KSTAT_DATA_UINT64 },
525 { "demand_metadata_misses", KSTAT_DATA_UINT64 },
526 { "prefetch_data_hits", KSTAT_DATA_UINT64 },
527 { "prefetch_data_misses", KSTAT_DATA_UINT64 },
528 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 },
529 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 },
530 { "mru_hits", KSTAT_DATA_UINT64 },
531 { "mru_ghost_hits", KSTAT_DATA_UINT64 },
532 { "mfu_hits", KSTAT_DATA_UINT64 },
533 { "mfu_ghost_hits", KSTAT_DATA_UINT64 },
534 { "deleted", KSTAT_DATA_UINT64 },
535 { "mutex_miss", KSTAT_DATA_UINT64 },
536 { "evict_skip", KSTAT_DATA_UINT64 },
537 { "evict_not_enough", KSTAT_DATA_UINT64 },
538 { "evict_l2_cached", KSTAT_DATA_UINT64 },
539 { "evict_l2_eligible", KSTAT_DATA_UINT64 },
540 { "evict_l2_ineligible", KSTAT_DATA_UINT64 },
541 { "evict_l2_skip", KSTAT_DATA_UINT64 },
542 { "hash_elements", KSTAT_DATA_UINT64 },
543 { "hash_elements_max", KSTAT_DATA_UINT64 },
544 { "hash_collisions", KSTAT_DATA_UINT64 },
545 { "hash_chains", KSTAT_DATA_UINT64 },
546 { "hash_chain_max", KSTAT_DATA_UINT64 },
547 { "p", KSTAT_DATA_UINT64 },
548 { "c", KSTAT_DATA_UINT64 },
549 { "c_min", KSTAT_DATA_UINT64 },
550 { "c_max", KSTAT_DATA_UINT64 },
551 { "size", KSTAT_DATA_UINT64 },
552 { "hdr_size", KSTAT_DATA_UINT64 },
553 { "data_size", KSTAT_DATA_UINT64 },
554 { "metadata_size", KSTAT_DATA_UINT64 },
555 { "other_size", KSTAT_DATA_UINT64 },
556 { "anon_size", KSTAT_DATA_UINT64 },
557 { "anon_evictable_data", KSTAT_DATA_UINT64 },
558 { "anon_evictable_metadata", KSTAT_DATA_UINT64 },
559 { "mru_size", KSTAT_DATA_UINT64 },
560 { "mru_evictable_data", KSTAT_DATA_UINT64 },
561 { "mru_evictable_metadata", KSTAT_DATA_UINT64 },
562 { "mru_ghost_size", KSTAT_DATA_UINT64 },
563 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 },
564 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
565 { "mfu_size", KSTAT_DATA_UINT64 },
566 { "mfu_evictable_data", KSTAT_DATA_UINT64 },
567 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 },
568 { "mfu_ghost_size", KSTAT_DATA_UINT64 },
569 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 },
570 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 },
571 { "l2_hits", KSTAT_DATA_UINT64 },
572 { "l2_misses", KSTAT_DATA_UINT64 },
573 { "l2_feeds", KSTAT_DATA_UINT64 },
574 { "l2_rw_clash", KSTAT_DATA_UINT64 },
575 { "l2_read_bytes", KSTAT_DATA_UINT64 },
576 { "l2_write_bytes", KSTAT_DATA_UINT64 },
577 { "l2_writes_sent", KSTAT_DATA_UINT64 },
578 { "l2_writes_done", KSTAT_DATA_UINT64 },
579 { "l2_writes_error", KSTAT_DATA_UINT64 },
580 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 },
581 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
582 { "l2_evict_reading", KSTAT_DATA_UINT64 },
583 { "l2_evict_l1cached", KSTAT_DATA_UINT64 },
584 { "l2_free_on_write", KSTAT_DATA_UINT64 },
585 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 },
586 { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
587 { "l2_cksum_bad", KSTAT_DATA_UINT64 },
588 { "l2_io_error", KSTAT_DATA_UINT64 },
589 { "l2_size", KSTAT_DATA_UINT64 },
590 { "l2_asize", KSTAT_DATA_UINT64 },
591 { "l2_hdr_size", KSTAT_DATA_UINT64 },
592 { "l2_compress_successes", KSTAT_DATA_UINT64 },
593 { "l2_compress_zeros", KSTAT_DATA_UINT64 },
594 { "l2_compress_failures", KSTAT_DATA_UINT64 },
595 { "l2_log_blk_writes", KSTAT_DATA_UINT64 },
596 { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 },
597 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 },
598 { "l2_rebuild_successes", KSTAT_DATA_UINT64 },
599 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 },
600 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 },
601 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 },
602 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 },
603 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 },
604 { "l2_rebuild_size", KSTAT_DATA_UINT64 },
605 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 },
606 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 },
607 { "l2_rebuild_psize", KSTAT_DATA_UINT64 },
608 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 },
609 { "memory_throttle_count", KSTAT_DATA_UINT64 },
610 { "duplicate_buffers", KSTAT_DATA_UINT64 },
611 { "duplicate_buffers_size", KSTAT_DATA_UINT64 },
612 { "duplicate_reads", KSTAT_DATA_UINT64 },
613 { "arc_meta_used", KSTAT_DATA_UINT64 },
614 { "arc_meta_limit", KSTAT_DATA_UINT64 },
615 { "arc_meta_max", KSTAT_DATA_UINT64 },
616 { "arc_meta_min", KSTAT_DATA_UINT64 },
617 { "sync_wait_for_async", KSTAT_DATA_UINT64 },
618 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
619 };
620
621 #define ARCSTAT(stat) (arc_stats.stat.value.ui64)
622
623 #define ARCSTAT_INCR(stat, val) \
624 atomic_add_64(&arc_stats.stat.value.ui64, (val))
625
626 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
627 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
628
629 #define ARCSTAT_MAX(stat, val) { \
630 uint64_t m; \
631 while ((val) > (m = arc_stats.stat.value.ui64) && \
632 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \
633 continue; \
634 }
635
636 #define ARCSTAT_MAXSTAT(stat) \
637 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64)
638
639 /*
640 * We define a macro to allow ARC hits/misses to be easily broken down by
641 * two separate conditions, giving a total of four different subtypes for
642 * each of hits and misses (so eight statistics total).
643 */
644 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \
645 if (cond1) { \
646 if (cond2) { \
647 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \
648 } else { \
649 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \
650 } \
651 } else { \
652 if (cond2) { \
653 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \
654 } else { \
655 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\
656 } \
657 }
658
659 /*
660 * This macro allows us to use kstats as floating averages. Each time we
661 * update this kstat, we first factor it and the update value by
662 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall
663 * average. This macro assumes that integer loads and stores are atomic, but
664 * is not safe for multiple writers updating the kstat in parallel (only the
665 * last writer's update will remain).
666 */
667 #define ARCSTAT_F_AVG_FACTOR 3
668 #define ARCSTAT_F_AVG(stat, value) \
669 do { \
670 uint64_t x = ARCSTAT(stat); \
671 x = x - x / ARCSTAT_F_AVG_FACTOR + \
672 (value) / ARCSTAT_F_AVG_FACTOR; \
673 ARCSTAT(stat) = x; \
674 _NOTE(CONSTCOND) \
675 } while (0)
676
677 kstat_t *arc_ksp;
678 static arc_state_t *arc_anon;
679 static arc_state_t *arc_mru;
680 static arc_state_t *arc_mru_ghost;
681 static arc_state_t *arc_mfu;
682 static arc_state_t *arc_mfu_ghost;
683 static arc_state_t *arc_l2c_only;
684
685 /*
686 * There are several ARC variables that are critical to export as kstats --
687 * but we don't want to have to grovel around in the kstat whenever we wish to
688 * manipulate them. For these variables, we therefore define them to be in
689 * terms of the statistic variable. This assures that we are not introducing
690 * the possibility of inconsistency by having shadow copies of the variables,
691 * while still allowing the code to be readable.
692 */
693 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */
694 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */
695 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */
696 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */
697 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */
698 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */
699 #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */
700 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */
701 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */
702
703 #define L2ARC_IS_VALID_COMPRESS(_c_) \
704 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY)
705
706 static int arc_no_grow; /* Don't try to grow cache size */
707 static uint64_t arc_tempreserve;
708 static uint64_t arc_loaned_bytes;
709
710 typedef struct arc_callback arc_callback_t;
711
712 struct arc_callback {
713 void *acb_private;
714 arc_done_func_t *acb_done;
715 arc_buf_t *acb_buf;
716 zio_t *acb_zio_dummy;
717 arc_callback_t *acb_next;
718 };
719
720 typedef struct arc_write_callback arc_write_callback_t;
721
722 struct arc_write_callback {
723 void *awcb_private;
724 arc_done_func_t *awcb_ready;
725 arc_done_func_t *awcb_physdone;
726 arc_done_func_t *awcb_done;
727 arc_buf_t *awcb_buf;
728 };
729
730 /*
731 * ARC buffers are separated into multiple structs as a memory saving measure:
732 * - Common fields struct, always defined, and embedded within it:
733 * - L2-only fields, always allocated but undefined when not in L2ARC
734 * - L1-only fields, only allocated when in L1ARC
735 *
736 * Buffer in L1 Buffer only in L2
737 * +------------------------+ +------------------------+
738 * | arc_buf_hdr_t | | arc_buf_hdr_t |
739 * | | | |
740 * | | | |
741 * | | | |
742 * +------------------------+ +------------------------+
743 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t |
744 * | (undefined if L1-only) | | |
745 * +------------------------+ +------------------------+
746 * | l1arc_buf_hdr_t |
747 * | |
748 * | |
749 * | |
750 * | |
751 * +------------------------+
752 *
753 * Because it's possible for the L2ARC to become extremely large, we can wind
754 * up eating a lot of memory in L2ARC buffer headers, so the size of a header
755 * is minimized by only allocating the fields necessary for an L1-cached buffer
756 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and
757 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple
758 * words in pointers. arc_hdr_realloc() is used to switch a header between
759 * these two allocation states.
760 */
761 typedef struct l1arc_buf_hdr {
762 kmutex_t b_freeze_lock;
763 #ifdef ZFS_DEBUG
764 /*
765 * used for debugging wtih kmem_flags - by allocating and freeing
766 * b_thawed when the buffer is thawed, we get a record of the stack
767 * trace that thawed it.
768 */
769 void *b_thawed;
770 #endif
771
772 arc_buf_t *b_buf;
773 uint32_t b_datacnt;
774 /* for waiting on writes to complete */
775 kcondvar_t b_cv;
776
777 /* protected by arc state mutex */
778 arc_state_t *b_state;
779 multilist_node_t b_arc_node;
780
781 /* updated atomically */
782 clock_t b_arc_access;
783
784 /* self protecting */
785 refcount_t b_refcnt;
786
787 arc_callback_t *b_acb;
788 /* temporary buffer holder for in-flight compressed data */
789 void *b_tmp_cdata;
790 } l1arc_buf_hdr_t;
791
792 typedef struct l2arc_dev l2arc_dev_t;
793
794 typedef struct l2arc_buf_hdr {
795 /* protected by arc_buf_hdr mutex */
796 l2arc_dev_t *b_dev; /* L2ARC device */
797 uint64_t b_daddr; /* disk address, offset byte */
798 /* real alloc'd buffer size depending on b_compress applied */
799 int32_t b_asize;
800 uint8_t b_compress;
801
802 list_node_t b_l2node;
803 } l2arc_buf_hdr_t;
804
805 struct arc_buf_hdr {
806 /* protected by hash lock */
807 dva_t b_dva;
808 uint64_t b_birth;
809 /*
810 * Even though this checksum is only set/verified when a buffer is in
811 * the L1 cache, it needs to be in the set of common fields because it
812 * must be preserved from the time before a buffer is written out to
813 * L2ARC until after it is read back in.
814 */
815 zio_cksum_t *b_freeze_cksum;
816
817 arc_buf_hdr_t *b_hash_next;
818 arc_flags_t b_flags;
819
820 /* immutable */
821 int32_t b_size;
822 uint64_t b_spa;
823
824 /* L2ARC fields. Undefined when not in L2ARC. */
825 l2arc_buf_hdr_t b_l2hdr;
826 /* L1ARC fields. Undefined when in l2arc_only state */
827 l1arc_buf_hdr_t b_l1hdr;
828 };
829
830 static arc_buf_t *arc_eviction_list;
831 static arc_buf_hdr_t arc_eviction_hdr;
832
833 #define GHOST_STATE(state) \
834 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
835 (state) == arc_l2c_only)
836
837 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE)
838 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
839 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
840 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH)
841 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ)
842 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE)
843
844 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE)
845 #define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS)
846 #define HDR_L2_READING(hdr) \
847 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \
848 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR))
849 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING)
850 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED)
851 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD)
852
853 #define HDR_ISTYPE_METADATA(hdr) \
854 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA)
855 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr))
856
857 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR)
858 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)
859
860 /*
861 * Other sizes
862 */
863
864 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t))
865 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr))
866
867 /*
868 * Hash table routines
869 */
870
871 #define HT_LOCK_PAD 64
872
873 struct ht_lock {
874 kmutex_t ht_lock;
875 #ifdef _KERNEL
876 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
877 #endif
878 };
879
880 #define BUF_LOCKS 256
881 typedef struct buf_hash_table {
882 uint64_t ht_mask;
883 arc_buf_hdr_t **ht_table;
884 struct ht_lock ht_locks[BUF_LOCKS];
885 } buf_hash_table_t;
886
887 static buf_hash_table_t buf_hash_table;
888
889 #define BUF_HASH_INDEX(spa, dva, birth) \
890 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
891 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
892 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
893 #define HDR_LOCK(hdr) \
894 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth)))
895
896 uint64_t zfs_crc64_table[256];
897
898 /*
899 * Level 2 ARC
900 */
901
902 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
903 #define L2ARC_HEADROOM 2 /* num of writes */
904 /*
905 * If we discover during ARC scan any buffers to be compressed, we boost
906 * our headroom for the next scanning cycle by this percentage multiple.
907 */
908 #define L2ARC_HEADROOM_BOOST 200
909 #define L2ARC_FEED_SECS 1 /* caching interval secs */
910 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */
911
912 /*
913 * Used to distinguish headers that are being process by
914 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk
915 * address. This can happen when the header is added to the l2arc's list
916 * of buffers to write in the first stage of l2arc_write_buffers(), but
917 * has not yet been written out which happens in the second stage of
918 * l2arc_write_buffers().
919 */
920 #define L2ARC_ADDR_UNSET ((uint64_t)(-1))
921
922 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
923 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
924
925 /* L2ARC Performance Tunables */
926 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
927 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
928 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
929 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST;
930 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
931 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */
932 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
933 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */
934 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */
935
936 static list_t L2ARC_dev_list; /* device list */
937 static list_t *l2arc_dev_list; /* device list pointer */
938 static kmutex_t l2arc_dev_mtx; /* device list mutex */
939 static l2arc_dev_t *l2arc_dev_last; /* last device used */
940 static list_t L2ARC_free_on_write; /* free after write buf list */
941 static list_t *l2arc_free_on_write; /* free after write list ptr */
942 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
943 static uint64_t l2arc_ndev; /* number of devices */
944
945 typedef struct l2arc_read_callback {
946 arc_buf_t *l2rcb_buf; /* read buffer */
947 spa_t *l2rcb_spa; /* spa */
948 blkptr_t l2rcb_bp; /* original blkptr */
949 zbookmark_phys_t l2rcb_zb; /* original bookmark */
950 int l2rcb_flags; /* original flags */
951 enum zio_compress l2rcb_compress; /* applied compress */
952 } l2arc_read_callback_t;
953
954 typedef struct l2arc_write_callback {
955 l2arc_dev_t *l2wcb_dev; /* device info */
956 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
957 list_t l2wcb_log_blk_buflist; /* in-flight log blocks */
958 } l2arc_write_callback_t;
959
960 typedef struct l2arc_data_free {
961 /* protected by l2arc_free_on_write_mtx */
962 void *l2df_data;
963 size_t l2df_size;
964 void (*l2df_func)(void *, size_t);
965 list_node_t l2df_list_node;
966 } l2arc_data_free_t;
967
968 static kmutex_t l2arc_feed_thr_lock;
969 static kcondvar_t l2arc_feed_thr_cv;
970 static uint8_t l2arc_thread_exit;
971
972 static void arc_get_data_buf(arc_buf_t *);
973 static void arc_access(arc_buf_hdr_t *, kmutex_t *);
974 static boolean_t arc_is_overflowing();
975 static void arc_buf_watch(arc_buf_t *);
976 static void l2arc_read_done(zio_t *zio);
977 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd);
978
979 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *);
980 static uint32_t arc_bufc_to_flags(arc_buf_contents_t);
981 static arc_buf_contents_t arc_flags_to_bufc(uint32_t);
982
983 static boolean_t l2arc_write_eligible(uint64_t, uint64_t, arc_buf_hdr_t *);
984 static void l2arc_read_done(zio_t *);
985
986 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *);
987 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress);
988 static void l2arc_release_cdata_buf(arc_buf_hdr_t *);
989
990 static void
arc_update_hit_stat(arc_buf_hdr_t * hdr,boolean_t hit)991 arc_update_hit_stat(arc_buf_hdr_t *hdr, boolean_t hit)
992 {
993 boolean_t pf = !HDR_PREFETCH(hdr);
994 switch (arc_buf_type(hdr)) {
995 case ARC_BUFC_DATA:
996 ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses, data);
997 break;
998 case ARC_BUFC_METADATA:
999 ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses,
1000 metadata);
1001 break;
1002 default:
1003 break;
1004 }
1005 }
1006
1007 enum {
1008 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */
1009 };
1010
1011 /*
1012 * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers).
1013 */
1014 typedef struct l2arc_log_blkptr {
1015 uint64_t lbp_daddr; /* device address of log */
1016 /*
1017 * lbp_prop is the same format as the blk_prop in blkptr_t:
1018 * * logical size (in sectors)
1019 * * physical (compressed) size (in sectors)
1020 * * compression algorithm (we always LZ4-compress l2arc logs)
1021 * * checksum algorithm (used for lbp_cksum)
1022 * * object type & level (unused for now)
1023 */
1024 uint64_t lbp_prop;
1025 zio_cksum_t lbp_cksum; /* fletcher4 of log */
1026 } l2arc_log_blkptr_t;
1027
1028 /*
1029 * The persistent L2ARC device header.
1030 * Byte order of magic determines whether 64-bit bswap of fields is necessary.
1031 */
1032 typedef struct l2arc_dev_hdr_phys {
1033 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */
1034 zio_cksum_t dh_self_cksum; /* fletcher4 of fields below */
1035
1036 /*
1037 * Global L2ARC device state and metadata.
1038 */
1039 uint64_t dh_spa_guid;
1040 uint64_t dh_alloc_space; /* vdev space alloc status */
1041 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */
1042
1043 /*
1044 * Start of log block chain. [0] -> newest log, [1] -> one older (used
1045 * for initiating prefetch).
1046 */
1047 l2arc_log_blkptr_t dh_start_lbps[2];
1048
1049 const uint64_t dh_pad[44]; /* pad to 512 bytes */
1050 } l2arc_dev_hdr_phys_t;
1051 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE);
1052
1053 /*
1054 * A single ARC buffer header entry in a l2arc_log_blk_phys_t.
1055 */
1056 typedef struct l2arc_log_ent_phys {
1057 dva_t le_dva; /* dva of buffer */
1058 uint64_t le_birth; /* birth txg of buffer */
1059 zio_cksum_t le_freeze_cksum;
1060 /*
1061 * le_prop is the same format as the blk_prop in blkptr_t:
1062 * * logical size (in sectors)
1063 * * physical (compressed) size (in sectors)
1064 * * compression algorithm
1065 * * checksum algorithm (used for b_freeze_cksum)
1066 * * object type & level (used to restore arc_buf_contents_t)
1067 */
1068 uint64_t le_prop;
1069 uint64_t le_daddr; /* buf location on l2dev */
1070 const uint64_t le_pad[7]; /* resv'd for future use */
1071 } l2arc_log_ent_phys_t;
1072
1073 /*
1074 * These design limits give us the following metadata overhead (before
1075 * compression):
1076 * avg_blk_sz overhead
1077 * 1k 12.51 %
1078 * 2k 6.26 %
1079 * 4k 3.13 %
1080 * 8k 1.56 %
1081 * 16k 0.78 %
1082 * 32k 0.39 %
1083 * 64k 0.20 %
1084 * 128k 0.10 %
1085 * Compression should be able to sequeeze these down by about a factor of 2x.
1086 */
1087 #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */
1088 #define L2ARC_LOG_BLK_HEADER_LEN (128)
1089 #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \
1090 ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \
1091 sizeof (l2arc_log_ent_phys_t))
1092 /*
1093 * Maximum amount of data in an l2arc log block (used to terminate rebuilding
1094 * before we hit the write head and restore potentially corrupted blocks).
1095 */
1096 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \
1097 (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES)
1098 /*
1099 * For the persistency and rebuild algorithms to operate reliably we need
1100 * the L2ARC device to at least be able to hold 3 full log blocks (otherwise
1101 * excessive log block looping might confuse the log chain end detection).
1102 * Under normal circumstances this is not a problem, since this is somewhere
1103 * around only 400 MB.
1104 */
1105 #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE)
1106
1107 /*
1108 * A log block of up to 1023 ARC buffer log entries, chained into the
1109 * persistent L2ARC metadata linked list. Byte order of magic determines
1110 * whether 64-bit bswap of fields is necessary.
1111 */
1112 typedef struct l2arc_log_blk_phys {
1113 /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */
1114 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */
1115 l2arc_log_blkptr_t lb_back2_lbp; /* back 2 steps in chain */
1116 uint64_t lb_pad[9]; /* resv'd for future use */
1117 /* Payload */
1118 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_ENTRIES];
1119 } l2arc_log_blk_phys_t;
1120
1121 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE);
1122 CTASSERT(offsetof(l2arc_log_blk_phys_t, lb_entries) -
1123 offsetof(l2arc_log_blk_phys_t, lb_magic) == L2ARC_LOG_BLK_HEADER_LEN);
1124
1125 /*
1126 * These structures hold in-flight l2arc_log_blk_phys_t's as they're being
1127 * written to the L2ARC device. They may be compressed, hence the uint8_t[].
1128 */
1129 typedef struct l2arc_log_blk_buf {
1130 uint8_t lbb_log_blk[sizeof (l2arc_log_blk_phys_t)];
1131 list_node_t lbb_node;
1132 } l2arc_log_blk_buf_t;
1133
1134 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */
1135 #define BLKPROP_GET_LSIZE(_obj, _field) \
1136 BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1)
1137 #define BLKPROP_SET_LSIZE(_obj, _field, x) \
1138 BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
1139 #define BLKPROP_GET_PSIZE(_obj, _field) \
1140 BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1)
1141 #define BLKPROP_SET_PSIZE(_obj, _field, x) \
1142 BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
1143 #define BLKPROP_GET_COMPRESS(_obj, _field) \
1144 BF64_GET((_obj)->_field, 32, 8)
1145 #define BLKPROP_SET_COMPRESS(_obj, _field, x) \
1146 BF64_SET((_obj)->_field, 32, 8, x)
1147 #define BLKPROP_GET_CHECKSUM(_obj, _field) \
1148 BF64_GET((_obj)->_field, 40, 8)
1149 #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \
1150 BF64_SET((_obj)->_field, 40, 8, x)
1151 #define BLKPROP_GET_TYPE(_obj, _field) \
1152 BF64_GET((_obj)->_field, 48, 8)
1153 #define BLKPROP_SET_TYPE(_obj, _field, x) \
1154 BF64_SET((_obj)->_field, 48, 8, x)
1155
1156 /* Macros for manipulating a l2arc_log_blkptr_t->lbp_prop field */
1157 #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, lbp_prop)
1158 #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, lbp_prop, x)
1159 #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, lbp_prop)
1160 #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, lbp_prop, x)
1161 #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, lbp_prop)
1162 #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, lbp_prop, \
1163 x)
1164 #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, lbp_prop)
1165 #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, lbp_prop, \
1166 x)
1167 #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, lbp_prop)
1168 #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, lbp_prop, x)
1169
1170 /* Macros for manipulating a l2arc_log_ent_phys_t->le_prop field */
1171 #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, le_prop)
1172 #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, le_prop, x)
1173 #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, le_prop)
1174 #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, le_prop, x)
1175 #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, le_prop)
1176 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, le_prop, x)
1177 #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, le_prop)
1178 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, le_prop, x)
1179 #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, le_prop)
1180 #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, le_prop, x)
1181
1182 #define PTR_SWAP(x, y) \
1183 do { \
1184 void *tmp = (x);\
1185 x = y; \
1186 y = tmp; \
1187 _NOTE(CONSTCOND)\
1188 } while (0)
1189
1190 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */
1191 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */
1192
1193 /*
1194 * Performance tuning of L2ARC persistency:
1195 *
1196 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at
1197 * pool import or when adding one manually later) will attempt
1198 * to rebuild L2ARC buffer contents. In special circumstances,
1199 * the administrator may want to set this to B_FALSE, if they
1200 * are having trouble importing a pool or attaching an L2ARC
1201 * device (e.g. the L2ARC device is slow to read in stored log
1202 * metadata, or the metadata has become somehow
1203 * fragmented/unusable).
1204 */
1205 boolean_t l2arc_rebuild_enabled = B_TRUE;
1206
1207 /* L2ARC persistency rebuild control routines. */
1208 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev);
1209 static int l2arc_rebuild(l2arc_dev_t *dev);
1210
1211 /* L2ARC persistency read I/O routines. */
1212 static int l2arc_dev_hdr_read(l2arc_dev_t *dev);
1213 static int l2arc_log_blk_read(l2arc_dev_t *dev,
1214 const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp,
1215 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
1216 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
1217 zio_t *this_io, zio_t **next_io);
1218 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd,
1219 const l2arc_log_blkptr_t *lp, uint8_t *lb_buf);
1220 static void l2arc_log_blk_prefetch_abort(zio_t *zio);
1221
1222 /* L2ARC persistency block restoration routines. */
1223 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
1224 const l2arc_log_blk_phys_t *lb, uint64_t lb_psize);
1225 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le,
1226 l2arc_dev_t *dev, uint64_t guid);
1227
1228 /* L2ARC persistency write I/O routines. */
1229 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio);
1230 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
1231 l2arc_write_callback_t *cb);
1232
1233 /* L2ARC persistency auxilliary routines. */
1234 static boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev,
1235 const l2arc_log_blkptr_t *lp);
1236 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr,
1237 zio_cksum_t *cksum);
1238 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev,
1239 const arc_buf_hdr_t *ab);
1240 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom,
1241 uint64_t top, uint64_t check);
1242
1243 /*
1244 * L2ARC Internals
1245 */
1246 struct l2arc_dev {
1247 vdev_t *l2ad_vdev; /* vdev */
1248 spa_t *l2ad_spa; /* spa */
1249 uint64_t l2ad_hand; /* next write location */
1250 uint64_t l2ad_start; /* first addr on device */
1251 uint64_t l2ad_end; /* last addr on device */
1252 boolean_t l2ad_first; /* first sweep through */
1253 boolean_t l2ad_writing; /* currently writing */
1254 kmutex_t l2ad_mtx; /* lock for buffer list */
1255 list_t l2ad_buflist; /* buffer list */
1256 list_node_t l2ad_node; /* device list node */
1257 refcount_t l2ad_alloc; /* allocated bytes */
1258 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */
1259 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */
1260 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */
1261 int l2ad_log_ent_idx; /* index into cur log blk */
1262 /* number of bytes in current log block's payload */
1263 uint64_t l2ad_log_blk_payload_asize;
1264 /* flag indicating whether a rebuild is scheduled or is going on */
1265 boolean_t l2ad_rebuild;
1266 boolean_t l2ad_rebuild_cancel;
1267 kt_did_t l2ad_rebuild_did;
1268 };
1269
1270 static inline uint64_t
buf_hash(uint64_t spa,const dva_t * dva,uint64_t birth)1271 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
1272 {
1273 uint8_t *vdva = (uint8_t *)dva;
1274 uint64_t crc = -1ULL;
1275 int i;
1276
1277 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
1278
1279 for (i = 0; i < sizeof (dva_t); i++)
1280 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
1281
1282 crc ^= (spa>>8) ^ birth;
1283
1284 return (crc);
1285 }
1286
1287 #define BUF_EMPTY(buf) \
1288 ((buf)->b_dva.dva_word[0] == 0 && \
1289 (buf)->b_dva.dva_word[1] == 0)
1290
1291 #define BUF_EQUAL(spa, dva, birth, buf) \
1292 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \
1293 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \
1294 ((buf)->b_birth == birth) && ((buf)->b_spa == spa)
1295
1296 static void
buf_discard_identity(arc_buf_hdr_t * hdr)1297 buf_discard_identity(arc_buf_hdr_t *hdr)
1298 {
1299 hdr->b_dva.dva_word[0] = 0;
1300 hdr->b_dva.dva_word[1] = 0;
1301 hdr->b_birth = 0;
1302 }
1303
1304 static arc_buf_hdr_t *
buf_hash_find(uint64_t spa,const blkptr_t * bp,kmutex_t ** lockp)1305 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp)
1306 {
1307 const dva_t *dva = BP_IDENTITY(bp);
1308 uint64_t birth = BP_PHYSICAL_BIRTH(bp);
1309 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
1310 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1311 arc_buf_hdr_t *hdr;
1312
1313 mutex_enter(hash_lock);
1314 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL;
1315 hdr = hdr->b_hash_next) {
1316 if (BUF_EQUAL(spa, dva, birth, hdr)) {
1317 *lockp = hash_lock;
1318 return (hdr);
1319 }
1320 }
1321 mutex_exit(hash_lock);
1322 *lockp = NULL;
1323 return (NULL);
1324 }
1325
1326 /*
1327 * Insert an entry into the hash table. If there is already an element
1328 * equal to elem in the hash table, then the already existing element
1329 * will be returned and the new element will not be inserted.
1330 * Otherwise returns NULL.
1331 * If lockp == NULL, the caller is assumed to already hold the hash lock.
1332 */
1333 static arc_buf_hdr_t *
buf_hash_insert(arc_buf_hdr_t * hdr,kmutex_t ** lockp)1334 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp)
1335 {
1336 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1337 kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
1338 arc_buf_hdr_t *fhdr;
1339 uint32_t i;
1340
1341 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva));
1342 ASSERT(hdr->b_birth != 0);
1343 ASSERT(!HDR_IN_HASH_TABLE(hdr));
1344
1345 if (lockp != NULL) {
1346 *lockp = hash_lock;
1347 mutex_enter(hash_lock);
1348 } else {
1349 ASSERT(MUTEX_HELD(hash_lock));
1350 }
1351
1352 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL;
1353 fhdr = fhdr->b_hash_next, i++) {
1354 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr))
1355 return (fhdr);
1356 }
1357
1358 hdr->b_hash_next = buf_hash_table.ht_table[idx];
1359 buf_hash_table.ht_table[idx] = hdr;
1360 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
1361
1362 /* collect some hash table performance data */
1363 if (i > 0) {
1364 ARCSTAT_BUMP(arcstat_hash_collisions);
1365 if (i == 1)
1366 ARCSTAT_BUMP(arcstat_hash_chains);
1367
1368 ARCSTAT_MAX(arcstat_hash_chain_max, i);
1369 }
1370
1371 ARCSTAT_BUMP(arcstat_hash_elements);
1372 ARCSTAT_MAXSTAT(arcstat_hash_elements);
1373
1374 return (NULL);
1375 }
1376
1377 static void
buf_hash_remove(arc_buf_hdr_t * hdr)1378 buf_hash_remove(arc_buf_hdr_t *hdr)
1379 {
1380 arc_buf_hdr_t *fhdr, **hdrp;
1381 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth);
1382
1383 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
1384 ASSERT(HDR_IN_HASH_TABLE(hdr));
1385
1386 hdrp = &buf_hash_table.ht_table[idx];
1387 while ((fhdr = *hdrp) != hdr) {
1388 ASSERT(fhdr != NULL);
1389 hdrp = &fhdr->b_hash_next;
1390 }
1391 *hdrp = hdr->b_hash_next;
1392 hdr->b_hash_next = NULL;
1393 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE;
1394
1395 /* collect some hash table performance data */
1396 ARCSTAT_BUMPDOWN(arcstat_hash_elements);
1397
1398 if (buf_hash_table.ht_table[idx] &&
1399 buf_hash_table.ht_table[idx]->b_hash_next == NULL)
1400 ARCSTAT_BUMPDOWN(arcstat_hash_chains);
1401 }
1402
1403 /*
1404 * Global data structures and functions for the buf kmem cache.
1405 */
1406 static kmem_cache_t *hdr_full_cache;
1407 static kmem_cache_t *hdr_l2only_cache;
1408 static kmem_cache_t *buf_cache;
1409
1410 static void
buf_fini(void)1411 buf_fini(void)
1412 {
1413 int i;
1414
1415 kmem_free(buf_hash_table.ht_table,
1416 (buf_hash_table.ht_mask + 1) * sizeof (void *));
1417 for (i = 0; i < BUF_LOCKS; i++)
1418 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
1419 kmem_cache_destroy(hdr_full_cache);
1420 kmem_cache_destroy(hdr_l2only_cache);
1421 kmem_cache_destroy(buf_cache);
1422 }
1423
1424 /*
1425 * Constructor callback - called when the cache is empty
1426 * and a new buf is requested.
1427 */
1428 /* ARGSUSED */
1429 static int
hdr_full_cons(void * vbuf,void * unused,int kmflag)1430 hdr_full_cons(void *vbuf, void *unused, int kmflag)
1431 {
1432 arc_buf_hdr_t *hdr = vbuf;
1433
1434 bzero(hdr, HDR_FULL_SIZE);
1435 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL);
1436 refcount_create(&hdr->b_l1hdr.b_refcnt);
1437 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
1438 multilist_link_init(&hdr->b_l1hdr.b_arc_node);
1439 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1440
1441 return (0);
1442 }
1443
1444 /* ARGSUSED */
1445 static int
hdr_l2only_cons(void * vbuf,void * unused,int kmflag)1446 hdr_l2only_cons(void *vbuf, void *unused, int kmflag)
1447 {
1448 arc_buf_hdr_t *hdr = vbuf;
1449
1450 bzero(hdr, HDR_L2ONLY_SIZE);
1451 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1452
1453 return (0);
1454 }
1455
1456 /* ARGSUSED */
1457 static int
buf_cons(void * vbuf,void * unused,int kmflag)1458 buf_cons(void *vbuf, void *unused, int kmflag)
1459 {
1460 arc_buf_t *buf = vbuf;
1461
1462 bzero(buf, sizeof (arc_buf_t));
1463 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL);
1464 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1465
1466 return (0);
1467 }
1468
1469 /*
1470 * Destructor callback - called when a cached buf is
1471 * no longer required.
1472 */
1473 /* ARGSUSED */
1474 static void
hdr_full_dest(void * vbuf,void * unused)1475 hdr_full_dest(void *vbuf, void *unused)
1476 {
1477 arc_buf_hdr_t *hdr = vbuf;
1478
1479 ASSERT(BUF_EMPTY(hdr));
1480 cv_destroy(&hdr->b_l1hdr.b_cv);
1481 refcount_destroy(&hdr->b_l1hdr.b_refcnt);
1482 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock);
1483 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1484 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS);
1485 }
1486
1487 /* ARGSUSED */
1488 static void
hdr_l2only_dest(void * vbuf,void * unused)1489 hdr_l2only_dest(void *vbuf, void *unused)
1490 {
1491 arc_buf_hdr_t *hdr = vbuf;
1492
1493 ASSERT(BUF_EMPTY(hdr));
1494 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS);
1495 }
1496
1497 /* ARGSUSED */
1498 static void
buf_dest(void * vbuf,void * unused)1499 buf_dest(void *vbuf, void *unused)
1500 {
1501 arc_buf_t *buf = vbuf;
1502
1503 mutex_destroy(&buf->b_evict_lock);
1504 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
1505 }
1506
1507 /*
1508 * Reclaim callback -- invoked when memory is low.
1509 */
1510 /* ARGSUSED */
1511 static void
hdr_recl(void * unused)1512 hdr_recl(void *unused)
1513 {
1514 dprintf("hdr_recl called\n");
1515 /*
1516 * umem calls the reclaim func when we destroy the buf cache,
1517 * which is after we do arc_fini().
1518 */
1519 if (!arc_dead)
1520 cv_signal(&arc_reclaim_thread_cv);
1521 }
1522
1523 static void
buf_init(void)1524 buf_init(void)
1525 {
1526 uint64_t *ct;
1527 uint64_t hsize = 1ULL << 12;
1528 int i, j;
1529
1530 /*
1531 * The hash table is big enough to fill all of physical memory
1532 * with an average block size of zfs_arc_average_blocksize (default 8K).
1533 * By default, the table will take up
1534 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
1535 */
1536 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
1537 hsize <<= 1;
1538 retry:
1539 buf_hash_table.ht_mask = hsize - 1;
1540 buf_hash_table.ht_table =
1541 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
1542 if (buf_hash_table.ht_table == NULL) {
1543 ASSERT(hsize > (1ULL << 8));
1544 hsize >>= 1;
1545 goto retry;
1546 }
1547
1548 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE,
1549 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0);
1550 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only",
1551 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl,
1552 NULL, NULL, 0);
1553 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
1554 0, buf_cons, buf_dest, NULL, NULL, NULL, 0);
1555
1556 for (i = 0; i < 256; i++)
1557 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
1558 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
1559
1560 for (i = 0; i < BUF_LOCKS; i++) {
1561 mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
1562 NULL, MUTEX_DEFAULT, NULL);
1563 }
1564 }
1565
1566 /*
1567 * Transition between the two allocation states for the arc_buf_hdr struct.
1568 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without
1569 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller
1570 * version is used when a cache buffer is only in the L2ARC in order to reduce
1571 * memory usage.
1572 */
1573 static arc_buf_hdr_t *
arc_hdr_realloc(arc_buf_hdr_t * hdr,kmem_cache_t * old,kmem_cache_t * new)1574 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new)
1575 {
1576 ASSERT(HDR_HAS_L2HDR(hdr));
1577
1578 arc_buf_hdr_t *nhdr;
1579 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
1580
1581 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) ||
1582 (old == hdr_l2only_cache && new == hdr_full_cache));
1583
1584 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE);
1585
1586 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)));
1587 buf_hash_remove(hdr);
1588
1589 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE);
1590
1591 if (new == hdr_full_cache) {
1592 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
1593 /*
1594 * arc_access and arc_change_state need to be aware that a
1595 * header has just come out of L2ARC, so we set its state to
1596 * l2c_only even though it's about to change.
1597 */
1598 nhdr->b_l1hdr.b_state = arc_l2c_only;
1599
1600 /* Verify previous threads set to NULL before freeing */
1601 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1602 } else {
1603 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1604 ASSERT0(hdr->b_l1hdr.b_datacnt);
1605
1606 /*
1607 * If we've reached here, We must have been called from
1608 * arc_evict_hdr(), as such we should have already been
1609 * removed from any ghost list we were previously on
1610 * (which protects us from racing with arc_evict_state),
1611 * thus no locking is needed during this check.
1612 */
1613 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
1614
1615 /*
1616 * A buffer must not be moved into the arc_l2c_only
1617 * state if it's not finished being written out to the
1618 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field
1619 * might try to be accessed, even though it was removed.
1620 */
1621 VERIFY(!HDR_L2_WRITING(hdr));
1622 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
1623
1624 #ifdef ZFS_DEBUG
1625 if (hdr->b_l1hdr.b_thawed != NULL) {
1626 kmem_free(hdr->b_l1hdr.b_thawed, 1);
1627 hdr->b_l1hdr.b_thawed = NULL;
1628 }
1629 #endif
1630
1631 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR;
1632 }
1633 /*
1634 * The header has been reallocated so we need to re-insert it into any
1635 * lists it was on.
1636 */
1637 (void) buf_hash_insert(nhdr, NULL);
1638
1639 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node));
1640
1641 mutex_enter(&dev->l2ad_mtx);
1642
1643 /*
1644 * We must place the realloc'ed header back into the list at
1645 * the same spot. Otherwise, if it's placed earlier in the list,
1646 * l2arc_write_buffers() could find it during the function's
1647 * write phase, and try to write it out to the l2arc.
1648 */
1649 list_insert_after(&dev->l2ad_buflist, hdr, nhdr);
1650 list_remove(&dev->l2ad_buflist, hdr);
1651
1652 mutex_exit(&dev->l2ad_mtx);
1653
1654 /*
1655 * Since we're using the pointer address as the tag when
1656 * incrementing and decrementing the l2ad_alloc refcount, we
1657 * must remove the old pointer (that we're about to destroy) and
1658 * add the new pointer to the refcount. Otherwise we'd remove
1659 * the wrong pointer address when calling arc_hdr_destroy() later.
1660 */
1661
1662 (void) refcount_remove_many(&dev->l2ad_alloc,
1663 hdr->b_l2hdr.b_asize, hdr);
1664
1665 (void) refcount_add_many(&dev->l2ad_alloc,
1666 nhdr->b_l2hdr.b_asize, nhdr);
1667
1668 buf_discard_identity(hdr);
1669 hdr->b_freeze_cksum = NULL;
1670 kmem_cache_free(old, hdr);
1671
1672 return (nhdr);
1673 }
1674
1675
1676 #define ARC_MINTIME (hz>>4) /* 62 ms */
1677
1678 static void
arc_cksum_verify(arc_buf_t * buf)1679 arc_cksum_verify(arc_buf_t *buf)
1680 {
1681 zio_cksum_t zc;
1682
1683 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1684 return;
1685
1686 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1687 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) {
1688 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1689 return;
1690 }
1691 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
1692 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc))
1693 panic("buffer modified while frozen!");
1694 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1695 }
1696
1697 static int
arc_cksum_equal(arc_buf_t * buf)1698 arc_cksum_equal(arc_buf_t *buf)
1699 {
1700 zio_cksum_t zc;
1701 int equal;
1702
1703 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1704 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc);
1705 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
1706 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1707
1708 return (equal);
1709 }
1710
1711 static void
arc_cksum_compute(arc_buf_t * buf,boolean_t force)1712 arc_cksum_compute(arc_buf_t *buf, boolean_t force)
1713 {
1714 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
1715 return;
1716
1717 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1718 if (buf->b_hdr->b_freeze_cksum != NULL) {
1719 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1720 return;
1721 }
1722 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
1723 fletcher_2_native(buf->b_data, buf->b_hdr->b_size,
1724 NULL, buf->b_hdr->b_freeze_cksum);
1725 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1726 arc_buf_watch(buf);
1727 }
1728
1729 #ifndef _KERNEL
1730 typedef struct procctl {
1731 long cmd;
1732 prwatch_t prwatch;
1733 } procctl_t;
1734 #endif
1735
1736 /* ARGSUSED */
1737 static void
arc_buf_unwatch(arc_buf_t * buf)1738 arc_buf_unwatch(arc_buf_t *buf)
1739 {
1740 #ifndef _KERNEL
1741 if (arc_watch) {
1742 int result;
1743 procctl_t ctl;
1744 ctl.cmd = PCWATCH;
1745 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1746 ctl.prwatch.pr_size = 0;
1747 ctl.prwatch.pr_wflags = 0;
1748 result = write(arc_procfd, &ctl, sizeof (ctl));
1749 ASSERT3U(result, ==, sizeof (ctl));
1750 }
1751 #endif
1752 }
1753
1754 /* ARGSUSED */
1755 static void
arc_buf_watch(arc_buf_t * buf)1756 arc_buf_watch(arc_buf_t *buf)
1757 {
1758 #ifndef _KERNEL
1759 if (arc_watch) {
1760 int result;
1761 procctl_t ctl;
1762 ctl.cmd = PCWATCH;
1763 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data;
1764 ctl.prwatch.pr_size = buf->b_hdr->b_size;
1765 ctl.prwatch.pr_wflags = WA_WRITE;
1766 result = write(arc_procfd, &ctl, sizeof (ctl));
1767 ASSERT3U(result, ==, sizeof (ctl));
1768 }
1769 #endif
1770 }
1771
1772 static arc_buf_contents_t
arc_buf_type(arc_buf_hdr_t * hdr)1773 arc_buf_type(arc_buf_hdr_t *hdr)
1774 {
1775 if (HDR_ISTYPE_METADATA(hdr)) {
1776 return (ARC_BUFC_METADATA);
1777 } else {
1778 return (ARC_BUFC_DATA);
1779 }
1780 }
1781
1782 static uint32_t
arc_bufc_to_flags(arc_buf_contents_t type)1783 arc_bufc_to_flags(arc_buf_contents_t type)
1784 {
1785 switch (type) {
1786 case ARC_BUFC_DATA:
1787 /* metadata field is 0 if buffer contains normal data */
1788 return (0);
1789 case ARC_BUFC_METADATA:
1790 return (ARC_FLAG_BUFC_METADATA);
1791 default:
1792 break;
1793 }
1794 panic("undefined ARC buffer type!");
1795 return ((uint32_t)-1);
1796 }
1797
1798 static arc_buf_contents_t
arc_flags_to_bufc(uint32_t flags)1799 arc_flags_to_bufc(uint32_t flags)
1800 {
1801 if (flags & ARC_FLAG_BUFC_METADATA)
1802 return (ARC_BUFC_METADATA);
1803 return (ARC_BUFC_DATA);
1804 }
1805
1806 void
arc_buf_thaw(arc_buf_t * buf)1807 arc_buf_thaw(arc_buf_t *buf)
1808 {
1809 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1810 if (buf->b_hdr->b_l1hdr.b_state != arc_anon)
1811 panic("modifying non-anon buffer!");
1812 if (HDR_IO_IN_PROGRESS(buf->b_hdr))
1813 panic("modifying buffer while i/o in progress!");
1814 arc_cksum_verify(buf);
1815 }
1816
1817 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1818 if (buf->b_hdr->b_freeze_cksum != NULL) {
1819 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
1820 buf->b_hdr->b_freeze_cksum = NULL;
1821 }
1822
1823 #ifdef ZFS_DEBUG
1824 if (zfs_flags & ZFS_DEBUG_MODIFY) {
1825 if (buf->b_hdr->b_l1hdr.b_thawed != NULL)
1826 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1);
1827 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP);
1828 }
1829 #endif
1830
1831 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock);
1832
1833 arc_buf_unwatch(buf);
1834 }
1835
1836 void
arc_buf_freeze(arc_buf_t * buf)1837 arc_buf_freeze(arc_buf_t *buf)
1838 {
1839 kmutex_t *hash_lock;
1840
1841 if (!(zfs_flags & ZFS_DEBUG_MODIFY))
1842 return;
1843
1844 hash_lock = HDR_LOCK(buf->b_hdr);
1845 mutex_enter(hash_lock);
1846
1847 ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
1848 buf->b_hdr->b_l1hdr.b_state == arc_anon);
1849 arc_cksum_compute(buf, B_FALSE);
1850 mutex_exit(hash_lock);
1851
1852 }
1853
1854 static void
add_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1855 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1856 {
1857 ASSERT(HDR_HAS_L1HDR(hdr));
1858 ASSERT(MUTEX_HELD(hash_lock));
1859 arc_state_t *state = hdr->b_l1hdr.b_state;
1860
1861 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) &&
1862 (state != arc_anon)) {
1863 /* We don't use the L2-only state list. */
1864 if (state != arc_l2c_only) {
1865 arc_buf_contents_t type = arc_buf_type(hdr);
1866 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt;
1867 multilist_t *list = &state->arcs_list[type];
1868 uint64_t *size = &state->arcs_lsize[type];
1869
1870 multilist_remove(list, hdr);
1871
1872 if (GHOST_STATE(state)) {
1873 ASSERT0(hdr->b_l1hdr.b_datacnt);
1874 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
1875 delta = hdr->b_size;
1876 }
1877 ASSERT(delta > 0);
1878 ASSERT3U(*size, >=, delta);
1879 atomic_add_64(size, -delta);
1880 }
1881 /* remove the prefetch flag if we get a reference */
1882 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
1883 }
1884 }
1885
1886 static int
remove_reference(arc_buf_hdr_t * hdr,kmutex_t * hash_lock,void * tag)1887 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag)
1888 {
1889 int cnt;
1890 arc_state_t *state = hdr->b_l1hdr.b_state;
1891
1892 ASSERT(HDR_HAS_L1HDR(hdr));
1893 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock));
1894 ASSERT(!GHOST_STATE(state));
1895
1896 /*
1897 * arc_l2c_only counts as a ghost state so we don't need to explicitly
1898 * check to prevent usage of the arc_l2c_only list.
1899 */
1900 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) &&
1901 (state != arc_anon)) {
1902 arc_buf_contents_t type = arc_buf_type(hdr);
1903 multilist_t *list = &state->arcs_list[type];
1904 uint64_t *size = &state->arcs_lsize[type];
1905
1906 multilist_insert(list, hdr);
1907
1908 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
1909 atomic_add_64(size, hdr->b_size *
1910 hdr->b_l1hdr.b_datacnt);
1911 }
1912 return (cnt);
1913 }
1914
1915 /*
1916 * Move the supplied buffer to the indicated state. The hash lock
1917 * for the buffer must be held by the caller.
1918 */
1919 static void
arc_change_state(arc_state_t * new_state,arc_buf_hdr_t * hdr,kmutex_t * hash_lock)1920 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr,
1921 kmutex_t *hash_lock)
1922 {
1923 arc_state_t *old_state;
1924 int64_t refcnt;
1925 uint32_t datacnt;
1926 uint64_t from_delta, to_delta;
1927 arc_buf_contents_t buftype = arc_buf_type(hdr);
1928
1929 /*
1930 * We almost always have an L1 hdr here, since we call arc_hdr_realloc()
1931 * in arc_read() when bringing a buffer out of the L2ARC. However, the
1932 * L1 hdr doesn't always exist when we change state to arc_anon before
1933 * destroying a header, in which case reallocating to add the L1 hdr is
1934 * pointless.
1935 */
1936 if (HDR_HAS_L1HDR(hdr)) {
1937 old_state = hdr->b_l1hdr.b_state;
1938 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt);
1939 datacnt = hdr->b_l1hdr.b_datacnt;
1940 } else {
1941 old_state = arc_l2c_only;
1942 refcnt = 0;
1943 datacnt = 0;
1944 }
1945
1946 ASSERT(MUTEX_HELD(hash_lock));
1947 ASSERT3P(new_state, !=, old_state);
1948 ASSERT(refcnt == 0 || datacnt > 0);
1949 ASSERT(!GHOST_STATE(new_state) || datacnt == 0);
1950 ASSERT(old_state != arc_anon || datacnt <= 1);
1951
1952 from_delta = to_delta = datacnt * hdr->b_size;
1953
1954 /*
1955 * If this buffer is evictable, transfer it from the
1956 * old state list to the new state list.
1957 */
1958 if (refcnt == 0) {
1959 if (old_state != arc_anon && old_state != arc_l2c_only) {
1960 uint64_t *size = &old_state->arcs_lsize[buftype];
1961
1962 ASSERT(HDR_HAS_L1HDR(hdr));
1963 multilist_remove(&old_state->arcs_list[buftype], hdr);
1964
1965 /*
1966 * If prefetching out of the ghost cache,
1967 * we will have a non-zero datacnt.
1968 */
1969 if (GHOST_STATE(old_state) && datacnt == 0) {
1970 /* ghost elements have a ghost size */
1971 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1972 from_delta = hdr->b_size;
1973 }
1974 ASSERT3U(*size, >=, from_delta);
1975 atomic_add_64(size, -from_delta);
1976 }
1977 if (new_state != arc_anon && new_state != arc_l2c_only) {
1978 uint64_t *size = &new_state->arcs_lsize[buftype];
1979
1980 /*
1981 * An L1 header always exists here, since if we're
1982 * moving to some L1-cached state (i.e. not l2c_only or
1983 * anonymous), we realloc the header to add an L1hdr
1984 * beforehand.
1985 */
1986 ASSERT(HDR_HAS_L1HDR(hdr));
1987 multilist_insert(&new_state->arcs_list[buftype], hdr);
1988
1989 /* ghost elements have a ghost size */
1990 if (GHOST_STATE(new_state)) {
1991 ASSERT0(datacnt);
1992 ASSERT(hdr->b_l1hdr.b_buf == NULL);
1993 to_delta = hdr->b_size;
1994 }
1995 atomic_add_64(size, to_delta);
1996 }
1997 }
1998
1999 ASSERT(!BUF_EMPTY(hdr));
2000 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr))
2001 buf_hash_remove(hdr);
2002
2003 /* adjust state sizes (ignore arc_l2c_only) */
2004
2005 if (to_delta && new_state != arc_l2c_only) {
2006 ASSERT(HDR_HAS_L1HDR(hdr));
2007 if (GHOST_STATE(new_state)) {
2008 ASSERT0(datacnt);
2009
2010 /*
2011 * We moving a header to a ghost state, we first
2012 * remove all arc buffers. Thus, we'll have a
2013 * datacnt of zero, and no arc buffer to use for
2014 * the reference. As a result, we use the arc
2015 * header pointer for the reference.
2016 */
2017 (void) refcount_add_many(&new_state->arcs_size,
2018 hdr->b_size, hdr);
2019 } else {
2020 ASSERT3U(datacnt, !=, 0);
2021
2022 /*
2023 * Each individual buffer holds a unique reference,
2024 * thus we must remove each of these references one
2025 * at a time.
2026 */
2027 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2028 buf = buf->b_next) {
2029 (void) refcount_add_many(&new_state->arcs_size,
2030 hdr->b_size, buf);
2031 }
2032 }
2033 }
2034
2035 if (from_delta && old_state != arc_l2c_only) {
2036 ASSERT(HDR_HAS_L1HDR(hdr));
2037 if (GHOST_STATE(old_state)) {
2038 /*
2039 * When moving a header off of a ghost state,
2040 * there's the possibility for datacnt to be
2041 * non-zero. This is because we first add the
2042 * arc buffer to the header prior to changing
2043 * the header's state. Since we used the header
2044 * for the reference when putting the header on
2045 * the ghost state, we must balance that and use
2046 * the header when removing off the ghost state
2047 * (even though datacnt is non zero).
2048 */
2049
2050 IMPLY(datacnt == 0, new_state == arc_anon ||
2051 new_state == arc_l2c_only);
2052
2053 (void) refcount_remove_many(&old_state->arcs_size,
2054 hdr->b_size, hdr);
2055 } else {
2056 ASSERT3P(datacnt, !=, 0);
2057
2058 /*
2059 * Each individual buffer holds a unique reference,
2060 * thus we must remove each of these references one
2061 * at a time.
2062 */
2063 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
2064 buf = buf->b_next) {
2065 (void) refcount_remove_many(
2066 &old_state->arcs_size, hdr->b_size, buf);
2067 }
2068 }
2069 }
2070
2071 if (HDR_HAS_L1HDR(hdr))
2072 hdr->b_l1hdr.b_state = new_state;
2073
2074 /*
2075 * L2 headers should never be on the L2 state list since they don't
2076 * have L1 headers allocated.
2077 */
2078 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) &&
2079 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]));
2080 }
2081
2082 void
arc_space_consume(uint64_t space,arc_space_type_t type)2083 arc_space_consume(uint64_t space, arc_space_type_t type)
2084 {
2085 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2086
2087 switch (type) {
2088 case ARC_SPACE_DATA:
2089 ARCSTAT_INCR(arcstat_data_size, space);
2090 break;
2091 case ARC_SPACE_META:
2092 ARCSTAT_INCR(arcstat_metadata_size, space);
2093 break;
2094 case ARC_SPACE_OTHER:
2095 ARCSTAT_INCR(arcstat_other_size, space);
2096 break;
2097 case ARC_SPACE_HDRS:
2098 ARCSTAT_INCR(arcstat_hdr_size, space);
2099 break;
2100 case ARC_SPACE_L2HDRS:
2101 ARCSTAT_INCR(arcstat_l2_hdr_size, space);
2102 break;
2103 }
2104
2105 if (type != ARC_SPACE_DATA)
2106 ARCSTAT_INCR(arcstat_meta_used, space);
2107
2108 atomic_add_64(&arc_size, space);
2109 }
2110
2111 void
arc_space_return(uint64_t space,arc_space_type_t type)2112 arc_space_return(uint64_t space, arc_space_type_t type)
2113 {
2114 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
2115
2116 switch (type) {
2117 case ARC_SPACE_DATA:
2118 ARCSTAT_INCR(arcstat_data_size, -space);
2119 break;
2120 case ARC_SPACE_META:
2121 ARCSTAT_INCR(arcstat_metadata_size, -space);
2122 break;
2123 case ARC_SPACE_OTHER:
2124 ARCSTAT_INCR(arcstat_other_size, -space);
2125 break;
2126 case ARC_SPACE_HDRS:
2127 ARCSTAT_INCR(arcstat_hdr_size, -space);
2128 break;
2129 case ARC_SPACE_L2HDRS:
2130 ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
2131 break;
2132 }
2133
2134 if (type != ARC_SPACE_DATA) {
2135 ASSERT(arc_meta_used >= space);
2136 if (arc_meta_max < arc_meta_used)
2137 arc_meta_max = arc_meta_used;
2138 ARCSTAT_INCR(arcstat_meta_used, -space);
2139 }
2140
2141 ASSERT(arc_size >= space);
2142 atomic_add_64(&arc_size, -space);
2143 }
2144
2145 arc_buf_t *
arc_buf_alloc(spa_t * spa,int32_t size,void * tag,arc_buf_contents_t type)2146 arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type)
2147 {
2148 arc_buf_hdr_t *hdr;
2149 arc_buf_t *buf;
2150
2151 ASSERT3U(size, >, 0);
2152 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
2153 ASSERT(BUF_EMPTY(hdr));
2154 ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
2155 hdr->b_size = size;
2156 hdr->b_spa = spa_load_guid(spa);
2157
2158 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2159 buf->b_hdr = hdr;
2160 buf->b_data = NULL;
2161 buf->b_efunc = NULL;
2162 buf->b_private = NULL;
2163 buf->b_next = NULL;
2164
2165 hdr->b_flags = arc_bufc_to_flags(type);
2166 hdr->b_flags |= ARC_FLAG_HAS_L1HDR;
2167
2168 hdr->b_l1hdr.b_buf = buf;
2169 hdr->b_l1hdr.b_state = arc_anon;
2170 hdr->b_l1hdr.b_arc_access = 0;
2171 hdr->b_l1hdr.b_datacnt = 1;
2172 hdr->b_l1hdr.b_tmp_cdata = NULL;
2173
2174 arc_get_data_buf(buf);
2175 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2176 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2177
2178 return (buf);
2179 }
2180
2181 /*
2182 * Allocates an ARC buf header that's in an evicted & L2-cached state.
2183 * This is used during l2arc reconstruction to make empty ARC buffers
2184 * which circumvent the regular disk->arc->l2arc path and instead come
2185 * into being in the reverse order, i.e. l2arc->arc.
2186 */
2187 arc_buf_hdr_t *
arc_buf_alloc_l2only(uint64_t load_guid,int size,arc_buf_contents_t type,l2arc_dev_t * dev,dva_t dva,uint64_t daddr,int32_t asize,uint64_t birth,zio_cksum_t cksum,enum zio_compress compress)2188 arc_buf_alloc_l2only(uint64_t load_guid, int size, arc_buf_contents_t type,
2189 l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t asize, uint64_t birth,
2190 zio_cksum_t cksum, enum zio_compress compress)
2191 {
2192 arc_buf_hdr_t *hdr;
2193
2194 ASSERT3U(size, >, 0);
2195 hdr = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
2196 ASSERT(BUF_EMPTY(hdr));
2197 ASSERT3P(hdr->b_freeze_cksum, ==, NULL);
2198 hdr->b_dva = dva;
2199 hdr->b_birth = birth;
2200 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP);
2201 bcopy(&cksum, hdr->b_freeze_cksum, sizeof (cksum));
2202 hdr->b_flags = arc_bufc_to_flags(type);
2203 hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
2204 hdr->b_size = size;
2205 hdr->b_spa = load_guid;
2206
2207 hdr->b_l2hdr.b_compress = compress;
2208 hdr->b_l2hdr.b_dev = dev;
2209 hdr->b_l2hdr.b_daddr = daddr;
2210 hdr->b_l2hdr.b_asize = asize;
2211
2212 return (hdr);
2213 }
2214
2215 static char *arc_onloan_tag = "onloan";
2216
2217 /*
2218 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
2219 * flight data by arc_tempreserve_space() until they are "returned". Loaned
2220 * buffers must be returned to the arc before they can be used by the DMU or
2221 * freed.
2222 */
2223 arc_buf_t *
arc_loan_buf(spa_t * spa,int size)2224 arc_loan_buf(spa_t *spa, int size)
2225 {
2226 arc_buf_t *buf;
2227
2228 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
2229
2230 atomic_add_64(&arc_loaned_bytes, size);
2231 return (buf);
2232 }
2233
2234 /*
2235 * Return a loaned arc buffer to the arc.
2236 */
2237 void
arc_return_buf(arc_buf_t * buf,void * tag)2238 arc_return_buf(arc_buf_t *buf, void *tag)
2239 {
2240 arc_buf_hdr_t *hdr = buf->b_hdr;
2241
2242 ASSERT(buf->b_data != NULL);
2243 ASSERT(HDR_HAS_L1HDR(hdr));
2244 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag);
2245 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2246
2247 atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
2248 }
2249
2250 /* Detach an arc_buf from a dbuf (tag) */
2251 void
arc_loan_inuse_buf(arc_buf_t * buf,void * tag)2252 arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
2253 {
2254 arc_buf_hdr_t *hdr = buf->b_hdr;
2255
2256 ASSERT(buf->b_data != NULL);
2257 ASSERT(HDR_HAS_L1HDR(hdr));
2258 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag);
2259 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag);
2260 buf->b_efunc = NULL;
2261 buf->b_private = NULL;
2262
2263 atomic_add_64(&arc_loaned_bytes, hdr->b_size);
2264 }
2265
2266 static arc_buf_t *
arc_buf_clone(arc_buf_t * from)2267 arc_buf_clone(arc_buf_t *from)
2268 {
2269 arc_buf_t *buf;
2270 arc_buf_hdr_t *hdr = from->b_hdr;
2271 uint64_t size = hdr->b_size;
2272
2273 ASSERT(HDR_HAS_L1HDR(hdr));
2274 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2275
2276 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
2277 buf->b_hdr = hdr;
2278 buf->b_data = NULL;
2279 buf->b_efunc = NULL;
2280 buf->b_private = NULL;
2281 buf->b_next = hdr->b_l1hdr.b_buf;
2282 hdr->b_l1hdr.b_buf = buf;
2283 arc_get_data_buf(buf);
2284 bcopy(from->b_data, buf->b_data, size);
2285
2286 /*
2287 * This buffer already exists in the arc so create a duplicate
2288 * copy for the caller. If the buffer is associated with user data
2289 * then track the size and number of duplicates. These stats will be
2290 * updated as duplicate buffers are created and destroyed.
2291 */
2292 if (HDR_ISTYPE_DATA(hdr)) {
2293 ARCSTAT_BUMP(arcstat_duplicate_buffers);
2294 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size);
2295 }
2296 hdr->b_l1hdr.b_datacnt += 1;
2297 return (buf);
2298 }
2299
2300 void
arc_buf_add_ref(arc_buf_t * buf,void * tag)2301 arc_buf_add_ref(arc_buf_t *buf, void* tag)
2302 {
2303 arc_buf_hdr_t *hdr;
2304 kmutex_t *hash_lock;
2305
2306 /*
2307 * Check to see if this buffer is evicted. Callers
2308 * must verify b_data != NULL to know if the add_ref
2309 * was successful.
2310 */
2311 mutex_enter(&buf->b_evict_lock);
2312 if (buf->b_data == NULL) {
2313 mutex_exit(&buf->b_evict_lock);
2314 return;
2315 }
2316 hash_lock = HDR_LOCK(buf->b_hdr);
2317 mutex_enter(hash_lock);
2318 hdr = buf->b_hdr;
2319 ASSERT(HDR_HAS_L1HDR(hdr));
2320 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2321 mutex_exit(&buf->b_evict_lock);
2322
2323 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
2324 hdr->b_l1hdr.b_state == arc_mfu);
2325
2326 add_reference(hdr, hash_lock, tag);
2327 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
2328 arc_access(hdr, hash_lock);
2329 mutex_exit(hash_lock);
2330 ARCSTAT_BUMP(arcstat_hits);
2331 arc_update_hit_stat(hdr, B_TRUE);
2332 }
2333
2334 static void
arc_buf_free_on_write(void * data,size_t size,void (* free_func)(void *,size_t))2335 arc_buf_free_on_write(void *data, size_t size,
2336 void (*free_func)(void *, size_t))
2337 {
2338 l2arc_data_free_t *df;
2339
2340 df = kmem_alloc(sizeof (*df), KM_SLEEP);
2341 df->l2df_data = data;
2342 df->l2df_size = size;
2343 df->l2df_func = free_func;
2344 mutex_enter(&l2arc_free_on_write_mtx);
2345 list_insert_head(l2arc_free_on_write, df);
2346 mutex_exit(&l2arc_free_on_write_mtx);
2347 }
2348
2349 /*
2350 * Free the arc data buffer. If it is an l2arc write in progress,
2351 * the buffer is placed on l2arc_free_on_write to be freed later.
2352 */
2353 static void
arc_buf_data_free(arc_buf_t * buf,void (* free_func)(void *,size_t))2354 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t))
2355 {
2356 arc_buf_hdr_t *hdr = buf->b_hdr;
2357
2358 if (HDR_L2_WRITING(hdr)) {
2359 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func);
2360 ARCSTAT_BUMP(arcstat_l2_free_on_write);
2361 } else {
2362 free_func(buf->b_data, hdr->b_size);
2363 }
2364 }
2365
2366 static void
arc_buf_l2_cdata_free(arc_buf_hdr_t * hdr)2367 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr)
2368 {
2369 ASSERT(HDR_HAS_L2HDR(hdr));
2370 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx));
2371
2372 /*
2373 * The b_tmp_cdata field is linked off of the b_l1hdr, so if
2374 * that doesn't exist, the header is in the arc_l2c_only state,
2375 * and there isn't anything to free (it's already been freed).
2376 */
2377 if (!HDR_HAS_L1HDR(hdr))
2378 return;
2379
2380 /*
2381 * The header isn't being written to the l2arc device, thus it
2382 * shouldn't have a b_tmp_cdata to free.
2383 */
2384 if (!HDR_L2_WRITING(hdr)) {
2385 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2386 return;
2387 }
2388
2389 /*
2390 * The header does not have compression enabled. This can be due
2391 * to the buffer not being compressible, or because we're
2392 * freeing the buffer before the second phase of
2393 * l2arc_write_buffer() has started (which does the compression
2394 * step). In either case, b_tmp_cdata does not point to a
2395 * separately compressed buffer, so there's nothing to free (it
2396 * points to the same buffer as the arc_buf_t's b_data field).
2397 */
2398 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) {
2399 hdr->b_l1hdr.b_tmp_cdata = NULL;
2400 return;
2401 }
2402
2403 /*
2404 * There's nothing to free since the buffer was all zero's and
2405 * compressed to a zero length buffer.
2406 */
2407 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) {
2408 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
2409 return;
2410 }
2411
2412 ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress));
2413
2414 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata,
2415 hdr->b_size, zio_data_buf_free);
2416
2417 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write);
2418 hdr->b_l1hdr.b_tmp_cdata = NULL;
2419 }
2420
2421 /*
2422 * Free up buf->b_data and if 'remove' is set, then pull the
2423 * arc_buf_t off of the the arc_buf_hdr_t's list and free it.
2424 */
2425 static void
arc_buf_destroy(arc_buf_t * buf,boolean_t remove)2426 arc_buf_destroy(arc_buf_t *buf, boolean_t remove)
2427 {
2428 arc_buf_t **bufp;
2429
2430 /* free up data associated with the buf */
2431 if (buf->b_data != NULL) {
2432 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
2433 uint64_t size = buf->b_hdr->b_size;
2434 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
2435
2436 arc_cksum_verify(buf);
2437 arc_buf_unwatch(buf);
2438
2439 if (type == ARC_BUFC_METADATA) {
2440 arc_buf_data_free(buf, zio_buf_free);
2441 arc_space_return(size, ARC_SPACE_META);
2442 } else {
2443 ASSERT(type == ARC_BUFC_DATA);
2444 arc_buf_data_free(buf, zio_data_buf_free);
2445 arc_space_return(size, ARC_SPACE_DATA);
2446 }
2447
2448 /* protected by hash lock, if in the hash table */
2449 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) {
2450 uint64_t *cnt = &state->arcs_lsize[type];
2451
2452 ASSERT(refcount_is_zero(
2453 &buf->b_hdr->b_l1hdr.b_refcnt));
2454 ASSERT(state != arc_anon && state != arc_l2c_only);
2455
2456 ASSERT3U(*cnt, >=, size);
2457 atomic_add_64(cnt, -size);
2458 }
2459
2460 (void) refcount_remove_many(&state->arcs_size, size, buf);
2461 buf->b_data = NULL;
2462
2463 /*
2464 * If we're destroying a duplicate buffer make sure
2465 * that the appropriate statistics are updated.
2466 */
2467 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 &&
2468 HDR_ISTYPE_DATA(buf->b_hdr)) {
2469 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
2470 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size);
2471 }
2472 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0);
2473 buf->b_hdr->b_l1hdr.b_datacnt -= 1;
2474 }
2475
2476 /* only remove the buf if requested */
2477 if (!remove)
2478 return;
2479
2480 /* remove the buf from the hdr list */
2481 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf;
2482 bufp = &(*bufp)->b_next)
2483 continue;
2484 *bufp = buf->b_next;
2485 buf->b_next = NULL;
2486
2487 ASSERT(buf->b_efunc == NULL);
2488
2489 /* clean up the buf */
2490 buf->b_hdr = NULL;
2491 kmem_cache_free(buf_cache, buf);
2492 }
2493
2494 static void
arc_hdr_l2hdr_destroy(arc_buf_hdr_t * hdr)2495 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr)
2496 {
2497 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
2498 l2arc_dev_t *dev = l2hdr->b_dev;
2499
2500 ASSERT(MUTEX_HELD(&dev->l2ad_mtx));
2501 ASSERT(HDR_HAS_L2HDR(hdr));
2502
2503 list_remove(&dev->l2ad_buflist, hdr);
2504
2505 /*
2506 * We don't want to leak the b_tmp_cdata buffer that was
2507 * allocated in l2arc_write_buffers()
2508 */
2509 arc_buf_l2_cdata_free(hdr);
2510
2511 /*
2512 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then
2513 * this header is being processed by l2arc_write_buffers() (i.e.
2514 * it's in the first stage of l2arc_write_buffers()).
2515 * Re-affirming that truth here, just to serve as a reminder. If
2516 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or
2517 * may not have its HDR_L2_WRITING flag set. (the write may have
2518 * completed, in which case HDR_L2_WRITING will be false and the
2519 * b_daddr field will point to the address of the buffer on disk).
2520 */
2521 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr));
2522
2523 /*
2524 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with
2525 * l2arc_write_buffers(). Since we've just removed this header
2526 * from the l2arc buffer list, this header will never reach the
2527 * second stage of l2arc_write_buffers(), which increments the
2528 * accounting stats for this header. Thus, we must be careful
2529 * not to decrement them for this header either.
2530 */
2531 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) {
2532 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize);
2533 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
2534
2535 vdev_space_update(dev->l2ad_vdev,
2536 -l2hdr->b_asize, 0, 0);
2537
2538 (void) refcount_remove_many(&dev->l2ad_alloc,
2539 l2hdr->b_asize, hdr);
2540 }
2541
2542 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
2543 }
2544
2545 static void
arc_hdr_destroy(arc_buf_hdr_t * hdr)2546 arc_hdr_destroy(arc_buf_hdr_t *hdr)
2547 {
2548 if (HDR_HAS_L1HDR(hdr)) {
2549 ASSERT(hdr->b_l1hdr.b_buf == NULL ||
2550 hdr->b_l1hdr.b_datacnt > 0);
2551 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2552 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
2553 }
2554 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2555 ASSERT(!HDR_IN_HASH_TABLE(hdr));
2556
2557 if (HDR_HAS_L2HDR(hdr)) {
2558 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev;
2559 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx);
2560
2561 if (!buflist_held)
2562 mutex_enter(&dev->l2ad_mtx);
2563
2564 /*
2565 * Even though we checked this conditional above, we
2566 * need to check this again now that we have the
2567 * l2ad_mtx. This is because we could be racing with
2568 * another thread calling l2arc_evict() which might have
2569 * destroyed this header's L2 portion as we were waiting
2570 * to acquire the l2ad_mtx. If that happens, we don't
2571 * want to re-destroy the header's L2 portion.
2572 */
2573 if (HDR_HAS_L2HDR(hdr))
2574 arc_hdr_l2hdr_destroy(hdr);
2575
2576 if (!buflist_held)
2577 mutex_exit(&dev->l2ad_mtx);
2578 }
2579
2580 if (!BUF_EMPTY(hdr))
2581 buf_discard_identity(hdr);
2582
2583 if (hdr->b_freeze_cksum != NULL) {
2584 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
2585 hdr->b_freeze_cksum = NULL;
2586 }
2587
2588 if (HDR_HAS_L1HDR(hdr)) {
2589 while (hdr->b_l1hdr.b_buf) {
2590 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2591
2592 if (buf->b_efunc != NULL) {
2593 mutex_enter(&arc_user_evicts_lock);
2594 mutex_enter(&buf->b_evict_lock);
2595 ASSERT(buf->b_hdr != NULL);
2596 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE);
2597 hdr->b_l1hdr.b_buf = buf->b_next;
2598 buf->b_hdr = &arc_eviction_hdr;
2599 buf->b_next = arc_eviction_list;
2600 arc_eviction_list = buf;
2601 mutex_exit(&buf->b_evict_lock);
2602 cv_signal(&arc_user_evicts_cv);
2603 mutex_exit(&arc_user_evicts_lock);
2604 } else {
2605 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE);
2606 }
2607 }
2608 #ifdef ZFS_DEBUG
2609 if (hdr->b_l1hdr.b_thawed != NULL) {
2610 kmem_free(hdr->b_l1hdr.b_thawed, 1);
2611 hdr->b_l1hdr.b_thawed = NULL;
2612 }
2613 #endif
2614 }
2615
2616 ASSERT3P(hdr->b_hash_next, ==, NULL);
2617 if (HDR_HAS_L1HDR(hdr)) {
2618 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
2619 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL);
2620 kmem_cache_free(hdr_full_cache, hdr);
2621 } else {
2622 kmem_cache_free(hdr_l2only_cache, hdr);
2623 }
2624 }
2625
2626 void
arc_buf_free(arc_buf_t * buf,void * tag)2627 arc_buf_free(arc_buf_t *buf, void *tag)
2628 {
2629 arc_buf_hdr_t *hdr = buf->b_hdr;
2630 int hashed = hdr->b_l1hdr.b_state != arc_anon;
2631
2632 ASSERT(buf->b_efunc == NULL);
2633 ASSERT(buf->b_data != NULL);
2634
2635 if (hashed) {
2636 kmutex_t *hash_lock = HDR_LOCK(hdr);
2637
2638 mutex_enter(hash_lock);
2639 hdr = buf->b_hdr;
2640 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2641
2642 (void) remove_reference(hdr, hash_lock, tag);
2643 if (hdr->b_l1hdr.b_datacnt > 1) {
2644 arc_buf_destroy(buf, TRUE);
2645 } else {
2646 ASSERT(buf == hdr->b_l1hdr.b_buf);
2647 ASSERT(buf->b_efunc == NULL);
2648 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2649 }
2650 mutex_exit(hash_lock);
2651 } else if (HDR_IO_IN_PROGRESS(hdr)) {
2652 int destroy_hdr;
2653 /*
2654 * We are in the middle of an async write. Don't destroy
2655 * this buffer unless the write completes before we finish
2656 * decrementing the reference count.
2657 */
2658 mutex_enter(&arc_user_evicts_lock);
2659 (void) remove_reference(hdr, NULL, tag);
2660 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2661 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
2662 mutex_exit(&arc_user_evicts_lock);
2663 if (destroy_hdr)
2664 arc_hdr_destroy(hdr);
2665 } else {
2666 if (remove_reference(hdr, NULL, tag) > 0)
2667 arc_buf_destroy(buf, TRUE);
2668 else
2669 arc_hdr_destroy(hdr);
2670 }
2671 }
2672
2673 boolean_t
arc_buf_remove_ref(arc_buf_t * buf,void * tag)2674 arc_buf_remove_ref(arc_buf_t *buf, void* tag)
2675 {
2676 arc_buf_hdr_t *hdr = buf->b_hdr;
2677 kmutex_t *hash_lock = HDR_LOCK(hdr);
2678 boolean_t no_callback = (buf->b_efunc == NULL);
2679
2680 if (hdr->b_l1hdr.b_state == arc_anon) {
2681 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
2682 arc_buf_free(buf, tag);
2683 return (no_callback);
2684 }
2685
2686 mutex_enter(hash_lock);
2687 hdr = buf->b_hdr;
2688 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
2689 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
2690 ASSERT(hdr->b_l1hdr.b_state != arc_anon);
2691 ASSERT(buf->b_data != NULL);
2692
2693 (void) remove_reference(hdr, hash_lock, tag);
2694 if (hdr->b_l1hdr.b_datacnt > 1) {
2695 if (no_callback)
2696 arc_buf_destroy(buf, TRUE);
2697 } else if (no_callback) {
2698 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL);
2699 ASSERT(buf->b_efunc == NULL);
2700 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
2701 }
2702 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 ||
2703 refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
2704 mutex_exit(hash_lock);
2705 return (no_callback);
2706 }
2707
2708 int32_t
arc_buf_size(arc_buf_t * buf)2709 arc_buf_size(arc_buf_t *buf)
2710 {
2711 return (buf->b_hdr->b_size);
2712 }
2713
2714 /*
2715 * Called from the DMU to determine if the current buffer should be
2716 * evicted. In order to ensure proper locking, the eviction must be initiated
2717 * from the DMU. Return true if the buffer is associated with user data and
2718 * duplicate buffers still exist.
2719 */
2720 boolean_t
arc_buf_eviction_needed(arc_buf_t * buf)2721 arc_buf_eviction_needed(arc_buf_t *buf)
2722 {
2723 arc_buf_hdr_t *hdr;
2724 boolean_t evict_needed = B_FALSE;
2725
2726 if (zfs_disable_dup_eviction)
2727 return (B_FALSE);
2728
2729 mutex_enter(&buf->b_evict_lock);
2730 hdr = buf->b_hdr;
2731 if (hdr == NULL) {
2732 /*
2733 * We are in arc_do_user_evicts(); let that function
2734 * perform the eviction.
2735 */
2736 ASSERT(buf->b_data == NULL);
2737 mutex_exit(&buf->b_evict_lock);
2738 return (B_FALSE);
2739 } else if (buf->b_data == NULL) {
2740 /*
2741 * We have already been added to the arc eviction list;
2742 * recommend eviction.
2743 */
2744 ASSERT3P(hdr, ==, &arc_eviction_hdr);
2745 mutex_exit(&buf->b_evict_lock);
2746 return (B_TRUE);
2747 }
2748
2749 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr))
2750 evict_needed = B_TRUE;
2751
2752 mutex_exit(&buf->b_evict_lock);
2753 return (evict_needed);
2754 }
2755
2756 /*
2757 * Evict the arc_buf_hdr that is provided as a parameter. The resultant
2758 * state of the header is dependent on it's state prior to entering this
2759 * function. The following transitions are possible:
2760 *
2761 * - arc_mru -> arc_mru_ghost
2762 * - arc_mfu -> arc_mfu_ghost
2763 * - arc_mru_ghost -> arc_l2c_only
2764 * - arc_mru_ghost -> deleted
2765 * - arc_mfu_ghost -> arc_l2c_only
2766 * - arc_mfu_ghost -> deleted
2767 */
2768 static int64_t
arc_evict_hdr(arc_buf_hdr_t * hdr,kmutex_t * hash_lock)2769 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
2770 {
2771 arc_state_t *evicted_state, *state;
2772 int64_t bytes_evicted = 0;
2773
2774 ASSERT(MUTEX_HELD(hash_lock));
2775 ASSERT(HDR_HAS_L1HDR(hdr));
2776
2777 state = hdr->b_l1hdr.b_state;
2778 if (GHOST_STATE(state)) {
2779 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
2780 ASSERT(hdr->b_l1hdr.b_buf == NULL);
2781
2782 /*
2783 * l2arc_write_buffers() relies on a header's L1 portion
2784 * (i.e. it's b_tmp_cdata field) during it's write phase.
2785 * Thus, we cannot push a header onto the arc_l2c_only
2786 * state (removing it's L1 piece) until the header is
2787 * done being written to the l2arc.
2788 */
2789 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) {
2790 ARCSTAT_BUMP(arcstat_evict_l2_skip);
2791 return (bytes_evicted);
2792 }
2793
2794 ARCSTAT_BUMP(arcstat_deleted);
2795 bytes_evicted += hdr->b_size;
2796
2797 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr);
2798
2799 if (HDR_HAS_L2HDR(hdr)) {
2800 /*
2801 * This buffer is cached on the 2nd Level ARC;
2802 * don't destroy the header.
2803 */
2804 arc_change_state(arc_l2c_only, hdr, hash_lock);
2805 /*
2806 * dropping from L1+L2 cached to L2-only,
2807 * realloc to remove the L1 header.
2808 */
2809 hdr = arc_hdr_realloc(hdr, hdr_full_cache,
2810 hdr_l2only_cache);
2811 } else {
2812 arc_change_state(arc_anon, hdr, hash_lock);
2813 arc_hdr_destroy(hdr);
2814 }
2815 return (bytes_evicted);
2816 }
2817
2818 ASSERT(state == arc_mru || state == arc_mfu);
2819 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost;
2820
2821 /* prefetch buffers have a minimum lifespan */
2822 if (HDR_IO_IN_PROGRESS(hdr) ||
2823 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
2824 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
2825 arc_min_prefetch_lifespan)) {
2826 ARCSTAT_BUMP(arcstat_evict_skip);
2827 return (bytes_evicted);
2828 }
2829
2830 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
2831 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0);
2832 while (hdr->b_l1hdr.b_buf) {
2833 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
2834 if (!mutex_tryenter(&buf->b_evict_lock)) {
2835 ARCSTAT_BUMP(arcstat_mutex_miss);
2836 break;
2837 }
2838 if (buf->b_data != NULL)
2839 bytes_evicted += hdr->b_size;
2840 if (buf->b_efunc != NULL) {
2841 mutex_enter(&arc_user_evicts_lock);
2842 arc_buf_destroy(buf, FALSE);
2843 hdr->b_l1hdr.b_buf = buf->b_next;
2844 buf->b_hdr = &arc_eviction_hdr;
2845 buf->b_next = arc_eviction_list;
2846 arc_eviction_list = buf;
2847 cv_signal(&arc_user_evicts_cv);
2848 mutex_exit(&arc_user_evicts_lock);
2849 mutex_exit(&buf->b_evict_lock);
2850 } else {
2851 mutex_exit(&buf->b_evict_lock);
2852 arc_buf_destroy(buf, TRUE);
2853 }
2854 }
2855
2856 if (HDR_HAS_L2HDR(hdr)) {
2857 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size);
2858 } else {
2859 if (l2arc_write_eligible(hdr->b_spa, UINT64_MAX, hdr))
2860 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size);
2861 else
2862 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size);
2863 }
2864
2865 if (hdr->b_l1hdr.b_datacnt == 0) {
2866 arc_change_state(evicted_state, hdr, hash_lock);
2867 ASSERT(HDR_IN_HASH_TABLE(hdr));
2868 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE;
2869 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
2870 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr);
2871 }
2872
2873 return (bytes_evicted);
2874 }
2875
2876 static uint64_t
arc_evict_state_impl(multilist_t * ml,int idx,arc_buf_hdr_t * marker,uint64_t spa,int64_t bytes)2877 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker,
2878 uint64_t spa, int64_t bytes)
2879 {
2880 multilist_sublist_t *mls;
2881 uint64_t bytes_evicted = 0;
2882 arc_buf_hdr_t *hdr;
2883 kmutex_t *hash_lock;
2884 int evict_count = 0;
2885
2886 ASSERT3P(marker, !=, NULL);
2887 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
2888
2889 mls = multilist_sublist_lock(ml, idx);
2890
2891 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL;
2892 hdr = multilist_sublist_prev(mls, marker)) {
2893 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) ||
2894 (evict_count >= zfs_arc_evict_batch_limit))
2895 break;
2896
2897 /*
2898 * To keep our iteration location, move the marker
2899 * forward. Since we're not holding hdr's hash lock, we
2900 * must be very careful and not remove 'hdr' from the
2901 * sublist. Otherwise, other consumers might mistake the
2902 * 'hdr' as not being on a sublist when they call the
2903 * multilist_link_active() function (they all rely on
2904 * the hash lock protecting concurrent insertions and
2905 * removals). multilist_sublist_move_forward() was
2906 * specifically implemented to ensure this is the case
2907 * (only 'marker' will be removed and re-inserted).
2908 */
2909 multilist_sublist_move_forward(mls, marker);
2910
2911 /*
2912 * The only case where the b_spa field should ever be
2913 * zero, is the marker headers inserted by
2914 * arc_evict_state(). It's possible for multiple threads
2915 * to be calling arc_evict_state() concurrently (e.g.
2916 * dsl_pool_close() and zio_inject_fault()), so we must
2917 * skip any markers we see from these other threads.
2918 */
2919 if (hdr->b_spa == 0)
2920 continue;
2921
2922 /* we're only interested in evicting buffers of a certain spa */
2923 if (spa != 0 && hdr->b_spa != spa) {
2924 ARCSTAT_BUMP(arcstat_evict_skip);
2925 continue;
2926 }
2927
2928 hash_lock = HDR_LOCK(hdr);
2929
2930 /*
2931 * We aren't calling this function from any code path
2932 * that would already be holding a hash lock, so we're
2933 * asserting on this assumption to be defensive in case
2934 * this ever changes. Without this check, it would be
2935 * possible to incorrectly increment arcstat_mutex_miss
2936 * below (e.g. if the code changed such that we called
2937 * this function with a hash lock held).
2938 */
2939 ASSERT(!MUTEX_HELD(hash_lock));
2940
2941 if (mutex_tryenter(hash_lock)) {
2942 uint64_t evicted = arc_evict_hdr(hdr, hash_lock);
2943 mutex_exit(hash_lock);
2944
2945 bytes_evicted += evicted;
2946
2947 /*
2948 * If evicted is zero, arc_evict_hdr() must have
2949 * decided to skip this header, don't increment
2950 * evict_count in this case.
2951 */
2952 if (evicted != 0)
2953 evict_count++;
2954
2955 /*
2956 * If arc_size isn't overflowing, signal any
2957 * threads that might happen to be waiting.
2958 *
2959 * For each header evicted, we wake up a single
2960 * thread. If we used cv_broadcast, we could
2961 * wake up "too many" threads causing arc_size
2962 * to significantly overflow arc_c; since
2963 * arc_get_data_buf() doesn't check for overflow
2964 * when it's woken up (it doesn't because it's
2965 * possible for the ARC to be overflowing while
2966 * full of un-evictable buffers, and the
2967 * function should proceed in this case).
2968 *
2969 * If threads are left sleeping, due to not
2970 * using cv_broadcast, they will be woken up
2971 * just before arc_reclaim_thread() sleeps.
2972 */
2973 mutex_enter(&arc_reclaim_lock);
2974 if (!arc_is_overflowing())
2975 cv_signal(&arc_reclaim_waiters_cv);
2976 mutex_exit(&arc_reclaim_lock);
2977 } else {
2978 ARCSTAT_BUMP(arcstat_mutex_miss);
2979 }
2980 }
2981
2982 multilist_sublist_unlock(mls);
2983
2984 return (bytes_evicted);
2985 }
2986
2987 /*
2988 * Evict buffers from the given arc state, until we've removed the
2989 * specified number of bytes. Move the removed buffers to the
2990 * appropriate evict state.
2991 *
2992 * This function makes a "best effort". It skips over any buffers
2993 * it can't get a hash_lock on, and so, may not catch all candidates.
2994 * It may also return without evicting as much space as requested.
2995 *
2996 * If bytes is specified using the special value ARC_EVICT_ALL, this
2997 * will evict all available (i.e. unlocked and evictable) buffers from
2998 * the given arc state; which is used by arc_flush().
2999 */
3000 static uint64_t
arc_evict_state(arc_state_t * state,uint64_t spa,int64_t bytes,arc_buf_contents_t type)3001 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes,
3002 arc_buf_contents_t type)
3003 {
3004 uint64_t total_evicted = 0;
3005 multilist_t *ml = &state->arcs_list[type];
3006 int num_sublists;
3007 arc_buf_hdr_t **markers;
3008
3009 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL);
3010
3011 num_sublists = multilist_get_num_sublists(ml);
3012
3013 /*
3014 * If we've tried to evict from each sublist, made some
3015 * progress, but still have not hit the target number of bytes
3016 * to evict, we want to keep trying. The markers allow us to
3017 * pick up where we left off for each individual sublist, rather
3018 * than starting from the tail each time.
3019 */
3020 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP);
3021 for (int i = 0; i < num_sublists; i++) {
3022 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP);
3023
3024 /*
3025 * A b_spa of 0 is used to indicate that this header is
3026 * a marker. This fact is used in arc_adjust_type() and
3027 * arc_evict_state_impl().
3028 */
3029 markers[i]->b_spa = 0;
3030
3031 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3032 multilist_sublist_insert_tail(mls, markers[i]);
3033 multilist_sublist_unlock(mls);
3034 }
3035
3036 /*
3037 * While we haven't hit our target number of bytes to evict, or
3038 * we're evicting all available buffers.
3039 */
3040 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) {
3041 /*
3042 * Start eviction using a randomly selected sublist,
3043 * this is to try and evenly balance eviction across all
3044 * sublists. Always starting at the same sublist
3045 * (e.g. index 0) would cause evictions to favor certain
3046 * sublists over others.
3047 */
3048 int sublist_idx = multilist_get_random_index(ml);
3049 uint64_t scan_evicted = 0;
3050
3051 for (int i = 0; i < num_sublists; i++) {
3052 uint64_t bytes_remaining;
3053 uint64_t bytes_evicted;
3054
3055 if (bytes == ARC_EVICT_ALL)
3056 bytes_remaining = ARC_EVICT_ALL;
3057 else if (total_evicted < bytes)
3058 bytes_remaining = bytes - total_evicted;
3059 else
3060 break;
3061
3062 bytes_evicted = arc_evict_state_impl(ml, sublist_idx,
3063 markers[sublist_idx], spa, bytes_remaining);
3064
3065 scan_evicted += bytes_evicted;
3066 total_evicted += bytes_evicted;
3067
3068 /* we've reached the end, wrap to the beginning */
3069 if (++sublist_idx >= num_sublists)
3070 sublist_idx = 0;
3071 }
3072
3073 /*
3074 * If we didn't evict anything during this scan, we have
3075 * no reason to believe we'll evict more during another
3076 * scan, so break the loop.
3077 */
3078 if (scan_evicted == 0) {
3079 /* This isn't possible, let's make that obvious */
3080 ASSERT3S(bytes, !=, 0);
3081
3082 /*
3083 * When bytes is ARC_EVICT_ALL, the only way to
3084 * break the loop is when scan_evicted is zero.
3085 * In that case, we actually have evicted enough,
3086 * so we don't want to increment the kstat.
3087 */
3088 if (bytes != ARC_EVICT_ALL) {
3089 ASSERT3S(total_evicted, <, bytes);
3090 ARCSTAT_BUMP(arcstat_evict_not_enough);
3091 }
3092
3093 break;
3094 }
3095 }
3096
3097 for (int i = 0; i < num_sublists; i++) {
3098 multilist_sublist_t *mls = multilist_sublist_lock(ml, i);
3099 multilist_sublist_remove(mls, markers[i]);
3100 multilist_sublist_unlock(mls);
3101
3102 kmem_cache_free(hdr_full_cache, markers[i]);
3103 }
3104 kmem_free(markers, sizeof (*markers) * num_sublists);
3105
3106 return (total_evicted);
3107 }
3108
3109 /*
3110 * Flush all "evictable" data of the given type from the arc state
3111 * specified. This will not evict any "active" buffers (i.e. referenced).
3112 *
3113 * When 'retry' is set to FALSE, the function will make a single pass
3114 * over the state and evict any buffers that it can. Since it doesn't
3115 * continually retry the eviction, it might end up leaving some buffers
3116 * in the ARC due to lock misses.
3117 *
3118 * When 'retry' is set to TRUE, the function will continually retry the
3119 * eviction until *all* evictable buffers have been removed from the
3120 * state. As a result, if concurrent insertions into the state are
3121 * allowed (e.g. if the ARC isn't shutting down), this function might
3122 * wind up in an infinite loop, continually trying to evict buffers.
3123 */
3124 static uint64_t
arc_flush_state(arc_state_t * state,uint64_t spa,arc_buf_contents_t type,boolean_t retry)3125 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type,
3126 boolean_t retry)
3127 {
3128 uint64_t evicted = 0;
3129
3130 while (state->arcs_lsize[type] != 0) {
3131 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type);
3132
3133 if (!retry)
3134 break;
3135 }
3136
3137 return (evicted);
3138 }
3139
3140 /*
3141 * Evict the specified number of bytes from the state specified,
3142 * restricting eviction to the spa and type given. This function
3143 * prevents us from trying to evict more from a state's list than
3144 * is "evictable", and to skip evicting altogether when passed a
3145 * negative value for "bytes". In contrast, arc_evict_state() will
3146 * evict everything it can, when passed a negative value for "bytes".
3147 */
3148 static uint64_t
arc_adjust_impl(arc_state_t * state,uint64_t spa,int64_t bytes,arc_buf_contents_t type)3149 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes,
3150 arc_buf_contents_t type)
3151 {
3152 int64_t delta;
3153
3154 if (bytes > 0 && state->arcs_lsize[type] > 0) {
3155 delta = MIN(state->arcs_lsize[type], bytes);
3156 return (arc_evict_state(state, spa, delta, type));
3157 }
3158
3159 return (0);
3160 }
3161
3162 /*
3163 * Evict metadata buffers from the cache, such that arc_meta_used is
3164 * capped by the arc_meta_limit tunable.
3165 */
3166 static uint64_t
arc_adjust_meta(void)3167 arc_adjust_meta(void)
3168 {
3169 uint64_t total_evicted = 0;
3170 int64_t target;
3171
3172 /*
3173 * If we're over the meta limit, we want to evict enough
3174 * metadata to get back under the meta limit. We don't want to
3175 * evict so much that we drop the MRU below arc_p, though. If
3176 * we're over the meta limit more than we're over arc_p, we
3177 * evict some from the MRU here, and some from the MFU below.
3178 */
3179 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3180 (int64_t)(refcount_count(&arc_anon->arcs_size) +
3181 refcount_count(&arc_mru->arcs_size) - arc_p));
3182
3183 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3184
3185 /*
3186 * Similar to the above, we want to evict enough bytes to get us
3187 * below the meta limit, but not so much as to drop us below the
3188 * space alloted to the MFU (which is defined as arc_c - arc_p).
3189 */
3190 target = MIN((int64_t)(arc_meta_used - arc_meta_limit),
3191 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p)));
3192
3193 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3194
3195 return (total_evicted);
3196 }
3197
3198 /*
3199 * Return the type of the oldest buffer in the given arc state
3200 *
3201 * This function will select a random sublist of type ARC_BUFC_DATA and
3202 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist
3203 * is compared, and the type which contains the "older" buffer will be
3204 * returned.
3205 */
3206 static arc_buf_contents_t
arc_adjust_type(arc_state_t * state)3207 arc_adjust_type(arc_state_t *state)
3208 {
3209 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA];
3210 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA];
3211 int data_idx = multilist_get_random_index(data_ml);
3212 int meta_idx = multilist_get_random_index(meta_ml);
3213 multilist_sublist_t *data_mls;
3214 multilist_sublist_t *meta_mls;
3215 arc_buf_contents_t type;
3216 arc_buf_hdr_t *data_hdr;
3217 arc_buf_hdr_t *meta_hdr;
3218
3219 /*
3220 * We keep the sublist lock until we're finished, to prevent
3221 * the headers from being destroyed via arc_evict_state().
3222 */
3223 data_mls = multilist_sublist_lock(data_ml, data_idx);
3224 meta_mls = multilist_sublist_lock(meta_ml, meta_idx);
3225
3226 /*
3227 * These two loops are to ensure we skip any markers that
3228 * might be at the tail of the lists due to arc_evict_state().
3229 */
3230
3231 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL;
3232 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) {
3233 if (data_hdr->b_spa != 0)
3234 break;
3235 }
3236
3237 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL;
3238 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) {
3239 if (meta_hdr->b_spa != 0)
3240 break;
3241 }
3242
3243 if (data_hdr == NULL && meta_hdr == NULL) {
3244 type = ARC_BUFC_DATA;
3245 } else if (data_hdr == NULL) {
3246 ASSERT3P(meta_hdr, !=, NULL);
3247 type = ARC_BUFC_METADATA;
3248 } else if (meta_hdr == NULL) {
3249 ASSERT3P(data_hdr, !=, NULL);
3250 type = ARC_BUFC_DATA;
3251 } else {
3252 ASSERT3P(data_hdr, !=, NULL);
3253 ASSERT3P(meta_hdr, !=, NULL);
3254
3255 /* The headers can't be on the sublist without an L1 header */
3256 ASSERT(HDR_HAS_L1HDR(data_hdr));
3257 ASSERT(HDR_HAS_L1HDR(meta_hdr));
3258
3259 if (data_hdr->b_l1hdr.b_arc_access <
3260 meta_hdr->b_l1hdr.b_arc_access) {
3261 type = ARC_BUFC_DATA;
3262 } else {
3263 type = ARC_BUFC_METADATA;
3264 }
3265 }
3266
3267 multilist_sublist_unlock(meta_mls);
3268 multilist_sublist_unlock(data_mls);
3269
3270 return (type);
3271 }
3272
3273 /*
3274 * Evict buffers from the cache, such that arc_size is capped by arc_c.
3275 */
3276 static uint64_t
arc_adjust(void)3277 arc_adjust(void)
3278 {
3279 uint64_t total_evicted = 0;
3280 uint64_t bytes;
3281 int64_t target;
3282
3283 /*
3284 * If we're over arc_meta_limit, we want to correct that before
3285 * potentially evicting data buffers below.
3286 */
3287 total_evicted += arc_adjust_meta();
3288
3289 /*
3290 * Adjust MRU size
3291 *
3292 * If we're over the target cache size, we want to evict enough
3293 * from the list to get back to our target size. We don't want
3294 * to evict too much from the MRU, such that it drops below
3295 * arc_p. So, if we're over our target cache size more than
3296 * the MRU is over arc_p, we'll evict enough to get back to
3297 * arc_p here, and then evict more from the MFU below.
3298 */
3299 target = MIN((int64_t)(arc_size - arc_c),
3300 (int64_t)(refcount_count(&arc_anon->arcs_size) +
3301 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p));
3302
3303 /*
3304 * If we're below arc_meta_min, always prefer to evict data.
3305 * Otherwise, try to satisfy the requested number of bytes to
3306 * evict from the type which contains older buffers; in an
3307 * effort to keep newer buffers in the cache regardless of their
3308 * type. If we cannot satisfy the number of bytes from this
3309 * type, spill over into the next type.
3310 */
3311 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA &&
3312 arc_meta_used > arc_meta_min) {
3313 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3314 total_evicted += bytes;
3315
3316 /*
3317 * If we couldn't evict our target number of bytes from
3318 * metadata, we try to get the rest from data.
3319 */
3320 target -= bytes;
3321
3322 total_evicted +=
3323 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3324 } else {
3325 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA);
3326 total_evicted += bytes;
3327
3328 /*
3329 * If we couldn't evict our target number of bytes from
3330 * data, we try to get the rest from metadata.
3331 */
3332 target -= bytes;
3333
3334 total_evicted +=
3335 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA);
3336 }
3337
3338 /*
3339 * Adjust MFU size
3340 *
3341 * Now that we've tried to evict enough from the MRU to get its
3342 * size back to arc_p, if we're still above the target cache
3343 * size, we evict the rest from the MFU.
3344 */
3345 target = arc_size - arc_c;
3346
3347 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA &&
3348 arc_meta_used > arc_meta_min) {
3349 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3350 total_evicted += bytes;
3351
3352 /*
3353 * If we couldn't evict our target number of bytes from
3354 * metadata, we try to get the rest from data.
3355 */
3356 target -= bytes;
3357
3358 total_evicted +=
3359 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3360 } else {
3361 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA);
3362 total_evicted += bytes;
3363
3364 /*
3365 * If we couldn't evict our target number of bytes from
3366 * data, we try to get the rest from data.
3367 */
3368 target -= bytes;
3369
3370 total_evicted +=
3371 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA);
3372 }
3373
3374 /*
3375 * Adjust ghost lists
3376 *
3377 * In addition to the above, the ARC also defines target values
3378 * for the ghost lists. The sum of the mru list and mru ghost
3379 * list should never exceed the target size of the cache, and
3380 * the sum of the mru list, mfu list, mru ghost list, and mfu
3381 * ghost list should never exceed twice the target size of the
3382 * cache. The following logic enforces these limits on the ghost
3383 * caches, and evicts from them as needed.
3384 */
3385 target = refcount_count(&arc_mru->arcs_size) +
3386 refcount_count(&arc_mru_ghost->arcs_size) - arc_c;
3387
3388 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA);
3389 total_evicted += bytes;
3390
3391 target -= bytes;
3392
3393 total_evicted +=
3394 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA);
3395
3396 /*
3397 * We assume the sum of the mru list and mfu list is less than
3398 * or equal to arc_c (we enforced this above), which means we
3399 * can use the simpler of the two equations below:
3400 *
3401 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c
3402 * mru ghost + mfu ghost <= arc_c
3403 */
3404 target = refcount_count(&arc_mru_ghost->arcs_size) +
3405 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c;
3406
3407 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA);
3408 total_evicted += bytes;
3409
3410 target -= bytes;
3411
3412 total_evicted +=
3413 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA);
3414
3415 return (total_evicted);
3416 }
3417
3418 static void
arc_do_user_evicts(void)3419 arc_do_user_evicts(void)
3420 {
3421 mutex_enter(&arc_user_evicts_lock);
3422 while (arc_eviction_list != NULL) {
3423 arc_buf_t *buf = arc_eviction_list;
3424 arc_eviction_list = buf->b_next;
3425 mutex_enter(&buf->b_evict_lock);
3426 buf->b_hdr = NULL;
3427 mutex_exit(&buf->b_evict_lock);
3428 mutex_exit(&arc_user_evicts_lock);
3429
3430 if (buf->b_efunc != NULL)
3431 VERIFY0(buf->b_efunc(buf->b_private));
3432
3433 buf->b_efunc = NULL;
3434 buf->b_private = NULL;
3435 kmem_cache_free(buf_cache, buf);
3436 mutex_enter(&arc_user_evicts_lock);
3437 }
3438 mutex_exit(&arc_user_evicts_lock);
3439 }
3440
3441 void
arc_flush(spa_t * spa,boolean_t retry)3442 arc_flush(spa_t *spa, boolean_t retry)
3443 {
3444 uint64_t guid = 0;
3445
3446 /*
3447 * If retry is TRUE, a spa must not be specified since we have
3448 * no good way to determine if all of a spa's buffers have been
3449 * evicted from an arc state.
3450 */
3451 ASSERT(!retry || spa == 0);
3452
3453 if (spa != NULL)
3454 guid = spa_load_guid(spa);
3455
3456 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry);
3457 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry);
3458
3459 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry);
3460 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry);
3461
3462 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry);
3463 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry);
3464
3465 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry);
3466 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry);
3467
3468 arc_do_user_evicts();
3469 ASSERT(spa || arc_eviction_list == NULL);
3470 }
3471
3472 void
arc_shrink(int64_t to_free)3473 arc_shrink(int64_t to_free)
3474 {
3475 if (arc_c > arc_c_min) {
3476
3477 if (arc_c > arc_c_min + to_free)
3478 atomic_add_64(&arc_c, -to_free);
3479 else
3480 arc_c = arc_c_min;
3481
3482 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift));
3483 if (arc_c > arc_size)
3484 arc_c = MAX(arc_size, arc_c_min);
3485 if (arc_p > arc_c)
3486 arc_p = (arc_c >> 1);
3487 ASSERT(arc_c >= arc_c_min);
3488 ASSERT((int64_t)arc_p >= 0);
3489 }
3490
3491 if (arc_size > arc_c)
3492 (void) arc_adjust();
3493 }
3494
3495 typedef enum free_memory_reason_t {
3496 FMR_UNKNOWN,
3497 FMR_NEEDFREE,
3498 FMR_LOTSFREE,
3499 FMR_SWAPFS_MINFREE,
3500 FMR_PAGES_PP_MAXIMUM,
3501 FMR_HEAP_ARENA,
3502 FMR_ZIO_ARENA,
3503 } free_memory_reason_t;
3504
3505 int64_t last_free_memory;
3506 free_memory_reason_t last_free_reason;
3507
3508 /*
3509 * Additional reserve of pages for pp_reserve.
3510 */
3511 int64_t arc_pages_pp_reserve = 64;
3512
3513 /*
3514 * Additional reserve of pages for swapfs.
3515 */
3516 int64_t arc_swapfs_reserve = 64;
3517
3518 /*
3519 * Return the amount of memory that can be consumed before reclaim will be
3520 * needed. Positive if there is sufficient free memory, negative indicates
3521 * the amount of memory that needs to be freed up.
3522 */
3523 static int64_t
arc_available_memory(void)3524 arc_available_memory(void)
3525 {
3526 int64_t lowest = INT64_MAX;
3527 int64_t n;
3528 free_memory_reason_t r = FMR_UNKNOWN;
3529
3530 #ifdef _KERNEL
3531 if (needfree > 0) {
3532 n = PAGESIZE * (-needfree);
3533 if (n < lowest) {
3534 lowest = n;
3535 r = FMR_NEEDFREE;
3536 }
3537 }
3538
3539 /*
3540 * check that we're out of range of the pageout scanner. It starts to
3541 * schedule paging if freemem is less than lotsfree and needfree.
3542 * lotsfree is the high-water mark for pageout, and needfree is the
3543 * number of needed free pages. We add extra pages here to make sure
3544 * the scanner doesn't start up while we're freeing memory.
3545 */
3546 n = PAGESIZE * (freemem - lotsfree - needfree - desfree);
3547 if (n < lowest) {
3548 lowest = n;
3549 r = FMR_LOTSFREE;
3550 }
3551
3552 /*
3553 * check to make sure that swapfs has enough space so that anon
3554 * reservations can still succeed. anon_resvmem() checks that the
3555 * availrmem is greater than swapfs_minfree, and the number of reserved
3556 * swap pages. We also add a bit of extra here just to prevent
3557 * circumstances from getting really dire.
3558 */
3559 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve -
3560 desfree - arc_swapfs_reserve);
3561 if (n < lowest) {
3562 lowest = n;
3563 r = FMR_SWAPFS_MINFREE;
3564 }
3565
3566
3567 /*
3568 * Check that we have enough availrmem that memory locking (e.g., via
3569 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum
3570 * stores the number of pages that cannot be locked; when availrmem
3571 * drops below pages_pp_maximum, page locking mechanisms such as
3572 * page_pp_lock() will fail.)
3573 */
3574 n = PAGESIZE * (availrmem - pages_pp_maximum -
3575 arc_pages_pp_reserve);
3576 if (n < lowest) {
3577 lowest = n;
3578 r = FMR_PAGES_PP_MAXIMUM;
3579 }
3580
3581 #if defined(__i386)
3582 /*
3583 * If we're on an i386 platform, it's possible that we'll exhaust the
3584 * kernel heap space before we ever run out of available physical
3585 * memory. Most checks of the size of the heap_area compare against
3586 * tune.t_minarmem, which is the minimum available real memory that we
3587 * can have in the system. However, this is generally fixed at 25 pages
3588 * which is so low that it's useless. In this comparison, we seek to
3589 * calculate the total heap-size, and reclaim if more than 3/4ths of the
3590 * heap is allocated. (Or, in the calculation, if less than 1/4th is
3591 * free)
3592 */
3593 n = vmem_size(heap_arena, VMEM_FREE) -
3594 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2);
3595 if (n < lowest) {
3596 lowest = n;
3597 r = FMR_HEAP_ARENA;
3598 }
3599 #endif
3600
3601 /*
3602 * If zio data pages are being allocated out of a separate heap segment,
3603 * then enforce that the size of available vmem for this arena remains
3604 * above about 1/16th free.
3605 *
3606 * Note: The 1/16th arena free requirement was put in place
3607 * to aggressively evict memory from the arc in order to avoid
3608 * memory fragmentation issues.
3609 */
3610 if (zio_arena != NULL) {
3611 n = vmem_size(zio_arena, VMEM_FREE) -
3612 (vmem_size(zio_arena, VMEM_ALLOC) >> 4);
3613 if (n < lowest) {
3614 lowest = n;
3615 r = FMR_ZIO_ARENA;
3616 }
3617 }
3618 #else
3619 /* Every 100 calls, free a small amount */
3620 if (spa_get_random(100) == 0)
3621 lowest = -1024;
3622 #endif
3623
3624 last_free_memory = lowest;
3625 last_free_reason = r;
3626
3627 return (lowest);
3628 }
3629
3630
3631 /*
3632 * Determine if the system is under memory pressure and is asking
3633 * to reclaim memory. A return value of TRUE indicates that the system
3634 * is under memory pressure and that the arc should adjust accordingly.
3635 */
3636 static boolean_t
arc_reclaim_needed(void)3637 arc_reclaim_needed(void)
3638 {
3639 return (arc_available_memory() < 0);
3640 }
3641
3642 static void
arc_kmem_reap_now(void)3643 arc_kmem_reap_now(void)
3644 {
3645 size_t i;
3646 kmem_cache_t *prev_cache = NULL;
3647 kmem_cache_t *prev_data_cache = NULL;
3648 extern kmem_cache_t *zio_buf_cache[];
3649 extern kmem_cache_t *zio_data_buf_cache[];
3650 extern kmem_cache_t *range_seg_cache;
3651
3652 #ifdef _KERNEL
3653 if (arc_meta_used >= arc_meta_limit) {
3654 /*
3655 * We are exceeding our meta-data cache limit.
3656 * Purge some DNLC entries to release holds on meta-data.
3657 */
3658 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
3659 }
3660 #if defined(__i386)
3661 /*
3662 * Reclaim unused memory from all kmem caches.
3663 */
3664 kmem_reap();
3665 #endif
3666 #endif
3667
3668 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
3669 if (zio_buf_cache[i] != prev_cache) {
3670 prev_cache = zio_buf_cache[i];
3671 kmem_cache_reap_now(zio_buf_cache[i]);
3672 }
3673 if (zio_data_buf_cache[i] != prev_data_cache) {
3674 prev_data_cache = zio_data_buf_cache[i];
3675 kmem_cache_reap_now(zio_data_buf_cache[i]);
3676 }
3677 }
3678 kmem_cache_reap_now(buf_cache);
3679 kmem_cache_reap_now(hdr_full_cache);
3680 kmem_cache_reap_now(hdr_l2only_cache);
3681 kmem_cache_reap_now(range_seg_cache);
3682
3683 if (zio_arena != NULL) {
3684 /*
3685 * Ask the vmem arena to reclaim unused memory from its
3686 * quantum caches.
3687 */
3688 vmem_qcache_reap(zio_arena);
3689 }
3690 }
3691
3692 /*
3693 * Threads can block in arc_get_data_buf() waiting for this thread to evict
3694 * enough data and signal them to proceed. When this happens, the threads in
3695 * arc_get_data_buf() are sleeping while holding the hash lock for their
3696 * particular arc header. Thus, we must be careful to never sleep on a
3697 * hash lock in this thread. This is to prevent the following deadlock:
3698 *
3699 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L",
3700 * waiting for the reclaim thread to signal it.
3701 *
3702 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter,
3703 * fails, and goes to sleep forever.
3704 *
3705 * This possible deadlock is avoided by always acquiring a hash lock
3706 * using mutex_tryenter() from arc_reclaim_thread().
3707 */
3708 static void
arc_reclaim_thread(void)3709 arc_reclaim_thread(void)
3710 {
3711 hrtime_t growtime = 0;
3712 callb_cpr_t cpr;
3713
3714 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG);
3715
3716 mutex_enter(&arc_reclaim_lock);
3717 while (!arc_reclaim_thread_exit) {
3718 int64_t free_memory = arc_available_memory();
3719 uint64_t evicted = 0;
3720
3721 mutex_exit(&arc_reclaim_lock);
3722
3723 if (free_memory < 0) {
3724
3725 arc_no_grow = B_TRUE;
3726 arc_warm = B_TRUE;
3727
3728 /*
3729 * Wait at least zfs_grow_retry (default 60) seconds
3730 * before considering growing.
3731 */
3732 growtime = gethrtime() + SEC2NSEC(arc_grow_retry);
3733
3734 arc_kmem_reap_now();
3735
3736 /*
3737 * If we are still low on memory, shrink the ARC
3738 * so that we have arc_shrink_min free space.
3739 */
3740 free_memory = arc_available_memory();
3741
3742 int64_t to_free =
3743 (arc_c >> arc_shrink_shift) - free_memory;
3744 if (to_free > 0) {
3745 #ifdef _KERNEL
3746 to_free = MAX(to_free, ptob(needfree));
3747 #endif
3748 arc_shrink(to_free);
3749 }
3750 } else if (free_memory < arc_c >> arc_no_grow_shift) {
3751 arc_no_grow = B_TRUE;
3752 } else if (gethrtime() >= growtime) {
3753 arc_no_grow = B_FALSE;
3754 }
3755
3756 evicted = arc_adjust();
3757
3758 mutex_enter(&arc_reclaim_lock);
3759
3760 /*
3761 * If evicted is zero, we couldn't evict anything via
3762 * arc_adjust(). This could be due to hash lock
3763 * collisions, but more likely due to the majority of
3764 * arc buffers being unevictable. Therefore, even if
3765 * arc_size is above arc_c, another pass is unlikely to
3766 * be helpful and could potentially cause us to enter an
3767 * infinite loop.
3768 */
3769 if (arc_size <= arc_c || evicted == 0) {
3770 /*
3771 * We're either no longer overflowing, or we
3772 * can't evict anything more, so we should wake
3773 * up any threads before we go to sleep.
3774 */
3775 cv_broadcast(&arc_reclaim_waiters_cv);
3776
3777 /*
3778 * Block until signaled, or after one second (we
3779 * might need to perform arc_kmem_reap_now()
3780 * even if we aren't being signalled)
3781 */
3782 CALLB_CPR_SAFE_BEGIN(&cpr);
3783 (void) cv_timedwait_hires(&arc_reclaim_thread_cv,
3784 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
3785 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock);
3786 }
3787 }
3788
3789 arc_reclaim_thread_exit = FALSE;
3790 cv_broadcast(&arc_reclaim_thread_cv);
3791 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */
3792 thread_exit();
3793 }
3794
3795 static void
arc_user_evicts_thread(void)3796 arc_user_evicts_thread(void)
3797 {
3798 callb_cpr_t cpr;
3799
3800 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG);
3801
3802 mutex_enter(&arc_user_evicts_lock);
3803 while (!arc_user_evicts_thread_exit) {
3804 mutex_exit(&arc_user_evicts_lock);
3805
3806 arc_do_user_evicts();
3807
3808 /*
3809 * This is necessary in order for the mdb ::arc dcmd to
3810 * show up to date information. Since the ::arc command
3811 * does not call the kstat's update function, without
3812 * this call, the command may show stale stats for the
3813 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even
3814 * with this change, the data might be up to 1 second
3815 * out of date; but that should suffice. The arc_state_t
3816 * structures can be queried directly if more accurate
3817 * information is needed.
3818 */
3819 if (arc_ksp != NULL)
3820 arc_ksp->ks_update(arc_ksp, KSTAT_READ);
3821
3822 mutex_enter(&arc_user_evicts_lock);
3823
3824 /*
3825 * Block until signaled, or after one second (we need to
3826 * call the arc's kstat update function regularly).
3827 */
3828 CALLB_CPR_SAFE_BEGIN(&cpr);
3829 (void) cv_timedwait(&arc_user_evicts_cv,
3830 &arc_user_evicts_lock, ddi_get_lbolt() + hz);
3831 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock);
3832 }
3833
3834 arc_user_evicts_thread_exit = FALSE;
3835 cv_broadcast(&arc_user_evicts_cv);
3836 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */
3837 thread_exit();
3838 }
3839
3840 /*
3841 * Adapt arc info given the number of bytes we are trying to add and
3842 * the state that we are comming from. This function is only called
3843 * when we are adding new content to the cache.
3844 */
3845 static void
arc_adapt(int bytes,arc_state_t * state)3846 arc_adapt(int bytes, arc_state_t *state)
3847 {
3848 int mult;
3849 uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
3850 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size);
3851 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size);
3852
3853 if (state == arc_l2c_only)
3854 return;
3855
3856 ASSERT(bytes > 0);
3857 /*
3858 * Adapt the target size of the MRU list:
3859 * - if we just hit in the MRU ghost list, then increase
3860 * the target size of the MRU list.
3861 * - if we just hit in the MFU ghost list, then increase
3862 * the target size of the MFU list by decreasing the
3863 * target size of the MRU list.
3864 */
3865 if (state == arc_mru_ghost) {
3866 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size);
3867 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */
3868
3869 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
3870 } else if (state == arc_mfu_ghost) {
3871 uint64_t delta;
3872
3873 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size);
3874 mult = MIN(mult, 10);
3875
3876 delta = MIN(bytes * mult, arc_p);
3877 arc_p = MAX(arc_p_min, arc_p - delta);
3878 }
3879 ASSERT((int64_t)arc_p >= 0);
3880
3881 if (arc_reclaim_needed()) {
3882 cv_signal(&arc_reclaim_thread_cv);
3883 return;
3884 }
3885
3886 if (arc_no_grow)
3887 return;
3888
3889 if (arc_c >= arc_c_max)
3890 return;
3891
3892 /*
3893 * If we're within (2 * maxblocksize) bytes of the target
3894 * cache size, increment the target cache size
3895 */
3896 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) {
3897 atomic_add_64(&arc_c, (int64_t)bytes);
3898 if (arc_c > arc_c_max)
3899 arc_c = arc_c_max;
3900 else if (state == arc_anon)
3901 atomic_add_64(&arc_p, (int64_t)bytes);
3902 if (arc_p > arc_c)
3903 arc_p = arc_c;
3904 }
3905 ASSERT((int64_t)arc_p >= 0);
3906 }
3907
3908 /*
3909 * Check if arc_size has grown past our upper threshold, determined by
3910 * zfs_arc_overflow_shift.
3911 */
3912 static boolean_t
arc_is_overflowing(void)3913 arc_is_overflowing(void)
3914 {
3915 /* Always allow at least one block of overflow */
3916 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE,
3917 arc_c >> zfs_arc_overflow_shift);
3918
3919 return (arc_size >= arc_c + overflow);
3920 }
3921
3922 /*
3923 * The buffer, supplied as the first argument, needs a data block. If we
3924 * are hitting the hard limit for the cache size, we must sleep, waiting
3925 * for the eviction thread to catch up. If we're past the target size
3926 * but below the hard limit, we'll only signal the reclaim thread and
3927 * continue on.
3928 */
3929 static void
arc_get_data_buf(arc_buf_t * buf)3930 arc_get_data_buf(arc_buf_t *buf)
3931 {
3932 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state;
3933 uint64_t size = buf->b_hdr->b_size;
3934 arc_buf_contents_t type = arc_buf_type(buf->b_hdr);
3935
3936 arc_adapt(size, state);
3937
3938 /*
3939 * If arc_size is currently overflowing, and has grown past our
3940 * upper limit, we must be adding data faster than the evict
3941 * thread can evict. Thus, to ensure we don't compound the
3942 * problem by adding more data and forcing arc_size to grow even
3943 * further past it's target size, we halt and wait for the
3944 * eviction thread to catch up.
3945 *
3946 * It's also possible that the reclaim thread is unable to evict
3947 * enough buffers to get arc_size below the overflow limit (e.g.
3948 * due to buffers being un-evictable, or hash lock collisions).
3949 * In this case, we want to proceed regardless if we're
3950 * overflowing; thus we don't use a while loop here.
3951 */
3952 if (arc_is_overflowing()) {
3953 mutex_enter(&arc_reclaim_lock);
3954
3955 /*
3956 * Now that we've acquired the lock, we may no longer be
3957 * over the overflow limit, lets check.
3958 *
3959 * We're ignoring the case of spurious wake ups. If that
3960 * were to happen, it'd let this thread consume an ARC
3961 * buffer before it should have (i.e. before we're under
3962 * the overflow limit and were signalled by the reclaim
3963 * thread). As long as that is a rare occurrence, it
3964 * shouldn't cause any harm.
3965 */
3966 if (arc_is_overflowing()) {
3967 cv_signal(&arc_reclaim_thread_cv);
3968 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock);
3969 }
3970
3971 mutex_exit(&arc_reclaim_lock);
3972 }
3973
3974 if (type == ARC_BUFC_METADATA) {
3975 buf->b_data = zio_buf_alloc(size);
3976 arc_space_consume(size, ARC_SPACE_META);
3977 } else {
3978 ASSERT(type == ARC_BUFC_DATA);
3979 buf->b_data = zio_data_buf_alloc(size);
3980 arc_space_consume(size, ARC_SPACE_DATA);
3981 }
3982
3983 /*
3984 * Update the state size. Note that ghost states have a
3985 * "ghost size" and so don't need to be updated.
3986 */
3987 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) {
3988 arc_buf_hdr_t *hdr = buf->b_hdr;
3989 arc_state_t *state = hdr->b_l1hdr.b_state;
3990
3991 (void) refcount_add_many(&state->arcs_size, size, buf);
3992
3993 /*
3994 * If this is reached via arc_read, the link is
3995 * protected by the hash lock. If reached via
3996 * arc_buf_alloc, the header should not be accessed by
3997 * any other thread. And, if reached via arc_read_done,
3998 * the hash lock will protect it if it's found in the
3999 * hash table; otherwise no other thread should be
4000 * trying to [add|remove]_reference it.
4001 */
4002 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) {
4003 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4004 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type],
4005 size);
4006 }
4007 /*
4008 * If we are growing the cache, and we are adding anonymous
4009 * data, and we have outgrown arc_p, update arc_p
4010 */
4011 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon &&
4012 (refcount_count(&arc_anon->arcs_size) +
4013 refcount_count(&arc_mru->arcs_size) > arc_p))
4014 arc_p = MIN(arc_c, arc_p + size);
4015 }
4016 }
4017
4018 /*
4019 * This routine is called whenever a buffer is accessed.
4020 * NOTE: the hash lock is dropped in this function.
4021 */
4022 static void
arc_access(arc_buf_hdr_t * hdr,kmutex_t * hash_lock)4023 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
4024 {
4025 clock_t now;
4026
4027 ASSERT(MUTEX_HELD(hash_lock));
4028 ASSERT(HDR_HAS_L1HDR(hdr));
4029
4030 if (hdr->b_l1hdr.b_state == arc_anon) {
4031 /*
4032 * This buffer is not in the cache, and does not
4033 * appear in our "ghost" list. Add the new buffer
4034 * to the MRU state.
4035 */
4036
4037 ASSERT0(hdr->b_l1hdr.b_arc_access);
4038 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4039 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4040 arc_change_state(arc_mru, hdr, hash_lock);
4041
4042 } else if (hdr->b_l1hdr.b_state == arc_mru) {
4043 now = ddi_get_lbolt();
4044
4045 /*
4046 * If this buffer is here because of a prefetch, then either:
4047 * - clear the flag if this is a "referencing" read
4048 * (any subsequent access will bump this into the MFU state).
4049 * or
4050 * - move the buffer to the head of the list if this is
4051 * another prefetch (to make it less likely to be evicted).
4052 */
4053 if (HDR_PREFETCH(hdr)) {
4054 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4055 /* link protected by hash lock */
4056 ASSERT(multilist_link_active(
4057 &hdr->b_l1hdr.b_arc_node));
4058 } else {
4059 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
4060 ARCSTAT_BUMP(arcstat_mru_hits);
4061 }
4062 hdr->b_l1hdr.b_arc_access = now;
4063 return;
4064 }
4065
4066 /*
4067 * This buffer has been "accessed" only once so far,
4068 * but it is still in the cache. Move it to the MFU
4069 * state.
4070 */
4071 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) {
4072 /*
4073 * More than 125ms have passed since we
4074 * instantiated this buffer. Move it to the
4075 * most frequently used state.
4076 */
4077 hdr->b_l1hdr.b_arc_access = now;
4078 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4079 arc_change_state(arc_mfu, hdr, hash_lock);
4080 }
4081 ARCSTAT_BUMP(arcstat_mru_hits);
4082 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) {
4083 arc_state_t *new_state;
4084 /*
4085 * This buffer has been "accessed" recently, but
4086 * was evicted from the cache. Move it to the
4087 * MFU state.
4088 */
4089
4090 if (HDR_PREFETCH(hdr)) {
4091 new_state = arc_mru;
4092 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
4093 hdr->b_flags &= ~ARC_FLAG_PREFETCH;
4094 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
4095 } else {
4096 new_state = arc_mfu;
4097 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4098 }
4099
4100 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4101 arc_change_state(new_state, hdr, hash_lock);
4102
4103 ARCSTAT_BUMP(arcstat_mru_ghost_hits);
4104 } else if (hdr->b_l1hdr.b_state == arc_mfu) {
4105 /*
4106 * This buffer has been accessed more than once and is
4107 * still in the cache. Keep it in the MFU state.
4108 *
4109 * NOTE: an add_reference() that occurred when we did
4110 * the arc_read() will have kicked this off the list.
4111 * If it was a prefetch, we will explicitly move it to
4112 * the head of the list now.
4113 */
4114 if ((HDR_PREFETCH(hdr)) != 0) {
4115 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4116 /* link protected by hash_lock */
4117 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4118 }
4119 ARCSTAT_BUMP(arcstat_mfu_hits);
4120 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4121 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) {
4122 arc_state_t *new_state = arc_mfu;
4123 /*
4124 * This buffer has been accessed more than once but has
4125 * been evicted from the cache. Move it back to the
4126 * MFU state.
4127 */
4128
4129 if (HDR_PREFETCH(hdr)) {
4130 /*
4131 * This is a prefetch access...
4132 * move this block back to the MRU state.
4133 */
4134 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
4135 new_state = arc_mru;
4136 }
4137
4138 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4139 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4140 arc_change_state(new_state, hdr, hash_lock);
4141
4142 ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
4143 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) {
4144 /*
4145 * This buffer is on the 2nd Level ARC.
4146 */
4147
4148 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
4149 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr);
4150 arc_change_state(arc_mfu, hdr, hash_lock);
4151 } else {
4152 ASSERT(!"invalid arc state");
4153 }
4154 }
4155
4156 /* a generic arc_done_func_t which you can use */
4157 /* ARGSUSED */
4158 void
arc_bcopy_func(zio_t * zio,arc_buf_t * buf,void * arg)4159 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
4160 {
4161 if (zio == NULL || zio->io_error == 0)
4162 bcopy(buf->b_data, arg, buf->b_hdr->b_size);
4163 VERIFY(arc_buf_remove_ref(buf, arg));
4164 }
4165
4166 /* a generic arc_done_func_t */
4167 void
arc_getbuf_func(zio_t * zio,arc_buf_t * buf,void * arg)4168 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
4169 {
4170 arc_buf_t **bufp = arg;
4171 if (zio && zio->io_error) {
4172 VERIFY(arc_buf_remove_ref(buf, arg));
4173 *bufp = NULL;
4174 } else {
4175 *bufp = buf;
4176 ASSERT(buf->b_data);
4177 }
4178 }
4179
4180 static void
arc_read_done(zio_t * zio)4181 arc_read_done(zio_t *zio)
4182 {
4183 arc_buf_hdr_t *hdr;
4184 arc_buf_t *buf;
4185 arc_buf_t *abuf; /* buffer we're assigning to callback */
4186 kmutex_t *hash_lock = NULL;
4187 arc_callback_t *callback_list, *acb;
4188 int freeable = FALSE;
4189
4190 buf = zio->io_private;
4191 hdr = buf->b_hdr;
4192
4193 /*
4194 * The hdr was inserted into hash-table and removed from lists
4195 * prior to starting I/O. We should find this header, since
4196 * it's in the hash table, and it should be legit since it's
4197 * not possible to evict it during the I/O. The only possible
4198 * reason for it not to be found is if we were freed during the
4199 * read.
4200 */
4201 if (HDR_IN_HASH_TABLE(hdr)) {
4202 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp));
4203 ASSERT3U(hdr->b_dva.dva_word[0], ==,
4204 BP_IDENTITY(zio->io_bp)->dva_word[0]);
4205 ASSERT3U(hdr->b_dva.dva_word[1], ==,
4206 BP_IDENTITY(zio->io_bp)->dva_word[1]);
4207
4208 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp,
4209 &hash_lock);
4210
4211 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) &&
4212 hash_lock == NULL) ||
4213 (found == hdr &&
4214 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
4215 (found == hdr && HDR_L2_READING(hdr)));
4216 }
4217
4218 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED;
4219 if (l2arc_noprefetch && HDR_PREFETCH(hdr))
4220 hdr->b_flags &= ~ARC_FLAG_L2CACHE;
4221
4222 /* byteswap if necessary */
4223 callback_list = hdr->b_l1hdr.b_acb;
4224 ASSERT(callback_list != NULL);
4225 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
4226 dmu_object_byteswap_t bswap =
4227 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp));
4228 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
4229 byteswap_uint64_array :
4230 dmu_ot_byteswap[bswap].ob_func;
4231 func(buf->b_data, hdr->b_size);
4232 }
4233
4234 arc_cksum_compute(buf, B_FALSE);
4235 arc_buf_watch(buf);
4236
4237 if (hash_lock && zio->io_error == 0 &&
4238 hdr->b_l1hdr.b_state == arc_anon) {
4239 /*
4240 * Only call arc_access on anonymous buffers. This is because
4241 * if we've issued an I/O for an evicted buffer, we've already
4242 * called arc_access (to prevent any simultaneous readers from
4243 * getting confused).
4244 */
4245 arc_access(hdr, hash_lock);
4246 }
4247
4248 /* create copies of the data buffer for the callers */
4249 abuf = buf;
4250 for (acb = callback_list; acb; acb = acb->acb_next) {
4251 if (acb->acb_done) {
4252 if (abuf == NULL) {
4253 ARCSTAT_BUMP(arcstat_duplicate_reads);
4254 abuf = arc_buf_clone(buf);
4255 }
4256 acb->acb_buf = abuf;
4257 abuf = NULL;
4258 }
4259 }
4260 hdr->b_l1hdr.b_acb = NULL;
4261 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
4262 ASSERT(!HDR_BUF_AVAILABLE(hdr));
4263 if (abuf == buf) {
4264 ASSERT(buf->b_efunc == NULL);
4265 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
4266 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4267 }
4268
4269 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
4270 callback_list != NULL);
4271
4272 if (zio->io_error != 0) {
4273 hdr->b_flags |= ARC_FLAG_IO_ERROR;
4274 if (hdr->b_l1hdr.b_state != arc_anon)
4275 arc_change_state(arc_anon, hdr, hash_lock);
4276 if (HDR_IN_HASH_TABLE(hdr))
4277 buf_hash_remove(hdr);
4278 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4279 }
4280
4281 /*
4282 * Broadcast before we drop the hash_lock to avoid the possibility
4283 * that the hdr (and hence the cv) might be freed before we get to
4284 * the cv_broadcast().
4285 */
4286 cv_broadcast(&hdr->b_l1hdr.b_cv);
4287
4288 if (hash_lock != NULL) {
4289 mutex_exit(hash_lock);
4290 } else {
4291 /*
4292 * This block was freed while we waited for the read to
4293 * complete. It has been removed from the hash table and
4294 * moved to the anonymous state (so that it won't show up
4295 * in the cache).
4296 */
4297 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon);
4298 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt);
4299 }
4300
4301 /* execute each callback and free its structure */
4302 while ((acb = callback_list) != NULL) {
4303 if (acb->acb_done)
4304 acb->acb_done(zio, acb->acb_buf, acb->acb_private);
4305
4306 if (acb->acb_zio_dummy != NULL) {
4307 acb->acb_zio_dummy->io_error = zio->io_error;
4308 zio_nowait(acb->acb_zio_dummy);
4309 }
4310
4311 callback_list = acb->acb_next;
4312 kmem_free(acb, sizeof (arc_callback_t));
4313 }
4314
4315 if (freeable)
4316 arc_hdr_destroy(hdr);
4317 }
4318
4319 /*
4320 * "Read" the block at the specified DVA (in bp) via the
4321 * cache. If the block is found in the cache, invoke the provided
4322 * callback immediately and return. Note that the `zio' parameter
4323 * in the callback will be NULL in this case, since no IO was
4324 * required. If the block is not in the cache pass the read request
4325 * on to the spa with a substitute callback function, so that the
4326 * requested block will be added to the cache.
4327 *
4328 * If a read request arrives for a block that has a read in-progress,
4329 * either wait for the in-progress read to complete (and return the
4330 * results); or, if this is a read with a "done" func, add a record
4331 * to the read to invoke the "done" func when the read completes,
4332 * and return; or just return.
4333 *
4334 * arc_read_done() will invoke all the requested "done" functions
4335 * for readers of this block.
4336 */
4337 int
arc_read(zio_t * pio,spa_t * spa,const blkptr_t * bp,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,arc_flags_t * arc_flags,const zbookmark_phys_t * zb)4338 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
4339 void *private, zio_priority_t priority, int zio_flags,
4340 arc_flags_t *arc_flags, const zbookmark_phys_t *zb)
4341 {
4342 arc_buf_hdr_t *hdr = NULL;
4343 arc_buf_t *buf = NULL;
4344 kmutex_t *hash_lock = NULL;
4345 zio_t *rzio;
4346 uint64_t guid = spa_load_guid(spa);
4347
4348 ASSERT(!BP_IS_EMBEDDED(bp) ||
4349 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA);
4350
4351 top:
4352 if (!BP_IS_EMBEDDED(bp)) {
4353 /*
4354 * Embedded BP's have no DVA and require no I/O to "read".
4355 * Create an anonymous arc buf to back it.
4356 */
4357 hdr = buf_hash_find(guid, bp, &hash_lock);
4358 }
4359
4360 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) {
4361
4362 *arc_flags |= ARC_FLAG_CACHED;
4363
4364 if (HDR_IO_IN_PROGRESS(hdr)) {
4365
4366 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) &&
4367 priority == ZIO_PRIORITY_SYNC_READ) {
4368 /*
4369 * This sync read must wait for an
4370 * in-progress async read (e.g. a predictive
4371 * prefetch). Async reads are queued
4372 * separately at the vdev_queue layer, so
4373 * this is a form of priority inversion.
4374 * Ideally, we would "inherit" the demand
4375 * i/o's priority by moving the i/o from
4376 * the async queue to the synchronous queue,
4377 * but there is currently no mechanism to do
4378 * so. Track this so that we can evaluate
4379 * the magnitude of this potential performance
4380 * problem.
4381 *
4382 * Note that if the prefetch i/o is already
4383 * active (has been issued to the device),
4384 * the prefetch improved performance, because
4385 * we issued it sooner than we would have
4386 * without the prefetch.
4387 */
4388 DTRACE_PROBE1(arc__sync__wait__for__async,
4389 arc_buf_hdr_t *, hdr);
4390 ARCSTAT_BUMP(arcstat_sync_wait_for_async);
4391 }
4392 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4393 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4394 }
4395
4396 if (*arc_flags & ARC_FLAG_WAIT) {
4397 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock);
4398 mutex_exit(hash_lock);
4399 goto top;
4400 }
4401 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4402
4403 if (done) {
4404 arc_callback_t *acb = NULL;
4405
4406 acb = kmem_zalloc(sizeof (arc_callback_t),
4407 KM_SLEEP);
4408 acb->acb_done = done;
4409 acb->acb_private = private;
4410 if (pio != NULL)
4411 acb->acb_zio_dummy = zio_null(pio,
4412 spa, NULL, NULL, NULL, zio_flags);
4413
4414 ASSERT(acb->acb_done != NULL);
4415 acb->acb_next = hdr->b_l1hdr.b_acb;
4416 hdr->b_l1hdr.b_acb = acb;
4417 add_reference(hdr, hash_lock, private);
4418 mutex_exit(hash_lock);
4419 return (0);
4420 }
4421 mutex_exit(hash_lock);
4422 return (0);
4423 }
4424
4425 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4426 hdr->b_l1hdr.b_state == arc_mfu);
4427
4428 if (done) {
4429 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) {
4430 /*
4431 * This is a demand read which does not have to
4432 * wait for i/o because we did a predictive
4433 * prefetch i/o for it, which has completed.
4434 */
4435 DTRACE_PROBE1(
4436 arc__demand__hit__predictive__prefetch,
4437 arc_buf_hdr_t *, hdr);
4438 ARCSTAT_BUMP(
4439 arcstat_demand_hit_predictive_prefetch);
4440 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH;
4441 }
4442 add_reference(hdr, hash_lock, private);
4443 /*
4444 * If this block is already in use, create a new
4445 * copy of the data so that we will be guaranteed
4446 * that arc_release() will always succeed.
4447 */
4448 buf = hdr->b_l1hdr.b_buf;
4449 ASSERT(buf);
4450 ASSERT(buf->b_data);
4451 if (HDR_BUF_AVAILABLE(hdr)) {
4452 ASSERT(buf->b_efunc == NULL);
4453 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4454 } else {
4455 buf = arc_buf_clone(buf);
4456 }
4457
4458 } else if (*arc_flags & ARC_FLAG_PREFETCH &&
4459 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
4460 hdr->b_flags |= ARC_FLAG_PREFETCH;
4461 }
4462 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
4463 arc_access(hdr, hash_lock);
4464 if (*arc_flags & ARC_FLAG_L2CACHE)
4465 hdr->b_flags |= ARC_FLAG_L2CACHE;
4466 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4467 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4468 mutex_exit(hash_lock);
4469 ARCSTAT_BUMP(arcstat_hits);
4470 arc_update_hit_stat(hdr, B_TRUE);
4471
4472 if (done)
4473 done(NULL, buf, private);
4474 } else {
4475 uint64_t size = BP_GET_LSIZE(bp);
4476 arc_callback_t *acb;
4477 vdev_t *vd = NULL;
4478 uint64_t addr = 0;
4479 boolean_t devw = B_FALSE;
4480 enum zio_compress b_compress = ZIO_COMPRESS_OFF;
4481 int32_t b_asize = 0;
4482
4483 if (hdr == NULL) {
4484 /* this block is not in the cache */
4485 arc_buf_hdr_t *exists = NULL;
4486 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp);
4487 buf = arc_buf_alloc(spa, size, private, type);
4488 hdr = buf->b_hdr;
4489 if (!BP_IS_EMBEDDED(bp)) {
4490 hdr->b_dva = *BP_IDENTITY(bp);
4491 hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
4492 exists = buf_hash_insert(hdr, &hash_lock);
4493 }
4494 if (exists != NULL) {
4495 /* somebody beat us to the hash insert */
4496 mutex_exit(hash_lock);
4497 buf_discard_identity(hdr);
4498 (void) arc_buf_remove_ref(buf, private);
4499 goto top; /* restart the IO request */
4500 }
4501
4502 /*
4503 * If there is a callback, we pass our reference to
4504 * it; otherwise we remove our reference.
4505 */
4506 if (done == NULL) {
4507 (void) remove_reference(hdr, hash_lock,
4508 private);
4509 }
4510 if (*arc_flags & ARC_FLAG_PREFETCH)
4511 hdr->b_flags |= ARC_FLAG_PREFETCH;
4512 if (*arc_flags & ARC_FLAG_L2CACHE)
4513 hdr->b_flags |= ARC_FLAG_L2CACHE;
4514 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4515 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4516 if (BP_GET_LEVEL(bp) > 0)
4517 hdr->b_flags |= ARC_FLAG_INDIRECT;
4518 } else {
4519 /*
4520 * This block is in the ghost cache. If it was L2-only
4521 * (and thus didn't have an L1 hdr), we realloc the
4522 * header to add an L1 hdr.
4523 */
4524 if (!HDR_HAS_L1HDR(hdr)) {
4525 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache,
4526 hdr_full_cache);
4527 }
4528
4529 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state));
4530 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4531 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
4532 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL);
4533
4534 /*
4535 * If there is a callback, we pass a reference to it.
4536 */
4537 if (done != NULL)
4538 add_reference(hdr, hash_lock, private);
4539 if (*arc_flags & ARC_FLAG_PREFETCH)
4540 hdr->b_flags |= ARC_FLAG_PREFETCH;
4541 if (*arc_flags & ARC_FLAG_L2CACHE)
4542 hdr->b_flags |= ARC_FLAG_L2CACHE;
4543 if (*arc_flags & ARC_FLAG_L2COMPRESS)
4544 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
4545 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
4546 buf->b_hdr = hdr;
4547 buf->b_data = NULL;
4548 buf->b_efunc = NULL;
4549 buf->b_private = NULL;
4550 buf->b_next = NULL;
4551 hdr->b_l1hdr.b_buf = buf;
4552 ASSERT0(hdr->b_l1hdr.b_datacnt);
4553 hdr->b_l1hdr.b_datacnt = 1;
4554 arc_get_data_buf(buf);
4555 arc_access(hdr, hash_lock);
4556 }
4557
4558 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH)
4559 hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH;
4560 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state));
4561
4562 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
4563 acb->acb_done = done;
4564 acb->acb_private = private;
4565
4566 ASSERT(hdr->b_l1hdr.b_acb == NULL);
4567 hdr->b_l1hdr.b_acb = acb;
4568 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
4569
4570 if (HDR_HAS_L2HDR(hdr) &&
4571 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) {
4572 devw = hdr->b_l2hdr.b_dev->l2ad_writing;
4573 addr = hdr->b_l2hdr.b_daddr;
4574 b_compress = hdr->b_l2hdr.b_compress;
4575 b_asize = hdr->b_l2hdr.b_asize;
4576 /*
4577 * Lock out device removal.
4578 */
4579 if (vdev_is_dead(vd) ||
4580 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER))
4581 vd = NULL;
4582 }
4583
4584 if (hash_lock != NULL)
4585 mutex_exit(hash_lock);
4586
4587 /*
4588 * At this point, we have a level 1 cache miss. Try again in
4589 * L2ARC if possible.
4590 */
4591 ASSERT3U(hdr->b_size, ==, size);
4592 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
4593 uint64_t, size, zbookmark_phys_t *, zb);
4594 ARCSTAT_BUMP(arcstat_misses);
4595 arc_update_hit_stat(hdr, B_FALSE);
4596
4597 if (priority == ZIO_PRIORITY_ASYNC_READ)
4598 hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ;
4599 else
4600 hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ;
4601
4602 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
4603 /*
4604 * Read from the L2ARC if the following are true:
4605 * 1. The L2ARC vdev was previously cached.
4606 * 2. This buffer still has L2ARC metadata.
4607 * 3. This buffer isn't currently writing to the L2ARC.
4608 * 4. The L2ARC entry wasn't evicted, which may
4609 * also have invalidated the vdev.
4610 * 5. This isn't prefetch and l2arc_noprefetch is set.
4611 */
4612 if (HDR_HAS_L2HDR(hdr) &&
4613 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
4614 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
4615 l2arc_read_callback_t *cb;
4616
4617 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
4618 ARCSTAT_BUMP(arcstat_l2_hits);
4619
4620 cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
4621 KM_SLEEP);
4622 cb->l2rcb_buf = buf;
4623 cb->l2rcb_spa = spa;
4624 cb->l2rcb_bp = *bp;
4625 cb->l2rcb_zb = *zb;
4626 cb->l2rcb_flags = zio_flags;
4627 cb->l2rcb_compress = b_compress;
4628
4629 ASSERT(addr >= VDEV_LABEL_START_SIZE &&
4630 addr + size < vd->vdev_psize -
4631 VDEV_LABEL_END_SIZE);
4632
4633 /*
4634 * l2arc read. The SCL_L2ARC lock will be
4635 * released by l2arc_read_done().
4636 * Issue a null zio if the underlying buffer
4637 * was squashed to zero size by compression.
4638 */
4639 if (b_compress == ZIO_COMPRESS_EMPTY) {
4640 rzio = zio_null(pio, spa, vd,
4641 l2arc_read_done, cb,
4642 zio_flags | ZIO_FLAG_DONT_CACHE |
4643 ZIO_FLAG_CANFAIL |
4644 ZIO_FLAG_DONT_PROPAGATE |
4645 ZIO_FLAG_DONT_RETRY);
4646 } else {
4647 rzio = zio_read_phys(pio, vd, addr,
4648 b_asize, buf->b_data,
4649 ZIO_CHECKSUM_OFF,
4650 l2arc_read_done, cb, priority,
4651 zio_flags | ZIO_FLAG_DONT_CACHE |
4652 ZIO_FLAG_CANFAIL |
4653 ZIO_FLAG_DONT_PROPAGATE |
4654 ZIO_FLAG_DONT_RETRY, B_FALSE);
4655 }
4656 DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
4657 zio_t *, rzio);
4658 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize);
4659
4660 if (*arc_flags & ARC_FLAG_NOWAIT) {
4661 zio_nowait(rzio);
4662 return (0);
4663 }
4664
4665 ASSERT(*arc_flags & ARC_FLAG_WAIT);
4666 if (zio_wait(rzio) == 0)
4667 return (0);
4668
4669 /* l2arc read error; goto zio_read() */
4670 } else {
4671 DTRACE_PROBE1(l2arc__miss,
4672 arc_buf_hdr_t *, hdr);
4673 ARCSTAT_BUMP(arcstat_l2_misses);
4674 if (HDR_L2_WRITING(hdr))
4675 ARCSTAT_BUMP(arcstat_l2_rw_clash);
4676 spa_config_exit(spa, SCL_L2ARC, vd);
4677 }
4678 } else {
4679 if (vd != NULL)
4680 spa_config_exit(spa, SCL_L2ARC, vd);
4681 if (l2arc_ndev != 0) {
4682 DTRACE_PROBE1(l2arc__miss,
4683 arc_buf_hdr_t *, hdr);
4684 ARCSTAT_BUMP(arcstat_l2_misses);
4685 }
4686 }
4687
4688 rzio = zio_read(pio, spa, bp, buf->b_data, size,
4689 arc_read_done, buf, priority, zio_flags, zb);
4690
4691 if (*arc_flags & ARC_FLAG_WAIT)
4692 return (zio_wait(rzio));
4693
4694 ASSERT(*arc_flags & ARC_FLAG_NOWAIT);
4695 zio_nowait(rzio);
4696 }
4697 return (0);
4698 }
4699
4700 void
arc_set_callback(arc_buf_t * buf,arc_evict_func_t * func,void * private)4701 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
4702 {
4703 ASSERT(buf->b_hdr != NULL);
4704 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon);
4705 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) ||
4706 func == NULL);
4707 ASSERT(buf->b_efunc == NULL);
4708 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
4709
4710 buf->b_efunc = func;
4711 buf->b_private = private;
4712 }
4713
4714 /*
4715 * Notify the arc that a block was freed, and thus will never be used again.
4716 */
4717 void
arc_freed(spa_t * spa,const blkptr_t * bp)4718 arc_freed(spa_t *spa, const blkptr_t *bp)
4719 {
4720 arc_buf_hdr_t *hdr;
4721 kmutex_t *hash_lock;
4722 uint64_t guid = spa_load_guid(spa);
4723
4724 ASSERT(!BP_IS_EMBEDDED(bp));
4725
4726 hdr = buf_hash_find(guid, bp, &hash_lock);
4727 if (hdr == NULL)
4728 return;
4729 if (HDR_BUF_AVAILABLE(hdr)) {
4730 arc_buf_t *buf = hdr->b_l1hdr.b_buf;
4731 add_reference(hdr, hash_lock, FTAG);
4732 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE;
4733 mutex_exit(hash_lock);
4734
4735 arc_release(buf, FTAG);
4736 (void) arc_buf_remove_ref(buf, FTAG);
4737 } else {
4738 mutex_exit(hash_lock);
4739 }
4740
4741 }
4742
4743 /*
4744 * Clear the user eviction callback set by arc_set_callback(), first calling
4745 * it if it exists. Because the presence of a callback keeps an arc_buf cached
4746 * clearing the callback may result in the arc_buf being destroyed. However,
4747 * it will not result in the *last* arc_buf being destroyed, hence the data
4748 * will remain cached in the ARC. We make a copy of the arc buffer here so
4749 * that we can process the callback without holding any locks.
4750 *
4751 * It's possible that the callback is already in the process of being cleared
4752 * by another thread. In this case we can not clear the callback.
4753 *
4754 * Returns B_TRUE if the callback was successfully called and cleared.
4755 */
4756 boolean_t
arc_clear_callback(arc_buf_t * buf)4757 arc_clear_callback(arc_buf_t *buf)
4758 {
4759 arc_buf_hdr_t *hdr;
4760 kmutex_t *hash_lock;
4761 arc_evict_func_t *efunc = buf->b_efunc;
4762 void *private = buf->b_private;
4763
4764 mutex_enter(&buf->b_evict_lock);
4765 hdr = buf->b_hdr;
4766 if (hdr == NULL) {
4767 /*
4768 * We are in arc_do_user_evicts().
4769 */
4770 ASSERT(buf->b_data == NULL);
4771 mutex_exit(&buf->b_evict_lock);
4772 return (B_FALSE);
4773 } else if (buf->b_data == NULL) {
4774 /*
4775 * We are on the eviction list; process this buffer now
4776 * but let arc_do_user_evicts() do the reaping.
4777 */
4778 buf->b_efunc = NULL;
4779 mutex_exit(&buf->b_evict_lock);
4780 VERIFY0(efunc(private));
4781 return (B_TRUE);
4782 }
4783 hash_lock = HDR_LOCK(hdr);
4784 mutex_enter(hash_lock);
4785 hdr = buf->b_hdr;
4786 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4787
4788 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <,
4789 hdr->b_l1hdr.b_datacnt);
4790 ASSERT(hdr->b_l1hdr.b_state == arc_mru ||
4791 hdr->b_l1hdr.b_state == arc_mfu);
4792
4793 buf->b_efunc = NULL;
4794 buf->b_private = NULL;
4795
4796 if (hdr->b_l1hdr.b_datacnt > 1) {
4797 mutex_exit(&buf->b_evict_lock);
4798 arc_buf_destroy(buf, TRUE);
4799 } else {
4800 ASSERT(buf == hdr->b_l1hdr.b_buf);
4801 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE;
4802 mutex_exit(&buf->b_evict_lock);
4803 }
4804
4805 mutex_exit(hash_lock);
4806 VERIFY0(efunc(private));
4807 return (B_TRUE);
4808 }
4809
4810 /*
4811 * Release this buffer from the cache, making it an anonymous buffer. This
4812 * must be done after a read and prior to modifying the buffer contents.
4813 * If the buffer has more than one reference, we must make
4814 * a new hdr for the buffer.
4815 */
4816 void
arc_release(arc_buf_t * buf,void * tag)4817 arc_release(arc_buf_t *buf, void *tag)
4818 {
4819 arc_buf_hdr_t *hdr = buf->b_hdr;
4820
4821 /*
4822 * It would be nice to assert that if it's DMU metadata (level >
4823 * 0 || it's the dnode file), then it must be syncing context.
4824 * But we don't know that information at this level.
4825 */
4826
4827 mutex_enter(&buf->b_evict_lock);
4828
4829 ASSERT(HDR_HAS_L1HDR(hdr));
4830
4831 /*
4832 * We don't grab the hash lock prior to this check, because if
4833 * the buffer's header is in the arc_anon state, it won't be
4834 * linked into the hash table.
4835 */
4836 if (hdr->b_l1hdr.b_state == arc_anon) {
4837 mutex_exit(&buf->b_evict_lock);
4838 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4839 ASSERT(!HDR_IN_HASH_TABLE(hdr));
4840 ASSERT(!HDR_HAS_L2HDR(hdr));
4841 ASSERT(BUF_EMPTY(hdr));
4842
4843 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1);
4844 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1);
4845 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node));
4846
4847 ASSERT3P(buf->b_efunc, ==, NULL);
4848 ASSERT3P(buf->b_private, ==, NULL);
4849
4850 hdr->b_l1hdr.b_arc_access = 0;
4851 arc_buf_thaw(buf);
4852
4853 return;
4854 }
4855
4856 kmutex_t *hash_lock = HDR_LOCK(hdr);
4857 mutex_enter(hash_lock);
4858
4859 /*
4860 * This assignment is only valid as long as the hash_lock is
4861 * held, we must be careful not to reference state or the
4862 * b_state field after dropping the lock.
4863 */
4864 arc_state_t *state = hdr->b_l1hdr.b_state;
4865 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
4866 ASSERT3P(state, !=, arc_anon);
4867
4868 /* this buffer is not on any list */
4869 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0);
4870
4871 if (HDR_HAS_L2HDR(hdr)) {
4872 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4873
4874 /*
4875 * We have to recheck this conditional again now that
4876 * we're holding the l2ad_mtx to prevent a race with
4877 * another thread which might be concurrently calling
4878 * l2arc_evict(). In that case, l2arc_evict() might have
4879 * destroyed the header's L2 portion as we were waiting
4880 * to acquire the l2ad_mtx.
4881 */
4882 if (HDR_HAS_L2HDR(hdr))
4883 arc_hdr_l2hdr_destroy(hdr);
4884
4885 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx);
4886 }
4887
4888 /*
4889 * Do we have more than one buf?
4890 */
4891 if (hdr->b_l1hdr.b_datacnt > 1) {
4892 arc_buf_hdr_t *nhdr;
4893 arc_buf_t **bufp;
4894 uint64_t blksz = hdr->b_size;
4895 uint64_t spa = hdr->b_spa;
4896 arc_buf_contents_t type = arc_buf_type(hdr);
4897 uint32_t flags = hdr->b_flags;
4898
4899 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
4900 /*
4901 * Pull the data off of this hdr and attach it to
4902 * a new anonymous hdr.
4903 */
4904 (void) remove_reference(hdr, hash_lock, tag);
4905 bufp = &hdr->b_l1hdr.b_buf;
4906 while (*bufp != buf)
4907 bufp = &(*bufp)->b_next;
4908 *bufp = buf->b_next;
4909 buf->b_next = NULL;
4910
4911 ASSERT3P(state, !=, arc_l2c_only);
4912
4913 (void) refcount_remove_many(
4914 &state->arcs_size, hdr->b_size, buf);
4915
4916 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) {
4917 ASSERT3P(state, !=, arc_l2c_only);
4918 uint64_t *size = &state->arcs_lsize[type];
4919 ASSERT3U(*size, >=, hdr->b_size);
4920 atomic_add_64(size, -hdr->b_size);
4921 }
4922
4923 /*
4924 * We're releasing a duplicate user data buffer, update
4925 * our statistics accordingly.
4926 */
4927 if (HDR_ISTYPE_DATA(hdr)) {
4928 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers);
4929 ARCSTAT_INCR(arcstat_duplicate_buffers_size,
4930 -hdr->b_size);
4931 }
4932 hdr->b_l1hdr.b_datacnt -= 1;
4933 arc_cksum_verify(buf);
4934 arc_buf_unwatch(buf);
4935
4936 mutex_exit(hash_lock);
4937
4938 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE);
4939 nhdr->b_size = blksz;
4940 nhdr->b_spa = spa;
4941
4942 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING;
4943 nhdr->b_flags |= arc_bufc_to_flags(type);
4944 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR;
4945
4946 nhdr->b_l1hdr.b_buf = buf;
4947 nhdr->b_l1hdr.b_datacnt = 1;
4948 nhdr->b_l1hdr.b_state = arc_anon;
4949 nhdr->b_l1hdr.b_arc_access = 0;
4950 nhdr->b_l1hdr.b_tmp_cdata = NULL;
4951 nhdr->b_freeze_cksum = NULL;
4952
4953 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag);
4954 buf->b_hdr = nhdr;
4955 mutex_exit(&buf->b_evict_lock);
4956 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf);
4957 } else {
4958 mutex_exit(&buf->b_evict_lock);
4959 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1);
4960 /* protected by hash lock, or hdr is on arc_anon */
4961 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node));
4962 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
4963 arc_change_state(arc_anon, hdr, hash_lock);
4964 hdr->b_l1hdr.b_arc_access = 0;
4965 mutex_exit(hash_lock);
4966
4967 buf_discard_identity(hdr);
4968 arc_buf_thaw(buf);
4969 }
4970 buf->b_efunc = NULL;
4971 buf->b_private = NULL;
4972 }
4973
4974 int
arc_released(arc_buf_t * buf)4975 arc_released(arc_buf_t *buf)
4976 {
4977 int released;
4978
4979 mutex_enter(&buf->b_evict_lock);
4980 released = (buf->b_data != NULL &&
4981 buf->b_hdr->b_l1hdr.b_state == arc_anon);
4982 mutex_exit(&buf->b_evict_lock);
4983 return (released);
4984 }
4985
4986 #ifdef ZFS_DEBUG
4987 int
arc_referenced(arc_buf_t * buf)4988 arc_referenced(arc_buf_t *buf)
4989 {
4990 int referenced;
4991
4992 mutex_enter(&buf->b_evict_lock);
4993 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt));
4994 mutex_exit(&buf->b_evict_lock);
4995 return (referenced);
4996 }
4997 #endif
4998
4999 static void
arc_write_ready(zio_t * zio)5000 arc_write_ready(zio_t *zio)
5001 {
5002 arc_write_callback_t *callback = zio->io_private;
5003 arc_buf_t *buf = callback->awcb_buf;
5004 arc_buf_hdr_t *hdr = buf->b_hdr;
5005
5006 ASSERT(HDR_HAS_L1HDR(hdr));
5007 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt));
5008 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5009 callback->awcb_ready(zio, buf, callback->awcb_private);
5010
5011 /*
5012 * If the IO is already in progress, then this is a re-write
5013 * attempt, so we need to thaw and re-compute the cksum.
5014 * It is the responsibility of the callback to handle the
5015 * accounting for any re-write attempt.
5016 */
5017 if (HDR_IO_IN_PROGRESS(hdr)) {
5018 mutex_enter(&hdr->b_l1hdr.b_freeze_lock);
5019 if (hdr->b_freeze_cksum != NULL) {
5020 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t));
5021 hdr->b_freeze_cksum = NULL;
5022 }
5023 mutex_exit(&hdr->b_l1hdr.b_freeze_lock);
5024 }
5025 arc_cksum_compute(buf, B_FALSE);
5026 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS;
5027 }
5028
5029 /*
5030 * The SPA calls this callback for each physical write that happens on behalf
5031 * of a logical write. See the comment in dbuf_write_physdone() for details.
5032 */
5033 static void
arc_write_physdone(zio_t * zio)5034 arc_write_physdone(zio_t *zio)
5035 {
5036 arc_write_callback_t *cb = zio->io_private;
5037 if (cb->awcb_physdone != NULL)
5038 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private);
5039 }
5040
5041 static void
arc_write_done(zio_t * zio)5042 arc_write_done(zio_t *zio)
5043 {
5044 arc_write_callback_t *callback = zio->io_private;
5045 arc_buf_t *buf = callback->awcb_buf;
5046 arc_buf_hdr_t *hdr = buf->b_hdr;
5047
5048 ASSERT(hdr->b_l1hdr.b_acb == NULL);
5049
5050 if (zio->io_error == 0) {
5051 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) {
5052 buf_discard_identity(hdr);
5053 } else {
5054 hdr->b_dva = *BP_IDENTITY(zio->io_bp);
5055 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
5056 }
5057 } else {
5058 ASSERT(BUF_EMPTY(hdr));
5059 }
5060
5061 /*
5062 * If the block to be written was all-zero or compressed enough to be
5063 * embedded in the BP, no write was performed so there will be no
5064 * dva/birth/checksum. The buffer must therefore remain anonymous
5065 * (and uncached).
5066 */
5067 if (!BUF_EMPTY(hdr)) {
5068 arc_buf_hdr_t *exists;
5069 kmutex_t *hash_lock;
5070
5071 ASSERT(zio->io_error == 0);
5072
5073 arc_cksum_verify(buf);
5074
5075 exists = buf_hash_insert(hdr, &hash_lock);
5076 if (exists != NULL) {
5077 /*
5078 * This can only happen if we overwrite for
5079 * sync-to-convergence, because we remove
5080 * buffers from the hash table when we arc_free().
5081 */
5082 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
5083 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5084 panic("bad overwrite, hdr=%p exists=%p",
5085 (void *)hdr, (void *)exists);
5086 ASSERT(refcount_is_zero(
5087 &exists->b_l1hdr.b_refcnt));
5088 arc_change_state(arc_anon, exists, hash_lock);
5089 mutex_exit(hash_lock);
5090 arc_hdr_destroy(exists);
5091 exists = buf_hash_insert(hdr, &hash_lock);
5092 ASSERT3P(exists, ==, NULL);
5093 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) {
5094 /* nopwrite */
5095 ASSERT(zio->io_prop.zp_nopwrite);
5096 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
5097 panic("bad nopwrite, hdr=%p exists=%p",
5098 (void *)hdr, (void *)exists);
5099 } else {
5100 /* Dedup */
5101 ASSERT(hdr->b_l1hdr.b_datacnt == 1);
5102 ASSERT(hdr->b_l1hdr.b_state == arc_anon);
5103 ASSERT(BP_GET_DEDUP(zio->io_bp));
5104 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
5105 }
5106 }
5107 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5108 /* if it's not anon, we are doing a scrub */
5109 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon)
5110 arc_access(hdr, hash_lock);
5111 mutex_exit(hash_lock);
5112 } else {
5113 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS;
5114 }
5115
5116 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
5117 callback->awcb_done(zio, buf, callback->awcb_private);
5118
5119 kmem_free(callback, sizeof (arc_write_callback_t));
5120 }
5121
5122 zio_t *
arc_write(zio_t * pio,spa_t * spa,uint64_t txg,blkptr_t * bp,arc_buf_t * buf,boolean_t l2arc,boolean_t l2arc_compress,const zio_prop_t * zp,arc_done_func_t * ready,arc_done_func_t * physdone,arc_done_func_t * done,void * private,zio_priority_t priority,int zio_flags,const zbookmark_phys_t * zb)5123 arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
5124 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress,
5125 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone,
5126 arc_done_func_t *done, void *private, zio_priority_t priority,
5127 int zio_flags, const zbookmark_phys_t *zb)
5128 {
5129 arc_buf_hdr_t *hdr = buf->b_hdr;
5130 arc_write_callback_t *callback;
5131 zio_t *zio;
5132
5133 ASSERT(ready != NULL);
5134 ASSERT(done != NULL);
5135 ASSERT(!HDR_IO_ERROR(hdr));
5136 ASSERT(!HDR_IO_IN_PROGRESS(hdr));
5137 ASSERT(hdr->b_l1hdr.b_acb == NULL);
5138 ASSERT(hdr->b_l1hdr.b_datacnt > 0);
5139 if (l2arc)
5140 hdr->b_flags |= ARC_FLAG_L2CACHE;
5141 if (l2arc_compress)
5142 hdr->b_flags |= ARC_FLAG_L2COMPRESS;
5143 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
5144 callback->awcb_ready = ready;
5145 callback->awcb_physdone = physdone;
5146 callback->awcb_done = done;
5147 callback->awcb_private = private;
5148 callback->awcb_buf = buf;
5149
5150 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
5151 arc_write_ready, arc_write_physdone, arc_write_done, callback,
5152 priority, zio_flags, zb);
5153
5154 return (zio);
5155 }
5156
5157 static int
arc_memory_throttle(uint64_t reserve,uint64_t txg)5158 arc_memory_throttle(uint64_t reserve, uint64_t txg)
5159 {
5160 #ifdef _KERNEL
5161 uint64_t available_memory = ptob(freemem);
5162 static uint64_t page_load = 0;
5163 static uint64_t last_txg = 0;
5164
5165 #if defined(__i386)
5166 available_memory =
5167 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE));
5168 #endif
5169
5170 if (freemem > physmem * arc_lotsfree_percent / 100)
5171 return (0);
5172
5173 if (txg > last_txg) {
5174 last_txg = txg;
5175 page_load = 0;
5176 }
5177 /*
5178 * If we are in pageout, we know that memory is already tight,
5179 * the arc is already going to be evicting, so we just want to
5180 * continue to let page writes occur as quickly as possible.
5181 */
5182 if (curproc == proc_pageout) {
5183 if (page_load > MAX(ptob(minfree), available_memory) / 4)
5184 return (SET_ERROR(ERESTART));
5185 /* Note: reserve is inflated, so we deflate */
5186 page_load += reserve / 8;
5187 return (0);
5188 } else if (page_load > 0 && arc_reclaim_needed()) {
5189 /* memory is low, delay before restarting */
5190 ARCSTAT_INCR(arcstat_memory_throttle_count, 1);
5191 return (SET_ERROR(EAGAIN));
5192 }
5193 page_load = 0;
5194 #endif
5195 return (0);
5196 }
5197
5198 void
arc_tempreserve_clear(uint64_t reserve)5199 arc_tempreserve_clear(uint64_t reserve)
5200 {
5201 atomic_add_64(&arc_tempreserve, -reserve);
5202 ASSERT((int64_t)arc_tempreserve >= 0);
5203 }
5204
5205 int
arc_tempreserve_space(uint64_t reserve,uint64_t txg)5206 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
5207 {
5208 int error;
5209 uint64_t anon_size;
5210
5211 if (reserve > arc_c/4 && !arc_no_grow)
5212 arc_c = MIN(arc_c_max, reserve * 4);
5213 if (reserve > arc_c)
5214 return (SET_ERROR(ENOMEM));
5215
5216 /*
5217 * Don't count loaned bufs as in flight dirty data to prevent long
5218 * network delays from blocking transactions that are ready to be
5219 * assigned to a txg.
5220 */
5221 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) -
5222 arc_loaned_bytes), 0);
5223
5224 /*
5225 * Writes will, almost always, require additional memory allocations
5226 * in order to compress/encrypt/etc the data. We therefore need to
5227 * make sure that there is sufficient available memory for this.
5228 */
5229 error = arc_memory_throttle(reserve, txg);
5230 if (error != 0)
5231 return (error);
5232
5233 /*
5234 * Throttle writes when the amount of dirty data in the cache
5235 * gets too large. We try to keep the cache less than half full
5236 * of dirty blocks so that our sync times don't grow too large.
5237 * Note: if two requests come in concurrently, we might let them
5238 * both succeed, when one of them should fail. Not a huge deal.
5239 */
5240
5241 if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
5242 anon_size > arc_c / 4) {
5243 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
5244 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
5245 arc_tempreserve>>10,
5246 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10,
5247 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10,
5248 reserve>>10, arc_c>>10);
5249 return (SET_ERROR(ERESTART));
5250 }
5251 atomic_add_64(&arc_tempreserve, reserve);
5252 return (0);
5253 }
5254
5255 static void
arc_kstat_update_state(arc_state_t * state,kstat_named_t * size,kstat_named_t * evict_data,kstat_named_t * evict_metadata)5256 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size,
5257 kstat_named_t *evict_data, kstat_named_t *evict_metadata)
5258 {
5259 size->value.ui64 = refcount_count(&state->arcs_size);
5260 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA];
5261 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA];
5262 }
5263
5264 static int
arc_kstat_update(kstat_t * ksp,int rw)5265 arc_kstat_update(kstat_t *ksp, int rw)
5266 {
5267 arc_stats_t *as = ksp->ks_data;
5268
5269 if (rw == KSTAT_WRITE) {
5270 return (EACCES);
5271 } else {
5272 arc_kstat_update_state(arc_anon,
5273 &as->arcstat_anon_size,
5274 &as->arcstat_anon_evictable_data,
5275 &as->arcstat_anon_evictable_metadata);
5276 arc_kstat_update_state(arc_mru,
5277 &as->arcstat_mru_size,
5278 &as->arcstat_mru_evictable_data,
5279 &as->arcstat_mru_evictable_metadata);
5280 arc_kstat_update_state(arc_mru_ghost,
5281 &as->arcstat_mru_ghost_size,
5282 &as->arcstat_mru_ghost_evictable_data,
5283 &as->arcstat_mru_ghost_evictable_metadata);
5284 arc_kstat_update_state(arc_mfu,
5285 &as->arcstat_mfu_size,
5286 &as->arcstat_mfu_evictable_data,
5287 &as->arcstat_mfu_evictable_metadata);
5288 arc_kstat_update_state(arc_mfu_ghost,
5289 &as->arcstat_mfu_ghost_size,
5290 &as->arcstat_mfu_ghost_evictable_data,
5291 &as->arcstat_mfu_ghost_evictable_metadata);
5292 }
5293
5294 return (0);
5295 }
5296
5297 /*
5298 * This function *must* return indices evenly distributed between all
5299 * sublists of the multilist. This is needed due to how the ARC eviction
5300 * code is laid out; arc_evict_state() assumes ARC buffers are evenly
5301 * distributed between all sublists and uses this assumption when
5302 * deciding which sublist to evict from and how much to evict from it.
5303 */
5304 unsigned int
arc_state_multilist_index_func(multilist_t * ml,void * obj)5305 arc_state_multilist_index_func(multilist_t *ml, void *obj)
5306 {
5307 arc_buf_hdr_t *hdr = obj;
5308
5309 /*
5310 * We rely on b_dva to generate evenly distributed index
5311 * numbers using buf_hash below. So, as an added precaution,
5312 * let's make sure we never add empty buffers to the arc lists.
5313 */
5314 ASSERT(!BUF_EMPTY(hdr));
5315
5316 /*
5317 * The assumption here, is the hash value for a given
5318 * arc_buf_hdr_t will remain constant throughout it's lifetime
5319 * (i.e. it's b_spa, b_dva, and b_birth fields don't change).
5320 * Thus, we don't need to store the header's sublist index
5321 * on insertion, as this index can be recalculated on removal.
5322 *
5323 * Also, the low order bits of the hash value are thought to be
5324 * distributed evenly. Otherwise, in the case that the multilist
5325 * has a power of two number of sublists, each sublists' usage
5326 * would not be evenly distributed.
5327 */
5328 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) %
5329 multilist_get_num_sublists(ml));
5330 }
5331
5332 void
arc_init(void)5333 arc_init(void)
5334 {
5335 /*
5336 * allmem is "all memory that we could possibly use".
5337 */
5338 #ifdef _KERNEL
5339 uint64_t allmem = ptob(physmem - swapfs_minfree);
5340 #else
5341 uint64_t allmem = (physmem * PAGESIZE) / 2;
5342 #endif
5343
5344 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
5345 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL);
5346 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
5347
5348 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL);
5349 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL);
5350
5351 /* Convert seconds to clock ticks */
5352 arc_min_prefetch_lifespan = 1 * hz;
5353
5354 /* Start out with 1/8 of all memory */
5355 arc_c = allmem / 8;
5356
5357 #ifdef _KERNEL
5358 /*
5359 * On architectures where the physical memory can be larger
5360 * than the addressable space (intel in 32-bit mode), we may
5361 * need to limit the cache to 1/8 of VM size.
5362 */
5363 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
5364 #endif
5365
5366 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */
5367 arc_c_min = MAX(allmem / 32, 64 << 20);
5368 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */
5369 if (allmem >= 1 << 30)
5370 arc_c_max = allmem - (1 << 30);
5371 else
5372 arc_c_max = arc_c_min;
5373 arc_c_max = MAX(allmem * 3 / 4, arc_c_max);
5374
5375 /*
5376 * In userland, there's only the memory pressure that we artificially
5377 * create (see arc_available_memory()). Don't let arc_c get too
5378 * small, because it can cause transactions to be larger than
5379 * arc_c, causing arc_tempreserve_space() to fail.
5380 */
5381 #ifndef _KERNEL
5382 arc_c_min = arc_c_max / 2;
5383 #endif
5384
5385 /*
5386 * Allow the tunables to override our calculations if they are
5387 * reasonable (ie. over 64MB)
5388 */
5389 if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem)
5390 arc_c_max = zfs_arc_max;
5391 if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max)
5392 arc_c_min = zfs_arc_min;
5393
5394 arc_c = arc_c_max;
5395 arc_p = (arc_c >> 1);
5396
5397 /* limit meta-data to 1/4 of the arc capacity */
5398 arc_meta_limit = arc_c_max / 4;
5399
5400 /* Allow the tunable to override if it is reasonable */
5401 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max)
5402 arc_meta_limit = zfs_arc_meta_limit;
5403
5404 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
5405 arc_c_min = arc_meta_limit / 2;
5406
5407 if (zfs_arc_meta_min > 0) {
5408 arc_meta_min = zfs_arc_meta_min;
5409 } else {
5410 arc_meta_min = arc_c_min / 2;
5411 }
5412
5413 if (zfs_arc_grow_retry > 0)
5414 arc_grow_retry = zfs_arc_grow_retry;
5415
5416 if (zfs_arc_shrink_shift > 0)
5417 arc_shrink_shift = zfs_arc_shrink_shift;
5418
5419 /*
5420 * Ensure that arc_no_grow_shift is less than arc_shrink_shift.
5421 */
5422 if (arc_no_grow_shift >= arc_shrink_shift)
5423 arc_no_grow_shift = arc_shrink_shift - 1;
5424
5425 if (zfs_arc_p_min_shift > 0)
5426 arc_p_min_shift = zfs_arc_p_min_shift;
5427
5428 if (zfs_arc_num_sublists_per_state < 1)
5429 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1);
5430
5431 /* if kmem_flags are set, lets try to use less memory */
5432 if (kmem_debugging())
5433 arc_c = arc_c / 2;
5434 if (arc_c < arc_c_min)
5435 arc_c = arc_c_min;
5436
5437 arc_anon = &ARC_anon;
5438 arc_mru = &ARC_mru;
5439 arc_mru_ghost = &ARC_mru_ghost;
5440 arc_mfu = &ARC_mfu;
5441 arc_mfu_ghost = &ARC_mfu_ghost;
5442 arc_l2c_only = &ARC_l2c_only;
5443 arc_size = 0;
5444
5445 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
5446 sizeof (arc_buf_hdr_t),
5447 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5448 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5449 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA],
5450 sizeof (arc_buf_hdr_t),
5451 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5452 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5453 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA],
5454 sizeof (arc_buf_hdr_t),
5455 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5456 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5457 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA],
5458 sizeof (arc_buf_hdr_t),
5459 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5460 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5461 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA],
5462 sizeof (arc_buf_hdr_t),
5463 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5464 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5465 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA],
5466 sizeof (arc_buf_hdr_t),
5467 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5468 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5469 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA],
5470 sizeof (arc_buf_hdr_t),
5471 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5472 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5473 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
5474 sizeof (arc_buf_hdr_t),
5475 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5476 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5477 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
5478 sizeof (arc_buf_hdr_t),
5479 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5480 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5481 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
5482 sizeof (arc_buf_hdr_t),
5483 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node),
5484 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func);
5485
5486 refcount_create(&arc_anon->arcs_size);
5487 refcount_create(&arc_mru->arcs_size);
5488 refcount_create(&arc_mru_ghost->arcs_size);
5489 refcount_create(&arc_mfu->arcs_size);
5490 refcount_create(&arc_mfu_ghost->arcs_size);
5491 refcount_create(&arc_l2c_only->arcs_size);
5492
5493 buf_init();
5494
5495 arc_reclaim_thread_exit = FALSE;
5496 arc_user_evicts_thread_exit = FALSE;
5497 arc_eviction_list = NULL;
5498 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t));
5499
5500 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED,
5501 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
5502
5503 if (arc_ksp != NULL) {
5504 arc_ksp->ks_data = &arc_stats;
5505 arc_ksp->ks_update = arc_kstat_update;
5506 kstat_install(arc_ksp);
5507 }
5508
5509 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
5510 TS_RUN, minclsyspri);
5511
5512 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0,
5513 TS_RUN, minclsyspri);
5514
5515 arc_dead = FALSE;
5516 arc_warm = B_FALSE;
5517
5518 /*
5519 * Calculate maximum amount of dirty data per pool.
5520 *
5521 * If it has been set by /etc/system, take that.
5522 * Otherwise, use a percentage of physical memory defined by
5523 * zfs_dirty_data_max_percent (default 10%) with a cap at
5524 * zfs_dirty_data_max_max (default 4GB).
5525 */
5526 if (zfs_dirty_data_max == 0) {
5527 zfs_dirty_data_max = physmem * PAGESIZE *
5528 zfs_dirty_data_max_percent / 100;
5529 zfs_dirty_data_max = MIN(zfs_dirty_data_max,
5530 zfs_dirty_data_max_max);
5531 }
5532 }
5533
5534 void
arc_fini(void)5535 arc_fini(void)
5536 {
5537 mutex_enter(&arc_reclaim_lock);
5538 arc_reclaim_thread_exit = TRUE;
5539 /*
5540 * The reclaim thread will set arc_reclaim_thread_exit back to
5541 * FALSE when it is finished exiting; we're waiting for that.
5542 */
5543 while (arc_reclaim_thread_exit) {
5544 cv_signal(&arc_reclaim_thread_cv);
5545 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock);
5546 }
5547 mutex_exit(&arc_reclaim_lock);
5548
5549 mutex_enter(&arc_user_evicts_lock);
5550 arc_user_evicts_thread_exit = TRUE;
5551 /*
5552 * The user evicts thread will set arc_user_evicts_thread_exit
5553 * to FALSE when it is finished exiting; we're waiting for that.
5554 */
5555 while (arc_user_evicts_thread_exit) {
5556 cv_signal(&arc_user_evicts_cv);
5557 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock);
5558 }
5559 mutex_exit(&arc_user_evicts_lock);
5560
5561 /* Use TRUE to ensure *all* buffers are evicted */
5562 arc_flush(NULL, TRUE);
5563
5564 arc_dead = TRUE;
5565
5566 if (arc_ksp != NULL) {
5567 kstat_delete(arc_ksp);
5568 arc_ksp = NULL;
5569 }
5570
5571 mutex_destroy(&arc_reclaim_lock);
5572 cv_destroy(&arc_reclaim_thread_cv);
5573 cv_destroy(&arc_reclaim_waiters_cv);
5574
5575 mutex_destroy(&arc_user_evicts_lock);
5576 cv_destroy(&arc_user_evicts_cv);
5577
5578 refcount_destroy(&arc_anon->arcs_size);
5579 refcount_destroy(&arc_mru->arcs_size);
5580 refcount_destroy(&arc_mru_ghost->arcs_size);
5581 refcount_destroy(&arc_mfu->arcs_size);
5582 refcount_destroy(&arc_mfu_ghost->arcs_size);
5583 refcount_destroy(&arc_l2c_only->arcs_size);
5584
5585 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]);
5586 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]);
5587 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]);
5588 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]);
5589 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]);
5590 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]);
5591 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]);
5592 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]);
5593 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]);
5594 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]);
5595
5596 buf_fini();
5597
5598 ASSERT0(arc_loaned_bytes);
5599 }
5600
5601 /*
5602 * Level 2 ARC
5603 *
5604 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
5605 * It uses dedicated storage devices to hold cached data, which are populated
5606 * using large infrequent writes. The main role of this cache is to boost
5607 * the performance of random read workloads. The intended L2ARC devices
5608 * include short-stroked disks, solid state disks, and other media with
5609 * substantially faster read latency than disk.
5610 *
5611 * +-----------------------+
5612 * | ARC |
5613 * +-----------------------+
5614 * | ^ ^
5615 * | | |
5616 * l2arc_feed_thread() arc_read()
5617 * | | |
5618 * | l2arc read |
5619 * V | |
5620 * +---------------+ |
5621 * | L2ARC | |
5622 * +---------------+ |
5623 * | ^ |
5624 * l2arc_write() | |
5625 * | | |
5626 * V | |
5627 * +-------+ +-------+
5628 * | vdev | | vdev |
5629 * | cache | | cache |
5630 * +-------+ +-------+
5631 * +=========+ .-----.
5632 * : L2ARC : |-_____-|
5633 * : devices : | Disks |
5634 * +=========+ `-_____-'
5635 *
5636 * Read requests are satisfied from the following sources, in order:
5637 *
5638 * 1) ARC
5639 * 2) vdev cache of L2ARC devices
5640 * 3) L2ARC devices
5641 * 4) vdev cache of disks
5642 * 5) disks
5643 *
5644 * Some L2ARC device types exhibit extremely slow write performance.
5645 * To accommodate for this there are some significant differences between
5646 * the L2ARC and traditional cache design:
5647 *
5648 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
5649 * the ARC behave as usual, freeing buffers and placing headers on ghost
5650 * lists. The ARC does not send buffers to the L2ARC during eviction as
5651 * this would add inflated write latencies for all ARC memory pressure.
5652 *
5653 * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
5654 * It does this by periodically scanning buffers from the eviction-end of
5655 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
5656 * not already there. It scans until a headroom of buffers is satisfied,
5657 * which itself is a buffer for ARC eviction. If a compressible buffer is
5658 * found during scanning and selected for writing to an L2ARC device, we
5659 * temporarily boost scanning headroom during the next scan cycle to make
5660 * sure we adapt to compression effects (which might significantly reduce
5661 * the data volume we write to L2ARC). The thread that does this is
5662 * l2arc_feed_thread(), illustrated below; example sizes are included to
5663 * provide a better sense of ratio than this diagram:
5664 *
5665 * head --> tail
5666 * +---------------------+----------+
5667 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
5668 * +---------------------+----------+ | o L2ARC eligible
5669 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
5670 * +---------------------+----------+ |
5671 * 15.9 Gbytes ^ 32 Mbytes |
5672 * headroom |
5673 * l2arc_feed_thread()
5674 * |
5675 * l2arc write hand <--[oooo]--'
5676 * | 8 Mbyte
5677 * | write max
5678 * V
5679 * +==============================+
5680 * L2ARC dev |####|#|###|###| |####| ... |
5681 * +==============================+
5682 * 32 Gbytes
5683 *
5684 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
5685 * evicted, then the L2ARC has cached a buffer much sooner than it probably
5686 * needed to, potentially wasting L2ARC device bandwidth and storage. It is
5687 * safe to say that this is an uncommon case, since buffers at the end of
5688 * the ARC lists have moved there due to inactivity.
5689 *
5690 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
5691 * then the L2ARC simply misses copying some buffers. This serves as a
5692 * pressure valve to prevent heavy read workloads from both stalling the ARC
5693 * with waits and clogging the L2ARC with writes. This also helps prevent
5694 * the potential for the L2ARC to churn if it attempts to cache content too
5695 * quickly, such as during backups of the entire pool.
5696 *
5697 * 5. After system boot and before the ARC has filled main memory, there are
5698 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru
5699 * lists can remain mostly static. Instead of searching from tail of these
5700 * lists as pictured, the l2arc_feed_thread() will search from the list heads
5701 * for eligible buffers, greatly increasing its chance of finding them.
5702 *
5703 * The L2ARC device write speed is also boosted during this time so that
5704 * the L2ARC warms up faster. Since there have been no ARC evictions yet,
5705 * there are no L2ARC reads, and no fear of degrading read performance
5706 * through increased writes.
5707 *
5708 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that
5709 * the vdev queue can aggregate them into larger and fewer writes. Each
5710 * device is written to in a rotor fashion, sweeping writes through
5711 * available space then repeating.
5712 *
5713 * 7. The L2ARC does not store dirty content. It never needs to flush
5714 * write buffers back to disk based storage.
5715 *
5716 * 8. If an ARC buffer is written (and dirtied) which also exists in the
5717 * L2ARC, the now stale L2ARC buffer is immediately dropped.
5718 *
5719 * The performance of the L2ARC can be tweaked by a number of tunables, which
5720 * may be necessary for different workloads:
5721 *
5722 * l2arc_write_max max write bytes per interval
5723 * l2arc_write_boost extra write bytes during device warmup
5724 * l2arc_noprefetch skip caching prefetched buffers
5725 * l2arc_headroom number of max device writes to precache
5726 * l2arc_headroom_boost when we find compressed buffers during ARC
5727 * scanning, we multiply headroom by this
5728 * percentage factor for the next scan cycle,
5729 * since more compressed buffers are likely to
5730 * be present
5731 * l2arc_feed_secs seconds between L2ARC writing
5732 *
5733 * Tunables may be removed or added as future performance improvements are
5734 * integrated, and also may become zpool properties.
5735 *
5736 * There are three key functions that control how the L2ARC warms up:
5737 *
5738 * l2arc_write_eligible() check if a buffer is eligible to cache
5739 * l2arc_write_size() calculate how much to write
5740 * l2arc_write_interval() calculate sleep delay between writes
5741 *
5742 * These three functions determine what to write, how much, and how quickly
5743 * to send writes.
5744 *
5745 * L2ARC persistency:
5746 *
5747 * When writing buffers to L2ARC, we periodically add some metadata to
5748 * make sure we can pick them up after reboot, thus dramatically reducing
5749 * the impact that any downtime has on the performance of storage systems
5750 * with large caches.
5751 *
5752 * The implementation works fairly simply by integrating the following two
5753 * modifications:
5754 *
5755 * *) Every now and then we mix in a piece of metadata (called a log block)
5756 * into the L2ARC write. This allows us to understand what's been written,
5757 * so that we can rebuild the arc_buf_hdr_t structures of the main ARC
5758 * buffers. The log block also includes a "2-back-reference" pointer to
5759 * he second-to-previous block, forming a back-linked list of blocks on
5760 * the L2ARC device.
5761 *
5762 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device
5763 * for our header bookkeeping purposes. This contains a device header,
5764 * which contains our top-level reference structures. We update it each
5765 * time we write a new log block, so that we're able to locate it in the
5766 * L2ARC device. If this write results in an inconsistent device header
5767 * (e.g. due to power failure), we detect this by verifying the header's
5768 * checksum and simply drop the entries from L2ARC.
5769 *
5770 * Implementation diagram:
5771 *
5772 * +=== L2ARC device (not to scale) ======================================+
5773 * | ___two newest log block pointers__.__________ |
5774 * | / \1 back \latest |
5775 * |.____/_. V V |
5776 * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---|
5777 * || hdr| ^ /^ /^ / / |
5778 * |+------+ ...--\-------/ \-----/--\------/ / |
5779 * | \--------------/ \--------------/ |
5780 * +======================================================================+
5781 *
5782 * As can be seen on the diagram, rather than using a simple linked list,
5783 * we use a pair of linked lists with alternating elements. This is a
5784 * performance enhancement due to the fact that we only find out of the
5785 * address of the next log block access once the current block has been
5786 * completely read in. Obviously, this hurts performance, because we'd be
5787 * keeping the device's I/O queue at only a 1 operation deep, thus
5788 * incurring a large amount of I/O round-trip latency. Having two lists
5789 * allows us to "prefetch" two log blocks ahead of where we are currently
5790 * rebuilding L2ARC buffers.
5791 *
5792 * On-device data structures:
5793 *
5794 * L2ARC device header: l2arc_dev_hdr_phys_t
5795 * L2ARC log block: l2arc_log_blk_phys_t
5796 *
5797 * L2ARC reconstruction:
5798 *
5799 * When writing data, we simply write in the standard rotary fashion,
5800 * evicting buffers as we go and simply writing new data over them (writing
5801 * a new log block every now and then). This obviously means that once we
5802 * loop around the end of the device, we will start cutting into an already
5803 * committed log block (and its referenced data buffers), like so:
5804 *
5805 * current write head__ __old tail
5806 * \ /
5807 * V V
5808 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |-->
5809 * ^ ^^^^^^^^^___________________________________
5810 * | \
5811 * <<nextwrite>> may overwrite this blk and/or its bufs --'
5812 *
5813 * When importing the pool, we detect this situation and use it to stop
5814 * our scanning process (see l2arc_rebuild).
5815 *
5816 * There is one significant caveat to consider when rebuilding ARC contents
5817 * from an L2ARC device: what about invalidated buffers? Given the above
5818 * construction, we cannot update blocks which we've already written to amend
5819 * them to remove buffers which were invalidated. Thus, during reconstruction,
5820 * we might be populating the cache with buffers for data that's not on the
5821 * main pool anymore, or may have been overwritten!
5822 *
5823 * As it turns out, this isn't a problem. Every arc_read request includes
5824 * both the DVA and, crucially, the birth TXG of the BP the caller is
5825 * looking for. So even if the cache were populated by completely rotten
5826 * blocks for data that had been long deleted and/or overwritten, we'll
5827 * never actually return bad data from the cache, since the DVA with the
5828 * birth TXG uniquely identify a block in space and time - once created,
5829 * a block is immutable on disk. The worst thing we have done is wasted
5830 * some time and memory at l2arc rebuild to reconstruct outdated ARC
5831 * entries that will get dropped from the l2arc as it is being updated
5832 * with new blocks.
5833 */
5834
5835 static boolean_t
l2arc_write_eligible(uint64_t spa_guid,uint64_t sync_txg,arc_buf_hdr_t * hdr)5836 l2arc_write_eligible(uint64_t spa_guid, uint64_t sync_txg, arc_buf_hdr_t *hdr)
5837 {
5838 /*
5839 * A buffer is *not* eligible for the L2ARC if it:
5840 * 1. belongs to a different spa.
5841 * 2. is already cached on the L2ARC.
5842 * 3. has an I/O in progress (it may be an incomplete read).
5843 * 4. is flagged not eligible (zfs property).
5844 * 5. is part of the syncing txg (and thus subject to change).
5845 */
5846 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) ||
5847 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) ||
5848 hdr->b_birth >= sync_txg)
5849 return (B_FALSE);
5850
5851 return (B_TRUE);
5852 }
5853
5854 static uint64_t
l2arc_write_size(void)5855 l2arc_write_size(void)
5856 {
5857 uint64_t size;
5858
5859 /*
5860 * Make sure our globals have meaningful values in case the user
5861 * altered them.
5862 */
5863 size = l2arc_write_max;
5864 if (size == 0) {
5865 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must "
5866 "be greater than zero, resetting it to the default (%d)",
5867 L2ARC_WRITE_SIZE);
5868 size = l2arc_write_max = L2ARC_WRITE_SIZE;
5869 }
5870
5871 if (arc_warm == B_FALSE)
5872 size += l2arc_write_boost;
5873
5874 return (size);
5875
5876 }
5877
5878 static clock_t
l2arc_write_interval(clock_t began,uint64_t wanted,uint64_t wrote)5879 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
5880 {
5881 clock_t interval, next, now;
5882
5883 /*
5884 * If the ARC lists are busy, increase our write rate; if the
5885 * lists are stale, idle back. This is achieved by checking
5886 * how much we previously wrote - if it was more than half of
5887 * what we wanted, schedule the next write much sooner.
5888 */
5889 if (l2arc_feed_again && wrote > (wanted / 2))
5890 interval = (hz * l2arc_feed_min_ms) / 1000;
5891 else
5892 interval = hz * l2arc_feed_secs;
5893
5894 now = ddi_get_lbolt();
5895 next = MAX(now, MIN(now + interval, began + interval));
5896
5897 return (next);
5898 }
5899
5900 /*
5901 * Cycle through L2ARC devices. This is how L2ARC load balances.
5902 * If a device is returned, this also returns holding the spa config lock.
5903 */
5904 static l2arc_dev_t *
l2arc_dev_get_next(void)5905 l2arc_dev_get_next(void)
5906 {
5907 l2arc_dev_t *first, *next = NULL;
5908
5909 /*
5910 * Lock out the removal of spas (spa_namespace_lock), then removal
5911 * of cache devices (l2arc_dev_mtx). Once a device has been selected,
5912 * both locks will be dropped and a spa config lock held instead.
5913 */
5914 mutex_enter(&spa_namespace_lock);
5915 mutex_enter(&l2arc_dev_mtx);
5916
5917 /* if there are no vdevs, there is nothing to do */
5918 if (l2arc_ndev == 0)
5919 goto out;
5920
5921 first = NULL;
5922 next = l2arc_dev_last;
5923 do {
5924 /* loop around the list looking for a non-faulted vdev */
5925 if (next == NULL) {
5926 next = list_head(l2arc_dev_list);
5927 } else {
5928 next = list_next(l2arc_dev_list, next);
5929 if (next == NULL)
5930 next = list_head(l2arc_dev_list);
5931 }
5932
5933 /* if we have come back to the start, bail out */
5934 if (first == NULL)
5935 first = next;
5936 else if (next == first)
5937 break;
5938
5939 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild);
5940
5941 /* if we were unable to find any usable vdevs, return NULL */
5942 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild)
5943 next = NULL;
5944
5945 l2arc_dev_last = next;
5946
5947 out:
5948 mutex_exit(&l2arc_dev_mtx);
5949
5950 /*
5951 * Grab the config lock to prevent the 'next' device from being
5952 * removed while we are writing to it.
5953 */
5954 if (next != NULL)
5955 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER);
5956 mutex_exit(&spa_namespace_lock);
5957
5958 return (next);
5959 }
5960
5961 /*
5962 * Free buffers that were tagged for destruction.
5963 */
5964 static void
l2arc_do_free_on_write()5965 l2arc_do_free_on_write()
5966 {
5967 list_t *buflist;
5968 l2arc_data_free_t *df, *df_prev;
5969
5970 mutex_enter(&l2arc_free_on_write_mtx);
5971 buflist = l2arc_free_on_write;
5972
5973 for (df = list_tail(buflist); df; df = df_prev) {
5974 df_prev = list_prev(buflist, df);
5975 ASSERT(df->l2df_data != NULL);
5976 ASSERT(df->l2df_func != NULL);
5977 df->l2df_func(df->l2df_data, df->l2df_size);
5978 list_remove(buflist, df);
5979 kmem_free(df, sizeof (l2arc_data_free_t));
5980 }
5981
5982 mutex_exit(&l2arc_free_on_write_mtx);
5983 }
5984
5985 /*
5986 * A write to a cache device has completed. Update all headers to allow
5987 * reads from these buffers to begin.
5988 */
5989 static void
l2arc_write_done(zio_t * zio)5990 l2arc_write_done(zio_t *zio)
5991 {
5992 l2arc_write_callback_t *cb;
5993 l2arc_dev_t *dev;
5994 list_t *buflist;
5995 arc_buf_hdr_t *head, *hdr, *hdr_prev;
5996 kmutex_t *hash_lock;
5997 int64_t bytes_dropped = 0;
5998 l2arc_log_blk_buf_t *lb_buf;
5999
6000 cb = zio->io_private;
6001 ASSERT(cb != NULL);
6002 dev = cb->l2wcb_dev;
6003 ASSERT(dev != NULL);
6004 head = cb->l2wcb_head;
6005 ASSERT(head != NULL);
6006 buflist = &dev->l2ad_buflist;
6007 ASSERT(buflist != NULL);
6008 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
6009 l2arc_write_callback_t *, cb);
6010
6011 if (zio->io_error != 0)
6012 ARCSTAT_BUMP(arcstat_l2_writes_error);
6013
6014 /*
6015 * All writes completed, or an error was hit.
6016 */
6017 top:
6018 mutex_enter(&dev->l2ad_mtx);
6019 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) {
6020 hdr_prev = list_prev(buflist, hdr);
6021
6022 hash_lock = HDR_LOCK(hdr);
6023
6024 /*
6025 * We cannot use mutex_enter or else we can deadlock
6026 * with l2arc_write_buffers (due to swapping the order
6027 * the hash lock and l2ad_mtx are taken).
6028 */
6029 if (!mutex_tryenter(hash_lock)) {
6030 /*
6031 * Missed the hash lock. We must retry so we
6032 * don't leave the ARC_FLAG_L2_WRITING bit set.
6033 */
6034 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry);
6035
6036 /*
6037 * We don't want to rescan the headers we've
6038 * already marked as having been written out, so
6039 * we reinsert the head node so we can pick up
6040 * where we left off.
6041 */
6042 list_remove(buflist, head);
6043 list_insert_after(buflist, hdr, head);
6044
6045 mutex_exit(&dev->l2ad_mtx);
6046
6047 /*
6048 * We wait for the hash lock to become available
6049 * to try and prevent busy waiting, and increase
6050 * the chance we'll be able to acquire the lock
6051 * the next time around.
6052 */
6053 mutex_enter(hash_lock);
6054 mutex_exit(hash_lock);
6055 goto top;
6056 }
6057
6058 /*
6059 * We could not have been moved into the arc_l2c_only
6060 * state while in-flight due to our ARC_FLAG_L2_WRITING
6061 * bit being set. Let's just ensure that's being enforced.
6062 */
6063 ASSERT(HDR_HAS_L1HDR(hdr));
6064
6065 /*
6066 * We may have allocated a buffer for L2ARC compression,
6067 * we must release it to avoid leaking this data.
6068 */
6069 l2arc_release_cdata_buf(hdr);
6070
6071 if (zio->io_error != 0) {
6072 /*
6073 * Error - drop L2ARC entry.
6074 */
6075 list_remove(buflist, hdr);
6076 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR;
6077
6078 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize);
6079 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
6080
6081 bytes_dropped += hdr->b_l2hdr.b_asize;
6082 (void) refcount_remove_many(&dev->l2ad_alloc,
6083 hdr->b_l2hdr.b_asize, hdr);
6084 }
6085
6086 /*
6087 * Allow ARC to begin reads and ghost list evictions to
6088 * this L2ARC entry.
6089 */
6090 hdr->b_flags &= ~ARC_FLAG_L2_WRITING;
6091
6092 mutex_exit(hash_lock);
6093 }
6094
6095 atomic_inc_64(&l2arc_writes_done);
6096 list_remove(buflist, head);
6097 ASSERT(!HDR_HAS_L1HDR(head));
6098 kmem_cache_free(hdr_l2only_cache, head);
6099 mutex_exit(&dev->l2ad_mtx);
6100
6101 ASSERT(dev->l2ad_vdev != NULL);
6102 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0);
6103
6104 l2arc_do_free_on_write();
6105
6106 while ((lb_buf = list_remove_tail(&cb->l2wcb_log_blk_buflist)) != NULL)
6107 kmem_free(lb_buf, sizeof (*lb_buf));
6108 list_destroy(&cb->l2wcb_log_blk_buflist);
6109 kmem_free(cb, sizeof (l2arc_write_callback_t));
6110 }
6111
6112 /*
6113 * A read to a cache device completed. Validate buffer contents before
6114 * handing over to the regular ARC routines.
6115 */
6116 static void
l2arc_read_done(zio_t * zio)6117 l2arc_read_done(zio_t *zio)
6118 {
6119 l2arc_read_callback_t *cb;
6120 arc_buf_hdr_t *hdr;
6121 arc_buf_t *buf;
6122 kmutex_t *hash_lock;
6123 int equal;
6124
6125 ASSERT(zio->io_vd != NULL);
6126 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE);
6127
6128 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd);
6129
6130 cb = zio->io_private;
6131 ASSERT(cb != NULL);
6132 buf = cb->l2rcb_buf;
6133 ASSERT(buf != NULL);
6134
6135 hash_lock = HDR_LOCK(buf->b_hdr);
6136 mutex_enter(hash_lock);
6137 hdr = buf->b_hdr;
6138 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr));
6139
6140 /*
6141 * If the buffer was compressed, decompress it first.
6142 */
6143 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF)
6144 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress);
6145 ASSERT(zio->io_data != NULL);
6146 ASSERT3U(zio->io_size, ==, hdr->b_size);
6147 ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size);
6148
6149 /*
6150 * Check this survived the L2ARC journey.
6151 */
6152 equal = arc_cksum_equal(buf);
6153 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
6154 mutex_exit(hash_lock);
6155 zio->io_private = buf;
6156 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */
6157 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */
6158 arc_read_done(zio);
6159 } else {
6160 mutex_exit(hash_lock);
6161 /*
6162 * Buffer didn't survive caching. Increment stats and
6163 * reissue to the original storage device.
6164 */
6165 if (zio->io_error != 0) {
6166 ARCSTAT_BUMP(arcstat_l2_io_error);
6167 } else {
6168 zio->io_error = SET_ERROR(EIO);
6169 }
6170 if (!equal)
6171 ARCSTAT_BUMP(arcstat_l2_cksum_bad);
6172
6173 /*
6174 * If there's no waiter, issue an async i/o to the primary
6175 * storage now. If there *is* a waiter, the caller must
6176 * issue the i/o in a context where it's OK to block.
6177 */
6178 if (zio->io_waiter == NULL) {
6179 zio_t *pio = zio_unique_parent(zio);
6180
6181 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
6182
6183 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
6184 buf->b_data, hdr->b_size, arc_read_done, buf,
6185 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
6186 }
6187 }
6188
6189 kmem_free(cb, sizeof (l2arc_read_callback_t));
6190 }
6191
6192 /*
6193 * This is the list priority from which the L2ARC will search for pages to
6194 * cache. This is used within loops (0..3) to cycle through lists in the
6195 * desired order. This order can have a significant effect on cache
6196 * performance.
6197 *
6198 * Currently the metadata lists are hit first, MFU then MRU, followed by
6199 * the data lists. This function returns a locked list, and also returns
6200 * the lock pointer.
6201 */
6202 static multilist_sublist_t *
l2arc_sublist_lock(int list_num)6203 l2arc_sublist_lock(int list_num)
6204 {
6205 multilist_t *ml = NULL;
6206 unsigned int idx;
6207
6208 ASSERT(list_num >= 0 && list_num <= 3);
6209
6210 switch (list_num) {
6211 case 0:
6212 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
6213 break;
6214 case 1:
6215 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA];
6216 break;
6217 case 2:
6218 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA];
6219 break;
6220 case 3:
6221 ml = &arc_mru->arcs_list[ARC_BUFC_DATA];
6222 break;
6223 }
6224
6225 /*
6226 * Return a randomly-selected sublist. This is acceptable
6227 * because the caller feeds only a little bit of data for each
6228 * call (8MB). Subsequent calls will result in different
6229 * sublists being selected.
6230 */
6231 idx = multilist_get_random_index(ml);
6232 return (multilist_sublist_lock(ml, idx));
6233 }
6234
6235 /*
6236 * Calculates the maximum overhead of L2ARC metadata log blocks for a given
6237 * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this
6238 * overhead in processing to make sure there is enough headroom available
6239 * when writing buffers.
6240 */
6241 static inline uint64_t
l2arc_log_blk_overhead(uint64_t write_sz)6242 l2arc_log_blk_overhead(uint64_t write_sz)
6243 {
6244 return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) *
6245 L2ARC_LOG_BLK_SIZE;
6246 }
6247
6248 /*
6249 * Evict buffers from the device write hand to the distance specified in
6250 * bytes. This distance may span populated buffers, it may span nothing.
6251 * This is clearing a region on the L2ARC device ready for writing.
6252 * If the 'all' boolean is set, every buffer is evicted.
6253 */
6254 static void
l2arc_evict(l2arc_dev_t * dev,uint64_t distance,boolean_t all)6255 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
6256 {
6257 list_t *buflist;
6258 arc_buf_hdr_t *hdr, *hdr_prev;
6259 kmutex_t *hash_lock;
6260 uint64_t taddr;
6261
6262 buflist = &dev->l2ad_buflist;
6263
6264 if (!all && dev->l2ad_first) {
6265 /*
6266 * This is the first sweep through the device. There is
6267 * nothing to evict.
6268 */
6269 return;
6270 }
6271
6272 /*
6273 * We need to add in the worst case scenario of log block overhead.
6274 */
6275 distance += l2arc_log_blk_overhead(distance);
6276 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) {
6277 /*
6278 * When nearing the end of the device, evict to the end
6279 * before the device write hand jumps to the start.
6280 */
6281 taddr = dev->l2ad_end;
6282 } else {
6283 taddr = dev->l2ad_hand + distance;
6284 }
6285 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
6286 uint64_t, taddr, boolean_t, all);
6287
6288 top:
6289 mutex_enter(&dev->l2ad_mtx);
6290 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) {
6291 hdr_prev = list_prev(buflist, hdr);
6292
6293 hash_lock = HDR_LOCK(hdr);
6294
6295 /*
6296 * We cannot use mutex_enter or else we can deadlock
6297 * with l2arc_write_buffers (due to swapping the order
6298 * the hash lock and l2ad_mtx are taken).
6299 */
6300 if (!mutex_tryenter(hash_lock)) {
6301 /*
6302 * Missed the hash lock. Retry.
6303 */
6304 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
6305 mutex_exit(&dev->l2ad_mtx);
6306 mutex_enter(hash_lock);
6307 mutex_exit(hash_lock);
6308 goto top;
6309 }
6310
6311 if (HDR_L2_WRITE_HEAD(hdr)) {
6312 /*
6313 * We hit a write head node. Leave it for
6314 * l2arc_write_done().
6315 */
6316 list_remove(buflist, hdr);
6317 mutex_exit(hash_lock);
6318 continue;
6319 }
6320
6321 if (!all && HDR_HAS_L2HDR(hdr) &&
6322 (hdr->b_l2hdr.b_daddr > taddr ||
6323 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) {
6324 /*
6325 * We've evicted to the target address,
6326 * or the end of the device.
6327 */
6328 mutex_exit(hash_lock);
6329 break;
6330 }
6331
6332 ASSERT(HDR_HAS_L2HDR(hdr));
6333 if (!HDR_HAS_L1HDR(hdr)) {
6334 ASSERT(!HDR_L2_READING(hdr));
6335 /*
6336 * This doesn't exist in the ARC. Destroy.
6337 * arc_hdr_destroy() will call list_remove()
6338 * and decrement arcstat_l2_size.
6339 */
6340 arc_change_state(arc_anon, hdr, hash_lock);
6341 arc_hdr_destroy(hdr);
6342 } else {
6343 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only);
6344 ARCSTAT_BUMP(arcstat_l2_evict_l1cached);
6345 /*
6346 * Invalidate issued or about to be issued
6347 * reads, since we may be about to write
6348 * over this location.
6349 */
6350 if (HDR_L2_READING(hdr)) {
6351 ARCSTAT_BUMP(arcstat_l2_evict_reading);
6352 hdr->b_flags |= ARC_FLAG_L2_EVICTED;
6353 }
6354
6355 /* Ensure this header has finished being written */
6356 ASSERT(!HDR_L2_WRITING(hdr));
6357 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6358
6359 arc_hdr_l2hdr_destroy(hdr);
6360 }
6361 mutex_exit(hash_lock);
6362 }
6363 mutex_exit(&dev->l2ad_mtx);
6364 }
6365
6366 /*
6367 * Find and write ARC buffers to the L2ARC device.
6368 *
6369 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid
6370 * for reading until they have completed writing.
6371 * The headroom_boost is an in-out parameter used to maintain headroom boost
6372 * state between calls to this function.
6373 *
6374 * Returns the number of bytes actually written (which may be smaller than
6375 * the delta by which the device hand has changed due to alignment).
6376 */
6377 static uint64_t
l2arc_write_buffers(spa_t * spa,l2arc_dev_t * dev,uint64_t target_sz,boolean_t * headroom_boost)6378 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz,
6379 boolean_t *headroom_boost)
6380 {
6381 arc_buf_hdr_t *hdr, *hdr_prev, *head;
6382 uint64_t write_asize, write_sz, headroom,
6383 buf_compress_minsz;
6384 void *buf_data;
6385 boolean_t full;
6386 l2arc_write_callback_t *cb;
6387 zio_t *pio, *wzio;
6388 uint64_t guid = spa_load_guid(spa);
6389 uint64_t sync_txg = spa_syncing_txg(spa);
6390 const boolean_t do_headroom_boost = *headroom_boost;
6391 boolean_t dev_hdr_update = B_FALSE;
6392
6393 ASSERT(dev->l2ad_vdev != NULL);
6394
6395 /* Lower the flag now, we might want to raise it again later. */
6396 *headroom_boost = B_FALSE;
6397
6398 pio = NULL;
6399 cb = NULL;
6400 write_sz = write_asize = 0;
6401 full = B_FALSE;
6402 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE);
6403 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD;
6404 head->b_flags |= ARC_FLAG_HAS_L2HDR;
6405
6406 /*
6407 * We will want to try to compress buffers that are at least 2x the
6408 * device sector size.
6409 */
6410 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift;
6411
6412 /*
6413 * Copy buffers for L2ARC writing.
6414 */
6415 for (int try = 0; try <= 3; try++) {
6416 multilist_sublist_t *mls = l2arc_sublist_lock(try);
6417 uint64_t passed_sz = 0;
6418
6419 /*
6420 * L2ARC fast warmup.
6421 *
6422 * Until the ARC is warm and starts to evict, read from the
6423 * head of the ARC lists rather than the tail.
6424 */
6425 if (arc_warm == B_FALSE)
6426 hdr = multilist_sublist_head(mls);
6427 else
6428 hdr = multilist_sublist_tail(mls);
6429
6430 headroom = target_sz * l2arc_headroom;
6431 if (do_headroom_boost)
6432 headroom = (headroom * l2arc_headroom_boost) / 100;
6433
6434 for (; hdr; hdr = hdr_prev) {
6435 kmutex_t *hash_lock;
6436 uint64_t buf_sz;
6437 uint64_t buf_a_sz;
6438
6439 if (arc_warm == B_FALSE)
6440 hdr_prev = multilist_sublist_next(mls, hdr);
6441 else
6442 hdr_prev = multilist_sublist_prev(mls, hdr);
6443
6444 hash_lock = HDR_LOCK(hdr);
6445 if (!mutex_tryenter(hash_lock)) {
6446 /*
6447 * Skip this buffer rather than waiting.
6448 */
6449 continue;
6450 }
6451
6452 passed_sz += hdr->b_size;
6453 if (passed_sz > headroom) {
6454 /*
6455 * Searched too far.
6456 */
6457 mutex_exit(hash_lock);
6458 break;
6459 }
6460
6461 if (!l2arc_write_eligible(guid, sync_txg, hdr)) {
6462 mutex_exit(hash_lock);
6463 continue;
6464 }
6465
6466 /*
6467 * Assume that the buffer is not going to be compressed
6468 * and could take more space on disk because of a larger
6469 * disk block size.
6470 */
6471 buf_sz = hdr->b_size;
6472 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6473
6474 if ((write_asize + buf_a_sz) > target_sz) {
6475 full = B_TRUE;
6476 mutex_exit(hash_lock);
6477 break;
6478 }
6479
6480 if (pio == NULL) {
6481 /*
6482 * Insert a dummy header on the buflist so
6483 * l2arc_write_done() can find where the
6484 * write buffers begin without searching.
6485 */
6486 mutex_enter(&dev->l2ad_mtx);
6487 list_insert_head(&dev->l2ad_buflist, head);
6488 mutex_exit(&dev->l2ad_mtx);
6489
6490 cb = kmem_zalloc(
6491 sizeof (l2arc_write_callback_t), KM_SLEEP);
6492 cb->l2wcb_dev = dev;
6493 cb->l2wcb_head = head;
6494 list_create(&cb->l2wcb_log_blk_buflist,
6495 sizeof (l2arc_log_blk_buf_t),
6496 offsetof(l2arc_log_blk_buf_t, lbb_node));
6497 pio = zio_root(spa, l2arc_write_done, cb,
6498 ZIO_FLAG_CANFAIL);
6499 }
6500
6501 /*
6502 * Create and add a new L2ARC header.
6503 */
6504 hdr->b_l2hdr.b_dev = dev;
6505 hdr->b_flags |= ARC_FLAG_L2_WRITING;
6506 /*
6507 * Temporarily stash the data buffer in b_tmp_cdata.
6508 * The subsequent write step will pick it up from
6509 * there. This is because can't access b_l1hdr.b_buf
6510 * without holding the hash_lock, which we in turn
6511 * can't access without holding the ARC list locks
6512 * (which we want to avoid during compression/writing).
6513 */
6514 hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF;
6515 hdr->b_l2hdr.b_asize = hdr->b_size;
6516 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data;
6517
6518 /*
6519 * Explicitly set the b_daddr field to a known
6520 * value which means "invalid address". This
6521 * enables us to differentiate which stage of
6522 * l2arc_write_buffers() the particular header
6523 * is in (e.g. this loop, or the one below).
6524 * ARC_FLAG_L2_WRITING is not enough to make
6525 * this distinction, and we need to know in
6526 * order to do proper l2arc vdev accounting in
6527 * arc_release() and arc_hdr_destroy().
6528 *
6529 * Note, we can't use a new flag to distinguish
6530 * the two stages because we don't hold the
6531 * header's hash_lock below, in the second stage
6532 * of this function. Thus, we can't simply
6533 * change the b_flags field to denote that the
6534 * IO has been sent. We can change the b_daddr
6535 * field of the L2 portion, though, since we'll
6536 * be holding the l2ad_mtx; which is why we're
6537 * using it to denote the header's state change.
6538 */
6539 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET;
6540
6541 hdr->b_flags |= ARC_FLAG_HAS_L2HDR;
6542
6543 mutex_enter(&dev->l2ad_mtx);
6544 list_insert_head(&dev->l2ad_buflist, hdr);
6545 mutex_exit(&dev->l2ad_mtx);
6546
6547 /*
6548 * Compute and store the buffer cksum before
6549 * writing. On debug the cksum is verified first.
6550 */
6551 arc_cksum_verify(hdr->b_l1hdr.b_buf);
6552 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE);
6553
6554 mutex_exit(hash_lock);
6555
6556 write_sz += buf_sz;
6557 write_asize += buf_a_sz;
6558 }
6559
6560 multilist_sublist_unlock(mls);
6561
6562 if (full == B_TRUE)
6563 break;
6564 }
6565
6566 /* No buffers selected for writing? */
6567 if (pio == NULL) {
6568 ASSERT0(write_sz);
6569 ASSERT(!HDR_HAS_L1HDR(head));
6570 kmem_cache_free(hdr_l2only_cache, head);
6571 return (0);
6572 }
6573
6574 mutex_enter(&dev->l2ad_mtx);
6575
6576 /*
6577 * Note that elsewhere in this file arcstat_l2_asize
6578 * and the used space on l2ad_vdev are updated using b_asize,
6579 * which is not necessarily rounded up to the device block size.
6580 * Too keep accounting consistent we do the same here as well:
6581 * stats_size accumulates the sum of b_asize of the written buffers,
6582 * while write_asize accumulates the sum of b_asize rounded up
6583 * to the device block size.
6584 * The latter sum is used only to validate the corectness of the code.
6585 */
6586 uint64_t stats_size = 0;
6587 write_asize = 0;
6588
6589 /*
6590 * Now start writing the buffers. We're starting at the write head
6591 * and work backwards, retracing the course of the buffer selector
6592 * loop above.
6593 */
6594 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr;
6595 hdr = list_prev(&dev->l2ad_buflist, hdr)) {
6596 uint64_t buf_sz;
6597
6598 /*
6599 * We rely on the L1 portion of the header below, so
6600 * it's invalid for this header to have been evicted out
6601 * of the ghost cache, prior to being written out. The
6602 * ARC_FLAG_L2_WRITING bit ensures this won't happen.
6603 */
6604 ASSERT(HDR_HAS_L1HDR(hdr));
6605
6606 /*
6607 * We shouldn't need to lock the buffer here, since we flagged
6608 * it as ARC_FLAG_L2_WRITING in the previous step, but we must
6609 * take care to only access its L2 cache parameters. In
6610 * particular, hdr->l1hdr.b_buf may be invalid by now due to
6611 * ARC eviction.
6612 */
6613 hdr->b_l2hdr.b_daddr = dev->l2ad_hand;
6614
6615 if ((HDR_L2COMPRESS(hdr)) &&
6616 hdr->b_l2hdr.b_asize >= buf_compress_minsz) {
6617 if (l2arc_compress_buf(hdr)) {
6618 /*
6619 * If compression succeeded, enable headroom
6620 * boost on the next scan cycle.
6621 */
6622 *headroom_boost = B_TRUE;
6623 }
6624 }
6625
6626 /*
6627 * Pick up the buffer data we had previously stashed away
6628 * (and now potentially also compressed).
6629 */
6630 buf_data = hdr->b_l1hdr.b_tmp_cdata;
6631 buf_sz = hdr->b_l2hdr.b_asize;
6632
6633 /*
6634 * We need to do this regardless if buf_sz is zero or
6635 * not, otherwise, when this l2hdr is evicted we'll
6636 * remove a reference that was never added.
6637 */
6638 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr);
6639
6640 /* Compression may have squashed the buffer to zero length. */
6641 if (buf_sz != 0) {
6642 uint64_t buf_a_sz;
6643
6644 wzio = zio_write_phys(pio, dev->l2ad_vdev,
6645 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
6646 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
6647 ZIO_FLAG_CANFAIL, B_FALSE);
6648
6649 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
6650 zio_t *, wzio);
6651 (void) zio_nowait(wzio);
6652
6653 stats_size += buf_sz;
6654
6655 /*
6656 * Keep the clock hand suitably device-aligned.
6657 */
6658 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz);
6659 write_asize += buf_a_sz;
6660 dev->l2ad_hand += buf_a_sz;
6661 }
6662
6663 /*
6664 * Append buf info to current log and commit if full.
6665 * arcstat_l2_{size,asize} kstats are updated internally.
6666 */
6667 if (l2arc_log_blk_insert(dev, hdr)) {
6668 l2arc_log_blk_commit(dev, pio, cb);
6669 dev_hdr_update = B_TRUE;
6670 }
6671 }
6672
6673 mutex_exit(&dev->l2ad_mtx);
6674
6675 /*
6676 * If we wrote any logs as part of this write, update dev hdr
6677 * to point to it.
6678 */
6679 if (dev_hdr_update)
6680 l2arc_dev_hdr_update(dev, pio);
6681
6682 VERIFY3U(write_asize, <=, target_sz);
6683 ARCSTAT_BUMP(arcstat_l2_writes_sent);
6684 ARCSTAT_INCR(arcstat_l2_write_bytes, stats_size);
6685 ARCSTAT_INCR(arcstat_l2_size, write_sz);
6686 ARCSTAT_INCR(arcstat_l2_asize, stats_size);
6687 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0);
6688
6689 /*
6690 * Bump device hand to the device start if it is approaching the end.
6691 * l2arc_evict() will already have evicted ahead for this case.
6692 */
6693 if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >=
6694 dev->l2ad_end) {
6695 dev->l2ad_hand = dev->l2ad_start;
6696 dev->l2ad_first = B_FALSE;
6697 }
6698
6699 dev->l2ad_writing = B_TRUE;
6700 (void) zio_wait(pio);
6701 dev->l2ad_writing = B_FALSE;
6702
6703 return (stats_size);
6704 }
6705
6706 /*
6707 * Compresses an L2ARC buffer.
6708 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its
6709 * size in l2hdr->b_asize. This routine tries to compress the data and
6710 * depending on the compression result there are three possible outcomes:
6711 * *) The buffer was incompressible. The original l2hdr contents were left
6712 * untouched and are ready for writing to an L2 device.
6713 * *) The buffer was all-zeros, so there is no need to write it to an L2
6714 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is
6715 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY.
6716 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary
6717 * data buffer which holds the compressed data to be written, and b_asize
6718 * tells us how much data there is. b_compress is set to the appropriate
6719 * compression algorithm. Once writing is done, invoke
6720 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer.
6721 *
6722 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the
6723 * buffer was incompressible).
6724 */
6725 static boolean_t
l2arc_compress_buf(arc_buf_hdr_t * hdr)6726 l2arc_compress_buf(arc_buf_hdr_t *hdr)
6727 {
6728 void *cdata;
6729 size_t csize, len, rounded;
6730 ASSERT(HDR_HAS_L2HDR(hdr));
6731 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr;
6732
6733 ASSERT(HDR_HAS_L1HDR(hdr));
6734 ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF);
6735 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6736
6737 len = l2hdr->b_asize;
6738 cdata = zio_data_buf_alloc(len);
6739 ASSERT3P(cdata, !=, NULL);
6740 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata,
6741 cdata, l2hdr->b_asize);
6742
6743 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE);
6744 if (rounded > csize) {
6745 bzero((char *)cdata + csize, rounded - csize);
6746 csize = rounded;
6747 }
6748
6749 if (csize == 0) {
6750 /* zero block, indicate that there's nothing to write */
6751 zio_data_buf_free(cdata, len);
6752 l2hdr->b_compress = ZIO_COMPRESS_EMPTY;
6753 l2hdr->b_asize = 0;
6754 hdr->b_l1hdr.b_tmp_cdata = NULL;
6755 ARCSTAT_BUMP(arcstat_l2_compress_zeros);
6756 return (B_TRUE);
6757 } else if (csize > 0 && csize < len) {
6758 /*
6759 * Compression succeeded, we'll keep the cdata around for
6760 * writing and release it afterwards.
6761 */
6762 l2hdr->b_compress = ZIO_COMPRESS_LZ4;
6763 l2hdr->b_asize = csize;
6764 hdr->b_l1hdr.b_tmp_cdata = cdata;
6765 ARCSTAT_BUMP(arcstat_l2_compress_successes);
6766 return (B_TRUE);
6767 } else {
6768 /*
6769 * Compression failed, release the compressed buffer.
6770 * l2hdr will be left unmodified.
6771 */
6772 zio_data_buf_free(cdata, len);
6773 ARCSTAT_BUMP(arcstat_l2_compress_failures);
6774 return (B_FALSE);
6775 }
6776 }
6777
6778 /*
6779 * Decompresses a zio read back from an l2arc device. On success, the
6780 * underlying zio's io_data buffer is overwritten by the uncompressed
6781 * version. On decompression error (corrupt compressed stream), the
6782 * zio->io_error value is set to signal an I/O error.
6783 *
6784 * Please note that the compressed data stream is not checksummed, so
6785 * if the underlying device is experiencing data corruption, we may feed
6786 * corrupt data to the decompressor, so the decompressor needs to be
6787 * able to handle this situation (LZ4 does).
6788 */
6789 static void
l2arc_decompress_zio(zio_t * zio,arc_buf_hdr_t * hdr,enum zio_compress c)6790 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c)
6791 {
6792 ASSERT(L2ARC_IS_VALID_COMPRESS(c));
6793
6794 if (zio->io_error != 0) {
6795 /*
6796 * An io error has occured, just restore the original io
6797 * size in preparation for a main pool read.
6798 */
6799 zio->io_orig_size = zio->io_size = hdr->b_size;
6800 return;
6801 }
6802
6803 if (c == ZIO_COMPRESS_EMPTY) {
6804 /*
6805 * An empty buffer results in a null zio, which means we
6806 * need to fill its io_data after we're done restoring the
6807 * buffer's contents.
6808 */
6809 ASSERT(hdr->b_l1hdr.b_buf != NULL);
6810 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size);
6811 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data;
6812 } else {
6813 ASSERT(zio->io_data != NULL);
6814 /*
6815 * We copy the compressed data from the start of the arc buffer
6816 * (the zio_read will have pulled in only what we need, the
6817 * rest is garbage which we will overwrite at decompression)
6818 * and then decompress back to the ARC data buffer. This way we
6819 * can minimize copying by simply decompressing back over the
6820 * original compressed data (rather than decompressing to an
6821 * aux buffer and then copying back the uncompressed buffer,
6822 * which is likely to be much larger).
6823 */
6824 uint64_t csize;
6825 void *cdata;
6826
6827 csize = zio->io_size;
6828 cdata = zio_data_buf_alloc(csize);
6829 bcopy(zio->io_data, cdata, csize);
6830 if (zio_decompress_data(c, cdata, zio->io_data, csize,
6831 hdr->b_size) != 0)
6832 zio->io_error = EIO;
6833 zio_data_buf_free(cdata, csize);
6834 }
6835
6836 /* Restore the expected uncompressed IO size. */
6837 zio->io_orig_size = zio->io_size = hdr->b_size;
6838 }
6839
6840 /*
6841 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure.
6842 * This buffer serves as a temporary holder of compressed data while
6843 * the buffer entry is being written to an l2arc device. Once that is
6844 * done, we can dispose of it.
6845 */
6846 static void
l2arc_release_cdata_buf(arc_buf_hdr_t * hdr)6847 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr)
6848 {
6849 ASSERT(HDR_HAS_L2HDR(hdr));
6850 enum zio_compress comp = hdr->b_l2hdr.b_compress;
6851
6852 ASSERT(HDR_HAS_L1HDR(hdr));
6853 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp));
6854
6855 if (comp == ZIO_COMPRESS_OFF) {
6856 /*
6857 * In this case, b_tmp_cdata points to the same buffer
6858 * as the arc_buf_t's b_data field. We don't want to
6859 * free it, since the arc_buf_t will handle that.
6860 */
6861 hdr->b_l1hdr.b_tmp_cdata = NULL;
6862 } else if (comp == ZIO_COMPRESS_EMPTY) {
6863 /*
6864 * In this case, b_tmp_cdata was compressed to an empty
6865 * buffer, thus there's nothing to free and b_tmp_cdata
6866 * should have been set to NULL in l2arc_write_buffers().
6867 */
6868 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL);
6869 } else {
6870 /*
6871 * If the data was compressed, then we've allocated a
6872 * temporary buffer for it, so now we need to release it.
6873 */
6874 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL);
6875 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata,
6876 hdr->b_size);
6877 hdr->b_l1hdr.b_tmp_cdata = NULL;
6878 }
6879
6880 }
6881
6882 /*
6883 * This thread feeds the L2ARC at regular intervals. This is the beating
6884 * heart of the L2ARC.
6885 */
6886 static void
l2arc_feed_thread(void)6887 l2arc_feed_thread(void)
6888 {
6889 callb_cpr_t cpr;
6890 l2arc_dev_t *dev;
6891 spa_t *spa;
6892 uint64_t size, wrote;
6893 clock_t begin, next = ddi_get_lbolt();
6894 boolean_t headroom_boost = B_FALSE;
6895
6896 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
6897
6898 mutex_enter(&l2arc_feed_thr_lock);
6899
6900 while (l2arc_thread_exit == 0) {
6901 CALLB_CPR_SAFE_BEGIN(&cpr);
6902 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
6903 next);
6904 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
6905 next = ddi_get_lbolt() + hz;
6906
6907 /*
6908 * Quick check for L2ARC devices.
6909 */
6910 mutex_enter(&l2arc_dev_mtx);
6911 if (l2arc_ndev == 0) {
6912 mutex_exit(&l2arc_dev_mtx);
6913 continue;
6914 }
6915 mutex_exit(&l2arc_dev_mtx);
6916 begin = ddi_get_lbolt();
6917
6918 /*
6919 * This selects the next l2arc device to write to, and in
6920 * doing so the next spa to feed from: dev->l2ad_spa. This
6921 * will return NULL if there are now no l2arc devices or if
6922 * they are all faulted.
6923 *
6924 * If a device is returned, its spa's config lock is also
6925 * held to prevent device removal. l2arc_dev_get_next()
6926 * will grab and release l2arc_dev_mtx.
6927 */
6928 if ((dev = l2arc_dev_get_next()) == NULL)
6929 continue;
6930
6931 spa = dev->l2ad_spa;
6932 ASSERT(spa != NULL);
6933
6934 /*
6935 * If the pool is read-only then force the feed thread to
6936 * sleep a little longer.
6937 */
6938 if (!spa_writeable(spa)) {
6939 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz;
6940 spa_config_exit(spa, SCL_L2ARC, dev);
6941 continue;
6942 }
6943
6944 /*
6945 * Avoid contributing to memory pressure.
6946 */
6947 if (arc_reclaim_needed()) {
6948 ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
6949 spa_config_exit(spa, SCL_L2ARC, dev);
6950 continue;
6951 }
6952
6953 ARCSTAT_BUMP(arcstat_l2_feeds);
6954
6955 size = l2arc_write_size();
6956
6957 /*
6958 * Evict L2ARC buffers that will be overwritten.
6959 */
6960 l2arc_evict(dev, size, B_FALSE);
6961
6962 /*
6963 * Write ARC buffers.
6964 */
6965 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost);
6966
6967 /*
6968 * Calculate interval between writes.
6969 */
6970 next = l2arc_write_interval(begin, size, wrote);
6971 spa_config_exit(spa, SCL_L2ARC, dev);
6972 }
6973
6974 l2arc_thread_exit = 0;
6975 cv_broadcast(&l2arc_feed_thr_cv);
6976 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
6977 thread_exit();
6978 }
6979
6980 boolean_t
l2arc_vdev_present(vdev_t * vd)6981 l2arc_vdev_present(vdev_t *vd)
6982 {
6983 return (l2arc_vdev_get(vd) != NULL);
6984 }
6985
6986 /*
6987 * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if
6988 * the vdev_t isn't an L2ARC device.
6989 */
6990 static l2arc_dev_t *
l2arc_vdev_get(vdev_t * vd)6991 l2arc_vdev_get(vdev_t *vd)
6992 {
6993 l2arc_dev_t *dev;
6994 boolean_t held = MUTEX_HELD(&l2arc_dev_mtx);
6995
6996 if (!held)
6997 mutex_enter(&l2arc_dev_mtx);
6998 for (dev = list_head(l2arc_dev_list); dev != NULL;
6999 dev = list_next(l2arc_dev_list, dev)) {
7000 if (dev->l2ad_vdev == vd)
7001 break;
7002 }
7003 if (!held)
7004 mutex_exit(&l2arc_dev_mtx);
7005
7006 return (dev);
7007 }
7008
7009 /*
7010 * Add a vdev for use by the L2ARC. By this point the spa has already
7011 * validated the vdev and opened it. The `rebuild' flag indicates whether
7012 * we should attempt an L2ARC persistency rebuild.
7013 */
7014 void
l2arc_add_vdev(spa_t * spa,vdev_t * vd,boolean_t rebuild)7015 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild)
7016 {
7017 l2arc_dev_t *adddev;
7018
7019 ASSERT(!l2arc_vdev_present(vd));
7020
7021 /*
7022 * Create a new l2arc device entry.
7023 */
7024 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
7025 adddev->l2ad_spa = spa;
7026 adddev->l2ad_vdev = vd;
7027 /* leave extra size for an l2arc device header */
7028 adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr),
7029 1 << vd->vdev_ashift);
7030 adddev->l2ad_start = VDEV_LABEL_START_SIZE + adddev->l2ad_dev_hdr_asize;
7031 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
7032 ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end);
7033 adddev->l2ad_hand = adddev->l2ad_start;
7034 adddev->l2ad_first = B_TRUE;
7035 adddev->l2ad_writing = B_FALSE;
7036 adddev->l2ad_dev_hdr = kmem_zalloc(adddev->l2ad_dev_hdr_asize,
7037 KM_SLEEP);
7038
7039 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL);
7040 /*
7041 * This is a list of all ARC buffers that are still valid on the
7042 * device.
7043 */
7044 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
7045 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node));
7046
7047 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
7048 refcount_create(&adddev->l2ad_alloc);
7049
7050 /*
7051 * Add device to global list
7052 */
7053 mutex_enter(&l2arc_dev_mtx);
7054 list_insert_head(l2arc_dev_list, adddev);
7055 atomic_inc_64(&l2arc_ndev);
7056 if (rebuild && l2arc_rebuild_enabled &&
7057 adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) {
7058 /*
7059 * Just mark the device as pending for a rebuild. We won't
7060 * be starting a rebuild in line here as it would block pool
7061 * import. Instead spa_load_impl will hand that off to an
7062 * async task which will call l2arc_spa_rebuild_start.
7063 */
7064 adddev->l2ad_rebuild = B_TRUE;
7065 }
7066 mutex_exit(&l2arc_dev_mtx);
7067 }
7068
7069 /*
7070 * Remove a vdev from the L2ARC.
7071 */
7072 void
l2arc_remove_vdev(vdev_t * vd)7073 l2arc_remove_vdev(vdev_t *vd)
7074 {
7075 l2arc_dev_t *dev, *nextdev, *remdev = NULL;
7076
7077 /*
7078 * Find the device by vdev
7079 */
7080 mutex_enter(&l2arc_dev_mtx);
7081 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
7082 nextdev = list_next(l2arc_dev_list, dev);
7083 if (vd == dev->l2ad_vdev) {
7084 remdev = dev;
7085 break;
7086 }
7087 }
7088 ASSERT(remdev != NULL);
7089
7090 /*
7091 * Cancel any ongoing or scheduled rebuild (race protection with
7092 * l2arc_spa_rebuild_start provided via l2arc_dev_mtx).
7093 */
7094 remdev->l2ad_rebuild_cancel = B_TRUE;
7095 if (remdev->l2ad_rebuild_did != 0) {
7096 /*
7097 * N.B. it should be safe to thread_join with the rebuild
7098 * thread while holding l2arc_dev_mtx because it is not
7099 * accessed from anywhere in the l2arc rebuild code below
7100 * (except for l2arc_spa_rebuild_start, which is ok).
7101 */
7102 thread_join(remdev->l2ad_rebuild_did);
7103 }
7104
7105 /*
7106 * Remove device from global list
7107 */
7108 list_remove(l2arc_dev_list, remdev);
7109 l2arc_dev_last = NULL; /* may have been invalidated */
7110 atomic_dec_64(&l2arc_ndev);
7111 mutex_exit(&l2arc_dev_mtx);
7112
7113 /*
7114 * Clear all buflists and ARC references. L2ARC device flush.
7115 */
7116 l2arc_evict(remdev, 0, B_TRUE);
7117 list_destroy(&remdev->l2ad_buflist);
7118 mutex_destroy(&remdev->l2ad_mtx);
7119 refcount_destroy(&remdev->l2ad_alloc);
7120 kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize);
7121 kmem_free(remdev, sizeof (l2arc_dev_t));
7122 }
7123
7124 void
l2arc_init(void)7125 l2arc_init(void)
7126 {
7127 l2arc_thread_exit = 0;
7128 l2arc_ndev = 0;
7129 l2arc_writes_sent = 0;
7130 l2arc_writes_done = 0;
7131
7132 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
7133 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
7134 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
7135 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
7136
7137 l2arc_dev_list = &L2ARC_dev_list;
7138 l2arc_free_on_write = &L2ARC_free_on_write;
7139 list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
7140 offsetof(l2arc_dev_t, l2ad_node));
7141 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
7142 offsetof(l2arc_data_free_t, l2df_list_node));
7143 }
7144
7145 void
l2arc_fini(void)7146 l2arc_fini(void)
7147 {
7148 /*
7149 * This is called from dmu_fini(), which is called from spa_fini();
7150 * Because of this, we can assume that all l2arc devices have
7151 * already been removed when the pools themselves were removed.
7152 */
7153
7154 l2arc_do_free_on_write();
7155
7156 mutex_destroy(&l2arc_feed_thr_lock);
7157 cv_destroy(&l2arc_feed_thr_cv);
7158 mutex_destroy(&l2arc_dev_mtx);
7159 mutex_destroy(&l2arc_free_on_write_mtx);
7160
7161 list_destroy(l2arc_dev_list);
7162 list_destroy(l2arc_free_on_write);
7163 }
7164
7165 void
l2arc_start(void)7166 l2arc_start(void)
7167 {
7168 if (!(spa_mode_global & FWRITE))
7169 return;
7170
7171 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
7172 TS_RUN, minclsyspri);
7173 }
7174
7175 void
l2arc_stop(void)7176 l2arc_stop(void)
7177 {
7178 if (!(spa_mode_global & FWRITE))
7179 return;
7180
7181 mutex_enter(&l2arc_feed_thr_lock);
7182 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
7183 l2arc_thread_exit = 1;
7184 while (l2arc_thread_exit != 0)
7185 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
7186 mutex_exit(&l2arc_feed_thr_lock);
7187 }
7188
7189 /*
7190 * Punches out rebuild threads for the L2ARC devices in a spa. This should
7191 * be called after pool import from the spa async thread, since starting
7192 * these threads directly from spa_import() will make them part of the
7193 * "zpool import" context and delay process exit (and thus pool import).
7194 */
7195 void
l2arc_spa_rebuild_start(spa_t * spa)7196 l2arc_spa_rebuild_start(spa_t *spa)
7197 {
7198 /*
7199 * Locate the spa's l2arc devices and kick off rebuild threads.
7200 */
7201 mutex_enter(&l2arc_dev_mtx);
7202 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
7203 l2arc_dev_t *dev =
7204 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]);
7205 ASSERT(dev != NULL);
7206 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) {
7207 VERIFY3U(dev->l2ad_rebuild_did, ==, 0);
7208 #ifdef _KERNEL
7209 dev->l2ad_rebuild_did = thread_create(NULL, 0,
7210 l2arc_dev_rebuild_start, dev, 0, &p0, TS_RUN,
7211 minclsyspri)->t_did;
7212 #endif
7213 }
7214 }
7215 mutex_exit(&l2arc_dev_mtx);
7216 }
7217
7218 /*
7219 * Main entry point for L2ARC rebuilding.
7220 */
7221 static void
l2arc_dev_rebuild_start(l2arc_dev_t * dev)7222 l2arc_dev_rebuild_start(l2arc_dev_t *dev)
7223 {
7224 if (!dev->l2ad_rebuild_cancel) {
7225 VERIFY(dev->l2ad_rebuild);
7226 (void) l2arc_rebuild(dev);
7227 dev->l2ad_rebuild = B_FALSE;
7228 }
7229 }
7230
7231 /*
7232 * This function implements the actual L2ARC metadata rebuild. It:
7233 *
7234 * 1) reads the device's header
7235 * 2) if a good device header is found, starts reading the log block chain
7236 * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's)
7237 *
7238 * Operation stops under any of the following conditions:
7239 *
7240 * 1) We reach the end of the log blk chain (the back-reference in the blk is
7241 * invalid or loops over our starting point).
7242 * 2) We encounter *any* error condition (cksum errors, io errors, looped
7243 * blocks, etc.).
7244 */
7245 static int
l2arc_rebuild(l2arc_dev_t * dev)7246 l2arc_rebuild(l2arc_dev_t *dev)
7247 {
7248 vdev_t *vd = dev->l2ad_vdev;
7249 spa_t *spa = vd->vdev_spa;
7250 int err;
7251 l2arc_log_blk_phys_t *this_lb, *next_lb;
7252 uint8_t *this_lb_buf, *next_lb_buf;
7253 zio_t *this_io = NULL, *next_io = NULL;
7254 l2arc_log_blkptr_t lb_ptrs[2];
7255 boolean_t first_pass, lock_held;
7256 uint64_t load_guid;
7257
7258 this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP);
7259 next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP);
7260 this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
7261 next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP);
7262
7263 /*
7264 * We prevent device removal while issuing reads to the device,
7265 * then during the rebuilding phases we drop this lock again so
7266 * that a spa_unload or device remove can be initiated - this is
7267 * safe, because the spa will signal us to stop before removing
7268 * our device and wait for us to stop.
7269 */
7270 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER);
7271 lock_held = B_TRUE;
7272
7273 load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa);
7274 /*
7275 * Device header processing phase.
7276 */
7277 if ((err = l2arc_dev_hdr_read(dev)) != 0) {
7278 /* device header corrupted, start a new one */
7279 bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize);
7280 goto out;
7281 }
7282
7283 /* Retrieve the persistent L2ARC device state */
7284 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev,
7285 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr +
7286 LBP_GET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0]));
7287 dev->l2ad_first = !!(dev->l2ad_dev_hdr->dh_flags &
7288 L2ARC_DEV_HDR_EVICT_FIRST);
7289
7290 /* Prepare the rebuild processing state */
7291 bcopy(dev->l2ad_dev_hdr->dh_start_lbps, lb_ptrs, sizeof (lb_ptrs));
7292 first_pass = B_TRUE;
7293
7294 /* Start the rebuild process */
7295 for (;;) {
7296 if (!l2arc_log_blkptr_valid(dev, &lb_ptrs[0]))
7297 /* We hit an invalid block address, end the rebuild. */
7298 break;
7299
7300 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1],
7301 this_lb, next_lb, this_lb_buf, next_lb_buf,
7302 this_io, &next_io)) != 0)
7303 break;
7304
7305 spa_config_exit(spa, SCL_L2ARC, vd);
7306 lock_held = B_FALSE;
7307
7308 /* Protection against infinite loops of log blocks. */
7309 if (l2arc_range_check_overlap(lb_ptrs[1].lbp_daddr,
7310 lb_ptrs[0].lbp_daddr,
7311 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) &&
7312 !first_pass) {
7313 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors);
7314 err = SET_ERROR(ELOOP);
7315 break;
7316 }
7317
7318 /*
7319 * Our memory pressure valve. If the system is running low
7320 * on memory, rather than swamping memory with new ARC buf
7321 * hdrs, we opt not to rebuild the L2ARC. At this point,
7322 * however, we have already set up our L2ARC dev to chain in
7323 * new metadata log blk, so the user may choose to re-add the
7324 * L2ARC dev at a later time to reconstruct it (when there's
7325 * less memory pressure).
7326 */
7327 if (arc_reclaim_needed()) {
7328 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem);
7329 cmn_err(CE_NOTE, "System running low on memory, "
7330 "aborting L2ARC rebuild.");
7331 err = SET_ERROR(ENOMEM);
7332 break;
7333 }
7334
7335 /*
7336 * Now that we know that the next_lb checks out alright, we
7337 * can start reconstruction from this lb - we can be sure
7338 * that the L2ARC write hand has not yet reached any of our
7339 * buffers.
7340 */
7341 l2arc_log_blk_restore(dev, load_guid, this_lb,
7342 LBP_GET_PSIZE(&lb_ptrs[0]));
7343
7344 /*
7345 * End of list detection. We can look ahead two steps in the
7346 * blk chain and if the 2nd blk from this_lb dips below the
7347 * initial chain starting point, then we know two things:
7348 * 1) it can't be valid, and
7349 * 2) the next_lb's ARC entries might have already been
7350 * partially overwritten and so we should stop before
7351 * we restore it
7352 */
7353 if (l2arc_range_check_overlap(
7354 this_lb->lb_back2_lbp.lbp_daddr, lb_ptrs[0].lbp_daddr,
7355 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) &&
7356 !first_pass)
7357 break;
7358
7359 /* log blk restored, continue with next one in the list */
7360 lb_ptrs[0] = lb_ptrs[1];
7361 lb_ptrs[1] = this_lb->lb_back2_lbp;
7362 PTR_SWAP(this_lb, next_lb);
7363 PTR_SWAP(this_lb_buf, next_lb_buf);
7364 this_io = next_io;
7365 next_io = NULL;
7366 first_pass = B_FALSE;
7367
7368 for (;;) {
7369 if (dev->l2ad_rebuild_cancel) {
7370 err = SET_ERROR(ECANCELED);
7371 goto out;
7372 }
7373 if (spa_config_tryenter(spa, SCL_L2ARC, vd,
7374 RW_READER)) {
7375 lock_held = B_TRUE;
7376 break;
7377 }
7378 /*
7379 * L2ARC config lock held by somebody in writer,
7380 * possibly due to them trying to remove us. They'll
7381 * likely to want us to shut down, so after a little
7382 * delay, we check l2ad_rebuild_cancel and retry
7383 * the lock again.
7384 */
7385 delay(1);
7386 }
7387 }
7388 out:
7389 if (next_io != NULL)
7390 l2arc_log_blk_prefetch_abort(next_io);
7391 kmem_free(this_lb, sizeof (*this_lb));
7392 kmem_free(next_lb, sizeof (*next_lb));
7393 kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t));
7394 kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t));
7395 if (err == 0)
7396 ARCSTAT_BUMP(arcstat_l2_rebuild_successes);
7397
7398 if (lock_held)
7399 spa_config_exit(spa, SCL_L2ARC, vd);
7400
7401 return (err);
7402 }
7403
7404 /*
7405 * Attempts to read the device header on the provided L2ARC device and writes
7406 * it to `hdr'. On success, this function returns 0, otherwise the appropriate
7407 * error code is returned.
7408 */
7409 static int
l2arc_dev_hdr_read(l2arc_dev_t * dev)7410 l2arc_dev_hdr_read(l2arc_dev_t *dev)
7411 {
7412 int err;
7413 uint64_t guid;
7414 zio_cksum_t cksum;
7415 l2arc_dev_hdr_phys_t *hdr = dev->l2ad_dev_hdr;
7416 const uint64_t hdr_asize = dev->l2ad_dev_hdr_asize;
7417
7418 guid = spa_guid(dev->l2ad_vdev->vdev_spa);
7419
7420 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev,
7421 VDEV_LABEL_START_SIZE, hdr_asize, hdr,
7422 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
7423 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
7424 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) {
7425 spa_config_exit(dev->l2ad_vdev->vdev_spa, SCL_L2ARC,
7426 dev->l2ad_vdev);
7427 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
7428 return (err);
7429 }
7430
7431 if (hdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC))
7432 byteswap_uint64_array(hdr, sizeof (*hdr));
7433
7434 if (hdr->dh_magic != L2ARC_DEV_HDR_MAGIC || hdr->dh_spa_guid != guid) {
7435 /*
7436 * Attempt to rebuild a device containing no actual dev hdr
7437 * or containing a header from some other pool.
7438 */
7439 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported);
7440 return (SET_ERROR(ENOTSUP));
7441 }
7442
7443 l2arc_dev_hdr_checksum(hdr, &cksum);
7444 if (!ZIO_CHECKSUM_EQUAL(hdr->dh_self_cksum, cksum)) {
7445 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
7446 return (SET_ERROR(EINVAL));
7447 }
7448
7449 return (0);
7450 }
7451
7452 /*
7453 * Reads L2ARC log blocks from storage and validates their contents.
7454 *
7455 * This function implements a simple prefetcher to make sure that while
7456 * we're processing one buffer the L2ARC is already prefetching the next
7457 * one in the chain.
7458 *
7459 * The arguments this_lp and next_lp point to the current and next log blk
7460 * address in the block chain. Similarly, this_lb and next_lb hold the
7461 * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf
7462 * and next_lb_buf must be buffers of appropriate to hold a raw
7463 * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior
7464 * to buffer decompression).
7465 *
7466 * The `this_io' and `next_io' arguments are used for block prefetching.
7467 * When issuing the first blk IO during rebuild, you should pass NULL for
7468 * `this_io'. This function will then issue a sync IO to read the block and
7469 * also issue an async IO to fetch the next block in the block chain. The
7470 * prefetch IO is returned in `next_io'. On subsequent calls to this
7471 * function, pass the value returned in `next_io' from the previous call
7472 * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO.
7473 * Prior to the call, you should initialize your `next_io' pointer to be
7474 * NULL. If no prefetch IO was issued, the pointer is left set at NULL.
7475 *
7476 * On success, this function returns 0, otherwise it returns an appropriate
7477 * error code. On error the prefetching IO is aborted and cleared before
7478 * returning from this function. Therefore, if we return `success', the
7479 * caller can assume that we have taken care of cleanup of prefetch IOs.
7480 */
7481 static int
l2arc_log_blk_read(l2arc_dev_t * dev,const l2arc_log_blkptr_t * this_lbp,const l2arc_log_blkptr_t * next_lbp,l2arc_log_blk_phys_t * this_lb,l2arc_log_blk_phys_t * next_lb,uint8_t * this_lb_buf,uint8_t * next_lb_buf,zio_t * this_io,zio_t ** next_io)7482 l2arc_log_blk_read(l2arc_dev_t *dev,
7483 const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp,
7484 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb,
7485 uint8_t *this_lb_buf, uint8_t *next_lb_buf,
7486 zio_t *this_io, zio_t **next_io)
7487 {
7488 int err = 0;
7489 zio_cksum_t cksum;
7490
7491 ASSERT(this_lbp != NULL && next_lbp != NULL);
7492 ASSERT(this_lb != NULL && next_lb != NULL);
7493 ASSERT(this_lb_buf != NULL && next_lb_buf != NULL);
7494 ASSERT(next_io != NULL && *next_io == NULL);
7495 ASSERT(l2arc_log_blkptr_valid(dev, this_lbp));
7496
7497 /*
7498 * Check to see if we have issued the IO for this log blk in a
7499 * previous run. If not, this is the first call, so issue it now.
7500 */
7501 if (this_io == NULL) {
7502 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp,
7503 this_lb_buf);
7504 }
7505
7506 /*
7507 * Peek to see if we can start issuing the next IO immediately.
7508 */
7509 if (l2arc_log_blkptr_valid(dev, next_lbp)) {
7510 /*
7511 * Start issuing IO for the next log blk early - this
7512 * should help keep the L2ARC device busy while we
7513 * decompress and restore this log blk.
7514 */
7515 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp,
7516 next_lb_buf);
7517 }
7518
7519 /* Wait for the IO to read this log block to complete */
7520 if ((err = zio_wait(this_io)) != 0) {
7521 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors);
7522 goto cleanup;
7523 }
7524
7525 /* Make sure the buffer checks out */
7526 fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), NULL, &cksum);
7527 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) {
7528 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors);
7529 err = SET_ERROR(EINVAL);
7530 goto cleanup;
7531 }
7532
7533 /* Now we can take our time decoding this buffer */
7534 switch (LBP_GET_COMPRESS(this_lbp)) {
7535 case ZIO_COMPRESS_OFF:
7536 bcopy(this_lb_buf, this_lb, sizeof (*this_lb));
7537 break;
7538 case ZIO_COMPRESS_LZ4:
7539 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp),
7540 this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp),
7541 sizeof (*this_lb))) != 0) {
7542 err = SET_ERROR(EINVAL);
7543 goto cleanup;
7544 }
7545 break;
7546 default:
7547 err = SET_ERROR(EINVAL);
7548 goto cleanup;
7549 }
7550 if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC))
7551 byteswap_uint64_array(this_lb, sizeof (*this_lb));
7552 if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) {
7553 err = SET_ERROR(EINVAL);
7554 goto cleanup;
7555 }
7556 cleanup:
7557 /* Abort an in-flight prefetch I/O in case of error */
7558 if (err != 0 && *next_io != NULL) {
7559 l2arc_log_blk_prefetch_abort(*next_io);
7560 *next_io = NULL;
7561 }
7562 return (err);
7563 }
7564
7565 /*
7566 * Restores the payload of a log blk to ARC. This creates empty ARC hdr
7567 * entries which only contain an l2arc hdr, essentially restoring the
7568 * buffers to their L2ARC evicted state. This function also updates space
7569 * usage on the L2ARC vdev to make sure it tracks restored buffers.
7570 */
7571 static void
l2arc_log_blk_restore(l2arc_dev_t * dev,uint64_t load_guid,const l2arc_log_blk_phys_t * lb,uint64_t lb_psize)7572 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid,
7573 const l2arc_log_blk_phys_t *lb, uint64_t lb_psize)
7574 {
7575 uint64_t size = 0, psize = 0;
7576
7577 for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) {
7578 /*
7579 * Restore goes in the reverse temporal direction to preserve
7580 * correct temporal ordering of buffers in the l2ad_buflist.
7581 * l2arc_hdr_restore also does a list_insert_tail instead of
7582 * list_insert_head on the l2ad_buflist:
7583 *
7584 * LIST l2ad_buflist LIST
7585 * HEAD <------ (time) ------ TAIL
7586 * direction +-----+-----+-----+-----+-----+ direction
7587 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild
7588 * fill +-----+-----+-----+-----+-----+
7589 * ^ ^
7590 * | |
7591 * | |
7592 * l2arc_fill_thread l2arc_rebuild
7593 * places new bufs here restores bufs here
7594 *
7595 * This also works when the restored bufs get evicted at any
7596 * point during the rebuild.
7597 */
7598 l2arc_hdr_restore(&lb->lb_entries[i], dev, load_guid);
7599 size += LE_GET_LSIZE(&lb->lb_entries[i]);
7600 psize += LE_GET_PSIZE(&lb->lb_entries[i]);
7601 }
7602
7603 /*
7604 * Record rebuild stats:
7605 * size In-memory size of restored buffer data in ARC
7606 * psize Physical size of restored buffers in the L2ARC
7607 * bufs # of ARC buffer headers restored
7608 * log_blks # of L2ARC log entries processed during restore
7609 */
7610 ARCSTAT_INCR(arcstat_l2_rebuild_size, size);
7611 ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize);
7612 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES);
7613 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks);
7614 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize);
7615 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize);
7616 vdev_space_update(dev->l2ad_vdev, psize, 0, 0);
7617 }
7618
7619 /*
7620 * Restores a single ARC buf hdr from a log block. The ARC buffer is put
7621 * into a state indicating that it has been evicted to L2ARC.
7622 */
7623 static void
l2arc_hdr_restore(const l2arc_log_ent_phys_t * le,l2arc_dev_t * dev,uint64_t load_guid)7624 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev,
7625 uint64_t load_guid)
7626 {
7627 arc_buf_hdr_t *hdr, *exists;
7628 kmutex_t *hash_lock;
7629 arc_buf_contents_t type = LE_GET_TYPE(le);
7630
7631 /*
7632 * Do all the allocation before grabbing any locks, this lets us
7633 * sleep if memory is full and we don't have to deal with failed
7634 * allocations.
7635 */
7636 ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le)) ||
7637 LE_GET_COMPRESS(le) == ZIO_COMPRESS_OFF);
7638 hdr = arc_buf_alloc_l2only(load_guid, LE_GET_LSIZE(le), type,
7639 dev, le->le_dva, le->le_daddr, LE_GET_PSIZE(le), le->le_birth,
7640 le->le_freeze_cksum, LE_GET_COMPRESS(le));
7641 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) {
7642 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size);
7643 ARCSTAT_INCR(arcstat_l2_asize, hdr->b_l2hdr.b_asize);
7644 }
7645
7646 mutex_enter(&dev->l2ad_mtx);
7647 /*
7648 * We connect the l2hdr to the hdr only after the hdr is in the hash
7649 * table, otherwise the rest of the arc hdr manipulation machinery
7650 * might get confused.
7651 */
7652 list_insert_tail(&dev->l2ad_buflist, hdr);
7653 (void) refcount_add_many(&dev->l2ad_alloc, hdr->b_l2hdr.b_asize, hdr);
7654 mutex_exit(&dev->l2ad_mtx);
7655
7656 exists = buf_hash_insert(hdr, &hash_lock);
7657 if (exists) {
7658 /* Buffer was already cached, no need to restore it. */
7659 mutex_exit(hash_lock);
7660 arc_hdr_destroy(hdr);
7661 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached);
7662 return;
7663 }
7664
7665 mutex_exit(hash_lock);
7666 }
7667
7668 /*
7669 * Starts an asynchronous read IO to read a log block. This is used in log
7670 * block reconstruction to start reading the next block before we are done
7671 * decoding and reconstructing the current block, to keep the l2arc device
7672 * nice and hot with read IO to process.
7673 * The returned zio will contain a newly allocated memory buffers for the IO
7674 * data which should then be freed by the caller once the zio is no longer
7675 * needed (i.e. due to it having completed). If you wish to abort this
7676 * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes
7677 * care of disposing of the allocated buffers correctly.
7678 */
7679 static zio_t *
l2arc_log_blk_prefetch(vdev_t * vd,const l2arc_log_blkptr_t * lbp,uint8_t * lb_buf)7680 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp,
7681 uint8_t *lb_buf)
7682 {
7683 uint32_t psize;
7684 zio_t *pio;
7685
7686 psize = LBP_GET_PSIZE(lbp);
7687 ASSERT(psize <= sizeof (l2arc_log_blk_phys_t));
7688 pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE |
7689 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
7690 ZIO_FLAG_DONT_RETRY);
7691 (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize,
7692 lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
7693 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL |
7694 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE));
7695
7696 return (pio);
7697 }
7698
7699 /*
7700 * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data
7701 * buffers allocated for it.
7702 */
7703 static void
l2arc_log_blk_prefetch_abort(zio_t * zio)7704 l2arc_log_blk_prefetch_abort(zio_t *zio)
7705 {
7706 (void) zio_wait(zio);
7707 }
7708
7709 /*
7710 * Creates a zio to update the device header on an l2arc device. The zio is
7711 * initiated as a child of `pio'.
7712 */
7713 static void
l2arc_dev_hdr_update(l2arc_dev_t * dev,zio_t * pio)7714 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio)
7715 {
7716 zio_t *wzio;
7717 l2arc_dev_hdr_phys_t *hdr = dev->l2ad_dev_hdr;
7718 const uint64_t hdr_asize = dev->l2ad_dev_hdr_asize;
7719
7720 hdr->dh_magic = L2ARC_DEV_HDR_MAGIC;
7721 hdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa);
7722 hdr->dh_alloc_space = refcount_count(&dev->l2ad_alloc);
7723 hdr->dh_flags = 0;
7724 if (dev->l2ad_first)
7725 hdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST;
7726
7727 /* checksum operation goes last */
7728 l2arc_dev_hdr_checksum(hdr, &hdr->dh_self_cksum);
7729
7730 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE,
7731 hdr_asize, hdr, ZIO_CHECKSUM_OFF, NULL, NULL,
7732 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
7733 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
7734 (void) zio_nowait(wzio);
7735 }
7736
7737 /*
7738 * Commits a log block to the L2ARC device. This routine is invoked from
7739 * l2arc_write_buffers when the log block fills up.
7740 * This function allocates some memory to temporarily hold the serialized
7741 * buffer to be written. This is then released in l2arc_write_done.
7742 */
7743 static void
l2arc_log_blk_commit(l2arc_dev_t * dev,zio_t * pio,l2arc_write_callback_t * cb)7744 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio,
7745 l2arc_write_callback_t *cb)
7746 {
7747 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
7748 uint64_t psize, asize;
7749 l2arc_log_blk_buf_t *lb_buf;
7750 zio_t *wzio;
7751
7752 VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
7753
7754 /* link the buffer into the block chain */
7755 lb->lb_back2_lbp = dev->l2ad_dev_hdr->dh_start_lbps[1];
7756 lb->lb_magic = L2ARC_LOG_BLK_MAGIC;
7757
7758 /* try to compress the buffer */
7759 lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP);
7760 list_insert_tail(&cb->l2wcb_log_blk_buflist, lb_buf);
7761 psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb, lb_buf->lbb_log_blk,
7762 sizeof (*lb));
7763 /* a log block is never entirely zero */
7764 ASSERT(psize != 0);
7765 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize);
7766 ASSERT(asize <= sizeof (lb_buf->lbb_log_blk));
7767
7768 /*
7769 * Update the start log blk pointer in the device header to point
7770 * to the log block we're about to write.
7771 */
7772 dev->l2ad_dev_hdr->dh_start_lbps[1] =
7773 dev->l2ad_dev_hdr->dh_start_lbps[0];
7774 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand;
7775 _NOTE(CONSTCOND)
7776 LBP_SET_LSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], sizeof (*lb));
7777 LBP_SET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], asize);
7778 LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7779 ZIO_CHECKSUM_FLETCHER_4);
7780 LBP_SET_TYPE(&dev->l2ad_dev_hdr->dh_start_lbps[0], 0);
7781 if (asize < sizeof (*lb)) {
7782 /* compression succeeded */
7783 bzero(lb_buf->lbb_log_blk + psize, asize - psize);
7784 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7785 ZIO_COMPRESS_LZ4);
7786 } else {
7787 /* compression failed */
7788 bcopy(lb, lb_buf->lbb_log_blk, sizeof (*lb));
7789 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0],
7790 ZIO_COMPRESS_OFF);
7791 }
7792 /* checksum what we're about to write */
7793 fletcher_4_native(lb_buf->lbb_log_blk, asize, NULL,
7794 &dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_cksum);
7795
7796 /* perform the write itself */
7797 CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE &&
7798 L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE);
7799 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand,
7800 asize, lb_buf->lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL,
7801 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE);
7802 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio);
7803 (void) zio_nowait(wzio);
7804
7805 dev->l2ad_hand += asize;
7806 vdev_space_update(dev->l2ad_vdev, asize, 0, 0);
7807
7808 /* bump the kstats */
7809 ARCSTAT_INCR(arcstat_l2_write_bytes, asize);
7810 ARCSTAT_BUMP(arcstat_l2_log_blk_writes);
7811 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize);
7812 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio,
7813 dev->l2ad_log_blk_payload_asize / asize);
7814
7815 /* start a new log block */
7816 dev->l2ad_log_ent_idx = 0;
7817 dev->l2ad_log_blk_payload_asize = 0;
7818 }
7819
7820 /*
7821 * Validates an L2ARC log blk address to make sure that it can be read
7822 * from the provided L2ARC device. Returns B_TRUE if the address is
7823 * within the device's bounds, or B_FALSE if not.
7824 */
7825 static boolean_t
l2arc_log_blkptr_valid(l2arc_dev_t * dev,const l2arc_log_blkptr_t * lbp)7826 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp)
7827 {
7828 uint64_t psize = LBP_GET_PSIZE(lbp);
7829 uint64_t end = lbp->lbp_daddr + psize;
7830
7831 /*
7832 * A log block is valid if all of the following conditions are true:
7833 * - it fits entirely between l2ad_start and l2ad_end
7834 * - it has a valid size
7835 */
7836 return (lbp->lbp_daddr >= dev->l2ad_start && end <= dev->l2ad_end &&
7837 psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t));
7838 }
7839
7840 /*
7841 * Computes the checksum of `hdr' and stores it in `cksum'.
7842 */
7843 static void
l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t * hdr,zio_cksum_t * cksum)7844 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum)
7845 {
7846 fletcher_4_native((uint8_t *)hdr +
7847 offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid),
7848 sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid),
7849 NULL, cksum);
7850 }
7851
7852 /*
7853 * Inserts ARC buffer `ab' into the current L2ARC log blk on the device.
7854 * The buffer being inserted must be present in L2ARC.
7855 * Returns B_TRUE if the L2ARC log blk is full and needs to be committed
7856 * to L2ARC, or B_FALSE if it still has room for more ARC buffers.
7857 */
7858 static boolean_t
l2arc_log_blk_insert(l2arc_dev_t * dev,const arc_buf_hdr_t * ab)7859 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab)
7860 {
7861 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk;
7862 l2arc_log_ent_phys_t *le;
7863 int index = dev->l2ad_log_ent_idx++;
7864
7865 ASSERT(index < L2ARC_LOG_BLK_ENTRIES);
7866
7867 le = &lb->lb_entries[index];
7868 bzero(le, sizeof (*le));
7869 le->le_dva = ab->b_dva;
7870 le->le_birth = ab->b_birth;
7871 le->le_daddr = ab->b_l2hdr.b_daddr;
7872 LE_SET_LSIZE(le, ab->b_size);
7873 LE_SET_PSIZE(le, ab->b_l2hdr.b_asize);
7874 LE_SET_COMPRESS(le, ab->b_l2hdr.b_compress);
7875 if (ab->b_l2hdr.b_compress != ZIO_COMPRESS_OFF) {
7876 ASSERT(L2ARC_IS_VALID_COMPRESS(ab->b_l2hdr.b_compress));
7877 ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le)));
7878 }
7879 le->le_freeze_cksum = *ab->b_freeze_cksum;
7880 LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2);
7881 LE_SET_TYPE(le, arc_flags_to_bufc(ab->b_flags));
7882 dev->l2ad_log_blk_payload_asize += ab->b_l2hdr.b_asize;
7883
7884 return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES);
7885 }
7886
7887 /*
7888 * Checks whether a given L2ARC device address sits in a time-sequential
7889 * range. The trick here is that the L2ARC is a rotary buffer, so we can't
7890 * just do a range comparison, we need to handle the situation in which the
7891 * range wraps around the end of the L2ARC device. Arguments:
7892 * bottom Lower end of the range to check (written to earlier).
7893 * top Upper end of the range to check (written to later).
7894 * check The address for which we want to determine if it sits in
7895 * between the top and bottom.
7896 *
7897 * The 3-way conditional below represents the following cases:
7898 *
7899 * bottom < top : Sequentially ordered case:
7900 * <check>--------+-------------------+
7901 * | (overlap here?) |
7902 * L2ARC dev V V
7903 * |---------------<bottom>============<top>--------------|
7904 *
7905 * bottom > top: Looped-around case:
7906 * <check>--------+------------------+
7907 * | (overlap here?) |
7908 * L2ARC dev V V
7909 * |===============<top>---------------<bottom>===========|
7910 * ^ ^
7911 * | (or here?) |
7912 * +---------------+---------<check>
7913 *
7914 * top == bottom : Just a single address comparison.
7915 */
7916 static inline boolean_t
l2arc_range_check_overlap(uint64_t bottom,uint64_t top,uint64_t check)7917 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check)
7918 {
7919 if (bottom < top)
7920 return (bottom <= check && check <= top);
7921 else if (bottom > top)
7922 return (check <= top || bottom <= check);
7923 else
7924 return (check == top);
7925 }
7926
7927 /*
7928 * dump arc cache to user mode for debugging purposes
7929 */
7930 static void
arc_dump_entry(arc_buf_hdr_t * entry,arc_info_t * outp)7931 arc_dump_entry(arc_buf_hdr_t *entry, arc_info_t *outp)
7932 {
7933 outp->ai_dva = entry->b_dva;
7934 outp->ai_birth = entry->b_birth;
7935 outp->ai_flags = entry->b_flags;
7936 outp->ai_spa = entry->b_spa;
7937 outp->ai_size = entry->b_size;
7938 if (HDR_HAS_L1HDR(entry)) {
7939 arc_state_t *state = entry->b_l1hdr.b_state;
7940 if (state == arc_anon)
7941 outp->ai_state = AIS_ANON;
7942 else if (state == arc_mru)
7943 outp->ai_state = AIS_MRU;
7944 else if (state == arc_mru_ghost)
7945 outp->ai_state = AIS_MRU_GHOST;
7946 else if (state == arc_mfu)
7947 outp->ai_state = AIS_MFU;
7948 else if (state == arc_mfu_ghost)
7949 outp->ai_state = AIS_MFU_GHOST;
7950 else if (state == arc_l2c_only)
7951 outp->ai_state = AIS_L2C_ONLY;
7952 else
7953 outp->ai_state = AIS_UNKNOWN;
7954 } else {
7955 outp->ai_state = AIS_NO_L1HDR;
7956 }
7957 }
7958
7959 int
arc_dump(int start_bucket,void * buf,size_t bufsize,size_t * returned_bytes)7960 arc_dump(int start_bucket, void *buf, size_t bufsize, size_t *returned_bytes)
7961 {
7962 int i;
7963 arc_info_t *outp = buf + sizeof(arc_info_hdr_t);
7964 arc_info_t *maxp = buf + bufsize;
7965 arc_info_hdr_t *aih = buf;
7966 size_t nbuckets = buf_hash_table.ht_mask + 1;
7967 size_t bph = nbuckets / BUF_LOCKS; /* buckets per hash */
7968 kmutex_t *last_lock = NULL;
7969
7970 if (bufsize < sizeof(arc_info_hdr_t))
7971 return (ENOMEM);
7972
7973 aih->aih_buckets = nbuckets;
7974 aih->aih_buf_locks = BUF_LOCKS;
7975
7976 ASSERT(start_bucket >= 0);
7977 ASSERT(start_bucket < nbuckets);
7978
7979 for (i = start_bucket; i < nbuckets; ++i) {
7980 kmutex_t *hash_lock;
7981 arc_buf_hdr_t *entry;
7982 arc_info_t *dryrun = outp;
7983 int bucket;
7984
7985 /*
7986 * transform index. We want to enumerate the buckets in an
7987 * order that allows us to keep the mutex as long as possible
7988 */
7989 bucket = (i / bph) + (i % bph) * BUF_LOCKS;
7990
7991 hash_lock = BUF_HASH_LOCK(bucket);
7992 if (hash_lock != last_lock) {
7993 if (last_lock)
7994 mutex_exit(last_lock);
7995 mutex_enter(hash_lock);
7996 }
7997 last_lock = hash_lock;
7998 /* count entries to see if they will fit */
7999 entry = buf_hash_table.ht_table[bucket];
8000 while (entry != NULL) {
8001 ++dryrun;
8002 entry = entry->b_hash_next;
8003 }
8004 if (dryrun > maxp) {
8005 break;
8006 }
8007 /* actually copy entries */
8008 entry = buf_hash_table.ht_table[bucket];
8009 while (entry != NULL) {
8010 arc_dump_entry(entry, outp);
8011 ++outp;
8012 entry = entry->b_hash_next;
8013 }
8014 }
8015 if (last_lock)
8016 mutex_exit(last_lock);
8017
8018 *returned_bytes = (void *)outp - buf;
8019 aih->aih_entries = (*returned_bytes - sizeof(*aih)) / sizeof(*outp);
8020
8021 if (i <= buf_hash_table.ht_mask)
8022 aih->aih_next = i;
8023 else
8024 aih->aih_next = 0;
8025
8026 return (0);
8027 }
8028