1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74 /* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123 #include <sys/spa.h> 124 #include <sys/zio.h> 125 #include <sys/zio_compress.h> 126 #include <sys/zfs_context.h> 127 #include <sys/arc.h> 128 #include <sys/refcount.h> 129 #include <sys/vdev.h> 130 #include <sys/vdev_impl.h> 131 #include <sys/dsl_pool.h> 132 #include <sys/multilist.h> 133 #ifdef _KERNEL 134 #include <sys/vmsystm.h> 135 #include <vm/anon.h> 136 #include <sys/fs/swapnode.h> 137 #include <sys/dnlc.h> 138 #endif 139 #include <sys/callb.h> 140 #include <sys/kstat.h> 141 #include <zfs_fletcher.h> 142 #include <sys/byteorder.h> 143 #include <sys/spa_impl.h> 144 #include <sys/zfs_ioctl.h> 145 146 #ifndef _KERNEL 147 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 148 boolean_t arc_watch = B_FALSE; 149 int arc_procfd; 150 #endif 151 152 static kmutex_t arc_reclaim_lock; 153 static kcondvar_t arc_reclaim_thread_cv; 154 static boolean_t arc_reclaim_thread_exit; 155 static kcondvar_t arc_reclaim_waiters_cv; 156 157 static kmutex_t arc_user_evicts_lock; 158 static kcondvar_t arc_user_evicts_cv; 159 static boolean_t arc_user_evicts_thread_exit; 160 161 uint_t arc_reduce_dnlc_percent = 3; 162 163 /* 164 * The number of headers to evict in arc_evict_state_impl() before 165 * dropping the sublist lock and evicting from another sublist. A lower 166 * value means we're more likely to evict the "correct" header (i.e. the 167 * oldest header in the arc state), but comes with higher overhead 168 * (i.e. more invocations of arc_evict_state_impl()). 169 */ 170 int zfs_arc_evict_batch_limit = 10; 171 172 /* 173 * The number of sublists used for each of the arc state lists. If this 174 * is not set to a suitable value by the user, it will be configured to 175 * the number of CPUs on the system in arc_init(). 176 */ 177 int zfs_arc_num_sublists_per_state = 0; 178 179 /* number of seconds before growing cache again */ 180 static int arc_grow_retry = 60; 181 182 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */ 183 int zfs_arc_overflow_shift = 8; 184 185 /* shift of arc_c for calculating both min and max arc_p */ 186 static int arc_p_min_shift = 4; 187 188 /* log2(fraction of arc to reclaim) */ 189 static int arc_shrink_shift = 7; 190 191 /* 192 * log2(fraction of ARC which must be free to allow growing). 193 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 194 * when reading a new block into the ARC, we will evict an equal-sized block 195 * from the ARC. 196 * 197 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 198 * we will still not allow it to grow. 199 */ 200 int arc_no_grow_shift = 5; 201 202 203 /* 204 * minimum lifespan of a prefetch block in clock ticks 205 * (initialized in arc_init()) 206 */ 207 static int arc_min_prefetch_lifespan; 208 209 /* 210 * If this percent of memory is free, don't throttle. 211 */ 212 int arc_lotsfree_percent = 10; 213 214 static int arc_dead; 215 216 /* 217 * The arc has filled available memory and has now warmed up. 218 */ 219 static boolean_t arc_warm; 220 221 /* 222 * These tunables are for performance analysis. 223 */ 224 uint64_t zfs_arc_max; 225 uint64_t zfs_arc_min; 226 uint64_t zfs_arc_meta_limit = 0; 227 uint64_t zfs_arc_meta_min = 0; 228 int zfs_arc_grow_retry = 0; 229 int zfs_arc_shrink_shift = 0; 230 int zfs_arc_p_min_shift = 0; 231 int zfs_disable_dup_eviction = 0; 232 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 233 234 /* 235 * Note that buffers can be in one of 6 states: 236 * ARC_anon - anonymous (discussed below) 237 * ARC_mru - recently used, currently cached 238 * ARC_mru_ghost - recentely used, no longer in cache 239 * ARC_mfu - frequently used, currently cached 240 * ARC_mfu_ghost - frequently used, no longer in cache 241 * ARC_l2c_only - exists in L2ARC but not other states 242 * When there are no active references to the buffer, they are 243 * are linked onto a list in one of these arc states. These are 244 * the only buffers that can be evicted or deleted. Within each 245 * state there are multiple lists, one for meta-data and one for 246 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 247 * etc.) is tracked separately so that it can be managed more 248 * explicitly: favored over data, limited explicitly. 249 * 250 * Anonymous buffers are buffers that are not associated with 251 * a DVA. These are buffers that hold dirty block copies 252 * before they are written to stable storage. By definition, 253 * they are "ref'd" and are considered part of arc_mru 254 * that cannot be freed. Generally, they will aquire a DVA 255 * as they are written and migrate onto the arc_mru list. 256 * 257 * The ARC_l2c_only state is for buffers that are in the second 258 * level ARC but no longer in any of the ARC_m* lists. The second 259 * level ARC itself may also contain buffers that are in any of 260 * the ARC_m* states - meaning that a buffer can exist in two 261 * places. The reason for the ARC_l2c_only state is to keep the 262 * buffer header in the hash table, so that reads that hit the 263 * second level ARC benefit from these fast lookups. 264 */ 265 266 typedef struct arc_state { 267 /* 268 * list of evictable buffers 269 */ 270 multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 271 /* 272 * total amount of evictable data in this state 273 */ 274 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 275 /* 276 * total amount of data in this state; this includes: evictable, 277 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 278 */ 279 refcount_t arcs_size; 280 } arc_state_t; 281 282 /* The 6 states: */ 283 static arc_state_t ARC_anon; 284 static arc_state_t ARC_mru; 285 static arc_state_t ARC_mru_ghost; 286 static arc_state_t ARC_mfu; 287 static arc_state_t ARC_mfu_ghost; 288 static arc_state_t ARC_l2c_only; 289 290 typedef struct arc_stats { 291 kstat_named_t arcstat_hits; 292 kstat_named_t arcstat_misses; 293 kstat_named_t arcstat_demand_hits_data; 294 kstat_named_t arcstat_demand_misses_data; 295 kstat_named_t arcstat_demand_hits_metadata; 296 kstat_named_t arcstat_demand_misses_metadata; 297 kstat_named_t arcstat_prefetch_hits_data; 298 kstat_named_t arcstat_prefetch_misses_data; 299 kstat_named_t arcstat_prefetch_hits_metadata; 300 kstat_named_t arcstat_prefetch_misses_metadata; 301 kstat_named_t arcstat_mru_hits; 302 kstat_named_t arcstat_mru_ghost_hits; 303 kstat_named_t arcstat_mfu_hits; 304 kstat_named_t arcstat_mfu_ghost_hits; 305 kstat_named_t arcstat_deleted; 306 /* 307 * Number of buffers that could not be evicted because the hash lock 308 * was held by another thread. The lock may not necessarily be held 309 * by something using the same buffer, since hash locks are shared 310 * by multiple buffers. 311 */ 312 kstat_named_t arcstat_mutex_miss; 313 /* 314 * Number of buffers skipped because they have I/O in progress, are 315 * indrect prefetch buffers that have not lived long enough, or are 316 * not from the spa we're trying to evict from. 317 */ 318 kstat_named_t arcstat_evict_skip; 319 /* 320 * Number of times arc_evict_state() was unable to evict enough 321 * buffers to reach it's target amount. 322 */ 323 kstat_named_t arcstat_evict_not_enough; 324 kstat_named_t arcstat_evict_l2_cached; 325 kstat_named_t arcstat_evict_l2_eligible; 326 kstat_named_t arcstat_evict_l2_ineligible; 327 kstat_named_t arcstat_evict_l2_skip; 328 kstat_named_t arcstat_hash_elements; 329 kstat_named_t arcstat_hash_elements_max; 330 kstat_named_t arcstat_hash_collisions; 331 kstat_named_t arcstat_hash_chains; 332 kstat_named_t arcstat_hash_chain_max; 333 kstat_named_t arcstat_p; 334 kstat_named_t arcstat_c; 335 kstat_named_t arcstat_c_min; 336 kstat_named_t arcstat_c_max; 337 kstat_named_t arcstat_size; 338 /* 339 * Number of bytes consumed by internal ARC structures necessary 340 * for tracking purposes; these structures are not actually 341 * backed by ARC buffers. This includes arc_buf_hdr_t structures 342 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 343 * caches), and arc_buf_t structures (allocated via arc_buf_t 344 * cache). 345 */ 346 kstat_named_t arcstat_hdr_size; 347 /* 348 * Number of bytes consumed by ARC buffers of type equal to 349 * ARC_BUFC_DATA. This is generally consumed by buffers backing 350 * on disk user data (e.g. plain file contents). 351 */ 352 kstat_named_t arcstat_data_size; 353 /* 354 * Number of bytes consumed by ARC buffers of type equal to 355 * ARC_BUFC_METADATA. This is generally consumed by buffers 356 * backing on disk data that is used for internal ZFS 357 * structures (e.g. ZAP, dnode, indirect blocks, etc). 358 */ 359 kstat_named_t arcstat_metadata_size; 360 /* 361 * Number of bytes consumed by various buffers and structures 362 * not actually backed with ARC buffers. This includes bonus 363 * buffers (allocated directly via zio_buf_* functions), 364 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 365 * cache), and dnode_t structures (allocated via dnode_t cache). 366 */ 367 kstat_named_t arcstat_other_size; 368 /* 369 * Total number of bytes consumed by ARC buffers residing in the 370 * arc_anon state. This includes *all* buffers in the arc_anon 371 * state; e.g. data, metadata, evictable, and unevictable buffers 372 * are all included in this value. 373 */ 374 kstat_named_t arcstat_anon_size; 375 /* 376 * Number of bytes consumed by ARC buffers that meet the 377 * following criteria: backing buffers of type ARC_BUFC_DATA, 378 * residing in the arc_anon state, and are eligible for eviction 379 * (e.g. have no outstanding holds on the buffer). 380 */ 381 kstat_named_t arcstat_anon_evictable_data; 382 /* 383 * Number of bytes consumed by ARC buffers that meet the 384 * following criteria: backing buffers of type ARC_BUFC_METADATA, 385 * residing in the arc_anon state, and are eligible for eviction 386 * (e.g. have no outstanding holds on the buffer). 387 */ 388 kstat_named_t arcstat_anon_evictable_metadata; 389 /* 390 * Total number of bytes consumed by ARC buffers residing in the 391 * arc_mru state. This includes *all* buffers in the arc_mru 392 * state; e.g. data, metadata, evictable, and unevictable buffers 393 * are all included in this value. 394 */ 395 kstat_named_t arcstat_mru_size; 396 /* 397 * Number of bytes consumed by ARC buffers that meet the 398 * following criteria: backing buffers of type ARC_BUFC_DATA, 399 * residing in the arc_mru state, and are eligible for eviction 400 * (e.g. have no outstanding holds on the buffer). 401 */ 402 kstat_named_t arcstat_mru_evictable_data; 403 /* 404 * Number of bytes consumed by ARC buffers that meet the 405 * following criteria: backing buffers of type ARC_BUFC_METADATA, 406 * residing in the arc_mru state, and are eligible for eviction 407 * (e.g. have no outstanding holds on the buffer). 408 */ 409 kstat_named_t arcstat_mru_evictable_metadata; 410 /* 411 * Total number of bytes that *would have been* consumed by ARC 412 * buffers in the arc_mru_ghost state. The key thing to note 413 * here, is the fact that this size doesn't actually indicate 414 * RAM consumption. The ghost lists only consist of headers and 415 * don't actually have ARC buffers linked off of these headers. 416 * Thus, *if* the headers had associated ARC buffers, these 417 * buffers *would have* consumed this number of bytes. 418 */ 419 kstat_named_t arcstat_mru_ghost_size; 420 /* 421 * Number of bytes that *would have been* consumed by ARC 422 * buffers that are eligible for eviction, of type 423 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 424 */ 425 kstat_named_t arcstat_mru_ghost_evictable_data; 426 /* 427 * Number of bytes that *would have been* consumed by ARC 428 * buffers that are eligible for eviction, of type 429 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 430 */ 431 kstat_named_t arcstat_mru_ghost_evictable_metadata; 432 /* 433 * Total number of bytes consumed by ARC buffers residing in the 434 * arc_mfu state. This includes *all* buffers in the arc_mfu 435 * state; e.g. data, metadata, evictable, and unevictable buffers 436 * are all included in this value. 437 */ 438 kstat_named_t arcstat_mfu_size; 439 /* 440 * Number of bytes consumed by ARC buffers that are eligible for 441 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 442 * state. 443 */ 444 kstat_named_t arcstat_mfu_evictable_data; 445 /* 446 * Number of bytes consumed by ARC buffers that are eligible for 447 * eviction, of type ARC_BUFC_METADATA, and reside in the 448 * arc_mfu state. 449 */ 450 kstat_named_t arcstat_mfu_evictable_metadata; 451 /* 452 * Total number of bytes that *would have been* consumed by ARC 453 * buffers in the arc_mfu_ghost state. See the comment above 454 * arcstat_mru_ghost_size for more details. 455 */ 456 kstat_named_t arcstat_mfu_ghost_size; 457 /* 458 * Number of bytes that *would have been* consumed by ARC 459 * buffers that are eligible for eviction, of type 460 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 461 */ 462 kstat_named_t arcstat_mfu_ghost_evictable_data; 463 /* 464 * Number of bytes that *would have been* consumed by ARC 465 * buffers that are eligible for eviction, of type 466 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 467 */ 468 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 469 kstat_named_t arcstat_l2_hits; 470 kstat_named_t arcstat_l2_misses; 471 kstat_named_t arcstat_l2_feeds; 472 kstat_named_t arcstat_l2_rw_clash; 473 kstat_named_t arcstat_l2_read_bytes; 474 kstat_named_t arcstat_l2_write_bytes; 475 kstat_named_t arcstat_l2_writes_sent; 476 kstat_named_t arcstat_l2_writes_done; 477 kstat_named_t arcstat_l2_writes_error; 478 kstat_named_t arcstat_l2_writes_lock_retry; 479 kstat_named_t arcstat_l2_evict_lock_retry; 480 kstat_named_t arcstat_l2_evict_reading; 481 kstat_named_t arcstat_l2_evict_l1cached; 482 kstat_named_t arcstat_l2_free_on_write; 483 kstat_named_t arcstat_l2_cdata_free_on_write; 484 kstat_named_t arcstat_l2_abort_lowmem; 485 kstat_named_t arcstat_l2_cksum_bad; 486 kstat_named_t arcstat_l2_io_error; 487 kstat_named_t arcstat_l2_size; 488 kstat_named_t arcstat_l2_asize; 489 kstat_named_t arcstat_l2_hdr_size; 490 kstat_named_t arcstat_l2_compress_successes; 491 kstat_named_t arcstat_l2_compress_zeros; 492 kstat_named_t arcstat_l2_compress_failures; 493 kstat_named_t arcstat_l2_log_blk_writes; 494 kstat_named_t arcstat_l2_log_blk_avg_size; 495 kstat_named_t arcstat_l2_data_to_meta_ratio; 496 kstat_named_t arcstat_l2_rebuild_successes; 497 kstat_named_t arcstat_l2_rebuild_abort_unsupported; 498 kstat_named_t arcstat_l2_rebuild_abort_io_errors; 499 kstat_named_t arcstat_l2_rebuild_abort_cksum_errors; 500 kstat_named_t arcstat_l2_rebuild_abort_loop_errors; 501 kstat_named_t arcstat_l2_rebuild_abort_lowmem; 502 kstat_named_t arcstat_l2_rebuild_size; 503 kstat_named_t arcstat_l2_rebuild_bufs; 504 kstat_named_t arcstat_l2_rebuild_bufs_precached; 505 kstat_named_t arcstat_l2_rebuild_psize; 506 kstat_named_t arcstat_l2_rebuild_log_blks; 507 kstat_named_t arcstat_memory_throttle_count; 508 kstat_named_t arcstat_duplicate_buffers; 509 kstat_named_t arcstat_duplicate_buffers_size; 510 kstat_named_t arcstat_duplicate_reads; 511 kstat_named_t arcstat_meta_used; 512 kstat_named_t arcstat_meta_limit; 513 kstat_named_t arcstat_meta_max; 514 kstat_named_t arcstat_meta_min; 515 kstat_named_t arcstat_sync_wait_for_async; 516 kstat_named_t arcstat_demand_hit_predictive_prefetch; 517 } arc_stats_t; 518 519 static arc_stats_t arc_stats = { 520 { "hits", KSTAT_DATA_UINT64 }, 521 { "misses", KSTAT_DATA_UINT64 }, 522 { "demand_data_hits", KSTAT_DATA_UINT64 }, 523 { "demand_data_misses", KSTAT_DATA_UINT64 }, 524 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 525 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 526 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 527 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 528 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 529 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 530 { "mru_hits", KSTAT_DATA_UINT64 }, 531 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 532 { "mfu_hits", KSTAT_DATA_UINT64 }, 533 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 534 { "deleted", KSTAT_DATA_UINT64 }, 535 { "mutex_miss", KSTAT_DATA_UINT64 }, 536 { "evict_skip", KSTAT_DATA_UINT64 }, 537 { "evict_not_enough", KSTAT_DATA_UINT64 }, 538 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 539 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 540 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 541 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 542 { "hash_elements", KSTAT_DATA_UINT64 }, 543 { "hash_elements_max", KSTAT_DATA_UINT64 }, 544 { "hash_collisions", KSTAT_DATA_UINT64 }, 545 { "hash_chains", KSTAT_DATA_UINT64 }, 546 { "hash_chain_max", KSTAT_DATA_UINT64 }, 547 { "p", KSTAT_DATA_UINT64 }, 548 { "c", KSTAT_DATA_UINT64 }, 549 { "c_min", KSTAT_DATA_UINT64 }, 550 { "c_max", KSTAT_DATA_UINT64 }, 551 { "size", KSTAT_DATA_UINT64 }, 552 { "hdr_size", KSTAT_DATA_UINT64 }, 553 { "data_size", KSTAT_DATA_UINT64 }, 554 { "metadata_size", KSTAT_DATA_UINT64 }, 555 { "other_size", KSTAT_DATA_UINT64 }, 556 { "anon_size", KSTAT_DATA_UINT64 }, 557 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 558 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 559 { "mru_size", KSTAT_DATA_UINT64 }, 560 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 561 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 562 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 563 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 564 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 565 { "mfu_size", KSTAT_DATA_UINT64 }, 566 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 567 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 568 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 569 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 570 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 571 { "l2_hits", KSTAT_DATA_UINT64 }, 572 { "l2_misses", KSTAT_DATA_UINT64 }, 573 { "l2_feeds", KSTAT_DATA_UINT64 }, 574 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 575 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 576 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 577 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 578 { "l2_writes_done", KSTAT_DATA_UINT64 }, 579 { "l2_writes_error", KSTAT_DATA_UINT64 }, 580 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 581 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 582 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 583 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 584 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 585 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 586 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 587 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 588 { "l2_io_error", KSTAT_DATA_UINT64 }, 589 { "l2_size", KSTAT_DATA_UINT64 }, 590 { "l2_asize", KSTAT_DATA_UINT64 }, 591 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 592 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 593 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 594 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 595 { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, 596 { "l2_log_blk_avg_size", KSTAT_DATA_UINT64 }, 597 { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, 598 { "l2_rebuild_successes", KSTAT_DATA_UINT64 }, 599 { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, 600 { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, 601 { "l2_rebuild_cksum_errors", KSTAT_DATA_UINT64 }, 602 { "l2_rebuild_loop_errors", KSTAT_DATA_UINT64 }, 603 { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, 604 { "l2_rebuild_size", KSTAT_DATA_UINT64 }, 605 { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, 606 { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, 607 { "l2_rebuild_psize", KSTAT_DATA_UINT64 }, 608 { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, 609 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 610 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 611 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 612 { "duplicate_reads", KSTAT_DATA_UINT64 }, 613 { "arc_meta_used", KSTAT_DATA_UINT64 }, 614 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 615 { "arc_meta_max", KSTAT_DATA_UINT64 }, 616 { "arc_meta_min", KSTAT_DATA_UINT64 }, 617 { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 618 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 619 }; 620 621 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 622 623 #define ARCSTAT_INCR(stat, val) \ 624 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 625 626 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 627 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 628 629 #define ARCSTAT_MAX(stat, val) { \ 630 uint64_t m; \ 631 while ((val) > (m = arc_stats.stat.value.ui64) && \ 632 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 633 continue; \ 634 } 635 636 #define ARCSTAT_MAXSTAT(stat) \ 637 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 638 639 /* 640 * We define a macro to allow ARC hits/misses to be easily broken down by 641 * two separate conditions, giving a total of four different subtypes for 642 * each of hits and misses (so eight statistics total). 643 */ 644 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 645 if (cond1) { \ 646 if (cond2) { \ 647 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 648 } else { \ 649 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 650 } \ 651 } else { \ 652 if (cond2) { \ 653 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 654 } else { \ 655 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 656 } \ 657 } 658 659 /* 660 * This macro allows us to use kstats as floating averages. Each time we 661 * update this kstat, we first factor it and the update value by 662 * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall 663 * average. This macro assumes that integer loads and stores are atomic, but 664 * is not safe for multiple writers updating the kstat in parallel (only the 665 * last writer's update will remain). 666 */ 667 #define ARCSTAT_F_AVG_FACTOR 3 668 #define ARCSTAT_F_AVG(stat, value) \ 669 do { \ 670 uint64_t x = ARCSTAT(stat); \ 671 x = x - x / ARCSTAT_F_AVG_FACTOR + \ 672 (value) / ARCSTAT_F_AVG_FACTOR; \ 673 ARCSTAT(stat) = x; \ 674 _NOTE(CONSTCOND) \ 675 } while (0) 676 677 kstat_t *arc_ksp; 678 static arc_state_t *arc_anon; 679 static arc_state_t *arc_mru; 680 static arc_state_t *arc_mru_ghost; 681 static arc_state_t *arc_mfu; 682 static arc_state_t *arc_mfu_ghost; 683 static arc_state_t *arc_l2c_only; 684 685 /* 686 * There are several ARC variables that are critical to export as kstats -- 687 * but we don't want to have to grovel around in the kstat whenever we wish to 688 * manipulate them. For these variables, we therefore define them to be in 689 * terms of the statistic variable. This assures that we are not introducing 690 * the possibility of inconsistency by having shadow copies of the variables, 691 * while still allowing the code to be readable. 692 */ 693 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 694 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 695 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 696 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 697 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 698 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 699 #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 700 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 701 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 702 703 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 704 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 705 706 static int arc_no_grow; /* Don't try to grow cache size */ 707 static uint64_t arc_tempreserve; 708 static uint64_t arc_loaned_bytes; 709 710 typedef struct arc_callback arc_callback_t; 711 712 struct arc_callback { 713 void *acb_private; 714 arc_done_func_t *acb_done; 715 arc_buf_t *acb_buf; 716 zio_t *acb_zio_dummy; 717 arc_callback_t *acb_next; 718 }; 719 720 typedef struct arc_write_callback arc_write_callback_t; 721 722 struct arc_write_callback { 723 void *awcb_private; 724 arc_done_func_t *awcb_ready; 725 arc_done_func_t *awcb_physdone; 726 arc_done_func_t *awcb_done; 727 arc_buf_t *awcb_buf; 728 }; 729 730 /* 731 * ARC buffers are separated into multiple structs as a memory saving measure: 732 * - Common fields struct, always defined, and embedded within it: 733 * - L2-only fields, always allocated but undefined when not in L2ARC 734 * - L1-only fields, only allocated when in L1ARC 735 * 736 * Buffer in L1 Buffer only in L2 737 * +------------------------+ +------------------------+ 738 * | arc_buf_hdr_t | | arc_buf_hdr_t | 739 * | | | | 740 * | | | | 741 * | | | | 742 * +------------------------+ +------------------------+ 743 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 744 * | (undefined if L1-only) | | | 745 * +------------------------+ +------------------------+ 746 * | l1arc_buf_hdr_t | 747 * | | 748 * | | 749 * | | 750 * | | 751 * +------------------------+ 752 * 753 * Because it's possible for the L2ARC to become extremely large, we can wind 754 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 755 * is minimized by only allocating the fields necessary for an L1-cached buffer 756 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 757 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 758 * words in pointers. arc_hdr_realloc() is used to switch a header between 759 * these two allocation states. 760 */ 761 typedef struct l1arc_buf_hdr { 762 kmutex_t b_freeze_lock; 763 #ifdef ZFS_DEBUG 764 /* 765 * used for debugging wtih kmem_flags - by allocating and freeing 766 * b_thawed when the buffer is thawed, we get a record of the stack 767 * trace that thawed it. 768 */ 769 void *b_thawed; 770 #endif 771 772 arc_buf_t *b_buf; 773 uint32_t b_datacnt; 774 /* for waiting on writes to complete */ 775 kcondvar_t b_cv; 776 777 /* protected by arc state mutex */ 778 arc_state_t *b_state; 779 multilist_node_t b_arc_node; 780 781 /* updated atomically */ 782 clock_t b_arc_access; 783 784 /* self protecting */ 785 refcount_t b_refcnt; 786 787 arc_callback_t *b_acb; 788 /* temporary buffer holder for in-flight compressed data */ 789 void *b_tmp_cdata; 790 } l1arc_buf_hdr_t; 791 792 typedef struct l2arc_dev l2arc_dev_t; 793 794 typedef struct l2arc_buf_hdr { 795 /* protected by arc_buf_hdr mutex */ 796 l2arc_dev_t *b_dev; /* L2ARC device */ 797 uint64_t b_daddr; /* disk address, offset byte */ 798 /* real alloc'd buffer size depending on b_compress applied */ 799 int32_t b_asize; 800 uint8_t b_compress; 801 802 list_node_t b_l2node; 803 } l2arc_buf_hdr_t; 804 805 struct arc_buf_hdr { 806 /* protected by hash lock */ 807 dva_t b_dva; 808 uint64_t b_birth; 809 /* 810 * Even though this checksum is only set/verified when a buffer is in 811 * the L1 cache, it needs to be in the set of common fields because it 812 * must be preserved from the time before a buffer is written out to 813 * L2ARC until after it is read back in. 814 */ 815 zio_cksum_t *b_freeze_cksum; 816 817 arc_buf_hdr_t *b_hash_next; 818 arc_flags_t b_flags; 819 820 /* immutable */ 821 int32_t b_size; 822 uint64_t b_spa; 823 824 /* L2ARC fields. Undefined when not in L2ARC. */ 825 l2arc_buf_hdr_t b_l2hdr; 826 /* L1ARC fields. Undefined when in l2arc_only state */ 827 l1arc_buf_hdr_t b_l1hdr; 828 }; 829 830 static arc_buf_t *arc_eviction_list; 831 static arc_buf_hdr_t arc_eviction_hdr; 832 833 #define GHOST_STATE(state) \ 834 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 835 (state) == arc_l2c_only) 836 837 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 838 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 839 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 840 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 841 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 842 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 843 844 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 845 #define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 846 #define HDR_L2_READING(hdr) \ 847 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 848 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 849 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 850 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 851 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 852 853 #define HDR_ISTYPE_METADATA(hdr) \ 854 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 855 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 856 857 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 858 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 859 860 /* 861 * Other sizes 862 */ 863 864 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 865 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 866 867 /* 868 * Hash table routines 869 */ 870 871 #define HT_LOCK_PAD 64 872 873 struct ht_lock { 874 kmutex_t ht_lock; 875 #ifdef _KERNEL 876 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 877 #endif 878 }; 879 880 #define BUF_LOCKS 256 881 typedef struct buf_hash_table { 882 uint64_t ht_mask; 883 arc_buf_hdr_t **ht_table; 884 struct ht_lock ht_locks[BUF_LOCKS]; 885 } buf_hash_table_t; 886 887 static buf_hash_table_t buf_hash_table; 888 889 #define BUF_HASH_INDEX(spa, dva, birth) \ 890 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 891 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 892 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 893 #define HDR_LOCK(hdr) \ 894 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 895 896 uint64_t zfs_crc64_table[256]; 897 898 /* 899 * Level 2 ARC 900 */ 901 902 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 903 #define L2ARC_HEADROOM 2 /* num of writes */ 904 /* 905 * If we discover during ARC scan any buffers to be compressed, we boost 906 * our headroom for the next scanning cycle by this percentage multiple. 907 */ 908 #define L2ARC_HEADROOM_BOOST 200 909 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 910 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 911 912 /* 913 * Used to distinguish headers that are being process by 914 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 915 * address. This can happen when the header is added to the l2arc's list 916 * of buffers to write in the first stage of l2arc_write_buffers(), but 917 * has not yet been written out which happens in the second stage of 918 * l2arc_write_buffers(). 919 */ 920 #define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 921 922 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 923 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 924 925 /* L2ARC Performance Tunables */ 926 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 927 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 928 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 929 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 930 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 931 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 932 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 933 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 934 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 935 936 static list_t L2ARC_dev_list; /* device list */ 937 static list_t *l2arc_dev_list; /* device list pointer */ 938 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 939 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 940 static list_t L2ARC_free_on_write; /* free after write buf list */ 941 static list_t *l2arc_free_on_write; /* free after write list ptr */ 942 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 943 static uint64_t l2arc_ndev; /* number of devices */ 944 945 typedef struct l2arc_read_callback { 946 arc_buf_t *l2rcb_buf; /* read buffer */ 947 spa_t *l2rcb_spa; /* spa */ 948 blkptr_t l2rcb_bp; /* original blkptr */ 949 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 950 int l2rcb_flags; /* original flags */ 951 enum zio_compress l2rcb_compress; /* applied compress */ 952 } l2arc_read_callback_t; 953 954 typedef struct l2arc_write_callback { 955 l2arc_dev_t *l2wcb_dev; /* device info */ 956 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 957 list_t l2wcb_log_blk_buflist; /* in-flight log blocks */ 958 } l2arc_write_callback_t; 959 960 typedef struct l2arc_data_free { 961 /* protected by l2arc_free_on_write_mtx */ 962 void *l2df_data; 963 size_t l2df_size; 964 void (*l2df_func)(void *, size_t); 965 list_node_t l2df_list_node; 966 } l2arc_data_free_t; 967 968 static kmutex_t l2arc_feed_thr_lock; 969 static kcondvar_t l2arc_feed_thr_cv; 970 static uint8_t l2arc_thread_exit; 971 972 static void arc_get_data_buf(arc_buf_t *); 973 static void arc_access(arc_buf_hdr_t *, kmutex_t *); 974 static boolean_t arc_is_overflowing(); 975 static void arc_buf_watch(arc_buf_t *); 976 static void l2arc_read_done(zio_t *zio); 977 static l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); 978 979 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 980 static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 981 static arc_buf_contents_t arc_flags_to_bufc(uint32_t); 982 983 static boolean_t l2arc_write_eligible(uint64_t, uint64_t, arc_buf_hdr_t *); 984 static void l2arc_read_done(zio_t *); 985 986 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 987 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 988 static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 989 990 static void 991 arc_update_hit_stat(arc_buf_hdr_t *hdr, boolean_t hit) 992 { 993 boolean_t pf = !HDR_PREFETCH(hdr); 994 switch (arc_buf_type(hdr)) { 995 case ARC_BUFC_DATA: 996 ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses, data); 997 break; 998 case ARC_BUFC_METADATA: 999 ARCSTAT_CONDSTAT(pf, demand, prefetch, hit, hits, misses, 1000 metadata); 1001 break; 1002 default: 1003 break; 1004 } 1005 } 1006 1007 enum { 1008 L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ 1009 }; 1010 1011 /* 1012 * Pointer used in persistent L2ARC (for pointing to log blocks & ARC buffers). 1013 */ 1014 typedef struct l2arc_log_blkptr { 1015 uint64_t lbp_daddr; /* device address of log */ 1016 /* 1017 * lbp_prop is the same format as the blk_prop in blkptr_t: 1018 * * logical size (in sectors) 1019 * * physical (compressed) size (in sectors) 1020 * * compression algorithm (we always LZ4-compress l2arc logs) 1021 * * checksum algorithm (used for lbp_cksum) 1022 * * object type & level (unused for now) 1023 */ 1024 uint64_t lbp_prop; 1025 zio_cksum_t lbp_cksum; /* fletcher4 of log */ 1026 } l2arc_log_blkptr_t; 1027 1028 /* 1029 * The persistent L2ARC device header. 1030 * Byte order of magic determines whether 64-bit bswap of fields is necessary. 1031 */ 1032 typedef struct l2arc_dev_hdr_phys { 1033 uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ 1034 zio_cksum_t dh_self_cksum; /* fletcher4 of fields below */ 1035 1036 /* 1037 * Global L2ARC device state and metadata. 1038 */ 1039 uint64_t dh_spa_guid; 1040 uint64_t dh_alloc_space; /* vdev space alloc status */ 1041 uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ 1042 1043 /* 1044 * Start of log block chain. [0] -> newest log, [1] -> one older (used 1045 * for initiating prefetch). 1046 */ 1047 l2arc_log_blkptr_t dh_start_lbps[2]; 1048 1049 const uint64_t dh_pad[44]; /* pad to 512 bytes */ 1050 } l2arc_dev_hdr_phys_t; 1051 CTASSERT(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); 1052 1053 /* 1054 * A single ARC buffer header entry in a l2arc_log_blk_phys_t. 1055 */ 1056 typedef struct l2arc_log_ent_phys { 1057 dva_t le_dva; /* dva of buffer */ 1058 uint64_t le_birth; /* birth txg of buffer */ 1059 zio_cksum_t le_freeze_cksum; 1060 /* 1061 * le_prop is the same format as the blk_prop in blkptr_t: 1062 * * logical size (in sectors) 1063 * * physical (compressed) size (in sectors) 1064 * * compression algorithm 1065 * * checksum algorithm (used for b_freeze_cksum) 1066 * * object type & level (used to restore arc_buf_contents_t) 1067 */ 1068 uint64_t le_prop; 1069 uint64_t le_daddr; /* buf location on l2dev */ 1070 const uint64_t le_pad[7]; /* resv'd for future use */ 1071 } l2arc_log_ent_phys_t; 1072 1073 /* 1074 * These design limits give us the following metadata overhead (before 1075 * compression): 1076 * avg_blk_sz overhead 1077 * 1k 12.51 % 1078 * 2k 6.26 % 1079 * 4k 3.13 % 1080 * 8k 1.56 % 1081 * 16k 0.78 % 1082 * 32k 0.39 % 1083 * 64k 0.20 % 1084 * 128k 0.10 % 1085 * Compression should be able to sequeeze these down by about a factor of 2x. 1086 */ 1087 #define L2ARC_LOG_BLK_SIZE (128 * 1024) /* 128k */ 1088 #define L2ARC_LOG_BLK_HEADER_LEN (128) 1089 #define L2ARC_LOG_BLK_ENTRIES /* 1023 entries */ \ 1090 ((L2ARC_LOG_BLK_SIZE - L2ARC_LOG_BLK_HEADER_LEN) / \ 1091 sizeof (l2arc_log_ent_phys_t)) 1092 /* 1093 * Maximum amount of data in an l2arc log block (used to terminate rebuilding 1094 * before we hit the write head and restore potentially corrupted blocks). 1095 */ 1096 #define L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE \ 1097 (SPA_MAXBLOCKSIZE * L2ARC_LOG_BLK_ENTRIES) 1098 /* 1099 * For the persistency and rebuild algorithms to operate reliably we need 1100 * the L2ARC device to at least be able to hold 3 full log blocks (otherwise 1101 * excessive log block looping might confuse the log chain end detection). 1102 * Under normal circumstances this is not a problem, since this is somewhere 1103 * around only 400 MB. 1104 */ 1105 #define L2ARC_PERSIST_MIN_SIZE (3 * L2ARC_LOG_BLK_MAX_PAYLOAD_SIZE) 1106 1107 /* 1108 * A log block of up to 1023 ARC buffer log entries, chained into the 1109 * persistent L2ARC metadata linked list. Byte order of magic determines 1110 * whether 64-bit bswap of fields is necessary. 1111 */ 1112 typedef struct l2arc_log_blk_phys { 1113 /* Header - see L2ARC_LOG_BLK_HEADER_LEN above */ 1114 uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ 1115 l2arc_log_blkptr_t lb_back2_lbp; /* back 2 steps in chain */ 1116 uint64_t lb_pad[9]; /* resv'd for future use */ 1117 /* Payload */ 1118 l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_ENTRIES]; 1119 } l2arc_log_blk_phys_t; 1120 1121 CTASSERT(sizeof (l2arc_log_blk_phys_t) == L2ARC_LOG_BLK_SIZE); 1122 CTASSERT(offsetof(l2arc_log_blk_phys_t, lb_entries) - 1123 offsetof(l2arc_log_blk_phys_t, lb_magic) == L2ARC_LOG_BLK_HEADER_LEN); 1124 1125 /* 1126 * These structures hold in-flight l2arc_log_blk_phys_t's as they're being 1127 * written to the L2ARC device. They may be compressed, hence the uint8_t[]. 1128 */ 1129 typedef struct l2arc_log_blk_buf { 1130 uint8_t lbb_log_blk[sizeof (l2arc_log_blk_phys_t)]; 1131 list_node_t lbb_node; 1132 } l2arc_log_blk_buf_t; 1133 1134 /* Macros for the manipulation fields in the blk_prop format of blkptr_t */ 1135 #define BLKPROP_GET_LSIZE(_obj, _field) \ 1136 BF64_GET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1) 1137 #define BLKPROP_SET_LSIZE(_obj, _field, x) \ 1138 BF64_SET_SB((_obj)->_field, 0, 16, SPA_MINBLOCKSHIFT, 1, x) 1139 #define BLKPROP_GET_PSIZE(_obj, _field) \ 1140 BF64_GET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1) 1141 #define BLKPROP_SET_PSIZE(_obj, _field, x) \ 1142 BF64_SET_SB((_obj)->_field, 16, 16, SPA_MINBLOCKSHIFT, 1, x) 1143 #define BLKPROP_GET_COMPRESS(_obj, _field) \ 1144 BF64_GET((_obj)->_field, 32, 8) 1145 #define BLKPROP_SET_COMPRESS(_obj, _field, x) \ 1146 BF64_SET((_obj)->_field, 32, 8, x) 1147 #define BLKPROP_GET_CHECKSUM(_obj, _field) \ 1148 BF64_GET((_obj)->_field, 40, 8) 1149 #define BLKPROP_SET_CHECKSUM(_obj, _field, x) \ 1150 BF64_SET((_obj)->_field, 40, 8, x) 1151 #define BLKPROP_GET_TYPE(_obj, _field) \ 1152 BF64_GET((_obj)->_field, 48, 8) 1153 #define BLKPROP_SET_TYPE(_obj, _field, x) \ 1154 BF64_SET((_obj)->_field, 48, 8, x) 1155 1156 /* Macros for manipulating a l2arc_log_blkptr_t->lbp_prop field */ 1157 #define LBP_GET_LSIZE(_add) BLKPROP_GET_LSIZE(_add, lbp_prop) 1158 #define LBP_SET_LSIZE(_add, x) BLKPROP_SET_LSIZE(_add, lbp_prop, x) 1159 #define LBP_GET_PSIZE(_add) BLKPROP_GET_PSIZE(_add, lbp_prop) 1160 #define LBP_SET_PSIZE(_add, x) BLKPROP_SET_PSIZE(_add, lbp_prop, x) 1161 #define LBP_GET_COMPRESS(_add) BLKPROP_GET_COMPRESS(_add, lbp_prop) 1162 #define LBP_SET_COMPRESS(_add, x) BLKPROP_SET_COMPRESS(_add, lbp_prop, \ 1163 x) 1164 #define LBP_GET_CHECKSUM(_add) BLKPROP_GET_CHECKSUM(_add, lbp_prop) 1165 #define LBP_SET_CHECKSUM(_add, x) BLKPROP_SET_CHECKSUM(_add, lbp_prop, \ 1166 x) 1167 #define LBP_GET_TYPE(_add) BLKPROP_GET_TYPE(_add, lbp_prop) 1168 #define LBP_SET_TYPE(_add, x) BLKPROP_SET_TYPE(_add, lbp_prop, x) 1169 1170 /* Macros for manipulating a l2arc_log_ent_phys_t->le_prop field */ 1171 #define LE_GET_LSIZE(_le) BLKPROP_GET_LSIZE(_le, le_prop) 1172 #define LE_SET_LSIZE(_le, x) BLKPROP_SET_LSIZE(_le, le_prop, x) 1173 #define LE_GET_PSIZE(_le) BLKPROP_GET_PSIZE(_le, le_prop) 1174 #define LE_SET_PSIZE(_le, x) BLKPROP_SET_PSIZE(_le, le_prop, x) 1175 #define LE_GET_COMPRESS(_le) BLKPROP_GET_COMPRESS(_le, le_prop) 1176 #define LE_SET_COMPRESS(_le, x) BLKPROP_SET_COMPRESS(_le, le_prop, x) 1177 #define LE_GET_CHECKSUM(_le) BLKPROP_GET_CHECKSUM(_le, le_prop) 1178 #define LE_SET_CHECKSUM(_le, x) BLKPROP_SET_CHECKSUM(_le, le_prop, x) 1179 #define LE_GET_TYPE(_le) BLKPROP_GET_TYPE(_le, le_prop) 1180 #define LE_SET_TYPE(_le, x) BLKPROP_SET_TYPE(_le, le_prop, x) 1181 1182 #define PTR_SWAP(x, y) \ 1183 do { \ 1184 void *tmp = (x);\ 1185 x = y; \ 1186 y = tmp; \ 1187 _NOTE(CONSTCOND)\ 1188 } while (0) 1189 1190 #define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ 1191 #define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ 1192 1193 /* 1194 * Performance tuning of L2ARC persistency: 1195 * 1196 * l2arc_rebuild_enabled : Controls whether L2ARC device adds (either at 1197 * pool import or when adding one manually later) will attempt 1198 * to rebuild L2ARC buffer contents. In special circumstances, 1199 * the administrator may want to set this to B_FALSE, if they 1200 * are having trouble importing a pool or attaching an L2ARC 1201 * device (e.g. the L2ARC device is slow to read in stored log 1202 * metadata, or the metadata has become somehow 1203 * fragmented/unusable). 1204 */ 1205 boolean_t l2arc_rebuild_enabled = B_TRUE; 1206 1207 /* L2ARC persistency rebuild control routines. */ 1208 static void l2arc_dev_rebuild_start(l2arc_dev_t *dev); 1209 static int l2arc_rebuild(l2arc_dev_t *dev); 1210 1211 /* L2ARC persistency read I/O routines. */ 1212 static int l2arc_dev_hdr_read(l2arc_dev_t *dev); 1213 static int l2arc_log_blk_read(l2arc_dev_t *dev, 1214 const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, 1215 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, 1216 uint8_t *this_lb_buf, uint8_t *next_lb_buf, 1217 zio_t *this_io, zio_t **next_io); 1218 static zio_t *l2arc_log_blk_prefetch(vdev_t *vd, 1219 const l2arc_log_blkptr_t *lp, uint8_t *lb_buf); 1220 static void l2arc_log_blk_prefetch_abort(zio_t *zio); 1221 1222 /* L2ARC persistency block restoration routines. */ 1223 static void l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid, 1224 const l2arc_log_blk_phys_t *lb, uint64_t lb_psize); 1225 static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, 1226 l2arc_dev_t *dev, uint64_t guid); 1227 1228 /* L2ARC persistency write I/O routines. */ 1229 static void l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio); 1230 static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, 1231 l2arc_write_callback_t *cb); 1232 1233 /* L2ARC persistency auxilliary routines. */ 1234 static boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, 1235 const l2arc_log_blkptr_t *lp); 1236 static void l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, 1237 zio_cksum_t *cksum); 1238 static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, 1239 const arc_buf_hdr_t *ab); 1240 static inline boolean_t l2arc_range_check_overlap(uint64_t bottom, 1241 uint64_t top, uint64_t check); 1242 1243 /* 1244 * L2ARC Internals 1245 */ 1246 struct l2arc_dev { 1247 vdev_t *l2ad_vdev; /* vdev */ 1248 spa_t *l2ad_spa; /* spa */ 1249 uint64_t l2ad_hand; /* next write location */ 1250 uint64_t l2ad_start; /* first addr on device */ 1251 uint64_t l2ad_end; /* last addr on device */ 1252 boolean_t l2ad_first; /* first sweep through */ 1253 boolean_t l2ad_writing; /* currently writing */ 1254 kmutex_t l2ad_mtx; /* lock for buffer list */ 1255 list_t l2ad_buflist; /* buffer list */ 1256 list_node_t l2ad_node; /* device list node */ 1257 refcount_t l2ad_alloc; /* allocated bytes */ 1258 l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ 1259 uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ 1260 l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ 1261 int l2ad_log_ent_idx; /* index into cur log blk */ 1262 /* number of bytes in current log block's payload */ 1263 uint64_t l2ad_log_blk_payload_asize; 1264 /* flag indicating whether a rebuild is scheduled or is going on */ 1265 boolean_t l2ad_rebuild; 1266 boolean_t l2ad_rebuild_cancel; 1267 kt_did_t l2ad_rebuild_did; 1268 }; 1269 1270 static inline uint64_t 1271 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1272 { 1273 uint8_t *vdva = (uint8_t *)dva; 1274 uint64_t crc = -1ULL; 1275 int i; 1276 1277 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 1278 1279 for (i = 0; i < sizeof (dva_t); i++) 1280 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 1281 1282 crc ^= (spa>>8) ^ birth; 1283 1284 return (crc); 1285 } 1286 1287 #define BUF_EMPTY(buf) \ 1288 ((buf)->b_dva.dva_word[0] == 0 && \ 1289 (buf)->b_dva.dva_word[1] == 0) 1290 1291 #define BUF_EQUAL(spa, dva, birth, buf) \ 1292 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1293 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1294 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 1295 1296 static void 1297 buf_discard_identity(arc_buf_hdr_t *hdr) 1298 { 1299 hdr->b_dva.dva_word[0] = 0; 1300 hdr->b_dva.dva_word[1] = 0; 1301 hdr->b_birth = 0; 1302 } 1303 1304 static arc_buf_hdr_t * 1305 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1306 { 1307 const dva_t *dva = BP_IDENTITY(bp); 1308 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1309 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1310 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1311 arc_buf_hdr_t *hdr; 1312 1313 mutex_enter(hash_lock); 1314 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1315 hdr = hdr->b_hash_next) { 1316 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1317 *lockp = hash_lock; 1318 return (hdr); 1319 } 1320 } 1321 mutex_exit(hash_lock); 1322 *lockp = NULL; 1323 return (NULL); 1324 } 1325 1326 /* 1327 * Insert an entry into the hash table. If there is already an element 1328 * equal to elem in the hash table, then the already existing element 1329 * will be returned and the new element will not be inserted. 1330 * Otherwise returns NULL. 1331 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1332 */ 1333 static arc_buf_hdr_t * 1334 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1335 { 1336 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1337 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1338 arc_buf_hdr_t *fhdr; 1339 uint32_t i; 1340 1341 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1342 ASSERT(hdr->b_birth != 0); 1343 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1344 1345 if (lockp != NULL) { 1346 *lockp = hash_lock; 1347 mutex_enter(hash_lock); 1348 } else { 1349 ASSERT(MUTEX_HELD(hash_lock)); 1350 } 1351 1352 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1353 fhdr = fhdr->b_hash_next, i++) { 1354 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1355 return (fhdr); 1356 } 1357 1358 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1359 buf_hash_table.ht_table[idx] = hdr; 1360 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1361 1362 /* collect some hash table performance data */ 1363 if (i > 0) { 1364 ARCSTAT_BUMP(arcstat_hash_collisions); 1365 if (i == 1) 1366 ARCSTAT_BUMP(arcstat_hash_chains); 1367 1368 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1369 } 1370 1371 ARCSTAT_BUMP(arcstat_hash_elements); 1372 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1373 1374 return (NULL); 1375 } 1376 1377 static void 1378 buf_hash_remove(arc_buf_hdr_t *hdr) 1379 { 1380 arc_buf_hdr_t *fhdr, **hdrp; 1381 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1382 1383 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1384 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1385 1386 hdrp = &buf_hash_table.ht_table[idx]; 1387 while ((fhdr = *hdrp) != hdr) { 1388 ASSERT(fhdr != NULL); 1389 hdrp = &fhdr->b_hash_next; 1390 } 1391 *hdrp = hdr->b_hash_next; 1392 hdr->b_hash_next = NULL; 1393 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1394 1395 /* collect some hash table performance data */ 1396 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1397 1398 if (buf_hash_table.ht_table[idx] && 1399 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1400 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1401 } 1402 1403 /* 1404 * Global data structures and functions for the buf kmem cache. 1405 */ 1406 static kmem_cache_t *hdr_full_cache; 1407 static kmem_cache_t *hdr_l2only_cache; 1408 static kmem_cache_t *buf_cache; 1409 1410 static void 1411 buf_fini(void) 1412 { 1413 int i; 1414 1415 kmem_free(buf_hash_table.ht_table, 1416 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1417 for (i = 0; i < BUF_LOCKS; i++) 1418 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1419 kmem_cache_destroy(hdr_full_cache); 1420 kmem_cache_destroy(hdr_l2only_cache); 1421 kmem_cache_destroy(buf_cache); 1422 } 1423 1424 /* 1425 * Constructor callback - called when the cache is empty 1426 * and a new buf is requested. 1427 */ 1428 /* ARGSUSED */ 1429 static int 1430 hdr_full_cons(void *vbuf, void *unused, int kmflag) 1431 { 1432 arc_buf_hdr_t *hdr = vbuf; 1433 1434 bzero(hdr, HDR_FULL_SIZE); 1435 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1436 refcount_create(&hdr->b_l1hdr.b_refcnt); 1437 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1438 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1439 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1440 1441 return (0); 1442 } 1443 1444 /* ARGSUSED */ 1445 static int 1446 hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1447 { 1448 arc_buf_hdr_t *hdr = vbuf; 1449 1450 bzero(hdr, HDR_L2ONLY_SIZE); 1451 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1452 1453 return (0); 1454 } 1455 1456 /* ARGSUSED */ 1457 static int 1458 buf_cons(void *vbuf, void *unused, int kmflag) 1459 { 1460 arc_buf_t *buf = vbuf; 1461 1462 bzero(buf, sizeof (arc_buf_t)); 1463 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1464 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1465 1466 return (0); 1467 } 1468 1469 /* 1470 * Destructor callback - called when a cached buf is 1471 * no longer required. 1472 */ 1473 /* ARGSUSED */ 1474 static void 1475 hdr_full_dest(void *vbuf, void *unused) 1476 { 1477 arc_buf_hdr_t *hdr = vbuf; 1478 1479 ASSERT(BUF_EMPTY(hdr)); 1480 cv_destroy(&hdr->b_l1hdr.b_cv); 1481 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1482 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1483 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1484 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1485 } 1486 1487 /* ARGSUSED */ 1488 static void 1489 hdr_l2only_dest(void *vbuf, void *unused) 1490 { 1491 arc_buf_hdr_t *hdr = vbuf; 1492 1493 ASSERT(BUF_EMPTY(hdr)); 1494 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1495 } 1496 1497 /* ARGSUSED */ 1498 static void 1499 buf_dest(void *vbuf, void *unused) 1500 { 1501 arc_buf_t *buf = vbuf; 1502 1503 mutex_destroy(&buf->b_evict_lock); 1504 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1505 } 1506 1507 /* 1508 * Reclaim callback -- invoked when memory is low. 1509 */ 1510 /* ARGSUSED */ 1511 static void 1512 hdr_recl(void *unused) 1513 { 1514 dprintf("hdr_recl called\n"); 1515 /* 1516 * umem calls the reclaim func when we destroy the buf cache, 1517 * which is after we do arc_fini(). 1518 */ 1519 if (!arc_dead) 1520 cv_signal(&arc_reclaim_thread_cv); 1521 } 1522 1523 static void 1524 buf_init(void) 1525 { 1526 uint64_t *ct; 1527 uint64_t hsize = 1ULL << 12; 1528 int i, j; 1529 1530 /* 1531 * The hash table is big enough to fill all of physical memory 1532 * with an average block size of zfs_arc_average_blocksize (default 8K). 1533 * By default, the table will take up 1534 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1535 */ 1536 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) 1537 hsize <<= 1; 1538 retry: 1539 buf_hash_table.ht_mask = hsize - 1; 1540 buf_hash_table.ht_table = 1541 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1542 if (buf_hash_table.ht_table == NULL) { 1543 ASSERT(hsize > (1ULL << 8)); 1544 hsize >>= 1; 1545 goto retry; 1546 } 1547 1548 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1549 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1550 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1551 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1552 NULL, NULL, 0); 1553 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1554 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1555 1556 for (i = 0; i < 256; i++) 1557 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1558 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1559 1560 for (i = 0; i < BUF_LOCKS; i++) { 1561 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1562 NULL, MUTEX_DEFAULT, NULL); 1563 } 1564 } 1565 1566 /* 1567 * Transition between the two allocation states for the arc_buf_hdr struct. 1568 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1569 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1570 * version is used when a cache buffer is only in the L2ARC in order to reduce 1571 * memory usage. 1572 */ 1573 static arc_buf_hdr_t * 1574 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1575 { 1576 ASSERT(HDR_HAS_L2HDR(hdr)); 1577 1578 arc_buf_hdr_t *nhdr; 1579 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1580 1581 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1582 (old == hdr_l2only_cache && new == hdr_full_cache)); 1583 1584 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1585 1586 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1587 buf_hash_remove(hdr); 1588 1589 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1590 1591 if (new == hdr_full_cache) { 1592 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1593 /* 1594 * arc_access and arc_change_state need to be aware that a 1595 * header has just come out of L2ARC, so we set its state to 1596 * l2c_only even though it's about to change. 1597 */ 1598 nhdr->b_l1hdr.b_state = arc_l2c_only; 1599 1600 /* Verify previous threads set to NULL before freeing */ 1601 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1602 } else { 1603 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1604 ASSERT0(hdr->b_l1hdr.b_datacnt); 1605 1606 /* 1607 * If we've reached here, We must have been called from 1608 * arc_evict_hdr(), as such we should have already been 1609 * removed from any ghost list we were previously on 1610 * (which protects us from racing with arc_evict_state), 1611 * thus no locking is needed during this check. 1612 */ 1613 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1614 1615 /* 1616 * A buffer must not be moved into the arc_l2c_only 1617 * state if it's not finished being written out to the 1618 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field 1619 * might try to be accessed, even though it was removed. 1620 */ 1621 VERIFY(!HDR_L2_WRITING(hdr)); 1622 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1623 1624 #ifdef ZFS_DEBUG 1625 if (hdr->b_l1hdr.b_thawed != NULL) { 1626 kmem_free(hdr->b_l1hdr.b_thawed, 1); 1627 hdr->b_l1hdr.b_thawed = NULL; 1628 } 1629 #endif 1630 1631 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1632 } 1633 /* 1634 * The header has been reallocated so we need to re-insert it into any 1635 * lists it was on. 1636 */ 1637 (void) buf_hash_insert(nhdr, NULL); 1638 1639 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1640 1641 mutex_enter(&dev->l2ad_mtx); 1642 1643 /* 1644 * We must place the realloc'ed header back into the list at 1645 * the same spot. Otherwise, if it's placed earlier in the list, 1646 * l2arc_write_buffers() could find it during the function's 1647 * write phase, and try to write it out to the l2arc. 1648 */ 1649 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1650 list_remove(&dev->l2ad_buflist, hdr); 1651 1652 mutex_exit(&dev->l2ad_mtx); 1653 1654 /* 1655 * Since we're using the pointer address as the tag when 1656 * incrementing and decrementing the l2ad_alloc refcount, we 1657 * must remove the old pointer (that we're about to destroy) and 1658 * add the new pointer to the refcount. Otherwise we'd remove 1659 * the wrong pointer address when calling arc_hdr_destroy() later. 1660 */ 1661 1662 (void) refcount_remove_many(&dev->l2ad_alloc, 1663 hdr->b_l2hdr.b_asize, hdr); 1664 1665 (void) refcount_add_many(&dev->l2ad_alloc, 1666 nhdr->b_l2hdr.b_asize, nhdr); 1667 1668 buf_discard_identity(hdr); 1669 hdr->b_freeze_cksum = NULL; 1670 kmem_cache_free(old, hdr); 1671 1672 return (nhdr); 1673 } 1674 1675 1676 #define ARC_MINTIME (hz>>4) /* 62 ms */ 1677 1678 static void 1679 arc_cksum_verify(arc_buf_t *buf) 1680 { 1681 zio_cksum_t zc; 1682 1683 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1684 return; 1685 1686 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1687 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1688 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1689 return; 1690 } 1691 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1692 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1693 panic("buffer modified while frozen!"); 1694 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1695 } 1696 1697 static int 1698 arc_cksum_equal(arc_buf_t *buf) 1699 { 1700 zio_cksum_t zc; 1701 int equal; 1702 1703 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1704 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1705 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1706 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1707 1708 return (equal); 1709 } 1710 1711 static void 1712 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1713 { 1714 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1715 return; 1716 1717 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1718 if (buf->b_hdr->b_freeze_cksum != NULL) { 1719 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1720 return; 1721 } 1722 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1723 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1724 NULL, buf->b_hdr->b_freeze_cksum); 1725 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1726 arc_buf_watch(buf); 1727 } 1728 1729 #ifndef _KERNEL 1730 typedef struct procctl { 1731 long cmd; 1732 prwatch_t prwatch; 1733 } procctl_t; 1734 #endif 1735 1736 /* ARGSUSED */ 1737 static void 1738 arc_buf_unwatch(arc_buf_t *buf) 1739 { 1740 #ifndef _KERNEL 1741 if (arc_watch) { 1742 int result; 1743 procctl_t ctl; 1744 ctl.cmd = PCWATCH; 1745 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1746 ctl.prwatch.pr_size = 0; 1747 ctl.prwatch.pr_wflags = 0; 1748 result = write(arc_procfd, &ctl, sizeof (ctl)); 1749 ASSERT3U(result, ==, sizeof (ctl)); 1750 } 1751 #endif 1752 } 1753 1754 /* ARGSUSED */ 1755 static void 1756 arc_buf_watch(arc_buf_t *buf) 1757 { 1758 #ifndef _KERNEL 1759 if (arc_watch) { 1760 int result; 1761 procctl_t ctl; 1762 ctl.cmd = PCWATCH; 1763 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1764 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1765 ctl.prwatch.pr_wflags = WA_WRITE; 1766 result = write(arc_procfd, &ctl, sizeof (ctl)); 1767 ASSERT3U(result, ==, sizeof (ctl)); 1768 } 1769 #endif 1770 } 1771 1772 static arc_buf_contents_t 1773 arc_buf_type(arc_buf_hdr_t *hdr) 1774 { 1775 if (HDR_ISTYPE_METADATA(hdr)) { 1776 return (ARC_BUFC_METADATA); 1777 } else { 1778 return (ARC_BUFC_DATA); 1779 } 1780 } 1781 1782 static uint32_t 1783 arc_bufc_to_flags(arc_buf_contents_t type) 1784 { 1785 switch (type) { 1786 case ARC_BUFC_DATA: 1787 /* metadata field is 0 if buffer contains normal data */ 1788 return (0); 1789 case ARC_BUFC_METADATA: 1790 return (ARC_FLAG_BUFC_METADATA); 1791 default: 1792 break; 1793 } 1794 panic("undefined ARC buffer type!"); 1795 return ((uint32_t)-1); 1796 } 1797 1798 static arc_buf_contents_t 1799 arc_flags_to_bufc(uint32_t flags) 1800 { 1801 if (flags & ARC_FLAG_BUFC_METADATA) 1802 return (ARC_BUFC_METADATA); 1803 return (ARC_BUFC_DATA); 1804 } 1805 1806 void 1807 arc_buf_thaw(arc_buf_t *buf) 1808 { 1809 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1810 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1811 panic("modifying non-anon buffer!"); 1812 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1813 panic("modifying buffer while i/o in progress!"); 1814 arc_cksum_verify(buf); 1815 } 1816 1817 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1818 if (buf->b_hdr->b_freeze_cksum != NULL) { 1819 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1820 buf->b_hdr->b_freeze_cksum = NULL; 1821 } 1822 1823 #ifdef ZFS_DEBUG 1824 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1825 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1826 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1827 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1828 } 1829 #endif 1830 1831 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1832 1833 arc_buf_unwatch(buf); 1834 } 1835 1836 void 1837 arc_buf_freeze(arc_buf_t *buf) 1838 { 1839 kmutex_t *hash_lock; 1840 1841 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1842 return; 1843 1844 hash_lock = HDR_LOCK(buf->b_hdr); 1845 mutex_enter(hash_lock); 1846 1847 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1848 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1849 arc_cksum_compute(buf, B_FALSE); 1850 mutex_exit(hash_lock); 1851 1852 } 1853 1854 static void 1855 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1856 { 1857 ASSERT(HDR_HAS_L1HDR(hdr)); 1858 ASSERT(MUTEX_HELD(hash_lock)); 1859 arc_state_t *state = hdr->b_l1hdr.b_state; 1860 1861 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1862 (state != arc_anon)) { 1863 /* We don't use the L2-only state list. */ 1864 if (state != arc_l2c_only) { 1865 arc_buf_contents_t type = arc_buf_type(hdr); 1866 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1867 multilist_t *list = &state->arcs_list[type]; 1868 uint64_t *size = &state->arcs_lsize[type]; 1869 1870 multilist_remove(list, hdr); 1871 1872 if (GHOST_STATE(state)) { 1873 ASSERT0(hdr->b_l1hdr.b_datacnt); 1874 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1875 delta = hdr->b_size; 1876 } 1877 ASSERT(delta > 0); 1878 ASSERT3U(*size, >=, delta); 1879 atomic_add_64(size, -delta); 1880 } 1881 /* remove the prefetch flag if we get a reference */ 1882 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1883 } 1884 } 1885 1886 static int 1887 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1888 { 1889 int cnt; 1890 arc_state_t *state = hdr->b_l1hdr.b_state; 1891 1892 ASSERT(HDR_HAS_L1HDR(hdr)); 1893 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1894 ASSERT(!GHOST_STATE(state)); 1895 1896 /* 1897 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1898 * check to prevent usage of the arc_l2c_only list. 1899 */ 1900 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1901 (state != arc_anon)) { 1902 arc_buf_contents_t type = arc_buf_type(hdr); 1903 multilist_t *list = &state->arcs_list[type]; 1904 uint64_t *size = &state->arcs_lsize[type]; 1905 1906 multilist_insert(list, hdr); 1907 1908 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1909 atomic_add_64(size, hdr->b_size * 1910 hdr->b_l1hdr.b_datacnt); 1911 } 1912 return (cnt); 1913 } 1914 1915 /* 1916 * Move the supplied buffer to the indicated state. The hash lock 1917 * for the buffer must be held by the caller. 1918 */ 1919 static void 1920 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1921 kmutex_t *hash_lock) 1922 { 1923 arc_state_t *old_state; 1924 int64_t refcnt; 1925 uint32_t datacnt; 1926 uint64_t from_delta, to_delta; 1927 arc_buf_contents_t buftype = arc_buf_type(hdr); 1928 1929 /* 1930 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1931 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1932 * L1 hdr doesn't always exist when we change state to arc_anon before 1933 * destroying a header, in which case reallocating to add the L1 hdr is 1934 * pointless. 1935 */ 1936 if (HDR_HAS_L1HDR(hdr)) { 1937 old_state = hdr->b_l1hdr.b_state; 1938 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1939 datacnt = hdr->b_l1hdr.b_datacnt; 1940 } else { 1941 old_state = arc_l2c_only; 1942 refcnt = 0; 1943 datacnt = 0; 1944 } 1945 1946 ASSERT(MUTEX_HELD(hash_lock)); 1947 ASSERT3P(new_state, !=, old_state); 1948 ASSERT(refcnt == 0 || datacnt > 0); 1949 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1950 ASSERT(old_state != arc_anon || datacnt <= 1); 1951 1952 from_delta = to_delta = datacnt * hdr->b_size; 1953 1954 /* 1955 * If this buffer is evictable, transfer it from the 1956 * old state list to the new state list. 1957 */ 1958 if (refcnt == 0) { 1959 if (old_state != arc_anon && old_state != arc_l2c_only) { 1960 uint64_t *size = &old_state->arcs_lsize[buftype]; 1961 1962 ASSERT(HDR_HAS_L1HDR(hdr)); 1963 multilist_remove(&old_state->arcs_list[buftype], hdr); 1964 1965 /* 1966 * If prefetching out of the ghost cache, 1967 * we will have a non-zero datacnt. 1968 */ 1969 if (GHOST_STATE(old_state) && datacnt == 0) { 1970 /* ghost elements have a ghost size */ 1971 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1972 from_delta = hdr->b_size; 1973 } 1974 ASSERT3U(*size, >=, from_delta); 1975 atomic_add_64(size, -from_delta); 1976 } 1977 if (new_state != arc_anon && new_state != arc_l2c_only) { 1978 uint64_t *size = &new_state->arcs_lsize[buftype]; 1979 1980 /* 1981 * An L1 header always exists here, since if we're 1982 * moving to some L1-cached state (i.e. not l2c_only or 1983 * anonymous), we realloc the header to add an L1hdr 1984 * beforehand. 1985 */ 1986 ASSERT(HDR_HAS_L1HDR(hdr)); 1987 multilist_insert(&new_state->arcs_list[buftype], hdr); 1988 1989 /* ghost elements have a ghost size */ 1990 if (GHOST_STATE(new_state)) { 1991 ASSERT0(datacnt); 1992 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1993 to_delta = hdr->b_size; 1994 } 1995 atomic_add_64(size, to_delta); 1996 } 1997 } 1998 1999 ASSERT(!BUF_EMPTY(hdr)); 2000 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2001 buf_hash_remove(hdr); 2002 2003 /* adjust state sizes (ignore arc_l2c_only) */ 2004 2005 if (to_delta && new_state != arc_l2c_only) { 2006 ASSERT(HDR_HAS_L1HDR(hdr)); 2007 if (GHOST_STATE(new_state)) { 2008 ASSERT0(datacnt); 2009 2010 /* 2011 * We moving a header to a ghost state, we first 2012 * remove all arc buffers. Thus, we'll have a 2013 * datacnt of zero, and no arc buffer to use for 2014 * the reference. As a result, we use the arc 2015 * header pointer for the reference. 2016 */ 2017 (void) refcount_add_many(&new_state->arcs_size, 2018 hdr->b_size, hdr); 2019 } else { 2020 ASSERT3U(datacnt, !=, 0); 2021 2022 /* 2023 * Each individual buffer holds a unique reference, 2024 * thus we must remove each of these references one 2025 * at a time. 2026 */ 2027 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2028 buf = buf->b_next) { 2029 (void) refcount_add_many(&new_state->arcs_size, 2030 hdr->b_size, buf); 2031 } 2032 } 2033 } 2034 2035 if (from_delta && old_state != arc_l2c_only) { 2036 ASSERT(HDR_HAS_L1HDR(hdr)); 2037 if (GHOST_STATE(old_state)) { 2038 /* 2039 * When moving a header off of a ghost state, 2040 * there's the possibility for datacnt to be 2041 * non-zero. This is because we first add the 2042 * arc buffer to the header prior to changing 2043 * the header's state. Since we used the header 2044 * for the reference when putting the header on 2045 * the ghost state, we must balance that and use 2046 * the header when removing off the ghost state 2047 * (even though datacnt is non zero). 2048 */ 2049 2050 IMPLY(datacnt == 0, new_state == arc_anon || 2051 new_state == arc_l2c_only); 2052 2053 (void) refcount_remove_many(&old_state->arcs_size, 2054 hdr->b_size, hdr); 2055 } else { 2056 ASSERT3P(datacnt, !=, 0); 2057 2058 /* 2059 * Each individual buffer holds a unique reference, 2060 * thus we must remove each of these references one 2061 * at a time. 2062 */ 2063 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2064 buf = buf->b_next) { 2065 (void) refcount_remove_many( 2066 &old_state->arcs_size, hdr->b_size, buf); 2067 } 2068 } 2069 } 2070 2071 if (HDR_HAS_L1HDR(hdr)) 2072 hdr->b_l1hdr.b_state = new_state; 2073 2074 /* 2075 * L2 headers should never be on the L2 state list since they don't 2076 * have L1 headers allocated. 2077 */ 2078 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2079 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2080 } 2081 2082 void 2083 arc_space_consume(uint64_t space, arc_space_type_t type) 2084 { 2085 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2086 2087 switch (type) { 2088 case ARC_SPACE_DATA: 2089 ARCSTAT_INCR(arcstat_data_size, space); 2090 break; 2091 case ARC_SPACE_META: 2092 ARCSTAT_INCR(arcstat_metadata_size, space); 2093 break; 2094 case ARC_SPACE_OTHER: 2095 ARCSTAT_INCR(arcstat_other_size, space); 2096 break; 2097 case ARC_SPACE_HDRS: 2098 ARCSTAT_INCR(arcstat_hdr_size, space); 2099 break; 2100 case ARC_SPACE_L2HDRS: 2101 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 2102 break; 2103 } 2104 2105 if (type != ARC_SPACE_DATA) 2106 ARCSTAT_INCR(arcstat_meta_used, space); 2107 2108 atomic_add_64(&arc_size, space); 2109 } 2110 2111 void 2112 arc_space_return(uint64_t space, arc_space_type_t type) 2113 { 2114 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2115 2116 switch (type) { 2117 case ARC_SPACE_DATA: 2118 ARCSTAT_INCR(arcstat_data_size, -space); 2119 break; 2120 case ARC_SPACE_META: 2121 ARCSTAT_INCR(arcstat_metadata_size, -space); 2122 break; 2123 case ARC_SPACE_OTHER: 2124 ARCSTAT_INCR(arcstat_other_size, -space); 2125 break; 2126 case ARC_SPACE_HDRS: 2127 ARCSTAT_INCR(arcstat_hdr_size, -space); 2128 break; 2129 case ARC_SPACE_L2HDRS: 2130 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 2131 break; 2132 } 2133 2134 if (type != ARC_SPACE_DATA) { 2135 ASSERT(arc_meta_used >= space); 2136 if (arc_meta_max < arc_meta_used) 2137 arc_meta_max = arc_meta_used; 2138 ARCSTAT_INCR(arcstat_meta_used, -space); 2139 } 2140 2141 ASSERT(arc_size >= space); 2142 atomic_add_64(&arc_size, -space); 2143 } 2144 2145 arc_buf_t * 2146 arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 2147 { 2148 arc_buf_hdr_t *hdr; 2149 arc_buf_t *buf; 2150 2151 ASSERT3U(size, >, 0); 2152 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 2153 ASSERT(BUF_EMPTY(hdr)); 2154 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 2155 hdr->b_size = size; 2156 hdr->b_spa = spa_load_guid(spa); 2157 2158 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2159 buf->b_hdr = hdr; 2160 buf->b_data = NULL; 2161 buf->b_efunc = NULL; 2162 buf->b_private = NULL; 2163 buf->b_next = NULL; 2164 2165 hdr->b_flags = arc_bufc_to_flags(type); 2166 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 2167 2168 hdr->b_l1hdr.b_buf = buf; 2169 hdr->b_l1hdr.b_state = arc_anon; 2170 hdr->b_l1hdr.b_arc_access = 0; 2171 hdr->b_l1hdr.b_datacnt = 1; 2172 hdr->b_l1hdr.b_tmp_cdata = NULL; 2173 2174 arc_get_data_buf(buf); 2175 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2176 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2177 2178 return (buf); 2179 } 2180 2181 /* 2182 * Allocates an ARC buf header that's in an evicted & L2-cached state. 2183 * This is used during l2arc reconstruction to make empty ARC buffers 2184 * which circumvent the regular disk->arc->l2arc path and instead come 2185 * into being in the reverse order, i.e. l2arc->arc. 2186 */ 2187 arc_buf_hdr_t * 2188 arc_buf_alloc_l2only(uint64_t load_guid, int size, arc_buf_contents_t type, 2189 l2arc_dev_t *dev, dva_t dva, uint64_t daddr, int32_t asize, uint64_t birth, 2190 zio_cksum_t cksum, enum zio_compress compress) 2191 { 2192 arc_buf_hdr_t *hdr; 2193 2194 ASSERT3U(size, >, 0); 2195 hdr = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 2196 ASSERT(BUF_EMPTY(hdr)); 2197 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 2198 hdr->b_dva = dva; 2199 hdr->b_birth = birth; 2200 hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 2201 bcopy(&cksum, hdr->b_freeze_cksum, sizeof (cksum)); 2202 hdr->b_flags = arc_bufc_to_flags(type); 2203 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 2204 hdr->b_size = size; 2205 hdr->b_spa = load_guid; 2206 2207 hdr->b_l2hdr.b_compress = compress; 2208 hdr->b_l2hdr.b_dev = dev; 2209 hdr->b_l2hdr.b_daddr = daddr; 2210 hdr->b_l2hdr.b_asize = asize; 2211 2212 return (hdr); 2213 } 2214 2215 static char *arc_onloan_tag = "onloan"; 2216 2217 /* 2218 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 2219 * flight data by arc_tempreserve_space() until they are "returned". Loaned 2220 * buffers must be returned to the arc before they can be used by the DMU or 2221 * freed. 2222 */ 2223 arc_buf_t * 2224 arc_loan_buf(spa_t *spa, int size) 2225 { 2226 arc_buf_t *buf; 2227 2228 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 2229 2230 atomic_add_64(&arc_loaned_bytes, size); 2231 return (buf); 2232 } 2233 2234 /* 2235 * Return a loaned arc buffer to the arc. 2236 */ 2237 void 2238 arc_return_buf(arc_buf_t *buf, void *tag) 2239 { 2240 arc_buf_hdr_t *hdr = buf->b_hdr; 2241 2242 ASSERT(buf->b_data != NULL); 2243 ASSERT(HDR_HAS_L1HDR(hdr)); 2244 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 2245 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2246 2247 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 2248 } 2249 2250 /* Detach an arc_buf from a dbuf (tag) */ 2251 void 2252 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 2253 { 2254 arc_buf_hdr_t *hdr = buf->b_hdr; 2255 2256 ASSERT(buf->b_data != NULL); 2257 ASSERT(HDR_HAS_L1HDR(hdr)); 2258 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 2259 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 2260 buf->b_efunc = NULL; 2261 buf->b_private = NULL; 2262 2263 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 2264 } 2265 2266 static arc_buf_t * 2267 arc_buf_clone(arc_buf_t *from) 2268 { 2269 arc_buf_t *buf; 2270 arc_buf_hdr_t *hdr = from->b_hdr; 2271 uint64_t size = hdr->b_size; 2272 2273 ASSERT(HDR_HAS_L1HDR(hdr)); 2274 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2275 2276 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2277 buf->b_hdr = hdr; 2278 buf->b_data = NULL; 2279 buf->b_efunc = NULL; 2280 buf->b_private = NULL; 2281 buf->b_next = hdr->b_l1hdr.b_buf; 2282 hdr->b_l1hdr.b_buf = buf; 2283 arc_get_data_buf(buf); 2284 bcopy(from->b_data, buf->b_data, size); 2285 2286 /* 2287 * This buffer already exists in the arc so create a duplicate 2288 * copy for the caller. If the buffer is associated with user data 2289 * then track the size and number of duplicates. These stats will be 2290 * updated as duplicate buffers are created and destroyed. 2291 */ 2292 if (HDR_ISTYPE_DATA(hdr)) { 2293 ARCSTAT_BUMP(arcstat_duplicate_buffers); 2294 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 2295 } 2296 hdr->b_l1hdr.b_datacnt += 1; 2297 return (buf); 2298 } 2299 2300 void 2301 arc_buf_add_ref(arc_buf_t *buf, void* tag) 2302 { 2303 arc_buf_hdr_t *hdr; 2304 kmutex_t *hash_lock; 2305 2306 /* 2307 * Check to see if this buffer is evicted. Callers 2308 * must verify b_data != NULL to know if the add_ref 2309 * was successful. 2310 */ 2311 mutex_enter(&buf->b_evict_lock); 2312 if (buf->b_data == NULL) { 2313 mutex_exit(&buf->b_evict_lock); 2314 return; 2315 } 2316 hash_lock = HDR_LOCK(buf->b_hdr); 2317 mutex_enter(hash_lock); 2318 hdr = buf->b_hdr; 2319 ASSERT(HDR_HAS_L1HDR(hdr)); 2320 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2321 mutex_exit(&buf->b_evict_lock); 2322 2323 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 2324 hdr->b_l1hdr.b_state == arc_mfu); 2325 2326 add_reference(hdr, hash_lock, tag); 2327 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2328 arc_access(hdr, hash_lock); 2329 mutex_exit(hash_lock); 2330 ARCSTAT_BUMP(arcstat_hits); 2331 arc_update_hit_stat(hdr, B_TRUE); 2332 } 2333 2334 static void 2335 arc_buf_free_on_write(void *data, size_t size, 2336 void (*free_func)(void *, size_t)) 2337 { 2338 l2arc_data_free_t *df; 2339 2340 df = kmem_alloc(sizeof (*df), KM_SLEEP); 2341 df->l2df_data = data; 2342 df->l2df_size = size; 2343 df->l2df_func = free_func; 2344 mutex_enter(&l2arc_free_on_write_mtx); 2345 list_insert_head(l2arc_free_on_write, df); 2346 mutex_exit(&l2arc_free_on_write_mtx); 2347 } 2348 2349 /* 2350 * Free the arc data buffer. If it is an l2arc write in progress, 2351 * the buffer is placed on l2arc_free_on_write to be freed later. 2352 */ 2353 static void 2354 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2355 { 2356 arc_buf_hdr_t *hdr = buf->b_hdr; 2357 2358 if (HDR_L2_WRITING(hdr)) { 2359 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2360 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2361 } else { 2362 free_func(buf->b_data, hdr->b_size); 2363 } 2364 } 2365 2366 static void 2367 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2368 { 2369 ASSERT(HDR_HAS_L2HDR(hdr)); 2370 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2371 2372 /* 2373 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2374 * that doesn't exist, the header is in the arc_l2c_only state, 2375 * and there isn't anything to free (it's already been freed). 2376 */ 2377 if (!HDR_HAS_L1HDR(hdr)) 2378 return; 2379 2380 /* 2381 * The header isn't being written to the l2arc device, thus it 2382 * shouldn't have a b_tmp_cdata to free. 2383 */ 2384 if (!HDR_L2_WRITING(hdr)) { 2385 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2386 return; 2387 } 2388 2389 /* 2390 * The header does not have compression enabled. This can be due 2391 * to the buffer not being compressible, or because we're 2392 * freeing the buffer before the second phase of 2393 * l2arc_write_buffer() has started (which does the compression 2394 * step). In either case, b_tmp_cdata does not point to a 2395 * separately compressed buffer, so there's nothing to free (it 2396 * points to the same buffer as the arc_buf_t's b_data field). 2397 */ 2398 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { 2399 hdr->b_l1hdr.b_tmp_cdata = NULL; 2400 return; 2401 } 2402 2403 /* 2404 * There's nothing to free since the buffer was all zero's and 2405 * compressed to a zero length buffer. 2406 */ 2407 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { 2408 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2409 return; 2410 } 2411 2412 ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); 2413 2414 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, 2415 hdr->b_size, zio_data_buf_free); 2416 2417 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2418 hdr->b_l1hdr.b_tmp_cdata = NULL; 2419 } 2420 2421 /* 2422 * Free up buf->b_data and if 'remove' is set, then pull the 2423 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2424 */ 2425 static void 2426 arc_buf_destroy(arc_buf_t *buf, boolean_t remove) 2427 { 2428 arc_buf_t **bufp; 2429 2430 /* free up data associated with the buf */ 2431 if (buf->b_data != NULL) { 2432 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2433 uint64_t size = buf->b_hdr->b_size; 2434 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2435 2436 arc_cksum_verify(buf); 2437 arc_buf_unwatch(buf); 2438 2439 if (type == ARC_BUFC_METADATA) { 2440 arc_buf_data_free(buf, zio_buf_free); 2441 arc_space_return(size, ARC_SPACE_META); 2442 } else { 2443 ASSERT(type == ARC_BUFC_DATA); 2444 arc_buf_data_free(buf, zio_data_buf_free); 2445 arc_space_return(size, ARC_SPACE_DATA); 2446 } 2447 2448 /* protected by hash lock, if in the hash table */ 2449 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2450 uint64_t *cnt = &state->arcs_lsize[type]; 2451 2452 ASSERT(refcount_is_zero( 2453 &buf->b_hdr->b_l1hdr.b_refcnt)); 2454 ASSERT(state != arc_anon && state != arc_l2c_only); 2455 2456 ASSERT3U(*cnt, >=, size); 2457 atomic_add_64(cnt, -size); 2458 } 2459 2460 (void) refcount_remove_many(&state->arcs_size, size, buf); 2461 buf->b_data = NULL; 2462 2463 /* 2464 * If we're destroying a duplicate buffer make sure 2465 * that the appropriate statistics are updated. 2466 */ 2467 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2468 HDR_ISTYPE_DATA(buf->b_hdr)) { 2469 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2470 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2471 } 2472 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2473 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2474 } 2475 2476 /* only remove the buf if requested */ 2477 if (!remove) 2478 return; 2479 2480 /* remove the buf from the hdr list */ 2481 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2482 bufp = &(*bufp)->b_next) 2483 continue; 2484 *bufp = buf->b_next; 2485 buf->b_next = NULL; 2486 2487 ASSERT(buf->b_efunc == NULL); 2488 2489 /* clean up the buf */ 2490 buf->b_hdr = NULL; 2491 kmem_cache_free(buf_cache, buf); 2492 } 2493 2494 static void 2495 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2496 { 2497 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2498 l2arc_dev_t *dev = l2hdr->b_dev; 2499 2500 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2501 ASSERT(HDR_HAS_L2HDR(hdr)); 2502 2503 list_remove(&dev->l2ad_buflist, hdr); 2504 2505 /* 2506 * We don't want to leak the b_tmp_cdata buffer that was 2507 * allocated in l2arc_write_buffers() 2508 */ 2509 arc_buf_l2_cdata_free(hdr); 2510 2511 /* 2512 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2513 * this header is being processed by l2arc_write_buffers() (i.e. 2514 * it's in the first stage of l2arc_write_buffers()). 2515 * Re-affirming that truth here, just to serve as a reminder. If 2516 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2517 * may not have its HDR_L2_WRITING flag set. (the write may have 2518 * completed, in which case HDR_L2_WRITING will be false and the 2519 * b_daddr field will point to the address of the buffer on disk). 2520 */ 2521 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2522 2523 /* 2524 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2525 * l2arc_write_buffers(). Since we've just removed this header 2526 * from the l2arc buffer list, this header will never reach the 2527 * second stage of l2arc_write_buffers(), which increments the 2528 * accounting stats for this header. Thus, we must be careful 2529 * not to decrement them for this header either. 2530 */ 2531 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2532 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2533 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2534 2535 vdev_space_update(dev->l2ad_vdev, 2536 -l2hdr->b_asize, 0, 0); 2537 2538 (void) refcount_remove_many(&dev->l2ad_alloc, 2539 l2hdr->b_asize, hdr); 2540 } 2541 2542 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2543 } 2544 2545 static void 2546 arc_hdr_destroy(arc_buf_hdr_t *hdr) 2547 { 2548 if (HDR_HAS_L1HDR(hdr)) { 2549 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2550 hdr->b_l1hdr.b_datacnt > 0); 2551 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2552 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2553 } 2554 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2555 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2556 2557 if (HDR_HAS_L2HDR(hdr)) { 2558 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2559 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2560 2561 if (!buflist_held) 2562 mutex_enter(&dev->l2ad_mtx); 2563 2564 /* 2565 * Even though we checked this conditional above, we 2566 * need to check this again now that we have the 2567 * l2ad_mtx. This is because we could be racing with 2568 * another thread calling l2arc_evict() which might have 2569 * destroyed this header's L2 portion as we were waiting 2570 * to acquire the l2ad_mtx. If that happens, we don't 2571 * want to re-destroy the header's L2 portion. 2572 */ 2573 if (HDR_HAS_L2HDR(hdr)) 2574 arc_hdr_l2hdr_destroy(hdr); 2575 2576 if (!buflist_held) 2577 mutex_exit(&dev->l2ad_mtx); 2578 } 2579 2580 if (!BUF_EMPTY(hdr)) 2581 buf_discard_identity(hdr); 2582 2583 if (hdr->b_freeze_cksum != NULL) { 2584 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2585 hdr->b_freeze_cksum = NULL; 2586 } 2587 2588 if (HDR_HAS_L1HDR(hdr)) { 2589 while (hdr->b_l1hdr.b_buf) { 2590 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2591 2592 if (buf->b_efunc != NULL) { 2593 mutex_enter(&arc_user_evicts_lock); 2594 mutex_enter(&buf->b_evict_lock); 2595 ASSERT(buf->b_hdr != NULL); 2596 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); 2597 hdr->b_l1hdr.b_buf = buf->b_next; 2598 buf->b_hdr = &arc_eviction_hdr; 2599 buf->b_next = arc_eviction_list; 2600 arc_eviction_list = buf; 2601 mutex_exit(&buf->b_evict_lock); 2602 cv_signal(&arc_user_evicts_cv); 2603 mutex_exit(&arc_user_evicts_lock); 2604 } else { 2605 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); 2606 } 2607 } 2608 #ifdef ZFS_DEBUG 2609 if (hdr->b_l1hdr.b_thawed != NULL) { 2610 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2611 hdr->b_l1hdr.b_thawed = NULL; 2612 } 2613 #endif 2614 } 2615 2616 ASSERT3P(hdr->b_hash_next, ==, NULL); 2617 if (HDR_HAS_L1HDR(hdr)) { 2618 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 2619 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2620 kmem_cache_free(hdr_full_cache, hdr); 2621 } else { 2622 kmem_cache_free(hdr_l2only_cache, hdr); 2623 } 2624 } 2625 2626 void 2627 arc_buf_free(arc_buf_t *buf, void *tag) 2628 { 2629 arc_buf_hdr_t *hdr = buf->b_hdr; 2630 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2631 2632 ASSERT(buf->b_efunc == NULL); 2633 ASSERT(buf->b_data != NULL); 2634 2635 if (hashed) { 2636 kmutex_t *hash_lock = HDR_LOCK(hdr); 2637 2638 mutex_enter(hash_lock); 2639 hdr = buf->b_hdr; 2640 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2641 2642 (void) remove_reference(hdr, hash_lock, tag); 2643 if (hdr->b_l1hdr.b_datacnt > 1) { 2644 arc_buf_destroy(buf, TRUE); 2645 } else { 2646 ASSERT(buf == hdr->b_l1hdr.b_buf); 2647 ASSERT(buf->b_efunc == NULL); 2648 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2649 } 2650 mutex_exit(hash_lock); 2651 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2652 int destroy_hdr; 2653 /* 2654 * We are in the middle of an async write. Don't destroy 2655 * this buffer unless the write completes before we finish 2656 * decrementing the reference count. 2657 */ 2658 mutex_enter(&arc_user_evicts_lock); 2659 (void) remove_reference(hdr, NULL, tag); 2660 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2661 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2662 mutex_exit(&arc_user_evicts_lock); 2663 if (destroy_hdr) 2664 arc_hdr_destroy(hdr); 2665 } else { 2666 if (remove_reference(hdr, NULL, tag) > 0) 2667 arc_buf_destroy(buf, TRUE); 2668 else 2669 arc_hdr_destroy(hdr); 2670 } 2671 } 2672 2673 boolean_t 2674 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2675 { 2676 arc_buf_hdr_t *hdr = buf->b_hdr; 2677 kmutex_t *hash_lock = HDR_LOCK(hdr); 2678 boolean_t no_callback = (buf->b_efunc == NULL); 2679 2680 if (hdr->b_l1hdr.b_state == arc_anon) { 2681 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2682 arc_buf_free(buf, tag); 2683 return (no_callback); 2684 } 2685 2686 mutex_enter(hash_lock); 2687 hdr = buf->b_hdr; 2688 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2689 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2690 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2691 ASSERT(buf->b_data != NULL); 2692 2693 (void) remove_reference(hdr, hash_lock, tag); 2694 if (hdr->b_l1hdr.b_datacnt > 1) { 2695 if (no_callback) 2696 arc_buf_destroy(buf, TRUE); 2697 } else if (no_callback) { 2698 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2699 ASSERT(buf->b_efunc == NULL); 2700 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2701 } 2702 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2703 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2704 mutex_exit(hash_lock); 2705 return (no_callback); 2706 } 2707 2708 int32_t 2709 arc_buf_size(arc_buf_t *buf) 2710 { 2711 return (buf->b_hdr->b_size); 2712 } 2713 2714 /* 2715 * Called from the DMU to determine if the current buffer should be 2716 * evicted. In order to ensure proper locking, the eviction must be initiated 2717 * from the DMU. Return true if the buffer is associated with user data and 2718 * duplicate buffers still exist. 2719 */ 2720 boolean_t 2721 arc_buf_eviction_needed(arc_buf_t *buf) 2722 { 2723 arc_buf_hdr_t *hdr; 2724 boolean_t evict_needed = B_FALSE; 2725 2726 if (zfs_disable_dup_eviction) 2727 return (B_FALSE); 2728 2729 mutex_enter(&buf->b_evict_lock); 2730 hdr = buf->b_hdr; 2731 if (hdr == NULL) { 2732 /* 2733 * We are in arc_do_user_evicts(); let that function 2734 * perform the eviction. 2735 */ 2736 ASSERT(buf->b_data == NULL); 2737 mutex_exit(&buf->b_evict_lock); 2738 return (B_FALSE); 2739 } else if (buf->b_data == NULL) { 2740 /* 2741 * We have already been added to the arc eviction list; 2742 * recommend eviction. 2743 */ 2744 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2745 mutex_exit(&buf->b_evict_lock); 2746 return (B_TRUE); 2747 } 2748 2749 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2750 evict_needed = B_TRUE; 2751 2752 mutex_exit(&buf->b_evict_lock); 2753 return (evict_needed); 2754 } 2755 2756 /* 2757 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 2758 * state of the header is dependent on it's state prior to entering this 2759 * function. The following transitions are possible: 2760 * 2761 * - arc_mru -> arc_mru_ghost 2762 * - arc_mfu -> arc_mfu_ghost 2763 * - arc_mru_ghost -> arc_l2c_only 2764 * - arc_mru_ghost -> deleted 2765 * - arc_mfu_ghost -> arc_l2c_only 2766 * - arc_mfu_ghost -> deleted 2767 */ 2768 static int64_t 2769 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 2770 { 2771 arc_state_t *evicted_state, *state; 2772 int64_t bytes_evicted = 0; 2773 2774 ASSERT(MUTEX_HELD(hash_lock)); 2775 ASSERT(HDR_HAS_L1HDR(hdr)); 2776 2777 state = hdr->b_l1hdr.b_state; 2778 if (GHOST_STATE(state)) { 2779 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2780 ASSERT(hdr->b_l1hdr.b_buf == NULL); 2781 2782 /* 2783 * l2arc_write_buffers() relies on a header's L1 portion 2784 * (i.e. it's b_tmp_cdata field) during it's write phase. 2785 * Thus, we cannot push a header onto the arc_l2c_only 2786 * state (removing it's L1 piece) until the header is 2787 * done being written to the l2arc. 2788 */ 2789 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 2790 ARCSTAT_BUMP(arcstat_evict_l2_skip); 2791 return (bytes_evicted); 2792 } 2793 2794 ARCSTAT_BUMP(arcstat_deleted); 2795 bytes_evicted += hdr->b_size; 2796 2797 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2798 2799 if (HDR_HAS_L2HDR(hdr)) { 2800 /* 2801 * This buffer is cached on the 2nd Level ARC; 2802 * don't destroy the header. 2803 */ 2804 arc_change_state(arc_l2c_only, hdr, hash_lock); 2805 /* 2806 * dropping from L1+L2 cached to L2-only, 2807 * realloc to remove the L1 header. 2808 */ 2809 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2810 hdr_l2only_cache); 2811 } else { 2812 arc_change_state(arc_anon, hdr, hash_lock); 2813 arc_hdr_destroy(hdr); 2814 } 2815 return (bytes_evicted); 2816 } 2817 2818 ASSERT(state == arc_mru || state == arc_mfu); 2819 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2820 2821 /* prefetch buffers have a minimum lifespan */ 2822 if (HDR_IO_IN_PROGRESS(hdr) || 2823 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2824 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2825 arc_min_prefetch_lifespan)) { 2826 ARCSTAT_BUMP(arcstat_evict_skip); 2827 return (bytes_evicted); 2828 } 2829 2830 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2831 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2832 while (hdr->b_l1hdr.b_buf) { 2833 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2834 if (!mutex_tryenter(&buf->b_evict_lock)) { 2835 ARCSTAT_BUMP(arcstat_mutex_miss); 2836 break; 2837 } 2838 if (buf->b_data != NULL) 2839 bytes_evicted += hdr->b_size; 2840 if (buf->b_efunc != NULL) { 2841 mutex_enter(&arc_user_evicts_lock); 2842 arc_buf_destroy(buf, FALSE); 2843 hdr->b_l1hdr.b_buf = buf->b_next; 2844 buf->b_hdr = &arc_eviction_hdr; 2845 buf->b_next = arc_eviction_list; 2846 arc_eviction_list = buf; 2847 cv_signal(&arc_user_evicts_cv); 2848 mutex_exit(&arc_user_evicts_lock); 2849 mutex_exit(&buf->b_evict_lock); 2850 } else { 2851 mutex_exit(&buf->b_evict_lock); 2852 arc_buf_destroy(buf, TRUE); 2853 } 2854 } 2855 2856 if (HDR_HAS_L2HDR(hdr)) { 2857 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); 2858 } else { 2859 if (l2arc_write_eligible(hdr->b_spa, UINT64_MAX, hdr)) 2860 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); 2861 else 2862 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); 2863 } 2864 2865 if (hdr->b_l1hdr.b_datacnt == 0) { 2866 arc_change_state(evicted_state, hdr, hash_lock); 2867 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2868 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2869 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2870 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2871 } 2872 2873 return (bytes_evicted); 2874 } 2875 2876 static uint64_t 2877 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 2878 uint64_t spa, int64_t bytes) 2879 { 2880 multilist_sublist_t *mls; 2881 uint64_t bytes_evicted = 0; 2882 arc_buf_hdr_t *hdr; 2883 kmutex_t *hash_lock; 2884 int evict_count = 0; 2885 2886 ASSERT3P(marker, !=, NULL); 2887 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2888 2889 mls = multilist_sublist_lock(ml, idx); 2890 2891 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 2892 hdr = multilist_sublist_prev(mls, marker)) { 2893 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 2894 (evict_count >= zfs_arc_evict_batch_limit)) 2895 break; 2896 2897 /* 2898 * To keep our iteration location, move the marker 2899 * forward. Since we're not holding hdr's hash lock, we 2900 * must be very careful and not remove 'hdr' from the 2901 * sublist. Otherwise, other consumers might mistake the 2902 * 'hdr' as not being on a sublist when they call the 2903 * multilist_link_active() function (they all rely on 2904 * the hash lock protecting concurrent insertions and 2905 * removals). multilist_sublist_move_forward() was 2906 * specifically implemented to ensure this is the case 2907 * (only 'marker' will be removed and re-inserted). 2908 */ 2909 multilist_sublist_move_forward(mls, marker); 2910 2911 /* 2912 * The only case where the b_spa field should ever be 2913 * zero, is the marker headers inserted by 2914 * arc_evict_state(). It's possible for multiple threads 2915 * to be calling arc_evict_state() concurrently (e.g. 2916 * dsl_pool_close() and zio_inject_fault()), so we must 2917 * skip any markers we see from these other threads. 2918 */ 2919 if (hdr->b_spa == 0) 2920 continue; 2921 2922 /* we're only interested in evicting buffers of a certain spa */ 2923 if (spa != 0 && hdr->b_spa != spa) { 2924 ARCSTAT_BUMP(arcstat_evict_skip); 2925 continue; 2926 } 2927 2928 hash_lock = HDR_LOCK(hdr); 2929 2930 /* 2931 * We aren't calling this function from any code path 2932 * that would already be holding a hash lock, so we're 2933 * asserting on this assumption to be defensive in case 2934 * this ever changes. Without this check, it would be 2935 * possible to incorrectly increment arcstat_mutex_miss 2936 * below (e.g. if the code changed such that we called 2937 * this function with a hash lock held). 2938 */ 2939 ASSERT(!MUTEX_HELD(hash_lock)); 2940 2941 if (mutex_tryenter(hash_lock)) { 2942 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 2943 mutex_exit(hash_lock); 2944 2945 bytes_evicted += evicted; 2946 2947 /* 2948 * If evicted is zero, arc_evict_hdr() must have 2949 * decided to skip this header, don't increment 2950 * evict_count in this case. 2951 */ 2952 if (evicted != 0) 2953 evict_count++; 2954 2955 /* 2956 * If arc_size isn't overflowing, signal any 2957 * threads that might happen to be waiting. 2958 * 2959 * For each header evicted, we wake up a single 2960 * thread. If we used cv_broadcast, we could 2961 * wake up "too many" threads causing arc_size 2962 * to significantly overflow arc_c; since 2963 * arc_get_data_buf() doesn't check for overflow 2964 * when it's woken up (it doesn't because it's 2965 * possible for the ARC to be overflowing while 2966 * full of un-evictable buffers, and the 2967 * function should proceed in this case). 2968 * 2969 * If threads are left sleeping, due to not 2970 * using cv_broadcast, they will be woken up 2971 * just before arc_reclaim_thread() sleeps. 2972 */ 2973 mutex_enter(&arc_reclaim_lock); 2974 if (!arc_is_overflowing()) 2975 cv_signal(&arc_reclaim_waiters_cv); 2976 mutex_exit(&arc_reclaim_lock); 2977 } else { 2978 ARCSTAT_BUMP(arcstat_mutex_miss); 2979 } 2980 } 2981 2982 multilist_sublist_unlock(mls); 2983 2984 return (bytes_evicted); 2985 } 2986 2987 /* 2988 * Evict buffers from the given arc state, until we've removed the 2989 * specified number of bytes. Move the removed buffers to the 2990 * appropriate evict state. 2991 * 2992 * This function makes a "best effort". It skips over any buffers 2993 * it can't get a hash_lock on, and so, may not catch all candidates. 2994 * It may also return without evicting as much space as requested. 2995 * 2996 * If bytes is specified using the special value ARC_EVICT_ALL, this 2997 * will evict all available (i.e. unlocked and evictable) buffers from 2998 * the given arc state; which is used by arc_flush(). 2999 */ 3000 static uint64_t 3001 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 3002 arc_buf_contents_t type) 3003 { 3004 uint64_t total_evicted = 0; 3005 multilist_t *ml = &state->arcs_list[type]; 3006 int num_sublists; 3007 arc_buf_hdr_t **markers; 3008 3009 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 3010 3011 num_sublists = multilist_get_num_sublists(ml); 3012 3013 /* 3014 * If we've tried to evict from each sublist, made some 3015 * progress, but still have not hit the target number of bytes 3016 * to evict, we want to keep trying. The markers allow us to 3017 * pick up where we left off for each individual sublist, rather 3018 * than starting from the tail each time. 3019 */ 3020 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 3021 for (int i = 0; i < num_sublists; i++) { 3022 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 3023 3024 /* 3025 * A b_spa of 0 is used to indicate that this header is 3026 * a marker. This fact is used in arc_adjust_type() and 3027 * arc_evict_state_impl(). 3028 */ 3029 markers[i]->b_spa = 0; 3030 3031 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3032 multilist_sublist_insert_tail(mls, markers[i]); 3033 multilist_sublist_unlock(mls); 3034 } 3035 3036 /* 3037 * While we haven't hit our target number of bytes to evict, or 3038 * we're evicting all available buffers. 3039 */ 3040 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 3041 /* 3042 * Start eviction using a randomly selected sublist, 3043 * this is to try and evenly balance eviction across all 3044 * sublists. Always starting at the same sublist 3045 * (e.g. index 0) would cause evictions to favor certain 3046 * sublists over others. 3047 */ 3048 int sublist_idx = multilist_get_random_index(ml); 3049 uint64_t scan_evicted = 0; 3050 3051 for (int i = 0; i < num_sublists; i++) { 3052 uint64_t bytes_remaining; 3053 uint64_t bytes_evicted; 3054 3055 if (bytes == ARC_EVICT_ALL) 3056 bytes_remaining = ARC_EVICT_ALL; 3057 else if (total_evicted < bytes) 3058 bytes_remaining = bytes - total_evicted; 3059 else 3060 break; 3061 3062 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 3063 markers[sublist_idx], spa, bytes_remaining); 3064 3065 scan_evicted += bytes_evicted; 3066 total_evicted += bytes_evicted; 3067 3068 /* we've reached the end, wrap to the beginning */ 3069 if (++sublist_idx >= num_sublists) 3070 sublist_idx = 0; 3071 } 3072 3073 /* 3074 * If we didn't evict anything during this scan, we have 3075 * no reason to believe we'll evict more during another 3076 * scan, so break the loop. 3077 */ 3078 if (scan_evicted == 0) { 3079 /* This isn't possible, let's make that obvious */ 3080 ASSERT3S(bytes, !=, 0); 3081 3082 /* 3083 * When bytes is ARC_EVICT_ALL, the only way to 3084 * break the loop is when scan_evicted is zero. 3085 * In that case, we actually have evicted enough, 3086 * so we don't want to increment the kstat. 3087 */ 3088 if (bytes != ARC_EVICT_ALL) { 3089 ASSERT3S(total_evicted, <, bytes); 3090 ARCSTAT_BUMP(arcstat_evict_not_enough); 3091 } 3092 3093 break; 3094 } 3095 } 3096 3097 for (int i = 0; i < num_sublists; i++) { 3098 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 3099 multilist_sublist_remove(mls, markers[i]); 3100 multilist_sublist_unlock(mls); 3101 3102 kmem_cache_free(hdr_full_cache, markers[i]); 3103 } 3104 kmem_free(markers, sizeof (*markers) * num_sublists); 3105 3106 return (total_evicted); 3107 } 3108 3109 /* 3110 * Flush all "evictable" data of the given type from the arc state 3111 * specified. This will not evict any "active" buffers (i.e. referenced). 3112 * 3113 * When 'retry' is set to FALSE, the function will make a single pass 3114 * over the state and evict any buffers that it can. Since it doesn't 3115 * continually retry the eviction, it might end up leaving some buffers 3116 * in the ARC due to lock misses. 3117 * 3118 * When 'retry' is set to TRUE, the function will continually retry the 3119 * eviction until *all* evictable buffers have been removed from the 3120 * state. As a result, if concurrent insertions into the state are 3121 * allowed (e.g. if the ARC isn't shutting down), this function might 3122 * wind up in an infinite loop, continually trying to evict buffers. 3123 */ 3124 static uint64_t 3125 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 3126 boolean_t retry) 3127 { 3128 uint64_t evicted = 0; 3129 3130 while (state->arcs_lsize[type] != 0) { 3131 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 3132 3133 if (!retry) 3134 break; 3135 } 3136 3137 return (evicted); 3138 } 3139 3140 /* 3141 * Evict the specified number of bytes from the state specified, 3142 * restricting eviction to the spa and type given. This function 3143 * prevents us from trying to evict more from a state's list than 3144 * is "evictable", and to skip evicting altogether when passed a 3145 * negative value for "bytes". In contrast, arc_evict_state() will 3146 * evict everything it can, when passed a negative value for "bytes". 3147 */ 3148 static uint64_t 3149 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 3150 arc_buf_contents_t type) 3151 { 3152 int64_t delta; 3153 3154 if (bytes > 0 && state->arcs_lsize[type] > 0) { 3155 delta = MIN(state->arcs_lsize[type], bytes); 3156 return (arc_evict_state(state, spa, delta, type)); 3157 } 3158 3159 return (0); 3160 } 3161 3162 /* 3163 * Evict metadata buffers from the cache, such that arc_meta_used is 3164 * capped by the arc_meta_limit tunable. 3165 */ 3166 static uint64_t 3167 arc_adjust_meta(void) 3168 { 3169 uint64_t total_evicted = 0; 3170 int64_t target; 3171 3172 /* 3173 * If we're over the meta limit, we want to evict enough 3174 * metadata to get back under the meta limit. We don't want to 3175 * evict so much that we drop the MRU below arc_p, though. If 3176 * we're over the meta limit more than we're over arc_p, we 3177 * evict some from the MRU here, and some from the MFU below. 3178 */ 3179 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3180 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3181 refcount_count(&arc_mru->arcs_size) - arc_p)); 3182 3183 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3184 3185 /* 3186 * Similar to the above, we want to evict enough bytes to get us 3187 * below the meta limit, but not so much as to drop us below the 3188 * space alloted to the MFU (which is defined as arc_c - arc_p). 3189 */ 3190 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 3191 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 3192 3193 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3194 3195 return (total_evicted); 3196 } 3197 3198 /* 3199 * Return the type of the oldest buffer in the given arc state 3200 * 3201 * This function will select a random sublist of type ARC_BUFC_DATA and 3202 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 3203 * is compared, and the type which contains the "older" buffer will be 3204 * returned. 3205 */ 3206 static arc_buf_contents_t 3207 arc_adjust_type(arc_state_t *state) 3208 { 3209 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; 3210 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; 3211 int data_idx = multilist_get_random_index(data_ml); 3212 int meta_idx = multilist_get_random_index(meta_ml); 3213 multilist_sublist_t *data_mls; 3214 multilist_sublist_t *meta_mls; 3215 arc_buf_contents_t type; 3216 arc_buf_hdr_t *data_hdr; 3217 arc_buf_hdr_t *meta_hdr; 3218 3219 /* 3220 * We keep the sublist lock until we're finished, to prevent 3221 * the headers from being destroyed via arc_evict_state(). 3222 */ 3223 data_mls = multilist_sublist_lock(data_ml, data_idx); 3224 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 3225 3226 /* 3227 * These two loops are to ensure we skip any markers that 3228 * might be at the tail of the lists due to arc_evict_state(). 3229 */ 3230 3231 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 3232 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 3233 if (data_hdr->b_spa != 0) 3234 break; 3235 } 3236 3237 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 3238 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 3239 if (meta_hdr->b_spa != 0) 3240 break; 3241 } 3242 3243 if (data_hdr == NULL && meta_hdr == NULL) { 3244 type = ARC_BUFC_DATA; 3245 } else if (data_hdr == NULL) { 3246 ASSERT3P(meta_hdr, !=, NULL); 3247 type = ARC_BUFC_METADATA; 3248 } else if (meta_hdr == NULL) { 3249 ASSERT3P(data_hdr, !=, NULL); 3250 type = ARC_BUFC_DATA; 3251 } else { 3252 ASSERT3P(data_hdr, !=, NULL); 3253 ASSERT3P(meta_hdr, !=, NULL); 3254 3255 /* The headers can't be on the sublist without an L1 header */ 3256 ASSERT(HDR_HAS_L1HDR(data_hdr)); 3257 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 3258 3259 if (data_hdr->b_l1hdr.b_arc_access < 3260 meta_hdr->b_l1hdr.b_arc_access) { 3261 type = ARC_BUFC_DATA; 3262 } else { 3263 type = ARC_BUFC_METADATA; 3264 } 3265 } 3266 3267 multilist_sublist_unlock(meta_mls); 3268 multilist_sublist_unlock(data_mls); 3269 3270 return (type); 3271 } 3272 3273 /* 3274 * Evict buffers from the cache, such that arc_size is capped by arc_c. 3275 */ 3276 static uint64_t 3277 arc_adjust(void) 3278 { 3279 uint64_t total_evicted = 0; 3280 uint64_t bytes; 3281 int64_t target; 3282 3283 /* 3284 * If we're over arc_meta_limit, we want to correct that before 3285 * potentially evicting data buffers below. 3286 */ 3287 total_evicted += arc_adjust_meta(); 3288 3289 /* 3290 * Adjust MRU size 3291 * 3292 * If we're over the target cache size, we want to evict enough 3293 * from the list to get back to our target size. We don't want 3294 * to evict too much from the MRU, such that it drops below 3295 * arc_p. So, if we're over our target cache size more than 3296 * the MRU is over arc_p, we'll evict enough to get back to 3297 * arc_p here, and then evict more from the MFU below. 3298 */ 3299 target = MIN((int64_t)(arc_size - arc_c), 3300 (int64_t)(refcount_count(&arc_anon->arcs_size) + 3301 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 3302 3303 /* 3304 * If we're below arc_meta_min, always prefer to evict data. 3305 * Otherwise, try to satisfy the requested number of bytes to 3306 * evict from the type which contains older buffers; in an 3307 * effort to keep newer buffers in the cache regardless of their 3308 * type. If we cannot satisfy the number of bytes from this 3309 * type, spill over into the next type. 3310 */ 3311 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 3312 arc_meta_used > arc_meta_min) { 3313 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3314 total_evicted += bytes; 3315 3316 /* 3317 * If we couldn't evict our target number of bytes from 3318 * metadata, we try to get the rest from data. 3319 */ 3320 target -= bytes; 3321 3322 total_evicted += 3323 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3324 } else { 3325 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 3326 total_evicted += bytes; 3327 3328 /* 3329 * If we couldn't evict our target number of bytes from 3330 * data, we try to get the rest from metadata. 3331 */ 3332 target -= bytes; 3333 3334 total_evicted += 3335 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 3336 } 3337 3338 /* 3339 * Adjust MFU size 3340 * 3341 * Now that we've tried to evict enough from the MRU to get its 3342 * size back to arc_p, if we're still above the target cache 3343 * size, we evict the rest from the MFU. 3344 */ 3345 target = arc_size - arc_c; 3346 3347 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 3348 arc_meta_used > arc_meta_min) { 3349 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3350 total_evicted += bytes; 3351 3352 /* 3353 * If we couldn't evict our target number of bytes from 3354 * metadata, we try to get the rest from data. 3355 */ 3356 target -= bytes; 3357 3358 total_evicted += 3359 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3360 } else { 3361 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3362 total_evicted += bytes; 3363 3364 /* 3365 * If we couldn't evict our target number of bytes from 3366 * data, we try to get the rest from data. 3367 */ 3368 target -= bytes; 3369 3370 total_evicted += 3371 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3372 } 3373 3374 /* 3375 * Adjust ghost lists 3376 * 3377 * In addition to the above, the ARC also defines target values 3378 * for the ghost lists. The sum of the mru list and mru ghost 3379 * list should never exceed the target size of the cache, and 3380 * the sum of the mru list, mfu list, mru ghost list, and mfu 3381 * ghost list should never exceed twice the target size of the 3382 * cache. The following logic enforces these limits on the ghost 3383 * caches, and evicts from them as needed. 3384 */ 3385 target = refcount_count(&arc_mru->arcs_size) + 3386 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 3387 3388 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 3389 total_evicted += bytes; 3390 3391 target -= bytes; 3392 3393 total_evicted += 3394 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 3395 3396 /* 3397 * We assume the sum of the mru list and mfu list is less than 3398 * or equal to arc_c (we enforced this above), which means we 3399 * can use the simpler of the two equations below: 3400 * 3401 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 3402 * mru ghost + mfu ghost <= arc_c 3403 */ 3404 target = refcount_count(&arc_mru_ghost->arcs_size) + 3405 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 3406 3407 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 3408 total_evicted += bytes; 3409 3410 target -= bytes; 3411 3412 total_evicted += 3413 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 3414 3415 return (total_evicted); 3416 } 3417 3418 static void 3419 arc_do_user_evicts(void) 3420 { 3421 mutex_enter(&arc_user_evicts_lock); 3422 while (arc_eviction_list != NULL) { 3423 arc_buf_t *buf = arc_eviction_list; 3424 arc_eviction_list = buf->b_next; 3425 mutex_enter(&buf->b_evict_lock); 3426 buf->b_hdr = NULL; 3427 mutex_exit(&buf->b_evict_lock); 3428 mutex_exit(&arc_user_evicts_lock); 3429 3430 if (buf->b_efunc != NULL) 3431 VERIFY0(buf->b_efunc(buf->b_private)); 3432 3433 buf->b_efunc = NULL; 3434 buf->b_private = NULL; 3435 kmem_cache_free(buf_cache, buf); 3436 mutex_enter(&arc_user_evicts_lock); 3437 } 3438 mutex_exit(&arc_user_evicts_lock); 3439 } 3440 3441 void 3442 arc_flush(spa_t *spa, boolean_t retry) 3443 { 3444 uint64_t guid = 0; 3445 3446 /* 3447 * If retry is TRUE, a spa must not be specified since we have 3448 * no good way to determine if all of a spa's buffers have been 3449 * evicted from an arc state. 3450 */ 3451 ASSERT(!retry || spa == 0); 3452 3453 if (spa != NULL) 3454 guid = spa_load_guid(spa); 3455 3456 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 3457 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 3458 3459 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 3460 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 3461 3462 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 3463 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 3464 3465 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 3466 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 3467 3468 arc_do_user_evicts(); 3469 ASSERT(spa || arc_eviction_list == NULL); 3470 } 3471 3472 void 3473 arc_shrink(int64_t to_free) 3474 { 3475 if (arc_c > arc_c_min) { 3476 3477 if (arc_c > arc_c_min + to_free) 3478 atomic_add_64(&arc_c, -to_free); 3479 else 3480 arc_c = arc_c_min; 3481 3482 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3483 if (arc_c > arc_size) 3484 arc_c = MAX(arc_size, arc_c_min); 3485 if (arc_p > arc_c) 3486 arc_p = (arc_c >> 1); 3487 ASSERT(arc_c >= arc_c_min); 3488 ASSERT((int64_t)arc_p >= 0); 3489 } 3490 3491 if (arc_size > arc_c) 3492 (void) arc_adjust(); 3493 } 3494 3495 typedef enum free_memory_reason_t { 3496 FMR_UNKNOWN, 3497 FMR_NEEDFREE, 3498 FMR_LOTSFREE, 3499 FMR_SWAPFS_MINFREE, 3500 FMR_PAGES_PP_MAXIMUM, 3501 FMR_HEAP_ARENA, 3502 FMR_ZIO_ARENA, 3503 } free_memory_reason_t; 3504 3505 int64_t last_free_memory; 3506 free_memory_reason_t last_free_reason; 3507 3508 /* 3509 * Additional reserve of pages for pp_reserve. 3510 */ 3511 int64_t arc_pages_pp_reserve = 64; 3512 3513 /* 3514 * Additional reserve of pages for swapfs. 3515 */ 3516 int64_t arc_swapfs_reserve = 64; 3517 3518 /* 3519 * Return the amount of memory that can be consumed before reclaim will be 3520 * needed. Positive if there is sufficient free memory, negative indicates 3521 * the amount of memory that needs to be freed up. 3522 */ 3523 static int64_t 3524 arc_available_memory(void) 3525 { 3526 int64_t lowest = INT64_MAX; 3527 int64_t n; 3528 free_memory_reason_t r = FMR_UNKNOWN; 3529 3530 #ifdef _KERNEL 3531 if (needfree > 0) { 3532 n = PAGESIZE * (-needfree); 3533 if (n < lowest) { 3534 lowest = n; 3535 r = FMR_NEEDFREE; 3536 } 3537 } 3538 3539 /* 3540 * check that we're out of range of the pageout scanner. It starts to 3541 * schedule paging if freemem is less than lotsfree and needfree. 3542 * lotsfree is the high-water mark for pageout, and needfree is the 3543 * number of needed free pages. We add extra pages here to make sure 3544 * the scanner doesn't start up while we're freeing memory. 3545 */ 3546 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3547 if (n < lowest) { 3548 lowest = n; 3549 r = FMR_LOTSFREE; 3550 } 3551 3552 /* 3553 * check to make sure that swapfs has enough space so that anon 3554 * reservations can still succeed. anon_resvmem() checks that the 3555 * availrmem is greater than swapfs_minfree, and the number of reserved 3556 * swap pages. We also add a bit of extra here just to prevent 3557 * circumstances from getting really dire. 3558 */ 3559 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3560 desfree - arc_swapfs_reserve); 3561 if (n < lowest) { 3562 lowest = n; 3563 r = FMR_SWAPFS_MINFREE; 3564 } 3565 3566 3567 /* 3568 * Check that we have enough availrmem that memory locking (e.g., via 3569 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3570 * stores the number of pages that cannot be locked; when availrmem 3571 * drops below pages_pp_maximum, page locking mechanisms such as 3572 * page_pp_lock() will fail.) 3573 */ 3574 n = PAGESIZE * (availrmem - pages_pp_maximum - 3575 arc_pages_pp_reserve); 3576 if (n < lowest) { 3577 lowest = n; 3578 r = FMR_PAGES_PP_MAXIMUM; 3579 } 3580 3581 #if defined(__i386) 3582 /* 3583 * If we're on an i386 platform, it's possible that we'll exhaust the 3584 * kernel heap space before we ever run out of available physical 3585 * memory. Most checks of the size of the heap_area compare against 3586 * tune.t_minarmem, which is the minimum available real memory that we 3587 * can have in the system. However, this is generally fixed at 25 pages 3588 * which is so low that it's useless. In this comparison, we seek to 3589 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3590 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3591 * free) 3592 */ 3593 n = vmem_size(heap_arena, VMEM_FREE) - 3594 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3595 if (n < lowest) { 3596 lowest = n; 3597 r = FMR_HEAP_ARENA; 3598 } 3599 #endif 3600 3601 /* 3602 * If zio data pages are being allocated out of a separate heap segment, 3603 * then enforce that the size of available vmem for this arena remains 3604 * above about 1/16th free. 3605 * 3606 * Note: The 1/16th arena free requirement was put in place 3607 * to aggressively evict memory from the arc in order to avoid 3608 * memory fragmentation issues. 3609 */ 3610 if (zio_arena != NULL) { 3611 n = vmem_size(zio_arena, VMEM_FREE) - 3612 (vmem_size(zio_arena, VMEM_ALLOC) >> 4); 3613 if (n < lowest) { 3614 lowest = n; 3615 r = FMR_ZIO_ARENA; 3616 } 3617 } 3618 #else 3619 /* Every 100 calls, free a small amount */ 3620 if (spa_get_random(100) == 0) 3621 lowest = -1024; 3622 #endif 3623 3624 last_free_memory = lowest; 3625 last_free_reason = r; 3626 3627 return (lowest); 3628 } 3629 3630 3631 /* 3632 * Determine if the system is under memory pressure and is asking 3633 * to reclaim memory. A return value of TRUE indicates that the system 3634 * is under memory pressure and that the arc should adjust accordingly. 3635 */ 3636 static boolean_t 3637 arc_reclaim_needed(void) 3638 { 3639 return (arc_available_memory() < 0); 3640 } 3641 3642 static void 3643 arc_kmem_reap_now(void) 3644 { 3645 size_t i; 3646 kmem_cache_t *prev_cache = NULL; 3647 kmem_cache_t *prev_data_cache = NULL; 3648 extern kmem_cache_t *zio_buf_cache[]; 3649 extern kmem_cache_t *zio_data_buf_cache[]; 3650 extern kmem_cache_t *range_seg_cache; 3651 3652 #ifdef _KERNEL 3653 if (arc_meta_used >= arc_meta_limit) { 3654 /* 3655 * We are exceeding our meta-data cache limit. 3656 * Purge some DNLC entries to release holds on meta-data. 3657 */ 3658 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3659 } 3660 #if defined(__i386) 3661 /* 3662 * Reclaim unused memory from all kmem caches. 3663 */ 3664 kmem_reap(); 3665 #endif 3666 #endif 3667 3668 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3669 if (zio_buf_cache[i] != prev_cache) { 3670 prev_cache = zio_buf_cache[i]; 3671 kmem_cache_reap_now(zio_buf_cache[i]); 3672 } 3673 if (zio_data_buf_cache[i] != prev_data_cache) { 3674 prev_data_cache = zio_data_buf_cache[i]; 3675 kmem_cache_reap_now(zio_data_buf_cache[i]); 3676 } 3677 } 3678 kmem_cache_reap_now(buf_cache); 3679 kmem_cache_reap_now(hdr_full_cache); 3680 kmem_cache_reap_now(hdr_l2only_cache); 3681 kmem_cache_reap_now(range_seg_cache); 3682 3683 if (zio_arena != NULL) { 3684 /* 3685 * Ask the vmem arena to reclaim unused memory from its 3686 * quantum caches. 3687 */ 3688 vmem_qcache_reap(zio_arena); 3689 } 3690 } 3691 3692 /* 3693 * Threads can block in arc_get_data_buf() waiting for this thread to evict 3694 * enough data and signal them to proceed. When this happens, the threads in 3695 * arc_get_data_buf() are sleeping while holding the hash lock for their 3696 * particular arc header. Thus, we must be careful to never sleep on a 3697 * hash lock in this thread. This is to prevent the following deadlock: 3698 * 3699 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", 3700 * waiting for the reclaim thread to signal it. 3701 * 3702 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 3703 * fails, and goes to sleep forever. 3704 * 3705 * This possible deadlock is avoided by always acquiring a hash lock 3706 * using mutex_tryenter() from arc_reclaim_thread(). 3707 */ 3708 static void 3709 arc_reclaim_thread(void) 3710 { 3711 hrtime_t growtime = 0; 3712 callb_cpr_t cpr; 3713 3714 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 3715 3716 mutex_enter(&arc_reclaim_lock); 3717 while (!arc_reclaim_thread_exit) { 3718 int64_t free_memory = arc_available_memory(); 3719 uint64_t evicted = 0; 3720 3721 mutex_exit(&arc_reclaim_lock); 3722 3723 if (free_memory < 0) { 3724 3725 arc_no_grow = B_TRUE; 3726 arc_warm = B_TRUE; 3727 3728 /* 3729 * Wait at least zfs_grow_retry (default 60) seconds 3730 * before considering growing. 3731 */ 3732 growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 3733 3734 arc_kmem_reap_now(); 3735 3736 /* 3737 * If we are still low on memory, shrink the ARC 3738 * so that we have arc_shrink_min free space. 3739 */ 3740 free_memory = arc_available_memory(); 3741 3742 int64_t to_free = 3743 (arc_c >> arc_shrink_shift) - free_memory; 3744 if (to_free > 0) { 3745 #ifdef _KERNEL 3746 to_free = MAX(to_free, ptob(needfree)); 3747 #endif 3748 arc_shrink(to_free); 3749 } 3750 } else if (free_memory < arc_c >> arc_no_grow_shift) { 3751 arc_no_grow = B_TRUE; 3752 } else if (gethrtime() >= growtime) { 3753 arc_no_grow = B_FALSE; 3754 } 3755 3756 evicted = arc_adjust(); 3757 3758 mutex_enter(&arc_reclaim_lock); 3759 3760 /* 3761 * If evicted is zero, we couldn't evict anything via 3762 * arc_adjust(). This could be due to hash lock 3763 * collisions, but more likely due to the majority of 3764 * arc buffers being unevictable. Therefore, even if 3765 * arc_size is above arc_c, another pass is unlikely to 3766 * be helpful and could potentially cause us to enter an 3767 * infinite loop. 3768 */ 3769 if (arc_size <= arc_c || evicted == 0) { 3770 /* 3771 * We're either no longer overflowing, or we 3772 * can't evict anything more, so we should wake 3773 * up any threads before we go to sleep. 3774 */ 3775 cv_broadcast(&arc_reclaim_waiters_cv); 3776 3777 /* 3778 * Block until signaled, or after one second (we 3779 * might need to perform arc_kmem_reap_now() 3780 * even if we aren't being signalled) 3781 */ 3782 CALLB_CPR_SAFE_BEGIN(&cpr); 3783 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 3784 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 3785 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 3786 } 3787 } 3788 3789 arc_reclaim_thread_exit = FALSE; 3790 cv_broadcast(&arc_reclaim_thread_cv); 3791 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 3792 thread_exit(); 3793 } 3794 3795 static void 3796 arc_user_evicts_thread(void) 3797 { 3798 callb_cpr_t cpr; 3799 3800 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); 3801 3802 mutex_enter(&arc_user_evicts_lock); 3803 while (!arc_user_evicts_thread_exit) { 3804 mutex_exit(&arc_user_evicts_lock); 3805 3806 arc_do_user_evicts(); 3807 3808 /* 3809 * This is necessary in order for the mdb ::arc dcmd to 3810 * show up to date information. Since the ::arc command 3811 * does not call the kstat's update function, without 3812 * this call, the command may show stale stats for the 3813 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3814 * with this change, the data might be up to 1 second 3815 * out of date; but that should suffice. The arc_state_t 3816 * structures can be queried directly if more accurate 3817 * information is needed. 3818 */ 3819 if (arc_ksp != NULL) 3820 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3821 3822 mutex_enter(&arc_user_evicts_lock); 3823 3824 /* 3825 * Block until signaled, or after one second (we need to 3826 * call the arc's kstat update function regularly). 3827 */ 3828 CALLB_CPR_SAFE_BEGIN(&cpr); 3829 (void) cv_timedwait(&arc_user_evicts_cv, 3830 &arc_user_evicts_lock, ddi_get_lbolt() + hz); 3831 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); 3832 } 3833 3834 arc_user_evicts_thread_exit = FALSE; 3835 cv_broadcast(&arc_user_evicts_cv); 3836 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ 3837 thread_exit(); 3838 } 3839 3840 /* 3841 * Adapt arc info given the number of bytes we are trying to add and 3842 * the state that we are comming from. This function is only called 3843 * when we are adding new content to the cache. 3844 */ 3845 static void 3846 arc_adapt(int bytes, arc_state_t *state) 3847 { 3848 int mult; 3849 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3850 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 3851 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 3852 3853 if (state == arc_l2c_only) 3854 return; 3855 3856 ASSERT(bytes > 0); 3857 /* 3858 * Adapt the target size of the MRU list: 3859 * - if we just hit in the MRU ghost list, then increase 3860 * the target size of the MRU list. 3861 * - if we just hit in the MFU ghost list, then increase 3862 * the target size of the MFU list by decreasing the 3863 * target size of the MRU list. 3864 */ 3865 if (state == arc_mru_ghost) { 3866 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 3867 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3868 3869 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3870 } else if (state == arc_mfu_ghost) { 3871 uint64_t delta; 3872 3873 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 3874 mult = MIN(mult, 10); 3875 3876 delta = MIN(bytes * mult, arc_p); 3877 arc_p = MAX(arc_p_min, arc_p - delta); 3878 } 3879 ASSERT((int64_t)arc_p >= 0); 3880 3881 if (arc_reclaim_needed()) { 3882 cv_signal(&arc_reclaim_thread_cv); 3883 return; 3884 } 3885 3886 if (arc_no_grow) 3887 return; 3888 3889 if (arc_c >= arc_c_max) 3890 return; 3891 3892 /* 3893 * If we're within (2 * maxblocksize) bytes of the target 3894 * cache size, increment the target cache size 3895 */ 3896 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3897 atomic_add_64(&arc_c, (int64_t)bytes); 3898 if (arc_c > arc_c_max) 3899 arc_c = arc_c_max; 3900 else if (state == arc_anon) 3901 atomic_add_64(&arc_p, (int64_t)bytes); 3902 if (arc_p > arc_c) 3903 arc_p = arc_c; 3904 } 3905 ASSERT((int64_t)arc_p >= 0); 3906 } 3907 3908 /* 3909 * Check if arc_size has grown past our upper threshold, determined by 3910 * zfs_arc_overflow_shift. 3911 */ 3912 static boolean_t 3913 arc_is_overflowing(void) 3914 { 3915 /* Always allow at least one block of overflow */ 3916 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 3917 arc_c >> zfs_arc_overflow_shift); 3918 3919 return (arc_size >= arc_c + overflow); 3920 } 3921 3922 /* 3923 * The buffer, supplied as the first argument, needs a data block. If we 3924 * are hitting the hard limit for the cache size, we must sleep, waiting 3925 * for the eviction thread to catch up. If we're past the target size 3926 * but below the hard limit, we'll only signal the reclaim thread and 3927 * continue on. 3928 */ 3929 static void 3930 arc_get_data_buf(arc_buf_t *buf) 3931 { 3932 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3933 uint64_t size = buf->b_hdr->b_size; 3934 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3935 3936 arc_adapt(size, state); 3937 3938 /* 3939 * If arc_size is currently overflowing, and has grown past our 3940 * upper limit, we must be adding data faster than the evict 3941 * thread can evict. Thus, to ensure we don't compound the 3942 * problem by adding more data and forcing arc_size to grow even 3943 * further past it's target size, we halt and wait for the 3944 * eviction thread to catch up. 3945 * 3946 * It's also possible that the reclaim thread is unable to evict 3947 * enough buffers to get arc_size below the overflow limit (e.g. 3948 * due to buffers being un-evictable, or hash lock collisions). 3949 * In this case, we want to proceed regardless if we're 3950 * overflowing; thus we don't use a while loop here. 3951 */ 3952 if (arc_is_overflowing()) { 3953 mutex_enter(&arc_reclaim_lock); 3954 3955 /* 3956 * Now that we've acquired the lock, we may no longer be 3957 * over the overflow limit, lets check. 3958 * 3959 * We're ignoring the case of spurious wake ups. If that 3960 * were to happen, it'd let this thread consume an ARC 3961 * buffer before it should have (i.e. before we're under 3962 * the overflow limit and were signalled by the reclaim 3963 * thread). As long as that is a rare occurrence, it 3964 * shouldn't cause any harm. 3965 */ 3966 if (arc_is_overflowing()) { 3967 cv_signal(&arc_reclaim_thread_cv); 3968 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 3969 } 3970 3971 mutex_exit(&arc_reclaim_lock); 3972 } 3973 3974 if (type == ARC_BUFC_METADATA) { 3975 buf->b_data = zio_buf_alloc(size); 3976 arc_space_consume(size, ARC_SPACE_META); 3977 } else { 3978 ASSERT(type == ARC_BUFC_DATA); 3979 buf->b_data = zio_data_buf_alloc(size); 3980 arc_space_consume(size, ARC_SPACE_DATA); 3981 } 3982 3983 /* 3984 * Update the state size. Note that ghost states have a 3985 * "ghost size" and so don't need to be updated. 3986 */ 3987 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3988 arc_buf_hdr_t *hdr = buf->b_hdr; 3989 arc_state_t *state = hdr->b_l1hdr.b_state; 3990 3991 (void) refcount_add_many(&state->arcs_size, size, buf); 3992 3993 /* 3994 * If this is reached via arc_read, the link is 3995 * protected by the hash lock. If reached via 3996 * arc_buf_alloc, the header should not be accessed by 3997 * any other thread. And, if reached via arc_read_done, 3998 * the hash lock will protect it if it's found in the 3999 * hash table; otherwise no other thread should be 4000 * trying to [add|remove]_reference it. 4001 */ 4002 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 4003 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4004 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 4005 size); 4006 } 4007 /* 4008 * If we are growing the cache, and we are adding anonymous 4009 * data, and we have outgrown arc_p, update arc_p 4010 */ 4011 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 4012 (refcount_count(&arc_anon->arcs_size) + 4013 refcount_count(&arc_mru->arcs_size) > arc_p)) 4014 arc_p = MIN(arc_c, arc_p + size); 4015 } 4016 } 4017 4018 /* 4019 * This routine is called whenever a buffer is accessed. 4020 * NOTE: the hash lock is dropped in this function. 4021 */ 4022 static void 4023 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4024 { 4025 clock_t now; 4026 4027 ASSERT(MUTEX_HELD(hash_lock)); 4028 ASSERT(HDR_HAS_L1HDR(hdr)); 4029 4030 if (hdr->b_l1hdr.b_state == arc_anon) { 4031 /* 4032 * This buffer is not in the cache, and does not 4033 * appear in our "ghost" list. Add the new buffer 4034 * to the MRU state. 4035 */ 4036 4037 ASSERT0(hdr->b_l1hdr.b_arc_access); 4038 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4039 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4040 arc_change_state(arc_mru, hdr, hash_lock); 4041 4042 } else if (hdr->b_l1hdr.b_state == arc_mru) { 4043 now = ddi_get_lbolt(); 4044 4045 /* 4046 * If this buffer is here because of a prefetch, then either: 4047 * - clear the flag if this is a "referencing" read 4048 * (any subsequent access will bump this into the MFU state). 4049 * or 4050 * - move the buffer to the head of the list if this is 4051 * another prefetch (to make it less likely to be evicted). 4052 */ 4053 if (HDR_PREFETCH(hdr)) { 4054 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4055 /* link protected by hash lock */ 4056 ASSERT(multilist_link_active( 4057 &hdr->b_l1hdr.b_arc_node)); 4058 } else { 4059 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4060 ARCSTAT_BUMP(arcstat_mru_hits); 4061 } 4062 hdr->b_l1hdr.b_arc_access = now; 4063 return; 4064 } 4065 4066 /* 4067 * This buffer has been "accessed" only once so far, 4068 * but it is still in the cache. Move it to the MFU 4069 * state. 4070 */ 4071 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 4072 /* 4073 * More than 125ms have passed since we 4074 * instantiated this buffer. Move it to the 4075 * most frequently used state. 4076 */ 4077 hdr->b_l1hdr.b_arc_access = now; 4078 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4079 arc_change_state(arc_mfu, hdr, hash_lock); 4080 } 4081 ARCSTAT_BUMP(arcstat_mru_hits); 4082 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 4083 arc_state_t *new_state; 4084 /* 4085 * This buffer has been "accessed" recently, but 4086 * was evicted from the cache. Move it to the 4087 * MFU state. 4088 */ 4089 4090 if (HDR_PREFETCH(hdr)) { 4091 new_state = arc_mru; 4092 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 4093 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 4094 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 4095 } else { 4096 new_state = arc_mfu; 4097 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4098 } 4099 4100 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4101 arc_change_state(new_state, hdr, hash_lock); 4102 4103 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 4104 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 4105 /* 4106 * This buffer has been accessed more than once and is 4107 * still in the cache. Keep it in the MFU state. 4108 * 4109 * NOTE: an add_reference() that occurred when we did 4110 * the arc_read() will have kicked this off the list. 4111 * If it was a prefetch, we will explicitly move it to 4112 * the head of the list now. 4113 */ 4114 if ((HDR_PREFETCH(hdr)) != 0) { 4115 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4116 /* link protected by hash_lock */ 4117 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4118 } 4119 ARCSTAT_BUMP(arcstat_mfu_hits); 4120 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4121 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 4122 arc_state_t *new_state = arc_mfu; 4123 /* 4124 * This buffer has been accessed more than once but has 4125 * been evicted from the cache. Move it back to the 4126 * MFU state. 4127 */ 4128 4129 if (HDR_PREFETCH(hdr)) { 4130 /* 4131 * This is a prefetch access... 4132 * move this block back to the MRU state. 4133 */ 4134 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 4135 new_state = arc_mru; 4136 } 4137 4138 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4139 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4140 arc_change_state(new_state, hdr, hash_lock); 4141 4142 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 4143 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 4144 /* 4145 * This buffer is on the 2nd Level ARC. 4146 */ 4147 4148 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 4149 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 4150 arc_change_state(arc_mfu, hdr, hash_lock); 4151 } else { 4152 ASSERT(!"invalid arc state"); 4153 } 4154 } 4155 4156 /* a generic arc_done_func_t which you can use */ 4157 /* ARGSUSED */ 4158 void 4159 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 4160 { 4161 if (zio == NULL || zio->io_error == 0) 4162 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 4163 VERIFY(arc_buf_remove_ref(buf, arg)); 4164 } 4165 4166 /* a generic arc_done_func_t */ 4167 void 4168 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 4169 { 4170 arc_buf_t **bufp = arg; 4171 if (zio && zio->io_error) { 4172 VERIFY(arc_buf_remove_ref(buf, arg)); 4173 *bufp = NULL; 4174 } else { 4175 *bufp = buf; 4176 ASSERT(buf->b_data); 4177 } 4178 } 4179 4180 static void 4181 arc_read_done(zio_t *zio) 4182 { 4183 arc_buf_hdr_t *hdr; 4184 arc_buf_t *buf; 4185 arc_buf_t *abuf; /* buffer we're assigning to callback */ 4186 kmutex_t *hash_lock = NULL; 4187 arc_callback_t *callback_list, *acb; 4188 int freeable = FALSE; 4189 4190 buf = zio->io_private; 4191 hdr = buf->b_hdr; 4192 4193 /* 4194 * The hdr was inserted into hash-table and removed from lists 4195 * prior to starting I/O. We should find this header, since 4196 * it's in the hash table, and it should be legit since it's 4197 * not possible to evict it during the I/O. The only possible 4198 * reason for it not to be found is if we were freed during the 4199 * read. 4200 */ 4201 if (HDR_IN_HASH_TABLE(hdr)) { 4202 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 4203 ASSERT3U(hdr->b_dva.dva_word[0], ==, 4204 BP_IDENTITY(zio->io_bp)->dva_word[0]); 4205 ASSERT3U(hdr->b_dva.dva_word[1], ==, 4206 BP_IDENTITY(zio->io_bp)->dva_word[1]); 4207 4208 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 4209 &hash_lock); 4210 4211 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 4212 hash_lock == NULL) || 4213 (found == hdr && 4214 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 4215 (found == hdr && HDR_L2_READING(hdr))); 4216 } 4217 4218 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 4219 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 4220 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 4221 4222 /* byteswap if necessary */ 4223 callback_list = hdr->b_l1hdr.b_acb; 4224 ASSERT(callback_list != NULL); 4225 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 4226 dmu_object_byteswap_t bswap = 4227 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 4228 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 4229 byteswap_uint64_array : 4230 dmu_ot_byteswap[bswap].ob_func; 4231 func(buf->b_data, hdr->b_size); 4232 } 4233 4234 arc_cksum_compute(buf, B_FALSE); 4235 arc_buf_watch(buf); 4236 4237 if (hash_lock && zio->io_error == 0 && 4238 hdr->b_l1hdr.b_state == arc_anon) { 4239 /* 4240 * Only call arc_access on anonymous buffers. This is because 4241 * if we've issued an I/O for an evicted buffer, we've already 4242 * called arc_access (to prevent any simultaneous readers from 4243 * getting confused). 4244 */ 4245 arc_access(hdr, hash_lock); 4246 } 4247 4248 /* create copies of the data buffer for the callers */ 4249 abuf = buf; 4250 for (acb = callback_list; acb; acb = acb->acb_next) { 4251 if (acb->acb_done) { 4252 if (abuf == NULL) { 4253 ARCSTAT_BUMP(arcstat_duplicate_reads); 4254 abuf = arc_buf_clone(buf); 4255 } 4256 acb->acb_buf = abuf; 4257 abuf = NULL; 4258 } 4259 } 4260 hdr->b_l1hdr.b_acb = NULL; 4261 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4262 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 4263 if (abuf == buf) { 4264 ASSERT(buf->b_efunc == NULL); 4265 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4266 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4267 } 4268 4269 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 4270 callback_list != NULL); 4271 4272 if (zio->io_error != 0) { 4273 hdr->b_flags |= ARC_FLAG_IO_ERROR; 4274 if (hdr->b_l1hdr.b_state != arc_anon) 4275 arc_change_state(arc_anon, hdr, hash_lock); 4276 if (HDR_IN_HASH_TABLE(hdr)) 4277 buf_hash_remove(hdr); 4278 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4279 } 4280 4281 /* 4282 * Broadcast before we drop the hash_lock to avoid the possibility 4283 * that the hdr (and hence the cv) might be freed before we get to 4284 * the cv_broadcast(). 4285 */ 4286 cv_broadcast(&hdr->b_l1hdr.b_cv); 4287 4288 if (hash_lock != NULL) { 4289 mutex_exit(hash_lock); 4290 } else { 4291 /* 4292 * This block was freed while we waited for the read to 4293 * complete. It has been removed from the hash table and 4294 * moved to the anonymous state (so that it won't show up 4295 * in the cache). 4296 */ 4297 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 4298 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 4299 } 4300 4301 /* execute each callback and free its structure */ 4302 while ((acb = callback_list) != NULL) { 4303 if (acb->acb_done) 4304 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 4305 4306 if (acb->acb_zio_dummy != NULL) { 4307 acb->acb_zio_dummy->io_error = zio->io_error; 4308 zio_nowait(acb->acb_zio_dummy); 4309 } 4310 4311 callback_list = acb->acb_next; 4312 kmem_free(acb, sizeof (arc_callback_t)); 4313 } 4314 4315 if (freeable) 4316 arc_hdr_destroy(hdr); 4317 } 4318 4319 /* 4320 * "Read" the block at the specified DVA (in bp) via the 4321 * cache. If the block is found in the cache, invoke the provided 4322 * callback immediately and return. Note that the `zio' parameter 4323 * in the callback will be NULL in this case, since no IO was 4324 * required. If the block is not in the cache pass the read request 4325 * on to the spa with a substitute callback function, so that the 4326 * requested block will be added to the cache. 4327 * 4328 * If a read request arrives for a block that has a read in-progress, 4329 * either wait for the in-progress read to complete (and return the 4330 * results); or, if this is a read with a "done" func, add a record 4331 * to the read to invoke the "done" func when the read completes, 4332 * and return; or just return. 4333 * 4334 * arc_read_done() will invoke all the requested "done" functions 4335 * for readers of this block. 4336 */ 4337 int 4338 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 4339 void *private, zio_priority_t priority, int zio_flags, 4340 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 4341 { 4342 arc_buf_hdr_t *hdr = NULL; 4343 arc_buf_t *buf = NULL; 4344 kmutex_t *hash_lock = NULL; 4345 zio_t *rzio; 4346 uint64_t guid = spa_load_guid(spa); 4347 4348 ASSERT(!BP_IS_EMBEDDED(bp) || 4349 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 4350 4351 top: 4352 if (!BP_IS_EMBEDDED(bp)) { 4353 /* 4354 * Embedded BP's have no DVA and require no I/O to "read". 4355 * Create an anonymous arc buf to back it. 4356 */ 4357 hdr = buf_hash_find(guid, bp, &hash_lock); 4358 } 4359 4360 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 4361 4362 *arc_flags |= ARC_FLAG_CACHED; 4363 4364 if (HDR_IO_IN_PROGRESS(hdr)) { 4365 4366 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 4367 priority == ZIO_PRIORITY_SYNC_READ) { 4368 /* 4369 * This sync read must wait for an 4370 * in-progress async read (e.g. a predictive 4371 * prefetch). Async reads are queued 4372 * separately at the vdev_queue layer, so 4373 * this is a form of priority inversion. 4374 * Ideally, we would "inherit" the demand 4375 * i/o's priority by moving the i/o from 4376 * the async queue to the synchronous queue, 4377 * but there is currently no mechanism to do 4378 * so. Track this so that we can evaluate 4379 * the magnitude of this potential performance 4380 * problem. 4381 * 4382 * Note that if the prefetch i/o is already 4383 * active (has been issued to the device), 4384 * the prefetch improved performance, because 4385 * we issued it sooner than we would have 4386 * without the prefetch. 4387 */ 4388 DTRACE_PROBE1(arc__sync__wait__for__async, 4389 arc_buf_hdr_t *, hdr); 4390 ARCSTAT_BUMP(arcstat_sync_wait_for_async); 4391 } 4392 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4393 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4394 } 4395 4396 if (*arc_flags & ARC_FLAG_WAIT) { 4397 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 4398 mutex_exit(hash_lock); 4399 goto top; 4400 } 4401 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4402 4403 if (done) { 4404 arc_callback_t *acb = NULL; 4405 4406 acb = kmem_zalloc(sizeof (arc_callback_t), 4407 KM_SLEEP); 4408 acb->acb_done = done; 4409 acb->acb_private = private; 4410 if (pio != NULL) 4411 acb->acb_zio_dummy = zio_null(pio, 4412 spa, NULL, NULL, NULL, zio_flags); 4413 4414 ASSERT(acb->acb_done != NULL); 4415 acb->acb_next = hdr->b_l1hdr.b_acb; 4416 hdr->b_l1hdr.b_acb = acb; 4417 add_reference(hdr, hash_lock, private); 4418 mutex_exit(hash_lock); 4419 return (0); 4420 } 4421 mutex_exit(hash_lock); 4422 return (0); 4423 } 4424 4425 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4426 hdr->b_l1hdr.b_state == arc_mfu); 4427 4428 if (done) { 4429 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4430 /* 4431 * This is a demand read which does not have to 4432 * wait for i/o because we did a predictive 4433 * prefetch i/o for it, which has completed. 4434 */ 4435 DTRACE_PROBE1( 4436 arc__demand__hit__predictive__prefetch, 4437 arc_buf_hdr_t *, hdr); 4438 ARCSTAT_BUMP( 4439 arcstat_demand_hit_predictive_prefetch); 4440 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4441 } 4442 add_reference(hdr, hash_lock, private); 4443 /* 4444 * If this block is already in use, create a new 4445 * copy of the data so that we will be guaranteed 4446 * that arc_release() will always succeed. 4447 */ 4448 buf = hdr->b_l1hdr.b_buf; 4449 ASSERT(buf); 4450 ASSERT(buf->b_data); 4451 if (HDR_BUF_AVAILABLE(hdr)) { 4452 ASSERT(buf->b_efunc == NULL); 4453 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4454 } else { 4455 buf = arc_buf_clone(buf); 4456 } 4457 4458 } else if (*arc_flags & ARC_FLAG_PREFETCH && 4459 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4460 hdr->b_flags |= ARC_FLAG_PREFETCH; 4461 } 4462 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 4463 arc_access(hdr, hash_lock); 4464 if (*arc_flags & ARC_FLAG_L2CACHE) 4465 hdr->b_flags |= ARC_FLAG_L2CACHE; 4466 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4467 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4468 mutex_exit(hash_lock); 4469 ARCSTAT_BUMP(arcstat_hits); 4470 arc_update_hit_stat(hdr, B_TRUE); 4471 4472 if (done) 4473 done(NULL, buf, private); 4474 } else { 4475 uint64_t size = BP_GET_LSIZE(bp); 4476 arc_callback_t *acb; 4477 vdev_t *vd = NULL; 4478 uint64_t addr = 0; 4479 boolean_t devw = B_FALSE; 4480 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 4481 int32_t b_asize = 0; 4482 4483 if (hdr == NULL) { 4484 /* this block is not in the cache */ 4485 arc_buf_hdr_t *exists = NULL; 4486 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 4487 buf = arc_buf_alloc(spa, size, private, type); 4488 hdr = buf->b_hdr; 4489 if (!BP_IS_EMBEDDED(bp)) { 4490 hdr->b_dva = *BP_IDENTITY(bp); 4491 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 4492 exists = buf_hash_insert(hdr, &hash_lock); 4493 } 4494 if (exists != NULL) { 4495 /* somebody beat us to the hash insert */ 4496 mutex_exit(hash_lock); 4497 buf_discard_identity(hdr); 4498 (void) arc_buf_remove_ref(buf, private); 4499 goto top; /* restart the IO request */ 4500 } 4501 4502 /* 4503 * If there is a callback, we pass our reference to 4504 * it; otherwise we remove our reference. 4505 */ 4506 if (done == NULL) { 4507 (void) remove_reference(hdr, hash_lock, 4508 private); 4509 } 4510 if (*arc_flags & ARC_FLAG_PREFETCH) 4511 hdr->b_flags |= ARC_FLAG_PREFETCH; 4512 if (*arc_flags & ARC_FLAG_L2CACHE) 4513 hdr->b_flags |= ARC_FLAG_L2CACHE; 4514 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4515 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4516 if (BP_GET_LEVEL(bp) > 0) 4517 hdr->b_flags |= ARC_FLAG_INDIRECT; 4518 } else { 4519 /* 4520 * This block is in the ghost cache. If it was L2-only 4521 * (and thus didn't have an L1 hdr), we realloc the 4522 * header to add an L1 hdr. 4523 */ 4524 if (!HDR_HAS_L1HDR(hdr)) { 4525 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4526 hdr_full_cache); 4527 } 4528 4529 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4530 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4531 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4532 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4533 4534 /* 4535 * If there is a callback, we pass a reference to it. 4536 */ 4537 if (done != NULL) 4538 add_reference(hdr, hash_lock, private); 4539 if (*arc_flags & ARC_FLAG_PREFETCH) 4540 hdr->b_flags |= ARC_FLAG_PREFETCH; 4541 if (*arc_flags & ARC_FLAG_L2CACHE) 4542 hdr->b_flags |= ARC_FLAG_L2CACHE; 4543 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4544 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4545 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4546 buf->b_hdr = hdr; 4547 buf->b_data = NULL; 4548 buf->b_efunc = NULL; 4549 buf->b_private = NULL; 4550 buf->b_next = NULL; 4551 hdr->b_l1hdr.b_buf = buf; 4552 ASSERT0(hdr->b_l1hdr.b_datacnt); 4553 hdr->b_l1hdr.b_datacnt = 1; 4554 arc_get_data_buf(buf); 4555 arc_access(hdr, hash_lock); 4556 } 4557 4558 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 4559 hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; 4560 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4561 4562 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4563 acb->acb_done = done; 4564 acb->acb_private = private; 4565 4566 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4567 hdr->b_l1hdr.b_acb = acb; 4568 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4569 4570 if (HDR_HAS_L2HDR(hdr) && 4571 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4572 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4573 addr = hdr->b_l2hdr.b_daddr; 4574 b_compress = hdr->b_l2hdr.b_compress; 4575 b_asize = hdr->b_l2hdr.b_asize; 4576 /* 4577 * Lock out device removal. 4578 */ 4579 if (vdev_is_dead(vd) || 4580 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4581 vd = NULL; 4582 } 4583 4584 if (hash_lock != NULL) 4585 mutex_exit(hash_lock); 4586 4587 /* 4588 * At this point, we have a level 1 cache miss. Try again in 4589 * L2ARC if possible. 4590 */ 4591 ASSERT3U(hdr->b_size, ==, size); 4592 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4593 uint64_t, size, zbookmark_phys_t *, zb); 4594 ARCSTAT_BUMP(arcstat_misses); 4595 arc_update_hit_stat(hdr, B_FALSE); 4596 4597 if (priority == ZIO_PRIORITY_ASYNC_READ) 4598 hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; 4599 else 4600 hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; 4601 4602 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4603 /* 4604 * Read from the L2ARC if the following are true: 4605 * 1. The L2ARC vdev was previously cached. 4606 * 2. This buffer still has L2ARC metadata. 4607 * 3. This buffer isn't currently writing to the L2ARC. 4608 * 4. The L2ARC entry wasn't evicted, which may 4609 * also have invalidated the vdev. 4610 * 5. This isn't prefetch and l2arc_noprefetch is set. 4611 */ 4612 if (HDR_HAS_L2HDR(hdr) && 4613 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4614 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4615 l2arc_read_callback_t *cb; 4616 4617 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4618 ARCSTAT_BUMP(arcstat_l2_hits); 4619 4620 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4621 KM_SLEEP); 4622 cb->l2rcb_buf = buf; 4623 cb->l2rcb_spa = spa; 4624 cb->l2rcb_bp = *bp; 4625 cb->l2rcb_zb = *zb; 4626 cb->l2rcb_flags = zio_flags; 4627 cb->l2rcb_compress = b_compress; 4628 4629 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4630 addr + size < vd->vdev_psize - 4631 VDEV_LABEL_END_SIZE); 4632 4633 /* 4634 * l2arc read. The SCL_L2ARC lock will be 4635 * released by l2arc_read_done(). 4636 * Issue a null zio if the underlying buffer 4637 * was squashed to zero size by compression. 4638 */ 4639 if (b_compress == ZIO_COMPRESS_EMPTY) { 4640 rzio = zio_null(pio, spa, vd, 4641 l2arc_read_done, cb, 4642 zio_flags | ZIO_FLAG_DONT_CACHE | 4643 ZIO_FLAG_CANFAIL | 4644 ZIO_FLAG_DONT_PROPAGATE | 4645 ZIO_FLAG_DONT_RETRY); 4646 } else { 4647 rzio = zio_read_phys(pio, vd, addr, 4648 b_asize, buf->b_data, 4649 ZIO_CHECKSUM_OFF, 4650 l2arc_read_done, cb, priority, 4651 zio_flags | ZIO_FLAG_DONT_CACHE | 4652 ZIO_FLAG_CANFAIL | 4653 ZIO_FLAG_DONT_PROPAGATE | 4654 ZIO_FLAG_DONT_RETRY, B_FALSE); 4655 } 4656 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4657 zio_t *, rzio); 4658 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4659 4660 if (*arc_flags & ARC_FLAG_NOWAIT) { 4661 zio_nowait(rzio); 4662 return (0); 4663 } 4664 4665 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4666 if (zio_wait(rzio) == 0) 4667 return (0); 4668 4669 /* l2arc read error; goto zio_read() */ 4670 } else { 4671 DTRACE_PROBE1(l2arc__miss, 4672 arc_buf_hdr_t *, hdr); 4673 ARCSTAT_BUMP(arcstat_l2_misses); 4674 if (HDR_L2_WRITING(hdr)) 4675 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4676 spa_config_exit(spa, SCL_L2ARC, vd); 4677 } 4678 } else { 4679 if (vd != NULL) 4680 spa_config_exit(spa, SCL_L2ARC, vd); 4681 if (l2arc_ndev != 0) { 4682 DTRACE_PROBE1(l2arc__miss, 4683 arc_buf_hdr_t *, hdr); 4684 ARCSTAT_BUMP(arcstat_l2_misses); 4685 } 4686 } 4687 4688 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4689 arc_read_done, buf, priority, zio_flags, zb); 4690 4691 if (*arc_flags & ARC_FLAG_WAIT) 4692 return (zio_wait(rzio)); 4693 4694 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4695 zio_nowait(rzio); 4696 } 4697 return (0); 4698 } 4699 4700 void 4701 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4702 { 4703 ASSERT(buf->b_hdr != NULL); 4704 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4705 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4706 func == NULL); 4707 ASSERT(buf->b_efunc == NULL); 4708 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4709 4710 buf->b_efunc = func; 4711 buf->b_private = private; 4712 } 4713 4714 /* 4715 * Notify the arc that a block was freed, and thus will never be used again. 4716 */ 4717 void 4718 arc_freed(spa_t *spa, const blkptr_t *bp) 4719 { 4720 arc_buf_hdr_t *hdr; 4721 kmutex_t *hash_lock; 4722 uint64_t guid = spa_load_guid(spa); 4723 4724 ASSERT(!BP_IS_EMBEDDED(bp)); 4725 4726 hdr = buf_hash_find(guid, bp, &hash_lock); 4727 if (hdr == NULL) 4728 return; 4729 if (HDR_BUF_AVAILABLE(hdr)) { 4730 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4731 add_reference(hdr, hash_lock, FTAG); 4732 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4733 mutex_exit(hash_lock); 4734 4735 arc_release(buf, FTAG); 4736 (void) arc_buf_remove_ref(buf, FTAG); 4737 } else { 4738 mutex_exit(hash_lock); 4739 } 4740 4741 } 4742 4743 /* 4744 * Clear the user eviction callback set by arc_set_callback(), first calling 4745 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4746 * clearing the callback may result in the arc_buf being destroyed. However, 4747 * it will not result in the *last* arc_buf being destroyed, hence the data 4748 * will remain cached in the ARC. We make a copy of the arc buffer here so 4749 * that we can process the callback without holding any locks. 4750 * 4751 * It's possible that the callback is already in the process of being cleared 4752 * by another thread. In this case we can not clear the callback. 4753 * 4754 * Returns B_TRUE if the callback was successfully called and cleared. 4755 */ 4756 boolean_t 4757 arc_clear_callback(arc_buf_t *buf) 4758 { 4759 arc_buf_hdr_t *hdr; 4760 kmutex_t *hash_lock; 4761 arc_evict_func_t *efunc = buf->b_efunc; 4762 void *private = buf->b_private; 4763 4764 mutex_enter(&buf->b_evict_lock); 4765 hdr = buf->b_hdr; 4766 if (hdr == NULL) { 4767 /* 4768 * We are in arc_do_user_evicts(). 4769 */ 4770 ASSERT(buf->b_data == NULL); 4771 mutex_exit(&buf->b_evict_lock); 4772 return (B_FALSE); 4773 } else if (buf->b_data == NULL) { 4774 /* 4775 * We are on the eviction list; process this buffer now 4776 * but let arc_do_user_evicts() do the reaping. 4777 */ 4778 buf->b_efunc = NULL; 4779 mutex_exit(&buf->b_evict_lock); 4780 VERIFY0(efunc(private)); 4781 return (B_TRUE); 4782 } 4783 hash_lock = HDR_LOCK(hdr); 4784 mutex_enter(hash_lock); 4785 hdr = buf->b_hdr; 4786 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4787 4788 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4789 hdr->b_l1hdr.b_datacnt); 4790 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4791 hdr->b_l1hdr.b_state == arc_mfu); 4792 4793 buf->b_efunc = NULL; 4794 buf->b_private = NULL; 4795 4796 if (hdr->b_l1hdr.b_datacnt > 1) { 4797 mutex_exit(&buf->b_evict_lock); 4798 arc_buf_destroy(buf, TRUE); 4799 } else { 4800 ASSERT(buf == hdr->b_l1hdr.b_buf); 4801 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4802 mutex_exit(&buf->b_evict_lock); 4803 } 4804 4805 mutex_exit(hash_lock); 4806 VERIFY0(efunc(private)); 4807 return (B_TRUE); 4808 } 4809 4810 /* 4811 * Release this buffer from the cache, making it an anonymous buffer. This 4812 * must be done after a read and prior to modifying the buffer contents. 4813 * If the buffer has more than one reference, we must make 4814 * a new hdr for the buffer. 4815 */ 4816 void 4817 arc_release(arc_buf_t *buf, void *tag) 4818 { 4819 arc_buf_hdr_t *hdr = buf->b_hdr; 4820 4821 /* 4822 * It would be nice to assert that if it's DMU metadata (level > 4823 * 0 || it's the dnode file), then it must be syncing context. 4824 * But we don't know that information at this level. 4825 */ 4826 4827 mutex_enter(&buf->b_evict_lock); 4828 4829 ASSERT(HDR_HAS_L1HDR(hdr)); 4830 4831 /* 4832 * We don't grab the hash lock prior to this check, because if 4833 * the buffer's header is in the arc_anon state, it won't be 4834 * linked into the hash table. 4835 */ 4836 if (hdr->b_l1hdr.b_state == arc_anon) { 4837 mutex_exit(&buf->b_evict_lock); 4838 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4839 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4840 ASSERT(!HDR_HAS_L2HDR(hdr)); 4841 ASSERT(BUF_EMPTY(hdr)); 4842 4843 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4844 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4845 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4846 4847 ASSERT3P(buf->b_efunc, ==, NULL); 4848 ASSERT3P(buf->b_private, ==, NULL); 4849 4850 hdr->b_l1hdr.b_arc_access = 0; 4851 arc_buf_thaw(buf); 4852 4853 return; 4854 } 4855 4856 kmutex_t *hash_lock = HDR_LOCK(hdr); 4857 mutex_enter(hash_lock); 4858 4859 /* 4860 * This assignment is only valid as long as the hash_lock is 4861 * held, we must be careful not to reference state or the 4862 * b_state field after dropping the lock. 4863 */ 4864 arc_state_t *state = hdr->b_l1hdr.b_state; 4865 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4866 ASSERT3P(state, !=, arc_anon); 4867 4868 /* this buffer is not on any list */ 4869 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4870 4871 if (HDR_HAS_L2HDR(hdr)) { 4872 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4873 4874 /* 4875 * We have to recheck this conditional again now that 4876 * we're holding the l2ad_mtx to prevent a race with 4877 * another thread which might be concurrently calling 4878 * l2arc_evict(). In that case, l2arc_evict() might have 4879 * destroyed the header's L2 portion as we were waiting 4880 * to acquire the l2ad_mtx. 4881 */ 4882 if (HDR_HAS_L2HDR(hdr)) 4883 arc_hdr_l2hdr_destroy(hdr); 4884 4885 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4886 } 4887 4888 /* 4889 * Do we have more than one buf? 4890 */ 4891 if (hdr->b_l1hdr.b_datacnt > 1) { 4892 arc_buf_hdr_t *nhdr; 4893 arc_buf_t **bufp; 4894 uint64_t blksz = hdr->b_size; 4895 uint64_t spa = hdr->b_spa; 4896 arc_buf_contents_t type = arc_buf_type(hdr); 4897 uint32_t flags = hdr->b_flags; 4898 4899 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4900 /* 4901 * Pull the data off of this hdr and attach it to 4902 * a new anonymous hdr. 4903 */ 4904 (void) remove_reference(hdr, hash_lock, tag); 4905 bufp = &hdr->b_l1hdr.b_buf; 4906 while (*bufp != buf) 4907 bufp = &(*bufp)->b_next; 4908 *bufp = buf->b_next; 4909 buf->b_next = NULL; 4910 4911 ASSERT3P(state, !=, arc_l2c_only); 4912 4913 (void) refcount_remove_many( 4914 &state->arcs_size, hdr->b_size, buf); 4915 4916 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4917 ASSERT3P(state, !=, arc_l2c_only); 4918 uint64_t *size = &state->arcs_lsize[type]; 4919 ASSERT3U(*size, >=, hdr->b_size); 4920 atomic_add_64(size, -hdr->b_size); 4921 } 4922 4923 /* 4924 * We're releasing a duplicate user data buffer, update 4925 * our statistics accordingly. 4926 */ 4927 if (HDR_ISTYPE_DATA(hdr)) { 4928 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4929 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4930 -hdr->b_size); 4931 } 4932 hdr->b_l1hdr.b_datacnt -= 1; 4933 arc_cksum_verify(buf); 4934 arc_buf_unwatch(buf); 4935 4936 mutex_exit(hash_lock); 4937 4938 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4939 nhdr->b_size = blksz; 4940 nhdr->b_spa = spa; 4941 4942 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4943 nhdr->b_flags |= arc_bufc_to_flags(type); 4944 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4945 4946 nhdr->b_l1hdr.b_buf = buf; 4947 nhdr->b_l1hdr.b_datacnt = 1; 4948 nhdr->b_l1hdr.b_state = arc_anon; 4949 nhdr->b_l1hdr.b_arc_access = 0; 4950 nhdr->b_l1hdr.b_tmp_cdata = NULL; 4951 nhdr->b_freeze_cksum = NULL; 4952 4953 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4954 buf->b_hdr = nhdr; 4955 mutex_exit(&buf->b_evict_lock); 4956 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); 4957 } else { 4958 mutex_exit(&buf->b_evict_lock); 4959 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4960 /* protected by hash lock, or hdr is on arc_anon */ 4961 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4962 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4963 arc_change_state(arc_anon, hdr, hash_lock); 4964 hdr->b_l1hdr.b_arc_access = 0; 4965 mutex_exit(hash_lock); 4966 4967 buf_discard_identity(hdr); 4968 arc_buf_thaw(buf); 4969 } 4970 buf->b_efunc = NULL; 4971 buf->b_private = NULL; 4972 } 4973 4974 int 4975 arc_released(arc_buf_t *buf) 4976 { 4977 int released; 4978 4979 mutex_enter(&buf->b_evict_lock); 4980 released = (buf->b_data != NULL && 4981 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4982 mutex_exit(&buf->b_evict_lock); 4983 return (released); 4984 } 4985 4986 #ifdef ZFS_DEBUG 4987 int 4988 arc_referenced(arc_buf_t *buf) 4989 { 4990 int referenced; 4991 4992 mutex_enter(&buf->b_evict_lock); 4993 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4994 mutex_exit(&buf->b_evict_lock); 4995 return (referenced); 4996 } 4997 #endif 4998 4999 static void 5000 arc_write_ready(zio_t *zio) 5001 { 5002 arc_write_callback_t *callback = zio->io_private; 5003 arc_buf_t *buf = callback->awcb_buf; 5004 arc_buf_hdr_t *hdr = buf->b_hdr; 5005 5006 ASSERT(HDR_HAS_L1HDR(hdr)); 5007 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 5008 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5009 callback->awcb_ready(zio, buf, callback->awcb_private); 5010 5011 /* 5012 * If the IO is already in progress, then this is a re-write 5013 * attempt, so we need to thaw and re-compute the cksum. 5014 * It is the responsibility of the callback to handle the 5015 * accounting for any re-write attempt. 5016 */ 5017 if (HDR_IO_IN_PROGRESS(hdr)) { 5018 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 5019 if (hdr->b_freeze_cksum != NULL) { 5020 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 5021 hdr->b_freeze_cksum = NULL; 5022 } 5023 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 5024 } 5025 arc_cksum_compute(buf, B_FALSE); 5026 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 5027 } 5028 5029 /* 5030 * The SPA calls this callback for each physical write that happens on behalf 5031 * of a logical write. See the comment in dbuf_write_physdone() for details. 5032 */ 5033 static void 5034 arc_write_physdone(zio_t *zio) 5035 { 5036 arc_write_callback_t *cb = zio->io_private; 5037 if (cb->awcb_physdone != NULL) 5038 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 5039 } 5040 5041 static void 5042 arc_write_done(zio_t *zio) 5043 { 5044 arc_write_callback_t *callback = zio->io_private; 5045 arc_buf_t *buf = callback->awcb_buf; 5046 arc_buf_hdr_t *hdr = buf->b_hdr; 5047 5048 ASSERT(hdr->b_l1hdr.b_acb == NULL); 5049 5050 if (zio->io_error == 0) { 5051 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 5052 buf_discard_identity(hdr); 5053 } else { 5054 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 5055 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 5056 } 5057 } else { 5058 ASSERT(BUF_EMPTY(hdr)); 5059 } 5060 5061 /* 5062 * If the block to be written was all-zero or compressed enough to be 5063 * embedded in the BP, no write was performed so there will be no 5064 * dva/birth/checksum. The buffer must therefore remain anonymous 5065 * (and uncached). 5066 */ 5067 if (!BUF_EMPTY(hdr)) { 5068 arc_buf_hdr_t *exists; 5069 kmutex_t *hash_lock; 5070 5071 ASSERT(zio->io_error == 0); 5072 5073 arc_cksum_verify(buf); 5074 5075 exists = buf_hash_insert(hdr, &hash_lock); 5076 if (exists != NULL) { 5077 /* 5078 * This can only happen if we overwrite for 5079 * sync-to-convergence, because we remove 5080 * buffers from the hash table when we arc_free(). 5081 */ 5082 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 5083 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5084 panic("bad overwrite, hdr=%p exists=%p", 5085 (void *)hdr, (void *)exists); 5086 ASSERT(refcount_is_zero( 5087 &exists->b_l1hdr.b_refcnt)); 5088 arc_change_state(arc_anon, exists, hash_lock); 5089 mutex_exit(hash_lock); 5090 arc_hdr_destroy(exists); 5091 exists = buf_hash_insert(hdr, &hash_lock); 5092 ASSERT3P(exists, ==, NULL); 5093 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 5094 /* nopwrite */ 5095 ASSERT(zio->io_prop.zp_nopwrite); 5096 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 5097 panic("bad nopwrite, hdr=%p exists=%p", 5098 (void *)hdr, (void *)exists); 5099 } else { 5100 /* Dedup */ 5101 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 5102 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 5103 ASSERT(BP_GET_DEDUP(zio->io_bp)); 5104 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 5105 } 5106 } 5107 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5108 /* if it's not anon, we are doing a scrub */ 5109 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 5110 arc_access(hdr, hash_lock); 5111 mutex_exit(hash_lock); 5112 } else { 5113 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 5114 } 5115 5116 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5117 callback->awcb_done(zio, buf, callback->awcb_private); 5118 5119 kmem_free(callback, sizeof (arc_write_callback_t)); 5120 } 5121 5122 zio_t * 5123 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 5124 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 5125 const zio_prop_t *zp, arc_done_func_t *ready, arc_done_func_t *physdone, 5126 arc_done_func_t *done, void *private, zio_priority_t priority, 5127 int zio_flags, const zbookmark_phys_t *zb) 5128 { 5129 arc_buf_hdr_t *hdr = buf->b_hdr; 5130 arc_write_callback_t *callback; 5131 zio_t *zio; 5132 5133 ASSERT(ready != NULL); 5134 ASSERT(done != NULL); 5135 ASSERT(!HDR_IO_ERROR(hdr)); 5136 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 5137 ASSERT(hdr->b_l1hdr.b_acb == NULL); 5138 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 5139 if (l2arc) 5140 hdr->b_flags |= ARC_FLAG_L2CACHE; 5141 if (l2arc_compress) 5142 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 5143 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 5144 callback->awcb_ready = ready; 5145 callback->awcb_physdone = physdone; 5146 callback->awcb_done = done; 5147 callback->awcb_private = private; 5148 callback->awcb_buf = buf; 5149 5150 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 5151 arc_write_ready, arc_write_physdone, arc_write_done, callback, 5152 priority, zio_flags, zb); 5153 5154 return (zio); 5155 } 5156 5157 static int 5158 arc_memory_throttle(uint64_t reserve, uint64_t txg) 5159 { 5160 #ifdef _KERNEL 5161 uint64_t available_memory = ptob(freemem); 5162 static uint64_t page_load = 0; 5163 static uint64_t last_txg = 0; 5164 5165 #if defined(__i386) 5166 available_memory = 5167 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 5168 #endif 5169 5170 if (freemem > physmem * arc_lotsfree_percent / 100) 5171 return (0); 5172 5173 if (txg > last_txg) { 5174 last_txg = txg; 5175 page_load = 0; 5176 } 5177 /* 5178 * If we are in pageout, we know that memory is already tight, 5179 * the arc is already going to be evicting, so we just want to 5180 * continue to let page writes occur as quickly as possible. 5181 */ 5182 if (curproc == proc_pageout) { 5183 if (page_load > MAX(ptob(minfree), available_memory) / 4) 5184 return (SET_ERROR(ERESTART)); 5185 /* Note: reserve is inflated, so we deflate */ 5186 page_load += reserve / 8; 5187 return (0); 5188 } else if (page_load > 0 && arc_reclaim_needed()) { 5189 /* memory is low, delay before restarting */ 5190 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 5191 return (SET_ERROR(EAGAIN)); 5192 } 5193 page_load = 0; 5194 #endif 5195 return (0); 5196 } 5197 5198 void 5199 arc_tempreserve_clear(uint64_t reserve) 5200 { 5201 atomic_add_64(&arc_tempreserve, -reserve); 5202 ASSERT((int64_t)arc_tempreserve >= 0); 5203 } 5204 5205 int 5206 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 5207 { 5208 int error; 5209 uint64_t anon_size; 5210 5211 if (reserve > arc_c/4 && !arc_no_grow) 5212 arc_c = MIN(arc_c_max, reserve * 4); 5213 if (reserve > arc_c) 5214 return (SET_ERROR(ENOMEM)); 5215 5216 /* 5217 * Don't count loaned bufs as in flight dirty data to prevent long 5218 * network delays from blocking transactions that are ready to be 5219 * assigned to a txg. 5220 */ 5221 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 5222 arc_loaned_bytes), 0); 5223 5224 /* 5225 * Writes will, almost always, require additional memory allocations 5226 * in order to compress/encrypt/etc the data. We therefore need to 5227 * make sure that there is sufficient available memory for this. 5228 */ 5229 error = arc_memory_throttle(reserve, txg); 5230 if (error != 0) 5231 return (error); 5232 5233 /* 5234 * Throttle writes when the amount of dirty data in the cache 5235 * gets too large. We try to keep the cache less than half full 5236 * of dirty blocks so that our sync times don't grow too large. 5237 * Note: if two requests come in concurrently, we might let them 5238 * both succeed, when one of them should fail. Not a huge deal. 5239 */ 5240 5241 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 5242 anon_size > arc_c / 4) { 5243 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 5244 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 5245 arc_tempreserve>>10, 5246 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 5247 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 5248 reserve>>10, arc_c>>10); 5249 return (SET_ERROR(ERESTART)); 5250 } 5251 atomic_add_64(&arc_tempreserve, reserve); 5252 return (0); 5253 } 5254 5255 static void 5256 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 5257 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 5258 { 5259 size->value.ui64 = refcount_count(&state->arcs_size); 5260 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 5261 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 5262 } 5263 5264 static int 5265 arc_kstat_update(kstat_t *ksp, int rw) 5266 { 5267 arc_stats_t *as = ksp->ks_data; 5268 5269 if (rw == KSTAT_WRITE) { 5270 return (EACCES); 5271 } else { 5272 arc_kstat_update_state(arc_anon, 5273 &as->arcstat_anon_size, 5274 &as->arcstat_anon_evictable_data, 5275 &as->arcstat_anon_evictable_metadata); 5276 arc_kstat_update_state(arc_mru, 5277 &as->arcstat_mru_size, 5278 &as->arcstat_mru_evictable_data, 5279 &as->arcstat_mru_evictable_metadata); 5280 arc_kstat_update_state(arc_mru_ghost, 5281 &as->arcstat_mru_ghost_size, 5282 &as->arcstat_mru_ghost_evictable_data, 5283 &as->arcstat_mru_ghost_evictable_metadata); 5284 arc_kstat_update_state(arc_mfu, 5285 &as->arcstat_mfu_size, 5286 &as->arcstat_mfu_evictable_data, 5287 &as->arcstat_mfu_evictable_metadata); 5288 arc_kstat_update_state(arc_mfu_ghost, 5289 &as->arcstat_mfu_ghost_size, 5290 &as->arcstat_mfu_ghost_evictable_data, 5291 &as->arcstat_mfu_ghost_evictable_metadata); 5292 } 5293 5294 return (0); 5295 } 5296 5297 /* 5298 * This function *must* return indices evenly distributed between all 5299 * sublists of the multilist. This is needed due to how the ARC eviction 5300 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 5301 * distributed between all sublists and uses this assumption when 5302 * deciding which sublist to evict from and how much to evict from it. 5303 */ 5304 unsigned int 5305 arc_state_multilist_index_func(multilist_t *ml, void *obj) 5306 { 5307 arc_buf_hdr_t *hdr = obj; 5308 5309 /* 5310 * We rely on b_dva to generate evenly distributed index 5311 * numbers using buf_hash below. So, as an added precaution, 5312 * let's make sure we never add empty buffers to the arc lists. 5313 */ 5314 ASSERT(!BUF_EMPTY(hdr)); 5315 5316 /* 5317 * The assumption here, is the hash value for a given 5318 * arc_buf_hdr_t will remain constant throughout it's lifetime 5319 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 5320 * Thus, we don't need to store the header's sublist index 5321 * on insertion, as this index can be recalculated on removal. 5322 * 5323 * Also, the low order bits of the hash value are thought to be 5324 * distributed evenly. Otherwise, in the case that the multilist 5325 * has a power of two number of sublists, each sublists' usage 5326 * would not be evenly distributed. 5327 */ 5328 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 5329 multilist_get_num_sublists(ml)); 5330 } 5331 5332 void 5333 arc_init(void) 5334 { 5335 /* 5336 * allmem is "all memory that we could possibly use". 5337 */ 5338 #ifdef _KERNEL 5339 uint64_t allmem = ptob(physmem - swapfs_minfree); 5340 #else 5341 uint64_t allmem = (physmem * PAGESIZE) / 2; 5342 #endif 5343 5344 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 5345 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 5346 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 5347 5348 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5349 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); 5350 5351 /* Convert seconds to clock ticks */ 5352 arc_min_prefetch_lifespan = 1 * hz; 5353 5354 /* Start out with 1/8 of all memory */ 5355 arc_c = allmem / 8; 5356 5357 #ifdef _KERNEL 5358 /* 5359 * On architectures where the physical memory can be larger 5360 * than the addressable space (intel in 32-bit mode), we may 5361 * need to limit the cache to 1/8 of VM size. 5362 */ 5363 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 5364 #endif 5365 5366 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 5367 arc_c_min = MAX(allmem / 32, 64 << 20); 5368 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 5369 if (allmem >= 1 << 30) 5370 arc_c_max = allmem - (1 << 30); 5371 else 5372 arc_c_max = arc_c_min; 5373 arc_c_max = MAX(allmem * 3 / 4, arc_c_max); 5374 5375 /* 5376 * In userland, there's only the memory pressure that we artificially 5377 * create (see arc_available_memory()). Don't let arc_c get too 5378 * small, because it can cause transactions to be larger than 5379 * arc_c, causing arc_tempreserve_space() to fail. 5380 */ 5381 #ifndef _KERNEL 5382 arc_c_min = arc_c_max / 2; 5383 #endif 5384 5385 /* 5386 * Allow the tunables to override our calculations if they are 5387 * reasonable (ie. over 64MB) 5388 */ 5389 if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) 5390 arc_c_max = zfs_arc_max; 5391 if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max) 5392 arc_c_min = zfs_arc_min; 5393 5394 arc_c = arc_c_max; 5395 arc_p = (arc_c >> 1); 5396 5397 /* limit meta-data to 1/4 of the arc capacity */ 5398 arc_meta_limit = arc_c_max / 4; 5399 5400 /* Allow the tunable to override if it is reasonable */ 5401 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 5402 arc_meta_limit = zfs_arc_meta_limit; 5403 5404 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 5405 arc_c_min = arc_meta_limit / 2; 5406 5407 if (zfs_arc_meta_min > 0) { 5408 arc_meta_min = zfs_arc_meta_min; 5409 } else { 5410 arc_meta_min = arc_c_min / 2; 5411 } 5412 5413 if (zfs_arc_grow_retry > 0) 5414 arc_grow_retry = zfs_arc_grow_retry; 5415 5416 if (zfs_arc_shrink_shift > 0) 5417 arc_shrink_shift = zfs_arc_shrink_shift; 5418 5419 /* 5420 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 5421 */ 5422 if (arc_no_grow_shift >= arc_shrink_shift) 5423 arc_no_grow_shift = arc_shrink_shift - 1; 5424 5425 if (zfs_arc_p_min_shift > 0) 5426 arc_p_min_shift = zfs_arc_p_min_shift; 5427 5428 if (zfs_arc_num_sublists_per_state < 1) 5429 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); 5430 5431 /* if kmem_flags are set, lets try to use less memory */ 5432 if (kmem_debugging()) 5433 arc_c = arc_c / 2; 5434 if (arc_c < arc_c_min) 5435 arc_c = arc_c_min; 5436 5437 arc_anon = &ARC_anon; 5438 arc_mru = &ARC_mru; 5439 arc_mru_ghost = &ARC_mru_ghost; 5440 arc_mfu = &ARC_mfu; 5441 arc_mfu_ghost = &ARC_mfu_ghost; 5442 arc_l2c_only = &ARC_l2c_only; 5443 arc_size = 0; 5444 5445 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 5446 sizeof (arc_buf_hdr_t), 5447 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5448 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5449 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 5450 sizeof (arc_buf_hdr_t), 5451 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5452 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5453 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 5454 sizeof (arc_buf_hdr_t), 5455 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5456 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5457 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 5458 sizeof (arc_buf_hdr_t), 5459 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5460 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5461 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 5462 sizeof (arc_buf_hdr_t), 5463 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5464 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5465 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 5466 sizeof (arc_buf_hdr_t), 5467 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5468 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5469 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 5470 sizeof (arc_buf_hdr_t), 5471 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5472 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5473 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 5474 sizeof (arc_buf_hdr_t), 5475 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5476 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5477 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 5478 sizeof (arc_buf_hdr_t), 5479 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5480 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5481 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 5482 sizeof (arc_buf_hdr_t), 5483 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5484 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5485 5486 refcount_create(&arc_anon->arcs_size); 5487 refcount_create(&arc_mru->arcs_size); 5488 refcount_create(&arc_mru_ghost->arcs_size); 5489 refcount_create(&arc_mfu->arcs_size); 5490 refcount_create(&arc_mfu_ghost->arcs_size); 5491 refcount_create(&arc_l2c_only->arcs_size); 5492 5493 buf_init(); 5494 5495 arc_reclaim_thread_exit = FALSE; 5496 arc_user_evicts_thread_exit = FALSE; 5497 arc_eviction_list = NULL; 5498 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 5499 5500 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 5501 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 5502 5503 if (arc_ksp != NULL) { 5504 arc_ksp->ks_data = &arc_stats; 5505 arc_ksp->ks_update = arc_kstat_update; 5506 kstat_install(arc_ksp); 5507 } 5508 5509 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 5510 TS_RUN, minclsyspri); 5511 5512 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, 5513 TS_RUN, minclsyspri); 5514 5515 arc_dead = FALSE; 5516 arc_warm = B_FALSE; 5517 5518 /* 5519 * Calculate maximum amount of dirty data per pool. 5520 * 5521 * If it has been set by /etc/system, take that. 5522 * Otherwise, use a percentage of physical memory defined by 5523 * zfs_dirty_data_max_percent (default 10%) with a cap at 5524 * zfs_dirty_data_max_max (default 4GB). 5525 */ 5526 if (zfs_dirty_data_max == 0) { 5527 zfs_dirty_data_max = physmem * PAGESIZE * 5528 zfs_dirty_data_max_percent / 100; 5529 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 5530 zfs_dirty_data_max_max); 5531 } 5532 } 5533 5534 void 5535 arc_fini(void) 5536 { 5537 mutex_enter(&arc_reclaim_lock); 5538 arc_reclaim_thread_exit = TRUE; 5539 /* 5540 * The reclaim thread will set arc_reclaim_thread_exit back to 5541 * FALSE when it is finished exiting; we're waiting for that. 5542 */ 5543 while (arc_reclaim_thread_exit) { 5544 cv_signal(&arc_reclaim_thread_cv); 5545 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 5546 } 5547 mutex_exit(&arc_reclaim_lock); 5548 5549 mutex_enter(&arc_user_evicts_lock); 5550 arc_user_evicts_thread_exit = TRUE; 5551 /* 5552 * The user evicts thread will set arc_user_evicts_thread_exit 5553 * to FALSE when it is finished exiting; we're waiting for that. 5554 */ 5555 while (arc_user_evicts_thread_exit) { 5556 cv_signal(&arc_user_evicts_cv); 5557 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); 5558 } 5559 mutex_exit(&arc_user_evicts_lock); 5560 5561 /* Use TRUE to ensure *all* buffers are evicted */ 5562 arc_flush(NULL, TRUE); 5563 5564 arc_dead = TRUE; 5565 5566 if (arc_ksp != NULL) { 5567 kstat_delete(arc_ksp); 5568 arc_ksp = NULL; 5569 } 5570 5571 mutex_destroy(&arc_reclaim_lock); 5572 cv_destroy(&arc_reclaim_thread_cv); 5573 cv_destroy(&arc_reclaim_waiters_cv); 5574 5575 mutex_destroy(&arc_user_evicts_lock); 5576 cv_destroy(&arc_user_evicts_cv); 5577 5578 refcount_destroy(&arc_anon->arcs_size); 5579 refcount_destroy(&arc_mru->arcs_size); 5580 refcount_destroy(&arc_mru_ghost->arcs_size); 5581 refcount_destroy(&arc_mfu->arcs_size); 5582 refcount_destroy(&arc_mfu_ghost->arcs_size); 5583 refcount_destroy(&arc_l2c_only->arcs_size); 5584 5585 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 5586 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 5587 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 5588 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 5589 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 5590 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 5591 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 5592 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 5593 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 5594 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 5595 5596 buf_fini(); 5597 5598 ASSERT0(arc_loaned_bytes); 5599 } 5600 5601 /* 5602 * Level 2 ARC 5603 * 5604 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5605 * It uses dedicated storage devices to hold cached data, which are populated 5606 * using large infrequent writes. The main role of this cache is to boost 5607 * the performance of random read workloads. The intended L2ARC devices 5608 * include short-stroked disks, solid state disks, and other media with 5609 * substantially faster read latency than disk. 5610 * 5611 * +-----------------------+ 5612 * | ARC | 5613 * +-----------------------+ 5614 * | ^ ^ 5615 * | | | 5616 * l2arc_feed_thread() arc_read() 5617 * | | | 5618 * | l2arc read | 5619 * V | | 5620 * +---------------+ | 5621 * | L2ARC | | 5622 * +---------------+ | 5623 * | ^ | 5624 * l2arc_write() | | 5625 * | | | 5626 * V | | 5627 * +-------+ +-------+ 5628 * | vdev | | vdev | 5629 * | cache | | cache | 5630 * +-------+ +-------+ 5631 * +=========+ .-----. 5632 * : L2ARC : |-_____-| 5633 * : devices : | Disks | 5634 * +=========+ `-_____-' 5635 * 5636 * Read requests are satisfied from the following sources, in order: 5637 * 5638 * 1) ARC 5639 * 2) vdev cache of L2ARC devices 5640 * 3) L2ARC devices 5641 * 4) vdev cache of disks 5642 * 5) disks 5643 * 5644 * Some L2ARC device types exhibit extremely slow write performance. 5645 * To accommodate for this there are some significant differences between 5646 * the L2ARC and traditional cache design: 5647 * 5648 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5649 * the ARC behave as usual, freeing buffers and placing headers on ghost 5650 * lists. The ARC does not send buffers to the L2ARC during eviction as 5651 * this would add inflated write latencies for all ARC memory pressure. 5652 * 5653 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5654 * It does this by periodically scanning buffers from the eviction-end of 5655 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5656 * not already there. It scans until a headroom of buffers is satisfied, 5657 * which itself is a buffer for ARC eviction. If a compressible buffer is 5658 * found during scanning and selected for writing to an L2ARC device, we 5659 * temporarily boost scanning headroom during the next scan cycle to make 5660 * sure we adapt to compression effects (which might significantly reduce 5661 * the data volume we write to L2ARC). The thread that does this is 5662 * l2arc_feed_thread(), illustrated below; example sizes are included to 5663 * provide a better sense of ratio than this diagram: 5664 * 5665 * head --> tail 5666 * +---------------------+----------+ 5667 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5668 * +---------------------+----------+ | o L2ARC eligible 5669 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5670 * +---------------------+----------+ | 5671 * 15.9 Gbytes ^ 32 Mbytes | 5672 * headroom | 5673 * l2arc_feed_thread() 5674 * | 5675 * l2arc write hand <--[oooo]--' 5676 * | 8 Mbyte 5677 * | write max 5678 * V 5679 * +==============================+ 5680 * L2ARC dev |####|#|###|###| |####| ... | 5681 * +==============================+ 5682 * 32 Gbytes 5683 * 5684 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5685 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5686 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5687 * safe to say that this is an uncommon case, since buffers at the end of 5688 * the ARC lists have moved there due to inactivity. 5689 * 5690 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5691 * then the L2ARC simply misses copying some buffers. This serves as a 5692 * pressure valve to prevent heavy read workloads from both stalling the ARC 5693 * with waits and clogging the L2ARC with writes. This also helps prevent 5694 * the potential for the L2ARC to churn if it attempts to cache content too 5695 * quickly, such as during backups of the entire pool. 5696 * 5697 * 5. After system boot and before the ARC has filled main memory, there are 5698 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5699 * lists can remain mostly static. Instead of searching from tail of these 5700 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5701 * for eligible buffers, greatly increasing its chance of finding them. 5702 * 5703 * The L2ARC device write speed is also boosted during this time so that 5704 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5705 * there are no L2ARC reads, and no fear of degrading read performance 5706 * through increased writes. 5707 * 5708 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5709 * the vdev queue can aggregate them into larger and fewer writes. Each 5710 * device is written to in a rotor fashion, sweeping writes through 5711 * available space then repeating. 5712 * 5713 * 7. The L2ARC does not store dirty content. It never needs to flush 5714 * write buffers back to disk based storage. 5715 * 5716 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5717 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5718 * 5719 * The performance of the L2ARC can be tweaked by a number of tunables, which 5720 * may be necessary for different workloads: 5721 * 5722 * l2arc_write_max max write bytes per interval 5723 * l2arc_write_boost extra write bytes during device warmup 5724 * l2arc_noprefetch skip caching prefetched buffers 5725 * l2arc_headroom number of max device writes to precache 5726 * l2arc_headroom_boost when we find compressed buffers during ARC 5727 * scanning, we multiply headroom by this 5728 * percentage factor for the next scan cycle, 5729 * since more compressed buffers are likely to 5730 * be present 5731 * l2arc_feed_secs seconds between L2ARC writing 5732 * 5733 * Tunables may be removed or added as future performance improvements are 5734 * integrated, and also may become zpool properties. 5735 * 5736 * There are three key functions that control how the L2ARC warms up: 5737 * 5738 * l2arc_write_eligible() check if a buffer is eligible to cache 5739 * l2arc_write_size() calculate how much to write 5740 * l2arc_write_interval() calculate sleep delay between writes 5741 * 5742 * These three functions determine what to write, how much, and how quickly 5743 * to send writes. 5744 * 5745 * L2ARC persistency: 5746 * 5747 * When writing buffers to L2ARC, we periodically add some metadata to 5748 * make sure we can pick them up after reboot, thus dramatically reducing 5749 * the impact that any downtime has on the performance of storage systems 5750 * with large caches. 5751 * 5752 * The implementation works fairly simply by integrating the following two 5753 * modifications: 5754 * 5755 * *) Every now and then we mix in a piece of metadata (called a log block) 5756 * into the L2ARC write. This allows us to understand what's been written, 5757 * so that we can rebuild the arc_buf_hdr_t structures of the main ARC 5758 * buffers. The log block also includes a "2-back-reference" pointer to 5759 * he second-to-previous block, forming a back-linked list of blocks on 5760 * the L2ARC device. 5761 * 5762 * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device 5763 * for our header bookkeeping purposes. This contains a device header, 5764 * which contains our top-level reference structures. We update it each 5765 * time we write a new log block, so that we're able to locate it in the 5766 * L2ARC device. If this write results in an inconsistent device header 5767 * (e.g. due to power failure), we detect this by verifying the header's 5768 * checksum and simply drop the entries from L2ARC. 5769 * 5770 * Implementation diagram: 5771 * 5772 * +=== L2ARC device (not to scale) ======================================+ 5773 * | ___two newest log block pointers__.__________ | 5774 * | / \1 back \latest | 5775 * |.____/_. V V | 5776 * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| 5777 * || hdr| ^ /^ /^ / / | 5778 * |+------+ ...--\-------/ \-----/--\------/ / | 5779 * | \--------------/ \--------------/ | 5780 * +======================================================================+ 5781 * 5782 * As can be seen on the diagram, rather than using a simple linked list, 5783 * we use a pair of linked lists with alternating elements. This is a 5784 * performance enhancement due to the fact that we only find out of the 5785 * address of the next log block access once the current block has been 5786 * completely read in. Obviously, this hurts performance, because we'd be 5787 * keeping the device's I/O queue at only a 1 operation deep, thus 5788 * incurring a large amount of I/O round-trip latency. Having two lists 5789 * allows us to "prefetch" two log blocks ahead of where we are currently 5790 * rebuilding L2ARC buffers. 5791 * 5792 * On-device data structures: 5793 * 5794 * L2ARC device header: l2arc_dev_hdr_phys_t 5795 * L2ARC log block: l2arc_log_blk_phys_t 5796 * 5797 * L2ARC reconstruction: 5798 * 5799 * When writing data, we simply write in the standard rotary fashion, 5800 * evicting buffers as we go and simply writing new data over them (writing 5801 * a new log block every now and then). This obviously means that once we 5802 * loop around the end of the device, we will start cutting into an already 5803 * committed log block (and its referenced data buffers), like so: 5804 * 5805 * current write head__ __old tail 5806 * \ / 5807 * V V 5808 * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> 5809 * ^ ^^^^^^^^^___________________________________ 5810 * | \ 5811 * <<nextwrite>> may overwrite this blk and/or its bufs --' 5812 * 5813 * When importing the pool, we detect this situation and use it to stop 5814 * our scanning process (see l2arc_rebuild). 5815 * 5816 * There is one significant caveat to consider when rebuilding ARC contents 5817 * from an L2ARC device: what about invalidated buffers? Given the above 5818 * construction, we cannot update blocks which we've already written to amend 5819 * them to remove buffers which were invalidated. Thus, during reconstruction, 5820 * we might be populating the cache with buffers for data that's not on the 5821 * main pool anymore, or may have been overwritten! 5822 * 5823 * As it turns out, this isn't a problem. Every arc_read request includes 5824 * both the DVA and, crucially, the birth TXG of the BP the caller is 5825 * looking for. So even if the cache were populated by completely rotten 5826 * blocks for data that had been long deleted and/or overwritten, we'll 5827 * never actually return bad data from the cache, since the DVA with the 5828 * birth TXG uniquely identify a block in space and time - once created, 5829 * a block is immutable on disk. The worst thing we have done is wasted 5830 * some time and memory at l2arc rebuild to reconstruct outdated ARC 5831 * entries that will get dropped from the l2arc as it is being updated 5832 * with new blocks. 5833 */ 5834 5835 static boolean_t 5836 l2arc_write_eligible(uint64_t spa_guid, uint64_t sync_txg, arc_buf_hdr_t *hdr) 5837 { 5838 /* 5839 * A buffer is *not* eligible for the L2ARC if it: 5840 * 1. belongs to a different spa. 5841 * 2. is already cached on the L2ARC. 5842 * 3. has an I/O in progress (it may be an incomplete read). 5843 * 4. is flagged not eligible (zfs property). 5844 * 5. is part of the syncing txg (and thus subject to change). 5845 */ 5846 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || 5847 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr) || 5848 hdr->b_birth >= sync_txg) 5849 return (B_FALSE); 5850 5851 return (B_TRUE); 5852 } 5853 5854 static uint64_t 5855 l2arc_write_size(void) 5856 { 5857 uint64_t size; 5858 5859 /* 5860 * Make sure our globals have meaningful values in case the user 5861 * altered them. 5862 */ 5863 size = l2arc_write_max; 5864 if (size == 0) { 5865 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5866 "be greater than zero, resetting it to the default (%d)", 5867 L2ARC_WRITE_SIZE); 5868 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5869 } 5870 5871 if (arc_warm == B_FALSE) 5872 size += l2arc_write_boost; 5873 5874 return (size); 5875 5876 } 5877 5878 static clock_t 5879 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5880 { 5881 clock_t interval, next, now; 5882 5883 /* 5884 * If the ARC lists are busy, increase our write rate; if the 5885 * lists are stale, idle back. This is achieved by checking 5886 * how much we previously wrote - if it was more than half of 5887 * what we wanted, schedule the next write much sooner. 5888 */ 5889 if (l2arc_feed_again && wrote > (wanted / 2)) 5890 interval = (hz * l2arc_feed_min_ms) / 1000; 5891 else 5892 interval = hz * l2arc_feed_secs; 5893 5894 now = ddi_get_lbolt(); 5895 next = MAX(now, MIN(now + interval, began + interval)); 5896 5897 return (next); 5898 } 5899 5900 /* 5901 * Cycle through L2ARC devices. This is how L2ARC load balances. 5902 * If a device is returned, this also returns holding the spa config lock. 5903 */ 5904 static l2arc_dev_t * 5905 l2arc_dev_get_next(void) 5906 { 5907 l2arc_dev_t *first, *next = NULL; 5908 5909 /* 5910 * Lock out the removal of spas (spa_namespace_lock), then removal 5911 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5912 * both locks will be dropped and a spa config lock held instead. 5913 */ 5914 mutex_enter(&spa_namespace_lock); 5915 mutex_enter(&l2arc_dev_mtx); 5916 5917 /* if there are no vdevs, there is nothing to do */ 5918 if (l2arc_ndev == 0) 5919 goto out; 5920 5921 first = NULL; 5922 next = l2arc_dev_last; 5923 do { 5924 /* loop around the list looking for a non-faulted vdev */ 5925 if (next == NULL) { 5926 next = list_head(l2arc_dev_list); 5927 } else { 5928 next = list_next(l2arc_dev_list, next); 5929 if (next == NULL) 5930 next = list_head(l2arc_dev_list); 5931 } 5932 5933 /* if we have come back to the start, bail out */ 5934 if (first == NULL) 5935 first = next; 5936 else if (next == first) 5937 break; 5938 5939 } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild); 5940 5941 /* if we were unable to find any usable vdevs, return NULL */ 5942 if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild) 5943 next = NULL; 5944 5945 l2arc_dev_last = next; 5946 5947 out: 5948 mutex_exit(&l2arc_dev_mtx); 5949 5950 /* 5951 * Grab the config lock to prevent the 'next' device from being 5952 * removed while we are writing to it. 5953 */ 5954 if (next != NULL) 5955 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5956 mutex_exit(&spa_namespace_lock); 5957 5958 return (next); 5959 } 5960 5961 /* 5962 * Free buffers that were tagged for destruction. 5963 */ 5964 static void 5965 l2arc_do_free_on_write() 5966 { 5967 list_t *buflist; 5968 l2arc_data_free_t *df, *df_prev; 5969 5970 mutex_enter(&l2arc_free_on_write_mtx); 5971 buflist = l2arc_free_on_write; 5972 5973 for (df = list_tail(buflist); df; df = df_prev) { 5974 df_prev = list_prev(buflist, df); 5975 ASSERT(df->l2df_data != NULL); 5976 ASSERT(df->l2df_func != NULL); 5977 df->l2df_func(df->l2df_data, df->l2df_size); 5978 list_remove(buflist, df); 5979 kmem_free(df, sizeof (l2arc_data_free_t)); 5980 } 5981 5982 mutex_exit(&l2arc_free_on_write_mtx); 5983 } 5984 5985 /* 5986 * A write to a cache device has completed. Update all headers to allow 5987 * reads from these buffers to begin. 5988 */ 5989 static void 5990 l2arc_write_done(zio_t *zio) 5991 { 5992 l2arc_write_callback_t *cb; 5993 l2arc_dev_t *dev; 5994 list_t *buflist; 5995 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5996 kmutex_t *hash_lock; 5997 int64_t bytes_dropped = 0; 5998 l2arc_log_blk_buf_t *lb_buf; 5999 6000 cb = zio->io_private; 6001 ASSERT(cb != NULL); 6002 dev = cb->l2wcb_dev; 6003 ASSERT(dev != NULL); 6004 head = cb->l2wcb_head; 6005 ASSERT(head != NULL); 6006 buflist = &dev->l2ad_buflist; 6007 ASSERT(buflist != NULL); 6008 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 6009 l2arc_write_callback_t *, cb); 6010 6011 if (zio->io_error != 0) 6012 ARCSTAT_BUMP(arcstat_l2_writes_error); 6013 6014 /* 6015 * All writes completed, or an error was hit. 6016 */ 6017 top: 6018 mutex_enter(&dev->l2ad_mtx); 6019 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 6020 hdr_prev = list_prev(buflist, hdr); 6021 6022 hash_lock = HDR_LOCK(hdr); 6023 6024 /* 6025 * We cannot use mutex_enter or else we can deadlock 6026 * with l2arc_write_buffers (due to swapping the order 6027 * the hash lock and l2ad_mtx are taken). 6028 */ 6029 if (!mutex_tryenter(hash_lock)) { 6030 /* 6031 * Missed the hash lock. We must retry so we 6032 * don't leave the ARC_FLAG_L2_WRITING bit set. 6033 */ 6034 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 6035 6036 /* 6037 * We don't want to rescan the headers we've 6038 * already marked as having been written out, so 6039 * we reinsert the head node so we can pick up 6040 * where we left off. 6041 */ 6042 list_remove(buflist, head); 6043 list_insert_after(buflist, hdr, head); 6044 6045 mutex_exit(&dev->l2ad_mtx); 6046 6047 /* 6048 * We wait for the hash lock to become available 6049 * to try and prevent busy waiting, and increase 6050 * the chance we'll be able to acquire the lock 6051 * the next time around. 6052 */ 6053 mutex_enter(hash_lock); 6054 mutex_exit(hash_lock); 6055 goto top; 6056 } 6057 6058 /* 6059 * We could not have been moved into the arc_l2c_only 6060 * state while in-flight due to our ARC_FLAG_L2_WRITING 6061 * bit being set. Let's just ensure that's being enforced. 6062 */ 6063 ASSERT(HDR_HAS_L1HDR(hdr)); 6064 6065 /* 6066 * We may have allocated a buffer for L2ARC compression, 6067 * we must release it to avoid leaking this data. 6068 */ 6069 l2arc_release_cdata_buf(hdr); 6070 6071 if (zio->io_error != 0) { 6072 /* 6073 * Error - drop L2ARC entry. 6074 */ 6075 list_remove(buflist, hdr); 6076 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 6077 6078 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 6079 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 6080 6081 bytes_dropped += hdr->b_l2hdr.b_asize; 6082 (void) refcount_remove_many(&dev->l2ad_alloc, 6083 hdr->b_l2hdr.b_asize, hdr); 6084 } 6085 6086 /* 6087 * Allow ARC to begin reads and ghost list evictions to 6088 * this L2ARC entry. 6089 */ 6090 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 6091 6092 mutex_exit(hash_lock); 6093 } 6094 6095 atomic_inc_64(&l2arc_writes_done); 6096 list_remove(buflist, head); 6097 ASSERT(!HDR_HAS_L1HDR(head)); 6098 kmem_cache_free(hdr_l2only_cache, head); 6099 mutex_exit(&dev->l2ad_mtx); 6100 6101 ASSERT(dev->l2ad_vdev != NULL); 6102 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 6103 6104 l2arc_do_free_on_write(); 6105 6106 while ((lb_buf = list_remove_tail(&cb->l2wcb_log_blk_buflist)) != NULL) 6107 kmem_free(lb_buf, sizeof (*lb_buf)); 6108 list_destroy(&cb->l2wcb_log_blk_buflist); 6109 kmem_free(cb, sizeof (l2arc_write_callback_t)); 6110 } 6111 6112 /* 6113 * A read to a cache device completed. Validate buffer contents before 6114 * handing over to the regular ARC routines. 6115 */ 6116 static void 6117 l2arc_read_done(zio_t *zio) 6118 { 6119 l2arc_read_callback_t *cb; 6120 arc_buf_hdr_t *hdr; 6121 arc_buf_t *buf; 6122 kmutex_t *hash_lock; 6123 int equal; 6124 6125 ASSERT(zio->io_vd != NULL); 6126 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 6127 6128 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 6129 6130 cb = zio->io_private; 6131 ASSERT(cb != NULL); 6132 buf = cb->l2rcb_buf; 6133 ASSERT(buf != NULL); 6134 6135 hash_lock = HDR_LOCK(buf->b_hdr); 6136 mutex_enter(hash_lock); 6137 hdr = buf->b_hdr; 6138 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 6139 6140 /* 6141 * If the buffer was compressed, decompress it first. 6142 */ 6143 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 6144 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 6145 ASSERT(zio->io_data != NULL); 6146 ASSERT3U(zio->io_size, ==, hdr->b_size); 6147 ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); 6148 6149 /* 6150 * Check this survived the L2ARC journey. 6151 */ 6152 equal = arc_cksum_equal(buf); 6153 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 6154 mutex_exit(hash_lock); 6155 zio->io_private = buf; 6156 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 6157 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 6158 arc_read_done(zio); 6159 } else { 6160 mutex_exit(hash_lock); 6161 /* 6162 * Buffer didn't survive caching. Increment stats and 6163 * reissue to the original storage device. 6164 */ 6165 if (zio->io_error != 0) { 6166 ARCSTAT_BUMP(arcstat_l2_io_error); 6167 } else { 6168 zio->io_error = SET_ERROR(EIO); 6169 } 6170 if (!equal) 6171 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 6172 6173 /* 6174 * If there's no waiter, issue an async i/o to the primary 6175 * storage now. If there *is* a waiter, the caller must 6176 * issue the i/o in a context where it's OK to block. 6177 */ 6178 if (zio->io_waiter == NULL) { 6179 zio_t *pio = zio_unique_parent(zio); 6180 6181 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 6182 6183 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 6184 buf->b_data, hdr->b_size, arc_read_done, buf, 6185 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 6186 } 6187 } 6188 6189 kmem_free(cb, sizeof (l2arc_read_callback_t)); 6190 } 6191 6192 /* 6193 * This is the list priority from which the L2ARC will search for pages to 6194 * cache. This is used within loops (0..3) to cycle through lists in the 6195 * desired order. This order can have a significant effect on cache 6196 * performance. 6197 * 6198 * Currently the metadata lists are hit first, MFU then MRU, followed by 6199 * the data lists. This function returns a locked list, and also returns 6200 * the lock pointer. 6201 */ 6202 static multilist_sublist_t * 6203 l2arc_sublist_lock(int list_num) 6204 { 6205 multilist_t *ml = NULL; 6206 unsigned int idx; 6207 6208 ASSERT(list_num >= 0 && list_num <= 3); 6209 6210 switch (list_num) { 6211 case 0: 6212 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 6213 break; 6214 case 1: 6215 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 6216 break; 6217 case 2: 6218 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 6219 break; 6220 case 3: 6221 ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; 6222 break; 6223 } 6224 6225 /* 6226 * Return a randomly-selected sublist. This is acceptable 6227 * because the caller feeds only a little bit of data for each 6228 * call (8MB). Subsequent calls will result in different 6229 * sublists being selected. 6230 */ 6231 idx = multilist_get_random_index(ml); 6232 return (multilist_sublist_lock(ml, idx)); 6233 } 6234 6235 /* 6236 * Calculates the maximum overhead of L2ARC metadata log blocks for a given 6237 * L2ARC write size. l2arc_evict and l2arc_write_buffers need to include this 6238 * overhead in processing to make sure there is enough headroom available 6239 * when writing buffers. 6240 */ 6241 static inline uint64_t 6242 l2arc_log_blk_overhead(uint64_t write_sz) 6243 { 6244 return ((write_sz / SPA_MINBLOCKSIZE / L2ARC_LOG_BLK_ENTRIES) + 1) * 6245 L2ARC_LOG_BLK_SIZE; 6246 } 6247 6248 /* 6249 * Evict buffers from the device write hand to the distance specified in 6250 * bytes. This distance may span populated buffers, it may span nothing. 6251 * This is clearing a region on the L2ARC device ready for writing. 6252 * If the 'all' boolean is set, every buffer is evicted. 6253 */ 6254 static void 6255 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 6256 { 6257 list_t *buflist; 6258 arc_buf_hdr_t *hdr, *hdr_prev; 6259 kmutex_t *hash_lock; 6260 uint64_t taddr; 6261 6262 buflist = &dev->l2ad_buflist; 6263 6264 if (!all && dev->l2ad_first) { 6265 /* 6266 * This is the first sweep through the device. There is 6267 * nothing to evict. 6268 */ 6269 return; 6270 } 6271 6272 /* 6273 * We need to add in the worst case scenario of log block overhead. 6274 */ 6275 distance += l2arc_log_blk_overhead(distance); 6276 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 6277 /* 6278 * When nearing the end of the device, evict to the end 6279 * before the device write hand jumps to the start. 6280 */ 6281 taddr = dev->l2ad_end; 6282 } else { 6283 taddr = dev->l2ad_hand + distance; 6284 } 6285 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 6286 uint64_t, taddr, boolean_t, all); 6287 6288 top: 6289 mutex_enter(&dev->l2ad_mtx); 6290 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 6291 hdr_prev = list_prev(buflist, hdr); 6292 6293 hash_lock = HDR_LOCK(hdr); 6294 6295 /* 6296 * We cannot use mutex_enter or else we can deadlock 6297 * with l2arc_write_buffers (due to swapping the order 6298 * the hash lock and l2ad_mtx are taken). 6299 */ 6300 if (!mutex_tryenter(hash_lock)) { 6301 /* 6302 * Missed the hash lock. Retry. 6303 */ 6304 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 6305 mutex_exit(&dev->l2ad_mtx); 6306 mutex_enter(hash_lock); 6307 mutex_exit(hash_lock); 6308 goto top; 6309 } 6310 6311 if (HDR_L2_WRITE_HEAD(hdr)) { 6312 /* 6313 * We hit a write head node. Leave it for 6314 * l2arc_write_done(). 6315 */ 6316 list_remove(buflist, hdr); 6317 mutex_exit(hash_lock); 6318 continue; 6319 } 6320 6321 if (!all && HDR_HAS_L2HDR(hdr) && 6322 (hdr->b_l2hdr.b_daddr > taddr || 6323 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 6324 /* 6325 * We've evicted to the target address, 6326 * or the end of the device. 6327 */ 6328 mutex_exit(hash_lock); 6329 break; 6330 } 6331 6332 ASSERT(HDR_HAS_L2HDR(hdr)); 6333 if (!HDR_HAS_L1HDR(hdr)) { 6334 ASSERT(!HDR_L2_READING(hdr)); 6335 /* 6336 * This doesn't exist in the ARC. Destroy. 6337 * arc_hdr_destroy() will call list_remove() 6338 * and decrement arcstat_l2_size. 6339 */ 6340 arc_change_state(arc_anon, hdr, hash_lock); 6341 arc_hdr_destroy(hdr); 6342 } else { 6343 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 6344 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 6345 /* 6346 * Invalidate issued or about to be issued 6347 * reads, since we may be about to write 6348 * over this location. 6349 */ 6350 if (HDR_L2_READING(hdr)) { 6351 ARCSTAT_BUMP(arcstat_l2_evict_reading); 6352 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 6353 } 6354 6355 /* Ensure this header has finished being written */ 6356 ASSERT(!HDR_L2_WRITING(hdr)); 6357 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6358 6359 arc_hdr_l2hdr_destroy(hdr); 6360 } 6361 mutex_exit(hash_lock); 6362 } 6363 mutex_exit(&dev->l2ad_mtx); 6364 } 6365 6366 /* 6367 * Find and write ARC buffers to the L2ARC device. 6368 * 6369 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 6370 * for reading until they have completed writing. 6371 * The headroom_boost is an in-out parameter used to maintain headroom boost 6372 * state between calls to this function. 6373 * 6374 * Returns the number of bytes actually written (which may be smaller than 6375 * the delta by which the device hand has changed due to alignment). 6376 */ 6377 static uint64_t 6378 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 6379 boolean_t *headroom_boost) 6380 { 6381 arc_buf_hdr_t *hdr, *hdr_prev, *head; 6382 uint64_t write_asize, write_sz, headroom, 6383 buf_compress_minsz; 6384 void *buf_data; 6385 boolean_t full; 6386 l2arc_write_callback_t *cb; 6387 zio_t *pio, *wzio; 6388 uint64_t guid = spa_load_guid(spa); 6389 uint64_t sync_txg = spa_syncing_txg(spa); 6390 const boolean_t do_headroom_boost = *headroom_boost; 6391 boolean_t dev_hdr_update = B_FALSE; 6392 6393 ASSERT(dev->l2ad_vdev != NULL); 6394 6395 /* Lower the flag now, we might want to raise it again later. */ 6396 *headroom_boost = B_FALSE; 6397 6398 pio = NULL; 6399 cb = NULL; 6400 write_sz = write_asize = 0; 6401 full = B_FALSE; 6402 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 6403 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 6404 head->b_flags |= ARC_FLAG_HAS_L2HDR; 6405 6406 /* 6407 * We will want to try to compress buffers that are at least 2x the 6408 * device sector size. 6409 */ 6410 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 6411 6412 /* 6413 * Copy buffers for L2ARC writing. 6414 */ 6415 for (int try = 0; try <= 3; try++) { 6416 multilist_sublist_t *mls = l2arc_sublist_lock(try); 6417 uint64_t passed_sz = 0; 6418 6419 /* 6420 * L2ARC fast warmup. 6421 * 6422 * Until the ARC is warm and starts to evict, read from the 6423 * head of the ARC lists rather than the tail. 6424 */ 6425 if (arc_warm == B_FALSE) 6426 hdr = multilist_sublist_head(mls); 6427 else 6428 hdr = multilist_sublist_tail(mls); 6429 6430 headroom = target_sz * l2arc_headroom; 6431 if (do_headroom_boost) 6432 headroom = (headroom * l2arc_headroom_boost) / 100; 6433 6434 for (; hdr; hdr = hdr_prev) { 6435 kmutex_t *hash_lock; 6436 uint64_t buf_sz; 6437 uint64_t buf_a_sz; 6438 6439 if (arc_warm == B_FALSE) 6440 hdr_prev = multilist_sublist_next(mls, hdr); 6441 else 6442 hdr_prev = multilist_sublist_prev(mls, hdr); 6443 6444 hash_lock = HDR_LOCK(hdr); 6445 if (!mutex_tryenter(hash_lock)) { 6446 /* 6447 * Skip this buffer rather than waiting. 6448 */ 6449 continue; 6450 } 6451 6452 passed_sz += hdr->b_size; 6453 if (passed_sz > headroom) { 6454 /* 6455 * Searched too far. 6456 */ 6457 mutex_exit(hash_lock); 6458 break; 6459 } 6460 6461 if (!l2arc_write_eligible(guid, sync_txg, hdr)) { 6462 mutex_exit(hash_lock); 6463 continue; 6464 } 6465 6466 /* 6467 * Assume that the buffer is not going to be compressed 6468 * and could take more space on disk because of a larger 6469 * disk block size. 6470 */ 6471 buf_sz = hdr->b_size; 6472 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 6473 6474 if ((write_asize + buf_a_sz) > target_sz) { 6475 full = B_TRUE; 6476 mutex_exit(hash_lock); 6477 break; 6478 } 6479 6480 if (pio == NULL) { 6481 /* 6482 * Insert a dummy header on the buflist so 6483 * l2arc_write_done() can find where the 6484 * write buffers begin without searching. 6485 */ 6486 mutex_enter(&dev->l2ad_mtx); 6487 list_insert_head(&dev->l2ad_buflist, head); 6488 mutex_exit(&dev->l2ad_mtx); 6489 6490 cb = kmem_zalloc( 6491 sizeof (l2arc_write_callback_t), KM_SLEEP); 6492 cb->l2wcb_dev = dev; 6493 cb->l2wcb_head = head; 6494 list_create(&cb->l2wcb_log_blk_buflist, 6495 sizeof (l2arc_log_blk_buf_t), 6496 offsetof(l2arc_log_blk_buf_t, lbb_node)); 6497 pio = zio_root(spa, l2arc_write_done, cb, 6498 ZIO_FLAG_CANFAIL); 6499 } 6500 6501 /* 6502 * Create and add a new L2ARC header. 6503 */ 6504 hdr->b_l2hdr.b_dev = dev; 6505 hdr->b_flags |= ARC_FLAG_L2_WRITING; 6506 /* 6507 * Temporarily stash the data buffer in b_tmp_cdata. 6508 * The subsequent write step will pick it up from 6509 * there. This is because can't access b_l1hdr.b_buf 6510 * without holding the hash_lock, which we in turn 6511 * can't access without holding the ARC list locks 6512 * (which we want to avoid during compression/writing). 6513 */ 6514 hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; 6515 hdr->b_l2hdr.b_asize = hdr->b_size; 6516 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 6517 6518 /* 6519 * Explicitly set the b_daddr field to a known 6520 * value which means "invalid address". This 6521 * enables us to differentiate which stage of 6522 * l2arc_write_buffers() the particular header 6523 * is in (e.g. this loop, or the one below). 6524 * ARC_FLAG_L2_WRITING is not enough to make 6525 * this distinction, and we need to know in 6526 * order to do proper l2arc vdev accounting in 6527 * arc_release() and arc_hdr_destroy(). 6528 * 6529 * Note, we can't use a new flag to distinguish 6530 * the two stages because we don't hold the 6531 * header's hash_lock below, in the second stage 6532 * of this function. Thus, we can't simply 6533 * change the b_flags field to denote that the 6534 * IO has been sent. We can change the b_daddr 6535 * field of the L2 portion, though, since we'll 6536 * be holding the l2ad_mtx; which is why we're 6537 * using it to denote the header's state change. 6538 */ 6539 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 6540 6541 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 6542 6543 mutex_enter(&dev->l2ad_mtx); 6544 list_insert_head(&dev->l2ad_buflist, hdr); 6545 mutex_exit(&dev->l2ad_mtx); 6546 6547 /* 6548 * Compute and store the buffer cksum before 6549 * writing. On debug the cksum is verified first. 6550 */ 6551 arc_cksum_verify(hdr->b_l1hdr.b_buf); 6552 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 6553 6554 mutex_exit(hash_lock); 6555 6556 write_sz += buf_sz; 6557 write_asize += buf_a_sz; 6558 } 6559 6560 multilist_sublist_unlock(mls); 6561 6562 if (full == B_TRUE) 6563 break; 6564 } 6565 6566 /* No buffers selected for writing? */ 6567 if (pio == NULL) { 6568 ASSERT0(write_sz); 6569 ASSERT(!HDR_HAS_L1HDR(head)); 6570 kmem_cache_free(hdr_l2only_cache, head); 6571 return (0); 6572 } 6573 6574 mutex_enter(&dev->l2ad_mtx); 6575 6576 /* 6577 * Note that elsewhere in this file arcstat_l2_asize 6578 * and the used space on l2ad_vdev are updated using b_asize, 6579 * which is not necessarily rounded up to the device block size. 6580 * Too keep accounting consistent we do the same here as well: 6581 * stats_size accumulates the sum of b_asize of the written buffers, 6582 * while write_asize accumulates the sum of b_asize rounded up 6583 * to the device block size. 6584 * The latter sum is used only to validate the corectness of the code. 6585 */ 6586 uint64_t stats_size = 0; 6587 write_asize = 0; 6588 6589 /* 6590 * Now start writing the buffers. We're starting at the write head 6591 * and work backwards, retracing the course of the buffer selector 6592 * loop above. 6593 */ 6594 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 6595 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 6596 uint64_t buf_sz; 6597 6598 /* 6599 * We rely on the L1 portion of the header below, so 6600 * it's invalid for this header to have been evicted out 6601 * of the ghost cache, prior to being written out. The 6602 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 6603 */ 6604 ASSERT(HDR_HAS_L1HDR(hdr)); 6605 6606 /* 6607 * We shouldn't need to lock the buffer here, since we flagged 6608 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 6609 * take care to only access its L2 cache parameters. In 6610 * particular, hdr->l1hdr.b_buf may be invalid by now due to 6611 * ARC eviction. 6612 */ 6613 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 6614 6615 if ((HDR_L2COMPRESS(hdr)) && 6616 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 6617 if (l2arc_compress_buf(hdr)) { 6618 /* 6619 * If compression succeeded, enable headroom 6620 * boost on the next scan cycle. 6621 */ 6622 *headroom_boost = B_TRUE; 6623 } 6624 } 6625 6626 /* 6627 * Pick up the buffer data we had previously stashed away 6628 * (and now potentially also compressed). 6629 */ 6630 buf_data = hdr->b_l1hdr.b_tmp_cdata; 6631 buf_sz = hdr->b_l2hdr.b_asize; 6632 6633 /* 6634 * We need to do this regardless if buf_sz is zero or 6635 * not, otherwise, when this l2hdr is evicted we'll 6636 * remove a reference that was never added. 6637 */ 6638 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 6639 6640 /* Compression may have squashed the buffer to zero length. */ 6641 if (buf_sz != 0) { 6642 uint64_t buf_a_sz; 6643 6644 wzio = zio_write_phys(pio, dev->l2ad_vdev, 6645 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 6646 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 6647 ZIO_FLAG_CANFAIL, B_FALSE); 6648 6649 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6650 zio_t *, wzio); 6651 (void) zio_nowait(wzio); 6652 6653 stats_size += buf_sz; 6654 6655 /* 6656 * Keep the clock hand suitably device-aligned. 6657 */ 6658 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 6659 write_asize += buf_a_sz; 6660 dev->l2ad_hand += buf_a_sz; 6661 } 6662 6663 /* 6664 * Append buf info to current log and commit if full. 6665 * arcstat_l2_{size,asize} kstats are updated internally. 6666 */ 6667 if (l2arc_log_blk_insert(dev, hdr)) { 6668 l2arc_log_blk_commit(dev, pio, cb); 6669 dev_hdr_update = B_TRUE; 6670 } 6671 } 6672 6673 mutex_exit(&dev->l2ad_mtx); 6674 6675 /* 6676 * If we wrote any logs as part of this write, update dev hdr 6677 * to point to it. 6678 */ 6679 if (dev_hdr_update) 6680 l2arc_dev_hdr_update(dev, pio); 6681 6682 VERIFY3U(write_asize, <=, target_sz); 6683 ARCSTAT_BUMP(arcstat_l2_writes_sent); 6684 ARCSTAT_INCR(arcstat_l2_write_bytes, stats_size); 6685 ARCSTAT_INCR(arcstat_l2_size, write_sz); 6686 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 6687 vdev_space_update(dev->l2ad_vdev, write_asize, 0, 0); 6688 6689 /* 6690 * Bump device hand to the device start if it is approaching the end. 6691 * l2arc_evict() will already have evicted ahead for this case. 6692 */ 6693 if (dev->l2ad_hand + target_sz + l2arc_log_blk_overhead(target_sz) >= 6694 dev->l2ad_end) { 6695 dev->l2ad_hand = dev->l2ad_start; 6696 dev->l2ad_first = B_FALSE; 6697 } 6698 6699 dev->l2ad_writing = B_TRUE; 6700 (void) zio_wait(pio); 6701 dev->l2ad_writing = B_FALSE; 6702 6703 return (stats_size); 6704 } 6705 6706 /* 6707 * Compresses an L2ARC buffer. 6708 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6709 * size in l2hdr->b_asize. This routine tries to compress the data and 6710 * depending on the compression result there are three possible outcomes: 6711 * *) The buffer was incompressible. The original l2hdr contents were left 6712 * untouched and are ready for writing to an L2 device. 6713 * *) The buffer was all-zeros, so there is no need to write it to an L2 6714 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6715 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6716 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6717 * data buffer which holds the compressed data to be written, and b_asize 6718 * tells us how much data there is. b_compress is set to the appropriate 6719 * compression algorithm. Once writing is done, invoke 6720 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6721 * 6722 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6723 * buffer was incompressible). 6724 */ 6725 static boolean_t 6726 l2arc_compress_buf(arc_buf_hdr_t *hdr) 6727 { 6728 void *cdata; 6729 size_t csize, len, rounded; 6730 ASSERT(HDR_HAS_L2HDR(hdr)); 6731 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6732 6733 ASSERT(HDR_HAS_L1HDR(hdr)); 6734 ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); 6735 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6736 6737 len = l2hdr->b_asize; 6738 cdata = zio_data_buf_alloc(len); 6739 ASSERT3P(cdata, !=, NULL); 6740 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 6741 cdata, l2hdr->b_asize); 6742 6743 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); 6744 if (rounded > csize) { 6745 bzero((char *)cdata + csize, rounded - csize); 6746 csize = rounded; 6747 } 6748 6749 if (csize == 0) { 6750 /* zero block, indicate that there's nothing to write */ 6751 zio_data_buf_free(cdata, len); 6752 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 6753 l2hdr->b_asize = 0; 6754 hdr->b_l1hdr.b_tmp_cdata = NULL; 6755 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6756 return (B_TRUE); 6757 } else if (csize > 0 && csize < len) { 6758 /* 6759 * Compression succeeded, we'll keep the cdata around for 6760 * writing and release it afterwards. 6761 */ 6762 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 6763 l2hdr->b_asize = csize; 6764 hdr->b_l1hdr.b_tmp_cdata = cdata; 6765 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6766 return (B_TRUE); 6767 } else { 6768 /* 6769 * Compression failed, release the compressed buffer. 6770 * l2hdr will be left unmodified. 6771 */ 6772 zio_data_buf_free(cdata, len); 6773 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6774 return (B_FALSE); 6775 } 6776 } 6777 6778 /* 6779 * Decompresses a zio read back from an l2arc device. On success, the 6780 * underlying zio's io_data buffer is overwritten by the uncompressed 6781 * version. On decompression error (corrupt compressed stream), the 6782 * zio->io_error value is set to signal an I/O error. 6783 * 6784 * Please note that the compressed data stream is not checksummed, so 6785 * if the underlying device is experiencing data corruption, we may feed 6786 * corrupt data to the decompressor, so the decompressor needs to be 6787 * able to handle this situation (LZ4 does). 6788 */ 6789 static void 6790 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6791 { 6792 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6793 6794 if (zio->io_error != 0) { 6795 /* 6796 * An io error has occured, just restore the original io 6797 * size in preparation for a main pool read. 6798 */ 6799 zio->io_orig_size = zio->io_size = hdr->b_size; 6800 return; 6801 } 6802 6803 if (c == ZIO_COMPRESS_EMPTY) { 6804 /* 6805 * An empty buffer results in a null zio, which means we 6806 * need to fill its io_data after we're done restoring the 6807 * buffer's contents. 6808 */ 6809 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6810 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6811 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6812 } else { 6813 ASSERT(zio->io_data != NULL); 6814 /* 6815 * We copy the compressed data from the start of the arc buffer 6816 * (the zio_read will have pulled in only what we need, the 6817 * rest is garbage which we will overwrite at decompression) 6818 * and then decompress back to the ARC data buffer. This way we 6819 * can minimize copying by simply decompressing back over the 6820 * original compressed data (rather than decompressing to an 6821 * aux buffer and then copying back the uncompressed buffer, 6822 * which is likely to be much larger). 6823 */ 6824 uint64_t csize; 6825 void *cdata; 6826 6827 csize = zio->io_size; 6828 cdata = zio_data_buf_alloc(csize); 6829 bcopy(zio->io_data, cdata, csize); 6830 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6831 hdr->b_size) != 0) 6832 zio->io_error = EIO; 6833 zio_data_buf_free(cdata, csize); 6834 } 6835 6836 /* Restore the expected uncompressed IO size. */ 6837 zio->io_orig_size = zio->io_size = hdr->b_size; 6838 } 6839 6840 /* 6841 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6842 * This buffer serves as a temporary holder of compressed data while 6843 * the buffer entry is being written to an l2arc device. Once that is 6844 * done, we can dispose of it. 6845 */ 6846 static void 6847 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6848 { 6849 ASSERT(HDR_HAS_L2HDR(hdr)); 6850 enum zio_compress comp = hdr->b_l2hdr.b_compress; 6851 6852 ASSERT(HDR_HAS_L1HDR(hdr)); 6853 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); 6854 6855 if (comp == ZIO_COMPRESS_OFF) { 6856 /* 6857 * In this case, b_tmp_cdata points to the same buffer 6858 * as the arc_buf_t's b_data field. We don't want to 6859 * free it, since the arc_buf_t will handle that. 6860 */ 6861 hdr->b_l1hdr.b_tmp_cdata = NULL; 6862 } else if (comp == ZIO_COMPRESS_EMPTY) { 6863 /* 6864 * In this case, b_tmp_cdata was compressed to an empty 6865 * buffer, thus there's nothing to free and b_tmp_cdata 6866 * should have been set to NULL in l2arc_write_buffers(). 6867 */ 6868 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6869 } else { 6870 /* 6871 * If the data was compressed, then we've allocated a 6872 * temporary buffer for it, so now we need to release it. 6873 */ 6874 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6875 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6876 hdr->b_size); 6877 hdr->b_l1hdr.b_tmp_cdata = NULL; 6878 } 6879 6880 } 6881 6882 /* 6883 * This thread feeds the L2ARC at regular intervals. This is the beating 6884 * heart of the L2ARC. 6885 */ 6886 static void 6887 l2arc_feed_thread(void) 6888 { 6889 callb_cpr_t cpr; 6890 l2arc_dev_t *dev; 6891 spa_t *spa; 6892 uint64_t size, wrote; 6893 clock_t begin, next = ddi_get_lbolt(); 6894 boolean_t headroom_boost = B_FALSE; 6895 6896 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6897 6898 mutex_enter(&l2arc_feed_thr_lock); 6899 6900 while (l2arc_thread_exit == 0) { 6901 CALLB_CPR_SAFE_BEGIN(&cpr); 6902 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6903 next); 6904 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6905 next = ddi_get_lbolt() + hz; 6906 6907 /* 6908 * Quick check for L2ARC devices. 6909 */ 6910 mutex_enter(&l2arc_dev_mtx); 6911 if (l2arc_ndev == 0) { 6912 mutex_exit(&l2arc_dev_mtx); 6913 continue; 6914 } 6915 mutex_exit(&l2arc_dev_mtx); 6916 begin = ddi_get_lbolt(); 6917 6918 /* 6919 * This selects the next l2arc device to write to, and in 6920 * doing so the next spa to feed from: dev->l2ad_spa. This 6921 * will return NULL if there are now no l2arc devices or if 6922 * they are all faulted. 6923 * 6924 * If a device is returned, its spa's config lock is also 6925 * held to prevent device removal. l2arc_dev_get_next() 6926 * will grab and release l2arc_dev_mtx. 6927 */ 6928 if ((dev = l2arc_dev_get_next()) == NULL) 6929 continue; 6930 6931 spa = dev->l2ad_spa; 6932 ASSERT(spa != NULL); 6933 6934 /* 6935 * If the pool is read-only then force the feed thread to 6936 * sleep a little longer. 6937 */ 6938 if (!spa_writeable(spa)) { 6939 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6940 spa_config_exit(spa, SCL_L2ARC, dev); 6941 continue; 6942 } 6943 6944 /* 6945 * Avoid contributing to memory pressure. 6946 */ 6947 if (arc_reclaim_needed()) { 6948 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6949 spa_config_exit(spa, SCL_L2ARC, dev); 6950 continue; 6951 } 6952 6953 ARCSTAT_BUMP(arcstat_l2_feeds); 6954 6955 size = l2arc_write_size(); 6956 6957 /* 6958 * Evict L2ARC buffers that will be overwritten. 6959 */ 6960 l2arc_evict(dev, size, B_FALSE); 6961 6962 /* 6963 * Write ARC buffers. 6964 */ 6965 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6966 6967 /* 6968 * Calculate interval between writes. 6969 */ 6970 next = l2arc_write_interval(begin, size, wrote); 6971 spa_config_exit(spa, SCL_L2ARC, dev); 6972 } 6973 6974 l2arc_thread_exit = 0; 6975 cv_broadcast(&l2arc_feed_thr_cv); 6976 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6977 thread_exit(); 6978 } 6979 6980 boolean_t 6981 l2arc_vdev_present(vdev_t *vd) 6982 { 6983 return (l2arc_vdev_get(vd) != NULL); 6984 } 6985 6986 /* 6987 * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if 6988 * the vdev_t isn't an L2ARC device. 6989 */ 6990 static l2arc_dev_t * 6991 l2arc_vdev_get(vdev_t *vd) 6992 { 6993 l2arc_dev_t *dev; 6994 boolean_t held = MUTEX_HELD(&l2arc_dev_mtx); 6995 6996 if (!held) 6997 mutex_enter(&l2arc_dev_mtx); 6998 for (dev = list_head(l2arc_dev_list); dev != NULL; 6999 dev = list_next(l2arc_dev_list, dev)) { 7000 if (dev->l2ad_vdev == vd) 7001 break; 7002 } 7003 if (!held) 7004 mutex_exit(&l2arc_dev_mtx); 7005 7006 return (dev); 7007 } 7008 7009 /* 7010 * Add a vdev for use by the L2ARC. By this point the spa has already 7011 * validated the vdev and opened it. The `rebuild' flag indicates whether 7012 * we should attempt an L2ARC persistency rebuild. 7013 */ 7014 void 7015 l2arc_add_vdev(spa_t *spa, vdev_t *vd, boolean_t rebuild) 7016 { 7017 l2arc_dev_t *adddev; 7018 7019 ASSERT(!l2arc_vdev_present(vd)); 7020 7021 /* 7022 * Create a new l2arc device entry. 7023 */ 7024 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 7025 adddev->l2ad_spa = spa; 7026 adddev->l2ad_vdev = vd; 7027 /* leave extra size for an l2arc device header */ 7028 adddev->l2ad_dev_hdr_asize = MAX(sizeof (*adddev->l2ad_dev_hdr), 7029 1 << vd->vdev_ashift); 7030 adddev->l2ad_start = VDEV_LABEL_START_SIZE + adddev->l2ad_dev_hdr_asize; 7031 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 7032 ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); 7033 adddev->l2ad_hand = adddev->l2ad_start; 7034 adddev->l2ad_first = B_TRUE; 7035 adddev->l2ad_writing = B_FALSE; 7036 adddev->l2ad_dev_hdr = kmem_zalloc(adddev->l2ad_dev_hdr_asize, 7037 KM_SLEEP); 7038 7039 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 7040 /* 7041 * This is a list of all ARC buffers that are still valid on the 7042 * device. 7043 */ 7044 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 7045 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 7046 7047 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 7048 refcount_create(&adddev->l2ad_alloc); 7049 7050 /* 7051 * Add device to global list 7052 */ 7053 mutex_enter(&l2arc_dev_mtx); 7054 list_insert_head(l2arc_dev_list, adddev); 7055 atomic_inc_64(&l2arc_ndev); 7056 if (rebuild && l2arc_rebuild_enabled && 7057 adddev->l2ad_end - adddev->l2ad_start > L2ARC_PERSIST_MIN_SIZE) { 7058 /* 7059 * Just mark the device as pending for a rebuild. We won't 7060 * be starting a rebuild in line here as it would block pool 7061 * import. Instead spa_load_impl will hand that off to an 7062 * async task which will call l2arc_spa_rebuild_start. 7063 */ 7064 adddev->l2ad_rebuild = B_TRUE; 7065 } 7066 mutex_exit(&l2arc_dev_mtx); 7067 } 7068 7069 /* 7070 * Remove a vdev from the L2ARC. 7071 */ 7072 void 7073 l2arc_remove_vdev(vdev_t *vd) 7074 { 7075 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 7076 7077 /* 7078 * Find the device by vdev 7079 */ 7080 mutex_enter(&l2arc_dev_mtx); 7081 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 7082 nextdev = list_next(l2arc_dev_list, dev); 7083 if (vd == dev->l2ad_vdev) { 7084 remdev = dev; 7085 break; 7086 } 7087 } 7088 ASSERT(remdev != NULL); 7089 7090 /* 7091 * Cancel any ongoing or scheduled rebuild (race protection with 7092 * l2arc_spa_rebuild_start provided via l2arc_dev_mtx). 7093 */ 7094 remdev->l2ad_rebuild_cancel = B_TRUE; 7095 if (remdev->l2ad_rebuild_did != 0) { 7096 /* 7097 * N.B. it should be safe to thread_join with the rebuild 7098 * thread while holding l2arc_dev_mtx because it is not 7099 * accessed from anywhere in the l2arc rebuild code below 7100 * (except for l2arc_spa_rebuild_start, which is ok). 7101 */ 7102 thread_join(remdev->l2ad_rebuild_did); 7103 } 7104 7105 /* 7106 * Remove device from global list 7107 */ 7108 list_remove(l2arc_dev_list, remdev); 7109 l2arc_dev_last = NULL; /* may have been invalidated */ 7110 atomic_dec_64(&l2arc_ndev); 7111 mutex_exit(&l2arc_dev_mtx); 7112 7113 /* 7114 * Clear all buflists and ARC references. L2ARC device flush. 7115 */ 7116 l2arc_evict(remdev, 0, B_TRUE); 7117 list_destroy(&remdev->l2ad_buflist); 7118 mutex_destroy(&remdev->l2ad_mtx); 7119 refcount_destroy(&remdev->l2ad_alloc); 7120 kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); 7121 kmem_free(remdev, sizeof (l2arc_dev_t)); 7122 } 7123 7124 void 7125 l2arc_init(void) 7126 { 7127 l2arc_thread_exit = 0; 7128 l2arc_ndev = 0; 7129 l2arc_writes_sent = 0; 7130 l2arc_writes_done = 0; 7131 7132 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 7133 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 7134 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 7135 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 7136 7137 l2arc_dev_list = &L2ARC_dev_list; 7138 l2arc_free_on_write = &L2ARC_free_on_write; 7139 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 7140 offsetof(l2arc_dev_t, l2ad_node)); 7141 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 7142 offsetof(l2arc_data_free_t, l2df_list_node)); 7143 } 7144 7145 void 7146 l2arc_fini(void) 7147 { 7148 /* 7149 * This is called from dmu_fini(), which is called from spa_fini(); 7150 * Because of this, we can assume that all l2arc devices have 7151 * already been removed when the pools themselves were removed. 7152 */ 7153 7154 l2arc_do_free_on_write(); 7155 7156 mutex_destroy(&l2arc_feed_thr_lock); 7157 cv_destroy(&l2arc_feed_thr_cv); 7158 mutex_destroy(&l2arc_dev_mtx); 7159 mutex_destroy(&l2arc_free_on_write_mtx); 7160 7161 list_destroy(l2arc_dev_list); 7162 list_destroy(l2arc_free_on_write); 7163 } 7164 7165 void 7166 l2arc_start(void) 7167 { 7168 if (!(spa_mode_global & FWRITE)) 7169 return; 7170 7171 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 7172 TS_RUN, minclsyspri); 7173 } 7174 7175 void 7176 l2arc_stop(void) 7177 { 7178 if (!(spa_mode_global & FWRITE)) 7179 return; 7180 7181 mutex_enter(&l2arc_feed_thr_lock); 7182 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 7183 l2arc_thread_exit = 1; 7184 while (l2arc_thread_exit != 0) 7185 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 7186 mutex_exit(&l2arc_feed_thr_lock); 7187 } 7188 7189 /* 7190 * Punches out rebuild threads for the L2ARC devices in a spa. This should 7191 * be called after pool import from the spa async thread, since starting 7192 * these threads directly from spa_import() will make them part of the 7193 * "zpool import" context and delay process exit (and thus pool import). 7194 */ 7195 void 7196 l2arc_spa_rebuild_start(spa_t *spa) 7197 { 7198 /* 7199 * Locate the spa's l2arc devices and kick off rebuild threads. 7200 */ 7201 mutex_enter(&l2arc_dev_mtx); 7202 for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { 7203 l2arc_dev_t *dev = 7204 l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); 7205 ASSERT(dev != NULL); 7206 if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { 7207 VERIFY3U(dev->l2ad_rebuild_did, ==, 0); 7208 #ifdef _KERNEL 7209 dev->l2ad_rebuild_did = thread_create(NULL, 0, 7210 l2arc_dev_rebuild_start, dev, 0, &p0, TS_RUN, 7211 minclsyspri)->t_did; 7212 #endif 7213 } 7214 } 7215 mutex_exit(&l2arc_dev_mtx); 7216 } 7217 7218 /* 7219 * Main entry point for L2ARC rebuilding. 7220 */ 7221 static void 7222 l2arc_dev_rebuild_start(l2arc_dev_t *dev) 7223 { 7224 if (!dev->l2ad_rebuild_cancel) { 7225 VERIFY(dev->l2ad_rebuild); 7226 (void) l2arc_rebuild(dev); 7227 dev->l2ad_rebuild = B_FALSE; 7228 } 7229 } 7230 7231 /* 7232 * This function implements the actual L2ARC metadata rebuild. It: 7233 * 7234 * 1) reads the device's header 7235 * 2) if a good device header is found, starts reading the log block chain 7236 * 3) restores each block's contents to memory (reconstructing arc_buf_hdr_t's) 7237 * 7238 * Operation stops under any of the following conditions: 7239 * 7240 * 1) We reach the end of the log blk chain (the back-reference in the blk is 7241 * invalid or loops over our starting point). 7242 * 2) We encounter *any* error condition (cksum errors, io errors, looped 7243 * blocks, etc.). 7244 */ 7245 static int 7246 l2arc_rebuild(l2arc_dev_t *dev) 7247 { 7248 vdev_t *vd = dev->l2ad_vdev; 7249 spa_t *spa = vd->vdev_spa; 7250 int err; 7251 l2arc_log_blk_phys_t *this_lb, *next_lb; 7252 uint8_t *this_lb_buf, *next_lb_buf; 7253 zio_t *this_io = NULL, *next_io = NULL; 7254 l2arc_log_blkptr_t lb_ptrs[2]; 7255 boolean_t first_pass, lock_held; 7256 uint64_t load_guid; 7257 7258 this_lb = kmem_zalloc(sizeof (*this_lb), KM_SLEEP); 7259 next_lb = kmem_zalloc(sizeof (*next_lb), KM_SLEEP); 7260 this_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP); 7261 next_lb_buf = kmem_zalloc(sizeof (l2arc_log_blk_phys_t), KM_SLEEP); 7262 7263 /* 7264 * We prevent device removal while issuing reads to the device, 7265 * then during the rebuilding phases we drop this lock again so 7266 * that a spa_unload or device remove can be initiated - this is 7267 * safe, because the spa will signal us to stop before removing 7268 * our device and wait for us to stop. 7269 */ 7270 spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); 7271 lock_held = B_TRUE; 7272 7273 load_guid = spa_load_guid(dev->l2ad_vdev->vdev_spa); 7274 /* 7275 * Device header processing phase. 7276 */ 7277 if ((err = l2arc_dev_hdr_read(dev)) != 0) { 7278 /* device header corrupted, start a new one */ 7279 bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); 7280 goto out; 7281 } 7282 7283 /* Retrieve the persistent L2ARC device state */ 7284 dev->l2ad_hand = vdev_psize_to_asize(dev->l2ad_vdev, 7285 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr + 7286 LBP_GET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0])); 7287 dev->l2ad_first = !!(dev->l2ad_dev_hdr->dh_flags & 7288 L2ARC_DEV_HDR_EVICT_FIRST); 7289 7290 /* Prepare the rebuild processing state */ 7291 bcopy(dev->l2ad_dev_hdr->dh_start_lbps, lb_ptrs, sizeof (lb_ptrs)); 7292 first_pass = B_TRUE; 7293 7294 /* Start the rebuild process */ 7295 for (;;) { 7296 if (!l2arc_log_blkptr_valid(dev, &lb_ptrs[0])) 7297 /* We hit an invalid block address, end the rebuild. */ 7298 break; 7299 7300 if ((err = l2arc_log_blk_read(dev, &lb_ptrs[0], &lb_ptrs[1], 7301 this_lb, next_lb, this_lb_buf, next_lb_buf, 7302 this_io, &next_io)) != 0) 7303 break; 7304 7305 spa_config_exit(spa, SCL_L2ARC, vd); 7306 lock_held = B_FALSE; 7307 7308 /* Protection against infinite loops of log blocks. */ 7309 if (l2arc_range_check_overlap(lb_ptrs[1].lbp_daddr, 7310 lb_ptrs[0].lbp_daddr, 7311 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) && 7312 !first_pass) { 7313 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_loop_errors); 7314 err = SET_ERROR(ELOOP); 7315 break; 7316 } 7317 7318 /* 7319 * Our memory pressure valve. If the system is running low 7320 * on memory, rather than swamping memory with new ARC buf 7321 * hdrs, we opt not to rebuild the L2ARC. At this point, 7322 * however, we have already set up our L2ARC dev to chain in 7323 * new metadata log blk, so the user may choose to re-add the 7324 * L2ARC dev at a later time to reconstruct it (when there's 7325 * less memory pressure). 7326 */ 7327 if (arc_reclaim_needed()) { 7328 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); 7329 cmn_err(CE_NOTE, "System running low on memory, " 7330 "aborting L2ARC rebuild."); 7331 err = SET_ERROR(ENOMEM); 7332 break; 7333 } 7334 7335 /* 7336 * Now that we know that the next_lb checks out alright, we 7337 * can start reconstruction from this lb - we can be sure 7338 * that the L2ARC write hand has not yet reached any of our 7339 * buffers. 7340 */ 7341 l2arc_log_blk_restore(dev, load_guid, this_lb, 7342 LBP_GET_PSIZE(&lb_ptrs[0])); 7343 7344 /* 7345 * End of list detection. We can look ahead two steps in the 7346 * blk chain and if the 2nd blk from this_lb dips below the 7347 * initial chain starting point, then we know two things: 7348 * 1) it can't be valid, and 7349 * 2) the next_lb's ARC entries might have already been 7350 * partially overwritten and so we should stop before 7351 * we restore it 7352 */ 7353 if (l2arc_range_check_overlap( 7354 this_lb->lb_back2_lbp.lbp_daddr, lb_ptrs[0].lbp_daddr, 7355 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr) && 7356 !first_pass) 7357 break; 7358 7359 /* log blk restored, continue with next one in the list */ 7360 lb_ptrs[0] = lb_ptrs[1]; 7361 lb_ptrs[1] = this_lb->lb_back2_lbp; 7362 PTR_SWAP(this_lb, next_lb); 7363 PTR_SWAP(this_lb_buf, next_lb_buf); 7364 this_io = next_io; 7365 next_io = NULL; 7366 first_pass = B_FALSE; 7367 7368 for (;;) { 7369 if (dev->l2ad_rebuild_cancel) { 7370 err = SET_ERROR(ECANCELED); 7371 goto out; 7372 } 7373 if (spa_config_tryenter(spa, SCL_L2ARC, vd, 7374 RW_READER)) { 7375 lock_held = B_TRUE; 7376 break; 7377 } 7378 /* 7379 * L2ARC config lock held by somebody in writer, 7380 * possibly due to them trying to remove us. They'll 7381 * likely to want us to shut down, so after a little 7382 * delay, we check l2ad_rebuild_cancel and retry 7383 * the lock again. 7384 */ 7385 delay(1); 7386 } 7387 } 7388 out: 7389 if (next_io != NULL) 7390 l2arc_log_blk_prefetch_abort(next_io); 7391 kmem_free(this_lb, sizeof (*this_lb)); 7392 kmem_free(next_lb, sizeof (*next_lb)); 7393 kmem_free(this_lb_buf, sizeof (l2arc_log_blk_phys_t)); 7394 kmem_free(next_lb_buf, sizeof (l2arc_log_blk_phys_t)); 7395 if (err == 0) 7396 ARCSTAT_BUMP(arcstat_l2_rebuild_successes); 7397 7398 if (lock_held) 7399 spa_config_exit(spa, SCL_L2ARC, vd); 7400 7401 return (err); 7402 } 7403 7404 /* 7405 * Attempts to read the device header on the provided L2ARC device and writes 7406 * it to `hdr'. On success, this function returns 0, otherwise the appropriate 7407 * error code is returned. 7408 */ 7409 static int 7410 l2arc_dev_hdr_read(l2arc_dev_t *dev) 7411 { 7412 int err; 7413 uint64_t guid; 7414 zio_cksum_t cksum; 7415 l2arc_dev_hdr_phys_t *hdr = dev->l2ad_dev_hdr; 7416 const uint64_t hdr_asize = dev->l2ad_dev_hdr_asize; 7417 7418 guid = spa_guid(dev->l2ad_vdev->vdev_spa); 7419 7420 if ((err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, 7421 VDEV_LABEL_START_SIZE, hdr_asize, hdr, 7422 ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 7423 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 7424 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE))) != 0) { 7425 spa_config_exit(dev->l2ad_vdev->vdev_spa, SCL_L2ARC, 7426 dev->l2ad_vdev); 7427 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); 7428 return (err); 7429 } 7430 7431 if (hdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) 7432 byteswap_uint64_array(hdr, sizeof (*hdr)); 7433 7434 if (hdr->dh_magic != L2ARC_DEV_HDR_MAGIC || hdr->dh_spa_guid != guid) { 7435 /* 7436 * Attempt to rebuild a device containing no actual dev hdr 7437 * or containing a header from some other pool. 7438 */ 7439 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); 7440 return (SET_ERROR(ENOTSUP)); 7441 } 7442 7443 l2arc_dev_hdr_checksum(hdr, &cksum); 7444 if (!ZIO_CHECKSUM_EQUAL(hdr->dh_self_cksum, cksum)) { 7445 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors); 7446 return (SET_ERROR(EINVAL)); 7447 } 7448 7449 return (0); 7450 } 7451 7452 /* 7453 * Reads L2ARC log blocks from storage and validates their contents. 7454 * 7455 * This function implements a simple prefetcher to make sure that while 7456 * we're processing one buffer the L2ARC is already prefetching the next 7457 * one in the chain. 7458 * 7459 * The arguments this_lp and next_lp point to the current and next log blk 7460 * address in the block chain. Similarly, this_lb and next_lb hold the 7461 * l2arc_log_blk_phys_t's of the current and next L2ARC blk. The this_lb_buf 7462 * and next_lb_buf must be buffers of appropriate to hold a raw 7463 * l2arc_log_blk_phys_t (they are used as catch buffers for read ops prior 7464 * to buffer decompression). 7465 * 7466 * The `this_io' and `next_io' arguments are used for block prefetching. 7467 * When issuing the first blk IO during rebuild, you should pass NULL for 7468 * `this_io'. This function will then issue a sync IO to read the block and 7469 * also issue an async IO to fetch the next block in the block chain. The 7470 * prefetch IO is returned in `next_io'. On subsequent calls to this 7471 * function, pass the value returned in `next_io' from the previous call 7472 * as `this_io' and a fresh `next_io' pointer to hold the next prefetch IO. 7473 * Prior to the call, you should initialize your `next_io' pointer to be 7474 * NULL. If no prefetch IO was issued, the pointer is left set at NULL. 7475 * 7476 * On success, this function returns 0, otherwise it returns an appropriate 7477 * error code. On error the prefetching IO is aborted and cleared before 7478 * returning from this function. Therefore, if we return `success', the 7479 * caller can assume that we have taken care of cleanup of prefetch IOs. 7480 */ 7481 static int 7482 l2arc_log_blk_read(l2arc_dev_t *dev, 7483 const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, 7484 l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, 7485 uint8_t *this_lb_buf, uint8_t *next_lb_buf, 7486 zio_t *this_io, zio_t **next_io) 7487 { 7488 int err = 0; 7489 zio_cksum_t cksum; 7490 7491 ASSERT(this_lbp != NULL && next_lbp != NULL); 7492 ASSERT(this_lb != NULL && next_lb != NULL); 7493 ASSERT(this_lb_buf != NULL && next_lb_buf != NULL); 7494 ASSERT(next_io != NULL && *next_io == NULL); 7495 ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); 7496 7497 /* 7498 * Check to see if we have issued the IO for this log blk in a 7499 * previous run. If not, this is the first call, so issue it now. 7500 */ 7501 if (this_io == NULL) { 7502 this_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, this_lbp, 7503 this_lb_buf); 7504 } 7505 7506 /* 7507 * Peek to see if we can start issuing the next IO immediately. 7508 */ 7509 if (l2arc_log_blkptr_valid(dev, next_lbp)) { 7510 /* 7511 * Start issuing IO for the next log blk early - this 7512 * should help keep the L2ARC device busy while we 7513 * decompress and restore this log blk. 7514 */ 7515 *next_io = l2arc_log_blk_prefetch(dev->l2ad_vdev, next_lbp, 7516 next_lb_buf); 7517 } 7518 7519 /* Wait for the IO to read this log block to complete */ 7520 if ((err = zio_wait(this_io)) != 0) { 7521 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); 7522 goto cleanup; 7523 } 7524 7525 /* Make sure the buffer checks out */ 7526 fletcher_4_native(this_lb_buf, LBP_GET_PSIZE(this_lbp), NULL, &cksum); 7527 if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { 7528 ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_errors); 7529 err = SET_ERROR(EINVAL); 7530 goto cleanup; 7531 } 7532 7533 /* Now we can take our time decoding this buffer */ 7534 switch (LBP_GET_COMPRESS(this_lbp)) { 7535 case ZIO_COMPRESS_OFF: 7536 bcopy(this_lb_buf, this_lb, sizeof (*this_lb)); 7537 break; 7538 case ZIO_COMPRESS_LZ4: 7539 if ((err = zio_decompress_data(LBP_GET_COMPRESS(this_lbp), 7540 this_lb_buf, this_lb, LBP_GET_PSIZE(this_lbp), 7541 sizeof (*this_lb))) != 0) { 7542 err = SET_ERROR(EINVAL); 7543 goto cleanup; 7544 } 7545 break; 7546 default: 7547 err = SET_ERROR(EINVAL); 7548 goto cleanup; 7549 } 7550 if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) 7551 byteswap_uint64_array(this_lb, sizeof (*this_lb)); 7552 if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { 7553 err = SET_ERROR(EINVAL); 7554 goto cleanup; 7555 } 7556 cleanup: 7557 /* Abort an in-flight prefetch I/O in case of error */ 7558 if (err != 0 && *next_io != NULL) { 7559 l2arc_log_blk_prefetch_abort(*next_io); 7560 *next_io = NULL; 7561 } 7562 return (err); 7563 } 7564 7565 /* 7566 * Restores the payload of a log blk to ARC. This creates empty ARC hdr 7567 * entries which only contain an l2arc hdr, essentially restoring the 7568 * buffers to their L2ARC evicted state. This function also updates space 7569 * usage on the L2ARC vdev to make sure it tracks restored buffers. 7570 */ 7571 static void 7572 l2arc_log_blk_restore(l2arc_dev_t *dev, uint64_t load_guid, 7573 const l2arc_log_blk_phys_t *lb, uint64_t lb_psize) 7574 { 7575 uint64_t size = 0, psize = 0; 7576 7577 for (int i = L2ARC_LOG_BLK_ENTRIES - 1; i >= 0; i--) { 7578 /* 7579 * Restore goes in the reverse temporal direction to preserve 7580 * correct temporal ordering of buffers in the l2ad_buflist. 7581 * l2arc_hdr_restore also does a list_insert_tail instead of 7582 * list_insert_head on the l2ad_buflist: 7583 * 7584 * LIST l2ad_buflist LIST 7585 * HEAD <------ (time) ------ TAIL 7586 * direction +-----+-----+-----+-----+-----+ direction 7587 * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild 7588 * fill +-----+-----+-----+-----+-----+ 7589 * ^ ^ 7590 * | | 7591 * | | 7592 * l2arc_fill_thread l2arc_rebuild 7593 * places new bufs here restores bufs here 7594 * 7595 * This also works when the restored bufs get evicted at any 7596 * point during the rebuild. 7597 */ 7598 l2arc_hdr_restore(&lb->lb_entries[i], dev, load_guid); 7599 size += LE_GET_LSIZE(&lb->lb_entries[i]); 7600 psize += LE_GET_PSIZE(&lb->lb_entries[i]); 7601 } 7602 7603 /* 7604 * Record rebuild stats: 7605 * size In-memory size of restored buffer data in ARC 7606 * psize Physical size of restored buffers in the L2ARC 7607 * bufs # of ARC buffer headers restored 7608 * log_blks # of L2ARC log entries processed during restore 7609 */ 7610 ARCSTAT_INCR(arcstat_l2_rebuild_size, size); 7611 ARCSTAT_INCR(arcstat_l2_rebuild_psize, psize); 7612 ARCSTAT_INCR(arcstat_l2_rebuild_bufs, L2ARC_LOG_BLK_ENTRIES); 7613 ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); 7614 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, lb_psize); 7615 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, psize / lb_psize); 7616 vdev_space_update(dev->l2ad_vdev, psize, 0, 0); 7617 } 7618 7619 /* 7620 * Restores a single ARC buf hdr from a log block. The ARC buffer is put 7621 * into a state indicating that it has been evicted to L2ARC. 7622 */ 7623 static void 7624 l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev, 7625 uint64_t load_guid) 7626 { 7627 arc_buf_hdr_t *hdr, *exists; 7628 kmutex_t *hash_lock; 7629 arc_buf_contents_t type = LE_GET_TYPE(le); 7630 7631 /* 7632 * Do all the allocation before grabbing any locks, this lets us 7633 * sleep if memory is full and we don't have to deal with failed 7634 * allocations. 7635 */ 7636 ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le)) || 7637 LE_GET_COMPRESS(le) == ZIO_COMPRESS_OFF); 7638 hdr = arc_buf_alloc_l2only(load_guid, LE_GET_LSIZE(le), type, 7639 dev, le->le_dva, le->le_daddr, LE_GET_PSIZE(le), le->le_birth, 7640 le->le_freeze_cksum, LE_GET_COMPRESS(le)); 7641 if (hdr->b_l2hdr.b_daddr != L2ARC_ADDR_UNSET) { 7642 ARCSTAT_INCR(arcstat_l2_size, hdr->b_size); 7643 ARCSTAT_INCR(arcstat_l2_asize, hdr->b_l2hdr.b_asize); 7644 } 7645 7646 mutex_enter(&dev->l2ad_mtx); 7647 /* 7648 * We connect the l2hdr to the hdr only after the hdr is in the hash 7649 * table, otherwise the rest of the arc hdr manipulation machinery 7650 * might get confused. 7651 */ 7652 list_insert_tail(&dev->l2ad_buflist, hdr); 7653 (void) refcount_add_many(&dev->l2ad_alloc, hdr->b_l2hdr.b_asize, hdr); 7654 mutex_exit(&dev->l2ad_mtx); 7655 7656 exists = buf_hash_insert(hdr, &hash_lock); 7657 if (exists) { 7658 /* Buffer was already cached, no need to restore it. */ 7659 mutex_exit(hash_lock); 7660 arc_hdr_destroy(hdr); 7661 ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); 7662 return; 7663 } 7664 7665 mutex_exit(hash_lock); 7666 } 7667 7668 /* 7669 * Starts an asynchronous read IO to read a log block. This is used in log 7670 * block reconstruction to start reading the next block before we are done 7671 * decoding and reconstructing the current block, to keep the l2arc device 7672 * nice and hot with read IO to process. 7673 * The returned zio will contain a newly allocated memory buffers for the IO 7674 * data which should then be freed by the caller once the zio is no longer 7675 * needed (i.e. due to it having completed). If you wish to abort this 7676 * zio, you should do so using l2arc_log_blk_prefetch_abort, which takes 7677 * care of disposing of the allocated buffers correctly. 7678 */ 7679 static zio_t * 7680 l2arc_log_blk_prefetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, 7681 uint8_t *lb_buf) 7682 { 7683 uint32_t psize; 7684 zio_t *pio; 7685 7686 psize = LBP_GET_PSIZE(lbp); 7687 ASSERT(psize <= sizeof (l2arc_log_blk_phys_t)); 7688 pio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_DONT_CACHE | 7689 ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | 7690 ZIO_FLAG_DONT_RETRY); 7691 (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, psize, 7692 lb_buf, ZIO_CHECKSUM_OFF, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, 7693 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | 7694 ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); 7695 7696 return (pio); 7697 } 7698 7699 /* 7700 * Aborts a zio returned from l2arc_log_blk_prefetch and frees the data 7701 * buffers allocated for it. 7702 */ 7703 static void 7704 l2arc_log_blk_prefetch_abort(zio_t *zio) 7705 { 7706 (void) zio_wait(zio); 7707 } 7708 7709 /* 7710 * Creates a zio to update the device header on an l2arc device. The zio is 7711 * initiated as a child of `pio'. 7712 */ 7713 static void 7714 l2arc_dev_hdr_update(l2arc_dev_t *dev, zio_t *pio) 7715 { 7716 zio_t *wzio; 7717 l2arc_dev_hdr_phys_t *hdr = dev->l2ad_dev_hdr; 7718 const uint64_t hdr_asize = dev->l2ad_dev_hdr_asize; 7719 7720 hdr->dh_magic = L2ARC_DEV_HDR_MAGIC; 7721 hdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); 7722 hdr->dh_alloc_space = refcount_count(&dev->l2ad_alloc); 7723 hdr->dh_flags = 0; 7724 if (dev->l2ad_first) 7725 hdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; 7726 7727 /* checksum operation goes last */ 7728 l2arc_dev_hdr_checksum(hdr, &hdr->dh_self_cksum); 7729 7730 wzio = zio_write_phys(pio, dev->l2ad_vdev, VDEV_LABEL_START_SIZE, 7731 hdr_asize, hdr, ZIO_CHECKSUM_OFF, NULL, NULL, 7732 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); 7733 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); 7734 (void) zio_nowait(wzio); 7735 } 7736 7737 /* 7738 * Commits a log block to the L2ARC device. This routine is invoked from 7739 * l2arc_write_buffers when the log block fills up. 7740 * This function allocates some memory to temporarily hold the serialized 7741 * buffer to be written. This is then released in l2arc_write_done. 7742 */ 7743 static void 7744 l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, 7745 l2arc_write_callback_t *cb) 7746 { 7747 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; 7748 uint64_t psize, asize; 7749 l2arc_log_blk_buf_t *lb_buf; 7750 zio_t *wzio; 7751 7752 VERIFY(dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES); 7753 7754 /* link the buffer into the block chain */ 7755 lb->lb_back2_lbp = dev->l2ad_dev_hdr->dh_start_lbps[1]; 7756 lb->lb_magic = L2ARC_LOG_BLK_MAGIC; 7757 7758 /* try to compress the buffer */ 7759 lb_buf = kmem_zalloc(sizeof (*lb_buf), KM_SLEEP); 7760 list_insert_tail(&cb->l2wcb_log_blk_buflist, lb_buf); 7761 psize = zio_compress_data(ZIO_COMPRESS_LZ4, lb, lb_buf->lbb_log_blk, 7762 sizeof (*lb)); 7763 /* a log block is never entirely zero */ 7764 ASSERT(psize != 0); 7765 asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); 7766 ASSERT(asize <= sizeof (lb_buf->lbb_log_blk)); 7767 7768 /* 7769 * Update the start log blk pointer in the device header to point 7770 * to the log block we're about to write. 7771 */ 7772 dev->l2ad_dev_hdr->dh_start_lbps[1] = 7773 dev->l2ad_dev_hdr->dh_start_lbps[0]; 7774 dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; 7775 _NOTE(CONSTCOND) 7776 LBP_SET_LSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], sizeof (*lb)); 7777 LBP_SET_PSIZE(&dev->l2ad_dev_hdr->dh_start_lbps[0], asize); 7778 LBP_SET_CHECKSUM(&dev->l2ad_dev_hdr->dh_start_lbps[0], 7779 ZIO_CHECKSUM_FLETCHER_4); 7780 LBP_SET_TYPE(&dev->l2ad_dev_hdr->dh_start_lbps[0], 0); 7781 if (asize < sizeof (*lb)) { 7782 /* compression succeeded */ 7783 bzero(lb_buf->lbb_log_blk + psize, asize - psize); 7784 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0], 7785 ZIO_COMPRESS_LZ4); 7786 } else { 7787 /* compression failed */ 7788 bcopy(lb, lb_buf->lbb_log_blk, sizeof (*lb)); 7789 LBP_SET_COMPRESS(&dev->l2ad_dev_hdr->dh_start_lbps[0], 7790 ZIO_COMPRESS_OFF); 7791 } 7792 /* checksum what we're about to write */ 7793 fletcher_4_native(lb_buf->lbb_log_blk, asize, NULL, 7794 &dev->l2ad_dev_hdr->dh_start_lbps[0].lbp_cksum); 7795 7796 /* perform the write itself */ 7797 CTASSERT(L2ARC_LOG_BLK_SIZE >= SPA_MINBLOCKSIZE && 7798 L2ARC_LOG_BLK_SIZE <= SPA_MAXBLOCKSIZE); 7799 wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, 7800 asize, lb_buf->lbb_log_blk, ZIO_CHECKSUM_OFF, NULL, NULL, 7801 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); 7802 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); 7803 (void) zio_nowait(wzio); 7804 7805 dev->l2ad_hand += asize; 7806 vdev_space_update(dev->l2ad_vdev, asize, 0, 0); 7807 7808 /* bump the kstats */ 7809 ARCSTAT_INCR(arcstat_l2_write_bytes, asize); 7810 ARCSTAT_BUMP(arcstat_l2_log_blk_writes); 7811 ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_size, asize); 7812 ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, 7813 dev->l2ad_log_blk_payload_asize / asize); 7814 7815 /* start a new log block */ 7816 dev->l2ad_log_ent_idx = 0; 7817 dev->l2ad_log_blk_payload_asize = 0; 7818 } 7819 7820 /* 7821 * Validates an L2ARC log blk address to make sure that it can be read 7822 * from the provided L2ARC device. Returns B_TRUE if the address is 7823 * within the device's bounds, or B_FALSE if not. 7824 */ 7825 static boolean_t 7826 l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) 7827 { 7828 uint64_t psize = LBP_GET_PSIZE(lbp); 7829 uint64_t end = lbp->lbp_daddr + psize; 7830 7831 /* 7832 * A log block is valid if all of the following conditions are true: 7833 * - it fits entirely between l2ad_start and l2ad_end 7834 * - it has a valid size 7835 */ 7836 return (lbp->lbp_daddr >= dev->l2ad_start && end <= dev->l2ad_end && 7837 psize > 0 && psize <= sizeof (l2arc_log_blk_phys_t)); 7838 } 7839 7840 /* 7841 * Computes the checksum of `hdr' and stores it in `cksum'. 7842 */ 7843 static void 7844 l2arc_dev_hdr_checksum(const l2arc_dev_hdr_phys_t *hdr, zio_cksum_t *cksum) 7845 { 7846 fletcher_4_native((uint8_t *)hdr + 7847 offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid), 7848 sizeof (*hdr) - offsetof(l2arc_dev_hdr_phys_t, dh_spa_guid), 7849 NULL, cksum); 7850 } 7851 7852 /* 7853 * Inserts ARC buffer `ab' into the current L2ARC log blk on the device. 7854 * The buffer being inserted must be present in L2ARC. 7855 * Returns B_TRUE if the L2ARC log blk is full and needs to be committed 7856 * to L2ARC, or B_FALSE if it still has room for more ARC buffers. 7857 */ 7858 static boolean_t 7859 l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *ab) 7860 { 7861 l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; 7862 l2arc_log_ent_phys_t *le; 7863 int index = dev->l2ad_log_ent_idx++; 7864 7865 ASSERT(index < L2ARC_LOG_BLK_ENTRIES); 7866 7867 le = &lb->lb_entries[index]; 7868 bzero(le, sizeof (*le)); 7869 le->le_dva = ab->b_dva; 7870 le->le_birth = ab->b_birth; 7871 le->le_daddr = ab->b_l2hdr.b_daddr; 7872 LE_SET_LSIZE(le, ab->b_size); 7873 LE_SET_PSIZE(le, ab->b_l2hdr.b_asize); 7874 LE_SET_COMPRESS(le, ab->b_l2hdr.b_compress); 7875 if (ab->b_l2hdr.b_compress != ZIO_COMPRESS_OFF) { 7876 ASSERT(L2ARC_IS_VALID_COMPRESS(ab->b_l2hdr.b_compress)); 7877 ASSERT(L2ARC_IS_VALID_COMPRESS(LE_GET_COMPRESS(le))); 7878 } 7879 le->le_freeze_cksum = *ab->b_freeze_cksum; 7880 LE_SET_CHECKSUM(le, ZIO_CHECKSUM_FLETCHER_2); 7881 LE_SET_TYPE(le, arc_flags_to_bufc(ab->b_flags)); 7882 dev->l2ad_log_blk_payload_asize += ab->b_l2hdr.b_asize; 7883 7884 return (dev->l2ad_log_ent_idx == L2ARC_LOG_BLK_ENTRIES); 7885 } 7886 7887 /* 7888 * Checks whether a given L2ARC device address sits in a time-sequential 7889 * range. The trick here is that the L2ARC is a rotary buffer, so we can't 7890 * just do a range comparison, we need to handle the situation in which the 7891 * range wraps around the end of the L2ARC device. Arguments: 7892 * bottom Lower end of the range to check (written to earlier). 7893 * top Upper end of the range to check (written to later). 7894 * check The address for which we want to determine if it sits in 7895 * between the top and bottom. 7896 * 7897 * The 3-way conditional below represents the following cases: 7898 * 7899 * bottom < top : Sequentially ordered case: 7900 * <check>--------+-------------------+ 7901 * | (overlap here?) | 7902 * L2ARC dev V V 7903 * |---------------<bottom>============<top>--------------| 7904 * 7905 * bottom > top: Looped-around case: 7906 * <check>--------+------------------+ 7907 * | (overlap here?) | 7908 * L2ARC dev V V 7909 * |===============<top>---------------<bottom>===========| 7910 * ^ ^ 7911 * | (or here?) | 7912 * +---------------+---------<check> 7913 * 7914 * top == bottom : Just a single address comparison. 7915 */ 7916 static inline boolean_t 7917 l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) 7918 { 7919 if (bottom < top) 7920 return (bottom <= check && check <= top); 7921 else if (bottom > top) 7922 return (check <= top || bottom <= check); 7923 else 7924 return (check == top); 7925 } 7926 7927 /* 7928 * dump arc cache to user mode for debugging purposes 7929 */ 7930 static void 7931 arc_dump_entry(arc_buf_hdr_t *entry, arc_info_t *outp) 7932 { 7933 outp->ai_dva = entry->b_dva; 7934 outp->ai_birth = entry->b_birth; 7935 outp->ai_flags = entry->b_flags; 7936 outp->ai_spa = entry->b_spa; 7937 outp->ai_size = entry->b_size; 7938 if (HDR_HAS_L1HDR(entry)) { 7939 arc_state_t *state = entry->b_l1hdr.b_state; 7940 if (state == arc_anon) 7941 outp->ai_state = AIS_ANON; 7942 else if (state == arc_mru) 7943 outp->ai_state = AIS_MRU; 7944 else if (state == arc_mru_ghost) 7945 outp->ai_state = AIS_MRU_GHOST; 7946 else if (state == arc_mfu) 7947 outp->ai_state = AIS_MFU; 7948 else if (state == arc_mfu_ghost) 7949 outp->ai_state = AIS_MFU_GHOST; 7950 else if (state == arc_l2c_only) 7951 outp->ai_state = AIS_L2C_ONLY; 7952 else 7953 outp->ai_state = AIS_UNKNOWN; 7954 } else { 7955 outp->ai_state = AIS_NO_L1HDR; 7956 } 7957 } 7958 7959 int 7960 arc_dump(int start_bucket, void *buf, size_t bufsize, size_t *returned_bytes) 7961 { 7962 int i; 7963 arc_info_t *outp = buf + sizeof(arc_info_hdr_t); 7964 arc_info_t *maxp = buf + bufsize; 7965 arc_info_hdr_t *aih = buf; 7966 size_t nbuckets = buf_hash_table.ht_mask + 1; 7967 size_t bph = nbuckets / BUF_LOCKS; /* buckets per hash */ 7968 kmutex_t *last_lock = NULL; 7969 7970 if (bufsize < sizeof(arc_info_hdr_t)) 7971 return (ENOMEM); 7972 7973 aih->aih_buckets = nbuckets; 7974 aih->aih_buf_locks = BUF_LOCKS; 7975 7976 ASSERT(start_bucket >= 0); 7977 ASSERT(start_bucket < nbuckets); 7978 7979 for (i = start_bucket; i < nbuckets; ++i) { 7980 kmutex_t *hash_lock; 7981 arc_buf_hdr_t *entry; 7982 arc_info_t *dryrun = outp; 7983 int bucket; 7984 7985 /* 7986 * transform index. We want to enumerate the buckets in an 7987 * order that allows us to keep the mutex as long as possible 7988 */ 7989 bucket = (i / bph) + (i % bph) * BUF_LOCKS; 7990 7991 hash_lock = BUF_HASH_LOCK(bucket); 7992 if (hash_lock != last_lock) { 7993 if (last_lock) 7994 mutex_exit(last_lock); 7995 mutex_enter(hash_lock); 7996 } 7997 last_lock = hash_lock; 7998 /* count entries to see if they will fit */ 7999 entry = buf_hash_table.ht_table[bucket]; 8000 while (entry != NULL) { 8001 ++dryrun; 8002 entry = entry->b_hash_next; 8003 } 8004 if (dryrun > maxp) { 8005 break; 8006 } 8007 /* actually copy entries */ 8008 entry = buf_hash_table.ht_table[bucket]; 8009 while (entry != NULL) { 8010 arc_dump_entry(entry, outp); 8011 ++outp; 8012 entry = entry->b_hash_next; 8013 } 8014 } 8015 if (last_lock) 8016 mutex_exit(last_lock); 8017 8018 *returned_bytes = (void *)outp - buf; 8019 aih->aih_entries = (*returned_bytes - sizeof(*aih)) / sizeof(*outp); 8020 8021 if (i <= buf_hash_table.ht_mask) 8022 aih->aih_next = i; 8023 else 8024 aih->aih_next = 0; 8025 8026 return (0); 8027 } 8028