1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2012, Joyent, Inc. All rights reserved. 24 * Copyright (c) 2011, 2016 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2015 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74 /* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal arc algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * arc list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each arc state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an arc list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Arc buffers may have an associated eviction callback function. 103 * This function will be invoked prior to removing the buffer (e.g. 104 * in arc_do_user_evicts()). Note however that the data associated 105 * with the buffer may be evicted prior to the callback. The callback 106 * must be made with *no locks held* (to prevent deadlock). Additionally, 107 * the users of callbacks must ensure that their private data is 108 * protected from simultaneous callbacks from arc_clear_callback() 109 * and arc_do_user_evicts(). 110 * 111 * Note that the majority of the performance stats are manipulated 112 * with atomic operations. 113 * 114 * The L2ARC uses the l2ad_mtx on each vdev for the following: 115 * 116 * - L2ARC buflist creation 117 * - L2ARC buflist eviction 118 * - L2ARC write completion, which walks L2ARC buflists 119 * - ARC header destruction, as it removes from L2ARC buflists 120 * - ARC header release, as it removes from L2ARC buflists 121 */ 122 123 #include <sys/spa.h> 124 #include <sys/zio.h> 125 #include <sys/zio_compress.h> 126 #include <sys/zfs_context.h> 127 #include <sys/arc.h> 128 #include <sys/refcount.h> 129 #include <sys/vdev.h> 130 #include <sys/vdev_impl.h> 131 #include <sys/dsl_pool.h> 132 #include <sys/multilist.h> 133 #ifdef _KERNEL 134 #include <sys/vmsystm.h> 135 #include <vm/anon.h> 136 #include <sys/fs/swapnode.h> 137 #include <sys/dnlc.h> 138 #endif 139 #include <sys/callb.h> 140 #include <sys/kstat.h> 141 #include <zfs_fletcher.h> 142 143 #ifndef _KERNEL 144 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 145 boolean_t arc_watch = B_FALSE; 146 int arc_procfd; 147 #endif 148 149 static kmutex_t arc_reclaim_lock; 150 static kcondvar_t arc_reclaim_thread_cv; 151 static boolean_t arc_reclaim_thread_exit; 152 static kcondvar_t arc_reclaim_waiters_cv; 153 154 static kmutex_t arc_user_evicts_lock; 155 static kcondvar_t arc_user_evicts_cv; 156 static boolean_t arc_user_evicts_thread_exit; 157 158 uint_t arc_reduce_dnlc_percent = 3; 159 160 /* 161 * The number of headers to evict in arc_evict_state_impl() before 162 * dropping the sublist lock and evicting from another sublist. A lower 163 * value means we're more likely to evict the "correct" header (i.e. the 164 * oldest header in the arc state), but comes with higher overhead 165 * (i.e. more invocations of arc_evict_state_impl()). 166 */ 167 int zfs_arc_evict_batch_limit = 10; 168 169 /* 170 * The number of sublists used for each of the arc state lists. If this 171 * is not set to a suitable value by the user, it will be configured to 172 * the number of CPUs on the system in arc_init(). 173 */ 174 int zfs_arc_num_sublists_per_state = 0; 175 176 /* number of seconds before growing cache again */ 177 static int arc_grow_retry = 60; 178 179 /* shift of arc_c for calculating overflow limit in arc_get_data_buf */ 180 int zfs_arc_overflow_shift = 8; 181 182 /* shift of arc_c for calculating both min and max arc_p */ 183 static int arc_p_min_shift = 4; 184 185 /* log2(fraction of arc to reclaim) */ 186 static int arc_shrink_shift = 7; 187 188 /* 189 * log2(fraction of ARC which must be free to allow growing). 190 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 191 * when reading a new block into the ARC, we will evict an equal-sized block 192 * from the ARC. 193 * 194 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 195 * we will still not allow it to grow. 196 */ 197 int arc_no_grow_shift = 5; 198 199 200 /* 201 * minimum lifespan of a prefetch block in clock ticks 202 * (initialized in arc_init()) 203 */ 204 static int arc_min_prefetch_lifespan; 205 206 /* 207 * If this percent of memory is free, don't throttle. 208 */ 209 int arc_lotsfree_percent = 10; 210 211 static int arc_dead; 212 213 /* 214 * The arc has filled available memory and has now warmed up. 215 */ 216 static boolean_t arc_warm; 217 218 /* 219 * log2 fraction of the zio arena to keep free. 220 */ 221 int arc_zio_arena_free_shift = 2; 222 223 /* 224 * These tunables are for performance analysis. 225 */ 226 uint64_t zfs_arc_max; 227 uint64_t zfs_arc_min; 228 uint64_t zfs_arc_meta_limit = 0; 229 uint64_t zfs_arc_meta_min = 0; 230 int zfs_arc_grow_retry = 0; 231 int zfs_arc_shrink_shift = 0; 232 int zfs_arc_p_min_shift = 0; 233 int zfs_disable_dup_eviction = 0; 234 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 235 236 /* 237 * Note that buffers can be in one of 6 states: 238 * ARC_anon - anonymous (discussed below) 239 * ARC_mru - recently used, currently cached 240 * ARC_mru_ghost - recentely used, no longer in cache 241 * ARC_mfu - frequently used, currently cached 242 * ARC_mfu_ghost - frequently used, no longer in cache 243 * ARC_l2c_only - exists in L2ARC but not other states 244 * When there are no active references to the buffer, they are 245 * are linked onto a list in one of these arc states. These are 246 * the only buffers that can be evicted or deleted. Within each 247 * state there are multiple lists, one for meta-data and one for 248 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 249 * etc.) is tracked separately so that it can be managed more 250 * explicitly: favored over data, limited explicitly. 251 * 252 * Anonymous buffers are buffers that are not associated with 253 * a DVA. These are buffers that hold dirty block copies 254 * before they are written to stable storage. By definition, 255 * they are "ref'd" and are considered part of arc_mru 256 * that cannot be freed. Generally, they will aquire a DVA 257 * as they are written and migrate onto the arc_mru list. 258 * 259 * The ARC_l2c_only state is for buffers that are in the second 260 * level ARC but no longer in any of the ARC_m* lists. The second 261 * level ARC itself may also contain buffers that are in any of 262 * the ARC_m* states - meaning that a buffer can exist in two 263 * places. The reason for the ARC_l2c_only state is to keep the 264 * buffer header in the hash table, so that reads that hit the 265 * second level ARC benefit from these fast lookups. 266 */ 267 268 typedef struct arc_state { 269 /* 270 * list of evictable buffers 271 */ 272 multilist_t arcs_list[ARC_BUFC_NUMTYPES]; 273 /* 274 * total amount of evictable data in this state 275 */ 276 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; 277 /* 278 * total amount of data in this state; this includes: evictable, 279 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 280 */ 281 refcount_t arcs_size; 282 } arc_state_t; 283 284 /* The 6 states: */ 285 static arc_state_t ARC_anon; 286 static arc_state_t ARC_mru; 287 static arc_state_t ARC_mru_ghost; 288 static arc_state_t ARC_mfu; 289 static arc_state_t ARC_mfu_ghost; 290 static arc_state_t ARC_l2c_only; 291 292 typedef struct arc_stats { 293 kstat_named_t arcstat_hits; 294 kstat_named_t arcstat_misses; 295 kstat_named_t arcstat_demand_data_hits; 296 kstat_named_t arcstat_demand_data_misses; 297 kstat_named_t arcstat_demand_metadata_hits; 298 kstat_named_t arcstat_demand_metadata_misses; 299 kstat_named_t arcstat_prefetch_data_hits; 300 kstat_named_t arcstat_prefetch_data_misses; 301 kstat_named_t arcstat_prefetch_metadata_hits; 302 kstat_named_t arcstat_prefetch_metadata_misses; 303 kstat_named_t arcstat_mru_hits; 304 kstat_named_t arcstat_mru_ghost_hits; 305 kstat_named_t arcstat_mfu_hits; 306 kstat_named_t arcstat_mfu_ghost_hits; 307 kstat_named_t arcstat_deleted; 308 /* 309 * Number of buffers that could not be evicted because the hash lock 310 * was held by another thread. The lock may not necessarily be held 311 * by something using the same buffer, since hash locks are shared 312 * by multiple buffers. 313 */ 314 kstat_named_t arcstat_mutex_miss; 315 /* 316 * Number of buffers skipped because they have I/O in progress, are 317 * indrect prefetch buffers that have not lived long enough, or are 318 * not from the spa we're trying to evict from. 319 */ 320 kstat_named_t arcstat_evict_skip; 321 /* 322 * Number of times arc_evict_state() was unable to evict enough 323 * buffers to reach it's target amount. 324 */ 325 kstat_named_t arcstat_evict_not_enough; 326 kstat_named_t arcstat_evict_l2_cached; 327 kstat_named_t arcstat_evict_l2_eligible; 328 kstat_named_t arcstat_evict_l2_ineligible; 329 kstat_named_t arcstat_evict_l2_skip; 330 kstat_named_t arcstat_hash_elements; 331 kstat_named_t arcstat_hash_elements_max; 332 kstat_named_t arcstat_hash_collisions; 333 kstat_named_t arcstat_hash_chains; 334 kstat_named_t arcstat_hash_chain_max; 335 kstat_named_t arcstat_p; 336 kstat_named_t arcstat_c; 337 kstat_named_t arcstat_c_min; 338 kstat_named_t arcstat_c_max; 339 kstat_named_t arcstat_size; 340 /* 341 * Number of bytes consumed by internal ARC structures necessary 342 * for tracking purposes; these structures are not actually 343 * backed by ARC buffers. This includes arc_buf_hdr_t structures 344 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 345 * caches), and arc_buf_t structures (allocated via arc_buf_t 346 * cache). 347 */ 348 kstat_named_t arcstat_hdr_size; 349 /* 350 * Number of bytes consumed by ARC buffers of type equal to 351 * ARC_BUFC_DATA. This is generally consumed by buffers backing 352 * on disk user data (e.g. plain file contents). 353 */ 354 kstat_named_t arcstat_data_size; 355 /* 356 * Number of bytes consumed by ARC buffers of type equal to 357 * ARC_BUFC_METADATA. This is generally consumed by buffers 358 * backing on disk data that is used for internal ZFS 359 * structures (e.g. ZAP, dnode, indirect blocks, etc). 360 */ 361 kstat_named_t arcstat_metadata_size; 362 /* 363 * Number of bytes consumed by various buffers and structures 364 * not actually backed with ARC buffers. This includes bonus 365 * buffers (allocated directly via zio_buf_* functions), 366 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 367 * cache), and dnode_t structures (allocated via dnode_t cache). 368 */ 369 kstat_named_t arcstat_other_size; 370 /* 371 * Total number of bytes consumed by ARC buffers residing in the 372 * arc_anon state. This includes *all* buffers in the arc_anon 373 * state; e.g. data, metadata, evictable, and unevictable buffers 374 * are all included in this value. 375 */ 376 kstat_named_t arcstat_anon_size; 377 /* 378 * Number of bytes consumed by ARC buffers that meet the 379 * following criteria: backing buffers of type ARC_BUFC_DATA, 380 * residing in the arc_anon state, and are eligible for eviction 381 * (e.g. have no outstanding holds on the buffer). 382 */ 383 kstat_named_t arcstat_anon_evictable_data; 384 /* 385 * Number of bytes consumed by ARC buffers that meet the 386 * following criteria: backing buffers of type ARC_BUFC_METADATA, 387 * residing in the arc_anon state, and are eligible for eviction 388 * (e.g. have no outstanding holds on the buffer). 389 */ 390 kstat_named_t arcstat_anon_evictable_metadata; 391 /* 392 * Total number of bytes consumed by ARC buffers residing in the 393 * arc_mru state. This includes *all* buffers in the arc_mru 394 * state; e.g. data, metadata, evictable, and unevictable buffers 395 * are all included in this value. 396 */ 397 kstat_named_t arcstat_mru_size; 398 /* 399 * Number of bytes consumed by ARC buffers that meet the 400 * following criteria: backing buffers of type ARC_BUFC_DATA, 401 * residing in the arc_mru state, and are eligible for eviction 402 * (e.g. have no outstanding holds on the buffer). 403 */ 404 kstat_named_t arcstat_mru_evictable_data; 405 /* 406 * Number of bytes consumed by ARC buffers that meet the 407 * following criteria: backing buffers of type ARC_BUFC_METADATA, 408 * residing in the arc_mru state, and are eligible for eviction 409 * (e.g. have no outstanding holds on the buffer). 410 */ 411 kstat_named_t arcstat_mru_evictable_metadata; 412 /* 413 * Total number of bytes that *would have been* consumed by ARC 414 * buffers in the arc_mru_ghost state. The key thing to note 415 * here, is the fact that this size doesn't actually indicate 416 * RAM consumption. The ghost lists only consist of headers and 417 * don't actually have ARC buffers linked off of these headers. 418 * Thus, *if* the headers had associated ARC buffers, these 419 * buffers *would have* consumed this number of bytes. 420 */ 421 kstat_named_t arcstat_mru_ghost_size; 422 /* 423 * Number of bytes that *would have been* consumed by ARC 424 * buffers that are eligible for eviction, of type 425 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 426 */ 427 kstat_named_t arcstat_mru_ghost_evictable_data; 428 /* 429 * Number of bytes that *would have been* consumed by ARC 430 * buffers that are eligible for eviction, of type 431 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 432 */ 433 kstat_named_t arcstat_mru_ghost_evictable_metadata; 434 /* 435 * Total number of bytes consumed by ARC buffers residing in the 436 * arc_mfu state. This includes *all* buffers in the arc_mfu 437 * state; e.g. data, metadata, evictable, and unevictable buffers 438 * are all included in this value. 439 */ 440 kstat_named_t arcstat_mfu_size; 441 /* 442 * Number of bytes consumed by ARC buffers that are eligible for 443 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 444 * state. 445 */ 446 kstat_named_t arcstat_mfu_evictable_data; 447 /* 448 * Number of bytes consumed by ARC buffers that are eligible for 449 * eviction, of type ARC_BUFC_METADATA, and reside in the 450 * arc_mfu state. 451 */ 452 kstat_named_t arcstat_mfu_evictable_metadata; 453 /* 454 * Total number of bytes that *would have been* consumed by ARC 455 * buffers in the arc_mfu_ghost state. See the comment above 456 * arcstat_mru_ghost_size for more details. 457 */ 458 kstat_named_t arcstat_mfu_ghost_size; 459 /* 460 * Number of bytes that *would have been* consumed by ARC 461 * buffers that are eligible for eviction, of type 462 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 463 */ 464 kstat_named_t arcstat_mfu_ghost_evictable_data; 465 /* 466 * Number of bytes that *would have been* consumed by ARC 467 * buffers that are eligible for eviction, of type 468 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 469 */ 470 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 471 kstat_named_t arcstat_l2_hits; 472 kstat_named_t arcstat_l2_misses; 473 kstat_named_t arcstat_l2_feeds; 474 kstat_named_t arcstat_l2_rw_clash; 475 kstat_named_t arcstat_l2_read_bytes; 476 kstat_named_t arcstat_l2_write_bytes; 477 kstat_named_t arcstat_l2_writes_sent; 478 kstat_named_t arcstat_l2_writes_done; 479 kstat_named_t arcstat_l2_writes_error; 480 kstat_named_t arcstat_l2_writes_lock_retry; 481 kstat_named_t arcstat_l2_evict_lock_retry; 482 kstat_named_t arcstat_l2_evict_reading; 483 kstat_named_t arcstat_l2_evict_l1cached; 484 kstat_named_t arcstat_l2_free_on_write; 485 kstat_named_t arcstat_l2_cdata_free_on_write; 486 kstat_named_t arcstat_l2_abort_lowmem; 487 kstat_named_t arcstat_l2_cksum_bad; 488 kstat_named_t arcstat_l2_io_error; 489 kstat_named_t arcstat_l2_size; 490 kstat_named_t arcstat_l2_asize; 491 kstat_named_t arcstat_l2_hdr_size; 492 kstat_named_t arcstat_l2_compress_successes; 493 kstat_named_t arcstat_l2_compress_zeros; 494 kstat_named_t arcstat_l2_compress_failures; 495 kstat_named_t arcstat_memory_throttle_count; 496 kstat_named_t arcstat_duplicate_buffers; 497 kstat_named_t arcstat_duplicate_buffers_size; 498 kstat_named_t arcstat_duplicate_reads; 499 kstat_named_t arcstat_meta_used; 500 kstat_named_t arcstat_meta_limit; 501 kstat_named_t arcstat_meta_max; 502 kstat_named_t arcstat_meta_min; 503 kstat_named_t arcstat_sync_wait_for_async; 504 kstat_named_t arcstat_demand_hit_predictive_prefetch; 505 } arc_stats_t; 506 507 static arc_stats_t arc_stats = { 508 { "hits", KSTAT_DATA_UINT64 }, 509 { "misses", KSTAT_DATA_UINT64 }, 510 { "demand_data_hits", KSTAT_DATA_UINT64 }, 511 { "demand_data_misses", KSTAT_DATA_UINT64 }, 512 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 513 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 514 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 515 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 516 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 517 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 518 { "mru_hits", KSTAT_DATA_UINT64 }, 519 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 520 { "mfu_hits", KSTAT_DATA_UINT64 }, 521 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 522 { "deleted", KSTAT_DATA_UINT64 }, 523 { "mutex_miss", KSTAT_DATA_UINT64 }, 524 { "evict_skip", KSTAT_DATA_UINT64 }, 525 { "evict_not_enough", KSTAT_DATA_UINT64 }, 526 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 527 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 528 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 529 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 530 { "hash_elements", KSTAT_DATA_UINT64 }, 531 { "hash_elements_max", KSTAT_DATA_UINT64 }, 532 { "hash_collisions", KSTAT_DATA_UINT64 }, 533 { "hash_chains", KSTAT_DATA_UINT64 }, 534 { "hash_chain_max", KSTAT_DATA_UINT64 }, 535 { "p", KSTAT_DATA_UINT64 }, 536 { "c", KSTAT_DATA_UINT64 }, 537 { "c_min", KSTAT_DATA_UINT64 }, 538 { "c_max", KSTAT_DATA_UINT64 }, 539 { "size", KSTAT_DATA_UINT64 }, 540 { "hdr_size", KSTAT_DATA_UINT64 }, 541 { "data_size", KSTAT_DATA_UINT64 }, 542 { "metadata_size", KSTAT_DATA_UINT64 }, 543 { "other_size", KSTAT_DATA_UINT64 }, 544 { "anon_size", KSTAT_DATA_UINT64 }, 545 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 546 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 547 { "mru_size", KSTAT_DATA_UINT64 }, 548 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 549 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 550 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 551 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 552 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 553 { "mfu_size", KSTAT_DATA_UINT64 }, 554 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 555 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 556 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 557 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 558 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 559 { "l2_hits", KSTAT_DATA_UINT64 }, 560 { "l2_misses", KSTAT_DATA_UINT64 }, 561 { "l2_feeds", KSTAT_DATA_UINT64 }, 562 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 563 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 564 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 565 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 566 { "l2_writes_done", KSTAT_DATA_UINT64 }, 567 { "l2_writes_error", KSTAT_DATA_UINT64 }, 568 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 569 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 570 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 571 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 572 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 573 { "l2_cdata_free_on_write", KSTAT_DATA_UINT64 }, 574 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 575 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 576 { "l2_io_error", KSTAT_DATA_UINT64 }, 577 { "l2_size", KSTAT_DATA_UINT64 }, 578 { "l2_asize", KSTAT_DATA_UINT64 }, 579 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 580 { "l2_compress_successes", KSTAT_DATA_UINT64 }, 581 { "l2_compress_zeros", KSTAT_DATA_UINT64 }, 582 { "l2_compress_failures", KSTAT_DATA_UINT64 }, 583 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 584 { "duplicate_buffers", KSTAT_DATA_UINT64 }, 585 { "duplicate_buffers_size", KSTAT_DATA_UINT64 }, 586 { "duplicate_reads", KSTAT_DATA_UINT64 }, 587 { "arc_meta_used", KSTAT_DATA_UINT64 }, 588 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 589 { "arc_meta_max", KSTAT_DATA_UINT64 }, 590 { "arc_meta_min", KSTAT_DATA_UINT64 }, 591 { "sync_wait_for_async", KSTAT_DATA_UINT64 }, 592 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 593 }; 594 595 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 596 597 #define ARCSTAT_INCR(stat, val) \ 598 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 599 600 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 601 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 602 603 #define ARCSTAT_MAX(stat, val) { \ 604 uint64_t m; \ 605 while ((val) > (m = arc_stats.stat.value.ui64) && \ 606 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 607 continue; \ 608 } 609 610 #define ARCSTAT_MAXSTAT(stat) \ 611 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 612 613 /* 614 * We define a macro to allow ARC hits/misses to be easily broken down by 615 * two separate conditions, giving a total of four different subtypes for 616 * each of hits and misses (so eight statistics total). 617 */ 618 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 619 if (cond1) { \ 620 if (cond2) { \ 621 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 622 } else { \ 623 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 624 } \ 625 } else { \ 626 if (cond2) { \ 627 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 628 } else { \ 629 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 630 } \ 631 } 632 633 kstat_t *arc_ksp; 634 static arc_state_t *arc_anon; 635 static arc_state_t *arc_mru; 636 static arc_state_t *arc_mru_ghost; 637 static arc_state_t *arc_mfu; 638 static arc_state_t *arc_mfu_ghost; 639 static arc_state_t *arc_l2c_only; 640 641 /* 642 * There are several ARC variables that are critical to export as kstats -- 643 * but we don't want to have to grovel around in the kstat whenever we wish to 644 * manipulate them. For these variables, we therefore define them to be in 645 * terms of the statistic variable. This assures that we are not introducing 646 * the possibility of inconsistency by having shadow copies of the variables, 647 * while still allowing the code to be readable. 648 */ 649 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 650 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 651 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 652 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 653 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 654 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 655 #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 656 #define arc_meta_used ARCSTAT(arcstat_meta_used) /* size of metadata */ 657 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 658 659 #define L2ARC_IS_VALID_COMPRESS(_c_) \ 660 ((_c_) == ZIO_COMPRESS_LZ4 || (_c_) == ZIO_COMPRESS_EMPTY) 661 662 static int arc_no_grow; /* Don't try to grow cache size */ 663 static uint64_t arc_tempreserve; 664 static uint64_t arc_loaned_bytes; 665 666 typedef struct arc_callback arc_callback_t; 667 668 struct arc_callback { 669 void *acb_private; 670 arc_done_func_t *acb_done; 671 arc_buf_t *acb_buf; 672 zio_t *acb_zio_dummy; 673 arc_callback_t *acb_next; 674 }; 675 676 typedef struct arc_write_callback arc_write_callback_t; 677 678 struct arc_write_callback { 679 void *awcb_private; 680 arc_done_func_t *awcb_ready; 681 arc_done_func_t *awcb_children_ready; 682 arc_done_func_t *awcb_physdone; 683 arc_done_func_t *awcb_done; 684 arc_buf_t *awcb_buf; 685 }; 686 687 /* 688 * ARC buffers are separated into multiple structs as a memory saving measure: 689 * - Common fields struct, always defined, and embedded within it: 690 * - L2-only fields, always allocated but undefined when not in L2ARC 691 * - L1-only fields, only allocated when in L1ARC 692 * 693 * Buffer in L1 Buffer only in L2 694 * +------------------------+ +------------------------+ 695 * | arc_buf_hdr_t | | arc_buf_hdr_t | 696 * | | | | 697 * | | | | 698 * | | | | 699 * +------------------------+ +------------------------+ 700 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 701 * | (undefined if L1-only) | | | 702 * +------------------------+ +------------------------+ 703 * | l1arc_buf_hdr_t | 704 * | | 705 * | | 706 * | | 707 * | | 708 * +------------------------+ 709 * 710 * Because it's possible for the L2ARC to become extremely large, we can wind 711 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 712 * is minimized by only allocating the fields necessary for an L1-cached buffer 713 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 714 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 715 * words in pointers. arc_hdr_realloc() is used to switch a header between 716 * these two allocation states. 717 */ 718 typedef struct l1arc_buf_hdr { 719 kmutex_t b_freeze_lock; 720 #ifdef ZFS_DEBUG 721 /* 722 * used for debugging wtih kmem_flags - by allocating and freeing 723 * b_thawed when the buffer is thawed, we get a record of the stack 724 * trace that thawed it. 725 */ 726 void *b_thawed; 727 #endif 728 729 arc_buf_t *b_buf; 730 uint32_t b_datacnt; 731 /* for waiting on writes to complete */ 732 kcondvar_t b_cv; 733 734 /* protected by arc state mutex */ 735 arc_state_t *b_state; 736 multilist_node_t b_arc_node; 737 738 /* updated atomically */ 739 clock_t b_arc_access; 740 741 /* self protecting */ 742 refcount_t b_refcnt; 743 744 arc_callback_t *b_acb; 745 /* temporary buffer holder for in-flight compressed data */ 746 void *b_tmp_cdata; 747 } l1arc_buf_hdr_t; 748 749 typedef struct l2arc_dev l2arc_dev_t; 750 751 typedef struct l2arc_buf_hdr { 752 /* protected by arc_buf_hdr mutex */ 753 l2arc_dev_t *b_dev; /* L2ARC device */ 754 uint64_t b_daddr; /* disk address, offset byte */ 755 /* real alloc'd buffer size depending on b_compress applied */ 756 int32_t b_asize; 757 uint8_t b_compress; 758 759 list_node_t b_l2node; 760 } l2arc_buf_hdr_t; 761 762 struct arc_buf_hdr { 763 /* protected by hash lock */ 764 dva_t b_dva; 765 uint64_t b_birth; 766 /* 767 * Even though this checksum is only set/verified when a buffer is in 768 * the L1 cache, it needs to be in the set of common fields because it 769 * must be preserved from the time before a buffer is written out to 770 * L2ARC until after it is read back in. 771 */ 772 zio_cksum_t *b_freeze_cksum; 773 774 arc_buf_hdr_t *b_hash_next; 775 arc_flags_t b_flags; 776 777 /* immutable */ 778 int32_t b_size; 779 uint64_t b_spa; 780 781 /* L2ARC fields. Undefined when not in L2ARC. */ 782 l2arc_buf_hdr_t b_l2hdr; 783 /* L1ARC fields. Undefined when in l2arc_only state */ 784 l1arc_buf_hdr_t b_l1hdr; 785 }; 786 787 static arc_buf_t *arc_eviction_list; 788 static arc_buf_hdr_t arc_eviction_hdr; 789 790 #define GHOST_STATE(state) \ 791 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 792 (state) == arc_l2c_only) 793 794 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 795 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 796 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 797 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 798 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FLAG_FREED_IN_READ) 799 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_FLAG_BUF_AVAILABLE) 800 801 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 802 #define HDR_L2COMPRESS(hdr) ((hdr)->b_flags & ARC_FLAG_L2COMPRESS) 803 #define HDR_L2_READING(hdr) \ 804 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 805 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 806 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 807 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 808 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 809 810 #define HDR_ISTYPE_METADATA(hdr) \ 811 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 812 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 813 814 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 815 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 816 817 /* 818 * Other sizes 819 */ 820 821 #define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 822 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 823 824 /* 825 * Hash table routines 826 */ 827 828 #define HT_LOCK_PAD 64 829 830 struct ht_lock { 831 kmutex_t ht_lock; 832 #ifdef _KERNEL 833 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 834 #endif 835 }; 836 837 #define BUF_LOCKS 256 838 typedef struct buf_hash_table { 839 uint64_t ht_mask; 840 arc_buf_hdr_t **ht_table; 841 struct ht_lock ht_locks[BUF_LOCKS]; 842 } buf_hash_table_t; 843 844 static buf_hash_table_t buf_hash_table; 845 846 #define BUF_HASH_INDEX(spa, dva, birth) \ 847 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 848 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 849 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 850 #define HDR_LOCK(hdr) \ 851 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 852 853 uint64_t zfs_crc64_table[256]; 854 855 /* 856 * Level 2 ARC 857 */ 858 859 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 860 #define L2ARC_HEADROOM 2 /* num of writes */ 861 /* 862 * If we discover during ARC scan any buffers to be compressed, we boost 863 * our headroom for the next scanning cycle by this percentage multiple. 864 */ 865 #define L2ARC_HEADROOM_BOOST 200 866 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 867 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 868 869 /* 870 * Used to distinguish headers that are being process by 871 * l2arc_write_buffers(), but have yet to be assigned to a l2arc disk 872 * address. This can happen when the header is added to the l2arc's list 873 * of buffers to write in the first stage of l2arc_write_buffers(), but 874 * has not yet been written out which happens in the second stage of 875 * l2arc_write_buffers(). 876 */ 877 #define L2ARC_ADDR_UNSET ((uint64_t)(-1)) 878 879 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 880 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 881 882 /* L2ARC Performance Tunables */ 883 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 884 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 885 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 886 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 887 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 888 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 889 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 890 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 891 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 892 893 /* 894 * L2ARC Internals 895 */ 896 struct l2arc_dev { 897 vdev_t *l2ad_vdev; /* vdev */ 898 spa_t *l2ad_spa; /* spa */ 899 uint64_t l2ad_hand; /* next write location */ 900 uint64_t l2ad_start; /* first addr on device */ 901 uint64_t l2ad_end; /* last addr on device */ 902 boolean_t l2ad_first; /* first sweep through */ 903 boolean_t l2ad_writing; /* currently writing */ 904 kmutex_t l2ad_mtx; /* lock for buffer list */ 905 list_t l2ad_buflist; /* buffer list */ 906 list_node_t l2ad_node; /* device list node */ 907 refcount_t l2ad_alloc; /* allocated bytes */ 908 }; 909 910 static list_t L2ARC_dev_list; /* device list */ 911 static list_t *l2arc_dev_list; /* device list pointer */ 912 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 913 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 914 static list_t L2ARC_free_on_write; /* free after write buf list */ 915 static list_t *l2arc_free_on_write; /* free after write list ptr */ 916 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 917 static uint64_t l2arc_ndev; /* number of devices */ 918 919 typedef struct l2arc_read_callback { 920 arc_buf_t *l2rcb_buf; /* read buffer */ 921 spa_t *l2rcb_spa; /* spa */ 922 blkptr_t l2rcb_bp; /* original blkptr */ 923 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 924 int l2rcb_flags; /* original flags */ 925 enum zio_compress l2rcb_compress; /* applied compress */ 926 } l2arc_read_callback_t; 927 928 typedef struct l2arc_write_callback { 929 l2arc_dev_t *l2wcb_dev; /* device info */ 930 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 931 } l2arc_write_callback_t; 932 933 typedef struct l2arc_data_free { 934 /* protected by l2arc_free_on_write_mtx */ 935 void *l2df_data; 936 size_t l2df_size; 937 void (*l2df_func)(void *, size_t); 938 list_node_t l2df_list_node; 939 } l2arc_data_free_t; 940 941 static kmutex_t l2arc_feed_thr_lock; 942 static kcondvar_t l2arc_feed_thr_cv; 943 static uint8_t l2arc_thread_exit; 944 945 static void arc_get_data_buf(arc_buf_t *); 946 static void arc_access(arc_buf_hdr_t *, kmutex_t *); 947 static boolean_t arc_is_overflowing(); 948 static void arc_buf_watch(arc_buf_t *); 949 950 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 951 static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 952 953 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 954 static void l2arc_read_done(zio_t *); 955 956 static boolean_t l2arc_compress_buf(arc_buf_hdr_t *); 957 static void l2arc_decompress_zio(zio_t *, arc_buf_hdr_t *, enum zio_compress); 958 static void l2arc_release_cdata_buf(arc_buf_hdr_t *); 959 960 static uint64_t 961 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 962 { 963 uint8_t *vdva = (uint8_t *)dva; 964 uint64_t crc = -1ULL; 965 int i; 966 967 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 968 969 for (i = 0; i < sizeof (dva_t); i++) 970 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 971 972 crc ^= (spa>>8) ^ birth; 973 974 return (crc); 975 } 976 977 #define BUF_EMPTY(buf) \ 978 ((buf)->b_dva.dva_word[0] == 0 && \ 979 (buf)->b_dva.dva_word[1] == 0) 980 981 #define BUF_EQUAL(spa, dva, birth, buf) \ 982 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 983 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 984 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 985 986 static void 987 buf_discard_identity(arc_buf_hdr_t *hdr) 988 { 989 hdr->b_dva.dva_word[0] = 0; 990 hdr->b_dva.dva_word[1] = 0; 991 hdr->b_birth = 0; 992 } 993 994 static arc_buf_hdr_t * 995 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 996 { 997 const dva_t *dva = BP_IDENTITY(bp); 998 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 999 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1000 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1001 arc_buf_hdr_t *hdr; 1002 1003 mutex_enter(hash_lock); 1004 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1005 hdr = hdr->b_hash_next) { 1006 if (BUF_EQUAL(spa, dva, birth, hdr)) { 1007 *lockp = hash_lock; 1008 return (hdr); 1009 } 1010 } 1011 mutex_exit(hash_lock); 1012 *lockp = NULL; 1013 return (NULL); 1014 } 1015 1016 /* 1017 * Insert an entry into the hash table. If there is already an element 1018 * equal to elem in the hash table, then the already existing element 1019 * will be returned and the new element will not be inserted. 1020 * Otherwise returns NULL. 1021 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1022 */ 1023 static arc_buf_hdr_t * 1024 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1025 { 1026 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1027 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1028 arc_buf_hdr_t *fhdr; 1029 uint32_t i; 1030 1031 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1032 ASSERT(hdr->b_birth != 0); 1033 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1034 1035 if (lockp != NULL) { 1036 *lockp = hash_lock; 1037 mutex_enter(hash_lock); 1038 } else { 1039 ASSERT(MUTEX_HELD(hash_lock)); 1040 } 1041 1042 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1043 fhdr = fhdr->b_hash_next, i++) { 1044 if (BUF_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1045 return (fhdr); 1046 } 1047 1048 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1049 buf_hash_table.ht_table[idx] = hdr; 1050 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 1051 1052 /* collect some hash table performance data */ 1053 if (i > 0) { 1054 ARCSTAT_BUMP(arcstat_hash_collisions); 1055 if (i == 1) 1056 ARCSTAT_BUMP(arcstat_hash_chains); 1057 1058 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1059 } 1060 1061 ARCSTAT_BUMP(arcstat_hash_elements); 1062 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1063 1064 return (NULL); 1065 } 1066 1067 static void 1068 buf_hash_remove(arc_buf_hdr_t *hdr) 1069 { 1070 arc_buf_hdr_t *fhdr, **hdrp; 1071 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1072 1073 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1074 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1075 1076 hdrp = &buf_hash_table.ht_table[idx]; 1077 while ((fhdr = *hdrp) != hdr) { 1078 ASSERT(fhdr != NULL); 1079 hdrp = &fhdr->b_hash_next; 1080 } 1081 *hdrp = hdr->b_hash_next; 1082 hdr->b_hash_next = NULL; 1083 hdr->b_flags &= ~ARC_FLAG_IN_HASH_TABLE; 1084 1085 /* collect some hash table performance data */ 1086 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1087 1088 if (buf_hash_table.ht_table[idx] && 1089 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1090 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1091 } 1092 1093 /* 1094 * Global data structures and functions for the buf kmem cache. 1095 */ 1096 static kmem_cache_t *hdr_full_cache; 1097 static kmem_cache_t *hdr_l2only_cache; 1098 static kmem_cache_t *buf_cache; 1099 1100 static void 1101 buf_fini(void) 1102 { 1103 int i; 1104 1105 kmem_free(buf_hash_table.ht_table, 1106 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1107 for (i = 0; i < BUF_LOCKS; i++) 1108 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1109 kmem_cache_destroy(hdr_full_cache); 1110 kmem_cache_destroy(hdr_l2only_cache); 1111 kmem_cache_destroy(buf_cache); 1112 } 1113 1114 /* 1115 * Constructor callback - called when the cache is empty 1116 * and a new buf is requested. 1117 */ 1118 /* ARGSUSED */ 1119 static int 1120 hdr_full_cons(void *vbuf, void *unused, int kmflag) 1121 { 1122 arc_buf_hdr_t *hdr = vbuf; 1123 1124 bzero(hdr, HDR_FULL_SIZE); 1125 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1126 refcount_create(&hdr->b_l1hdr.b_refcnt); 1127 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1128 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1129 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1130 1131 return (0); 1132 } 1133 1134 /* ARGSUSED */ 1135 static int 1136 hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1137 { 1138 arc_buf_hdr_t *hdr = vbuf; 1139 1140 bzero(hdr, HDR_L2ONLY_SIZE); 1141 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1142 1143 return (0); 1144 } 1145 1146 /* ARGSUSED */ 1147 static int 1148 buf_cons(void *vbuf, void *unused, int kmflag) 1149 { 1150 arc_buf_t *buf = vbuf; 1151 1152 bzero(buf, sizeof (arc_buf_t)); 1153 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1154 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1155 1156 return (0); 1157 } 1158 1159 /* 1160 * Destructor callback - called when a cached buf is 1161 * no longer required. 1162 */ 1163 /* ARGSUSED */ 1164 static void 1165 hdr_full_dest(void *vbuf, void *unused) 1166 { 1167 arc_buf_hdr_t *hdr = vbuf; 1168 1169 ASSERT(BUF_EMPTY(hdr)); 1170 cv_destroy(&hdr->b_l1hdr.b_cv); 1171 refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1172 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1173 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1174 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1175 } 1176 1177 /* ARGSUSED */ 1178 static void 1179 hdr_l2only_dest(void *vbuf, void *unused) 1180 { 1181 arc_buf_hdr_t *hdr = vbuf; 1182 1183 ASSERT(BUF_EMPTY(hdr)); 1184 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1185 } 1186 1187 /* ARGSUSED */ 1188 static void 1189 buf_dest(void *vbuf, void *unused) 1190 { 1191 arc_buf_t *buf = vbuf; 1192 1193 mutex_destroy(&buf->b_evict_lock); 1194 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1195 } 1196 1197 /* 1198 * Reclaim callback -- invoked when memory is low. 1199 */ 1200 /* ARGSUSED */ 1201 static void 1202 hdr_recl(void *unused) 1203 { 1204 dprintf("hdr_recl called\n"); 1205 /* 1206 * umem calls the reclaim func when we destroy the buf cache, 1207 * which is after we do arc_fini(). 1208 */ 1209 if (!arc_dead) 1210 cv_signal(&arc_reclaim_thread_cv); 1211 } 1212 1213 static void 1214 buf_init(void) 1215 { 1216 uint64_t *ct; 1217 uint64_t hsize = 1ULL << 12; 1218 int i, j; 1219 1220 /* 1221 * The hash table is big enough to fill all of physical memory 1222 * with an average block size of zfs_arc_average_blocksize (default 8K). 1223 * By default, the table will take up 1224 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1225 */ 1226 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) 1227 hsize <<= 1; 1228 retry: 1229 buf_hash_table.ht_mask = hsize - 1; 1230 buf_hash_table.ht_table = 1231 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1232 if (buf_hash_table.ht_table == NULL) { 1233 ASSERT(hsize > (1ULL << 8)); 1234 hsize >>= 1; 1235 goto retry; 1236 } 1237 1238 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1239 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1240 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1241 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1242 NULL, NULL, 0); 1243 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1244 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1245 1246 for (i = 0; i < 256; i++) 1247 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1248 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1249 1250 for (i = 0; i < BUF_LOCKS; i++) { 1251 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1252 NULL, MUTEX_DEFAULT, NULL); 1253 } 1254 } 1255 1256 /* 1257 * Transition between the two allocation states for the arc_buf_hdr struct. 1258 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 1259 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 1260 * version is used when a cache buffer is only in the L2ARC in order to reduce 1261 * memory usage. 1262 */ 1263 static arc_buf_hdr_t * 1264 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 1265 { 1266 ASSERT(HDR_HAS_L2HDR(hdr)); 1267 1268 arc_buf_hdr_t *nhdr; 1269 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 1270 1271 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 1272 (old == hdr_l2only_cache && new == hdr_full_cache)); 1273 1274 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 1275 1276 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 1277 buf_hash_remove(hdr); 1278 1279 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 1280 1281 if (new == hdr_full_cache) { 1282 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1283 /* 1284 * arc_access and arc_change_state need to be aware that a 1285 * header has just come out of L2ARC, so we set its state to 1286 * l2c_only even though it's about to change. 1287 */ 1288 nhdr->b_l1hdr.b_state = arc_l2c_only; 1289 1290 /* Verify previous threads set to NULL before freeing */ 1291 ASSERT3P(nhdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1292 } else { 1293 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1294 ASSERT0(hdr->b_l1hdr.b_datacnt); 1295 1296 /* 1297 * If we've reached here, We must have been called from 1298 * arc_evict_hdr(), as such we should have already been 1299 * removed from any ghost list we were previously on 1300 * (which protects us from racing with arc_evict_state), 1301 * thus no locking is needed during this check. 1302 */ 1303 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1304 1305 /* 1306 * A buffer must not be moved into the arc_l2c_only 1307 * state if it's not finished being written out to the 1308 * l2arc device. Otherwise, the b_l1hdr.b_tmp_cdata field 1309 * might try to be accessed, even though it was removed. 1310 */ 1311 VERIFY(!HDR_L2_WRITING(hdr)); 1312 VERIFY3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 1313 1314 #ifdef ZFS_DEBUG 1315 if (hdr->b_l1hdr.b_thawed != NULL) { 1316 kmem_free(hdr->b_l1hdr.b_thawed, 1); 1317 hdr->b_l1hdr.b_thawed = NULL; 1318 } 1319 #endif 1320 1321 nhdr->b_flags &= ~ARC_FLAG_HAS_L1HDR; 1322 } 1323 /* 1324 * The header has been reallocated so we need to re-insert it into any 1325 * lists it was on. 1326 */ 1327 (void) buf_hash_insert(nhdr, NULL); 1328 1329 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 1330 1331 mutex_enter(&dev->l2ad_mtx); 1332 1333 /* 1334 * We must place the realloc'ed header back into the list at 1335 * the same spot. Otherwise, if it's placed earlier in the list, 1336 * l2arc_write_buffers() could find it during the function's 1337 * write phase, and try to write it out to the l2arc. 1338 */ 1339 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 1340 list_remove(&dev->l2ad_buflist, hdr); 1341 1342 mutex_exit(&dev->l2ad_mtx); 1343 1344 /* 1345 * Since we're using the pointer address as the tag when 1346 * incrementing and decrementing the l2ad_alloc refcount, we 1347 * must remove the old pointer (that we're about to destroy) and 1348 * add the new pointer to the refcount. Otherwise we'd remove 1349 * the wrong pointer address when calling arc_hdr_destroy() later. 1350 */ 1351 1352 (void) refcount_remove_many(&dev->l2ad_alloc, 1353 hdr->b_l2hdr.b_asize, hdr); 1354 1355 (void) refcount_add_many(&dev->l2ad_alloc, 1356 nhdr->b_l2hdr.b_asize, nhdr); 1357 1358 buf_discard_identity(hdr); 1359 hdr->b_freeze_cksum = NULL; 1360 kmem_cache_free(old, hdr); 1361 1362 return (nhdr); 1363 } 1364 1365 1366 #define ARC_MINTIME (hz>>4) /* 62 ms */ 1367 1368 static void 1369 arc_cksum_verify(arc_buf_t *buf) 1370 { 1371 zio_cksum_t zc; 1372 1373 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1374 return; 1375 1376 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1377 if (buf->b_hdr->b_freeze_cksum == NULL || HDR_IO_ERROR(buf->b_hdr)) { 1378 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1379 return; 1380 } 1381 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1382 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 1383 panic("buffer modified while frozen!"); 1384 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1385 } 1386 1387 static int 1388 arc_cksum_equal(arc_buf_t *buf) 1389 { 1390 zio_cksum_t zc; 1391 int equal; 1392 1393 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1394 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, NULL, &zc); 1395 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 1396 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1397 1398 return (equal); 1399 } 1400 1401 static void 1402 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 1403 { 1404 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 1405 return; 1406 1407 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1408 if (buf->b_hdr->b_freeze_cksum != NULL) { 1409 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1410 return; 1411 } 1412 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 1413 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 1414 NULL, buf->b_hdr->b_freeze_cksum); 1415 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1416 arc_buf_watch(buf); 1417 } 1418 1419 #ifndef _KERNEL 1420 typedef struct procctl { 1421 long cmd; 1422 prwatch_t prwatch; 1423 } procctl_t; 1424 #endif 1425 1426 /* ARGSUSED */ 1427 static void 1428 arc_buf_unwatch(arc_buf_t *buf) 1429 { 1430 #ifndef _KERNEL 1431 if (arc_watch) { 1432 int result; 1433 procctl_t ctl; 1434 ctl.cmd = PCWATCH; 1435 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1436 ctl.prwatch.pr_size = 0; 1437 ctl.prwatch.pr_wflags = 0; 1438 result = write(arc_procfd, &ctl, sizeof (ctl)); 1439 ASSERT3U(result, ==, sizeof (ctl)); 1440 } 1441 #endif 1442 } 1443 1444 /* ARGSUSED */ 1445 static void 1446 arc_buf_watch(arc_buf_t *buf) 1447 { 1448 #ifndef _KERNEL 1449 if (arc_watch) { 1450 int result; 1451 procctl_t ctl; 1452 ctl.cmd = PCWATCH; 1453 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1454 ctl.prwatch.pr_size = buf->b_hdr->b_size; 1455 ctl.prwatch.pr_wflags = WA_WRITE; 1456 result = write(arc_procfd, &ctl, sizeof (ctl)); 1457 ASSERT3U(result, ==, sizeof (ctl)); 1458 } 1459 #endif 1460 } 1461 1462 static arc_buf_contents_t 1463 arc_buf_type(arc_buf_hdr_t *hdr) 1464 { 1465 if (HDR_ISTYPE_METADATA(hdr)) { 1466 return (ARC_BUFC_METADATA); 1467 } else { 1468 return (ARC_BUFC_DATA); 1469 } 1470 } 1471 1472 static uint32_t 1473 arc_bufc_to_flags(arc_buf_contents_t type) 1474 { 1475 switch (type) { 1476 case ARC_BUFC_DATA: 1477 /* metadata field is 0 if buffer contains normal data */ 1478 return (0); 1479 case ARC_BUFC_METADATA: 1480 return (ARC_FLAG_BUFC_METADATA); 1481 default: 1482 break; 1483 } 1484 panic("undefined ARC buffer type!"); 1485 return ((uint32_t)-1); 1486 } 1487 1488 void 1489 arc_buf_thaw(arc_buf_t *buf) 1490 { 1491 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1492 if (buf->b_hdr->b_l1hdr.b_state != arc_anon) 1493 panic("modifying non-anon buffer!"); 1494 if (HDR_IO_IN_PROGRESS(buf->b_hdr)) 1495 panic("modifying buffer while i/o in progress!"); 1496 arc_cksum_verify(buf); 1497 } 1498 1499 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1500 if (buf->b_hdr->b_freeze_cksum != NULL) { 1501 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1502 buf->b_hdr->b_freeze_cksum = NULL; 1503 } 1504 1505 #ifdef ZFS_DEBUG 1506 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1507 if (buf->b_hdr->b_l1hdr.b_thawed != NULL) 1508 kmem_free(buf->b_hdr->b_l1hdr.b_thawed, 1); 1509 buf->b_hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1510 } 1511 #endif 1512 1513 mutex_exit(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1514 1515 arc_buf_unwatch(buf); 1516 } 1517 1518 void 1519 arc_buf_freeze(arc_buf_t *buf) 1520 { 1521 kmutex_t *hash_lock; 1522 1523 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1524 return; 1525 1526 hash_lock = HDR_LOCK(buf->b_hdr); 1527 mutex_enter(hash_lock); 1528 1529 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 1530 buf->b_hdr->b_l1hdr.b_state == arc_anon); 1531 arc_cksum_compute(buf, B_FALSE); 1532 mutex_exit(hash_lock); 1533 1534 } 1535 1536 static void 1537 add_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1538 { 1539 ASSERT(HDR_HAS_L1HDR(hdr)); 1540 ASSERT(MUTEX_HELD(hash_lock)); 1541 arc_state_t *state = hdr->b_l1hdr.b_state; 1542 1543 if ((refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 1544 (state != arc_anon)) { 1545 /* We don't use the L2-only state list. */ 1546 if (state != arc_l2c_only) { 1547 arc_buf_contents_t type = arc_buf_type(hdr); 1548 uint64_t delta = hdr->b_size * hdr->b_l1hdr.b_datacnt; 1549 multilist_t *list = &state->arcs_list[type]; 1550 uint64_t *size = &state->arcs_lsize[type]; 1551 1552 multilist_remove(list, hdr); 1553 1554 if (GHOST_STATE(state)) { 1555 ASSERT0(hdr->b_l1hdr.b_datacnt); 1556 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 1557 delta = hdr->b_size; 1558 } 1559 ASSERT(delta > 0); 1560 ASSERT3U(*size, >=, delta); 1561 atomic_add_64(size, -delta); 1562 } 1563 /* remove the prefetch flag if we get a reference */ 1564 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 1565 } 1566 } 1567 1568 static int 1569 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 1570 { 1571 int cnt; 1572 arc_state_t *state = hdr->b_l1hdr.b_state; 1573 1574 ASSERT(HDR_HAS_L1HDR(hdr)); 1575 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 1576 ASSERT(!GHOST_STATE(state)); 1577 1578 /* 1579 * arc_l2c_only counts as a ghost state so we don't need to explicitly 1580 * check to prevent usage of the arc_l2c_only list. 1581 */ 1582 if (((cnt = refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 1583 (state != arc_anon)) { 1584 arc_buf_contents_t type = arc_buf_type(hdr); 1585 multilist_t *list = &state->arcs_list[type]; 1586 uint64_t *size = &state->arcs_lsize[type]; 1587 1588 multilist_insert(list, hdr); 1589 1590 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 1591 atomic_add_64(size, hdr->b_size * 1592 hdr->b_l1hdr.b_datacnt); 1593 } 1594 return (cnt); 1595 } 1596 1597 /* 1598 * Move the supplied buffer to the indicated state. The hash lock 1599 * for the buffer must be held by the caller. 1600 */ 1601 static void 1602 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 1603 kmutex_t *hash_lock) 1604 { 1605 arc_state_t *old_state; 1606 int64_t refcnt; 1607 uint32_t datacnt; 1608 uint64_t from_delta, to_delta; 1609 arc_buf_contents_t buftype = arc_buf_type(hdr); 1610 1611 /* 1612 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 1613 * in arc_read() when bringing a buffer out of the L2ARC. However, the 1614 * L1 hdr doesn't always exist when we change state to arc_anon before 1615 * destroying a header, in which case reallocating to add the L1 hdr is 1616 * pointless. 1617 */ 1618 if (HDR_HAS_L1HDR(hdr)) { 1619 old_state = hdr->b_l1hdr.b_state; 1620 refcnt = refcount_count(&hdr->b_l1hdr.b_refcnt); 1621 datacnt = hdr->b_l1hdr.b_datacnt; 1622 } else { 1623 old_state = arc_l2c_only; 1624 refcnt = 0; 1625 datacnt = 0; 1626 } 1627 1628 ASSERT(MUTEX_HELD(hash_lock)); 1629 ASSERT3P(new_state, !=, old_state); 1630 ASSERT(refcnt == 0 || datacnt > 0); 1631 ASSERT(!GHOST_STATE(new_state) || datacnt == 0); 1632 ASSERT(old_state != arc_anon || datacnt <= 1); 1633 1634 from_delta = to_delta = datacnt * hdr->b_size; 1635 1636 /* 1637 * If this buffer is evictable, transfer it from the 1638 * old state list to the new state list. 1639 */ 1640 if (refcnt == 0) { 1641 if (old_state != arc_anon && old_state != arc_l2c_only) { 1642 uint64_t *size = &old_state->arcs_lsize[buftype]; 1643 1644 ASSERT(HDR_HAS_L1HDR(hdr)); 1645 multilist_remove(&old_state->arcs_list[buftype], hdr); 1646 1647 /* 1648 * If prefetching out of the ghost cache, 1649 * we will have a non-zero datacnt. 1650 */ 1651 if (GHOST_STATE(old_state) && datacnt == 0) { 1652 /* ghost elements have a ghost size */ 1653 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1654 from_delta = hdr->b_size; 1655 } 1656 ASSERT3U(*size, >=, from_delta); 1657 atomic_add_64(size, -from_delta); 1658 } 1659 if (new_state != arc_anon && new_state != arc_l2c_only) { 1660 uint64_t *size = &new_state->arcs_lsize[buftype]; 1661 1662 /* 1663 * An L1 header always exists here, since if we're 1664 * moving to some L1-cached state (i.e. not l2c_only or 1665 * anonymous), we realloc the header to add an L1hdr 1666 * beforehand. 1667 */ 1668 ASSERT(HDR_HAS_L1HDR(hdr)); 1669 multilist_insert(&new_state->arcs_list[buftype], hdr); 1670 1671 /* ghost elements have a ghost size */ 1672 if (GHOST_STATE(new_state)) { 1673 ASSERT0(datacnt); 1674 ASSERT(hdr->b_l1hdr.b_buf == NULL); 1675 to_delta = hdr->b_size; 1676 } 1677 atomic_add_64(size, to_delta); 1678 } 1679 } 1680 1681 ASSERT(!BUF_EMPTY(hdr)); 1682 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 1683 buf_hash_remove(hdr); 1684 1685 /* adjust state sizes (ignore arc_l2c_only) */ 1686 1687 if (to_delta && new_state != arc_l2c_only) { 1688 ASSERT(HDR_HAS_L1HDR(hdr)); 1689 if (GHOST_STATE(new_state)) { 1690 ASSERT0(datacnt); 1691 1692 /* 1693 * We moving a header to a ghost state, we first 1694 * remove all arc buffers. Thus, we'll have a 1695 * datacnt of zero, and no arc buffer to use for 1696 * the reference. As a result, we use the arc 1697 * header pointer for the reference. 1698 */ 1699 (void) refcount_add_many(&new_state->arcs_size, 1700 hdr->b_size, hdr); 1701 } else { 1702 ASSERT3U(datacnt, !=, 0); 1703 1704 /* 1705 * Each individual buffer holds a unique reference, 1706 * thus we must remove each of these references one 1707 * at a time. 1708 */ 1709 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 1710 buf = buf->b_next) { 1711 (void) refcount_add_many(&new_state->arcs_size, 1712 hdr->b_size, buf); 1713 } 1714 } 1715 } 1716 1717 if (from_delta && old_state != arc_l2c_only) { 1718 ASSERT(HDR_HAS_L1HDR(hdr)); 1719 if (GHOST_STATE(old_state)) { 1720 /* 1721 * When moving a header off of a ghost state, 1722 * there's the possibility for datacnt to be 1723 * non-zero. This is because we first add the 1724 * arc buffer to the header prior to changing 1725 * the header's state. Since we used the header 1726 * for the reference when putting the header on 1727 * the ghost state, we must balance that and use 1728 * the header when removing off the ghost state 1729 * (even though datacnt is non zero). 1730 */ 1731 1732 IMPLY(datacnt == 0, new_state == arc_anon || 1733 new_state == arc_l2c_only); 1734 1735 (void) refcount_remove_many(&old_state->arcs_size, 1736 hdr->b_size, hdr); 1737 } else { 1738 ASSERT3P(datacnt, !=, 0); 1739 1740 /* 1741 * Each individual buffer holds a unique reference, 1742 * thus we must remove each of these references one 1743 * at a time. 1744 */ 1745 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 1746 buf = buf->b_next) { 1747 (void) refcount_remove_many( 1748 &old_state->arcs_size, hdr->b_size, buf); 1749 } 1750 } 1751 } 1752 1753 if (HDR_HAS_L1HDR(hdr)) 1754 hdr->b_l1hdr.b_state = new_state; 1755 1756 /* 1757 * L2 headers should never be on the L2 state list since they don't 1758 * have L1 headers allocated. 1759 */ 1760 ASSERT(multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 1761 multilist_is_empty(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 1762 } 1763 1764 void 1765 arc_space_consume(uint64_t space, arc_space_type_t type) 1766 { 1767 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1768 1769 switch (type) { 1770 case ARC_SPACE_DATA: 1771 ARCSTAT_INCR(arcstat_data_size, space); 1772 break; 1773 case ARC_SPACE_META: 1774 ARCSTAT_INCR(arcstat_metadata_size, space); 1775 break; 1776 case ARC_SPACE_OTHER: 1777 ARCSTAT_INCR(arcstat_other_size, space); 1778 break; 1779 case ARC_SPACE_HDRS: 1780 ARCSTAT_INCR(arcstat_hdr_size, space); 1781 break; 1782 case ARC_SPACE_L2HDRS: 1783 ARCSTAT_INCR(arcstat_l2_hdr_size, space); 1784 break; 1785 } 1786 1787 if (type != ARC_SPACE_DATA) 1788 ARCSTAT_INCR(arcstat_meta_used, space); 1789 1790 atomic_add_64(&arc_size, space); 1791 } 1792 1793 void 1794 arc_space_return(uint64_t space, arc_space_type_t type) 1795 { 1796 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 1797 1798 switch (type) { 1799 case ARC_SPACE_DATA: 1800 ARCSTAT_INCR(arcstat_data_size, -space); 1801 break; 1802 case ARC_SPACE_META: 1803 ARCSTAT_INCR(arcstat_metadata_size, -space); 1804 break; 1805 case ARC_SPACE_OTHER: 1806 ARCSTAT_INCR(arcstat_other_size, -space); 1807 break; 1808 case ARC_SPACE_HDRS: 1809 ARCSTAT_INCR(arcstat_hdr_size, -space); 1810 break; 1811 case ARC_SPACE_L2HDRS: 1812 ARCSTAT_INCR(arcstat_l2_hdr_size, -space); 1813 break; 1814 } 1815 1816 if (type != ARC_SPACE_DATA) { 1817 ASSERT(arc_meta_used >= space); 1818 if (arc_meta_max < arc_meta_used) 1819 arc_meta_max = arc_meta_used; 1820 ARCSTAT_INCR(arcstat_meta_used, -space); 1821 } 1822 1823 ASSERT(arc_size >= space); 1824 atomic_add_64(&arc_size, -space); 1825 } 1826 1827 arc_buf_t * 1828 arc_buf_alloc(spa_t *spa, int32_t size, void *tag, arc_buf_contents_t type) 1829 { 1830 arc_buf_hdr_t *hdr; 1831 arc_buf_t *buf; 1832 1833 ASSERT3U(size, >, 0); 1834 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 1835 ASSERT(BUF_EMPTY(hdr)); 1836 ASSERT3P(hdr->b_freeze_cksum, ==, NULL); 1837 hdr->b_size = size; 1838 hdr->b_spa = spa_load_guid(spa); 1839 1840 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1841 buf->b_hdr = hdr; 1842 buf->b_data = NULL; 1843 buf->b_efunc = NULL; 1844 buf->b_private = NULL; 1845 buf->b_next = NULL; 1846 1847 hdr->b_flags = arc_bufc_to_flags(type); 1848 hdr->b_flags |= ARC_FLAG_HAS_L1HDR; 1849 1850 hdr->b_l1hdr.b_buf = buf; 1851 hdr->b_l1hdr.b_state = arc_anon; 1852 hdr->b_l1hdr.b_arc_access = 0; 1853 hdr->b_l1hdr.b_datacnt = 1; 1854 hdr->b_l1hdr.b_tmp_cdata = NULL; 1855 1856 arc_get_data_buf(buf); 1857 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 1858 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1859 1860 return (buf); 1861 } 1862 1863 static char *arc_onloan_tag = "onloan"; 1864 1865 /* 1866 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 1867 * flight data by arc_tempreserve_space() until they are "returned". Loaned 1868 * buffers must be returned to the arc before they can be used by the DMU or 1869 * freed. 1870 */ 1871 arc_buf_t * 1872 arc_loan_buf(spa_t *spa, int size) 1873 { 1874 arc_buf_t *buf; 1875 1876 buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); 1877 1878 atomic_add_64(&arc_loaned_bytes, size); 1879 return (buf); 1880 } 1881 1882 /* 1883 * Return a loaned arc buffer to the arc. 1884 */ 1885 void 1886 arc_return_buf(arc_buf_t *buf, void *tag) 1887 { 1888 arc_buf_hdr_t *hdr = buf->b_hdr; 1889 1890 ASSERT(buf->b_data != NULL); 1891 ASSERT(HDR_HAS_L1HDR(hdr)); 1892 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 1893 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1894 1895 atomic_add_64(&arc_loaned_bytes, -hdr->b_size); 1896 } 1897 1898 /* Detach an arc_buf from a dbuf (tag) */ 1899 void 1900 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 1901 { 1902 arc_buf_hdr_t *hdr = buf->b_hdr; 1903 1904 ASSERT(buf->b_data != NULL); 1905 ASSERT(HDR_HAS_L1HDR(hdr)); 1906 (void) refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 1907 (void) refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 1908 buf->b_efunc = NULL; 1909 buf->b_private = NULL; 1910 1911 atomic_add_64(&arc_loaned_bytes, hdr->b_size); 1912 } 1913 1914 static arc_buf_t * 1915 arc_buf_clone(arc_buf_t *from) 1916 { 1917 arc_buf_t *buf; 1918 arc_buf_hdr_t *hdr = from->b_hdr; 1919 uint64_t size = hdr->b_size; 1920 1921 ASSERT(HDR_HAS_L1HDR(hdr)); 1922 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 1923 1924 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1925 buf->b_hdr = hdr; 1926 buf->b_data = NULL; 1927 buf->b_efunc = NULL; 1928 buf->b_private = NULL; 1929 buf->b_next = hdr->b_l1hdr.b_buf; 1930 hdr->b_l1hdr.b_buf = buf; 1931 arc_get_data_buf(buf); 1932 bcopy(from->b_data, buf->b_data, size); 1933 1934 /* 1935 * This buffer already exists in the arc so create a duplicate 1936 * copy for the caller. If the buffer is associated with user data 1937 * then track the size and number of duplicates. These stats will be 1938 * updated as duplicate buffers are created and destroyed. 1939 */ 1940 if (HDR_ISTYPE_DATA(hdr)) { 1941 ARCSTAT_BUMP(arcstat_duplicate_buffers); 1942 ARCSTAT_INCR(arcstat_duplicate_buffers_size, size); 1943 } 1944 hdr->b_l1hdr.b_datacnt += 1; 1945 return (buf); 1946 } 1947 1948 void 1949 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1950 { 1951 arc_buf_hdr_t *hdr; 1952 kmutex_t *hash_lock; 1953 1954 /* 1955 * Check to see if this buffer is evicted. Callers 1956 * must verify b_data != NULL to know if the add_ref 1957 * was successful. 1958 */ 1959 mutex_enter(&buf->b_evict_lock); 1960 if (buf->b_data == NULL) { 1961 mutex_exit(&buf->b_evict_lock); 1962 return; 1963 } 1964 hash_lock = HDR_LOCK(buf->b_hdr); 1965 mutex_enter(hash_lock); 1966 hdr = buf->b_hdr; 1967 ASSERT(HDR_HAS_L1HDR(hdr)); 1968 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 1969 mutex_exit(&buf->b_evict_lock); 1970 1971 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 1972 hdr->b_l1hdr.b_state == arc_mfu); 1973 1974 add_reference(hdr, hash_lock, tag); 1975 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1976 arc_access(hdr, hash_lock); 1977 mutex_exit(hash_lock); 1978 ARCSTAT_BUMP(arcstat_hits); 1979 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 1980 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 1981 data, metadata, hits); 1982 } 1983 1984 static void 1985 arc_buf_free_on_write(void *data, size_t size, 1986 void (*free_func)(void *, size_t)) 1987 { 1988 l2arc_data_free_t *df; 1989 1990 df = kmem_alloc(sizeof (*df), KM_SLEEP); 1991 df->l2df_data = data; 1992 df->l2df_size = size; 1993 df->l2df_func = free_func; 1994 mutex_enter(&l2arc_free_on_write_mtx); 1995 list_insert_head(l2arc_free_on_write, df); 1996 mutex_exit(&l2arc_free_on_write_mtx); 1997 } 1998 1999 /* 2000 * Free the arc data buffer. If it is an l2arc write in progress, 2001 * the buffer is placed on l2arc_free_on_write to be freed later. 2002 */ 2003 static void 2004 arc_buf_data_free(arc_buf_t *buf, void (*free_func)(void *, size_t)) 2005 { 2006 arc_buf_hdr_t *hdr = buf->b_hdr; 2007 2008 if (HDR_L2_WRITING(hdr)) { 2009 arc_buf_free_on_write(buf->b_data, hdr->b_size, free_func); 2010 ARCSTAT_BUMP(arcstat_l2_free_on_write); 2011 } else { 2012 free_func(buf->b_data, hdr->b_size); 2013 } 2014 } 2015 2016 static void 2017 arc_buf_l2_cdata_free(arc_buf_hdr_t *hdr) 2018 { 2019 ASSERT(HDR_HAS_L2HDR(hdr)); 2020 ASSERT(MUTEX_HELD(&hdr->b_l2hdr.b_dev->l2ad_mtx)); 2021 2022 /* 2023 * The b_tmp_cdata field is linked off of the b_l1hdr, so if 2024 * that doesn't exist, the header is in the arc_l2c_only state, 2025 * and there isn't anything to free (it's already been freed). 2026 */ 2027 if (!HDR_HAS_L1HDR(hdr)) 2028 return; 2029 2030 /* 2031 * The header isn't being written to the l2arc device, thus it 2032 * shouldn't have a b_tmp_cdata to free. 2033 */ 2034 if (!HDR_L2_WRITING(hdr)) { 2035 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2036 return; 2037 } 2038 2039 /* 2040 * The header does not have compression enabled. This can be due 2041 * to the buffer not being compressible, or because we're 2042 * freeing the buffer before the second phase of 2043 * l2arc_write_buffer() has started (which does the compression 2044 * step). In either case, b_tmp_cdata does not point to a 2045 * separately compressed buffer, so there's nothing to free (it 2046 * points to the same buffer as the arc_buf_t's b_data field). 2047 */ 2048 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_OFF) { 2049 hdr->b_l1hdr.b_tmp_cdata = NULL; 2050 return; 2051 } 2052 2053 /* 2054 * There's nothing to free since the buffer was all zero's and 2055 * compressed to a zero length buffer. 2056 */ 2057 if (hdr->b_l2hdr.b_compress == ZIO_COMPRESS_EMPTY) { 2058 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 2059 return; 2060 } 2061 2062 ASSERT(L2ARC_IS_VALID_COMPRESS(hdr->b_l2hdr.b_compress)); 2063 2064 arc_buf_free_on_write(hdr->b_l1hdr.b_tmp_cdata, 2065 hdr->b_size, zio_data_buf_free); 2066 2067 ARCSTAT_BUMP(arcstat_l2_cdata_free_on_write); 2068 hdr->b_l1hdr.b_tmp_cdata = NULL; 2069 } 2070 2071 /* 2072 * Free up buf->b_data and if 'remove' is set, then pull the 2073 * arc_buf_t off of the the arc_buf_hdr_t's list and free it. 2074 */ 2075 static void 2076 arc_buf_destroy(arc_buf_t *buf, boolean_t remove) 2077 { 2078 arc_buf_t **bufp; 2079 2080 /* free up data associated with the buf */ 2081 if (buf->b_data != NULL) { 2082 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 2083 uint64_t size = buf->b_hdr->b_size; 2084 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 2085 2086 arc_cksum_verify(buf); 2087 arc_buf_unwatch(buf); 2088 2089 if (type == ARC_BUFC_METADATA) { 2090 arc_buf_data_free(buf, zio_buf_free); 2091 arc_space_return(size, ARC_SPACE_META); 2092 } else { 2093 ASSERT(type == ARC_BUFC_DATA); 2094 arc_buf_data_free(buf, zio_data_buf_free); 2095 arc_space_return(size, ARC_SPACE_DATA); 2096 } 2097 2098 /* protected by hash lock, if in the hash table */ 2099 if (multilist_link_active(&buf->b_hdr->b_l1hdr.b_arc_node)) { 2100 uint64_t *cnt = &state->arcs_lsize[type]; 2101 2102 ASSERT(refcount_is_zero( 2103 &buf->b_hdr->b_l1hdr.b_refcnt)); 2104 ASSERT(state != arc_anon && state != arc_l2c_only); 2105 2106 ASSERT3U(*cnt, >=, size); 2107 atomic_add_64(cnt, -size); 2108 } 2109 2110 (void) refcount_remove_many(&state->arcs_size, size, buf); 2111 buf->b_data = NULL; 2112 2113 /* 2114 * If we're destroying a duplicate buffer make sure 2115 * that the appropriate statistics are updated. 2116 */ 2117 if (buf->b_hdr->b_l1hdr.b_datacnt > 1 && 2118 HDR_ISTYPE_DATA(buf->b_hdr)) { 2119 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 2120 ARCSTAT_INCR(arcstat_duplicate_buffers_size, -size); 2121 } 2122 ASSERT(buf->b_hdr->b_l1hdr.b_datacnt > 0); 2123 buf->b_hdr->b_l1hdr.b_datacnt -= 1; 2124 } 2125 2126 /* only remove the buf if requested */ 2127 if (!remove) 2128 return; 2129 2130 /* remove the buf from the hdr list */ 2131 for (bufp = &buf->b_hdr->b_l1hdr.b_buf; *bufp != buf; 2132 bufp = &(*bufp)->b_next) 2133 continue; 2134 *bufp = buf->b_next; 2135 buf->b_next = NULL; 2136 2137 ASSERT(buf->b_efunc == NULL); 2138 2139 /* clean up the buf */ 2140 buf->b_hdr = NULL; 2141 kmem_cache_free(buf_cache, buf); 2142 } 2143 2144 static void 2145 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 2146 { 2147 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 2148 l2arc_dev_t *dev = l2hdr->b_dev; 2149 2150 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 2151 ASSERT(HDR_HAS_L2HDR(hdr)); 2152 2153 list_remove(&dev->l2ad_buflist, hdr); 2154 2155 /* 2156 * We don't want to leak the b_tmp_cdata buffer that was 2157 * allocated in l2arc_write_buffers() 2158 */ 2159 arc_buf_l2_cdata_free(hdr); 2160 2161 /* 2162 * If the l2hdr's b_daddr is equal to L2ARC_ADDR_UNSET, then 2163 * this header is being processed by l2arc_write_buffers() (i.e. 2164 * it's in the first stage of l2arc_write_buffers()). 2165 * Re-affirming that truth here, just to serve as a reminder. If 2166 * b_daddr does not equal L2ARC_ADDR_UNSET, then the header may or 2167 * may not have its HDR_L2_WRITING flag set. (the write may have 2168 * completed, in which case HDR_L2_WRITING will be false and the 2169 * b_daddr field will point to the address of the buffer on disk). 2170 */ 2171 IMPLY(l2hdr->b_daddr == L2ARC_ADDR_UNSET, HDR_L2_WRITING(hdr)); 2172 2173 /* 2174 * If b_daddr is equal to L2ARC_ADDR_UNSET, we're racing with 2175 * l2arc_write_buffers(). Since we've just removed this header 2176 * from the l2arc buffer list, this header will never reach the 2177 * second stage of l2arc_write_buffers(), which increments the 2178 * accounting stats for this header. Thus, we must be careful 2179 * not to decrement them for this header either. 2180 */ 2181 if (l2hdr->b_daddr != L2ARC_ADDR_UNSET) { 2182 ARCSTAT_INCR(arcstat_l2_asize, -l2hdr->b_asize); 2183 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 2184 2185 vdev_space_update(dev->l2ad_vdev, 2186 -l2hdr->b_asize, 0, 0); 2187 2188 (void) refcount_remove_many(&dev->l2ad_alloc, 2189 l2hdr->b_asize, hdr); 2190 } 2191 2192 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 2193 } 2194 2195 static void 2196 arc_hdr_destroy(arc_buf_hdr_t *hdr) 2197 { 2198 if (HDR_HAS_L1HDR(hdr)) { 2199 ASSERT(hdr->b_l1hdr.b_buf == NULL || 2200 hdr->b_l1hdr.b_datacnt > 0); 2201 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2202 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 2203 } 2204 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2205 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 2206 2207 if (HDR_HAS_L2HDR(hdr)) { 2208 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 2209 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 2210 2211 if (!buflist_held) 2212 mutex_enter(&dev->l2ad_mtx); 2213 2214 /* 2215 * Even though we checked this conditional above, we 2216 * need to check this again now that we have the 2217 * l2ad_mtx. This is because we could be racing with 2218 * another thread calling l2arc_evict() which might have 2219 * destroyed this header's L2 portion as we were waiting 2220 * to acquire the l2ad_mtx. If that happens, we don't 2221 * want to re-destroy the header's L2 portion. 2222 */ 2223 if (HDR_HAS_L2HDR(hdr)) 2224 arc_hdr_l2hdr_destroy(hdr); 2225 2226 if (!buflist_held) 2227 mutex_exit(&dev->l2ad_mtx); 2228 } 2229 2230 if (!BUF_EMPTY(hdr)) 2231 buf_discard_identity(hdr); 2232 2233 if (hdr->b_freeze_cksum != NULL) { 2234 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2235 hdr->b_freeze_cksum = NULL; 2236 } 2237 2238 if (HDR_HAS_L1HDR(hdr)) { 2239 while (hdr->b_l1hdr.b_buf) { 2240 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2241 2242 if (buf->b_efunc != NULL) { 2243 mutex_enter(&arc_user_evicts_lock); 2244 mutex_enter(&buf->b_evict_lock); 2245 ASSERT(buf->b_hdr != NULL); 2246 arc_buf_destroy(hdr->b_l1hdr.b_buf, FALSE); 2247 hdr->b_l1hdr.b_buf = buf->b_next; 2248 buf->b_hdr = &arc_eviction_hdr; 2249 buf->b_next = arc_eviction_list; 2250 arc_eviction_list = buf; 2251 mutex_exit(&buf->b_evict_lock); 2252 cv_signal(&arc_user_evicts_cv); 2253 mutex_exit(&arc_user_evicts_lock); 2254 } else { 2255 arc_buf_destroy(hdr->b_l1hdr.b_buf, TRUE); 2256 } 2257 } 2258 #ifdef ZFS_DEBUG 2259 if (hdr->b_l1hdr.b_thawed != NULL) { 2260 kmem_free(hdr->b_l1hdr.b_thawed, 1); 2261 hdr->b_l1hdr.b_thawed = NULL; 2262 } 2263 #endif 2264 } 2265 2266 ASSERT3P(hdr->b_hash_next, ==, NULL); 2267 if (HDR_HAS_L1HDR(hdr)) { 2268 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 2269 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 2270 kmem_cache_free(hdr_full_cache, hdr); 2271 } else { 2272 kmem_cache_free(hdr_l2only_cache, hdr); 2273 } 2274 } 2275 2276 void 2277 arc_buf_free(arc_buf_t *buf, void *tag) 2278 { 2279 arc_buf_hdr_t *hdr = buf->b_hdr; 2280 int hashed = hdr->b_l1hdr.b_state != arc_anon; 2281 2282 ASSERT(buf->b_efunc == NULL); 2283 ASSERT(buf->b_data != NULL); 2284 2285 if (hashed) { 2286 kmutex_t *hash_lock = HDR_LOCK(hdr); 2287 2288 mutex_enter(hash_lock); 2289 hdr = buf->b_hdr; 2290 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2291 2292 (void) remove_reference(hdr, hash_lock, tag); 2293 if (hdr->b_l1hdr.b_datacnt > 1) { 2294 arc_buf_destroy(buf, TRUE); 2295 } else { 2296 ASSERT(buf == hdr->b_l1hdr.b_buf); 2297 ASSERT(buf->b_efunc == NULL); 2298 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2299 } 2300 mutex_exit(hash_lock); 2301 } else if (HDR_IO_IN_PROGRESS(hdr)) { 2302 int destroy_hdr; 2303 /* 2304 * We are in the middle of an async write. Don't destroy 2305 * this buffer unless the write completes before we finish 2306 * decrementing the reference count. 2307 */ 2308 mutex_enter(&arc_user_evicts_lock); 2309 (void) remove_reference(hdr, NULL, tag); 2310 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2311 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 2312 mutex_exit(&arc_user_evicts_lock); 2313 if (destroy_hdr) 2314 arc_hdr_destroy(hdr); 2315 } else { 2316 if (remove_reference(hdr, NULL, tag) > 0) 2317 arc_buf_destroy(buf, TRUE); 2318 else 2319 arc_hdr_destroy(hdr); 2320 } 2321 } 2322 2323 boolean_t 2324 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 2325 { 2326 arc_buf_hdr_t *hdr = buf->b_hdr; 2327 kmutex_t *hash_lock = HDR_LOCK(hdr); 2328 boolean_t no_callback = (buf->b_efunc == NULL); 2329 2330 if (hdr->b_l1hdr.b_state == arc_anon) { 2331 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 2332 arc_buf_free(buf, tag); 2333 return (no_callback); 2334 } 2335 2336 mutex_enter(hash_lock); 2337 hdr = buf->b_hdr; 2338 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 2339 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 2340 ASSERT(hdr->b_l1hdr.b_state != arc_anon); 2341 ASSERT(buf->b_data != NULL); 2342 2343 (void) remove_reference(hdr, hash_lock, tag); 2344 if (hdr->b_l1hdr.b_datacnt > 1) { 2345 if (no_callback) 2346 arc_buf_destroy(buf, TRUE); 2347 } else if (no_callback) { 2348 ASSERT(hdr->b_l1hdr.b_buf == buf && buf->b_next == NULL); 2349 ASSERT(buf->b_efunc == NULL); 2350 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 2351 } 2352 ASSERT(no_callback || hdr->b_l1hdr.b_datacnt > 1 || 2353 refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2354 mutex_exit(hash_lock); 2355 return (no_callback); 2356 } 2357 2358 int32_t 2359 arc_buf_size(arc_buf_t *buf) 2360 { 2361 return (buf->b_hdr->b_size); 2362 } 2363 2364 /* 2365 * Called from the DMU to determine if the current buffer should be 2366 * evicted. In order to ensure proper locking, the eviction must be initiated 2367 * from the DMU. Return true if the buffer is associated with user data and 2368 * duplicate buffers still exist. 2369 */ 2370 boolean_t 2371 arc_buf_eviction_needed(arc_buf_t *buf) 2372 { 2373 arc_buf_hdr_t *hdr; 2374 boolean_t evict_needed = B_FALSE; 2375 2376 if (zfs_disable_dup_eviction) 2377 return (B_FALSE); 2378 2379 mutex_enter(&buf->b_evict_lock); 2380 hdr = buf->b_hdr; 2381 if (hdr == NULL) { 2382 /* 2383 * We are in arc_do_user_evicts(); let that function 2384 * perform the eviction. 2385 */ 2386 ASSERT(buf->b_data == NULL); 2387 mutex_exit(&buf->b_evict_lock); 2388 return (B_FALSE); 2389 } else if (buf->b_data == NULL) { 2390 /* 2391 * We have already been added to the arc eviction list; 2392 * recommend eviction. 2393 */ 2394 ASSERT3P(hdr, ==, &arc_eviction_hdr); 2395 mutex_exit(&buf->b_evict_lock); 2396 return (B_TRUE); 2397 } 2398 2399 if (hdr->b_l1hdr.b_datacnt > 1 && HDR_ISTYPE_DATA(hdr)) 2400 evict_needed = B_TRUE; 2401 2402 mutex_exit(&buf->b_evict_lock); 2403 return (evict_needed); 2404 } 2405 2406 /* 2407 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 2408 * state of the header is dependent on it's state prior to entering this 2409 * function. The following transitions are possible: 2410 * 2411 * - arc_mru -> arc_mru_ghost 2412 * - arc_mfu -> arc_mfu_ghost 2413 * - arc_mru_ghost -> arc_l2c_only 2414 * - arc_mru_ghost -> deleted 2415 * - arc_mfu_ghost -> arc_l2c_only 2416 * - arc_mfu_ghost -> deleted 2417 */ 2418 static int64_t 2419 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 2420 { 2421 arc_state_t *evicted_state, *state; 2422 int64_t bytes_evicted = 0; 2423 2424 ASSERT(MUTEX_HELD(hash_lock)); 2425 ASSERT(HDR_HAS_L1HDR(hdr)); 2426 2427 state = hdr->b_l1hdr.b_state; 2428 if (GHOST_STATE(state)) { 2429 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2430 ASSERT(hdr->b_l1hdr.b_buf == NULL); 2431 2432 /* 2433 * l2arc_write_buffers() relies on a header's L1 portion 2434 * (i.e. it's b_tmp_cdata field) during it's write phase. 2435 * Thus, we cannot push a header onto the arc_l2c_only 2436 * state (removing it's L1 piece) until the header is 2437 * done being written to the l2arc. 2438 */ 2439 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 2440 ARCSTAT_BUMP(arcstat_evict_l2_skip); 2441 return (bytes_evicted); 2442 } 2443 2444 ARCSTAT_BUMP(arcstat_deleted); 2445 bytes_evicted += hdr->b_size; 2446 2447 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 2448 2449 if (HDR_HAS_L2HDR(hdr)) { 2450 /* 2451 * This buffer is cached on the 2nd Level ARC; 2452 * don't destroy the header. 2453 */ 2454 arc_change_state(arc_l2c_only, hdr, hash_lock); 2455 /* 2456 * dropping from L1+L2 cached to L2-only, 2457 * realloc to remove the L1 header. 2458 */ 2459 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 2460 hdr_l2only_cache); 2461 } else { 2462 arc_change_state(arc_anon, hdr, hash_lock); 2463 arc_hdr_destroy(hdr); 2464 } 2465 return (bytes_evicted); 2466 } 2467 2468 ASSERT(state == arc_mru || state == arc_mfu); 2469 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2470 2471 /* prefetch buffers have a minimum lifespan */ 2472 if (HDR_IO_IN_PROGRESS(hdr) || 2473 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 2474 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < 2475 arc_min_prefetch_lifespan)) { 2476 ARCSTAT_BUMP(arcstat_evict_skip); 2477 return (bytes_evicted); 2478 } 2479 2480 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 2481 ASSERT3U(hdr->b_l1hdr.b_datacnt, >, 0); 2482 while (hdr->b_l1hdr.b_buf) { 2483 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 2484 if (!mutex_tryenter(&buf->b_evict_lock)) { 2485 ARCSTAT_BUMP(arcstat_mutex_miss); 2486 break; 2487 } 2488 if (buf->b_data != NULL) 2489 bytes_evicted += hdr->b_size; 2490 if (buf->b_efunc != NULL) { 2491 mutex_enter(&arc_user_evicts_lock); 2492 arc_buf_destroy(buf, FALSE); 2493 hdr->b_l1hdr.b_buf = buf->b_next; 2494 buf->b_hdr = &arc_eviction_hdr; 2495 buf->b_next = arc_eviction_list; 2496 arc_eviction_list = buf; 2497 cv_signal(&arc_user_evicts_cv); 2498 mutex_exit(&arc_user_evicts_lock); 2499 mutex_exit(&buf->b_evict_lock); 2500 } else { 2501 mutex_exit(&buf->b_evict_lock); 2502 arc_buf_destroy(buf, TRUE); 2503 } 2504 } 2505 2506 if (HDR_HAS_L2HDR(hdr)) { 2507 ARCSTAT_INCR(arcstat_evict_l2_cached, hdr->b_size); 2508 } else { 2509 if (l2arc_write_eligible(hdr->b_spa, hdr)) 2510 ARCSTAT_INCR(arcstat_evict_l2_eligible, hdr->b_size); 2511 else 2512 ARCSTAT_INCR(arcstat_evict_l2_ineligible, hdr->b_size); 2513 } 2514 2515 if (hdr->b_l1hdr.b_datacnt == 0) { 2516 arc_change_state(evicted_state, hdr, hash_lock); 2517 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2518 hdr->b_flags |= ARC_FLAG_IN_HASH_TABLE; 2519 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 2520 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 2521 } 2522 2523 return (bytes_evicted); 2524 } 2525 2526 static uint64_t 2527 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 2528 uint64_t spa, int64_t bytes) 2529 { 2530 multilist_sublist_t *mls; 2531 uint64_t bytes_evicted = 0; 2532 arc_buf_hdr_t *hdr; 2533 kmutex_t *hash_lock; 2534 int evict_count = 0; 2535 2536 ASSERT3P(marker, !=, NULL); 2537 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2538 2539 mls = multilist_sublist_lock(ml, idx); 2540 2541 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 2542 hdr = multilist_sublist_prev(mls, marker)) { 2543 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 2544 (evict_count >= zfs_arc_evict_batch_limit)) 2545 break; 2546 2547 /* 2548 * To keep our iteration location, move the marker 2549 * forward. Since we're not holding hdr's hash lock, we 2550 * must be very careful and not remove 'hdr' from the 2551 * sublist. Otherwise, other consumers might mistake the 2552 * 'hdr' as not being on a sublist when they call the 2553 * multilist_link_active() function (they all rely on 2554 * the hash lock protecting concurrent insertions and 2555 * removals). multilist_sublist_move_forward() was 2556 * specifically implemented to ensure this is the case 2557 * (only 'marker' will be removed and re-inserted). 2558 */ 2559 multilist_sublist_move_forward(mls, marker); 2560 2561 /* 2562 * The only case where the b_spa field should ever be 2563 * zero, is the marker headers inserted by 2564 * arc_evict_state(). It's possible for multiple threads 2565 * to be calling arc_evict_state() concurrently (e.g. 2566 * dsl_pool_close() and zio_inject_fault()), so we must 2567 * skip any markers we see from these other threads. 2568 */ 2569 if (hdr->b_spa == 0) 2570 continue; 2571 2572 /* we're only interested in evicting buffers of a certain spa */ 2573 if (spa != 0 && hdr->b_spa != spa) { 2574 ARCSTAT_BUMP(arcstat_evict_skip); 2575 continue; 2576 } 2577 2578 hash_lock = HDR_LOCK(hdr); 2579 2580 /* 2581 * We aren't calling this function from any code path 2582 * that would already be holding a hash lock, so we're 2583 * asserting on this assumption to be defensive in case 2584 * this ever changes. Without this check, it would be 2585 * possible to incorrectly increment arcstat_mutex_miss 2586 * below (e.g. if the code changed such that we called 2587 * this function with a hash lock held). 2588 */ 2589 ASSERT(!MUTEX_HELD(hash_lock)); 2590 2591 if (mutex_tryenter(hash_lock)) { 2592 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 2593 mutex_exit(hash_lock); 2594 2595 bytes_evicted += evicted; 2596 2597 /* 2598 * If evicted is zero, arc_evict_hdr() must have 2599 * decided to skip this header, don't increment 2600 * evict_count in this case. 2601 */ 2602 if (evicted != 0) 2603 evict_count++; 2604 2605 /* 2606 * If arc_size isn't overflowing, signal any 2607 * threads that might happen to be waiting. 2608 * 2609 * For each header evicted, we wake up a single 2610 * thread. If we used cv_broadcast, we could 2611 * wake up "too many" threads causing arc_size 2612 * to significantly overflow arc_c; since 2613 * arc_get_data_buf() doesn't check for overflow 2614 * when it's woken up (it doesn't because it's 2615 * possible for the ARC to be overflowing while 2616 * full of un-evictable buffers, and the 2617 * function should proceed in this case). 2618 * 2619 * If threads are left sleeping, due to not 2620 * using cv_broadcast, they will be woken up 2621 * just before arc_reclaim_thread() sleeps. 2622 */ 2623 mutex_enter(&arc_reclaim_lock); 2624 if (!arc_is_overflowing()) 2625 cv_signal(&arc_reclaim_waiters_cv); 2626 mutex_exit(&arc_reclaim_lock); 2627 } else { 2628 ARCSTAT_BUMP(arcstat_mutex_miss); 2629 } 2630 } 2631 2632 multilist_sublist_unlock(mls); 2633 2634 return (bytes_evicted); 2635 } 2636 2637 /* 2638 * Evict buffers from the given arc state, until we've removed the 2639 * specified number of bytes. Move the removed buffers to the 2640 * appropriate evict state. 2641 * 2642 * This function makes a "best effort". It skips over any buffers 2643 * it can't get a hash_lock on, and so, may not catch all candidates. 2644 * It may also return without evicting as much space as requested. 2645 * 2646 * If bytes is specified using the special value ARC_EVICT_ALL, this 2647 * will evict all available (i.e. unlocked and evictable) buffers from 2648 * the given arc state; which is used by arc_flush(). 2649 */ 2650 static uint64_t 2651 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 2652 arc_buf_contents_t type) 2653 { 2654 uint64_t total_evicted = 0; 2655 multilist_t *ml = &state->arcs_list[type]; 2656 int num_sublists; 2657 arc_buf_hdr_t **markers; 2658 2659 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 2660 2661 num_sublists = multilist_get_num_sublists(ml); 2662 2663 /* 2664 * If we've tried to evict from each sublist, made some 2665 * progress, but still have not hit the target number of bytes 2666 * to evict, we want to keep trying. The markers allow us to 2667 * pick up where we left off for each individual sublist, rather 2668 * than starting from the tail each time. 2669 */ 2670 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 2671 for (int i = 0; i < num_sublists; i++) { 2672 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 2673 2674 /* 2675 * A b_spa of 0 is used to indicate that this header is 2676 * a marker. This fact is used in arc_adjust_type() and 2677 * arc_evict_state_impl(). 2678 */ 2679 markers[i]->b_spa = 0; 2680 2681 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 2682 multilist_sublist_insert_tail(mls, markers[i]); 2683 multilist_sublist_unlock(mls); 2684 } 2685 2686 /* 2687 * While we haven't hit our target number of bytes to evict, or 2688 * we're evicting all available buffers. 2689 */ 2690 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 2691 /* 2692 * Start eviction using a randomly selected sublist, 2693 * this is to try and evenly balance eviction across all 2694 * sublists. Always starting at the same sublist 2695 * (e.g. index 0) would cause evictions to favor certain 2696 * sublists over others. 2697 */ 2698 int sublist_idx = multilist_get_random_index(ml); 2699 uint64_t scan_evicted = 0; 2700 2701 for (int i = 0; i < num_sublists; i++) { 2702 uint64_t bytes_remaining; 2703 uint64_t bytes_evicted; 2704 2705 if (bytes == ARC_EVICT_ALL) 2706 bytes_remaining = ARC_EVICT_ALL; 2707 else if (total_evicted < bytes) 2708 bytes_remaining = bytes - total_evicted; 2709 else 2710 break; 2711 2712 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 2713 markers[sublist_idx], spa, bytes_remaining); 2714 2715 scan_evicted += bytes_evicted; 2716 total_evicted += bytes_evicted; 2717 2718 /* we've reached the end, wrap to the beginning */ 2719 if (++sublist_idx >= num_sublists) 2720 sublist_idx = 0; 2721 } 2722 2723 /* 2724 * If we didn't evict anything during this scan, we have 2725 * no reason to believe we'll evict more during another 2726 * scan, so break the loop. 2727 */ 2728 if (scan_evicted == 0) { 2729 /* This isn't possible, let's make that obvious */ 2730 ASSERT3S(bytes, !=, 0); 2731 2732 /* 2733 * When bytes is ARC_EVICT_ALL, the only way to 2734 * break the loop is when scan_evicted is zero. 2735 * In that case, we actually have evicted enough, 2736 * so we don't want to increment the kstat. 2737 */ 2738 if (bytes != ARC_EVICT_ALL) { 2739 ASSERT3S(total_evicted, <, bytes); 2740 ARCSTAT_BUMP(arcstat_evict_not_enough); 2741 } 2742 2743 break; 2744 } 2745 } 2746 2747 for (int i = 0; i < num_sublists; i++) { 2748 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 2749 multilist_sublist_remove(mls, markers[i]); 2750 multilist_sublist_unlock(mls); 2751 2752 kmem_cache_free(hdr_full_cache, markers[i]); 2753 } 2754 kmem_free(markers, sizeof (*markers) * num_sublists); 2755 2756 return (total_evicted); 2757 } 2758 2759 /* 2760 * Flush all "evictable" data of the given type from the arc state 2761 * specified. This will not evict any "active" buffers (i.e. referenced). 2762 * 2763 * When 'retry' is set to FALSE, the function will make a single pass 2764 * over the state and evict any buffers that it can. Since it doesn't 2765 * continually retry the eviction, it might end up leaving some buffers 2766 * in the ARC due to lock misses. 2767 * 2768 * When 'retry' is set to TRUE, the function will continually retry the 2769 * eviction until *all* evictable buffers have been removed from the 2770 * state. As a result, if concurrent insertions into the state are 2771 * allowed (e.g. if the ARC isn't shutting down), this function might 2772 * wind up in an infinite loop, continually trying to evict buffers. 2773 */ 2774 static uint64_t 2775 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 2776 boolean_t retry) 2777 { 2778 uint64_t evicted = 0; 2779 2780 while (state->arcs_lsize[type] != 0) { 2781 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 2782 2783 if (!retry) 2784 break; 2785 } 2786 2787 return (evicted); 2788 } 2789 2790 /* 2791 * Evict the specified number of bytes from the state specified, 2792 * restricting eviction to the spa and type given. This function 2793 * prevents us from trying to evict more from a state's list than 2794 * is "evictable", and to skip evicting altogether when passed a 2795 * negative value for "bytes". In contrast, arc_evict_state() will 2796 * evict everything it can, when passed a negative value for "bytes". 2797 */ 2798 static uint64_t 2799 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 2800 arc_buf_contents_t type) 2801 { 2802 int64_t delta; 2803 2804 if (bytes > 0 && state->arcs_lsize[type] > 0) { 2805 delta = MIN(state->arcs_lsize[type], bytes); 2806 return (arc_evict_state(state, spa, delta, type)); 2807 } 2808 2809 return (0); 2810 } 2811 2812 /* 2813 * Evict metadata buffers from the cache, such that arc_meta_used is 2814 * capped by the arc_meta_limit tunable. 2815 */ 2816 static uint64_t 2817 arc_adjust_meta(void) 2818 { 2819 uint64_t total_evicted = 0; 2820 int64_t target; 2821 2822 /* 2823 * If we're over the meta limit, we want to evict enough 2824 * metadata to get back under the meta limit. We don't want to 2825 * evict so much that we drop the MRU below arc_p, though. If 2826 * we're over the meta limit more than we're over arc_p, we 2827 * evict some from the MRU here, and some from the MFU below. 2828 */ 2829 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 2830 (int64_t)(refcount_count(&arc_anon->arcs_size) + 2831 refcount_count(&arc_mru->arcs_size) - arc_p)); 2832 2833 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 2834 2835 /* 2836 * Similar to the above, we want to evict enough bytes to get us 2837 * below the meta limit, but not so much as to drop us below the 2838 * space alloted to the MFU (which is defined as arc_c - arc_p). 2839 */ 2840 target = MIN((int64_t)(arc_meta_used - arc_meta_limit), 2841 (int64_t)(refcount_count(&arc_mfu->arcs_size) - (arc_c - arc_p))); 2842 2843 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 2844 2845 return (total_evicted); 2846 } 2847 2848 /* 2849 * Return the type of the oldest buffer in the given arc state 2850 * 2851 * This function will select a random sublist of type ARC_BUFC_DATA and 2852 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 2853 * is compared, and the type which contains the "older" buffer will be 2854 * returned. 2855 */ 2856 static arc_buf_contents_t 2857 arc_adjust_type(arc_state_t *state) 2858 { 2859 multilist_t *data_ml = &state->arcs_list[ARC_BUFC_DATA]; 2860 multilist_t *meta_ml = &state->arcs_list[ARC_BUFC_METADATA]; 2861 int data_idx = multilist_get_random_index(data_ml); 2862 int meta_idx = multilist_get_random_index(meta_ml); 2863 multilist_sublist_t *data_mls; 2864 multilist_sublist_t *meta_mls; 2865 arc_buf_contents_t type; 2866 arc_buf_hdr_t *data_hdr; 2867 arc_buf_hdr_t *meta_hdr; 2868 2869 /* 2870 * We keep the sublist lock until we're finished, to prevent 2871 * the headers from being destroyed via arc_evict_state(). 2872 */ 2873 data_mls = multilist_sublist_lock(data_ml, data_idx); 2874 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 2875 2876 /* 2877 * These two loops are to ensure we skip any markers that 2878 * might be at the tail of the lists due to arc_evict_state(). 2879 */ 2880 2881 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 2882 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 2883 if (data_hdr->b_spa != 0) 2884 break; 2885 } 2886 2887 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 2888 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 2889 if (meta_hdr->b_spa != 0) 2890 break; 2891 } 2892 2893 if (data_hdr == NULL && meta_hdr == NULL) { 2894 type = ARC_BUFC_DATA; 2895 } else if (data_hdr == NULL) { 2896 ASSERT3P(meta_hdr, !=, NULL); 2897 type = ARC_BUFC_METADATA; 2898 } else if (meta_hdr == NULL) { 2899 ASSERT3P(data_hdr, !=, NULL); 2900 type = ARC_BUFC_DATA; 2901 } else { 2902 ASSERT3P(data_hdr, !=, NULL); 2903 ASSERT3P(meta_hdr, !=, NULL); 2904 2905 /* The headers can't be on the sublist without an L1 header */ 2906 ASSERT(HDR_HAS_L1HDR(data_hdr)); 2907 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 2908 2909 if (data_hdr->b_l1hdr.b_arc_access < 2910 meta_hdr->b_l1hdr.b_arc_access) { 2911 type = ARC_BUFC_DATA; 2912 } else { 2913 type = ARC_BUFC_METADATA; 2914 } 2915 } 2916 2917 multilist_sublist_unlock(meta_mls); 2918 multilist_sublist_unlock(data_mls); 2919 2920 return (type); 2921 } 2922 2923 /* 2924 * Evict buffers from the cache, such that arc_size is capped by arc_c. 2925 */ 2926 static uint64_t 2927 arc_adjust(void) 2928 { 2929 uint64_t total_evicted = 0; 2930 uint64_t bytes; 2931 int64_t target; 2932 2933 /* 2934 * If we're over arc_meta_limit, we want to correct that before 2935 * potentially evicting data buffers below. 2936 */ 2937 total_evicted += arc_adjust_meta(); 2938 2939 /* 2940 * Adjust MRU size 2941 * 2942 * If we're over the target cache size, we want to evict enough 2943 * from the list to get back to our target size. We don't want 2944 * to evict too much from the MRU, such that it drops below 2945 * arc_p. So, if we're over our target cache size more than 2946 * the MRU is over arc_p, we'll evict enough to get back to 2947 * arc_p here, and then evict more from the MFU below. 2948 */ 2949 target = MIN((int64_t)(arc_size - arc_c), 2950 (int64_t)(refcount_count(&arc_anon->arcs_size) + 2951 refcount_count(&arc_mru->arcs_size) + arc_meta_used - arc_p)); 2952 2953 /* 2954 * If we're below arc_meta_min, always prefer to evict data. 2955 * Otherwise, try to satisfy the requested number of bytes to 2956 * evict from the type which contains older buffers; in an 2957 * effort to keep newer buffers in the cache regardless of their 2958 * type. If we cannot satisfy the number of bytes from this 2959 * type, spill over into the next type. 2960 */ 2961 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 2962 arc_meta_used > arc_meta_min) { 2963 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 2964 total_evicted += bytes; 2965 2966 /* 2967 * If we couldn't evict our target number of bytes from 2968 * metadata, we try to get the rest from data. 2969 */ 2970 target -= bytes; 2971 2972 total_evicted += 2973 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 2974 } else { 2975 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 2976 total_evicted += bytes; 2977 2978 /* 2979 * If we couldn't evict our target number of bytes from 2980 * data, we try to get the rest from metadata. 2981 */ 2982 target -= bytes; 2983 2984 total_evicted += 2985 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 2986 } 2987 2988 /* 2989 * Adjust MFU size 2990 * 2991 * Now that we've tried to evict enough from the MRU to get its 2992 * size back to arc_p, if we're still above the target cache 2993 * size, we evict the rest from the MFU. 2994 */ 2995 target = arc_size - arc_c; 2996 2997 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 2998 arc_meta_used > arc_meta_min) { 2999 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3000 total_evicted += bytes; 3001 3002 /* 3003 * If we couldn't evict our target number of bytes from 3004 * metadata, we try to get the rest from data. 3005 */ 3006 target -= bytes; 3007 3008 total_evicted += 3009 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3010 } else { 3011 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 3012 total_evicted += bytes; 3013 3014 /* 3015 * If we couldn't evict our target number of bytes from 3016 * data, we try to get the rest from data. 3017 */ 3018 target -= bytes; 3019 3020 total_evicted += 3021 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 3022 } 3023 3024 /* 3025 * Adjust ghost lists 3026 * 3027 * In addition to the above, the ARC also defines target values 3028 * for the ghost lists. The sum of the mru list and mru ghost 3029 * list should never exceed the target size of the cache, and 3030 * the sum of the mru list, mfu list, mru ghost list, and mfu 3031 * ghost list should never exceed twice the target size of the 3032 * cache. The following logic enforces these limits on the ghost 3033 * caches, and evicts from them as needed. 3034 */ 3035 target = refcount_count(&arc_mru->arcs_size) + 3036 refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 3037 3038 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 3039 total_evicted += bytes; 3040 3041 target -= bytes; 3042 3043 total_evicted += 3044 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 3045 3046 /* 3047 * We assume the sum of the mru list and mfu list is less than 3048 * or equal to arc_c (we enforced this above), which means we 3049 * can use the simpler of the two equations below: 3050 * 3051 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 3052 * mru ghost + mfu ghost <= arc_c 3053 */ 3054 target = refcount_count(&arc_mru_ghost->arcs_size) + 3055 refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 3056 3057 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 3058 total_evicted += bytes; 3059 3060 target -= bytes; 3061 3062 total_evicted += 3063 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 3064 3065 return (total_evicted); 3066 } 3067 3068 static void 3069 arc_do_user_evicts(void) 3070 { 3071 mutex_enter(&arc_user_evicts_lock); 3072 while (arc_eviction_list != NULL) { 3073 arc_buf_t *buf = arc_eviction_list; 3074 arc_eviction_list = buf->b_next; 3075 mutex_enter(&buf->b_evict_lock); 3076 buf->b_hdr = NULL; 3077 mutex_exit(&buf->b_evict_lock); 3078 mutex_exit(&arc_user_evicts_lock); 3079 3080 if (buf->b_efunc != NULL) 3081 VERIFY0(buf->b_efunc(buf->b_private)); 3082 3083 buf->b_efunc = NULL; 3084 buf->b_private = NULL; 3085 kmem_cache_free(buf_cache, buf); 3086 mutex_enter(&arc_user_evicts_lock); 3087 } 3088 mutex_exit(&arc_user_evicts_lock); 3089 } 3090 3091 void 3092 arc_flush(spa_t *spa, boolean_t retry) 3093 { 3094 uint64_t guid = 0; 3095 3096 /* 3097 * If retry is TRUE, a spa must not be specified since we have 3098 * no good way to determine if all of a spa's buffers have been 3099 * evicted from an arc state. 3100 */ 3101 ASSERT(!retry || spa == 0); 3102 3103 if (spa != NULL) 3104 guid = spa_load_guid(spa); 3105 3106 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 3107 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 3108 3109 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 3110 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 3111 3112 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 3113 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 3114 3115 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 3116 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 3117 3118 arc_do_user_evicts(); 3119 ASSERT(spa || arc_eviction_list == NULL); 3120 } 3121 3122 void 3123 arc_shrink(int64_t to_free) 3124 { 3125 if (arc_c > arc_c_min) { 3126 3127 if (arc_c > arc_c_min + to_free) 3128 atomic_add_64(&arc_c, -to_free); 3129 else 3130 arc_c = arc_c_min; 3131 3132 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 3133 if (arc_c > arc_size) 3134 arc_c = MAX(arc_size, arc_c_min); 3135 if (arc_p > arc_c) 3136 arc_p = (arc_c >> 1); 3137 ASSERT(arc_c >= arc_c_min); 3138 ASSERT((int64_t)arc_p >= 0); 3139 } 3140 3141 if (arc_size > arc_c) 3142 (void) arc_adjust(); 3143 } 3144 3145 typedef enum free_memory_reason_t { 3146 FMR_UNKNOWN, 3147 FMR_NEEDFREE, 3148 FMR_LOTSFREE, 3149 FMR_SWAPFS_MINFREE, 3150 FMR_PAGES_PP_MAXIMUM, 3151 FMR_HEAP_ARENA, 3152 FMR_ZIO_ARENA, 3153 } free_memory_reason_t; 3154 3155 int64_t last_free_memory; 3156 free_memory_reason_t last_free_reason; 3157 3158 /* 3159 * Additional reserve of pages for pp_reserve. 3160 */ 3161 int64_t arc_pages_pp_reserve = 64; 3162 3163 /* 3164 * Additional reserve of pages for swapfs. 3165 */ 3166 int64_t arc_swapfs_reserve = 64; 3167 3168 /* 3169 * Return the amount of memory that can be consumed before reclaim will be 3170 * needed. Positive if there is sufficient free memory, negative indicates 3171 * the amount of memory that needs to be freed up. 3172 */ 3173 static int64_t 3174 arc_available_memory(void) 3175 { 3176 int64_t lowest = INT64_MAX; 3177 int64_t n; 3178 free_memory_reason_t r = FMR_UNKNOWN; 3179 3180 #ifdef _KERNEL 3181 if (needfree > 0) { 3182 n = PAGESIZE * (-needfree); 3183 if (n < lowest) { 3184 lowest = n; 3185 r = FMR_NEEDFREE; 3186 } 3187 } 3188 3189 /* 3190 * check that we're out of range of the pageout scanner. It starts to 3191 * schedule paging if freemem is less than lotsfree and needfree. 3192 * lotsfree is the high-water mark for pageout, and needfree is the 3193 * number of needed free pages. We add extra pages here to make sure 3194 * the scanner doesn't start up while we're freeing memory. 3195 */ 3196 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 3197 if (n < lowest) { 3198 lowest = n; 3199 r = FMR_LOTSFREE; 3200 } 3201 3202 /* 3203 * check to make sure that swapfs has enough space so that anon 3204 * reservations can still succeed. anon_resvmem() checks that the 3205 * availrmem is greater than swapfs_minfree, and the number of reserved 3206 * swap pages. We also add a bit of extra here just to prevent 3207 * circumstances from getting really dire. 3208 */ 3209 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 3210 desfree - arc_swapfs_reserve); 3211 if (n < lowest) { 3212 lowest = n; 3213 r = FMR_SWAPFS_MINFREE; 3214 } 3215 3216 3217 /* 3218 * Check that we have enough availrmem that memory locking (e.g., via 3219 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 3220 * stores the number of pages that cannot be locked; when availrmem 3221 * drops below pages_pp_maximum, page locking mechanisms such as 3222 * page_pp_lock() will fail.) 3223 */ 3224 n = PAGESIZE * (availrmem - pages_pp_maximum - 3225 arc_pages_pp_reserve); 3226 if (n < lowest) { 3227 lowest = n; 3228 r = FMR_PAGES_PP_MAXIMUM; 3229 } 3230 3231 #if defined(__i386) 3232 /* 3233 * If we're on an i386 platform, it's possible that we'll exhaust the 3234 * kernel heap space before we ever run out of available physical 3235 * memory. Most checks of the size of the heap_area compare against 3236 * tune.t_minarmem, which is the minimum available real memory that we 3237 * can have in the system. However, this is generally fixed at 25 pages 3238 * which is so low that it's useless. In this comparison, we seek to 3239 * calculate the total heap-size, and reclaim if more than 3/4ths of the 3240 * heap is allocated. (Or, in the calculation, if less than 1/4th is 3241 * free) 3242 */ 3243 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 3244 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 3245 if (n < lowest) { 3246 lowest = n; 3247 r = FMR_HEAP_ARENA; 3248 } 3249 #endif 3250 3251 /* 3252 * If zio data pages are being allocated out of a separate heap segment, 3253 * then enforce that the size of available vmem for this arena remains 3254 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 3255 * 3256 * Note that reducing the arc_zio_arena_free_shift keeps more virtual 3257 * memory (in the zio_arena) free, which can avoid memory 3258 * fragmentation issues. 3259 */ 3260 if (zio_arena != NULL) { 3261 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 3262 (vmem_size(zio_arena, VMEM_ALLOC) >> 3263 arc_zio_arena_free_shift); 3264 if (n < lowest) { 3265 lowest = n; 3266 r = FMR_ZIO_ARENA; 3267 } 3268 } 3269 #else 3270 /* Every 100 calls, free a small amount */ 3271 if (spa_get_random(100) == 0) 3272 lowest = -1024; 3273 #endif 3274 3275 last_free_memory = lowest; 3276 last_free_reason = r; 3277 3278 return (lowest); 3279 } 3280 3281 3282 /* 3283 * Determine if the system is under memory pressure and is asking 3284 * to reclaim memory. A return value of TRUE indicates that the system 3285 * is under memory pressure and that the arc should adjust accordingly. 3286 */ 3287 static boolean_t 3288 arc_reclaim_needed(void) 3289 { 3290 return (arc_available_memory() < 0); 3291 } 3292 3293 static void 3294 arc_kmem_reap_now(void) 3295 { 3296 size_t i; 3297 kmem_cache_t *prev_cache = NULL; 3298 kmem_cache_t *prev_data_cache = NULL; 3299 extern kmem_cache_t *zio_buf_cache[]; 3300 extern kmem_cache_t *zio_data_buf_cache[]; 3301 extern kmem_cache_t *range_seg_cache; 3302 3303 #ifdef _KERNEL 3304 if (arc_meta_used >= arc_meta_limit) { 3305 /* 3306 * We are exceeding our meta-data cache limit. 3307 * Purge some DNLC entries to release holds on meta-data. 3308 */ 3309 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 3310 } 3311 #if defined(__i386) 3312 /* 3313 * Reclaim unused memory from all kmem caches. 3314 */ 3315 kmem_reap(); 3316 #endif 3317 #endif 3318 3319 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 3320 if (zio_buf_cache[i] != prev_cache) { 3321 prev_cache = zio_buf_cache[i]; 3322 kmem_cache_reap_now(zio_buf_cache[i]); 3323 } 3324 if (zio_data_buf_cache[i] != prev_data_cache) { 3325 prev_data_cache = zio_data_buf_cache[i]; 3326 kmem_cache_reap_now(zio_data_buf_cache[i]); 3327 } 3328 } 3329 kmem_cache_reap_now(buf_cache); 3330 kmem_cache_reap_now(hdr_full_cache); 3331 kmem_cache_reap_now(hdr_l2only_cache); 3332 kmem_cache_reap_now(range_seg_cache); 3333 3334 if (zio_arena != NULL) { 3335 /* 3336 * Ask the vmem arena to reclaim unused memory from its 3337 * quantum caches. 3338 */ 3339 vmem_qcache_reap(zio_arena); 3340 } 3341 } 3342 3343 /* 3344 * Threads can block in arc_get_data_buf() waiting for this thread to evict 3345 * enough data and signal them to proceed. When this happens, the threads in 3346 * arc_get_data_buf() are sleeping while holding the hash lock for their 3347 * particular arc header. Thus, we must be careful to never sleep on a 3348 * hash lock in this thread. This is to prevent the following deadlock: 3349 * 3350 * - Thread A sleeps on CV in arc_get_data_buf() holding hash lock "L", 3351 * waiting for the reclaim thread to signal it. 3352 * 3353 * - arc_reclaim_thread() tries to acquire hash lock "L" using mutex_enter, 3354 * fails, and goes to sleep forever. 3355 * 3356 * This possible deadlock is avoided by always acquiring a hash lock 3357 * using mutex_tryenter() from arc_reclaim_thread(). 3358 */ 3359 static void 3360 arc_reclaim_thread(void) 3361 { 3362 hrtime_t growtime = 0; 3363 callb_cpr_t cpr; 3364 3365 CALLB_CPR_INIT(&cpr, &arc_reclaim_lock, callb_generic_cpr, FTAG); 3366 3367 mutex_enter(&arc_reclaim_lock); 3368 while (!arc_reclaim_thread_exit) { 3369 int64_t free_memory = arc_available_memory(); 3370 uint64_t evicted = 0; 3371 3372 mutex_exit(&arc_reclaim_lock); 3373 3374 if (free_memory < 0) { 3375 3376 arc_no_grow = B_TRUE; 3377 arc_warm = B_TRUE; 3378 3379 /* 3380 * Wait at least zfs_grow_retry (default 60) seconds 3381 * before considering growing. 3382 */ 3383 growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 3384 3385 arc_kmem_reap_now(); 3386 3387 /* 3388 * If we are still low on memory, shrink the ARC 3389 * so that we have arc_shrink_min free space. 3390 */ 3391 free_memory = arc_available_memory(); 3392 3393 int64_t to_free = 3394 (arc_c >> arc_shrink_shift) - free_memory; 3395 if (to_free > 0) { 3396 #ifdef _KERNEL 3397 to_free = MAX(to_free, ptob(needfree)); 3398 #endif 3399 arc_shrink(to_free); 3400 } 3401 } else if (free_memory < arc_c >> arc_no_grow_shift) { 3402 arc_no_grow = B_TRUE; 3403 } else if (gethrtime() >= growtime) { 3404 arc_no_grow = B_FALSE; 3405 } 3406 3407 evicted = arc_adjust(); 3408 3409 mutex_enter(&arc_reclaim_lock); 3410 3411 /* 3412 * If evicted is zero, we couldn't evict anything via 3413 * arc_adjust(). This could be due to hash lock 3414 * collisions, but more likely due to the majority of 3415 * arc buffers being unevictable. Therefore, even if 3416 * arc_size is above arc_c, another pass is unlikely to 3417 * be helpful and could potentially cause us to enter an 3418 * infinite loop. 3419 */ 3420 if (arc_size <= arc_c || evicted == 0) { 3421 /* 3422 * We're either no longer overflowing, or we 3423 * can't evict anything more, so we should wake 3424 * up any threads before we go to sleep. 3425 */ 3426 cv_broadcast(&arc_reclaim_waiters_cv); 3427 3428 /* 3429 * Block until signaled, or after one second (we 3430 * might need to perform arc_kmem_reap_now() 3431 * even if we aren't being signalled) 3432 */ 3433 CALLB_CPR_SAFE_BEGIN(&cpr); 3434 (void) cv_timedwait_hires(&arc_reclaim_thread_cv, 3435 &arc_reclaim_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); 3436 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_lock); 3437 } 3438 } 3439 3440 arc_reclaim_thread_exit = FALSE; 3441 cv_broadcast(&arc_reclaim_thread_cv); 3442 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_lock */ 3443 thread_exit(); 3444 } 3445 3446 static void 3447 arc_user_evicts_thread(void) 3448 { 3449 callb_cpr_t cpr; 3450 3451 CALLB_CPR_INIT(&cpr, &arc_user_evicts_lock, callb_generic_cpr, FTAG); 3452 3453 mutex_enter(&arc_user_evicts_lock); 3454 while (!arc_user_evicts_thread_exit) { 3455 mutex_exit(&arc_user_evicts_lock); 3456 3457 arc_do_user_evicts(); 3458 3459 /* 3460 * This is necessary in order for the mdb ::arc dcmd to 3461 * show up to date information. Since the ::arc command 3462 * does not call the kstat's update function, without 3463 * this call, the command may show stale stats for the 3464 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 3465 * with this change, the data might be up to 1 second 3466 * out of date; but that should suffice. The arc_state_t 3467 * structures can be queried directly if more accurate 3468 * information is needed. 3469 */ 3470 if (arc_ksp != NULL) 3471 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 3472 3473 mutex_enter(&arc_user_evicts_lock); 3474 3475 /* 3476 * Block until signaled, or after one second (we need to 3477 * call the arc's kstat update function regularly). 3478 */ 3479 CALLB_CPR_SAFE_BEGIN(&cpr); 3480 (void) cv_timedwait(&arc_user_evicts_cv, 3481 &arc_user_evicts_lock, ddi_get_lbolt() + hz); 3482 CALLB_CPR_SAFE_END(&cpr, &arc_user_evicts_lock); 3483 } 3484 3485 arc_user_evicts_thread_exit = FALSE; 3486 cv_broadcast(&arc_user_evicts_cv); 3487 CALLB_CPR_EXIT(&cpr); /* drops arc_user_evicts_lock */ 3488 thread_exit(); 3489 } 3490 3491 /* 3492 * Adapt arc info given the number of bytes we are trying to add and 3493 * the state that we are comming from. This function is only called 3494 * when we are adding new content to the cache. 3495 */ 3496 static void 3497 arc_adapt(int bytes, arc_state_t *state) 3498 { 3499 int mult; 3500 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 3501 int64_t mrug_size = refcount_count(&arc_mru_ghost->arcs_size); 3502 int64_t mfug_size = refcount_count(&arc_mfu_ghost->arcs_size); 3503 3504 if (state == arc_l2c_only) 3505 return; 3506 3507 ASSERT(bytes > 0); 3508 /* 3509 * Adapt the target size of the MRU list: 3510 * - if we just hit in the MRU ghost list, then increase 3511 * the target size of the MRU list. 3512 * - if we just hit in the MFU ghost list, then increase 3513 * the target size of the MFU list by decreasing the 3514 * target size of the MRU list. 3515 */ 3516 if (state == arc_mru_ghost) { 3517 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 3518 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 3519 3520 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 3521 } else if (state == arc_mfu_ghost) { 3522 uint64_t delta; 3523 3524 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 3525 mult = MIN(mult, 10); 3526 3527 delta = MIN(bytes * mult, arc_p); 3528 arc_p = MAX(arc_p_min, arc_p - delta); 3529 } 3530 ASSERT((int64_t)arc_p >= 0); 3531 3532 if (arc_reclaim_needed()) { 3533 cv_signal(&arc_reclaim_thread_cv); 3534 return; 3535 } 3536 3537 if (arc_no_grow) 3538 return; 3539 3540 if (arc_c >= arc_c_max) 3541 return; 3542 3543 /* 3544 * If we're within (2 * maxblocksize) bytes of the target 3545 * cache size, increment the target cache size 3546 */ 3547 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 3548 atomic_add_64(&arc_c, (int64_t)bytes); 3549 if (arc_c > arc_c_max) 3550 arc_c = arc_c_max; 3551 else if (state == arc_anon) 3552 atomic_add_64(&arc_p, (int64_t)bytes); 3553 if (arc_p > arc_c) 3554 arc_p = arc_c; 3555 } 3556 ASSERT((int64_t)arc_p >= 0); 3557 } 3558 3559 /* 3560 * Check if arc_size has grown past our upper threshold, determined by 3561 * zfs_arc_overflow_shift. 3562 */ 3563 static boolean_t 3564 arc_is_overflowing(void) 3565 { 3566 /* Always allow at least one block of overflow */ 3567 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 3568 arc_c >> zfs_arc_overflow_shift); 3569 3570 return (arc_size >= arc_c + overflow); 3571 } 3572 3573 /* 3574 * The buffer, supplied as the first argument, needs a data block. If we 3575 * are hitting the hard limit for the cache size, we must sleep, waiting 3576 * for the eviction thread to catch up. If we're past the target size 3577 * but below the hard limit, we'll only signal the reclaim thread and 3578 * continue on. 3579 */ 3580 static void 3581 arc_get_data_buf(arc_buf_t *buf) 3582 { 3583 arc_state_t *state = buf->b_hdr->b_l1hdr.b_state; 3584 uint64_t size = buf->b_hdr->b_size; 3585 arc_buf_contents_t type = arc_buf_type(buf->b_hdr); 3586 3587 arc_adapt(size, state); 3588 3589 /* 3590 * If arc_size is currently overflowing, and has grown past our 3591 * upper limit, we must be adding data faster than the evict 3592 * thread can evict. Thus, to ensure we don't compound the 3593 * problem by adding more data and forcing arc_size to grow even 3594 * further past it's target size, we halt and wait for the 3595 * eviction thread to catch up. 3596 * 3597 * It's also possible that the reclaim thread is unable to evict 3598 * enough buffers to get arc_size below the overflow limit (e.g. 3599 * due to buffers being un-evictable, or hash lock collisions). 3600 * In this case, we want to proceed regardless if we're 3601 * overflowing; thus we don't use a while loop here. 3602 */ 3603 if (arc_is_overflowing()) { 3604 mutex_enter(&arc_reclaim_lock); 3605 3606 /* 3607 * Now that we've acquired the lock, we may no longer be 3608 * over the overflow limit, lets check. 3609 * 3610 * We're ignoring the case of spurious wake ups. If that 3611 * were to happen, it'd let this thread consume an ARC 3612 * buffer before it should have (i.e. before we're under 3613 * the overflow limit and were signalled by the reclaim 3614 * thread). As long as that is a rare occurrence, it 3615 * shouldn't cause any harm. 3616 */ 3617 if (arc_is_overflowing()) { 3618 cv_signal(&arc_reclaim_thread_cv); 3619 cv_wait(&arc_reclaim_waiters_cv, &arc_reclaim_lock); 3620 } 3621 3622 mutex_exit(&arc_reclaim_lock); 3623 } 3624 3625 if (type == ARC_BUFC_METADATA) { 3626 buf->b_data = zio_buf_alloc(size); 3627 arc_space_consume(size, ARC_SPACE_META); 3628 } else { 3629 ASSERT(type == ARC_BUFC_DATA); 3630 buf->b_data = zio_data_buf_alloc(size); 3631 arc_space_consume(size, ARC_SPACE_DATA); 3632 } 3633 3634 /* 3635 * Update the state size. Note that ghost states have a 3636 * "ghost size" and so don't need to be updated. 3637 */ 3638 if (!GHOST_STATE(buf->b_hdr->b_l1hdr.b_state)) { 3639 arc_buf_hdr_t *hdr = buf->b_hdr; 3640 arc_state_t *state = hdr->b_l1hdr.b_state; 3641 3642 (void) refcount_add_many(&state->arcs_size, size, buf); 3643 3644 /* 3645 * If this is reached via arc_read, the link is 3646 * protected by the hash lock. If reached via 3647 * arc_buf_alloc, the header should not be accessed by 3648 * any other thread. And, if reached via arc_read_done, 3649 * the hash lock will protect it if it's found in the 3650 * hash table; otherwise no other thread should be 3651 * trying to [add|remove]_reference it. 3652 */ 3653 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 3654 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3655 atomic_add_64(&hdr->b_l1hdr.b_state->arcs_lsize[type], 3656 size); 3657 } 3658 /* 3659 * If we are growing the cache, and we are adding anonymous 3660 * data, and we have outgrown arc_p, update arc_p 3661 */ 3662 if (arc_size < arc_c && hdr->b_l1hdr.b_state == arc_anon && 3663 (refcount_count(&arc_anon->arcs_size) + 3664 refcount_count(&arc_mru->arcs_size) > arc_p)) 3665 arc_p = MIN(arc_c, arc_p + size); 3666 } 3667 } 3668 3669 /* 3670 * This routine is called whenever a buffer is accessed. 3671 * NOTE: the hash lock is dropped in this function. 3672 */ 3673 static void 3674 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 3675 { 3676 clock_t now; 3677 3678 ASSERT(MUTEX_HELD(hash_lock)); 3679 ASSERT(HDR_HAS_L1HDR(hdr)); 3680 3681 if (hdr->b_l1hdr.b_state == arc_anon) { 3682 /* 3683 * This buffer is not in the cache, and does not 3684 * appear in our "ghost" list. Add the new buffer 3685 * to the MRU state. 3686 */ 3687 3688 ASSERT0(hdr->b_l1hdr.b_arc_access); 3689 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3690 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3691 arc_change_state(arc_mru, hdr, hash_lock); 3692 3693 } else if (hdr->b_l1hdr.b_state == arc_mru) { 3694 now = ddi_get_lbolt(); 3695 3696 /* 3697 * If this buffer is here because of a prefetch, then either: 3698 * - clear the flag if this is a "referencing" read 3699 * (any subsequent access will bump this into the MFU state). 3700 * or 3701 * - move the buffer to the head of the list if this is 3702 * another prefetch (to make it less likely to be evicted). 3703 */ 3704 if (HDR_PREFETCH(hdr)) { 3705 if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 3706 /* link protected by hash lock */ 3707 ASSERT(multilist_link_active( 3708 &hdr->b_l1hdr.b_arc_node)); 3709 } else { 3710 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3711 ARCSTAT_BUMP(arcstat_mru_hits); 3712 } 3713 hdr->b_l1hdr.b_arc_access = now; 3714 return; 3715 } 3716 3717 /* 3718 * This buffer has been "accessed" only once so far, 3719 * but it is still in the cache. Move it to the MFU 3720 * state. 3721 */ 3722 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 3723 /* 3724 * More than 125ms have passed since we 3725 * instantiated this buffer. Move it to the 3726 * most frequently used state. 3727 */ 3728 hdr->b_l1hdr.b_arc_access = now; 3729 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3730 arc_change_state(arc_mfu, hdr, hash_lock); 3731 } 3732 ARCSTAT_BUMP(arcstat_mru_hits); 3733 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 3734 arc_state_t *new_state; 3735 /* 3736 * This buffer has been "accessed" recently, but 3737 * was evicted from the cache. Move it to the 3738 * MFU state. 3739 */ 3740 3741 if (HDR_PREFETCH(hdr)) { 3742 new_state = arc_mru; 3743 if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) 3744 hdr->b_flags &= ~ARC_FLAG_PREFETCH; 3745 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 3746 } else { 3747 new_state = arc_mfu; 3748 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3749 } 3750 3751 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3752 arc_change_state(new_state, hdr, hash_lock); 3753 3754 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 3755 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 3756 /* 3757 * This buffer has been accessed more than once and is 3758 * still in the cache. Keep it in the MFU state. 3759 * 3760 * NOTE: an add_reference() that occurred when we did 3761 * the arc_read() will have kicked this off the list. 3762 * If it was a prefetch, we will explicitly move it to 3763 * the head of the list now. 3764 */ 3765 if ((HDR_PREFETCH(hdr)) != 0) { 3766 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3767 /* link protected by hash_lock */ 3768 ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3769 } 3770 ARCSTAT_BUMP(arcstat_mfu_hits); 3771 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3772 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 3773 arc_state_t *new_state = arc_mfu; 3774 /* 3775 * This buffer has been accessed more than once but has 3776 * been evicted from the cache. Move it back to the 3777 * MFU state. 3778 */ 3779 3780 if (HDR_PREFETCH(hdr)) { 3781 /* 3782 * This is a prefetch access... 3783 * move this block back to the MRU state. 3784 */ 3785 ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt)); 3786 new_state = arc_mru; 3787 } 3788 3789 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3790 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3791 arc_change_state(new_state, hdr, hash_lock); 3792 3793 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 3794 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 3795 /* 3796 * This buffer is on the 2nd Level ARC. 3797 */ 3798 3799 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 3800 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 3801 arc_change_state(arc_mfu, hdr, hash_lock); 3802 } else { 3803 ASSERT(!"invalid arc state"); 3804 } 3805 } 3806 3807 /* a generic arc_done_func_t which you can use */ 3808 /* ARGSUSED */ 3809 void 3810 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 3811 { 3812 if (zio == NULL || zio->io_error == 0) 3813 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 3814 VERIFY(arc_buf_remove_ref(buf, arg)); 3815 } 3816 3817 /* a generic arc_done_func_t */ 3818 void 3819 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 3820 { 3821 arc_buf_t **bufp = arg; 3822 if (zio && zio->io_error) { 3823 VERIFY(arc_buf_remove_ref(buf, arg)); 3824 *bufp = NULL; 3825 } else { 3826 *bufp = buf; 3827 ASSERT(buf->b_data); 3828 } 3829 } 3830 3831 static void 3832 arc_read_done(zio_t *zio) 3833 { 3834 arc_buf_hdr_t *hdr; 3835 arc_buf_t *buf; 3836 arc_buf_t *abuf; /* buffer we're assigning to callback */ 3837 kmutex_t *hash_lock = NULL; 3838 arc_callback_t *callback_list, *acb; 3839 int freeable = FALSE; 3840 3841 buf = zio->io_private; 3842 hdr = buf->b_hdr; 3843 3844 /* 3845 * The hdr was inserted into hash-table and removed from lists 3846 * prior to starting I/O. We should find this header, since 3847 * it's in the hash table, and it should be legit since it's 3848 * not possible to evict it during the I/O. The only possible 3849 * reason for it not to be found is if we were freed during the 3850 * read. 3851 */ 3852 if (HDR_IN_HASH_TABLE(hdr)) { 3853 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 3854 ASSERT3U(hdr->b_dva.dva_word[0], ==, 3855 BP_IDENTITY(zio->io_bp)->dva_word[0]); 3856 ASSERT3U(hdr->b_dva.dva_word[1], ==, 3857 BP_IDENTITY(zio->io_bp)->dva_word[1]); 3858 3859 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 3860 &hash_lock); 3861 3862 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && 3863 hash_lock == NULL) || 3864 (found == hdr && 3865 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 3866 (found == hdr && HDR_L2_READING(hdr))); 3867 } 3868 3869 hdr->b_flags &= ~ARC_FLAG_L2_EVICTED; 3870 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 3871 hdr->b_flags &= ~ARC_FLAG_L2CACHE; 3872 3873 /* byteswap if necessary */ 3874 callback_list = hdr->b_l1hdr.b_acb; 3875 ASSERT(callback_list != NULL); 3876 if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { 3877 dmu_object_byteswap_t bswap = 3878 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 3879 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 3880 byteswap_uint64_array : 3881 dmu_ot_byteswap[bswap].ob_func; 3882 func(buf->b_data, hdr->b_size); 3883 } 3884 3885 arc_cksum_compute(buf, B_FALSE); 3886 arc_buf_watch(buf); 3887 3888 if (hash_lock && zio->io_error == 0 && 3889 hdr->b_l1hdr.b_state == arc_anon) { 3890 /* 3891 * Only call arc_access on anonymous buffers. This is because 3892 * if we've issued an I/O for an evicted buffer, we've already 3893 * called arc_access (to prevent any simultaneous readers from 3894 * getting confused). 3895 */ 3896 arc_access(hdr, hash_lock); 3897 } 3898 3899 /* create copies of the data buffer for the callers */ 3900 abuf = buf; 3901 for (acb = callback_list; acb; acb = acb->acb_next) { 3902 if (acb->acb_done) { 3903 if (abuf == NULL) { 3904 ARCSTAT_BUMP(arcstat_duplicate_reads); 3905 abuf = arc_buf_clone(buf); 3906 } 3907 acb->acb_buf = abuf; 3908 abuf = NULL; 3909 } 3910 } 3911 hdr->b_l1hdr.b_acb = NULL; 3912 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 3913 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 3914 if (abuf == buf) { 3915 ASSERT(buf->b_efunc == NULL); 3916 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 3917 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 3918 } 3919 3920 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 3921 callback_list != NULL); 3922 3923 if (zio->io_error != 0) { 3924 hdr->b_flags |= ARC_FLAG_IO_ERROR; 3925 if (hdr->b_l1hdr.b_state != arc_anon) 3926 arc_change_state(arc_anon, hdr, hash_lock); 3927 if (HDR_IN_HASH_TABLE(hdr)) 3928 buf_hash_remove(hdr); 3929 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3930 } 3931 3932 /* 3933 * Broadcast before we drop the hash_lock to avoid the possibility 3934 * that the hdr (and hence the cv) might be freed before we get to 3935 * the cv_broadcast(). 3936 */ 3937 cv_broadcast(&hdr->b_l1hdr.b_cv); 3938 3939 if (hash_lock != NULL) { 3940 mutex_exit(hash_lock); 3941 } else { 3942 /* 3943 * This block was freed while we waited for the read to 3944 * complete. It has been removed from the hash table and 3945 * moved to the anonymous state (so that it won't show up 3946 * in the cache). 3947 */ 3948 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3949 freeable = refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 3950 } 3951 3952 /* execute each callback and free its structure */ 3953 while ((acb = callback_list) != NULL) { 3954 if (acb->acb_done) 3955 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 3956 3957 if (acb->acb_zio_dummy != NULL) { 3958 acb->acb_zio_dummy->io_error = zio->io_error; 3959 zio_nowait(acb->acb_zio_dummy); 3960 } 3961 3962 callback_list = acb->acb_next; 3963 kmem_free(acb, sizeof (arc_callback_t)); 3964 } 3965 3966 if (freeable) 3967 arc_hdr_destroy(hdr); 3968 } 3969 3970 /* 3971 * "Read" the block at the specified DVA (in bp) via the 3972 * cache. If the block is found in the cache, invoke the provided 3973 * callback immediately and return. Note that the `zio' parameter 3974 * in the callback will be NULL in this case, since no IO was 3975 * required. If the block is not in the cache pass the read request 3976 * on to the spa with a substitute callback function, so that the 3977 * requested block will be added to the cache. 3978 * 3979 * If a read request arrives for a block that has a read in-progress, 3980 * either wait for the in-progress read to complete (and return the 3981 * results); or, if this is a read with a "done" func, add a record 3982 * to the read to invoke the "done" func when the read completes, 3983 * and return; or just return. 3984 * 3985 * arc_read_done() will invoke all the requested "done" functions 3986 * for readers of this block. 3987 */ 3988 int 3989 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, 3990 void *private, zio_priority_t priority, int zio_flags, 3991 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 3992 { 3993 arc_buf_hdr_t *hdr = NULL; 3994 arc_buf_t *buf = NULL; 3995 kmutex_t *hash_lock = NULL; 3996 zio_t *rzio; 3997 uint64_t guid = spa_load_guid(spa); 3998 3999 ASSERT(!BP_IS_EMBEDDED(bp) || 4000 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 4001 4002 top: 4003 if (!BP_IS_EMBEDDED(bp)) { 4004 /* 4005 * Embedded BP's have no DVA and require no I/O to "read". 4006 * Create an anonymous arc buf to back it. 4007 */ 4008 hdr = buf_hash_find(guid, bp, &hash_lock); 4009 } 4010 4011 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_datacnt > 0) { 4012 4013 *arc_flags |= ARC_FLAG_CACHED; 4014 4015 if (HDR_IO_IN_PROGRESS(hdr)) { 4016 4017 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 4018 priority == ZIO_PRIORITY_SYNC_READ) { 4019 /* 4020 * This sync read must wait for an 4021 * in-progress async read (e.g. a predictive 4022 * prefetch). Async reads are queued 4023 * separately at the vdev_queue layer, so 4024 * this is a form of priority inversion. 4025 * Ideally, we would "inherit" the demand 4026 * i/o's priority by moving the i/o from 4027 * the async queue to the synchronous queue, 4028 * but there is currently no mechanism to do 4029 * so. Track this so that we can evaluate 4030 * the magnitude of this potential performance 4031 * problem. 4032 * 4033 * Note that if the prefetch i/o is already 4034 * active (has been issued to the device), 4035 * the prefetch improved performance, because 4036 * we issued it sooner than we would have 4037 * without the prefetch. 4038 */ 4039 DTRACE_PROBE1(arc__sync__wait__for__async, 4040 arc_buf_hdr_t *, hdr); 4041 ARCSTAT_BUMP(arcstat_sync_wait_for_async); 4042 } 4043 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4044 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4045 } 4046 4047 if (*arc_flags & ARC_FLAG_WAIT) { 4048 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 4049 mutex_exit(hash_lock); 4050 goto top; 4051 } 4052 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4053 4054 if (done) { 4055 arc_callback_t *acb = NULL; 4056 4057 acb = kmem_zalloc(sizeof (arc_callback_t), 4058 KM_SLEEP); 4059 acb->acb_done = done; 4060 acb->acb_private = private; 4061 if (pio != NULL) 4062 acb->acb_zio_dummy = zio_null(pio, 4063 spa, NULL, NULL, NULL, zio_flags); 4064 4065 ASSERT(acb->acb_done != NULL); 4066 acb->acb_next = hdr->b_l1hdr.b_acb; 4067 hdr->b_l1hdr.b_acb = acb; 4068 add_reference(hdr, hash_lock, private); 4069 mutex_exit(hash_lock); 4070 return (0); 4071 } 4072 mutex_exit(hash_lock); 4073 return (0); 4074 } 4075 4076 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4077 hdr->b_l1hdr.b_state == arc_mfu); 4078 4079 if (done) { 4080 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 4081 /* 4082 * This is a demand read which does not have to 4083 * wait for i/o because we did a predictive 4084 * prefetch i/o for it, which has completed. 4085 */ 4086 DTRACE_PROBE1( 4087 arc__demand__hit__predictive__prefetch, 4088 arc_buf_hdr_t *, hdr); 4089 ARCSTAT_BUMP( 4090 arcstat_demand_hit_predictive_prefetch); 4091 hdr->b_flags &= ~ARC_FLAG_PREDICTIVE_PREFETCH; 4092 } 4093 add_reference(hdr, hash_lock, private); 4094 /* 4095 * If this block is already in use, create a new 4096 * copy of the data so that we will be guaranteed 4097 * that arc_release() will always succeed. 4098 */ 4099 buf = hdr->b_l1hdr.b_buf; 4100 ASSERT(buf); 4101 ASSERT(buf->b_data); 4102 if (HDR_BUF_AVAILABLE(hdr)) { 4103 ASSERT(buf->b_efunc == NULL); 4104 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4105 } else { 4106 buf = arc_buf_clone(buf); 4107 } 4108 4109 } else if (*arc_flags & ARC_FLAG_PREFETCH && 4110 refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 4111 hdr->b_flags |= ARC_FLAG_PREFETCH; 4112 } 4113 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 4114 arc_access(hdr, hash_lock); 4115 if (*arc_flags & ARC_FLAG_L2CACHE) 4116 hdr->b_flags |= ARC_FLAG_L2CACHE; 4117 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4118 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4119 mutex_exit(hash_lock); 4120 ARCSTAT_BUMP(arcstat_hits); 4121 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4122 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4123 data, metadata, hits); 4124 4125 if (done) 4126 done(NULL, buf, private); 4127 } else { 4128 uint64_t size = BP_GET_LSIZE(bp); 4129 arc_callback_t *acb; 4130 vdev_t *vd = NULL; 4131 uint64_t addr = 0; 4132 boolean_t devw = B_FALSE; 4133 enum zio_compress b_compress = ZIO_COMPRESS_OFF; 4134 int32_t b_asize = 0; 4135 4136 if (hdr == NULL) { 4137 /* this block is not in the cache */ 4138 arc_buf_hdr_t *exists = NULL; 4139 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 4140 buf = arc_buf_alloc(spa, size, private, type); 4141 hdr = buf->b_hdr; 4142 if (!BP_IS_EMBEDDED(bp)) { 4143 hdr->b_dva = *BP_IDENTITY(bp); 4144 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 4145 exists = buf_hash_insert(hdr, &hash_lock); 4146 } 4147 if (exists != NULL) { 4148 /* somebody beat us to the hash insert */ 4149 mutex_exit(hash_lock); 4150 buf_discard_identity(hdr); 4151 (void) arc_buf_remove_ref(buf, private); 4152 goto top; /* restart the IO request */ 4153 } 4154 4155 /* 4156 * If there is a callback, we pass our reference to 4157 * it; otherwise we remove our reference. 4158 */ 4159 if (done == NULL) { 4160 (void) remove_reference(hdr, hash_lock, 4161 private); 4162 } 4163 if (*arc_flags & ARC_FLAG_PREFETCH) 4164 hdr->b_flags |= ARC_FLAG_PREFETCH; 4165 if (*arc_flags & ARC_FLAG_L2CACHE) 4166 hdr->b_flags |= ARC_FLAG_L2CACHE; 4167 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4168 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4169 if (BP_GET_LEVEL(bp) > 0) 4170 hdr->b_flags |= ARC_FLAG_INDIRECT; 4171 } else { 4172 /* 4173 * This block is in the ghost cache. If it was L2-only 4174 * (and thus didn't have an L1 hdr), we realloc the 4175 * header to add an L1 hdr. 4176 */ 4177 if (!HDR_HAS_L1HDR(hdr)) { 4178 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 4179 hdr_full_cache); 4180 } 4181 4182 ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); 4183 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4184 ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4185 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4186 4187 /* 4188 * If there is a callback, we pass a reference to it. 4189 */ 4190 if (done != NULL) 4191 add_reference(hdr, hash_lock, private); 4192 if (*arc_flags & ARC_FLAG_PREFETCH) 4193 hdr->b_flags |= ARC_FLAG_PREFETCH; 4194 if (*arc_flags & ARC_FLAG_L2CACHE) 4195 hdr->b_flags |= ARC_FLAG_L2CACHE; 4196 if (*arc_flags & ARC_FLAG_L2COMPRESS) 4197 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4198 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 4199 buf->b_hdr = hdr; 4200 buf->b_data = NULL; 4201 buf->b_efunc = NULL; 4202 buf->b_private = NULL; 4203 buf->b_next = NULL; 4204 hdr->b_l1hdr.b_buf = buf; 4205 ASSERT0(hdr->b_l1hdr.b_datacnt); 4206 hdr->b_l1hdr.b_datacnt = 1; 4207 arc_get_data_buf(buf); 4208 arc_access(hdr, hash_lock); 4209 } 4210 4211 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 4212 hdr->b_flags |= ARC_FLAG_PREDICTIVE_PREFETCH; 4213 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 4214 4215 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 4216 acb->acb_done = done; 4217 acb->acb_private = private; 4218 4219 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4220 hdr->b_l1hdr.b_acb = acb; 4221 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4222 4223 if (HDR_HAS_L2HDR(hdr) && 4224 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 4225 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 4226 addr = hdr->b_l2hdr.b_daddr; 4227 b_compress = hdr->b_l2hdr.b_compress; 4228 b_asize = hdr->b_l2hdr.b_asize; 4229 /* 4230 * Lock out device removal. 4231 */ 4232 if (vdev_is_dead(vd) || 4233 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 4234 vd = NULL; 4235 } 4236 4237 if (hash_lock != NULL) 4238 mutex_exit(hash_lock); 4239 4240 /* 4241 * At this point, we have a level 1 cache miss. Try again in 4242 * L2ARC if possible. 4243 */ 4244 ASSERT3U(hdr->b_size, ==, size); 4245 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 4246 uint64_t, size, zbookmark_phys_t *, zb); 4247 ARCSTAT_BUMP(arcstat_misses); 4248 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 4249 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 4250 data, metadata, misses); 4251 4252 if (priority == ZIO_PRIORITY_ASYNC_READ) 4253 hdr->b_flags |= ARC_FLAG_PRIO_ASYNC_READ; 4254 else 4255 hdr->b_flags &= ~ARC_FLAG_PRIO_ASYNC_READ; 4256 4257 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 4258 /* 4259 * Read from the L2ARC if the following are true: 4260 * 1. The L2ARC vdev was previously cached. 4261 * 2. This buffer still has L2ARC metadata. 4262 * 3. This buffer isn't currently writing to the L2ARC. 4263 * 4. The L2ARC entry wasn't evicted, which may 4264 * also have invalidated the vdev. 4265 * 5. This isn't prefetch and l2arc_noprefetch is set. 4266 */ 4267 if (HDR_HAS_L2HDR(hdr) && 4268 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 4269 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 4270 l2arc_read_callback_t *cb; 4271 4272 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 4273 ARCSTAT_BUMP(arcstat_l2_hits); 4274 4275 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 4276 KM_SLEEP); 4277 cb->l2rcb_buf = buf; 4278 cb->l2rcb_spa = spa; 4279 cb->l2rcb_bp = *bp; 4280 cb->l2rcb_zb = *zb; 4281 cb->l2rcb_flags = zio_flags; 4282 cb->l2rcb_compress = b_compress; 4283 4284 ASSERT(addr >= VDEV_LABEL_START_SIZE && 4285 addr + size < vd->vdev_psize - 4286 VDEV_LABEL_END_SIZE); 4287 4288 /* 4289 * l2arc read. The SCL_L2ARC lock will be 4290 * released by l2arc_read_done(). 4291 * Issue a null zio if the underlying buffer 4292 * was squashed to zero size by compression. 4293 */ 4294 if (b_compress == ZIO_COMPRESS_EMPTY) { 4295 rzio = zio_null(pio, spa, vd, 4296 l2arc_read_done, cb, 4297 zio_flags | ZIO_FLAG_DONT_CACHE | 4298 ZIO_FLAG_CANFAIL | 4299 ZIO_FLAG_DONT_PROPAGATE | 4300 ZIO_FLAG_DONT_RETRY); 4301 } else { 4302 rzio = zio_read_phys(pio, vd, addr, 4303 b_asize, buf->b_data, 4304 ZIO_CHECKSUM_OFF, 4305 l2arc_read_done, cb, priority, 4306 zio_flags | ZIO_FLAG_DONT_CACHE | 4307 ZIO_FLAG_CANFAIL | 4308 ZIO_FLAG_DONT_PROPAGATE | 4309 ZIO_FLAG_DONT_RETRY, B_FALSE); 4310 } 4311 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 4312 zio_t *, rzio); 4313 ARCSTAT_INCR(arcstat_l2_read_bytes, b_asize); 4314 4315 if (*arc_flags & ARC_FLAG_NOWAIT) { 4316 zio_nowait(rzio); 4317 return (0); 4318 } 4319 4320 ASSERT(*arc_flags & ARC_FLAG_WAIT); 4321 if (zio_wait(rzio) == 0) 4322 return (0); 4323 4324 /* l2arc read error; goto zio_read() */ 4325 } else { 4326 DTRACE_PROBE1(l2arc__miss, 4327 arc_buf_hdr_t *, hdr); 4328 ARCSTAT_BUMP(arcstat_l2_misses); 4329 if (HDR_L2_WRITING(hdr)) 4330 ARCSTAT_BUMP(arcstat_l2_rw_clash); 4331 spa_config_exit(spa, SCL_L2ARC, vd); 4332 } 4333 } else { 4334 if (vd != NULL) 4335 spa_config_exit(spa, SCL_L2ARC, vd); 4336 if (l2arc_ndev != 0) { 4337 DTRACE_PROBE1(l2arc__miss, 4338 arc_buf_hdr_t *, hdr); 4339 ARCSTAT_BUMP(arcstat_l2_misses); 4340 } 4341 } 4342 4343 rzio = zio_read(pio, spa, bp, buf->b_data, size, 4344 arc_read_done, buf, priority, zio_flags, zb); 4345 4346 if (*arc_flags & ARC_FLAG_WAIT) 4347 return (zio_wait(rzio)); 4348 4349 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 4350 zio_nowait(rzio); 4351 } 4352 return (0); 4353 } 4354 4355 void 4356 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 4357 { 4358 ASSERT(buf->b_hdr != NULL); 4359 ASSERT(buf->b_hdr->b_l1hdr.b_state != arc_anon); 4360 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt) || 4361 func == NULL); 4362 ASSERT(buf->b_efunc == NULL); 4363 ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); 4364 4365 buf->b_efunc = func; 4366 buf->b_private = private; 4367 } 4368 4369 /* 4370 * Notify the arc that a block was freed, and thus will never be used again. 4371 */ 4372 void 4373 arc_freed(spa_t *spa, const blkptr_t *bp) 4374 { 4375 arc_buf_hdr_t *hdr; 4376 kmutex_t *hash_lock; 4377 uint64_t guid = spa_load_guid(spa); 4378 4379 ASSERT(!BP_IS_EMBEDDED(bp)); 4380 4381 hdr = buf_hash_find(guid, bp, &hash_lock); 4382 if (hdr == NULL) 4383 return; 4384 if (HDR_BUF_AVAILABLE(hdr)) { 4385 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4386 add_reference(hdr, hash_lock, FTAG); 4387 hdr->b_flags &= ~ARC_FLAG_BUF_AVAILABLE; 4388 mutex_exit(hash_lock); 4389 4390 arc_release(buf, FTAG); 4391 (void) arc_buf_remove_ref(buf, FTAG); 4392 } else { 4393 mutex_exit(hash_lock); 4394 } 4395 4396 } 4397 4398 /* 4399 * Clear the user eviction callback set by arc_set_callback(), first calling 4400 * it if it exists. Because the presence of a callback keeps an arc_buf cached 4401 * clearing the callback may result in the arc_buf being destroyed. However, 4402 * it will not result in the *last* arc_buf being destroyed, hence the data 4403 * will remain cached in the ARC. We make a copy of the arc buffer here so 4404 * that we can process the callback without holding any locks. 4405 * 4406 * It's possible that the callback is already in the process of being cleared 4407 * by another thread. In this case we can not clear the callback. 4408 * 4409 * Returns B_TRUE if the callback was successfully called and cleared. 4410 */ 4411 boolean_t 4412 arc_clear_callback(arc_buf_t *buf) 4413 { 4414 arc_buf_hdr_t *hdr; 4415 kmutex_t *hash_lock; 4416 arc_evict_func_t *efunc = buf->b_efunc; 4417 void *private = buf->b_private; 4418 4419 mutex_enter(&buf->b_evict_lock); 4420 hdr = buf->b_hdr; 4421 if (hdr == NULL) { 4422 /* 4423 * We are in arc_do_user_evicts(). 4424 */ 4425 ASSERT(buf->b_data == NULL); 4426 mutex_exit(&buf->b_evict_lock); 4427 return (B_FALSE); 4428 } else if (buf->b_data == NULL) { 4429 /* 4430 * We are on the eviction list; process this buffer now 4431 * but let arc_do_user_evicts() do the reaping. 4432 */ 4433 buf->b_efunc = NULL; 4434 mutex_exit(&buf->b_evict_lock); 4435 VERIFY0(efunc(private)); 4436 return (B_TRUE); 4437 } 4438 hash_lock = HDR_LOCK(hdr); 4439 mutex_enter(hash_lock); 4440 hdr = buf->b_hdr; 4441 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4442 4443 ASSERT3U(refcount_count(&hdr->b_l1hdr.b_refcnt), <, 4444 hdr->b_l1hdr.b_datacnt); 4445 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 4446 hdr->b_l1hdr.b_state == arc_mfu); 4447 4448 buf->b_efunc = NULL; 4449 buf->b_private = NULL; 4450 4451 if (hdr->b_l1hdr.b_datacnt > 1) { 4452 mutex_exit(&buf->b_evict_lock); 4453 arc_buf_destroy(buf, TRUE); 4454 } else { 4455 ASSERT(buf == hdr->b_l1hdr.b_buf); 4456 hdr->b_flags |= ARC_FLAG_BUF_AVAILABLE; 4457 mutex_exit(&buf->b_evict_lock); 4458 } 4459 4460 mutex_exit(hash_lock); 4461 VERIFY0(efunc(private)); 4462 return (B_TRUE); 4463 } 4464 4465 /* 4466 * Release this buffer from the cache, making it an anonymous buffer. This 4467 * must be done after a read and prior to modifying the buffer contents. 4468 * If the buffer has more than one reference, we must make 4469 * a new hdr for the buffer. 4470 */ 4471 void 4472 arc_release(arc_buf_t *buf, void *tag) 4473 { 4474 arc_buf_hdr_t *hdr = buf->b_hdr; 4475 4476 /* 4477 * It would be nice to assert that if it's DMU metadata (level > 4478 * 0 || it's the dnode file), then it must be syncing context. 4479 * But we don't know that information at this level. 4480 */ 4481 4482 mutex_enter(&buf->b_evict_lock); 4483 4484 ASSERT(HDR_HAS_L1HDR(hdr)); 4485 4486 /* 4487 * We don't grab the hash lock prior to this check, because if 4488 * the buffer's header is in the arc_anon state, it won't be 4489 * linked into the hash table. 4490 */ 4491 if (hdr->b_l1hdr.b_state == arc_anon) { 4492 mutex_exit(&buf->b_evict_lock); 4493 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4494 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 4495 ASSERT(!HDR_HAS_L2HDR(hdr)); 4496 ASSERT(BUF_EMPTY(hdr)); 4497 4498 ASSERT3U(hdr->b_l1hdr.b_datacnt, ==, 1); 4499 ASSERT3S(refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 4500 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 4501 4502 ASSERT3P(buf->b_efunc, ==, NULL); 4503 ASSERT3P(buf->b_private, ==, NULL); 4504 4505 hdr->b_l1hdr.b_arc_access = 0; 4506 arc_buf_thaw(buf); 4507 4508 return; 4509 } 4510 4511 kmutex_t *hash_lock = HDR_LOCK(hdr); 4512 mutex_enter(hash_lock); 4513 4514 /* 4515 * This assignment is only valid as long as the hash_lock is 4516 * held, we must be careful not to reference state or the 4517 * b_state field after dropping the lock. 4518 */ 4519 arc_state_t *state = hdr->b_l1hdr.b_state; 4520 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4521 ASSERT3P(state, !=, arc_anon); 4522 4523 /* this buffer is not on any list */ 4524 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) > 0); 4525 4526 if (HDR_HAS_L2HDR(hdr)) { 4527 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4528 4529 /* 4530 * We have to recheck this conditional again now that 4531 * we're holding the l2ad_mtx to prevent a race with 4532 * another thread which might be concurrently calling 4533 * l2arc_evict(). In that case, l2arc_evict() might have 4534 * destroyed the header's L2 portion as we were waiting 4535 * to acquire the l2ad_mtx. 4536 */ 4537 if (HDR_HAS_L2HDR(hdr)) 4538 arc_hdr_l2hdr_destroy(hdr); 4539 4540 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 4541 } 4542 4543 /* 4544 * Do we have more than one buf? 4545 */ 4546 if (hdr->b_l1hdr.b_datacnt > 1) { 4547 arc_buf_hdr_t *nhdr; 4548 arc_buf_t **bufp; 4549 uint64_t blksz = hdr->b_size; 4550 uint64_t spa = hdr->b_spa; 4551 arc_buf_contents_t type = arc_buf_type(hdr); 4552 uint32_t flags = hdr->b_flags; 4553 4554 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 4555 /* 4556 * Pull the data off of this hdr and attach it to 4557 * a new anonymous hdr. 4558 */ 4559 (void) remove_reference(hdr, hash_lock, tag); 4560 bufp = &hdr->b_l1hdr.b_buf; 4561 while (*bufp != buf) 4562 bufp = &(*bufp)->b_next; 4563 *bufp = buf->b_next; 4564 buf->b_next = NULL; 4565 4566 ASSERT3P(state, !=, arc_l2c_only); 4567 4568 (void) refcount_remove_many( 4569 &state->arcs_size, hdr->b_size, buf); 4570 4571 if (refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 4572 ASSERT3P(state, !=, arc_l2c_only); 4573 uint64_t *size = &state->arcs_lsize[type]; 4574 ASSERT3U(*size, >=, hdr->b_size); 4575 atomic_add_64(size, -hdr->b_size); 4576 } 4577 4578 /* 4579 * We're releasing a duplicate user data buffer, update 4580 * our statistics accordingly. 4581 */ 4582 if (HDR_ISTYPE_DATA(hdr)) { 4583 ARCSTAT_BUMPDOWN(arcstat_duplicate_buffers); 4584 ARCSTAT_INCR(arcstat_duplicate_buffers_size, 4585 -hdr->b_size); 4586 } 4587 hdr->b_l1hdr.b_datacnt -= 1; 4588 arc_cksum_verify(buf); 4589 arc_buf_unwatch(buf); 4590 4591 mutex_exit(hash_lock); 4592 4593 nhdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 4594 nhdr->b_size = blksz; 4595 nhdr->b_spa = spa; 4596 4597 nhdr->b_flags = flags & ARC_FLAG_L2_WRITING; 4598 nhdr->b_flags |= arc_bufc_to_flags(type); 4599 nhdr->b_flags |= ARC_FLAG_HAS_L1HDR; 4600 4601 nhdr->b_l1hdr.b_buf = buf; 4602 nhdr->b_l1hdr.b_datacnt = 1; 4603 nhdr->b_l1hdr.b_state = arc_anon; 4604 nhdr->b_l1hdr.b_arc_access = 0; 4605 nhdr->b_l1hdr.b_tmp_cdata = NULL; 4606 nhdr->b_freeze_cksum = NULL; 4607 4608 (void) refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 4609 buf->b_hdr = nhdr; 4610 mutex_exit(&buf->b_evict_lock); 4611 (void) refcount_add_many(&arc_anon->arcs_size, blksz, buf); 4612 } else { 4613 mutex_exit(&buf->b_evict_lock); 4614 ASSERT(refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 4615 /* protected by hash lock, or hdr is on arc_anon */ 4616 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4617 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4618 arc_change_state(arc_anon, hdr, hash_lock); 4619 hdr->b_l1hdr.b_arc_access = 0; 4620 mutex_exit(hash_lock); 4621 4622 buf_discard_identity(hdr); 4623 arc_buf_thaw(buf); 4624 } 4625 buf->b_efunc = NULL; 4626 buf->b_private = NULL; 4627 } 4628 4629 int 4630 arc_released(arc_buf_t *buf) 4631 { 4632 int released; 4633 4634 mutex_enter(&buf->b_evict_lock); 4635 released = (buf->b_data != NULL && 4636 buf->b_hdr->b_l1hdr.b_state == arc_anon); 4637 mutex_exit(&buf->b_evict_lock); 4638 return (released); 4639 } 4640 4641 #ifdef ZFS_DEBUG 4642 int 4643 arc_referenced(arc_buf_t *buf) 4644 { 4645 int referenced; 4646 4647 mutex_enter(&buf->b_evict_lock); 4648 referenced = (refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 4649 mutex_exit(&buf->b_evict_lock); 4650 return (referenced); 4651 } 4652 #endif 4653 4654 static void 4655 arc_write_ready(zio_t *zio) 4656 { 4657 arc_write_callback_t *callback = zio->io_private; 4658 arc_buf_t *buf = callback->awcb_buf; 4659 arc_buf_hdr_t *hdr = buf->b_hdr; 4660 4661 ASSERT(HDR_HAS_L1HDR(hdr)); 4662 ASSERT(!refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 4663 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4664 callback->awcb_ready(zio, buf, callback->awcb_private); 4665 4666 /* 4667 * If the IO is already in progress, then this is a re-write 4668 * attempt, so we need to thaw and re-compute the cksum. 4669 * It is the responsibility of the callback to handle the 4670 * accounting for any re-write attempt. 4671 */ 4672 if (HDR_IO_IN_PROGRESS(hdr)) { 4673 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 4674 if (hdr->b_freeze_cksum != NULL) { 4675 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 4676 hdr->b_freeze_cksum = NULL; 4677 } 4678 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 4679 } 4680 arc_cksum_compute(buf, B_FALSE); 4681 hdr->b_flags |= ARC_FLAG_IO_IN_PROGRESS; 4682 } 4683 4684 static void 4685 arc_write_children_ready(zio_t *zio) 4686 { 4687 arc_write_callback_t *callback = zio->io_private; 4688 arc_buf_t *buf = callback->awcb_buf; 4689 4690 callback->awcb_children_ready(zio, buf, callback->awcb_private); 4691 } 4692 4693 /* 4694 * The SPA calls this callback for each physical write that happens on behalf 4695 * of a logical write. See the comment in dbuf_write_physdone() for details. 4696 */ 4697 static void 4698 arc_write_physdone(zio_t *zio) 4699 { 4700 arc_write_callback_t *cb = zio->io_private; 4701 if (cb->awcb_physdone != NULL) 4702 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 4703 } 4704 4705 static void 4706 arc_write_done(zio_t *zio) 4707 { 4708 arc_write_callback_t *callback = zio->io_private; 4709 arc_buf_t *buf = callback->awcb_buf; 4710 arc_buf_hdr_t *hdr = buf->b_hdr; 4711 4712 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4713 4714 if (zio->io_error == 0) { 4715 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 4716 buf_discard_identity(hdr); 4717 } else { 4718 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 4719 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 4720 } 4721 } else { 4722 ASSERT(BUF_EMPTY(hdr)); 4723 } 4724 4725 /* 4726 * If the block to be written was all-zero or compressed enough to be 4727 * embedded in the BP, no write was performed so there will be no 4728 * dva/birth/checksum. The buffer must therefore remain anonymous 4729 * (and uncached). 4730 */ 4731 if (!BUF_EMPTY(hdr)) { 4732 arc_buf_hdr_t *exists; 4733 kmutex_t *hash_lock; 4734 4735 ASSERT(zio->io_error == 0); 4736 4737 arc_cksum_verify(buf); 4738 4739 exists = buf_hash_insert(hdr, &hash_lock); 4740 if (exists != NULL) { 4741 /* 4742 * This can only happen if we overwrite for 4743 * sync-to-convergence, because we remove 4744 * buffers from the hash table when we arc_free(). 4745 */ 4746 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 4747 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4748 panic("bad overwrite, hdr=%p exists=%p", 4749 (void *)hdr, (void *)exists); 4750 ASSERT(refcount_is_zero( 4751 &exists->b_l1hdr.b_refcnt)); 4752 arc_change_state(arc_anon, exists, hash_lock); 4753 mutex_exit(hash_lock); 4754 arc_hdr_destroy(exists); 4755 exists = buf_hash_insert(hdr, &hash_lock); 4756 ASSERT3P(exists, ==, NULL); 4757 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 4758 /* nopwrite */ 4759 ASSERT(zio->io_prop.zp_nopwrite); 4760 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 4761 panic("bad nopwrite, hdr=%p exists=%p", 4762 (void *)hdr, (void *)exists); 4763 } else { 4764 /* Dedup */ 4765 ASSERT(hdr->b_l1hdr.b_datacnt == 1); 4766 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 4767 ASSERT(BP_GET_DEDUP(zio->io_bp)); 4768 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 4769 } 4770 } 4771 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4772 /* if it's not anon, we are doing a scrub */ 4773 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 4774 arc_access(hdr, hash_lock); 4775 mutex_exit(hash_lock); 4776 } else { 4777 hdr->b_flags &= ~ARC_FLAG_IO_IN_PROGRESS; 4778 } 4779 4780 ASSERT(!refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 4781 callback->awcb_done(zio, buf, callback->awcb_private); 4782 4783 kmem_free(callback, sizeof (arc_write_callback_t)); 4784 } 4785 4786 zio_t * 4787 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, 4788 blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, boolean_t l2arc_compress, 4789 const zio_prop_t *zp, arc_done_func_t *ready, 4790 arc_done_func_t *children_ready, arc_done_func_t *physdone, 4791 arc_done_func_t *done, void *private, zio_priority_t priority, 4792 int zio_flags, const zbookmark_phys_t *zb) 4793 { 4794 arc_buf_hdr_t *hdr = buf->b_hdr; 4795 arc_write_callback_t *callback; 4796 zio_t *zio; 4797 4798 ASSERT(ready != NULL); 4799 ASSERT(done != NULL); 4800 ASSERT(!HDR_IO_ERROR(hdr)); 4801 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4802 ASSERT(hdr->b_l1hdr.b_acb == NULL); 4803 ASSERT(hdr->b_l1hdr.b_datacnt > 0); 4804 if (l2arc) 4805 hdr->b_flags |= ARC_FLAG_L2CACHE; 4806 if (l2arc_compress) 4807 hdr->b_flags |= ARC_FLAG_L2COMPRESS; 4808 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 4809 callback->awcb_ready = ready; 4810 callback->awcb_children_ready = children_ready; 4811 callback->awcb_physdone = physdone; 4812 callback->awcb_done = done; 4813 callback->awcb_private = private; 4814 callback->awcb_buf = buf; 4815 4816 zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, 4817 arc_write_ready, 4818 (children_ready != NULL) ? arc_write_children_ready : NULL, 4819 arc_write_physdone, arc_write_done, callback, 4820 priority, zio_flags, zb); 4821 4822 return (zio); 4823 } 4824 4825 static int 4826 arc_memory_throttle(uint64_t reserve, uint64_t txg) 4827 { 4828 #ifdef _KERNEL 4829 uint64_t available_memory = ptob(freemem); 4830 static uint64_t page_load = 0; 4831 static uint64_t last_txg = 0; 4832 4833 #if defined(__i386) 4834 available_memory = 4835 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 4836 #endif 4837 4838 if (freemem > physmem * arc_lotsfree_percent / 100) 4839 return (0); 4840 4841 if (txg > last_txg) { 4842 last_txg = txg; 4843 page_load = 0; 4844 } 4845 /* 4846 * If we are in pageout, we know that memory is already tight, 4847 * the arc is already going to be evicting, so we just want to 4848 * continue to let page writes occur as quickly as possible. 4849 */ 4850 if (curproc == proc_pageout) { 4851 if (page_load > MAX(ptob(minfree), available_memory) / 4) 4852 return (SET_ERROR(ERESTART)); 4853 /* Note: reserve is inflated, so we deflate */ 4854 page_load += reserve / 8; 4855 return (0); 4856 } else if (page_load > 0 && arc_reclaim_needed()) { 4857 /* memory is low, delay before restarting */ 4858 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 4859 return (SET_ERROR(EAGAIN)); 4860 } 4861 page_load = 0; 4862 #endif 4863 return (0); 4864 } 4865 4866 void 4867 arc_tempreserve_clear(uint64_t reserve) 4868 { 4869 atomic_add_64(&arc_tempreserve, -reserve); 4870 ASSERT((int64_t)arc_tempreserve >= 0); 4871 } 4872 4873 int 4874 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 4875 { 4876 int error; 4877 uint64_t anon_size; 4878 4879 if (reserve > arc_c/4 && !arc_no_grow) 4880 arc_c = MIN(arc_c_max, reserve * 4); 4881 if (reserve > arc_c) 4882 return (SET_ERROR(ENOMEM)); 4883 4884 /* 4885 * Don't count loaned bufs as in flight dirty data to prevent long 4886 * network delays from blocking transactions that are ready to be 4887 * assigned to a txg. 4888 */ 4889 anon_size = MAX((int64_t)(refcount_count(&arc_anon->arcs_size) - 4890 arc_loaned_bytes), 0); 4891 4892 /* 4893 * Writes will, almost always, require additional memory allocations 4894 * in order to compress/encrypt/etc the data. We therefore need to 4895 * make sure that there is sufficient available memory for this. 4896 */ 4897 error = arc_memory_throttle(reserve, txg); 4898 if (error != 0) 4899 return (error); 4900 4901 /* 4902 * Throttle writes when the amount of dirty data in the cache 4903 * gets too large. We try to keep the cache less than half full 4904 * of dirty blocks so that our sync times don't grow too large. 4905 * Note: if two requests come in concurrently, we might let them 4906 * both succeed, when one of them should fail. Not a huge deal. 4907 */ 4908 4909 if (reserve + arc_tempreserve + anon_size > arc_c / 2 && 4910 anon_size > arc_c / 4) { 4911 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 4912 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 4913 arc_tempreserve>>10, 4914 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 4915 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 4916 reserve>>10, arc_c>>10); 4917 return (SET_ERROR(ERESTART)); 4918 } 4919 atomic_add_64(&arc_tempreserve, reserve); 4920 return (0); 4921 } 4922 4923 static void 4924 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 4925 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 4926 { 4927 size->value.ui64 = refcount_count(&state->arcs_size); 4928 evict_data->value.ui64 = state->arcs_lsize[ARC_BUFC_DATA]; 4929 evict_metadata->value.ui64 = state->arcs_lsize[ARC_BUFC_METADATA]; 4930 } 4931 4932 static int 4933 arc_kstat_update(kstat_t *ksp, int rw) 4934 { 4935 arc_stats_t *as = ksp->ks_data; 4936 4937 if (rw == KSTAT_WRITE) { 4938 return (EACCES); 4939 } else { 4940 arc_kstat_update_state(arc_anon, 4941 &as->arcstat_anon_size, 4942 &as->arcstat_anon_evictable_data, 4943 &as->arcstat_anon_evictable_metadata); 4944 arc_kstat_update_state(arc_mru, 4945 &as->arcstat_mru_size, 4946 &as->arcstat_mru_evictable_data, 4947 &as->arcstat_mru_evictable_metadata); 4948 arc_kstat_update_state(arc_mru_ghost, 4949 &as->arcstat_mru_ghost_size, 4950 &as->arcstat_mru_ghost_evictable_data, 4951 &as->arcstat_mru_ghost_evictable_metadata); 4952 arc_kstat_update_state(arc_mfu, 4953 &as->arcstat_mfu_size, 4954 &as->arcstat_mfu_evictable_data, 4955 &as->arcstat_mfu_evictable_metadata); 4956 arc_kstat_update_state(arc_mfu_ghost, 4957 &as->arcstat_mfu_ghost_size, 4958 &as->arcstat_mfu_ghost_evictable_data, 4959 &as->arcstat_mfu_ghost_evictable_metadata); 4960 } 4961 4962 return (0); 4963 } 4964 4965 /* 4966 * This function *must* return indices evenly distributed between all 4967 * sublists of the multilist. This is needed due to how the ARC eviction 4968 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 4969 * distributed between all sublists and uses this assumption when 4970 * deciding which sublist to evict from and how much to evict from it. 4971 */ 4972 unsigned int 4973 arc_state_multilist_index_func(multilist_t *ml, void *obj) 4974 { 4975 arc_buf_hdr_t *hdr = obj; 4976 4977 /* 4978 * We rely on b_dva to generate evenly distributed index 4979 * numbers using buf_hash below. So, as an added precaution, 4980 * let's make sure we never add empty buffers to the arc lists. 4981 */ 4982 ASSERT(!BUF_EMPTY(hdr)); 4983 4984 /* 4985 * The assumption here, is the hash value for a given 4986 * arc_buf_hdr_t will remain constant throughout it's lifetime 4987 * (i.e. it's b_spa, b_dva, and b_birth fields don't change). 4988 * Thus, we don't need to store the header's sublist index 4989 * on insertion, as this index can be recalculated on removal. 4990 * 4991 * Also, the low order bits of the hash value are thought to be 4992 * distributed evenly. Otherwise, in the case that the multilist 4993 * has a power of two number of sublists, each sublists' usage 4994 * would not be evenly distributed. 4995 */ 4996 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 4997 multilist_get_num_sublists(ml)); 4998 } 4999 5000 void 5001 arc_init(void) 5002 { 5003 /* 5004 * allmem is "all memory that we could possibly use". 5005 */ 5006 #ifdef _KERNEL 5007 uint64_t allmem = ptob(physmem - swapfs_minfree); 5008 #else 5009 uint64_t allmem = (physmem * PAGESIZE) / 2; 5010 #endif 5011 5012 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 5013 cv_init(&arc_reclaim_thread_cv, NULL, CV_DEFAULT, NULL); 5014 cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL); 5015 5016 mutex_init(&arc_user_evicts_lock, NULL, MUTEX_DEFAULT, NULL); 5017 cv_init(&arc_user_evicts_cv, NULL, CV_DEFAULT, NULL); 5018 5019 /* Convert seconds to clock ticks */ 5020 arc_min_prefetch_lifespan = 1 * hz; 5021 5022 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 5023 arc_c_min = MAX(allmem / 32, 64 << 20); 5024 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 5025 if (allmem >= 1 << 30) 5026 arc_c_max = allmem - (1 << 30); 5027 else 5028 arc_c_max = arc_c_min; 5029 arc_c_max = MAX(allmem * 3 / 4, arc_c_max); 5030 5031 /* 5032 * In userland, there's only the memory pressure that we artificially 5033 * create (see arc_available_memory()). Don't let arc_c get too 5034 * small, because it can cause transactions to be larger than 5035 * arc_c, causing arc_tempreserve_space() to fail. 5036 */ 5037 #ifndef _KERNEL 5038 arc_c_min = arc_c_max / 2; 5039 #endif 5040 5041 /* 5042 * Allow the tunables to override our calculations if they are 5043 * reasonable (ie. over 64MB) 5044 */ 5045 if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) 5046 arc_c_max = zfs_arc_max; 5047 if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max) 5048 arc_c_min = zfs_arc_min; 5049 5050 arc_c = arc_c_max; 5051 arc_p = (arc_c >> 1); 5052 5053 /* limit meta-data to 1/4 of the arc capacity */ 5054 arc_meta_limit = arc_c_max / 4; 5055 5056 #ifdef _KERNEL 5057 /* 5058 * Metadata is stored in the kernel's heap. Don't let us 5059 * use more than half the heap for the ARC. 5060 */ 5061 arc_meta_limit = MIN(arc_meta_limit, 5062 vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 5063 #endif 5064 5065 /* Allow the tunable to override if it is reasonable */ 5066 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 5067 arc_meta_limit = zfs_arc_meta_limit; 5068 5069 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 5070 arc_c_min = arc_meta_limit / 2; 5071 5072 if (zfs_arc_meta_min > 0) { 5073 arc_meta_min = zfs_arc_meta_min; 5074 } else { 5075 arc_meta_min = arc_c_min / 2; 5076 } 5077 5078 if (zfs_arc_grow_retry > 0) 5079 arc_grow_retry = zfs_arc_grow_retry; 5080 5081 if (zfs_arc_shrink_shift > 0) 5082 arc_shrink_shift = zfs_arc_shrink_shift; 5083 5084 /* 5085 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 5086 */ 5087 if (arc_no_grow_shift >= arc_shrink_shift) 5088 arc_no_grow_shift = arc_shrink_shift - 1; 5089 5090 if (zfs_arc_p_min_shift > 0) 5091 arc_p_min_shift = zfs_arc_p_min_shift; 5092 5093 if (zfs_arc_num_sublists_per_state < 1) 5094 zfs_arc_num_sublists_per_state = MAX(boot_ncpus, 1); 5095 5096 /* if kmem_flags are set, lets try to use less memory */ 5097 if (kmem_debugging()) 5098 arc_c = arc_c / 2; 5099 if (arc_c < arc_c_min) 5100 arc_c = arc_c_min; 5101 5102 arc_anon = &ARC_anon; 5103 arc_mru = &ARC_mru; 5104 arc_mru_ghost = &ARC_mru_ghost; 5105 arc_mfu = &ARC_mfu; 5106 arc_mfu_ghost = &ARC_mfu_ghost; 5107 arc_l2c_only = &ARC_l2c_only; 5108 arc_size = 0; 5109 5110 multilist_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 5111 sizeof (arc_buf_hdr_t), 5112 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5113 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5114 multilist_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 5115 sizeof (arc_buf_hdr_t), 5116 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5117 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5118 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 5119 sizeof (arc_buf_hdr_t), 5120 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5121 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5122 multilist_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 5123 sizeof (arc_buf_hdr_t), 5124 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5125 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5126 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 5127 sizeof (arc_buf_hdr_t), 5128 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5129 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5130 multilist_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 5131 sizeof (arc_buf_hdr_t), 5132 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5133 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5134 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 5135 sizeof (arc_buf_hdr_t), 5136 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5137 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5138 multilist_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 5139 sizeof (arc_buf_hdr_t), 5140 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5141 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5142 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 5143 sizeof (arc_buf_hdr_t), 5144 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5145 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5146 multilist_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 5147 sizeof (arc_buf_hdr_t), 5148 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 5149 zfs_arc_num_sublists_per_state, arc_state_multilist_index_func); 5150 5151 refcount_create(&arc_anon->arcs_size); 5152 refcount_create(&arc_mru->arcs_size); 5153 refcount_create(&arc_mru_ghost->arcs_size); 5154 refcount_create(&arc_mfu->arcs_size); 5155 refcount_create(&arc_mfu_ghost->arcs_size); 5156 refcount_create(&arc_l2c_only->arcs_size); 5157 5158 buf_init(); 5159 5160 arc_reclaim_thread_exit = FALSE; 5161 arc_user_evicts_thread_exit = FALSE; 5162 arc_eviction_list = NULL; 5163 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 5164 5165 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 5166 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 5167 5168 if (arc_ksp != NULL) { 5169 arc_ksp->ks_data = &arc_stats; 5170 arc_ksp->ks_update = arc_kstat_update; 5171 kstat_install(arc_ksp); 5172 } 5173 5174 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 5175 TS_RUN, minclsyspri); 5176 5177 (void) thread_create(NULL, 0, arc_user_evicts_thread, NULL, 0, &p0, 5178 TS_RUN, minclsyspri); 5179 5180 arc_dead = FALSE; 5181 arc_warm = B_FALSE; 5182 5183 /* 5184 * Calculate maximum amount of dirty data per pool. 5185 * 5186 * If it has been set by /etc/system, take that. 5187 * Otherwise, use a percentage of physical memory defined by 5188 * zfs_dirty_data_max_percent (default 10%) with a cap at 5189 * zfs_dirty_data_max_max (default 4GB). 5190 */ 5191 if (zfs_dirty_data_max == 0) { 5192 zfs_dirty_data_max = physmem * PAGESIZE * 5193 zfs_dirty_data_max_percent / 100; 5194 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 5195 zfs_dirty_data_max_max); 5196 } 5197 } 5198 5199 void 5200 arc_fini(void) 5201 { 5202 mutex_enter(&arc_reclaim_lock); 5203 arc_reclaim_thread_exit = TRUE; 5204 /* 5205 * The reclaim thread will set arc_reclaim_thread_exit back to 5206 * FALSE when it is finished exiting; we're waiting for that. 5207 */ 5208 while (arc_reclaim_thread_exit) { 5209 cv_signal(&arc_reclaim_thread_cv); 5210 cv_wait(&arc_reclaim_thread_cv, &arc_reclaim_lock); 5211 } 5212 mutex_exit(&arc_reclaim_lock); 5213 5214 mutex_enter(&arc_user_evicts_lock); 5215 arc_user_evicts_thread_exit = TRUE; 5216 /* 5217 * The user evicts thread will set arc_user_evicts_thread_exit 5218 * to FALSE when it is finished exiting; we're waiting for that. 5219 */ 5220 while (arc_user_evicts_thread_exit) { 5221 cv_signal(&arc_user_evicts_cv); 5222 cv_wait(&arc_user_evicts_cv, &arc_user_evicts_lock); 5223 } 5224 mutex_exit(&arc_user_evicts_lock); 5225 5226 /* Use TRUE to ensure *all* buffers are evicted */ 5227 arc_flush(NULL, TRUE); 5228 5229 arc_dead = TRUE; 5230 5231 if (arc_ksp != NULL) { 5232 kstat_delete(arc_ksp); 5233 arc_ksp = NULL; 5234 } 5235 5236 mutex_destroy(&arc_reclaim_lock); 5237 cv_destroy(&arc_reclaim_thread_cv); 5238 cv_destroy(&arc_reclaim_waiters_cv); 5239 5240 mutex_destroy(&arc_user_evicts_lock); 5241 cv_destroy(&arc_user_evicts_cv); 5242 5243 refcount_destroy(&arc_anon->arcs_size); 5244 refcount_destroy(&arc_mru->arcs_size); 5245 refcount_destroy(&arc_mru_ghost->arcs_size); 5246 refcount_destroy(&arc_mfu->arcs_size); 5247 refcount_destroy(&arc_mfu_ghost->arcs_size); 5248 refcount_destroy(&arc_l2c_only->arcs_size); 5249 5250 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 5251 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 5252 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 5253 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 5254 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 5255 multilist_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 5256 multilist_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 5257 multilist_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 5258 multilist_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 5259 multilist_destroy(&arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 5260 5261 buf_fini(); 5262 5263 ASSERT0(arc_loaned_bytes); 5264 } 5265 5266 /* 5267 * Level 2 ARC 5268 * 5269 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 5270 * It uses dedicated storage devices to hold cached data, which are populated 5271 * using large infrequent writes. The main role of this cache is to boost 5272 * the performance of random read workloads. The intended L2ARC devices 5273 * include short-stroked disks, solid state disks, and other media with 5274 * substantially faster read latency than disk. 5275 * 5276 * +-----------------------+ 5277 * | ARC | 5278 * +-----------------------+ 5279 * | ^ ^ 5280 * | | | 5281 * l2arc_feed_thread() arc_read() 5282 * | | | 5283 * | l2arc read | 5284 * V | | 5285 * +---------------+ | 5286 * | L2ARC | | 5287 * +---------------+ | 5288 * | ^ | 5289 * l2arc_write() | | 5290 * | | | 5291 * V | | 5292 * +-------+ +-------+ 5293 * | vdev | | vdev | 5294 * | cache | | cache | 5295 * +-------+ +-------+ 5296 * +=========+ .-----. 5297 * : L2ARC : |-_____-| 5298 * : devices : | Disks | 5299 * +=========+ `-_____-' 5300 * 5301 * Read requests are satisfied from the following sources, in order: 5302 * 5303 * 1) ARC 5304 * 2) vdev cache of L2ARC devices 5305 * 3) L2ARC devices 5306 * 4) vdev cache of disks 5307 * 5) disks 5308 * 5309 * Some L2ARC device types exhibit extremely slow write performance. 5310 * To accommodate for this there are some significant differences between 5311 * the L2ARC and traditional cache design: 5312 * 5313 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 5314 * the ARC behave as usual, freeing buffers and placing headers on ghost 5315 * lists. The ARC does not send buffers to the L2ARC during eviction as 5316 * this would add inflated write latencies for all ARC memory pressure. 5317 * 5318 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 5319 * It does this by periodically scanning buffers from the eviction-end of 5320 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 5321 * not already there. It scans until a headroom of buffers is satisfied, 5322 * which itself is a buffer for ARC eviction. If a compressible buffer is 5323 * found during scanning and selected for writing to an L2ARC device, we 5324 * temporarily boost scanning headroom during the next scan cycle to make 5325 * sure we adapt to compression effects (which might significantly reduce 5326 * the data volume we write to L2ARC). The thread that does this is 5327 * l2arc_feed_thread(), illustrated below; example sizes are included to 5328 * provide a better sense of ratio than this diagram: 5329 * 5330 * head --> tail 5331 * +---------------------+----------+ 5332 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 5333 * +---------------------+----------+ | o L2ARC eligible 5334 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 5335 * +---------------------+----------+ | 5336 * 15.9 Gbytes ^ 32 Mbytes | 5337 * headroom | 5338 * l2arc_feed_thread() 5339 * | 5340 * l2arc write hand <--[oooo]--' 5341 * | 8 Mbyte 5342 * | write max 5343 * V 5344 * +==============================+ 5345 * L2ARC dev |####|#|###|###| |####| ... | 5346 * +==============================+ 5347 * 32 Gbytes 5348 * 5349 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 5350 * evicted, then the L2ARC has cached a buffer much sooner than it probably 5351 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 5352 * safe to say that this is an uncommon case, since buffers at the end of 5353 * the ARC lists have moved there due to inactivity. 5354 * 5355 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 5356 * then the L2ARC simply misses copying some buffers. This serves as a 5357 * pressure valve to prevent heavy read workloads from both stalling the ARC 5358 * with waits and clogging the L2ARC with writes. This also helps prevent 5359 * the potential for the L2ARC to churn if it attempts to cache content too 5360 * quickly, such as during backups of the entire pool. 5361 * 5362 * 5. After system boot and before the ARC has filled main memory, there are 5363 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 5364 * lists can remain mostly static. Instead of searching from tail of these 5365 * lists as pictured, the l2arc_feed_thread() will search from the list heads 5366 * for eligible buffers, greatly increasing its chance of finding them. 5367 * 5368 * The L2ARC device write speed is also boosted during this time so that 5369 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 5370 * there are no L2ARC reads, and no fear of degrading read performance 5371 * through increased writes. 5372 * 5373 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 5374 * the vdev queue can aggregate them into larger and fewer writes. Each 5375 * device is written to in a rotor fashion, sweeping writes through 5376 * available space then repeating. 5377 * 5378 * 7. The L2ARC does not store dirty content. It never needs to flush 5379 * write buffers back to disk based storage. 5380 * 5381 * 8. If an ARC buffer is written (and dirtied) which also exists in the 5382 * L2ARC, the now stale L2ARC buffer is immediately dropped. 5383 * 5384 * The performance of the L2ARC can be tweaked by a number of tunables, which 5385 * may be necessary for different workloads: 5386 * 5387 * l2arc_write_max max write bytes per interval 5388 * l2arc_write_boost extra write bytes during device warmup 5389 * l2arc_noprefetch skip caching prefetched buffers 5390 * l2arc_headroom number of max device writes to precache 5391 * l2arc_headroom_boost when we find compressed buffers during ARC 5392 * scanning, we multiply headroom by this 5393 * percentage factor for the next scan cycle, 5394 * since more compressed buffers are likely to 5395 * be present 5396 * l2arc_feed_secs seconds between L2ARC writing 5397 * 5398 * Tunables may be removed or added as future performance improvements are 5399 * integrated, and also may become zpool properties. 5400 * 5401 * There are three key functions that control how the L2ARC warms up: 5402 * 5403 * l2arc_write_eligible() check if a buffer is eligible to cache 5404 * l2arc_write_size() calculate how much to write 5405 * l2arc_write_interval() calculate sleep delay between writes 5406 * 5407 * These three functions determine what to write, how much, and how quickly 5408 * to send writes. 5409 */ 5410 5411 static boolean_t 5412 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 5413 { 5414 /* 5415 * A buffer is *not* eligible for the L2ARC if it: 5416 * 1. belongs to a different spa. 5417 * 2. is already cached on the L2ARC. 5418 * 3. has an I/O in progress (it may be an incomplete read). 5419 * 4. is flagged not eligible (zfs property). 5420 */ 5421 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || 5422 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) 5423 return (B_FALSE); 5424 5425 return (B_TRUE); 5426 } 5427 5428 static uint64_t 5429 l2arc_write_size(void) 5430 { 5431 uint64_t size; 5432 5433 /* 5434 * Make sure our globals have meaningful values in case the user 5435 * altered them. 5436 */ 5437 size = l2arc_write_max; 5438 if (size == 0) { 5439 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 5440 "be greater than zero, resetting it to the default (%d)", 5441 L2ARC_WRITE_SIZE); 5442 size = l2arc_write_max = L2ARC_WRITE_SIZE; 5443 } 5444 5445 if (arc_warm == B_FALSE) 5446 size += l2arc_write_boost; 5447 5448 return (size); 5449 5450 } 5451 5452 static clock_t 5453 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 5454 { 5455 clock_t interval, next, now; 5456 5457 /* 5458 * If the ARC lists are busy, increase our write rate; if the 5459 * lists are stale, idle back. This is achieved by checking 5460 * how much we previously wrote - if it was more than half of 5461 * what we wanted, schedule the next write much sooner. 5462 */ 5463 if (l2arc_feed_again && wrote > (wanted / 2)) 5464 interval = (hz * l2arc_feed_min_ms) / 1000; 5465 else 5466 interval = hz * l2arc_feed_secs; 5467 5468 now = ddi_get_lbolt(); 5469 next = MAX(now, MIN(now + interval, began + interval)); 5470 5471 return (next); 5472 } 5473 5474 /* 5475 * Cycle through L2ARC devices. This is how L2ARC load balances. 5476 * If a device is returned, this also returns holding the spa config lock. 5477 */ 5478 static l2arc_dev_t * 5479 l2arc_dev_get_next(void) 5480 { 5481 l2arc_dev_t *first, *next = NULL; 5482 5483 /* 5484 * Lock out the removal of spas (spa_namespace_lock), then removal 5485 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 5486 * both locks will be dropped and a spa config lock held instead. 5487 */ 5488 mutex_enter(&spa_namespace_lock); 5489 mutex_enter(&l2arc_dev_mtx); 5490 5491 /* if there are no vdevs, there is nothing to do */ 5492 if (l2arc_ndev == 0) 5493 goto out; 5494 5495 first = NULL; 5496 next = l2arc_dev_last; 5497 do { 5498 /* loop around the list looking for a non-faulted vdev */ 5499 if (next == NULL) { 5500 next = list_head(l2arc_dev_list); 5501 } else { 5502 next = list_next(l2arc_dev_list, next); 5503 if (next == NULL) 5504 next = list_head(l2arc_dev_list); 5505 } 5506 5507 /* if we have come back to the start, bail out */ 5508 if (first == NULL) 5509 first = next; 5510 else if (next == first) 5511 break; 5512 5513 } while (vdev_is_dead(next->l2ad_vdev)); 5514 5515 /* if we were unable to find any usable vdevs, return NULL */ 5516 if (vdev_is_dead(next->l2ad_vdev)) 5517 next = NULL; 5518 5519 l2arc_dev_last = next; 5520 5521 out: 5522 mutex_exit(&l2arc_dev_mtx); 5523 5524 /* 5525 * Grab the config lock to prevent the 'next' device from being 5526 * removed while we are writing to it. 5527 */ 5528 if (next != NULL) 5529 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 5530 mutex_exit(&spa_namespace_lock); 5531 5532 return (next); 5533 } 5534 5535 /* 5536 * Free buffers that were tagged for destruction. 5537 */ 5538 static void 5539 l2arc_do_free_on_write() 5540 { 5541 list_t *buflist; 5542 l2arc_data_free_t *df, *df_prev; 5543 5544 mutex_enter(&l2arc_free_on_write_mtx); 5545 buflist = l2arc_free_on_write; 5546 5547 for (df = list_tail(buflist); df; df = df_prev) { 5548 df_prev = list_prev(buflist, df); 5549 ASSERT(df->l2df_data != NULL); 5550 ASSERT(df->l2df_func != NULL); 5551 df->l2df_func(df->l2df_data, df->l2df_size); 5552 list_remove(buflist, df); 5553 kmem_free(df, sizeof (l2arc_data_free_t)); 5554 } 5555 5556 mutex_exit(&l2arc_free_on_write_mtx); 5557 } 5558 5559 /* 5560 * A write to a cache device has completed. Update all headers to allow 5561 * reads from these buffers to begin. 5562 */ 5563 static void 5564 l2arc_write_done(zio_t *zio) 5565 { 5566 l2arc_write_callback_t *cb; 5567 l2arc_dev_t *dev; 5568 list_t *buflist; 5569 arc_buf_hdr_t *head, *hdr, *hdr_prev; 5570 kmutex_t *hash_lock; 5571 int64_t bytes_dropped = 0; 5572 5573 cb = zio->io_private; 5574 ASSERT(cb != NULL); 5575 dev = cb->l2wcb_dev; 5576 ASSERT(dev != NULL); 5577 head = cb->l2wcb_head; 5578 ASSERT(head != NULL); 5579 buflist = &dev->l2ad_buflist; 5580 ASSERT(buflist != NULL); 5581 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 5582 l2arc_write_callback_t *, cb); 5583 5584 if (zio->io_error != 0) 5585 ARCSTAT_BUMP(arcstat_l2_writes_error); 5586 5587 /* 5588 * All writes completed, or an error was hit. 5589 */ 5590 top: 5591 mutex_enter(&dev->l2ad_mtx); 5592 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 5593 hdr_prev = list_prev(buflist, hdr); 5594 5595 hash_lock = HDR_LOCK(hdr); 5596 5597 /* 5598 * We cannot use mutex_enter or else we can deadlock 5599 * with l2arc_write_buffers (due to swapping the order 5600 * the hash lock and l2ad_mtx are taken). 5601 */ 5602 if (!mutex_tryenter(hash_lock)) { 5603 /* 5604 * Missed the hash lock. We must retry so we 5605 * don't leave the ARC_FLAG_L2_WRITING bit set. 5606 */ 5607 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 5608 5609 /* 5610 * We don't want to rescan the headers we've 5611 * already marked as having been written out, so 5612 * we reinsert the head node so we can pick up 5613 * where we left off. 5614 */ 5615 list_remove(buflist, head); 5616 list_insert_after(buflist, hdr, head); 5617 5618 mutex_exit(&dev->l2ad_mtx); 5619 5620 /* 5621 * We wait for the hash lock to become available 5622 * to try and prevent busy waiting, and increase 5623 * the chance we'll be able to acquire the lock 5624 * the next time around. 5625 */ 5626 mutex_enter(hash_lock); 5627 mutex_exit(hash_lock); 5628 goto top; 5629 } 5630 5631 /* 5632 * We could not have been moved into the arc_l2c_only 5633 * state while in-flight due to our ARC_FLAG_L2_WRITING 5634 * bit being set. Let's just ensure that's being enforced. 5635 */ 5636 ASSERT(HDR_HAS_L1HDR(hdr)); 5637 5638 /* 5639 * We may have allocated a buffer for L2ARC compression, 5640 * we must release it to avoid leaking this data. 5641 */ 5642 l2arc_release_cdata_buf(hdr); 5643 5644 if (zio->io_error != 0) { 5645 /* 5646 * Error - drop L2ARC entry. 5647 */ 5648 list_remove(buflist, hdr); 5649 hdr->b_flags &= ~ARC_FLAG_HAS_L2HDR; 5650 5651 ARCSTAT_INCR(arcstat_l2_asize, -hdr->b_l2hdr.b_asize); 5652 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 5653 5654 bytes_dropped += hdr->b_l2hdr.b_asize; 5655 (void) refcount_remove_many(&dev->l2ad_alloc, 5656 hdr->b_l2hdr.b_asize, hdr); 5657 } 5658 5659 /* 5660 * Allow ARC to begin reads and ghost list evictions to 5661 * this L2ARC entry. 5662 */ 5663 hdr->b_flags &= ~ARC_FLAG_L2_WRITING; 5664 5665 mutex_exit(hash_lock); 5666 } 5667 5668 atomic_inc_64(&l2arc_writes_done); 5669 list_remove(buflist, head); 5670 ASSERT(!HDR_HAS_L1HDR(head)); 5671 kmem_cache_free(hdr_l2only_cache, head); 5672 mutex_exit(&dev->l2ad_mtx); 5673 5674 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 5675 5676 l2arc_do_free_on_write(); 5677 5678 kmem_free(cb, sizeof (l2arc_write_callback_t)); 5679 } 5680 5681 /* 5682 * A read to a cache device completed. Validate buffer contents before 5683 * handing over to the regular ARC routines. 5684 */ 5685 static void 5686 l2arc_read_done(zio_t *zio) 5687 { 5688 l2arc_read_callback_t *cb; 5689 arc_buf_hdr_t *hdr; 5690 arc_buf_t *buf; 5691 kmutex_t *hash_lock; 5692 int equal; 5693 5694 ASSERT(zio->io_vd != NULL); 5695 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 5696 5697 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 5698 5699 cb = zio->io_private; 5700 ASSERT(cb != NULL); 5701 buf = cb->l2rcb_buf; 5702 ASSERT(buf != NULL); 5703 5704 hash_lock = HDR_LOCK(buf->b_hdr); 5705 mutex_enter(hash_lock); 5706 hdr = buf->b_hdr; 5707 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 5708 5709 /* 5710 * If the buffer was compressed, decompress it first. 5711 */ 5712 if (cb->l2rcb_compress != ZIO_COMPRESS_OFF) 5713 l2arc_decompress_zio(zio, hdr, cb->l2rcb_compress); 5714 ASSERT(zio->io_data != NULL); 5715 ASSERT3U(zio->io_size, ==, hdr->b_size); 5716 ASSERT3U(BP_GET_LSIZE(&cb->l2rcb_bp), ==, hdr->b_size); 5717 5718 /* 5719 * Check this survived the L2ARC journey. 5720 */ 5721 equal = arc_cksum_equal(buf); 5722 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 5723 mutex_exit(hash_lock); 5724 zio->io_private = buf; 5725 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 5726 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 5727 arc_read_done(zio); 5728 } else { 5729 mutex_exit(hash_lock); 5730 /* 5731 * Buffer didn't survive caching. Increment stats and 5732 * reissue to the original storage device. 5733 */ 5734 if (zio->io_error != 0) { 5735 ARCSTAT_BUMP(arcstat_l2_io_error); 5736 } else { 5737 zio->io_error = SET_ERROR(EIO); 5738 } 5739 if (!equal) 5740 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 5741 5742 /* 5743 * If there's no waiter, issue an async i/o to the primary 5744 * storage now. If there *is* a waiter, the caller must 5745 * issue the i/o in a context where it's OK to block. 5746 */ 5747 if (zio->io_waiter == NULL) { 5748 zio_t *pio = zio_unique_parent(zio); 5749 5750 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 5751 5752 zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, 5753 buf->b_data, hdr->b_size, arc_read_done, buf, 5754 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); 5755 } 5756 } 5757 5758 kmem_free(cb, sizeof (l2arc_read_callback_t)); 5759 } 5760 5761 /* 5762 * This is the list priority from which the L2ARC will search for pages to 5763 * cache. This is used within loops (0..3) to cycle through lists in the 5764 * desired order. This order can have a significant effect on cache 5765 * performance. 5766 * 5767 * Currently the metadata lists are hit first, MFU then MRU, followed by 5768 * the data lists. This function returns a locked list, and also returns 5769 * the lock pointer. 5770 */ 5771 static multilist_sublist_t * 5772 l2arc_sublist_lock(int list_num) 5773 { 5774 multilist_t *ml = NULL; 5775 unsigned int idx; 5776 5777 ASSERT(list_num >= 0 && list_num <= 3); 5778 5779 switch (list_num) { 5780 case 0: 5781 ml = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 5782 break; 5783 case 1: 5784 ml = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 5785 break; 5786 case 2: 5787 ml = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 5788 break; 5789 case 3: 5790 ml = &arc_mru->arcs_list[ARC_BUFC_DATA]; 5791 break; 5792 } 5793 5794 /* 5795 * Return a randomly-selected sublist. This is acceptable 5796 * because the caller feeds only a little bit of data for each 5797 * call (8MB). Subsequent calls will result in different 5798 * sublists being selected. 5799 */ 5800 idx = multilist_get_random_index(ml); 5801 return (multilist_sublist_lock(ml, idx)); 5802 } 5803 5804 /* 5805 * Evict buffers from the device write hand to the distance specified in 5806 * bytes. This distance may span populated buffers, it may span nothing. 5807 * This is clearing a region on the L2ARC device ready for writing. 5808 * If the 'all' boolean is set, every buffer is evicted. 5809 */ 5810 static void 5811 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 5812 { 5813 list_t *buflist; 5814 arc_buf_hdr_t *hdr, *hdr_prev; 5815 kmutex_t *hash_lock; 5816 uint64_t taddr; 5817 5818 buflist = &dev->l2ad_buflist; 5819 5820 if (!all && dev->l2ad_first) { 5821 /* 5822 * This is the first sweep through the device. There is 5823 * nothing to evict. 5824 */ 5825 return; 5826 } 5827 5828 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 5829 /* 5830 * When nearing the end of the device, evict to the end 5831 * before the device write hand jumps to the start. 5832 */ 5833 taddr = dev->l2ad_end; 5834 } else { 5835 taddr = dev->l2ad_hand + distance; 5836 } 5837 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 5838 uint64_t, taddr, boolean_t, all); 5839 5840 top: 5841 mutex_enter(&dev->l2ad_mtx); 5842 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 5843 hdr_prev = list_prev(buflist, hdr); 5844 5845 hash_lock = HDR_LOCK(hdr); 5846 5847 /* 5848 * We cannot use mutex_enter or else we can deadlock 5849 * with l2arc_write_buffers (due to swapping the order 5850 * the hash lock and l2ad_mtx are taken). 5851 */ 5852 if (!mutex_tryenter(hash_lock)) { 5853 /* 5854 * Missed the hash lock. Retry. 5855 */ 5856 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 5857 mutex_exit(&dev->l2ad_mtx); 5858 mutex_enter(hash_lock); 5859 mutex_exit(hash_lock); 5860 goto top; 5861 } 5862 5863 if (HDR_L2_WRITE_HEAD(hdr)) { 5864 /* 5865 * We hit a write head node. Leave it for 5866 * l2arc_write_done(). 5867 */ 5868 list_remove(buflist, hdr); 5869 mutex_exit(hash_lock); 5870 continue; 5871 } 5872 5873 if (!all && HDR_HAS_L2HDR(hdr) && 5874 (hdr->b_l2hdr.b_daddr > taddr || 5875 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 5876 /* 5877 * We've evicted to the target address, 5878 * or the end of the device. 5879 */ 5880 mutex_exit(hash_lock); 5881 break; 5882 } 5883 5884 ASSERT(HDR_HAS_L2HDR(hdr)); 5885 if (!HDR_HAS_L1HDR(hdr)) { 5886 ASSERT(!HDR_L2_READING(hdr)); 5887 /* 5888 * This doesn't exist in the ARC. Destroy. 5889 * arc_hdr_destroy() will call list_remove() 5890 * and decrement arcstat_l2_size. 5891 */ 5892 arc_change_state(arc_anon, hdr, hash_lock); 5893 arc_hdr_destroy(hdr); 5894 } else { 5895 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 5896 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 5897 /* 5898 * Invalidate issued or about to be issued 5899 * reads, since we may be about to write 5900 * over this location. 5901 */ 5902 if (HDR_L2_READING(hdr)) { 5903 ARCSTAT_BUMP(arcstat_l2_evict_reading); 5904 hdr->b_flags |= ARC_FLAG_L2_EVICTED; 5905 } 5906 5907 /* Ensure this header has finished being written */ 5908 ASSERT(!HDR_L2_WRITING(hdr)); 5909 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 5910 5911 arc_hdr_l2hdr_destroy(hdr); 5912 } 5913 mutex_exit(hash_lock); 5914 } 5915 mutex_exit(&dev->l2ad_mtx); 5916 } 5917 5918 /* 5919 * Find and write ARC buffers to the L2ARC device. 5920 * 5921 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 5922 * for reading until they have completed writing. 5923 * The headroom_boost is an in-out parameter used to maintain headroom boost 5924 * state between calls to this function. 5925 * 5926 * Returns the number of bytes actually written (which may be smaller than 5927 * the delta by which the device hand has changed due to alignment). 5928 */ 5929 static uint64_t 5930 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz, 5931 boolean_t *headroom_boost) 5932 { 5933 arc_buf_hdr_t *hdr, *hdr_prev, *head; 5934 uint64_t write_asize, write_sz, headroom, 5935 buf_compress_minsz; 5936 void *buf_data; 5937 boolean_t full; 5938 l2arc_write_callback_t *cb; 5939 zio_t *pio, *wzio; 5940 uint64_t guid = spa_load_guid(spa); 5941 const boolean_t do_headroom_boost = *headroom_boost; 5942 5943 ASSERT(dev->l2ad_vdev != NULL); 5944 5945 /* Lower the flag now, we might want to raise it again later. */ 5946 *headroom_boost = B_FALSE; 5947 5948 pio = NULL; 5949 write_sz = write_asize = 0; 5950 full = B_FALSE; 5951 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 5952 head->b_flags |= ARC_FLAG_L2_WRITE_HEAD; 5953 head->b_flags |= ARC_FLAG_HAS_L2HDR; 5954 5955 /* 5956 * We will want to try to compress buffers that are at least 2x the 5957 * device sector size. 5958 */ 5959 buf_compress_minsz = 2 << dev->l2ad_vdev->vdev_ashift; 5960 5961 /* 5962 * Copy buffers for L2ARC writing. 5963 */ 5964 for (int try = 0; try <= 3; try++) { 5965 multilist_sublist_t *mls = l2arc_sublist_lock(try); 5966 uint64_t passed_sz = 0; 5967 5968 /* 5969 * L2ARC fast warmup. 5970 * 5971 * Until the ARC is warm and starts to evict, read from the 5972 * head of the ARC lists rather than the tail. 5973 */ 5974 if (arc_warm == B_FALSE) 5975 hdr = multilist_sublist_head(mls); 5976 else 5977 hdr = multilist_sublist_tail(mls); 5978 5979 headroom = target_sz * l2arc_headroom; 5980 if (do_headroom_boost) 5981 headroom = (headroom * l2arc_headroom_boost) / 100; 5982 5983 for (; hdr; hdr = hdr_prev) { 5984 kmutex_t *hash_lock; 5985 uint64_t buf_sz; 5986 uint64_t buf_a_sz; 5987 5988 if (arc_warm == B_FALSE) 5989 hdr_prev = multilist_sublist_next(mls, hdr); 5990 else 5991 hdr_prev = multilist_sublist_prev(mls, hdr); 5992 5993 hash_lock = HDR_LOCK(hdr); 5994 if (!mutex_tryenter(hash_lock)) { 5995 /* 5996 * Skip this buffer rather than waiting. 5997 */ 5998 continue; 5999 } 6000 6001 passed_sz += hdr->b_size; 6002 if (passed_sz > headroom) { 6003 /* 6004 * Searched too far. 6005 */ 6006 mutex_exit(hash_lock); 6007 break; 6008 } 6009 6010 if (!l2arc_write_eligible(guid, hdr)) { 6011 mutex_exit(hash_lock); 6012 continue; 6013 } 6014 6015 /* 6016 * Assume that the buffer is not going to be compressed 6017 * and could take more space on disk because of a larger 6018 * disk block size. 6019 */ 6020 buf_sz = hdr->b_size; 6021 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 6022 6023 if ((write_asize + buf_a_sz) > target_sz) { 6024 full = B_TRUE; 6025 mutex_exit(hash_lock); 6026 break; 6027 } 6028 6029 if (pio == NULL) { 6030 /* 6031 * Insert a dummy header on the buflist so 6032 * l2arc_write_done() can find where the 6033 * write buffers begin without searching. 6034 */ 6035 mutex_enter(&dev->l2ad_mtx); 6036 list_insert_head(&dev->l2ad_buflist, head); 6037 mutex_exit(&dev->l2ad_mtx); 6038 6039 cb = kmem_alloc( 6040 sizeof (l2arc_write_callback_t), KM_SLEEP); 6041 cb->l2wcb_dev = dev; 6042 cb->l2wcb_head = head; 6043 pio = zio_root(spa, l2arc_write_done, cb, 6044 ZIO_FLAG_CANFAIL); 6045 } 6046 6047 /* 6048 * Create and add a new L2ARC header. 6049 */ 6050 hdr->b_l2hdr.b_dev = dev; 6051 hdr->b_flags |= ARC_FLAG_L2_WRITING; 6052 /* 6053 * Temporarily stash the data buffer in b_tmp_cdata. 6054 * The subsequent write step will pick it up from 6055 * there. This is because can't access b_l1hdr.b_buf 6056 * without holding the hash_lock, which we in turn 6057 * can't access without holding the ARC list locks 6058 * (which we want to avoid during compression/writing). 6059 */ 6060 hdr->b_l2hdr.b_compress = ZIO_COMPRESS_OFF; 6061 hdr->b_l2hdr.b_asize = hdr->b_size; 6062 hdr->b_l1hdr.b_tmp_cdata = hdr->b_l1hdr.b_buf->b_data; 6063 6064 /* 6065 * Explicitly set the b_daddr field to a known 6066 * value which means "invalid address". This 6067 * enables us to differentiate which stage of 6068 * l2arc_write_buffers() the particular header 6069 * is in (e.g. this loop, or the one below). 6070 * ARC_FLAG_L2_WRITING is not enough to make 6071 * this distinction, and we need to know in 6072 * order to do proper l2arc vdev accounting in 6073 * arc_release() and arc_hdr_destroy(). 6074 * 6075 * Note, we can't use a new flag to distinguish 6076 * the two stages because we don't hold the 6077 * header's hash_lock below, in the second stage 6078 * of this function. Thus, we can't simply 6079 * change the b_flags field to denote that the 6080 * IO has been sent. We can change the b_daddr 6081 * field of the L2 portion, though, since we'll 6082 * be holding the l2ad_mtx; which is why we're 6083 * using it to denote the header's state change. 6084 */ 6085 hdr->b_l2hdr.b_daddr = L2ARC_ADDR_UNSET; 6086 6087 hdr->b_flags |= ARC_FLAG_HAS_L2HDR; 6088 6089 mutex_enter(&dev->l2ad_mtx); 6090 list_insert_head(&dev->l2ad_buflist, hdr); 6091 mutex_exit(&dev->l2ad_mtx); 6092 6093 /* 6094 * Compute and store the buffer cksum before 6095 * writing. On debug the cksum is verified first. 6096 */ 6097 arc_cksum_verify(hdr->b_l1hdr.b_buf); 6098 arc_cksum_compute(hdr->b_l1hdr.b_buf, B_TRUE); 6099 6100 mutex_exit(hash_lock); 6101 6102 write_sz += buf_sz; 6103 write_asize += buf_a_sz; 6104 } 6105 6106 multilist_sublist_unlock(mls); 6107 6108 if (full == B_TRUE) 6109 break; 6110 } 6111 6112 /* No buffers selected for writing? */ 6113 if (pio == NULL) { 6114 ASSERT0(write_sz); 6115 ASSERT(!HDR_HAS_L1HDR(head)); 6116 kmem_cache_free(hdr_l2only_cache, head); 6117 return (0); 6118 } 6119 6120 mutex_enter(&dev->l2ad_mtx); 6121 6122 /* 6123 * Note that elsewhere in this file arcstat_l2_asize 6124 * and the used space on l2ad_vdev are updated using b_asize, 6125 * which is not necessarily rounded up to the device block size. 6126 * Too keep accounting consistent we do the same here as well: 6127 * stats_size accumulates the sum of b_asize of the written buffers, 6128 * while write_asize accumulates the sum of b_asize rounded up 6129 * to the device block size. 6130 * The latter sum is used only to validate the corectness of the code. 6131 */ 6132 uint64_t stats_size = 0; 6133 write_asize = 0; 6134 6135 /* 6136 * Now start writing the buffers. We're starting at the write head 6137 * and work backwards, retracing the course of the buffer selector 6138 * loop above. 6139 */ 6140 for (hdr = list_prev(&dev->l2ad_buflist, head); hdr; 6141 hdr = list_prev(&dev->l2ad_buflist, hdr)) { 6142 uint64_t buf_sz; 6143 6144 /* 6145 * We rely on the L1 portion of the header below, so 6146 * it's invalid for this header to have been evicted out 6147 * of the ghost cache, prior to being written out. The 6148 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 6149 */ 6150 ASSERT(HDR_HAS_L1HDR(hdr)); 6151 6152 /* 6153 * We shouldn't need to lock the buffer here, since we flagged 6154 * it as ARC_FLAG_L2_WRITING in the previous step, but we must 6155 * take care to only access its L2 cache parameters. In 6156 * particular, hdr->l1hdr.b_buf may be invalid by now due to 6157 * ARC eviction. 6158 */ 6159 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 6160 6161 if ((HDR_L2COMPRESS(hdr)) && 6162 hdr->b_l2hdr.b_asize >= buf_compress_minsz) { 6163 if (l2arc_compress_buf(hdr)) { 6164 /* 6165 * If compression succeeded, enable headroom 6166 * boost on the next scan cycle. 6167 */ 6168 *headroom_boost = B_TRUE; 6169 } 6170 } 6171 6172 /* 6173 * Pick up the buffer data we had previously stashed away 6174 * (and now potentially also compressed). 6175 */ 6176 buf_data = hdr->b_l1hdr.b_tmp_cdata; 6177 buf_sz = hdr->b_l2hdr.b_asize; 6178 6179 /* 6180 * We need to do this regardless if buf_sz is zero or 6181 * not, otherwise, when this l2hdr is evicted we'll 6182 * remove a reference that was never added. 6183 */ 6184 (void) refcount_add_many(&dev->l2ad_alloc, buf_sz, hdr); 6185 6186 /* Compression may have squashed the buffer to zero length. */ 6187 if (buf_sz != 0) { 6188 uint64_t buf_a_sz; 6189 6190 wzio = zio_write_phys(pio, dev->l2ad_vdev, 6191 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 6192 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 6193 ZIO_FLAG_CANFAIL, B_FALSE); 6194 6195 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 6196 zio_t *, wzio); 6197 (void) zio_nowait(wzio); 6198 6199 stats_size += buf_sz; 6200 6201 /* 6202 * Keep the clock hand suitably device-aligned. 6203 */ 6204 buf_a_sz = vdev_psize_to_asize(dev->l2ad_vdev, buf_sz); 6205 write_asize += buf_a_sz; 6206 dev->l2ad_hand += buf_a_sz; 6207 } 6208 } 6209 6210 mutex_exit(&dev->l2ad_mtx); 6211 6212 ASSERT3U(write_asize, <=, target_sz); 6213 ARCSTAT_BUMP(arcstat_l2_writes_sent); 6214 ARCSTAT_INCR(arcstat_l2_write_bytes, write_asize); 6215 ARCSTAT_INCR(arcstat_l2_size, write_sz); 6216 ARCSTAT_INCR(arcstat_l2_asize, stats_size); 6217 vdev_space_update(dev->l2ad_vdev, stats_size, 0, 0); 6218 6219 /* 6220 * Bump device hand to the device start if it is approaching the end. 6221 * l2arc_evict() will already have evicted ahead for this case. 6222 */ 6223 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 6224 dev->l2ad_hand = dev->l2ad_start; 6225 dev->l2ad_first = B_FALSE; 6226 } 6227 6228 dev->l2ad_writing = B_TRUE; 6229 (void) zio_wait(pio); 6230 dev->l2ad_writing = B_FALSE; 6231 6232 return (write_asize); 6233 } 6234 6235 /* 6236 * Compresses an L2ARC buffer. 6237 * The data to be compressed must be prefilled in l1hdr.b_tmp_cdata and its 6238 * size in l2hdr->b_asize. This routine tries to compress the data and 6239 * depending on the compression result there are three possible outcomes: 6240 * *) The buffer was incompressible. The original l2hdr contents were left 6241 * untouched and are ready for writing to an L2 device. 6242 * *) The buffer was all-zeros, so there is no need to write it to an L2 6243 * device. To indicate this situation b_tmp_cdata is NULL'ed, b_asize is 6244 * set to zero and b_compress is set to ZIO_COMPRESS_EMPTY. 6245 * *) Compression succeeded and b_tmp_cdata was replaced with a temporary 6246 * data buffer which holds the compressed data to be written, and b_asize 6247 * tells us how much data there is. b_compress is set to the appropriate 6248 * compression algorithm. Once writing is done, invoke 6249 * l2arc_release_cdata_buf on this l2hdr to free this temporary buffer. 6250 * 6251 * Returns B_TRUE if compression succeeded, or B_FALSE if it didn't (the 6252 * buffer was incompressible). 6253 */ 6254 static boolean_t 6255 l2arc_compress_buf(arc_buf_hdr_t *hdr) 6256 { 6257 void *cdata; 6258 size_t csize, len, rounded; 6259 ASSERT(HDR_HAS_L2HDR(hdr)); 6260 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 6261 6262 ASSERT(HDR_HAS_L1HDR(hdr)); 6263 ASSERT3S(l2hdr->b_compress, ==, ZIO_COMPRESS_OFF); 6264 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6265 6266 len = l2hdr->b_asize; 6267 cdata = zio_data_buf_alloc(len); 6268 ASSERT3P(cdata, !=, NULL); 6269 csize = zio_compress_data(ZIO_COMPRESS_LZ4, hdr->b_l1hdr.b_tmp_cdata, 6270 cdata, l2hdr->b_asize); 6271 6272 rounded = P2ROUNDUP(csize, (size_t)SPA_MINBLOCKSIZE); 6273 if (rounded > csize) { 6274 bzero((char *)cdata + csize, rounded - csize); 6275 csize = rounded; 6276 } 6277 6278 if (csize == 0) { 6279 /* zero block, indicate that there's nothing to write */ 6280 zio_data_buf_free(cdata, len); 6281 l2hdr->b_compress = ZIO_COMPRESS_EMPTY; 6282 l2hdr->b_asize = 0; 6283 hdr->b_l1hdr.b_tmp_cdata = NULL; 6284 ARCSTAT_BUMP(arcstat_l2_compress_zeros); 6285 return (B_TRUE); 6286 } else if (csize > 0 && csize < len) { 6287 /* 6288 * Compression succeeded, we'll keep the cdata around for 6289 * writing and release it afterwards. 6290 */ 6291 l2hdr->b_compress = ZIO_COMPRESS_LZ4; 6292 l2hdr->b_asize = csize; 6293 hdr->b_l1hdr.b_tmp_cdata = cdata; 6294 ARCSTAT_BUMP(arcstat_l2_compress_successes); 6295 return (B_TRUE); 6296 } else { 6297 /* 6298 * Compression failed, release the compressed buffer. 6299 * l2hdr will be left unmodified. 6300 */ 6301 zio_data_buf_free(cdata, len); 6302 ARCSTAT_BUMP(arcstat_l2_compress_failures); 6303 return (B_FALSE); 6304 } 6305 } 6306 6307 /* 6308 * Decompresses a zio read back from an l2arc device. On success, the 6309 * underlying zio's io_data buffer is overwritten by the uncompressed 6310 * version. On decompression error (corrupt compressed stream), the 6311 * zio->io_error value is set to signal an I/O error. 6312 * 6313 * Please note that the compressed data stream is not checksummed, so 6314 * if the underlying device is experiencing data corruption, we may feed 6315 * corrupt data to the decompressor, so the decompressor needs to be 6316 * able to handle this situation (LZ4 does). 6317 */ 6318 static void 6319 l2arc_decompress_zio(zio_t *zio, arc_buf_hdr_t *hdr, enum zio_compress c) 6320 { 6321 ASSERT(L2ARC_IS_VALID_COMPRESS(c)); 6322 6323 if (zio->io_error != 0) { 6324 /* 6325 * An io error has occured, just restore the original io 6326 * size in preparation for a main pool read. 6327 */ 6328 zio->io_orig_size = zio->io_size = hdr->b_size; 6329 return; 6330 } 6331 6332 if (c == ZIO_COMPRESS_EMPTY) { 6333 /* 6334 * An empty buffer results in a null zio, which means we 6335 * need to fill its io_data after we're done restoring the 6336 * buffer's contents. 6337 */ 6338 ASSERT(hdr->b_l1hdr.b_buf != NULL); 6339 bzero(hdr->b_l1hdr.b_buf->b_data, hdr->b_size); 6340 zio->io_data = zio->io_orig_data = hdr->b_l1hdr.b_buf->b_data; 6341 } else { 6342 ASSERT(zio->io_data != NULL); 6343 /* 6344 * We copy the compressed data from the start of the arc buffer 6345 * (the zio_read will have pulled in only what we need, the 6346 * rest is garbage which we will overwrite at decompression) 6347 * and then decompress back to the ARC data buffer. This way we 6348 * can minimize copying by simply decompressing back over the 6349 * original compressed data (rather than decompressing to an 6350 * aux buffer and then copying back the uncompressed buffer, 6351 * which is likely to be much larger). 6352 */ 6353 uint64_t csize; 6354 void *cdata; 6355 6356 csize = zio->io_size; 6357 cdata = zio_data_buf_alloc(csize); 6358 bcopy(zio->io_data, cdata, csize); 6359 if (zio_decompress_data(c, cdata, zio->io_data, csize, 6360 hdr->b_size) != 0) 6361 zio->io_error = EIO; 6362 zio_data_buf_free(cdata, csize); 6363 } 6364 6365 /* Restore the expected uncompressed IO size. */ 6366 zio->io_orig_size = zio->io_size = hdr->b_size; 6367 } 6368 6369 /* 6370 * Releases the temporary b_tmp_cdata buffer in an l2arc header structure. 6371 * This buffer serves as a temporary holder of compressed data while 6372 * the buffer entry is being written to an l2arc device. Once that is 6373 * done, we can dispose of it. 6374 */ 6375 static void 6376 l2arc_release_cdata_buf(arc_buf_hdr_t *hdr) 6377 { 6378 ASSERT(HDR_HAS_L2HDR(hdr)); 6379 enum zio_compress comp = hdr->b_l2hdr.b_compress; 6380 6381 ASSERT(HDR_HAS_L1HDR(hdr)); 6382 ASSERT(comp == ZIO_COMPRESS_OFF || L2ARC_IS_VALID_COMPRESS(comp)); 6383 6384 if (comp == ZIO_COMPRESS_OFF) { 6385 /* 6386 * In this case, b_tmp_cdata points to the same buffer 6387 * as the arc_buf_t's b_data field. We don't want to 6388 * free it, since the arc_buf_t will handle that. 6389 */ 6390 hdr->b_l1hdr.b_tmp_cdata = NULL; 6391 } else if (comp == ZIO_COMPRESS_EMPTY) { 6392 /* 6393 * In this case, b_tmp_cdata was compressed to an empty 6394 * buffer, thus there's nothing to free and b_tmp_cdata 6395 * should have been set to NULL in l2arc_write_buffers(). 6396 */ 6397 ASSERT3P(hdr->b_l1hdr.b_tmp_cdata, ==, NULL); 6398 } else { 6399 /* 6400 * If the data was compressed, then we've allocated a 6401 * temporary buffer for it, so now we need to release it. 6402 */ 6403 ASSERT(hdr->b_l1hdr.b_tmp_cdata != NULL); 6404 zio_data_buf_free(hdr->b_l1hdr.b_tmp_cdata, 6405 hdr->b_size); 6406 hdr->b_l1hdr.b_tmp_cdata = NULL; 6407 } 6408 6409 } 6410 6411 /* 6412 * This thread feeds the L2ARC at regular intervals. This is the beating 6413 * heart of the L2ARC. 6414 */ 6415 static void 6416 l2arc_feed_thread(void) 6417 { 6418 callb_cpr_t cpr; 6419 l2arc_dev_t *dev; 6420 spa_t *spa; 6421 uint64_t size, wrote; 6422 clock_t begin, next = ddi_get_lbolt(); 6423 boolean_t headroom_boost = B_FALSE; 6424 6425 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 6426 6427 mutex_enter(&l2arc_feed_thr_lock); 6428 6429 while (l2arc_thread_exit == 0) { 6430 CALLB_CPR_SAFE_BEGIN(&cpr); 6431 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 6432 next); 6433 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 6434 next = ddi_get_lbolt() + hz; 6435 6436 /* 6437 * Quick check for L2ARC devices. 6438 */ 6439 mutex_enter(&l2arc_dev_mtx); 6440 if (l2arc_ndev == 0) { 6441 mutex_exit(&l2arc_dev_mtx); 6442 continue; 6443 } 6444 mutex_exit(&l2arc_dev_mtx); 6445 begin = ddi_get_lbolt(); 6446 6447 /* 6448 * This selects the next l2arc device to write to, and in 6449 * doing so the next spa to feed from: dev->l2ad_spa. This 6450 * will return NULL if there are now no l2arc devices or if 6451 * they are all faulted. 6452 * 6453 * If a device is returned, its spa's config lock is also 6454 * held to prevent device removal. l2arc_dev_get_next() 6455 * will grab and release l2arc_dev_mtx. 6456 */ 6457 if ((dev = l2arc_dev_get_next()) == NULL) 6458 continue; 6459 6460 spa = dev->l2ad_spa; 6461 ASSERT(spa != NULL); 6462 6463 /* 6464 * If the pool is read-only then force the feed thread to 6465 * sleep a little longer. 6466 */ 6467 if (!spa_writeable(spa)) { 6468 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 6469 spa_config_exit(spa, SCL_L2ARC, dev); 6470 continue; 6471 } 6472 6473 /* 6474 * Avoid contributing to memory pressure. 6475 */ 6476 if (arc_reclaim_needed()) { 6477 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 6478 spa_config_exit(spa, SCL_L2ARC, dev); 6479 continue; 6480 } 6481 6482 ARCSTAT_BUMP(arcstat_l2_feeds); 6483 6484 size = l2arc_write_size(); 6485 6486 /* 6487 * Evict L2ARC buffers that will be overwritten. 6488 */ 6489 l2arc_evict(dev, size, B_FALSE); 6490 6491 /* 6492 * Write ARC buffers. 6493 */ 6494 wrote = l2arc_write_buffers(spa, dev, size, &headroom_boost); 6495 6496 /* 6497 * Calculate interval between writes. 6498 */ 6499 next = l2arc_write_interval(begin, size, wrote); 6500 spa_config_exit(spa, SCL_L2ARC, dev); 6501 } 6502 6503 l2arc_thread_exit = 0; 6504 cv_broadcast(&l2arc_feed_thr_cv); 6505 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 6506 thread_exit(); 6507 } 6508 6509 boolean_t 6510 l2arc_vdev_present(vdev_t *vd) 6511 { 6512 l2arc_dev_t *dev; 6513 6514 mutex_enter(&l2arc_dev_mtx); 6515 for (dev = list_head(l2arc_dev_list); dev != NULL; 6516 dev = list_next(l2arc_dev_list, dev)) { 6517 if (dev->l2ad_vdev == vd) 6518 break; 6519 } 6520 mutex_exit(&l2arc_dev_mtx); 6521 6522 return (dev != NULL); 6523 } 6524 6525 /* 6526 * Add a vdev for use by the L2ARC. By this point the spa has already 6527 * validated the vdev and opened it. 6528 */ 6529 void 6530 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 6531 { 6532 l2arc_dev_t *adddev; 6533 6534 ASSERT(!l2arc_vdev_present(vd)); 6535 6536 /* 6537 * Create a new l2arc device entry. 6538 */ 6539 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 6540 adddev->l2ad_spa = spa; 6541 adddev->l2ad_vdev = vd; 6542 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 6543 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 6544 adddev->l2ad_hand = adddev->l2ad_start; 6545 adddev->l2ad_first = B_TRUE; 6546 adddev->l2ad_writing = B_FALSE; 6547 6548 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 6549 /* 6550 * This is a list of all ARC buffers that are still valid on the 6551 * device. 6552 */ 6553 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 6554 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 6555 6556 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 6557 refcount_create(&adddev->l2ad_alloc); 6558 6559 /* 6560 * Add device to global list 6561 */ 6562 mutex_enter(&l2arc_dev_mtx); 6563 list_insert_head(l2arc_dev_list, adddev); 6564 atomic_inc_64(&l2arc_ndev); 6565 mutex_exit(&l2arc_dev_mtx); 6566 } 6567 6568 /* 6569 * Remove a vdev from the L2ARC. 6570 */ 6571 void 6572 l2arc_remove_vdev(vdev_t *vd) 6573 { 6574 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 6575 6576 /* 6577 * Find the device by vdev 6578 */ 6579 mutex_enter(&l2arc_dev_mtx); 6580 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 6581 nextdev = list_next(l2arc_dev_list, dev); 6582 if (vd == dev->l2ad_vdev) { 6583 remdev = dev; 6584 break; 6585 } 6586 } 6587 ASSERT(remdev != NULL); 6588 6589 /* 6590 * Remove device from global list 6591 */ 6592 list_remove(l2arc_dev_list, remdev); 6593 l2arc_dev_last = NULL; /* may have been invalidated */ 6594 atomic_dec_64(&l2arc_ndev); 6595 mutex_exit(&l2arc_dev_mtx); 6596 6597 /* 6598 * Clear all buflists and ARC references. L2ARC device flush. 6599 */ 6600 l2arc_evict(remdev, 0, B_TRUE); 6601 list_destroy(&remdev->l2ad_buflist); 6602 mutex_destroy(&remdev->l2ad_mtx); 6603 refcount_destroy(&remdev->l2ad_alloc); 6604 kmem_free(remdev, sizeof (l2arc_dev_t)); 6605 } 6606 6607 void 6608 l2arc_init(void) 6609 { 6610 l2arc_thread_exit = 0; 6611 l2arc_ndev = 0; 6612 l2arc_writes_sent = 0; 6613 l2arc_writes_done = 0; 6614 6615 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 6616 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 6617 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 6618 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 6619 6620 l2arc_dev_list = &L2ARC_dev_list; 6621 l2arc_free_on_write = &L2ARC_free_on_write; 6622 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 6623 offsetof(l2arc_dev_t, l2ad_node)); 6624 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 6625 offsetof(l2arc_data_free_t, l2df_list_node)); 6626 } 6627 6628 void 6629 l2arc_fini(void) 6630 { 6631 /* 6632 * This is called from dmu_fini(), which is called from spa_fini(); 6633 * Because of this, we can assume that all l2arc devices have 6634 * already been removed when the pools themselves were removed. 6635 */ 6636 6637 l2arc_do_free_on_write(); 6638 6639 mutex_destroy(&l2arc_feed_thr_lock); 6640 cv_destroy(&l2arc_feed_thr_cv); 6641 mutex_destroy(&l2arc_dev_mtx); 6642 mutex_destroy(&l2arc_free_on_write_mtx); 6643 6644 list_destroy(l2arc_dev_list); 6645 list_destroy(l2arc_free_on_write); 6646 } 6647 6648 void 6649 l2arc_start(void) 6650 { 6651 if (!(spa_mode_global & FWRITE)) 6652 return; 6653 6654 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 6655 TS_RUN, minclsyspri); 6656 } 6657 6658 void 6659 l2arc_stop(void) 6660 { 6661 if (!(spa_mode_global & FWRITE)) 6662 return; 6663 6664 mutex_enter(&l2arc_feed_thr_lock); 6665 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 6666 l2arc_thread_exit = 1; 6667 while (l2arc_thread_exit != 0) 6668 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 6669 mutex_exit(&l2arc_feed_thr_lock); 6670 } 6671