1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slows the flow of new data 51 * into the cache until we can make space available. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory pressure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() interface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 * 113 * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: 114 * 115 * - L2ARC buflist creation 116 * - L2ARC buflist eviction 117 * - L2ARC write completion, which walks L2ARC buflists 118 * - ARC header destruction, as it removes from L2ARC buflists 119 * - ARC header release, as it removes from L2ARC buflists 120 */ 121 122 #include <sys/spa.h> 123 #include <sys/zio.h> 124 #include <sys/zio_checksum.h> 125 #include <sys/zfs_context.h> 126 #include <sys/arc.h> 127 #include <sys/refcount.h> 128 #include <sys/vdev.h> 129 #ifdef _KERNEL 130 #include <sys/vmsystm.h> 131 #include <vm/anon.h> 132 #include <sys/fs/swapnode.h> 133 #include <sys/dnlc.h> 134 #endif 135 #include <sys/callb.h> 136 #include <sys/kstat.h> 137 138 static kmutex_t arc_reclaim_thr_lock; 139 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 140 static uint8_t arc_thread_exit; 141 142 extern int zfs_write_limit_shift; 143 extern uint64_t zfs_write_limit_max; 144 extern uint64_t zfs_write_limit_inflated; 145 146 #define ARC_REDUCE_DNLC_PERCENT 3 147 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 148 149 typedef enum arc_reclaim_strategy { 150 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 151 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 152 } arc_reclaim_strategy_t; 153 154 /* number of seconds before growing cache again */ 155 static int arc_grow_retry = 60; 156 157 /* 158 * minimum lifespan of a prefetch block in clock ticks 159 * (initialized in arc_init()) 160 */ 161 static int arc_min_prefetch_lifespan; 162 163 static int arc_dead; 164 165 /* 166 * The arc has filled available memory and has now warmed up. 167 */ 168 static boolean_t arc_warm; 169 170 /* 171 * These tunables are for performance analysis. 172 */ 173 uint64_t zfs_arc_max; 174 uint64_t zfs_arc_min; 175 uint64_t zfs_arc_meta_limit = 0; 176 int zfs_mdcomp_disable = 0; 177 178 /* 179 * Note that buffers can be in one of 6 states: 180 * ARC_anon - anonymous (discussed below) 181 * ARC_mru - recently used, currently cached 182 * ARC_mru_ghost - recentely used, no longer in cache 183 * ARC_mfu - frequently used, currently cached 184 * ARC_mfu_ghost - frequently used, no longer in cache 185 * ARC_l2c_only - exists in L2ARC but not other states 186 * When there are no active references to the buffer, they are 187 * are linked onto a list in one of these arc states. These are 188 * the only buffers that can be evicted or deleted. Within each 189 * state there are multiple lists, one for meta-data and one for 190 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 191 * etc.) is tracked separately so that it can be managed more 192 * explicitly: favored over data, limited explicitly. 193 * 194 * Anonymous buffers are buffers that are not associated with 195 * a DVA. These are buffers that hold dirty block copies 196 * before they are written to stable storage. By definition, 197 * they are "ref'd" and are considered part of arc_mru 198 * that cannot be freed. Generally, they will aquire a DVA 199 * as they are written and migrate onto the arc_mru list. 200 * 201 * The ARC_l2c_only state is for buffers that are in the second 202 * level ARC but no longer in any of the ARC_m* lists. The second 203 * level ARC itself may also contain buffers that are in any of 204 * the ARC_m* states - meaning that a buffer can exist in two 205 * places. The reason for the ARC_l2c_only state is to keep the 206 * buffer header in the hash table, so that reads that hit the 207 * second level ARC benefit from these fast lookups. 208 */ 209 210 typedef struct arc_state { 211 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 212 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 213 uint64_t arcs_size; /* total amount of data in this state */ 214 kmutex_t arcs_mtx; 215 } arc_state_t; 216 217 /* The 6 states: */ 218 static arc_state_t ARC_anon; 219 static arc_state_t ARC_mru; 220 static arc_state_t ARC_mru_ghost; 221 static arc_state_t ARC_mfu; 222 static arc_state_t ARC_mfu_ghost; 223 static arc_state_t ARC_l2c_only; 224 225 typedef struct arc_stats { 226 kstat_named_t arcstat_hits; 227 kstat_named_t arcstat_misses; 228 kstat_named_t arcstat_demand_data_hits; 229 kstat_named_t arcstat_demand_data_misses; 230 kstat_named_t arcstat_demand_metadata_hits; 231 kstat_named_t arcstat_demand_metadata_misses; 232 kstat_named_t arcstat_prefetch_data_hits; 233 kstat_named_t arcstat_prefetch_data_misses; 234 kstat_named_t arcstat_prefetch_metadata_hits; 235 kstat_named_t arcstat_prefetch_metadata_misses; 236 kstat_named_t arcstat_mru_hits; 237 kstat_named_t arcstat_mru_ghost_hits; 238 kstat_named_t arcstat_mfu_hits; 239 kstat_named_t arcstat_mfu_ghost_hits; 240 kstat_named_t arcstat_deleted; 241 kstat_named_t arcstat_recycle_miss; 242 kstat_named_t arcstat_mutex_miss; 243 kstat_named_t arcstat_evict_skip; 244 kstat_named_t arcstat_hash_elements; 245 kstat_named_t arcstat_hash_elements_max; 246 kstat_named_t arcstat_hash_collisions; 247 kstat_named_t arcstat_hash_chains; 248 kstat_named_t arcstat_hash_chain_max; 249 kstat_named_t arcstat_p; 250 kstat_named_t arcstat_c; 251 kstat_named_t arcstat_c_min; 252 kstat_named_t arcstat_c_max; 253 kstat_named_t arcstat_size; 254 kstat_named_t arcstat_hdr_size; 255 kstat_named_t arcstat_l2_hits; 256 kstat_named_t arcstat_l2_misses; 257 kstat_named_t arcstat_l2_feeds; 258 kstat_named_t arcstat_l2_rw_clash; 259 kstat_named_t arcstat_l2_writes_sent; 260 kstat_named_t arcstat_l2_writes_done; 261 kstat_named_t arcstat_l2_writes_error; 262 kstat_named_t arcstat_l2_writes_hdr_miss; 263 kstat_named_t arcstat_l2_evict_lock_retry; 264 kstat_named_t arcstat_l2_evict_reading; 265 kstat_named_t arcstat_l2_free_on_write; 266 kstat_named_t arcstat_l2_abort_lowmem; 267 kstat_named_t arcstat_l2_cksum_bad; 268 kstat_named_t arcstat_l2_io_error; 269 kstat_named_t arcstat_l2_size; 270 kstat_named_t arcstat_l2_hdr_size; 271 kstat_named_t arcstat_memory_throttle_count; 272 } arc_stats_t; 273 274 static arc_stats_t arc_stats = { 275 { "hits", KSTAT_DATA_UINT64 }, 276 { "misses", KSTAT_DATA_UINT64 }, 277 { "demand_data_hits", KSTAT_DATA_UINT64 }, 278 { "demand_data_misses", KSTAT_DATA_UINT64 }, 279 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 280 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 281 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 282 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 283 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 284 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 285 { "mru_hits", KSTAT_DATA_UINT64 }, 286 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 287 { "mfu_hits", KSTAT_DATA_UINT64 }, 288 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 289 { "deleted", KSTAT_DATA_UINT64 }, 290 { "recycle_miss", KSTAT_DATA_UINT64 }, 291 { "mutex_miss", KSTAT_DATA_UINT64 }, 292 { "evict_skip", KSTAT_DATA_UINT64 }, 293 { "hash_elements", KSTAT_DATA_UINT64 }, 294 { "hash_elements_max", KSTAT_DATA_UINT64 }, 295 { "hash_collisions", KSTAT_DATA_UINT64 }, 296 { "hash_chains", KSTAT_DATA_UINT64 }, 297 { "hash_chain_max", KSTAT_DATA_UINT64 }, 298 { "p", KSTAT_DATA_UINT64 }, 299 { "c", KSTAT_DATA_UINT64 }, 300 { "c_min", KSTAT_DATA_UINT64 }, 301 { "c_max", KSTAT_DATA_UINT64 }, 302 { "size", KSTAT_DATA_UINT64 }, 303 { "hdr_size", KSTAT_DATA_UINT64 }, 304 { "l2_hits", KSTAT_DATA_UINT64 }, 305 { "l2_misses", KSTAT_DATA_UINT64 }, 306 { "l2_feeds", KSTAT_DATA_UINT64 }, 307 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 308 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 309 { "l2_writes_done", KSTAT_DATA_UINT64 }, 310 { "l2_writes_error", KSTAT_DATA_UINT64 }, 311 { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, 312 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 313 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 314 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 315 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 316 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 317 { "l2_io_error", KSTAT_DATA_UINT64 }, 318 { "l2_size", KSTAT_DATA_UINT64 }, 319 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 320 { "memory_throttle_count", KSTAT_DATA_UINT64 } 321 }; 322 323 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 324 325 #define ARCSTAT_INCR(stat, val) \ 326 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 327 328 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 329 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 330 331 #define ARCSTAT_MAX(stat, val) { \ 332 uint64_t m; \ 333 while ((val) > (m = arc_stats.stat.value.ui64) && \ 334 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 335 continue; \ 336 } 337 338 #define ARCSTAT_MAXSTAT(stat) \ 339 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 340 341 /* 342 * We define a macro to allow ARC hits/misses to be easily broken down by 343 * two separate conditions, giving a total of four different subtypes for 344 * each of hits and misses (so eight statistics total). 345 */ 346 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 347 if (cond1) { \ 348 if (cond2) { \ 349 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 350 } else { \ 351 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 352 } \ 353 } else { \ 354 if (cond2) { \ 355 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 356 } else { \ 357 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 358 } \ 359 } 360 361 kstat_t *arc_ksp; 362 static arc_state_t *arc_anon; 363 static arc_state_t *arc_mru; 364 static arc_state_t *arc_mru_ghost; 365 static arc_state_t *arc_mfu; 366 static arc_state_t *arc_mfu_ghost; 367 static arc_state_t *arc_l2c_only; 368 369 /* 370 * There are several ARC variables that are critical to export as kstats -- 371 * but we don't want to have to grovel around in the kstat whenever we wish to 372 * manipulate them. For these variables, we therefore define them to be in 373 * terms of the statistic variable. This assures that we are not introducing 374 * the possibility of inconsistency by having shadow copies of the variables, 375 * while still allowing the code to be readable. 376 */ 377 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 378 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 379 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 380 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 381 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 382 383 static int arc_no_grow; /* Don't try to grow cache size */ 384 static uint64_t arc_tempreserve; 385 static uint64_t arc_meta_used; 386 static uint64_t arc_meta_limit; 387 static uint64_t arc_meta_max = 0; 388 389 typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; 390 391 typedef struct arc_callback arc_callback_t; 392 393 struct arc_callback { 394 void *acb_private; 395 arc_done_func_t *acb_done; 396 arc_buf_t *acb_buf; 397 zio_t *acb_zio_dummy; 398 arc_callback_t *acb_next; 399 }; 400 401 typedef struct arc_write_callback arc_write_callback_t; 402 403 struct arc_write_callback { 404 void *awcb_private; 405 arc_done_func_t *awcb_ready; 406 arc_done_func_t *awcb_done; 407 arc_buf_t *awcb_buf; 408 }; 409 410 struct arc_buf_hdr { 411 /* protected by hash lock */ 412 dva_t b_dva; 413 uint64_t b_birth; 414 uint64_t b_cksum0; 415 416 kmutex_t b_freeze_lock; 417 zio_cksum_t *b_freeze_cksum; 418 419 arc_buf_hdr_t *b_hash_next; 420 arc_buf_t *b_buf; 421 uint32_t b_flags; 422 uint32_t b_datacnt; 423 424 arc_callback_t *b_acb; 425 kcondvar_t b_cv; 426 427 /* immutable */ 428 arc_buf_contents_t b_type; 429 uint64_t b_size; 430 spa_t *b_spa; 431 432 /* protected by arc state mutex */ 433 arc_state_t *b_state; 434 list_node_t b_arc_node; 435 436 /* updated atomically */ 437 clock_t b_arc_access; 438 439 /* self protecting */ 440 refcount_t b_refcnt; 441 442 l2arc_buf_hdr_t *b_l2hdr; 443 list_node_t b_l2node; 444 /* 445 * scrub code can lockout access to the buf while it changes 446 * bp's contained within it. 447 */ 448 krwlock_t b_datalock; 449 }; 450 451 static arc_buf_t *arc_eviction_list; 452 static kmutex_t arc_eviction_mtx; 453 static arc_buf_hdr_t arc_eviction_hdr; 454 static void arc_get_data_buf(arc_buf_t *buf); 455 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 456 static int arc_evict_needed(arc_buf_contents_t type); 457 static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); 458 459 #define GHOST_STATE(state) \ 460 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 461 (state) == arc_l2c_only) 462 463 /* 464 * Private ARC flags. These flags are private ARC only flags that will show up 465 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 466 * be passed in as arc_flags in things like arc_read. However, these flags 467 * should never be passed and should only be set by ARC code. When adding new 468 * public flags, make sure not to smash the private ones. 469 */ 470 471 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 472 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 473 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 474 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 475 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 476 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 477 #define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ 478 #define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */ 479 #define ARC_L2_WRITING (1 << 17) /* L2ARC write in progress */ 480 #define ARC_L2_EVICTED (1 << 18) /* evicted during I/O */ 481 #define ARC_L2_WRITE_HEAD (1 << 19) /* head of write list */ 482 #define ARC_STORED (1 << 20) /* has been store()d to */ 483 484 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 485 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 486 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 487 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 488 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 489 #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) 490 #define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE) 491 #define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS && \ 492 (hdr)->b_l2hdr != NULL) 493 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) 494 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) 495 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) 496 497 /* 498 * Other sizes 499 */ 500 501 #define HDR_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 502 #define L2HDR_SIZE ((int64_t)sizeof (l2arc_buf_hdr_t)) 503 504 /* 505 * Hash table routines 506 */ 507 508 #define HT_LOCK_PAD 64 509 510 struct ht_lock { 511 kmutex_t ht_lock; 512 #ifdef _KERNEL 513 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 514 #endif 515 }; 516 517 #define BUF_LOCKS 256 518 typedef struct buf_hash_table { 519 uint64_t ht_mask; 520 arc_buf_hdr_t **ht_table; 521 struct ht_lock ht_locks[BUF_LOCKS]; 522 } buf_hash_table_t; 523 524 static buf_hash_table_t buf_hash_table; 525 526 #define BUF_HASH_INDEX(spa, dva, birth) \ 527 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 528 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 529 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 530 #define HDR_LOCK(buf) \ 531 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 532 533 uint64_t zfs_crc64_table[256]; 534 535 /* 536 * Level 2 ARC 537 */ 538 539 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 540 #define L2ARC_HEADROOM 4 /* num of writes */ 541 #define L2ARC_FEED_SECS 1 /* caching interval */ 542 543 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 544 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 545 546 /* 547 * L2ARC Performance Tunables 548 */ 549 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 550 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 551 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 552 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 553 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 554 555 /* 556 * L2ARC Internals 557 */ 558 typedef struct l2arc_dev { 559 vdev_t *l2ad_vdev; /* vdev */ 560 spa_t *l2ad_spa; /* spa */ 561 uint64_t l2ad_hand; /* next write location */ 562 uint64_t l2ad_write; /* desired write size, bytes */ 563 uint64_t l2ad_boost; /* warmup write boost, bytes */ 564 uint64_t l2ad_start; /* first addr on device */ 565 uint64_t l2ad_end; /* last addr on device */ 566 uint64_t l2ad_evict; /* last addr eviction reached */ 567 boolean_t l2ad_first; /* first sweep through */ 568 list_t *l2ad_buflist; /* buffer list */ 569 list_node_t l2ad_node; /* device list node */ 570 } l2arc_dev_t; 571 572 static list_t L2ARC_dev_list; /* device list */ 573 static list_t *l2arc_dev_list; /* device list pointer */ 574 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 575 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 576 static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ 577 static list_t L2ARC_free_on_write; /* free after write buf list */ 578 static list_t *l2arc_free_on_write; /* free after write list ptr */ 579 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 580 static uint64_t l2arc_ndev; /* number of devices */ 581 582 typedef struct l2arc_read_callback { 583 arc_buf_t *l2rcb_buf; /* read buffer */ 584 spa_t *l2rcb_spa; /* spa */ 585 blkptr_t l2rcb_bp; /* original blkptr */ 586 zbookmark_t l2rcb_zb; /* original bookmark */ 587 int l2rcb_flags; /* original flags */ 588 } l2arc_read_callback_t; 589 590 typedef struct l2arc_write_callback { 591 l2arc_dev_t *l2wcb_dev; /* device info */ 592 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 593 } l2arc_write_callback_t; 594 595 struct l2arc_buf_hdr { 596 /* protected by arc_buf_hdr mutex */ 597 l2arc_dev_t *b_dev; /* L2ARC device */ 598 daddr_t b_daddr; /* disk address, offset byte */ 599 }; 600 601 typedef struct l2arc_data_free { 602 /* protected by l2arc_free_on_write_mtx */ 603 void *l2df_data; 604 size_t l2df_size; 605 void (*l2df_func)(void *, size_t); 606 list_node_t l2df_list_node; 607 } l2arc_data_free_t; 608 609 static kmutex_t l2arc_feed_thr_lock; 610 static kcondvar_t l2arc_feed_thr_cv; 611 static uint8_t l2arc_thread_exit; 612 613 static void l2arc_read_done(zio_t *zio); 614 static void l2arc_hdr_stat_add(void); 615 static void l2arc_hdr_stat_remove(void); 616 617 static uint64_t 618 buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) 619 { 620 uintptr_t spav = (uintptr_t)spa; 621 uint8_t *vdva = (uint8_t *)dva; 622 uint64_t crc = -1ULL; 623 int i; 624 625 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 626 627 for (i = 0; i < sizeof (dva_t); i++) 628 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 629 630 crc ^= (spav>>8) ^ birth; 631 632 return (crc); 633 } 634 635 #define BUF_EMPTY(buf) \ 636 ((buf)->b_dva.dva_word[0] == 0 && \ 637 (buf)->b_dva.dva_word[1] == 0 && \ 638 (buf)->b_birth == 0) 639 640 #define BUF_EQUAL(spa, dva, birth, buf) \ 641 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 642 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 643 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 644 645 static arc_buf_hdr_t * 646 buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) 647 { 648 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 649 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 650 arc_buf_hdr_t *buf; 651 652 mutex_enter(hash_lock); 653 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 654 buf = buf->b_hash_next) { 655 if (BUF_EQUAL(spa, dva, birth, buf)) { 656 *lockp = hash_lock; 657 return (buf); 658 } 659 } 660 mutex_exit(hash_lock); 661 *lockp = NULL; 662 return (NULL); 663 } 664 665 /* 666 * Insert an entry into the hash table. If there is already an element 667 * equal to elem in the hash table, then the already existing element 668 * will be returned and the new element will not be inserted. 669 * Otherwise returns NULL. 670 */ 671 static arc_buf_hdr_t * 672 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 673 { 674 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 675 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 676 arc_buf_hdr_t *fbuf; 677 uint32_t i; 678 679 ASSERT(!HDR_IN_HASH_TABLE(buf)); 680 *lockp = hash_lock; 681 mutex_enter(hash_lock); 682 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 683 fbuf = fbuf->b_hash_next, i++) { 684 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 685 return (fbuf); 686 } 687 688 buf->b_hash_next = buf_hash_table.ht_table[idx]; 689 buf_hash_table.ht_table[idx] = buf; 690 buf->b_flags |= ARC_IN_HASH_TABLE; 691 692 /* collect some hash table performance data */ 693 if (i > 0) { 694 ARCSTAT_BUMP(arcstat_hash_collisions); 695 if (i == 1) 696 ARCSTAT_BUMP(arcstat_hash_chains); 697 698 ARCSTAT_MAX(arcstat_hash_chain_max, i); 699 } 700 701 ARCSTAT_BUMP(arcstat_hash_elements); 702 ARCSTAT_MAXSTAT(arcstat_hash_elements); 703 704 return (NULL); 705 } 706 707 static void 708 buf_hash_remove(arc_buf_hdr_t *buf) 709 { 710 arc_buf_hdr_t *fbuf, **bufp; 711 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 712 713 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 714 ASSERT(HDR_IN_HASH_TABLE(buf)); 715 716 bufp = &buf_hash_table.ht_table[idx]; 717 while ((fbuf = *bufp) != buf) { 718 ASSERT(fbuf != NULL); 719 bufp = &fbuf->b_hash_next; 720 } 721 *bufp = buf->b_hash_next; 722 buf->b_hash_next = NULL; 723 buf->b_flags &= ~ARC_IN_HASH_TABLE; 724 725 /* collect some hash table performance data */ 726 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 727 728 if (buf_hash_table.ht_table[idx] && 729 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 730 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 731 } 732 733 /* 734 * Global data structures and functions for the buf kmem cache. 735 */ 736 static kmem_cache_t *hdr_cache; 737 static kmem_cache_t *buf_cache; 738 739 static void 740 buf_fini(void) 741 { 742 int i; 743 744 kmem_free(buf_hash_table.ht_table, 745 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 746 for (i = 0; i < BUF_LOCKS; i++) 747 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 748 kmem_cache_destroy(hdr_cache); 749 kmem_cache_destroy(buf_cache); 750 } 751 752 /* 753 * Constructor callback - called when the cache is empty 754 * and a new buf is requested. 755 */ 756 /* ARGSUSED */ 757 static int 758 hdr_cons(void *vbuf, void *unused, int kmflag) 759 { 760 arc_buf_hdr_t *buf = vbuf; 761 762 bzero(buf, sizeof (arc_buf_hdr_t)); 763 refcount_create(&buf->b_refcnt); 764 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 765 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 766 rw_init(&buf->b_datalock, NULL, RW_DEFAULT, NULL); 767 768 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 769 return (0); 770 } 771 772 /* 773 * Destructor callback - called when a cached buf is 774 * no longer required. 775 */ 776 /* ARGSUSED */ 777 static void 778 hdr_dest(void *vbuf, void *unused) 779 { 780 arc_buf_hdr_t *buf = vbuf; 781 782 refcount_destroy(&buf->b_refcnt); 783 cv_destroy(&buf->b_cv); 784 mutex_destroy(&buf->b_freeze_lock); 785 rw_destroy(&buf->b_datalock); 786 787 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 788 } 789 790 /* 791 * Reclaim callback -- invoked when memory is low. 792 */ 793 /* ARGSUSED */ 794 static void 795 hdr_recl(void *unused) 796 { 797 dprintf("hdr_recl called\n"); 798 /* 799 * umem calls the reclaim func when we destroy the buf cache, 800 * which is after we do arc_fini(). 801 */ 802 if (!arc_dead) 803 cv_signal(&arc_reclaim_thr_cv); 804 } 805 806 static void 807 buf_init(void) 808 { 809 uint64_t *ct; 810 uint64_t hsize = 1ULL << 12; 811 int i, j; 812 813 /* 814 * The hash table is big enough to fill all of physical memory 815 * with an average 64K block size. The table will take up 816 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 817 */ 818 while (hsize * 65536 < physmem * PAGESIZE) 819 hsize <<= 1; 820 retry: 821 buf_hash_table.ht_mask = hsize - 1; 822 buf_hash_table.ht_table = 823 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 824 if (buf_hash_table.ht_table == NULL) { 825 ASSERT(hsize > (1ULL << 8)); 826 hsize >>= 1; 827 goto retry; 828 } 829 830 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 831 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 832 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 833 0, NULL, NULL, NULL, NULL, NULL, 0); 834 835 for (i = 0; i < 256; i++) 836 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 837 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 838 839 for (i = 0; i < BUF_LOCKS; i++) { 840 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 841 NULL, MUTEX_DEFAULT, NULL); 842 } 843 } 844 845 #define ARC_MINTIME (hz>>4) /* 62 ms */ 846 847 static void 848 arc_cksum_verify(arc_buf_t *buf) 849 { 850 zio_cksum_t zc; 851 852 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 853 return; 854 855 mutex_enter(&buf->b_hdr->b_freeze_lock); 856 if (buf->b_hdr->b_freeze_cksum == NULL || 857 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 858 mutex_exit(&buf->b_hdr->b_freeze_lock); 859 return; 860 } 861 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 862 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 863 panic("buffer modified while frozen!"); 864 mutex_exit(&buf->b_hdr->b_freeze_lock); 865 } 866 867 static int 868 arc_cksum_equal(arc_buf_t *buf) 869 { 870 zio_cksum_t zc; 871 int equal; 872 873 mutex_enter(&buf->b_hdr->b_freeze_lock); 874 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 875 equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); 876 mutex_exit(&buf->b_hdr->b_freeze_lock); 877 878 return (equal); 879 } 880 881 static void 882 arc_cksum_compute(arc_buf_t *buf, boolean_t force) 883 { 884 if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) 885 return; 886 887 mutex_enter(&buf->b_hdr->b_freeze_lock); 888 if (buf->b_hdr->b_freeze_cksum != NULL) { 889 mutex_exit(&buf->b_hdr->b_freeze_lock); 890 return; 891 } 892 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 893 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 894 buf->b_hdr->b_freeze_cksum); 895 mutex_exit(&buf->b_hdr->b_freeze_lock); 896 } 897 898 void 899 arc_buf_thaw(arc_buf_t *buf) 900 { 901 if (zfs_flags & ZFS_DEBUG_MODIFY) { 902 if (buf->b_hdr->b_state != arc_anon) 903 panic("modifying non-anon buffer!"); 904 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 905 panic("modifying buffer while i/o in progress!"); 906 arc_cksum_verify(buf); 907 } 908 909 mutex_enter(&buf->b_hdr->b_freeze_lock); 910 if (buf->b_hdr->b_freeze_cksum != NULL) { 911 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 912 buf->b_hdr->b_freeze_cksum = NULL; 913 } 914 mutex_exit(&buf->b_hdr->b_freeze_lock); 915 } 916 917 void 918 arc_buf_freeze(arc_buf_t *buf) 919 { 920 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 921 return; 922 923 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 924 buf->b_hdr->b_state == arc_anon); 925 arc_cksum_compute(buf, B_FALSE); 926 } 927 928 static void 929 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 930 { 931 ASSERT(MUTEX_HELD(hash_lock)); 932 933 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 934 (ab->b_state != arc_anon)) { 935 uint64_t delta = ab->b_size * ab->b_datacnt; 936 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 937 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 938 939 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 940 mutex_enter(&ab->b_state->arcs_mtx); 941 ASSERT(list_link_active(&ab->b_arc_node)); 942 list_remove(list, ab); 943 if (GHOST_STATE(ab->b_state)) { 944 ASSERT3U(ab->b_datacnt, ==, 0); 945 ASSERT3P(ab->b_buf, ==, NULL); 946 delta = ab->b_size; 947 } 948 ASSERT(delta > 0); 949 ASSERT3U(*size, >=, delta); 950 atomic_add_64(size, -delta); 951 mutex_exit(&ab->b_state->arcs_mtx); 952 /* remove the prefetch flag if we get a reference */ 953 if (ab->b_flags & ARC_PREFETCH) 954 ab->b_flags &= ~ARC_PREFETCH; 955 } 956 } 957 958 static int 959 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 960 { 961 int cnt; 962 arc_state_t *state = ab->b_state; 963 964 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 965 ASSERT(!GHOST_STATE(state)); 966 967 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 968 (state != arc_anon)) { 969 uint64_t *size = &state->arcs_lsize[ab->b_type]; 970 971 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 972 mutex_enter(&state->arcs_mtx); 973 ASSERT(!list_link_active(&ab->b_arc_node)); 974 list_insert_head(&state->arcs_list[ab->b_type], ab); 975 ASSERT(ab->b_datacnt > 0); 976 atomic_add_64(size, ab->b_size * ab->b_datacnt); 977 mutex_exit(&state->arcs_mtx); 978 } 979 return (cnt); 980 } 981 982 /* 983 * Move the supplied buffer to the indicated state. The mutex 984 * for the buffer must be held by the caller. 985 */ 986 static void 987 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 988 { 989 arc_state_t *old_state = ab->b_state; 990 int64_t refcnt = refcount_count(&ab->b_refcnt); 991 uint64_t from_delta, to_delta; 992 993 ASSERT(MUTEX_HELD(hash_lock)); 994 ASSERT(new_state != old_state); 995 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 996 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 997 998 from_delta = to_delta = ab->b_datacnt * ab->b_size; 999 1000 /* 1001 * If this buffer is evictable, transfer it from the 1002 * old state list to the new state list. 1003 */ 1004 if (refcnt == 0) { 1005 if (old_state != arc_anon) { 1006 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 1007 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 1008 1009 if (use_mutex) 1010 mutex_enter(&old_state->arcs_mtx); 1011 1012 ASSERT(list_link_active(&ab->b_arc_node)); 1013 list_remove(&old_state->arcs_list[ab->b_type], ab); 1014 1015 /* 1016 * If prefetching out of the ghost cache, 1017 * we will have a non-null datacnt. 1018 */ 1019 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 1020 /* ghost elements have a ghost size */ 1021 ASSERT(ab->b_buf == NULL); 1022 from_delta = ab->b_size; 1023 } 1024 ASSERT3U(*size, >=, from_delta); 1025 atomic_add_64(size, -from_delta); 1026 1027 if (use_mutex) 1028 mutex_exit(&old_state->arcs_mtx); 1029 } 1030 if (new_state != arc_anon) { 1031 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 1032 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 1033 1034 if (use_mutex) 1035 mutex_enter(&new_state->arcs_mtx); 1036 1037 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 1038 1039 /* ghost elements have a ghost size */ 1040 if (GHOST_STATE(new_state)) { 1041 ASSERT(ab->b_datacnt == 0); 1042 ASSERT(ab->b_buf == NULL); 1043 to_delta = ab->b_size; 1044 } 1045 atomic_add_64(size, to_delta); 1046 1047 if (use_mutex) 1048 mutex_exit(&new_state->arcs_mtx); 1049 } 1050 } 1051 1052 ASSERT(!BUF_EMPTY(ab)); 1053 if (new_state == arc_anon) { 1054 buf_hash_remove(ab); 1055 } 1056 1057 /* adjust state sizes */ 1058 if (to_delta) 1059 atomic_add_64(&new_state->arcs_size, to_delta); 1060 if (from_delta) { 1061 ASSERT3U(old_state->arcs_size, >=, from_delta); 1062 atomic_add_64(&old_state->arcs_size, -from_delta); 1063 } 1064 ab->b_state = new_state; 1065 1066 /* adjust l2arc hdr stats */ 1067 if (new_state == arc_l2c_only) 1068 l2arc_hdr_stat_add(); 1069 else if (old_state == arc_l2c_only) 1070 l2arc_hdr_stat_remove(); 1071 } 1072 1073 void 1074 arc_space_consume(uint64_t space) 1075 { 1076 atomic_add_64(&arc_meta_used, space); 1077 atomic_add_64(&arc_size, space); 1078 } 1079 1080 void 1081 arc_space_return(uint64_t space) 1082 { 1083 ASSERT(arc_meta_used >= space); 1084 if (arc_meta_max < arc_meta_used) 1085 arc_meta_max = arc_meta_used; 1086 atomic_add_64(&arc_meta_used, -space); 1087 ASSERT(arc_size >= space); 1088 atomic_add_64(&arc_size, -space); 1089 } 1090 1091 void * 1092 arc_data_buf_alloc(uint64_t size) 1093 { 1094 if (arc_evict_needed(ARC_BUFC_DATA)) 1095 cv_signal(&arc_reclaim_thr_cv); 1096 atomic_add_64(&arc_size, size); 1097 return (zio_data_buf_alloc(size)); 1098 } 1099 1100 void 1101 arc_data_buf_free(void *buf, uint64_t size) 1102 { 1103 zio_data_buf_free(buf, size); 1104 ASSERT(arc_size >= size); 1105 atomic_add_64(&arc_size, -size); 1106 } 1107 1108 arc_buf_t * 1109 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 1110 { 1111 arc_buf_hdr_t *hdr; 1112 arc_buf_t *buf; 1113 1114 ASSERT3U(size, >, 0); 1115 hdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 1116 ASSERT(BUF_EMPTY(hdr)); 1117 hdr->b_size = size; 1118 hdr->b_type = type; 1119 hdr->b_spa = spa; 1120 hdr->b_state = arc_anon; 1121 hdr->b_arc_access = 0; 1122 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1123 buf->b_hdr = hdr; 1124 buf->b_data = NULL; 1125 buf->b_efunc = NULL; 1126 buf->b_private = NULL; 1127 buf->b_next = NULL; 1128 hdr->b_buf = buf; 1129 arc_get_data_buf(buf); 1130 hdr->b_datacnt = 1; 1131 hdr->b_flags = 0; 1132 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1133 (void) refcount_add(&hdr->b_refcnt, tag); 1134 1135 return (buf); 1136 } 1137 1138 static arc_buf_t * 1139 arc_buf_clone(arc_buf_t *from) 1140 { 1141 arc_buf_t *buf; 1142 arc_buf_hdr_t *hdr = from->b_hdr; 1143 uint64_t size = hdr->b_size; 1144 1145 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 1146 buf->b_hdr = hdr; 1147 buf->b_data = NULL; 1148 buf->b_efunc = NULL; 1149 buf->b_private = NULL; 1150 buf->b_next = hdr->b_buf; 1151 hdr->b_buf = buf; 1152 arc_get_data_buf(buf); 1153 bcopy(from->b_data, buf->b_data, size); 1154 hdr->b_datacnt += 1; 1155 return (buf); 1156 } 1157 1158 void 1159 arc_buf_add_ref(arc_buf_t *buf, void* tag) 1160 { 1161 arc_buf_hdr_t *hdr; 1162 kmutex_t *hash_lock; 1163 1164 /* 1165 * Check to see if this buffer is currently being evicted via 1166 * arc_do_user_evicts(). 1167 */ 1168 mutex_enter(&arc_eviction_mtx); 1169 hdr = buf->b_hdr; 1170 if (hdr == NULL) { 1171 mutex_exit(&arc_eviction_mtx); 1172 return; 1173 } 1174 hash_lock = HDR_LOCK(hdr); 1175 mutex_exit(&arc_eviction_mtx); 1176 1177 mutex_enter(hash_lock); 1178 if (buf->b_data == NULL) { 1179 /* 1180 * This buffer is evicted. 1181 */ 1182 mutex_exit(hash_lock); 1183 return; 1184 } 1185 1186 ASSERT(buf->b_hdr == hdr); 1187 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 1188 add_reference(hdr, hash_lock, tag); 1189 arc_access(hdr, hash_lock); 1190 mutex_exit(hash_lock); 1191 ARCSTAT_BUMP(arcstat_hits); 1192 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 1193 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 1194 data, metadata, hits); 1195 } 1196 1197 /* 1198 * Free the arc data buffer. If it is an l2arc write in progress, 1199 * the buffer is placed on l2arc_free_on_write to be freed later. 1200 */ 1201 static void 1202 arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), 1203 void *data, size_t size) 1204 { 1205 if (HDR_L2_WRITING(hdr)) { 1206 l2arc_data_free_t *df; 1207 df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); 1208 df->l2df_data = data; 1209 df->l2df_size = size; 1210 df->l2df_func = free_func; 1211 mutex_enter(&l2arc_free_on_write_mtx); 1212 list_insert_head(l2arc_free_on_write, df); 1213 mutex_exit(&l2arc_free_on_write_mtx); 1214 ARCSTAT_BUMP(arcstat_l2_free_on_write); 1215 } else { 1216 free_func(data, size); 1217 } 1218 } 1219 1220 static void 1221 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 1222 { 1223 arc_buf_t **bufp; 1224 1225 /* free up data associated with the buf */ 1226 if (buf->b_data) { 1227 arc_state_t *state = buf->b_hdr->b_state; 1228 uint64_t size = buf->b_hdr->b_size; 1229 arc_buf_contents_t type = buf->b_hdr->b_type; 1230 1231 arc_cksum_verify(buf); 1232 if (!recycle) { 1233 if (type == ARC_BUFC_METADATA) { 1234 arc_buf_data_free(buf->b_hdr, zio_buf_free, 1235 buf->b_data, size); 1236 arc_space_return(size); 1237 } else { 1238 ASSERT(type == ARC_BUFC_DATA); 1239 arc_buf_data_free(buf->b_hdr, 1240 zio_data_buf_free, buf->b_data, size); 1241 atomic_add_64(&arc_size, -size); 1242 } 1243 } 1244 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1245 uint64_t *cnt = &state->arcs_lsize[type]; 1246 1247 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1248 ASSERT(state != arc_anon); 1249 1250 ASSERT3U(*cnt, >=, size); 1251 atomic_add_64(cnt, -size); 1252 } 1253 ASSERT3U(state->arcs_size, >=, size); 1254 atomic_add_64(&state->arcs_size, -size); 1255 buf->b_data = NULL; 1256 ASSERT(buf->b_hdr->b_datacnt > 0); 1257 buf->b_hdr->b_datacnt -= 1; 1258 } 1259 1260 /* only remove the buf if requested */ 1261 if (!all) 1262 return; 1263 1264 /* remove the buf from the hdr list */ 1265 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1266 continue; 1267 *bufp = buf->b_next; 1268 1269 ASSERT(buf->b_efunc == NULL); 1270 1271 /* clean up the buf */ 1272 buf->b_hdr = NULL; 1273 kmem_cache_free(buf_cache, buf); 1274 } 1275 1276 static void 1277 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1278 { 1279 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1280 ASSERT3P(hdr->b_state, ==, arc_anon); 1281 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1282 ASSERT(!(hdr->b_flags & ARC_STORED)); 1283 1284 if (hdr->b_l2hdr != NULL) { 1285 if (!MUTEX_HELD(&l2arc_buflist_mtx)) { 1286 /* 1287 * To prevent arc_free() and l2arc_evict() from 1288 * attempting to free the same buffer at the same time, 1289 * a FREE_IN_PROGRESS flag is given to arc_free() to 1290 * give it priority. l2arc_evict() can't destroy this 1291 * header while we are waiting on l2arc_buflist_mtx. 1292 */ 1293 mutex_enter(&l2arc_buflist_mtx); 1294 ASSERT(hdr->b_l2hdr != NULL); 1295 1296 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); 1297 mutex_exit(&l2arc_buflist_mtx); 1298 } else { 1299 list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); 1300 } 1301 ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); 1302 kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); 1303 if (hdr->b_state == arc_l2c_only) 1304 l2arc_hdr_stat_remove(); 1305 hdr->b_l2hdr = NULL; 1306 } 1307 1308 if (!BUF_EMPTY(hdr)) { 1309 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1310 bzero(&hdr->b_dva, sizeof (dva_t)); 1311 hdr->b_birth = 0; 1312 hdr->b_cksum0 = 0; 1313 } 1314 while (hdr->b_buf) { 1315 arc_buf_t *buf = hdr->b_buf; 1316 1317 if (buf->b_efunc) { 1318 mutex_enter(&arc_eviction_mtx); 1319 ASSERT(buf->b_hdr != NULL); 1320 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1321 hdr->b_buf = buf->b_next; 1322 buf->b_hdr = &arc_eviction_hdr; 1323 buf->b_next = arc_eviction_list; 1324 arc_eviction_list = buf; 1325 mutex_exit(&arc_eviction_mtx); 1326 } else { 1327 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1328 } 1329 } 1330 if (hdr->b_freeze_cksum != NULL) { 1331 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1332 hdr->b_freeze_cksum = NULL; 1333 } 1334 1335 ASSERT(!list_link_active(&hdr->b_arc_node)); 1336 ASSERT3P(hdr->b_hash_next, ==, NULL); 1337 ASSERT3P(hdr->b_acb, ==, NULL); 1338 kmem_cache_free(hdr_cache, hdr); 1339 } 1340 1341 void 1342 arc_buf_free(arc_buf_t *buf, void *tag) 1343 { 1344 arc_buf_hdr_t *hdr = buf->b_hdr; 1345 int hashed = hdr->b_state != arc_anon; 1346 1347 ASSERT(buf->b_efunc == NULL); 1348 ASSERT(buf->b_data != NULL); 1349 1350 if (hashed) { 1351 kmutex_t *hash_lock = HDR_LOCK(hdr); 1352 1353 mutex_enter(hash_lock); 1354 (void) remove_reference(hdr, hash_lock, tag); 1355 if (hdr->b_datacnt > 1) 1356 arc_buf_destroy(buf, FALSE, TRUE); 1357 else 1358 hdr->b_flags |= ARC_BUF_AVAILABLE; 1359 mutex_exit(hash_lock); 1360 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1361 int destroy_hdr; 1362 /* 1363 * We are in the middle of an async write. Don't destroy 1364 * this buffer unless the write completes before we finish 1365 * decrementing the reference count. 1366 */ 1367 mutex_enter(&arc_eviction_mtx); 1368 (void) remove_reference(hdr, NULL, tag); 1369 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1370 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1371 mutex_exit(&arc_eviction_mtx); 1372 if (destroy_hdr) 1373 arc_hdr_destroy(hdr); 1374 } else { 1375 if (remove_reference(hdr, NULL, tag) > 0) { 1376 ASSERT(HDR_IO_ERROR(hdr)); 1377 arc_buf_destroy(buf, FALSE, TRUE); 1378 } else { 1379 arc_hdr_destroy(hdr); 1380 } 1381 } 1382 } 1383 1384 int 1385 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1386 { 1387 arc_buf_hdr_t *hdr = buf->b_hdr; 1388 kmutex_t *hash_lock = HDR_LOCK(hdr); 1389 int no_callback = (buf->b_efunc == NULL); 1390 1391 if (hdr->b_state == arc_anon) { 1392 arc_buf_free(buf, tag); 1393 return (no_callback); 1394 } 1395 1396 mutex_enter(hash_lock); 1397 ASSERT(hdr->b_state != arc_anon); 1398 ASSERT(buf->b_data != NULL); 1399 1400 (void) remove_reference(hdr, hash_lock, tag); 1401 if (hdr->b_datacnt > 1) { 1402 if (no_callback) 1403 arc_buf_destroy(buf, FALSE, TRUE); 1404 } else if (no_callback) { 1405 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1406 hdr->b_flags |= ARC_BUF_AVAILABLE; 1407 } 1408 ASSERT(no_callback || hdr->b_datacnt > 1 || 1409 refcount_is_zero(&hdr->b_refcnt)); 1410 mutex_exit(hash_lock); 1411 return (no_callback); 1412 } 1413 1414 int 1415 arc_buf_size(arc_buf_t *buf) 1416 { 1417 return (buf->b_hdr->b_size); 1418 } 1419 1420 /* 1421 * Evict buffers from list until we've removed the specified number of 1422 * bytes. Move the removed buffers to the appropriate evict state. 1423 * If the recycle flag is set, then attempt to "recycle" a buffer: 1424 * - look for a buffer to evict that is `bytes' long. 1425 * - return the data block from this buffer rather than freeing it. 1426 * This flag is used by callers that are trying to make space for a 1427 * new buffer in a full arc cache. 1428 * 1429 * This function makes a "best effort". It skips over any buffers 1430 * it can't get a hash_lock on, and so may not catch all candidates. 1431 * It may also return without evicting as much space as requested. 1432 */ 1433 static void * 1434 arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, 1435 arc_buf_contents_t type) 1436 { 1437 arc_state_t *evicted_state; 1438 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1439 arc_buf_hdr_t *ab, *ab_prev = NULL; 1440 list_t *list = &state->arcs_list[type]; 1441 kmutex_t *hash_lock; 1442 boolean_t have_lock; 1443 void *stolen = NULL; 1444 1445 ASSERT(state == arc_mru || state == arc_mfu); 1446 1447 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1448 1449 mutex_enter(&state->arcs_mtx); 1450 mutex_enter(&evicted_state->arcs_mtx); 1451 1452 for (ab = list_tail(list); ab; ab = ab_prev) { 1453 ab_prev = list_prev(list, ab); 1454 /* prefetch buffers have a minimum lifespan */ 1455 if (HDR_IO_IN_PROGRESS(ab) || 1456 (spa && ab->b_spa != spa) || 1457 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1458 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1459 skipped++; 1460 continue; 1461 } 1462 /* "lookahead" for better eviction candidate */ 1463 if (recycle && ab->b_size != bytes && 1464 ab_prev && ab_prev->b_size == bytes) 1465 continue; 1466 hash_lock = HDR_LOCK(ab); 1467 have_lock = MUTEX_HELD(hash_lock); 1468 if (have_lock || mutex_tryenter(hash_lock)) { 1469 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1470 ASSERT(ab->b_datacnt > 0); 1471 while (ab->b_buf) { 1472 arc_buf_t *buf = ab->b_buf; 1473 if (buf->b_data) { 1474 bytes_evicted += ab->b_size; 1475 if (recycle && ab->b_type == type && 1476 ab->b_size == bytes && 1477 !HDR_L2_WRITING(ab)) { 1478 stolen = buf->b_data; 1479 recycle = FALSE; 1480 } 1481 } 1482 if (buf->b_efunc) { 1483 mutex_enter(&arc_eviction_mtx); 1484 arc_buf_destroy(buf, 1485 buf->b_data == stolen, FALSE); 1486 ab->b_buf = buf->b_next; 1487 buf->b_hdr = &arc_eviction_hdr; 1488 buf->b_next = arc_eviction_list; 1489 arc_eviction_list = buf; 1490 mutex_exit(&arc_eviction_mtx); 1491 } else { 1492 arc_buf_destroy(buf, 1493 buf->b_data == stolen, TRUE); 1494 } 1495 } 1496 ASSERT(ab->b_datacnt == 0); 1497 arc_change_state(evicted_state, ab, hash_lock); 1498 ASSERT(HDR_IN_HASH_TABLE(ab)); 1499 ab->b_flags |= ARC_IN_HASH_TABLE; 1500 ab->b_flags &= ~ARC_BUF_AVAILABLE; 1501 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1502 if (!have_lock) 1503 mutex_exit(hash_lock); 1504 if (bytes >= 0 && bytes_evicted >= bytes) 1505 break; 1506 } else { 1507 missed += 1; 1508 } 1509 } 1510 1511 mutex_exit(&evicted_state->arcs_mtx); 1512 mutex_exit(&state->arcs_mtx); 1513 1514 if (bytes_evicted < bytes) 1515 dprintf("only evicted %lld bytes from %x", 1516 (longlong_t)bytes_evicted, state); 1517 1518 if (skipped) 1519 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1520 1521 if (missed) 1522 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1523 1524 /* 1525 * We have just evicted some date into the ghost state, make 1526 * sure we also adjust the ghost state size if necessary. 1527 */ 1528 if (arc_no_grow && 1529 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1530 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1531 arc_mru_ghost->arcs_size - arc_c; 1532 1533 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1534 int64_t todelete = 1535 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1536 arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1537 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1538 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1539 arc_mru_ghost->arcs_size + 1540 arc_mfu_ghost->arcs_size - arc_c); 1541 arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1542 } 1543 } 1544 1545 return (stolen); 1546 } 1547 1548 /* 1549 * Remove buffers from list until we've removed the specified number of 1550 * bytes. Destroy the buffers that are removed. 1551 */ 1552 static void 1553 arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) 1554 { 1555 arc_buf_hdr_t *ab, *ab_prev; 1556 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1557 kmutex_t *hash_lock; 1558 uint64_t bytes_deleted = 0; 1559 uint64_t bufs_skipped = 0; 1560 1561 ASSERT(GHOST_STATE(state)); 1562 top: 1563 mutex_enter(&state->arcs_mtx); 1564 for (ab = list_tail(list); ab; ab = ab_prev) { 1565 ab_prev = list_prev(list, ab); 1566 if (spa && ab->b_spa != spa) 1567 continue; 1568 hash_lock = HDR_LOCK(ab); 1569 if (mutex_tryenter(hash_lock)) { 1570 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1571 ASSERT(ab->b_buf == NULL); 1572 ARCSTAT_BUMP(arcstat_deleted); 1573 bytes_deleted += ab->b_size; 1574 1575 if (ab->b_l2hdr != NULL) { 1576 /* 1577 * This buffer is cached on the 2nd Level ARC; 1578 * don't destroy the header. 1579 */ 1580 arc_change_state(arc_l2c_only, ab, hash_lock); 1581 mutex_exit(hash_lock); 1582 } else { 1583 arc_change_state(arc_anon, ab, hash_lock); 1584 mutex_exit(hash_lock); 1585 arc_hdr_destroy(ab); 1586 } 1587 1588 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1589 if (bytes >= 0 && bytes_deleted >= bytes) 1590 break; 1591 } else { 1592 if (bytes < 0) { 1593 mutex_exit(&state->arcs_mtx); 1594 mutex_enter(hash_lock); 1595 mutex_exit(hash_lock); 1596 goto top; 1597 } 1598 bufs_skipped += 1; 1599 } 1600 } 1601 mutex_exit(&state->arcs_mtx); 1602 1603 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1604 (bytes < 0 || bytes_deleted < bytes)) { 1605 list = &state->arcs_list[ARC_BUFC_METADATA]; 1606 goto top; 1607 } 1608 1609 if (bufs_skipped) { 1610 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1611 ASSERT(bytes >= 0); 1612 } 1613 1614 if (bytes_deleted < bytes) 1615 dprintf("only deleted %lld bytes from %p", 1616 (longlong_t)bytes_deleted, state); 1617 } 1618 1619 static void 1620 arc_adjust(void) 1621 { 1622 int64_t top_sz, mru_over, arc_over, todelete; 1623 1624 top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; 1625 1626 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1627 int64_t toevict = 1628 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1629 (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); 1630 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1631 } 1632 1633 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1634 int64_t toevict = 1635 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1636 (void) arc_evict(arc_mru, NULL, toevict, FALSE, 1637 ARC_BUFC_METADATA); 1638 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1639 } 1640 1641 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1642 1643 if (mru_over > 0) { 1644 if (arc_mru_ghost->arcs_size > 0) { 1645 todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1646 arc_evict_ghost(arc_mru_ghost, NULL, todelete); 1647 } 1648 } 1649 1650 if ((arc_over = arc_size - arc_c) > 0) { 1651 int64_t tbl_over; 1652 1653 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1654 int64_t toevict = 1655 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1656 (void) arc_evict(arc_mfu, NULL, toevict, FALSE, 1657 ARC_BUFC_DATA); 1658 arc_over = arc_size - arc_c; 1659 } 1660 1661 if (arc_over > 0 && 1662 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1663 int64_t toevict = 1664 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 1665 arc_over); 1666 (void) arc_evict(arc_mfu, NULL, toevict, FALSE, 1667 ARC_BUFC_METADATA); 1668 } 1669 1670 tbl_over = arc_size + arc_mru_ghost->arcs_size + 1671 arc_mfu_ghost->arcs_size - arc_c * 2; 1672 1673 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 1674 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1675 arc_evict_ghost(arc_mfu_ghost, NULL, todelete); 1676 } 1677 } 1678 } 1679 1680 static void 1681 arc_do_user_evicts(void) 1682 { 1683 mutex_enter(&arc_eviction_mtx); 1684 while (arc_eviction_list != NULL) { 1685 arc_buf_t *buf = arc_eviction_list; 1686 arc_eviction_list = buf->b_next; 1687 buf->b_hdr = NULL; 1688 mutex_exit(&arc_eviction_mtx); 1689 1690 if (buf->b_efunc != NULL) 1691 VERIFY(buf->b_efunc(buf) == 0); 1692 1693 buf->b_efunc = NULL; 1694 buf->b_private = NULL; 1695 kmem_cache_free(buf_cache, buf); 1696 mutex_enter(&arc_eviction_mtx); 1697 } 1698 mutex_exit(&arc_eviction_mtx); 1699 } 1700 1701 /* 1702 * Flush all *evictable* data from the cache for the given spa. 1703 * NOTE: this will not touch "active" (i.e. referenced) data. 1704 */ 1705 void 1706 arc_flush(spa_t *spa) 1707 { 1708 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { 1709 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); 1710 if (spa) 1711 break; 1712 } 1713 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { 1714 (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); 1715 if (spa) 1716 break; 1717 } 1718 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { 1719 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); 1720 if (spa) 1721 break; 1722 } 1723 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { 1724 (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); 1725 if (spa) 1726 break; 1727 } 1728 1729 arc_evict_ghost(arc_mru_ghost, spa, -1); 1730 arc_evict_ghost(arc_mfu_ghost, spa, -1); 1731 1732 mutex_enter(&arc_reclaim_thr_lock); 1733 arc_do_user_evicts(); 1734 mutex_exit(&arc_reclaim_thr_lock); 1735 ASSERT(spa || arc_eviction_list == NULL); 1736 } 1737 1738 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1739 1740 void 1741 arc_shrink(void) 1742 { 1743 if (arc_c > arc_c_min) { 1744 uint64_t to_free; 1745 1746 #ifdef _KERNEL 1747 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1748 #else 1749 to_free = arc_c >> arc_shrink_shift; 1750 #endif 1751 if (arc_c > arc_c_min + to_free) 1752 atomic_add_64(&arc_c, -to_free); 1753 else 1754 arc_c = arc_c_min; 1755 1756 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1757 if (arc_c > arc_size) 1758 arc_c = MAX(arc_size, arc_c_min); 1759 if (arc_p > arc_c) 1760 arc_p = (arc_c >> 1); 1761 ASSERT(arc_c >= arc_c_min); 1762 ASSERT((int64_t)arc_p >= 0); 1763 } 1764 1765 if (arc_size > arc_c) 1766 arc_adjust(); 1767 } 1768 1769 static int 1770 arc_reclaim_needed(void) 1771 { 1772 uint64_t extra; 1773 1774 #ifdef _KERNEL 1775 1776 if (needfree) 1777 return (1); 1778 1779 /* 1780 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1781 */ 1782 extra = desfree; 1783 1784 /* 1785 * check that we're out of range of the pageout scanner. It starts to 1786 * schedule paging if freemem is less than lotsfree and needfree. 1787 * lotsfree is the high-water mark for pageout, and needfree is the 1788 * number of needed free pages. We add extra pages here to make sure 1789 * the scanner doesn't start up while we're freeing memory. 1790 */ 1791 if (freemem < lotsfree + needfree + extra) 1792 return (1); 1793 1794 /* 1795 * check to make sure that swapfs has enough space so that anon 1796 * reservations can still succeed. anon_resvmem() checks that the 1797 * availrmem is greater than swapfs_minfree, and the number of reserved 1798 * swap pages. We also add a bit of extra here just to prevent 1799 * circumstances from getting really dire. 1800 */ 1801 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1802 return (1); 1803 1804 #if defined(__i386) 1805 /* 1806 * If we're on an i386 platform, it's possible that we'll exhaust the 1807 * kernel heap space before we ever run out of available physical 1808 * memory. Most checks of the size of the heap_area compare against 1809 * tune.t_minarmem, which is the minimum available real memory that we 1810 * can have in the system. However, this is generally fixed at 25 pages 1811 * which is so low that it's useless. In this comparison, we seek to 1812 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1813 * heap is allocated. (Or, in the calculation, if less than 1/4th is 1814 * free) 1815 */ 1816 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1817 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1818 return (1); 1819 #endif 1820 1821 #else 1822 if (spa_get_random(100) == 0) 1823 return (1); 1824 #endif 1825 return (0); 1826 } 1827 1828 static void 1829 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1830 { 1831 size_t i; 1832 kmem_cache_t *prev_cache = NULL; 1833 kmem_cache_t *prev_data_cache = NULL; 1834 extern kmem_cache_t *zio_buf_cache[]; 1835 extern kmem_cache_t *zio_data_buf_cache[]; 1836 1837 #ifdef _KERNEL 1838 if (arc_meta_used >= arc_meta_limit) { 1839 /* 1840 * We are exceeding our meta-data cache limit. 1841 * Purge some DNLC entries to release holds on meta-data. 1842 */ 1843 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1844 } 1845 #if defined(__i386) 1846 /* 1847 * Reclaim unused memory from all kmem caches. 1848 */ 1849 kmem_reap(); 1850 #endif 1851 #endif 1852 1853 /* 1854 * An aggressive reclamation will shrink the cache size as well as 1855 * reap free buffers from the arc kmem caches. 1856 */ 1857 if (strat == ARC_RECLAIM_AGGR) 1858 arc_shrink(); 1859 1860 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1861 if (zio_buf_cache[i] != prev_cache) { 1862 prev_cache = zio_buf_cache[i]; 1863 kmem_cache_reap_now(zio_buf_cache[i]); 1864 } 1865 if (zio_data_buf_cache[i] != prev_data_cache) { 1866 prev_data_cache = zio_data_buf_cache[i]; 1867 kmem_cache_reap_now(zio_data_buf_cache[i]); 1868 } 1869 } 1870 kmem_cache_reap_now(buf_cache); 1871 kmem_cache_reap_now(hdr_cache); 1872 } 1873 1874 static void 1875 arc_reclaim_thread(void) 1876 { 1877 clock_t growtime = 0; 1878 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1879 callb_cpr_t cpr; 1880 1881 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1882 1883 mutex_enter(&arc_reclaim_thr_lock); 1884 while (arc_thread_exit == 0) { 1885 if (arc_reclaim_needed()) { 1886 1887 if (arc_no_grow) { 1888 if (last_reclaim == ARC_RECLAIM_CONS) { 1889 last_reclaim = ARC_RECLAIM_AGGR; 1890 } else { 1891 last_reclaim = ARC_RECLAIM_CONS; 1892 } 1893 } else { 1894 arc_no_grow = TRUE; 1895 last_reclaim = ARC_RECLAIM_AGGR; 1896 membar_producer(); 1897 } 1898 1899 /* reset the growth delay for every reclaim */ 1900 growtime = lbolt + (arc_grow_retry * hz); 1901 1902 arc_kmem_reap_now(last_reclaim); 1903 arc_warm = B_TRUE; 1904 1905 } else if (arc_no_grow && lbolt >= growtime) { 1906 arc_no_grow = FALSE; 1907 } 1908 1909 if (2 * arc_c < arc_size + 1910 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1911 arc_adjust(); 1912 1913 if (arc_eviction_list != NULL) 1914 arc_do_user_evicts(); 1915 1916 /* block until needed, or one second, whichever is shorter */ 1917 CALLB_CPR_SAFE_BEGIN(&cpr); 1918 (void) cv_timedwait(&arc_reclaim_thr_cv, 1919 &arc_reclaim_thr_lock, (lbolt + hz)); 1920 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1921 } 1922 1923 arc_thread_exit = 0; 1924 cv_broadcast(&arc_reclaim_thr_cv); 1925 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1926 thread_exit(); 1927 } 1928 1929 /* 1930 * Adapt arc info given the number of bytes we are trying to add and 1931 * the state that we are comming from. This function is only called 1932 * when we are adding new content to the cache. 1933 */ 1934 static void 1935 arc_adapt(int bytes, arc_state_t *state) 1936 { 1937 int mult; 1938 1939 if (state == arc_l2c_only) 1940 return; 1941 1942 ASSERT(bytes > 0); 1943 /* 1944 * Adapt the target size of the MRU list: 1945 * - if we just hit in the MRU ghost list, then increase 1946 * the target size of the MRU list. 1947 * - if we just hit in the MFU ghost list, then increase 1948 * the target size of the MFU list by decreasing the 1949 * target size of the MRU list. 1950 */ 1951 if (state == arc_mru_ghost) { 1952 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1953 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1954 1955 arc_p = MIN(arc_c, arc_p + bytes * mult); 1956 } else if (state == arc_mfu_ghost) { 1957 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1958 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1959 1960 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1961 } 1962 ASSERT((int64_t)arc_p >= 0); 1963 1964 if (arc_reclaim_needed()) { 1965 cv_signal(&arc_reclaim_thr_cv); 1966 return; 1967 } 1968 1969 if (arc_no_grow) 1970 return; 1971 1972 if (arc_c >= arc_c_max) 1973 return; 1974 1975 /* 1976 * If we're within (2 * maxblocksize) bytes of the target 1977 * cache size, increment the target cache size 1978 */ 1979 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1980 atomic_add_64(&arc_c, (int64_t)bytes); 1981 if (arc_c > arc_c_max) 1982 arc_c = arc_c_max; 1983 else if (state == arc_anon) 1984 atomic_add_64(&arc_p, (int64_t)bytes); 1985 if (arc_p > arc_c) 1986 arc_p = arc_c; 1987 } 1988 ASSERT((int64_t)arc_p >= 0); 1989 } 1990 1991 /* 1992 * Check if the cache has reached its limits and eviction is required 1993 * prior to insert. 1994 */ 1995 static int 1996 arc_evict_needed(arc_buf_contents_t type) 1997 { 1998 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 1999 return (1); 2000 2001 #ifdef _KERNEL 2002 /* 2003 * If zio data pages are being allocated out of a separate heap segment, 2004 * then enforce that the size of available vmem for this area remains 2005 * above about 1/32nd free. 2006 */ 2007 if (type == ARC_BUFC_DATA && zio_arena != NULL && 2008 vmem_size(zio_arena, VMEM_FREE) < 2009 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 2010 return (1); 2011 #endif 2012 2013 if (arc_reclaim_needed()) 2014 return (1); 2015 2016 return (arc_size > arc_c); 2017 } 2018 2019 /* 2020 * The buffer, supplied as the first argument, needs a data block. 2021 * So, if we are at cache max, determine which cache should be victimized. 2022 * We have the following cases: 2023 * 2024 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 2025 * In this situation if we're out of space, but the resident size of the MFU is 2026 * under the limit, victimize the MFU cache to satisfy this insertion request. 2027 * 2028 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 2029 * Here, we've used up all of the available space for the MRU, so we need to 2030 * evict from our own cache instead. Evict from the set of resident MRU 2031 * entries. 2032 * 2033 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 2034 * c minus p represents the MFU space in the cache, since p is the size of the 2035 * cache that is dedicated to the MRU. In this situation there's still space on 2036 * the MFU side, so the MRU side needs to be victimized. 2037 * 2038 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 2039 * MFU's resident set is consuming more space than it has been allotted. In 2040 * this situation, we must victimize our own cache, the MFU, for this insertion. 2041 */ 2042 static void 2043 arc_get_data_buf(arc_buf_t *buf) 2044 { 2045 arc_state_t *state = buf->b_hdr->b_state; 2046 uint64_t size = buf->b_hdr->b_size; 2047 arc_buf_contents_t type = buf->b_hdr->b_type; 2048 2049 arc_adapt(size, state); 2050 2051 /* 2052 * We have not yet reached cache maximum size, 2053 * just allocate a new buffer. 2054 */ 2055 if (!arc_evict_needed(type)) { 2056 if (type == ARC_BUFC_METADATA) { 2057 buf->b_data = zio_buf_alloc(size); 2058 arc_space_consume(size); 2059 } else { 2060 ASSERT(type == ARC_BUFC_DATA); 2061 buf->b_data = zio_data_buf_alloc(size); 2062 atomic_add_64(&arc_size, size); 2063 } 2064 goto out; 2065 } 2066 2067 /* 2068 * If we are prefetching from the mfu ghost list, this buffer 2069 * will end up on the mru list; so steal space from there. 2070 */ 2071 if (state == arc_mfu_ghost) 2072 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 2073 else if (state == arc_mru_ghost) 2074 state = arc_mru; 2075 2076 if (state == arc_mru || state == arc_anon) { 2077 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 2078 state = (arc_mfu->arcs_lsize[type] > 0 && 2079 arc_p > mru_used) ? arc_mfu : arc_mru; 2080 } else { 2081 /* MFU cases */ 2082 uint64_t mfu_space = arc_c - arc_p; 2083 state = (arc_mru->arcs_lsize[type] > 0 && 2084 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 2085 } 2086 if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { 2087 if (type == ARC_BUFC_METADATA) { 2088 buf->b_data = zio_buf_alloc(size); 2089 arc_space_consume(size); 2090 } else { 2091 ASSERT(type == ARC_BUFC_DATA); 2092 buf->b_data = zio_data_buf_alloc(size); 2093 atomic_add_64(&arc_size, size); 2094 } 2095 ARCSTAT_BUMP(arcstat_recycle_miss); 2096 } 2097 ASSERT(buf->b_data != NULL); 2098 out: 2099 /* 2100 * Update the state size. Note that ghost states have a 2101 * "ghost size" and so don't need to be updated. 2102 */ 2103 if (!GHOST_STATE(buf->b_hdr->b_state)) { 2104 arc_buf_hdr_t *hdr = buf->b_hdr; 2105 2106 atomic_add_64(&hdr->b_state->arcs_size, size); 2107 if (list_link_active(&hdr->b_arc_node)) { 2108 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2109 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 2110 } 2111 /* 2112 * If we are growing the cache, and we are adding anonymous 2113 * data, and we have outgrown arc_p, update arc_p 2114 */ 2115 if (arc_size < arc_c && hdr->b_state == arc_anon && 2116 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 2117 arc_p = MIN(arc_c, arc_p + size); 2118 } 2119 } 2120 2121 /* 2122 * This routine is called whenever a buffer is accessed. 2123 * NOTE: the hash lock is dropped in this function. 2124 */ 2125 static void 2126 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 2127 { 2128 ASSERT(MUTEX_HELD(hash_lock)); 2129 2130 if (buf->b_state == arc_anon) { 2131 /* 2132 * This buffer is not in the cache, and does not 2133 * appear in our "ghost" list. Add the new buffer 2134 * to the MRU state. 2135 */ 2136 2137 ASSERT(buf->b_arc_access == 0); 2138 buf->b_arc_access = lbolt; 2139 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2140 arc_change_state(arc_mru, buf, hash_lock); 2141 2142 } else if (buf->b_state == arc_mru) { 2143 /* 2144 * If this buffer is here because of a prefetch, then either: 2145 * - clear the flag if this is a "referencing" read 2146 * (any subsequent access will bump this into the MFU state). 2147 * or 2148 * - move the buffer to the head of the list if this is 2149 * another prefetch (to make it less likely to be evicted). 2150 */ 2151 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2152 if (refcount_count(&buf->b_refcnt) == 0) { 2153 ASSERT(list_link_active(&buf->b_arc_node)); 2154 } else { 2155 buf->b_flags &= ~ARC_PREFETCH; 2156 ARCSTAT_BUMP(arcstat_mru_hits); 2157 } 2158 buf->b_arc_access = lbolt; 2159 return; 2160 } 2161 2162 /* 2163 * This buffer has been "accessed" only once so far, 2164 * but it is still in the cache. Move it to the MFU 2165 * state. 2166 */ 2167 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 2168 /* 2169 * More than 125ms have passed since we 2170 * instantiated this buffer. Move it to the 2171 * most frequently used state. 2172 */ 2173 buf->b_arc_access = lbolt; 2174 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2175 arc_change_state(arc_mfu, buf, hash_lock); 2176 } 2177 ARCSTAT_BUMP(arcstat_mru_hits); 2178 } else if (buf->b_state == arc_mru_ghost) { 2179 arc_state_t *new_state; 2180 /* 2181 * This buffer has been "accessed" recently, but 2182 * was evicted from the cache. Move it to the 2183 * MFU state. 2184 */ 2185 2186 if (buf->b_flags & ARC_PREFETCH) { 2187 new_state = arc_mru; 2188 if (refcount_count(&buf->b_refcnt) > 0) 2189 buf->b_flags &= ~ARC_PREFETCH; 2190 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 2191 } else { 2192 new_state = arc_mfu; 2193 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2194 } 2195 2196 buf->b_arc_access = lbolt; 2197 arc_change_state(new_state, buf, hash_lock); 2198 2199 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 2200 } else if (buf->b_state == arc_mfu) { 2201 /* 2202 * This buffer has been accessed more than once and is 2203 * still in the cache. Keep it in the MFU state. 2204 * 2205 * NOTE: an add_reference() that occurred when we did 2206 * the arc_read() will have kicked this off the list. 2207 * If it was a prefetch, we will explicitly move it to 2208 * the head of the list now. 2209 */ 2210 if ((buf->b_flags & ARC_PREFETCH) != 0) { 2211 ASSERT(refcount_count(&buf->b_refcnt) == 0); 2212 ASSERT(list_link_active(&buf->b_arc_node)); 2213 } 2214 ARCSTAT_BUMP(arcstat_mfu_hits); 2215 buf->b_arc_access = lbolt; 2216 } else if (buf->b_state == arc_mfu_ghost) { 2217 arc_state_t *new_state = arc_mfu; 2218 /* 2219 * This buffer has been accessed more than once but has 2220 * been evicted from the cache. Move it back to the 2221 * MFU state. 2222 */ 2223 2224 if (buf->b_flags & ARC_PREFETCH) { 2225 /* 2226 * This is a prefetch access... 2227 * move this block back to the MRU state. 2228 */ 2229 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 2230 new_state = arc_mru; 2231 } 2232 2233 buf->b_arc_access = lbolt; 2234 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2235 arc_change_state(new_state, buf, hash_lock); 2236 2237 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 2238 } else if (buf->b_state == arc_l2c_only) { 2239 /* 2240 * This buffer is on the 2nd Level ARC. 2241 */ 2242 2243 buf->b_arc_access = lbolt; 2244 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 2245 arc_change_state(arc_mfu, buf, hash_lock); 2246 } else { 2247 ASSERT(!"invalid arc state"); 2248 } 2249 } 2250 2251 /* a generic arc_done_func_t which you can use */ 2252 /* ARGSUSED */ 2253 void 2254 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 2255 { 2256 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 2257 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2258 } 2259 2260 /* a generic arc_done_func_t */ 2261 void 2262 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 2263 { 2264 arc_buf_t **bufp = arg; 2265 if (zio && zio->io_error) { 2266 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 2267 *bufp = NULL; 2268 } else { 2269 *bufp = buf; 2270 } 2271 } 2272 2273 static void 2274 arc_read_done(zio_t *zio) 2275 { 2276 arc_buf_hdr_t *hdr, *found; 2277 arc_buf_t *buf; 2278 arc_buf_t *abuf; /* buffer we're assigning to callback */ 2279 kmutex_t *hash_lock; 2280 arc_callback_t *callback_list, *acb; 2281 int freeable = FALSE; 2282 2283 buf = zio->io_private; 2284 hdr = buf->b_hdr; 2285 2286 /* 2287 * The hdr was inserted into hash-table and removed from lists 2288 * prior to starting I/O. We should find this header, since 2289 * it's in the hash table, and it should be legit since it's 2290 * not possible to evict it during the I/O. The only possible 2291 * reason for it not to be found is if we were freed during the 2292 * read. 2293 */ 2294 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 2295 &hash_lock); 2296 2297 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 2298 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 2299 (found == hdr && HDR_L2_READING(hdr))); 2300 2301 hdr->b_flags &= ~ARC_L2_EVICTED; 2302 if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) 2303 hdr->b_flags |= ARC_DONT_L2CACHE; 2304 2305 /* byteswap if necessary */ 2306 callback_list = hdr->b_acb; 2307 ASSERT(callback_list != NULL); 2308 if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 2309 arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? 2310 byteswap_uint64_array : 2311 dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; 2312 func(buf->b_data, hdr->b_size); 2313 } 2314 2315 arc_cksum_compute(buf, B_FALSE); 2316 2317 /* create copies of the data buffer for the callers */ 2318 abuf = buf; 2319 for (acb = callback_list; acb; acb = acb->acb_next) { 2320 if (acb->acb_done) { 2321 if (abuf == NULL) 2322 abuf = arc_buf_clone(buf); 2323 acb->acb_buf = abuf; 2324 abuf = NULL; 2325 } 2326 } 2327 hdr->b_acb = NULL; 2328 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2329 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2330 if (abuf == buf) 2331 hdr->b_flags |= ARC_BUF_AVAILABLE; 2332 2333 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2334 2335 if (zio->io_error != 0) { 2336 hdr->b_flags |= ARC_IO_ERROR; 2337 if (hdr->b_state != arc_anon) 2338 arc_change_state(arc_anon, hdr, hash_lock); 2339 if (HDR_IN_HASH_TABLE(hdr)) 2340 buf_hash_remove(hdr); 2341 freeable = refcount_is_zero(&hdr->b_refcnt); 2342 /* convert checksum errors into IO errors */ 2343 if (zio->io_error == ECKSUM) 2344 zio->io_error = EIO; 2345 } 2346 2347 /* 2348 * Broadcast before we drop the hash_lock to avoid the possibility 2349 * that the hdr (and hence the cv) might be freed before we get to 2350 * the cv_broadcast(). 2351 */ 2352 cv_broadcast(&hdr->b_cv); 2353 2354 if (hash_lock) { 2355 /* 2356 * Only call arc_access on anonymous buffers. This is because 2357 * if we've issued an I/O for an evicted buffer, we've already 2358 * called arc_access (to prevent any simultaneous readers from 2359 * getting confused). 2360 */ 2361 if (zio->io_error == 0 && hdr->b_state == arc_anon) 2362 arc_access(hdr, hash_lock); 2363 mutex_exit(hash_lock); 2364 } else { 2365 /* 2366 * This block was freed while we waited for the read to 2367 * complete. It has been removed from the hash table and 2368 * moved to the anonymous state (so that it won't show up 2369 * in the cache). 2370 */ 2371 ASSERT3P(hdr->b_state, ==, arc_anon); 2372 freeable = refcount_is_zero(&hdr->b_refcnt); 2373 } 2374 2375 /* execute each callback and free its structure */ 2376 while ((acb = callback_list) != NULL) { 2377 if (acb->acb_done) 2378 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2379 2380 if (acb->acb_zio_dummy != NULL) { 2381 acb->acb_zio_dummy->io_error = zio->io_error; 2382 zio_nowait(acb->acb_zio_dummy); 2383 } 2384 2385 callback_list = acb->acb_next; 2386 kmem_free(acb, sizeof (arc_callback_t)); 2387 } 2388 2389 if (freeable) 2390 arc_hdr_destroy(hdr); 2391 } 2392 2393 /* 2394 * "Read" the block block at the specified DVA (in bp) via the 2395 * cache. If the block is found in the cache, invoke the provided 2396 * callback immediately and return. Note that the `zio' parameter 2397 * in the callback will be NULL in this case, since no IO was 2398 * required. If the block is not in the cache pass the read request 2399 * on to the spa with a substitute callback function, so that the 2400 * requested block will be added to the cache. 2401 * 2402 * If a read request arrives for a block that has a read in-progress, 2403 * either wait for the in-progress read to complete (and return the 2404 * results); or, if this is a read with a "done" func, add a record 2405 * to the read to invoke the "done" func when the read completes, 2406 * and return; or just return. 2407 * 2408 * arc_read_done() will invoke all the requested "done" functions 2409 * for readers of this block. 2410 * 2411 * Normal callers should use arc_read and pass the arc buffer and offset 2412 * for the bp. But if you know you don't need locking, you can use 2413 * arc_read_bp. 2414 */ 2415 int 2416 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, 2417 arc_done_func_t *done, void *private, int priority, int flags, 2418 uint32_t *arc_flags, const zbookmark_t *zb) 2419 { 2420 int err; 2421 2422 ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); 2423 ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); 2424 rw_enter(&pbuf->b_hdr->b_datalock, RW_READER); 2425 2426 err = arc_read_nolock(pio, spa, bp, done, private, priority, 2427 flags, arc_flags, zb); 2428 2429 rw_exit(&pbuf->b_hdr->b_datalock); 2430 return (err); 2431 } 2432 2433 int 2434 arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, 2435 arc_done_func_t *done, void *private, int priority, int flags, 2436 uint32_t *arc_flags, const zbookmark_t *zb) 2437 { 2438 arc_buf_hdr_t *hdr; 2439 arc_buf_t *buf; 2440 kmutex_t *hash_lock; 2441 zio_t *rzio; 2442 2443 top: 2444 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2445 if (hdr && hdr->b_datacnt > 0) { 2446 2447 *arc_flags |= ARC_CACHED; 2448 2449 if (HDR_IO_IN_PROGRESS(hdr)) { 2450 2451 if (*arc_flags & ARC_WAIT) { 2452 cv_wait(&hdr->b_cv, hash_lock); 2453 mutex_exit(hash_lock); 2454 goto top; 2455 } 2456 ASSERT(*arc_flags & ARC_NOWAIT); 2457 2458 if (done) { 2459 arc_callback_t *acb = NULL; 2460 2461 acb = kmem_zalloc(sizeof (arc_callback_t), 2462 KM_SLEEP); 2463 acb->acb_done = done; 2464 acb->acb_private = private; 2465 if (pio != NULL) 2466 acb->acb_zio_dummy = zio_null(pio, 2467 spa, NULL, NULL, flags); 2468 2469 ASSERT(acb->acb_done != NULL); 2470 acb->acb_next = hdr->b_acb; 2471 hdr->b_acb = acb; 2472 add_reference(hdr, hash_lock, private); 2473 mutex_exit(hash_lock); 2474 return (0); 2475 } 2476 mutex_exit(hash_lock); 2477 return (0); 2478 } 2479 2480 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2481 2482 if (done) { 2483 add_reference(hdr, hash_lock, private); 2484 /* 2485 * If this block is already in use, create a new 2486 * copy of the data so that we will be guaranteed 2487 * that arc_release() will always succeed. 2488 */ 2489 buf = hdr->b_buf; 2490 ASSERT(buf); 2491 ASSERT(buf->b_data); 2492 if (HDR_BUF_AVAILABLE(hdr)) { 2493 ASSERT(buf->b_efunc == NULL); 2494 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2495 } else { 2496 buf = arc_buf_clone(buf); 2497 } 2498 } else if (*arc_flags & ARC_PREFETCH && 2499 refcount_count(&hdr->b_refcnt) == 0) { 2500 hdr->b_flags |= ARC_PREFETCH; 2501 } 2502 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2503 arc_access(hdr, hash_lock); 2504 mutex_exit(hash_lock); 2505 ARCSTAT_BUMP(arcstat_hits); 2506 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2507 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2508 data, metadata, hits); 2509 2510 if (done) 2511 done(NULL, buf, private); 2512 } else { 2513 uint64_t size = BP_GET_LSIZE(bp); 2514 arc_callback_t *acb; 2515 vdev_t *vd = NULL; 2516 daddr_t addr; 2517 2518 if (hdr == NULL) { 2519 /* this block is not in the cache */ 2520 arc_buf_hdr_t *exists; 2521 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2522 buf = arc_buf_alloc(spa, size, private, type); 2523 hdr = buf->b_hdr; 2524 hdr->b_dva = *BP_IDENTITY(bp); 2525 hdr->b_birth = bp->blk_birth; 2526 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2527 exists = buf_hash_insert(hdr, &hash_lock); 2528 if (exists) { 2529 /* somebody beat us to the hash insert */ 2530 mutex_exit(hash_lock); 2531 bzero(&hdr->b_dva, sizeof (dva_t)); 2532 hdr->b_birth = 0; 2533 hdr->b_cksum0 = 0; 2534 (void) arc_buf_remove_ref(buf, private); 2535 goto top; /* restart the IO request */ 2536 } 2537 /* if this is a prefetch, we don't have a reference */ 2538 if (*arc_flags & ARC_PREFETCH) { 2539 (void) remove_reference(hdr, hash_lock, 2540 private); 2541 hdr->b_flags |= ARC_PREFETCH; 2542 } 2543 if (BP_GET_LEVEL(bp) > 0) 2544 hdr->b_flags |= ARC_INDIRECT; 2545 } else { 2546 /* this block is in the ghost cache */ 2547 ASSERT(GHOST_STATE(hdr->b_state)); 2548 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2549 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2550 ASSERT(hdr->b_buf == NULL); 2551 2552 /* if this is a prefetch, we don't have a reference */ 2553 if (*arc_flags & ARC_PREFETCH) 2554 hdr->b_flags |= ARC_PREFETCH; 2555 else 2556 add_reference(hdr, hash_lock, private); 2557 buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 2558 buf->b_hdr = hdr; 2559 buf->b_data = NULL; 2560 buf->b_efunc = NULL; 2561 buf->b_private = NULL; 2562 buf->b_next = NULL; 2563 hdr->b_buf = buf; 2564 arc_get_data_buf(buf); 2565 ASSERT(hdr->b_datacnt == 0); 2566 hdr->b_datacnt = 1; 2567 2568 } 2569 2570 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2571 acb->acb_done = done; 2572 acb->acb_private = private; 2573 2574 ASSERT(hdr->b_acb == NULL); 2575 hdr->b_acb = acb; 2576 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2577 2578 /* 2579 * If the buffer has been evicted, migrate it to a present state 2580 * before issuing the I/O. Once we drop the hash-table lock, 2581 * the header will be marked as I/O in progress and have an 2582 * attached buffer. At this point, anybody who finds this 2583 * buffer ought to notice that it's legit but has a pending I/O. 2584 */ 2585 2586 if (GHOST_STATE(hdr->b_state)) 2587 arc_access(hdr, hash_lock); 2588 2589 if (hdr->b_l2hdr != NULL) { 2590 vd = hdr->b_l2hdr->b_dev->l2ad_vdev; 2591 addr = hdr->b_l2hdr->b_daddr; 2592 } 2593 2594 mutex_exit(hash_lock); 2595 2596 ASSERT3U(hdr->b_size, ==, size); 2597 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2598 zbookmark_t *, zb); 2599 ARCSTAT_BUMP(arcstat_misses); 2600 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2601 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2602 data, metadata, misses); 2603 2604 if (l2arc_ndev != 0) { 2605 /* 2606 * Lock out device removal. 2607 */ 2608 spa_config_enter(spa, RW_READER, FTAG); 2609 2610 /* 2611 * Read from the L2ARC if the following are true: 2612 * 1. The L2ARC vdev was previously cached. 2613 * 2. This buffer still has L2ARC metadata. 2614 * 3. This buffer isn't currently writing to the L2ARC. 2615 * 4. The L2ARC entry wasn't evicted, which may 2616 * also have invalidated the vdev. 2617 */ 2618 if (vd != NULL && hdr->b_l2hdr != NULL && 2619 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { 2620 l2arc_read_callback_t *cb; 2621 2622 if (vdev_is_dead(vd)) 2623 goto l2skip; 2624 2625 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 2626 ARCSTAT_BUMP(arcstat_l2_hits); 2627 2628 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 2629 KM_SLEEP); 2630 cb->l2rcb_buf = buf; 2631 cb->l2rcb_spa = spa; 2632 cb->l2rcb_bp = *bp; 2633 cb->l2rcb_zb = *zb; 2634 cb->l2rcb_flags = flags; 2635 2636 /* 2637 * l2arc read. 2638 */ 2639 rzio = zio_read_phys(pio, vd, addr, size, 2640 buf->b_data, ZIO_CHECKSUM_OFF, 2641 l2arc_read_done, cb, priority, flags | 2642 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL, 2643 B_FALSE); 2644 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 2645 zio_t *, rzio); 2646 spa_config_exit(spa, FTAG); 2647 2648 if (*arc_flags & ARC_NOWAIT) { 2649 zio_nowait(rzio); 2650 return (0); 2651 } 2652 2653 ASSERT(*arc_flags & ARC_WAIT); 2654 if (zio_wait(rzio) == 0) 2655 return (0); 2656 2657 /* l2arc read error; goto zio_read() */ 2658 } else { 2659 DTRACE_PROBE1(l2arc__miss, 2660 arc_buf_hdr_t *, hdr); 2661 ARCSTAT_BUMP(arcstat_l2_misses); 2662 if (HDR_L2_WRITING(hdr)) 2663 ARCSTAT_BUMP(arcstat_l2_rw_clash); 2664 l2skip: 2665 spa_config_exit(spa, FTAG); 2666 } 2667 } 2668 2669 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2670 arc_read_done, buf, priority, flags, zb); 2671 2672 if (*arc_flags & ARC_WAIT) 2673 return (zio_wait(rzio)); 2674 2675 ASSERT(*arc_flags & ARC_NOWAIT); 2676 zio_nowait(rzio); 2677 } 2678 return (0); 2679 } 2680 2681 /* 2682 * arc_read() variant to support pool traversal. If the block is already 2683 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2684 * The idea is that we don't want pool traversal filling up memory, but 2685 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2686 */ 2687 int 2688 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2689 { 2690 arc_buf_hdr_t *hdr; 2691 kmutex_t *hash_mtx; 2692 int rc = 0; 2693 2694 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2695 2696 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2697 arc_buf_t *buf = hdr->b_buf; 2698 2699 ASSERT(buf); 2700 while (buf->b_data == NULL) { 2701 buf = buf->b_next; 2702 ASSERT(buf); 2703 } 2704 bcopy(buf->b_data, data, hdr->b_size); 2705 } else { 2706 rc = ENOENT; 2707 } 2708 2709 if (hash_mtx) 2710 mutex_exit(hash_mtx); 2711 2712 return (rc); 2713 } 2714 2715 void 2716 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2717 { 2718 ASSERT(buf->b_hdr != NULL); 2719 ASSERT(buf->b_hdr->b_state != arc_anon); 2720 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2721 buf->b_efunc = func; 2722 buf->b_private = private; 2723 } 2724 2725 /* 2726 * This is used by the DMU to let the ARC know that a buffer is 2727 * being evicted, so the ARC should clean up. If this arc buf 2728 * is not yet in the evicted state, it will be put there. 2729 */ 2730 int 2731 arc_buf_evict(arc_buf_t *buf) 2732 { 2733 arc_buf_hdr_t *hdr; 2734 kmutex_t *hash_lock; 2735 arc_buf_t **bufp; 2736 2737 mutex_enter(&arc_eviction_mtx); 2738 hdr = buf->b_hdr; 2739 if (hdr == NULL) { 2740 /* 2741 * We are in arc_do_user_evicts(). 2742 */ 2743 ASSERT(buf->b_data == NULL); 2744 mutex_exit(&arc_eviction_mtx); 2745 return (0); 2746 } 2747 hash_lock = HDR_LOCK(hdr); 2748 mutex_exit(&arc_eviction_mtx); 2749 2750 mutex_enter(hash_lock); 2751 2752 if (buf->b_data == NULL) { 2753 /* 2754 * We are on the eviction list. 2755 */ 2756 mutex_exit(hash_lock); 2757 mutex_enter(&arc_eviction_mtx); 2758 if (buf->b_hdr == NULL) { 2759 /* 2760 * We are already in arc_do_user_evicts(). 2761 */ 2762 mutex_exit(&arc_eviction_mtx); 2763 return (0); 2764 } else { 2765 arc_buf_t copy = *buf; /* structure assignment */ 2766 /* 2767 * Process this buffer now 2768 * but let arc_do_user_evicts() do the reaping. 2769 */ 2770 buf->b_efunc = NULL; 2771 mutex_exit(&arc_eviction_mtx); 2772 VERIFY(copy.b_efunc(©) == 0); 2773 return (1); 2774 } 2775 } 2776 2777 ASSERT(buf->b_hdr == hdr); 2778 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2779 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2780 2781 /* 2782 * Pull this buffer off of the hdr 2783 */ 2784 bufp = &hdr->b_buf; 2785 while (*bufp != buf) 2786 bufp = &(*bufp)->b_next; 2787 *bufp = buf->b_next; 2788 2789 ASSERT(buf->b_data != NULL); 2790 arc_buf_destroy(buf, FALSE, FALSE); 2791 2792 if (hdr->b_datacnt == 0) { 2793 arc_state_t *old_state = hdr->b_state; 2794 arc_state_t *evicted_state; 2795 2796 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2797 2798 evicted_state = 2799 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2800 2801 mutex_enter(&old_state->arcs_mtx); 2802 mutex_enter(&evicted_state->arcs_mtx); 2803 2804 arc_change_state(evicted_state, hdr, hash_lock); 2805 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2806 hdr->b_flags |= ARC_IN_HASH_TABLE; 2807 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2808 2809 mutex_exit(&evicted_state->arcs_mtx); 2810 mutex_exit(&old_state->arcs_mtx); 2811 } 2812 mutex_exit(hash_lock); 2813 2814 VERIFY(buf->b_efunc(buf) == 0); 2815 buf->b_efunc = NULL; 2816 buf->b_private = NULL; 2817 buf->b_hdr = NULL; 2818 kmem_cache_free(buf_cache, buf); 2819 return (1); 2820 } 2821 2822 /* 2823 * Release this buffer from the cache. This must be done 2824 * after a read and prior to modifying the buffer contents. 2825 * If the buffer has more than one reference, we must make 2826 * a new hdr for the buffer. 2827 */ 2828 void 2829 arc_release(arc_buf_t *buf, void *tag) 2830 { 2831 arc_buf_hdr_t *hdr = buf->b_hdr; 2832 kmutex_t *hash_lock = HDR_LOCK(hdr); 2833 l2arc_buf_hdr_t *l2hdr = NULL; 2834 uint64_t buf_size; 2835 2836 /* this buffer is not on any list */ 2837 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2838 ASSERT(!(hdr->b_flags & ARC_STORED)); 2839 2840 if (hdr->b_state == arc_anon) { 2841 /* this buffer is already released */ 2842 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2843 ASSERT(BUF_EMPTY(hdr)); 2844 ASSERT(buf->b_efunc == NULL); 2845 arc_buf_thaw(buf); 2846 return; 2847 } 2848 2849 mutex_enter(hash_lock); 2850 2851 /* 2852 * Do we have more than one buf? 2853 */ 2854 if (hdr->b_buf != buf || buf->b_next != NULL) { 2855 arc_buf_hdr_t *nhdr; 2856 arc_buf_t **bufp; 2857 uint64_t blksz = hdr->b_size; 2858 spa_t *spa = hdr->b_spa; 2859 arc_buf_contents_t type = hdr->b_type; 2860 uint32_t flags = hdr->b_flags; 2861 2862 ASSERT(hdr->b_datacnt > 1); 2863 /* 2864 * Pull the data off of this buf and attach it to 2865 * a new anonymous buf. 2866 */ 2867 (void) remove_reference(hdr, hash_lock, tag); 2868 bufp = &hdr->b_buf; 2869 while (*bufp != buf) 2870 bufp = &(*bufp)->b_next; 2871 *bufp = (*bufp)->b_next; 2872 buf->b_next = NULL; 2873 2874 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2875 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2876 if (refcount_is_zero(&hdr->b_refcnt)) { 2877 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 2878 ASSERT3U(*size, >=, hdr->b_size); 2879 atomic_add_64(size, -hdr->b_size); 2880 } 2881 hdr->b_datacnt -= 1; 2882 if (hdr->b_l2hdr != NULL) { 2883 mutex_enter(&l2arc_buflist_mtx); 2884 l2hdr = hdr->b_l2hdr; 2885 hdr->b_l2hdr = NULL; 2886 buf_size = hdr->b_size; 2887 } 2888 arc_cksum_verify(buf); 2889 2890 mutex_exit(hash_lock); 2891 2892 nhdr = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 2893 nhdr->b_size = blksz; 2894 nhdr->b_spa = spa; 2895 nhdr->b_type = type; 2896 nhdr->b_buf = buf; 2897 nhdr->b_state = arc_anon; 2898 nhdr->b_arc_access = 0; 2899 nhdr->b_flags = flags & ARC_L2_WRITING; 2900 nhdr->b_l2hdr = NULL; 2901 nhdr->b_datacnt = 1; 2902 nhdr->b_freeze_cksum = NULL; 2903 (void) refcount_add(&nhdr->b_refcnt, tag); 2904 buf->b_hdr = nhdr; 2905 atomic_add_64(&arc_anon->arcs_size, blksz); 2906 } else { 2907 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2908 ASSERT(!list_link_active(&hdr->b_arc_node)); 2909 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2910 arc_change_state(arc_anon, hdr, hash_lock); 2911 hdr->b_arc_access = 0; 2912 if (hdr->b_l2hdr != NULL) { 2913 mutex_enter(&l2arc_buflist_mtx); 2914 l2hdr = hdr->b_l2hdr; 2915 hdr->b_l2hdr = NULL; 2916 buf_size = hdr->b_size; 2917 } 2918 mutex_exit(hash_lock); 2919 2920 bzero(&hdr->b_dva, sizeof (dva_t)); 2921 hdr->b_birth = 0; 2922 hdr->b_cksum0 = 0; 2923 arc_buf_thaw(buf); 2924 } 2925 buf->b_efunc = NULL; 2926 buf->b_private = NULL; 2927 2928 if (l2hdr) { 2929 list_remove(l2hdr->b_dev->l2ad_buflist, hdr); 2930 kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); 2931 ARCSTAT_INCR(arcstat_l2_size, -buf_size); 2932 } 2933 if (MUTEX_HELD(&l2arc_buflist_mtx)) 2934 mutex_exit(&l2arc_buflist_mtx); 2935 } 2936 2937 int 2938 arc_released(arc_buf_t *buf) 2939 { 2940 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2941 } 2942 2943 int 2944 arc_has_callback(arc_buf_t *buf) 2945 { 2946 return (buf->b_efunc != NULL); 2947 } 2948 2949 #ifdef ZFS_DEBUG 2950 int 2951 arc_referenced(arc_buf_t *buf) 2952 { 2953 return (refcount_count(&buf->b_hdr->b_refcnt)); 2954 } 2955 #endif 2956 2957 static void 2958 arc_write_ready(zio_t *zio) 2959 { 2960 arc_write_callback_t *callback = zio->io_private; 2961 arc_buf_t *buf = callback->awcb_buf; 2962 arc_buf_hdr_t *hdr = buf->b_hdr; 2963 2964 if (zio->io_error == 0 && callback->awcb_ready) { 2965 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2966 callback->awcb_ready(zio, buf, callback->awcb_private); 2967 } 2968 /* 2969 * If the IO is already in progress, then this is a re-write 2970 * attempt, so we need to thaw and re-compute the cksum. It is 2971 * the responsibility of the callback to handle the freeing 2972 * and accounting for any re-write attempt. If we don't have a 2973 * callback registered then simply free the block here. 2974 */ 2975 if (HDR_IO_IN_PROGRESS(hdr)) { 2976 if (!BP_IS_HOLE(&zio->io_bp_orig) && 2977 callback->awcb_ready == NULL) { 2978 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 2979 &zio->io_bp_orig, NULL, NULL)); 2980 } 2981 mutex_enter(&hdr->b_freeze_lock); 2982 if (hdr->b_freeze_cksum != NULL) { 2983 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2984 hdr->b_freeze_cksum = NULL; 2985 } 2986 mutex_exit(&hdr->b_freeze_lock); 2987 } 2988 arc_cksum_compute(buf, B_FALSE); 2989 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2990 } 2991 2992 static void 2993 arc_write_done(zio_t *zio) 2994 { 2995 arc_write_callback_t *callback = zio->io_private; 2996 arc_buf_t *buf = callback->awcb_buf; 2997 arc_buf_hdr_t *hdr = buf->b_hdr; 2998 2999 hdr->b_acb = NULL; 3000 3001 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 3002 hdr->b_birth = zio->io_bp->blk_birth; 3003 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 3004 /* 3005 * If the block to be written was all-zero, we may have 3006 * compressed it away. In this case no write was performed 3007 * so there will be no dva/birth-date/checksum. The buffer 3008 * must therefor remain anonymous (and uncached). 3009 */ 3010 if (!BUF_EMPTY(hdr)) { 3011 arc_buf_hdr_t *exists; 3012 kmutex_t *hash_lock; 3013 3014 arc_cksum_verify(buf); 3015 3016 exists = buf_hash_insert(hdr, &hash_lock); 3017 if (exists) { 3018 /* 3019 * This can only happen if we overwrite for 3020 * sync-to-convergence, because we remove 3021 * buffers from the hash table when we arc_free(). 3022 */ 3023 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 3024 BP_IDENTITY(zio->io_bp))); 3025 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 3026 zio->io_bp->blk_birth); 3027 3028 ASSERT(refcount_is_zero(&exists->b_refcnt)); 3029 arc_change_state(arc_anon, exists, hash_lock); 3030 mutex_exit(hash_lock); 3031 arc_hdr_destroy(exists); 3032 exists = buf_hash_insert(hdr, &hash_lock); 3033 ASSERT3P(exists, ==, NULL); 3034 } 3035 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3036 /* if it's not anon, we are doing a scrub */ 3037 if (hdr->b_state == arc_anon) 3038 arc_access(hdr, hash_lock); 3039 mutex_exit(hash_lock); 3040 } else if (callback->awcb_done == NULL) { 3041 int destroy_hdr; 3042 /* 3043 * This is an anonymous buffer with no user callback, 3044 * destroy it if there are no active references. 3045 */ 3046 mutex_enter(&arc_eviction_mtx); 3047 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 3048 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3049 mutex_exit(&arc_eviction_mtx); 3050 if (destroy_hdr) 3051 arc_hdr_destroy(hdr); 3052 } else { 3053 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 3054 } 3055 hdr->b_flags &= ~ARC_STORED; 3056 3057 if (callback->awcb_done) { 3058 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 3059 callback->awcb_done(zio, buf, callback->awcb_private); 3060 } 3061 3062 kmem_free(callback, sizeof (arc_write_callback_t)); 3063 } 3064 3065 static void 3066 write_policy(spa_t *spa, const writeprops_t *wp, 3067 int *cksump, int *compp, int *copiesp) 3068 { 3069 int copies = wp->wp_copies; 3070 boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); 3071 3072 /* Determine copies setting */ 3073 if (ismd) 3074 copies++; 3075 *copiesp = MIN(copies, spa_max_replication(spa)); 3076 3077 /* Determine checksum setting */ 3078 if (ismd) { 3079 /* 3080 * Metadata always gets checksummed. If the data 3081 * checksum is multi-bit correctable, and it's not a 3082 * ZBT-style checksum, then it's suitable for metadata 3083 * as well. Otherwise, the metadata checksum defaults 3084 * to fletcher4. 3085 */ 3086 if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && 3087 !zio_checksum_table[wp->wp_oschecksum].ci_zbt) 3088 *cksump = wp->wp_oschecksum; 3089 else 3090 *cksump = ZIO_CHECKSUM_FLETCHER_4; 3091 } else { 3092 *cksump = zio_checksum_select(wp->wp_dnchecksum, 3093 wp->wp_oschecksum); 3094 } 3095 3096 /* Determine compression setting */ 3097 if (ismd) { 3098 /* 3099 * XXX -- we should design a compression algorithm 3100 * that specializes in arrays of bps. 3101 */ 3102 *compp = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : 3103 ZIO_COMPRESS_LZJB; 3104 } else { 3105 *compp = zio_compress_select(wp->wp_dncompress, 3106 wp->wp_oscompress); 3107 } 3108 } 3109 3110 zio_t * 3111 arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, 3112 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 3113 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 3114 int flags, const zbookmark_t *zb) 3115 { 3116 arc_buf_hdr_t *hdr = buf->b_hdr; 3117 arc_write_callback_t *callback; 3118 zio_t *zio; 3119 int cksum, comp, copies; 3120 3121 ASSERT(!HDR_IO_ERROR(hdr)); 3122 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 3123 ASSERT(hdr->b_acb == 0); 3124 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 3125 callback->awcb_ready = ready; 3126 callback->awcb_done = done; 3127 callback->awcb_private = private; 3128 callback->awcb_buf = buf; 3129 3130 write_policy(spa, wp, &cksum, &comp, &copies); 3131 zio = zio_write(pio, spa, cksum, comp, copies, txg, bp, 3132 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, 3133 callback, priority, flags, zb); 3134 3135 return (zio); 3136 } 3137 3138 int 3139 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 3140 zio_done_func_t *done, void *private, uint32_t arc_flags) 3141 { 3142 arc_buf_hdr_t *ab; 3143 kmutex_t *hash_lock; 3144 zio_t *zio; 3145 3146 /* 3147 * If this buffer is in the cache, release it, so it 3148 * can be re-used. 3149 */ 3150 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 3151 if (ab != NULL) { 3152 /* 3153 * The checksum of blocks to free is not always 3154 * preserved (eg. on the deadlist). However, if it is 3155 * nonzero, it should match what we have in the cache. 3156 */ 3157 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 3158 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 3159 if (ab->b_state != arc_anon) 3160 arc_change_state(arc_anon, ab, hash_lock); 3161 if (HDR_IO_IN_PROGRESS(ab)) { 3162 /* 3163 * This should only happen when we prefetch. 3164 */ 3165 ASSERT(ab->b_flags & ARC_PREFETCH); 3166 ASSERT3U(ab->b_datacnt, ==, 1); 3167 ab->b_flags |= ARC_FREED_IN_READ; 3168 if (HDR_IN_HASH_TABLE(ab)) 3169 buf_hash_remove(ab); 3170 ab->b_arc_access = 0; 3171 bzero(&ab->b_dva, sizeof (dva_t)); 3172 ab->b_birth = 0; 3173 ab->b_cksum0 = 0; 3174 ab->b_buf->b_efunc = NULL; 3175 ab->b_buf->b_private = NULL; 3176 mutex_exit(hash_lock); 3177 } else if (refcount_is_zero(&ab->b_refcnt)) { 3178 ab->b_flags |= ARC_FREE_IN_PROGRESS; 3179 mutex_exit(hash_lock); 3180 arc_hdr_destroy(ab); 3181 ARCSTAT_BUMP(arcstat_deleted); 3182 } else { 3183 /* 3184 * We still have an active reference on this 3185 * buffer. This can happen, e.g., from 3186 * dbuf_unoverride(). 3187 */ 3188 ASSERT(!HDR_IN_HASH_TABLE(ab)); 3189 ab->b_arc_access = 0; 3190 bzero(&ab->b_dva, sizeof (dva_t)); 3191 ab->b_birth = 0; 3192 ab->b_cksum0 = 0; 3193 ab->b_buf->b_efunc = NULL; 3194 ab->b_buf->b_private = NULL; 3195 mutex_exit(hash_lock); 3196 } 3197 } 3198 3199 zio = zio_free(pio, spa, txg, bp, done, private); 3200 3201 if (arc_flags & ARC_WAIT) 3202 return (zio_wait(zio)); 3203 3204 ASSERT(arc_flags & ARC_NOWAIT); 3205 zio_nowait(zio); 3206 3207 return (0); 3208 } 3209 3210 static int 3211 arc_memory_throttle(uint64_t reserve, uint64_t txg) 3212 { 3213 #ifdef _KERNEL 3214 uint64_t inflight_data = arc_anon->arcs_size; 3215 uint64_t available_memory = ptob(freemem); 3216 static uint64_t page_load = 0; 3217 static uint64_t last_txg = 0; 3218 3219 #if defined(__i386) 3220 available_memory = 3221 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 3222 #endif 3223 if (available_memory >= zfs_write_limit_max) 3224 return (0); 3225 3226 if (txg > last_txg) { 3227 last_txg = txg; 3228 page_load = 0; 3229 } 3230 /* 3231 * If we are in pageout, we know that memory is already tight, 3232 * the arc is already going to be evicting, so we just want to 3233 * continue to let page writes occur as quickly as possible. 3234 */ 3235 if (curproc == proc_pageout) { 3236 if (page_load > MAX(ptob(minfree), available_memory) / 4) 3237 return (ERESTART); 3238 /* Note: reserve is inflated, so we deflate */ 3239 page_load += reserve / 8; 3240 return (0); 3241 } else if (page_load > 0 && arc_reclaim_needed()) { 3242 /* memory is low, delay before restarting */ 3243 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3244 return (EAGAIN); 3245 } 3246 page_load = 0; 3247 3248 if (arc_size > arc_c_min) { 3249 uint64_t evictable_memory = 3250 arc_mru->arcs_lsize[ARC_BUFC_DATA] + 3251 arc_mru->arcs_lsize[ARC_BUFC_METADATA] + 3252 arc_mfu->arcs_lsize[ARC_BUFC_DATA] + 3253 arc_mfu->arcs_lsize[ARC_BUFC_METADATA]; 3254 available_memory += MIN(evictable_memory, arc_size - arc_c_min); 3255 } 3256 3257 if (inflight_data > available_memory / 4) { 3258 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 3259 return (ERESTART); 3260 } 3261 #endif 3262 return (0); 3263 } 3264 3265 void 3266 arc_tempreserve_clear(uint64_t reserve) 3267 { 3268 atomic_add_64(&arc_tempreserve, -reserve); 3269 ASSERT((int64_t)arc_tempreserve >= 0); 3270 } 3271 3272 int 3273 arc_tempreserve_space(uint64_t reserve, uint64_t txg) 3274 { 3275 int error; 3276 3277 #ifdef ZFS_DEBUG 3278 /* 3279 * Once in a while, fail for no reason. Everything should cope. 3280 */ 3281 if (spa_get_random(10000) == 0) { 3282 dprintf("forcing random failure\n"); 3283 return (ERESTART); 3284 } 3285 #endif 3286 if (reserve > arc_c/4 && !arc_no_grow) 3287 arc_c = MIN(arc_c_max, reserve * 4); 3288 if (reserve > arc_c) 3289 return (ENOMEM); 3290 3291 /* 3292 * Writes will, almost always, require additional memory allocations 3293 * in order to compress/encrypt/etc the data. We therefor need to 3294 * make sure that there is sufficient available memory for this. 3295 */ 3296 if (error = arc_memory_throttle(reserve, txg)) 3297 return (error); 3298 3299 /* 3300 * Throttle writes when the amount of dirty data in the cache 3301 * gets too large. We try to keep the cache less than half full 3302 * of dirty blocks so that our sync times don't grow too large. 3303 * Note: if two requests come in concurrently, we might let them 3304 * both succeed, when one of them should fail. Not a huge deal. 3305 */ 3306 if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 3307 arc_anon->arcs_size > arc_c / 4) { 3308 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 3309 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 3310 arc_tempreserve>>10, 3311 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 3312 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 3313 reserve>>10, arc_c>>10); 3314 return (ERESTART); 3315 } 3316 atomic_add_64(&arc_tempreserve, reserve); 3317 return (0); 3318 } 3319 3320 void 3321 arc_init(void) 3322 { 3323 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 3324 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 3325 3326 /* Convert seconds to clock ticks */ 3327 arc_min_prefetch_lifespan = 1 * hz; 3328 3329 /* Start out with 1/8 of all memory */ 3330 arc_c = physmem * PAGESIZE / 8; 3331 3332 #ifdef _KERNEL 3333 /* 3334 * On architectures where the physical memory can be larger 3335 * than the addressable space (intel in 32-bit mode), we may 3336 * need to limit the cache to 1/8 of VM size. 3337 */ 3338 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 3339 #endif 3340 3341 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 3342 arc_c_min = MAX(arc_c / 4, 64<<20); 3343 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 3344 if (arc_c * 8 >= 1<<30) 3345 arc_c_max = (arc_c * 8) - (1<<30); 3346 else 3347 arc_c_max = arc_c_min; 3348 arc_c_max = MAX(arc_c * 6, arc_c_max); 3349 3350 /* 3351 * Allow the tunables to override our calculations if they are 3352 * reasonable (ie. over 64MB) 3353 */ 3354 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 3355 arc_c_max = zfs_arc_max; 3356 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 3357 arc_c_min = zfs_arc_min; 3358 3359 arc_c = arc_c_max; 3360 arc_p = (arc_c >> 1); 3361 3362 /* limit meta-data to 1/4 of the arc capacity */ 3363 arc_meta_limit = arc_c_max / 4; 3364 3365 /* Allow the tunable to override if it is reasonable */ 3366 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 3367 arc_meta_limit = zfs_arc_meta_limit; 3368 3369 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 3370 arc_c_min = arc_meta_limit / 2; 3371 3372 /* if kmem_flags are set, lets try to use less memory */ 3373 if (kmem_debugging()) 3374 arc_c = arc_c / 2; 3375 if (arc_c < arc_c_min) 3376 arc_c = arc_c_min; 3377 3378 arc_anon = &ARC_anon; 3379 arc_mru = &ARC_mru; 3380 arc_mru_ghost = &ARC_mru_ghost; 3381 arc_mfu = &ARC_mfu; 3382 arc_mfu_ghost = &ARC_mfu_ghost; 3383 arc_l2c_only = &ARC_l2c_only; 3384 arc_size = 0; 3385 3386 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3387 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3388 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3389 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3390 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3391 mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 3392 3393 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 3394 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3395 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 3396 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3397 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 3398 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3399 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 3400 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3401 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 3402 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3403 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 3404 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3405 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 3406 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3407 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 3408 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3409 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], 3410 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3411 list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], 3412 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 3413 3414 buf_init(); 3415 3416 arc_thread_exit = 0; 3417 arc_eviction_list = NULL; 3418 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 3419 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 3420 3421 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 3422 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 3423 3424 if (arc_ksp != NULL) { 3425 arc_ksp->ks_data = &arc_stats; 3426 kstat_install(arc_ksp); 3427 } 3428 3429 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 3430 TS_RUN, minclsyspri); 3431 3432 arc_dead = FALSE; 3433 arc_warm = B_FALSE; 3434 3435 if (zfs_write_limit_max == 0) 3436 zfs_write_limit_max = physmem * PAGESIZE >> 3437 zfs_write_limit_shift; 3438 else 3439 zfs_write_limit_shift = 0; 3440 } 3441 3442 void 3443 arc_fini(void) 3444 { 3445 mutex_enter(&arc_reclaim_thr_lock); 3446 arc_thread_exit = 1; 3447 while (arc_thread_exit != 0) 3448 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 3449 mutex_exit(&arc_reclaim_thr_lock); 3450 3451 arc_flush(NULL); 3452 3453 arc_dead = TRUE; 3454 3455 if (arc_ksp != NULL) { 3456 kstat_delete(arc_ksp); 3457 arc_ksp = NULL; 3458 } 3459 3460 mutex_destroy(&arc_eviction_mtx); 3461 mutex_destroy(&arc_reclaim_thr_lock); 3462 cv_destroy(&arc_reclaim_thr_cv); 3463 3464 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 3465 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 3466 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 3467 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 3468 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 3469 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 3470 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 3471 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 3472 3473 mutex_destroy(&arc_anon->arcs_mtx); 3474 mutex_destroy(&arc_mru->arcs_mtx); 3475 mutex_destroy(&arc_mru_ghost->arcs_mtx); 3476 mutex_destroy(&arc_mfu->arcs_mtx); 3477 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 3478 3479 buf_fini(); 3480 } 3481 3482 /* 3483 * Level 2 ARC 3484 * 3485 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 3486 * It uses dedicated storage devices to hold cached data, which are populated 3487 * using large infrequent writes. The main role of this cache is to boost 3488 * the performance of random read workloads. The intended L2ARC devices 3489 * include short-stroked disks, solid state disks, and other media with 3490 * substantially faster read latency than disk. 3491 * 3492 * +-----------------------+ 3493 * | ARC | 3494 * +-----------------------+ 3495 * | ^ ^ 3496 * | | | 3497 * l2arc_feed_thread() arc_read() 3498 * | | | 3499 * | l2arc read | 3500 * V | | 3501 * +---------------+ | 3502 * | L2ARC | | 3503 * +---------------+ | 3504 * | ^ | 3505 * l2arc_write() | | 3506 * | | | 3507 * V | | 3508 * +-------+ +-------+ 3509 * | vdev | | vdev | 3510 * | cache | | cache | 3511 * +-------+ +-------+ 3512 * +=========+ .-----. 3513 * : L2ARC : |-_____-| 3514 * : devices : | Disks | 3515 * +=========+ `-_____-' 3516 * 3517 * Read requests are satisfied from the following sources, in order: 3518 * 3519 * 1) ARC 3520 * 2) vdev cache of L2ARC devices 3521 * 3) L2ARC devices 3522 * 4) vdev cache of disks 3523 * 5) disks 3524 * 3525 * Some L2ARC device types exhibit extremely slow write performance. 3526 * To accommodate for this there are some significant differences between 3527 * the L2ARC and traditional cache design: 3528 * 3529 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 3530 * the ARC behave as usual, freeing buffers and placing headers on ghost 3531 * lists. The ARC does not send buffers to the L2ARC during eviction as 3532 * this would add inflated write latencies for all ARC memory pressure. 3533 * 3534 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 3535 * It does this by periodically scanning buffers from the eviction-end of 3536 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 3537 * not already there. It scans until a headroom of buffers is satisfied, 3538 * which itself is a buffer for ARC eviction. The thread that does this is 3539 * l2arc_feed_thread(), illustrated below; example sizes are included to 3540 * provide a better sense of ratio than this diagram: 3541 * 3542 * head --> tail 3543 * +---------------------+----------+ 3544 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 3545 * +---------------------+----------+ | o L2ARC eligible 3546 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 3547 * +---------------------+----------+ | 3548 * 15.9 Gbytes ^ 32 Mbytes | 3549 * headroom | 3550 * l2arc_feed_thread() 3551 * | 3552 * l2arc write hand <--[oooo]--' 3553 * | 8 Mbyte 3554 * | write max 3555 * V 3556 * +==============================+ 3557 * L2ARC dev |####|#|###|###| |####| ... | 3558 * +==============================+ 3559 * 32 Gbytes 3560 * 3561 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 3562 * evicted, then the L2ARC has cached a buffer much sooner than it probably 3563 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 3564 * safe to say that this is an uncommon case, since buffers at the end of 3565 * the ARC lists have moved there due to inactivity. 3566 * 3567 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 3568 * then the L2ARC simply misses copying some buffers. This serves as a 3569 * pressure valve to prevent heavy read workloads from both stalling the ARC 3570 * with waits and clogging the L2ARC with writes. This also helps prevent 3571 * the potential for the L2ARC to churn if it attempts to cache content too 3572 * quickly, such as during backups of the entire pool. 3573 * 3574 * 5. After system boot and before the ARC has filled main memory, there are 3575 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 3576 * lists can remain mostly static. Instead of searching from tail of these 3577 * lists as pictured, the l2arc_feed_thread() will search from the list heads 3578 * for eligible buffers, greatly increasing its chance of finding them. 3579 * 3580 * The L2ARC device write speed is also boosted during this time so that 3581 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 3582 * there are no L2ARC reads, and no fear of degrading read performance 3583 * through increased writes. 3584 * 3585 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 3586 * the vdev queue can aggregate them into larger and fewer writes. Each 3587 * device is written to in a rotor fashion, sweeping writes through 3588 * available space then repeating. 3589 * 3590 * 7. The L2ARC does not store dirty content. It never needs to flush 3591 * write buffers back to disk based storage. 3592 * 3593 * 8. If an ARC buffer is written (and dirtied) which also exists in the 3594 * L2ARC, the now stale L2ARC buffer is immediately dropped. 3595 * 3596 * The performance of the L2ARC can be tweaked by a number of tunables, which 3597 * may be necessary for different workloads: 3598 * 3599 * l2arc_write_max max write bytes per interval 3600 * l2arc_write_boost extra write bytes during device warmup 3601 * l2arc_noprefetch skip caching prefetched buffers 3602 * l2arc_headroom number of max device writes to precache 3603 * l2arc_feed_secs seconds between L2ARC writing 3604 * 3605 * Tunables may be removed or added as future performance improvements are 3606 * integrated, and also may become zpool properties. 3607 */ 3608 3609 static void 3610 l2arc_hdr_stat_add(void) 3611 { 3612 ARCSTAT_INCR(arcstat_l2_hdr_size, HDR_SIZE + L2HDR_SIZE); 3613 ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); 3614 } 3615 3616 static void 3617 l2arc_hdr_stat_remove(void) 3618 { 3619 ARCSTAT_INCR(arcstat_l2_hdr_size, -(HDR_SIZE + L2HDR_SIZE)); 3620 ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); 3621 } 3622 3623 /* 3624 * Cycle through L2ARC devices. This is how L2ARC load balances. 3625 * If a device is returned, this also returns holding the spa config lock. 3626 */ 3627 static l2arc_dev_t * 3628 l2arc_dev_get_next(void) 3629 { 3630 l2arc_dev_t *first, *next = NULL; 3631 3632 /* 3633 * Lock out the removal of spas (spa_namespace_lock), then removal 3634 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 3635 * both locks will be dropped and a spa config lock held instead. 3636 */ 3637 mutex_enter(&spa_namespace_lock); 3638 mutex_enter(&l2arc_dev_mtx); 3639 3640 /* if there are no vdevs, there is nothing to do */ 3641 if (l2arc_ndev == 0) 3642 goto out; 3643 3644 first = NULL; 3645 next = l2arc_dev_last; 3646 do { 3647 /* loop around the list looking for a non-faulted vdev */ 3648 if (next == NULL) { 3649 next = list_head(l2arc_dev_list); 3650 } else { 3651 next = list_next(l2arc_dev_list, next); 3652 if (next == NULL) 3653 next = list_head(l2arc_dev_list); 3654 } 3655 3656 /* if we have come back to the start, bail out */ 3657 if (first == NULL) 3658 first = next; 3659 else if (next == first) 3660 break; 3661 3662 } while (vdev_is_dead(next->l2ad_vdev)); 3663 3664 /* if we were unable to find any usable vdevs, return NULL */ 3665 if (vdev_is_dead(next->l2ad_vdev)) 3666 next = NULL; 3667 3668 l2arc_dev_last = next; 3669 3670 out: 3671 mutex_exit(&l2arc_dev_mtx); 3672 3673 /* 3674 * Grab the config lock to prevent the 'next' device from being 3675 * removed while we are writing to it. 3676 */ 3677 if (next != NULL) 3678 spa_config_enter(next->l2ad_spa, RW_READER, next); 3679 mutex_exit(&spa_namespace_lock); 3680 3681 return (next); 3682 } 3683 3684 /* 3685 * Free buffers that were tagged for destruction. 3686 */ 3687 static void 3688 l2arc_do_free_on_write() 3689 { 3690 list_t *buflist; 3691 l2arc_data_free_t *df, *df_prev; 3692 3693 mutex_enter(&l2arc_free_on_write_mtx); 3694 buflist = l2arc_free_on_write; 3695 3696 for (df = list_tail(buflist); df; df = df_prev) { 3697 df_prev = list_prev(buflist, df); 3698 ASSERT(df->l2df_data != NULL); 3699 ASSERT(df->l2df_func != NULL); 3700 df->l2df_func(df->l2df_data, df->l2df_size); 3701 list_remove(buflist, df); 3702 kmem_free(df, sizeof (l2arc_data_free_t)); 3703 } 3704 3705 mutex_exit(&l2arc_free_on_write_mtx); 3706 } 3707 3708 /* 3709 * A write to a cache device has completed. Update all headers to allow 3710 * reads from these buffers to begin. 3711 */ 3712 static void 3713 l2arc_write_done(zio_t *zio) 3714 { 3715 l2arc_write_callback_t *cb; 3716 l2arc_dev_t *dev; 3717 list_t *buflist; 3718 arc_buf_hdr_t *head, *ab, *ab_prev; 3719 l2arc_buf_hdr_t *abl2; 3720 kmutex_t *hash_lock; 3721 3722 cb = zio->io_private; 3723 ASSERT(cb != NULL); 3724 dev = cb->l2wcb_dev; 3725 ASSERT(dev != NULL); 3726 head = cb->l2wcb_head; 3727 ASSERT(head != NULL); 3728 buflist = dev->l2ad_buflist; 3729 ASSERT(buflist != NULL); 3730 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 3731 l2arc_write_callback_t *, cb); 3732 3733 if (zio->io_error != 0) 3734 ARCSTAT_BUMP(arcstat_l2_writes_error); 3735 3736 mutex_enter(&l2arc_buflist_mtx); 3737 3738 /* 3739 * All writes completed, or an error was hit. 3740 */ 3741 for (ab = list_prev(buflist, head); ab; ab = ab_prev) { 3742 ab_prev = list_prev(buflist, ab); 3743 3744 hash_lock = HDR_LOCK(ab); 3745 if (!mutex_tryenter(hash_lock)) { 3746 /* 3747 * This buffer misses out. It may be in a stage 3748 * of eviction. Its ARC_L2_WRITING flag will be 3749 * left set, denying reads to this buffer. 3750 */ 3751 ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); 3752 continue; 3753 } 3754 3755 if (zio->io_error != 0) { 3756 /* 3757 * Error - drop L2ARC entry. 3758 */ 3759 list_remove(buflist, ab); 3760 abl2 = ab->b_l2hdr; 3761 ab->b_l2hdr = NULL; 3762 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 3763 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 3764 } 3765 3766 /* 3767 * Allow ARC to begin reads to this L2ARC entry. 3768 */ 3769 ab->b_flags &= ~ARC_L2_WRITING; 3770 3771 mutex_exit(hash_lock); 3772 } 3773 3774 atomic_inc_64(&l2arc_writes_done); 3775 list_remove(buflist, head); 3776 kmem_cache_free(hdr_cache, head); 3777 mutex_exit(&l2arc_buflist_mtx); 3778 3779 l2arc_do_free_on_write(); 3780 3781 kmem_free(cb, sizeof (l2arc_write_callback_t)); 3782 } 3783 3784 /* 3785 * A read to a cache device completed. Validate buffer contents before 3786 * handing over to the regular ARC routines. 3787 */ 3788 static void 3789 l2arc_read_done(zio_t *zio) 3790 { 3791 l2arc_read_callback_t *cb; 3792 arc_buf_hdr_t *hdr; 3793 arc_buf_t *buf; 3794 zio_t *rzio; 3795 kmutex_t *hash_lock; 3796 int equal; 3797 3798 cb = zio->io_private; 3799 ASSERT(cb != NULL); 3800 buf = cb->l2rcb_buf; 3801 ASSERT(buf != NULL); 3802 hdr = buf->b_hdr; 3803 ASSERT(hdr != NULL); 3804 3805 hash_lock = HDR_LOCK(hdr); 3806 mutex_enter(hash_lock); 3807 3808 /* 3809 * Check this survived the L2ARC journey. 3810 */ 3811 equal = arc_cksum_equal(buf); 3812 if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { 3813 mutex_exit(hash_lock); 3814 zio->io_private = buf; 3815 arc_read_done(zio); 3816 } else { 3817 mutex_exit(hash_lock); 3818 /* 3819 * Buffer didn't survive caching. Increment stats and 3820 * reissue to the original storage device. 3821 */ 3822 if (zio->io_error != 0) { 3823 ARCSTAT_BUMP(arcstat_l2_io_error); 3824 } else { 3825 zio->io_error = EIO; 3826 } 3827 if (!equal) 3828 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 3829 3830 if (zio->io_waiter == NULL) { 3831 /* 3832 * Let the resent I/O call arc_read_done() instead. 3833 */ 3834 zio->io_done = NULL; 3835 zio->io_flags &= ~ZIO_FLAG_DONT_CACHE; 3836 3837 rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp, 3838 buf->b_data, zio->io_size, arc_read_done, buf, 3839 zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); 3840 3841 (void) zio_nowait(rzio); 3842 } 3843 } 3844 3845 kmem_free(cb, sizeof (l2arc_read_callback_t)); 3846 } 3847 3848 /* 3849 * This is the list priority from which the L2ARC will search for pages to 3850 * cache. This is used within loops (0..3) to cycle through lists in the 3851 * desired order. This order can have a significant effect on cache 3852 * performance. 3853 * 3854 * Currently the metadata lists are hit first, MFU then MRU, followed by 3855 * the data lists. This function returns a locked list, and also returns 3856 * the lock pointer. 3857 */ 3858 static list_t * 3859 l2arc_list_locked(int list_num, kmutex_t **lock) 3860 { 3861 list_t *list; 3862 3863 ASSERT(list_num >= 0 && list_num <= 3); 3864 3865 switch (list_num) { 3866 case 0: 3867 list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; 3868 *lock = &arc_mfu->arcs_mtx; 3869 break; 3870 case 1: 3871 list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; 3872 *lock = &arc_mru->arcs_mtx; 3873 break; 3874 case 2: 3875 list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; 3876 *lock = &arc_mfu->arcs_mtx; 3877 break; 3878 case 3: 3879 list = &arc_mru->arcs_list[ARC_BUFC_DATA]; 3880 *lock = &arc_mru->arcs_mtx; 3881 break; 3882 } 3883 3884 ASSERT(!(MUTEX_HELD(*lock))); 3885 mutex_enter(*lock); 3886 return (list); 3887 } 3888 3889 /* 3890 * Evict buffers from the device write hand to the distance specified in 3891 * bytes. This distance may span populated buffers, it may span nothing. 3892 * This is clearing a region on the L2ARC device ready for writing. 3893 * If the 'all' boolean is set, every buffer is evicted. 3894 */ 3895 static void 3896 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 3897 { 3898 list_t *buflist; 3899 l2arc_buf_hdr_t *abl2; 3900 arc_buf_hdr_t *ab, *ab_prev; 3901 kmutex_t *hash_lock; 3902 uint64_t taddr; 3903 3904 buflist = dev->l2ad_buflist; 3905 3906 if (buflist == NULL) 3907 return; 3908 3909 if (!all && dev->l2ad_first) { 3910 /* 3911 * This is the first sweep through the device. There is 3912 * nothing to evict. 3913 */ 3914 return; 3915 } 3916 3917 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 3918 /* 3919 * When nearing the end of the device, evict to the end 3920 * before the device write hand jumps to the start. 3921 */ 3922 taddr = dev->l2ad_end; 3923 } else { 3924 taddr = dev->l2ad_hand + distance; 3925 } 3926 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 3927 uint64_t, taddr, boolean_t, all); 3928 3929 top: 3930 mutex_enter(&l2arc_buflist_mtx); 3931 for (ab = list_tail(buflist); ab; ab = ab_prev) { 3932 ab_prev = list_prev(buflist, ab); 3933 3934 hash_lock = HDR_LOCK(ab); 3935 if (!mutex_tryenter(hash_lock)) { 3936 /* 3937 * Missed the hash lock. Retry. 3938 */ 3939 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 3940 mutex_exit(&l2arc_buflist_mtx); 3941 mutex_enter(hash_lock); 3942 mutex_exit(hash_lock); 3943 goto top; 3944 } 3945 3946 if (HDR_L2_WRITE_HEAD(ab)) { 3947 /* 3948 * We hit a write head node. Leave it for 3949 * l2arc_write_done(). 3950 */ 3951 list_remove(buflist, ab); 3952 mutex_exit(hash_lock); 3953 continue; 3954 } 3955 3956 if (!all && ab->b_l2hdr != NULL && 3957 (ab->b_l2hdr->b_daddr > taddr || 3958 ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { 3959 /* 3960 * We've evicted to the target address, 3961 * or the end of the device. 3962 */ 3963 mutex_exit(hash_lock); 3964 break; 3965 } 3966 3967 if (HDR_FREE_IN_PROGRESS(ab)) { 3968 /* 3969 * Already on the path to destruction. 3970 */ 3971 mutex_exit(hash_lock); 3972 continue; 3973 } 3974 3975 if (ab->b_state == arc_l2c_only) { 3976 ASSERT(!HDR_L2_READING(ab)); 3977 /* 3978 * This doesn't exist in the ARC. Destroy. 3979 * arc_hdr_destroy() will call list_remove() 3980 * and decrement arcstat_l2_size. 3981 */ 3982 arc_change_state(arc_anon, ab, hash_lock); 3983 arc_hdr_destroy(ab); 3984 } else { 3985 /* 3986 * Invalidate issued or about to be issued 3987 * reads, since we may be about to write 3988 * over this location. 3989 */ 3990 if (HDR_L2_READING(ab)) { 3991 ARCSTAT_BUMP(arcstat_l2_evict_reading); 3992 ab->b_flags |= ARC_L2_EVICTED; 3993 } 3994 3995 /* 3996 * Tell ARC this no longer exists in L2ARC. 3997 */ 3998 if (ab->b_l2hdr != NULL) { 3999 abl2 = ab->b_l2hdr; 4000 ab->b_l2hdr = NULL; 4001 kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); 4002 ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); 4003 } 4004 list_remove(buflist, ab); 4005 4006 /* 4007 * This may have been leftover after a 4008 * failed write. 4009 */ 4010 ab->b_flags &= ~ARC_L2_WRITING; 4011 } 4012 mutex_exit(hash_lock); 4013 } 4014 mutex_exit(&l2arc_buflist_mtx); 4015 4016 spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); 4017 dev->l2ad_evict = taddr; 4018 } 4019 4020 /* 4021 * Find and write ARC buffers to the L2ARC device. 4022 * 4023 * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid 4024 * for reading until they have completed writing. 4025 */ 4026 static void 4027 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 4028 { 4029 arc_buf_hdr_t *ab, *ab_prev, *head; 4030 l2arc_buf_hdr_t *hdrl2; 4031 list_t *list; 4032 uint64_t passed_sz, write_sz, buf_sz, headroom; 4033 void *buf_data; 4034 kmutex_t *hash_lock, *list_lock; 4035 boolean_t have_lock, full; 4036 l2arc_write_callback_t *cb; 4037 zio_t *pio, *wzio; 4038 4039 ASSERT(dev->l2ad_vdev != NULL); 4040 4041 pio = NULL; 4042 write_sz = 0; 4043 full = B_FALSE; 4044 head = kmem_cache_alloc(hdr_cache, KM_PUSHPAGE); 4045 head->b_flags |= ARC_L2_WRITE_HEAD; 4046 4047 /* 4048 * Copy buffers for L2ARC writing. 4049 */ 4050 mutex_enter(&l2arc_buflist_mtx); 4051 for (int try = 0; try <= 3; try++) { 4052 list = l2arc_list_locked(try, &list_lock); 4053 passed_sz = 0; 4054 4055 /* 4056 * L2ARC fast warmup. 4057 * 4058 * Until the ARC is warm and starts to evict, read from the 4059 * head of the ARC lists rather than the tail. 4060 */ 4061 headroom = target_sz * l2arc_headroom; 4062 if (arc_warm == B_FALSE) 4063 ab = list_head(list); 4064 else 4065 ab = list_tail(list); 4066 4067 for (; ab; ab = ab_prev) { 4068 if (arc_warm == B_FALSE) 4069 ab_prev = list_next(list, ab); 4070 else 4071 ab_prev = list_prev(list, ab); 4072 4073 hash_lock = HDR_LOCK(ab); 4074 have_lock = MUTEX_HELD(hash_lock); 4075 if (!have_lock && !mutex_tryenter(hash_lock)) { 4076 /* 4077 * Skip this buffer rather than waiting. 4078 */ 4079 continue; 4080 } 4081 4082 passed_sz += ab->b_size; 4083 if (passed_sz > headroom) { 4084 /* 4085 * Searched too far. 4086 */ 4087 mutex_exit(hash_lock); 4088 break; 4089 } 4090 4091 if (ab->b_spa != spa) { 4092 mutex_exit(hash_lock); 4093 continue; 4094 } 4095 4096 if (ab->b_l2hdr != NULL) { 4097 /* 4098 * Already in L2ARC. 4099 */ 4100 mutex_exit(hash_lock); 4101 continue; 4102 } 4103 4104 if (HDR_IO_IN_PROGRESS(ab) || HDR_DONT_L2CACHE(ab)) { 4105 mutex_exit(hash_lock); 4106 continue; 4107 } 4108 4109 if ((write_sz + ab->b_size) > target_sz) { 4110 full = B_TRUE; 4111 mutex_exit(hash_lock); 4112 break; 4113 } 4114 4115 if (ab->b_buf == NULL) { 4116 DTRACE_PROBE1(l2arc__buf__null, void *, ab); 4117 mutex_exit(hash_lock); 4118 continue; 4119 } 4120 4121 if (pio == NULL) { 4122 /* 4123 * Insert a dummy header on the buflist so 4124 * l2arc_write_done() can find where the 4125 * write buffers begin without searching. 4126 */ 4127 list_insert_head(dev->l2ad_buflist, head); 4128 4129 cb = kmem_alloc( 4130 sizeof (l2arc_write_callback_t), KM_SLEEP); 4131 cb->l2wcb_dev = dev; 4132 cb->l2wcb_head = head; 4133 pio = zio_root(spa, l2arc_write_done, cb, 4134 ZIO_FLAG_CANFAIL); 4135 } 4136 4137 /* 4138 * Create and add a new L2ARC header. 4139 */ 4140 hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); 4141 hdrl2->b_dev = dev; 4142 hdrl2->b_daddr = dev->l2ad_hand; 4143 4144 ab->b_flags |= ARC_L2_WRITING; 4145 ab->b_l2hdr = hdrl2; 4146 list_insert_head(dev->l2ad_buflist, ab); 4147 buf_data = ab->b_buf->b_data; 4148 buf_sz = ab->b_size; 4149 4150 /* 4151 * Compute and store the buffer cksum before 4152 * writing. On debug the cksum is verified first. 4153 */ 4154 arc_cksum_verify(ab->b_buf); 4155 arc_cksum_compute(ab->b_buf, B_TRUE); 4156 4157 mutex_exit(hash_lock); 4158 4159 wzio = zio_write_phys(pio, dev->l2ad_vdev, 4160 dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, 4161 NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, 4162 ZIO_FLAG_CANFAIL, B_FALSE); 4163 4164 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 4165 zio_t *, wzio); 4166 (void) zio_nowait(wzio); 4167 4168 write_sz += buf_sz; 4169 dev->l2ad_hand += buf_sz; 4170 } 4171 4172 mutex_exit(list_lock); 4173 4174 if (full == B_TRUE) 4175 break; 4176 } 4177 mutex_exit(&l2arc_buflist_mtx); 4178 4179 if (pio == NULL) { 4180 ASSERT3U(write_sz, ==, 0); 4181 kmem_cache_free(hdr_cache, head); 4182 return; 4183 } 4184 4185 ASSERT3U(write_sz, <=, target_sz); 4186 ARCSTAT_BUMP(arcstat_l2_writes_sent); 4187 ARCSTAT_INCR(arcstat_l2_size, write_sz); 4188 spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); 4189 4190 /* 4191 * Bump device hand to the device start if it is approaching the end. 4192 * l2arc_evict() will already have evicted ahead for this case. 4193 */ 4194 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 4195 spa_l2cache_space_update(dev->l2ad_vdev, 0, 4196 dev->l2ad_end - dev->l2ad_hand); 4197 dev->l2ad_hand = dev->l2ad_start; 4198 dev->l2ad_evict = dev->l2ad_start; 4199 dev->l2ad_first = B_FALSE; 4200 } 4201 4202 (void) zio_wait(pio); 4203 } 4204 4205 /* 4206 * This thread feeds the L2ARC at regular intervals. This is the beating 4207 * heart of the L2ARC. 4208 */ 4209 static void 4210 l2arc_feed_thread(void) 4211 { 4212 callb_cpr_t cpr; 4213 l2arc_dev_t *dev; 4214 spa_t *spa; 4215 uint64_t size; 4216 4217 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 4218 4219 mutex_enter(&l2arc_feed_thr_lock); 4220 4221 while (l2arc_thread_exit == 0) { 4222 /* 4223 * Pause for l2arc_feed_secs seconds between writes. 4224 */ 4225 CALLB_CPR_SAFE_BEGIN(&cpr); 4226 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 4227 lbolt + (hz * l2arc_feed_secs)); 4228 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 4229 4230 /* 4231 * Quick check for L2ARC devices. 4232 */ 4233 mutex_enter(&l2arc_dev_mtx); 4234 if (l2arc_ndev == 0) { 4235 mutex_exit(&l2arc_dev_mtx); 4236 continue; 4237 } 4238 mutex_exit(&l2arc_dev_mtx); 4239 4240 /* 4241 * This selects the next l2arc device to write to, and in 4242 * doing so the next spa to feed from: dev->l2ad_spa. This 4243 * will return NULL if there are now no l2arc devices or if 4244 * they are all faulted. 4245 * 4246 * If a device is returned, its spa's config lock is also 4247 * held to prevent device removal. l2arc_dev_get_next() 4248 * will grab and release l2arc_dev_mtx. 4249 */ 4250 if ((dev = l2arc_dev_get_next()) == NULL) 4251 continue; 4252 4253 spa = dev->l2ad_spa; 4254 ASSERT(spa != NULL); 4255 4256 /* 4257 * Avoid contributing to memory pressure. 4258 */ 4259 if (arc_reclaim_needed()) { 4260 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 4261 spa_config_exit(spa, dev); 4262 continue; 4263 } 4264 4265 ARCSTAT_BUMP(arcstat_l2_feeds); 4266 4267 size = dev->l2ad_write; 4268 if (arc_warm == B_FALSE) 4269 size += dev->l2ad_boost; 4270 4271 /* 4272 * Evict L2ARC buffers that will be overwritten. 4273 */ 4274 l2arc_evict(dev, size, B_FALSE); 4275 4276 /* 4277 * Write ARC buffers. 4278 */ 4279 l2arc_write_buffers(spa, dev, size); 4280 spa_config_exit(spa, dev); 4281 } 4282 4283 l2arc_thread_exit = 0; 4284 cv_broadcast(&l2arc_feed_thr_cv); 4285 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 4286 thread_exit(); 4287 } 4288 4289 boolean_t 4290 l2arc_vdev_present(vdev_t *vd) 4291 { 4292 l2arc_dev_t *dev; 4293 4294 mutex_enter(&l2arc_dev_mtx); 4295 for (dev = list_head(l2arc_dev_list); dev != NULL; 4296 dev = list_next(l2arc_dev_list, dev)) { 4297 if (dev->l2ad_vdev == vd) 4298 break; 4299 } 4300 mutex_exit(&l2arc_dev_mtx); 4301 4302 return (dev != NULL); 4303 } 4304 4305 /* 4306 * Add a vdev for use by the L2ARC. By this point the spa has already 4307 * validated the vdev and opened it. 4308 */ 4309 void 4310 l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) 4311 { 4312 l2arc_dev_t *adddev; 4313 4314 ASSERT(!l2arc_vdev_present(vd)); 4315 4316 /* 4317 * Create a new l2arc device entry. 4318 */ 4319 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 4320 adddev->l2ad_spa = spa; 4321 adddev->l2ad_vdev = vd; 4322 adddev->l2ad_write = l2arc_write_max; 4323 adddev->l2ad_boost = l2arc_write_boost; 4324 adddev->l2ad_start = start; 4325 adddev->l2ad_end = end; 4326 adddev->l2ad_hand = adddev->l2ad_start; 4327 adddev->l2ad_evict = adddev->l2ad_start; 4328 adddev->l2ad_first = B_TRUE; 4329 ASSERT3U(adddev->l2ad_write, >, 0); 4330 4331 /* 4332 * This is a list of all ARC buffers that are still valid on the 4333 * device. 4334 */ 4335 adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); 4336 list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 4337 offsetof(arc_buf_hdr_t, b_l2node)); 4338 4339 spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); 4340 4341 /* 4342 * Add device to global list 4343 */ 4344 mutex_enter(&l2arc_dev_mtx); 4345 list_insert_head(l2arc_dev_list, adddev); 4346 atomic_inc_64(&l2arc_ndev); 4347 mutex_exit(&l2arc_dev_mtx); 4348 } 4349 4350 /* 4351 * Remove a vdev from the L2ARC. 4352 */ 4353 void 4354 l2arc_remove_vdev(vdev_t *vd) 4355 { 4356 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 4357 4358 /* 4359 * Find the device by vdev 4360 */ 4361 mutex_enter(&l2arc_dev_mtx); 4362 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 4363 nextdev = list_next(l2arc_dev_list, dev); 4364 if (vd == dev->l2ad_vdev) { 4365 remdev = dev; 4366 break; 4367 } 4368 } 4369 ASSERT(remdev != NULL); 4370 4371 /* 4372 * Remove device from global list 4373 */ 4374 list_remove(l2arc_dev_list, remdev); 4375 l2arc_dev_last = NULL; /* may have been invalidated */ 4376 atomic_dec_64(&l2arc_ndev); 4377 mutex_exit(&l2arc_dev_mtx); 4378 4379 /* 4380 * Clear all buflists and ARC references. L2ARC device flush. 4381 */ 4382 l2arc_evict(remdev, 0, B_TRUE); 4383 list_destroy(remdev->l2ad_buflist); 4384 kmem_free(remdev->l2ad_buflist, sizeof (list_t)); 4385 kmem_free(remdev, sizeof (l2arc_dev_t)); 4386 } 4387 4388 void 4389 l2arc_init() 4390 { 4391 l2arc_thread_exit = 0; 4392 l2arc_ndev = 0; 4393 l2arc_writes_sent = 0; 4394 l2arc_writes_done = 0; 4395 4396 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 4397 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 4398 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 4399 mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); 4400 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 4401 4402 l2arc_dev_list = &L2ARC_dev_list; 4403 l2arc_free_on_write = &L2ARC_free_on_write; 4404 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 4405 offsetof(l2arc_dev_t, l2ad_node)); 4406 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 4407 offsetof(l2arc_data_free_t, l2df_list_node)); 4408 4409 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 4410 TS_RUN, minclsyspri); 4411 } 4412 4413 void 4414 l2arc_fini() 4415 { 4416 /* 4417 * This is called from dmu_fini(), which is called from spa_fini(); 4418 * Because of this, we can assume that all l2arc devices have 4419 * already been removed when the pools themselves were removed. 4420 */ 4421 4422 mutex_enter(&l2arc_feed_thr_lock); 4423 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 4424 l2arc_thread_exit = 1; 4425 while (l2arc_thread_exit != 0) 4426 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 4427 mutex_exit(&l2arc_feed_thr_lock); 4428 4429 l2arc_do_free_on_write(); 4430 4431 mutex_destroy(&l2arc_feed_thr_lock); 4432 cv_destroy(&l2arc_feed_thr_cv); 4433 mutex_destroy(&l2arc_dev_mtx); 4434 mutex_destroy(&l2arc_buflist_mtx); 4435 mutex_destroy(&l2arc_free_on_write_mtx); 4436 4437 list_destroy(l2arc_dev_list); 4438 list_destroy(l2arc_free_on_write); 4439 } 4440