1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 #include <sys/kstat.h> 128 129 static kmutex_t arc_reclaim_thr_lock; 130 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131 static uint8_t arc_thread_exit; 132 133 #define ARC_REDUCE_DNLC_PERCENT 3 134 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136 typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139 } arc_reclaim_strategy_t; 140 141 /* number of seconds before growing cache again */ 142 static int arc_grow_retry = 60; 143 144 /* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148 static int arc_min_prefetch_lifespan; 149 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 uint64_t zfs_arc_meta_limit = 0; 158 159 /* 160 * Note that buffers can be in one of 5 states: 161 * ARC_anon - anonymous (discussed below) 162 * ARC_mru - recently used, currently cached 163 * ARC_mru_ghost - recentely used, no longer in cache 164 * ARC_mfu - frequently used, currently cached 165 * ARC_mfu_ghost - frequently used, no longer in cache 166 * When there are no active references to the buffer, they are 167 * are linked onto a list in one of these arc states. These are 168 * the only buffers that can be evicted or deleted. Within each 169 * state there are multiple lists, one for meta-data and one for 170 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 171 * etc.) is tracked separately so that it can be managed more 172 * explicitly: favored over data, limited explicitely. 173 * 174 * Anonymous buffers are buffers that are not associated with 175 * a DVA. These are buffers that hold dirty block copies 176 * before they are written to stable storage. By definition, 177 * they are "ref'd" and are considered part of arc_mru 178 * that cannot be freed. Generally, they will aquire a DVA 179 * as they are written and migrate onto the arc_mru list. 180 */ 181 182 typedef struct arc_state { 183 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 184 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 185 uint64_t arcs_size; /* total amount of data in this state */ 186 kmutex_t arcs_mtx; 187 } arc_state_t; 188 189 /* The 5 states: */ 190 static arc_state_t ARC_anon; 191 static arc_state_t ARC_mru; 192 static arc_state_t ARC_mru_ghost; 193 static arc_state_t ARC_mfu; 194 static arc_state_t ARC_mfu_ghost; 195 196 typedef struct arc_stats { 197 kstat_named_t arcstat_hits; 198 kstat_named_t arcstat_misses; 199 kstat_named_t arcstat_demand_data_hits; 200 kstat_named_t arcstat_demand_data_misses; 201 kstat_named_t arcstat_demand_metadata_hits; 202 kstat_named_t arcstat_demand_metadata_misses; 203 kstat_named_t arcstat_prefetch_data_hits; 204 kstat_named_t arcstat_prefetch_data_misses; 205 kstat_named_t arcstat_prefetch_metadata_hits; 206 kstat_named_t arcstat_prefetch_metadata_misses; 207 kstat_named_t arcstat_mru_hits; 208 kstat_named_t arcstat_mru_ghost_hits; 209 kstat_named_t arcstat_mfu_hits; 210 kstat_named_t arcstat_mfu_ghost_hits; 211 kstat_named_t arcstat_deleted; 212 kstat_named_t arcstat_recycle_miss; 213 kstat_named_t arcstat_mutex_miss; 214 kstat_named_t arcstat_evict_skip; 215 kstat_named_t arcstat_hash_elements; 216 kstat_named_t arcstat_hash_elements_max; 217 kstat_named_t arcstat_hash_collisions; 218 kstat_named_t arcstat_hash_chains; 219 kstat_named_t arcstat_hash_chain_max; 220 kstat_named_t arcstat_p; 221 kstat_named_t arcstat_c; 222 kstat_named_t arcstat_c_min; 223 kstat_named_t arcstat_c_max; 224 kstat_named_t arcstat_size; 225 } arc_stats_t; 226 227 static arc_stats_t arc_stats = { 228 { "hits", KSTAT_DATA_UINT64 }, 229 { "misses", KSTAT_DATA_UINT64 }, 230 { "demand_data_hits", KSTAT_DATA_UINT64 }, 231 { "demand_data_misses", KSTAT_DATA_UINT64 }, 232 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 233 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 234 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 235 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 236 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 237 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 238 { "mru_hits", KSTAT_DATA_UINT64 }, 239 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 240 { "mfu_hits", KSTAT_DATA_UINT64 }, 241 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 242 { "deleted", KSTAT_DATA_UINT64 }, 243 { "recycle_miss", KSTAT_DATA_UINT64 }, 244 { "mutex_miss", KSTAT_DATA_UINT64 }, 245 { "evict_skip", KSTAT_DATA_UINT64 }, 246 { "hash_elements", KSTAT_DATA_UINT64 }, 247 { "hash_elements_max", KSTAT_DATA_UINT64 }, 248 { "hash_collisions", KSTAT_DATA_UINT64 }, 249 { "hash_chains", KSTAT_DATA_UINT64 }, 250 { "hash_chain_max", KSTAT_DATA_UINT64 }, 251 { "p", KSTAT_DATA_UINT64 }, 252 { "c", KSTAT_DATA_UINT64 }, 253 { "c_min", KSTAT_DATA_UINT64 }, 254 { "c_max", KSTAT_DATA_UINT64 }, 255 { "size", KSTAT_DATA_UINT64 } 256 }; 257 258 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 259 260 #define ARCSTAT_INCR(stat, val) \ 261 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 262 263 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 264 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 265 266 #define ARCSTAT_MAX(stat, val) { \ 267 uint64_t m; \ 268 while ((val) > (m = arc_stats.stat.value.ui64) && \ 269 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 270 continue; \ 271 } 272 273 #define ARCSTAT_MAXSTAT(stat) \ 274 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 275 276 /* 277 * We define a macro to allow ARC hits/misses to be easily broken down by 278 * two separate conditions, giving a total of four different subtypes for 279 * each of hits and misses (so eight statistics total). 280 */ 281 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 282 if (cond1) { \ 283 if (cond2) { \ 284 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 285 } else { \ 286 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 287 } \ 288 } else { \ 289 if (cond2) { \ 290 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 291 } else { \ 292 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 293 } \ 294 } 295 296 kstat_t *arc_ksp; 297 static arc_state_t *arc_anon; 298 static arc_state_t *arc_mru; 299 static arc_state_t *arc_mru_ghost; 300 static arc_state_t *arc_mfu; 301 static arc_state_t *arc_mfu_ghost; 302 303 /* 304 * There are several ARC variables that are critical to export as kstats -- 305 * but we don't want to have to grovel around in the kstat whenever we wish to 306 * manipulate them. For these variables, we therefore define them to be in 307 * terms of the statistic variable. This assures that we are not introducing 308 * the possibility of inconsistency by having shadow copies of the variables, 309 * while still allowing the code to be readable. 310 */ 311 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 312 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 313 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 314 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 315 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 316 317 static int arc_no_grow; /* Don't try to grow cache size */ 318 static uint64_t arc_tempreserve; 319 static uint64_t arc_meta_used; 320 static uint64_t arc_meta_limit; 321 static uint64_t arc_meta_max = 0; 322 323 typedef struct arc_callback arc_callback_t; 324 325 struct arc_callback { 326 void *acb_private; 327 arc_done_func_t *acb_done; 328 arc_byteswap_func_t *acb_byteswap; 329 arc_buf_t *acb_buf; 330 zio_t *acb_zio_dummy; 331 arc_callback_t *acb_next; 332 }; 333 334 typedef struct arc_write_callback arc_write_callback_t; 335 336 struct arc_write_callback { 337 void *awcb_private; 338 arc_done_func_t *awcb_ready; 339 arc_done_func_t *awcb_done; 340 arc_buf_t *awcb_buf; 341 }; 342 343 struct arc_buf_hdr { 344 /* protected by hash lock */ 345 dva_t b_dva; 346 uint64_t b_birth; 347 uint64_t b_cksum0; 348 349 kmutex_t b_freeze_lock; 350 zio_cksum_t *b_freeze_cksum; 351 352 arc_buf_hdr_t *b_hash_next; 353 arc_buf_t *b_buf; 354 uint32_t b_flags; 355 uint32_t b_datacnt; 356 357 arc_callback_t *b_acb; 358 kcondvar_t b_cv; 359 360 /* immutable */ 361 arc_buf_contents_t b_type; 362 uint64_t b_size; 363 spa_t *b_spa; 364 365 /* protected by arc state mutex */ 366 arc_state_t *b_state; 367 list_node_t b_arc_node; 368 369 /* updated atomically */ 370 clock_t b_arc_access; 371 372 /* self protecting */ 373 refcount_t b_refcnt; 374 }; 375 376 static arc_buf_t *arc_eviction_list; 377 static kmutex_t arc_eviction_mtx; 378 static arc_buf_hdr_t arc_eviction_hdr; 379 static void arc_get_data_buf(arc_buf_t *buf); 380 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 381 static int arc_evict_needed(arc_buf_contents_t type); 382 static void arc_evict_ghost(arc_state_t *state, int64_t bytes); 383 384 #define GHOST_STATE(state) \ 385 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 386 387 /* 388 * Private ARC flags. These flags are private ARC only flags that will show up 389 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 390 * be passed in as arc_flags in things like arc_read. However, these flags 391 * should never be passed and should only be set by ARC code. When adding new 392 * public flags, make sure not to smash the private ones. 393 */ 394 395 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 396 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 397 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 398 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 399 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 400 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 401 402 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 403 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 404 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 405 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 406 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 407 408 /* 409 * Hash table routines 410 */ 411 412 #define HT_LOCK_PAD 64 413 414 struct ht_lock { 415 kmutex_t ht_lock; 416 #ifdef _KERNEL 417 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 418 #endif 419 }; 420 421 #define BUF_LOCKS 256 422 typedef struct buf_hash_table { 423 uint64_t ht_mask; 424 arc_buf_hdr_t **ht_table; 425 struct ht_lock ht_locks[BUF_LOCKS]; 426 } buf_hash_table_t; 427 428 static buf_hash_table_t buf_hash_table; 429 430 #define BUF_HASH_INDEX(spa, dva, birth) \ 431 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 432 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 433 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 434 #define HDR_LOCK(buf) \ 435 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 436 437 uint64_t zfs_crc64_table[256]; 438 439 static uint64_t 440 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 441 { 442 uintptr_t spav = (uintptr_t)spa; 443 uint8_t *vdva = (uint8_t *)dva; 444 uint64_t crc = -1ULL; 445 int i; 446 447 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 448 449 for (i = 0; i < sizeof (dva_t); i++) 450 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 451 452 crc ^= (spav>>8) ^ birth; 453 454 return (crc); 455 } 456 457 #define BUF_EMPTY(buf) \ 458 ((buf)->b_dva.dva_word[0] == 0 && \ 459 (buf)->b_dva.dva_word[1] == 0 && \ 460 (buf)->b_birth == 0) 461 462 #define BUF_EQUAL(spa, dva, birth, buf) \ 463 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 464 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 465 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 466 467 static arc_buf_hdr_t * 468 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 469 { 470 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 471 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 472 arc_buf_hdr_t *buf; 473 474 mutex_enter(hash_lock); 475 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 476 buf = buf->b_hash_next) { 477 if (BUF_EQUAL(spa, dva, birth, buf)) { 478 *lockp = hash_lock; 479 return (buf); 480 } 481 } 482 mutex_exit(hash_lock); 483 *lockp = NULL; 484 return (NULL); 485 } 486 487 /* 488 * Insert an entry into the hash table. If there is already an element 489 * equal to elem in the hash table, then the already existing element 490 * will be returned and the new element will not be inserted. 491 * Otherwise returns NULL. 492 */ 493 static arc_buf_hdr_t * 494 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 495 { 496 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 497 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 498 arc_buf_hdr_t *fbuf; 499 uint32_t i; 500 501 ASSERT(!HDR_IN_HASH_TABLE(buf)); 502 *lockp = hash_lock; 503 mutex_enter(hash_lock); 504 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 505 fbuf = fbuf->b_hash_next, i++) { 506 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 507 return (fbuf); 508 } 509 510 buf->b_hash_next = buf_hash_table.ht_table[idx]; 511 buf_hash_table.ht_table[idx] = buf; 512 buf->b_flags |= ARC_IN_HASH_TABLE; 513 514 /* collect some hash table performance data */ 515 if (i > 0) { 516 ARCSTAT_BUMP(arcstat_hash_collisions); 517 if (i == 1) 518 ARCSTAT_BUMP(arcstat_hash_chains); 519 520 ARCSTAT_MAX(arcstat_hash_chain_max, i); 521 } 522 523 ARCSTAT_BUMP(arcstat_hash_elements); 524 ARCSTAT_MAXSTAT(arcstat_hash_elements); 525 526 return (NULL); 527 } 528 529 static void 530 buf_hash_remove(arc_buf_hdr_t *buf) 531 { 532 arc_buf_hdr_t *fbuf, **bufp; 533 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 534 535 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 536 ASSERT(HDR_IN_HASH_TABLE(buf)); 537 538 bufp = &buf_hash_table.ht_table[idx]; 539 while ((fbuf = *bufp) != buf) { 540 ASSERT(fbuf != NULL); 541 bufp = &fbuf->b_hash_next; 542 } 543 *bufp = buf->b_hash_next; 544 buf->b_hash_next = NULL; 545 buf->b_flags &= ~ARC_IN_HASH_TABLE; 546 547 /* collect some hash table performance data */ 548 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 549 550 if (buf_hash_table.ht_table[idx] && 551 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 552 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 553 } 554 555 /* 556 * Global data structures and functions for the buf kmem cache. 557 */ 558 static kmem_cache_t *hdr_cache; 559 static kmem_cache_t *buf_cache; 560 561 static void 562 buf_fini(void) 563 { 564 int i; 565 566 kmem_free(buf_hash_table.ht_table, 567 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 568 for (i = 0; i < BUF_LOCKS; i++) 569 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 570 kmem_cache_destroy(hdr_cache); 571 kmem_cache_destroy(buf_cache); 572 } 573 574 /* 575 * Constructor callback - called when the cache is empty 576 * and a new buf is requested. 577 */ 578 /* ARGSUSED */ 579 static int 580 hdr_cons(void *vbuf, void *unused, int kmflag) 581 { 582 arc_buf_hdr_t *buf = vbuf; 583 584 bzero(buf, sizeof (arc_buf_hdr_t)); 585 refcount_create(&buf->b_refcnt); 586 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 587 mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 588 return (0); 589 } 590 591 /* 592 * Destructor callback - called when a cached buf is 593 * no longer required. 594 */ 595 /* ARGSUSED */ 596 static void 597 hdr_dest(void *vbuf, void *unused) 598 { 599 arc_buf_hdr_t *buf = vbuf; 600 601 refcount_destroy(&buf->b_refcnt); 602 cv_destroy(&buf->b_cv); 603 mutex_destroy(&buf->b_freeze_lock); 604 } 605 606 /* 607 * Reclaim callback -- invoked when memory is low. 608 */ 609 /* ARGSUSED */ 610 static void 611 hdr_recl(void *unused) 612 { 613 dprintf("hdr_recl called\n"); 614 /* 615 * umem calls the reclaim func when we destroy the buf cache, 616 * which is after we do arc_fini(). 617 */ 618 if (!arc_dead) 619 cv_signal(&arc_reclaim_thr_cv); 620 } 621 622 static void 623 buf_init(void) 624 { 625 uint64_t *ct; 626 uint64_t hsize = 1ULL << 12; 627 int i, j; 628 629 /* 630 * The hash table is big enough to fill all of physical memory 631 * with an average 64K block size. The table will take up 632 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 633 */ 634 while (hsize * 65536 < physmem * PAGESIZE) 635 hsize <<= 1; 636 retry: 637 buf_hash_table.ht_mask = hsize - 1; 638 buf_hash_table.ht_table = 639 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 640 if (buf_hash_table.ht_table == NULL) { 641 ASSERT(hsize > (1ULL << 8)); 642 hsize >>= 1; 643 goto retry; 644 } 645 646 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 647 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 648 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 649 0, NULL, NULL, NULL, NULL, NULL, 0); 650 651 for (i = 0; i < 256; i++) 652 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 653 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 654 655 for (i = 0; i < BUF_LOCKS; i++) { 656 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 657 NULL, MUTEX_DEFAULT, NULL); 658 } 659 } 660 661 #define ARC_MINTIME (hz>>4) /* 62 ms */ 662 663 static void 664 arc_cksum_verify(arc_buf_t *buf) 665 { 666 zio_cksum_t zc; 667 668 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 669 return; 670 671 mutex_enter(&buf->b_hdr->b_freeze_lock); 672 if (buf->b_hdr->b_freeze_cksum == NULL || 673 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 674 mutex_exit(&buf->b_hdr->b_freeze_lock); 675 return; 676 } 677 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 678 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 679 panic("buffer modified while frozen!"); 680 mutex_exit(&buf->b_hdr->b_freeze_lock); 681 } 682 683 static void 684 arc_cksum_compute(arc_buf_t *buf) 685 { 686 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 687 return; 688 689 mutex_enter(&buf->b_hdr->b_freeze_lock); 690 if (buf->b_hdr->b_freeze_cksum != NULL) { 691 mutex_exit(&buf->b_hdr->b_freeze_lock); 692 return; 693 } 694 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 695 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 696 buf->b_hdr->b_freeze_cksum); 697 mutex_exit(&buf->b_hdr->b_freeze_lock); 698 } 699 700 void 701 arc_buf_thaw(arc_buf_t *buf) 702 { 703 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 704 return; 705 706 if (buf->b_hdr->b_state != arc_anon) 707 panic("modifying non-anon buffer!"); 708 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 709 panic("modifying buffer while i/o in progress!"); 710 arc_cksum_verify(buf); 711 mutex_enter(&buf->b_hdr->b_freeze_lock); 712 if (buf->b_hdr->b_freeze_cksum != NULL) { 713 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 714 buf->b_hdr->b_freeze_cksum = NULL; 715 } 716 mutex_exit(&buf->b_hdr->b_freeze_lock); 717 } 718 719 void 720 arc_buf_freeze(arc_buf_t *buf) 721 { 722 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 723 return; 724 725 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 726 buf->b_hdr->b_state == arc_anon); 727 arc_cksum_compute(buf); 728 } 729 730 static void 731 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 732 { 733 ASSERT(MUTEX_HELD(hash_lock)); 734 735 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 736 (ab->b_state != arc_anon)) { 737 uint64_t delta = ab->b_size * ab->b_datacnt; 738 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 739 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 740 741 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 742 mutex_enter(&ab->b_state->arcs_mtx); 743 ASSERT(list_link_active(&ab->b_arc_node)); 744 list_remove(list, ab); 745 if (GHOST_STATE(ab->b_state)) { 746 ASSERT3U(ab->b_datacnt, ==, 0); 747 ASSERT3P(ab->b_buf, ==, NULL); 748 delta = ab->b_size; 749 } 750 ASSERT(delta > 0); 751 ASSERT3U(*size, >=, delta); 752 atomic_add_64(size, -delta); 753 mutex_exit(&ab->b_state->arcs_mtx); 754 /* remove the prefetch flag is we get a reference */ 755 if (ab->b_flags & ARC_PREFETCH) 756 ab->b_flags &= ~ARC_PREFETCH; 757 } 758 } 759 760 static int 761 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 762 { 763 int cnt; 764 arc_state_t *state = ab->b_state; 765 766 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 767 ASSERT(!GHOST_STATE(state)); 768 769 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 770 (state != arc_anon)) { 771 uint64_t *size = &state->arcs_lsize[ab->b_type]; 772 773 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 774 mutex_enter(&state->arcs_mtx); 775 ASSERT(!list_link_active(&ab->b_arc_node)); 776 list_insert_head(&state->arcs_list[ab->b_type], ab); 777 ASSERT(ab->b_datacnt > 0); 778 atomic_add_64(size, ab->b_size * ab->b_datacnt); 779 mutex_exit(&state->arcs_mtx); 780 } 781 return (cnt); 782 } 783 784 /* 785 * Move the supplied buffer to the indicated state. The mutex 786 * for the buffer must be held by the caller. 787 */ 788 static void 789 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 790 { 791 arc_state_t *old_state = ab->b_state; 792 int64_t refcnt = refcount_count(&ab->b_refcnt); 793 uint64_t from_delta, to_delta; 794 795 ASSERT(MUTEX_HELD(hash_lock)); 796 ASSERT(new_state != old_state); 797 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 798 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 799 800 from_delta = to_delta = ab->b_datacnt * ab->b_size; 801 802 /* 803 * If this buffer is evictable, transfer it from the 804 * old state list to the new state list. 805 */ 806 if (refcnt == 0) { 807 if (old_state != arc_anon) { 808 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 809 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 810 811 if (use_mutex) 812 mutex_enter(&old_state->arcs_mtx); 813 814 ASSERT(list_link_active(&ab->b_arc_node)); 815 list_remove(&old_state->arcs_list[ab->b_type], ab); 816 817 /* 818 * If prefetching out of the ghost cache, 819 * we will have a non-null datacnt. 820 */ 821 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 822 /* ghost elements have a ghost size */ 823 ASSERT(ab->b_buf == NULL); 824 from_delta = ab->b_size; 825 } 826 ASSERT3U(*size, >=, from_delta); 827 atomic_add_64(size, -from_delta); 828 829 if (use_mutex) 830 mutex_exit(&old_state->arcs_mtx); 831 } 832 if (new_state != arc_anon) { 833 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 834 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 835 836 if (use_mutex) 837 mutex_enter(&new_state->arcs_mtx); 838 839 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 840 841 /* ghost elements have a ghost size */ 842 if (GHOST_STATE(new_state)) { 843 ASSERT(ab->b_datacnt == 0); 844 ASSERT(ab->b_buf == NULL); 845 to_delta = ab->b_size; 846 } 847 atomic_add_64(size, to_delta); 848 849 if (use_mutex) 850 mutex_exit(&new_state->arcs_mtx); 851 } 852 } 853 854 ASSERT(!BUF_EMPTY(ab)); 855 if (new_state == arc_anon && old_state != arc_anon) { 856 buf_hash_remove(ab); 857 } 858 859 /* adjust state sizes */ 860 if (to_delta) 861 atomic_add_64(&new_state->arcs_size, to_delta); 862 if (from_delta) { 863 ASSERT3U(old_state->arcs_size, >=, from_delta); 864 atomic_add_64(&old_state->arcs_size, -from_delta); 865 } 866 ab->b_state = new_state; 867 } 868 869 void 870 arc_space_consume(uint64_t space) 871 { 872 atomic_add_64(&arc_meta_used, space); 873 atomic_add_64(&arc_size, space); 874 } 875 876 void 877 arc_space_return(uint64_t space) 878 { 879 ASSERT(arc_meta_used >= space); 880 if (arc_meta_max < arc_meta_used) 881 arc_meta_max = arc_meta_used; 882 atomic_add_64(&arc_meta_used, -space); 883 ASSERT(arc_size >= space); 884 atomic_add_64(&arc_size, -space); 885 } 886 887 void * 888 arc_data_buf_alloc(uint64_t size) 889 { 890 if (arc_evict_needed(ARC_BUFC_DATA)) 891 cv_signal(&arc_reclaim_thr_cv); 892 atomic_add_64(&arc_size, size); 893 return (zio_data_buf_alloc(size)); 894 } 895 896 void 897 arc_data_buf_free(void *buf, uint64_t size) 898 { 899 zio_data_buf_free(buf, size); 900 ASSERT(arc_size >= size); 901 atomic_add_64(&arc_size, -size); 902 } 903 904 arc_buf_t * 905 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 906 { 907 arc_buf_hdr_t *hdr; 908 arc_buf_t *buf; 909 910 ASSERT3U(size, >, 0); 911 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 912 ASSERT(BUF_EMPTY(hdr)); 913 hdr->b_size = size; 914 hdr->b_type = type; 915 hdr->b_spa = spa; 916 hdr->b_state = arc_anon; 917 hdr->b_arc_access = 0; 918 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 919 buf->b_hdr = hdr; 920 buf->b_data = NULL; 921 buf->b_efunc = NULL; 922 buf->b_private = NULL; 923 buf->b_next = NULL; 924 hdr->b_buf = buf; 925 arc_get_data_buf(buf); 926 hdr->b_datacnt = 1; 927 hdr->b_flags = 0; 928 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 929 (void) refcount_add(&hdr->b_refcnt, tag); 930 931 return (buf); 932 } 933 934 static arc_buf_t * 935 arc_buf_clone(arc_buf_t *from) 936 { 937 arc_buf_t *buf; 938 arc_buf_hdr_t *hdr = from->b_hdr; 939 uint64_t size = hdr->b_size; 940 941 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 942 buf->b_hdr = hdr; 943 buf->b_data = NULL; 944 buf->b_efunc = NULL; 945 buf->b_private = NULL; 946 buf->b_next = hdr->b_buf; 947 hdr->b_buf = buf; 948 arc_get_data_buf(buf); 949 bcopy(from->b_data, buf->b_data, size); 950 hdr->b_datacnt += 1; 951 return (buf); 952 } 953 954 void 955 arc_buf_add_ref(arc_buf_t *buf, void* tag) 956 { 957 arc_buf_hdr_t *hdr; 958 kmutex_t *hash_lock; 959 960 /* 961 * Check to see if this buffer is currently being evicted via 962 * arc_do_user_evicts(). 963 */ 964 mutex_enter(&arc_eviction_mtx); 965 hdr = buf->b_hdr; 966 if (hdr == NULL) { 967 mutex_exit(&arc_eviction_mtx); 968 return; 969 } 970 hash_lock = HDR_LOCK(hdr); 971 mutex_exit(&arc_eviction_mtx); 972 973 mutex_enter(hash_lock); 974 if (buf->b_data == NULL) { 975 /* 976 * This buffer is evicted. 977 */ 978 mutex_exit(hash_lock); 979 return; 980 } 981 982 ASSERT(buf->b_hdr == hdr); 983 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 984 add_reference(hdr, hash_lock, tag); 985 arc_access(hdr, hash_lock); 986 mutex_exit(hash_lock); 987 ARCSTAT_BUMP(arcstat_hits); 988 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 989 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 990 data, metadata, hits); 991 } 992 993 static void 994 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 995 { 996 arc_buf_t **bufp; 997 998 /* free up data associated with the buf */ 999 if (buf->b_data) { 1000 arc_state_t *state = buf->b_hdr->b_state; 1001 uint64_t size = buf->b_hdr->b_size; 1002 arc_buf_contents_t type = buf->b_hdr->b_type; 1003 1004 arc_cksum_verify(buf); 1005 if (!recycle) { 1006 if (type == ARC_BUFC_METADATA) { 1007 zio_buf_free(buf->b_data, size); 1008 arc_space_return(size); 1009 } else { 1010 ASSERT(type == ARC_BUFC_DATA); 1011 zio_data_buf_free(buf->b_data, size); 1012 atomic_add_64(&arc_size, -size); 1013 } 1014 } 1015 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1016 uint64_t *cnt = &state->arcs_lsize[type]; 1017 1018 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1019 ASSERT(state != arc_anon); 1020 1021 ASSERT3U(*cnt, >=, size); 1022 atomic_add_64(cnt, -size); 1023 } 1024 ASSERT3U(state->arcs_size, >=, size); 1025 atomic_add_64(&state->arcs_size, -size); 1026 buf->b_data = NULL; 1027 ASSERT(buf->b_hdr->b_datacnt > 0); 1028 buf->b_hdr->b_datacnt -= 1; 1029 } 1030 1031 /* only remove the buf if requested */ 1032 if (!all) 1033 return; 1034 1035 /* remove the buf from the hdr list */ 1036 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1037 continue; 1038 *bufp = buf->b_next; 1039 1040 ASSERT(buf->b_efunc == NULL); 1041 1042 /* clean up the buf */ 1043 buf->b_hdr = NULL; 1044 kmem_cache_free(buf_cache, buf); 1045 } 1046 1047 static void 1048 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1049 { 1050 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1051 ASSERT3P(hdr->b_state, ==, arc_anon); 1052 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1053 1054 if (!BUF_EMPTY(hdr)) { 1055 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1056 bzero(&hdr->b_dva, sizeof (dva_t)); 1057 hdr->b_birth = 0; 1058 hdr->b_cksum0 = 0; 1059 } 1060 while (hdr->b_buf) { 1061 arc_buf_t *buf = hdr->b_buf; 1062 1063 if (buf->b_efunc) { 1064 mutex_enter(&arc_eviction_mtx); 1065 ASSERT(buf->b_hdr != NULL); 1066 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1067 hdr->b_buf = buf->b_next; 1068 buf->b_hdr = &arc_eviction_hdr; 1069 buf->b_next = arc_eviction_list; 1070 arc_eviction_list = buf; 1071 mutex_exit(&arc_eviction_mtx); 1072 } else { 1073 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1074 } 1075 } 1076 if (hdr->b_freeze_cksum != NULL) { 1077 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1078 hdr->b_freeze_cksum = NULL; 1079 } 1080 1081 ASSERT(!list_link_active(&hdr->b_arc_node)); 1082 ASSERT3P(hdr->b_hash_next, ==, NULL); 1083 ASSERT3P(hdr->b_acb, ==, NULL); 1084 kmem_cache_free(hdr_cache, hdr); 1085 } 1086 1087 void 1088 arc_buf_free(arc_buf_t *buf, void *tag) 1089 { 1090 arc_buf_hdr_t *hdr = buf->b_hdr; 1091 int hashed = hdr->b_state != arc_anon; 1092 1093 ASSERT(buf->b_efunc == NULL); 1094 ASSERT(buf->b_data != NULL); 1095 1096 if (hashed) { 1097 kmutex_t *hash_lock = HDR_LOCK(hdr); 1098 1099 mutex_enter(hash_lock); 1100 (void) remove_reference(hdr, hash_lock, tag); 1101 if (hdr->b_datacnt > 1) 1102 arc_buf_destroy(buf, FALSE, TRUE); 1103 else 1104 hdr->b_flags |= ARC_BUF_AVAILABLE; 1105 mutex_exit(hash_lock); 1106 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1107 int destroy_hdr; 1108 /* 1109 * We are in the middle of an async write. Don't destroy 1110 * this buffer unless the write completes before we finish 1111 * decrementing the reference count. 1112 */ 1113 mutex_enter(&arc_eviction_mtx); 1114 (void) remove_reference(hdr, NULL, tag); 1115 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1116 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1117 mutex_exit(&arc_eviction_mtx); 1118 if (destroy_hdr) 1119 arc_hdr_destroy(hdr); 1120 } else { 1121 if (remove_reference(hdr, NULL, tag) > 0) { 1122 ASSERT(HDR_IO_ERROR(hdr)); 1123 arc_buf_destroy(buf, FALSE, TRUE); 1124 } else { 1125 arc_hdr_destroy(hdr); 1126 } 1127 } 1128 } 1129 1130 int 1131 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1132 { 1133 arc_buf_hdr_t *hdr = buf->b_hdr; 1134 kmutex_t *hash_lock = HDR_LOCK(hdr); 1135 int no_callback = (buf->b_efunc == NULL); 1136 1137 if (hdr->b_state == arc_anon) { 1138 arc_buf_free(buf, tag); 1139 return (no_callback); 1140 } 1141 1142 mutex_enter(hash_lock); 1143 ASSERT(hdr->b_state != arc_anon); 1144 ASSERT(buf->b_data != NULL); 1145 1146 (void) remove_reference(hdr, hash_lock, tag); 1147 if (hdr->b_datacnt > 1) { 1148 if (no_callback) 1149 arc_buf_destroy(buf, FALSE, TRUE); 1150 } else if (no_callback) { 1151 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1152 hdr->b_flags |= ARC_BUF_AVAILABLE; 1153 } 1154 ASSERT(no_callback || hdr->b_datacnt > 1 || 1155 refcount_is_zero(&hdr->b_refcnt)); 1156 mutex_exit(hash_lock); 1157 return (no_callback); 1158 } 1159 1160 int 1161 arc_buf_size(arc_buf_t *buf) 1162 { 1163 return (buf->b_hdr->b_size); 1164 } 1165 1166 /* 1167 * Evict buffers from list until we've removed the specified number of 1168 * bytes. Move the removed buffers to the appropriate evict state. 1169 * If the recycle flag is set, then attempt to "recycle" a buffer: 1170 * - look for a buffer to evict that is `bytes' long. 1171 * - return the data block from this buffer rather than freeing it. 1172 * This flag is used by callers that are trying to make space for a 1173 * new buffer in a full arc cache. 1174 */ 1175 static void * 1176 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1177 arc_buf_contents_t type) 1178 { 1179 arc_state_t *evicted_state; 1180 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1181 arc_buf_hdr_t *ab, *ab_prev = NULL; 1182 list_t *list = &state->arcs_list[type]; 1183 kmutex_t *hash_lock; 1184 boolean_t have_lock; 1185 void *stolen = NULL; 1186 1187 ASSERT(state == arc_mru || state == arc_mfu); 1188 1189 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1190 1191 mutex_enter(&state->arcs_mtx); 1192 mutex_enter(&evicted_state->arcs_mtx); 1193 1194 for (ab = list_tail(list); ab; ab = ab_prev) { 1195 ab_prev = list_prev(list, ab); 1196 /* prefetch buffers have a minimum lifespan */ 1197 if (HDR_IO_IN_PROGRESS(ab) || 1198 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1199 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1200 skipped++; 1201 continue; 1202 } 1203 /* "lookahead" for better eviction candidate */ 1204 if (recycle && ab->b_size != bytes && 1205 ab_prev && ab_prev->b_size == bytes) 1206 continue; 1207 hash_lock = HDR_LOCK(ab); 1208 have_lock = MUTEX_HELD(hash_lock); 1209 if (have_lock || mutex_tryenter(hash_lock)) { 1210 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1211 ASSERT(ab->b_datacnt > 0); 1212 while (ab->b_buf) { 1213 arc_buf_t *buf = ab->b_buf; 1214 if (buf->b_data) { 1215 bytes_evicted += ab->b_size; 1216 if (recycle && ab->b_type == type && 1217 ab->b_size == bytes) { 1218 stolen = buf->b_data; 1219 recycle = FALSE; 1220 } 1221 } 1222 if (buf->b_efunc) { 1223 mutex_enter(&arc_eviction_mtx); 1224 arc_buf_destroy(buf, 1225 buf->b_data == stolen, FALSE); 1226 ab->b_buf = buf->b_next; 1227 buf->b_hdr = &arc_eviction_hdr; 1228 buf->b_next = arc_eviction_list; 1229 arc_eviction_list = buf; 1230 mutex_exit(&arc_eviction_mtx); 1231 } else { 1232 arc_buf_destroy(buf, 1233 buf->b_data == stolen, TRUE); 1234 } 1235 } 1236 ASSERT(ab->b_datacnt == 0); 1237 arc_change_state(evicted_state, ab, hash_lock); 1238 ASSERT(HDR_IN_HASH_TABLE(ab)); 1239 ab->b_flags = ARC_IN_HASH_TABLE; 1240 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1241 if (!have_lock) 1242 mutex_exit(hash_lock); 1243 if (bytes >= 0 && bytes_evicted >= bytes) 1244 break; 1245 } else { 1246 missed += 1; 1247 } 1248 } 1249 1250 mutex_exit(&evicted_state->arcs_mtx); 1251 mutex_exit(&state->arcs_mtx); 1252 1253 if (bytes_evicted < bytes) 1254 dprintf("only evicted %lld bytes from %x", 1255 (longlong_t)bytes_evicted, state); 1256 1257 if (skipped) 1258 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1259 1260 if (missed) 1261 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1262 1263 /* 1264 * We have just evicted some date into the ghost state, make 1265 * sure we also adjust the ghost state size if necessary. 1266 */ 1267 if (arc_no_grow && 1268 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1269 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1270 arc_mru_ghost->arcs_size - arc_c; 1271 1272 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1273 int64_t todelete = 1274 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1275 arc_evict_ghost(arc_mru_ghost, todelete); 1276 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1277 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1278 arc_mru_ghost->arcs_size + 1279 arc_mfu_ghost->arcs_size - arc_c); 1280 arc_evict_ghost(arc_mfu_ghost, todelete); 1281 } 1282 } 1283 1284 return (stolen); 1285 } 1286 1287 /* 1288 * Remove buffers from list until we've removed the specified number of 1289 * bytes. Destroy the buffers that are removed. 1290 */ 1291 static void 1292 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1293 { 1294 arc_buf_hdr_t *ab, *ab_prev; 1295 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1296 kmutex_t *hash_lock; 1297 uint64_t bytes_deleted = 0; 1298 uint64_t bufs_skipped = 0; 1299 1300 ASSERT(GHOST_STATE(state)); 1301 top: 1302 mutex_enter(&state->arcs_mtx); 1303 for (ab = list_tail(list); ab; ab = ab_prev) { 1304 ab_prev = list_prev(list, ab); 1305 hash_lock = HDR_LOCK(ab); 1306 if (mutex_tryenter(hash_lock)) { 1307 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1308 ASSERT(ab->b_buf == NULL); 1309 arc_change_state(arc_anon, ab, hash_lock); 1310 mutex_exit(hash_lock); 1311 ARCSTAT_BUMP(arcstat_deleted); 1312 bytes_deleted += ab->b_size; 1313 arc_hdr_destroy(ab); 1314 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1315 if (bytes >= 0 && bytes_deleted >= bytes) 1316 break; 1317 } else { 1318 if (bytes < 0) { 1319 mutex_exit(&state->arcs_mtx); 1320 mutex_enter(hash_lock); 1321 mutex_exit(hash_lock); 1322 goto top; 1323 } 1324 bufs_skipped += 1; 1325 } 1326 } 1327 mutex_exit(&state->arcs_mtx); 1328 1329 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1330 (bytes < 0 || bytes_deleted < bytes)) { 1331 list = &state->arcs_list[ARC_BUFC_METADATA]; 1332 goto top; 1333 } 1334 1335 if (bufs_skipped) { 1336 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1337 ASSERT(bytes >= 0); 1338 } 1339 1340 if (bytes_deleted < bytes) 1341 dprintf("only deleted %lld bytes from %p", 1342 (longlong_t)bytes_deleted, state); 1343 } 1344 1345 static void 1346 arc_adjust(void) 1347 { 1348 int64_t top_sz, mru_over, arc_over, todelete; 1349 1350 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1351 1352 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1353 int64_t toevict = 1354 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1355 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); 1356 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1357 } 1358 1359 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1360 int64_t toevict = 1361 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1362 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); 1363 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1364 } 1365 1366 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1367 1368 if (mru_over > 0) { 1369 if (arc_mru_ghost->arcs_size > 0) { 1370 todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1371 arc_evict_ghost(arc_mru_ghost, todelete); 1372 } 1373 } 1374 1375 if ((arc_over = arc_size - arc_c) > 0) { 1376 int64_t tbl_over; 1377 1378 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1379 int64_t toevict = 1380 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1381 (void) arc_evict(arc_mfu, toevict, FALSE, 1382 ARC_BUFC_DATA); 1383 arc_over = arc_size - arc_c; 1384 } 1385 1386 if (arc_over > 0 && 1387 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1388 int64_t toevict = 1389 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 1390 arc_over); 1391 (void) arc_evict(arc_mfu, toevict, FALSE, 1392 ARC_BUFC_METADATA); 1393 } 1394 1395 tbl_over = arc_size + arc_mru_ghost->arcs_size + 1396 arc_mfu_ghost->arcs_size - arc_c * 2; 1397 1398 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 1399 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1400 arc_evict_ghost(arc_mfu_ghost, todelete); 1401 } 1402 } 1403 } 1404 1405 static void 1406 arc_do_user_evicts(void) 1407 { 1408 mutex_enter(&arc_eviction_mtx); 1409 while (arc_eviction_list != NULL) { 1410 arc_buf_t *buf = arc_eviction_list; 1411 arc_eviction_list = buf->b_next; 1412 buf->b_hdr = NULL; 1413 mutex_exit(&arc_eviction_mtx); 1414 1415 if (buf->b_efunc != NULL) 1416 VERIFY(buf->b_efunc(buf) == 0); 1417 1418 buf->b_efunc = NULL; 1419 buf->b_private = NULL; 1420 kmem_cache_free(buf_cache, buf); 1421 mutex_enter(&arc_eviction_mtx); 1422 } 1423 mutex_exit(&arc_eviction_mtx); 1424 } 1425 1426 /* 1427 * Flush all *evictable* data from the cache. 1428 * NOTE: this will not touch "active" (i.e. referenced) data. 1429 */ 1430 void 1431 arc_flush(void) 1432 { 1433 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) 1434 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); 1435 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) 1436 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); 1437 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) 1438 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); 1439 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) 1440 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); 1441 1442 arc_evict_ghost(arc_mru_ghost, -1); 1443 arc_evict_ghost(arc_mfu_ghost, -1); 1444 1445 mutex_enter(&arc_reclaim_thr_lock); 1446 arc_do_user_evicts(); 1447 mutex_exit(&arc_reclaim_thr_lock); 1448 ASSERT(arc_eviction_list == NULL); 1449 } 1450 1451 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1452 1453 void 1454 arc_shrink(void) 1455 { 1456 if (arc_c > arc_c_min) { 1457 uint64_t to_free; 1458 1459 #ifdef _KERNEL 1460 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1461 #else 1462 to_free = arc_c >> arc_shrink_shift; 1463 #endif 1464 if (arc_c > arc_c_min + to_free) 1465 atomic_add_64(&arc_c, -to_free); 1466 else 1467 arc_c = arc_c_min; 1468 1469 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1470 if (arc_c > arc_size) 1471 arc_c = MAX(arc_size, arc_c_min); 1472 if (arc_p > arc_c) 1473 arc_p = (arc_c >> 1); 1474 ASSERT(arc_c >= arc_c_min); 1475 ASSERT((int64_t)arc_p >= 0); 1476 } 1477 1478 if (arc_size > arc_c) 1479 arc_adjust(); 1480 } 1481 1482 static int 1483 arc_reclaim_needed(void) 1484 { 1485 uint64_t extra; 1486 1487 #ifdef _KERNEL 1488 1489 if (needfree) 1490 return (1); 1491 1492 /* 1493 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1494 */ 1495 extra = desfree; 1496 1497 /* 1498 * check that we're out of range of the pageout scanner. It starts to 1499 * schedule paging if freemem is less than lotsfree and needfree. 1500 * lotsfree is the high-water mark for pageout, and needfree is the 1501 * number of needed free pages. We add extra pages here to make sure 1502 * the scanner doesn't start up while we're freeing memory. 1503 */ 1504 if (freemem < lotsfree + needfree + extra) 1505 return (1); 1506 1507 /* 1508 * check to make sure that swapfs has enough space so that anon 1509 * reservations can still succeeed. anon_resvmem() checks that the 1510 * availrmem is greater than swapfs_minfree, and the number of reserved 1511 * swap pages. We also add a bit of extra here just to prevent 1512 * circumstances from getting really dire. 1513 */ 1514 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1515 return (1); 1516 1517 #if defined(__i386) 1518 /* 1519 * If we're on an i386 platform, it's possible that we'll exhaust the 1520 * kernel heap space before we ever run out of available physical 1521 * memory. Most checks of the size of the heap_area compare against 1522 * tune.t_minarmem, which is the minimum available real memory that we 1523 * can have in the system. However, this is generally fixed at 25 pages 1524 * which is so low that it's useless. In this comparison, we seek to 1525 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1526 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1527 * free) 1528 */ 1529 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1530 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1531 return (1); 1532 #endif 1533 1534 #else 1535 if (spa_get_random(100) == 0) 1536 return (1); 1537 #endif 1538 return (0); 1539 } 1540 1541 static void 1542 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1543 { 1544 size_t i; 1545 kmem_cache_t *prev_cache = NULL; 1546 kmem_cache_t *prev_data_cache = NULL; 1547 extern kmem_cache_t *zio_buf_cache[]; 1548 extern kmem_cache_t *zio_data_buf_cache[]; 1549 1550 #ifdef _KERNEL 1551 if (arc_meta_used >= arc_meta_limit) { 1552 /* 1553 * We are exceeding our meta-data cache limit. 1554 * Purge some DNLC entries to release holds on meta-data. 1555 */ 1556 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1557 } 1558 #if defined(__i386) 1559 /* 1560 * Reclaim unused memory from all kmem caches. 1561 */ 1562 kmem_reap(); 1563 #endif 1564 #endif 1565 1566 /* 1567 * An agressive reclamation will shrink the cache size as well as 1568 * reap free buffers from the arc kmem caches. 1569 */ 1570 if (strat == ARC_RECLAIM_AGGR) 1571 arc_shrink(); 1572 1573 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1574 if (zio_buf_cache[i] != prev_cache) { 1575 prev_cache = zio_buf_cache[i]; 1576 kmem_cache_reap_now(zio_buf_cache[i]); 1577 } 1578 if (zio_data_buf_cache[i] != prev_data_cache) { 1579 prev_data_cache = zio_data_buf_cache[i]; 1580 kmem_cache_reap_now(zio_data_buf_cache[i]); 1581 } 1582 } 1583 kmem_cache_reap_now(buf_cache); 1584 kmem_cache_reap_now(hdr_cache); 1585 } 1586 1587 static void 1588 arc_reclaim_thread(void) 1589 { 1590 clock_t growtime = 0; 1591 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1592 callb_cpr_t cpr; 1593 1594 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1595 1596 mutex_enter(&arc_reclaim_thr_lock); 1597 while (arc_thread_exit == 0) { 1598 if (arc_reclaim_needed()) { 1599 1600 if (arc_no_grow) { 1601 if (last_reclaim == ARC_RECLAIM_CONS) { 1602 last_reclaim = ARC_RECLAIM_AGGR; 1603 } else { 1604 last_reclaim = ARC_RECLAIM_CONS; 1605 } 1606 } else { 1607 arc_no_grow = TRUE; 1608 last_reclaim = ARC_RECLAIM_AGGR; 1609 membar_producer(); 1610 } 1611 1612 /* reset the growth delay for every reclaim */ 1613 growtime = lbolt + (arc_grow_retry * hz); 1614 1615 arc_kmem_reap_now(last_reclaim); 1616 1617 } else if (arc_no_grow && lbolt >= growtime) { 1618 arc_no_grow = FALSE; 1619 } 1620 1621 if (2 * arc_c < arc_size + 1622 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1623 arc_adjust(); 1624 1625 if (arc_eviction_list != NULL) 1626 arc_do_user_evicts(); 1627 1628 /* block until needed, or one second, whichever is shorter */ 1629 CALLB_CPR_SAFE_BEGIN(&cpr); 1630 (void) cv_timedwait(&arc_reclaim_thr_cv, 1631 &arc_reclaim_thr_lock, (lbolt + hz)); 1632 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1633 } 1634 1635 arc_thread_exit = 0; 1636 cv_broadcast(&arc_reclaim_thr_cv); 1637 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1638 thread_exit(); 1639 } 1640 1641 /* 1642 * Adapt arc info given the number of bytes we are trying to add and 1643 * the state that we are comming from. This function is only called 1644 * when we are adding new content to the cache. 1645 */ 1646 static void 1647 arc_adapt(int bytes, arc_state_t *state) 1648 { 1649 int mult; 1650 1651 ASSERT(bytes > 0); 1652 /* 1653 * Adapt the target size of the MRU list: 1654 * - if we just hit in the MRU ghost list, then increase 1655 * the target size of the MRU list. 1656 * - if we just hit in the MFU ghost list, then increase 1657 * the target size of the MFU list by decreasing the 1658 * target size of the MRU list. 1659 */ 1660 if (state == arc_mru_ghost) { 1661 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1662 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1663 1664 arc_p = MIN(arc_c, arc_p + bytes * mult); 1665 } else if (state == arc_mfu_ghost) { 1666 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1667 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1668 1669 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1670 } 1671 ASSERT((int64_t)arc_p >= 0); 1672 1673 if (arc_reclaim_needed()) { 1674 cv_signal(&arc_reclaim_thr_cv); 1675 return; 1676 } 1677 1678 if (arc_no_grow) 1679 return; 1680 1681 if (arc_c >= arc_c_max) 1682 return; 1683 1684 /* 1685 * If we're within (2 * maxblocksize) bytes of the target 1686 * cache size, increment the target cache size 1687 */ 1688 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1689 atomic_add_64(&arc_c, (int64_t)bytes); 1690 if (arc_c > arc_c_max) 1691 arc_c = arc_c_max; 1692 else if (state == arc_anon) 1693 atomic_add_64(&arc_p, (int64_t)bytes); 1694 if (arc_p > arc_c) 1695 arc_p = arc_c; 1696 } 1697 ASSERT((int64_t)arc_p >= 0); 1698 } 1699 1700 /* 1701 * Check if the cache has reached its limits and eviction is required 1702 * prior to insert. 1703 */ 1704 static int 1705 arc_evict_needed(arc_buf_contents_t type) 1706 { 1707 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 1708 return (1); 1709 1710 #ifdef _KERNEL 1711 /* 1712 * If zio data pages are being allocated out of a separate heap segment, 1713 * then enforce that the size of available vmem for this area remains 1714 * above about 1/32nd free. 1715 */ 1716 if (type == ARC_BUFC_DATA && zio_arena != NULL && 1717 vmem_size(zio_arena, VMEM_FREE) < 1718 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 1719 return (1); 1720 #endif 1721 1722 if (arc_reclaim_needed()) 1723 return (1); 1724 1725 return (arc_size > arc_c); 1726 } 1727 1728 /* 1729 * The buffer, supplied as the first argument, needs a data block. 1730 * So, if we are at cache max, determine which cache should be victimized. 1731 * We have the following cases: 1732 * 1733 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1734 * In this situation if we're out of space, but the resident size of the MFU is 1735 * under the limit, victimize the MFU cache to satisfy this insertion request. 1736 * 1737 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1738 * Here, we've used up all of the available space for the MRU, so we need to 1739 * evict from our own cache instead. Evict from the set of resident MRU 1740 * entries. 1741 * 1742 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1743 * c minus p represents the MFU space in the cache, since p is the size of the 1744 * cache that is dedicated to the MRU. In this situation there's still space on 1745 * the MFU side, so the MRU side needs to be victimized. 1746 * 1747 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1748 * MFU's resident set is consuming more space than it has been allotted. In 1749 * this situation, we must victimize our own cache, the MFU, for this insertion. 1750 */ 1751 static void 1752 arc_get_data_buf(arc_buf_t *buf) 1753 { 1754 arc_state_t *state = buf->b_hdr->b_state; 1755 uint64_t size = buf->b_hdr->b_size; 1756 arc_buf_contents_t type = buf->b_hdr->b_type; 1757 1758 arc_adapt(size, state); 1759 1760 /* 1761 * We have not yet reached cache maximum size, 1762 * just allocate a new buffer. 1763 */ 1764 if (!arc_evict_needed(type)) { 1765 if (type == ARC_BUFC_METADATA) { 1766 buf->b_data = zio_buf_alloc(size); 1767 arc_space_consume(size); 1768 } else { 1769 ASSERT(type == ARC_BUFC_DATA); 1770 buf->b_data = zio_data_buf_alloc(size); 1771 atomic_add_64(&arc_size, size); 1772 } 1773 goto out; 1774 } 1775 1776 /* 1777 * If we are prefetching from the mfu ghost list, this buffer 1778 * will end up on the mru list; so steal space from there. 1779 */ 1780 if (state == arc_mfu_ghost) 1781 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1782 else if (state == arc_mru_ghost) 1783 state = arc_mru; 1784 1785 if (state == arc_mru || state == arc_anon) { 1786 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1787 state = (arc_mfu->arcs_lsize[type] > 0 && 1788 arc_p > mru_used) ? arc_mfu : arc_mru; 1789 } else { 1790 /* MFU cases */ 1791 uint64_t mfu_space = arc_c - arc_p; 1792 state = (arc_mru->arcs_lsize[type] > 0 && 1793 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1794 } 1795 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1796 if (type == ARC_BUFC_METADATA) { 1797 buf->b_data = zio_buf_alloc(size); 1798 arc_space_consume(size); 1799 } else { 1800 ASSERT(type == ARC_BUFC_DATA); 1801 buf->b_data = zio_data_buf_alloc(size); 1802 atomic_add_64(&arc_size, size); 1803 } 1804 ARCSTAT_BUMP(arcstat_recycle_miss); 1805 } 1806 ASSERT(buf->b_data != NULL); 1807 out: 1808 /* 1809 * Update the state size. Note that ghost states have a 1810 * "ghost size" and so don't need to be updated. 1811 */ 1812 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1813 arc_buf_hdr_t *hdr = buf->b_hdr; 1814 1815 atomic_add_64(&hdr->b_state->arcs_size, size); 1816 if (list_link_active(&hdr->b_arc_node)) { 1817 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1818 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 1819 } 1820 /* 1821 * If we are growing the cache, and we are adding anonymous 1822 * data, and we have outgrown arc_p, update arc_p 1823 */ 1824 if (arc_size < arc_c && hdr->b_state == arc_anon && 1825 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1826 arc_p = MIN(arc_c, arc_p + size); 1827 } 1828 } 1829 1830 /* 1831 * This routine is called whenever a buffer is accessed. 1832 * NOTE: the hash lock is dropped in this function. 1833 */ 1834 static void 1835 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1836 { 1837 ASSERT(MUTEX_HELD(hash_lock)); 1838 1839 if (buf->b_state == arc_anon) { 1840 /* 1841 * This buffer is not in the cache, and does not 1842 * appear in our "ghost" list. Add the new buffer 1843 * to the MRU state. 1844 */ 1845 1846 ASSERT(buf->b_arc_access == 0); 1847 buf->b_arc_access = lbolt; 1848 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1849 arc_change_state(arc_mru, buf, hash_lock); 1850 1851 } else if (buf->b_state == arc_mru) { 1852 /* 1853 * If this buffer is here because of a prefetch, then either: 1854 * - clear the flag if this is a "referencing" read 1855 * (any subsequent access will bump this into the MFU state). 1856 * or 1857 * - move the buffer to the head of the list if this is 1858 * another prefetch (to make it less likely to be evicted). 1859 */ 1860 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1861 if (refcount_count(&buf->b_refcnt) == 0) { 1862 ASSERT(list_link_active(&buf->b_arc_node)); 1863 } else { 1864 buf->b_flags &= ~ARC_PREFETCH; 1865 ARCSTAT_BUMP(arcstat_mru_hits); 1866 } 1867 buf->b_arc_access = lbolt; 1868 return; 1869 } 1870 1871 /* 1872 * This buffer has been "accessed" only once so far, 1873 * but it is still in the cache. Move it to the MFU 1874 * state. 1875 */ 1876 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1877 /* 1878 * More than 125ms have passed since we 1879 * instantiated this buffer. Move it to the 1880 * most frequently used state. 1881 */ 1882 buf->b_arc_access = lbolt; 1883 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1884 arc_change_state(arc_mfu, buf, hash_lock); 1885 } 1886 ARCSTAT_BUMP(arcstat_mru_hits); 1887 } else if (buf->b_state == arc_mru_ghost) { 1888 arc_state_t *new_state; 1889 /* 1890 * This buffer has been "accessed" recently, but 1891 * was evicted from the cache. Move it to the 1892 * MFU state. 1893 */ 1894 1895 if (buf->b_flags & ARC_PREFETCH) { 1896 new_state = arc_mru; 1897 if (refcount_count(&buf->b_refcnt) > 0) 1898 buf->b_flags &= ~ARC_PREFETCH; 1899 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1900 } else { 1901 new_state = arc_mfu; 1902 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1903 } 1904 1905 buf->b_arc_access = lbolt; 1906 arc_change_state(new_state, buf, hash_lock); 1907 1908 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1909 } else if (buf->b_state == arc_mfu) { 1910 /* 1911 * This buffer has been accessed more than once and is 1912 * still in the cache. Keep it in the MFU state. 1913 * 1914 * NOTE: an add_reference() that occurred when we did 1915 * the arc_read() will have kicked this off the list. 1916 * If it was a prefetch, we will explicitly move it to 1917 * the head of the list now. 1918 */ 1919 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1920 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1921 ASSERT(list_link_active(&buf->b_arc_node)); 1922 } 1923 ARCSTAT_BUMP(arcstat_mfu_hits); 1924 buf->b_arc_access = lbolt; 1925 } else if (buf->b_state == arc_mfu_ghost) { 1926 arc_state_t *new_state = arc_mfu; 1927 /* 1928 * This buffer has been accessed more than once but has 1929 * been evicted from the cache. Move it back to the 1930 * MFU state. 1931 */ 1932 1933 if (buf->b_flags & ARC_PREFETCH) { 1934 /* 1935 * This is a prefetch access... 1936 * move this block back to the MRU state. 1937 */ 1938 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1939 new_state = arc_mru; 1940 } 1941 1942 buf->b_arc_access = lbolt; 1943 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1944 arc_change_state(new_state, buf, hash_lock); 1945 1946 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1947 } else { 1948 ASSERT(!"invalid arc state"); 1949 } 1950 } 1951 1952 /* a generic arc_done_func_t which you can use */ 1953 /* ARGSUSED */ 1954 void 1955 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1956 { 1957 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1958 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1959 } 1960 1961 /* a generic arc_done_func_t */ 1962 void 1963 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1964 { 1965 arc_buf_t **bufp = arg; 1966 if (zio && zio->io_error) { 1967 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1968 *bufp = NULL; 1969 } else { 1970 *bufp = buf; 1971 } 1972 } 1973 1974 static void 1975 arc_read_done(zio_t *zio) 1976 { 1977 arc_buf_hdr_t *hdr, *found; 1978 arc_buf_t *buf; 1979 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1980 kmutex_t *hash_lock; 1981 arc_callback_t *callback_list, *acb; 1982 int freeable = FALSE; 1983 1984 buf = zio->io_private; 1985 hdr = buf->b_hdr; 1986 1987 /* 1988 * The hdr was inserted into hash-table and removed from lists 1989 * prior to starting I/O. We should find this header, since 1990 * it's in the hash table, and it should be legit since it's 1991 * not possible to evict it during the I/O. The only possible 1992 * reason for it not to be found is if we were freed during the 1993 * read. 1994 */ 1995 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1996 &hash_lock); 1997 1998 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1999 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 2000 2001 /* byteswap if necessary */ 2002 callback_list = hdr->b_acb; 2003 ASSERT(callback_list != NULL); 2004 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 2005 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 2006 2007 arc_cksum_compute(buf); 2008 2009 /* create copies of the data buffer for the callers */ 2010 abuf = buf; 2011 for (acb = callback_list; acb; acb = acb->acb_next) { 2012 if (acb->acb_done) { 2013 if (abuf == NULL) 2014 abuf = arc_buf_clone(buf); 2015 acb->acb_buf = abuf; 2016 abuf = NULL; 2017 } 2018 } 2019 hdr->b_acb = NULL; 2020 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2021 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2022 if (abuf == buf) 2023 hdr->b_flags |= ARC_BUF_AVAILABLE; 2024 2025 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2026 2027 if (zio->io_error != 0) { 2028 hdr->b_flags |= ARC_IO_ERROR; 2029 if (hdr->b_state != arc_anon) 2030 arc_change_state(arc_anon, hdr, hash_lock); 2031 if (HDR_IN_HASH_TABLE(hdr)) 2032 buf_hash_remove(hdr); 2033 freeable = refcount_is_zero(&hdr->b_refcnt); 2034 /* convert checksum errors into IO errors */ 2035 if (zio->io_error == ECKSUM) 2036 zio->io_error = EIO; 2037 } 2038 2039 /* 2040 * Broadcast before we drop the hash_lock to avoid the possibility 2041 * that the hdr (and hence the cv) might be freed before we get to 2042 * the cv_broadcast(). 2043 */ 2044 cv_broadcast(&hdr->b_cv); 2045 2046 if (hash_lock) { 2047 /* 2048 * Only call arc_access on anonymous buffers. This is because 2049 * if we've issued an I/O for an evicted buffer, we've already 2050 * called arc_access (to prevent any simultaneous readers from 2051 * getting confused). 2052 */ 2053 if (zio->io_error == 0 && hdr->b_state == arc_anon) 2054 arc_access(hdr, hash_lock); 2055 mutex_exit(hash_lock); 2056 } else { 2057 /* 2058 * This block was freed while we waited for the read to 2059 * complete. It has been removed from the hash table and 2060 * moved to the anonymous state (so that it won't show up 2061 * in the cache). 2062 */ 2063 ASSERT3P(hdr->b_state, ==, arc_anon); 2064 freeable = refcount_is_zero(&hdr->b_refcnt); 2065 } 2066 2067 /* execute each callback and free its structure */ 2068 while ((acb = callback_list) != NULL) { 2069 if (acb->acb_done) 2070 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2071 2072 if (acb->acb_zio_dummy != NULL) { 2073 acb->acb_zio_dummy->io_error = zio->io_error; 2074 zio_nowait(acb->acb_zio_dummy); 2075 } 2076 2077 callback_list = acb->acb_next; 2078 kmem_free(acb, sizeof (arc_callback_t)); 2079 } 2080 2081 if (freeable) 2082 arc_hdr_destroy(hdr); 2083 } 2084 2085 /* 2086 * "Read" the block block at the specified DVA (in bp) via the 2087 * cache. If the block is found in the cache, invoke the provided 2088 * callback immediately and return. Note that the `zio' parameter 2089 * in the callback will be NULL in this case, since no IO was 2090 * required. If the block is not in the cache pass the read request 2091 * on to the spa with a substitute callback function, so that the 2092 * requested block will be added to the cache. 2093 * 2094 * If a read request arrives for a block that has a read in-progress, 2095 * either wait for the in-progress read to complete (and return the 2096 * results); or, if this is a read with a "done" func, add a record 2097 * to the read to invoke the "done" func when the read completes, 2098 * and return; or just return. 2099 * 2100 * arc_read_done() will invoke all the requested "done" functions 2101 * for readers of this block. 2102 */ 2103 int 2104 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2105 arc_done_func_t *done, void *private, int priority, int flags, 2106 uint32_t *arc_flags, zbookmark_t *zb) 2107 { 2108 arc_buf_hdr_t *hdr; 2109 arc_buf_t *buf; 2110 kmutex_t *hash_lock; 2111 zio_t *rzio; 2112 2113 top: 2114 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2115 if (hdr && hdr->b_datacnt > 0) { 2116 2117 *arc_flags |= ARC_CACHED; 2118 2119 if (HDR_IO_IN_PROGRESS(hdr)) { 2120 2121 if (*arc_flags & ARC_WAIT) { 2122 cv_wait(&hdr->b_cv, hash_lock); 2123 mutex_exit(hash_lock); 2124 goto top; 2125 } 2126 ASSERT(*arc_flags & ARC_NOWAIT); 2127 2128 if (done) { 2129 arc_callback_t *acb = NULL; 2130 2131 acb = kmem_zalloc(sizeof (arc_callback_t), 2132 KM_SLEEP); 2133 acb->acb_done = done; 2134 acb->acb_private = private; 2135 acb->acb_byteswap = swap; 2136 if (pio != NULL) 2137 acb->acb_zio_dummy = zio_null(pio, 2138 spa, NULL, NULL, flags); 2139 2140 ASSERT(acb->acb_done != NULL); 2141 acb->acb_next = hdr->b_acb; 2142 hdr->b_acb = acb; 2143 add_reference(hdr, hash_lock, private); 2144 mutex_exit(hash_lock); 2145 return (0); 2146 } 2147 mutex_exit(hash_lock); 2148 return (0); 2149 } 2150 2151 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2152 2153 if (done) { 2154 add_reference(hdr, hash_lock, private); 2155 /* 2156 * If this block is already in use, create a new 2157 * copy of the data so that we will be guaranteed 2158 * that arc_release() will always succeed. 2159 */ 2160 buf = hdr->b_buf; 2161 ASSERT(buf); 2162 ASSERT(buf->b_data); 2163 if (HDR_BUF_AVAILABLE(hdr)) { 2164 ASSERT(buf->b_efunc == NULL); 2165 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2166 } else { 2167 buf = arc_buf_clone(buf); 2168 } 2169 } else if (*arc_flags & ARC_PREFETCH && 2170 refcount_count(&hdr->b_refcnt) == 0) { 2171 hdr->b_flags |= ARC_PREFETCH; 2172 } 2173 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2174 arc_access(hdr, hash_lock); 2175 mutex_exit(hash_lock); 2176 ARCSTAT_BUMP(arcstat_hits); 2177 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2178 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2179 data, metadata, hits); 2180 2181 if (done) 2182 done(NULL, buf, private); 2183 } else { 2184 uint64_t size = BP_GET_LSIZE(bp); 2185 arc_callback_t *acb; 2186 2187 if (hdr == NULL) { 2188 /* this block is not in the cache */ 2189 arc_buf_hdr_t *exists; 2190 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2191 buf = arc_buf_alloc(spa, size, private, type); 2192 hdr = buf->b_hdr; 2193 hdr->b_dva = *BP_IDENTITY(bp); 2194 hdr->b_birth = bp->blk_birth; 2195 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2196 exists = buf_hash_insert(hdr, &hash_lock); 2197 if (exists) { 2198 /* somebody beat us to the hash insert */ 2199 mutex_exit(hash_lock); 2200 bzero(&hdr->b_dva, sizeof (dva_t)); 2201 hdr->b_birth = 0; 2202 hdr->b_cksum0 = 0; 2203 (void) arc_buf_remove_ref(buf, private); 2204 goto top; /* restart the IO request */ 2205 } 2206 /* if this is a prefetch, we don't have a reference */ 2207 if (*arc_flags & ARC_PREFETCH) { 2208 (void) remove_reference(hdr, hash_lock, 2209 private); 2210 hdr->b_flags |= ARC_PREFETCH; 2211 } 2212 if (BP_GET_LEVEL(bp) > 0) 2213 hdr->b_flags |= ARC_INDIRECT; 2214 } else { 2215 /* this block is in the ghost cache */ 2216 ASSERT(GHOST_STATE(hdr->b_state)); 2217 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2218 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2219 ASSERT(hdr->b_buf == NULL); 2220 2221 /* if this is a prefetch, we don't have a reference */ 2222 if (*arc_flags & ARC_PREFETCH) 2223 hdr->b_flags |= ARC_PREFETCH; 2224 else 2225 add_reference(hdr, hash_lock, private); 2226 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2227 buf->b_hdr = hdr; 2228 buf->b_data = NULL; 2229 buf->b_efunc = NULL; 2230 buf->b_private = NULL; 2231 buf->b_next = NULL; 2232 hdr->b_buf = buf; 2233 arc_get_data_buf(buf); 2234 ASSERT(hdr->b_datacnt == 0); 2235 hdr->b_datacnt = 1; 2236 2237 } 2238 2239 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2240 acb->acb_done = done; 2241 acb->acb_private = private; 2242 acb->acb_byteswap = swap; 2243 2244 ASSERT(hdr->b_acb == NULL); 2245 hdr->b_acb = acb; 2246 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2247 2248 /* 2249 * If the buffer has been evicted, migrate it to a present state 2250 * before issuing the I/O. Once we drop the hash-table lock, 2251 * the header will be marked as I/O in progress and have an 2252 * attached buffer. At this point, anybody who finds this 2253 * buffer ought to notice that it's legit but has a pending I/O. 2254 */ 2255 2256 if (GHOST_STATE(hdr->b_state)) 2257 arc_access(hdr, hash_lock); 2258 mutex_exit(hash_lock); 2259 2260 ASSERT3U(hdr->b_size, ==, size); 2261 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2262 zbookmark_t *, zb); 2263 ARCSTAT_BUMP(arcstat_misses); 2264 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2265 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2266 data, metadata, misses); 2267 2268 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2269 arc_read_done, buf, priority, flags, zb); 2270 2271 if (*arc_flags & ARC_WAIT) 2272 return (zio_wait(rzio)); 2273 2274 ASSERT(*arc_flags & ARC_NOWAIT); 2275 zio_nowait(rzio); 2276 } 2277 return (0); 2278 } 2279 2280 /* 2281 * arc_read() variant to support pool traversal. If the block is already 2282 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2283 * The idea is that we don't want pool traversal filling up memory, but 2284 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2285 */ 2286 int 2287 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2288 { 2289 arc_buf_hdr_t *hdr; 2290 kmutex_t *hash_mtx; 2291 int rc = 0; 2292 2293 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2294 2295 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2296 arc_buf_t *buf = hdr->b_buf; 2297 2298 ASSERT(buf); 2299 while (buf->b_data == NULL) { 2300 buf = buf->b_next; 2301 ASSERT(buf); 2302 } 2303 bcopy(buf->b_data, data, hdr->b_size); 2304 } else { 2305 rc = ENOENT; 2306 } 2307 2308 if (hash_mtx) 2309 mutex_exit(hash_mtx); 2310 2311 return (rc); 2312 } 2313 2314 void 2315 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2316 { 2317 ASSERT(buf->b_hdr != NULL); 2318 ASSERT(buf->b_hdr->b_state != arc_anon); 2319 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2320 buf->b_efunc = func; 2321 buf->b_private = private; 2322 } 2323 2324 /* 2325 * This is used by the DMU to let the ARC know that a buffer is 2326 * being evicted, so the ARC should clean up. If this arc buf 2327 * is not yet in the evicted state, it will be put there. 2328 */ 2329 int 2330 arc_buf_evict(arc_buf_t *buf) 2331 { 2332 arc_buf_hdr_t *hdr; 2333 kmutex_t *hash_lock; 2334 arc_buf_t **bufp; 2335 2336 mutex_enter(&arc_eviction_mtx); 2337 hdr = buf->b_hdr; 2338 if (hdr == NULL) { 2339 /* 2340 * We are in arc_do_user_evicts(). 2341 */ 2342 ASSERT(buf->b_data == NULL); 2343 mutex_exit(&arc_eviction_mtx); 2344 return (0); 2345 } 2346 hash_lock = HDR_LOCK(hdr); 2347 mutex_exit(&arc_eviction_mtx); 2348 2349 mutex_enter(hash_lock); 2350 2351 if (buf->b_data == NULL) { 2352 /* 2353 * We are on the eviction list. 2354 */ 2355 mutex_exit(hash_lock); 2356 mutex_enter(&arc_eviction_mtx); 2357 if (buf->b_hdr == NULL) { 2358 /* 2359 * We are already in arc_do_user_evicts(). 2360 */ 2361 mutex_exit(&arc_eviction_mtx); 2362 return (0); 2363 } else { 2364 arc_buf_t copy = *buf; /* structure assignment */ 2365 /* 2366 * Process this buffer now 2367 * but let arc_do_user_evicts() do the reaping. 2368 */ 2369 buf->b_efunc = NULL; 2370 mutex_exit(&arc_eviction_mtx); 2371 VERIFY(copy.b_efunc(©) == 0); 2372 return (1); 2373 } 2374 } 2375 2376 ASSERT(buf->b_hdr == hdr); 2377 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2378 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2379 2380 /* 2381 * Pull this buffer off of the hdr 2382 */ 2383 bufp = &hdr->b_buf; 2384 while (*bufp != buf) 2385 bufp = &(*bufp)->b_next; 2386 *bufp = buf->b_next; 2387 2388 ASSERT(buf->b_data != NULL); 2389 arc_buf_destroy(buf, FALSE, FALSE); 2390 2391 if (hdr->b_datacnt == 0) { 2392 arc_state_t *old_state = hdr->b_state; 2393 arc_state_t *evicted_state; 2394 2395 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2396 2397 evicted_state = 2398 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2399 2400 mutex_enter(&old_state->arcs_mtx); 2401 mutex_enter(&evicted_state->arcs_mtx); 2402 2403 arc_change_state(evicted_state, hdr, hash_lock); 2404 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2405 hdr->b_flags = ARC_IN_HASH_TABLE; 2406 2407 mutex_exit(&evicted_state->arcs_mtx); 2408 mutex_exit(&old_state->arcs_mtx); 2409 } 2410 mutex_exit(hash_lock); 2411 2412 VERIFY(buf->b_efunc(buf) == 0); 2413 buf->b_efunc = NULL; 2414 buf->b_private = NULL; 2415 buf->b_hdr = NULL; 2416 kmem_cache_free(buf_cache, buf); 2417 return (1); 2418 } 2419 2420 /* 2421 * Release this buffer from the cache. This must be done 2422 * after a read and prior to modifying the buffer contents. 2423 * If the buffer has more than one reference, we must make 2424 * make a new hdr for the buffer. 2425 */ 2426 void 2427 arc_release(arc_buf_t *buf, void *tag) 2428 { 2429 arc_buf_hdr_t *hdr = buf->b_hdr; 2430 kmutex_t *hash_lock = HDR_LOCK(hdr); 2431 2432 /* this buffer is not on any list */ 2433 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2434 2435 if (hdr->b_state == arc_anon) { 2436 /* this buffer is already released */ 2437 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2438 ASSERT(BUF_EMPTY(hdr)); 2439 ASSERT(buf->b_efunc == NULL); 2440 arc_buf_thaw(buf); 2441 return; 2442 } 2443 2444 mutex_enter(hash_lock); 2445 2446 /* 2447 * Do we have more than one buf? 2448 */ 2449 if (hdr->b_buf != buf || buf->b_next != NULL) { 2450 arc_buf_hdr_t *nhdr; 2451 arc_buf_t **bufp; 2452 uint64_t blksz = hdr->b_size; 2453 spa_t *spa = hdr->b_spa; 2454 arc_buf_contents_t type = hdr->b_type; 2455 2456 ASSERT(hdr->b_datacnt > 1); 2457 /* 2458 * Pull the data off of this buf and attach it to 2459 * a new anonymous buf. 2460 */ 2461 (void) remove_reference(hdr, hash_lock, tag); 2462 bufp = &hdr->b_buf; 2463 while (*bufp != buf) 2464 bufp = &(*bufp)->b_next; 2465 *bufp = (*bufp)->b_next; 2466 buf->b_next = NULL; 2467 2468 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2469 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2470 if (refcount_is_zero(&hdr->b_refcnt)) { 2471 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 2472 ASSERT3U(*size, >=, hdr->b_size); 2473 atomic_add_64(size, -hdr->b_size); 2474 } 2475 hdr->b_datacnt -= 1; 2476 arc_cksum_verify(buf); 2477 2478 mutex_exit(hash_lock); 2479 2480 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2481 nhdr->b_size = blksz; 2482 nhdr->b_spa = spa; 2483 nhdr->b_type = type; 2484 nhdr->b_buf = buf; 2485 nhdr->b_state = arc_anon; 2486 nhdr->b_arc_access = 0; 2487 nhdr->b_flags = 0; 2488 nhdr->b_datacnt = 1; 2489 nhdr->b_freeze_cksum = NULL; 2490 (void) refcount_add(&nhdr->b_refcnt, tag); 2491 buf->b_hdr = nhdr; 2492 atomic_add_64(&arc_anon->arcs_size, blksz); 2493 2494 hdr = nhdr; 2495 } else { 2496 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2497 ASSERT(!list_link_active(&hdr->b_arc_node)); 2498 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2499 arc_change_state(arc_anon, hdr, hash_lock); 2500 hdr->b_arc_access = 0; 2501 mutex_exit(hash_lock); 2502 bzero(&hdr->b_dva, sizeof (dva_t)); 2503 hdr->b_birth = 0; 2504 hdr->b_cksum0 = 0; 2505 arc_buf_thaw(buf); 2506 } 2507 buf->b_efunc = NULL; 2508 buf->b_private = NULL; 2509 } 2510 2511 int 2512 arc_released(arc_buf_t *buf) 2513 { 2514 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2515 } 2516 2517 int 2518 arc_has_callback(arc_buf_t *buf) 2519 { 2520 return (buf->b_efunc != NULL); 2521 } 2522 2523 #ifdef ZFS_DEBUG 2524 int 2525 arc_referenced(arc_buf_t *buf) 2526 { 2527 return (refcount_count(&buf->b_hdr->b_refcnt)); 2528 } 2529 #endif 2530 2531 static void 2532 arc_write_ready(zio_t *zio) 2533 { 2534 arc_write_callback_t *callback = zio->io_private; 2535 arc_buf_t *buf = callback->awcb_buf; 2536 arc_buf_hdr_t *hdr = buf->b_hdr; 2537 2538 if (zio->io_error == 0 && callback->awcb_ready) { 2539 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2540 callback->awcb_ready(zio, buf, callback->awcb_private); 2541 } 2542 /* 2543 * If the IO is already in progress, then this is a re-write 2544 * attempt, so we need to thaw and re-compute the cksum. It is 2545 * the responsibility of the callback to handle the freeing 2546 * and accounting for any re-write attempt. If we don't have a 2547 * callback registered then simply free the block here. 2548 */ 2549 if (HDR_IO_IN_PROGRESS(hdr)) { 2550 if (!BP_IS_HOLE(&zio->io_bp_orig) && 2551 callback->awcb_ready == NULL) { 2552 zio_nowait(zio_free(zio, zio->io_spa, zio->io_txg, 2553 &zio->io_bp_orig, NULL, NULL)); 2554 } 2555 mutex_enter(&hdr->b_freeze_lock); 2556 if (hdr->b_freeze_cksum != NULL) { 2557 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 2558 hdr->b_freeze_cksum = NULL; 2559 } 2560 mutex_exit(&hdr->b_freeze_lock); 2561 } 2562 arc_cksum_compute(buf); 2563 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2564 } 2565 2566 static void 2567 arc_write_done(zio_t *zio) 2568 { 2569 arc_write_callback_t *callback = zio->io_private; 2570 arc_buf_t *buf = callback->awcb_buf; 2571 arc_buf_hdr_t *hdr = buf->b_hdr; 2572 2573 hdr->b_acb = NULL; 2574 2575 /* this buffer is on no lists and is not in the hash table */ 2576 ASSERT3P(hdr->b_state, ==, arc_anon); 2577 2578 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2579 hdr->b_birth = zio->io_bp->blk_birth; 2580 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2581 /* 2582 * If the block to be written was all-zero, we may have 2583 * compressed it away. In this case no write was performed 2584 * so there will be no dva/birth-date/checksum. The buffer 2585 * must therefor remain anonymous (and uncached). 2586 */ 2587 if (!BUF_EMPTY(hdr)) { 2588 arc_buf_hdr_t *exists; 2589 kmutex_t *hash_lock; 2590 2591 arc_cksum_verify(buf); 2592 2593 exists = buf_hash_insert(hdr, &hash_lock); 2594 if (exists) { 2595 /* 2596 * This can only happen if we overwrite for 2597 * sync-to-convergence, because we remove 2598 * buffers from the hash table when we arc_free(). 2599 */ 2600 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2601 BP_IDENTITY(zio->io_bp))); 2602 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2603 zio->io_bp->blk_birth); 2604 2605 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2606 arc_change_state(arc_anon, exists, hash_lock); 2607 mutex_exit(hash_lock); 2608 arc_hdr_destroy(exists); 2609 exists = buf_hash_insert(hdr, &hash_lock); 2610 ASSERT3P(exists, ==, NULL); 2611 } 2612 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2613 arc_access(hdr, hash_lock); 2614 mutex_exit(hash_lock); 2615 } else if (callback->awcb_done == NULL) { 2616 int destroy_hdr; 2617 /* 2618 * This is an anonymous buffer with no user callback, 2619 * destroy it if there are no active references. 2620 */ 2621 mutex_enter(&arc_eviction_mtx); 2622 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2623 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2624 mutex_exit(&arc_eviction_mtx); 2625 if (destroy_hdr) 2626 arc_hdr_destroy(hdr); 2627 } else { 2628 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2629 } 2630 2631 if (callback->awcb_done) { 2632 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2633 callback->awcb_done(zio, buf, callback->awcb_private); 2634 } 2635 2636 kmem_free(callback, sizeof (arc_write_callback_t)); 2637 } 2638 2639 zio_t * 2640 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2641 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2642 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2643 int flags, zbookmark_t *zb) 2644 { 2645 arc_buf_hdr_t *hdr = buf->b_hdr; 2646 arc_write_callback_t *callback; 2647 zio_t *zio; 2648 2649 /* this is a private buffer - no locking required */ 2650 ASSERT3P(hdr->b_state, ==, arc_anon); 2651 ASSERT(BUF_EMPTY(hdr)); 2652 ASSERT(!HDR_IO_ERROR(hdr)); 2653 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2654 ASSERT(hdr->b_acb == 0); 2655 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2656 callback->awcb_ready = ready; 2657 callback->awcb_done = done; 2658 callback->awcb_private = private; 2659 callback->awcb_buf = buf; 2660 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2661 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2662 priority, flags, zb); 2663 2664 return (zio); 2665 } 2666 2667 int 2668 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2669 zio_done_func_t *done, void *private, uint32_t arc_flags) 2670 { 2671 arc_buf_hdr_t *ab; 2672 kmutex_t *hash_lock; 2673 zio_t *zio; 2674 2675 /* 2676 * If this buffer is in the cache, release it, so it 2677 * can be re-used. 2678 */ 2679 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2680 if (ab != NULL) { 2681 /* 2682 * The checksum of blocks to free is not always 2683 * preserved (eg. on the deadlist). However, if it is 2684 * nonzero, it should match what we have in the cache. 2685 */ 2686 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2687 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2688 if (ab->b_state != arc_anon) 2689 arc_change_state(arc_anon, ab, hash_lock); 2690 if (HDR_IO_IN_PROGRESS(ab)) { 2691 /* 2692 * This should only happen when we prefetch. 2693 */ 2694 ASSERT(ab->b_flags & ARC_PREFETCH); 2695 ASSERT3U(ab->b_datacnt, ==, 1); 2696 ab->b_flags |= ARC_FREED_IN_READ; 2697 if (HDR_IN_HASH_TABLE(ab)) 2698 buf_hash_remove(ab); 2699 ab->b_arc_access = 0; 2700 bzero(&ab->b_dva, sizeof (dva_t)); 2701 ab->b_birth = 0; 2702 ab->b_cksum0 = 0; 2703 ab->b_buf->b_efunc = NULL; 2704 ab->b_buf->b_private = NULL; 2705 mutex_exit(hash_lock); 2706 } else if (refcount_is_zero(&ab->b_refcnt)) { 2707 mutex_exit(hash_lock); 2708 arc_hdr_destroy(ab); 2709 ARCSTAT_BUMP(arcstat_deleted); 2710 } else { 2711 /* 2712 * We still have an active reference on this 2713 * buffer. This can happen, e.g., from 2714 * dbuf_unoverride(). 2715 */ 2716 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2717 ab->b_arc_access = 0; 2718 bzero(&ab->b_dva, sizeof (dva_t)); 2719 ab->b_birth = 0; 2720 ab->b_cksum0 = 0; 2721 ab->b_buf->b_efunc = NULL; 2722 ab->b_buf->b_private = NULL; 2723 mutex_exit(hash_lock); 2724 } 2725 } 2726 2727 zio = zio_free(pio, spa, txg, bp, done, private); 2728 2729 if (arc_flags & ARC_WAIT) 2730 return (zio_wait(zio)); 2731 2732 ASSERT(arc_flags & ARC_NOWAIT); 2733 zio_nowait(zio); 2734 2735 return (0); 2736 } 2737 2738 void 2739 arc_tempreserve_clear(uint64_t tempreserve) 2740 { 2741 atomic_add_64(&arc_tempreserve, -tempreserve); 2742 ASSERT((int64_t)arc_tempreserve >= 0); 2743 } 2744 2745 int 2746 arc_tempreserve_space(uint64_t tempreserve) 2747 { 2748 #ifdef ZFS_DEBUG 2749 /* 2750 * Once in a while, fail for no reason. Everything should cope. 2751 */ 2752 if (spa_get_random(10000) == 0) { 2753 dprintf("forcing random failure\n"); 2754 return (ERESTART); 2755 } 2756 #endif 2757 if (tempreserve > arc_c/4 && !arc_no_grow) 2758 arc_c = MIN(arc_c_max, tempreserve * 4); 2759 if (tempreserve > arc_c) 2760 return (ENOMEM); 2761 2762 /* 2763 * Throttle writes when the amount of dirty data in the cache 2764 * gets too large. We try to keep the cache less than half full 2765 * of dirty blocks so that our sync times don't grow too large. 2766 * Note: if two requests come in concurrently, we might let them 2767 * both succeed, when one of them should fail. Not a huge deal. 2768 * 2769 * XXX The limit should be adjusted dynamically to keep the time 2770 * to sync a dataset fixed (around 1-5 seconds?). 2771 */ 2772 2773 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2774 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2775 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 2776 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 2777 arc_tempreserve>>10, 2778 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 2779 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 2780 tempreserve>>10, arc_c>>10); 2781 return (ERESTART); 2782 } 2783 atomic_add_64(&arc_tempreserve, tempreserve); 2784 return (0); 2785 } 2786 2787 void 2788 arc_init(void) 2789 { 2790 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2791 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2792 2793 /* Convert seconds to clock ticks */ 2794 arc_min_prefetch_lifespan = 1 * hz; 2795 2796 /* Start out with 1/8 of all memory */ 2797 arc_c = physmem * PAGESIZE / 8; 2798 2799 #ifdef _KERNEL 2800 /* 2801 * On architectures where the physical memory can be larger 2802 * than the addressable space (intel in 32-bit mode), we may 2803 * need to limit the cache to 1/8 of VM size. 2804 */ 2805 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2806 #endif 2807 2808 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2809 arc_c_min = MAX(arc_c / 4, 64<<20); 2810 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2811 if (arc_c * 8 >= 1<<30) 2812 arc_c_max = (arc_c * 8) - (1<<30); 2813 else 2814 arc_c_max = arc_c_min; 2815 arc_c_max = MAX(arc_c * 6, arc_c_max); 2816 2817 /* 2818 * Allow the tunables to override our calculations if they are 2819 * reasonable (ie. over 64MB) 2820 */ 2821 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2822 arc_c_max = zfs_arc_max; 2823 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2824 arc_c_min = zfs_arc_min; 2825 2826 arc_c = arc_c_max; 2827 arc_p = (arc_c >> 1); 2828 2829 /* limit meta-data to 1/4 of the arc capacity */ 2830 arc_meta_limit = arc_c_max / 4; 2831 2832 /* Allow the tunable to override if it is reasonable */ 2833 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 2834 arc_meta_limit = zfs_arc_meta_limit; 2835 2836 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 2837 arc_c_min = arc_meta_limit / 2; 2838 2839 /* if kmem_flags are set, lets try to use less memory */ 2840 if (kmem_debugging()) 2841 arc_c = arc_c / 2; 2842 if (arc_c < arc_c_min) 2843 arc_c = arc_c_min; 2844 2845 arc_anon = &ARC_anon; 2846 arc_mru = &ARC_mru; 2847 arc_mru_ghost = &ARC_mru_ghost; 2848 arc_mfu = &ARC_mfu; 2849 arc_mfu_ghost = &ARC_mfu_ghost; 2850 arc_size = 0; 2851 2852 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2853 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2854 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2855 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2856 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2857 2858 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 2859 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2860 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 2861 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2862 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 2863 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2864 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 2865 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2866 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 2867 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2868 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 2869 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2870 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 2871 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2872 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 2873 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2874 2875 buf_init(); 2876 2877 arc_thread_exit = 0; 2878 arc_eviction_list = NULL; 2879 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2880 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2881 2882 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2883 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2884 2885 if (arc_ksp != NULL) { 2886 arc_ksp->ks_data = &arc_stats; 2887 kstat_install(arc_ksp); 2888 } 2889 2890 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2891 TS_RUN, minclsyspri); 2892 2893 arc_dead = FALSE; 2894 } 2895 2896 void 2897 arc_fini(void) 2898 { 2899 mutex_enter(&arc_reclaim_thr_lock); 2900 arc_thread_exit = 1; 2901 while (arc_thread_exit != 0) 2902 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2903 mutex_exit(&arc_reclaim_thr_lock); 2904 2905 arc_flush(); 2906 2907 arc_dead = TRUE; 2908 2909 if (arc_ksp != NULL) { 2910 kstat_delete(arc_ksp); 2911 arc_ksp = NULL; 2912 } 2913 2914 mutex_destroy(&arc_eviction_mtx); 2915 mutex_destroy(&arc_reclaim_thr_lock); 2916 cv_destroy(&arc_reclaim_thr_cv); 2917 2918 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 2919 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 2920 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 2921 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 2922 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 2923 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 2924 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 2925 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 2926 2927 mutex_destroy(&arc_anon->arcs_mtx); 2928 mutex_destroy(&arc_mru->arcs_mtx); 2929 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2930 mutex_destroy(&arc_mfu->arcs_mtx); 2931 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2932 2933 buf_fini(); 2934 } 2935