1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 #include <sys/kstat.h> 128 129 static kmutex_t arc_reclaim_thr_lock; 130 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131 static uint8_t arc_thread_exit; 132 133 #define ARC_REDUCE_DNLC_PERCENT 3 134 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136 typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139 } arc_reclaim_strategy_t; 140 141 /* number of seconds before growing cache again */ 142 static int arc_grow_retry = 60; 143 144 /* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148 static int arc_min_prefetch_lifespan; 149 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 uint64_t zfs_arc_meta_limit = 0; 158 159 /* 160 * Note that buffers can be in one of 5 states: 161 * ARC_anon - anonymous (discussed below) 162 * ARC_mru - recently used, currently cached 163 * ARC_mru_ghost - recentely used, no longer in cache 164 * ARC_mfu - frequently used, currently cached 165 * ARC_mfu_ghost - frequently used, no longer in cache 166 * When there are no active references to the buffer, they are 167 * are linked onto a list in one of these arc states. These are 168 * the only buffers that can be evicted or deleted. Within each 169 * state there are multiple lists, one for meta-data and one for 170 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 171 * etc.) is tracked separately so that it can be managed more 172 * explicitly: favored over data, limited explicitely. 173 * 174 * Anonymous buffers are buffers that are not associated with 175 * a DVA. These are buffers that hold dirty block copies 176 * before they are written to stable storage. By definition, 177 * they are "ref'd" and are considered part of arc_mru 178 * that cannot be freed. Generally, they will aquire a DVA 179 * as they are written and migrate onto the arc_mru list. 180 */ 181 182 typedef struct arc_state { 183 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 184 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 185 uint64_t arcs_size; /* total amount of data in this state */ 186 kmutex_t arcs_mtx; 187 } arc_state_t; 188 189 /* The 5 states: */ 190 static arc_state_t ARC_anon; 191 static arc_state_t ARC_mru; 192 static arc_state_t ARC_mru_ghost; 193 static arc_state_t ARC_mfu; 194 static arc_state_t ARC_mfu_ghost; 195 196 typedef struct arc_stats { 197 kstat_named_t arcstat_hits; 198 kstat_named_t arcstat_misses; 199 kstat_named_t arcstat_demand_data_hits; 200 kstat_named_t arcstat_demand_data_misses; 201 kstat_named_t arcstat_demand_metadata_hits; 202 kstat_named_t arcstat_demand_metadata_misses; 203 kstat_named_t arcstat_prefetch_data_hits; 204 kstat_named_t arcstat_prefetch_data_misses; 205 kstat_named_t arcstat_prefetch_metadata_hits; 206 kstat_named_t arcstat_prefetch_metadata_misses; 207 kstat_named_t arcstat_mru_hits; 208 kstat_named_t arcstat_mru_ghost_hits; 209 kstat_named_t arcstat_mfu_hits; 210 kstat_named_t arcstat_mfu_ghost_hits; 211 kstat_named_t arcstat_deleted; 212 kstat_named_t arcstat_recycle_miss; 213 kstat_named_t arcstat_mutex_miss; 214 kstat_named_t arcstat_evict_skip; 215 kstat_named_t arcstat_hash_elements; 216 kstat_named_t arcstat_hash_elements_max; 217 kstat_named_t arcstat_hash_collisions; 218 kstat_named_t arcstat_hash_chains; 219 kstat_named_t arcstat_hash_chain_max; 220 kstat_named_t arcstat_p; 221 kstat_named_t arcstat_c; 222 kstat_named_t arcstat_c_min; 223 kstat_named_t arcstat_c_max; 224 kstat_named_t arcstat_size; 225 } arc_stats_t; 226 227 static arc_stats_t arc_stats = { 228 { "hits", KSTAT_DATA_UINT64 }, 229 { "misses", KSTAT_DATA_UINT64 }, 230 { "demand_data_hits", KSTAT_DATA_UINT64 }, 231 { "demand_data_misses", KSTAT_DATA_UINT64 }, 232 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 233 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 234 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 235 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 236 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 237 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 238 { "mru_hits", KSTAT_DATA_UINT64 }, 239 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 240 { "mfu_hits", KSTAT_DATA_UINT64 }, 241 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 242 { "deleted", KSTAT_DATA_UINT64 }, 243 { "recycle_miss", KSTAT_DATA_UINT64 }, 244 { "mutex_miss", KSTAT_DATA_UINT64 }, 245 { "evict_skip", KSTAT_DATA_UINT64 }, 246 { "hash_elements", KSTAT_DATA_UINT64 }, 247 { "hash_elements_max", KSTAT_DATA_UINT64 }, 248 { "hash_collisions", KSTAT_DATA_UINT64 }, 249 { "hash_chains", KSTAT_DATA_UINT64 }, 250 { "hash_chain_max", KSTAT_DATA_UINT64 }, 251 { "p", KSTAT_DATA_UINT64 }, 252 { "c", KSTAT_DATA_UINT64 }, 253 { "c_min", KSTAT_DATA_UINT64 }, 254 { "c_max", KSTAT_DATA_UINT64 }, 255 { "size", KSTAT_DATA_UINT64 } 256 }; 257 258 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 259 260 #define ARCSTAT_INCR(stat, val) \ 261 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 262 263 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 264 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 265 266 #define ARCSTAT_MAX(stat, val) { \ 267 uint64_t m; \ 268 while ((val) > (m = arc_stats.stat.value.ui64) && \ 269 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 270 continue; \ 271 } 272 273 #define ARCSTAT_MAXSTAT(stat) \ 274 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 275 276 /* 277 * We define a macro to allow ARC hits/misses to be easily broken down by 278 * two separate conditions, giving a total of four different subtypes for 279 * each of hits and misses (so eight statistics total). 280 */ 281 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 282 if (cond1) { \ 283 if (cond2) { \ 284 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 285 } else { \ 286 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 287 } \ 288 } else { \ 289 if (cond2) { \ 290 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 291 } else { \ 292 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 293 } \ 294 } 295 296 kstat_t *arc_ksp; 297 static arc_state_t *arc_anon; 298 static arc_state_t *arc_mru; 299 static arc_state_t *arc_mru_ghost; 300 static arc_state_t *arc_mfu; 301 static arc_state_t *arc_mfu_ghost; 302 303 /* 304 * There are several ARC variables that are critical to export as kstats -- 305 * but we don't want to have to grovel around in the kstat whenever we wish to 306 * manipulate them. For these variables, we therefore define them to be in 307 * terms of the statistic variable. This assures that we are not introducing 308 * the possibility of inconsistency by having shadow copies of the variables, 309 * while still allowing the code to be readable. 310 */ 311 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 312 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 313 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 314 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 315 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 316 317 static int arc_no_grow; /* Don't try to grow cache size */ 318 static uint64_t arc_tempreserve; 319 static uint64_t arc_meta_used; 320 static uint64_t arc_meta_limit; 321 static uint64_t arc_meta_max = 0; 322 323 typedef struct arc_callback arc_callback_t; 324 325 struct arc_callback { 326 void *acb_private; 327 arc_done_func_t *acb_done; 328 arc_byteswap_func_t *acb_byteswap; 329 arc_buf_t *acb_buf; 330 zio_t *acb_zio_dummy; 331 arc_callback_t *acb_next; 332 }; 333 334 typedef struct arc_write_callback arc_write_callback_t; 335 336 struct arc_write_callback { 337 void *awcb_private; 338 arc_done_func_t *awcb_ready; 339 arc_done_func_t *awcb_done; 340 arc_buf_t *awcb_buf; 341 }; 342 343 struct arc_buf_hdr { 344 /* protected by hash lock */ 345 dva_t b_dva; 346 uint64_t b_birth; 347 uint64_t b_cksum0; 348 349 kmutex_t b_freeze_lock; 350 zio_cksum_t *b_freeze_cksum; 351 352 arc_buf_hdr_t *b_hash_next; 353 arc_buf_t *b_buf; 354 uint32_t b_flags; 355 uint32_t b_datacnt; 356 357 arc_callback_t *b_acb; 358 kcondvar_t b_cv; 359 360 /* immutable */ 361 arc_buf_contents_t b_type; 362 uint64_t b_size; 363 spa_t *b_spa; 364 365 /* protected by arc state mutex */ 366 arc_state_t *b_state; 367 list_node_t b_arc_node; 368 369 /* updated atomically */ 370 clock_t b_arc_access; 371 372 /* self protecting */ 373 refcount_t b_refcnt; 374 }; 375 376 static arc_buf_t *arc_eviction_list; 377 static kmutex_t arc_eviction_mtx; 378 static arc_buf_hdr_t arc_eviction_hdr; 379 static void arc_get_data_buf(arc_buf_t *buf); 380 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 381 static int arc_evict_needed(arc_buf_contents_t type); 382 static void arc_evict_ghost(arc_state_t *state, int64_t bytes); 383 384 #define GHOST_STATE(state) \ 385 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 386 387 /* 388 * Private ARC flags. These flags are private ARC only flags that will show up 389 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 390 * be passed in as arc_flags in things like arc_read. However, these flags 391 * should never be passed and should only be set by ARC code. When adding new 392 * public flags, make sure not to smash the private ones. 393 */ 394 395 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 396 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 397 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 398 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 399 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 400 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 401 402 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 403 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 404 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 405 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 406 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 407 408 /* 409 * Hash table routines 410 */ 411 412 #define HT_LOCK_PAD 64 413 414 struct ht_lock { 415 kmutex_t ht_lock; 416 #ifdef _KERNEL 417 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 418 #endif 419 }; 420 421 #define BUF_LOCKS 256 422 typedef struct buf_hash_table { 423 uint64_t ht_mask; 424 arc_buf_hdr_t **ht_table; 425 struct ht_lock ht_locks[BUF_LOCKS]; 426 } buf_hash_table_t; 427 428 static buf_hash_table_t buf_hash_table; 429 430 #define BUF_HASH_INDEX(spa, dva, birth) \ 431 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 432 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 433 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 434 #define HDR_LOCK(buf) \ 435 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 436 437 uint64_t zfs_crc64_table[256]; 438 439 static uint64_t 440 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 441 { 442 uintptr_t spav = (uintptr_t)spa; 443 uint8_t *vdva = (uint8_t *)dva; 444 uint64_t crc = -1ULL; 445 int i; 446 447 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 448 449 for (i = 0; i < sizeof (dva_t); i++) 450 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 451 452 crc ^= (spav>>8) ^ birth; 453 454 return (crc); 455 } 456 457 #define BUF_EMPTY(buf) \ 458 ((buf)->b_dva.dva_word[0] == 0 && \ 459 (buf)->b_dva.dva_word[1] == 0 && \ 460 (buf)->b_birth == 0) 461 462 #define BUF_EQUAL(spa, dva, birth, buf) \ 463 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 464 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 465 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 466 467 static arc_buf_hdr_t * 468 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 469 { 470 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 471 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 472 arc_buf_hdr_t *buf; 473 474 mutex_enter(hash_lock); 475 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 476 buf = buf->b_hash_next) { 477 if (BUF_EQUAL(spa, dva, birth, buf)) { 478 *lockp = hash_lock; 479 return (buf); 480 } 481 } 482 mutex_exit(hash_lock); 483 *lockp = NULL; 484 return (NULL); 485 } 486 487 /* 488 * Insert an entry into the hash table. If there is already an element 489 * equal to elem in the hash table, then the already existing element 490 * will be returned and the new element will not be inserted. 491 * Otherwise returns NULL. 492 */ 493 static arc_buf_hdr_t * 494 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 495 { 496 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 497 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 498 arc_buf_hdr_t *fbuf; 499 uint32_t i; 500 501 ASSERT(!HDR_IN_HASH_TABLE(buf)); 502 *lockp = hash_lock; 503 mutex_enter(hash_lock); 504 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 505 fbuf = fbuf->b_hash_next, i++) { 506 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 507 return (fbuf); 508 } 509 510 buf->b_hash_next = buf_hash_table.ht_table[idx]; 511 buf_hash_table.ht_table[idx] = buf; 512 buf->b_flags |= ARC_IN_HASH_TABLE; 513 514 /* collect some hash table performance data */ 515 if (i > 0) { 516 ARCSTAT_BUMP(arcstat_hash_collisions); 517 if (i == 1) 518 ARCSTAT_BUMP(arcstat_hash_chains); 519 520 ARCSTAT_MAX(arcstat_hash_chain_max, i); 521 } 522 523 ARCSTAT_BUMP(arcstat_hash_elements); 524 ARCSTAT_MAXSTAT(arcstat_hash_elements); 525 526 return (NULL); 527 } 528 529 static void 530 buf_hash_remove(arc_buf_hdr_t *buf) 531 { 532 arc_buf_hdr_t *fbuf, **bufp; 533 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 534 535 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 536 ASSERT(HDR_IN_HASH_TABLE(buf)); 537 538 bufp = &buf_hash_table.ht_table[idx]; 539 while ((fbuf = *bufp) != buf) { 540 ASSERT(fbuf != NULL); 541 bufp = &fbuf->b_hash_next; 542 } 543 *bufp = buf->b_hash_next; 544 buf->b_hash_next = NULL; 545 buf->b_flags &= ~ARC_IN_HASH_TABLE; 546 547 /* collect some hash table performance data */ 548 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 549 550 if (buf_hash_table.ht_table[idx] && 551 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 552 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 553 } 554 555 /* 556 * Global data structures and functions for the buf kmem cache. 557 */ 558 static kmem_cache_t *hdr_cache; 559 static kmem_cache_t *buf_cache; 560 561 static void 562 buf_fini(void) 563 { 564 int i; 565 566 kmem_free(buf_hash_table.ht_table, 567 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 568 for (i = 0; i < BUF_LOCKS; i++) 569 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 570 kmem_cache_destroy(hdr_cache); 571 kmem_cache_destroy(buf_cache); 572 } 573 574 /* 575 * Constructor callback - called when the cache is empty 576 * and a new buf is requested. 577 */ 578 /* ARGSUSED */ 579 static int 580 hdr_cons(void *vbuf, void *unused, int kmflag) 581 { 582 arc_buf_hdr_t *buf = vbuf; 583 584 bzero(buf, sizeof (arc_buf_hdr_t)); 585 refcount_create(&buf->b_refcnt); 586 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 587 return (0); 588 } 589 590 /* 591 * Destructor callback - called when a cached buf is 592 * no longer required. 593 */ 594 /* ARGSUSED */ 595 static void 596 hdr_dest(void *vbuf, void *unused) 597 { 598 arc_buf_hdr_t *buf = vbuf; 599 600 refcount_destroy(&buf->b_refcnt); 601 cv_destroy(&buf->b_cv); 602 } 603 604 /* 605 * Reclaim callback -- invoked when memory is low. 606 */ 607 /* ARGSUSED */ 608 static void 609 hdr_recl(void *unused) 610 { 611 dprintf("hdr_recl called\n"); 612 /* 613 * umem calls the reclaim func when we destroy the buf cache, 614 * which is after we do arc_fini(). 615 */ 616 if (!arc_dead) 617 cv_signal(&arc_reclaim_thr_cv); 618 } 619 620 static void 621 buf_init(void) 622 { 623 uint64_t *ct; 624 uint64_t hsize = 1ULL << 12; 625 int i, j; 626 627 /* 628 * The hash table is big enough to fill all of physical memory 629 * with an average 64K block size. The table will take up 630 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 631 */ 632 while (hsize * 65536 < physmem * PAGESIZE) 633 hsize <<= 1; 634 retry: 635 buf_hash_table.ht_mask = hsize - 1; 636 buf_hash_table.ht_table = 637 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 638 if (buf_hash_table.ht_table == NULL) { 639 ASSERT(hsize > (1ULL << 8)); 640 hsize >>= 1; 641 goto retry; 642 } 643 644 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 645 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 646 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 647 0, NULL, NULL, NULL, NULL, NULL, 0); 648 649 for (i = 0; i < 256; i++) 650 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 651 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 652 653 for (i = 0; i < BUF_LOCKS; i++) { 654 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 655 NULL, MUTEX_DEFAULT, NULL); 656 } 657 } 658 659 #define ARC_MINTIME (hz>>4) /* 62 ms */ 660 661 static void 662 arc_cksum_verify(arc_buf_t *buf) 663 { 664 zio_cksum_t zc; 665 666 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 667 return; 668 669 mutex_enter(&buf->b_hdr->b_freeze_lock); 670 if (buf->b_hdr->b_freeze_cksum == NULL || 671 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 672 mutex_exit(&buf->b_hdr->b_freeze_lock); 673 return; 674 } 675 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 676 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 677 panic("buffer modified while frozen!"); 678 mutex_exit(&buf->b_hdr->b_freeze_lock); 679 } 680 681 static void 682 arc_cksum_compute(arc_buf_t *buf) 683 { 684 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 685 return; 686 687 mutex_enter(&buf->b_hdr->b_freeze_lock); 688 if (buf->b_hdr->b_freeze_cksum != NULL) { 689 mutex_exit(&buf->b_hdr->b_freeze_lock); 690 return; 691 } 692 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 693 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 694 buf->b_hdr->b_freeze_cksum); 695 mutex_exit(&buf->b_hdr->b_freeze_lock); 696 } 697 698 void 699 arc_buf_thaw(arc_buf_t *buf) 700 { 701 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 702 return; 703 704 if (buf->b_hdr->b_state != arc_anon) 705 panic("modifying non-anon buffer!"); 706 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 707 panic("modifying buffer while i/o in progress!"); 708 arc_cksum_verify(buf); 709 mutex_enter(&buf->b_hdr->b_freeze_lock); 710 if (buf->b_hdr->b_freeze_cksum != NULL) { 711 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 712 buf->b_hdr->b_freeze_cksum = NULL; 713 } 714 mutex_exit(&buf->b_hdr->b_freeze_lock); 715 } 716 717 void 718 arc_buf_freeze(arc_buf_t *buf) 719 { 720 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 721 return; 722 723 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 724 buf->b_hdr->b_state == arc_anon); 725 arc_cksum_compute(buf); 726 } 727 728 static void 729 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 730 { 731 ASSERT(MUTEX_HELD(hash_lock)); 732 733 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 734 (ab->b_state != arc_anon)) { 735 uint64_t delta = ab->b_size * ab->b_datacnt; 736 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 737 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 738 739 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 740 mutex_enter(&ab->b_state->arcs_mtx); 741 ASSERT(list_link_active(&ab->b_arc_node)); 742 list_remove(list, ab); 743 if (GHOST_STATE(ab->b_state)) { 744 ASSERT3U(ab->b_datacnt, ==, 0); 745 ASSERT3P(ab->b_buf, ==, NULL); 746 delta = ab->b_size; 747 } 748 ASSERT(delta > 0); 749 ASSERT3U(*size, >=, delta); 750 atomic_add_64(size, -delta); 751 mutex_exit(&ab->b_state->arcs_mtx); 752 /* remove the prefetch flag is we get a reference */ 753 if (ab->b_flags & ARC_PREFETCH) 754 ab->b_flags &= ~ARC_PREFETCH; 755 } 756 } 757 758 static int 759 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 760 { 761 int cnt; 762 arc_state_t *state = ab->b_state; 763 764 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 765 ASSERT(!GHOST_STATE(state)); 766 767 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 768 (state != arc_anon)) { 769 uint64_t *size = &state->arcs_lsize[ab->b_type]; 770 771 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 772 mutex_enter(&state->arcs_mtx); 773 ASSERT(!list_link_active(&ab->b_arc_node)); 774 list_insert_head(&state->arcs_list[ab->b_type], ab); 775 ASSERT(ab->b_datacnt > 0); 776 atomic_add_64(size, ab->b_size * ab->b_datacnt); 777 mutex_exit(&state->arcs_mtx); 778 } 779 return (cnt); 780 } 781 782 /* 783 * Move the supplied buffer to the indicated state. The mutex 784 * for the buffer must be held by the caller. 785 */ 786 static void 787 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 788 { 789 arc_state_t *old_state = ab->b_state; 790 int64_t refcnt = refcount_count(&ab->b_refcnt); 791 uint64_t from_delta, to_delta; 792 793 ASSERT(MUTEX_HELD(hash_lock)); 794 ASSERT(new_state != old_state); 795 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 796 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 797 798 from_delta = to_delta = ab->b_datacnt * ab->b_size; 799 800 /* 801 * If this buffer is evictable, transfer it from the 802 * old state list to the new state list. 803 */ 804 if (refcnt == 0) { 805 if (old_state != arc_anon) { 806 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 807 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 808 809 if (use_mutex) 810 mutex_enter(&old_state->arcs_mtx); 811 812 ASSERT(list_link_active(&ab->b_arc_node)); 813 list_remove(&old_state->arcs_list[ab->b_type], ab); 814 815 /* 816 * If prefetching out of the ghost cache, 817 * we will have a non-null datacnt. 818 */ 819 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 820 /* ghost elements have a ghost size */ 821 ASSERT(ab->b_buf == NULL); 822 from_delta = ab->b_size; 823 } 824 ASSERT3U(*size, >=, from_delta); 825 atomic_add_64(size, -from_delta); 826 827 if (use_mutex) 828 mutex_exit(&old_state->arcs_mtx); 829 } 830 if (new_state != arc_anon) { 831 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 832 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 833 834 if (use_mutex) 835 mutex_enter(&new_state->arcs_mtx); 836 837 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 838 839 /* ghost elements have a ghost size */ 840 if (GHOST_STATE(new_state)) { 841 ASSERT(ab->b_datacnt == 0); 842 ASSERT(ab->b_buf == NULL); 843 to_delta = ab->b_size; 844 } 845 atomic_add_64(size, to_delta); 846 847 if (use_mutex) 848 mutex_exit(&new_state->arcs_mtx); 849 } 850 } 851 852 ASSERT(!BUF_EMPTY(ab)); 853 if (new_state == arc_anon && old_state != arc_anon) { 854 buf_hash_remove(ab); 855 } 856 857 /* adjust state sizes */ 858 if (to_delta) 859 atomic_add_64(&new_state->arcs_size, to_delta); 860 if (from_delta) { 861 ASSERT3U(old_state->arcs_size, >=, from_delta); 862 atomic_add_64(&old_state->arcs_size, -from_delta); 863 } 864 ab->b_state = new_state; 865 } 866 867 void 868 arc_space_consume(uint64_t space) 869 { 870 atomic_add_64(&arc_meta_used, space); 871 atomic_add_64(&arc_size, space); 872 } 873 874 void 875 arc_space_return(uint64_t space) 876 { 877 ASSERT(arc_meta_used >= space); 878 if (arc_meta_max < arc_meta_used) 879 arc_meta_max = arc_meta_used; 880 atomic_add_64(&arc_meta_used, -space); 881 ASSERT(arc_size >= space); 882 atomic_add_64(&arc_size, -space); 883 } 884 885 void * 886 arc_data_buf_alloc(uint64_t size) 887 { 888 if (arc_evict_needed(ARC_BUFC_DATA)) 889 cv_signal(&arc_reclaim_thr_cv); 890 atomic_add_64(&arc_size, size); 891 return (zio_data_buf_alloc(size)); 892 } 893 894 void 895 arc_data_buf_free(void *buf, uint64_t size) 896 { 897 zio_data_buf_free(buf, size); 898 ASSERT(arc_size >= size); 899 atomic_add_64(&arc_size, -size); 900 } 901 902 arc_buf_t * 903 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 904 { 905 arc_buf_hdr_t *hdr; 906 arc_buf_t *buf; 907 908 ASSERT3U(size, >, 0); 909 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 910 ASSERT(BUF_EMPTY(hdr)); 911 hdr->b_size = size; 912 hdr->b_type = type; 913 hdr->b_spa = spa; 914 hdr->b_state = arc_anon; 915 hdr->b_arc_access = 0; 916 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 917 buf->b_hdr = hdr; 918 buf->b_data = NULL; 919 buf->b_efunc = NULL; 920 buf->b_private = NULL; 921 buf->b_next = NULL; 922 hdr->b_buf = buf; 923 arc_get_data_buf(buf); 924 hdr->b_datacnt = 1; 925 hdr->b_flags = 0; 926 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 927 (void) refcount_add(&hdr->b_refcnt, tag); 928 929 return (buf); 930 } 931 932 static arc_buf_t * 933 arc_buf_clone(arc_buf_t *from) 934 { 935 arc_buf_t *buf; 936 arc_buf_hdr_t *hdr = from->b_hdr; 937 uint64_t size = hdr->b_size; 938 939 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 940 buf->b_hdr = hdr; 941 buf->b_data = NULL; 942 buf->b_efunc = NULL; 943 buf->b_private = NULL; 944 buf->b_next = hdr->b_buf; 945 hdr->b_buf = buf; 946 arc_get_data_buf(buf); 947 bcopy(from->b_data, buf->b_data, size); 948 hdr->b_datacnt += 1; 949 return (buf); 950 } 951 952 void 953 arc_buf_add_ref(arc_buf_t *buf, void* tag) 954 { 955 arc_buf_hdr_t *hdr; 956 kmutex_t *hash_lock; 957 958 /* 959 * Check to see if this buffer is currently being evicted via 960 * arc_do_user_evicts(). 961 */ 962 mutex_enter(&arc_eviction_mtx); 963 hdr = buf->b_hdr; 964 if (hdr == NULL) { 965 mutex_exit(&arc_eviction_mtx); 966 return; 967 } 968 hash_lock = HDR_LOCK(hdr); 969 mutex_exit(&arc_eviction_mtx); 970 971 mutex_enter(hash_lock); 972 if (buf->b_data == NULL) { 973 /* 974 * This buffer is evicted. 975 */ 976 mutex_exit(hash_lock); 977 return; 978 } 979 980 ASSERT(buf->b_hdr == hdr); 981 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 982 add_reference(hdr, hash_lock, tag); 983 arc_access(hdr, hash_lock); 984 mutex_exit(hash_lock); 985 ARCSTAT_BUMP(arcstat_hits); 986 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 987 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 988 data, metadata, hits); 989 } 990 991 static void 992 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 993 { 994 arc_buf_t **bufp; 995 996 /* free up data associated with the buf */ 997 if (buf->b_data) { 998 arc_state_t *state = buf->b_hdr->b_state; 999 uint64_t size = buf->b_hdr->b_size; 1000 arc_buf_contents_t type = buf->b_hdr->b_type; 1001 1002 arc_cksum_verify(buf); 1003 if (!recycle) { 1004 if (type == ARC_BUFC_METADATA) { 1005 zio_buf_free(buf->b_data, size); 1006 arc_space_return(size); 1007 } else { 1008 ASSERT(type == ARC_BUFC_DATA); 1009 zio_data_buf_free(buf->b_data, size); 1010 atomic_add_64(&arc_size, -size); 1011 } 1012 } 1013 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1014 uint64_t *cnt = &state->arcs_lsize[type]; 1015 1016 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1017 ASSERT(state != arc_anon); 1018 1019 ASSERT3U(*cnt, >=, size); 1020 atomic_add_64(cnt, -size); 1021 } 1022 ASSERT3U(state->arcs_size, >=, size); 1023 atomic_add_64(&state->arcs_size, -size); 1024 buf->b_data = NULL; 1025 ASSERT(buf->b_hdr->b_datacnt > 0); 1026 buf->b_hdr->b_datacnt -= 1; 1027 } 1028 1029 /* only remove the buf if requested */ 1030 if (!all) 1031 return; 1032 1033 /* remove the buf from the hdr list */ 1034 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1035 continue; 1036 *bufp = buf->b_next; 1037 1038 ASSERT(buf->b_efunc == NULL); 1039 1040 /* clean up the buf */ 1041 buf->b_hdr = NULL; 1042 kmem_cache_free(buf_cache, buf); 1043 } 1044 1045 static void 1046 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1047 { 1048 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1049 ASSERT3P(hdr->b_state, ==, arc_anon); 1050 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1051 1052 if (!BUF_EMPTY(hdr)) { 1053 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1054 bzero(&hdr->b_dva, sizeof (dva_t)); 1055 hdr->b_birth = 0; 1056 hdr->b_cksum0 = 0; 1057 } 1058 while (hdr->b_buf) { 1059 arc_buf_t *buf = hdr->b_buf; 1060 1061 if (buf->b_efunc) { 1062 mutex_enter(&arc_eviction_mtx); 1063 ASSERT(buf->b_hdr != NULL); 1064 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1065 hdr->b_buf = buf->b_next; 1066 buf->b_hdr = &arc_eviction_hdr; 1067 buf->b_next = arc_eviction_list; 1068 arc_eviction_list = buf; 1069 mutex_exit(&arc_eviction_mtx); 1070 } else { 1071 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1072 } 1073 } 1074 if (hdr->b_freeze_cksum != NULL) { 1075 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1076 hdr->b_freeze_cksum = NULL; 1077 } 1078 1079 ASSERT(!list_link_active(&hdr->b_arc_node)); 1080 ASSERT3P(hdr->b_hash_next, ==, NULL); 1081 ASSERT3P(hdr->b_acb, ==, NULL); 1082 kmem_cache_free(hdr_cache, hdr); 1083 } 1084 1085 void 1086 arc_buf_free(arc_buf_t *buf, void *tag) 1087 { 1088 arc_buf_hdr_t *hdr = buf->b_hdr; 1089 int hashed = hdr->b_state != arc_anon; 1090 1091 ASSERT(buf->b_efunc == NULL); 1092 ASSERT(buf->b_data != NULL); 1093 1094 if (hashed) { 1095 kmutex_t *hash_lock = HDR_LOCK(hdr); 1096 1097 mutex_enter(hash_lock); 1098 (void) remove_reference(hdr, hash_lock, tag); 1099 if (hdr->b_datacnt > 1) 1100 arc_buf_destroy(buf, FALSE, TRUE); 1101 else 1102 hdr->b_flags |= ARC_BUF_AVAILABLE; 1103 mutex_exit(hash_lock); 1104 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1105 int destroy_hdr; 1106 /* 1107 * We are in the middle of an async write. Don't destroy 1108 * this buffer unless the write completes before we finish 1109 * decrementing the reference count. 1110 */ 1111 mutex_enter(&arc_eviction_mtx); 1112 (void) remove_reference(hdr, NULL, tag); 1113 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1114 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1115 mutex_exit(&arc_eviction_mtx); 1116 if (destroy_hdr) 1117 arc_hdr_destroy(hdr); 1118 } else { 1119 if (remove_reference(hdr, NULL, tag) > 0) { 1120 ASSERT(HDR_IO_ERROR(hdr)); 1121 arc_buf_destroy(buf, FALSE, TRUE); 1122 } else { 1123 arc_hdr_destroy(hdr); 1124 } 1125 } 1126 } 1127 1128 int 1129 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1130 { 1131 arc_buf_hdr_t *hdr = buf->b_hdr; 1132 kmutex_t *hash_lock = HDR_LOCK(hdr); 1133 int no_callback = (buf->b_efunc == NULL); 1134 1135 if (hdr->b_state == arc_anon) { 1136 arc_buf_free(buf, tag); 1137 return (no_callback); 1138 } 1139 1140 mutex_enter(hash_lock); 1141 ASSERT(hdr->b_state != arc_anon); 1142 ASSERT(buf->b_data != NULL); 1143 1144 (void) remove_reference(hdr, hash_lock, tag); 1145 if (hdr->b_datacnt > 1) { 1146 if (no_callback) 1147 arc_buf_destroy(buf, FALSE, TRUE); 1148 } else if (no_callback) { 1149 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1150 hdr->b_flags |= ARC_BUF_AVAILABLE; 1151 } 1152 ASSERT(no_callback || hdr->b_datacnt > 1 || 1153 refcount_is_zero(&hdr->b_refcnt)); 1154 mutex_exit(hash_lock); 1155 return (no_callback); 1156 } 1157 1158 int 1159 arc_buf_size(arc_buf_t *buf) 1160 { 1161 return (buf->b_hdr->b_size); 1162 } 1163 1164 /* 1165 * Evict buffers from list until we've removed the specified number of 1166 * bytes. Move the removed buffers to the appropriate evict state. 1167 * If the recycle flag is set, then attempt to "recycle" a buffer: 1168 * - look for a buffer to evict that is `bytes' long. 1169 * - return the data block from this buffer rather than freeing it. 1170 * This flag is used by callers that are trying to make space for a 1171 * new buffer in a full arc cache. 1172 */ 1173 static void * 1174 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1175 arc_buf_contents_t type) 1176 { 1177 arc_state_t *evicted_state; 1178 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1179 arc_buf_hdr_t *ab, *ab_prev = NULL; 1180 list_t *list = &state->arcs_list[type]; 1181 kmutex_t *hash_lock; 1182 boolean_t have_lock; 1183 void *stolen = NULL; 1184 1185 ASSERT(state == arc_mru || state == arc_mfu); 1186 1187 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1188 1189 mutex_enter(&state->arcs_mtx); 1190 mutex_enter(&evicted_state->arcs_mtx); 1191 1192 for (ab = list_tail(list); ab; ab = ab_prev) { 1193 ab_prev = list_prev(list, ab); 1194 /* prefetch buffers have a minimum lifespan */ 1195 if (HDR_IO_IN_PROGRESS(ab) || 1196 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1197 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1198 skipped++; 1199 continue; 1200 } 1201 /* "lookahead" for better eviction candidate */ 1202 if (recycle && ab->b_size != bytes && 1203 ab_prev && ab_prev->b_size == bytes) 1204 continue; 1205 hash_lock = HDR_LOCK(ab); 1206 have_lock = MUTEX_HELD(hash_lock); 1207 if (have_lock || mutex_tryenter(hash_lock)) { 1208 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1209 ASSERT(ab->b_datacnt > 0); 1210 while (ab->b_buf) { 1211 arc_buf_t *buf = ab->b_buf; 1212 if (buf->b_data) { 1213 bytes_evicted += ab->b_size; 1214 if (recycle && ab->b_type == type && 1215 ab->b_size == bytes) { 1216 stolen = buf->b_data; 1217 recycle = FALSE; 1218 } 1219 } 1220 if (buf->b_efunc) { 1221 mutex_enter(&arc_eviction_mtx); 1222 arc_buf_destroy(buf, 1223 buf->b_data == stolen, FALSE); 1224 ab->b_buf = buf->b_next; 1225 buf->b_hdr = &arc_eviction_hdr; 1226 buf->b_next = arc_eviction_list; 1227 arc_eviction_list = buf; 1228 mutex_exit(&arc_eviction_mtx); 1229 } else { 1230 arc_buf_destroy(buf, 1231 buf->b_data == stolen, TRUE); 1232 } 1233 } 1234 ASSERT(ab->b_datacnt == 0); 1235 arc_change_state(evicted_state, ab, hash_lock); 1236 ASSERT(HDR_IN_HASH_TABLE(ab)); 1237 ab->b_flags = ARC_IN_HASH_TABLE; 1238 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1239 if (!have_lock) 1240 mutex_exit(hash_lock); 1241 if (bytes >= 0 && bytes_evicted >= bytes) 1242 break; 1243 } else { 1244 missed += 1; 1245 } 1246 } 1247 1248 mutex_exit(&evicted_state->arcs_mtx); 1249 mutex_exit(&state->arcs_mtx); 1250 1251 if (bytes_evicted < bytes) 1252 dprintf("only evicted %lld bytes from %x", 1253 (longlong_t)bytes_evicted, state); 1254 1255 if (skipped) 1256 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1257 1258 if (missed) 1259 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1260 1261 /* 1262 * We have just evicted some date into the ghost state, make 1263 * sure we also adjust the ghost state size if necessary. 1264 */ 1265 if (arc_no_grow && 1266 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size > arc_c) { 1267 int64_t mru_over = arc_anon->arcs_size + arc_mru->arcs_size + 1268 arc_mru_ghost->arcs_size - arc_c; 1269 1270 if (mru_over > 0 && arc_mru_ghost->arcs_lsize[type] > 0) { 1271 int64_t todelete = 1272 MIN(arc_mru_ghost->arcs_lsize[type], mru_over); 1273 arc_evict_ghost(arc_mru_ghost, todelete); 1274 } else if (arc_mfu_ghost->arcs_lsize[type] > 0) { 1275 int64_t todelete = MIN(arc_mfu_ghost->arcs_lsize[type], 1276 arc_mru_ghost->arcs_size + 1277 arc_mfu_ghost->arcs_size - arc_c); 1278 arc_evict_ghost(arc_mfu_ghost, todelete); 1279 } 1280 } 1281 1282 return (stolen); 1283 } 1284 1285 /* 1286 * Remove buffers from list until we've removed the specified number of 1287 * bytes. Destroy the buffers that are removed. 1288 */ 1289 static void 1290 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1291 { 1292 arc_buf_hdr_t *ab, *ab_prev; 1293 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1294 kmutex_t *hash_lock; 1295 uint64_t bytes_deleted = 0; 1296 uint64_t bufs_skipped = 0; 1297 1298 ASSERT(GHOST_STATE(state)); 1299 top: 1300 mutex_enter(&state->arcs_mtx); 1301 for (ab = list_tail(list); ab; ab = ab_prev) { 1302 ab_prev = list_prev(list, ab); 1303 hash_lock = HDR_LOCK(ab); 1304 if (mutex_tryenter(hash_lock)) { 1305 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1306 ASSERT(ab->b_buf == NULL); 1307 arc_change_state(arc_anon, ab, hash_lock); 1308 mutex_exit(hash_lock); 1309 ARCSTAT_BUMP(arcstat_deleted); 1310 bytes_deleted += ab->b_size; 1311 arc_hdr_destroy(ab); 1312 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1313 if (bytes >= 0 && bytes_deleted >= bytes) 1314 break; 1315 } else { 1316 if (bytes < 0) { 1317 mutex_exit(&state->arcs_mtx); 1318 mutex_enter(hash_lock); 1319 mutex_exit(hash_lock); 1320 goto top; 1321 } 1322 bufs_skipped += 1; 1323 } 1324 } 1325 mutex_exit(&state->arcs_mtx); 1326 1327 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1328 (bytes < 0 || bytes_deleted < bytes)) { 1329 list = &state->arcs_list[ARC_BUFC_METADATA]; 1330 goto top; 1331 } 1332 1333 if (bufs_skipped) { 1334 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1335 ASSERT(bytes >= 0); 1336 } 1337 1338 if (bytes_deleted < bytes) 1339 dprintf("only deleted %lld bytes from %p", 1340 (longlong_t)bytes_deleted, state); 1341 } 1342 1343 static void 1344 arc_adjust(void) 1345 { 1346 int64_t top_sz, mru_over, arc_over, todelete; 1347 1348 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1349 1350 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1351 int64_t toevict = 1352 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1353 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); 1354 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1355 } 1356 1357 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1358 int64_t toevict = 1359 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1360 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); 1361 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1362 } 1363 1364 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1365 1366 if (mru_over > 0) { 1367 if (arc_mru_ghost->arcs_size > 0) { 1368 todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1369 arc_evict_ghost(arc_mru_ghost, todelete); 1370 } 1371 } 1372 1373 if ((arc_over = arc_size - arc_c) > 0) { 1374 int64_t tbl_over; 1375 1376 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1377 int64_t toevict = 1378 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1379 (void) arc_evict(arc_mfu, toevict, FALSE, 1380 ARC_BUFC_DATA); 1381 arc_over = arc_size - arc_c; 1382 } 1383 1384 if (arc_over > 0 && 1385 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1386 int64_t toevict = 1387 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 1388 arc_over); 1389 (void) arc_evict(arc_mfu, toevict, FALSE, 1390 ARC_BUFC_METADATA); 1391 } 1392 1393 tbl_over = arc_size + arc_mru_ghost->arcs_size + 1394 arc_mfu_ghost->arcs_size - arc_c * 2; 1395 1396 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 1397 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1398 arc_evict_ghost(arc_mfu_ghost, todelete); 1399 } 1400 } 1401 } 1402 1403 static void 1404 arc_do_user_evicts(void) 1405 { 1406 mutex_enter(&arc_eviction_mtx); 1407 while (arc_eviction_list != NULL) { 1408 arc_buf_t *buf = arc_eviction_list; 1409 arc_eviction_list = buf->b_next; 1410 buf->b_hdr = NULL; 1411 mutex_exit(&arc_eviction_mtx); 1412 1413 if (buf->b_efunc != NULL) 1414 VERIFY(buf->b_efunc(buf) == 0); 1415 1416 buf->b_efunc = NULL; 1417 buf->b_private = NULL; 1418 kmem_cache_free(buf_cache, buf); 1419 mutex_enter(&arc_eviction_mtx); 1420 } 1421 mutex_exit(&arc_eviction_mtx); 1422 } 1423 1424 /* 1425 * Flush all *evictable* data from the cache. 1426 * NOTE: this will not touch "active" (i.e. referenced) data. 1427 */ 1428 void 1429 arc_flush(void) 1430 { 1431 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) 1432 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); 1433 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) 1434 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); 1435 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) 1436 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); 1437 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) 1438 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); 1439 1440 arc_evict_ghost(arc_mru_ghost, -1); 1441 arc_evict_ghost(arc_mfu_ghost, -1); 1442 1443 mutex_enter(&arc_reclaim_thr_lock); 1444 arc_do_user_evicts(); 1445 mutex_exit(&arc_reclaim_thr_lock); 1446 ASSERT(arc_eviction_list == NULL); 1447 } 1448 1449 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1450 1451 void 1452 arc_shrink(void) 1453 { 1454 if (arc_c > arc_c_min) { 1455 uint64_t to_free; 1456 1457 #ifdef _KERNEL 1458 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1459 #else 1460 to_free = arc_c >> arc_shrink_shift; 1461 #endif 1462 if (arc_c > arc_c_min + to_free) 1463 atomic_add_64(&arc_c, -to_free); 1464 else 1465 arc_c = arc_c_min; 1466 1467 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1468 if (arc_c > arc_size) 1469 arc_c = MAX(arc_size, arc_c_min); 1470 if (arc_p > arc_c) 1471 arc_p = (arc_c >> 1); 1472 ASSERT(arc_c >= arc_c_min); 1473 ASSERT((int64_t)arc_p >= 0); 1474 } 1475 1476 if (arc_size > arc_c) 1477 arc_adjust(); 1478 } 1479 1480 static int 1481 arc_reclaim_needed(void) 1482 { 1483 uint64_t extra; 1484 1485 #ifdef _KERNEL 1486 1487 if (needfree) 1488 return (1); 1489 1490 /* 1491 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1492 */ 1493 extra = desfree; 1494 1495 /* 1496 * check that we're out of range of the pageout scanner. It starts to 1497 * schedule paging if freemem is less than lotsfree and needfree. 1498 * lotsfree is the high-water mark for pageout, and needfree is the 1499 * number of needed free pages. We add extra pages here to make sure 1500 * the scanner doesn't start up while we're freeing memory. 1501 */ 1502 if (freemem < lotsfree + needfree + extra) 1503 return (1); 1504 1505 /* 1506 * check to make sure that swapfs has enough space so that anon 1507 * reservations can still succeeed. anon_resvmem() checks that the 1508 * availrmem is greater than swapfs_minfree, and the number of reserved 1509 * swap pages. We also add a bit of extra here just to prevent 1510 * circumstances from getting really dire. 1511 */ 1512 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1513 return (1); 1514 1515 #if defined(__i386) 1516 /* 1517 * If we're on an i386 platform, it's possible that we'll exhaust the 1518 * kernel heap space before we ever run out of available physical 1519 * memory. Most checks of the size of the heap_area compare against 1520 * tune.t_minarmem, which is the minimum available real memory that we 1521 * can have in the system. However, this is generally fixed at 25 pages 1522 * which is so low that it's useless. In this comparison, we seek to 1523 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1524 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1525 * free) 1526 */ 1527 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1528 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1529 return (1); 1530 #endif 1531 1532 #else 1533 if (spa_get_random(100) == 0) 1534 return (1); 1535 #endif 1536 return (0); 1537 } 1538 1539 static void 1540 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1541 { 1542 size_t i; 1543 kmem_cache_t *prev_cache = NULL; 1544 kmem_cache_t *prev_data_cache = NULL; 1545 extern kmem_cache_t *zio_buf_cache[]; 1546 extern kmem_cache_t *zio_data_buf_cache[]; 1547 1548 #ifdef _KERNEL 1549 if (arc_meta_used >= arc_meta_limit) { 1550 /* 1551 * We are exceeding our meta-data cache limit. 1552 * Purge some DNLC entries to release holds on meta-data. 1553 */ 1554 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1555 } 1556 #if defined(__i386) 1557 /* 1558 * Reclaim unused memory from all kmem caches. 1559 */ 1560 kmem_reap(); 1561 #endif 1562 #endif 1563 1564 /* 1565 * An agressive reclamation will shrink the cache size as well as 1566 * reap free buffers from the arc kmem caches. 1567 */ 1568 if (strat == ARC_RECLAIM_AGGR) 1569 arc_shrink(); 1570 1571 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1572 if (zio_buf_cache[i] != prev_cache) { 1573 prev_cache = zio_buf_cache[i]; 1574 kmem_cache_reap_now(zio_buf_cache[i]); 1575 } 1576 if (zio_data_buf_cache[i] != prev_data_cache) { 1577 prev_data_cache = zio_data_buf_cache[i]; 1578 kmem_cache_reap_now(zio_data_buf_cache[i]); 1579 } 1580 } 1581 kmem_cache_reap_now(buf_cache); 1582 kmem_cache_reap_now(hdr_cache); 1583 } 1584 1585 static void 1586 arc_reclaim_thread(void) 1587 { 1588 clock_t growtime = 0; 1589 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1590 callb_cpr_t cpr; 1591 1592 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1593 1594 mutex_enter(&arc_reclaim_thr_lock); 1595 while (arc_thread_exit == 0) { 1596 if (arc_reclaim_needed()) { 1597 1598 if (arc_no_grow) { 1599 if (last_reclaim == ARC_RECLAIM_CONS) { 1600 last_reclaim = ARC_RECLAIM_AGGR; 1601 } else { 1602 last_reclaim = ARC_RECLAIM_CONS; 1603 } 1604 } else { 1605 arc_no_grow = TRUE; 1606 last_reclaim = ARC_RECLAIM_AGGR; 1607 membar_producer(); 1608 } 1609 1610 /* reset the growth delay for every reclaim */ 1611 growtime = lbolt + (arc_grow_retry * hz); 1612 1613 arc_kmem_reap_now(last_reclaim); 1614 1615 } else if (arc_no_grow && lbolt >= growtime) { 1616 arc_no_grow = FALSE; 1617 } 1618 1619 if (2 * arc_c < arc_size + 1620 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1621 arc_adjust(); 1622 1623 if (arc_eviction_list != NULL) 1624 arc_do_user_evicts(); 1625 1626 /* block until needed, or one second, whichever is shorter */ 1627 CALLB_CPR_SAFE_BEGIN(&cpr); 1628 (void) cv_timedwait(&arc_reclaim_thr_cv, 1629 &arc_reclaim_thr_lock, (lbolt + hz)); 1630 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1631 } 1632 1633 arc_thread_exit = 0; 1634 cv_broadcast(&arc_reclaim_thr_cv); 1635 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1636 thread_exit(); 1637 } 1638 1639 /* 1640 * Adapt arc info given the number of bytes we are trying to add and 1641 * the state that we are comming from. This function is only called 1642 * when we are adding new content to the cache. 1643 */ 1644 static void 1645 arc_adapt(int bytes, arc_state_t *state) 1646 { 1647 int mult; 1648 1649 ASSERT(bytes > 0); 1650 /* 1651 * Adapt the target size of the MRU list: 1652 * - if we just hit in the MRU ghost list, then increase 1653 * the target size of the MRU list. 1654 * - if we just hit in the MFU ghost list, then increase 1655 * the target size of the MFU list by decreasing the 1656 * target size of the MRU list. 1657 */ 1658 if (state == arc_mru_ghost) { 1659 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1660 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1661 1662 arc_p = MIN(arc_c, arc_p + bytes * mult); 1663 } else if (state == arc_mfu_ghost) { 1664 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1665 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1666 1667 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1668 } 1669 ASSERT((int64_t)arc_p >= 0); 1670 1671 if (arc_reclaim_needed()) { 1672 cv_signal(&arc_reclaim_thr_cv); 1673 return; 1674 } 1675 1676 if (arc_no_grow) 1677 return; 1678 1679 if (arc_c >= arc_c_max) 1680 return; 1681 1682 /* 1683 * If we're within (2 * maxblocksize) bytes of the target 1684 * cache size, increment the target cache size 1685 */ 1686 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1687 atomic_add_64(&arc_c, (int64_t)bytes); 1688 if (arc_c > arc_c_max) 1689 arc_c = arc_c_max; 1690 else if (state == arc_anon) 1691 atomic_add_64(&arc_p, (int64_t)bytes); 1692 if (arc_p > arc_c) 1693 arc_p = arc_c; 1694 } 1695 ASSERT((int64_t)arc_p >= 0); 1696 } 1697 1698 /* 1699 * Check if the cache has reached its limits and eviction is required 1700 * prior to insert. 1701 */ 1702 static int 1703 arc_evict_needed(arc_buf_contents_t type) 1704 { 1705 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 1706 return (1); 1707 1708 #ifdef _KERNEL 1709 /* 1710 * If zio data pages are being allocated out of a separate heap segment, 1711 * then enforce that the size of available vmem for this area remains 1712 * above about 1/32nd free. 1713 */ 1714 if (type == ARC_BUFC_DATA && zio_arena != NULL && 1715 vmem_size(zio_arena, VMEM_FREE) < 1716 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 1717 return (1); 1718 #endif 1719 1720 if (arc_reclaim_needed()) 1721 return (1); 1722 1723 return (arc_size > arc_c); 1724 } 1725 1726 /* 1727 * The buffer, supplied as the first argument, needs a data block. 1728 * So, if we are at cache max, determine which cache should be victimized. 1729 * We have the following cases: 1730 * 1731 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1732 * In this situation if we're out of space, but the resident size of the MFU is 1733 * under the limit, victimize the MFU cache to satisfy this insertion request. 1734 * 1735 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1736 * Here, we've used up all of the available space for the MRU, so we need to 1737 * evict from our own cache instead. Evict from the set of resident MRU 1738 * entries. 1739 * 1740 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1741 * c minus p represents the MFU space in the cache, since p is the size of the 1742 * cache that is dedicated to the MRU. In this situation there's still space on 1743 * the MFU side, so the MRU side needs to be victimized. 1744 * 1745 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1746 * MFU's resident set is consuming more space than it has been allotted. In 1747 * this situation, we must victimize our own cache, the MFU, for this insertion. 1748 */ 1749 static void 1750 arc_get_data_buf(arc_buf_t *buf) 1751 { 1752 arc_state_t *state = buf->b_hdr->b_state; 1753 uint64_t size = buf->b_hdr->b_size; 1754 arc_buf_contents_t type = buf->b_hdr->b_type; 1755 1756 arc_adapt(size, state); 1757 1758 /* 1759 * We have not yet reached cache maximum size, 1760 * just allocate a new buffer. 1761 */ 1762 if (!arc_evict_needed(type)) { 1763 if (type == ARC_BUFC_METADATA) { 1764 buf->b_data = zio_buf_alloc(size); 1765 arc_space_consume(size); 1766 } else { 1767 ASSERT(type == ARC_BUFC_DATA); 1768 buf->b_data = zio_data_buf_alloc(size); 1769 atomic_add_64(&arc_size, size); 1770 } 1771 goto out; 1772 } 1773 1774 /* 1775 * If we are prefetching from the mfu ghost list, this buffer 1776 * will end up on the mru list; so steal space from there. 1777 */ 1778 if (state == arc_mfu_ghost) 1779 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1780 else if (state == arc_mru_ghost) 1781 state = arc_mru; 1782 1783 if (state == arc_mru || state == arc_anon) { 1784 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1785 state = (arc_mfu->arcs_lsize[type] > 0 && 1786 arc_p > mru_used) ? arc_mfu : arc_mru; 1787 } else { 1788 /* MFU cases */ 1789 uint64_t mfu_space = arc_c - arc_p; 1790 state = (arc_mru->arcs_lsize[type] > 0 && 1791 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1792 } 1793 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1794 if (type == ARC_BUFC_METADATA) { 1795 buf->b_data = zio_buf_alloc(size); 1796 arc_space_consume(size); 1797 } else { 1798 ASSERT(type == ARC_BUFC_DATA); 1799 buf->b_data = zio_data_buf_alloc(size); 1800 atomic_add_64(&arc_size, size); 1801 } 1802 ARCSTAT_BUMP(arcstat_recycle_miss); 1803 } 1804 ASSERT(buf->b_data != NULL); 1805 out: 1806 /* 1807 * Update the state size. Note that ghost states have a 1808 * "ghost size" and so don't need to be updated. 1809 */ 1810 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1811 arc_buf_hdr_t *hdr = buf->b_hdr; 1812 1813 atomic_add_64(&hdr->b_state->arcs_size, size); 1814 if (list_link_active(&hdr->b_arc_node)) { 1815 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1816 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 1817 } 1818 /* 1819 * If we are growing the cache, and we are adding anonymous 1820 * data, and we have outgrown arc_p, update arc_p 1821 */ 1822 if (arc_size < arc_c && hdr->b_state == arc_anon && 1823 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1824 arc_p = MIN(arc_c, arc_p + size); 1825 } 1826 } 1827 1828 /* 1829 * This routine is called whenever a buffer is accessed. 1830 * NOTE: the hash lock is dropped in this function. 1831 */ 1832 static void 1833 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1834 { 1835 ASSERT(MUTEX_HELD(hash_lock)); 1836 1837 if (buf->b_state == arc_anon) { 1838 /* 1839 * This buffer is not in the cache, and does not 1840 * appear in our "ghost" list. Add the new buffer 1841 * to the MRU state. 1842 */ 1843 1844 ASSERT(buf->b_arc_access == 0); 1845 buf->b_arc_access = lbolt; 1846 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1847 arc_change_state(arc_mru, buf, hash_lock); 1848 1849 } else if (buf->b_state == arc_mru) { 1850 /* 1851 * If this buffer is here because of a prefetch, then either: 1852 * - clear the flag if this is a "referencing" read 1853 * (any subsequent access will bump this into the MFU state). 1854 * or 1855 * - move the buffer to the head of the list if this is 1856 * another prefetch (to make it less likely to be evicted). 1857 */ 1858 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1859 if (refcount_count(&buf->b_refcnt) == 0) { 1860 ASSERT(list_link_active(&buf->b_arc_node)); 1861 } else { 1862 buf->b_flags &= ~ARC_PREFETCH; 1863 ARCSTAT_BUMP(arcstat_mru_hits); 1864 } 1865 buf->b_arc_access = lbolt; 1866 return; 1867 } 1868 1869 /* 1870 * This buffer has been "accessed" only once so far, 1871 * but it is still in the cache. Move it to the MFU 1872 * state. 1873 */ 1874 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1875 /* 1876 * More than 125ms have passed since we 1877 * instantiated this buffer. Move it to the 1878 * most frequently used state. 1879 */ 1880 buf->b_arc_access = lbolt; 1881 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1882 arc_change_state(arc_mfu, buf, hash_lock); 1883 } 1884 ARCSTAT_BUMP(arcstat_mru_hits); 1885 } else if (buf->b_state == arc_mru_ghost) { 1886 arc_state_t *new_state; 1887 /* 1888 * This buffer has been "accessed" recently, but 1889 * was evicted from the cache. Move it to the 1890 * MFU state. 1891 */ 1892 1893 if (buf->b_flags & ARC_PREFETCH) { 1894 new_state = arc_mru; 1895 if (refcount_count(&buf->b_refcnt) > 0) 1896 buf->b_flags &= ~ARC_PREFETCH; 1897 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1898 } else { 1899 new_state = arc_mfu; 1900 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1901 } 1902 1903 buf->b_arc_access = lbolt; 1904 arc_change_state(new_state, buf, hash_lock); 1905 1906 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1907 } else if (buf->b_state == arc_mfu) { 1908 /* 1909 * This buffer has been accessed more than once and is 1910 * still in the cache. Keep it in the MFU state. 1911 * 1912 * NOTE: an add_reference() that occurred when we did 1913 * the arc_read() will have kicked this off the list. 1914 * If it was a prefetch, we will explicitly move it to 1915 * the head of the list now. 1916 */ 1917 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1918 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1919 ASSERT(list_link_active(&buf->b_arc_node)); 1920 } 1921 ARCSTAT_BUMP(arcstat_mfu_hits); 1922 buf->b_arc_access = lbolt; 1923 } else if (buf->b_state == arc_mfu_ghost) { 1924 arc_state_t *new_state = arc_mfu; 1925 /* 1926 * This buffer has been accessed more than once but has 1927 * been evicted from the cache. Move it back to the 1928 * MFU state. 1929 */ 1930 1931 if (buf->b_flags & ARC_PREFETCH) { 1932 /* 1933 * This is a prefetch access... 1934 * move this block back to the MRU state. 1935 */ 1936 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1937 new_state = arc_mru; 1938 } 1939 1940 buf->b_arc_access = lbolt; 1941 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1942 arc_change_state(new_state, buf, hash_lock); 1943 1944 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1945 } else { 1946 ASSERT(!"invalid arc state"); 1947 } 1948 } 1949 1950 /* a generic arc_done_func_t which you can use */ 1951 /* ARGSUSED */ 1952 void 1953 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1954 { 1955 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1956 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1957 } 1958 1959 /* a generic arc_done_func_t */ 1960 void 1961 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1962 { 1963 arc_buf_t **bufp = arg; 1964 if (zio && zio->io_error) { 1965 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1966 *bufp = NULL; 1967 } else { 1968 *bufp = buf; 1969 } 1970 } 1971 1972 static void 1973 arc_read_done(zio_t *zio) 1974 { 1975 arc_buf_hdr_t *hdr, *found; 1976 arc_buf_t *buf; 1977 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1978 kmutex_t *hash_lock; 1979 arc_callback_t *callback_list, *acb; 1980 int freeable = FALSE; 1981 1982 buf = zio->io_private; 1983 hdr = buf->b_hdr; 1984 1985 /* 1986 * The hdr was inserted into hash-table and removed from lists 1987 * prior to starting I/O. We should find this header, since 1988 * it's in the hash table, and it should be legit since it's 1989 * not possible to evict it during the I/O. The only possible 1990 * reason for it not to be found is if we were freed during the 1991 * read. 1992 */ 1993 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1994 &hash_lock); 1995 1996 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1997 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1998 1999 /* byteswap if necessary */ 2000 callback_list = hdr->b_acb; 2001 ASSERT(callback_list != NULL); 2002 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 2003 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 2004 2005 arc_cksum_compute(buf); 2006 2007 /* create copies of the data buffer for the callers */ 2008 abuf = buf; 2009 for (acb = callback_list; acb; acb = acb->acb_next) { 2010 if (acb->acb_done) { 2011 if (abuf == NULL) 2012 abuf = arc_buf_clone(buf); 2013 acb->acb_buf = abuf; 2014 abuf = NULL; 2015 } 2016 } 2017 hdr->b_acb = NULL; 2018 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2019 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 2020 if (abuf == buf) 2021 hdr->b_flags |= ARC_BUF_AVAILABLE; 2022 2023 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2024 2025 if (zio->io_error != 0) { 2026 hdr->b_flags |= ARC_IO_ERROR; 2027 if (hdr->b_state != arc_anon) 2028 arc_change_state(arc_anon, hdr, hash_lock); 2029 if (HDR_IN_HASH_TABLE(hdr)) 2030 buf_hash_remove(hdr); 2031 freeable = refcount_is_zero(&hdr->b_refcnt); 2032 /* convert checksum errors into IO errors */ 2033 if (zio->io_error == ECKSUM) 2034 zio->io_error = EIO; 2035 } 2036 2037 /* 2038 * Broadcast before we drop the hash_lock to avoid the possibility 2039 * that the hdr (and hence the cv) might be freed before we get to 2040 * the cv_broadcast(). 2041 */ 2042 cv_broadcast(&hdr->b_cv); 2043 2044 if (hash_lock) { 2045 /* 2046 * Only call arc_access on anonymous buffers. This is because 2047 * if we've issued an I/O for an evicted buffer, we've already 2048 * called arc_access (to prevent any simultaneous readers from 2049 * getting confused). 2050 */ 2051 if (zio->io_error == 0 && hdr->b_state == arc_anon) 2052 arc_access(hdr, hash_lock); 2053 mutex_exit(hash_lock); 2054 } else { 2055 /* 2056 * This block was freed while we waited for the read to 2057 * complete. It has been removed from the hash table and 2058 * moved to the anonymous state (so that it won't show up 2059 * in the cache). 2060 */ 2061 ASSERT3P(hdr->b_state, ==, arc_anon); 2062 freeable = refcount_is_zero(&hdr->b_refcnt); 2063 } 2064 2065 /* execute each callback and free its structure */ 2066 while ((acb = callback_list) != NULL) { 2067 if (acb->acb_done) 2068 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2069 2070 if (acb->acb_zio_dummy != NULL) { 2071 acb->acb_zio_dummy->io_error = zio->io_error; 2072 zio_nowait(acb->acb_zio_dummy); 2073 } 2074 2075 callback_list = acb->acb_next; 2076 kmem_free(acb, sizeof (arc_callback_t)); 2077 } 2078 2079 if (freeable) 2080 arc_hdr_destroy(hdr); 2081 } 2082 2083 /* 2084 * "Read" the block block at the specified DVA (in bp) via the 2085 * cache. If the block is found in the cache, invoke the provided 2086 * callback immediately and return. Note that the `zio' parameter 2087 * in the callback will be NULL in this case, since no IO was 2088 * required. If the block is not in the cache pass the read request 2089 * on to the spa with a substitute callback function, so that the 2090 * requested block will be added to the cache. 2091 * 2092 * If a read request arrives for a block that has a read in-progress, 2093 * either wait for the in-progress read to complete (and return the 2094 * results); or, if this is a read with a "done" func, add a record 2095 * to the read to invoke the "done" func when the read completes, 2096 * and return; or just return. 2097 * 2098 * arc_read_done() will invoke all the requested "done" functions 2099 * for readers of this block. 2100 */ 2101 int 2102 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2103 arc_done_func_t *done, void *private, int priority, int flags, 2104 uint32_t *arc_flags, zbookmark_t *zb) 2105 { 2106 arc_buf_hdr_t *hdr; 2107 arc_buf_t *buf; 2108 kmutex_t *hash_lock; 2109 zio_t *rzio; 2110 2111 top: 2112 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2113 if (hdr && hdr->b_datacnt > 0) { 2114 2115 *arc_flags |= ARC_CACHED; 2116 2117 if (HDR_IO_IN_PROGRESS(hdr)) { 2118 2119 if (*arc_flags & ARC_WAIT) { 2120 cv_wait(&hdr->b_cv, hash_lock); 2121 mutex_exit(hash_lock); 2122 goto top; 2123 } 2124 ASSERT(*arc_flags & ARC_NOWAIT); 2125 2126 if (done) { 2127 arc_callback_t *acb = NULL; 2128 2129 acb = kmem_zalloc(sizeof (arc_callback_t), 2130 KM_SLEEP); 2131 acb->acb_done = done; 2132 acb->acb_private = private; 2133 acb->acb_byteswap = swap; 2134 if (pio != NULL) 2135 acb->acb_zio_dummy = zio_null(pio, 2136 spa, NULL, NULL, flags); 2137 2138 ASSERT(acb->acb_done != NULL); 2139 acb->acb_next = hdr->b_acb; 2140 hdr->b_acb = acb; 2141 add_reference(hdr, hash_lock, private); 2142 mutex_exit(hash_lock); 2143 return (0); 2144 } 2145 mutex_exit(hash_lock); 2146 return (0); 2147 } 2148 2149 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2150 2151 if (done) { 2152 add_reference(hdr, hash_lock, private); 2153 /* 2154 * If this block is already in use, create a new 2155 * copy of the data so that we will be guaranteed 2156 * that arc_release() will always succeed. 2157 */ 2158 buf = hdr->b_buf; 2159 ASSERT(buf); 2160 ASSERT(buf->b_data); 2161 if (HDR_BUF_AVAILABLE(hdr)) { 2162 ASSERT(buf->b_efunc == NULL); 2163 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2164 } else { 2165 buf = arc_buf_clone(buf); 2166 } 2167 } else if (*arc_flags & ARC_PREFETCH && 2168 refcount_count(&hdr->b_refcnt) == 0) { 2169 hdr->b_flags |= ARC_PREFETCH; 2170 } 2171 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2172 arc_access(hdr, hash_lock); 2173 mutex_exit(hash_lock); 2174 ARCSTAT_BUMP(arcstat_hits); 2175 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2176 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2177 data, metadata, hits); 2178 2179 if (done) 2180 done(NULL, buf, private); 2181 } else { 2182 uint64_t size = BP_GET_LSIZE(bp); 2183 arc_callback_t *acb; 2184 2185 if (hdr == NULL) { 2186 /* this block is not in the cache */ 2187 arc_buf_hdr_t *exists; 2188 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2189 buf = arc_buf_alloc(spa, size, private, type); 2190 hdr = buf->b_hdr; 2191 hdr->b_dva = *BP_IDENTITY(bp); 2192 hdr->b_birth = bp->blk_birth; 2193 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2194 exists = buf_hash_insert(hdr, &hash_lock); 2195 if (exists) { 2196 /* somebody beat us to the hash insert */ 2197 mutex_exit(hash_lock); 2198 bzero(&hdr->b_dva, sizeof (dva_t)); 2199 hdr->b_birth = 0; 2200 hdr->b_cksum0 = 0; 2201 (void) arc_buf_remove_ref(buf, private); 2202 goto top; /* restart the IO request */ 2203 } 2204 /* if this is a prefetch, we don't have a reference */ 2205 if (*arc_flags & ARC_PREFETCH) { 2206 (void) remove_reference(hdr, hash_lock, 2207 private); 2208 hdr->b_flags |= ARC_PREFETCH; 2209 } 2210 if (BP_GET_LEVEL(bp) > 0) 2211 hdr->b_flags |= ARC_INDIRECT; 2212 } else { 2213 /* this block is in the ghost cache */ 2214 ASSERT(GHOST_STATE(hdr->b_state)); 2215 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2216 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2217 ASSERT(hdr->b_buf == NULL); 2218 2219 /* if this is a prefetch, we don't have a reference */ 2220 if (*arc_flags & ARC_PREFETCH) 2221 hdr->b_flags |= ARC_PREFETCH; 2222 else 2223 add_reference(hdr, hash_lock, private); 2224 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2225 buf->b_hdr = hdr; 2226 buf->b_data = NULL; 2227 buf->b_efunc = NULL; 2228 buf->b_private = NULL; 2229 buf->b_next = NULL; 2230 hdr->b_buf = buf; 2231 arc_get_data_buf(buf); 2232 ASSERT(hdr->b_datacnt == 0); 2233 hdr->b_datacnt = 1; 2234 2235 } 2236 2237 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2238 acb->acb_done = done; 2239 acb->acb_private = private; 2240 acb->acb_byteswap = swap; 2241 2242 ASSERT(hdr->b_acb == NULL); 2243 hdr->b_acb = acb; 2244 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2245 2246 /* 2247 * If the buffer has been evicted, migrate it to a present state 2248 * before issuing the I/O. Once we drop the hash-table lock, 2249 * the header will be marked as I/O in progress and have an 2250 * attached buffer. At this point, anybody who finds this 2251 * buffer ought to notice that it's legit but has a pending I/O. 2252 */ 2253 2254 if (GHOST_STATE(hdr->b_state)) 2255 arc_access(hdr, hash_lock); 2256 mutex_exit(hash_lock); 2257 2258 ASSERT3U(hdr->b_size, ==, size); 2259 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2260 zbookmark_t *, zb); 2261 ARCSTAT_BUMP(arcstat_misses); 2262 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2263 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2264 data, metadata, misses); 2265 2266 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2267 arc_read_done, buf, priority, flags, zb); 2268 2269 if (*arc_flags & ARC_WAIT) 2270 return (zio_wait(rzio)); 2271 2272 ASSERT(*arc_flags & ARC_NOWAIT); 2273 zio_nowait(rzio); 2274 } 2275 return (0); 2276 } 2277 2278 /* 2279 * arc_read() variant to support pool traversal. If the block is already 2280 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2281 * The idea is that we don't want pool traversal filling up memory, but 2282 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2283 */ 2284 int 2285 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2286 { 2287 arc_buf_hdr_t *hdr; 2288 kmutex_t *hash_mtx; 2289 int rc = 0; 2290 2291 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2292 2293 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2294 arc_buf_t *buf = hdr->b_buf; 2295 2296 ASSERT(buf); 2297 while (buf->b_data == NULL) { 2298 buf = buf->b_next; 2299 ASSERT(buf); 2300 } 2301 bcopy(buf->b_data, data, hdr->b_size); 2302 } else { 2303 rc = ENOENT; 2304 } 2305 2306 if (hash_mtx) 2307 mutex_exit(hash_mtx); 2308 2309 return (rc); 2310 } 2311 2312 void 2313 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2314 { 2315 ASSERT(buf->b_hdr != NULL); 2316 ASSERT(buf->b_hdr->b_state != arc_anon); 2317 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2318 buf->b_efunc = func; 2319 buf->b_private = private; 2320 } 2321 2322 /* 2323 * This is used by the DMU to let the ARC know that a buffer is 2324 * being evicted, so the ARC should clean up. If this arc buf 2325 * is not yet in the evicted state, it will be put there. 2326 */ 2327 int 2328 arc_buf_evict(arc_buf_t *buf) 2329 { 2330 arc_buf_hdr_t *hdr; 2331 kmutex_t *hash_lock; 2332 arc_buf_t **bufp; 2333 2334 mutex_enter(&arc_eviction_mtx); 2335 hdr = buf->b_hdr; 2336 if (hdr == NULL) { 2337 /* 2338 * We are in arc_do_user_evicts(). 2339 */ 2340 ASSERT(buf->b_data == NULL); 2341 mutex_exit(&arc_eviction_mtx); 2342 return (0); 2343 } 2344 hash_lock = HDR_LOCK(hdr); 2345 mutex_exit(&arc_eviction_mtx); 2346 2347 mutex_enter(hash_lock); 2348 2349 if (buf->b_data == NULL) { 2350 /* 2351 * We are on the eviction list. 2352 */ 2353 mutex_exit(hash_lock); 2354 mutex_enter(&arc_eviction_mtx); 2355 if (buf->b_hdr == NULL) { 2356 /* 2357 * We are already in arc_do_user_evicts(). 2358 */ 2359 mutex_exit(&arc_eviction_mtx); 2360 return (0); 2361 } else { 2362 arc_buf_t copy = *buf; /* structure assignment */ 2363 /* 2364 * Process this buffer now 2365 * but let arc_do_user_evicts() do the reaping. 2366 */ 2367 buf->b_efunc = NULL; 2368 mutex_exit(&arc_eviction_mtx); 2369 VERIFY(copy.b_efunc(©) == 0); 2370 return (1); 2371 } 2372 } 2373 2374 ASSERT(buf->b_hdr == hdr); 2375 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2376 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2377 2378 /* 2379 * Pull this buffer off of the hdr 2380 */ 2381 bufp = &hdr->b_buf; 2382 while (*bufp != buf) 2383 bufp = &(*bufp)->b_next; 2384 *bufp = buf->b_next; 2385 2386 ASSERT(buf->b_data != NULL); 2387 arc_buf_destroy(buf, FALSE, FALSE); 2388 2389 if (hdr->b_datacnt == 0) { 2390 arc_state_t *old_state = hdr->b_state; 2391 arc_state_t *evicted_state; 2392 2393 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2394 2395 evicted_state = 2396 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2397 2398 mutex_enter(&old_state->arcs_mtx); 2399 mutex_enter(&evicted_state->arcs_mtx); 2400 2401 arc_change_state(evicted_state, hdr, hash_lock); 2402 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2403 hdr->b_flags = ARC_IN_HASH_TABLE; 2404 2405 mutex_exit(&evicted_state->arcs_mtx); 2406 mutex_exit(&old_state->arcs_mtx); 2407 } 2408 mutex_exit(hash_lock); 2409 2410 VERIFY(buf->b_efunc(buf) == 0); 2411 buf->b_efunc = NULL; 2412 buf->b_private = NULL; 2413 buf->b_hdr = NULL; 2414 kmem_cache_free(buf_cache, buf); 2415 return (1); 2416 } 2417 2418 /* 2419 * Release this buffer from the cache. This must be done 2420 * after a read and prior to modifying the buffer contents. 2421 * If the buffer has more than one reference, we must make 2422 * make a new hdr for the buffer. 2423 */ 2424 void 2425 arc_release(arc_buf_t *buf, void *tag) 2426 { 2427 arc_buf_hdr_t *hdr = buf->b_hdr; 2428 kmutex_t *hash_lock = HDR_LOCK(hdr); 2429 2430 /* this buffer is not on any list */ 2431 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2432 2433 if (hdr->b_state == arc_anon) { 2434 /* this buffer is already released */ 2435 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2436 ASSERT(BUF_EMPTY(hdr)); 2437 ASSERT(buf->b_efunc == NULL); 2438 arc_buf_thaw(buf); 2439 return; 2440 } 2441 2442 mutex_enter(hash_lock); 2443 2444 /* 2445 * Do we have more than one buf? 2446 */ 2447 if (hdr->b_buf != buf || buf->b_next != NULL) { 2448 arc_buf_hdr_t *nhdr; 2449 arc_buf_t **bufp; 2450 uint64_t blksz = hdr->b_size; 2451 spa_t *spa = hdr->b_spa; 2452 arc_buf_contents_t type = hdr->b_type; 2453 2454 ASSERT(hdr->b_datacnt > 1); 2455 /* 2456 * Pull the data off of this buf and attach it to 2457 * a new anonymous buf. 2458 */ 2459 (void) remove_reference(hdr, hash_lock, tag); 2460 bufp = &hdr->b_buf; 2461 while (*bufp != buf) 2462 bufp = &(*bufp)->b_next; 2463 *bufp = (*bufp)->b_next; 2464 buf->b_next = NULL; 2465 2466 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2467 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2468 if (refcount_is_zero(&hdr->b_refcnt)) { 2469 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 2470 ASSERT3U(*size, >=, hdr->b_size); 2471 atomic_add_64(size, -hdr->b_size); 2472 } 2473 hdr->b_datacnt -= 1; 2474 arc_cksum_verify(buf); 2475 2476 mutex_exit(hash_lock); 2477 2478 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2479 nhdr->b_size = blksz; 2480 nhdr->b_spa = spa; 2481 nhdr->b_type = type; 2482 nhdr->b_buf = buf; 2483 nhdr->b_state = arc_anon; 2484 nhdr->b_arc_access = 0; 2485 nhdr->b_flags = 0; 2486 nhdr->b_datacnt = 1; 2487 nhdr->b_freeze_cksum = NULL; 2488 (void) refcount_add(&nhdr->b_refcnt, tag); 2489 buf->b_hdr = nhdr; 2490 atomic_add_64(&arc_anon->arcs_size, blksz); 2491 2492 hdr = nhdr; 2493 } else { 2494 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2495 ASSERT(!list_link_active(&hdr->b_arc_node)); 2496 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2497 arc_change_state(arc_anon, hdr, hash_lock); 2498 hdr->b_arc_access = 0; 2499 mutex_exit(hash_lock); 2500 bzero(&hdr->b_dva, sizeof (dva_t)); 2501 hdr->b_birth = 0; 2502 hdr->b_cksum0 = 0; 2503 arc_buf_thaw(buf); 2504 } 2505 buf->b_efunc = NULL; 2506 buf->b_private = NULL; 2507 } 2508 2509 int 2510 arc_released(arc_buf_t *buf) 2511 { 2512 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2513 } 2514 2515 int 2516 arc_has_callback(arc_buf_t *buf) 2517 { 2518 return (buf->b_efunc != NULL); 2519 } 2520 2521 #ifdef ZFS_DEBUG 2522 int 2523 arc_referenced(arc_buf_t *buf) 2524 { 2525 return (refcount_count(&buf->b_hdr->b_refcnt)); 2526 } 2527 #endif 2528 2529 static void 2530 arc_write_ready(zio_t *zio) 2531 { 2532 arc_write_callback_t *callback = zio->io_private; 2533 arc_buf_t *buf = callback->awcb_buf; 2534 2535 if (callback->awcb_ready) { 2536 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2537 callback->awcb_ready(zio, buf, callback->awcb_private); 2538 } 2539 arc_cksum_compute(buf); 2540 } 2541 2542 static void 2543 arc_write_done(zio_t *zio) 2544 { 2545 arc_write_callback_t *callback = zio->io_private; 2546 arc_buf_t *buf = callback->awcb_buf; 2547 arc_buf_hdr_t *hdr = buf->b_hdr; 2548 2549 hdr->b_acb = NULL; 2550 2551 /* this buffer is on no lists and is not in the hash table */ 2552 ASSERT3P(hdr->b_state, ==, arc_anon); 2553 2554 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2555 hdr->b_birth = zio->io_bp->blk_birth; 2556 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2557 /* 2558 * If the block to be written was all-zero, we may have 2559 * compressed it away. In this case no write was performed 2560 * so there will be no dva/birth-date/checksum. The buffer 2561 * must therefor remain anonymous (and uncached). 2562 */ 2563 if (!BUF_EMPTY(hdr)) { 2564 arc_buf_hdr_t *exists; 2565 kmutex_t *hash_lock; 2566 2567 arc_cksum_verify(buf); 2568 2569 exists = buf_hash_insert(hdr, &hash_lock); 2570 if (exists) { 2571 /* 2572 * This can only happen if we overwrite for 2573 * sync-to-convergence, because we remove 2574 * buffers from the hash table when we arc_free(). 2575 */ 2576 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2577 BP_IDENTITY(zio->io_bp))); 2578 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2579 zio->io_bp->blk_birth); 2580 2581 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2582 arc_change_state(arc_anon, exists, hash_lock); 2583 mutex_exit(hash_lock); 2584 arc_hdr_destroy(exists); 2585 exists = buf_hash_insert(hdr, &hash_lock); 2586 ASSERT3P(exists, ==, NULL); 2587 } 2588 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2589 arc_access(hdr, hash_lock); 2590 mutex_exit(hash_lock); 2591 } else if (callback->awcb_done == NULL) { 2592 int destroy_hdr; 2593 /* 2594 * This is an anonymous buffer with no user callback, 2595 * destroy it if there are no active references. 2596 */ 2597 mutex_enter(&arc_eviction_mtx); 2598 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2599 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2600 mutex_exit(&arc_eviction_mtx); 2601 if (destroy_hdr) 2602 arc_hdr_destroy(hdr); 2603 } else { 2604 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2605 } 2606 2607 if (callback->awcb_done) { 2608 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2609 callback->awcb_done(zio, buf, callback->awcb_private); 2610 } 2611 2612 kmem_free(callback, sizeof (arc_write_callback_t)); 2613 } 2614 2615 zio_t * 2616 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2617 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2618 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2619 int flags, zbookmark_t *zb) 2620 { 2621 arc_buf_hdr_t *hdr = buf->b_hdr; 2622 arc_write_callback_t *callback; 2623 zio_t *zio; 2624 2625 /* this is a private buffer - no locking required */ 2626 ASSERT3P(hdr->b_state, ==, arc_anon); 2627 ASSERT(BUF_EMPTY(hdr)); 2628 ASSERT(!HDR_IO_ERROR(hdr)); 2629 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2630 ASSERT(hdr->b_acb == 0); 2631 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2632 callback->awcb_ready = ready; 2633 callback->awcb_done = done; 2634 callback->awcb_private = private; 2635 callback->awcb_buf = buf; 2636 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2637 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2638 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2639 priority, flags, zb); 2640 2641 return (zio); 2642 } 2643 2644 int 2645 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2646 zio_done_func_t *done, void *private, uint32_t arc_flags) 2647 { 2648 arc_buf_hdr_t *ab; 2649 kmutex_t *hash_lock; 2650 zio_t *zio; 2651 2652 /* 2653 * If this buffer is in the cache, release it, so it 2654 * can be re-used. 2655 */ 2656 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2657 if (ab != NULL) { 2658 /* 2659 * The checksum of blocks to free is not always 2660 * preserved (eg. on the deadlist). However, if it is 2661 * nonzero, it should match what we have in the cache. 2662 */ 2663 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2664 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2665 if (ab->b_state != arc_anon) 2666 arc_change_state(arc_anon, ab, hash_lock); 2667 if (HDR_IO_IN_PROGRESS(ab)) { 2668 /* 2669 * This should only happen when we prefetch. 2670 */ 2671 ASSERT(ab->b_flags & ARC_PREFETCH); 2672 ASSERT3U(ab->b_datacnt, ==, 1); 2673 ab->b_flags |= ARC_FREED_IN_READ; 2674 if (HDR_IN_HASH_TABLE(ab)) 2675 buf_hash_remove(ab); 2676 ab->b_arc_access = 0; 2677 bzero(&ab->b_dva, sizeof (dva_t)); 2678 ab->b_birth = 0; 2679 ab->b_cksum0 = 0; 2680 ab->b_buf->b_efunc = NULL; 2681 ab->b_buf->b_private = NULL; 2682 mutex_exit(hash_lock); 2683 } else if (refcount_is_zero(&ab->b_refcnt)) { 2684 mutex_exit(hash_lock); 2685 arc_hdr_destroy(ab); 2686 ARCSTAT_BUMP(arcstat_deleted); 2687 } else { 2688 /* 2689 * We still have an active reference on this 2690 * buffer. This can happen, e.g., from 2691 * dbuf_unoverride(). 2692 */ 2693 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2694 ab->b_arc_access = 0; 2695 bzero(&ab->b_dva, sizeof (dva_t)); 2696 ab->b_birth = 0; 2697 ab->b_cksum0 = 0; 2698 ab->b_buf->b_efunc = NULL; 2699 ab->b_buf->b_private = NULL; 2700 mutex_exit(hash_lock); 2701 } 2702 } 2703 2704 zio = zio_free(pio, spa, txg, bp, done, private); 2705 2706 if (arc_flags & ARC_WAIT) 2707 return (zio_wait(zio)); 2708 2709 ASSERT(arc_flags & ARC_NOWAIT); 2710 zio_nowait(zio); 2711 2712 return (0); 2713 } 2714 2715 void 2716 arc_tempreserve_clear(uint64_t tempreserve) 2717 { 2718 atomic_add_64(&arc_tempreserve, -tempreserve); 2719 ASSERT((int64_t)arc_tempreserve >= 0); 2720 } 2721 2722 int 2723 arc_tempreserve_space(uint64_t tempreserve) 2724 { 2725 #ifdef ZFS_DEBUG 2726 /* 2727 * Once in a while, fail for no reason. Everything should cope. 2728 */ 2729 if (spa_get_random(10000) == 0) { 2730 dprintf("forcing random failure\n"); 2731 return (ERESTART); 2732 } 2733 #endif 2734 if (tempreserve > arc_c/4 && !arc_no_grow) 2735 arc_c = MIN(arc_c_max, tempreserve * 4); 2736 if (tempreserve > arc_c) 2737 return (ENOMEM); 2738 2739 /* 2740 * Throttle writes when the amount of dirty data in the cache 2741 * gets too large. We try to keep the cache less than half full 2742 * of dirty blocks so that our sync times don't grow too large. 2743 * Note: if two requests come in concurrently, we might let them 2744 * both succeed, when one of them should fail. Not a huge deal. 2745 * 2746 * XXX The limit should be adjusted dynamically to keep the time 2747 * to sync a dataset fixed (around 1-5 seconds?). 2748 */ 2749 2750 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2751 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2752 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 2753 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 2754 arc_tempreserve>>10, 2755 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 2756 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 2757 tempreserve>>10, arc_c>>10); 2758 return (ERESTART); 2759 } 2760 atomic_add_64(&arc_tempreserve, tempreserve); 2761 return (0); 2762 } 2763 2764 void 2765 arc_init(void) 2766 { 2767 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2768 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2769 2770 /* Convert seconds to clock ticks */ 2771 arc_min_prefetch_lifespan = 1 * hz; 2772 2773 /* Start out with 1/8 of all memory */ 2774 arc_c = physmem * PAGESIZE / 8; 2775 2776 #ifdef _KERNEL 2777 /* 2778 * On architectures where the physical memory can be larger 2779 * than the addressable space (intel in 32-bit mode), we may 2780 * need to limit the cache to 1/8 of VM size. 2781 */ 2782 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2783 #endif 2784 2785 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2786 arc_c_min = MAX(arc_c / 4, 64<<20); 2787 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2788 if (arc_c * 8 >= 1<<30) 2789 arc_c_max = (arc_c * 8) - (1<<30); 2790 else 2791 arc_c_max = arc_c_min; 2792 arc_c_max = MAX(arc_c * 6, arc_c_max); 2793 2794 /* 2795 * Allow the tunables to override our calculations if they are 2796 * reasonable (ie. over 64MB) 2797 */ 2798 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2799 arc_c_max = zfs_arc_max; 2800 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2801 arc_c_min = zfs_arc_min; 2802 2803 arc_c = arc_c_max; 2804 arc_p = (arc_c >> 1); 2805 2806 /* limit meta-data to 1/4 of the arc capacity */ 2807 arc_meta_limit = arc_c_max / 4; 2808 2809 /* Allow the tunable to override if it is reasonable */ 2810 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 2811 arc_meta_limit = zfs_arc_meta_limit; 2812 2813 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 2814 arc_c_min = arc_meta_limit / 2; 2815 2816 /* if kmem_flags are set, lets try to use less memory */ 2817 if (kmem_debugging()) 2818 arc_c = arc_c / 2; 2819 if (arc_c < arc_c_min) 2820 arc_c = arc_c_min; 2821 2822 arc_anon = &ARC_anon; 2823 arc_mru = &ARC_mru; 2824 arc_mru_ghost = &ARC_mru_ghost; 2825 arc_mfu = &ARC_mfu; 2826 arc_mfu_ghost = &ARC_mfu_ghost; 2827 arc_size = 0; 2828 2829 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2830 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2831 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2832 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2833 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2834 2835 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 2836 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2837 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 2838 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2839 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 2840 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2841 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 2842 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2843 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 2844 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2845 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 2846 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2847 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 2848 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2849 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 2850 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2851 2852 buf_init(); 2853 2854 arc_thread_exit = 0; 2855 arc_eviction_list = NULL; 2856 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2857 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2858 2859 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2860 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2861 2862 if (arc_ksp != NULL) { 2863 arc_ksp->ks_data = &arc_stats; 2864 kstat_install(arc_ksp); 2865 } 2866 2867 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2868 TS_RUN, minclsyspri); 2869 2870 arc_dead = FALSE; 2871 } 2872 2873 void 2874 arc_fini(void) 2875 { 2876 mutex_enter(&arc_reclaim_thr_lock); 2877 arc_thread_exit = 1; 2878 while (arc_thread_exit != 0) 2879 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2880 mutex_exit(&arc_reclaim_thr_lock); 2881 2882 arc_flush(); 2883 2884 arc_dead = TRUE; 2885 2886 if (arc_ksp != NULL) { 2887 kstat_delete(arc_ksp); 2888 arc_ksp = NULL; 2889 } 2890 2891 mutex_destroy(&arc_eviction_mtx); 2892 mutex_destroy(&arc_reclaim_thr_lock); 2893 cv_destroy(&arc_reclaim_thr_cv); 2894 2895 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 2896 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 2897 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 2898 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 2899 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 2900 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 2901 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 2902 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 2903 2904 mutex_destroy(&arc_anon->arcs_mtx); 2905 mutex_destroy(&arc_mru->arcs_mtx); 2906 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2907 mutex_destroy(&arc_mfu->arcs_mtx); 2908 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2909 2910 buf_fini(); 2911 } 2912