1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 #include <sys/kstat.h> 128 129 static kmutex_t arc_reclaim_thr_lock; 130 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131 static uint8_t arc_thread_exit; 132 133 #define ARC_REDUCE_DNLC_PERCENT 3 134 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136 typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139 } arc_reclaim_strategy_t; 140 141 /* number of seconds before growing cache again */ 142 static int arc_grow_retry = 60; 143 144 /* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148 static int arc_min_prefetch_lifespan; 149 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 uint64_t zfs_arc_meta_limit = 0; 158 159 /* 160 * Note that buffers can be in one of 5 states: 161 * ARC_anon - anonymous (discussed below) 162 * ARC_mru - recently used, currently cached 163 * ARC_mru_ghost - recentely used, no longer in cache 164 * ARC_mfu - frequently used, currently cached 165 * ARC_mfu_ghost - frequently used, no longer in cache 166 * When there are no active references to the buffer, they are 167 * are linked onto a list in one of these arc states. These are 168 * the only buffers that can be evicted or deleted. Within each 169 * state there are multiple lists, one for meta-data and one for 170 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 171 * etc.) is tracked separately so that it can be managed more 172 * explicitly: favored over data, limited explicitely. 173 * 174 * Anonymous buffers are buffers that are not associated with 175 * a DVA. These are buffers that hold dirty block copies 176 * before they are written to stable storage. By definition, 177 * they are "ref'd" and are considered part of arc_mru 178 * that cannot be freed. Generally, they will aquire a DVA 179 * as they are written and migrate onto the arc_mru list. 180 */ 181 182 typedef struct arc_state { 183 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 184 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 185 uint64_t arcs_size; /* total amount of data in this state */ 186 kmutex_t arcs_mtx; 187 } arc_state_t; 188 189 /* The 5 states: */ 190 static arc_state_t ARC_anon; 191 static arc_state_t ARC_mru; 192 static arc_state_t ARC_mru_ghost; 193 static arc_state_t ARC_mfu; 194 static arc_state_t ARC_mfu_ghost; 195 196 typedef struct arc_stats { 197 kstat_named_t arcstat_hits; 198 kstat_named_t arcstat_misses; 199 kstat_named_t arcstat_demand_data_hits; 200 kstat_named_t arcstat_demand_data_misses; 201 kstat_named_t arcstat_demand_metadata_hits; 202 kstat_named_t arcstat_demand_metadata_misses; 203 kstat_named_t arcstat_prefetch_data_hits; 204 kstat_named_t arcstat_prefetch_data_misses; 205 kstat_named_t arcstat_prefetch_metadata_hits; 206 kstat_named_t arcstat_prefetch_metadata_misses; 207 kstat_named_t arcstat_mru_hits; 208 kstat_named_t arcstat_mru_ghost_hits; 209 kstat_named_t arcstat_mfu_hits; 210 kstat_named_t arcstat_mfu_ghost_hits; 211 kstat_named_t arcstat_deleted; 212 kstat_named_t arcstat_recycle_miss; 213 kstat_named_t arcstat_mutex_miss; 214 kstat_named_t arcstat_evict_skip; 215 kstat_named_t arcstat_hash_elements; 216 kstat_named_t arcstat_hash_elements_max; 217 kstat_named_t arcstat_hash_collisions; 218 kstat_named_t arcstat_hash_chains; 219 kstat_named_t arcstat_hash_chain_max; 220 kstat_named_t arcstat_p; 221 kstat_named_t arcstat_c; 222 kstat_named_t arcstat_c_min; 223 kstat_named_t arcstat_c_max; 224 kstat_named_t arcstat_size; 225 } arc_stats_t; 226 227 static arc_stats_t arc_stats = { 228 { "hits", KSTAT_DATA_UINT64 }, 229 { "misses", KSTAT_DATA_UINT64 }, 230 { "demand_data_hits", KSTAT_DATA_UINT64 }, 231 { "demand_data_misses", KSTAT_DATA_UINT64 }, 232 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 233 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 234 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 235 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 236 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 237 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 238 { "mru_hits", KSTAT_DATA_UINT64 }, 239 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 240 { "mfu_hits", KSTAT_DATA_UINT64 }, 241 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 242 { "deleted", KSTAT_DATA_UINT64 }, 243 { "recycle_miss", KSTAT_DATA_UINT64 }, 244 { "mutex_miss", KSTAT_DATA_UINT64 }, 245 { "evict_skip", KSTAT_DATA_UINT64 }, 246 { "hash_elements", KSTAT_DATA_UINT64 }, 247 { "hash_elements_max", KSTAT_DATA_UINT64 }, 248 { "hash_collisions", KSTAT_DATA_UINT64 }, 249 { "hash_chains", KSTAT_DATA_UINT64 }, 250 { "hash_chain_max", KSTAT_DATA_UINT64 }, 251 { "p", KSTAT_DATA_UINT64 }, 252 { "c", KSTAT_DATA_UINT64 }, 253 { "c_min", KSTAT_DATA_UINT64 }, 254 { "c_max", KSTAT_DATA_UINT64 }, 255 { "size", KSTAT_DATA_UINT64 } 256 }; 257 258 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 259 260 #define ARCSTAT_INCR(stat, val) \ 261 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 262 263 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 264 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 265 266 #define ARCSTAT_MAX(stat, val) { \ 267 uint64_t m; \ 268 while ((val) > (m = arc_stats.stat.value.ui64) && \ 269 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 270 continue; \ 271 } 272 273 #define ARCSTAT_MAXSTAT(stat) \ 274 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 275 276 /* 277 * We define a macro to allow ARC hits/misses to be easily broken down by 278 * two separate conditions, giving a total of four different subtypes for 279 * each of hits and misses (so eight statistics total). 280 */ 281 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 282 if (cond1) { \ 283 if (cond2) { \ 284 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 285 } else { \ 286 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 287 } \ 288 } else { \ 289 if (cond2) { \ 290 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 291 } else { \ 292 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 293 } \ 294 } 295 296 kstat_t *arc_ksp; 297 static arc_state_t *arc_anon; 298 static arc_state_t *arc_mru; 299 static arc_state_t *arc_mru_ghost; 300 static arc_state_t *arc_mfu; 301 static arc_state_t *arc_mfu_ghost; 302 303 /* 304 * There are several ARC variables that are critical to export as kstats -- 305 * but we don't want to have to grovel around in the kstat whenever we wish to 306 * manipulate them. For these variables, we therefore define them to be in 307 * terms of the statistic variable. This assures that we are not introducing 308 * the possibility of inconsistency by having shadow copies of the variables, 309 * while still allowing the code to be readable. 310 */ 311 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 312 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 313 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 314 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 315 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 316 317 static int arc_no_grow; /* Don't try to grow cache size */ 318 static uint64_t arc_tempreserve; 319 static uint64_t arc_meta_used; 320 static uint64_t arc_meta_limit; 321 static uint64_t arc_meta_max = 0; 322 323 typedef struct arc_callback arc_callback_t; 324 325 struct arc_callback { 326 void *acb_private; 327 arc_done_func_t *acb_done; 328 arc_byteswap_func_t *acb_byteswap; 329 arc_buf_t *acb_buf; 330 zio_t *acb_zio_dummy; 331 arc_callback_t *acb_next; 332 }; 333 334 typedef struct arc_write_callback arc_write_callback_t; 335 336 struct arc_write_callback { 337 void *awcb_private; 338 arc_done_func_t *awcb_ready; 339 arc_done_func_t *awcb_done; 340 arc_buf_t *awcb_buf; 341 }; 342 343 struct arc_buf_hdr { 344 /* protected by hash lock */ 345 dva_t b_dva; 346 uint64_t b_birth; 347 uint64_t b_cksum0; 348 349 kmutex_t b_freeze_lock; 350 zio_cksum_t *b_freeze_cksum; 351 352 arc_buf_hdr_t *b_hash_next; 353 arc_buf_t *b_buf; 354 uint32_t b_flags; 355 uint32_t b_datacnt; 356 357 arc_callback_t *b_acb; 358 kcondvar_t b_cv; 359 360 /* immutable */ 361 arc_buf_contents_t b_type; 362 uint64_t b_size; 363 spa_t *b_spa; 364 365 /* protected by arc state mutex */ 366 arc_state_t *b_state; 367 list_node_t b_arc_node; 368 369 /* updated atomically */ 370 clock_t b_arc_access; 371 372 /* self protecting */ 373 refcount_t b_refcnt; 374 }; 375 376 static arc_buf_t *arc_eviction_list; 377 static kmutex_t arc_eviction_mtx; 378 static arc_buf_hdr_t arc_eviction_hdr; 379 static void arc_get_data_buf(arc_buf_t *buf); 380 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 381 static int arc_evict_needed(arc_buf_contents_t type); 382 383 #define GHOST_STATE(state) \ 384 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 385 386 /* 387 * Private ARC flags. These flags are private ARC only flags that will show up 388 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 389 * be passed in as arc_flags in things like arc_read. However, these flags 390 * should never be passed and should only be set by ARC code. When adding new 391 * public flags, make sure not to smash the private ones. 392 */ 393 394 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 395 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 396 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 397 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 398 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 399 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 400 401 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 402 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 403 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 404 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 405 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 406 407 /* 408 * Hash table routines 409 */ 410 411 #define HT_LOCK_PAD 64 412 413 struct ht_lock { 414 kmutex_t ht_lock; 415 #ifdef _KERNEL 416 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 417 #endif 418 }; 419 420 #define BUF_LOCKS 256 421 typedef struct buf_hash_table { 422 uint64_t ht_mask; 423 arc_buf_hdr_t **ht_table; 424 struct ht_lock ht_locks[BUF_LOCKS]; 425 } buf_hash_table_t; 426 427 static buf_hash_table_t buf_hash_table; 428 429 #define BUF_HASH_INDEX(spa, dva, birth) \ 430 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 431 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 432 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 433 #define HDR_LOCK(buf) \ 434 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 435 436 uint64_t zfs_crc64_table[256]; 437 438 static uint64_t 439 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 440 { 441 uintptr_t spav = (uintptr_t)spa; 442 uint8_t *vdva = (uint8_t *)dva; 443 uint64_t crc = -1ULL; 444 int i; 445 446 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 447 448 for (i = 0; i < sizeof (dva_t); i++) 449 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 450 451 crc ^= (spav>>8) ^ birth; 452 453 return (crc); 454 } 455 456 #define BUF_EMPTY(buf) \ 457 ((buf)->b_dva.dva_word[0] == 0 && \ 458 (buf)->b_dva.dva_word[1] == 0 && \ 459 (buf)->b_birth == 0) 460 461 #define BUF_EQUAL(spa, dva, birth, buf) \ 462 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 463 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 464 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 465 466 static arc_buf_hdr_t * 467 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 468 { 469 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 470 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 471 arc_buf_hdr_t *buf; 472 473 mutex_enter(hash_lock); 474 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 475 buf = buf->b_hash_next) { 476 if (BUF_EQUAL(spa, dva, birth, buf)) { 477 *lockp = hash_lock; 478 return (buf); 479 } 480 } 481 mutex_exit(hash_lock); 482 *lockp = NULL; 483 return (NULL); 484 } 485 486 /* 487 * Insert an entry into the hash table. If there is already an element 488 * equal to elem in the hash table, then the already existing element 489 * will be returned and the new element will not be inserted. 490 * Otherwise returns NULL. 491 */ 492 static arc_buf_hdr_t * 493 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 494 { 495 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 496 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 497 arc_buf_hdr_t *fbuf; 498 uint32_t i; 499 500 ASSERT(!HDR_IN_HASH_TABLE(buf)); 501 *lockp = hash_lock; 502 mutex_enter(hash_lock); 503 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 504 fbuf = fbuf->b_hash_next, i++) { 505 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 506 return (fbuf); 507 } 508 509 buf->b_hash_next = buf_hash_table.ht_table[idx]; 510 buf_hash_table.ht_table[idx] = buf; 511 buf->b_flags |= ARC_IN_HASH_TABLE; 512 513 /* collect some hash table performance data */ 514 if (i > 0) { 515 ARCSTAT_BUMP(arcstat_hash_collisions); 516 if (i == 1) 517 ARCSTAT_BUMP(arcstat_hash_chains); 518 519 ARCSTAT_MAX(arcstat_hash_chain_max, i); 520 } 521 522 ARCSTAT_BUMP(arcstat_hash_elements); 523 ARCSTAT_MAXSTAT(arcstat_hash_elements); 524 525 return (NULL); 526 } 527 528 static void 529 buf_hash_remove(arc_buf_hdr_t *buf) 530 { 531 arc_buf_hdr_t *fbuf, **bufp; 532 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 533 534 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 535 ASSERT(HDR_IN_HASH_TABLE(buf)); 536 537 bufp = &buf_hash_table.ht_table[idx]; 538 while ((fbuf = *bufp) != buf) { 539 ASSERT(fbuf != NULL); 540 bufp = &fbuf->b_hash_next; 541 } 542 *bufp = buf->b_hash_next; 543 buf->b_hash_next = NULL; 544 buf->b_flags &= ~ARC_IN_HASH_TABLE; 545 546 /* collect some hash table performance data */ 547 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 548 549 if (buf_hash_table.ht_table[idx] && 550 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 551 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 552 } 553 554 /* 555 * Global data structures and functions for the buf kmem cache. 556 */ 557 static kmem_cache_t *hdr_cache; 558 static kmem_cache_t *buf_cache; 559 560 static void 561 buf_fini(void) 562 { 563 int i; 564 565 kmem_free(buf_hash_table.ht_table, 566 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 567 for (i = 0; i < BUF_LOCKS; i++) 568 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 569 kmem_cache_destroy(hdr_cache); 570 kmem_cache_destroy(buf_cache); 571 } 572 573 /* 574 * Constructor callback - called when the cache is empty 575 * and a new buf is requested. 576 */ 577 /* ARGSUSED */ 578 static int 579 hdr_cons(void *vbuf, void *unused, int kmflag) 580 { 581 arc_buf_hdr_t *buf = vbuf; 582 583 bzero(buf, sizeof (arc_buf_hdr_t)); 584 refcount_create(&buf->b_refcnt); 585 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 586 return (0); 587 } 588 589 /* 590 * Destructor callback - called when a cached buf is 591 * no longer required. 592 */ 593 /* ARGSUSED */ 594 static void 595 hdr_dest(void *vbuf, void *unused) 596 { 597 arc_buf_hdr_t *buf = vbuf; 598 599 refcount_destroy(&buf->b_refcnt); 600 cv_destroy(&buf->b_cv); 601 } 602 603 /* 604 * Reclaim callback -- invoked when memory is low. 605 */ 606 /* ARGSUSED */ 607 static void 608 hdr_recl(void *unused) 609 { 610 dprintf("hdr_recl called\n"); 611 /* 612 * umem calls the reclaim func when we destroy the buf cache, 613 * which is after we do arc_fini(). 614 */ 615 if (!arc_dead) 616 cv_signal(&arc_reclaim_thr_cv); 617 } 618 619 static void 620 buf_init(void) 621 { 622 uint64_t *ct; 623 uint64_t hsize = 1ULL << 12; 624 int i, j; 625 626 /* 627 * The hash table is big enough to fill all of physical memory 628 * with an average 64K block size. The table will take up 629 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 630 */ 631 while (hsize * 65536 < physmem * PAGESIZE) 632 hsize <<= 1; 633 retry: 634 buf_hash_table.ht_mask = hsize - 1; 635 buf_hash_table.ht_table = 636 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 637 if (buf_hash_table.ht_table == NULL) { 638 ASSERT(hsize > (1ULL << 8)); 639 hsize >>= 1; 640 goto retry; 641 } 642 643 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 644 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 645 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 646 0, NULL, NULL, NULL, NULL, NULL, 0); 647 648 for (i = 0; i < 256; i++) 649 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 650 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 651 652 for (i = 0; i < BUF_LOCKS; i++) { 653 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 654 NULL, MUTEX_DEFAULT, NULL); 655 } 656 } 657 658 #define ARC_MINTIME (hz>>4) /* 62 ms */ 659 660 static void 661 arc_cksum_verify(arc_buf_t *buf) 662 { 663 zio_cksum_t zc; 664 665 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 666 return; 667 668 mutex_enter(&buf->b_hdr->b_freeze_lock); 669 if (buf->b_hdr->b_freeze_cksum == NULL || 670 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 671 mutex_exit(&buf->b_hdr->b_freeze_lock); 672 return; 673 } 674 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 675 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 676 panic("buffer modified while frozen!"); 677 mutex_exit(&buf->b_hdr->b_freeze_lock); 678 } 679 680 static void 681 arc_cksum_compute(arc_buf_t *buf) 682 { 683 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 684 return; 685 686 mutex_enter(&buf->b_hdr->b_freeze_lock); 687 if (buf->b_hdr->b_freeze_cksum != NULL) { 688 mutex_exit(&buf->b_hdr->b_freeze_lock); 689 return; 690 } 691 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 692 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 693 buf->b_hdr->b_freeze_cksum); 694 mutex_exit(&buf->b_hdr->b_freeze_lock); 695 } 696 697 void 698 arc_buf_thaw(arc_buf_t *buf) 699 { 700 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 701 return; 702 703 if (buf->b_hdr->b_state != arc_anon) 704 panic("modifying non-anon buffer!"); 705 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 706 panic("modifying buffer while i/o in progress!"); 707 arc_cksum_verify(buf); 708 mutex_enter(&buf->b_hdr->b_freeze_lock); 709 if (buf->b_hdr->b_freeze_cksum != NULL) { 710 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 711 buf->b_hdr->b_freeze_cksum = NULL; 712 } 713 mutex_exit(&buf->b_hdr->b_freeze_lock); 714 } 715 716 void 717 arc_buf_freeze(arc_buf_t *buf) 718 { 719 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 720 return; 721 722 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 723 buf->b_hdr->b_state == arc_anon); 724 arc_cksum_compute(buf); 725 } 726 727 static void 728 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 729 { 730 ASSERT(MUTEX_HELD(hash_lock)); 731 732 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 733 (ab->b_state != arc_anon)) { 734 uint64_t delta = ab->b_size * ab->b_datacnt; 735 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 736 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 737 738 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 739 mutex_enter(&ab->b_state->arcs_mtx); 740 ASSERT(list_link_active(&ab->b_arc_node)); 741 list_remove(list, ab); 742 if (GHOST_STATE(ab->b_state)) { 743 ASSERT3U(ab->b_datacnt, ==, 0); 744 ASSERT3P(ab->b_buf, ==, NULL); 745 delta = ab->b_size; 746 } 747 ASSERT(delta > 0); 748 ASSERT3U(*size, >=, delta); 749 atomic_add_64(size, -delta); 750 mutex_exit(&ab->b_state->arcs_mtx); 751 /* remove the prefetch flag is we get a reference */ 752 if (ab->b_flags & ARC_PREFETCH) 753 ab->b_flags &= ~ARC_PREFETCH; 754 } 755 } 756 757 static int 758 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 759 { 760 int cnt; 761 arc_state_t *state = ab->b_state; 762 763 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 764 ASSERT(!GHOST_STATE(state)); 765 766 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 767 (state != arc_anon)) { 768 uint64_t *size = &state->arcs_lsize[ab->b_type]; 769 770 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 771 mutex_enter(&state->arcs_mtx); 772 ASSERT(!list_link_active(&ab->b_arc_node)); 773 list_insert_head(&state->arcs_list[ab->b_type], ab); 774 ASSERT(ab->b_datacnt > 0); 775 atomic_add_64(size, ab->b_size * ab->b_datacnt); 776 mutex_exit(&state->arcs_mtx); 777 } 778 return (cnt); 779 } 780 781 /* 782 * Move the supplied buffer to the indicated state. The mutex 783 * for the buffer must be held by the caller. 784 */ 785 static void 786 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 787 { 788 arc_state_t *old_state = ab->b_state; 789 int64_t refcnt = refcount_count(&ab->b_refcnt); 790 uint64_t from_delta, to_delta; 791 792 ASSERT(MUTEX_HELD(hash_lock)); 793 ASSERT(new_state != old_state); 794 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 795 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 796 797 from_delta = to_delta = ab->b_datacnt * ab->b_size; 798 799 /* 800 * If this buffer is evictable, transfer it from the 801 * old state list to the new state list. 802 */ 803 if (refcnt == 0) { 804 if (old_state != arc_anon) { 805 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 806 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 807 808 if (use_mutex) 809 mutex_enter(&old_state->arcs_mtx); 810 811 ASSERT(list_link_active(&ab->b_arc_node)); 812 list_remove(&old_state->arcs_list[ab->b_type], ab); 813 814 /* 815 * If prefetching out of the ghost cache, 816 * we will have a non-null datacnt. 817 */ 818 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 819 /* ghost elements have a ghost size */ 820 ASSERT(ab->b_buf == NULL); 821 from_delta = ab->b_size; 822 } 823 ASSERT3U(*size, >=, from_delta); 824 atomic_add_64(size, -from_delta); 825 826 if (use_mutex) 827 mutex_exit(&old_state->arcs_mtx); 828 } 829 if (new_state != arc_anon) { 830 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 831 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 832 833 if (use_mutex) 834 mutex_enter(&new_state->arcs_mtx); 835 836 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 837 838 /* ghost elements have a ghost size */ 839 if (GHOST_STATE(new_state)) { 840 ASSERT(ab->b_datacnt == 0); 841 ASSERT(ab->b_buf == NULL); 842 to_delta = ab->b_size; 843 } 844 atomic_add_64(size, to_delta); 845 ASSERT3U(new_state->arcs_size + to_delta, >=, *size); 846 847 if (use_mutex) 848 mutex_exit(&new_state->arcs_mtx); 849 } 850 } 851 852 ASSERT(!BUF_EMPTY(ab)); 853 if (new_state == arc_anon && old_state != arc_anon) { 854 buf_hash_remove(ab); 855 } 856 857 /* adjust state sizes */ 858 if (to_delta) 859 atomic_add_64(&new_state->arcs_size, to_delta); 860 if (from_delta) { 861 ASSERT3U(old_state->arcs_size, >=, from_delta); 862 atomic_add_64(&old_state->arcs_size, -from_delta); 863 } 864 ab->b_state = new_state; 865 } 866 867 void 868 arc_space_consume(uint64_t space) 869 { 870 atomic_add_64(&arc_meta_used, space); 871 atomic_add_64(&arc_size, space); 872 } 873 874 void 875 arc_space_return(uint64_t space) 876 { 877 ASSERT(arc_meta_used >= space); 878 if (arc_meta_max < arc_meta_used) 879 arc_meta_max = arc_meta_used; 880 atomic_add_64(&arc_meta_used, -space); 881 ASSERT(arc_size >= space); 882 atomic_add_64(&arc_size, -space); 883 } 884 885 void * 886 arc_data_buf_alloc(uint64_t size) 887 { 888 if (arc_evict_needed(ARC_BUFC_DATA)) 889 cv_signal(&arc_reclaim_thr_cv); 890 atomic_add_64(&arc_size, size); 891 return (zio_data_buf_alloc(size)); 892 } 893 894 void 895 arc_data_buf_free(void *buf, uint64_t size) 896 { 897 zio_data_buf_free(buf, size); 898 ASSERT(arc_size >= size); 899 atomic_add_64(&arc_size, -size); 900 } 901 902 arc_buf_t * 903 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 904 { 905 arc_buf_hdr_t *hdr; 906 arc_buf_t *buf; 907 908 ASSERT3U(size, >, 0); 909 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 910 ASSERT(BUF_EMPTY(hdr)); 911 hdr->b_size = size; 912 hdr->b_type = type; 913 hdr->b_spa = spa; 914 hdr->b_state = arc_anon; 915 hdr->b_arc_access = 0; 916 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 917 buf->b_hdr = hdr; 918 buf->b_data = NULL; 919 buf->b_efunc = NULL; 920 buf->b_private = NULL; 921 buf->b_next = NULL; 922 hdr->b_buf = buf; 923 arc_get_data_buf(buf); 924 hdr->b_datacnt = 1; 925 hdr->b_flags = 0; 926 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 927 (void) refcount_add(&hdr->b_refcnt, tag); 928 929 return (buf); 930 } 931 932 static arc_buf_t * 933 arc_buf_clone(arc_buf_t *from) 934 { 935 arc_buf_t *buf; 936 arc_buf_hdr_t *hdr = from->b_hdr; 937 uint64_t size = hdr->b_size; 938 939 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 940 buf->b_hdr = hdr; 941 buf->b_data = NULL; 942 buf->b_efunc = NULL; 943 buf->b_private = NULL; 944 buf->b_next = hdr->b_buf; 945 hdr->b_buf = buf; 946 arc_get_data_buf(buf); 947 bcopy(from->b_data, buf->b_data, size); 948 hdr->b_datacnt += 1; 949 return (buf); 950 } 951 952 void 953 arc_buf_add_ref(arc_buf_t *buf, void* tag) 954 { 955 arc_buf_hdr_t *hdr; 956 kmutex_t *hash_lock; 957 958 /* 959 * Check to see if this buffer is currently being evicted via 960 * arc_do_user_evicts(). 961 */ 962 mutex_enter(&arc_eviction_mtx); 963 hdr = buf->b_hdr; 964 if (hdr == NULL) { 965 mutex_exit(&arc_eviction_mtx); 966 return; 967 } 968 hash_lock = HDR_LOCK(hdr); 969 mutex_exit(&arc_eviction_mtx); 970 971 mutex_enter(hash_lock); 972 if (buf->b_data == NULL) { 973 /* 974 * This buffer is evicted. 975 */ 976 mutex_exit(hash_lock); 977 return; 978 } 979 980 ASSERT(buf->b_hdr == hdr); 981 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 982 add_reference(hdr, hash_lock, tag); 983 arc_access(hdr, hash_lock); 984 mutex_exit(hash_lock); 985 ARCSTAT_BUMP(arcstat_hits); 986 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 987 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 988 data, metadata, hits); 989 } 990 991 static void 992 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 993 { 994 arc_buf_t **bufp; 995 996 /* free up data associated with the buf */ 997 if (buf->b_data) { 998 arc_state_t *state = buf->b_hdr->b_state; 999 uint64_t size = buf->b_hdr->b_size; 1000 arc_buf_contents_t type = buf->b_hdr->b_type; 1001 1002 arc_cksum_verify(buf); 1003 if (!recycle) { 1004 if (type == ARC_BUFC_METADATA) { 1005 zio_buf_free(buf->b_data, size); 1006 arc_space_return(size); 1007 } else { 1008 ASSERT(type == ARC_BUFC_DATA); 1009 zio_data_buf_free(buf->b_data, size); 1010 atomic_add_64(&arc_size, -size); 1011 } 1012 } 1013 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1014 uint64_t *cnt = &state->arcs_lsize[type]; 1015 1016 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1017 ASSERT(state != arc_anon); 1018 1019 ASSERT3U(*cnt, >=, size); 1020 atomic_add_64(cnt, -size); 1021 } 1022 ASSERT3U(state->arcs_size, >=, size); 1023 atomic_add_64(&state->arcs_size, -size); 1024 buf->b_data = NULL; 1025 ASSERT(buf->b_hdr->b_datacnt > 0); 1026 buf->b_hdr->b_datacnt -= 1; 1027 } 1028 1029 /* only remove the buf if requested */ 1030 if (!all) 1031 return; 1032 1033 /* remove the buf from the hdr list */ 1034 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1035 continue; 1036 *bufp = buf->b_next; 1037 1038 ASSERT(buf->b_efunc == NULL); 1039 1040 /* clean up the buf */ 1041 buf->b_hdr = NULL; 1042 kmem_cache_free(buf_cache, buf); 1043 } 1044 1045 static void 1046 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1047 { 1048 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1049 ASSERT3P(hdr->b_state, ==, arc_anon); 1050 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1051 1052 if (!BUF_EMPTY(hdr)) { 1053 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1054 bzero(&hdr->b_dva, sizeof (dva_t)); 1055 hdr->b_birth = 0; 1056 hdr->b_cksum0 = 0; 1057 } 1058 while (hdr->b_buf) { 1059 arc_buf_t *buf = hdr->b_buf; 1060 1061 if (buf->b_efunc) { 1062 mutex_enter(&arc_eviction_mtx); 1063 ASSERT(buf->b_hdr != NULL); 1064 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1065 hdr->b_buf = buf->b_next; 1066 buf->b_hdr = &arc_eviction_hdr; 1067 buf->b_next = arc_eviction_list; 1068 arc_eviction_list = buf; 1069 mutex_exit(&arc_eviction_mtx); 1070 } else { 1071 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1072 } 1073 } 1074 if (hdr->b_freeze_cksum != NULL) { 1075 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1076 hdr->b_freeze_cksum = NULL; 1077 } 1078 1079 ASSERT(!list_link_active(&hdr->b_arc_node)); 1080 ASSERT3P(hdr->b_hash_next, ==, NULL); 1081 ASSERT3P(hdr->b_acb, ==, NULL); 1082 kmem_cache_free(hdr_cache, hdr); 1083 } 1084 1085 void 1086 arc_buf_free(arc_buf_t *buf, void *tag) 1087 { 1088 arc_buf_hdr_t *hdr = buf->b_hdr; 1089 int hashed = hdr->b_state != arc_anon; 1090 1091 ASSERT(buf->b_efunc == NULL); 1092 ASSERT(buf->b_data != NULL); 1093 1094 if (hashed) { 1095 kmutex_t *hash_lock = HDR_LOCK(hdr); 1096 1097 mutex_enter(hash_lock); 1098 (void) remove_reference(hdr, hash_lock, tag); 1099 if (hdr->b_datacnt > 1) 1100 arc_buf_destroy(buf, FALSE, TRUE); 1101 else 1102 hdr->b_flags |= ARC_BUF_AVAILABLE; 1103 mutex_exit(hash_lock); 1104 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1105 int destroy_hdr; 1106 /* 1107 * We are in the middle of an async write. Don't destroy 1108 * this buffer unless the write completes before we finish 1109 * decrementing the reference count. 1110 */ 1111 mutex_enter(&arc_eviction_mtx); 1112 (void) remove_reference(hdr, NULL, tag); 1113 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1114 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1115 mutex_exit(&arc_eviction_mtx); 1116 if (destroy_hdr) 1117 arc_hdr_destroy(hdr); 1118 } else { 1119 if (remove_reference(hdr, NULL, tag) > 0) { 1120 ASSERT(HDR_IO_ERROR(hdr)); 1121 arc_buf_destroy(buf, FALSE, TRUE); 1122 } else { 1123 arc_hdr_destroy(hdr); 1124 } 1125 } 1126 } 1127 1128 int 1129 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1130 { 1131 arc_buf_hdr_t *hdr = buf->b_hdr; 1132 kmutex_t *hash_lock = HDR_LOCK(hdr); 1133 int no_callback = (buf->b_efunc == NULL); 1134 1135 if (hdr->b_state == arc_anon) { 1136 arc_buf_free(buf, tag); 1137 return (no_callback); 1138 } 1139 1140 mutex_enter(hash_lock); 1141 ASSERT(hdr->b_state != arc_anon); 1142 ASSERT(buf->b_data != NULL); 1143 1144 (void) remove_reference(hdr, hash_lock, tag); 1145 if (hdr->b_datacnt > 1) { 1146 if (no_callback) 1147 arc_buf_destroy(buf, FALSE, TRUE); 1148 } else if (no_callback) { 1149 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1150 hdr->b_flags |= ARC_BUF_AVAILABLE; 1151 } 1152 ASSERT(no_callback || hdr->b_datacnt > 1 || 1153 refcount_is_zero(&hdr->b_refcnt)); 1154 mutex_exit(hash_lock); 1155 return (no_callback); 1156 } 1157 1158 int 1159 arc_buf_size(arc_buf_t *buf) 1160 { 1161 return (buf->b_hdr->b_size); 1162 } 1163 1164 /* 1165 * Evict buffers from list until we've removed the specified number of 1166 * bytes. Move the removed buffers to the appropriate evict state. 1167 * If the recycle flag is set, then attempt to "recycle" a buffer: 1168 * - look for a buffer to evict that is `bytes' long. 1169 * - return the data block from this buffer rather than freeing it. 1170 * This flag is used by callers that are trying to make space for a 1171 * new buffer in a full arc cache. 1172 */ 1173 static void * 1174 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1175 arc_buf_contents_t type) 1176 { 1177 arc_state_t *evicted_state; 1178 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1179 arc_buf_hdr_t *ab, *ab_prev = NULL; 1180 list_t *list = &state->arcs_list[type]; 1181 kmutex_t *hash_lock; 1182 boolean_t have_lock; 1183 void *stolen = NULL; 1184 1185 ASSERT(state == arc_mru || state == arc_mfu); 1186 1187 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1188 1189 mutex_enter(&state->arcs_mtx); 1190 mutex_enter(&evicted_state->arcs_mtx); 1191 1192 for (ab = list_tail(list); ab; ab = ab_prev) { 1193 ab_prev = list_prev(list, ab); 1194 /* prefetch buffers have a minimum lifespan */ 1195 if (HDR_IO_IN_PROGRESS(ab) || 1196 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1197 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1198 skipped++; 1199 continue; 1200 } 1201 /* "lookahead" for better eviction candidate */ 1202 if (recycle && ab->b_size != bytes && 1203 ab_prev && ab_prev->b_size == bytes) 1204 continue; 1205 hash_lock = HDR_LOCK(ab); 1206 have_lock = MUTEX_HELD(hash_lock); 1207 if (have_lock || mutex_tryenter(hash_lock)) { 1208 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1209 ASSERT(ab->b_datacnt > 0); 1210 while (ab->b_buf) { 1211 arc_buf_t *buf = ab->b_buf; 1212 if (buf->b_data) { 1213 bytes_evicted += ab->b_size; 1214 if (recycle && ab->b_type == type && 1215 ab->b_size == bytes) { 1216 stolen = buf->b_data; 1217 recycle = FALSE; 1218 } 1219 } 1220 if (buf->b_efunc) { 1221 mutex_enter(&arc_eviction_mtx); 1222 arc_buf_destroy(buf, 1223 buf->b_data == stolen, FALSE); 1224 ab->b_buf = buf->b_next; 1225 buf->b_hdr = &arc_eviction_hdr; 1226 buf->b_next = arc_eviction_list; 1227 arc_eviction_list = buf; 1228 mutex_exit(&arc_eviction_mtx); 1229 } else { 1230 arc_buf_destroy(buf, 1231 buf->b_data == stolen, TRUE); 1232 } 1233 } 1234 ASSERT(ab->b_datacnt == 0); 1235 arc_change_state(evicted_state, ab, hash_lock); 1236 ASSERT(HDR_IN_HASH_TABLE(ab)); 1237 ab->b_flags = ARC_IN_HASH_TABLE; 1238 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1239 if (!have_lock) 1240 mutex_exit(hash_lock); 1241 if (bytes >= 0 && bytes_evicted >= bytes) 1242 break; 1243 } else { 1244 missed += 1; 1245 } 1246 } 1247 1248 mutex_exit(&evicted_state->arcs_mtx); 1249 mutex_exit(&state->arcs_mtx); 1250 1251 if (bytes_evicted < bytes) 1252 dprintf("only evicted %lld bytes from %x", 1253 (longlong_t)bytes_evicted, state); 1254 1255 if (skipped) 1256 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1257 1258 if (missed) 1259 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1260 1261 return (stolen); 1262 } 1263 1264 /* 1265 * Remove buffers from list until we've removed the specified number of 1266 * bytes. Destroy the buffers that are removed. 1267 */ 1268 static void 1269 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1270 { 1271 arc_buf_hdr_t *ab, *ab_prev; 1272 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1273 kmutex_t *hash_lock; 1274 uint64_t bytes_deleted = 0; 1275 uint64_t bufs_skipped = 0; 1276 1277 ASSERT(GHOST_STATE(state)); 1278 top: 1279 mutex_enter(&state->arcs_mtx); 1280 for (ab = list_tail(list); ab; ab = ab_prev) { 1281 ab_prev = list_prev(list, ab); 1282 hash_lock = HDR_LOCK(ab); 1283 if (mutex_tryenter(hash_lock)) { 1284 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1285 ASSERT(ab->b_buf == NULL); 1286 arc_change_state(arc_anon, ab, hash_lock); 1287 mutex_exit(hash_lock); 1288 ARCSTAT_BUMP(arcstat_deleted); 1289 bytes_deleted += ab->b_size; 1290 arc_hdr_destroy(ab); 1291 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1292 if (bytes >= 0 && bytes_deleted >= bytes) 1293 break; 1294 } else { 1295 if (bytes < 0) { 1296 mutex_exit(&state->arcs_mtx); 1297 mutex_enter(hash_lock); 1298 mutex_exit(hash_lock); 1299 goto top; 1300 } 1301 bufs_skipped += 1; 1302 } 1303 } 1304 mutex_exit(&state->arcs_mtx); 1305 1306 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1307 (bytes < 0 || bytes_deleted < bytes)) { 1308 list = &state->arcs_list[ARC_BUFC_METADATA]; 1309 goto top; 1310 } 1311 1312 if (bufs_skipped) { 1313 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1314 ASSERT(bytes >= 0); 1315 } 1316 1317 if (bytes_deleted < bytes) 1318 dprintf("only deleted %lld bytes from %p", 1319 (longlong_t)bytes_deleted, state); 1320 } 1321 1322 static void 1323 arc_adjust(void) 1324 { 1325 int64_t top_sz, mru_over, arc_over, todelete; 1326 1327 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1328 1329 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1330 int64_t toevict = 1331 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1332 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); 1333 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1334 } 1335 1336 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1337 int64_t toevict = 1338 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1339 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); 1340 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1341 } 1342 1343 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1344 1345 if (mru_over > 0) { 1346 if (arc_mru_ghost->arcs_size > 0) { 1347 todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1348 arc_evict_ghost(arc_mru_ghost, todelete); 1349 } 1350 } 1351 1352 if ((arc_over = arc_size - arc_c) > 0) { 1353 int64_t tbl_over; 1354 1355 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1356 int64_t toevict = 1357 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1358 (void) arc_evict(arc_mfu, toevict, FALSE, 1359 ARC_BUFC_DATA); 1360 arc_over = arc_size - arc_c; 1361 } 1362 1363 if (arc_over > 0 && 1364 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1365 int64_t toevict = 1366 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 1367 arc_over); 1368 (void) arc_evict(arc_mfu, toevict, FALSE, 1369 ARC_BUFC_METADATA); 1370 } 1371 1372 tbl_over = arc_size + arc_mru_ghost->arcs_size + 1373 arc_mfu_ghost->arcs_size - arc_c * 2; 1374 1375 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 1376 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1377 arc_evict_ghost(arc_mfu_ghost, todelete); 1378 } 1379 } 1380 } 1381 1382 static void 1383 arc_do_user_evicts(void) 1384 { 1385 mutex_enter(&arc_eviction_mtx); 1386 while (arc_eviction_list != NULL) { 1387 arc_buf_t *buf = arc_eviction_list; 1388 arc_eviction_list = buf->b_next; 1389 buf->b_hdr = NULL; 1390 mutex_exit(&arc_eviction_mtx); 1391 1392 if (buf->b_efunc != NULL) 1393 VERIFY(buf->b_efunc(buf) == 0); 1394 1395 buf->b_efunc = NULL; 1396 buf->b_private = NULL; 1397 kmem_cache_free(buf_cache, buf); 1398 mutex_enter(&arc_eviction_mtx); 1399 } 1400 mutex_exit(&arc_eviction_mtx); 1401 } 1402 1403 /* 1404 * Flush all *evictable* data from the cache. 1405 * NOTE: this will not touch "active" (i.e. referenced) data. 1406 */ 1407 void 1408 arc_flush(void) 1409 { 1410 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) 1411 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); 1412 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) 1413 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); 1414 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) 1415 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); 1416 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) 1417 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); 1418 1419 arc_evict_ghost(arc_mru_ghost, -1); 1420 arc_evict_ghost(arc_mfu_ghost, -1); 1421 1422 mutex_enter(&arc_reclaim_thr_lock); 1423 arc_do_user_evicts(); 1424 mutex_exit(&arc_reclaim_thr_lock); 1425 ASSERT(arc_eviction_list == NULL); 1426 } 1427 1428 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1429 1430 void 1431 arc_shrink(void) 1432 { 1433 if (arc_c > arc_c_min) { 1434 uint64_t to_free; 1435 1436 #ifdef _KERNEL 1437 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1438 #else 1439 to_free = arc_c >> arc_shrink_shift; 1440 #endif 1441 if (arc_c > arc_c_min + to_free) 1442 atomic_add_64(&arc_c, -to_free); 1443 else 1444 arc_c = arc_c_min; 1445 1446 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1447 if (arc_c > arc_size) 1448 arc_c = MAX(arc_size, arc_c_min); 1449 if (arc_p > arc_c) 1450 arc_p = (arc_c >> 1); 1451 ASSERT(arc_c >= arc_c_min); 1452 ASSERT((int64_t)arc_p >= 0); 1453 } 1454 1455 if (arc_size > arc_c) 1456 arc_adjust(); 1457 } 1458 1459 static int 1460 arc_reclaim_needed(void) 1461 { 1462 uint64_t extra; 1463 1464 #ifdef _KERNEL 1465 1466 if (needfree) 1467 return (1); 1468 1469 /* 1470 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1471 */ 1472 extra = desfree; 1473 1474 /* 1475 * check that we're out of range of the pageout scanner. It starts to 1476 * schedule paging if freemem is less than lotsfree and needfree. 1477 * lotsfree is the high-water mark for pageout, and needfree is the 1478 * number of needed free pages. We add extra pages here to make sure 1479 * the scanner doesn't start up while we're freeing memory. 1480 */ 1481 if (freemem < lotsfree + needfree + extra) 1482 return (1); 1483 1484 /* 1485 * check to make sure that swapfs has enough space so that anon 1486 * reservations can still succeeed. anon_resvmem() checks that the 1487 * availrmem is greater than swapfs_minfree, and the number of reserved 1488 * swap pages. We also add a bit of extra here just to prevent 1489 * circumstances from getting really dire. 1490 */ 1491 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1492 return (1); 1493 1494 #if defined(__i386) 1495 /* 1496 * If we're on an i386 platform, it's possible that we'll exhaust the 1497 * kernel heap space before we ever run out of available physical 1498 * memory. Most checks of the size of the heap_area compare against 1499 * tune.t_minarmem, which is the minimum available real memory that we 1500 * can have in the system. However, this is generally fixed at 25 pages 1501 * which is so low that it's useless. In this comparison, we seek to 1502 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1503 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1504 * free) 1505 */ 1506 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1507 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1508 return (1); 1509 #endif 1510 1511 #else 1512 if (spa_get_random(100) == 0) 1513 return (1); 1514 #endif 1515 return (0); 1516 } 1517 1518 static void 1519 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1520 { 1521 size_t i; 1522 kmem_cache_t *prev_cache = NULL; 1523 kmem_cache_t *prev_data_cache = NULL; 1524 extern kmem_cache_t *zio_buf_cache[]; 1525 extern kmem_cache_t *zio_data_buf_cache[]; 1526 1527 #ifdef _KERNEL 1528 if (arc_meta_used >= arc_meta_limit) { 1529 /* 1530 * We are exceeding our meta-data cache limit. 1531 * Purge some DNLC entries to release holds on meta-data. 1532 */ 1533 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1534 } 1535 #if defined(__i386) 1536 /* 1537 * Reclaim unused memory from all kmem caches. 1538 */ 1539 kmem_reap(); 1540 #endif 1541 #endif 1542 1543 /* 1544 * An agressive reclamation will shrink the cache size as well as 1545 * reap free buffers from the arc kmem caches. 1546 */ 1547 if (strat == ARC_RECLAIM_AGGR) 1548 arc_shrink(); 1549 1550 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1551 if (zio_buf_cache[i] != prev_cache) { 1552 prev_cache = zio_buf_cache[i]; 1553 kmem_cache_reap_now(zio_buf_cache[i]); 1554 } 1555 if (zio_data_buf_cache[i] != prev_data_cache) { 1556 prev_data_cache = zio_data_buf_cache[i]; 1557 kmem_cache_reap_now(zio_data_buf_cache[i]); 1558 } 1559 } 1560 kmem_cache_reap_now(buf_cache); 1561 kmem_cache_reap_now(hdr_cache); 1562 } 1563 1564 static void 1565 arc_reclaim_thread(void) 1566 { 1567 clock_t growtime = 0; 1568 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1569 callb_cpr_t cpr; 1570 1571 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1572 1573 mutex_enter(&arc_reclaim_thr_lock); 1574 while (arc_thread_exit == 0) { 1575 if (arc_reclaim_needed()) { 1576 1577 if (arc_no_grow) { 1578 if (last_reclaim == ARC_RECLAIM_CONS) { 1579 last_reclaim = ARC_RECLAIM_AGGR; 1580 } else { 1581 last_reclaim = ARC_RECLAIM_CONS; 1582 } 1583 } else { 1584 arc_no_grow = TRUE; 1585 last_reclaim = ARC_RECLAIM_AGGR; 1586 membar_producer(); 1587 } 1588 1589 /* reset the growth delay for every reclaim */ 1590 growtime = lbolt + (arc_grow_retry * hz); 1591 1592 arc_kmem_reap_now(last_reclaim); 1593 1594 } else if (arc_no_grow && lbolt >= growtime) { 1595 arc_no_grow = FALSE; 1596 } 1597 1598 if (2 * arc_c < arc_size + 1599 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1600 arc_adjust(); 1601 1602 if (arc_eviction_list != NULL) 1603 arc_do_user_evicts(); 1604 1605 /* block until needed, or one second, whichever is shorter */ 1606 CALLB_CPR_SAFE_BEGIN(&cpr); 1607 (void) cv_timedwait(&arc_reclaim_thr_cv, 1608 &arc_reclaim_thr_lock, (lbolt + hz)); 1609 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1610 } 1611 1612 arc_thread_exit = 0; 1613 cv_broadcast(&arc_reclaim_thr_cv); 1614 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1615 thread_exit(); 1616 } 1617 1618 /* 1619 * Adapt arc info given the number of bytes we are trying to add and 1620 * the state that we are comming from. This function is only called 1621 * when we are adding new content to the cache. 1622 */ 1623 static void 1624 arc_adapt(int bytes, arc_state_t *state) 1625 { 1626 int mult; 1627 1628 ASSERT(bytes > 0); 1629 /* 1630 * Adapt the target size of the MRU list: 1631 * - if we just hit in the MRU ghost list, then increase 1632 * the target size of the MRU list. 1633 * - if we just hit in the MFU ghost list, then increase 1634 * the target size of the MFU list by decreasing the 1635 * target size of the MRU list. 1636 */ 1637 if (state == arc_mru_ghost) { 1638 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1639 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1640 1641 arc_p = MIN(arc_c, arc_p + bytes * mult); 1642 } else if (state == arc_mfu_ghost) { 1643 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1644 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1645 1646 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1647 } 1648 ASSERT((int64_t)arc_p >= 0); 1649 1650 if (arc_reclaim_needed()) { 1651 cv_signal(&arc_reclaim_thr_cv); 1652 return; 1653 } 1654 1655 if (arc_no_grow) 1656 return; 1657 1658 if (arc_c >= arc_c_max) 1659 return; 1660 1661 /* 1662 * If we're within (2 * maxblocksize) bytes of the target 1663 * cache size, increment the target cache size 1664 */ 1665 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1666 atomic_add_64(&arc_c, (int64_t)bytes); 1667 if (arc_c > arc_c_max) 1668 arc_c = arc_c_max; 1669 else if (state == arc_anon) 1670 atomic_add_64(&arc_p, (int64_t)bytes); 1671 if (arc_p > arc_c) 1672 arc_p = arc_c; 1673 } 1674 ASSERT((int64_t)arc_p >= 0); 1675 } 1676 1677 /* 1678 * Check if the cache has reached its limits and eviction is required 1679 * prior to insert. 1680 */ 1681 static int 1682 arc_evict_needed(arc_buf_contents_t type) 1683 { 1684 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 1685 return (1); 1686 1687 #ifdef _KERNEL 1688 /* 1689 * If zio data pages are being allocated out of a separate heap segment, 1690 * then enforce that the size of available vmem for this area remains 1691 * above about 1/32nd free. 1692 */ 1693 if (type == ARC_BUFC_DATA && zio_arena != NULL && 1694 vmem_size(zio_arena, VMEM_FREE) < 1695 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 1696 return (1); 1697 #endif 1698 1699 if (arc_reclaim_needed()) 1700 return (1); 1701 1702 return (arc_size > arc_c); 1703 } 1704 1705 /* 1706 * The buffer, supplied as the first argument, needs a data block. 1707 * So, if we are at cache max, determine which cache should be victimized. 1708 * We have the following cases: 1709 * 1710 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1711 * In this situation if we're out of space, but the resident size of the MFU is 1712 * under the limit, victimize the MFU cache to satisfy this insertion request. 1713 * 1714 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1715 * Here, we've used up all of the available space for the MRU, so we need to 1716 * evict from our own cache instead. Evict from the set of resident MRU 1717 * entries. 1718 * 1719 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1720 * c minus p represents the MFU space in the cache, since p is the size of the 1721 * cache that is dedicated to the MRU. In this situation there's still space on 1722 * the MFU side, so the MRU side needs to be victimized. 1723 * 1724 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1725 * MFU's resident set is consuming more space than it has been allotted. In 1726 * this situation, we must victimize our own cache, the MFU, for this insertion. 1727 */ 1728 static void 1729 arc_get_data_buf(arc_buf_t *buf) 1730 { 1731 arc_state_t *state = buf->b_hdr->b_state; 1732 uint64_t size = buf->b_hdr->b_size; 1733 arc_buf_contents_t type = buf->b_hdr->b_type; 1734 1735 arc_adapt(size, state); 1736 1737 /* 1738 * We have not yet reached cache maximum size, 1739 * just allocate a new buffer. 1740 */ 1741 if (!arc_evict_needed(type)) { 1742 if (type == ARC_BUFC_METADATA) { 1743 buf->b_data = zio_buf_alloc(size); 1744 arc_space_consume(size); 1745 } else { 1746 ASSERT(type == ARC_BUFC_DATA); 1747 buf->b_data = zio_data_buf_alloc(size); 1748 atomic_add_64(&arc_size, size); 1749 } 1750 goto out; 1751 } 1752 1753 /* 1754 * If we are prefetching from the mfu ghost list, this buffer 1755 * will end up on the mru list; so steal space from there. 1756 */ 1757 if (state == arc_mfu_ghost) 1758 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1759 else if (state == arc_mru_ghost) 1760 state = arc_mru; 1761 1762 if (state == arc_mru || state == arc_anon) { 1763 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1764 state = (arc_mfu->arcs_lsize[type] > 0 && 1765 arc_p > mru_used) ? arc_mfu : arc_mru; 1766 } else { 1767 /* MFU cases */ 1768 uint64_t mfu_space = arc_c - arc_p; 1769 state = (arc_mru->arcs_lsize[type] > 0 && 1770 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1771 } 1772 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1773 if (type == ARC_BUFC_METADATA) { 1774 buf->b_data = zio_buf_alloc(size); 1775 arc_space_consume(size); 1776 } else { 1777 ASSERT(type == ARC_BUFC_DATA); 1778 buf->b_data = zio_data_buf_alloc(size); 1779 atomic_add_64(&arc_size, size); 1780 } 1781 ARCSTAT_BUMP(arcstat_recycle_miss); 1782 } 1783 ASSERT(buf->b_data != NULL); 1784 out: 1785 /* 1786 * Update the state size. Note that ghost states have a 1787 * "ghost size" and so don't need to be updated. 1788 */ 1789 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1790 arc_buf_hdr_t *hdr = buf->b_hdr; 1791 1792 atomic_add_64(&hdr->b_state->arcs_size, size); 1793 if (list_link_active(&hdr->b_arc_node)) { 1794 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1795 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 1796 } 1797 /* 1798 * If we are growing the cache, and we are adding anonymous 1799 * data, and we have outgrown arc_p, update arc_p 1800 */ 1801 if (arc_size < arc_c && hdr->b_state == arc_anon && 1802 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1803 arc_p = MIN(arc_c, arc_p + size); 1804 } 1805 } 1806 1807 /* 1808 * This routine is called whenever a buffer is accessed. 1809 * NOTE: the hash lock is dropped in this function. 1810 */ 1811 static void 1812 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1813 { 1814 ASSERT(MUTEX_HELD(hash_lock)); 1815 1816 if (buf->b_state == arc_anon) { 1817 /* 1818 * This buffer is not in the cache, and does not 1819 * appear in our "ghost" list. Add the new buffer 1820 * to the MRU state. 1821 */ 1822 1823 ASSERT(buf->b_arc_access == 0); 1824 buf->b_arc_access = lbolt; 1825 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1826 arc_change_state(arc_mru, buf, hash_lock); 1827 1828 } else if (buf->b_state == arc_mru) { 1829 /* 1830 * If this buffer is here because of a prefetch, then either: 1831 * - clear the flag if this is a "referencing" read 1832 * (any subsequent access will bump this into the MFU state). 1833 * or 1834 * - move the buffer to the head of the list if this is 1835 * another prefetch (to make it less likely to be evicted). 1836 */ 1837 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1838 if (refcount_count(&buf->b_refcnt) == 0) { 1839 ASSERT(list_link_active(&buf->b_arc_node)); 1840 } else { 1841 buf->b_flags &= ~ARC_PREFETCH; 1842 ARCSTAT_BUMP(arcstat_mru_hits); 1843 } 1844 buf->b_arc_access = lbolt; 1845 return; 1846 } 1847 1848 /* 1849 * This buffer has been "accessed" only once so far, 1850 * but it is still in the cache. Move it to the MFU 1851 * state. 1852 */ 1853 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1854 /* 1855 * More than 125ms have passed since we 1856 * instantiated this buffer. Move it to the 1857 * most frequently used state. 1858 */ 1859 buf->b_arc_access = lbolt; 1860 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1861 arc_change_state(arc_mfu, buf, hash_lock); 1862 } 1863 ARCSTAT_BUMP(arcstat_mru_hits); 1864 } else if (buf->b_state == arc_mru_ghost) { 1865 arc_state_t *new_state; 1866 /* 1867 * This buffer has been "accessed" recently, but 1868 * was evicted from the cache. Move it to the 1869 * MFU state. 1870 */ 1871 1872 if (buf->b_flags & ARC_PREFETCH) { 1873 new_state = arc_mru; 1874 if (refcount_count(&buf->b_refcnt) > 0) 1875 buf->b_flags &= ~ARC_PREFETCH; 1876 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1877 } else { 1878 new_state = arc_mfu; 1879 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1880 } 1881 1882 buf->b_arc_access = lbolt; 1883 arc_change_state(new_state, buf, hash_lock); 1884 1885 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1886 } else if (buf->b_state == arc_mfu) { 1887 /* 1888 * This buffer has been accessed more than once and is 1889 * still in the cache. Keep it in the MFU state. 1890 * 1891 * NOTE: an add_reference() that occurred when we did 1892 * the arc_read() will have kicked this off the list. 1893 * If it was a prefetch, we will explicitly move it to 1894 * the head of the list now. 1895 */ 1896 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1897 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1898 ASSERT(list_link_active(&buf->b_arc_node)); 1899 } 1900 ARCSTAT_BUMP(arcstat_mfu_hits); 1901 buf->b_arc_access = lbolt; 1902 } else if (buf->b_state == arc_mfu_ghost) { 1903 arc_state_t *new_state = arc_mfu; 1904 /* 1905 * This buffer has been accessed more than once but has 1906 * been evicted from the cache. Move it back to the 1907 * MFU state. 1908 */ 1909 1910 if (buf->b_flags & ARC_PREFETCH) { 1911 /* 1912 * This is a prefetch access... 1913 * move this block back to the MRU state. 1914 */ 1915 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1916 new_state = arc_mru; 1917 } 1918 1919 buf->b_arc_access = lbolt; 1920 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1921 arc_change_state(new_state, buf, hash_lock); 1922 1923 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1924 } else { 1925 ASSERT(!"invalid arc state"); 1926 } 1927 } 1928 1929 /* a generic arc_done_func_t which you can use */ 1930 /* ARGSUSED */ 1931 void 1932 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1933 { 1934 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1935 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1936 } 1937 1938 /* a generic arc_done_func_t */ 1939 void 1940 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1941 { 1942 arc_buf_t **bufp = arg; 1943 if (zio && zio->io_error) { 1944 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1945 *bufp = NULL; 1946 } else { 1947 *bufp = buf; 1948 } 1949 } 1950 1951 static void 1952 arc_read_done(zio_t *zio) 1953 { 1954 arc_buf_hdr_t *hdr, *found; 1955 arc_buf_t *buf; 1956 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1957 kmutex_t *hash_lock; 1958 arc_callback_t *callback_list, *acb; 1959 int freeable = FALSE; 1960 1961 buf = zio->io_private; 1962 hdr = buf->b_hdr; 1963 1964 /* 1965 * The hdr was inserted into hash-table and removed from lists 1966 * prior to starting I/O. We should find this header, since 1967 * it's in the hash table, and it should be legit since it's 1968 * not possible to evict it during the I/O. The only possible 1969 * reason for it not to be found is if we were freed during the 1970 * read. 1971 */ 1972 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1973 &hash_lock); 1974 1975 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1976 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1977 1978 /* byteswap if necessary */ 1979 callback_list = hdr->b_acb; 1980 ASSERT(callback_list != NULL); 1981 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1982 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1983 1984 arc_cksum_compute(buf); 1985 1986 /* create copies of the data buffer for the callers */ 1987 abuf = buf; 1988 for (acb = callback_list; acb; acb = acb->acb_next) { 1989 if (acb->acb_done) { 1990 if (abuf == NULL) 1991 abuf = arc_buf_clone(buf); 1992 acb->acb_buf = abuf; 1993 abuf = NULL; 1994 } 1995 } 1996 hdr->b_acb = NULL; 1997 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1998 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1999 if (abuf == buf) 2000 hdr->b_flags |= ARC_BUF_AVAILABLE; 2001 2002 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2003 2004 if (zio->io_error != 0) { 2005 hdr->b_flags |= ARC_IO_ERROR; 2006 if (hdr->b_state != arc_anon) 2007 arc_change_state(arc_anon, hdr, hash_lock); 2008 if (HDR_IN_HASH_TABLE(hdr)) 2009 buf_hash_remove(hdr); 2010 freeable = refcount_is_zero(&hdr->b_refcnt); 2011 /* convert checksum errors into IO errors */ 2012 if (zio->io_error == ECKSUM) 2013 zio->io_error = EIO; 2014 } 2015 2016 /* 2017 * Broadcast before we drop the hash_lock to avoid the possibility 2018 * that the hdr (and hence the cv) might be freed before we get to 2019 * the cv_broadcast(). 2020 */ 2021 cv_broadcast(&hdr->b_cv); 2022 2023 if (hash_lock) { 2024 /* 2025 * Only call arc_access on anonymous buffers. This is because 2026 * if we've issued an I/O for an evicted buffer, we've already 2027 * called arc_access (to prevent any simultaneous readers from 2028 * getting confused). 2029 */ 2030 if (zio->io_error == 0 && hdr->b_state == arc_anon) 2031 arc_access(hdr, hash_lock); 2032 mutex_exit(hash_lock); 2033 } else { 2034 /* 2035 * This block was freed while we waited for the read to 2036 * complete. It has been removed from the hash table and 2037 * moved to the anonymous state (so that it won't show up 2038 * in the cache). 2039 */ 2040 ASSERT3P(hdr->b_state, ==, arc_anon); 2041 freeable = refcount_is_zero(&hdr->b_refcnt); 2042 } 2043 2044 /* execute each callback and free its structure */ 2045 while ((acb = callback_list) != NULL) { 2046 if (acb->acb_done) 2047 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2048 2049 if (acb->acb_zio_dummy != NULL) { 2050 acb->acb_zio_dummy->io_error = zio->io_error; 2051 zio_nowait(acb->acb_zio_dummy); 2052 } 2053 2054 callback_list = acb->acb_next; 2055 kmem_free(acb, sizeof (arc_callback_t)); 2056 } 2057 2058 if (freeable) 2059 arc_hdr_destroy(hdr); 2060 } 2061 2062 /* 2063 * "Read" the block block at the specified DVA (in bp) via the 2064 * cache. If the block is found in the cache, invoke the provided 2065 * callback immediately and return. Note that the `zio' parameter 2066 * in the callback will be NULL in this case, since no IO was 2067 * required. If the block is not in the cache pass the read request 2068 * on to the spa with a substitute callback function, so that the 2069 * requested block will be added to the cache. 2070 * 2071 * If a read request arrives for a block that has a read in-progress, 2072 * either wait for the in-progress read to complete (and return the 2073 * results); or, if this is a read with a "done" func, add a record 2074 * to the read to invoke the "done" func when the read completes, 2075 * and return; or just return. 2076 * 2077 * arc_read_done() will invoke all the requested "done" functions 2078 * for readers of this block. 2079 */ 2080 int 2081 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2082 arc_done_func_t *done, void *private, int priority, int flags, 2083 uint32_t *arc_flags, zbookmark_t *zb) 2084 { 2085 arc_buf_hdr_t *hdr; 2086 arc_buf_t *buf; 2087 kmutex_t *hash_lock; 2088 zio_t *rzio; 2089 2090 top: 2091 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2092 if (hdr && hdr->b_datacnt > 0) { 2093 2094 *arc_flags |= ARC_CACHED; 2095 2096 if (HDR_IO_IN_PROGRESS(hdr)) { 2097 2098 if (*arc_flags & ARC_WAIT) { 2099 cv_wait(&hdr->b_cv, hash_lock); 2100 mutex_exit(hash_lock); 2101 goto top; 2102 } 2103 ASSERT(*arc_flags & ARC_NOWAIT); 2104 2105 if (done) { 2106 arc_callback_t *acb = NULL; 2107 2108 acb = kmem_zalloc(sizeof (arc_callback_t), 2109 KM_SLEEP); 2110 acb->acb_done = done; 2111 acb->acb_private = private; 2112 acb->acb_byteswap = swap; 2113 if (pio != NULL) 2114 acb->acb_zio_dummy = zio_null(pio, 2115 spa, NULL, NULL, flags); 2116 2117 ASSERT(acb->acb_done != NULL); 2118 acb->acb_next = hdr->b_acb; 2119 hdr->b_acb = acb; 2120 add_reference(hdr, hash_lock, private); 2121 mutex_exit(hash_lock); 2122 return (0); 2123 } 2124 mutex_exit(hash_lock); 2125 return (0); 2126 } 2127 2128 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2129 2130 if (done) { 2131 add_reference(hdr, hash_lock, private); 2132 /* 2133 * If this block is already in use, create a new 2134 * copy of the data so that we will be guaranteed 2135 * that arc_release() will always succeed. 2136 */ 2137 buf = hdr->b_buf; 2138 ASSERT(buf); 2139 ASSERT(buf->b_data); 2140 if (HDR_BUF_AVAILABLE(hdr)) { 2141 ASSERT(buf->b_efunc == NULL); 2142 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2143 } else { 2144 buf = arc_buf_clone(buf); 2145 } 2146 } else if (*arc_flags & ARC_PREFETCH && 2147 refcount_count(&hdr->b_refcnt) == 0) { 2148 hdr->b_flags |= ARC_PREFETCH; 2149 } 2150 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2151 arc_access(hdr, hash_lock); 2152 mutex_exit(hash_lock); 2153 ARCSTAT_BUMP(arcstat_hits); 2154 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2155 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2156 data, metadata, hits); 2157 2158 if (done) 2159 done(NULL, buf, private); 2160 } else { 2161 uint64_t size = BP_GET_LSIZE(bp); 2162 arc_callback_t *acb; 2163 2164 if (hdr == NULL) { 2165 /* this block is not in the cache */ 2166 arc_buf_hdr_t *exists; 2167 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2168 buf = arc_buf_alloc(spa, size, private, type); 2169 hdr = buf->b_hdr; 2170 hdr->b_dva = *BP_IDENTITY(bp); 2171 hdr->b_birth = bp->blk_birth; 2172 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2173 exists = buf_hash_insert(hdr, &hash_lock); 2174 if (exists) { 2175 /* somebody beat us to the hash insert */ 2176 mutex_exit(hash_lock); 2177 bzero(&hdr->b_dva, sizeof (dva_t)); 2178 hdr->b_birth = 0; 2179 hdr->b_cksum0 = 0; 2180 (void) arc_buf_remove_ref(buf, private); 2181 goto top; /* restart the IO request */ 2182 } 2183 /* if this is a prefetch, we don't have a reference */ 2184 if (*arc_flags & ARC_PREFETCH) { 2185 (void) remove_reference(hdr, hash_lock, 2186 private); 2187 hdr->b_flags |= ARC_PREFETCH; 2188 } 2189 if (BP_GET_LEVEL(bp) > 0) 2190 hdr->b_flags |= ARC_INDIRECT; 2191 } else { 2192 /* this block is in the ghost cache */ 2193 ASSERT(GHOST_STATE(hdr->b_state)); 2194 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2195 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2196 ASSERT(hdr->b_buf == NULL); 2197 2198 /* if this is a prefetch, we don't have a reference */ 2199 if (*arc_flags & ARC_PREFETCH) 2200 hdr->b_flags |= ARC_PREFETCH; 2201 else 2202 add_reference(hdr, hash_lock, private); 2203 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2204 buf->b_hdr = hdr; 2205 buf->b_data = NULL; 2206 buf->b_efunc = NULL; 2207 buf->b_private = NULL; 2208 buf->b_next = NULL; 2209 hdr->b_buf = buf; 2210 arc_get_data_buf(buf); 2211 ASSERT(hdr->b_datacnt == 0); 2212 hdr->b_datacnt = 1; 2213 2214 } 2215 2216 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2217 acb->acb_done = done; 2218 acb->acb_private = private; 2219 acb->acb_byteswap = swap; 2220 2221 ASSERT(hdr->b_acb == NULL); 2222 hdr->b_acb = acb; 2223 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2224 2225 /* 2226 * If the buffer has been evicted, migrate it to a present state 2227 * before issuing the I/O. Once we drop the hash-table lock, 2228 * the header will be marked as I/O in progress and have an 2229 * attached buffer. At this point, anybody who finds this 2230 * buffer ought to notice that it's legit but has a pending I/O. 2231 */ 2232 2233 if (GHOST_STATE(hdr->b_state)) 2234 arc_access(hdr, hash_lock); 2235 mutex_exit(hash_lock); 2236 2237 ASSERT3U(hdr->b_size, ==, size); 2238 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2239 zbookmark_t *, zb); 2240 ARCSTAT_BUMP(arcstat_misses); 2241 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2242 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2243 data, metadata, misses); 2244 2245 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2246 arc_read_done, buf, priority, flags, zb); 2247 2248 if (*arc_flags & ARC_WAIT) 2249 return (zio_wait(rzio)); 2250 2251 ASSERT(*arc_flags & ARC_NOWAIT); 2252 zio_nowait(rzio); 2253 } 2254 return (0); 2255 } 2256 2257 /* 2258 * arc_read() variant to support pool traversal. If the block is already 2259 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2260 * The idea is that we don't want pool traversal filling up memory, but 2261 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2262 */ 2263 int 2264 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2265 { 2266 arc_buf_hdr_t *hdr; 2267 kmutex_t *hash_mtx; 2268 int rc = 0; 2269 2270 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2271 2272 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2273 arc_buf_t *buf = hdr->b_buf; 2274 2275 ASSERT(buf); 2276 while (buf->b_data == NULL) { 2277 buf = buf->b_next; 2278 ASSERT(buf); 2279 } 2280 bcopy(buf->b_data, data, hdr->b_size); 2281 } else { 2282 rc = ENOENT; 2283 } 2284 2285 if (hash_mtx) 2286 mutex_exit(hash_mtx); 2287 2288 return (rc); 2289 } 2290 2291 void 2292 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2293 { 2294 ASSERT(buf->b_hdr != NULL); 2295 ASSERT(buf->b_hdr->b_state != arc_anon); 2296 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2297 buf->b_efunc = func; 2298 buf->b_private = private; 2299 } 2300 2301 /* 2302 * This is used by the DMU to let the ARC know that a buffer is 2303 * being evicted, so the ARC should clean up. If this arc buf 2304 * is not yet in the evicted state, it will be put there. 2305 */ 2306 int 2307 arc_buf_evict(arc_buf_t *buf) 2308 { 2309 arc_buf_hdr_t *hdr; 2310 kmutex_t *hash_lock; 2311 arc_buf_t **bufp; 2312 2313 mutex_enter(&arc_eviction_mtx); 2314 hdr = buf->b_hdr; 2315 if (hdr == NULL) { 2316 /* 2317 * We are in arc_do_user_evicts(). 2318 */ 2319 ASSERT(buf->b_data == NULL); 2320 mutex_exit(&arc_eviction_mtx); 2321 return (0); 2322 } 2323 hash_lock = HDR_LOCK(hdr); 2324 mutex_exit(&arc_eviction_mtx); 2325 2326 mutex_enter(hash_lock); 2327 2328 if (buf->b_data == NULL) { 2329 /* 2330 * We are on the eviction list. 2331 */ 2332 mutex_exit(hash_lock); 2333 mutex_enter(&arc_eviction_mtx); 2334 if (buf->b_hdr == NULL) { 2335 /* 2336 * We are already in arc_do_user_evicts(). 2337 */ 2338 mutex_exit(&arc_eviction_mtx); 2339 return (0); 2340 } else { 2341 arc_buf_t copy = *buf; /* structure assignment */ 2342 /* 2343 * Process this buffer now 2344 * but let arc_do_user_evicts() do the reaping. 2345 */ 2346 buf->b_efunc = NULL; 2347 mutex_exit(&arc_eviction_mtx); 2348 VERIFY(copy.b_efunc(©) == 0); 2349 return (1); 2350 } 2351 } 2352 2353 ASSERT(buf->b_hdr == hdr); 2354 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2355 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2356 2357 /* 2358 * Pull this buffer off of the hdr 2359 */ 2360 bufp = &hdr->b_buf; 2361 while (*bufp != buf) 2362 bufp = &(*bufp)->b_next; 2363 *bufp = buf->b_next; 2364 2365 ASSERT(buf->b_data != NULL); 2366 arc_buf_destroy(buf, FALSE, FALSE); 2367 2368 if (hdr->b_datacnt == 0) { 2369 arc_state_t *old_state = hdr->b_state; 2370 arc_state_t *evicted_state; 2371 2372 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2373 2374 evicted_state = 2375 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2376 2377 mutex_enter(&old_state->arcs_mtx); 2378 mutex_enter(&evicted_state->arcs_mtx); 2379 2380 arc_change_state(evicted_state, hdr, hash_lock); 2381 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2382 hdr->b_flags = ARC_IN_HASH_TABLE; 2383 2384 mutex_exit(&evicted_state->arcs_mtx); 2385 mutex_exit(&old_state->arcs_mtx); 2386 } 2387 mutex_exit(hash_lock); 2388 2389 VERIFY(buf->b_efunc(buf) == 0); 2390 buf->b_efunc = NULL; 2391 buf->b_private = NULL; 2392 buf->b_hdr = NULL; 2393 kmem_cache_free(buf_cache, buf); 2394 return (1); 2395 } 2396 2397 /* 2398 * Release this buffer from the cache. This must be done 2399 * after a read and prior to modifying the buffer contents. 2400 * If the buffer has more than one reference, we must make 2401 * make a new hdr for the buffer. 2402 */ 2403 void 2404 arc_release(arc_buf_t *buf, void *tag) 2405 { 2406 arc_buf_hdr_t *hdr = buf->b_hdr; 2407 kmutex_t *hash_lock = HDR_LOCK(hdr); 2408 2409 /* this buffer is not on any list */ 2410 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2411 2412 if (hdr->b_state == arc_anon) { 2413 /* this buffer is already released */ 2414 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2415 ASSERT(BUF_EMPTY(hdr)); 2416 ASSERT(buf->b_efunc == NULL); 2417 arc_buf_thaw(buf); 2418 return; 2419 } 2420 2421 mutex_enter(hash_lock); 2422 2423 /* 2424 * Do we have more than one buf? 2425 */ 2426 if (hdr->b_buf != buf || buf->b_next != NULL) { 2427 arc_buf_hdr_t *nhdr; 2428 arc_buf_t **bufp; 2429 uint64_t blksz = hdr->b_size; 2430 spa_t *spa = hdr->b_spa; 2431 arc_buf_contents_t type = hdr->b_type; 2432 2433 ASSERT(hdr->b_datacnt > 1); 2434 /* 2435 * Pull the data off of this buf and attach it to 2436 * a new anonymous buf. 2437 */ 2438 (void) remove_reference(hdr, hash_lock, tag); 2439 bufp = &hdr->b_buf; 2440 while (*bufp != buf) 2441 bufp = &(*bufp)->b_next; 2442 *bufp = (*bufp)->b_next; 2443 buf->b_next = NULL; 2444 2445 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2446 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2447 if (refcount_is_zero(&hdr->b_refcnt)) { 2448 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 2449 ASSERT3U(*size, >=, hdr->b_size); 2450 atomic_add_64(size, -hdr->b_size); 2451 } 2452 hdr->b_datacnt -= 1; 2453 arc_cksum_verify(buf); 2454 2455 mutex_exit(hash_lock); 2456 2457 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2458 nhdr->b_size = blksz; 2459 nhdr->b_spa = spa; 2460 nhdr->b_type = type; 2461 nhdr->b_buf = buf; 2462 nhdr->b_state = arc_anon; 2463 nhdr->b_arc_access = 0; 2464 nhdr->b_flags = 0; 2465 nhdr->b_datacnt = 1; 2466 nhdr->b_freeze_cksum = NULL; 2467 (void) refcount_add(&nhdr->b_refcnt, tag); 2468 buf->b_hdr = nhdr; 2469 atomic_add_64(&arc_anon->arcs_size, blksz); 2470 2471 hdr = nhdr; 2472 } else { 2473 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2474 ASSERT(!list_link_active(&hdr->b_arc_node)); 2475 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2476 arc_change_state(arc_anon, hdr, hash_lock); 2477 hdr->b_arc_access = 0; 2478 mutex_exit(hash_lock); 2479 bzero(&hdr->b_dva, sizeof (dva_t)); 2480 hdr->b_birth = 0; 2481 hdr->b_cksum0 = 0; 2482 arc_buf_thaw(buf); 2483 } 2484 buf->b_efunc = NULL; 2485 buf->b_private = NULL; 2486 } 2487 2488 int 2489 arc_released(arc_buf_t *buf) 2490 { 2491 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2492 } 2493 2494 int 2495 arc_has_callback(arc_buf_t *buf) 2496 { 2497 return (buf->b_efunc != NULL); 2498 } 2499 2500 #ifdef ZFS_DEBUG 2501 int 2502 arc_referenced(arc_buf_t *buf) 2503 { 2504 return (refcount_count(&buf->b_hdr->b_refcnt)); 2505 } 2506 #endif 2507 2508 static void 2509 arc_write_ready(zio_t *zio) 2510 { 2511 arc_write_callback_t *callback = zio->io_private; 2512 arc_buf_t *buf = callback->awcb_buf; 2513 2514 if (callback->awcb_ready) { 2515 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2516 callback->awcb_ready(zio, buf, callback->awcb_private); 2517 } 2518 arc_cksum_compute(buf); 2519 } 2520 2521 static void 2522 arc_write_done(zio_t *zio) 2523 { 2524 arc_write_callback_t *callback = zio->io_private; 2525 arc_buf_t *buf = callback->awcb_buf; 2526 arc_buf_hdr_t *hdr = buf->b_hdr; 2527 2528 hdr->b_acb = NULL; 2529 2530 /* this buffer is on no lists and is not in the hash table */ 2531 ASSERT3P(hdr->b_state, ==, arc_anon); 2532 2533 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2534 hdr->b_birth = zio->io_bp->blk_birth; 2535 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2536 /* 2537 * If the block to be written was all-zero, we may have 2538 * compressed it away. In this case no write was performed 2539 * so there will be no dva/birth-date/checksum. The buffer 2540 * must therefor remain anonymous (and uncached). 2541 */ 2542 if (!BUF_EMPTY(hdr)) { 2543 arc_buf_hdr_t *exists; 2544 kmutex_t *hash_lock; 2545 2546 arc_cksum_verify(buf); 2547 2548 exists = buf_hash_insert(hdr, &hash_lock); 2549 if (exists) { 2550 /* 2551 * This can only happen if we overwrite for 2552 * sync-to-convergence, because we remove 2553 * buffers from the hash table when we arc_free(). 2554 */ 2555 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2556 BP_IDENTITY(zio->io_bp))); 2557 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2558 zio->io_bp->blk_birth); 2559 2560 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2561 arc_change_state(arc_anon, exists, hash_lock); 2562 mutex_exit(hash_lock); 2563 arc_hdr_destroy(exists); 2564 exists = buf_hash_insert(hdr, &hash_lock); 2565 ASSERT3P(exists, ==, NULL); 2566 } 2567 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2568 arc_access(hdr, hash_lock); 2569 mutex_exit(hash_lock); 2570 } else if (callback->awcb_done == NULL) { 2571 int destroy_hdr; 2572 /* 2573 * This is an anonymous buffer with no user callback, 2574 * destroy it if there are no active references. 2575 */ 2576 mutex_enter(&arc_eviction_mtx); 2577 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2578 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2579 mutex_exit(&arc_eviction_mtx); 2580 if (destroy_hdr) 2581 arc_hdr_destroy(hdr); 2582 } else { 2583 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2584 } 2585 2586 if (callback->awcb_done) { 2587 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2588 callback->awcb_done(zio, buf, callback->awcb_private); 2589 } 2590 2591 kmem_free(callback, sizeof (arc_write_callback_t)); 2592 } 2593 2594 zio_t * 2595 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2596 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2597 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2598 int flags, zbookmark_t *zb) 2599 { 2600 arc_buf_hdr_t *hdr = buf->b_hdr; 2601 arc_write_callback_t *callback; 2602 zio_t *zio; 2603 2604 /* this is a private buffer - no locking required */ 2605 ASSERT3P(hdr->b_state, ==, arc_anon); 2606 ASSERT(BUF_EMPTY(hdr)); 2607 ASSERT(!HDR_IO_ERROR(hdr)); 2608 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2609 ASSERT(hdr->b_acb == 0); 2610 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2611 callback->awcb_ready = ready; 2612 callback->awcb_done = done; 2613 callback->awcb_private = private; 2614 callback->awcb_buf = buf; 2615 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2616 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2617 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2618 priority, flags, zb); 2619 2620 return (zio); 2621 } 2622 2623 int 2624 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2625 zio_done_func_t *done, void *private, uint32_t arc_flags) 2626 { 2627 arc_buf_hdr_t *ab; 2628 kmutex_t *hash_lock; 2629 zio_t *zio; 2630 2631 /* 2632 * If this buffer is in the cache, release it, so it 2633 * can be re-used. 2634 */ 2635 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2636 if (ab != NULL) { 2637 /* 2638 * The checksum of blocks to free is not always 2639 * preserved (eg. on the deadlist). However, if it is 2640 * nonzero, it should match what we have in the cache. 2641 */ 2642 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2643 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2644 if (ab->b_state != arc_anon) 2645 arc_change_state(arc_anon, ab, hash_lock); 2646 if (HDR_IO_IN_PROGRESS(ab)) { 2647 /* 2648 * This should only happen when we prefetch. 2649 */ 2650 ASSERT(ab->b_flags & ARC_PREFETCH); 2651 ASSERT3U(ab->b_datacnt, ==, 1); 2652 ab->b_flags |= ARC_FREED_IN_READ; 2653 if (HDR_IN_HASH_TABLE(ab)) 2654 buf_hash_remove(ab); 2655 ab->b_arc_access = 0; 2656 bzero(&ab->b_dva, sizeof (dva_t)); 2657 ab->b_birth = 0; 2658 ab->b_cksum0 = 0; 2659 ab->b_buf->b_efunc = NULL; 2660 ab->b_buf->b_private = NULL; 2661 mutex_exit(hash_lock); 2662 } else if (refcount_is_zero(&ab->b_refcnt)) { 2663 mutex_exit(hash_lock); 2664 arc_hdr_destroy(ab); 2665 ARCSTAT_BUMP(arcstat_deleted); 2666 } else { 2667 /* 2668 * We still have an active reference on this 2669 * buffer. This can happen, e.g., from 2670 * dbuf_unoverride(). 2671 */ 2672 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2673 ab->b_arc_access = 0; 2674 bzero(&ab->b_dva, sizeof (dva_t)); 2675 ab->b_birth = 0; 2676 ab->b_cksum0 = 0; 2677 ab->b_buf->b_efunc = NULL; 2678 ab->b_buf->b_private = NULL; 2679 mutex_exit(hash_lock); 2680 } 2681 } 2682 2683 zio = zio_free(pio, spa, txg, bp, done, private); 2684 2685 if (arc_flags & ARC_WAIT) 2686 return (zio_wait(zio)); 2687 2688 ASSERT(arc_flags & ARC_NOWAIT); 2689 zio_nowait(zio); 2690 2691 return (0); 2692 } 2693 2694 void 2695 arc_tempreserve_clear(uint64_t tempreserve) 2696 { 2697 atomic_add_64(&arc_tempreserve, -tempreserve); 2698 ASSERT((int64_t)arc_tempreserve >= 0); 2699 } 2700 2701 int 2702 arc_tempreserve_space(uint64_t tempreserve) 2703 { 2704 #ifdef ZFS_DEBUG 2705 /* 2706 * Once in a while, fail for no reason. Everything should cope. 2707 */ 2708 if (spa_get_random(10000) == 0) { 2709 dprintf("forcing random failure\n"); 2710 return (ERESTART); 2711 } 2712 #endif 2713 if (tempreserve > arc_c/4 && !arc_no_grow) 2714 arc_c = MIN(arc_c_max, tempreserve * 4); 2715 if (tempreserve > arc_c) 2716 return (ENOMEM); 2717 2718 /* 2719 * Throttle writes when the amount of dirty data in the cache 2720 * gets too large. We try to keep the cache less than half full 2721 * of dirty blocks so that our sync times don't grow too large. 2722 * Note: if two requests come in concurrently, we might let them 2723 * both succeed, when one of them should fail. Not a huge deal. 2724 * 2725 * XXX The limit should be adjusted dynamically to keep the time 2726 * to sync a dataset fixed (around 1-5 seconds?). 2727 */ 2728 2729 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2730 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2731 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 2732 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 2733 arc_tempreserve>>10, 2734 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 2735 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 2736 tempreserve>>10, arc_c>>10); 2737 return (ERESTART); 2738 } 2739 atomic_add_64(&arc_tempreserve, tempreserve); 2740 return (0); 2741 } 2742 2743 void 2744 arc_init(void) 2745 { 2746 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2747 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2748 2749 /* Convert seconds to clock ticks */ 2750 arc_min_prefetch_lifespan = 1 * hz; 2751 2752 /* Start out with 1/8 of all memory */ 2753 arc_c = physmem * PAGESIZE / 8; 2754 2755 #ifdef _KERNEL 2756 /* 2757 * On architectures where the physical memory can be larger 2758 * than the addressable space (intel in 32-bit mode), we may 2759 * need to limit the cache to 1/8 of VM size. 2760 */ 2761 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2762 #endif 2763 2764 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2765 arc_c_min = MAX(arc_c / 4, 64<<20); 2766 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2767 if (arc_c * 8 >= 1<<30) 2768 arc_c_max = (arc_c * 8) - (1<<30); 2769 else 2770 arc_c_max = arc_c_min; 2771 arc_c_max = MAX(arc_c * 6, arc_c_max); 2772 2773 /* 2774 * Allow the tunables to override our calculations if they are 2775 * reasonable (ie. over 64MB) 2776 */ 2777 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2778 arc_c_max = zfs_arc_max; 2779 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2780 arc_c_min = zfs_arc_min; 2781 2782 arc_c = arc_c_max; 2783 arc_p = (arc_c >> 1); 2784 2785 /* limit meta-data to 1/4 of the arc capacity */ 2786 arc_meta_limit = arc_c_max / 4; 2787 2788 /* Allow the tunable to override if it is reasonable */ 2789 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 2790 arc_meta_limit = zfs_arc_meta_limit; 2791 2792 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 2793 arc_c_min = arc_meta_limit / 2; 2794 2795 /* if kmem_flags are set, lets try to use less memory */ 2796 if (kmem_debugging()) 2797 arc_c = arc_c / 2; 2798 if (arc_c < arc_c_min) 2799 arc_c = arc_c_min; 2800 2801 arc_anon = &ARC_anon; 2802 arc_mru = &ARC_mru; 2803 arc_mru_ghost = &ARC_mru_ghost; 2804 arc_mfu = &ARC_mfu; 2805 arc_mfu_ghost = &ARC_mfu_ghost; 2806 arc_size = 0; 2807 2808 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2809 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2810 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2811 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2812 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2813 2814 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 2815 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2816 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 2817 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2818 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 2819 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2820 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 2821 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2822 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 2823 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2824 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 2825 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2826 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 2827 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2828 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 2829 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2830 2831 buf_init(); 2832 2833 arc_thread_exit = 0; 2834 arc_eviction_list = NULL; 2835 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2836 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2837 2838 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2839 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2840 2841 if (arc_ksp != NULL) { 2842 arc_ksp->ks_data = &arc_stats; 2843 kstat_install(arc_ksp); 2844 } 2845 2846 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2847 TS_RUN, minclsyspri); 2848 2849 arc_dead = FALSE; 2850 } 2851 2852 void 2853 arc_fini(void) 2854 { 2855 mutex_enter(&arc_reclaim_thr_lock); 2856 arc_thread_exit = 1; 2857 while (arc_thread_exit != 0) 2858 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2859 mutex_exit(&arc_reclaim_thr_lock); 2860 2861 arc_flush(); 2862 2863 arc_dead = TRUE; 2864 2865 if (arc_ksp != NULL) { 2866 kstat_delete(arc_ksp); 2867 arc_ksp = NULL; 2868 } 2869 2870 mutex_destroy(&arc_eviction_mtx); 2871 mutex_destroy(&arc_reclaim_thr_lock); 2872 cv_destroy(&arc_reclaim_thr_cv); 2873 2874 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 2875 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 2876 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 2877 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 2878 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 2879 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 2880 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 2881 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 2882 2883 mutex_destroy(&arc_anon->arcs_mtx); 2884 mutex_destroy(&arc_mru->arcs_mtx); 2885 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2886 mutex_destroy(&arc_mfu->arcs_mtx); 2887 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2888 2889 buf_fini(); 2890 } 2891