1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 #include <sys/kstat.h> 128 129 static kmutex_t arc_reclaim_thr_lock; 130 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131 static uint8_t arc_thread_exit; 132 133 #define ARC_REDUCE_DNLC_PERCENT 3 134 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136 typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139 } arc_reclaim_strategy_t; 140 141 /* number of seconds before growing cache again */ 142 static int arc_grow_retry = 60; 143 144 /* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148 static int arc_min_prefetch_lifespan; 149 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 158 /* 159 * Note that buffers can be in one of 5 states: 160 * ARC_anon - anonymous (discussed below) 161 * ARC_mru - recently used, currently cached 162 * ARC_mru_ghost - recentely used, no longer in cache 163 * ARC_mfu - frequently used, currently cached 164 * ARC_mfu_ghost - frequently used, no longer in cache 165 * When there are no active references to the buffer, they are 166 * are linked onto a list in one of these arc states. These are 167 * the only buffers that can be evicted or deleted. Within each 168 * state there are multiple lists, one for meta-data and one for 169 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 170 * etc.) is tracked separately so that it can be managed more 171 * explicitly: favored over data, limited explicitely. 172 * 173 * Anonymous buffers are buffers that are not associated with 174 * a DVA. These are buffers that hold dirty block copies 175 * before they are written to stable storage. By definition, 176 * they are "ref'd" and are considered part of arc_mru 177 * that cannot be freed. Generally, they will aquire a DVA 178 * as they are written and migrate onto the arc_mru list. 179 */ 180 181 typedef struct arc_state { 182 list_t arcs_list[ARC_BUFC_NUMTYPES]; /* list of evictable buffers */ 183 uint64_t arcs_lsize[ARC_BUFC_NUMTYPES]; /* amount of evictable data */ 184 uint64_t arcs_size; /* total amount of data in this state */ 185 kmutex_t arcs_mtx; 186 } arc_state_t; 187 188 /* The 5 states: */ 189 static arc_state_t ARC_anon; 190 static arc_state_t ARC_mru; 191 static arc_state_t ARC_mru_ghost; 192 static arc_state_t ARC_mfu; 193 static arc_state_t ARC_mfu_ghost; 194 195 typedef struct arc_stats { 196 kstat_named_t arcstat_hits; 197 kstat_named_t arcstat_misses; 198 kstat_named_t arcstat_demand_data_hits; 199 kstat_named_t arcstat_demand_data_misses; 200 kstat_named_t arcstat_demand_metadata_hits; 201 kstat_named_t arcstat_demand_metadata_misses; 202 kstat_named_t arcstat_prefetch_data_hits; 203 kstat_named_t arcstat_prefetch_data_misses; 204 kstat_named_t arcstat_prefetch_metadata_hits; 205 kstat_named_t arcstat_prefetch_metadata_misses; 206 kstat_named_t arcstat_mru_hits; 207 kstat_named_t arcstat_mru_ghost_hits; 208 kstat_named_t arcstat_mfu_hits; 209 kstat_named_t arcstat_mfu_ghost_hits; 210 kstat_named_t arcstat_deleted; 211 kstat_named_t arcstat_recycle_miss; 212 kstat_named_t arcstat_mutex_miss; 213 kstat_named_t arcstat_evict_skip; 214 kstat_named_t arcstat_hash_elements; 215 kstat_named_t arcstat_hash_elements_max; 216 kstat_named_t arcstat_hash_collisions; 217 kstat_named_t arcstat_hash_chains; 218 kstat_named_t arcstat_hash_chain_max; 219 kstat_named_t arcstat_p; 220 kstat_named_t arcstat_c; 221 kstat_named_t arcstat_c_min; 222 kstat_named_t arcstat_c_max; 223 kstat_named_t arcstat_size; 224 } arc_stats_t; 225 226 static arc_stats_t arc_stats = { 227 { "hits", KSTAT_DATA_UINT64 }, 228 { "misses", KSTAT_DATA_UINT64 }, 229 { "demand_data_hits", KSTAT_DATA_UINT64 }, 230 { "demand_data_misses", KSTAT_DATA_UINT64 }, 231 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 232 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 233 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 234 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 235 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 236 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 237 { "mru_hits", KSTAT_DATA_UINT64 }, 238 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 239 { "mfu_hits", KSTAT_DATA_UINT64 }, 240 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 241 { "deleted", KSTAT_DATA_UINT64 }, 242 { "recycle_miss", KSTAT_DATA_UINT64 }, 243 { "mutex_miss", KSTAT_DATA_UINT64 }, 244 { "evict_skip", KSTAT_DATA_UINT64 }, 245 { "hash_elements", KSTAT_DATA_UINT64 }, 246 { "hash_elements_max", KSTAT_DATA_UINT64 }, 247 { "hash_collisions", KSTAT_DATA_UINT64 }, 248 { "hash_chains", KSTAT_DATA_UINT64 }, 249 { "hash_chain_max", KSTAT_DATA_UINT64 }, 250 { "p", KSTAT_DATA_UINT64 }, 251 { "c", KSTAT_DATA_UINT64 }, 252 { "c_min", KSTAT_DATA_UINT64 }, 253 { "c_max", KSTAT_DATA_UINT64 }, 254 { "size", KSTAT_DATA_UINT64 } 255 }; 256 257 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 258 259 #define ARCSTAT_INCR(stat, val) \ 260 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 261 262 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 263 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 264 265 #define ARCSTAT_MAX(stat, val) { \ 266 uint64_t m; \ 267 while ((val) > (m = arc_stats.stat.value.ui64) && \ 268 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 269 continue; \ 270 } 271 272 #define ARCSTAT_MAXSTAT(stat) \ 273 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 274 275 /* 276 * We define a macro to allow ARC hits/misses to be easily broken down by 277 * two separate conditions, giving a total of four different subtypes for 278 * each of hits and misses (so eight statistics total). 279 */ 280 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 281 if (cond1) { \ 282 if (cond2) { \ 283 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 284 } else { \ 285 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 286 } \ 287 } else { \ 288 if (cond2) { \ 289 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 290 } else { \ 291 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 292 } \ 293 } 294 295 kstat_t *arc_ksp; 296 static arc_state_t *arc_anon; 297 static arc_state_t *arc_mru; 298 static arc_state_t *arc_mru_ghost; 299 static arc_state_t *arc_mfu; 300 static arc_state_t *arc_mfu_ghost; 301 302 /* 303 * There are several ARC variables that are critical to export as kstats -- 304 * but we don't want to have to grovel around in the kstat whenever we wish to 305 * manipulate them. For these variables, we therefore define them to be in 306 * terms of the statistic variable. This assures that we are not introducing 307 * the possibility of inconsistency by having shadow copies of the variables, 308 * while still allowing the code to be readable. 309 */ 310 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 311 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 312 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 313 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 314 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 315 316 static int arc_no_grow; /* Don't try to grow cache size */ 317 static uint64_t arc_tempreserve; 318 static uint64_t arc_meta_used; 319 static uint64_t arc_meta_limit; 320 static uint64_t arc_meta_max = 0; 321 322 typedef struct arc_callback arc_callback_t; 323 324 struct arc_callback { 325 void *acb_private; 326 arc_done_func_t *acb_done; 327 arc_byteswap_func_t *acb_byteswap; 328 arc_buf_t *acb_buf; 329 zio_t *acb_zio_dummy; 330 arc_callback_t *acb_next; 331 }; 332 333 typedef struct arc_write_callback arc_write_callback_t; 334 335 struct arc_write_callback { 336 void *awcb_private; 337 arc_done_func_t *awcb_ready; 338 arc_done_func_t *awcb_done; 339 arc_buf_t *awcb_buf; 340 }; 341 342 struct arc_buf_hdr { 343 /* protected by hash lock */ 344 dva_t b_dva; 345 uint64_t b_birth; 346 uint64_t b_cksum0; 347 348 kmutex_t b_freeze_lock; 349 zio_cksum_t *b_freeze_cksum; 350 351 arc_buf_hdr_t *b_hash_next; 352 arc_buf_t *b_buf; 353 uint32_t b_flags; 354 uint32_t b_datacnt; 355 356 arc_callback_t *b_acb; 357 kcondvar_t b_cv; 358 359 /* immutable */ 360 arc_buf_contents_t b_type; 361 uint64_t b_size; 362 spa_t *b_spa; 363 364 /* protected by arc state mutex */ 365 arc_state_t *b_state; 366 list_node_t b_arc_node; 367 368 /* updated atomically */ 369 clock_t b_arc_access; 370 371 /* self protecting */ 372 refcount_t b_refcnt; 373 }; 374 375 static arc_buf_t *arc_eviction_list; 376 static kmutex_t arc_eviction_mtx; 377 static arc_buf_hdr_t arc_eviction_hdr; 378 static void arc_get_data_buf(arc_buf_t *buf); 379 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 380 static int arc_evict_needed(arc_buf_contents_t type); 381 382 #define GHOST_STATE(state) \ 383 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 384 385 /* 386 * Private ARC flags. These flags are private ARC only flags that will show up 387 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 388 * be passed in as arc_flags in things like arc_read. However, these flags 389 * should never be passed and should only be set by ARC code. When adding new 390 * public flags, make sure not to smash the private ones. 391 */ 392 393 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 394 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 395 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 396 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 397 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 398 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 399 400 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 401 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 402 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 403 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 404 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 405 406 /* 407 * Hash table routines 408 */ 409 410 #define HT_LOCK_PAD 64 411 412 struct ht_lock { 413 kmutex_t ht_lock; 414 #ifdef _KERNEL 415 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 416 #endif 417 }; 418 419 #define BUF_LOCKS 256 420 typedef struct buf_hash_table { 421 uint64_t ht_mask; 422 arc_buf_hdr_t **ht_table; 423 struct ht_lock ht_locks[BUF_LOCKS]; 424 } buf_hash_table_t; 425 426 static buf_hash_table_t buf_hash_table; 427 428 #define BUF_HASH_INDEX(spa, dva, birth) \ 429 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 430 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 431 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 432 #define HDR_LOCK(buf) \ 433 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 434 435 uint64_t zfs_crc64_table[256]; 436 437 static uint64_t 438 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 439 { 440 uintptr_t spav = (uintptr_t)spa; 441 uint8_t *vdva = (uint8_t *)dva; 442 uint64_t crc = -1ULL; 443 int i; 444 445 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 446 447 for (i = 0; i < sizeof (dva_t); i++) 448 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 449 450 crc ^= (spav>>8) ^ birth; 451 452 return (crc); 453 } 454 455 #define BUF_EMPTY(buf) \ 456 ((buf)->b_dva.dva_word[0] == 0 && \ 457 (buf)->b_dva.dva_word[1] == 0 && \ 458 (buf)->b_birth == 0) 459 460 #define BUF_EQUAL(spa, dva, birth, buf) \ 461 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 462 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 463 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 464 465 static arc_buf_hdr_t * 466 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 467 { 468 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 469 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 470 arc_buf_hdr_t *buf; 471 472 mutex_enter(hash_lock); 473 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 474 buf = buf->b_hash_next) { 475 if (BUF_EQUAL(spa, dva, birth, buf)) { 476 *lockp = hash_lock; 477 return (buf); 478 } 479 } 480 mutex_exit(hash_lock); 481 *lockp = NULL; 482 return (NULL); 483 } 484 485 /* 486 * Insert an entry into the hash table. If there is already an element 487 * equal to elem in the hash table, then the already existing element 488 * will be returned and the new element will not be inserted. 489 * Otherwise returns NULL. 490 */ 491 static arc_buf_hdr_t * 492 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 493 { 494 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 495 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 496 arc_buf_hdr_t *fbuf; 497 uint32_t i; 498 499 ASSERT(!HDR_IN_HASH_TABLE(buf)); 500 *lockp = hash_lock; 501 mutex_enter(hash_lock); 502 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 503 fbuf = fbuf->b_hash_next, i++) { 504 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 505 return (fbuf); 506 } 507 508 buf->b_hash_next = buf_hash_table.ht_table[idx]; 509 buf_hash_table.ht_table[idx] = buf; 510 buf->b_flags |= ARC_IN_HASH_TABLE; 511 512 /* collect some hash table performance data */ 513 if (i > 0) { 514 ARCSTAT_BUMP(arcstat_hash_collisions); 515 if (i == 1) 516 ARCSTAT_BUMP(arcstat_hash_chains); 517 518 ARCSTAT_MAX(arcstat_hash_chain_max, i); 519 } 520 521 ARCSTAT_BUMP(arcstat_hash_elements); 522 ARCSTAT_MAXSTAT(arcstat_hash_elements); 523 524 return (NULL); 525 } 526 527 static void 528 buf_hash_remove(arc_buf_hdr_t *buf) 529 { 530 arc_buf_hdr_t *fbuf, **bufp; 531 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 532 533 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 534 ASSERT(HDR_IN_HASH_TABLE(buf)); 535 536 bufp = &buf_hash_table.ht_table[idx]; 537 while ((fbuf = *bufp) != buf) { 538 ASSERT(fbuf != NULL); 539 bufp = &fbuf->b_hash_next; 540 } 541 *bufp = buf->b_hash_next; 542 buf->b_hash_next = NULL; 543 buf->b_flags &= ~ARC_IN_HASH_TABLE; 544 545 /* collect some hash table performance data */ 546 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 547 548 if (buf_hash_table.ht_table[idx] && 549 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 550 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 551 } 552 553 /* 554 * Global data structures and functions for the buf kmem cache. 555 */ 556 static kmem_cache_t *hdr_cache; 557 static kmem_cache_t *buf_cache; 558 559 static void 560 buf_fini(void) 561 { 562 int i; 563 564 kmem_free(buf_hash_table.ht_table, 565 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 566 for (i = 0; i < BUF_LOCKS; i++) 567 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 568 kmem_cache_destroy(hdr_cache); 569 kmem_cache_destroy(buf_cache); 570 } 571 572 /* 573 * Constructor callback - called when the cache is empty 574 * and a new buf is requested. 575 */ 576 /* ARGSUSED */ 577 static int 578 hdr_cons(void *vbuf, void *unused, int kmflag) 579 { 580 arc_buf_hdr_t *buf = vbuf; 581 582 bzero(buf, sizeof (arc_buf_hdr_t)); 583 refcount_create(&buf->b_refcnt); 584 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 585 return (0); 586 } 587 588 /* 589 * Destructor callback - called when a cached buf is 590 * no longer required. 591 */ 592 /* ARGSUSED */ 593 static void 594 hdr_dest(void *vbuf, void *unused) 595 { 596 arc_buf_hdr_t *buf = vbuf; 597 598 refcount_destroy(&buf->b_refcnt); 599 cv_destroy(&buf->b_cv); 600 } 601 602 /* 603 * Reclaim callback -- invoked when memory is low. 604 */ 605 /* ARGSUSED */ 606 static void 607 hdr_recl(void *unused) 608 { 609 dprintf("hdr_recl called\n"); 610 /* 611 * umem calls the reclaim func when we destroy the buf cache, 612 * which is after we do arc_fini(). 613 */ 614 if (!arc_dead) 615 cv_signal(&arc_reclaim_thr_cv); 616 } 617 618 static void 619 buf_init(void) 620 { 621 uint64_t *ct; 622 uint64_t hsize = 1ULL << 12; 623 int i, j; 624 625 /* 626 * The hash table is big enough to fill all of physical memory 627 * with an average 64K block size. The table will take up 628 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 629 */ 630 while (hsize * 65536 < physmem * PAGESIZE) 631 hsize <<= 1; 632 retry: 633 buf_hash_table.ht_mask = hsize - 1; 634 buf_hash_table.ht_table = 635 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 636 if (buf_hash_table.ht_table == NULL) { 637 ASSERT(hsize > (1ULL << 8)); 638 hsize >>= 1; 639 goto retry; 640 } 641 642 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 643 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 644 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 645 0, NULL, NULL, NULL, NULL, NULL, 0); 646 647 for (i = 0; i < 256; i++) 648 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 649 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 650 651 for (i = 0; i < BUF_LOCKS; i++) { 652 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 653 NULL, MUTEX_DEFAULT, NULL); 654 } 655 } 656 657 #define ARC_MINTIME (hz>>4) /* 62 ms */ 658 659 static void 660 arc_cksum_verify(arc_buf_t *buf) 661 { 662 zio_cksum_t zc; 663 664 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 665 return; 666 667 mutex_enter(&buf->b_hdr->b_freeze_lock); 668 if (buf->b_hdr->b_freeze_cksum == NULL || 669 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 670 mutex_exit(&buf->b_hdr->b_freeze_lock); 671 return; 672 } 673 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 674 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 675 panic("buffer modified while frozen!"); 676 mutex_exit(&buf->b_hdr->b_freeze_lock); 677 } 678 679 static void 680 arc_cksum_compute(arc_buf_t *buf) 681 { 682 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 683 return; 684 685 mutex_enter(&buf->b_hdr->b_freeze_lock); 686 if (buf->b_hdr->b_freeze_cksum != NULL) { 687 mutex_exit(&buf->b_hdr->b_freeze_lock); 688 return; 689 } 690 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 691 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 692 buf->b_hdr->b_freeze_cksum); 693 mutex_exit(&buf->b_hdr->b_freeze_lock); 694 } 695 696 void 697 arc_buf_thaw(arc_buf_t *buf) 698 { 699 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 700 return; 701 702 if (buf->b_hdr->b_state != arc_anon) 703 panic("modifying non-anon buffer!"); 704 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 705 panic("modifying buffer while i/o in progress!"); 706 arc_cksum_verify(buf); 707 mutex_enter(&buf->b_hdr->b_freeze_lock); 708 if (buf->b_hdr->b_freeze_cksum != NULL) { 709 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 710 buf->b_hdr->b_freeze_cksum = NULL; 711 } 712 mutex_exit(&buf->b_hdr->b_freeze_lock); 713 } 714 715 void 716 arc_buf_freeze(arc_buf_t *buf) 717 { 718 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 719 return; 720 721 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 722 buf->b_hdr->b_state == arc_anon); 723 arc_cksum_compute(buf); 724 } 725 726 static void 727 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 728 { 729 ASSERT(MUTEX_HELD(hash_lock)); 730 731 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 732 (ab->b_state != arc_anon)) { 733 uint64_t delta = ab->b_size * ab->b_datacnt; 734 list_t *list = &ab->b_state->arcs_list[ab->b_type]; 735 uint64_t *size = &ab->b_state->arcs_lsize[ab->b_type]; 736 737 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 738 mutex_enter(&ab->b_state->arcs_mtx); 739 ASSERT(list_link_active(&ab->b_arc_node)); 740 list_remove(list, ab); 741 if (GHOST_STATE(ab->b_state)) { 742 ASSERT3U(ab->b_datacnt, ==, 0); 743 ASSERT3P(ab->b_buf, ==, NULL); 744 delta = ab->b_size; 745 } 746 ASSERT(delta > 0); 747 ASSERT3U(*size, >=, delta); 748 atomic_add_64(size, -delta); 749 mutex_exit(&ab->b_state->arcs_mtx); 750 /* remove the prefetch flag is we get a reference */ 751 if (ab->b_flags & ARC_PREFETCH) 752 ab->b_flags &= ~ARC_PREFETCH; 753 } 754 } 755 756 static int 757 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 758 { 759 int cnt; 760 arc_state_t *state = ab->b_state; 761 762 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 763 ASSERT(!GHOST_STATE(state)); 764 765 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 766 (state != arc_anon)) { 767 uint64_t *size = &state->arcs_lsize[ab->b_type]; 768 769 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 770 mutex_enter(&state->arcs_mtx); 771 ASSERT(!list_link_active(&ab->b_arc_node)); 772 list_insert_head(&state->arcs_list[ab->b_type], ab); 773 ASSERT(ab->b_datacnt > 0); 774 atomic_add_64(size, ab->b_size * ab->b_datacnt); 775 mutex_exit(&state->arcs_mtx); 776 } 777 return (cnt); 778 } 779 780 /* 781 * Move the supplied buffer to the indicated state. The mutex 782 * for the buffer must be held by the caller. 783 */ 784 static void 785 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 786 { 787 arc_state_t *old_state = ab->b_state; 788 int64_t refcnt = refcount_count(&ab->b_refcnt); 789 uint64_t from_delta, to_delta; 790 791 ASSERT(MUTEX_HELD(hash_lock)); 792 ASSERT(new_state != old_state); 793 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 794 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 795 796 from_delta = to_delta = ab->b_datacnt * ab->b_size; 797 798 /* 799 * If this buffer is evictable, transfer it from the 800 * old state list to the new state list. 801 */ 802 if (refcnt == 0) { 803 if (old_state != arc_anon) { 804 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 805 uint64_t *size = &old_state->arcs_lsize[ab->b_type]; 806 807 if (use_mutex) 808 mutex_enter(&old_state->arcs_mtx); 809 810 ASSERT(list_link_active(&ab->b_arc_node)); 811 list_remove(&old_state->arcs_list[ab->b_type], ab); 812 813 /* 814 * If prefetching out of the ghost cache, 815 * we will have a non-null datacnt. 816 */ 817 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 818 /* ghost elements have a ghost size */ 819 ASSERT(ab->b_buf == NULL); 820 from_delta = ab->b_size; 821 } 822 ASSERT3U(*size, >=, from_delta); 823 atomic_add_64(size, -from_delta); 824 825 if (use_mutex) 826 mutex_exit(&old_state->arcs_mtx); 827 } 828 if (new_state != arc_anon) { 829 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 830 uint64_t *size = &new_state->arcs_lsize[ab->b_type]; 831 832 if (use_mutex) 833 mutex_enter(&new_state->arcs_mtx); 834 835 list_insert_head(&new_state->arcs_list[ab->b_type], ab); 836 837 /* ghost elements have a ghost size */ 838 if (GHOST_STATE(new_state)) { 839 ASSERT(ab->b_datacnt == 0); 840 ASSERT(ab->b_buf == NULL); 841 to_delta = ab->b_size; 842 } 843 atomic_add_64(size, to_delta); 844 ASSERT3U(new_state->arcs_size + to_delta, >=, *size); 845 846 if (use_mutex) 847 mutex_exit(&new_state->arcs_mtx); 848 } 849 } 850 851 ASSERT(!BUF_EMPTY(ab)); 852 if (new_state == arc_anon && old_state != arc_anon) { 853 buf_hash_remove(ab); 854 } 855 856 /* adjust state sizes */ 857 if (to_delta) 858 atomic_add_64(&new_state->arcs_size, to_delta); 859 if (from_delta) { 860 ASSERT3U(old_state->arcs_size, >=, from_delta); 861 atomic_add_64(&old_state->arcs_size, -from_delta); 862 } 863 ab->b_state = new_state; 864 } 865 866 void 867 arc_space_consume(uint64_t space) 868 { 869 atomic_add_64(&arc_meta_used, space); 870 atomic_add_64(&arc_size, space); 871 } 872 873 void 874 arc_space_return(uint64_t space) 875 { 876 ASSERT(arc_meta_used >= space); 877 if (arc_meta_max < arc_meta_used) 878 arc_meta_max = arc_meta_used; 879 atomic_add_64(&arc_meta_used, -space); 880 ASSERT(arc_size >= space); 881 atomic_add_64(&arc_size, -space); 882 } 883 884 void * 885 arc_data_buf_alloc(uint64_t size) 886 { 887 if (arc_evict_needed(ARC_BUFC_DATA)) 888 cv_signal(&arc_reclaim_thr_cv); 889 atomic_add_64(&arc_size, size); 890 return (zio_data_buf_alloc(size)); 891 } 892 893 void 894 arc_data_buf_free(void *buf, uint64_t size) 895 { 896 zio_data_buf_free(buf, size); 897 ASSERT(arc_size >= size); 898 atomic_add_64(&arc_size, -size); 899 } 900 901 arc_buf_t * 902 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 903 { 904 arc_buf_hdr_t *hdr; 905 arc_buf_t *buf; 906 907 ASSERT3U(size, >, 0); 908 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 909 ASSERT(BUF_EMPTY(hdr)); 910 hdr->b_size = size; 911 hdr->b_type = type; 912 hdr->b_spa = spa; 913 hdr->b_state = arc_anon; 914 hdr->b_arc_access = 0; 915 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 916 buf->b_hdr = hdr; 917 buf->b_data = NULL; 918 buf->b_efunc = NULL; 919 buf->b_private = NULL; 920 buf->b_next = NULL; 921 hdr->b_buf = buf; 922 arc_get_data_buf(buf); 923 hdr->b_datacnt = 1; 924 hdr->b_flags = 0; 925 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 926 (void) refcount_add(&hdr->b_refcnt, tag); 927 928 return (buf); 929 } 930 931 static arc_buf_t * 932 arc_buf_clone(arc_buf_t *from) 933 { 934 arc_buf_t *buf; 935 arc_buf_hdr_t *hdr = from->b_hdr; 936 uint64_t size = hdr->b_size; 937 938 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 939 buf->b_hdr = hdr; 940 buf->b_data = NULL; 941 buf->b_efunc = NULL; 942 buf->b_private = NULL; 943 buf->b_next = hdr->b_buf; 944 hdr->b_buf = buf; 945 arc_get_data_buf(buf); 946 bcopy(from->b_data, buf->b_data, size); 947 hdr->b_datacnt += 1; 948 return (buf); 949 } 950 951 void 952 arc_buf_add_ref(arc_buf_t *buf, void* tag) 953 { 954 arc_buf_hdr_t *hdr; 955 kmutex_t *hash_lock; 956 957 /* 958 * Check to see if this buffer is currently being evicted via 959 * arc_do_user_evicts(). 960 */ 961 mutex_enter(&arc_eviction_mtx); 962 hdr = buf->b_hdr; 963 if (hdr == NULL) { 964 mutex_exit(&arc_eviction_mtx); 965 return; 966 } 967 hash_lock = HDR_LOCK(hdr); 968 mutex_exit(&arc_eviction_mtx); 969 970 mutex_enter(hash_lock); 971 if (buf->b_data == NULL) { 972 /* 973 * This buffer is evicted. 974 */ 975 mutex_exit(hash_lock); 976 return; 977 } 978 979 ASSERT(buf->b_hdr == hdr); 980 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 981 add_reference(hdr, hash_lock, tag); 982 arc_access(hdr, hash_lock); 983 mutex_exit(hash_lock); 984 ARCSTAT_BUMP(arcstat_hits); 985 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 986 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 987 data, metadata, hits); 988 } 989 990 static void 991 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 992 { 993 arc_buf_t **bufp; 994 995 /* free up data associated with the buf */ 996 if (buf->b_data) { 997 arc_state_t *state = buf->b_hdr->b_state; 998 uint64_t size = buf->b_hdr->b_size; 999 arc_buf_contents_t type = buf->b_hdr->b_type; 1000 1001 arc_cksum_verify(buf); 1002 if (!recycle) { 1003 if (type == ARC_BUFC_METADATA) { 1004 zio_buf_free(buf->b_data, size); 1005 arc_space_return(size); 1006 } else { 1007 ASSERT(type == ARC_BUFC_DATA); 1008 zio_data_buf_free(buf->b_data, size); 1009 atomic_add_64(&arc_size, -size); 1010 } 1011 } 1012 if (list_link_active(&buf->b_hdr->b_arc_node)) { 1013 uint64_t *cnt = &state->arcs_lsize[type]; 1014 1015 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 1016 ASSERT(state != arc_anon); 1017 1018 ASSERT3U(*cnt, >=, size); 1019 atomic_add_64(cnt, -size); 1020 } 1021 ASSERT3U(state->arcs_size, >=, size); 1022 atomic_add_64(&state->arcs_size, -size); 1023 buf->b_data = NULL; 1024 ASSERT(buf->b_hdr->b_datacnt > 0); 1025 buf->b_hdr->b_datacnt -= 1; 1026 } 1027 1028 /* only remove the buf if requested */ 1029 if (!all) 1030 return; 1031 1032 /* remove the buf from the hdr list */ 1033 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 1034 continue; 1035 *bufp = buf->b_next; 1036 1037 ASSERT(buf->b_efunc == NULL); 1038 1039 /* clean up the buf */ 1040 buf->b_hdr = NULL; 1041 kmem_cache_free(buf_cache, buf); 1042 } 1043 1044 static void 1045 arc_hdr_destroy(arc_buf_hdr_t *hdr) 1046 { 1047 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1048 ASSERT3P(hdr->b_state, ==, arc_anon); 1049 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1050 1051 if (!BUF_EMPTY(hdr)) { 1052 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1053 bzero(&hdr->b_dva, sizeof (dva_t)); 1054 hdr->b_birth = 0; 1055 hdr->b_cksum0 = 0; 1056 } 1057 while (hdr->b_buf) { 1058 arc_buf_t *buf = hdr->b_buf; 1059 1060 if (buf->b_efunc) { 1061 mutex_enter(&arc_eviction_mtx); 1062 ASSERT(buf->b_hdr != NULL); 1063 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1064 hdr->b_buf = buf->b_next; 1065 buf->b_hdr = &arc_eviction_hdr; 1066 buf->b_next = arc_eviction_list; 1067 arc_eviction_list = buf; 1068 mutex_exit(&arc_eviction_mtx); 1069 } else { 1070 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1071 } 1072 } 1073 if (hdr->b_freeze_cksum != NULL) { 1074 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1075 hdr->b_freeze_cksum = NULL; 1076 } 1077 1078 ASSERT(!list_link_active(&hdr->b_arc_node)); 1079 ASSERT3P(hdr->b_hash_next, ==, NULL); 1080 ASSERT3P(hdr->b_acb, ==, NULL); 1081 kmem_cache_free(hdr_cache, hdr); 1082 } 1083 1084 void 1085 arc_buf_free(arc_buf_t *buf, void *tag) 1086 { 1087 arc_buf_hdr_t *hdr = buf->b_hdr; 1088 int hashed = hdr->b_state != arc_anon; 1089 1090 ASSERT(buf->b_efunc == NULL); 1091 ASSERT(buf->b_data != NULL); 1092 1093 if (hashed) { 1094 kmutex_t *hash_lock = HDR_LOCK(hdr); 1095 1096 mutex_enter(hash_lock); 1097 (void) remove_reference(hdr, hash_lock, tag); 1098 if (hdr->b_datacnt > 1) 1099 arc_buf_destroy(buf, FALSE, TRUE); 1100 else 1101 hdr->b_flags |= ARC_BUF_AVAILABLE; 1102 mutex_exit(hash_lock); 1103 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1104 int destroy_hdr; 1105 /* 1106 * We are in the middle of an async write. Don't destroy 1107 * this buffer unless the write completes before we finish 1108 * decrementing the reference count. 1109 */ 1110 mutex_enter(&arc_eviction_mtx); 1111 (void) remove_reference(hdr, NULL, tag); 1112 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1113 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1114 mutex_exit(&arc_eviction_mtx); 1115 if (destroy_hdr) 1116 arc_hdr_destroy(hdr); 1117 } else { 1118 if (remove_reference(hdr, NULL, tag) > 0) { 1119 ASSERT(HDR_IO_ERROR(hdr)); 1120 arc_buf_destroy(buf, FALSE, TRUE); 1121 } else { 1122 arc_hdr_destroy(hdr); 1123 } 1124 } 1125 } 1126 1127 int 1128 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1129 { 1130 arc_buf_hdr_t *hdr = buf->b_hdr; 1131 kmutex_t *hash_lock = HDR_LOCK(hdr); 1132 int no_callback = (buf->b_efunc == NULL); 1133 1134 if (hdr->b_state == arc_anon) { 1135 arc_buf_free(buf, tag); 1136 return (no_callback); 1137 } 1138 1139 mutex_enter(hash_lock); 1140 ASSERT(hdr->b_state != arc_anon); 1141 ASSERT(buf->b_data != NULL); 1142 1143 (void) remove_reference(hdr, hash_lock, tag); 1144 if (hdr->b_datacnt > 1) { 1145 if (no_callback) 1146 arc_buf_destroy(buf, FALSE, TRUE); 1147 } else if (no_callback) { 1148 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1149 hdr->b_flags |= ARC_BUF_AVAILABLE; 1150 } 1151 ASSERT(no_callback || hdr->b_datacnt > 1 || 1152 refcount_is_zero(&hdr->b_refcnt)); 1153 mutex_exit(hash_lock); 1154 return (no_callback); 1155 } 1156 1157 int 1158 arc_buf_size(arc_buf_t *buf) 1159 { 1160 return (buf->b_hdr->b_size); 1161 } 1162 1163 /* 1164 * Evict buffers from list until we've removed the specified number of 1165 * bytes. Move the removed buffers to the appropriate evict state. 1166 * If the recycle flag is set, then attempt to "recycle" a buffer: 1167 * - look for a buffer to evict that is `bytes' long. 1168 * - return the data block from this buffer rather than freeing it. 1169 * This flag is used by callers that are trying to make space for a 1170 * new buffer in a full arc cache. 1171 */ 1172 static void * 1173 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1174 arc_buf_contents_t type) 1175 { 1176 arc_state_t *evicted_state; 1177 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1178 arc_buf_hdr_t *ab, *ab_prev = NULL; 1179 list_t *list = &state->arcs_list[type]; 1180 kmutex_t *hash_lock; 1181 boolean_t have_lock; 1182 void *stolen = NULL; 1183 1184 ASSERT(state == arc_mru || state == arc_mfu); 1185 1186 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1187 1188 mutex_enter(&state->arcs_mtx); 1189 mutex_enter(&evicted_state->arcs_mtx); 1190 1191 for (ab = list_tail(list); ab; ab = ab_prev) { 1192 ab_prev = list_prev(list, ab); 1193 /* prefetch buffers have a minimum lifespan */ 1194 if (HDR_IO_IN_PROGRESS(ab) || 1195 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1196 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1197 skipped++; 1198 continue; 1199 } 1200 /* "lookahead" for better eviction candidate */ 1201 if (recycle && ab->b_size != bytes && 1202 ab_prev && ab_prev->b_size == bytes) 1203 continue; 1204 hash_lock = HDR_LOCK(ab); 1205 have_lock = MUTEX_HELD(hash_lock); 1206 if (have_lock || mutex_tryenter(hash_lock)) { 1207 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1208 ASSERT(ab->b_datacnt > 0); 1209 while (ab->b_buf) { 1210 arc_buf_t *buf = ab->b_buf; 1211 if (buf->b_data) { 1212 bytes_evicted += ab->b_size; 1213 if (recycle && ab->b_type == type && 1214 ab->b_size == bytes) { 1215 stolen = buf->b_data; 1216 recycle = FALSE; 1217 } 1218 } 1219 if (buf->b_efunc) { 1220 mutex_enter(&arc_eviction_mtx); 1221 arc_buf_destroy(buf, 1222 buf->b_data == stolen, FALSE); 1223 ab->b_buf = buf->b_next; 1224 buf->b_hdr = &arc_eviction_hdr; 1225 buf->b_next = arc_eviction_list; 1226 arc_eviction_list = buf; 1227 mutex_exit(&arc_eviction_mtx); 1228 } else { 1229 arc_buf_destroy(buf, 1230 buf->b_data == stolen, TRUE); 1231 } 1232 } 1233 ASSERT(ab->b_datacnt == 0); 1234 arc_change_state(evicted_state, ab, hash_lock); 1235 ASSERT(HDR_IN_HASH_TABLE(ab)); 1236 ab->b_flags = ARC_IN_HASH_TABLE; 1237 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1238 if (!have_lock) 1239 mutex_exit(hash_lock); 1240 if (bytes >= 0 && bytes_evicted >= bytes) 1241 break; 1242 } else { 1243 missed += 1; 1244 } 1245 } 1246 1247 mutex_exit(&evicted_state->arcs_mtx); 1248 mutex_exit(&state->arcs_mtx); 1249 1250 if (bytes_evicted < bytes) 1251 dprintf("only evicted %lld bytes from %x", 1252 (longlong_t)bytes_evicted, state); 1253 1254 if (skipped) 1255 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1256 1257 if (missed) 1258 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1259 1260 return (stolen); 1261 } 1262 1263 /* 1264 * Remove buffers from list until we've removed the specified number of 1265 * bytes. Destroy the buffers that are removed. 1266 */ 1267 static void 1268 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1269 { 1270 arc_buf_hdr_t *ab, *ab_prev; 1271 list_t *list = &state->arcs_list[ARC_BUFC_DATA]; 1272 kmutex_t *hash_lock; 1273 uint64_t bytes_deleted = 0; 1274 uint64_t bufs_skipped = 0; 1275 1276 ASSERT(GHOST_STATE(state)); 1277 top: 1278 mutex_enter(&state->arcs_mtx); 1279 for (ab = list_tail(list); ab; ab = ab_prev) { 1280 ab_prev = list_prev(list, ab); 1281 hash_lock = HDR_LOCK(ab); 1282 if (mutex_tryenter(hash_lock)) { 1283 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1284 ASSERT(ab->b_buf == NULL); 1285 arc_change_state(arc_anon, ab, hash_lock); 1286 mutex_exit(hash_lock); 1287 ARCSTAT_BUMP(arcstat_deleted); 1288 bytes_deleted += ab->b_size; 1289 arc_hdr_destroy(ab); 1290 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1291 if (bytes >= 0 && bytes_deleted >= bytes) 1292 break; 1293 } else { 1294 if (bytes < 0) { 1295 mutex_exit(&state->arcs_mtx); 1296 mutex_enter(hash_lock); 1297 mutex_exit(hash_lock); 1298 goto top; 1299 } 1300 bufs_skipped += 1; 1301 } 1302 } 1303 mutex_exit(&state->arcs_mtx); 1304 1305 if (list == &state->arcs_list[ARC_BUFC_DATA] && 1306 (bytes < 0 || bytes_deleted < bytes)) { 1307 list = &state->arcs_list[ARC_BUFC_METADATA]; 1308 goto top; 1309 } 1310 1311 if (bufs_skipped) { 1312 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1313 ASSERT(bytes >= 0); 1314 } 1315 1316 if (bytes_deleted < bytes) 1317 dprintf("only deleted %lld bytes from %p", 1318 (longlong_t)bytes_deleted, state); 1319 } 1320 1321 static void 1322 arc_adjust(void) 1323 { 1324 int64_t top_sz, mru_over, arc_over, todelete; 1325 1326 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1327 1328 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { 1329 int64_t toevict = 1330 MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); 1331 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_DATA); 1332 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1333 } 1334 1335 if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1336 int64_t toevict = 1337 MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); 1338 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_METADATA); 1339 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1340 } 1341 1342 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1343 1344 if (mru_over > 0) { 1345 if (arc_mru_ghost->arcs_size > 0) { 1346 todelete = MIN(arc_mru_ghost->arcs_size, mru_over); 1347 arc_evict_ghost(arc_mru_ghost, todelete); 1348 } 1349 } 1350 1351 if ((arc_over = arc_size - arc_c) > 0) { 1352 int64_t tbl_over; 1353 1354 if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { 1355 int64_t toevict = 1356 MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); 1357 (void) arc_evict(arc_mfu, toevict, FALSE, 1358 ARC_BUFC_DATA); 1359 arc_over = arc_size - arc_c; 1360 } 1361 1362 if (arc_over > 0 && 1363 arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { 1364 int64_t toevict = 1365 MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], 1366 arc_over); 1367 (void) arc_evict(arc_mfu, toevict, FALSE, 1368 ARC_BUFC_METADATA); 1369 } 1370 1371 tbl_over = arc_size + arc_mru_ghost->arcs_size + 1372 arc_mfu_ghost->arcs_size - arc_c * 2; 1373 1374 if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { 1375 todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); 1376 arc_evict_ghost(arc_mfu_ghost, todelete); 1377 } 1378 } 1379 } 1380 1381 static void 1382 arc_do_user_evicts(void) 1383 { 1384 mutex_enter(&arc_eviction_mtx); 1385 while (arc_eviction_list != NULL) { 1386 arc_buf_t *buf = arc_eviction_list; 1387 arc_eviction_list = buf->b_next; 1388 buf->b_hdr = NULL; 1389 mutex_exit(&arc_eviction_mtx); 1390 1391 if (buf->b_efunc != NULL) 1392 VERIFY(buf->b_efunc(buf) == 0); 1393 1394 buf->b_efunc = NULL; 1395 buf->b_private = NULL; 1396 kmem_cache_free(buf_cache, buf); 1397 mutex_enter(&arc_eviction_mtx); 1398 } 1399 mutex_exit(&arc_eviction_mtx); 1400 } 1401 1402 /* 1403 * Flush all *evictable* data from the cache. 1404 * NOTE: this will not touch "active" (i.e. referenced) data. 1405 */ 1406 void 1407 arc_flush(void) 1408 { 1409 while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) 1410 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_DATA); 1411 while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) 1412 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_METADATA); 1413 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) 1414 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_DATA); 1415 while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) 1416 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_METADATA); 1417 1418 arc_evict_ghost(arc_mru_ghost, -1); 1419 arc_evict_ghost(arc_mfu_ghost, -1); 1420 1421 mutex_enter(&arc_reclaim_thr_lock); 1422 arc_do_user_evicts(); 1423 mutex_exit(&arc_reclaim_thr_lock); 1424 ASSERT(arc_eviction_list == NULL); 1425 } 1426 1427 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1428 1429 void 1430 arc_shrink(void) 1431 { 1432 if (arc_c > arc_c_min) { 1433 uint64_t to_free; 1434 1435 #ifdef _KERNEL 1436 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1437 #else 1438 to_free = arc_c >> arc_shrink_shift; 1439 #endif 1440 if (arc_c > arc_c_min + to_free) 1441 atomic_add_64(&arc_c, -to_free); 1442 else 1443 arc_c = arc_c_min; 1444 1445 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1446 if (arc_c > arc_size) 1447 arc_c = MAX(arc_size, arc_c_min); 1448 if (arc_p > arc_c) 1449 arc_p = (arc_c >> 1); 1450 ASSERT(arc_c >= arc_c_min); 1451 ASSERT((int64_t)arc_p >= 0); 1452 } 1453 1454 if (arc_size > arc_c) 1455 arc_adjust(); 1456 } 1457 1458 static int 1459 arc_reclaim_needed(void) 1460 { 1461 uint64_t extra; 1462 1463 #ifdef _KERNEL 1464 1465 if (needfree) 1466 return (1); 1467 1468 /* 1469 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1470 */ 1471 extra = desfree; 1472 1473 /* 1474 * check that we're out of range of the pageout scanner. It starts to 1475 * schedule paging if freemem is less than lotsfree and needfree. 1476 * lotsfree is the high-water mark for pageout, and needfree is the 1477 * number of needed free pages. We add extra pages here to make sure 1478 * the scanner doesn't start up while we're freeing memory. 1479 */ 1480 if (freemem < lotsfree + needfree + extra) 1481 return (1); 1482 1483 /* 1484 * check to make sure that swapfs has enough space so that anon 1485 * reservations can still succeeed. anon_resvmem() checks that the 1486 * availrmem is greater than swapfs_minfree, and the number of reserved 1487 * swap pages. We also add a bit of extra here just to prevent 1488 * circumstances from getting really dire. 1489 */ 1490 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1491 return (1); 1492 1493 #if defined(__i386) 1494 /* 1495 * If we're on an i386 platform, it's possible that we'll exhaust the 1496 * kernel heap space before we ever run out of available physical 1497 * memory. Most checks of the size of the heap_area compare against 1498 * tune.t_minarmem, which is the minimum available real memory that we 1499 * can have in the system. However, this is generally fixed at 25 pages 1500 * which is so low that it's useless. In this comparison, we seek to 1501 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1502 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1503 * free) 1504 */ 1505 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1506 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1507 return (1); 1508 #endif 1509 1510 #else 1511 if (spa_get_random(100) == 0) 1512 return (1); 1513 #endif 1514 return (0); 1515 } 1516 1517 static void 1518 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1519 { 1520 size_t i; 1521 kmem_cache_t *prev_cache = NULL; 1522 kmem_cache_t *prev_data_cache = NULL; 1523 extern kmem_cache_t *zio_buf_cache[]; 1524 extern kmem_cache_t *zio_data_buf_cache[]; 1525 1526 #ifdef _KERNEL 1527 if (arc_meta_used >= arc_meta_limit) { 1528 /* 1529 * We are exceeding our meta-data cache limit. 1530 * Purge some DNLC entries to release holds on meta-data. 1531 */ 1532 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1533 } 1534 #if defined(__i386) 1535 /* 1536 * Reclaim unused memory from all kmem caches. 1537 */ 1538 kmem_reap(); 1539 #endif 1540 #endif 1541 1542 /* 1543 * An agressive reclamation will shrink the cache size as well as 1544 * reap free buffers from the arc kmem caches. 1545 */ 1546 if (strat == ARC_RECLAIM_AGGR) 1547 arc_shrink(); 1548 1549 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1550 if (zio_buf_cache[i] != prev_cache) { 1551 prev_cache = zio_buf_cache[i]; 1552 kmem_cache_reap_now(zio_buf_cache[i]); 1553 } 1554 if (zio_data_buf_cache[i] != prev_data_cache) { 1555 prev_data_cache = zio_data_buf_cache[i]; 1556 kmem_cache_reap_now(zio_data_buf_cache[i]); 1557 } 1558 } 1559 kmem_cache_reap_now(buf_cache); 1560 kmem_cache_reap_now(hdr_cache); 1561 } 1562 1563 static void 1564 arc_reclaim_thread(void) 1565 { 1566 clock_t growtime = 0; 1567 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1568 callb_cpr_t cpr; 1569 1570 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1571 1572 mutex_enter(&arc_reclaim_thr_lock); 1573 while (arc_thread_exit == 0) { 1574 if (arc_reclaim_needed()) { 1575 1576 if (arc_no_grow) { 1577 if (last_reclaim == ARC_RECLAIM_CONS) { 1578 last_reclaim = ARC_RECLAIM_AGGR; 1579 } else { 1580 last_reclaim = ARC_RECLAIM_CONS; 1581 } 1582 } else { 1583 arc_no_grow = TRUE; 1584 last_reclaim = ARC_RECLAIM_AGGR; 1585 membar_producer(); 1586 } 1587 1588 /* reset the growth delay for every reclaim */ 1589 growtime = lbolt + (arc_grow_retry * hz); 1590 1591 arc_kmem_reap_now(last_reclaim); 1592 1593 } else if (arc_no_grow && lbolt >= growtime) { 1594 arc_no_grow = FALSE; 1595 } 1596 1597 if (2 * arc_c < arc_size + 1598 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1599 arc_adjust(); 1600 1601 if (arc_eviction_list != NULL) 1602 arc_do_user_evicts(); 1603 1604 /* block until needed, or one second, whichever is shorter */ 1605 CALLB_CPR_SAFE_BEGIN(&cpr); 1606 (void) cv_timedwait(&arc_reclaim_thr_cv, 1607 &arc_reclaim_thr_lock, (lbolt + hz)); 1608 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1609 } 1610 1611 arc_thread_exit = 0; 1612 cv_broadcast(&arc_reclaim_thr_cv); 1613 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1614 thread_exit(); 1615 } 1616 1617 /* 1618 * Adapt arc info given the number of bytes we are trying to add and 1619 * the state that we are comming from. This function is only called 1620 * when we are adding new content to the cache. 1621 */ 1622 static void 1623 arc_adapt(int bytes, arc_state_t *state) 1624 { 1625 int mult; 1626 1627 ASSERT(bytes > 0); 1628 /* 1629 * Adapt the target size of the MRU list: 1630 * - if we just hit in the MRU ghost list, then increase 1631 * the target size of the MRU list. 1632 * - if we just hit in the MFU ghost list, then increase 1633 * the target size of the MFU list by decreasing the 1634 * target size of the MRU list. 1635 */ 1636 if (state == arc_mru_ghost) { 1637 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1638 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1639 1640 arc_p = MIN(arc_c, arc_p + bytes * mult); 1641 } else if (state == arc_mfu_ghost) { 1642 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1643 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1644 1645 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1646 } 1647 ASSERT((int64_t)arc_p >= 0); 1648 1649 if (arc_reclaim_needed()) { 1650 cv_signal(&arc_reclaim_thr_cv); 1651 return; 1652 } 1653 1654 if (arc_no_grow) 1655 return; 1656 1657 if (arc_c >= arc_c_max) 1658 return; 1659 1660 /* 1661 * If we're within (2 * maxblocksize) bytes of the target 1662 * cache size, increment the target cache size 1663 */ 1664 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1665 atomic_add_64(&arc_c, (int64_t)bytes); 1666 if (arc_c > arc_c_max) 1667 arc_c = arc_c_max; 1668 else if (state == arc_anon) 1669 atomic_add_64(&arc_p, (int64_t)bytes); 1670 if (arc_p > arc_c) 1671 arc_p = arc_c; 1672 } 1673 ASSERT((int64_t)arc_p >= 0); 1674 } 1675 1676 /* 1677 * Check if the cache has reached its limits and eviction is required 1678 * prior to insert. 1679 */ 1680 static int 1681 arc_evict_needed(arc_buf_contents_t type) 1682 { 1683 if (type == ARC_BUFC_METADATA && arc_meta_used >= arc_meta_limit) 1684 return (1); 1685 1686 #ifdef _KERNEL 1687 /* 1688 * If zio data pages are being allocated out of a separate heap segment, 1689 * then enforce that the size of available vmem for this area remains 1690 * above about 1/32nd free. 1691 */ 1692 if (type == ARC_BUFC_DATA && zio_arena != NULL && 1693 vmem_size(zio_arena, VMEM_FREE) < 1694 (vmem_size(zio_arena, VMEM_ALLOC) >> 5)) 1695 return (1); 1696 #endif 1697 1698 if (arc_reclaim_needed()) 1699 return (1); 1700 1701 return (arc_size > arc_c); 1702 } 1703 1704 /* 1705 * The buffer, supplied as the first argument, needs a data block. 1706 * So, if we are at cache max, determine which cache should be victimized. 1707 * We have the following cases: 1708 * 1709 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1710 * In this situation if we're out of space, but the resident size of the MFU is 1711 * under the limit, victimize the MFU cache to satisfy this insertion request. 1712 * 1713 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1714 * Here, we've used up all of the available space for the MRU, so we need to 1715 * evict from our own cache instead. Evict from the set of resident MRU 1716 * entries. 1717 * 1718 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1719 * c minus p represents the MFU space in the cache, since p is the size of the 1720 * cache that is dedicated to the MRU. In this situation there's still space on 1721 * the MFU side, so the MRU side needs to be victimized. 1722 * 1723 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1724 * MFU's resident set is consuming more space than it has been allotted. In 1725 * this situation, we must victimize our own cache, the MFU, for this insertion. 1726 */ 1727 static void 1728 arc_get_data_buf(arc_buf_t *buf) 1729 { 1730 arc_state_t *state = buf->b_hdr->b_state; 1731 uint64_t size = buf->b_hdr->b_size; 1732 arc_buf_contents_t type = buf->b_hdr->b_type; 1733 1734 arc_adapt(size, state); 1735 1736 /* 1737 * We have not yet reached cache maximum size, 1738 * just allocate a new buffer. 1739 */ 1740 if (!arc_evict_needed(type)) { 1741 if (type == ARC_BUFC_METADATA) { 1742 buf->b_data = zio_buf_alloc(size); 1743 arc_space_consume(size); 1744 } else { 1745 ASSERT(type == ARC_BUFC_DATA); 1746 buf->b_data = zio_data_buf_alloc(size); 1747 atomic_add_64(&arc_size, size); 1748 } 1749 goto out; 1750 } 1751 1752 /* 1753 * If we are prefetching from the mfu ghost list, this buffer 1754 * will end up on the mru list; so steal space from there. 1755 */ 1756 if (state == arc_mfu_ghost) 1757 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1758 else if (state == arc_mru_ghost) 1759 state = arc_mru; 1760 1761 if (state == arc_mru || state == arc_anon) { 1762 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1763 state = (arc_mfu->arcs_lsize[type] > 0 && 1764 arc_p > mru_used) ? arc_mfu : arc_mru; 1765 } else { 1766 /* MFU cases */ 1767 uint64_t mfu_space = arc_c - arc_p; 1768 state = (arc_mru->arcs_lsize[type] > 0 && 1769 mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1770 } 1771 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1772 if (type == ARC_BUFC_METADATA) { 1773 buf->b_data = zio_buf_alloc(size); 1774 arc_space_consume(size); 1775 } else { 1776 ASSERT(type == ARC_BUFC_DATA); 1777 buf->b_data = zio_data_buf_alloc(size); 1778 atomic_add_64(&arc_size, size); 1779 } 1780 ARCSTAT_BUMP(arcstat_recycle_miss); 1781 } 1782 ASSERT(buf->b_data != NULL); 1783 out: 1784 /* 1785 * Update the state size. Note that ghost states have a 1786 * "ghost size" and so don't need to be updated. 1787 */ 1788 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1789 arc_buf_hdr_t *hdr = buf->b_hdr; 1790 1791 atomic_add_64(&hdr->b_state->arcs_size, size); 1792 if (list_link_active(&hdr->b_arc_node)) { 1793 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1794 atomic_add_64(&hdr->b_state->arcs_lsize[type], size); 1795 } 1796 /* 1797 * If we are growing the cache, and we are adding anonymous 1798 * data, and we have outgrown arc_p, update arc_p 1799 */ 1800 if (arc_size < arc_c && hdr->b_state == arc_anon && 1801 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1802 arc_p = MIN(arc_c, arc_p + size); 1803 } 1804 } 1805 1806 /* 1807 * This routine is called whenever a buffer is accessed. 1808 * NOTE: the hash lock is dropped in this function. 1809 */ 1810 static void 1811 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1812 { 1813 ASSERT(MUTEX_HELD(hash_lock)); 1814 1815 if (buf->b_state == arc_anon) { 1816 /* 1817 * This buffer is not in the cache, and does not 1818 * appear in our "ghost" list. Add the new buffer 1819 * to the MRU state. 1820 */ 1821 1822 ASSERT(buf->b_arc_access == 0); 1823 buf->b_arc_access = lbolt; 1824 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1825 arc_change_state(arc_mru, buf, hash_lock); 1826 1827 } else if (buf->b_state == arc_mru) { 1828 /* 1829 * If this buffer is here because of a prefetch, then either: 1830 * - clear the flag if this is a "referencing" read 1831 * (any subsequent access will bump this into the MFU state). 1832 * or 1833 * - move the buffer to the head of the list if this is 1834 * another prefetch (to make it less likely to be evicted). 1835 */ 1836 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1837 if (refcount_count(&buf->b_refcnt) == 0) { 1838 ASSERT(list_link_active(&buf->b_arc_node)); 1839 } else { 1840 buf->b_flags &= ~ARC_PREFETCH; 1841 ARCSTAT_BUMP(arcstat_mru_hits); 1842 } 1843 buf->b_arc_access = lbolt; 1844 return; 1845 } 1846 1847 /* 1848 * This buffer has been "accessed" only once so far, 1849 * but it is still in the cache. Move it to the MFU 1850 * state. 1851 */ 1852 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1853 /* 1854 * More than 125ms have passed since we 1855 * instantiated this buffer. Move it to the 1856 * most frequently used state. 1857 */ 1858 buf->b_arc_access = lbolt; 1859 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1860 arc_change_state(arc_mfu, buf, hash_lock); 1861 } 1862 ARCSTAT_BUMP(arcstat_mru_hits); 1863 } else if (buf->b_state == arc_mru_ghost) { 1864 arc_state_t *new_state; 1865 /* 1866 * This buffer has been "accessed" recently, but 1867 * was evicted from the cache. Move it to the 1868 * MFU state. 1869 */ 1870 1871 if (buf->b_flags & ARC_PREFETCH) { 1872 new_state = arc_mru; 1873 if (refcount_count(&buf->b_refcnt) > 0) 1874 buf->b_flags &= ~ARC_PREFETCH; 1875 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1876 } else { 1877 new_state = arc_mfu; 1878 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1879 } 1880 1881 buf->b_arc_access = lbolt; 1882 arc_change_state(new_state, buf, hash_lock); 1883 1884 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1885 } else if (buf->b_state == arc_mfu) { 1886 /* 1887 * This buffer has been accessed more than once and is 1888 * still in the cache. Keep it in the MFU state. 1889 * 1890 * NOTE: an add_reference() that occurred when we did 1891 * the arc_read() will have kicked this off the list. 1892 * If it was a prefetch, we will explicitly move it to 1893 * the head of the list now. 1894 */ 1895 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1896 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1897 ASSERT(list_link_active(&buf->b_arc_node)); 1898 } 1899 ARCSTAT_BUMP(arcstat_mfu_hits); 1900 buf->b_arc_access = lbolt; 1901 } else if (buf->b_state == arc_mfu_ghost) { 1902 arc_state_t *new_state = arc_mfu; 1903 /* 1904 * This buffer has been accessed more than once but has 1905 * been evicted from the cache. Move it back to the 1906 * MFU state. 1907 */ 1908 1909 if (buf->b_flags & ARC_PREFETCH) { 1910 /* 1911 * This is a prefetch access... 1912 * move this block back to the MRU state. 1913 */ 1914 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1915 new_state = arc_mru; 1916 } 1917 1918 buf->b_arc_access = lbolt; 1919 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1920 arc_change_state(new_state, buf, hash_lock); 1921 1922 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1923 } else { 1924 ASSERT(!"invalid arc state"); 1925 } 1926 } 1927 1928 /* a generic arc_done_func_t which you can use */ 1929 /* ARGSUSED */ 1930 void 1931 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1932 { 1933 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1934 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1935 } 1936 1937 /* a generic arc_done_func_t */ 1938 void 1939 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1940 { 1941 arc_buf_t **bufp = arg; 1942 if (zio && zio->io_error) { 1943 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1944 *bufp = NULL; 1945 } else { 1946 *bufp = buf; 1947 } 1948 } 1949 1950 static void 1951 arc_read_done(zio_t *zio) 1952 { 1953 arc_buf_hdr_t *hdr, *found; 1954 arc_buf_t *buf; 1955 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1956 kmutex_t *hash_lock; 1957 arc_callback_t *callback_list, *acb; 1958 int freeable = FALSE; 1959 1960 buf = zio->io_private; 1961 hdr = buf->b_hdr; 1962 1963 /* 1964 * The hdr was inserted into hash-table and removed from lists 1965 * prior to starting I/O. We should find this header, since 1966 * it's in the hash table, and it should be legit since it's 1967 * not possible to evict it during the I/O. The only possible 1968 * reason for it not to be found is if we were freed during the 1969 * read. 1970 */ 1971 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1972 &hash_lock); 1973 1974 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1975 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1976 1977 /* byteswap if necessary */ 1978 callback_list = hdr->b_acb; 1979 ASSERT(callback_list != NULL); 1980 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1981 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1982 1983 arc_cksum_compute(buf); 1984 1985 /* create copies of the data buffer for the callers */ 1986 abuf = buf; 1987 for (acb = callback_list; acb; acb = acb->acb_next) { 1988 if (acb->acb_done) { 1989 if (abuf == NULL) 1990 abuf = arc_buf_clone(buf); 1991 acb->acb_buf = abuf; 1992 abuf = NULL; 1993 } 1994 } 1995 hdr->b_acb = NULL; 1996 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1997 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1998 if (abuf == buf) 1999 hdr->b_flags |= ARC_BUF_AVAILABLE; 2000 2001 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 2002 2003 if (zio->io_error != 0) { 2004 hdr->b_flags |= ARC_IO_ERROR; 2005 if (hdr->b_state != arc_anon) 2006 arc_change_state(arc_anon, hdr, hash_lock); 2007 if (HDR_IN_HASH_TABLE(hdr)) 2008 buf_hash_remove(hdr); 2009 freeable = refcount_is_zero(&hdr->b_refcnt); 2010 /* convert checksum errors into IO errors */ 2011 if (zio->io_error == ECKSUM) 2012 zio->io_error = EIO; 2013 } 2014 2015 /* 2016 * Broadcast before we drop the hash_lock to avoid the possibility 2017 * that the hdr (and hence the cv) might be freed before we get to 2018 * the cv_broadcast(). 2019 */ 2020 cv_broadcast(&hdr->b_cv); 2021 2022 if (hash_lock) { 2023 /* 2024 * Only call arc_access on anonymous buffers. This is because 2025 * if we've issued an I/O for an evicted buffer, we've already 2026 * called arc_access (to prevent any simultaneous readers from 2027 * getting confused). 2028 */ 2029 if (zio->io_error == 0 && hdr->b_state == arc_anon) 2030 arc_access(hdr, hash_lock); 2031 mutex_exit(hash_lock); 2032 } else { 2033 /* 2034 * This block was freed while we waited for the read to 2035 * complete. It has been removed from the hash table and 2036 * moved to the anonymous state (so that it won't show up 2037 * in the cache). 2038 */ 2039 ASSERT3P(hdr->b_state, ==, arc_anon); 2040 freeable = refcount_is_zero(&hdr->b_refcnt); 2041 } 2042 2043 /* execute each callback and free its structure */ 2044 while ((acb = callback_list) != NULL) { 2045 if (acb->acb_done) 2046 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 2047 2048 if (acb->acb_zio_dummy != NULL) { 2049 acb->acb_zio_dummy->io_error = zio->io_error; 2050 zio_nowait(acb->acb_zio_dummy); 2051 } 2052 2053 callback_list = acb->acb_next; 2054 kmem_free(acb, sizeof (arc_callback_t)); 2055 } 2056 2057 if (freeable) 2058 arc_hdr_destroy(hdr); 2059 } 2060 2061 /* 2062 * "Read" the block block at the specified DVA (in bp) via the 2063 * cache. If the block is found in the cache, invoke the provided 2064 * callback immediately and return. Note that the `zio' parameter 2065 * in the callback will be NULL in this case, since no IO was 2066 * required. If the block is not in the cache pass the read request 2067 * on to the spa with a substitute callback function, so that the 2068 * requested block will be added to the cache. 2069 * 2070 * If a read request arrives for a block that has a read in-progress, 2071 * either wait for the in-progress read to complete (and return the 2072 * results); or, if this is a read with a "done" func, add a record 2073 * to the read to invoke the "done" func when the read completes, 2074 * and return; or just return. 2075 * 2076 * arc_read_done() will invoke all the requested "done" functions 2077 * for readers of this block. 2078 */ 2079 int 2080 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2081 arc_done_func_t *done, void *private, int priority, int flags, 2082 uint32_t *arc_flags, zbookmark_t *zb) 2083 { 2084 arc_buf_hdr_t *hdr; 2085 arc_buf_t *buf; 2086 kmutex_t *hash_lock; 2087 zio_t *rzio; 2088 2089 top: 2090 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2091 if (hdr && hdr->b_datacnt > 0) { 2092 2093 *arc_flags |= ARC_CACHED; 2094 2095 if (HDR_IO_IN_PROGRESS(hdr)) { 2096 2097 if (*arc_flags & ARC_WAIT) { 2098 cv_wait(&hdr->b_cv, hash_lock); 2099 mutex_exit(hash_lock); 2100 goto top; 2101 } 2102 ASSERT(*arc_flags & ARC_NOWAIT); 2103 2104 if (done) { 2105 arc_callback_t *acb = NULL; 2106 2107 acb = kmem_zalloc(sizeof (arc_callback_t), 2108 KM_SLEEP); 2109 acb->acb_done = done; 2110 acb->acb_private = private; 2111 acb->acb_byteswap = swap; 2112 if (pio != NULL) 2113 acb->acb_zio_dummy = zio_null(pio, 2114 spa, NULL, NULL, flags); 2115 2116 ASSERT(acb->acb_done != NULL); 2117 acb->acb_next = hdr->b_acb; 2118 hdr->b_acb = acb; 2119 add_reference(hdr, hash_lock, private); 2120 mutex_exit(hash_lock); 2121 return (0); 2122 } 2123 mutex_exit(hash_lock); 2124 return (0); 2125 } 2126 2127 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2128 2129 if (done) { 2130 add_reference(hdr, hash_lock, private); 2131 /* 2132 * If this block is already in use, create a new 2133 * copy of the data so that we will be guaranteed 2134 * that arc_release() will always succeed. 2135 */ 2136 buf = hdr->b_buf; 2137 ASSERT(buf); 2138 ASSERT(buf->b_data); 2139 if (HDR_BUF_AVAILABLE(hdr)) { 2140 ASSERT(buf->b_efunc == NULL); 2141 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2142 } else { 2143 buf = arc_buf_clone(buf); 2144 } 2145 } else if (*arc_flags & ARC_PREFETCH && 2146 refcount_count(&hdr->b_refcnt) == 0) { 2147 hdr->b_flags |= ARC_PREFETCH; 2148 } 2149 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2150 arc_access(hdr, hash_lock); 2151 mutex_exit(hash_lock); 2152 ARCSTAT_BUMP(arcstat_hits); 2153 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2154 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2155 data, metadata, hits); 2156 2157 if (done) 2158 done(NULL, buf, private); 2159 } else { 2160 uint64_t size = BP_GET_LSIZE(bp); 2161 arc_callback_t *acb; 2162 2163 if (hdr == NULL) { 2164 /* this block is not in the cache */ 2165 arc_buf_hdr_t *exists; 2166 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2167 buf = arc_buf_alloc(spa, size, private, type); 2168 hdr = buf->b_hdr; 2169 hdr->b_dva = *BP_IDENTITY(bp); 2170 hdr->b_birth = bp->blk_birth; 2171 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2172 exists = buf_hash_insert(hdr, &hash_lock); 2173 if (exists) { 2174 /* somebody beat us to the hash insert */ 2175 mutex_exit(hash_lock); 2176 bzero(&hdr->b_dva, sizeof (dva_t)); 2177 hdr->b_birth = 0; 2178 hdr->b_cksum0 = 0; 2179 (void) arc_buf_remove_ref(buf, private); 2180 goto top; /* restart the IO request */ 2181 } 2182 /* if this is a prefetch, we don't have a reference */ 2183 if (*arc_flags & ARC_PREFETCH) { 2184 (void) remove_reference(hdr, hash_lock, 2185 private); 2186 hdr->b_flags |= ARC_PREFETCH; 2187 } 2188 if (BP_GET_LEVEL(bp) > 0) 2189 hdr->b_flags |= ARC_INDIRECT; 2190 } else { 2191 /* this block is in the ghost cache */ 2192 ASSERT(GHOST_STATE(hdr->b_state)); 2193 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2194 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2195 ASSERT(hdr->b_buf == NULL); 2196 2197 /* if this is a prefetch, we don't have a reference */ 2198 if (*arc_flags & ARC_PREFETCH) 2199 hdr->b_flags |= ARC_PREFETCH; 2200 else 2201 add_reference(hdr, hash_lock, private); 2202 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2203 buf->b_hdr = hdr; 2204 buf->b_data = NULL; 2205 buf->b_efunc = NULL; 2206 buf->b_private = NULL; 2207 buf->b_next = NULL; 2208 hdr->b_buf = buf; 2209 arc_get_data_buf(buf); 2210 ASSERT(hdr->b_datacnt == 0); 2211 hdr->b_datacnt = 1; 2212 2213 } 2214 2215 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2216 acb->acb_done = done; 2217 acb->acb_private = private; 2218 acb->acb_byteswap = swap; 2219 2220 ASSERT(hdr->b_acb == NULL); 2221 hdr->b_acb = acb; 2222 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2223 2224 /* 2225 * If the buffer has been evicted, migrate it to a present state 2226 * before issuing the I/O. Once we drop the hash-table lock, 2227 * the header will be marked as I/O in progress and have an 2228 * attached buffer. At this point, anybody who finds this 2229 * buffer ought to notice that it's legit but has a pending I/O. 2230 */ 2231 2232 if (GHOST_STATE(hdr->b_state)) 2233 arc_access(hdr, hash_lock); 2234 mutex_exit(hash_lock); 2235 2236 ASSERT3U(hdr->b_size, ==, size); 2237 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2238 zbookmark_t *, zb); 2239 ARCSTAT_BUMP(arcstat_misses); 2240 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2241 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2242 data, metadata, misses); 2243 2244 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2245 arc_read_done, buf, priority, flags, zb); 2246 2247 if (*arc_flags & ARC_WAIT) 2248 return (zio_wait(rzio)); 2249 2250 ASSERT(*arc_flags & ARC_NOWAIT); 2251 zio_nowait(rzio); 2252 } 2253 return (0); 2254 } 2255 2256 /* 2257 * arc_read() variant to support pool traversal. If the block is already 2258 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2259 * The idea is that we don't want pool traversal filling up memory, but 2260 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2261 */ 2262 int 2263 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2264 { 2265 arc_buf_hdr_t *hdr; 2266 kmutex_t *hash_mtx; 2267 int rc = 0; 2268 2269 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2270 2271 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2272 arc_buf_t *buf = hdr->b_buf; 2273 2274 ASSERT(buf); 2275 while (buf->b_data == NULL) { 2276 buf = buf->b_next; 2277 ASSERT(buf); 2278 } 2279 bcopy(buf->b_data, data, hdr->b_size); 2280 } else { 2281 rc = ENOENT; 2282 } 2283 2284 if (hash_mtx) 2285 mutex_exit(hash_mtx); 2286 2287 return (rc); 2288 } 2289 2290 void 2291 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2292 { 2293 ASSERT(buf->b_hdr != NULL); 2294 ASSERT(buf->b_hdr->b_state != arc_anon); 2295 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2296 buf->b_efunc = func; 2297 buf->b_private = private; 2298 } 2299 2300 /* 2301 * This is used by the DMU to let the ARC know that a buffer is 2302 * being evicted, so the ARC should clean up. If this arc buf 2303 * is not yet in the evicted state, it will be put there. 2304 */ 2305 int 2306 arc_buf_evict(arc_buf_t *buf) 2307 { 2308 arc_buf_hdr_t *hdr; 2309 kmutex_t *hash_lock; 2310 arc_buf_t **bufp; 2311 2312 mutex_enter(&arc_eviction_mtx); 2313 hdr = buf->b_hdr; 2314 if (hdr == NULL) { 2315 /* 2316 * We are in arc_do_user_evicts(). 2317 */ 2318 ASSERT(buf->b_data == NULL); 2319 mutex_exit(&arc_eviction_mtx); 2320 return (0); 2321 } 2322 hash_lock = HDR_LOCK(hdr); 2323 mutex_exit(&arc_eviction_mtx); 2324 2325 mutex_enter(hash_lock); 2326 2327 if (buf->b_data == NULL) { 2328 /* 2329 * We are on the eviction list. 2330 */ 2331 mutex_exit(hash_lock); 2332 mutex_enter(&arc_eviction_mtx); 2333 if (buf->b_hdr == NULL) { 2334 /* 2335 * We are already in arc_do_user_evicts(). 2336 */ 2337 mutex_exit(&arc_eviction_mtx); 2338 return (0); 2339 } else { 2340 arc_buf_t copy = *buf; /* structure assignment */ 2341 /* 2342 * Process this buffer now 2343 * but let arc_do_user_evicts() do the reaping. 2344 */ 2345 buf->b_efunc = NULL; 2346 mutex_exit(&arc_eviction_mtx); 2347 VERIFY(copy.b_efunc(©) == 0); 2348 return (1); 2349 } 2350 } 2351 2352 ASSERT(buf->b_hdr == hdr); 2353 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2354 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2355 2356 /* 2357 * Pull this buffer off of the hdr 2358 */ 2359 bufp = &hdr->b_buf; 2360 while (*bufp != buf) 2361 bufp = &(*bufp)->b_next; 2362 *bufp = buf->b_next; 2363 2364 ASSERT(buf->b_data != NULL); 2365 arc_buf_destroy(buf, FALSE, FALSE); 2366 2367 if (hdr->b_datacnt == 0) { 2368 arc_state_t *old_state = hdr->b_state; 2369 arc_state_t *evicted_state; 2370 2371 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2372 2373 evicted_state = 2374 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2375 2376 mutex_enter(&old_state->arcs_mtx); 2377 mutex_enter(&evicted_state->arcs_mtx); 2378 2379 arc_change_state(evicted_state, hdr, hash_lock); 2380 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2381 hdr->b_flags = ARC_IN_HASH_TABLE; 2382 2383 mutex_exit(&evicted_state->arcs_mtx); 2384 mutex_exit(&old_state->arcs_mtx); 2385 } 2386 mutex_exit(hash_lock); 2387 2388 VERIFY(buf->b_efunc(buf) == 0); 2389 buf->b_efunc = NULL; 2390 buf->b_private = NULL; 2391 buf->b_hdr = NULL; 2392 kmem_cache_free(buf_cache, buf); 2393 return (1); 2394 } 2395 2396 /* 2397 * Release this buffer from the cache. This must be done 2398 * after a read and prior to modifying the buffer contents. 2399 * If the buffer has more than one reference, we must make 2400 * make a new hdr for the buffer. 2401 */ 2402 void 2403 arc_release(arc_buf_t *buf, void *tag) 2404 { 2405 arc_buf_hdr_t *hdr = buf->b_hdr; 2406 kmutex_t *hash_lock = HDR_LOCK(hdr); 2407 2408 /* this buffer is not on any list */ 2409 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2410 2411 if (hdr->b_state == arc_anon) { 2412 /* this buffer is already released */ 2413 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2414 ASSERT(BUF_EMPTY(hdr)); 2415 ASSERT(buf->b_efunc == NULL); 2416 arc_buf_thaw(buf); 2417 return; 2418 } 2419 2420 mutex_enter(hash_lock); 2421 2422 /* 2423 * Do we have more than one buf? 2424 */ 2425 if (hdr->b_buf != buf || buf->b_next != NULL) { 2426 arc_buf_hdr_t *nhdr; 2427 arc_buf_t **bufp; 2428 uint64_t blksz = hdr->b_size; 2429 spa_t *spa = hdr->b_spa; 2430 arc_buf_contents_t type = hdr->b_type; 2431 2432 ASSERT(hdr->b_datacnt > 1); 2433 /* 2434 * Pull the data off of this buf and attach it to 2435 * a new anonymous buf. 2436 */ 2437 (void) remove_reference(hdr, hash_lock, tag); 2438 bufp = &hdr->b_buf; 2439 while (*bufp != buf) 2440 bufp = &(*bufp)->b_next; 2441 *bufp = (*bufp)->b_next; 2442 buf->b_next = NULL; 2443 2444 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2445 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2446 if (refcount_is_zero(&hdr->b_refcnt)) { 2447 uint64_t *size = &hdr->b_state->arcs_lsize[hdr->b_type]; 2448 ASSERT3U(*size, >=, hdr->b_size); 2449 atomic_add_64(size, -hdr->b_size); 2450 } 2451 hdr->b_datacnt -= 1; 2452 arc_cksum_verify(buf); 2453 2454 mutex_exit(hash_lock); 2455 2456 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2457 nhdr->b_size = blksz; 2458 nhdr->b_spa = spa; 2459 nhdr->b_type = type; 2460 nhdr->b_buf = buf; 2461 nhdr->b_state = arc_anon; 2462 nhdr->b_arc_access = 0; 2463 nhdr->b_flags = 0; 2464 nhdr->b_datacnt = 1; 2465 nhdr->b_freeze_cksum = NULL; 2466 (void) refcount_add(&nhdr->b_refcnt, tag); 2467 buf->b_hdr = nhdr; 2468 atomic_add_64(&arc_anon->arcs_size, blksz); 2469 2470 hdr = nhdr; 2471 } else { 2472 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2473 ASSERT(!list_link_active(&hdr->b_arc_node)); 2474 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2475 arc_change_state(arc_anon, hdr, hash_lock); 2476 hdr->b_arc_access = 0; 2477 mutex_exit(hash_lock); 2478 bzero(&hdr->b_dva, sizeof (dva_t)); 2479 hdr->b_birth = 0; 2480 hdr->b_cksum0 = 0; 2481 arc_buf_thaw(buf); 2482 } 2483 buf->b_efunc = NULL; 2484 buf->b_private = NULL; 2485 } 2486 2487 int 2488 arc_released(arc_buf_t *buf) 2489 { 2490 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2491 } 2492 2493 int 2494 arc_has_callback(arc_buf_t *buf) 2495 { 2496 return (buf->b_efunc != NULL); 2497 } 2498 2499 #ifdef ZFS_DEBUG 2500 int 2501 arc_referenced(arc_buf_t *buf) 2502 { 2503 return (refcount_count(&buf->b_hdr->b_refcnt)); 2504 } 2505 #endif 2506 2507 static void 2508 arc_write_ready(zio_t *zio) 2509 { 2510 arc_write_callback_t *callback = zio->io_private; 2511 arc_buf_t *buf = callback->awcb_buf; 2512 2513 if (callback->awcb_ready) { 2514 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2515 callback->awcb_ready(zio, buf, callback->awcb_private); 2516 } 2517 arc_cksum_compute(buf); 2518 } 2519 2520 static void 2521 arc_write_done(zio_t *zio) 2522 { 2523 arc_write_callback_t *callback = zio->io_private; 2524 arc_buf_t *buf = callback->awcb_buf; 2525 arc_buf_hdr_t *hdr = buf->b_hdr; 2526 2527 hdr->b_acb = NULL; 2528 2529 /* this buffer is on no lists and is not in the hash table */ 2530 ASSERT3P(hdr->b_state, ==, arc_anon); 2531 2532 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2533 hdr->b_birth = zio->io_bp->blk_birth; 2534 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2535 /* 2536 * If the block to be written was all-zero, we may have 2537 * compressed it away. In this case no write was performed 2538 * so there will be no dva/birth-date/checksum. The buffer 2539 * must therefor remain anonymous (and uncached). 2540 */ 2541 if (!BUF_EMPTY(hdr)) { 2542 arc_buf_hdr_t *exists; 2543 kmutex_t *hash_lock; 2544 2545 arc_cksum_verify(buf); 2546 2547 exists = buf_hash_insert(hdr, &hash_lock); 2548 if (exists) { 2549 /* 2550 * This can only happen if we overwrite for 2551 * sync-to-convergence, because we remove 2552 * buffers from the hash table when we arc_free(). 2553 */ 2554 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2555 BP_IDENTITY(zio->io_bp))); 2556 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2557 zio->io_bp->blk_birth); 2558 2559 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2560 arc_change_state(arc_anon, exists, hash_lock); 2561 mutex_exit(hash_lock); 2562 arc_hdr_destroy(exists); 2563 exists = buf_hash_insert(hdr, &hash_lock); 2564 ASSERT3P(exists, ==, NULL); 2565 } 2566 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2567 arc_access(hdr, hash_lock); 2568 mutex_exit(hash_lock); 2569 } else if (callback->awcb_done == NULL) { 2570 int destroy_hdr; 2571 /* 2572 * This is an anonymous buffer with no user callback, 2573 * destroy it if there are no active references. 2574 */ 2575 mutex_enter(&arc_eviction_mtx); 2576 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2577 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2578 mutex_exit(&arc_eviction_mtx); 2579 if (destroy_hdr) 2580 arc_hdr_destroy(hdr); 2581 } else { 2582 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2583 } 2584 2585 if (callback->awcb_done) { 2586 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2587 callback->awcb_done(zio, buf, callback->awcb_private); 2588 } 2589 2590 kmem_free(callback, sizeof (arc_write_callback_t)); 2591 } 2592 2593 zio_t * 2594 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2595 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2596 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2597 int flags, zbookmark_t *zb) 2598 { 2599 arc_buf_hdr_t *hdr = buf->b_hdr; 2600 arc_write_callback_t *callback; 2601 zio_t *zio; 2602 2603 /* this is a private buffer - no locking required */ 2604 ASSERT3P(hdr->b_state, ==, arc_anon); 2605 ASSERT(BUF_EMPTY(hdr)); 2606 ASSERT(!HDR_IO_ERROR(hdr)); 2607 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2608 ASSERT(hdr->b_acb == 0); 2609 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2610 callback->awcb_ready = ready; 2611 callback->awcb_done = done; 2612 callback->awcb_private = private; 2613 callback->awcb_buf = buf; 2614 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2615 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2616 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2617 priority, flags, zb); 2618 2619 return (zio); 2620 } 2621 2622 int 2623 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2624 zio_done_func_t *done, void *private, uint32_t arc_flags) 2625 { 2626 arc_buf_hdr_t *ab; 2627 kmutex_t *hash_lock; 2628 zio_t *zio; 2629 2630 /* 2631 * If this buffer is in the cache, release it, so it 2632 * can be re-used. 2633 */ 2634 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2635 if (ab != NULL) { 2636 /* 2637 * The checksum of blocks to free is not always 2638 * preserved (eg. on the deadlist). However, if it is 2639 * nonzero, it should match what we have in the cache. 2640 */ 2641 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2642 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2643 if (ab->b_state != arc_anon) 2644 arc_change_state(arc_anon, ab, hash_lock); 2645 if (HDR_IO_IN_PROGRESS(ab)) { 2646 /* 2647 * This should only happen when we prefetch. 2648 */ 2649 ASSERT(ab->b_flags & ARC_PREFETCH); 2650 ASSERT3U(ab->b_datacnt, ==, 1); 2651 ab->b_flags |= ARC_FREED_IN_READ; 2652 if (HDR_IN_HASH_TABLE(ab)) 2653 buf_hash_remove(ab); 2654 ab->b_arc_access = 0; 2655 bzero(&ab->b_dva, sizeof (dva_t)); 2656 ab->b_birth = 0; 2657 ab->b_cksum0 = 0; 2658 ab->b_buf->b_efunc = NULL; 2659 ab->b_buf->b_private = NULL; 2660 mutex_exit(hash_lock); 2661 } else if (refcount_is_zero(&ab->b_refcnt)) { 2662 mutex_exit(hash_lock); 2663 arc_hdr_destroy(ab); 2664 ARCSTAT_BUMP(arcstat_deleted); 2665 } else { 2666 /* 2667 * We still have an active reference on this 2668 * buffer. This can happen, e.g., from 2669 * dbuf_unoverride(). 2670 */ 2671 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2672 ab->b_arc_access = 0; 2673 bzero(&ab->b_dva, sizeof (dva_t)); 2674 ab->b_birth = 0; 2675 ab->b_cksum0 = 0; 2676 ab->b_buf->b_efunc = NULL; 2677 ab->b_buf->b_private = NULL; 2678 mutex_exit(hash_lock); 2679 } 2680 } 2681 2682 zio = zio_free(pio, spa, txg, bp, done, private); 2683 2684 if (arc_flags & ARC_WAIT) 2685 return (zio_wait(zio)); 2686 2687 ASSERT(arc_flags & ARC_NOWAIT); 2688 zio_nowait(zio); 2689 2690 return (0); 2691 } 2692 2693 void 2694 arc_tempreserve_clear(uint64_t tempreserve) 2695 { 2696 atomic_add_64(&arc_tempreserve, -tempreserve); 2697 ASSERT((int64_t)arc_tempreserve >= 0); 2698 } 2699 2700 int 2701 arc_tempreserve_space(uint64_t tempreserve) 2702 { 2703 #ifdef ZFS_DEBUG 2704 /* 2705 * Once in a while, fail for no reason. Everything should cope. 2706 */ 2707 if (spa_get_random(10000) == 0) { 2708 dprintf("forcing random failure\n"); 2709 return (ERESTART); 2710 } 2711 #endif 2712 if (tempreserve > arc_c/4 && !arc_no_grow) 2713 arc_c = MIN(arc_c_max, tempreserve * 4); 2714 if (tempreserve > arc_c) 2715 return (ENOMEM); 2716 2717 /* 2718 * Throttle writes when the amount of dirty data in the cache 2719 * gets too large. We try to keep the cache less than half full 2720 * of dirty blocks so that our sync times don't grow too large. 2721 * Note: if two requests come in concurrently, we might let them 2722 * both succeed, when one of them should fail. Not a huge deal. 2723 * 2724 * XXX The limit should be adjusted dynamically to keep the time 2725 * to sync a dataset fixed (around 1-5 seconds?). 2726 */ 2727 2728 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2729 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2730 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 2731 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 2732 arc_tempreserve>>10, 2733 arc_anon->arcs_lsize[ARC_BUFC_METADATA]>>10, 2734 arc_anon->arcs_lsize[ARC_BUFC_DATA]>>10, 2735 tempreserve>>10, arc_c>>10); 2736 return (ERESTART); 2737 } 2738 atomic_add_64(&arc_tempreserve, tempreserve); 2739 return (0); 2740 } 2741 2742 void 2743 arc_init(void) 2744 { 2745 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2746 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2747 2748 /* Convert seconds to clock ticks */ 2749 arc_min_prefetch_lifespan = 1 * hz; 2750 2751 /* Start out with 1/8 of all memory */ 2752 arc_c = physmem * PAGESIZE / 8; 2753 2754 #ifdef _KERNEL 2755 /* 2756 * On architectures where the physical memory can be larger 2757 * than the addressable space (intel in 32-bit mode), we may 2758 * need to limit the cache to 1/8 of VM size. 2759 */ 2760 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2761 #endif 2762 2763 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2764 arc_c_min = MAX(arc_c / 4, 64<<20); 2765 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2766 if (arc_c * 8 >= 1<<30) 2767 arc_c_max = (arc_c * 8) - (1<<30); 2768 else 2769 arc_c_max = arc_c_min; 2770 arc_c_max = MAX(arc_c * 6, arc_c_max); 2771 2772 /* 2773 * Allow the tunables to override our calculations if they are 2774 * reasonable (ie. over 64MB) 2775 */ 2776 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2777 arc_c_max = zfs_arc_max; 2778 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2779 arc_c_min = zfs_arc_min; 2780 2781 arc_c = arc_c_max; 2782 arc_p = (arc_c >> 1); 2783 2784 /* limit meta-data to 1/4 of the arc capacity */ 2785 arc_meta_limit = arc_c_max / 4; 2786 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 2787 arc_c_min = arc_meta_limit / 2; 2788 2789 /* if kmem_flags are set, lets try to use less memory */ 2790 if (kmem_debugging()) 2791 arc_c = arc_c / 2; 2792 if (arc_c < arc_c_min) 2793 arc_c = arc_c_min; 2794 2795 arc_anon = &ARC_anon; 2796 arc_mru = &ARC_mru; 2797 arc_mru_ghost = &ARC_mru_ghost; 2798 arc_mfu = &ARC_mfu; 2799 arc_mfu_ghost = &ARC_mfu_ghost; 2800 arc_size = 0; 2801 2802 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2803 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2804 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2805 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2806 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2807 2808 list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], 2809 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2810 list_create(&arc_mru->arcs_list[ARC_BUFC_DATA], 2811 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2812 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA], 2813 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2814 list_create(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA], 2815 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2816 list_create(&arc_mfu->arcs_list[ARC_BUFC_METADATA], 2817 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2818 list_create(&arc_mfu->arcs_list[ARC_BUFC_DATA], 2819 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2820 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA], 2821 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2822 list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], 2823 sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); 2824 2825 buf_init(); 2826 2827 arc_thread_exit = 0; 2828 arc_eviction_list = NULL; 2829 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2830 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2831 2832 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2833 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2834 2835 if (arc_ksp != NULL) { 2836 arc_ksp->ks_data = &arc_stats; 2837 kstat_install(arc_ksp); 2838 } 2839 2840 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2841 TS_RUN, minclsyspri); 2842 2843 arc_dead = FALSE; 2844 } 2845 2846 void 2847 arc_fini(void) 2848 { 2849 mutex_enter(&arc_reclaim_thr_lock); 2850 arc_thread_exit = 1; 2851 while (arc_thread_exit != 0) 2852 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2853 mutex_exit(&arc_reclaim_thr_lock); 2854 2855 arc_flush(); 2856 2857 arc_dead = TRUE; 2858 2859 if (arc_ksp != NULL) { 2860 kstat_delete(arc_ksp); 2861 arc_ksp = NULL; 2862 } 2863 2864 mutex_destroy(&arc_eviction_mtx); 2865 mutex_destroy(&arc_reclaim_thr_lock); 2866 cv_destroy(&arc_reclaim_thr_cv); 2867 2868 list_destroy(&arc_mru->arcs_list[ARC_BUFC_METADATA]); 2869 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 2870 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_METADATA]); 2871 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 2872 list_destroy(&arc_mru->arcs_list[ARC_BUFC_DATA]); 2873 list_destroy(&arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 2874 list_destroy(&arc_mfu->arcs_list[ARC_BUFC_DATA]); 2875 list_destroy(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 2876 2877 mutex_destroy(&arc_anon->arcs_mtx); 2878 mutex_destroy(&arc_mru->arcs_mtx); 2879 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2880 mutex_destroy(&arc_mfu->arcs_mtx); 2881 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2882 2883 buf_fini(); 2884 } 2885