1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Replacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 #include <sys/kstat.h> 128 129 static kmutex_t arc_reclaim_thr_lock; 130 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 131 static uint8_t arc_thread_exit; 132 133 #define ARC_REDUCE_DNLC_PERCENT 3 134 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 135 136 typedef enum arc_reclaim_strategy { 137 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 138 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 139 } arc_reclaim_strategy_t; 140 141 /* number of seconds before growing cache again */ 142 static int arc_grow_retry = 60; 143 144 /* 145 * minimum lifespan of a prefetch block in clock ticks 146 * (initialized in arc_init()) 147 */ 148 static int arc_min_prefetch_lifespan; 149 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 158 /* 159 * Note that buffers can be on one of 5 states: 160 * ARC_anon - anonymous (discussed below) 161 * ARC_mru - recently used, currently cached 162 * ARC_mru_ghost - recentely used, no longer in cache 163 * ARC_mfu - frequently used, currently cached 164 * ARC_mfu_ghost - frequently used, no longer in cache 165 * When there are no active references to the buffer, they 166 * are linked onto one of the lists in arc. These are the 167 * only buffers that can be evicted or deleted. 168 * 169 * Anonymous buffers are buffers that are not associated with 170 * a DVA. These are buffers that hold dirty block copies 171 * before they are written to stable storage. By definition, 172 * they are "ref'd" and are considered part of arc_mru 173 * that cannot be freed. Generally, they will aquire a DVA 174 * as they are written and migrate onto the arc_mru list. 175 */ 176 177 typedef struct arc_state { 178 list_t arcs_list; /* linked list of evictable buffer in state */ 179 uint64_t arcs_lsize; /* total size of buffers in the linked list */ 180 uint64_t arcs_size; /* total size of all buffers in this state */ 181 kmutex_t arcs_mtx; 182 } arc_state_t; 183 184 /* The 5 states: */ 185 static arc_state_t ARC_anon; 186 static arc_state_t ARC_mru; 187 static arc_state_t ARC_mru_ghost; 188 static arc_state_t ARC_mfu; 189 static arc_state_t ARC_mfu_ghost; 190 191 typedef struct arc_stats { 192 kstat_named_t arcstat_hits; 193 kstat_named_t arcstat_misses; 194 kstat_named_t arcstat_demand_data_hits; 195 kstat_named_t arcstat_demand_data_misses; 196 kstat_named_t arcstat_demand_metadata_hits; 197 kstat_named_t arcstat_demand_metadata_misses; 198 kstat_named_t arcstat_prefetch_data_hits; 199 kstat_named_t arcstat_prefetch_data_misses; 200 kstat_named_t arcstat_prefetch_metadata_hits; 201 kstat_named_t arcstat_prefetch_metadata_misses; 202 kstat_named_t arcstat_mru_hits; 203 kstat_named_t arcstat_mru_ghost_hits; 204 kstat_named_t arcstat_mfu_hits; 205 kstat_named_t arcstat_mfu_ghost_hits; 206 kstat_named_t arcstat_deleted; 207 kstat_named_t arcstat_recycle_miss; 208 kstat_named_t arcstat_mutex_miss; 209 kstat_named_t arcstat_evict_skip; 210 kstat_named_t arcstat_hash_elements; 211 kstat_named_t arcstat_hash_elements_max; 212 kstat_named_t arcstat_hash_collisions; 213 kstat_named_t arcstat_hash_chains; 214 kstat_named_t arcstat_hash_chain_max; 215 kstat_named_t arcstat_p; 216 kstat_named_t arcstat_c; 217 kstat_named_t arcstat_c_min; 218 kstat_named_t arcstat_c_max; 219 kstat_named_t arcstat_size; 220 } arc_stats_t; 221 222 static arc_stats_t arc_stats = { 223 { "hits", KSTAT_DATA_UINT64 }, 224 { "misses", KSTAT_DATA_UINT64 }, 225 { "demand_data_hits", KSTAT_DATA_UINT64 }, 226 { "demand_data_misses", KSTAT_DATA_UINT64 }, 227 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 228 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 229 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 230 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 231 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 232 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 233 { "mru_hits", KSTAT_DATA_UINT64 }, 234 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 235 { "mfu_hits", KSTAT_DATA_UINT64 }, 236 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 237 { "deleted", KSTAT_DATA_UINT64 }, 238 { "recycle_miss", KSTAT_DATA_UINT64 }, 239 { "mutex_miss", KSTAT_DATA_UINT64 }, 240 { "evict_skip", KSTAT_DATA_UINT64 }, 241 { "hash_elements", KSTAT_DATA_UINT64 }, 242 { "hash_elements_max", KSTAT_DATA_UINT64 }, 243 { "hash_collisions", KSTAT_DATA_UINT64 }, 244 { "hash_chains", KSTAT_DATA_UINT64 }, 245 { "hash_chain_max", KSTAT_DATA_UINT64 }, 246 { "p", KSTAT_DATA_UINT64 }, 247 { "c", KSTAT_DATA_UINT64 }, 248 { "c_min", KSTAT_DATA_UINT64 }, 249 { "c_max", KSTAT_DATA_UINT64 }, 250 { "size", KSTAT_DATA_UINT64 } 251 }; 252 253 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 254 255 #define ARCSTAT_INCR(stat, val) \ 256 atomic_add_64(&arc_stats.stat.value.ui64, (val)); 257 258 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 259 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 260 261 #define ARCSTAT_MAX(stat, val) { \ 262 uint64_t m; \ 263 while ((val) > (m = arc_stats.stat.value.ui64) && \ 264 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 265 continue; \ 266 } 267 268 #define ARCSTAT_MAXSTAT(stat) \ 269 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 270 271 /* 272 * We define a macro to allow ARC hits/misses to be easily broken down by 273 * two separate conditions, giving a total of four different subtypes for 274 * each of hits and misses (so eight statistics total). 275 */ 276 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 277 if (cond1) { \ 278 if (cond2) { \ 279 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 280 } else { \ 281 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 282 } \ 283 } else { \ 284 if (cond2) { \ 285 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 286 } else { \ 287 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 288 } \ 289 } 290 291 kstat_t *arc_ksp; 292 static arc_state_t *arc_anon; 293 static arc_state_t *arc_mru; 294 static arc_state_t *arc_mru_ghost; 295 static arc_state_t *arc_mfu; 296 static arc_state_t *arc_mfu_ghost; 297 298 /* 299 * There are several ARC variables that are critical to export as kstats -- 300 * but we don't want to have to grovel around in the kstat whenever we wish to 301 * manipulate them. For these variables, we therefore define them to be in 302 * terms of the statistic variable. This assures that we are not introducing 303 * the possibility of inconsistency by having shadow copies of the variables, 304 * while still allowing the code to be readable. 305 */ 306 #define arc_size ARCSTAT(arcstat_size) /* actual total arc size */ 307 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 308 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 309 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 310 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 311 312 static int arc_no_grow; /* Don't try to grow cache size */ 313 static uint64_t arc_tempreserve; 314 315 typedef struct arc_callback arc_callback_t; 316 317 struct arc_callback { 318 void *acb_private; 319 arc_done_func_t *acb_done; 320 arc_byteswap_func_t *acb_byteswap; 321 arc_buf_t *acb_buf; 322 zio_t *acb_zio_dummy; 323 arc_callback_t *acb_next; 324 }; 325 326 typedef struct arc_write_callback arc_write_callback_t; 327 328 struct arc_write_callback { 329 void *awcb_private; 330 arc_done_func_t *awcb_ready; 331 arc_done_func_t *awcb_done; 332 arc_buf_t *awcb_buf; 333 }; 334 335 struct arc_buf_hdr { 336 /* protected by hash lock */ 337 dva_t b_dva; 338 uint64_t b_birth; 339 uint64_t b_cksum0; 340 341 kmutex_t b_freeze_lock; 342 zio_cksum_t *b_freeze_cksum; 343 344 arc_buf_hdr_t *b_hash_next; 345 arc_buf_t *b_buf; 346 uint32_t b_flags; 347 uint32_t b_datacnt; 348 349 arc_callback_t *b_acb; 350 kcondvar_t b_cv; 351 352 /* immutable */ 353 arc_buf_contents_t b_type; 354 uint64_t b_size; 355 spa_t *b_spa; 356 357 /* protected by arc state mutex */ 358 arc_state_t *b_state; 359 list_node_t b_arc_node; 360 361 /* updated atomically */ 362 clock_t b_arc_access; 363 364 /* self protecting */ 365 refcount_t b_refcnt; 366 }; 367 368 static arc_buf_t *arc_eviction_list; 369 static kmutex_t arc_eviction_mtx; 370 static arc_buf_hdr_t arc_eviction_hdr; 371 static size_t arc_ziosize; 372 static void arc_get_data_buf(arc_buf_t *buf); 373 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 374 375 #define GHOST_STATE(state) \ 376 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) 377 378 /* 379 * Private ARC flags. These flags are private ARC only flags that will show up 380 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 381 * be passed in as arc_flags in things like arc_read. However, these flags 382 * should never be passed and should only be set by ARC code. When adding new 383 * public flags, make sure not to smash the private ones. 384 */ 385 386 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 387 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 388 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 389 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 390 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 391 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 392 393 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 394 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 395 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 396 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 397 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 398 399 /* 400 * Hash table routines 401 */ 402 403 #define HT_LOCK_PAD 64 404 405 struct ht_lock { 406 kmutex_t ht_lock; 407 #ifdef _KERNEL 408 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 409 #endif 410 }; 411 412 #define BUF_LOCKS 256 413 typedef struct buf_hash_table { 414 uint64_t ht_mask; 415 arc_buf_hdr_t **ht_table; 416 struct ht_lock ht_locks[BUF_LOCKS]; 417 } buf_hash_table_t; 418 419 static buf_hash_table_t buf_hash_table; 420 421 #define BUF_HASH_INDEX(spa, dva, birth) \ 422 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 423 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 424 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 425 #define HDR_LOCK(buf) \ 426 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 427 428 uint64_t zfs_crc64_table[256]; 429 430 static uint64_t 431 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 432 { 433 uintptr_t spav = (uintptr_t)spa; 434 uint8_t *vdva = (uint8_t *)dva; 435 uint64_t crc = -1ULL; 436 int i; 437 438 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 439 440 for (i = 0; i < sizeof (dva_t); i++) 441 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 442 443 crc ^= (spav>>8) ^ birth; 444 445 return (crc); 446 } 447 448 #define BUF_EMPTY(buf) \ 449 ((buf)->b_dva.dva_word[0] == 0 && \ 450 (buf)->b_dva.dva_word[1] == 0 && \ 451 (buf)->b_birth == 0) 452 453 #define BUF_EQUAL(spa, dva, birth, buf) \ 454 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 455 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 456 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 457 458 static arc_buf_hdr_t * 459 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 460 { 461 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 462 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 463 arc_buf_hdr_t *buf; 464 465 mutex_enter(hash_lock); 466 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 467 buf = buf->b_hash_next) { 468 if (BUF_EQUAL(spa, dva, birth, buf)) { 469 *lockp = hash_lock; 470 return (buf); 471 } 472 } 473 mutex_exit(hash_lock); 474 *lockp = NULL; 475 return (NULL); 476 } 477 478 /* 479 * Insert an entry into the hash table. If there is already an element 480 * equal to elem in the hash table, then the already existing element 481 * will be returned and the new element will not be inserted. 482 * Otherwise returns NULL. 483 */ 484 static arc_buf_hdr_t * 485 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 486 { 487 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 488 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 489 arc_buf_hdr_t *fbuf; 490 uint32_t i; 491 492 ASSERT(!HDR_IN_HASH_TABLE(buf)); 493 *lockp = hash_lock; 494 mutex_enter(hash_lock); 495 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 496 fbuf = fbuf->b_hash_next, i++) { 497 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 498 return (fbuf); 499 } 500 501 buf->b_hash_next = buf_hash_table.ht_table[idx]; 502 buf_hash_table.ht_table[idx] = buf; 503 buf->b_flags |= ARC_IN_HASH_TABLE; 504 505 /* collect some hash table performance data */ 506 if (i > 0) { 507 ARCSTAT_BUMP(arcstat_hash_collisions); 508 if (i == 1) 509 ARCSTAT_BUMP(arcstat_hash_chains); 510 511 ARCSTAT_MAX(arcstat_hash_chain_max, i); 512 } 513 514 ARCSTAT_BUMP(arcstat_hash_elements); 515 ARCSTAT_MAXSTAT(arcstat_hash_elements); 516 517 return (NULL); 518 } 519 520 static void 521 buf_hash_remove(arc_buf_hdr_t *buf) 522 { 523 arc_buf_hdr_t *fbuf, **bufp; 524 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 525 526 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 527 ASSERT(HDR_IN_HASH_TABLE(buf)); 528 529 bufp = &buf_hash_table.ht_table[idx]; 530 while ((fbuf = *bufp) != buf) { 531 ASSERT(fbuf != NULL); 532 bufp = &fbuf->b_hash_next; 533 } 534 *bufp = buf->b_hash_next; 535 buf->b_hash_next = NULL; 536 buf->b_flags &= ~ARC_IN_HASH_TABLE; 537 538 /* collect some hash table performance data */ 539 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 540 541 if (buf_hash_table.ht_table[idx] && 542 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 543 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 544 } 545 546 /* 547 * Global data structures and functions for the buf kmem cache. 548 */ 549 static kmem_cache_t *hdr_cache; 550 static kmem_cache_t *buf_cache; 551 552 static void 553 buf_fini(void) 554 { 555 int i; 556 557 kmem_free(buf_hash_table.ht_table, 558 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 559 for (i = 0; i < BUF_LOCKS; i++) 560 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 561 kmem_cache_destroy(hdr_cache); 562 kmem_cache_destroy(buf_cache); 563 } 564 565 /* 566 * Constructor callback - called when the cache is empty 567 * and a new buf is requested. 568 */ 569 /* ARGSUSED */ 570 static int 571 hdr_cons(void *vbuf, void *unused, int kmflag) 572 { 573 arc_buf_hdr_t *buf = vbuf; 574 575 bzero(buf, sizeof (arc_buf_hdr_t)); 576 refcount_create(&buf->b_refcnt); 577 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 578 return (0); 579 } 580 581 /* 582 * Destructor callback - called when a cached buf is 583 * no longer required. 584 */ 585 /* ARGSUSED */ 586 static void 587 hdr_dest(void *vbuf, void *unused) 588 { 589 arc_buf_hdr_t *buf = vbuf; 590 591 refcount_destroy(&buf->b_refcnt); 592 cv_destroy(&buf->b_cv); 593 } 594 595 /* 596 * Reclaim callback -- invoked when memory is low. 597 */ 598 /* ARGSUSED */ 599 static void 600 hdr_recl(void *unused) 601 { 602 dprintf("hdr_recl called\n"); 603 /* 604 * umem calls the reclaim func when we destroy the buf cache, 605 * which is after we do arc_fini(). 606 */ 607 if (!arc_dead) 608 cv_signal(&arc_reclaim_thr_cv); 609 } 610 611 static void 612 buf_init(void) 613 { 614 uint64_t *ct; 615 uint64_t hsize = 1ULL << 12; 616 int i, j; 617 618 /* 619 * The hash table is big enough to fill all of physical memory 620 * with an average 64K block size. The table will take up 621 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 622 */ 623 while (hsize * 65536 < physmem * PAGESIZE) 624 hsize <<= 1; 625 retry: 626 buf_hash_table.ht_mask = hsize - 1; 627 buf_hash_table.ht_table = 628 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 629 if (buf_hash_table.ht_table == NULL) { 630 ASSERT(hsize > (1ULL << 8)); 631 hsize >>= 1; 632 goto retry; 633 } 634 635 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 636 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 637 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 638 0, NULL, NULL, NULL, NULL, NULL, 0); 639 640 for (i = 0; i < 256; i++) 641 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 642 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 643 644 for (i = 0; i < BUF_LOCKS; i++) { 645 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 646 NULL, MUTEX_DEFAULT, NULL); 647 } 648 } 649 650 #define ARC_MINTIME (hz>>4) /* 62 ms */ 651 652 static void 653 arc_cksum_verify(arc_buf_t *buf) 654 { 655 zio_cksum_t zc; 656 657 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 658 return; 659 660 mutex_enter(&buf->b_hdr->b_freeze_lock); 661 if (buf->b_hdr->b_freeze_cksum == NULL || 662 (buf->b_hdr->b_flags & ARC_IO_ERROR)) { 663 mutex_exit(&buf->b_hdr->b_freeze_lock); 664 return; 665 } 666 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 667 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 668 panic("buffer modified while frozen!"); 669 mutex_exit(&buf->b_hdr->b_freeze_lock); 670 } 671 672 static void 673 arc_cksum_compute(arc_buf_t *buf) 674 { 675 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 676 return; 677 678 mutex_enter(&buf->b_hdr->b_freeze_lock); 679 if (buf->b_hdr->b_freeze_cksum != NULL) { 680 mutex_exit(&buf->b_hdr->b_freeze_lock); 681 return; 682 } 683 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 684 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 685 buf->b_hdr->b_freeze_cksum); 686 mutex_exit(&buf->b_hdr->b_freeze_lock); 687 } 688 689 void 690 arc_buf_thaw(arc_buf_t *buf) 691 { 692 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 693 return; 694 695 if (buf->b_hdr->b_state != arc_anon) 696 panic("modifying non-anon buffer!"); 697 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 698 panic("modifying buffer while i/o in progress!"); 699 arc_cksum_verify(buf); 700 mutex_enter(&buf->b_hdr->b_freeze_lock); 701 if (buf->b_hdr->b_freeze_cksum != NULL) { 702 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 703 buf->b_hdr->b_freeze_cksum = NULL; 704 } 705 mutex_exit(&buf->b_hdr->b_freeze_lock); 706 } 707 708 void 709 arc_buf_freeze(arc_buf_t *buf) 710 { 711 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 712 return; 713 714 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 715 buf->b_hdr->b_state == arc_anon); 716 arc_cksum_compute(buf); 717 } 718 719 static void 720 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 721 { 722 ASSERT(MUTEX_HELD(hash_lock)); 723 724 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 725 (ab->b_state != arc_anon)) { 726 uint64_t delta = ab->b_size * ab->b_datacnt; 727 728 ASSERT(!MUTEX_HELD(&ab->b_state->arcs_mtx)); 729 mutex_enter(&ab->b_state->arcs_mtx); 730 ASSERT(list_link_active(&ab->b_arc_node)); 731 list_remove(&ab->b_state->arcs_list, ab); 732 if (GHOST_STATE(ab->b_state)) { 733 ASSERT3U(ab->b_datacnt, ==, 0); 734 ASSERT3P(ab->b_buf, ==, NULL); 735 delta = ab->b_size; 736 } 737 ASSERT(delta > 0); 738 ASSERT3U(ab->b_state->arcs_lsize, >=, delta); 739 atomic_add_64(&ab->b_state->arcs_lsize, -delta); 740 mutex_exit(&ab->b_state->arcs_mtx); 741 /* remove the prefetch flag is we get a reference */ 742 if (ab->b_flags & ARC_PREFETCH) 743 ab->b_flags &= ~ARC_PREFETCH; 744 } 745 } 746 747 static int 748 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 749 { 750 int cnt; 751 arc_state_t *state = ab->b_state; 752 753 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 754 ASSERT(!GHOST_STATE(state)); 755 756 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 757 (state != arc_anon)) { 758 ASSERT(!MUTEX_HELD(&state->arcs_mtx)); 759 mutex_enter(&state->arcs_mtx); 760 ASSERT(!list_link_active(&ab->b_arc_node)); 761 list_insert_head(&state->arcs_list, ab); 762 ASSERT(ab->b_datacnt > 0); 763 atomic_add_64(&state->arcs_lsize, ab->b_size * ab->b_datacnt); 764 ASSERT3U(state->arcs_size, >=, state->arcs_lsize); 765 mutex_exit(&state->arcs_mtx); 766 } 767 return (cnt); 768 } 769 770 /* 771 * Move the supplied buffer to the indicated state. The mutex 772 * for the buffer must be held by the caller. 773 */ 774 static void 775 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 776 { 777 arc_state_t *old_state = ab->b_state; 778 int64_t refcnt = refcount_count(&ab->b_refcnt); 779 uint64_t from_delta, to_delta; 780 781 ASSERT(MUTEX_HELD(hash_lock)); 782 ASSERT(new_state != old_state); 783 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 784 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 785 786 from_delta = to_delta = ab->b_datacnt * ab->b_size; 787 788 /* 789 * If this buffer is evictable, transfer it from the 790 * old state list to the new state list. 791 */ 792 if (refcnt == 0) { 793 if (old_state != arc_anon) { 794 int use_mutex = !MUTEX_HELD(&old_state->arcs_mtx); 795 796 if (use_mutex) 797 mutex_enter(&old_state->arcs_mtx); 798 799 ASSERT(list_link_active(&ab->b_arc_node)); 800 list_remove(&old_state->arcs_list, ab); 801 802 /* 803 * If prefetching out of the ghost cache, 804 * we will have a non-null datacnt. 805 */ 806 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 807 /* ghost elements have a ghost size */ 808 ASSERT(ab->b_buf == NULL); 809 from_delta = ab->b_size; 810 } 811 ASSERT3U(old_state->arcs_lsize, >=, from_delta); 812 atomic_add_64(&old_state->arcs_lsize, -from_delta); 813 814 if (use_mutex) 815 mutex_exit(&old_state->arcs_mtx); 816 } 817 if (new_state != arc_anon) { 818 int use_mutex = !MUTEX_HELD(&new_state->arcs_mtx); 819 820 if (use_mutex) 821 mutex_enter(&new_state->arcs_mtx); 822 823 list_insert_head(&new_state->arcs_list, ab); 824 825 /* ghost elements have a ghost size */ 826 if (GHOST_STATE(new_state)) { 827 ASSERT(ab->b_datacnt == 0); 828 ASSERT(ab->b_buf == NULL); 829 to_delta = ab->b_size; 830 } 831 atomic_add_64(&new_state->arcs_lsize, to_delta); 832 ASSERT3U(new_state->arcs_size + to_delta, >=, 833 new_state->arcs_lsize); 834 835 if (use_mutex) 836 mutex_exit(&new_state->arcs_mtx); 837 } 838 } 839 840 ASSERT(!BUF_EMPTY(ab)); 841 if (new_state == arc_anon && old_state != arc_anon) { 842 buf_hash_remove(ab); 843 } 844 845 /* adjust state sizes */ 846 if (to_delta) 847 atomic_add_64(&new_state->arcs_size, to_delta); 848 if (from_delta) { 849 ASSERT3U(old_state->arcs_size, >=, from_delta); 850 atomic_add_64(&old_state->arcs_size, -from_delta); 851 } 852 ab->b_state = new_state; 853 } 854 855 arc_buf_t * 856 arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) 857 { 858 arc_buf_hdr_t *hdr; 859 arc_buf_t *buf; 860 861 ASSERT3U(size, >, 0); 862 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 863 ASSERT(BUF_EMPTY(hdr)); 864 hdr->b_size = size; 865 hdr->b_type = type; 866 hdr->b_spa = spa; 867 hdr->b_state = arc_anon; 868 hdr->b_arc_access = 0; 869 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 870 buf->b_hdr = hdr; 871 buf->b_data = NULL; 872 buf->b_efunc = NULL; 873 buf->b_private = NULL; 874 buf->b_next = NULL; 875 hdr->b_buf = buf; 876 arc_get_data_buf(buf); 877 hdr->b_datacnt = 1; 878 hdr->b_flags = 0; 879 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 880 (void) refcount_add(&hdr->b_refcnt, tag); 881 882 return (buf); 883 } 884 885 static arc_buf_t * 886 arc_buf_clone(arc_buf_t *from) 887 { 888 arc_buf_t *buf; 889 arc_buf_hdr_t *hdr = from->b_hdr; 890 uint64_t size = hdr->b_size; 891 892 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 893 buf->b_hdr = hdr; 894 buf->b_data = NULL; 895 buf->b_efunc = NULL; 896 buf->b_private = NULL; 897 buf->b_next = hdr->b_buf; 898 hdr->b_buf = buf; 899 arc_get_data_buf(buf); 900 bcopy(from->b_data, buf->b_data, size); 901 hdr->b_datacnt += 1; 902 return (buf); 903 } 904 905 void 906 arc_buf_add_ref(arc_buf_t *buf, void* tag) 907 { 908 arc_buf_hdr_t *hdr; 909 kmutex_t *hash_lock; 910 911 /* 912 * Check to see if this buffer is currently being evicted via 913 * arc_do_user_evicts(). 914 */ 915 mutex_enter(&arc_eviction_mtx); 916 hdr = buf->b_hdr; 917 if (hdr == NULL) { 918 mutex_exit(&arc_eviction_mtx); 919 return; 920 } 921 hash_lock = HDR_LOCK(hdr); 922 mutex_exit(&arc_eviction_mtx); 923 924 mutex_enter(hash_lock); 925 if (buf->b_data == NULL) { 926 /* 927 * This buffer is evicted. 928 */ 929 mutex_exit(hash_lock); 930 return; 931 } 932 933 ASSERT(buf->b_hdr == hdr); 934 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 935 add_reference(hdr, hash_lock, tag); 936 arc_access(hdr, hash_lock); 937 mutex_exit(hash_lock); 938 ARCSTAT_BUMP(arcstat_hits); 939 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 940 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 941 data, metadata, hits); 942 } 943 944 static void 945 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 946 { 947 arc_buf_t **bufp; 948 949 /* free up data associated with the buf */ 950 if (buf->b_data) { 951 arc_state_t *state = buf->b_hdr->b_state; 952 uint64_t size = buf->b_hdr->b_size; 953 arc_buf_contents_t type = buf->b_hdr->b_type; 954 955 arc_cksum_verify(buf); 956 if (!recycle) { 957 if (type == ARC_BUFC_METADATA) { 958 zio_buf_free(buf->b_data, size); 959 } else { 960 ASSERT(type == ARC_BUFC_DATA); 961 zio_data_buf_free(buf->b_data, size); 962 } 963 atomic_add_64(&arc_size, -size); 964 } 965 if (list_link_active(&buf->b_hdr->b_arc_node)) { 966 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 967 ASSERT(state != arc_anon); 968 ASSERT3U(state->arcs_lsize, >=, size); 969 atomic_add_64(&state->arcs_lsize, -size); 970 } 971 ASSERT3U(state->arcs_size, >=, size); 972 atomic_add_64(&state->arcs_size, -size); 973 buf->b_data = NULL; 974 ASSERT(buf->b_hdr->b_datacnt > 0); 975 buf->b_hdr->b_datacnt -= 1; 976 } 977 978 /* only remove the buf if requested */ 979 if (!all) 980 return; 981 982 /* remove the buf from the hdr list */ 983 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 984 continue; 985 *bufp = buf->b_next; 986 987 ASSERT(buf->b_efunc == NULL); 988 989 /* clean up the buf */ 990 buf->b_hdr = NULL; 991 kmem_cache_free(buf_cache, buf); 992 } 993 994 static void 995 arc_hdr_destroy(arc_buf_hdr_t *hdr) 996 { 997 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 998 ASSERT3P(hdr->b_state, ==, arc_anon); 999 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1000 1001 if (!BUF_EMPTY(hdr)) { 1002 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1003 bzero(&hdr->b_dva, sizeof (dva_t)); 1004 hdr->b_birth = 0; 1005 hdr->b_cksum0 = 0; 1006 } 1007 while (hdr->b_buf) { 1008 arc_buf_t *buf = hdr->b_buf; 1009 1010 if (buf->b_efunc) { 1011 mutex_enter(&arc_eviction_mtx); 1012 ASSERT(buf->b_hdr != NULL); 1013 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 1014 hdr->b_buf = buf->b_next; 1015 buf->b_hdr = &arc_eviction_hdr; 1016 buf->b_next = arc_eviction_list; 1017 arc_eviction_list = buf; 1018 mutex_exit(&arc_eviction_mtx); 1019 } else { 1020 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 1021 } 1022 } 1023 if (hdr->b_freeze_cksum != NULL) { 1024 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 1025 hdr->b_freeze_cksum = NULL; 1026 } 1027 1028 ASSERT(!list_link_active(&hdr->b_arc_node)); 1029 ASSERT3P(hdr->b_hash_next, ==, NULL); 1030 ASSERT3P(hdr->b_acb, ==, NULL); 1031 kmem_cache_free(hdr_cache, hdr); 1032 } 1033 1034 void 1035 arc_buf_free(arc_buf_t *buf, void *tag) 1036 { 1037 arc_buf_hdr_t *hdr = buf->b_hdr; 1038 int hashed = hdr->b_state != arc_anon; 1039 1040 ASSERT(buf->b_efunc == NULL); 1041 ASSERT(buf->b_data != NULL); 1042 1043 if (hashed) { 1044 kmutex_t *hash_lock = HDR_LOCK(hdr); 1045 1046 mutex_enter(hash_lock); 1047 (void) remove_reference(hdr, hash_lock, tag); 1048 if (hdr->b_datacnt > 1) 1049 arc_buf_destroy(buf, FALSE, TRUE); 1050 else 1051 hdr->b_flags |= ARC_BUF_AVAILABLE; 1052 mutex_exit(hash_lock); 1053 } else if (HDR_IO_IN_PROGRESS(hdr)) { 1054 int destroy_hdr; 1055 /* 1056 * We are in the middle of an async write. Don't destroy 1057 * this buffer unless the write completes before we finish 1058 * decrementing the reference count. 1059 */ 1060 mutex_enter(&arc_eviction_mtx); 1061 (void) remove_reference(hdr, NULL, tag); 1062 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1063 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 1064 mutex_exit(&arc_eviction_mtx); 1065 if (destroy_hdr) 1066 arc_hdr_destroy(hdr); 1067 } else { 1068 if (remove_reference(hdr, NULL, tag) > 0) { 1069 ASSERT(HDR_IO_ERROR(hdr)); 1070 arc_buf_destroy(buf, FALSE, TRUE); 1071 } else { 1072 arc_hdr_destroy(hdr); 1073 } 1074 } 1075 } 1076 1077 int 1078 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 1079 { 1080 arc_buf_hdr_t *hdr = buf->b_hdr; 1081 kmutex_t *hash_lock = HDR_LOCK(hdr); 1082 int no_callback = (buf->b_efunc == NULL); 1083 1084 if (hdr->b_state == arc_anon) { 1085 arc_buf_free(buf, tag); 1086 return (no_callback); 1087 } 1088 1089 mutex_enter(hash_lock); 1090 ASSERT(hdr->b_state != arc_anon); 1091 ASSERT(buf->b_data != NULL); 1092 1093 (void) remove_reference(hdr, hash_lock, tag); 1094 if (hdr->b_datacnt > 1) { 1095 if (no_callback) 1096 arc_buf_destroy(buf, FALSE, TRUE); 1097 } else if (no_callback) { 1098 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 1099 hdr->b_flags |= ARC_BUF_AVAILABLE; 1100 } 1101 ASSERT(no_callback || hdr->b_datacnt > 1 || 1102 refcount_is_zero(&hdr->b_refcnt)); 1103 mutex_exit(hash_lock); 1104 return (no_callback); 1105 } 1106 1107 int 1108 arc_buf_size(arc_buf_t *buf) 1109 { 1110 return (buf->b_hdr->b_size); 1111 } 1112 1113 /* 1114 * Evict buffers from list until we've removed the specified number of 1115 * bytes. Move the removed buffers to the appropriate evict state. 1116 * If the recycle flag is set, then attempt to "recycle" a buffer: 1117 * - look for a buffer to evict that is `bytes' long. 1118 * - return the data block from this buffer rather than freeing it. 1119 * This flag is used by callers that are trying to make space for a 1120 * new buffer in a full arc cache. 1121 */ 1122 static void * 1123 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, 1124 arc_buf_contents_t type) 1125 { 1126 arc_state_t *evicted_state; 1127 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1128 arc_buf_hdr_t *ab, *ab_prev = NULL; 1129 kmutex_t *hash_lock; 1130 boolean_t have_lock; 1131 void *stolen = NULL; 1132 1133 ASSERT(state == arc_mru || state == arc_mfu); 1134 1135 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 1136 1137 mutex_enter(&state->arcs_mtx); 1138 mutex_enter(&evicted_state->arcs_mtx); 1139 1140 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1141 ab_prev = list_prev(&state->arcs_list, ab); 1142 /* prefetch buffers have a minimum lifespan */ 1143 if (HDR_IO_IN_PROGRESS(ab) || 1144 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1145 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1146 skipped++; 1147 continue; 1148 } 1149 /* "lookahead" for better eviction candidate */ 1150 if (recycle && ab->b_size != bytes && 1151 ab_prev && ab_prev->b_size == bytes) 1152 continue; 1153 hash_lock = HDR_LOCK(ab); 1154 have_lock = MUTEX_HELD(hash_lock); 1155 if (have_lock || mutex_tryenter(hash_lock)) { 1156 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1157 ASSERT(ab->b_datacnt > 0); 1158 while (ab->b_buf) { 1159 arc_buf_t *buf = ab->b_buf; 1160 if (buf->b_data) { 1161 bytes_evicted += ab->b_size; 1162 if (recycle && ab->b_type == type && 1163 ab->b_size == bytes) { 1164 stolen = buf->b_data; 1165 recycle = FALSE; 1166 } 1167 } 1168 if (buf->b_efunc) { 1169 mutex_enter(&arc_eviction_mtx); 1170 arc_buf_destroy(buf, 1171 buf->b_data == stolen, FALSE); 1172 ab->b_buf = buf->b_next; 1173 buf->b_hdr = &arc_eviction_hdr; 1174 buf->b_next = arc_eviction_list; 1175 arc_eviction_list = buf; 1176 mutex_exit(&arc_eviction_mtx); 1177 } else { 1178 arc_buf_destroy(buf, 1179 buf->b_data == stolen, TRUE); 1180 } 1181 } 1182 ASSERT(ab->b_datacnt == 0); 1183 arc_change_state(evicted_state, ab, hash_lock); 1184 ASSERT(HDR_IN_HASH_TABLE(ab)); 1185 ab->b_flags = ARC_IN_HASH_TABLE; 1186 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1187 if (!have_lock) 1188 mutex_exit(hash_lock); 1189 if (bytes >= 0 && bytes_evicted >= bytes) 1190 break; 1191 } else { 1192 missed += 1; 1193 } 1194 } 1195 1196 mutex_exit(&evicted_state->arcs_mtx); 1197 mutex_exit(&state->arcs_mtx); 1198 1199 if (bytes_evicted < bytes) 1200 dprintf("only evicted %lld bytes from %x", 1201 (longlong_t)bytes_evicted, state); 1202 1203 if (skipped) 1204 ARCSTAT_INCR(arcstat_evict_skip, skipped); 1205 1206 if (missed) 1207 ARCSTAT_INCR(arcstat_mutex_miss, missed); 1208 1209 return (stolen); 1210 } 1211 1212 /* 1213 * Remove buffers from list until we've removed the specified number of 1214 * bytes. Destroy the buffers that are removed. 1215 */ 1216 static void 1217 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1218 { 1219 arc_buf_hdr_t *ab, *ab_prev; 1220 kmutex_t *hash_lock; 1221 uint64_t bytes_deleted = 0; 1222 uint64_t bufs_skipped = 0; 1223 1224 ASSERT(GHOST_STATE(state)); 1225 top: 1226 mutex_enter(&state->arcs_mtx); 1227 for (ab = list_tail(&state->arcs_list); ab; ab = ab_prev) { 1228 ab_prev = list_prev(&state->arcs_list, ab); 1229 hash_lock = HDR_LOCK(ab); 1230 if (mutex_tryenter(hash_lock)) { 1231 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1232 ASSERT(ab->b_buf == NULL); 1233 arc_change_state(arc_anon, ab, hash_lock); 1234 mutex_exit(hash_lock); 1235 ARCSTAT_BUMP(arcstat_deleted); 1236 bytes_deleted += ab->b_size; 1237 arc_hdr_destroy(ab); 1238 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1239 if (bytes >= 0 && bytes_deleted >= bytes) 1240 break; 1241 } else { 1242 if (bytes < 0) { 1243 mutex_exit(&state->arcs_mtx); 1244 mutex_enter(hash_lock); 1245 mutex_exit(hash_lock); 1246 goto top; 1247 } 1248 bufs_skipped += 1; 1249 } 1250 } 1251 mutex_exit(&state->arcs_mtx); 1252 1253 if (bufs_skipped) { 1254 ARCSTAT_INCR(arcstat_mutex_miss, bufs_skipped); 1255 ASSERT(bytes >= 0); 1256 } 1257 1258 if (bytes_deleted < bytes) 1259 dprintf("only deleted %lld bytes from %p", 1260 (longlong_t)bytes_deleted, state); 1261 } 1262 1263 static void 1264 arc_adjust(void) 1265 { 1266 int64_t top_sz, mru_over, arc_over, todelete; 1267 1268 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1269 1270 if (top_sz > arc_p && arc_mru->arcs_lsize > 0) { 1271 int64_t toevict = MIN(arc_mru->arcs_lsize, top_sz - arc_p); 1272 (void) arc_evict(arc_mru, toevict, FALSE, ARC_BUFC_UNDEF); 1273 top_sz = arc_anon->arcs_size + arc_mru->arcs_size; 1274 } 1275 1276 mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; 1277 1278 if (mru_over > 0) { 1279 if (arc_mru_ghost->arcs_lsize > 0) { 1280 todelete = MIN(arc_mru_ghost->arcs_lsize, mru_over); 1281 arc_evict_ghost(arc_mru_ghost, todelete); 1282 } 1283 } 1284 1285 if ((arc_over = arc_size - arc_c) > 0) { 1286 int64_t tbl_over; 1287 1288 if (arc_mfu->arcs_lsize > 0) { 1289 int64_t toevict = MIN(arc_mfu->arcs_lsize, arc_over); 1290 (void) arc_evict(arc_mfu, toevict, FALSE, 1291 ARC_BUFC_UNDEF); 1292 } 1293 1294 tbl_over = arc_size + arc_mru_ghost->arcs_lsize + 1295 arc_mfu_ghost->arcs_lsize - arc_c*2; 1296 1297 if (tbl_over > 0 && arc_mfu_ghost->arcs_lsize > 0) { 1298 todelete = MIN(arc_mfu_ghost->arcs_lsize, tbl_over); 1299 arc_evict_ghost(arc_mfu_ghost, todelete); 1300 } 1301 } 1302 } 1303 1304 static void 1305 arc_do_user_evicts(void) 1306 { 1307 mutex_enter(&arc_eviction_mtx); 1308 while (arc_eviction_list != NULL) { 1309 arc_buf_t *buf = arc_eviction_list; 1310 arc_eviction_list = buf->b_next; 1311 buf->b_hdr = NULL; 1312 mutex_exit(&arc_eviction_mtx); 1313 1314 if (buf->b_efunc != NULL) 1315 VERIFY(buf->b_efunc(buf) == 0); 1316 1317 buf->b_efunc = NULL; 1318 buf->b_private = NULL; 1319 kmem_cache_free(buf_cache, buf); 1320 mutex_enter(&arc_eviction_mtx); 1321 } 1322 mutex_exit(&arc_eviction_mtx); 1323 } 1324 1325 /* 1326 * Flush all *evictable* data from the cache. 1327 * NOTE: this will not touch "active" (i.e. referenced) data. 1328 */ 1329 void 1330 arc_flush(void) 1331 { 1332 while (list_head(&arc_mru->arcs_list)) 1333 (void) arc_evict(arc_mru, -1, FALSE, ARC_BUFC_UNDEF); 1334 while (list_head(&arc_mfu->arcs_list)) 1335 (void) arc_evict(arc_mfu, -1, FALSE, ARC_BUFC_UNDEF); 1336 1337 arc_evict_ghost(arc_mru_ghost, -1); 1338 arc_evict_ghost(arc_mfu_ghost, -1); 1339 1340 mutex_enter(&arc_reclaim_thr_lock); 1341 arc_do_user_evicts(); 1342 mutex_exit(&arc_reclaim_thr_lock); 1343 ASSERT(arc_eviction_list == NULL); 1344 } 1345 1346 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1347 1348 void 1349 arc_shrink(void) 1350 { 1351 if (arc_c > arc_c_min) { 1352 uint64_t to_free; 1353 1354 #ifdef _KERNEL 1355 to_free = MAX(arc_c >> arc_shrink_shift, ptob(needfree)); 1356 #else 1357 to_free = arc_c >> arc_shrink_shift; 1358 #endif 1359 if (arc_c > arc_c_min + to_free) 1360 atomic_add_64(&arc_c, -to_free); 1361 else 1362 arc_c = arc_c_min; 1363 1364 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 1365 if (arc_c > arc_size) 1366 arc_c = MAX(arc_size, arc_c_min); 1367 if (arc_p > arc_c) 1368 arc_p = (arc_c >> 1); 1369 ASSERT(arc_c >= arc_c_min); 1370 ASSERT((int64_t)arc_p >= 0); 1371 } 1372 1373 if (arc_size > arc_c) 1374 arc_adjust(); 1375 } 1376 1377 static int 1378 arc_reclaim_needed(void) 1379 { 1380 uint64_t extra; 1381 1382 #ifdef _KERNEL 1383 1384 if (needfree) 1385 return (1); 1386 1387 /* 1388 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1389 */ 1390 extra = desfree; 1391 1392 /* 1393 * check that we're out of range of the pageout scanner. It starts to 1394 * schedule paging if freemem is less than lotsfree and needfree. 1395 * lotsfree is the high-water mark for pageout, and needfree is the 1396 * number of needed free pages. We add extra pages here to make sure 1397 * the scanner doesn't start up while we're freeing memory. 1398 */ 1399 if (freemem < lotsfree + needfree + extra) 1400 return (1); 1401 1402 /* 1403 * check to make sure that swapfs has enough space so that anon 1404 * reservations can still succeeed. anon_resvmem() checks that the 1405 * availrmem is greater than swapfs_minfree, and the number of reserved 1406 * swap pages. We also add a bit of extra here just to prevent 1407 * circumstances from getting really dire. 1408 */ 1409 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1410 return (1); 1411 1412 /* 1413 * If zio data pages are being allocated out of a separate heap segment, 1414 * then check that the size of available vmem for this area remains 1415 * above 1/4th free. This needs to be done when the size of the 1416 * non-default segment is smaller than physical memory, so we could 1417 * conceivably run out of VA in that segment before running out of 1418 * physical memory. 1419 */ 1420 if (zio_arena != NULL) { 1421 if ((btop(physmem) > arc_ziosize) && 1422 (btop(vmem_size(zio_arena, VMEM_FREE)) < arc_ziosize >> 2)) 1423 return (1); 1424 } 1425 1426 #if defined(__i386) 1427 /* 1428 * If we're on an i386 platform, it's possible that we'll exhaust the 1429 * kernel heap space before we ever run out of available physical 1430 * memory. Most checks of the size of the heap_area compare against 1431 * tune.t_minarmem, which is the minimum available real memory that we 1432 * can have in the system. However, this is generally fixed at 25 pages 1433 * which is so low that it's useless. In this comparison, we seek to 1434 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1435 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1436 * free) 1437 */ 1438 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1439 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1440 return (1); 1441 #endif 1442 1443 #else 1444 if (spa_get_random(100) == 0) 1445 return (1); 1446 #endif 1447 return (0); 1448 } 1449 1450 static void 1451 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1452 { 1453 size_t i; 1454 kmem_cache_t *prev_cache = NULL; 1455 kmem_cache_t *prev_data_cache = NULL; 1456 extern kmem_cache_t *zio_buf_cache[]; 1457 extern kmem_cache_t *zio_data_buf_cache[]; 1458 1459 #ifdef _KERNEL 1460 /* 1461 * First purge some DNLC entries, in case the DNLC is using 1462 * up too much memory. 1463 */ 1464 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1465 1466 #if defined(__i386) 1467 /* 1468 * Reclaim unused memory from all kmem caches. 1469 */ 1470 kmem_reap(); 1471 #endif 1472 #endif 1473 1474 /* 1475 * An agressive reclamation will shrink the cache size as well as 1476 * reap free buffers from the arc kmem caches. 1477 */ 1478 if (strat == ARC_RECLAIM_AGGR) 1479 arc_shrink(); 1480 1481 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1482 if (zio_buf_cache[i] != prev_cache) { 1483 prev_cache = zio_buf_cache[i]; 1484 kmem_cache_reap_now(zio_buf_cache[i]); 1485 } 1486 if (zio_data_buf_cache[i] != prev_data_cache) { 1487 prev_data_cache = zio_data_buf_cache[i]; 1488 kmem_cache_reap_now(zio_data_buf_cache[i]); 1489 } 1490 } 1491 kmem_cache_reap_now(buf_cache); 1492 kmem_cache_reap_now(hdr_cache); 1493 } 1494 1495 static void 1496 arc_reclaim_thread(void) 1497 { 1498 clock_t growtime = 0; 1499 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1500 callb_cpr_t cpr; 1501 1502 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1503 1504 mutex_enter(&arc_reclaim_thr_lock); 1505 while (arc_thread_exit == 0) { 1506 if (arc_reclaim_needed()) { 1507 1508 if (arc_no_grow) { 1509 if (last_reclaim == ARC_RECLAIM_CONS) { 1510 last_reclaim = ARC_RECLAIM_AGGR; 1511 } else { 1512 last_reclaim = ARC_RECLAIM_CONS; 1513 } 1514 } else { 1515 arc_no_grow = TRUE; 1516 last_reclaim = ARC_RECLAIM_AGGR; 1517 membar_producer(); 1518 } 1519 1520 /* reset the growth delay for every reclaim */ 1521 growtime = lbolt + (arc_grow_retry * hz); 1522 ASSERT(growtime > 0); 1523 1524 arc_kmem_reap_now(last_reclaim); 1525 1526 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1527 arc_no_grow = FALSE; 1528 } 1529 1530 if (2 * arc_c < arc_size + 1531 arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size) 1532 arc_adjust(); 1533 1534 if (arc_eviction_list != NULL) 1535 arc_do_user_evicts(); 1536 1537 /* block until needed, or one second, whichever is shorter */ 1538 CALLB_CPR_SAFE_BEGIN(&cpr); 1539 (void) cv_timedwait(&arc_reclaim_thr_cv, 1540 &arc_reclaim_thr_lock, (lbolt + hz)); 1541 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1542 } 1543 1544 arc_thread_exit = 0; 1545 cv_broadcast(&arc_reclaim_thr_cv); 1546 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1547 thread_exit(); 1548 } 1549 1550 /* 1551 * Adapt arc info given the number of bytes we are trying to add and 1552 * the state that we are comming from. This function is only called 1553 * when we are adding new content to the cache. 1554 */ 1555 static void 1556 arc_adapt(int bytes, arc_state_t *state) 1557 { 1558 int mult; 1559 1560 ASSERT(bytes > 0); 1561 /* 1562 * Adapt the target size of the MRU list: 1563 * - if we just hit in the MRU ghost list, then increase 1564 * the target size of the MRU list. 1565 * - if we just hit in the MFU ghost list, then increase 1566 * the target size of the MFU list by decreasing the 1567 * target size of the MRU list. 1568 */ 1569 if (state == arc_mru_ghost) { 1570 mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1571 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); 1572 1573 arc_p = MIN(arc_c, arc_p + bytes * mult); 1574 } else if (state == arc_mfu_ghost) { 1575 mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1576 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); 1577 1578 arc_p = MAX(0, (int64_t)arc_p - bytes * mult); 1579 } 1580 ASSERT((int64_t)arc_p >= 0); 1581 1582 if (arc_reclaim_needed()) { 1583 cv_signal(&arc_reclaim_thr_cv); 1584 return; 1585 } 1586 1587 if (arc_no_grow) 1588 return; 1589 1590 if (arc_c >= arc_c_max) 1591 return; 1592 1593 /* 1594 * If we're within (2 * maxblocksize) bytes of the target 1595 * cache size, increment the target cache size 1596 */ 1597 if (arc_size > arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1598 atomic_add_64(&arc_c, (int64_t)bytes); 1599 if (arc_c > arc_c_max) 1600 arc_c = arc_c_max; 1601 else if (state == arc_anon) 1602 atomic_add_64(&arc_p, (int64_t)bytes); 1603 if (arc_p > arc_c) 1604 arc_p = arc_c; 1605 } 1606 ASSERT((int64_t)arc_p >= 0); 1607 } 1608 1609 /* 1610 * Check if the cache has reached its limits and eviction is required 1611 * prior to insert. 1612 */ 1613 static int 1614 arc_evict_needed() 1615 { 1616 if (arc_reclaim_needed()) 1617 return (1); 1618 1619 return (arc_size > arc_c); 1620 } 1621 1622 /* 1623 * The buffer, supplied as the first argument, needs a data block. 1624 * So, if we are at cache max, determine which cache should be victimized. 1625 * We have the following cases: 1626 * 1627 * 1. Insert for MRU, p > sizeof(arc_anon + arc_mru) -> 1628 * In this situation if we're out of space, but the resident size of the MFU is 1629 * under the limit, victimize the MFU cache to satisfy this insertion request. 1630 * 1631 * 2. Insert for MRU, p <= sizeof(arc_anon + arc_mru) -> 1632 * Here, we've used up all of the available space for the MRU, so we need to 1633 * evict from our own cache instead. Evict from the set of resident MRU 1634 * entries. 1635 * 1636 * 3. Insert for MFU (c - p) > sizeof(arc_mfu) -> 1637 * c minus p represents the MFU space in the cache, since p is the size of the 1638 * cache that is dedicated to the MRU. In this situation there's still space on 1639 * the MFU side, so the MRU side needs to be victimized. 1640 * 1641 * 4. Insert for MFU (c - p) < sizeof(arc_mfu) -> 1642 * MFU's resident set is consuming more space than it has been allotted. In 1643 * this situation, we must victimize our own cache, the MFU, for this insertion. 1644 */ 1645 static void 1646 arc_get_data_buf(arc_buf_t *buf) 1647 { 1648 arc_state_t *state = buf->b_hdr->b_state; 1649 uint64_t size = buf->b_hdr->b_size; 1650 arc_buf_contents_t type = buf->b_hdr->b_type; 1651 1652 arc_adapt(size, state); 1653 1654 /* 1655 * We have not yet reached cache maximum size, 1656 * just allocate a new buffer. 1657 */ 1658 if (!arc_evict_needed()) { 1659 if (type == ARC_BUFC_METADATA) { 1660 buf->b_data = zio_buf_alloc(size); 1661 } else { 1662 ASSERT(type == ARC_BUFC_DATA); 1663 buf->b_data = zio_data_buf_alloc(size); 1664 } 1665 atomic_add_64(&arc_size, size); 1666 goto out; 1667 } 1668 1669 /* 1670 * If we are prefetching from the mfu ghost list, this buffer 1671 * will end up on the mru list; so steal space from there. 1672 */ 1673 if (state == arc_mfu_ghost) 1674 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc_mru : arc_mfu; 1675 else if (state == arc_mru_ghost) 1676 state = arc_mru; 1677 1678 if (state == arc_mru || state == arc_anon) { 1679 uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; 1680 state = (arc_p > mru_used) ? arc_mfu : arc_mru; 1681 } else { 1682 /* MFU cases */ 1683 uint64_t mfu_space = arc_c - arc_p; 1684 state = (mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; 1685 } 1686 if ((buf->b_data = arc_evict(state, size, TRUE, type)) == NULL) { 1687 if (type == ARC_BUFC_METADATA) { 1688 buf->b_data = zio_buf_alloc(size); 1689 } else { 1690 ASSERT(type == ARC_BUFC_DATA); 1691 buf->b_data = zio_data_buf_alloc(size); 1692 } 1693 atomic_add_64(&arc_size, size); 1694 ARCSTAT_BUMP(arcstat_recycle_miss); 1695 } 1696 ASSERT(buf->b_data != NULL); 1697 out: 1698 /* 1699 * Update the state size. Note that ghost states have a 1700 * "ghost size" and so don't need to be updated. 1701 */ 1702 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1703 arc_buf_hdr_t *hdr = buf->b_hdr; 1704 1705 atomic_add_64(&hdr->b_state->arcs_size, size); 1706 if (list_link_active(&hdr->b_arc_node)) { 1707 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1708 atomic_add_64(&hdr->b_state->arcs_lsize, size); 1709 } 1710 /* 1711 * If we are growing the cache, and we are adding anonymous 1712 * data, and we have outgrown arc_p, update arc_p 1713 */ 1714 if (arc_size < arc_c && hdr->b_state == arc_anon && 1715 arc_anon->arcs_size + arc_mru->arcs_size > arc_p) 1716 arc_p = MIN(arc_c, arc_p + size); 1717 } 1718 } 1719 1720 /* 1721 * This routine is called whenever a buffer is accessed. 1722 * NOTE: the hash lock is dropped in this function. 1723 */ 1724 static void 1725 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1726 { 1727 ASSERT(MUTEX_HELD(hash_lock)); 1728 1729 if (buf->b_state == arc_anon) { 1730 /* 1731 * This buffer is not in the cache, and does not 1732 * appear in our "ghost" list. Add the new buffer 1733 * to the MRU state. 1734 */ 1735 1736 ASSERT(buf->b_arc_access == 0); 1737 buf->b_arc_access = lbolt; 1738 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1739 arc_change_state(arc_mru, buf, hash_lock); 1740 1741 } else if (buf->b_state == arc_mru) { 1742 /* 1743 * If this buffer is here because of a prefetch, then either: 1744 * - clear the flag if this is a "referencing" read 1745 * (any subsequent access will bump this into the MFU state). 1746 * or 1747 * - move the buffer to the head of the list if this is 1748 * another prefetch (to make it less likely to be evicted). 1749 */ 1750 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1751 if (refcount_count(&buf->b_refcnt) == 0) { 1752 ASSERT(list_link_active(&buf->b_arc_node)); 1753 mutex_enter(&arc_mru->arcs_mtx); 1754 list_remove(&arc_mru->arcs_list, buf); 1755 list_insert_head(&arc_mru->arcs_list, buf); 1756 mutex_exit(&arc_mru->arcs_mtx); 1757 } else { 1758 buf->b_flags &= ~ARC_PREFETCH; 1759 ARCSTAT_BUMP(arcstat_mru_hits); 1760 } 1761 buf->b_arc_access = lbolt; 1762 return; 1763 } 1764 1765 /* 1766 * This buffer has been "accessed" only once so far, 1767 * but it is still in the cache. Move it to the MFU 1768 * state. 1769 */ 1770 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1771 /* 1772 * More than 125ms have passed since we 1773 * instantiated this buffer. Move it to the 1774 * most frequently used state. 1775 */ 1776 buf->b_arc_access = lbolt; 1777 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1778 arc_change_state(arc_mfu, buf, hash_lock); 1779 } 1780 ARCSTAT_BUMP(arcstat_mru_hits); 1781 } else if (buf->b_state == arc_mru_ghost) { 1782 arc_state_t *new_state; 1783 /* 1784 * This buffer has been "accessed" recently, but 1785 * was evicted from the cache. Move it to the 1786 * MFU state. 1787 */ 1788 1789 if (buf->b_flags & ARC_PREFETCH) { 1790 new_state = arc_mru; 1791 if (refcount_count(&buf->b_refcnt) > 0) 1792 buf->b_flags &= ~ARC_PREFETCH; 1793 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1794 } else { 1795 new_state = arc_mfu; 1796 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1797 } 1798 1799 buf->b_arc_access = lbolt; 1800 arc_change_state(new_state, buf, hash_lock); 1801 1802 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 1803 } else if (buf->b_state == arc_mfu) { 1804 /* 1805 * This buffer has been accessed more than once and is 1806 * still in the cache. Keep it in the MFU state. 1807 * 1808 * NOTE: an add_reference() that occurred when we did 1809 * the arc_read() will have kicked this off the list. 1810 * If it was a prefetch, we will explicitly move it to 1811 * the head of the list now. 1812 */ 1813 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1814 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1815 ASSERT(list_link_active(&buf->b_arc_node)); 1816 mutex_enter(&arc_mfu->arcs_mtx); 1817 list_remove(&arc_mfu->arcs_list, buf); 1818 list_insert_head(&arc_mfu->arcs_list, buf); 1819 mutex_exit(&arc_mfu->arcs_mtx); 1820 } 1821 ARCSTAT_BUMP(arcstat_mfu_hits); 1822 buf->b_arc_access = lbolt; 1823 } else if (buf->b_state == arc_mfu_ghost) { 1824 arc_state_t *new_state = arc_mfu; 1825 /* 1826 * This buffer has been accessed more than once but has 1827 * been evicted from the cache. Move it back to the 1828 * MFU state. 1829 */ 1830 1831 if (buf->b_flags & ARC_PREFETCH) { 1832 /* 1833 * This is a prefetch access... 1834 * move this block back to the MRU state. 1835 */ 1836 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1837 new_state = arc_mru; 1838 } 1839 1840 buf->b_arc_access = lbolt; 1841 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1842 arc_change_state(new_state, buf, hash_lock); 1843 1844 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 1845 } else { 1846 ASSERT(!"invalid arc state"); 1847 } 1848 } 1849 1850 /* a generic arc_done_func_t which you can use */ 1851 /* ARGSUSED */ 1852 void 1853 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1854 { 1855 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1856 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1857 } 1858 1859 /* a generic arc_done_func_t which you can use */ 1860 void 1861 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1862 { 1863 arc_buf_t **bufp = arg; 1864 if (zio && zio->io_error) { 1865 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1866 *bufp = NULL; 1867 } else { 1868 *bufp = buf; 1869 } 1870 } 1871 1872 static void 1873 arc_read_done(zio_t *zio) 1874 { 1875 arc_buf_hdr_t *hdr, *found; 1876 arc_buf_t *buf; 1877 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1878 kmutex_t *hash_lock; 1879 arc_callback_t *callback_list, *acb; 1880 int freeable = FALSE; 1881 1882 buf = zio->io_private; 1883 hdr = buf->b_hdr; 1884 1885 /* 1886 * The hdr was inserted into hash-table and removed from lists 1887 * prior to starting I/O. We should find this header, since 1888 * it's in the hash table, and it should be legit since it's 1889 * not possible to evict it during the I/O. The only possible 1890 * reason for it not to be found is if we were freed during the 1891 * read. 1892 */ 1893 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1894 &hash_lock); 1895 1896 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1897 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1898 1899 /* byteswap if necessary */ 1900 callback_list = hdr->b_acb; 1901 ASSERT(callback_list != NULL); 1902 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1903 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1904 1905 arc_cksum_compute(buf); 1906 1907 /* create copies of the data buffer for the callers */ 1908 abuf = buf; 1909 for (acb = callback_list; acb; acb = acb->acb_next) { 1910 if (acb->acb_done) { 1911 if (abuf == NULL) 1912 abuf = arc_buf_clone(buf); 1913 acb->acb_buf = abuf; 1914 abuf = NULL; 1915 } 1916 } 1917 hdr->b_acb = NULL; 1918 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1919 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1920 if (abuf == buf) 1921 hdr->b_flags |= ARC_BUF_AVAILABLE; 1922 1923 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1924 1925 if (zio->io_error != 0) { 1926 hdr->b_flags |= ARC_IO_ERROR; 1927 if (hdr->b_state != arc_anon) 1928 arc_change_state(arc_anon, hdr, hash_lock); 1929 if (HDR_IN_HASH_TABLE(hdr)) 1930 buf_hash_remove(hdr); 1931 freeable = refcount_is_zero(&hdr->b_refcnt); 1932 /* convert checksum errors into IO errors */ 1933 if (zio->io_error == ECKSUM) 1934 zio->io_error = EIO; 1935 } 1936 1937 /* 1938 * Broadcast before we drop the hash_lock to avoid the possibility 1939 * that the hdr (and hence the cv) might be freed before we get to 1940 * the cv_broadcast(). 1941 */ 1942 cv_broadcast(&hdr->b_cv); 1943 1944 if (hash_lock) { 1945 /* 1946 * Only call arc_access on anonymous buffers. This is because 1947 * if we've issued an I/O for an evicted buffer, we've already 1948 * called arc_access (to prevent any simultaneous readers from 1949 * getting confused). 1950 */ 1951 if (zio->io_error == 0 && hdr->b_state == arc_anon) 1952 arc_access(hdr, hash_lock); 1953 mutex_exit(hash_lock); 1954 } else { 1955 /* 1956 * This block was freed while we waited for the read to 1957 * complete. It has been removed from the hash table and 1958 * moved to the anonymous state (so that it won't show up 1959 * in the cache). 1960 */ 1961 ASSERT3P(hdr->b_state, ==, arc_anon); 1962 freeable = refcount_is_zero(&hdr->b_refcnt); 1963 } 1964 1965 /* execute each callback and free its structure */ 1966 while ((acb = callback_list) != NULL) { 1967 if (acb->acb_done) 1968 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1969 1970 if (acb->acb_zio_dummy != NULL) { 1971 acb->acb_zio_dummy->io_error = zio->io_error; 1972 zio_nowait(acb->acb_zio_dummy); 1973 } 1974 1975 callback_list = acb->acb_next; 1976 kmem_free(acb, sizeof (arc_callback_t)); 1977 } 1978 1979 if (freeable) 1980 arc_hdr_destroy(hdr); 1981 } 1982 1983 /* 1984 * "Read" the block block at the specified DVA (in bp) via the 1985 * cache. If the block is found in the cache, invoke the provided 1986 * callback immediately and return. Note that the `zio' parameter 1987 * in the callback will be NULL in this case, since no IO was 1988 * required. If the block is not in the cache pass the read request 1989 * on to the spa with a substitute callback function, so that the 1990 * requested block will be added to the cache. 1991 * 1992 * If a read request arrives for a block that has a read in-progress, 1993 * either wait for the in-progress read to complete (and return the 1994 * results); or, if this is a read with a "done" func, add a record 1995 * to the read to invoke the "done" func when the read completes, 1996 * and return; or just return. 1997 * 1998 * arc_read_done() will invoke all the requested "done" functions 1999 * for readers of this block. 2000 */ 2001 int 2002 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 2003 arc_done_func_t *done, void *private, int priority, int flags, 2004 uint32_t *arc_flags, zbookmark_t *zb) 2005 { 2006 arc_buf_hdr_t *hdr; 2007 arc_buf_t *buf; 2008 kmutex_t *hash_lock; 2009 zio_t *rzio; 2010 2011 top: 2012 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2013 if (hdr && hdr->b_datacnt > 0) { 2014 2015 *arc_flags |= ARC_CACHED; 2016 2017 if (HDR_IO_IN_PROGRESS(hdr)) { 2018 2019 if (*arc_flags & ARC_WAIT) { 2020 cv_wait(&hdr->b_cv, hash_lock); 2021 mutex_exit(hash_lock); 2022 goto top; 2023 } 2024 ASSERT(*arc_flags & ARC_NOWAIT); 2025 2026 if (done) { 2027 arc_callback_t *acb = NULL; 2028 2029 acb = kmem_zalloc(sizeof (arc_callback_t), 2030 KM_SLEEP); 2031 acb->acb_done = done; 2032 acb->acb_private = private; 2033 acb->acb_byteswap = swap; 2034 if (pio != NULL) 2035 acb->acb_zio_dummy = zio_null(pio, 2036 spa, NULL, NULL, flags); 2037 2038 ASSERT(acb->acb_done != NULL); 2039 acb->acb_next = hdr->b_acb; 2040 hdr->b_acb = acb; 2041 add_reference(hdr, hash_lock, private); 2042 mutex_exit(hash_lock); 2043 return (0); 2044 } 2045 mutex_exit(hash_lock); 2046 return (0); 2047 } 2048 2049 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2050 2051 if (done) { 2052 add_reference(hdr, hash_lock, private); 2053 /* 2054 * If this block is already in use, create a new 2055 * copy of the data so that we will be guaranteed 2056 * that arc_release() will always succeed. 2057 */ 2058 buf = hdr->b_buf; 2059 ASSERT(buf); 2060 ASSERT(buf->b_data); 2061 if (HDR_BUF_AVAILABLE(hdr)) { 2062 ASSERT(buf->b_efunc == NULL); 2063 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 2064 } else { 2065 buf = arc_buf_clone(buf); 2066 } 2067 } else if (*arc_flags & ARC_PREFETCH && 2068 refcount_count(&hdr->b_refcnt) == 0) { 2069 hdr->b_flags |= ARC_PREFETCH; 2070 } 2071 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 2072 arc_access(hdr, hash_lock); 2073 mutex_exit(hash_lock); 2074 ARCSTAT_BUMP(arcstat_hits); 2075 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2076 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2077 data, metadata, hits); 2078 2079 if (done) 2080 done(NULL, buf, private); 2081 } else { 2082 uint64_t size = BP_GET_LSIZE(bp); 2083 arc_callback_t *acb; 2084 2085 if (hdr == NULL) { 2086 /* this block is not in the cache */ 2087 arc_buf_hdr_t *exists; 2088 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 2089 buf = arc_buf_alloc(spa, size, private, type); 2090 hdr = buf->b_hdr; 2091 hdr->b_dva = *BP_IDENTITY(bp); 2092 hdr->b_birth = bp->blk_birth; 2093 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 2094 exists = buf_hash_insert(hdr, &hash_lock); 2095 if (exists) { 2096 /* somebody beat us to the hash insert */ 2097 mutex_exit(hash_lock); 2098 bzero(&hdr->b_dva, sizeof (dva_t)); 2099 hdr->b_birth = 0; 2100 hdr->b_cksum0 = 0; 2101 (void) arc_buf_remove_ref(buf, private); 2102 goto top; /* restart the IO request */ 2103 } 2104 /* if this is a prefetch, we don't have a reference */ 2105 if (*arc_flags & ARC_PREFETCH) { 2106 (void) remove_reference(hdr, hash_lock, 2107 private); 2108 hdr->b_flags |= ARC_PREFETCH; 2109 } 2110 if (BP_GET_LEVEL(bp) > 0) 2111 hdr->b_flags |= ARC_INDIRECT; 2112 } else { 2113 /* this block is in the ghost cache */ 2114 ASSERT(GHOST_STATE(hdr->b_state)); 2115 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2116 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 2117 ASSERT(hdr->b_buf == NULL); 2118 2119 /* if this is a prefetch, we don't have a reference */ 2120 if (*arc_flags & ARC_PREFETCH) 2121 hdr->b_flags |= ARC_PREFETCH; 2122 else 2123 add_reference(hdr, hash_lock, private); 2124 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 2125 buf->b_hdr = hdr; 2126 buf->b_data = NULL; 2127 buf->b_efunc = NULL; 2128 buf->b_private = NULL; 2129 buf->b_next = NULL; 2130 hdr->b_buf = buf; 2131 arc_get_data_buf(buf); 2132 ASSERT(hdr->b_datacnt == 0); 2133 hdr->b_datacnt = 1; 2134 2135 } 2136 2137 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2138 acb->acb_done = done; 2139 acb->acb_private = private; 2140 acb->acb_byteswap = swap; 2141 2142 ASSERT(hdr->b_acb == NULL); 2143 hdr->b_acb = acb; 2144 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2145 2146 /* 2147 * If the buffer has been evicted, migrate it to a present state 2148 * before issuing the I/O. Once we drop the hash-table lock, 2149 * the header will be marked as I/O in progress and have an 2150 * attached buffer. At this point, anybody who finds this 2151 * buffer ought to notice that it's legit but has a pending I/O. 2152 */ 2153 2154 if (GHOST_STATE(hdr->b_state)) 2155 arc_access(hdr, hash_lock); 2156 mutex_exit(hash_lock); 2157 2158 ASSERT3U(hdr->b_size, ==, size); 2159 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2160 zbookmark_t *, zb); 2161 ARCSTAT_BUMP(arcstat_misses); 2162 ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), 2163 demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, 2164 data, metadata, misses); 2165 2166 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2167 arc_read_done, buf, priority, flags, zb); 2168 2169 if (*arc_flags & ARC_WAIT) 2170 return (zio_wait(rzio)); 2171 2172 ASSERT(*arc_flags & ARC_NOWAIT); 2173 zio_nowait(rzio); 2174 } 2175 return (0); 2176 } 2177 2178 /* 2179 * arc_read() variant to support pool traversal. If the block is already 2180 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2181 * The idea is that we don't want pool traversal filling up memory, but 2182 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2183 */ 2184 int 2185 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2186 { 2187 arc_buf_hdr_t *hdr; 2188 kmutex_t *hash_mtx; 2189 int rc = 0; 2190 2191 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2192 2193 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2194 arc_buf_t *buf = hdr->b_buf; 2195 2196 ASSERT(buf); 2197 while (buf->b_data == NULL) { 2198 buf = buf->b_next; 2199 ASSERT(buf); 2200 } 2201 bcopy(buf->b_data, data, hdr->b_size); 2202 } else { 2203 rc = ENOENT; 2204 } 2205 2206 if (hash_mtx) 2207 mutex_exit(hash_mtx); 2208 2209 return (rc); 2210 } 2211 2212 void 2213 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2214 { 2215 ASSERT(buf->b_hdr != NULL); 2216 ASSERT(buf->b_hdr->b_state != arc_anon); 2217 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2218 buf->b_efunc = func; 2219 buf->b_private = private; 2220 } 2221 2222 /* 2223 * This is used by the DMU to let the ARC know that a buffer is 2224 * being evicted, so the ARC should clean up. If this arc buf 2225 * is not yet in the evicted state, it will be put there. 2226 */ 2227 int 2228 arc_buf_evict(arc_buf_t *buf) 2229 { 2230 arc_buf_hdr_t *hdr; 2231 kmutex_t *hash_lock; 2232 arc_buf_t **bufp; 2233 2234 mutex_enter(&arc_eviction_mtx); 2235 hdr = buf->b_hdr; 2236 if (hdr == NULL) { 2237 /* 2238 * We are in arc_do_user_evicts(). 2239 */ 2240 ASSERT(buf->b_data == NULL); 2241 mutex_exit(&arc_eviction_mtx); 2242 return (0); 2243 } 2244 hash_lock = HDR_LOCK(hdr); 2245 mutex_exit(&arc_eviction_mtx); 2246 2247 mutex_enter(hash_lock); 2248 2249 if (buf->b_data == NULL) { 2250 /* 2251 * We are on the eviction list. 2252 */ 2253 mutex_exit(hash_lock); 2254 mutex_enter(&arc_eviction_mtx); 2255 if (buf->b_hdr == NULL) { 2256 /* 2257 * We are already in arc_do_user_evicts(). 2258 */ 2259 mutex_exit(&arc_eviction_mtx); 2260 return (0); 2261 } else { 2262 arc_buf_t copy = *buf; /* structure assignment */ 2263 /* 2264 * Process this buffer now 2265 * but let arc_do_user_evicts() do the reaping. 2266 */ 2267 buf->b_efunc = NULL; 2268 mutex_exit(&arc_eviction_mtx); 2269 VERIFY(copy.b_efunc(©) == 0); 2270 return (1); 2271 } 2272 } 2273 2274 ASSERT(buf->b_hdr == hdr); 2275 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2276 ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); 2277 2278 /* 2279 * Pull this buffer off of the hdr 2280 */ 2281 bufp = &hdr->b_buf; 2282 while (*bufp != buf) 2283 bufp = &(*bufp)->b_next; 2284 *bufp = buf->b_next; 2285 2286 ASSERT(buf->b_data != NULL); 2287 arc_buf_destroy(buf, FALSE, FALSE); 2288 2289 if (hdr->b_datacnt == 0) { 2290 arc_state_t *old_state = hdr->b_state; 2291 arc_state_t *evicted_state; 2292 2293 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2294 2295 evicted_state = 2296 (old_state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 2297 2298 mutex_enter(&old_state->arcs_mtx); 2299 mutex_enter(&evicted_state->arcs_mtx); 2300 2301 arc_change_state(evicted_state, hdr, hash_lock); 2302 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2303 hdr->b_flags = ARC_IN_HASH_TABLE; 2304 2305 mutex_exit(&evicted_state->arcs_mtx); 2306 mutex_exit(&old_state->arcs_mtx); 2307 } 2308 mutex_exit(hash_lock); 2309 2310 VERIFY(buf->b_efunc(buf) == 0); 2311 buf->b_efunc = NULL; 2312 buf->b_private = NULL; 2313 buf->b_hdr = NULL; 2314 kmem_cache_free(buf_cache, buf); 2315 return (1); 2316 } 2317 2318 /* 2319 * Release this buffer from the cache. This must be done 2320 * after a read and prior to modifying the buffer contents. 2321 * If the buffer has more than one reference, we must make 2322 * make a new hdr for the buffer. 2323 */ 2324 void 2325 arc_release(arc_buf_t *buf, void *tag) 2326 { 2327 arc_buf_hdr_t *hdr = buf->b_hdr; 2328 kmutex_t *hash_lock = HDR_LOCK(hdr); 2329 2330 /* this buffer is not on any list */ 2331 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2332 2333 if (hdr->b_state == arc_anon) { 2334 /* this buffer is already released */ 2335 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2336 ASSERT(BUF_EMPTY(hdr)); 2337 ASSERT(buf->b_efunc == NULL); 2338 arc_buf_thaw(buf); 2339 return; 2340 } 2341 2342 mutex_enter(hash_lock); 2343 2344 /* 2345 * Do we have more than one buf? 2346 */ 2347 if (hdr->b_buf != buf || buf->b_next != NULL) { 2348 arc_buf_hdr_t *nhdr; 2349 arc_buf_t **bufp; 2350 uint64_t blksz = hdr->b_size; 2351 spa_t *spa = hdr->b_spa; 2352 arc_buf_contents_t type = hdr->b_type; 2353 2354 ASSERT(hdr->b_datacnt > 1); 2355 /* 2356 * Pull the data off of this buf and attach it to 2357 * a new anonymous buf. 2358 */ 2359 (void) remove_reference(hdr, hash_lock, tag); 2360 bufp = &hdr->b_buf; 2361 while (*bufp != buf) 2362 bufp = &(*bufp)->b_next; 2363 *bufp = (*bufp)->b_next; 2364 2365 ASSERT3U(hdr->b_state->arcs_size, >=, hdr->b_size); 2366 atomic_add_64(&hdr->b_state->arcs_size, -hdr->b_size); 2367 if (refcount_is_zero(&hdr->b_refcnt)) { 2368 ASSERT3U(hdr->b_state->arcs_lsize, >=, hdr->b_size); 2369 atomic_add_64(&hdr->b_state->arcs_lsize, -hdr->b_size); 2370 } 2371 hdr->b_datacnt -= 1; 2372 arc_cksum_verify(buf); 2373 2374 mutex_exit(hash_lock); 2375 2376 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2377 nhdr->b_size = blksz; 2378 nhdr->b_spa = spa; 2379 nhdr->b_type = type; 2380 nhdr->b_buf = buf; 2381 nhdr->b_state = arc_anon; 2382 nhdr->b_arc_access = 0; 2383 nhdr->b_flags = 0; 2384 nhdr->b_datacnt = 1; 2385 nhdr->b_freeze_cksum = NULL; 2386 buf->b_hdr = nhdr; 2387 buf->b_next = NULL; 2388 (void) refcount_add(&nhdr->b_refcnt, tag); 2389 atomic_add_64(&arc_anon->arcs_size, blksz); 2390 2391 hdr = nhdr; 2392 } else { 2393 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2394 ASSERT(!list_link_active(&hdr->b_arc_node)); 2395 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2396 arc_change_state(arc_anon, hdr, hash_lock); 2397 hdr->b_arc_access = 0; 2398 mutex_exit(hash_lock); 2399 bzero(&hdr->b_dva, sizeof (dva_t)); 2400 hdr->b_birth = 0; 2401 hdr->b_cksum0 = 0; 2402 arc_buf_thaw(buf); 2403 } 2404 buf->b_efunc = NULL; 2405 buf->b_private = NULL; 2406 } 2407 2408 int 2409 arc_released(arc_buf_t *buf) 2410 { 2411 return (buf->b_data != NULL && buf->b_hdr->b_state == arc_anon); 2412 } 2413 2414 int 2415 arc_has_callback(arc_buf_t *buf) 2416 { 2417 return (buf->b_efunc != NULL); 2418 } 2419 2420 #ifdef ZFS_DEBUG 2421 int 2422 arc_referenced(arc_buf_t *buf) 2423 { 2424 return (refcount_count(&buf->b_hdr->b_refcnt)); 2425 } 2426 #endif 2427 2428 static void 2429 arc_write_ready(zio_t *zio) 2430 { 2431 arc_write_callback_t *callback = zio->io_private; 2432 arc_buf_t *buf = callback->awcb_buf; 2433 2434 if (callback->awcb_ready) { 2435 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt)); 2436 callback->awcb_ready(zio, buf, callback->awcb_private); 2437 } 2438 arc_cksum_compute(buf); 2439 } 2440 2441 static void 2442 arc_write_done(zio_t *zio) 2443 { 2444 arc_write_callback_t *callback = zio->io_private; 2445 arc_buf_t *buf = callback->awcb_buf; 2446 arc_buf_hdr_t *hdr = buf->b_hdr; 2447 2448 hdr->b_acb = NULL; 2449 2450 /* this buffer is on no lists and is not in the hash table */ 2451 ASSERT3P(hdr->b_state, ==, arc_anon); 2452 2453 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2454 hdr->b_birth = zio->io_bp->blk_birth; 2455 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2456 /* 2457 * If the block to be written was all-zero, we may have 2458 * compressed it away. In this case no write was performed 2459 * so there will be no dva/birth-date/checksum. The buffer 2460 * must therefor remain anonymous (and uncached). 2461 */ 2462 if (!BUF_EMPTY(hdr)) { 2463 arc_buf_hdr_t *exists; 2464 kmutex_t *hash_lock; 2465 2466 arc_cksum_verify(buf); 2467 2468 exists = buf_hash_insert(hdr, &hash_lock); 2469 if (exists) { 2470 /* 2471 * This can only happen if we overwrite for 2472 * sync-to-convergence, because we remove 2473 * buffers from the hash table when we arc_free(). 2474 */ 2475 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2476 BP_IDENTITY(zio->io_bp))); 2477 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2478 zio->io_bp->blk_birth); 2479 2480 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2481 arc_change_state(arc_anon, exists, hash_lock); 2482 mutex_exit(hash_lock); 2483 arc_hdr_destroy(exists); 2484 exists = buf_hash_insert(hdr, &hash_lock); 2485 ASSERT3P(exists, ==, NULL); 2486 } 2487 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2488 arc_access(hdr, hash_lock); 2489 mutex_exit(hash_lock); 2490 } else if (callback->awcb_done == NULL) { 2491 int destroy_hdr; 2492 /* 2493 * This is an anonymous buffer with no user callback, 2494 * destroy it if there are no active references. 2495 */ 2496 mutex_enter(&arc_eviction_mtx); 2497 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2498 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2499 mutex_exit(&arc_eviction_mtx); 2500 if (destroy_hdr) 2501 arc_hdr_destroy(hdr); 2502 } else { 2503 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2504 } 2505 2506 if (callback->awcb_done) { 2507 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2508 callback->awcb_done(zio, buf, callback->awcb_private); 2509 } 2510 2511 kmem_free(callback, sizeof (arc_write_callback_t)); 2512 } 2513 2514 zio_t * 2515 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2516 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2517 arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, 2518 int flags, zbookmark_t *zb) 2519 { 2520 arc_buf_hdr_t *hdr = buf->b_hdr; 2521 arc_write_callback_t *callback; 2522 zio_t *zio; 2523 2524 /* this is a private buffer - no locking required */ 2525 ASSERT3P(hdr->b_state, ==, arc_anon); 2526 ASSERT(BUF_EMPTY(hdr)); 2527 ASSERT(!HDR_IO_ERROR(hdr)); 2528 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2529 ASSERT(hdr->b_acb == 0); 2530 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 2531 callback->awcb_ready = ready; 2532 callback->awcb_done = done; 2533 callback->awcb_private = private; 2534 callback->awcb_buf = buf; 2535 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2536 zio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2537 buf->b_data, hdr->b_size, arc_write_ready, arc_write_done, callback, 2538 priority, flags, zb); 2539 2540 return (zio); 2541 } 2542 2543 int 2544 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2545 zio_done_func_t *done, void *private, uint32_t arc_flags) 2546 { 2547 arc_buf_hdr_t *ab; 2548 kmutex_t *hash_lock; 2549 zio_t *zio; 2550 2551 /* 2552 * If this buffer is in the cache, release it, so it 2553 * can be re-used. 2554 */ 2555 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2556 if (ab != NULL) { 2557 /* 2558 * The checksum of blocks to free is not always 2559 * preserved (eg. on the deadlist). However, if it is 2560 * nonzero, it should match what we have in the cache. 2561 */ 2562 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2563 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2564 if (ab->b_state != arc_anon) 2565 arc_change_state(arc_anon, ab, hash_lock); 2566 if (HDR_IO_IN_PROGRESS(ab)) { 2567 /* 2568 * This should only happen when we prefetch. 2569 */ 2570 ASSERT(ab->b_flags & ARC_PREFETCH); 2571 ASSERT3U(ab->b_datacnt, ==, 1); 2572 ab->b_flags |= ARC_FREED_IN_READ; 2573 if (HDR_IN_HASH_TABLE(ab)) 2574 buf_hash_remove(ab); 2575 ab->b_arc_access = 0; 2576 bzero(&ab->b_dva, sizeof (dva_t)); 2577 ab->b_birth = 0; 2578 ab->b_cksum0 = 0; 2579 ab->b_buf->b_efunc = NULL; 2580 ab->b_buf->b_private = NULL; 2581 mutex_exit(hash_lock); 2582 } else if (refcount_is_zero(&ab->b_refcnt)) { 2583 mutex_exit(hash_lock); 2584 arc_hdr_destroy(ab); 2585 ARCSTAT_BUMP(arcstat_deleted); 2586 } else { 2587 /* 2588 * We still have an active reference on this 2589 * buffer. This can happen, e.g., from 2590 * dbuf_unoverride(). 2591 */ 2592 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2593 ab->b_arc_access = 0; 2594 bzero(&ab->b_dva, sizeof (dva_t)); 2595 ab->b_birth = 0; 2596 ab->b_cksum0 = 0; 2597 ab->b_buf->b_efunc = NULL; 2598 ab->b_buf->b_private = NULL; 2599 mutex_exit(hash_lock); 2600 } 2601 } 2602 2603 zio = zio_free(pio, spa, txg, bp, done, private); 2604 2605 if (arc_flags & ARC_WAIT) 2606 return (zio_wait(zio)); 2607 2608 ASSERT(arc_flags & ARC_NOWAIT); 2609 zio_nowait(zio); 2610 2611 return (0); 2612 } 2613 2614 void 2615 arc_tempreserve_clear(uint64_t tempreserve) 2616 { 2617 atomic_add_64(&arc_tempreserve, -tempreserve); 2618 ASSERT((int64_t)arc_tempreserve >= 0); 2619 } 2620 2621 int 2622 arc_tempreserve_space(uint64_t tempreserve) 2623 { 2624 #ifdef ZFS_DEBUG 2625 /* 2626 * Once in a while, fail for no reason. Everything should cope. 2627 */ 2628 if (spa_get_random(10000) == 0) { 2629 dprintf("forcing random failure\n"); 2630 return (ERESTART); 2631 } 2632 #endif 2633 if (tempreserve > arc_c/4 && !arc_no_grow) 2634 arc_c = MIN(arc_c_max, tempreserve * 4); 2635 if (tempreserve > arc_c) 2636 return (ENOMEM); 2637 2638 /* 2639 * Throttle writes when the amount of dirty data in the cache 2640 * gets too large. We try to keep the cache less than half full 2641 * of dirty blocks so that our sync times don't grow too large. 2642 * Note: if two requests come in concurrently, we might let them 2643 * both succeed, when one of them should fail. Not a huge deal. 2644 * 2645 * XXX The limit should be adjusted dynamically to keep the time 2646 * to sync a dataset fixed (around 1-5 seconds?). 2647 */ 2648 2649 if (tempreserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && 2650 arc_tempreserve + arc_anon->arcs_size > arc_c / 4) { 2651 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2652 "tempreserve=%lluK arc_c=%lluK\n", 2653 arc_tempreserve>>10, arc_anon->arcs_lsize>>10, 2654 tempreserve>>10, arc_c>>10); 2655 return (ERESTART); 2656 } 2657 atomic_add_64(&arc_tempreserve, tempreserve); 2658 return (0); 2659 } 2660 2661 void 2662 arc_init(void) 2663 { 2664 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2665 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2666 2667 /* Convert seconds to clock ticks */ 2668 arc_min_prefetch_lifespan = 1 * hz; 2669 2670 /* Start out with 1/8 of all memory */ 2671 arc_c = physmem * PAGESIZE / 8; 2672 2673 #ifdef _KERNEL 2674 /* 2675 * On architectures where the physical memory can be larger 2676 * than the addressable space (intel in 32-bit mode), we may 2677 * need to limit the cache to 1/8 of VM size. 2678 */ 2679 arc_c = MIN(arc_c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2680 #endif 2681 2682 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2683 arc_c_min = MAX(arc_c / 4, 64<<20); 2684 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2685 if (arc_c * 8 >= 1<<30) 2686 arc_c_max = (arc_c * 8) - (1<<30); 2687 else 2688 arc_c_max = arc_c_min; 2689 arc_c_max = MAX(arc_c * 6, arc_c_max); 2690 2691 /* 2692 * Allow the tunables to override our calculations if they are 2693 * reasonable (ie. over 64MB) 2694 */ 2695 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2696 arc_c_max = zfs_arc_max; 2697 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc_c_max) 2698 arc_c_min = zfs_arc_min; 2699 2700 arc_c = arc_c_max; 2701 arc_p = (arc_c >> 1); 2702 2703 /* if kmem_flags are set, lets try to use less memory */ 2704 if (kmem_debugging()) 2705 arc_c = arc_c / 2; 2706 if (arc_c < arc_c_min) 2707 arc_c = arc_c_min; 2708 2709 arc_anon = &ARC_anon; 2710 arc_mru = &ARC_mru; 2711 arc_mru_ghost = &ARC_mru_ghost; 2712 arc_mfu = &ARC_mfu; 2713 arc_mfu_ghost = &ARC_mfu_ghost; 2714 arc_size = 0; 2715 2716 mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2717 mutex_init(&arc_mru->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2718 mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2719 mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2720 mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); 2721 2722 list_create(&arc_mru->arcs_list, sizeof (arc_buf_hdr_t), 2723 offsetof(arc_buf_hdr_t, b_arc_node)); 2724 list_create(&arc_mru_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2725 offsetof(arc_buf_hdr_t, b_arc_node)); 2726 list_create(&arc_mfu->arcs_list, sizeof (arc_buf_hdr_t), 2727 offsetof(arc_buf_hdr_t, b_arc_node)); 2728 list_create(&arc_mfu_ghost->arcs_list, sizeof (arc_buf_hdr_t), 2729 offsetof(arc_buf_hdr_t, b_arc_node)); 2730 2731 buf_init(); 2732 2733 arc_thread_exit = 0; 2734 arc_eviction_list = NULL; 2735 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2736 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2737 2738 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 2739 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 2740 2741 if (arc_ksp != NULL) { 2742 arc_ksp->ks_data = &arc_stats; 2743 kstat_install(arc_ksp); 2744 } 2745 2746 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2747 TS_RUN, minclsyspri); 2748 2749 arc_dead = FALSE; 2750 2751 #ifdef _KERNEL 2752 if (zio_arena != NULL) 2753 arc_ziosize = 2754 btop(vmem_size(zio_arena, VMEM_FREE | VMEM_ALLOC)); 2755 #endif /* _KERNEL */ 2756 } 2757 2758 void 2759 arc_fini(void) 2760 { 2761 mutex_enter(&arc_reclaim_thr_lock); 2762 arc_thread_exit = 1; 2763 while (arc_thread_exit != 0) 2764 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2765 mutex_exit(&arc_reclaim_thr_lock); 2766 2767 arc_flush(); 2768 2769 arc_dead = TRUE; 2770 2771 if (arc_ksp != NULL) { 2772 kstat_delete(arc_ksp); 2773 arc_ksp = NULL; 2774 } 2775 2776 mutex_destroy(&arc_eviction_mtx); 2777 mutex_destroy(&arc_reclaim_thr_lock); 2778 cv_destroy(&arc_reclaim_thr_cv); 2779 2780 list_destroy(&arc_mru->arcs_list); 2781 list_destroy(&arc_mru_ghost->arcs_list); 2782 list_destroy(&arc_mfu->arcs_list); 2783 list_destroy(&arc_mfu_ghost->arcs_list); 2784 2785 mutex_destroy(&arc_anon->arcs_mtx); 2786 mutex_destroy(&arc_mru->arcs_mtx); 2787 mutex_destroy(&arc_mru_ghost->arcs_mtx); 2788 mutex_destroy(&arc_mfu->arcs_mtx); 2789 mutex_destroy(&arc_mfu_ghost->arcs_mtx); 2790 2791 buf_fini(); 2792 } 2793