1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Relpacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 128 static kmutex_t arc_reclaim_thr_lock; 129 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 130 static uint8_t arc_thread_exit; 131 132 #define ARC_REDUCE_DNLC_PERCENT 3 133 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 134 135 typedef enum arc_reclaim_strategy { 136 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 137 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 138 } arc_reclaim_strategy_t; 139 140 /* number of seconds before growing cache again */ 141 static int arc_grow_retry = 60; 142 143 /* 144 * minimum lifespan of a prefetch block in clock ticks 145 * (initialized in arc_init()) 146 */ 147 static int arc_min_prefetch_lifespan; 148 149 static int arc_dead; 150 151 /* 152 * These tunables are for performance analysis. 153 */ 154 uint64_t zfs_arc_max; 155 uint64_t zfs_arc_min; 156 157 /* 158 * Note that buffers can be on one of 5 states: 159 * ARC_anon - anonymous (discussed below) 160 * ARC_mru - recently used, currently cached 161 * ARC_mru_ghost - recentely used, no longer in cache 162 * ARC_mfu - frequently used, currently cached 163 * ARC_mfu_ghost - frequently used, no longer in cache 164 * When there are no active references to the buffer, they 165 * are linked onto one of the lists in arc. These are the 166 * only buffers that can be evicted or deleted. 167 * 168 * Anonymous buffers are buffers that are not associated with 169 * a DVA. These are buffers that hold dirty block copies 170 * before they are written to stable storage. By definition, 171 * they are "ref'd" and are considered part of arc_mru 172 * that cannot be freed. Generally, they will aquire a DVA 173 * as they are written and migrate onto the arc_mru list. 174 */ 175 176 typedef struct arc_state { 177 list_t list; /* linked list of evictable buffer in state */ 178 uint64_t lsize; /* total size of buffers in the linked list */ 179 uint64_t size; /* total size of all buffers in this state */ 180 uint64_t hits; 181 kmutex_t mtx; 182 } arc_state_t; 183 184 /* The 5 states: */ 185 static arc_state_t ARC_anon; 186 static arc_state_t ARC_mru; 187 static arc_state_t ARC_mru_ghost; 188 static arc_state_t ARC_mfu; 189 static arc_state_t ARC_mfu_ghost; 190 191 static struct arc { 192 arc_state_t *anon; 193 arc_state_t *mru; 194 arc_state_t *mru_ghost; 195 arc_state_t *mfu; 196 arc_state_t *mfu_ghost; 197 uint64_t size; /* Actual total arc size */ 198 uint64_t p; /* Target size (in bytes) of mru */ 199 uint64_t c; /* Target size of cache (in bytes) */ 200 uint64_t c_min; /* Minimum target cache size */ 201 uint64_t c_max; /* Maximum target cache size */ 202 203 /* performance stats */ 204 uint64_t hits; 205 uint64_t misses; 206 uint64_t deleted; 207 uint64_t recycle_miss; 208 uint64_t mutex_miss; 209 uint64_t evict_skip; 210 uint64_t hash_elements; 211 uint64_t hash_elements_max; 212 uint64_t hash_collisions; 213 uint64_t hash_chains; 214 uint32_t hash_chain_max; 215 216 int no_grow; /* Don't try to grow cache size */ 217 } arc; 218 219 static uint64_t arc_tempreserve; 220 221 typedef struct arc_callback arc_callback_t; 222 223 struct arc_callback { 224 arc_done_func_t *acb_done; 225 void *acb_private; 226 arc_byteswap_func_t *acb_byteswap; 227 arc_buf_t *acb_buf; 228 zio_t *acb_zio_dummy; 229 arc_callback_t *acb_next; 230 }; 231 232 struct arc_buf_hdr { 233 /* immutable */ 234 uint64_t b_size; 235 spa_t *b_spa; 236 237 /* protected by hash lock */ 238 dva_t b_dva; 239 uint64_t b_birth; 240 uint64_t b_cksum0; 241 242 kmutex_t b_freeze_lock; 243 zio_cksum_t *b_freeze_cksum; 244 245 arc_buf_hdr_t *b_hash_next; 246 arc_buf_t *b_buf; 247 uint32_t b_flags; 248 uint32_t b_datacnt; 249 250 kcondvar_t b_cv; 251 arc_callback_t *b_acb; 252 253 /* protected by arc state mutex */ 254 arc_state_t *b_state; 255 list_node_t b_arc_node; 256 257 /* updated atomically */ 258 clock_t b_arc_access; 259 260 /* self protecting */ 261 refcount_t b_refcnt; 262 }; 263 264 static arc_buf_t *arc_eviction_list; 265 static kmutex_t arc_eviction_mtx; 266 static arc_buf_hdr_t arc_eviction_hdr; 267 static void arc_get_data_buf(arc_buf_t *buf); 268 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 269 270 #define GHOST_STATE(state) \ 271 ((state) == arc.mru_ghost || (state) == arc.mfu_ghost) 272 273 /* 274 * Private ARC flags. These flags are private ARC only flags that will show up 275 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 276 * be passed in as arc_flags in things like arc_read. However, these flags 277 * should never be passed and should only be set by ARC code. When adding new 278 * public flags, make sure not to smash the private ones. 279 */ 280 281 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 282 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 283 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 284 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 285 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 286 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 287 288 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 289 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 290 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 291 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 292 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 293 294 /* 295 * Hash table routines 296 */ 297 298 #define HT_LOCK_PAD 64 299 300 struct ht_lock { 301 kmutex_t ht_lock; 302 #ifdef _KERNEL 303 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 304 #endif 305 }; 306 307 #define BUF_LOCKS 256 308 typedef struct buf_hash_table { 309 uint64_t ht_mask; 310 arc_buf_hdr_t **ht_table; 311 struct ht_lock ht_locks[BUF_LOCKS]; 312 } buf_hash_table_t; 313 314 static buf_hash_table_t buf_hash_table; 315 316 #define BUF_HASH_INDEX(spa, dva, birth) \ 317 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 318 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 319 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 320 #define HDR_LOCK(buf) \ 321 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 322 323 uint64_t zfs_crc64_table[256]; 324 325 static uint64_t 326 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 327 { 328 uintptr_t spav = (uintptr_t)spa; 329 uint8_t *vdva = (uint8_t *)dva; 330 uint64_t crc = -1ULL; 331 int i; 332 333 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 334 335 for (i = 0; i < sizeof (dva_t); i++) 336 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 337 338 crc ^= (spav>>8) ^ birth; 339 340 return (crc); 341 } 342 343 #define BUF_EMPTY(buf) \ 344 ((buf)->b_dva.dva_word[0] == 0 && \ 345 (buf)->b_dva.dva_word[1] == 0 && \ 346 (buf)->b_birth == 0) 347 348 #define BUF_EQUAL(spa, dva, birth, buf) \ 349 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 350 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 351 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 352 353 static arc_buf_hdr_t * 354 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 355 { 356 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 357 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 358 arc_buf_hdr_t *buf; 359 360 mutex_enter(hash_lock); 361 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 362 buf = buf->b_hash_next) { 363 if (BUF_EQUAL(spa, dva, birth, buf)) { 364 *lockp = hash_lock; 365 return (buf); 366 } 367 } 368 mutex_exit(hash_lock); 369 *lockp = NULL; 370 return (NULL); 371 } 372 373 /* 374 * Insert an entry into the hash table. If there is already an element 375 * equal to elem in the hash table, then the already existing element 376 * will be returned and the new element will not be inserted. 377 * Otherwise returns NULL. 378 */ 379 static arc_buf_hdr_t * 380 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 381 { 382 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 383 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 384 arc_buf_hdr_t *fbuf; 385 uint32_t max, i; 386 387 ASSERT(!HDR_IN_HASH_TABLE(buf)); 388 *lockp = hash_lock; 389 mutex_enter(hash_lock); 390 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 391 fbuf = fbuf->b_hash_next, i++) { 392 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 393 return (fbuf); 394 } 395 396 buf->b_hash_next = buf_hash_table.ht_table[idx]; 397 buf_hash_table.ht_table[idx] = buf; 398 buf->b_flags |= ARC_IN_HASH_TABLE; 399 400 /* collect some hash table performance data */ 401 if (i > 0) { 402 atomic_add_64(&arc.hash_collisions, 1); 403 if (i == 1) 404 atomic_add_64(&arc.hash_chains, 1); 405 } 406 while (i > (max = arc.hash_chain_max) && 407 max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 408 continue; 409 } 410 atomic_add_64(&arc.hash_elements, 1); 411 if (arc.hash_elements > arc.hash_elements_max) 412 atomic_add_64(&arc.hash_elements_max, 1); 413 414 return (NULL); 415 } 416 417 static void 418 buf_hash_remove(arc_buf_hdr_t *buf) 419 { 420 arc_buf_hdr_t *fbuf, **bufp; 421 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 422 423 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 424 ASSERT(HDR_IN_HASH_TABLE(buf)); 425 426 bufp = &buf_hash_table.ht_table[idx]; 427 while ((fbuf = *bufp) != buf) { 428 ASSERT(fbuf != NULL); 429 bufp = &fbuf->b_hash_next; 430 } 431 *bufp = buf->b_hash_next; 432 buf->b_hash_next = NULL; 433 buf->b_flags &= ~ARC_IN_HASH_TABLE; 434 435 /* collect some hash table performance data */ 436 atomic_add_64(&arc.hash_elements, -1); 437 if (buf_hash_table.ht_table[idx] && 438 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 439 atomic_add_64(&arc.hash_chains, -1); 440 } 441 442 /* 443 * Global data structures and functions for the buf kmem cache. 444 */ 445 static kmem_cache_t *hdr_cache; 446 static kmem_cache_t *buf_cache; 447 448 static void 449 buf_fini(void) 450 { 451 int i; 452 453 kmem_free(buf_hash_table.ht_table, 454 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 455 for (i = 0; i < BUF_LOCKS; i++) 456 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 457 kmem_cache_destroy(hdr_cache); 458 kmem_cache_destroy(buf_cache); 459 } 460 461 /* 462 * Constructor callback - called when the cache is empty 463 * and a new buf is requested. 464 */ 465 /* ARGSUSED */ 466 static int 467 hdr_cons(void *vbuf, void *unused, int kmflag) 468 { 469 arc_buf_hdr_t *buf = vbuf; 470 471 bzero(buf, sizeof (arc_buf_hdr_t)); 472 refcount_create(&buf->b_refcnt); 473 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 474 return (0); 475 } 476 477 /* 478 * Destructor callback - called when a cached buf is 479 * no longer required. 480 */ 481 /* ARGSUSED */ 482 static void 483 hdr_dest(void *vbuf, void *unused) 484 { 485 arc_buf_hdr_t *buf = vbuf; 486 487 refcount_destroy(&buf->b_refcnt); 488 cv_destroy(&buf->b_cv); 489 } 490 491 /* 492 * Reclaim callback -- invoked when memory is low. 493 */ 494 /* ARGSUSED */ 495 static void 496 hdr_recl(void *unused) 497 { 498 dprintf("hdr_recl called\n"); 499 /* 500 * umem calls the reclaim func when we destroy the buf cache, 501 * which is after we do arc_fini(). 502 */ 503 if (!arc_dead) 504 cv_signal(&arc_reclaim_thr_cv); 505 } 506 507 static void 508 buf_init(void) 509 { 510 uint64_t *ct; 511 uint64_t hsize = 1ULL << 12; 512 int i, j; 513 514 /* 515 * The hash table is big enough to fill all of physical memory 516 * with an average 64K block size. The table will take up 517 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 518 */ 519 while (hsize * 65536 < physmem * PAGESIZE) 520 hsize <<= 1; 521 retry: 522 buf_hash_table.ht_mask = hsize - 1; 523 buf_hash_table.ht_table = 524 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 525 if (buf_hash_table.ht_table == NULL) { 526 ASSERT(hsize > (1ULL << 8)); 527 hsize >>= 1; 528 goto retry; 529 } 530 531 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 532 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 533 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 534 0, NULL, NULL, NULL, NULL, NULL, 0); 535 536 for (i = 0; i < 256; i++) 537 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 538 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 539 540 for (i = 0; i < BUF_LOCKS; i++) { 541 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 542 NULL, MUTEX_DEFAULT, NULL); 543 } 544 } 545 546 #define ARC_MINTIME (hz>>4) /* 62 ms */ 547 548 static void 549 arc_cksum_verify(arc_buf_t *buf) 550 { 551 zio_cksum_t zc; 552 553 if (!zfs_flags & ZFS_DEBUG_MODIFY) 554 return; 555 556 mutex_enter(&buf->b_hdr->b_freeze_lock); 557 if (buf->b_hdr->b_freeze_cksum == NULL) { 558 mutex_exit(&buf->b_hdr->b_freeze_lock); 559 return; 560 } 561 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 562 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 563 panic("buffer modified while frozen!"); 564 mutex_exit(&buf->b_hdr->b_freeze_lock); 565 } 566 567 static void 568 arc_cksum_compute(arc_buf_t *buf) 569 { 570 if (!zfs_flags & ZFS_DEBUG_MODIFY) 571 return; 572 573 mutex_enter(&buf->b_hdr->b_freeze_lock); 574 if (buf->b_hdr->b_freeze_cksum != NULL) { 575 mutex_exit(&buf->b_hdr->b_freeze_lock); 576 return; 577 } 578 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 579 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 580 buf->b_hdr->b_freeze_cksum); 581 mutex_exit(&buf->b_hdr->b_freeze_lock); 582 } 583 584 void 585 arc_buf_thaw(arc_buf_t *buf) 586 { 587 if (!zfs_flags & ZFS_DEBUG_MODIFY) 588 return; 589 590 if (buf->b_hdr->b_state != arc.anon) 591 panic("modifying non-anon buffer!"); 592 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 593 panic("modifying buffer while i/o in progress!"); 594 arc_cksum_verify(buf); 595 mutex_enter(&buf->b_hdr->b_freeze_lock); 596 if (buf->b_hdr->b_freeze_cksum != NULL) { 597 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 598 buf->b_hdr->b_freeze_cksum = NULL; 599 } 600 mutex_exit(&buf->b_hdr->b_freeze_lock); 601 } 602 603 void 604 arc_buf_freeze(arc_buf_t *buf) 605 { 606 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 607 buf->b_hdr->b_state == arc.anon); 608 arc_cksum_compute(buf); 609 } 610 611 static void 612 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 613 { 614 ASSERT(MUTEX_HELD(hash_lock)); 615 616 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 617 (ab->b_state != arc.anon)) { 618 int delta = ab->b_size * ab->b_datacnt; 619 620 ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 621 mutex_enter(&ab->b_state->mtx); 622 ASSERT(list_link_active(&ab->b_arc_node)); 623 list_remove(&ab->b_state->list, ab); 624 if (GHOST_STATE(ab->b_state)) { 625 ASSERT3U(ab->b_datacnt, ==, 0); 626 ASSERT3P(ab->b_buf, ==, NULL); 627 delta = ab->b_size; 628 } 629 ASSERT(delta > 0); 630 ASSERT3U(ab->b_state->lsize, >=, delta); 631 atomic_add_64(&ab->b_state->lsize, -delta); 632 mutex_exit(&ab->b_state->mtx); 633 /* remove the prefetch flag is we get a reference */ 634 if (ab->b_flags & ARC_PREFETCH) 635 ab->b_flags &= ~ARC_PREFETCH; 636 } 637 } 638 639 static int 640 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 641 { 642 int cnt; 643 644 ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock)); 645 ASSERT(!GHOST_STATE(ab->b_state)); 646 647 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 648 (ab->b_state != arc.anon)) { 649 650 ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 651 mutex_enter(&ab->b_state->mtx); 652 ASSERT(!list_link_active(&ab->b_arc_node)); 653 list_insert_head(&ab->b_state->list, ab); 654 ASSERT(ab->b_datacnt > 0); 655 atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt); 656 ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize); 657 mutex_exit(&ab->b_state->mtx); 658 } 659 return (cnt); 660 } 661 662 /* 663 * Move the supplied buffer to the indicated state. The mutex 664 * for the buffer must be held by the caller. 665 */ 666 static void 667 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 668 { 669 arc_state_t *old_state = ab->b_state; 670 int refcnt = refcount_count(&ab->b_refcnt); 671 int from_delta, to_delta; 672 673 ASSERT(MUTEX_HELD(hash_lock)); 674 ASSERT(new_state != old_state); 675 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 676 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 677 678 from_delta = to_delta = ab->b_datacnt * ab->b_size; 679 680 /* 681 * If this buffer is evictable, transfer it from the 682 * old state list to the new state list. 683 */ 684 if (refcnt == 0) { 685 if (old_state != arc.anon) { 686 int use_mutex = !MUTEX_HELD(&old_state->mtx); 687 688 if (use_mutex) 689 mutex_enter(&old_state->mtx); 690 691 ASSERT(list_link_active(&ab->b_arc_node)); 692 list_remove(&old_state->list, ab); 693 694 /* 695 * If prefetching out of the ghost cache, 696 * we will have a non-null datacnt. 697 */ 698 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 699 /* ghost elements have a ghost size */ 700 ASSERT(ab->b_buf == NULL); 701 from_delta = ab->b_size; 702 } 703 ASSERT3U(old_state->lsize, >=, from_delta); 704 atomic_add_64(&old_state->lsize, -from_delta); 705 706 if (use_mutex) 707 mutex_exit(&old_state->mtx); 708 } 709 if (new_state != arc.anon) { 710 int use_mutex = !MUTEX_HELD(&new_state->mtx); 711 712 if (use_mutex) 713 mutex_enter(&new_state->mtx); 714 715 list_insert_head(&new_state->list, ab); 716 717 /* ghost elements have a ghost size */ 718 if (GHOST_STATE(new_state)) { 719 ASSERT(ab->b_datacnt == 0); 720 ASSERT(ab->b_buf == NULL); 721 to_delta = ab->b_size; 722 } 723 atomic_add_64(&new_state->lsize, to_delta); 724 ASSERT3U(new_state->size + to_delta, >=, 725 new_state->lsize); 726 727 if (use_mutex) 728 mutex_exit(&new_state->mtx); 729 } 730 } 731 732 ASSERT(!BUF_EMPTY(ab)); 733 if (new_state == arc.anon && old_state != arc.anon) { 734 buf_hash_remove(ab); 735 } 736 737 /* adjust state sizes */ 738 if (to_delta) 739 atomic_add_64(&new_state->size, to_delta); 740 if (from_delta) { 741 ASSERT3U(old_state->size, >=, from_delta); 742 atomic_add_64(&old_state->size, -from_delta); 743 } 744 ab->b_state = new_state; 745 } 746 747 arc_buf_t * 748 arc_buf_alloc(spa_t *spa, int size, void *tag) 749 { 750 arc_buf_hdr_t *hdr; 751 arc_buf_t *buf; 752 753 ASSERT3U(size, >, 0); 754 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 755 ASSERT(BUF_EMPTY(hdr)); 756 hdr->b_size = size; 757 hdr->b_spa = spa; 758 hdr->b_state = arc.anon; 759 hdr->b_arc_access = 0; 760 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 761 buf->b_hdr = hdr; 762 buf->b_data = NULL; 763 buf->b_efunc = NULL; 764 buf->b_private = NULL; 765 buf->b_next = NULL; 766 hdr->b_buf = buf; 767 arc_get_data_buf(buf); 768 hdr->b_datacnt = 1; 769 hdr->b_flags = 0; 770 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 771 (void) refcount_add(&hdr->b_refcnt, tag); 772 773 return (buf); 774 } 775 776 static arc_buf_t * 777 arc_buf_clone(arc_buf_t *from) 778 { 779 arc_buf_t *buf; 780 arc_buf_hdr_t *hdr = from->b_hdr; 781 uint64_t size = hdr->b_size; 782 783 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 784 buf->b_hdr = hdr; 785 buf->b_data = NULL; 786 buf->b_efunc = NULL; 787 buf->b_private = NULL; 788 buf->b_next = hdr->b_buf; 789 hdr->b_buf = buf; 790 arc_get_data_buf(buf); 791 bcopy(from->b_data, buf->b_data, size); 792 hdr->b_datacnt += 1; 793 return (buf); 794 } 795 796 void 797 arc_buf_add_ref(arc_buf_t *buf, void* tag) 798 { 799 arc_buf_hdr_t *hdr; 800 kmutex_t *hash_lock; 801 802 /* 803 * Check to see if this buffer is currently being evicted via 804 * arc_do_user_evicts(). 805 */ 806 mutex_enter(&arc_eviction_mtx); 807 hdr = buf->b_hdr; 808 if (hdr == NULL) { 809 mutex_exit(&arc_eviction_mtx); 810 return; 811 } 812 hash_lock = HDR_LOCK(hdr); 813 mutex_exit(&arc_eviction_mtx); 814 815 mutex_enter(hash_lock); 816 if (buf->b_data == NULL) { 817 /* 818 * This buffer is evicted. 819 */ 820 mutex_exit(hash_lock); 821 return; 822 } 823 824 ASSERT(buf->b_hdr == hdr); 825 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 826 add_reference(hdr, hash_lock, tag); 827 arc_access(hdr, hash_lock); 828 mutex_exit(hash_lock); 829 atomic_add_64(&arc.hits, 1); 830 } 831 832 static void 833 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 834 { 835 arc_buf_t **bufp; 836 837 /* free up data associated with the buf */ 838 if (buf->b_data) { 839 arc_state_t *state = buf->b_hdr->b_state; 840 uint64_t size = buf->b_hdr->b_size; 841 842 arc_cksum_verify(buf); 843 if (!recycle) { 844 zio_buf_free(buf->b_data, size); 845 atomic_add_64(&arc.size, -size); 846 } 847 if (list_link_active(&buf->b_hdr->b_arc_node)) { 848 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 849 ASSERT(state != arc.anon); 850 ASSERT3U(state->lsize, >=, size); 851 atomic_add_64(&state->lsize, -size); 852 } 853 ASSERT3U(state->size, >=, size); 854 atomic_add_64(&state->size, -size); 855 buf->b_data = NULL; 856 ASSERT(buf->b_hdr->b_datacnt > 0); 857 buf->b_hdr->b_datacnt -= 1; 858 } 859 860 /* only remove the buf if requested */ 861 if (!all) 862 return; 863 864 /* remove the buf from the hdr list */ 865 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 866 continue; 867 *bufp = buf->b_next; 868 869 ASSERT(buf->b_efunc == NULL); 870 871 /* clean up the buf */ 872 buf->b_hdr = NULL; 873 kmem_cache_free(buf_cache, buf); 874 } 875 876 static void 877 arc_hdr_destroy(arc_buf_hdr_t *hdr) 878 { 879 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 880 ASSERT3P(hdr->b_state, ==, arc.anon); 881 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 882 883 if (!BUF_EMPTY(hdr)) { 884 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 885 bzero(&hdr->b_dva, sizeof (dva_t)); 886 hdr->b_birth = 0; 887 hdr->b_cksum0 = 0; 888 } 889 while (hdr->b_buf) { 890 arc_buf_t *buf = hdr->b_buf; 891 892 if (buf->b_efunc) { 893 mutex_enter(&arc_eviction_mtx); 894 ASSERT(buf->b_hdr != NULL); 895 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 896 hdr->b_buf = buf->b_next; 897 buf->b_hdr = &arc_eviction_hdr; 898 buf->b_next = arc_eviction_list; 899 arc_eviction_list = buf; 900 mutex_exit(&arc_eviction_mtx); 901 } else { 902 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 903 } 904 } 905 if (hdr->b_freeze_cksum != NULL) { 906 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 907 hdr->b_freeze_cksum = NULL; 908 } 909 910 ASSERT(!list_link_active(&hdr->b_arc_node)); 911 ASSERT3P(hdr->b_hash_next, ==, NULL); 912 ASSERT3P(hdr->b_acb, ==, NULL); 913 kmem_cache_free(hdr_cache, hdr); 914 } 915 916 void 917 arc_buf_free(arc_buf_t *buf, void *tag) 918 { 919 arc_buf_hdr_t *hdr = buf->b_hdr; 920 int hashed = hdr->b_state != arc.anon; 921 922 ASSERT(buf->b_efunc == NULL); 923 ASSERT(buf->b_data != NULL); 924 925 if (hashed) { 926 kmutex_t *hash_lock = HDR_LOCK(hdr); 927 928 mutex_enter(hash_lock); 929 (void) remove_reference(hdr, hash_lock, tag); 930 if (hdr->b_datacnt > 1) 931 arc_buf_destroy(buf, FALSE, TRUE); 932 else 933 hdr->b_flags |= ARC_BUF_AVAILABLE; 934 mutex_exit(hash_lock); 935 } else if (HDR_IO_IN_PROGRESS(hdr)) { 936 int destroy_hdr; 937 /* 938 * We are in the middle of an async write. Don't destroy 939 * this buffer unless the write completes before we finish 940 * decrementing the reference count. 941 */ 942 mutex_enter(&arc_eviction_mtx); 943 (void) remove_reference(hdr, NULL, tag); 944 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 945 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 946 mutex_exit(&arc_eviction_mtx); 947 if (destroy_hdr) 948 arc_hdr_destroy(hdr); 949 } else { 950 if (remove_reference(hdr, NULL, tag) > 0) { 951 ASSERT(HDR_IO_ERROR(hdr)); 952 arc_buf_destroy(buf, FALSE, TRUE); 953 } else { 954 arc_hdr_destroy(hdr); 955 } 956 } 957 } 958 959 int 960 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 961 { 962 arc_buf_hdr_t *hdr = buf->b_hdr; 963 kmutex_t *hash_lock = HDR_LOCK(hdr); 964 int no_callback = (buf->b_efunc == NULL); 965 966 if (hdr->b_state == arc.anon) { 967 arc_buf_free(buf, tag); 968 return (no_callback); 969 } 970 971 mutex_enter(hash_lock); 972 ASSERT(hdr->b_state != arc.anon); 973 ASSERT(buf->b_data != NULL); 974 975 (void) remove_reference(hdr, hash_lock, tag); 976 if (hdr->b_datacnt > 1) { 977 if (no_callback) 978 arc_buf_destroy(buf, FALSE, TRUE); 979 } else if (no_callback) { 980 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 981 hdr->b_flags |= ARC_BUF_AVAILABLE; 982 } 983 ASSERT(no_callback || hdr->b_datacnt > 1 || 984 refcount_is_zero(&hdr->b_refcnt)); 985 mutex_exit(hash_lock); 986 return (no_callback); 987 } 988 989 int 990 arc_buf_size(arc_buf_t *buf) 991 { 992 return (buf->b_hdr->b_size); 993 } 994 995 /* 996 * Evict buffers from list until we've removed the specified number of 997 * bytes. Move the removed buffers to the appropriate evict state. 998 * If the recycle flag is set, then attempt to "recycle" a buffer: 999 * - look for a buffer to evict that is `bytes' long. 1000 * - return the data block from this buffer rather than freeing it. 1001 * This flag is used by callers that are trying to make space for a 1002 * new buffer in a full arc cache. 1003 */ 1004 static void * 1005 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle) 1006 { 1007 arc_state_t *evicted_state; 1008 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1009 arc_buf_hdr_t *ab, *ab_prev = NULL; 1010 kmutex_t *hash_lock; 1011 boolean_t have_lock; 1012 void *stolen = NULL; 1013 1014 ASSERT(state == arc.mru || state == arc.mfu); 1015 1016 evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 1017 1018 mutex_enter(&state->mtx); 1019 mutex_enter(&evicted_state->mtx); 1020 1021 for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1022 ab_prev = list_prev(&state->list, ab); 1023 /* prefetch buffers have a minimum lifespan */ 1024 if (HDR_IO_IN_PROGRESS(ab) || 1025 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1026 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1027 skipped++; 1028 continue; 1029 } 1030 /* "lookahead" for better eviction candidate */ 1031 if (recycle && ab->b_size != bytes && 1032 ab_prev && ab_prev->b_size == bytes) 1033 continue; 1034 hash_lock = HDR_LOCK(ab); 1035 have_lock = MUTEX_HELD(hash_lock); 1036 if (have_lock || mutex_tryenter(hash_lock)) { 1037 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1038 ASSERT(ab->b_datacnt > 0); 1039 while (ab->b_buf) { 1040 arc_buf_t *buf = ab->b_buf; 1041 if (buf->b_data) { 1042 bytes_evicted += ab->b_size; 1043 if (recycle && ab->b_size == bytes) { 1044 stolen = buf->b_data; 1045 recycle = FALSE; 1046 } 1047 } 1048 if (buf->b_efunc) { 1049 mutex_enter(&arc_eviction_mtx); 1050 arc_buf_destroy(buf, 1051 buf->b_data == stolen, FALSE); 1052 ab->b_buf = buf->b_next; 1053 buf->b_hdr = &arc_eviction_hdr; 1054 buf->b_next = arc_eviction_list; 1055 arc_eviction_list = buf; 1056 mutex_exit(&arc_eviction_mtx); 1057 } else { 1058 arc_buf_destroy(buf, 1059 buf->b_data == stolen, TRUE); 1060 } 1061 } 1062 ASSERT(ab->b_datacnt == 0); 1063 arc_change_state(evicted_state, ab, hash_lock); 1064 ASSERT(HDR_IN_HASH_TABLE(ab)); 1065 ab->b_flags = ARC_IN_HASH_TABLE; 1066 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1067 if (!have_lock) 1068 mutex_exit(hash_lock); 1069 if (bytes >= 0 && bytes_evicted >= bytes) 1070 break; 1071 } else { 1072 missed += 1; 1073 } 1074 } 1075 mutex_exit(&evicted_state->mtx); 1076 mutex_exit(&state->mtx); 1077 1078 if (bytes_evicted < bytes) 1079 dprintf("only evicted %lld bytes from %x", 1080 (longlong_t)bytes_evicted, state); 1081 1082 if (skipped) 1083 atomic_add_64(&arc.evict_skip, skipped); 1084 if (missed) 1085 atomic_add_64(&arc.mutex_miss, missed); 1086 return (stolen); 1087 } 1088 1089 /* 1090 * Remove buffers from list until we've removed the specified number of 1091 * bytes. Destroy the buffers that are removed. 1092 */ 1093 static void 1094 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1095 { 1096 arc_buf_hdr_t *ab, *ab_prev; 1097 kmutex_t *hash_lock; 1098 uint64_t bytes_deleted = 0; 1099 uint_t bufs_skipped = 0; 1100 1101 ASSERT(GHOST_STATE(state)); 1102 top: 1103 mutex_enter(&state->mtx); 1104 for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1105 ab_prev = list_prev(&state->list, ab); 1106 hash_lock = HDR_LOCK(ab); 1107 if (mutex_tryenter(hash_lock)) { 1108 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1109 ASSERT(ab->b_buf == NULL); 1110 arc_change_state(arc.anon, ab, hash_lock); 1111 mutex_exit(hash_lock); 1112 atomic_add_64(&arc.deleted, 1); 1113 bytes_deleted += ab->b_size; 1114 arc_hdr_destroy(ab); 1115 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1116 if (bytes >= 0 && bytes_deleted >= bytes) 1117 break; 1118 } else { 1119 if (bytes < 0) { 1120 mutex_exit(&state->mtx); 1121 mutex_enter(hash_lock); 1122 mutex_exit(hash_lock); 1123 goto top; 1124 } 1125 bufs_skipped += 1; 1126 } 1127 } 1128 mutex_exit(&state->mtx); 1129 1130 if (bufs_skipped) { 1131 atomic_add_64(&arc.mutex_miss, bufs_skipped); 1132 ASSERT(bytes >= 0); 1133 } 1134 1135 if (bytes_deleted < bytes) 1136 dprintf("only deleted %lld bytes from %p", 1137 (longlong_t)bytes_deleted, state); 1138 } 1139 1140 static void 1141 arc_adjust(void) 1142 { 1143 int64_t top_sz, mru_over, arc_over; 1144 1145 top_sz = arc.anon->size + arc.mru->size; 1146 1147 if (top_sz > arc.p && arc.mru->lsize > 0) { 1148 int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); 1149 (void) arc_evict(arc.mru, toevict, FALSE); 1150 top_sz = arc.anon->size + arc.mru->size; 1151 } 1152 1153 mru_over = top_sz + arc.mru_ghost->size - arc.c; 1154 1155 if (mru_over > 0) { 1156 if (arc.mru_ghost->lsize > 0) { 1157 int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over); 1158 arc_evict_ghost(arc.mru_ghost, todelete); 1159 } 1160 } 1161 1162 if ((arc_over = arc.size - arc.c) > 0) { 1163 int64_t tbl_over; 1164 1165 if (arc.mfu->lsize > 0) { 1166 int64_t toevict = MIN(arc.mfu->lsize, arc_over); 1167 (void) arc_evict(arc.mfu, toevict, FALSE); 1168 } 1169 1170 tbl_over = arc.size + arc.mru_ghost->lsize + 1171 arc.mfu_ghost->lsize - arc.c*2; 1172 1173 if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) { 1174 int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over); 1175 arc_evict_ghost(arc.mfu_ghost, todelete); 1176 } 1177 } 1178 } 1179 1180 static void 1181 arc_do_user_evicts(void) 1182 { 1183 mutex_enter(&arc_eviction_mtx); 1184 while (arc_eviction_list != NULL) { 1185 arc_buf_t *buf = arc_eviction_list; 1186 arc_eviction_list = buf->b_next; 1187 buf->b_hdr = NULL; 1188 mutex_exit(&arc_eviction_mtx); 1189 1190 if (buf->b_efunc != NULL) 1191 VERIFY(buf->b_efunc(buf) == 0); 1192 1193 buf->b_efunc = NULL; 1194 buf->b_private = NULL; 1195 kmem_cache_free(buf_cache, buf); 1196 mutex_enter(&arc_eviction_mtx); 1197 } 1198 mutex_exit(&arc_eviction_mtx); 1199 } 1200 1201 /* 1202 * Flush all *evictable* data from the cache. 1203 * NOTE: this will not touch "active" (i.e. referenced) data. 1204 */ 1205 void 1206 arc_flush(void) 1207 { 1208 while (list_head(&arc.mru->list)) 1209 (void) arc_evict(arc.mru, -1, FALSE); 1210 while (list_head(&arc.mfu->list)) 1211 (void) arc_evict(arc.mfu, -1, FALSE); 1212 1213 arc_evict_ghost(arc.mru_ghost, -1); 1214 arc_evict_ghost(arc.mfu_ghost, -1); 1215 1216 mutex_enter(&arc_reclaim_thr_lock); 1217 arc_do_user_evicts(); 1218 mutex_exit(&arc_reclaim_thr_lock); 1219 ASSERT(arc_eviction_list == NULL); 1220 } 1221 1222 int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ 1223 1224 void 1225 arc_shrink(void) 1226 { 1227 if (arc.c > arc.c_min) { 1228 uint64_t to_free; 1229 1230 #ifdef _KERNEL 1231 to_free = MAX(arc.c >> arc_shrink_shift, ptob(needfree)); 1232 #else 1233 to_free = arc.c >> arc_shrink_shift; 1234 #endif 1235 if (arc.c > arc.c_min + to_free) 1236 atomic_add_64(&arc.c, -to_free); 1237 else 1238 arc.c = arc.c_min; 1239 1240 atomic_add_64(&arc.p, -(arc.p >> arc_shrink_shift)); 1241 if (arc.c > arc.size) 1242 arc.c = MAX(arc.size, arc.c_min); 1243 if (arc.p > arc.c) 1244 arc.p = (arc.c >> 1); 1245 ASSERT(arc.c >= arc.c_min); 1246 ASSERT((int64_t)arc.p >= 0); 1247 } 1248 1249 if (arc.size > arc.c) 1250 arc_adjust(); 1251 } 1252 1253 static int 1254 arc_reclaim_needed(void) 1255 { 1256 uint64_t extra; 1257 1258 #ifdef _KERNEL 1259 1260 if (needfree) 1261 return (1); 1262 1263 /* 1264 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1265 */ 1266 extra = desfree; 1267 1268 /* 1269 * check that we're out of range of the pageout scanner. It starts to 1270 * schedule paging if freemem is less than lotsfree and needfree. 1271 * lotsfree is the high-water mark for pageout, and needfree is the 1272 * number of needed free pages. We add extra pages here to make sure 1273 * the scanner doesn't start up while we're freeing memory. 1274 */ 1275 if (freemem < lotsfree + needfree + extra) 1276 return (1); 1277 1278 /* 1279 * check to make sure that swapfs has enough space so that anon 1280 * reservations can still succeeed. anon_resvmem() checks that the 1281 * availrmem is greater than swapfs_minfree, and the number of reserved 1282 * swap pages. We also add a bit of extra here just to prevent 1283 * circumstances from getting really dire. 1284 */ 1285 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1286 return (1); 1287 1288 #if defined(__i386) 1289 /* 1290 * If we're on an i386 platform, it's possible that we'll exhaust the 1291 * kernel heap space before we ever run out of available physical 1292 * memory. Most checks of the size of the heap_area compare against 1293 * tune.t_minarmem, which is the minimum available real memory that we 1294 * can have in the system. However, this is generally fixed at 25 pages 1295 * which is so low that it's useless. In this comparison, we seek to 1296 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1297 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1298 * free) 1299 */ 1300 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1301 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1302 return (1); 1303 #endif 1304 1305 #else 1306 if (spa_get_random(100) == 0) 1307 return (1); 1308 #endif 1309 return (0); 1310 } 1311 1312 static void 1313 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1314 { 1315 size_t i; 1316 kmem_cache_t *prev_cache = NULL; 1317 extern kmem_cache_t *zio_buf_cache[]; 1318 1319 #ifdef _KERNEL 1320 /* 1321 * First purge some DNLC entries, in case the DNLC is using 1322 * up too much memory. 1323 */ 1324 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1325 1326 #if defined(__i386) 1327 /* 1328 * Reclaim unused memory from all kmem caches. 1329 */ 1330 kmem_reap(); 1331 #endif 1332 #endif 1333 1334 /* 1335 * An agressive reclamation will shrink the cache size as well as 1336 * reap free buffers from the arc kmem caches. 1337 */ 1338 if (strat == ARC_RECLAIM_AGGR) 1339 arc_shrink(); 1340 1341 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1342 if (zio_buf_cache[i] != prev_cache) { 1343 prev_cache = zio_buf_cache[i]; 1344 kmem_cache_reap_now(zio_buf_cache[i]); 1345 } 1346 } 1347 kmem_cache_reap_now(buf_cache); 1348 kmem_cache_reap_now(hdr_cache); 1349 } 1350 1351 static void 1352 arc_reclaim_thread(void) 1353 { 1354 clock_t growtime = 0; 1355 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1356 callb_cpr_t cpr; 1357 1358 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1359 1360 mutex_enter(&arc_reclaim_thr_lock); 1361 while (arc_thread_exit == 0) { 1362 if (arc_reclaim_needed()) { 1363 1364 if (arc.no_grow) { 1365 if (last_reclaim == ARC_RECLAIM_CONS) { 1366 last_reclaim = ARC_RECLAIM_AGGR; 1367 } else { 1368 last_reclaim = ARC_RECLAIM_CONS; 1369 } 1370 } else { 1371 arc.no_grow = TRUE; 1372 last_reclaim = ARC_RECLAIM_AGGR; 1373 membar_producer(); 1374 } 1375 1376 /* reset the growth delay for every reclaim */ 1377 growtime = lbolt + (arc_grow_retry * hz); 1378 ASSERT(growtime > 0); 1379 1380 arc_kmem_reap_now(last_reclaim); 1381 1382 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1383 arc.no_grow = FALSE; 1384 } 1385 1386 if (arc_eviction_list != NULL) 1387 arc_do_user_evicts(); 1388 1389 /* block until needed, or one second, whichever is shorter */ 1390 CALLB_CPR_SAFE_BEGIN(&cpr); 1391 (void) cv_timedwait(&arc_reclaim_thr_cv, 1392 &arc_reclaim_thr_lock, (lbolt + hz)); 1393 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1394 } 1395 1396 arc_thread_exit = 0; 1397 cv_broadcast(&arc_reclaim_thr_cv); 1398 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1399 thread_exit(); 1400 } 1401 1402 /* 1403 * Adapt arc info given the number of bytes we are trying to add and 1404 * the state that we are comming from. This function is only called 1405 * when we are adding new content to the cache. 1406 */ 1407 static void 1408 arc_adapt(int bytes, arc_state_t *state) 1409 { 1410 int mult; 1411 1412 ASSERT(bytes > 0); 1413 /* 1414 * Adapt the target size of the MRU list: 1415 * - if we just hit in the MRU ghost list, then increase 1416 * the target size of the MRU list. 1417 * - if we just hit in the MFU ghost list, then increase 1418 * the target size of the MFU list by decreasing the 1419 * target size of the MRU list. 1420 */ 1421 if (state == arc.mru_ghost) { 1422 mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ? 1423 1 : (arc.mfu_ghost->size/arc.mru_ghost->size)); 1424 1425 arc.p = MIN(arc.c, arc.p + bytes * mult); 1426 } else if (state == arc.mfu_ghost) { 1427 mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ? 1428 1 : (arc.mru_ghost->size/arc.mfu_ghost->size)); 1429 1430 arc.p = MAX(0, (int64_t)arc.p - bytes * mult); 1431 } 1432 ASSERT((int64_t)arc.p >= 0); 1433 1434 if (arc_reclaim_needed()) { 1435 cv_signal(&arc_reclaim_thr_cv); 1436 return; 1437 } 1438 1439 if (arc.no_grow) 1440 return; 1441 1442 if (arc.c >= arc.c_max) 1443 return; 1444 1445 /* 1446 * If we're within (2 * maxblocksize) bytes of the target 1447 * cache size, increment the target cache size 1448 */ 1449 if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1450 atomic_add_64(&arc.c, (int64_t)bytes); 1451 if (arc.c > arc.c_max) 1452 arc.c = arc.c_max; 1453 else if (state == arc.anon) 1454 atomic_add_64(&arc.p, (int64_t)bytes); 1455 if (arc.p > arc.c) 1456 arc.p = arc.c; 1457 } 1458 ASSERT((int64_t)arc.p >= 0); 1459 } 1460 1461 /* 1462 * Check if the cache has reached its limits and eviction is required 1463 * prior to insert. 1464 */ 1465 static int 1466 arc_evict_needed() 1467 { 1468 if (arc_reclaim_needed()) 1469 return (1); 1470 1471 return (arc.size > arc.c); 1472 } 1473 1474 /* 1475 * The buffer, supplied as the first argument, needs a data block. 1476 * So, if we are at cache max, determine which cache should be victimized. 1477 * We have the following cases: 1478 * 1479 * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) -> 1480 * In this situation if we're out of space, but the resident size of the MFU is 1481 * under the limit, victimize the MFU cache to satisfy this insertion request. 1482 * 1483 * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) -> 1484 * Here, we've used up all of the available space for the MRU, so we need to 1485 * evict from our own cache instead. Evict from the set of resident MRU 1486 * entries. 1487 * 1488 * 3. Insert for MFU (c - p) > sizeof(arc.mfu) -> 1489 * c minus p represents the MFU space in the cache, since p is the size of the 1490 * cache that is dedicated to the MRU. In this situation there's still space on 1491 * the MFU side, so the MRU side needs to be victimized. 1492 * 1493 * 4. Insert for MFU (c - p) < sizeof(arc.mfu) -> 1494 * MFU's resident set is consuming more space than it has been allotted. In 1495 * this situation, we must victimize our own cache, the MFU, for this insertion. 1496 */ 1497 static void 1498 arc_get_data_buf(arc_buf_t *buf) 1499 { 1500 arc_state_t *state = buf->b_hdr->b_state; 1501 uint64_t size = buf->b_hdr->b_size; 1502 1503 arc_adapt(size, state); 1504 1505 /* 1506 * We have not yet reached cache maximum size, 1507 * just allocate a new buffer. 1508 */ 1509 if (!arc_evict_needed()) { 1510 buf->b_data = zio_buf_alloc(size); 1511 atomic_add_64(&arc.size, size); 1512 goto out; 1513 } 1514 1515 /* 1516 * If we are prefetching from the mfu ghost list, this buffer 1517 * will end up on the mru list; so steal space from there. 1518 */ 1519 if (state == arc.mfu_ghost) 1520 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc.mru : arc.mfu; 1521 else if (state == arc.mru_ghost) 1522 state = arc.mru; 1523 1524 if (state == arc.mru || state == arc.anon) { 1525 uint64_t mru_used = arc.anon->size + arc.mru->size; 1526 state = (arc.p > mru_used) ? arc.mfu : arc.mru; 1527 } else { 1528 /* MFU cases */ 1529 uint64_t mfu_space = arc.c - arc.p; 1530 state = (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu; 1531 } 1532 if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) { 1533 buf->b_data = zio_buf_alloc(size); 1534 atomic_add_64(&arc.size, size); 1535 atomic_add_64(&arc.recycle_miss, 1); 1536 } 1537 ASSERT(buf->b_data != NULL); 1538 out: 1539 /* 1540 * Update the state size. Note that ghost states have a 1541 * "ghost size" and so don't need to be updated. 1542 */ 1543 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1544 arc_buf_hdr_t *hdr = buf->b_hdr; 1545 1546 atomic_add_64(&hdr->b_state->size, size); 1547 if (list_link_active(&hdr->b_arc_node)) { 1548 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1549 atomic_add_64(&hdr->b_state->lsize, size); 1550 } 1551 } 1552 } 1553 1554 /* 1555 * This routine is called whenever a buffer is accessed. 1556 * NOTE: the hash lock is dropped in this function. 1557 */ 1558 static void 1559 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1560 { 1561 ASSERT(MUTEX_HELD(hash_lock)); 1562 1563 if (buf->b_state == arc.anon) { 1564 /* 1565 * This buffer is not in the cache, and does not 1566 * appear in our "ghost" list. Add the new buffer 1567 * to the MRU state. 1568 */ 1569 1570 ASSERT(buf->b_arc_access == 0); 1571 buf->b_arc_access = lbolt; 1572 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1573 arc_change_state(arc.mru, buf, hash_lock); 1574 1575 } else if (buf->b_state == arc.mru) { 1576 /* 1577 * If this buffer is here because of a prefetch, then either: 1578 * - clear the flag if this is a "referencing" read 1579 * (any subsequent access will bump this into the MFU state). 1580 * or 1581 * - move the buffer to the head of the list if this is 1582 * another prefetch (to make it less likely to be evicted). 1583 */ 1584 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1585 if (refcount_count(&buf->b_refcnt) == 0) { 1586 ASSERT(list_link_active(&buf->b_arc_node)); 1587 mutex_enter(&arc.mru->mtx); 1588 list_remove(&arc.mru->list, buf); 1589 list_insert_head(&arc.mru->list, buf); 1590 mutex_exit(&arc.mru->mtx); 1591 } else { 1592 buf->b_flags &= ~ARC_PREFETCH; 1593 atomic_add_64(&arc.mru->hits, 1); 1594 } 1595 buf->b_arc_access = lbolt; 1596 return; 1597 } 1598 1599 /* 1600 * This buffer has been "accessed" only once so far, 1601 * but it is still in the cache. Move it to the MFU 1602 * state. 1603 */ 1604 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1605 /* 1606 * More than 125ms have passed since we 1607 * instantiated this buffer. Move it to the 1608 * most frequently used state. 1609 */ 1610 buf->b_arc_access = lbolt; 1611 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1612 arc_change_state(arc.mfu, buf, hash_lock); 1613 } 1614 atomic_add_64(&arc.mru->hits, 1); 1615 } else if (buf->b_state == arc.mru_ghost) { 1616 arc_state_t *new_state; 1617 /* 1618 * This buffer has been "accessed" recently, but 1619 * was evicted from the cache. Move it to the 1620 * MFU state. 1621 */ 1622 1623 if (buf->b_flags & ARC_PREFETCH) { 1624 new_state = arc.mru; 1625 if (refcount_count(&buf->b_refcnt) > 0) 1626 buf->b_flags &= ~ARC_PREFETCH; 1627 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1628 } else { 1629 new_state = arc.mfu; 1630 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1631 } 1632 1633 buf->b_arc_access = lbolt; 1634 arc_change_state(new_state, buf, hash_lock); 1635 1636 atomic_add_64(&arc.mru_ghost->hits, 1); 1637 } else if (buf->b_state == arc.mfu) { 1638 /* 1639 * This buffer has been accessed more than once and is 1640 * still in the cache. Keep it in the MFU state. 1641 * 1642 * NOTE: an add_reference() that occurred when we did 1643 * the arc_read() will have kicked this off the list. 1644 * If it was a prefetch, we will explicitly move it to 1645 * the head of the list now. 1646 */ 1647 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1648 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1649 ASSERT(list_link_active(&buf->b_arc_node)); 1650 mutex_enter(&arc.mfu->mtx); 1651 list_remove(&arc.mfu->list, buf); 1652 list_insert_head(&arc.mfu->list, buf); 1653 mutex_exit(&arc.mfu->mtx); 1654 } 1655 atomic_add_64(&arc.mfu->hits, 1); 1656 buf->b_arc_access = lbolt; 1657 } else if (buf->b_state == arc.mfu_ghost) { 1658 arc_state_t *new_state = arc.mfu; 1659 /* 1660 * This buffer has been accessed more than once but has 1661 * been evicted from the cache. Move it back to the 1662 * MFU state. 1663 */ 1664 1665 if (buf->b_flags & ARC_PREFETCH) { 1666 /* 1667 * This is a prefetch access... 1668 * move this block back to the MRU state. 1669 */ 1670 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1671 new_state = arc.mru; 1672 } 1673 1674 buf->b_arc_access = lbolt; 1675 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1676 arc_change_state(new_state, buf, hash_lock); 1677 1678 atomic_add_64(&arc.mfu_ghost->hits, 1); 1679 } else { 1680 ASSERT(!"invalid arc state"); 1681 } 1682 } 1683 1684 /* a generic arc_done_func_t which you can use */ 1685 /* ARGSUSED */ 1686 void 1687 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1688 { 1689 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1690 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1691 } 1692 1693 /* a generic arc_done_func_t which you can use */ 1694 void 1695 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1696 { 1697 arc_buf_t **bufp = arg; 1698 if (zio && zio->io_error) { 1699 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1700 *bufp = NULL; 1701 } else { 1702 *bufp = buf; 1703 } 1704 } 1705 1706 static void 1707 arc_read_done(zio_t *zio) 1708 { 1709 arc_buf_hdr_t *hdr, *found; 1710 arc_buf_t *buf; 1711 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1712 kmutex_t *hash_lock; 1713 arc_callback_t *callback_list, *acb; 1714 int freeable = FALSE; 1715 1716 buf = zio->io_private; 1717 hdr = buf->b_hdr; 1718 1719 /* 1720 * The hdr was inserted into hash-table and removed from lists 1721 * prior to starting I/O. We should find this header, since 1722 * it's in the hash table, and it should be legit since it's 1723 * not possible to evict it during the I/O. The only possible 1724 * reason for it not to be found is if we were freed during the 1725 * read. 1726 */ 1727 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1728 &hash_lock); 1729 1730 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1731 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1732 1733 /* byteswap if necessary */ 1734 callback_list = hdr->b_acb; 1735 ASSERT(callback_list != NULL); 1736 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1737 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1738 1739 arc_cksum_compute(buf); 1740 1741 /* create copies of the data buffer for the callers */ 1742 abuf = buf; 1743 for (acb = callback_list; acb; acb = acb->acb_next) { 1744 if (acb->acb_done) { 1745 if (abuf == NULL) 1746 abuf = arc_buf_clone(buf); 1747 acb->acb_buf = abuf; 1748 abuf = NULL; 1749 } 1750 } 1751 hdr->b_acb = NULL; 1752 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1753 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1754 if (abuf == buf) 1755 hdr->b_flags |= ARC_BUF_AVAILABLE; 1756 1757 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1758 1759 if (zio->io_error != 0) { 1760 hdr->b_flags |= ARC_IO_ERROR; 1761 if (hdr->b_state != arc.anon) 1762 arc_change_state(arc.anon, hdr, hash_lock); 1763 if (HDR_IN_HASH_TABLE(hdr)) 1764 buf_hash_remove(hdr); 1765 freeable = refcount_is_zero(&hdr->b_refcnt); 1766 /* convert checksum errors into IO errors */ 1767 if (zio->io_error == ECKSUM) 1768 zio->io_error = EIO; 1769 } 1770 1771 /* 1772 * Broadcast before we drop the hash_lock to avoid the possibility 1773 * that the hdr (and hence the cv) might be freed before we get to 1774 * the cv_broadcast(). 1775 */ 1776 cv_broadcast(&hdr->b_cv); 1777 1778 if (hash_lock) { 1779 /* 1780 * Only call arc_access on anonymous buffers. This is because 1781 * if we've issued an I/O for an evicted buffer, we've already 1782 * called arc_access (to prevent any simultaneous readers from 1783 * getting confused). 1784 */ 1785 if (zio->io_error == 0 && hdr->b_state == arc.anon) 1786 arc_access(hdr, hash_lock); 1787 mutex_exit(hash_lock); 1788 } else { 1789 /* 1790 * This block was freed while we waited for the read to 1791 * complete. It has been removed from the hash table and 1792 * moved to the anonymous state (so that it won't show up 1793 * in the cache). 1794 */ 1795 ASSERT3P(hdr->b_state, ==, arc.anon); 1796 freeable = refcount_is_zero(&hdr->b_refcnt); 1797 } 1798 1799 /* execute each callback and free its structure */ 1800 while ((acb = callback_list) != NULL) { 1801 if (acb->acb_done) 1802 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1803 1804 if (acb->acb_zio_dummy != NULL) { 1805 acb->acb_zio_dummy->io_error = zio->io_error; 1806 zio_nowait(acb->acb_zio_dummy); 1807 } 1808 1809 callback_list = acb->acb_next; 1810 kmem_free(acb, sizeof (arc_callback_t)); 1811 } 1812 1813 if (freeable) 1814 arc_hdr_destroy(hdr); 1815 } 1816 1817 /* 1818 * "Read" the block block at the specified DVA (in bp) via the 1819 * cache. If the block is found in the cache, invoke the provided 1820 * callback immediately and return. Note that the `zio' parameter 1821 * in the callback will be NULL in this case, since no IO was 1822 * required. If the block is not in the cache pass the read request 1823 * on to the spa with a substitute callback function, so that the 1824 * requested block will be added to the cache. 1825 * 1826 * If a read request arrives for a block that has a read in-progress, 1827 * either wait for the in-progress read to complete (and return the 1828 * results); or, if this is a read with a "done" func, add a record 1829 * to the read to invoke the "done" func when the read completes, 1830 * and return; or just return. 1831 * 1832 * arc_read_done() will invoke all the requested "done" functions 1833 * for readers of this block. 1834 */ 1835 int 1836 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1837 arc_done_func_t *done, void *private, int priority, int flags, 1838 uint32_t *arc_flags, zbookmark_t *zb) 1839 { 1840 arc_buf_hdr_t *hdr; 1841 arc_buf_t *buf; 1842 kmutex_t *hash_lock; 1843 zio_t *rzio; 1844 1845 top: 1846 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1847 if (hdr && hdr->b_datacnt > 0) { 1848 1849 *arc_flags |= ARC_CACHED; 1850 1851 if (HDR_IO_IN_PROGRESS(hdr)) { 1852 1853 if (*arc_flags & ARC_WAIT) { 1854 cv_wait(&hdr->b_cv, hash_lock); 1855 mutex_exit(hash_lock); 1856 goto top; 1857 } 1858 ASSERT(*arc_flags & ARC_NOWAIT); 1859 1860 if (done) { 1861 arc_callback_t *acb = NULL; 1862 1863 acb = kmem_zalloc(sizeof (arc_callback_t), 1864 KM_SLEEP); 1865 acb->acb_done = done; 1866 acb->acb_private = private; 1867 acb->acb_byteswap = swap; 1868 if (pio != NULL) 1869 acb->acb_zio_dummy = zio_null(pio, 1870 spa, NULL, NULL, flags); 1871 1872 ASSERT(acb->acb_done != NULL); 1873 acb->acb_next = hdr->b_acb; 1874 hdr->b_acb = acb; 1875 add_reference(hdr, hash_lock, private); 1876 mutex_exit(hash_lock); 1877 return (0); 1878 } 1879 mutex_exit(hash_lock); 1880 return (0); 1881 } 1882 1883 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 1884 1885 if (done) { 1886 add_reference(hdr, hash_lock, private); 1887 /* 1888 * If this block is already in use, create a new 1889 * copy of the data so that we will be guaranteed 1890 * that arc_release() will always succeed. 1891 */ 1892 buf = hdr->b_buf; 1893 ASSERT(buf); 1894 ASSERT(buf->b_data); 1895 if (HDR_BUF_AVAILABLE(hdr)) { 1896 ASSERT(buf->b_efunc == NULL); 1897 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 1898 } else { 1899 buf = arc_buf_clone(buf); 1900 } 1901 } else if (*arc_flags & ARC_PREFETCH && 1902 refcount_count(&hdr->b_refcnt) == 0) { 1903 hdr->b_flags |= ARC_PREFETCH; 1904 } 1905 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1906 arc_access(hdr, hash_lock); 1907 mutex_exit(hash_lock); 1908 atomic_add_64(&arc.hits, 1); 1909 if (done) 1910 done(NULL, buf, private); 1911 } else { 1912 uint64_t size = BP_GET_LSIZE(bp); 1913 arc_callback_t *acb; 1914 1915 if (hdr == NULL) { 1916 /* this block is not in the cache */ 1917 arc_buf_hdr_t *exists; 1918 1919 buf = arc_buf_alloc(spa, size, private); 1920 hdr = buf->b_hdr; 1921 hdr->b_dva = *BP_IDENTITY(bp); 1922 hdr->b_birth = bp->blk_birth; 1923 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1924 exists = buf_hash_insert(hdr, &hash_lock); 1925 if (exists) { 1926 /* somebody beat us to the hash insert */ 1927 mutex_exit(hash_lock); 1928 bzero(&hdr->b_dva, sizeof (dva_t)); 1929 hdr->b_birth = 0; 1930 hdr->b_cksum0 = 0; 1931 (void) arc_buf_remove_ref(buf, private); 1932 goto top; /* restart the IO request */ 1933 } 1934 /* if this is a prefetch, we don't have a reference */ 1935 if (*arc_flags & ARC_PREFETCH) { 1936 (void) remove_reference(hdr, hash_lock, 1937 private); 1938 hdr->b_flags |= ARC_PREFETCH; 1939 } 1940 if (BP_GET_LEVEL(bp) > 0) 1941 hdr->b_flags |= ARC_INDIRECT; 1942 } else { 1943 /* this block is in the ghost cache */ 1944 ASSERT(GHOST_STATE(hdr->b_state)); 1945 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1946 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 1947 ASSERT(hdr->b_buf == NULL); 1948 1949 /* if this is a prefetch, we don't have a reference */ 1950 if (*arc_flags & ARC_PREFETCH) 1951 hdr->b_flags |= ARC_PREFETCH; 1952 else 1953 add_reference(hdr, hash_lock, private); 1954 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1955 buf->b_hdr = hdr; 1956 buf->b_data = NULL; 1957 buf->b_efunc = NULL; 1958 buf->b_private = NULL; 1959 buf->b_next = NULL; 1960 hdr->b_buf = buf; 1961 arc_get_data_buf(buf); 1962 ASSERT(hdr->b_datacnt == 0); 1963 hdr->b_datacnt = 1; 1964 1965 } 1966 1967 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1968 acb->acb_done = done; 1969 acb->acb_private = private; 1970 acb->acb_byteswap = swap; 1971 1972 ASSERT(hdr->b_acb == NULL); 1973 hdr->b_acb = acb; 1974 hdr->b_flags |= ARC_IO_IN_PROGRESS; 1975 1976 /* 1977 * If the buffer has been evicted, migrate it to a present state 1978 * before issuing the I/O. Once we drop the hash-table lock, 1979 * the header will be marked as I/O in progress and have an 1980 * attached buffer. At this point, anybody who finds this 1981 * buffer ought to notice that it's legit but has a pending I/O. 1982 */ 1983 1984 if (GHOST_STATE(hdr->b_state)) 1985 arc_access(hdr, hash_lock); 1986 mutex_exit(hash_lock); 1987 1988 ASSERT3U(hdr->b_size, ==, size); 1989 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 1990 zbookmark_t *, zb); 1991 atomic_add_64(&arc.misses, 1); 1992 1993 rzio = zio_read(pio, spa, bp, buf->b_data, size, 1994 arc_read_done, buf, priority, flags, zb); 1995 1996 if (*arc_flags & ARC_WAIT) 1997 return (zio_wait(rzio)); 1998 1999 ASSERT(*arc_flags & ARC_NOWAIT); 2000 zio_nowait(rzio); 2001 } 2002 return (0); 2003 } 2004 2005 /* 2006 * arc_read() variant to support pool traversal. If the block is already 2007 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2008 * The idea is that we don't want pool traversal filling up memory, but 2009 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2010 */ 2011 int 2012 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2013 { 2014 arc_buf_hdr_t *hdr; 2015 kmutex_t *hash_mtx; 2016 int rc = 0; 2017 2018 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2019 2020 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2021 arc_buf_t *buf = hdr->b_buf; 2022 2023 ASSERT(buf); 2024 while (buf->b_data == NULL) { 2025 buf = buf->b_next; 2026 ASSERT(buf); 2027 } 2028 bcopy(buf->b_data, data, hdr->b_size); 2029 } else { 2030 rc = ENOENT; 2031 } 2032 2033 if (hash_mtx) 2034 mutex_exit(hash_mtx); 2035 2036 return (rc); 2037 } 2038 2039 void 2040 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2041 { 2042 ASSERT(buf->b_hdr != NULL); 2043 ASSERT(buf->b_hdr->b_state != arc.anon); 2044 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2045 buf->b_efunc = func; 2046 buf->b_private = private; 2047 } 2048 2049 /* 2050 * This is used by the DMU to let the ARC know that a buffer is 2051 * being evicted, so the ARC should clean up. If this arc buf 2052 * is not yet in the evicted state, it will be put there. 2053 */ 2054 int 2055 arc_buf_evict(arc_buf_t *buf) 2056 { 2057 arc_buf_hdr_t *hdr; 2058 kmutex_t *hash_lock; 2059 arc_buf_t **bufp; 2060 2061 mutex_enter(&arc_eviction_mtx); 2062 hdr = buf->b_hdr; 2063 if (hdr == NULL) { 2064 /* 2065 * We are in arc_do_user_evicts(). 2066 */ 2067 ASSERT(buf->b_data == NULL); 2068 mutex_exit(&arc_eviction_mtx); 2069 return (0); 2070 } 2071 hash_lock = HDR_LOCK(hdr); 2072 mutex_exit(&arc_eviction_mtx); 2073 2074 mutex_enter(hash_lock); 2075 2076 if (buf->b_data == NULL) { 2077 /* 2078 * We are on the eviction list. 2079 */ 2080 mutex_exit(hash_lock); 2081 mutex_enter(&arc_eviction_mtx); 2082 if (buf->b_hdr == NULL) { 2083 /* 2084 * We are already in arc_do_user_evicts(). 2085 */ 2086 mutex_exit(&arc_eviction_mtx); 2087 return (0); 2088 } else { 2089 arc_buf_t copy = *buf; /* structure assignment */ 2090 /* 2091 * Process this buffer now 2092 * but let arc_do_user_evicts() do the reaping. 2093 */ 2094 buf->b_efunc = NULL; 2095 mutex_exit(&arc_eviction_mtx); 2096 VERIFY(copy.b_efunc(©) == 0); 2097 return (1); 2098 } 2099 } 2100 2101 ASSERT(buf->b_hdr == hdr); 2102 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2103 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 2104 2105 /* 2106 * Pull this buffer off of the hdr 2107 */ 2108 bufp = &hdr->b_buf; 2109 while (*bufp != buf) 2110 bufp = &(*bufp)->b_next; 2111 *bufp = buf->b_next; 2112 2113 ASSERT(buf->b_data != NULL); 2114 arc_buf_destroy(buf, FALSE, FALSE); 2115 2116 if (hdr->b_datacnt == 0) { 2117 arc_state_t *old_state = hdr->b_state; 2118 arc_state_t *evicted_state; 2119 2120 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2121 2122 evicted_state = 2123 (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 2124 2125 mutex_enter(&old_state->mtx); 2126 mutex_enter(&evicted_state->mtx); 2127 2128 arc_change_state(evicted_state, hdr, hash_lock); 2129 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2130 hdr->b_flags = ARC_IN_HASH_TABLE; 2131 2132 mutex_exit(&evicted_state->mtx); 2133 mutex_exit(&old_state->mtx); 2134 } 2135 mutex_exit(hash_lock); 2136 2137 VERIFY(buf->b_efunc(buf) == 0); 2138 buf->b_efunc = NULL; 2139 buf->b_private = NULL; 2140 buf->b_hdr = NULL; 2141 kmem_cache_free(buf_cache, buf); 2142 return (1); 2143 } 2144 2145 /* 2146 * Release this buffer from the cache. This must be done 2147 * after a read and prior to modifying the buffer contents. 2148 * If the buffer has more than one reference, we must make 2149 * make a new hdr for the buffer. 2150 */ 2151 void 2152 arc_release(arc_buf_t *buf, void *tag) 2153 { 2154 arc_buf_hdr_t *hdr = buf->b_hdr; 2155 kmutex_t *hash_lock = HDR_LOCK(hdr); 2156 2157 /* this buffer is not on any list */ 2158 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2159 2160 if (hdr->b_state == arc.anon) { 2161 /* this buffer is already released */ 2162 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2163 ASSERT(BUF_EMPTY(hdr)); 2164 ASSERT(buf->b_efunc == NULL); 2165 arc_buf_thaw(buf); 2166 return; 2167 } 2168 2169 mutex_enter(hash_lock); 2170 2171 /* 2172 * Do we have more than one buf? 2173 */ 2174 if (hdr->b_buf != buf || buf->b_next != NULL) { 2175 arc_buf_hdr_t *nhdr; 2176 arc_buf_t **bufp; 2177 uint64_t blksz = hdr->b_size; 2178 spa_t *spa = hdr->b_spa; 2179 2180 ASSERT(hdr->b_datacnt > 1); 2181 /* 2182 * Pull the data off of this buf and attach it to 2183 * a new anonymous buf. 2184 */ 2185 (void) remove_reference(hdr, hash_lock, tag); 2186 bufp = &hdr->b_buf; 2187 while (*bufp != buf) 2188 bufp = &(*bufp)->b_next; 2189 *bufp = (*bufp)->b_next; 2190 2191 ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 2192 atomic_add_64(&hdr->b_state->size, -hdr->b_size); 2193 if (refcount_is_zero(&hdr->b_refcnt)) { 2194 ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size); 2195 atomic_add_64(&hdr->b_state->lsize, -hdr->b_size); 2196 } 2197 hdr->b_datacnt -= 1; 2198 2199 mutex_exit(hash_lock); 2200 2201 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2202 nhdr->b_size = blksz; 2203 nhdr->b_spa = spa; 2204 nhdr->b_buf = buf; 2205 nhdr->b_state = arc.anon; 2206 nhdr->b_arc_access = 0; 2207 nhdr->b_flags = 0; 2208 nhdr->b_datacnt = 1; 2209 nhdr->b_freeze_cksum = 2210 kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 2211 *nhdr->b_freeze_cksum = *hdr->b_freeze_cksum; /* struct copy */ 2212 buf->b_hdr = nhdr; 2213 buf->b_next = NULL; 2214 (void) refcount_add(&nhdr->b_refcnt, tag); 2215 atomic_add_64(&arc.anon->size, blksz); 2216 2217 hdr = nhdr; 2218 } else { 2219 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2220 ASSERT(!list_link_active(&hdr->b_arc_node)); 2221 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2222 arc_change_state(arc.anon, hdr, hash_lock); 2223 hdr->b_arc_access = 0; 2224 mutex_exit(hash_lock); 2225 bzero(&hdr->b_dva, sizeof (dva_t)); 2226 hdr->b_birth = 0; 2227 hdr->b_cksum0 = 0; 2228 } 2229 buf->b_efunc = NULL; 2230 buf->b_private = NULL; 2231 arc_buf_thaw(buf); 2232 } 2233 2234 int 2235 arc_released(arc_buf_t *buf) 2236 { 2237 return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon); 2238 } 2239 2240 int 2241 arc_has_callback(arc_buf_t *buf) 2242 { 2243 return (buf->b_efunc != NULL); 2244 } 2245 2246 #ifdef ZFS_DEBUG 2247 int 2248 arc_referenced(arc_buf_t *buf) 2249 { 2250 return (refcount_count(&buf->b_hdr->b_refcnt)); 2251 } 2252 #endif 2253 2254 static void 2255 arc_write_done(zio_t *zio) 2256 { 2257 arc_buf_t *buf; 2258 arc_buf_hdr_t *hdr; 2259 arc_callback_t *acb; 2260 2261 buf = zio->io_private; 2262 hdr = buf->b_hdr; 2263 acb = hdr->b_acb; 2264 hdr->b_acb = NULL; 2265 ASSERT(acb != NULL); 2266 2267 /* this buffer is on no lists and is not in the hash table */ 2268 ASSERT3P(hdr->b_state, ==, arc.anon); 2269 2270 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2271 hdr->b_birth = zio->io_bp->blk_birth; 2272 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2273 /* 2274 * If the block to be written was all-zero, we may have 2275 * compressed it away. In this case no write was performed 2276 * so there will be no dva/birth-date/checksum. The buffer 2277 * must therefor remain anonymous (and uncached). 2278 */ 2279 if (!BUF_EMPTY(hdr)) { 2280 arc_buf_hdr_t *exists; 2281 kmutex_t *hash_lock; 2282 2283 arc_cksum_verify(buf); 2284 2285 exists = buf_hash_insert(hdr, &hash_lock); 2286 if (exists) { 2287 /* 2288 * This can only happen if we overwrite for 2289 * sync-to-convergence, because we remove 2290 * buffers from the hash table when we arc_free(). 2291 */ 2292 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2293 BP_IDENTITY(zio->io_bp))); 2294 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2295 zio->io_bp->blk_birth); 2296 2297 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2298 arc_change_state(arc.anon, exists, hash_lock); 2299 mutex_exit(hash_lock); 2300 arc_hdr_destroy(exists); 2301 exists = buf_hash_insert(hdr, &hash_lock); 2302 ASSERT3P(exists, ==, NULL); 2303 } 2304 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2305 arc_access(hdr, hash_lock); 2306 mutex_exit(hash_lock); 2307 } else if (acb->acb_done == NULL) { 2308 int destroy_hdr; 2309 /* 2310 * This is an anonymous buffer with no user callback, 2311 * destroy it if there are no active references. 2312 */ 2313 mutex_enter(&arc_eviction_mtx); 2314 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2315 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2316 mutex_exit(&arc_eviction_mtx); 2317 if (destroy_hdr) 2318 arc_hdr_destroy(hdr); 2319 } else { 2320 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2321 } 2322 2323 if (acb->acb_done) { 2324 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2325 acb->acb_done(zio, buf, acb->acb_private); 2326 } 2327 2328 kmem_free(acb, sizeof (arc_callback_t)); 2329 } 2330 2331 int 2332 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2333 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2334 arc_done_func_t *done, void *private, int priority, int flags, 2335 uint32_t arc_flags, zbookmark_t *zb) 2336 { 2337 arc_buf_hdr_t *hdr = buf->b_hdr; 2338 arc_callback_t *acb; 2339 zio_t *rzio; 2340 2341 /* this is a private buffer - no locking required */ 2342 ASSERT3P(hdr->b_state, ==, arc.anon); 2343 ASSERT(BUF_EMPTY(hdr)); 2344 ASSERT(!HDR_IO_ERROR(hdr)); 2345 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2346 ASSERT(hdr->b_acb == 0); 2347 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2348 acb->acb_done = done; 2349 acb->acb_private = private; 2350 acb->acb_byteswap = (arc_byteswap_func_t *)-1; 2351 hdr->b_acb = acb; 2352 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2353 arc_cksum_compute(buf); 2354 rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2355 buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); 2356 2357 if (arc_flags & ARC_WAIT) 2358 return (zio_wait(rzio)); 2359 2360 ASSERT(arc_flags & ARC_NOWAIT); 2361 zio_nowait(rzio); 2362 2363 return (0); 2364 } 2365 2366 int 2367 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2368 zio_done_func_t *done, void *private, uint32_t arc_flags) 2369 { 2370 arc_buf_hdr_t *ab; 2371 kmutex_t *hash_lock; 2372 zio_t *zio; 2373 2374 /* 2375 * If this buffer is in the cache, release it, so it 2376 * can be re-used. 2377 */ 2378 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2379 if (ab != NULL) { 2380 /* 2381 * The checksum of blocks to free is not always 2382 * preserved (eg. on the deadlist). However, if it is 2383 * nonzero, it should match what we have in the cache. 2384 */ 2385 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2386 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2387 if (ab->b_state != arc.anon) 2388 arc_change_state(arc.anon, ab, hash_lock); 2389 if (HDR_IO_IN_PROGRESS(ab)) { 2390 /* 2391 * This should only happen when we prefetch. 2392 */ 2393 ASSERT(ab->b_flags & ARC_PREFETCH); 2394 ASSERT3U(ab->b_datacnt, ==, 1); 2395 ab->b_flags |= ARC_FREED_IN_READ; 2396 if (HDR_IN_HASH_TABLE(ab)) 2397 buf_hash_remove(ab); 2398 ab->b_arc_access = 0; 2399 bzero(&ab->b_dva, sizeof (dva_t)); 2400 ab->b_birth = 0; 2401 ab->b_cksum0 = 0; 2402 ab->b_buf->b_efunc = NULL; 2403 ab->b_buf->b_private = NULL; 2404 mutex_exit(hash_lock); 2405 } else if (refcount_is_zero(&ab->b_refcnt)) { 2406 mutex_exit(hash_lock); 2407 arc_hdr_destroy(ab); 2408 atomic_add_64(&arc.deleted, 1); 2409 } else { 2410 /* 2411 * We still have an active reference on this 2412 * buffer. This can happen, e.g., from 2413 * dbuf_unoverride(). 2414 */ 2415 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2416 ab->b_arc_access = 0; 2417 bzero(&ab->b_dva, sizeof (dva_t)); 2418 ab->b_birth = 0; 2419 ab->b_cksum0 = 0; 2420 ab->b_buf->b_efunc = NULL; 2421 ab->b_buf->b_private = NULL; 2422 mutex_exit(hash_lock); 2423 } 2424 } 2425 2426 zio = zio_free(pio, spa, txg, bp, done, private); 2427 2428 if (arc_flags & ARC_WAIT) 2429 return (zio_wait(zio)); 2430 2431 ASSERT(arc_flags & ARC_NOWAIT); 2432 zio_nowait(zio); 2433 2434 return (0); 2435 } 2436 2437 void 2438 arc_tempreserve_clear(uint64_t tempreserve) 2439 { 2440 atomic_add_64(&arc_tempreserve, -tempreserve); 2441 ASSERT((int64_t)arc_tempreserve >= 0); 2442 } 2443 2444 int 2445 arc_tempreserve_space(uint64_t tempreserve) 2446 { 2447 #ifdef ZFS_DEBUG 2448 /* 2449 * Once in a while, fail for no reason. Everything should cope. 2450 */ 2451 if (spa_get_random(10000) == 0) { 2452 dprintf("forcing random failure\n"); 2453 return (ERESTART); 2454 } 2455 #endif 2456 if (tempreserve > arc.c/4 && !arc.no_grow) 2457 arc.c = MIN(arc.c_max, tempreserve * 4); 2458 if (tempreserve > arc.c) 2459 return (ENOMEM); 2460 2461 /* 2462 * Throttle writes when the amount of dirty data in the cache 2463 * gets too large. We try to keep the cache less than half full 2464 * of dirty blocks so that our sync times don't grow too large. 2465 * Note: if two requests come in concurrently, we might let them 2466 * both succeed, when one of them should fail. Not a huge deal. 2467 * 2468 * XXX The limit should be adjusted dynamically to keep the time 2469 * to sync a dataset fixed (around 1-5 seconds?). 2470 */ 2471 2472 if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 2473 arc_tempreserve + arc.anon->size > arc.c / 4) { 2474 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2475 "tempreserve=%lluK arc.c=%lluK\n", 2476 arc_tempreserve>>10, arc.anon->lsize>>10, 2477 tempreserve>>10, arc.c>>10); 2478 return (ERESTART); 2479 } 2480 atomic_add_64(&arc_tempreserve, tempreserve); 2481 return (0); 2482 } 2483 2484 void 2485 arc_init(void) 2486 { 2487 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2488 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2489 2490 /* Convert seconds to clock ticks */ 2491 arc_min_prefetch_lifespan = 1 * hz; 2492 2493 /* Start out with 1/8 of all memory */ 2494 arc.c = physmem * PAGESIZE / 8; 2495 2496 #ifdef _KERNEL 2497 /* 2498 * On architectures where the physical memory can be larger 2499 * than the addressable space (intel in 32-bit mode), we may 2500 * need to limit the cache to 1/8 of VM size. 2501 */ 2502 arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2503 #endif 2504 2505 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2506 arc.c_min = MAX(arc.c / 4, 64<<20); 2507 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2508 if (arc.c * 8 >= 1<<30) 2509 arc.c_max = (arc.c * 8) - (1<<30); 2510 else 2511 arc.c_max = arc.c_min; 2512 arc.c_max = MAX(arc.c * 6, arc.c_max); 2513 2514 /* 2515 * Allow the tunables to override our calculations if they are 2516 * reasonable (ie. over 64MB) 2517 */ 2518 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2519 arc.c_max = zfs_arc_max; 2520 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc.c_max) 2521 arc.c_min = zfs_arc_min; 2522 2523 arc.c = arc.c_max; 2524 arc.p = (arc.c >> 1); 2525 2526 /* if kmem_flags are set, lets try to use less memory */ 2527 if (kmem_debugging()) 2528 arc.c = arc.c / 2; 2529 if (arc.c < arc.c_min) 2530 arc.c = arc.c_min; 2531 2532 arc.anon = &ARC_anon; 2533 arc.mru = &ARC_mru; 2534 arc.mru_ghost = &ARC_mru_ghost; 2535 arc.mfu = &ARC_mfu; 2536 arc.mfu_ghost = &ARC_mfu_ghost; 2537 arc.size = 0; 2538 2539 arc.hits = 0; 2540 arc.recycle_miss = 0; 2541 arc.evict_skip = 0; 2542 arc.mutex_miss = 0; 2543 2544 mutex_init(&arc.anon->mtx, NULL, MUTEX_DEFAULT, NULL); 2545 mutex_init(&arc.mru->mtx, NULL, MUTEX_DEFAULT, NULL); 2546 mutex_init(&arc.mru_ghost->mtx, NULL, MUTEX_DEFAULT, NULL); 2547 mutex_init(&arc.mfu->mtx, NULL, MUTEX_DEFAULT, NULL); 2548 mutex_init(&arc.mfu_ghost->mtx, NULL, MUTEX_DEFAULT, NULL); 2549 2550 list_create(&arc.mru->list, sizeof (arc_buf_hdr_t), 2551 offsetof(arc_buf_hdr_t, b_arc_node)); 2552 list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t), 2553 offsetof(arc_buf_hdr_t, b_arc_node)); 2554 list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t), 2555 offsetof(arc_buf_hdr_t, b_arc_node)); 2556 list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t), 2557 offsetof(arc_buf_hdr_t, b_arc_node)); 2558 2559 buf_init(); 2560 2561 arc_thread_exit = 0; 2562 arc_eviction_list = NULL; 2563 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2564 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2565 2566 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2567 TS_RUN, minclsyspri); 2568 2569 arc_dead = FALSE; 2570 } 2571 2572 void 2573 arc_fini(void) 2574 { 2575 mutex_enter(&arc_reclaim_thr_lock); 2576 arc_thread_exit = 1; 2577 while (arc_thread_exit != 0) 2578 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2579 mutex_exit(&arc_reclaim_thr_lock); 2580 2581 arc_flush(); 2582 2583 arc_dead = TRUE; 2584 2585 mutex_destroy(&arc_eviction_mtx); 2586 mutex_destroy(&arc_reclaim_thr_lock); 2587 cv_destroy(&arc_reclaim_thr_cv); 2588 2589 list_destroy(&arc.mru->list); 2590 list_destroy(&arc.mru_ghost->list); 2591 list_destroy(&arc.mfu->list); 2592 list_destroy(&arc.mfu_ghost->list); 2593 2594 mutex_destroy(&arc.anon->mtx); 2595 mutex_destroy(&arc.mru->mtx); 2596 mutex_destroy(&arc.mru_ghost->mtx); 2597 mutex_destroy(&arc.mfu->mtx); 2598 mutex_destroy(&arc.mfu_ghost->mtx); 2599 2600 buf_fini(); 2601 } 2602