1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * DVA-based Adjustable Relpacement Cache 30 * 31 * While much of the theory of operation used here is 32 * based on the self-tuning, low overhead replacement cache 33 * presented by Megiddo and Modha at FAST 2003, there are some 34 * significant differences: 35 * 36 * 1. The Megiddo and Modha model assumes any page is evictable. 37 * Pages in its cache cannot be "locked" into memory. This makes 38 * the eviction algorithm simple: evict the last page in the list. 39 * This also make the performance characteristics easy to reason 40 * about. Our cache is not so simple. At any given moment, some 41 * subset of the blocks in the cache are un-evictable because we 42 * have handed out a reference to them. Blocks are only evictable 43 * when there are no external references active. This makes 44 * eviction far more problematic: we choose to evict the evictable 45 * blocks that are the "lowest" in the list. 46 * 47 * There are times when it is not possible to evict the requested 48 * space. In these circumstances we are unable to adjust the cache 49 * size. To prevent the cache growing unbounded at these times we 50 * implement a "cache throttle" that slowes the flow of new data 51 * into the cache until we can make space avaiable. 52 * 53 * 2. The Megiddo and Modha model assumes a fixed cache size. 54 * Pages are evicted when the cache is full and there is a cache 55 * miss. Our model has a variable sized cache. It grows with 56 * high use, but also tries to react to memory preasure from the 57 * operating system: decreasing its size when system memory is 58 * tight. 59 * 60 * 3. The Megiddo and Modha model assumes a fixed page size. All 61 * elements of the cache are therefor exactly the same size. So 62 * when adjusting the cache size following a cache miss, its simply 63 * a matter of choosing a single page to evict. In our model, we 64 * have variable sized cache blocks (rangeing from 512 bytes to 65 * 128K bytes). We therefor choose a set of blocks to evict to make 66 * space for a cache miss that approximates as closely as possible 67 * the space used by the new block. 68 * 69 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 70 * by N. Megiddo & D. Modha, FAST 2003 71 */ 72 73 /* 74 * The locking model: 75 * 76 * A new reference to a cache buffer can be obtained in two 77 * ways: 1) via a hash table lookup using the DVA as a key, 78 * or 2) via one of the ARC lists. The arc_read() inerface 79 * uses method 1, while the internal arc algorithms for 80 * adjusting the cache use method 2. We therefor provide two 81 * types of locks: 1) the hash table lock array, and 2) the 82 * arc list locks. 83 * 84 * Buffers do not have their own mutexs, rather they rely on the 85 * hash table mutexs for the bulk of their protection (i.e. most 86 * fields in the arc_buf_hdr_t are protected by these mutexs). 87 * 88 * buf_hash_find() returns the appropriate mutex (held) when it 89 * locates the requested buffer in the hash table. It returns 90 * NULL for the mutex if the buffer was not in the table. 91 * 92 * buf_hash_remove() expects the appropriate hash mutex to be 93 * already held before it is invoked. 94 * 95 * Each arc state also has a mutex which is used to protect the 96 * buffer list associated with the state. When attempting to 97 * obtain a hash table lock while holding an arc list lock you 98 * must use: mutex_tryenter() to avoid deadlock. Also note that 99 * the active state mutex must be held before the ghost state mutex. 100 * 101 * Arc buffers may have an associated eviction callback function. 102 * This function will be invoked prior to removing the buffer (e.g. 103 * in arc_do_user_evicts()). Note however that the data associated 104 * with the buffer may be evicted prior to the callback. The callback 105 * must be made with *no locks held* (to prevent deadlock). Additionally, 106 * the users of callbacks must ensure that their private data is 107 * protected from simultaneous callbacks from arc_buf_evict() 108 * and arc_do_user_evicts(). 109 * 110 * Note that the majority of the performance stats are manipulated 111 * with atomic operations. 112 */ 113 114 #include <sys/spa.h> 115 #include <sys/zio.h> 116 #include <sys/zio_checksum.h> 117 #include <sys/zfs_context.h> 118 #include <sys/arc.h> 119 #include <sys/refcount.h> 120 #ifdef _KERNEL 121 #include <sys/vmsystm.h> 122 #include <vm/anon.h> 123 #include <sys/fs/swapnode.h> 124 #include <sys/dnlc.h> 125 #endif 126 #include <sys/callb.h> 127 128 static kmutex_t arc_reclaim_thr_lock; 129 static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ 130 static uint8_t arc_thread_exit; 131 132 #define ARC_REDUCE_DNLC_PERCENT 3 133 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT; 134 135 typedef enum arc_reclaim_strategy { 136 ARC_RECLAIM_AGGR, /* Aggressive reclaim strategy */ 137 ARC_RECLAIM_CONS /* Conservative reclaim strategy */ 138 } arc_reclaim_strategy_t; 139 140 /* number of seconds before growing cache again */ 141 static int arc_grow_retry = 60; 142 143 /* 144 * minimum lifespan of a prefetch block in clock ticks 145 * (initialized in arc_init()) 146 */ 147 static int arc_min_prefetch_lifespan; 148 149 static kmutex_t arc_reclaim_lock; 150 static int arc_dead; 151 152 /* 153 * These tunables are for performance analysis. 154 */ 155 uint64_t zfs_arc_max; 156 uint64_t zfs_arc_min; 157 158 /* 159 * Note that buffers can be on one of 5 states: 160 * ARC_anon - anonymous (discussed below) 161 * ARC_mru - recently used, currently cached 162 * ARC_mru_ghost - recentely used, no longer in cache 163 * ARC_mfu - frequently used, currently cached 164 * ARC_mfu_ghost - frequently used, no longer in cache 165 * When there are no active references to the buffer, they 166 * are linked onto one of the lists in arc. These are the 167 * only buffers that can be evicted or deleted. 168 * 169 * Anonymous buffers are buffers that are not associated with 170 * a DVA. These are buffers that hold dirty block copies 171 * before they are written to stable storage. By definition, 172 * they are "ref'd" and are considered part of arc_mru 173 * that cannot be freed. Generally, they will aquire a DVA 174 * as they are written and migrate onto the arc_mru list. 175 */ 176 177 typedef struct arc_state { 178 list_t list; /* linked list of evictable buffer in state */ 179 uint64_t lsize; /* total size of buffers in the linked list */ 180 uint64_t size; /* total size of all buffers in this state */ 181 uint64_t hits; 182 kmutex_t mtx; 183 } arc_state_t; 184 185 /* The 5 states: */ 186 static arc_state_t ARC_anon; 187 static arc_state_t ARC_mru; 188 static arc_state_t ARC_mru_ghost; 189 static arc_state_t ARC_mfu; 190 static arc_state_t ARC_mfu_ghost; 191 192 static struct arc { 193 arc_state_t *anon; 194 arc_state_t *mru; 195 arc_state_t *mru_ghost; 196 arc_state_t *mfu; 197 arc_state_t *mfu_ghost; 198 uint64_t size; /* Actual total arc size */ 199 uint64_t p; /* Target size (in bytes) of mru */ 200 uint64_t c; /* Target size of cache (in bytes) */ 201 uint64_t c_min; /* Minimum target cache size */ 202 uint64_t c_max; /* Maximum target cache size */ 203 204 /* performance stats */ 205 uint64_t hits; 206 uint64_t misses; 207 uint64_t deleted; 208 uint64_t recycle_miss; 209 uint64_t mutex_miss; 210 uint64_t evict_skip; 211 uint64_t hash_elements; 212 uint64_t hash_elements_max; 213 uint64_t hash_collisions; 214 uint64_t hash_chains; 215 uint32_t hash_chain_max; 216 217 int no_grow; /* Don't try to grow cache size */ 218 } arc; 219 220 static uint64_t arc_tempreserve; 221 222 typedef struct arc_callback arc_callback_t; 223 224 struct arc_callback { 225 arc_done_func_t *acb_done; 226 void *acb_private; 227 arc_byteswap_func_t *acb_byteswap; 228 arc_buf_t *acb_buf; 229 zio_t *acb_zio_dummy; 230 arc_callback_t *acb_next; 231 }; 232 233 struct arc_buf_hdr { 234 /* immutable */ 235 uint64_t b_size; 236 spa_t *b_spa; 237 238 /* protected by hash lock */ 239 dva_t b_dva; 240 uint64_t b_birth; 241 uint64_t b_cksum0; 242 243 kmutex_t b_freeze_lock; 244 zio_cksum_t *b_freeze_cksum; 245 246 arc_buf_hdr_t *b_hash_next; 247 arc_buf_t *b_buf; 248 uint32_t b_flags; 249 uint32_t b_datacnt; 250 251 kcondvar_t b_cv; 252 arc_callback_t *b_acb; 253 254 /* protected by arc state mutex */ 255 arc_state_t *b_state; 256 list_node_t b_arc_node; 257 258 /* updated atomically */ 259 clock_t b_arc_access; 260 261 /* self protecting */ 262 refcount_t b_refcnt; 263 }; 264 265 static arc_buf_t *arc_eviction_list; 266 static kmutex_t arc_eviction_mtx; 267 static arc_buf_hdr_t arc_eviction_hdr; 268 static void arc_get_data_buf(arc_buf_t *buf); 269 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); 270 271 #define GHOST_STATE(state) \ 272 ((state) == arc.mru_ghost || (state) == arc.mfu_ghost) 273 274 /* 275 * Private ARC flags. These flags are private ARC only flags that will show up 276 * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can 277 * be passed in as arc_flags in things like arc_read. However, these flags 278 * should never be passed and should only be set by ARC code. When adding new 279 * public flags, make sure not to smash the private ones. 280 */ 281 282 #define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ 283 #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ 284 #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ 285 #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ 286 #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ 287 #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ 288 289 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) 290 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) 291 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) 292 #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) 293 #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) 294 295 /* 296 * Hash table routines 297 */ 298 299 #define HT_LOCK_PAD 64 300 301 struct ht_lock { 302 kmutex_t ht_lock; 303 #ifdef _KERNEL 304 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 305 #endif 306 }; 307 308 #define BUF_LOCKS 256 309 typedef struct buf_hash_table { 310 uint64_t ht_mask; 311 arc_buf_hdr_t **ht_table; 312 struct ht_lock ht_locks[BUF_LOCKS]; 313 } buf_hash_table_t; 314 315 static buf_hash_table_t buf_hash_table; 316 317 #define BUF_HASH_INDEX(spa, dva, birth) \ 318 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 319 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 320 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 321 #define HDR_LOCK(buf) \ 322 (BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth))) 323 324 uint64_t zfs_crc64_table[256]; 325 326 static uint64_t 327 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) 328 { 329 uintptr_t spav = (uintptr_t)spa; 330 uint8_t *vdva = (uint8_t *)dva; 331 uint64_t crc = -1ULL; 332 int i; 333 334 ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); 335 336 for (i = 0; i < sizeof (dva_t); i++) 337 crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; 338 339 crc ^= (spav>>8) ^ birth; 340 341 return (crc); 342 } 343 344 #define BUF_EMPTY(buf) \ 345 ((buf)->b_dva.dva_word[0] == 0 && \ 346 (buf)->b_dva.dva_word[1] == 0 && \ 347 (buf)->b_birth == 0) 348 349 #define BUF_EQUAL(spa, dva, birth, buf) \ 350 ((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 351 ((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 352 ((buf)->b_birth == birth) && ((buf)->b_spa == spa) 353 354 static arc_buf_hdr_t * 355 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp) 356 { 357 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 358 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 359 arc_buf_hdr_t *buf; 360 361 mutex_enter(hash_lock); 362 for (buf = buf_hash_table.ht_table[idx]; buf != NULL; 363 buf = buf->b_hash_next) { 364 if (BUF_EQUAL(spa, dva, birth, buf)) { 365 *lockp = hash_lock; 366 return (buf); 367 } 368 } 369 mutex_exit(hash_lock); 370 *lockp = NULL; 371 return (NULL); 372 } 373 374 /* 375 * Insert an entry into the hash table. If there is already an element 376 * equal to elem in the hash table, then the already existing element 377 * will be returned and the new element will not be inserted. 378 * Otherwise returns NULL. 379 */ 380 static arc_buf_hdr_t * 381 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) 382 { 383 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 384 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 385 arc_buf_hdr_t *fbuf; 386 uint32_t max, i; 387 388 ASSERT(!HDR_IN_HASH_TABLE(buf)); 389 *lockp = hash_lock; 390 mutex_enter(hash_lock); 391 for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL; 392 fbuf = fbuf->b_hash_next, i++) { 393 if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf)) 394 return (fbuf); 395 } 396 397 buf->b_hash_next = buf_hash_table.ht_table[idx]; 398 buf_hash_table.ht_table[idx] = buf; 399 buf->b_flags |= ARC_IN_HASH_TABLE; 400 401 /* collect some hash table performance data */ 402 if (i > 0) { 403 atomic_add_64(&arc.hash_collisions, 1); 404 if (i == 1) 405 atomic_add_64(&arc.hash_chains, 1); 406 } 407 while (i > (max = arc.hash_chain_max) && 408 max != atomic_cas_32(&arc.hash_chain_max, max, i)) { 409 continue; 410 } 411 atomic_add_64(&arc.hash_elements, 1); 412 if (arc.hash_elements > arc.hash_elements_max) 413 atomic_add_64(&arc.hash_elements_max, 1); 414 415 return (NULL); 416 } 417 418 static void 419 buf_hash_remove(arc_buf_hdr_t *buf) 420 { 421 arc_buf_hdr_t *fbuf, **bufp; 422 uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); 423 424 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 425 ASSERT(HDR_IN_HASH_TABLE(buf)); 426 427 bufp = &buf_hash_table.ht_table[idx]; 428 while ((fbuf = *bufp) != buf) { 429 ASSERT(fbuf != NULL); 430 bufp = &fbuf->b_hash_next; 431 } 432 *bufp = buf->b_hash_next; 433 buf->b_hash_next = NULL; 434 buf->b_flags &= ~ARC_IN_HASH_TABLE; 435 436 /* collect some hash table performance data */ 437 atomic_add_64(&arc.hash_elements, -1); 438 if (buf_hash_table.ht_table[idx] && 439 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 440 atomic_add_64(&arc.hash_chains, -1); 441 } 442 443 /* 444 * Global data structures and functions for the buf kmem cache. 445 */ 446 static kmem_cache_t *hdr_cache; 447 static kmem_cache_t *buf_cache; 448 449 static void 450 buf_fini(void) 451 { 452 int i; 453 454 kmem_free(buf_hash_table.ht_table, 455 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 456 for (i = 0; i < BUF_LOCKS; i++) 457 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 458 kmem_cache_destroy(hdr_cache); 459 kmem_cache_destroy(buf_cache); 460 } 461 462 /* 463 * Constructor callback - called when the cache is empty 464 * and a new buf is requested. 465 */ 466 /* ARGSUSED */ 467 static int 468 hdr_cons(void *vbuf, void *unused, int kmflag) 469 { 470 arc_buf_hdr_t *buf = vbuf; 471 472 bzero(buf, sizeof (arc_buf_hdr_t)); 473 refcount_create(&buf->b_refcnt); 474 cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); 475 return (0); 476 } 477 478 /* 479 * Destructor callback - called when a cached buf is 480 * no longer required. 481 */ 482 /* ARGSUSED */ 483 static void 484 hdr_dest(void *vbuf, void *unused) 485 { 486 arc_buf_hdr_t *buf = vbuf; 487 488 refcount_destroy(&buf->b_refcnt); 489 cv_destroy(&buf->b_cv); 490 } 491 492 static int arc_reclaim_needed(void); 493 void arc_kmem_reclaim(void); 494 495 /* 496 * Reclaim callback -- invoked when memory is low. 497 */ 498 /* ARGSUSED */ 499 static void 500 hdr_recl(void *unused) 501 { 502 dprintf("hdr_recl called\n"); 503 if (arc_reclaim_needed()) 504 arc_kmem_reclaim(); 505 } 506 507 static void 508 buf_init(void) 509 { 510 uint64_t *ct; 511 uint64_t hsize = 1ULL << 12; 512 int i, j; 513 514 /* 515 * The hash table is big enough to fill all of physical memory 516 * with an average 64K block size. The table will take up 517 * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). 518 */ 519 while (hsize * 65536 < physmem * PAGESIZE) 520 hsize <<= 1; 521 retry: 522 buf_hash_table.ht_mask = hsize - 1; 523 buf_hash_table.ht_table = 524 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 525 if (buf_hash_table.ht_table == NULL) { 526 ASSERT(hsize > (1ULL << 8)); 527 hsize >>= 1; 528 goto retry; 529 } 530 531 hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 532 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); 533 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 534 0, NULL, NULL, NULL, NULL, NULL, 0); 535 536 for (i = 0; i < 256; i++) 537 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 538 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 539 540 for (i = 0; i < BUF_LOCKS; i++) { 541 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 542 NULL, MUTEX_DEFAULT, NULL); 543 } 544 } 545 546 #define ARC_MINTIME (hz>>4) /* 62 ms */ 547 548 static void 549 arc_cksum_verify(arc_buf_t *buf) 550 { 551 zio_cksum_t zc; 552 553 if (!zfs_flags & ZFS_DEBUG_MODIFY) 554 return; 555 556 mutex_enter(&buf->b_hdr->b_freeze_lock); 557 if (buf->b_hdr->b_freeze_cksum == NULL) { 558 mutex_exit(&buf->b_hdr->b_freeze_lock); 559 return; 560 } 561 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); 562 if (!ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc)) 563 panic("buffer modified while frozen!"); 564 mutex_exit(&buf->b_hdr->b_freeze_lock); 565 } 566 567 static void 568 arc_cksum_compute(arc_buf_t *buf) 569 { 570 if (!zfs_flags & ZFS_DEBUG_MODIFY) 571 return; 572 573 mutex_enter(&buf->b_hdr->b_freeze_lock); 574 if (buf->b_hdr->b_freeze_cksum != NULL) { 575 mutex_exit(&buf->b_hdr->b_freeze_lock); 576 return; 577 } 578 buf->b_hdr->b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 579 fletcher_2_native(buf->b_data, buf->b_hdr->b_size, 580 buf->b_hdr->b_freeze_cksum); 581 mutex_exit(&buf->b_hdr->b_freeze_lock); 582 } 583 584 void 585 arc_buf_thaw(arc_buf_t *buf) 586 { 587 if (!zfs_flags & ZFS_DEBUG_MODIFY) 588 return; 589 590 if (buf->b_hdr->b_state != arc.anon) 591 panic("modifying non-anon buffer!"); 592 if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) 593 panic("modifying buffer while i/o in progress!"); 594 arc_cksum_verify(buf); 595 mutex_enter(&buf->b_hdr->b_freeze_lock); 596 if (buf->b_hdr->b_freeze_cksum != NULL) { 597 kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 598 buf->b_hdr->b_freeze_cksum = NULL; 599 } 600 mutex_exit(&buf->b_hdr->b_freeze_lock); 601 } 602 603 void 604 arc_buf_freeze(arc_buf_t *buf) 605 { 606 ASSERT(buf->b_hdr->b_freeze_cksum != NULL || 607 buf->b_hdr->b_state == arc.anon); 608 arc_cksum_compute(buf); 609 } 610 611 static void 612 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 613 { 614 ASSERT(MUTEX_HELD(hash_lock)); 615 616 if ((refcount_add(&ab->b_refcnt, tag) == 1) && 617 (ab->b_state != arc.anon)) { 618 int delta = ab->b_size * ab->b_datacnt; 619 620 ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 621 mutex_enter(&ab->b_state->mtx); 622 ASSERT(list_link_active(&ab->b_arc_node)); 623 list_remove(&ab->b_state->list, ab); 624 if (GHOST_STATE(ab->b_state)) { 625 ASSERT3U(ab->b_datacnt, ==, 0); 626 ASSERT3P(ab->b_buf, ==, NULL); 627 delta = ab->b_size; 628 } 629 ASSERT(delta > 0); 630 ASSERT3U(ab->b_state->lsize, >=, delta); 631 atomic_add_64(&ab->b_state->lsize, -delta); 632 mutex_exit(&ab->b_state->mtx); 633 /* remove the prefetch flag is we get a reference */ 634 if (ab->b_flags & ARC_PREFETCH) 635 ab->b_flags &= ~ARC_PREFETCH; 636 } 637 } 638 639 static int 640 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) 641 { 642 int cnt; 643 644 ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock)); 645 ASSERT(!GHOST_STATE(ab->b_state)); 646 647 if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && 648 (ab->b_state != arc.anon)) { 649 650 ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); 651 mutex_enter(&ab->b_state->mtx); 652 ASSERT(!list_link_active(&ab->b_arc_node)); 653 list_insert_head(&ab->b_state->list, ab); 654 ASSERT(ab->b_datacnt > 0); 655 atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt); 656 ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize); 657 mutex_exit(&ab->b_state->mtx); 658 } 659 return (cnt); 660 } 661 662 /* 663 * Move the supplied buffer to the indicated state. The mutex 664 * for the buffer must be held by the caller. 665 */ 666 static void 667 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) 668 { 669 arc_state_t *old_state = ab->b_state; 670 int refcnt = refcount_count(&ab->b_refcnt); 671 int from_delta, to_delta; 672 673 ASSERT(MUTEX_HELD(hash_lock)); 674 ASSERT(new_state != old_state); 675 ASSERT(refcnt == 0 || ab->b_datacnt > 0); 676 ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); 677 678 from_delta = to_delta = ab->b_datacnt * ab->b_size; 679 680 /* 681 * If this buffer is evictable, transfer it from the 682 * old state list to the new state list. 683 */ 684 if (refcnt == 0) { 685 if (old_state != arc.anon) { 686 int use_mutex = !MUTEX_HELD(&old_state->mtx); 687 688 if (use_mutex) 689 mutex_enter(&old_state->mtx); 690 691 ASSERT(list_link_active(&ab->b_arc_node)); 692 list_remove(&old_state->list, ab); 693 694 /* 695 * If prefetching out of the ghost cache, 696 * we will have a non-null datacnt. 697 */ 698 if (GHOST_STATE(old_state) && ab->b_datacnt == 0) { 699 /* ghost elements have a ghost size */ 700 ASSERT(ab->b_buf == NULL); 701 from_delta = ab->b_size; 702 } 703 ASSERT3U(old_state->lsize, >=, from_delta); 704 atomic_add_64(&old_state->lsize, -from_delta); 705 706 if (use_mutex) 707 mutex_exit(&old_state->mtx); 708 } 709 if (new_state != arc.anon) { 710 int use_mutex = !MUTEX_HELD(&new_state->mtx); 711 712 if (use_mutex) 713 mutex_enter(&new_state->mtx); 714 715 list_insert_head(&new_state->list, ab); 716 717 /* ghost elements have a ghost size */ 718 if (GHOST_STATE(new_state)) { 719 ASSERT(ab->b_datacnt == 0); 720 ASSERT(ab->b_buf == NULL); 721 to_delta = ab->b_size; 722 } 723 atomic_add_64(&new_state->lsize, to_delta); 724 ASSERT3U(new_state->size + to_delta, >=, 725 new_state->lsize); 726 727 if (use_mutex) 728 mutex_exit(&new_state->mtx); 729 } 730 } 731 732 ASSERT(!BUF_EMPTY(ab)); 733 if (new_state == arc.anon && old_state != arc.anon) { 734 buf_hash_remove(ab); 735 } 736 737 /* adjust state sizes */ 738 if (to_delta) 739 atomic_add_64(&new_state->size, to_delta); 740 if (from_delta) { 741 ASSERT3U(old_state->size, >=, from_delta); 742 atomic_add_64(&old_state->size, -from_delta); 743 } 744 ab->b_state = new_state; 745 } 746 747 arc_buf_t * 748 arc_buf_alloc(spa_t *spa, int size, void *tag) 749 { 750 arc_buf_hdr_t *hdr; 751 arc_buf_t *buf; 752 753 ASSERT3U(size, >, 0); 754 hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 755 ASSERT(BUF_EMPTY(hdr)); 756 hdr->b_size = size; 757 hdr->b_spa = spa; 758 hdr->b_state = arc.anon; 759 hdr->b_arc_access = 0; 760 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 761 buf->b_hdr = hdr; 762 buf->b_data = NULL; 763 buf->b_efunc = NULL; 764 buf->b_private = NULL; 765 buf->b_next = NULL; 766 hdr->b_buf = buf; 767 arc_get_data_buf(buf); 768 hdr->b_datacnt = 1; 769 hdr->b_flags = 0; 770 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 771 (void) refcount_add(&hdr->b_refcnt, tag); 772 773 return (buf); 774 } 775 776 static arc_buf_t * 777 arc_buf_clone(arc_buf_t *from) 778 { 779 arc_buf_t *buf; 780 arc_buf_hdr_t *hdr = from->b_hdr; 781 uint64_t size = hdr->b_size; 782 783 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 784 buf->b_hdr = hdr; 785 buf->b_data = NULL; 786 buf->b_efunc = NULL; 787 buf->b_private = NULL; 788 buf->b_next = hdr->b_buf; 789 hdr->b_buf = buf; 790 arc_get_data_buf(buf); 791 bcopy(from->b_data, buf->b_data, size); 792 hdr->b_datacnt += 1; 793 return (buf); 794 } 795 796 void 797 arc_buf_add_ref(arc_buf_t *buf, void* tag) 798 { 799 arc_buf_hdr_t *hdr; 800 kmutex_t *hash_lock; 801 802 /* 803 * Check to see if this buffer is currently being evicted via 804 * arc_do_user_evicts(). 805 */ 806 mutex_enter(&arc_eviction_mtx); 807 hdr = buf->b_hdr; 808 if (hdr == NULL) { 809 mutex_exit(&arc_eviction_mtx); 810 return; 811 } 812 hash_lock = HDR_LOCK(hdr); 813 mutex_exit(&arc_eviction_mtx); 814 815 mutex_enter(hash_lock); 816 if (buf->b_data == NULL) { 817 /* 818 * This buffer is evicted. 819 */ 820 mutex_exit(hash_lock); 821 return; 822 } 823 824 ASSERT(buf->b_hdr == hdr); 825 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 826 add_reference(hdr, hash_lock, tag); 827 arc_access(hdr, hash_lock); 828 mutex_exit(hash_lock); 829 atomic_add_64(&arc.hits, 1); 830 } 831 832 static void 833 arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) 834 { 835 arc_buf_t **bufp; 836 837 /* free up data associated with the buf */ 838 if (buf->b_data) { 839 arc_state_t *state = buf->b_hdr->b_state; 840 uint64_t size = buf->b_hdr->b_size; 841 842 arc_cksum_verify(buf); 843 if (!recycle) { 844 zio_buf_free(buf->b_data, size); 845 atomic_add_64(&arc.size, -size); 846 } 847 if (list_link_active(&buf->b_hdr->b_arc_node)) { 848 ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); 849 ASSERT(state != arc.anon); 850 ASSERT3U(state->lsize, >=, size); 851 atomic_add_64(&state->lsize, -size); 852 } 853 ASSERT3U(state->size, >=, size); 854 atomic_add_64(&state->size, -size); 855 buf->b_data = NULL; 856 ASSERT(buf->b_hdr->b_datacnt > 0); 857 buf->b_hdr->b_datacnt -= 1; 858 } 859 860 /* only remove the buf if requested */ 861 if (!all) 862 return; 863 864 /* remove the buf from the hdr list */ 865 for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) 866 continue; 867 *bufp = buf->b_next; 868 869 ASSERT(buf->b_efunc == NULL); 870 871 /* clean up the buf */ 872 buf->b_hdr = NULL; 873 kmem_cache_free(buf_cache, buf); 874 } 875 876 static void 877 arc_hdr_destroy(arc_buf_hdr_t *hdr) 878 { 879 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 880 ASSERT3P(hdr->b_state, ==, arc.anon); 881 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 882 883 if (!BUF_EMPTY(hdr)) { 884 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 885 bzero(&hdr->b_dva, sizeof (dva_t)); 886 hdr->b_birth = 0; 887 hdr->b_cksum0 = 0; 888 } 889 while (hdr->b_buf) { 890 arc_buf_t *buf = hdr->b_buf; 891 892 if (buf->b_efunc) { 893 mutex_enter(&arc_eviction_mtx); 894 ASSERT(buf->b_hdr != NULL); 895 arc_buf_destroy(hdr->b_buf, FALSE, FALSE); 896 hdr->b_buf = buf->b_next; 897 buf->b_hdr = &arc_eviction_hdr; 898 buf->b_next = arc_eviction_list; 899 arc_eviction_list = buf; 900 mutex_exit(&arc_eviction_mtx); 901 } else { 902 arc_buf_destroy(hdr->b_buf, FALSE, TRUE); 903 } 904 } 905 if (hdr->b_freeze_cksum != NULL) { 906 kmem_free(hdr->b_freeze_cksum, sizeof (zio_cksum_t)); 907 hdr->b_freeze_cksum = NULL; 908 } 909 910 ASSERT(!list_link_active(&hdr->b_arc_node)); 911 ASSERT3P(hdr->b_hash_next, ==, NULL); 912 ASSERT3P(hdr->b_acb, ==, NULL); 913 kmem_cache_free(hdr_cache, hdr); 914 } 915 916 void 917 arc_buf_free(arc_buf_t *buf, void *tag) 918 { 919 arc_buf_hdr_t *hdr = buf->b_hdr; 920 int hashed = hdr->b_state != arc.anon; 921 922 ASSERT(buf->b_efunc == NULL); 923 ASSERT(buf->b_data != NULL); 924 925 if (hashed) { 926 kmutex_t *hash_lock = HDR_LOCK(hdr); 927 928 mutex_enter(hash_lock); 929 (void) remove_reference(hdr, hash_lock, tag); 930 if (hdr->b_datacnt > 1) 931 arc_buf_destroy(buf, FALSE, TRUE); 932 else 933 hdr->b_flags |= ARC_BUF_AVAILABLE; 934 mutex_exit(hash_lock); 935 } else if (HDR_IO_IN_PROGRESS(hdr)) { 936 int destroy_hdr; 937 /* 938 * We are in the middle of an async write. Don't destroy 939 * this buffer unless the write completes before we finish 940 * decrementing the reference count. 941 */ 942 mutex_enter(&arc_eviction_mtx); 943 (void) remove_reference(hdr, NULL, tag); 944 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 945 destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); 946 mutex_exit(&arc_eviction_mtx); 947 if (destroy_hdr) 948 arc_hdr_destroy(hdr); 949 } else { 950 if (remove_reference(hdr, NULL, tag) > 0) { 951 ASSERT(HDR_IO_ERROR(hdr)); 952 arc_buf_destroy(buf, FALSE, TRUE); 953 } else { 954 arc_hdr_destroy(hdr); 955 } 956 } 957 } 958 959 int 960 arc_buf_remove_ref(arc_buf_t *buf, void* tag) 961 { 962 arc_buf_hdr_t *hdr = buf->b_hdr; 963 kmutex_t *hash_lock = HDR_LOCK(hdr); 964 int no_callback = (buf->b_efunc == NULL); 965 966 if (hdr->b_state == arc.anon) { 967 arc_buf_free(buf, tag); 968 return (no_callback); 969 } 970 971 mutex_enter(hash_lock); 972 ASSERT(hdr->b_state != arc.anon); 973 ASSERT(buf->b_data != NULL); 974 975 (void) remove_reference(hdr, hash_lock, tag); 976 if (hdr->b_datacnt > 1) { 977 if (no_callback) 978 arc_buf_destroy(buf, FALSE, TRUE); 979 } else if (no_callback) { 980 ASSERT(hdr->b_buf == buf && buf->b_next == NULL); 981 hdr->b_flags |= ARC_BUF_AVAILABLE; 982 } 983 ASSERT(no_callback || hdr->b_datacnt > 1 || 984 refcount_is_zero(&hdr->b_refcnt)); 985 mutex_exit(hash_lock); 986 return (no_callback); 987 } 988 989 int 990 arc_buf_size(arc_buf_t *buf) 991 { 992 return (buf->b_hdr->b_size); 993 } 994 995 /* 996 * Evict buffers from list until we've removed the specified number of 997 * bytes. Move the removed buffers to the appropriate evict state. 998 * If the recycle flag is set, then attempt to "recycle" a buffer: 999 * - look for a buffer to evict that is `bytes' long. 1000 * - return the data block from this buffer rather than freeing it. 1001 * This flag is used by callers that are trying to make space for a 1002 * new buffer in a full arc cache. 1003 */ 1004 static void * 1005 arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle) 1006 { 1007 arc_state_t *evicted_state; 1008 uint64_t bytes_evicted = 0, skipped = 0, missed = 0; 1009 arc_buf_hdr_t *ab, *ab_prev = NULL; 1010 kmutex_t *hash_lock; 1011 boolean_t have_lock; 1012 void *stolen = NULL; 1013 1014 ASSERT(state == arc.mru || state == arc.mfu); 1015 1016 evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 1017 1018 mutex_enter(&state->mtx); 1019 mutex_enter(&evicted_state->mtx); 1020 1021 for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1022 ab_prev = list_prev(&state->list, ab); 1023 /* prefetch buffers have a minimum lifespan */ 1024 if (HDR_IO_IN_PROGRESS(ab) || 1025 (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && 1026 lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { 1027 skipped++; 1028 continue; 1029 } 1030 /* "lookahead" for better eviction candidate */ 1031 if (recycle && ab->b_size != bytes && 1032 ab_prev && ab_prev->b_size == bytes) 1033 continue; 1034 hash_lock = HDR_LOCK(ab); 1035 have_lock = MUTEX_HELD(hash_lock); 1036 if (have_lock || mutex_tryenter(hash_lock)) { 1037 ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); 1038 ASSERT(ab->b_datacnt > 0); 1039 while (ab->b_buf) { 1040 arc_buf_t *buf = ab->b_buf; 1041 if (buf->b_data) { 1042 bytes_evicted += ab->b_size; 1043 if (recycle && ab->b_size == bytes) { 1044 stolen = buf->b_data; 1045 recycle = FALSE; 1046 } 1047 } 1048 if (buf->b_efunc) { 1049 mutex_enter(&arc_eviction_mtx); 1050 arc_buf_destroy(buf, 1051 buf->b_data == stolen, FALSE); 1052 ab->b_buf = buf->b_next; 1053 buf->b_hdr = &arc_eviction_hdr; 1054 buf->b_next = arc_eviction_list; 1055 arc_eviction_list = buf; 1056 mutex_exit(&arc_eviction_mtx); 1057 } else { 1058 arc_buf_destroy(buf, 1059 buf->b_data == stolen, TRUE); 1060 } 1061 } 1062 ASSERT(ab->b_datacnt == 0); 1063 arc_change_state(evicted_state, ab, hash_lock); 1064 ASSERT(HDR_IN_HASH_TABLE(ab)); 1065 ab->b_flags = ARC_IN_HASH_TABLE; 1066 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); 1067 if (!have_lock) 1068 mutex_exit(hash_lock); 1069 if (bytes >= 0 && bytes_evicted >= bytes) 1070 break; 1071 } else { 1072 missed += 1; 1073 } 1074 } 1075 mutex_exit(&evicted_state->mtx); 1076 mutex_exit(&state->mtx); 1077 1078 if (bytes_evicted < bytes) 1079 dprintf("only evicted %lld bytes from %x", 1080 (longlong_t)bytes_evicted, state); 1081 1082 if (skipped) 1083 atomic_add_64(&arc.evict_skip, skipped); 1084 if (missed) 1085 atomic_add_64(&arc.mutex_miss, missed); 1086 return (stolen); 1087 } 1088 1089 /* 1090 * Remove buffers from list until we've removed the specified number of 1091 * bytes. Destroy the buffers that are removed. 1092 */ 1093 static void 1094 arc_evict_ghost(arc_state_t *state, int64_t bytes) 1095 { 1096 arc_buf_hdr_t *ab, *ab_prev; 1097 kmutex_t *hash_lock; 1098 uint64_t bytes_deleted = 0; 1099 uint_t bufs_skipped = 0; 1100 1101 ASSERT(GHOST_STATE(state)); 1102 top: 1103 mutex_enter(&state->mtx); 1104 for (ab = list_tail(&state->list); ab; ab = ab_prev) { 1105 ab_prev = list_prev(&state->list, ab); 1106 hash_lock = HDR_LOCK(ab); 1107 if (mutex_tryenter(hash_lock)) { 1108 ASSERT(!HDR_IO_IN_PROGRESS(ab)); 1109 ASSERT(ab->b_buf == NULL); 1110 arc_change_state(arc.anon, ab, hash_lock); 1111 mutex_exit(hash_lock); 1112 atomic_add_64(&arc.deleted, 1); 1113 bytes_deleted += ab->b_size; 1114 arc_hdr_destroy(ab); 1115 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); 1116 if (bytes >= 0 && bytes_deleted >= bytes) 1117 break; 1118 } else { 1119 if (bytes < 0) { 1120 mutex_exit(&state->mtx); 1121 mutex_enter(hash_lock); 1122 mutex_exit(hash_lock); 1123 goto top; 1124 } 1125 bufs_skipped += 1; 1126 } 1127 } 1128 mutex_exit(&state->mtx); 1129 1130 if (bufs_skipped) { 1131 atomic_add_64(&arc.mutex_miss, bufs_skipped); 1132 ASSERT(bytes >= 0); 1133 } 1134 1135 if (bytes_deleted < bytes) 1136 dprintf("only deleted %lld bytes from %p", 1137 (longlong_t)bytes_deleted, state); 1138 } 1139 1140 static void 1141 arc_adjust(void) 1142 { 1143 int64_t top_sz, mru_over, arc_over; 1144 1145 top_sz = arc.anon->size + arc.mru->size; 1146 1147 if (top_sz > arc.p && arc.mru->lsize > 0) { 1148 int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); 1149 (void) arc_evict(arc.mru, toevict, FALSE); 1150 top_sz = arc.anon->size + arc.mru->size; 1151 } 1152 1153 mru_over = top_sz + arc.mru_ghost->size - arc.c; 1154 1155 if (mru_over > 0) { 1156 if (arc.mru_ghost->lsize > 0) { 1157 int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over); 1158 arc_evict_ghost(arc.mru_ghost, todelete); 1159 } 1160 } 1161 1162 if ((arc_over = arc.size - arc.c) > 0) { 1163 int64_t tbl_over; 1164 1165 if (arc.mfu->lsize > 0) { 1166 int64_t toevict = MIN(arc.mfu->lsize, arc_over); 1167 (void) arc_evict(arc.mfu, toevict, FALSE); 1168 } 1169 1170 tbl_over = arc.size + arc.mru_ghost->lsize + 1171 arc.mfu_ghost->lsize - arc.c*2; 1172 1173 if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) { 1174 int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over); 1175 arc_evict_ghost(arc.mfu_ghost, todelete); 1176 } 1177 } 1178 } 1179 1180 static void 1181 arc_do_user_evicts(void) 1182 { 1183 mutex_enter(&arc_eviction_mtx); 1184 while (arc_eviction_list != NULL) { 1185 arc_buf_t *buf = arc_eviction_list; 1186 arc_eviction_list = buf->b_next; 1187 buf->b_hdr = NULL; 1188 mutex_exit(&arc_eviction_mtx); 1189 1190 if (buf->b_efunc != NULL) 1191 VERIFY(buf->b_efunc(buf) == 0); 1192 1193 buf->b_efunc = NULL; 1194 buf->b_private = NULL; 1195 kmem_cache_free(buf_cache, buf); 1196 mutex_enter(&arc_eviction_mtx); 1197 } 1198 mutex_exit(&arc_eviction_mtx); 1199 } 1200 1201 /* 1202 * Flush all *evictable* data from the cache. 1203 * NOTE: this will not touch "active" (i.e. referenced) data. 1204 */ 1205 void 1206 arc_flush(void) 1207 { 1208 while (list_head(&arc.mru->list)) 1209 (void) arc_evict(arc.mru, -1, FALSE); 1210 while (list_head(&arc.mfu->list)) 1211 (void) arc_evict(arc.mfu, -1, FALSE); 1212 1213 arc_evict_ghost(arc.mru_ghost, -1); 1214 arc_evict_ghost(arc.mfu_ghost, -1); 1215 1216 mutex_enter(&arc_reclaim_thr_lock); 1217 arc_do_user_evicts(); 1218 mutex_exit(&arc_reclaim_thr_lock); 1219 ASSERT(arc_eviction_list == NULL); 1220 } 1221 1222 int arc_kmem_reclaim_shift = 5; /* log2(fraction of arc to reclaim) */ 1223 1224 void 1225 arc_kmem_reclaim(void) 1226 { 1227 uint64_t to_free; 1228 1229 /* 1230 * We need arc_reclaim_lock because we don't want multiple 1231 * threads trying to reclaim concurrently. 1232 */ 1233 1234 /* 1235 * umem calls the reclaim func when we destroy the buf cache, 1236 * which is after we do arc_fini(). So we set a flag to prevent 1237 * accessing the destroyed mutexes and lists. 1238 */ 1239 if (arc_dead) 1240 return; 1241 1242 if (arc.c <= arc.c_min) 1243 return; 1244 1245 mutex_enter(&arc_reclaim_lock); 1246 1247 #ifdef _KERNEL 1248 to_free = MAX(arc.c >> arc_kmem_reclaim_shift, ptob(needfree)); 1249 #else 1250 to_free = arc.c >> arc_kmem_reclaim_shift; 1251 #endif 1252 if (arc.c > to_free) 1253 atomic_add_64(&arc.c, -to_free); 1254 else 1255 arc.c = arc.c_min; 1256 1257 atomic_add_64(&arc.p, -(arc.p >> arc_kmem_reclaim_shift)); 1258 if (arc.c > arc.size) 1259 arc.c = arc.size; 1260 if (arc.c < arc.c_min) 1261 arc.c = arc.c_min; 1262 if (arc.p > arc.c) 1263 arc.p = (arc.c >> 1); 1264 ASSERT((int64_t)arc.p >= 0); 1265 1266 arc_adjust(); 1267 1268 mutex_exit(&arc_reclaim_lock); 1269 } 1270 1271 static int 1272 arc_reclaim_needed(void) 1273 { 1274 uint64_t extra; 1275 1276 #ifdef _KERNEL 1277 1278 if (needfree) 1279 return (1); 1280 1281 /* 1282 * take 'desfree' extra pages, so we reclaim sooner, rather than later 1283 */ 1284 extra = desfree; 1285 1286 /* 1287 * check that we're out of range of the pageout scanner. It starts to 1288 * schedule paging if freemem is less than lotsfree and needfree. 1289 * lotsfree is the high-water mark for pageout, and needfree is the 1290 * number of needed free pages. We add extra pages here to make sure 1291 * the scanner doesn't start up while we're freeing memory. 1292 */ 1293 if (freemem < lotsfree + needfree + extra) 1294 return (1); 1295 1296 /* 1297 * check to make sure that swapfs has enough space so that anon 1298 * reservations can still succeeed. anon_resvmem() checks that the 1299 * availrmem is greater than swapfs_minfree, and the number of reserved 1300 * swap pages. We also add a bit of extra here just to prevent 1301 * circumstances from getting really dire. 1302 */ 1303 if (availrmem < swapfs_minfree + swapfs_reserve + extra) 1304 return (1); 1305 1306 #if defined(__i386) 1307 /* 1308 * If we're on an i386 platform, it's possible that we'll exhaust the 1309 * kernel heap space before we ever run out of available physical 1310 * memory. Most checks of the size of the heap_area compare against 1311 * tune.t_minarmem, which is the minimum available real memory that we 1312 * can have in the system. However, this is generally fixed at 25 pages 1313 * which is so low that it's useless. In this comparison, we seek to 1314 * calculate the total heap-size, and reclaim if more than 3/4ths of the 1315 * heap is allocated. (Or, in the caclulation, if less than 1/4th is 1316 * free) 1317 */ 1318 if (btop(vmem_size(heap_arena, VMEM_FREE)) < 1319 (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2)) 1320 return (1); 1321 #endif 1322 1323 #else 1324 if (spa_get_random(100) == 0) 1325 return (1); 1326 #endif 1327 return (0); 1328 } 1329 1330 static void 1331 arc_kmem_reap_now(arc_reclaim_strategy_t strat) 1332 { 1333 size_t i; 1334 kmem_cache_t *prev_cache = NULL; 1335 extern kmem_cache_t *zio_buf_cache[]; 1336 1337 #ifdef _KERNEL 1338 /* 1339 * First purge some DNLC entries, in case the DNLC is using 1340 * up too much memory. 1341 */ 1342 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 1343 1344 #if defined(__i386) 1345 /* 1346 * Reclaim unused memory from all kmem caches. 1347 */ 1348 kmem_reap(); 1349 #endif 1350 #endif 1351 1352 /* 1353 * An agressive reclamation will shrink the cache size as well as 1354 * reap free buffers from the arc kmem caches. 1355 */ 1356 if (strat == ARC_RECLAIM_AGGR) 1357 arc_kmem_reclaim(); 1358 1359 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 1360 if (zio_buf_cache[i] != prev_cache) { 1361 prev_cache = zio_buf_cache[i]; 1362 kmem_cache_reap_now(zio_buf_cache[i]); 1363 } 1364 } 1365 kmem_cache_reap_now(buf_cache); 1366 kmem_cache_reap_now(hdr_cache); 1367 } 1368 1369 static void 1370 arc_reclaim_thread(void) 1371 { 1372 clock_t growtime = 0; 1373 arc_reclaim_strategy_t last_reclaim = ARC_RECLAIM_CONS; 1374 callb_cpr_t cpr; 1375 1376 CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG); 1377 1378 mutex_enter(&arc_reclaim_thr_lock); 1379 while (arc_thread_exit == 0) { 1380 if (arc_reclaim_needed()) { 1381 1382 if (arc.no_grow) { 1383 if (last_reclaim == ARC_RECLAIM_CONS) { 1384 last_reclaim = ARC_RECLAIM_AGGR; 1385 } else { 1386 last_reclaim = ARC_RECLAIM_CONS; 1387 } 1388 } else { 1389 arc.no_grow = TRUE; 1390 last_reclaim = ARC_RECLAIM_AGGR; 1391 membar_producer(); 1392 } 1393 1394 /* reset the growth delay for every reclaim */ 1395 growtime = lbolt + (arc_grow_retry * hz); 1396 ASSERT(growtime > 0); 1397 1398 arc_kmem_reap_now(last_reclaim); 1399 1400 } else if ((growtime > 0) && ((growtime - lbolt) <= 0)) { 1401 arc.no_grow = FALSE; 1402 } 1403 1404 if (arc_eviction_list != NULL) 1405 arc_do_user_evicts(); 1406 1407 /* block until needed, or one second, whichever is shorter */ 1408 CALLB_CPR_SAFE_BEGIN(&cpr); 1409 (void) cv_timedwait(&arc_reclaim_thr_cv, 1410 &arc_reclaim_thr_lock, (lbolt + hz)); 1411 CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); 1412 } 1413 1414 arc_thread_exit = 0; 1415 cv_broadcast(&arc_reclaim_thr_cv); 1416 CALLB_CPR_EXIT(&cpr); /* drops arc_reclaim_thr_lock */ 1417 thread_exit(); 1418 } 1419 1420 /* 1421 * Adapt arc info given the number of bytes we are trying to add and 1422 * the state that we are comming from. This function is only called 1423 * when we are adding new content to the cache. 1424 */ 1425 static void 1426 arc_adapt(int bytes, arc_state_t *state) 1427 { 1428 int mult; 1429 1430 ASSERT(bytes > 0); 1431 /* 1432 * Adapt the target size of the MRU list: 1433 * - if we just hit in the MRU ghost list, then increase 1434 * the target size of the MRU list. 1435 * - if we just hit in the MFU ghost list, then increase 1436 * the target size of the MFU list by decreasing the 1437 * target size of the MRU list. 1438 */ 1439 if (state == arc.mru_ghost) { 1440 mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ? 1441 1 : (arc.mfu_ghost->size/arc.mru_ghost->size)); 1442 1443 arc.p = MIN(arc.c, arc.p + bytes * mult); 1444 } else if (state == arc.mfu_ghost) { 1445 mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ? 1446 1 : (arc.mru_ghost->size/arc.mfu_ghost->size)); 1447 1448 arc.p = MAX(0, (int64_t)arc.p - bytes * mult); 1449 } 1450 ASSERT((int64_t)arc.p >= 0); 1451 1452 if (arc_reclaim_needed()) { 1453 cv_signal(&arc_reclaim_thr_cv); 1454 return; 1455 } 1456 1457 if (arc.no_grow) 1458 return; 1459 1460 if (arc.c >= arc.c_max) 1461 return; 1462 1463 /* 1464 * If we're within (2 * maxblocksize) bytes of the target 1465 * cache size, increment the target cache size 1466 */ 1467 if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) { 1468 atomic_add_64(&arc.c, (int64_t)bytes); 1469 if (arc.c > arc.c_max) 1470 arc.c = arc.c_max; 1471 else if (state == arc.anon) 1472 atomic_add_64(&arc.p, (int64_t)bytes); 1473 if (arc.p > arc.c) 1474 arc.p = arc.c; 1475 } 1476 ASSERT((int64_t)arc.p >= 0); 1477 } 1478 1479 /* 1480 * Check if the cache has reached its limits and eviction is required 1481 * prior to insert. 1482 */ 1483 static int 1484 arc_evict_needed() 1485 { 1486 if (arc_reclaim_needed()) 1487 return (1); 1488 1489 return (arc.size > arc.c); 1490 } 1491 1492 /* 1493 * The buffer, supplied as the first argument, needs a data block. 1494 * So, if we are at cache max, determine which cache should be victimized. 1495 * We have the following cases: 1496 * 1497 * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) -> 1498 * In this situation if we're out of space, but the resident size of the MFU is 1499 * under the limit, victimize the MFU cache to satisfy this insertion request. 1500 * 1501 * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) -> 1502 * Here, we've used up all of the available space for the MRU, so we need to 1503 * evict from our own cache instead. Evict from the set of resident MRU 1504 * entries. 1505 * 1506 * 3. Insert for MFU (c - p) > sizeof(arc.mfu) -> 1507 * c minus p represents the MFU space in the cache, since p is the size of the 1508 * cache that is dedicated to the MRU. In this situation there's still space on 1509 * the MFU side, so the MRU side needs to be victimized. 1510 * 1511 * 4. Insert for MFU (c - p) < sizeof(arc.mfu) -> 1512 * MFU's resident set is consuming more space than it has been allotted. In 1513 * this situation, we must victimize our own cache, the MFU, for this insertion. 1514 */ 1515 static void 1516 arc_get_data_buf(arc_buf_t *buf) 1517 { 1518 arc_state_t *state = buf->b_hdr->b_state; 1519 uint64_t size = buf->b_hdr->b_size; 1520 1521 arc_adapt(size, state); 1522 1523 /* 1524 * We have not yet reached cache maximum size, 1525 * just allocate a new buffer. 1526 */ 1527 if (!arc_evict_needed()) { 1528 buf->b_data = zio_buf_alloc(size); 1529 atomic_add_64(&arc.size, size); 1530 goto out; 1531 } 1532 1533 /* 1534 * If we are prefetching from the mfu ghost list, this buffer 1535 * will end up on the mru list; so steal space from there. 1536 */ 1537 if (state == arc.mfu_ghost) 1538 state = buf->b_hdr->b_flags & ARC_PREFETCH ? arc.mru : arc.mfu; 1539 else if (state == arc.mru_ghost) 1540 state = arc.mru; 1541 1542 if (state == arc.mru || state == arc.anon) { 1543 uint64_t mru_used = arc.anon->size + arc.mru->size; 1544 state = (arc.p > mru_used) ? arc.mfu : arc.mru; 1545 } else { 1546 /* MFU cases */ 1547 uint64_t mfu_space = arc.c - arc.p; 1548 state = (mfu_space > arc.mfu->size) ? arc.mru : arc.mfu; 1549 } 1550 if ((buf->b_data = arc_evict(state, size, TRUE)) == NULL) { 1551 buf->b_data = zio_buf_alloc(size); 1552 atomic_add_64(&arc.size, size); 1553 atomic_add_64(&arc.recycle_miss, 1); 1554 } 1555 ASSERT(buf->b_data != NULL); 1556 out: 1557 /* 1558 * Update the state size. Note that ghost states have a 1559 * "ghost size" and so don't need to be updated. 1560 */ 1561 if (!GHOST_STATE(buf->b_hdr->b_state)) { 1562 arc_buf_hdr_t *hdr = buf->b_hdr; 1563 1564 atomic_add_64(&hdr->b_state->size, size); 1565 if (list_link_active(&hdr->b_arc_node)) { 1566 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 1567 atomic_add_64(&hdr->b_state->lsize, size); 1568 } 1569 } 1570 } 1571 1572 /* 1573 * This routine is called whenever a buffer is accessed. 1574 * NOTE: the hash lock is dropped in this function. 1575 */ 1576 static void 1577 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) 1578 { 1579 ASSERT(MUTEX_HELD(hash_lock)); 1580 1581 if (buf->b_state == arc.anon) { 1582 /* 1583 * This buffer is not in the cache, and does not 1584 * appear in our "ghost" list. Add the new buffer 1585 * to the MRU state. 1586 */ 1587 1588 ASSERT(buf->b_arc_access == 0); 1589 buf->b_arc_access = lbolt; 1590 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1591 arc_change_state(arc.mru, buf, hash_lock); 1592 1593 } else if (buf->b_state == arc.mru) { 1594 /* 1595 * If this buffer is here because of a prefetch, then either: 1596 * - clear the flag if this is a "referencing" read 1597 * (any subsequent access will bump this into the MFU state). 1598 * or 1599 * - move the buffer to the head of the list if this is 1600 * another prefetch (to make it less likely to be evicted). 1601 */ 1602 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1603 if (refcount_count(&buf->b_refcnt) == 0) { 1604 ASSERT(list_link_active(&buf->b_arc_node)); 1605 mutex_enter(&arc.mru->mtx); 1606 list_remove(&arc.mru->list, buf); 1607 list_insert_head(&arc.mru->list, buf); 1608 mutex_exit(&arc.mru->mtx); 1609 } else { 1610 buf->b_flags &= ~ARC_PREFETCH; 1611 atomic_add_64(&arc.mru->hits, 1); 1612 } 1613 buf->b_arc_access = lbolt; 1614 return; 1615 } 1616 1617 /* 1618 * This buffer has been "accessed" only once so far, 1619 * but it is still in the cache. Move it to the MFU 1620 * state. 1621 */ 1622 if (lbolt > buf->b_arc_access + ARC_MINTIME) { 1623 /* 1624 * More than 125ms have passed since we 1625 * instantiated this buffer. Move it to the 1626 * most frequently used state. 1627 */ 1628 buf->b_arc_access = lbolt; 1629 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1630 arc_change_state(arc.mfu, buf, hash_lock); 1631 } 1632 atomic_add_64(&arc.mru->hits, 1); 1633 } else if (buf->b_state == arc.mru_ghost) { 1634 arc_state_t *new_state; 1635 /* 1636 * This buffer has been "accessed" recently, but 1637 * was evicted from the cache. Move it to the 1638 * MFU state. 1639 */ 1640 1641 if (buf->b_flags & ARC_PREFETCH) { 1642 new_state = arc.mru; 1643 if (refcount_count(&buf->b_refcnt) > 0) 1644 buf->b_flags &= ~ARC_PREFETCH; 1645 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); 1646 } else { 1647 new_state = arc.mfu; 1648 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1649 } 1650 1651 buf->b_arc_access = lbolt; 1652 arc_change_state(new_state, buf, hash_lock); 1653 1654 atomic_add_64(&arc.mru_ghost->hits, 1); 1655 } else if (buf->b_state == arc.mfu) { 1656 /* 1657 * This buffer has been accessed more than once and is 1658 * still in the cache. Keep it in the MFU state. 1659 * 1660 * NOTE: an add_reference() that occurred when we did 1661 * the arc_read() will have kicked this off the list. 1662 * If it was a prefetch, we will explicitly move it to 1663 * the head of the list now. 1664 */ 1665 if ((buf->b_flags & ARC_PREFETCH) != 0) { 1666 ASSERT(refcount_count(&buf->b_refcnt) == 0); 1667 ASSERT(list_link_active(&buf->b_arc_node)); 1668 mutex_enter(&arc.mfu->mtx); 1669 list_remove(&arc.mfu->list, buf); 1670 list_insert_head(&arc.mfu->list, buf); 1671 mutex_exit(&arc.mfu->mtx); 1672 } 1673 atomic_add_64(&arc.mfu->hits, 1); 1674 buf->b_arc_access = lbolt; 1675 } else if (buf->b_state == arc.mfu_ghost) { 1676 arc_state_t *new_state = arc.mfu; 1677 /* 1678 * This buffer has been accessed more than once but has 1679 * been evicted from the cache. Move it back to the 1680 * MFU state. 1681 */ 1682 1683 if (buf->b_flags & ARC_PREFETCH) { 1684 /* 1685 * This is a prefetch access... 1686 * move this block back to the MRU state. 1687 */ 1688 ASSERT3U(refcount_count(&buf->b_refcnt), ==, 0); 1689 new_state = arc.mru; 1690 } 1691 1692 buf->b_arc_access = lbolt; 1693 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); 1694 arc_change_state(new_state, buf, hash_lock); 1695 1696 atomic_add_64(&arc.mfu_ghost->hits, 1); 1697 } else { 1698 ASSERT(!"invalid arc state"); 1699 } 1700 } 1701 1702 /* a generic arc_done_func_t which you can use */ 1703 /* ARGSUSED */ 1704 void 1705 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) 1706 { 1707 bcopy(buf->b_data, arg, buf->b_hdr->b_size); 1708 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1709 } 1710 1711 /* a generic arc_done_func_t which you can use */ 1712 void 1713 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) 1714 { 1715 arc_buf_t **bufp = arg; 1716 if (zio && zio->io_error) { 1717 VERIFY(arc_buf_remove_ref(buf, arg) == 1); 1718 *bufp = NULL; 1719 } else { 1720 *bufp = buf; 1721 } 1722 } 1723 1724 static void 1725 arc_read_done(zio_t *zio) 1726 { 1727 arc_buf_hdr_t *hdr, *found; 1728 arc_buf_t *buf; 1729 arc_buf_t *abuf; /* buffer we're assigning to callback */ 1730 kmutex_t *hash_lock; 1731 arc_callback_t *callback_list, *acb; 1732 int freeable = FALSE; 1733 1734 buf = zio->io_private; 1735 hdr = buf->b_hdr; 1736 1737 /* 1738 * The hdr was inserted into hash-table and removed from lists 1739 * prior to starting I/O. We should find this header, since 1740 * it's in the hash table, and it should be legit since it's 1741 * not possible to evict it during the I/O. The only possible 1742 * reason for it not to be found is if we were freed during the 1743 * read. 1744 */ 1745 found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, 1746 &hash_lock); 1747 1748 ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || 1749 (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); 1750 1751 /* byteswap if necessary */ 1752 callback_list = hdr->b_acb; 1753 ASSERT(callback_list != NULL); 1754 if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) 1755 callback_list->acb_byteswap(buf->b_data, hdr->b_size); 1756 1757 arc_cksum_compute(buf); 1758 1759 /* create copies of the data buffer for the callers */ 1760 abuf = buf; 1761 for (acb = callback_list; acb; acb = acb->acb_next) { 1762 if (acb->acb_done) { 1763 if (abuf == NULL) 1764 abuf = arc_buf_clone(buf); 1765 acb->acb_buf = abuf; 1766 abuf = NULL; 1767 } 1768 } 1769 hdr->b_acb = NULL; 1770 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 1771 ASSERT(!HDR_BUF_AVAILABLE(hdr)); 1772 if (abuf == buf) 1773 hdr->b_flags |= ARC_BUF_AVAILABLE; 1774 1775 ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); 1776 1777 if (zio->io_error != 0) { 1778 hdr->b_flags |= ARC_IO_ERROR; 1779 if (hdr->b_state != arc.anon) 1780 arc_change_state(arc.anon, hdr, hash_lock); 1781 if (HDR_IN_HASH_TABLE(hdr)) 1782 buf_hash_remove(hdr); 1783 freeable = refcount_is_zero(&hdr->b_refcnt); 1784 /* convert checksum errors into IO errors */ 1785 if (zio->io_error == ECKSUM) 1786 zio->io_error = EIO; 1787 } 1788 1789 /* 1790 * Broadcast before we drop the hash_lock to avoid the possibility 1791 * that the hdr (and hence the cv) might be freed before we get to 1792 * the cv_broadcast(). 1793 */ 1794 cv_broadcast(&hdr->b_cv); 1795 1796 if (hash_lock) { 1797 /* 1798 * Only call arc_access on anonymous buffers. This is because 1799 * if we've issued an I/O for an evicted buffer, we've already 1800 * called arc_access (to prevent any simultaneous readers from 1801 * getting confused). 1802 */ 1803 if (zio->io_error == 0 && hdr->b_state == arc.anon) 1804 arc_access(hdr, hash_lock); 1805 mutex_exit(hash_lock); 1806 } else { 1807 /* 1808 * This block was freed while we waited for the read to 1809 * complete. It has been removed from the hash table and 1810 * moved to the anonymous state (so that it won't show up 1811 * in the cache). 1812 */ 1813 ASSERT3P(hdr->b_state, ==, arc.anon); 1814 freeable = refcount_is_zero(&hdr->b_refcnt); 1815 } 1816 1817 /* execute each callback and free its structure */ 1818 while ((acb = callback_list) != NULL) { 1819 if (acb->acb_done) 1820 acb->acb_done(zio, acb->acb_buf, acb->acb_private); 1821 1822 if (acb->acb_zio_dummy != NULL) { 1823 acb->acb_zio_dummy->io_error = zio->io_error; 1824 zio_nowait(acb->acb_zio_dummy); 1825 } 1826 1827 callback_list = acb->acb_next; 1828 kmem_free(acb, sizeof (arc_callback_t)); 1829 } 1830 1831 if (freeable) 1832 arc_hdr_destroy(hdr); 1833 } 1834 1835 /* 1836 * "Read" the block block at the specified DVA (in bp) via the 1837 * cache. If the block is found in the cache, invoke the provided 1838 * callback immediately and return. Note that the `zio' parameter 1839 * in the callback will be NULL in this case, since no IO was 1840 * required. If the block is not in the cache pass the read request 1841 * on to the spa with a substitute callback function, so that the 1842 * requested block will be added to the cache. 1843 * 1844 * If a read request arrives for a block that has a read in-progress, 1845 * either wait for the in-progress read to complete (and return the 1846 * results); or, if this is a read with a "done" func, add a record 1847 * to the read to invoke the "done" func when the read completes, 1848 * and return; or just return. 1849 * 1850 * arc_read_done() will invoke all the requested "done" functions 1851 * for readers of this block. 1852 */ 1853 int 1854 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, 1855 arc_done_func_t *done, void *private, int priority, int flags, 1856 uint32_t *arc_flags, zbookmark_t *zb) 1857 { 1858 arc_buf_hdr_t *hdr; 1859 arc_buf_t *buf; 1860 kmutex_t *hash_lock; 1861 zio_t *rzio; 1862 1863 top: 1864 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 1865 if (hdr && hdr->b_datacnt > 0) { 1866 1867 *arc_flags |= ARC_CACHED; 1868 1869 if (HDR_IO_IN_PROGRESS(hdr)) { 1870 1871 if (*arc_flags & ARC_WAIT) { 1872 cv_wait(&hdr->b_cv, hash_lock); 1873 mutex_exit(hash_lock); 1874 goto top; 1875 } 1876 ASSERT(*arc_flags & ARC_NOWAIT); 1877 1878 if (done) { 1879 arc_callback_t *acb = NULL; 1880 1881 acb = kmem_zalloc(sizeof (arc_callback_t), 1882 KM_SLEEP); 1883 acb->acb_done = done; 1884 acb->acb_private = private; 1885 acb->acb_byteswap = swap; 1886 if (pio != NULL) 1887 acb->acb_zio_dummy = zio_null(pio, 1888 spa, NULL, NULL, flags); 1889 1890 ASSERT(acb->acb_done != NULL); 1891 acb->acb_next = hdr->b_acb; 1892 hdr->b_acb = acb; 1893 add_reference(hdr, hash_lock, private); 1894 mutex_exit(hash_lock); 1895 return (0); 1896 } 1897 mutex_exit(hash_lock); 1898 return (0); 1899 } 1900 1901 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 1902 1903 if (done) { 1904 add_reference(hdr, hash_lock, private); 1905 /* 1906 * If this block is already in use, create a new 1907 * copy of the data so that we will be guaranteed 1908 * that arc_release() will always succeed. 1909 */ 1910 buf = hdr->b_buf; 1911 ASSERT(buf); 1912 ASSERT(buf->b_data); 1913 if (HDR_BUF_AVAILABLE(hdr)) { 1914 ASSERT(buf->b_efunc == NULL); 1915 hdr->b_flags &= ~ARC_BUF_AVAILABLE; 1916 } else { 1917 buf = arc_buf_clone(buf); 1918 } 1919 } else if (*arc_flags & ARC_PREFETCH && 1920 refcount_count(&hdr->b_refcnt) == 0) { 1921 hdr->b_flags |= ARC_PREFETCH; 1922 } 1923 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 1924 arc_access(hdr, hash_lock); 1925 mutex_exit(hash_lock); 1926 atomic_add_64(&arc.hits, 1); 1927 if (done) 1928 done(NULL, buf, private); 1929 } else { 1930 uint64_t size = BP_GET_LSIZE(bp); 1931 arc_callback_t *acb; 1932 1933 if (hdr == NULL) { 1934 /* this block is not in the cache */ 1935 arc_buf_hdr_t *exists; 1936 1937 buf = arc_buf_alloc(spa, size, private); 1938 hdr = buf->b_hdr; 1939 hdr->b_dva = *BP_IDENTITY(bp); 1940 hdr->b_birth = bp->blk_birth; 1941 hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; 1942 exists = buf_hash_insert(hdr, &hash_lock); 1943 if (exists) { 1944 /* somebody beat us to the hash insert */ 1945 mutex_exit(hash_lock); 1946 bzero(&hdr->b_dva, sizeof (dva_t)); 1947 hdr->b_birth = 0; 1948 hdr->b_cksum0 = 0; 1949 (void) arc_buf_remove_ref(buf, private); 1950 goto top; /* restart the IO request */ 1951 } 1952 /* if this is a prefetch, we don't have a reference */ 1953 if (*arc_flags & ARC_PREFETCH) { 1954 (void) remove_reference(hdr, hash_lock, 1955 private); 1956 hdr->b_flags |= ARC_PREFETCH; 1957 } 1958 if (BP_GET_LEVEL(bp) > 0) 1959 hdr->b_flags |= ARC_INDIRECT; 1960 } else { 1961 /* this block is in the ghost cache */ 1962 ASSERT(GHOST_STATE(hdr->b_state)); 1963 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1964 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 0); 1965 ASSERT(hdr->b_buf == NULL); 1966 1967 /* if this is a prefetch, we don't have a reference */ 1968 if (*arc_flags & ARC_PREFETCH) 1969 hdr->b_flags |= ARC_PREFETCH; 1970 else 1971 add_reference(hdr, hash_lock, private); 1972 buf = kmem_cache_alloc(buf_cache, KM_SLEEP); 1973 buf->b_hdr = hdr; 1974 buf->b_data = NULL; 1975 buf->b_efunc = NULL; 1976 buf->b_private = NULL; 1977 buf->b_next = NULL; 1978 hdr->b_buf = buf; 1979 arc_get_data_buf(buf); 1980 ASSERT(hdr->b_datacnt == 0); 1981 hdr->b_datacnt = 1; 1982 1983 } 1984 1985 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 1986 acb->acb_done = done; 1987 acb->acb_private = private; 1988 acb->acb_byteswap = swap; 1989 1990 ASSERT(hdr->b_acb == NULL); 1991 hdr->b_acb = acb; 1992 hdr->b_flags |= ARC_IO_IN_PROGRESS; 1993 1994 /* 1995 * If the buffer has been evicted, migrate it to a present state 1996 * before issuing the I/O. Once we drop the hash-table lock, 1997 * the header will be marked as I/O in progress and have an 1998 * attached buffer. At this point, anybody who finds this 1999 * buffer ought to notice that it's legit but has a pending I/O. 2000 */ 2001 2002 if (GHOST_STATE(hdr->b_state)) 2003 arc_access(hdr, hash_lock); 2004 mutex_exit(hash_lock); 2005 2006 ASSERT3U(hdr->b_size, ==, size); 2007 DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, 2008 zbookmark_t *, zb); 2009 atomic_add_64(&arc.misses, 1); 2010 2011 rzio = zio_read(pio, spa, bp, buf->b_data, size, 2012 arc_read_done, buf, priority, flags, zb); 2013 2014 if (*arc_flags & ARC_WAIT) 2015 return (zio_wait(rzio)); 2016 2017 ASSERT(*arc_flags & ARC_NOWAIT); 2018 zio_nowait(rzio); 2019 } 2020 return (0); 2021 } 2022 2023 /* 2024 * arc_read() variant to support pool traversal. If the block is already 2025 * in the ARC, make a copy of it; otherwise, the caller will do the I/O. 2026 * The idea is that we don't want pool traversal filling up memory, but 2027 * if the ARC already has the data anyway, we shouldn't pay for the I/O. 2028 */ 2029 int 2030 arc_tryread(spa_t *spa, blkptr_t *bp, void *data) 2031 { 2032 arc_buf_hdr_t *hdr; 2033 kmutex_t *hash_mtx; 2034 int rc = 0; 2035 2036 hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); 2037 2038 if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { 2039 arc_buf_t *buf = hdr->b_buf; 2040 2041 ASSERT(buf); 2042 while (buf->b_data == NULL) { 2043 buf = buf->b_next; 2044 ASSERT(buf); 2045 } 2046 bcopy(buf->b_data, data, hdr->b_size); 2047 } else { 2048 rc = ENOENT; 2049 } 2050 2051 if (hash_mtx) 2052 mutex_exit(hash_mtx); 2053 2054 return (rc); 2055 } 2056 2057 void 2058 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) 2059 { 2060 ASSERT(buf->b_hdr != NULL); 2061 ASSERT(buf->b_hdr->b_state != arc.anon); 2062 ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); 2063 buf->b_efunc = func; 2064 buf->b_private = private; 2065 } 2066 2067 /* 2068 * This is used by the DMU to let the ARC know that a buffer is 2069 * being evicted, so the ARC should clean up. If this arc buf 2070 * is not yet in the evicted state, it will be put there. 2071 */ 2072 int 2073 arc_buf_evict(arc_buf_t *buf) 2074 { 2075 arc_buf_hdr_t *hdr; 2076 kmutex_t *hash_lock; 2077 arc_buf_t **bufp; 2078 2079 mutex_enter(&arc_eviction_mtx); 2080 hdr = buf->b_hdr; 2081 if (hdr == NULL) { 2082 /* 2083 * We are in arc_do_user_evicts(). 2084 */ 2085 ASSERT(buf->b_data == NULL); 2086 mutex_exit(&arc_eviction_mtx); 2087 return (0); 2088 } 2089 hash_lock = HDR_LOCK(hdr); 2090 mutex_exit(&arc_eviction_mtx); 2091 2092 mutex_enter(hash_lock); 2093 2094 if (buf->b_data == NULL) { 2095 /* 2096 * We are on the eviction list. 2097 */ 2098 mutex_exit(hash_lock); 2099 mutex_enter(&arc_eviction_mtx); 2100 if (buf->b_hdr == NULL) { 2101 /* 2102 * We are already in arc_do_user_evicts(). 2103 */ 2104 mutex_exit(&arc_eviction_mtx); 2105 return (0); 2106 } else { 2107 arc_buf_t copy = *buf; /* structure assignment */ 2108 /* 2109 * Process this buffer now 2110 * but let arc_do_user_evicts() do the reaping. 2111 */ 2112 buf->b_efunc = NULL; 2113 mutex_exit(&arc_eviction_mtx); 2114 VERIFY(copy.b_efunc(©) == 0); 2115 return (1); 2116 } 2117 } 2118 2119 ASSERT(buf->b_hdr == hdr); 2120 ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); 2121 ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); 2122 2123 /* 2124 * Pull this buffer off of the hdr 2125 */ 2126 bufp = &hdr->b_buf; 2127 while (*bufp != buf) 2128 bufp = &(*bufp)->b_next; 2129 *bufp = buf->b_next; 2130 2131 ASSERT(buf->b_data != NULL); 2132 arc_buf_destroy(buf, FALSE, FALSE); 2133 2134 if (hdr->b_datacnt == 0) { 2135 arc_state_t *old_state = hdr->b_state; 2136 arc_state_t *evicted_state; 2137 2138 ASSERT(refcount_is_zero(&hdr->b_refcnt)); 2139 2140 evicted_state = 2141 (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; 2142 2143 mutex_enter(&old_state->mtx); 2144 mutex_enter(&evicted_state->mtx); 2145 2146 arc_change_state(evicted_state, hdr, hash_lock); 2147 ASSERT(HDR_IN_HASH_TABLE(hdr)); 2148 hdr->b_flags = ARC_IN_HASH_TABLE; 2149 2150 mutex_exit(&evicted_state->mtx); 2151 mutex_exit(&old_state->mtx); 2152 } 2153 mutex_exit(hash_lock); 2154 2155 VERIFY(buf->b_efunc(buf) == 0); 2156 buf->b_efunc = NULL; 2157 buf->b_private = NULL; 2158 buf->b_hdr = NULL; 2159 kmem_cache_free(buf_cache, buf); 2160 return (1); 2161 } 2162 2163 /* 2164 * Release this buffer from the cache. This must be done 2165 * after a read and prior to modifying the buffer contents. 2166 * If the buffer has more than one reference, we must make 2167 * make a new hdr for the buffer. 2168 */ 2169 void 2170 arc_release(arc_buf_t *buf, void *tag) 2171 { 2172 arc_buf_hdr_t *hdr = buf->b_hdr; 2173 kmutex_t *hash_lock = HDR_LOCK(hdr); 2174 2175 /* this buffer is not on any list */ 2176 ASSERT(refcount_count(&hdr->b_refcnt) > 0); 2177 2178 if (hdr->b_state == arc.anon) { 2179 /* this buffer is already released */ 2180 ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); 2181 ASSERT(BUF_EMPTY(hdr)); 2182 ASSERT(buf->b_efunc == NULL); 2183 arc_buf_thaw(buf); 2184 return; 2185 } 2186 2187 mutex_enter(hash_lock); 2188 2189 /* 2190 * Do we have more than one buf? 2191 */ 2192 if (hdr->b_buf != buf || buf->b_next != NULL) { 2193 arc_buf_hdr_t *nhdr; 2194 arc_buf_t **bufp; 2195 uint64_t blksz = hdr->b_size; 2196 spa_t *spa = hdr->b_spa; 2197 2198 ASSERT(hdr->b_datacnt > 1); 2199 /* 2200 * Pull the data off of this buf and attach it to 2201 * a new anonymous buf. 2202 */ 2203 (void) remove_reference(hdr, hash_lock, tag); 2204 bufp = &hdr->b_buf; 2205 while (*bufp != buf) 2206 bufp = &(*bufp)->b_next; 2207 *bufp = (*bufp)->b_next; 2208 2209 ASSERT3U(hdr->b_state->size, >=, hdr->b_size); 2210 atomic_add_64(&hdr->b_state->size, -hdr->b_size); 2211 if (refcount_is_zero(&hdr->b_refcnt)) { 2212 ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size); 2213 atomic_add_64(&hdr->b_state->lsize, -hdr->b_size); 2214 } 2215 hdr->b_datacnt -= 1; 2216 2217 mutex_exit(hash_lock); 2218 2219 nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); 2220 nhdr->b_size = blksz; 2221 nhdr->b_spa = spa; 2222 nhdr->b_buf = buf; 2223 nhdr->b_state = arc.anon; 2224 nhdr->b_arc_access = 0; 2225 nhdr->b_flags = 0; 2226 nhdr->b_datacnt = 1; 2227 nhdr->b_freeze_cksum = 2228 kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); 2229 *nhdr->b_freeze_cksum = *hdr->b_freeze_cksum; /* struct copy */ 2230 buf->b_hdr = nhdr; 2231 buf->b_next = NULL; 2232 (void) refcount_add(&nhdr->b_refcnt, tag); 2233 atomic_add_64(&arc.anon->size, blksz); 2234 2235 hdr = nhdr; 2236 } else { 2237 ASSERT(refcount_count(&hdr->b_refcnt) == 1); 2238 ASSERT(!list_link_active(&hdr->b_arc_node)); 2239 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 2240 arc_change_state(arc.anon, hdr, hash_lock); 2241 hdr->b_arc_access = 0; 2242 mutex_exit(hash_lock); 2243 bzero(&hdr->b_dva, sizeof (dva_t)); 2244 hdr->b_birth = 0; 2245 hdr->b_cksum0 = 0; 2246 } 2247 buf->b_efunc = NULL; 2248 buf->b_private = NULL; 2249 arc_buf_thaw(buf); 2250 } 2251 2252 int 2253 arc_released(arc_buf_t *buf) 2254 { 2255 return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon); 2256 } 2257 2258 int 2259 arc_has_callback(arc_buf_t *buf) 2260 { 2261 return (buf->b_efunc != NULL); 2262 } 2263 2264 #ifdef ZFS_DEBUG 2265 int 2266 arc_referenced(arc_buf_t *buf) 2267 { 2268 return (refcount_count(&buf->b_hdr->b_refcnt)); 2269 } 2270 #endif 2271 2272 static void 2273 arc_write_done(zio_t *zio) 2274 { 2275 arc_buf_t *buf; 2276 arc_buf_hdr_t *hdr; 2277 arc_callback_t *acb; 2278 2279 buf = zio->io_private; 2280 hdr = buf->b_hdr; 2281 acb = hdr->b_acb; 2282 hdr->b_acb = NULL; 2283 ASSERT(acb != NULL); 2284 2285 /* this buffer is on no lists and is not in the hash table */ 2286 ASSERT3P(hdr->b_state, ==, arc.anon); 2287 2288 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 2289 hdr->b_birth = zio->io_bp->blk_birth; 2290 hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; 2291 /* 2292 * If the block to be written was all-zero, we may have 2293 * compressed it away. In this case no write was performed 2294 * so there will be no dva/birth-date/checksum. The buffer 2295 * must therefor remain anonymous (and uncached). 2296 */ 2297 if (!BUF_EMPTY(hdr)) { 2298 arc_buf_hdr_t *exists; 2299 kmutex_t *hash_lock; 2300 2301 arc_cksum_verify(buf); 2302 2303 exists = buf_hash_insert(hdr, &hash_lock); 2304 if (exists) { 2305 /* 2306 * This can only happen if we overwrite for 2307 * sync-to-convergence, because we remove 2308 * buffers from the hash table when we arc_free(). 2309 */ 2310 ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), 2311 BP_IDENTITY(zio->io_bp))); 2312 ASSERT3U(zio->io_bp_orig.blk_birth, ==, 2313 zio->io_bp->blk_birth); 2314 2315 ASSERT(refcount_is_zero(&exists->b_refcnt)); 2316 arc_change_state(arc.anon, exists, hash_lock); 2317 mutex_exit(hash_lock); 2318 arc_hdr_destroy(exists); 2319 exists = buf_hash_insert(hdr, &hash_lock); 2320 ASSERT3P(exists, ==, NULL); 2321 } 2322 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2323 arc_access(hdr, hash_lock); 2324 mutex_exit(hash_lock); 2325 } else if (acb->acb_done == NULL) { 2326 int destroy_hdr; 2327 /* 2328 * This is an anonymous buffer with no user callback, 2329 * destroy it if there are no active references. 2330 */ 2331 mutex_enter(&arc_eviction_mtx); 2332 destroy_hdr = refcount_is_zero(&hdr->b_refcnt); 2333 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2334 mutex_exit(&arc_eviction_mtx); 2335 if (destroy_hdr) 2336 arc_hdr_destroy(hdr); 2337 } else { 2338 hdr->b_flags &= ~ARC_IO_IN_PROGRESS; 2339 } 2340 2341 if (acb->acb_done) { 2342 ASSERT(!refcount_is_zero(&hdr->b_refcnt)); 2343 acb->acb_done(zio, buf, acb->acb_private); 2344 } 2345 2346 kmem_free(acb, sizeof (arc_callback_t)); 2347 } 2348 2349 int 2350 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, int ncopies, 2351 uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 2352 arc_done_func_t *done, void *private, int priority, int flags, 2353 uint32_t arc_flags, zbookmark_t *zb) 2354 { 2355 arc_buf_hdr_t *hdr = buf->b_hdr; 2356 arc_callback_t *acb; 2357 zio_t *rzio; 2358 2359 /* this is a private buffer - no locking required */ 2360 ASSERT3P(hdr->b_state, ==, arc.anon); 2361 ASSERT(BUF_EMPTY(hdr)); 2362 ASSERT(!HDR_IO_ERROR(hdr)); 2363 ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); 2364 ASSERT(hdr->b_acb == 0); 2365 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 2366 acb->acb_done = done; 2367 acb->acb_private = private; 2368 acb->acb_byteswap = (arc_byteswap_func_t *)-1; 2369 hdr->b_acb = acb; 2370 hdr->b_flags |= ARC_IO_IN_PROGRESS; 2371 arc_cksum_compute(buf); 2372 rzio = zio_write(pio, spa, checksum, compress, ncopies, txg, bp, 2373 buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); 2374 2375 if (arc_flags & ARC_WAIT) 2376 return (zio_wait(rzio)); 2377 2378 ASSERT(arc_flags & ARC_NOWAIT); 2379 zio_nowait(rzio); 2380 2381 return (0); 2382 } 2383 2384 int 2385 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, 2386 zio_done_func_t *done, void *private, uint32_t arc_flags) 2387 { 2388 arc_buf_hdr_t *ab; 2389 kmutex_t *hash_lock; 2390 zio_t *zio; 2391 2392 /* 2393 * If this buffer is in the cache, release it, so it 2394 * can be re-used. 2395 */ 2396 ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); 2397 if (ab != NULL) { 2398 /* 2399 * The checksum of blocks to free is not always 2400 * preserved (eg. on the deadlist). However, if it is 2401 * nonzero, it should match what we have in the cache. 2402 */ 2403 ASSERT(bp->blk_cksum.zc_word[0] == 0 || 2404 ab->b_cksum0 == bp->blk_cksum.zc_word[0]); 2405 if (ab->b_state != arc.anon) 2406 arc_change_state(arc.anon, ab, hash_lock); 2407 if (HDR_IO_IN_PROGRESS(ab)) { 2408 /* 2409 * This should only happen when we prefetch. 2410 */ 2411 ASSERT(ab->b_flags & ARC_PREFETCH); 2412 ASSERT3U(ab->b_datacnt, ==, 1); 2413 ab->b_flags |= ARC_FREED_IN_READ; 2414 if (HDR_IN_HASH_TABLE(ab)) 2415 buf_hash_remove(ab); 2416 ab->b_arc_access = 0; 2417 bzero(&ab->b_dva, sizeof (dva_t)); 2418 ab->b_birth = 0; 2419 ab->b_cksum0 = 0; 2420 ab->b_buf->b_efunc = NULL; 2421 ab->b_buf->b_private = NULL; 2422 mutex_exit(hash_lock); 2423 } else if (refcount_is_zero(&ab->b_refcnt)) { 2424 mutex_exit(hash_lock); 2425 arc_hdr_destroy(ab); 2426 atomic_add_64(&arc.deleted, 1); 2427 } else { 2428 /* 2429 * We still have an active reference on this 2430 * buffer. This can happen, e.g., from 2431 * dbuf_unoverride(). 2432 */ 2433 ASSERT(!HDR_IN_HASH_TABLE(ab)); 2434 ab->b_arc_access = 0; 2435 bzero(&ab->b_dva, sizeof (dva_t)); 2436 ab->b_birth = 0; 2437 ab->b_cksum0 = 0; 2438 ab->b_buf->b_efunc = NULL; 2439 ab->b_buf->b_private = NULL; 2440 mutex_exit(hash_lock); 2441 } 2442 } 2443 2444 zio = zio_free(pio, spa, txg, bp, done, private); 2445 2446 if (arc_flags & ARC_WAIT) 2447 return (zio_wait(zio)); 2448 2449 ASSERT(arc_flags & ARC_NOWAIT); 2450 zio_nowait(zio); 2451 2452 return (0); 2453 } 2454 2455 void 2456 arc_tempreserve_clear(uint64_t tempreserve) 2457 { 2458 atomic_add_64(&arc_tempreserve, -tempreserve); 2459 ASSERT((int64_t)arc_tempreserve >= 0); 2460 } 2461 2462 int 2463 arc_tempreserve_space(uint64_t tempreserve) 2464 { 2465 #ifdef ZFS_DEBUG 2466 /* 2467 * Once in a while, fail for no reason. Everything should cope. 2468 */ 2469 if (spa_get_random(10000) == 0) { 2470 dprintf("forcing random failure\n"); 2471 return (ERESTART); 2472 } 2473 #endif 2474 if (tempreserve > arc.c/4 && !arc.no_grow) 2475 arc.c = MIN(arc.c_max, tempreserve * 4); 2476 if (tempreserve > arc.c) 2477 return (ENOMEM); 2478 2479 /* 2480 * Throttle writes when the amount of dirty data in the cache 2481 * gets too large. We try to keep the cache less than half full 2482 * of dirty blocks so that our sync times don't grow too large. 2483 * Note: if two requests come in concurrently, we might let them 2484 * both succeed, when one of them should fail. Not a huge deal. 2485 * 2486 * XXX The limit should be adjusted dynamically to keep the time 2487 * to sync a dataset fixed (around 1-5 seconds?). 2488 */ 2489 2490 if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 && 2491 arc_tempreserve + arc.anon->size > arc.c / 4) { 2492 dprintf("failing, arc_tempreserve=%lluK anon=%lluK " 2493 "tempreserve=%lluK arc.c=%lluK\n", 2494 arc_tempreserve>>10, arc.anon->lsize>>10, 2495 tempreserve>>10, arc.c>>10); 2496 return (ERESTART); 2497 } 2498 atomic_add_64(&arc_tempreserve, tempreserve); 2499 return (0); 2500 } 2501 2502 void 2503 arc_init(void) 2504 { 2505 mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); 2506 mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL); 2507 cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL); 2508 2509 /* Convert seconds to clock ticks */ 2510 arc_min_prefetch_lifespan = 1 * hz; 2511 2512 /* Start out with 1/8 of all memory */ 2513 arc.c = physmem * PAGESIZE / 8; 2514 2515 #ifdef _KERNEL 2516 /* 2517 * On architectures where the physical memory can be larger 2518 * than the addressable space (intel in 32-bit mode), we may 2519 * need to limit the cache to 1/8 of VM size. 2520 */ 2521 arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8); 2522 #endif 2523 2524 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 2525 arc.c_min = MAX(arc.c / 4, 64<<20); 2526 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 2527 if (arc.c * 8 >= 1<<30) 2528 arc.c_max = (arc.c * 8) - (1<<30); 2529 else 2530 arc.c_max = arc.c_min; 2531 arc.c_max = MAX(arc.c * 6, arc.c_max); 2532 2533 /* 2534 * Allow the tunables to override our calculations if they are 2535 * reasonable (ie. over 64MB) 2536 */ 2537 if (zfs_arc_max > 64<<20 && zfs_arc_max < physmem * PAGESIZE) 2538 arc.c_max = zfs_arc_max; 2539 if (zfs_arc_min > 64<<20 && zfs_arc_min <= arc.c_max) 2540 arc.c_min = zfs_arc_min; 2541 2542 arc.c = arc.c_max; 2543 arc.p = (arc.c >> 1); 2544 2545 /* if kmem_flags are set, lets try to use less memory */ 2546 if (kmem_debugging()) 2547 arc.c = arc.c / 2; 2548 if (arc.c < arc.c_min) 2549 arc.c = arc.c_min; 2550 2551 arc.anon = &ARC_anon; 2552 arc.mru = &ARC_mru; 2553 arc.mru_ghost = &ARC_mru_ghost; 2554 arc.mfu = &ARC_mfu; 2555 arc.mfu_ghost = &ARC_mfu_ghost; 2556 arc.size = 0; 2557 2558 arc.hits = 0; 2559 arc.recycle_miss = 0; 2560 arc.evict_skip = 0; 2561 arc.mutex_miss = 0; 2562 2563 mutex_init(&arc.anon->mtx, NULL, MUTEX_DEFAULT, NULL); 2564 mutex_init(&arc.mru->mtx, NULL, MUTEX_DEFAULT, NULL); 2565 mutex_init(&arc.mru_ghost->mtx, NULL, MUTEX_DEFAULT, NULL); 2566 mutex_init(&arc.mfu->mtx, NULL, MUTEX_DEFAULT, NULL); 2567 mutex_init(&arc.mfu_ghost->mtx, NULL, MUTEX_DEFAULT, NULL); 2568 2569 list_create(&arc.mru->list, sizeof (arc_buf_hdr_t), 2570 offsetof(arc_buf_hdr_t, b_arc_node)); 2571 list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t), 2572 offsetof(arc_buf_hdr_t, b_arc_node)); 2573 list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t), 2574 offsetof(arc_buf_hdr_t, b_arc_node)); 2575 list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t), 2576 offsetof(arc_buf_hdr_t, b_arc_node)); 2577 2578 buf_init(); 2579 2580 arc_thread_exit = 0; 2581 arc_eviction_list = NULL; 2582 mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); 2583 bzero(&arc_eviction_hdr, sizeof (arc_buf_hdr_t)); 2584 2585 (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, 2586 TS_RUN, minclsyspri); 2587 } 2588 2589 void 2590 arc_fini(void) 2591 { 2592 mutex_enter(&arc_reclaim_thr_lock); 2593 arc_thread_exit = 1; 2594 while (arc_thread_exit != 0) 2595 cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock); 2596 mutex_exit(&arc_reclaim_thr_lock); 2597 2598 arc_flush(); 2599 2600 arc_dead = TRUE; 2601 2602 mutex_destroy(&arc_eviction_mtx); 2603 mutex_destroy(&arc_reclaim_lock); 2604 mutex_destroy(&arc_reclaim_thr_lock); 2605 cv_destroy(&arc_reclaim_thr_cv); 2606 2607 list_destroy(&arc.mru->list); 2608 list_destroy(&arc.mru_ghost->list); 2609 list_destroy(&arc.mfu->list); 2610 list_destroy(&arc.mfu_ghost->list); 2611 2612 mutex_destroy(&arc.anon->mtx); 2613 mutex_destroy(&arc.mru->mtx); 2614 mutex_destroy(&arc.mru_ghost->mtx); 2615 mutex_destroy(&arc.mfu->mtx); 2616 mutex_destroy(&arc.mfu_ghost->mtx); 2617 2618 buf_fini(); 2619 } 2620