1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright (c) 2018, Joyent, Inc. 24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved. 25 * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. 26 * Copyright 2017 Nexenta Systems, Inc. All rights reserved. 27 */ 28 29 /* 30 * DVA-based Adjustable Replacement Cache 31 * 32 * While much of the theory of operation used here is 33 * based on the self-tuning, low overhead replacement cache 34 * presented by Megiddo and Modha at FAST 2003, there are some 35 * significant differences: 36 * 37 * 1. The Megiddo and Modha model assumes any page is evictable. 38 * Pages in its cache cannot be "locked" into memory. This makes 39 * the eviction algorithm simple: evict the last page in the list. 40 * This also make the performance characteristics easy to reason 41 * about. Our cache is not so simple. At any given moment, some 42 * subset of the blocks in the cache are un-evictable because we 43 * have handed out a reference to them. Blocks are only evictable 44 * when there are no external references active. This makes 45 * eviction far more problematic: we choose to evict the evictable 46 * blocks that are the "lowest" in the list. 47 * 48 * There are times when it is not possible to evict the requested 49 * space. In these circumstances we are unable to adjust the cache 50 * size. To prevent the cache growing unbounded at these times we 51 * implement a "cache throttle" that slows the flow of new data 52 * into the cache until we can make space available. 53 * 54 * 2. The Megiddo and Modha model assumes a fixed cache size. 55 * Pages are evicted when the cache is full and there is a cache 56 * miss. Our model has a variable sized cache. It grows with 57 * high use, but also tries to react to memory pressure from the 58 * operating system: decreasing its size when system memory is 59 * tight. 60 * 61 * 3. The Megiddo and Modha model assumes a fixed page size. All 62 * elements of the cache are therefore exactly the same size. So 63 * when adjusting the cache size following a cache miss, its simply 64 * a matter of choosing a single page to evict. In our model, we 65 * have variable sized cache blocks (rangeing from 512 bytes to 66 * 128K bytes). We therefore choose a set of blocks to evict to make 67 * space for a cache miss that approximates as closely as possible 68 * the space used by the new block. 69 * 70 * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" 71 * by N. Megiddo & D. Modha, FAST 2003 72 */ 73 74 /* 75 * The locking model: 76 * 77 * A new reference to a cache buffer can be obtained in two 78 * ways: 1) via a hash table lookup using the DVA as a key, 79 * or 2) via one of the ARC lists. The arc_read() interface 80 * uses method 1, while the internal ARC algorithms for 81 * adjusting the cache use method 2. We therefore provide two 82 * types of locks: 1) the hash table lock array, and 2) the 83 * ARC list locks. 84 * 85 * Buffers do not have their own mutexes, rather they rely on the 86 * hash table mutexes for the bulk of their protection (i.e. most 87 * fields in the arc_buf_hdr_t are protected by these mutexes). 88 * 89 * buf_hash_find() returns the appropriate mutex (held) when it 90 * locates the requested buffer in the hash table. It returns 91 * NULL for the mutex if the buffer was not in the table. 92 * 93 * buf_hash_remove() expects the appropriate hash mutex to be 94 * already held before it is invoked. 95 * 96 * Each ARC state also has a mutex which is used to protect the 97 * buffer list associated with the state. When attempting to 98 * obtain a hash table lock while holding an ARC list lock you 99 * must use: mutex_tryenter() to avoid deadlock. Also note that 100 * the active state mutex must be held before the ghost state mutex. 101 * 102 * Note that the majority of the performance stats are manipulated 103 * with atomic operations. 104 * 105 * The L2ARC uses the l2ad_mtx on each vdev for the following: 106 * 107 * - L2ARC buflist creation 108 * - L2ARC buflist eviction 109 * - L2ARC write completion, which walks L2ARC buflists 110 * - ARC header destruction, as it removes from L2ARC buflists 111 * - ARC header release, as it removes from L2ARC buflists 112 */ 113 114 /* 115 * ARC operation: 116 * 117 * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. 118 * This structure can point either to a block that is still in the cache or to 119 * one that is only accessible in an L2 ARC device, or it can provide 120 * information about a block that was recently evicted. If a block is 121 * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough 122 * information to retrieve it from the L2ARC device. This information is 123 * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block 124 * that is in this state cannot access the data directly. 125 * 126 * Blocks that are actively being referenced or have not been evicted 127 * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within 128 * the arc_buf_hdr_t that will point to the data block in memory. A block can 129 * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC 130 * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and 131 * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). 132 * 133 * The L1ARC's data pointer may or may not be uncompressed. The ARC has the 134 * ability to store the physical data (b_pabd) associated with the DVA of the 135 * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, 136 * it will match its on-disk compression characteristics. This behavior can be 137 * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the 138 * compressed ARC functionality is disabled, the b_pabd will point to an 139 * uncompressed version of the on-disk data. 140 * 141 * Data in the L1ARC is not accessed by consumers of the ARC directly. Each 142 * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. 143 * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC 144 * consumer. The ARC will provide references to this data and will keep it 145 * cached until it is no longer in use. The ARC caches only the L1ARC's physical 146 * data block and will evict any arc_buf_t that is no longer referenced. The 147 * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the 148 * "overhead_size" kstat. 149 * 150 * Depending on the consumer, an arc_buf_t can be requested in uncompressed or 151 * compressed form. The typical case is that consumers will want uncompressed 152 * data, and when that happens a new data buffer is allocated where the data is 153 * decompressed for them to use. Currently the only consumer who wants 154 * compressed arc_buf_t's is "zfs send", when it streams data exactly as it 155 * exists on disk. When this happens, the arc_buf_t's data buffer is shared 156 * with the arc_buf_hdr_t. 157 * 158 * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The 159 * first one is owned by a compressed send consumer (and therefore references 160 * the same compressed data buffer as the arc_buf_hdr_t) and the second could be 161 * used by any other consumer (and has its own uncompressed copy of the data 162 * buffer). 163 * 164 * arc_buf_hdr_t 165 * +-----------+ 166 * | fields | 167 * | common to | 168 * | L1- and | 169 * | L2ARC | 170 * +-----------+ 171 * | l2arc_buf_hdr_t 172 * | | 173 * +-----------+ 174 * | l1arc_buf_hdr_t 175 * | | arc_buf_t 176 * | b_buf +------------>+-----------+ arc_buf_t 177 * | b_pabd +-+ |b_next +---->+-----------+ 178 * +-----------+ | |-----------| |b_next +-->NULL 179 * | |b_comp = T | +-----------+ 180 * | |b_data +-+ |b_comp = F | 181 * | +-----------+ | |b_data +-+ 182 * +->+------+ | +-----------+ | 183 * compressed | | | | 184 * data | |<--------------+ | uncompressed 185 * +------+ compressed, | data 186 * shared +-->+------+ 187 * data | | 188 * | | 189 * +------+ 190 * 191 * When a consumer reads a block, the ARC must first look to see if the 192 * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new 193 * arc_buf_t and either copies uncompressed data into a new data buffer from an 194 * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a 195 * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the 196 * hdr is compressed and the desired compression characteristics of the 197 * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the 198 * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be 199 * the last buffer in the hdr's b_buf list, however a shared compressed buf can 200 * be anywhere in the hdr's list. 201 * 202 * The diagram below shows an example of an uncompressed ARC hdr that is 203 * sharing its data with an arc_buf_t (note that the shared uncompressed buf is 204 * the last element in the buf list): 205 * 206 * arc_buf_hdr_t 207 * +-----------+ 208 * | | 209 * | | 210 * | | 211 * +-----------+ 212 * l2arc_buf_hdr_t| | 213 * | | 214 * +-----------+ 215 * l1arc_buf_hdr_t| | 216 * | | arc_buf_t (shared) 217 * | b_buf +------------>+---------+ arc_buf_t 218 * | | |b_next +---->+---------+ 219 * | b_pabd +-+ |---------| |b_next +-->NULL 220 * +-----------+ | | | +---------+ 221 * | |b_data +-+ | | 222 * | +---------+ | |b_data +-+ 223 * +->+------+ | +---------+ | 224 * | | | | 225 * uncompressed | | | | 226 * data +------+ | | 227 * ^ +->+------+ | 228 * | uncompressed | | | 229 * | data | | | 230 * | +------+ | 231 * +---------------------------------+ 232 * 233 * Writing to the ARC requires that the ARC first discard the hdr's b_pabd 234 * since the physical block is about to be rewritten. The new data contents 235 * will be contained in the arc_buf_t. As the I/O pipeline performs the write, 236 * it may compress the data before writing it to disk. The ARC will be called 237 * with the transformed data and will bcopy the transformed on-disk block into 238 * a newly allocated b_pabd. Writes are always done into buffers which have 239 * either been loaned (and hence are new and don't have other readers) or 240 * buffers which have been released (and hence have their own hdr, if there 241 * were originally other readers of the buf's original hdr). This ensures that 242 * the ARC only needs to update a single buf and its hdr after a write occurs. 243 * 244 * When the L2ARC is in use, it will also take advantage of the b_pabd. The 245 * L2ARC will always write the contents of b_pabd to the L2ARC. This means 246 * that when compressed ARC is enabled that the L2ARC blocks are identical 247 * to the on-disk block in the main data pool. This provides a significant 248 * advantage since the ARC can leverage the bp's checksum when reading from the 249 * L2ARC to determine if the contents are valid. However, if the compressed 250 * ARC is disabled, then the L2ARC's block must be transformed to look 251 * like the physical block in the main data pool before comparing the 252 * checksum and determining its validity. 253 * 254 * The L1ARC has a slightly different system for storing encrypted data. 255 * Raw (encrypted + possibly compressed) data has a few subtle differences from 256 * data that is just compressed. The biggest difference is that it is not 257 * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded. 258 * The other difference is that encryption cannot be treated as a suggestion. 259 * If a caller would prefer compressed data, but they actually wind up with 260 * uncompressed data the worst thing that could happen is there might be a 261 * performance hit. If the caller requests encrypted data, however, we must be 262 * sure they actually get it or else secret information could be leaked. Raw 263 * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, 264 * may have both an encrypted version and a decrypted version of its data at 265 * once. When a caller needs a raw arc_buf_t, it is allocated and the data is 266 * copied out of this header. To avoid complications with b_pabd, raw buffers 267 * cannot be shared. 268 */ 269 270 #include <sys/spa.h> 271 #include <sys/zio.h> 272 #include <sys/spa_impl.h> 273 #include <sys/zio_compress.h> 274 #include <sys/zio_checksum.h> 275 #include <sys/zfs_context.h> 276 #include <sys/arc.h> 277 #include <sys/refcount.h> 278 #include <sys/vdev.h> 279 #include <sys/vdev_impl.h> 280 #include <sys/dsl_pool.h> 281 #include <sys/zio_checksum.h> 282 #include <sys/multilist.h> 283 #include <sys/abd.h> 284 #include <sys/zil.h> 285 #include <sys/fm/fs/zfs.h> 286 #ifdef _KERNEL 287 #include <sys/vmsystm.h> 288 #include <vm/anon.h> 289 #include <sys/fs/swapnode.h> 290 #include <sys/dnlc.h> 291 #endif 292 #include <sys/callb.h> 293 #include <sys/kstat.h> 294 #include <sys/zthr.h> 295 #include <zfs_fletcher.h> 296 #include <sys/aggsum.h> 297 #include <sys/cityhash.h> 298 299 #ifndef _KERNEL 300 /* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ 301 boolean_t arc_watch = B_FALSE; 302 int arc_procfd; 303 #endif 304 305 /* 306 * This thread's job is to keep enough free memory in the system, by 307 * calling arc_kmem_reap_now() plus arc_shrink(), which improves 308 * arc_available_memory(). 309 */ 310 static zthr_t *arc_reap_zthr; 311 312 /* 313 * This thread's job is to keep arc_size under arc_c, by calling 314 * arc_adjust(), which improves arc_is_overflowing(). 315 */ 316 static zthr_t *arc_adjust_zthr; 317 318 static kmutex_t arc_adjust_lock; 319 static kcondvar_t arc_adjust_waiters_cv; 320 static boolean_t arc_adjust_needed = B_FALSE; 321 322 uint_t arc_reduce_dnlc_percent = 3; 323 324 /* 325 * The number of headers to evict in arc_evict_state_impl() before 326 * dropping the sublist lock and evicting from another sublist. A lower 327 * value means we're more likely to evict the "correct" header (i.e. the 328 * oldest header in the arc state), but comes with higher overhead 329 * (i.e. more invocations of arc_evict_state_impl()). 330 */ 331 int zfs_arc_evict_batch_limit = 10; 332 333 /* number of seconds before growing cache again */ 334 int arc_grow_retry = 60; 335 336 /* 337 * Minimum time between calls to arc_kmem_reap_soon(). Note that this will 338 * be converted to ticks, so with the default hz=100, a setting of 15 ms 339 * will actually wait 2 ticks, or 20ms. 340 */ 341 int arc_kmem_cache_reap_retry_ms = 1000; 342 343 /* shift of arc_c for calculating overflow limit in arc_get_data_impl */ 344 int zfs_arc_overflow_shift = 8; 345 346 /* shift of arc_c for calculating both min and max arc_p */ 347 int arc_p_min_shift = 4; 348 349 /* log2(fraction of arc to reclaim) */ 350 int arc_shrink_shift = 7; 351 352 /* 353 * log2(fraction of ARC which must be free to allow growing). 354 * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, 355 * when reading a new block into the ARC, we will evict an equal-sized block 356 * from the ARC. 357 * 358 * This must be less than arc_shrink_shift, so that when we shrink the ARC, 359 * we will still not allow it to grow. 360 */ 361 int arc_no_grow_shift = 5; 362 363 364 /* 365 * minimum lifespan of a prefetch block in clock ticks 366 * (initialized in arc_init()) 367 */ 368 static int zfs_arc_min_prefetch_ms = 1; 369 static int zfs_arc_min_prescient_prefetch_ms = 6; 370 371 /* 372 * If this percent of memory is free, don't throttle. 373 */ 374 int arc_lotsfree_percent = 10; 375 376 static boolean_t arc_initialized; 377 378 /* 379 * The arc has filled available memory and has now warmed up. 380 */ 381 static boolean_t arc_warm; 382 383 /* 384 * log2 fraction of the zio arena to keep free. 385 */ 386 int arc_zio_arena_free_shift = 2; 387 388 /* 389 * These tunables are for performance analysis. 390 */ 391 uint64_t zfs_arc_max; 392 uint64_t zfs_arc_min; 393 uint64_t zfs_arc_meta_limit = 0; 394 uint64_t zfs_arc_meta_min = 0; 395 int zfs_arc_grow_retry = 0; 396 int zfs_arc_shrink_shift = 0; 397 int zfs_arc_p_min_shift = 0; 398 int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ 399 400 /* 401 * ARC dirty data constraints for arc_tempreserve_space() throttle 402 */ 403 uint_t zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ 404 uint_t zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ 405 uint_t zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ 406 407 boolean_t zfs_compressed_arc_enabled = B_TRUE; 408 409 /* 410 * Note that buffers can be in one of 6 states: 411 * ARC_anon - anonymous (discussed below) 412 * ARC_mru - recently used, currently cached 413 * ARC_mru_ghost - recentely used, no longer in cache 414 * ARC_mfu - frequently used, currently cached 415 * ARC_mfu_ghost - frequently used, no longer in cache 416 * ARC_l2c_only - exists in L2ARC but not other states 417 * When there are no active references to the buffer, they are 418 * are linked onto a list in one of these arc states. These are 419 * the only buffers that can be evicted or deleted. Within each 420 * state there are multiple lists, one for meta-data and one for 421 * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, 422 * etc.) is tracked separately so that it can be managed more 423 * explicitly: favored over data, limited explicitly. 424 * 425 * Anonymous buffers are buffers that are not associated with 426 * a DVA. These are buffers that hold dirty block copies 427 * before they are written to stable storage. By definition, 428 * they are "ref'd" and are considered part of arc_mru 429 * that cannot be freed. Generally, they will aquire a DVA 430 * as they are written and migrate onto the arc_mru list. 431 * 432 * The ARC_l2c_only state is for buffers that are in the second 433 * level ARC but no longer in any of the ARC_m* lists. The second 434 * level ARC itself may also contain buffers that are in any of 435 * the ARC_m* states - meaning that a buffer can exist in two 436 * places. The reason for the ARC_l2c_only state is to keep the 437 * buffer header in the hash table, so that reads that hit the 438 * second level ARC benefit from these fast lookups. 439 */ 440 441 typedef struct arc_state { 442 /* 443 * list of evictable buffers 444 */ 445 multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; 446 /* 447 * total amount of evictable data in this state 448 */ 449 zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; 450 /* 451 * total amount of data in this state; this includes: evictable, 452 * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. 453 */ 454 zfs_refcount_t arcs_size; 455 } arc_state_t; 456 457 /* The 6 states: */ 458 static arc_state_t ARC_anon; 459 static arc_state_t ARC_mru; 460 static arc_state_t ARC_mru_ghost; 461 static arc_state_t ARC_mfu; 462 static arc_state_t ARC_mfu_ghost; 463 static arc_state_t ARC_l2c_only; 464 465 typedef struct arc_stats { 466 kstat_named_t arcstat_hits; 467 kstat_named_t arcstat_misses; 468 kstat_named_t arcstat_demand_data_hits; 469 kstat_named_t arcstat_demand_data_misses; 470 kstat_named_t arcstat_demand_metadata_hits; 471 kstat_named_t arcstat_demand_metadata_misses; 472 kstat_named_t arcstat_prefetch_data_hits; 473 kstat_named_t arcstat_prefetch_data_misses; 474 kstat_named_t arcstat_prefetch_metadata_hits; 475 kstat_named_t arcstat_prefetch_metadata_misses; 476 kstat_named_t arcstat_mru_hits; 477 kstat_named_t arcstat_mru_ghost_hits; 478 kstat_named_t arcstat_mfu_hits; 479 kstat_named_t arcstat_mfu_ghost_hits; 480 kstat_named_t arcstat_deleted; 481 /* 482 * Number of buffers that could not be evicted because the hash lock 483 * was held by another thread. The lock may not necessarily be held 484 * by something using the same buffer, since hash locks are shared 485 * by multiple buffers. 486 */ 487 kstat_named_t arcstat_mutex_miss; 488 /* 489 * Number of buffers skipped when updating the access state due to the 490 * header having already been released after acquiring the hash lock. 491 */ 492 kstat_named_t arcstat_access_skip; 493 /* 494 * Number of buffers skipped because they have I/O in progress, are 495 * indirect prefetch buffers that have not lived long enough, or are 496 * not from the spa we're trying to evict from. 497 */ 498 kstat_named_t arcstat_evict_skip; 499 /* 500 * Number of times arc_evict_state() was unable to evict enough 501 * buffers to reach its target amount. 502 */ 503 kstat_named_t arcstat_evict_not_enough; 504 kstat_named_t arcstat_evict_l2_cached; 505 kstat_named_t arcstat_evict_l2_eligible; 506 kstat_named_t arcstat_evict_l2_ineligible; 507 kstat_named_t arcstat_evict_l2_skip; 508 kstat_named_t arcstat_hash_elements; 509 kstat_named_t arcstat_hash_elements_max; 510 kstat_named_t arcstat_hash_collisions; 511 kstat_named_t arcstat_hash_chains; 512 kstat_named_t arcstat_hash_chain_max; 513 kstat_named_t arcstat_p; 514 kstat_named_t arcstat_c; 515 kstat_named_t arcstat_c_min; 516 kstat_named_t arcstat_c_max; 517 /* Not updated directly; only synced in arc_kstat_update. */ 518 kstat_named_t arcstat_size; 519 /* 520 * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. 521 * Note that the compressed bytes may match the uncompressed bytes 522 * if the block is either not compressed or compressed arc is disabled. 523 */ 524 kstat_named_t arcstat_compressed_size; 525 /* 526 * Uncompressed size of the data stored in b_pabd. If compressed 527 * arc is disabled then this value will be identical to the stat 528 * above. 529 */ 530 kstat_named_t arcstat_uncompressed_size; 531 /* 532 * Number of bytes stored in all the arc_buf_t's. This is classified 533 * as "overhead" since this data is typically short-lived and will 534 * be evicted from the arc when it becomes unreferenced unless the 535 * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level 536 * values have been set (see comment in dbuf.c for more information). 537 */ 538 kstat_named_t arcstat_overhead_size; 539 /* 540 * Number of bytes consumed by internal ARC structures necessary 541 * for tracking purposes; these structures are not actually 542 * backed by ARC buffers. This includes arc_buf_hdr_t structures 543 * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only 544 * caches), and arc_buf_t structures (allocated via arc_buf_t 545 * cache). 546 * Not updated directly; only synced in arc_kstat_update. 547 */ 548 kstat_named_t arcstat_hdr_size; 549 /* 550 * Number of bytes consumed by ARC buffers of type equal to 551 * ARC_BUFC_DATA. This is generally consumed by buffers backing 552 * on disk user data (e.g. plain file contents). 553 * Not updated directly; only synced in arc_kstat_update. 554 */ 555 kstat_named_t arcstat_data_size; 556 /* 557 * Number of bytes consumed by ARC buffers of type equal to 558 * ARC_BUFC_METADATA. This is generally consumed by buffers 559 * backing on disk data that is used for internal ZFS 560 * structures (e.g. ZAP, dnode, indirect blocks, etc). 561 * Not updated directly; only synced in arc_kstat_update. 562 */ 563 kstat_named_t arcstat_metadata_size; 564 /* 565 * Number of bytes consumed by various buffers and structures 566 * not actually backed with ARC buffers. This includes bonus 567 * buffers (allocated directly via zio_buf_* functions), 568 * dmu_buf_impl_t structures (allocated via dmu_buf_impl_t 569 * cache), and dnode_t structures (allocated via dnode_t cache). 570 * Not updated directly; only synced in arc_kstat_update. 571 */ 572 kstat_named_t arcstat_other_size; 573 /* 574 * Total number of bytes consumed by ARC buffers residing in the 575 * arc_anon state. This includes *all* buffers in the arc_anon 576 * state; e.g. data, metadata, evictable, and unevictable buffers 577 * are all included in this value. 578 * Not updated directly; only synced in arc_kstat_update. 579 */ 580 kstat_named_t arcstat_anon_size; 581 /* 582 * Number of bytes consumed by ARC buffers that meet the 583 * following criteria: backing buffers of type ARC_BUFC_DATA, 584 * residing in the arc_anon state, and are eligible for eviction 585 * (e.g. have no outstanding holds on the buffer). 586 * Not updated directly; only synced in arc_kstat_update. 587 */ 588 kstat_named_t arcstat_anon_evictable_data; 589 /* 590 * Number of bytes consumed by ARC buffers that meet the 591 * following criteria: backing buffers of type ARC_BUFC_METADATA, 592 * residing in the arc_anon state, and are eligible for eviction 593 * (e.g. have no outstanding holds on the buffer). 594 * Not updated directly; only synced in arc_kstat_update. 595 */ 596 kstat_named_t arcstat_anon_evictable_metadata; 597 /* 598 * Total number of bytes consumed by ARC buffers residing in the 599 * arc_mru state. This includes *all* buffers in the arc_mru 600 * state; e.g. data, metadata, evictable, and unevictable buffers 601 * are all included in this value. 602 * Not updated directly; only synced in arc_kstat_update. 603 */ 604 kstat_named_t arcstat_mru_size; 605 /* 606 * Number of bytes consumed by ARC buffers that meet the 607 * following criteria: backing buffers of type ARC_BUFC_DATA, 608 * residing in the arc_mru state, and are eligible for eviction 609 * (e.g. have no outstanding holds on the buffer). 610 * Not updated directly; only synced in arc_kstat_update. 611 */ 612 kstat_named_t arcstat_mru_evictable_data; 613 /* 614 * Number of bytes consumed by ARC buffers that meet the 615 * following criteria: backing buffers of type ARC_BUFC_METADATA, 616 * residing in the arc_mru state, and are eligible for eviction 617 * (e.g. have no outstanding holds on the buffer). 618 * Not updated directly; only synced in arc_kstat_update. 619 */ 620 kstat_named_t arcstat_mru_evictable_metadata; 621 /* 622 * Total number of bytes that *would have been* consumed by ARC 623 * buffers in the arc_mru_ghost state. The key thing to note 624 * here, is the fact that this size doesn't actually indicate 625 * RAM consumption. The ghost lists only consist of headers and 626 * don't actually have ARC buffers linked off of these headers. 627 * Thus, *if* the headers had associated ARC buffers, these 628 * buffers *would have* consumed this number of bytes. 629 * Not updated directly; only synced in arc_kstat_update. 630 */ 631 kstat_named_t arcstat_mru_ghost_size; 632 /* 633 * Number of bytes that *would have been* consumed by ARC 634 * buffers that are eligible for eviction, of type 635 * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. 636 * Not updated directly; only synced in arc_kstat_update. 637 */ 638 kstat_named_t arcstat_mru_ghost_evictable_data; 639 /* 640 * Number of bytes that *would have been* consumed by ARC 641 * buffers that are eligible for eviction, of type 642 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 643 * Not updated directly; only synced in arc_kstat_update. 644 */ 645 kstat_named_t arcstat_mru_ghost_evictable_metadata; 646 /* 647 * Total number of bytes consumed by ARC buffers residing in the 648 * arc_mfu state. This includes *all* buffers in the arc_mfu 649 * state; e.g. data, metadata, evictable, and unevictable buffers 650 * are all included in this value. 651 * Not updated directly; only synced in arc_kstat_update. 652 */ 653 kstat_named_t arcstat_mfu_size; 654 /* 655 * Number of bytes consumed by ARC buffers that are eligible for 656 * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu 657 * state. 658 * Not updated directly; only synced in arc_kstat_update. 659 */ 660 kstat_named_t arcstat_mfu_evictable_data; 661 /* 662 * Number of bytes consumed by ARC buffers that are eligible for 663 * eviction, of type ARC_BUFC_METADATA, and reside in the 664 * arc_mfu state. 665 * Not updated directly; only synced in arc_kstat_update. 666 */ 667 kstat_named_t arcstat_mfu_evictable_metadata; 668 /* 669 * Total number of bytes that *would have been* consumed by ARC 670 * buffers in the arc_mfu_ghost state. See the comment above 671 * arcstat_mru_ghost_size for more details. 672 * Not updated directly; only synced in arc_kstat_update. 673 */ 674 kstat_named_t arcstat_mfu_ghost_size; 675 /* 676 * Number of bytes that *would have been* consumed by ARC 677 * buffers that are eligible for eviction, of type 678 * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. 679 * Not updated directly; only synced in arc_kstat_update. 680 */ 681 kstat_named_t arcstat_mfu_ghost_evictable_data; 682 /* 683 * Number of bytes that *would have been* consumed by ARC 684 * buffers that are eligible for eviction, of type 685 * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. 686 * Not updated directly; only synced in arc_kstat_update. 687 */ 688 kstat_named_t arcstat_mfu_ghost_evictable_metadata; 689 kstat_named_t arcstat_l2_hits; 690 kstat_named_t arcstat_l2_misses; 691 kstat_named_t arcstat_l2_feeds; 692 kstat_named_t arcstat_l2_rw_clash; 693 kstat_named_t arcstat_l2_read_bytes; 694 kstat_named_t arcstat_l2_write_bytes; 695 kstat_named_t arcstat_l2_writes_sent; 696 kstat_named_t arcstat_l2_writes_done; 697 kstat_named_t arcstat_l2_writes_error; 698 kstat_named_t arcstat_l2_writes_lock_retry; 699 kstat_named_t arcstat_l2_evict_lock_retry; 700 kstat_named_t arcstat_l2_evict_reading; 701 kstat_named_t arcstat_l2_evict_l1cached; 702 kstat_named_t arcstat_l2_free_on_write; 703 kstat_named_t arcstat_l2_abort_lowmem; 704 kstat_named_t arcstat_l2_cksum_bad; 705 kstat_named_t arcstat_l2_io_error; 706 kstat_named_t arcstat_l2_lsize; 707 kstat_named_t arcstat_l2_psize; 708 /* Not updated directly; only synced in arc_kstat_update. */ 709 kstat_named_t arcstat_l2_hdr_size; 710 kstat_named_t arcstat_memory_throttle_count; 711 /* Not updated directly; only synced in arc_kstat_update. */ 712 kstat_named_t arcstat_meta_used; 713 kstat_named_t arcstat_meta_limit; 714 kstat_named_t arcstat_meta_max; 715 kstat_named_t arcstat_meta_min; 716 kstat_named_t arcstat_async_upgrade_sync; 717 kstat_named_t arcstat_demand_hit_predictive_prefetch; 718 kstat_named_t arcstat_demand_hit_prescient_prefetch; 719 } arc_stats_t; 720 721 static arc_stats_t arc_stats = { 722 { "hits", KSTAT_DATA_UINT64 }, 723 { "misses", KSTAT_DATA_UINT64 }, 724 { "demand_data_hits", KSTAT_DATA_UINT64 }, 725 { "demand_data_misses", KSTAT_DATA_UINT64 }, 726 { "demand_metadata_hits", KSTAT_DATA_UINT64 }, 727 { "demand_metadata_misses", KSTAT_DATA_UINT64 }, 728 { "prefetch_data_hits", KSTAT_DATA_UINT64 }, 729 { "prefetch_data_misses", KSTAT_DATA_UINT64 }, 730 { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, 731 { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, 732 { "mru_hits", KSTAT_DATA_UINT64 }, 733 { "mru_ghost_hits", KSTAT_DATA_UINT64 }, 734 { "mfu_hits", KSTAT_DATA_UINT64 }, 735 { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, 736 { "deleted", KSTAT_DATA_UINT64 }, 737 { "mutex_miss", KSTAT_DATA_UINT64 }, 738 { "access_skip", KSTAT_DATA_UINT64 }, 739 { "evict_skip", KSTAT_DATA_UINT64 }, 740 { "evict_not_enough", KSTAT_DATA_UINT64 }, 741 { "evict_l2_cached", KSTAT_DATA_UINT64 }, 742 { "evict_l2_eligible", KSTAT_DATA_UINT64 }, 743 { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, 744 { "evict_l2_skip", KSTAT_DATA_UINT64 }, 745 { "hash_elements", KSTAT_DATA_UINT64 }, 746 { "hash_elements_max", KSTAT_DATA_UINT64 }, 747 { "hash_collisions", KSTAT_DATA_UINT64 }, 748 { "hash_chains", KSTAT_DATA_UINT64 }, 749 { "hash_chain_max", KSTAT_DATA_UINT64 }, 750 { "p", KSTAT_DATA_UINT64 }, 751 { "c", KSTAT_DATA_UINT64 }, 752 { "c_min", KSTAT_DATA_UINT64 }, 753 { "c_max", KSTAT_DATA_UINT64 }, 754 { "size", KSTAT_DATA_UINT64 }, 755 { "compressed_size", KSTAT_DATA_UINT64 }, 756 { "uncompressed_size", KSTAT_DATA_UINT64 }, 757 { "overhead_size", KSTAT_DATA_UINT64 }, 758 { "hdr_size", KSTAT_DATA_UINT64 }, 759 { "data_size", KSTAT_DATA_UINT64 }, 760 { "metadata_size", KSTAT_DATA_UINT64 }, 761 { "other_size", KSTAT_DATA_UINT64 }, 762 { "anon_size", KSTAT_DATA_UINT64 }, 763 { "anon_evictable_data", KSTAT_DATA_UINT64 }, 764 { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, 765 { "mru_size", KSTAT_DATA_UINT64 }, 766 { "mru_evictable_data", KSTAT_DATA_UINT64 }, 767 { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, 768 { "mru_ghost_size", KSTAT_DATA_UINT64 }, 769 { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, 770 { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 771 { "mfu_size", KSTAT_DATA_UINT64 }, 772 { "mfu_evictable_data", KSTAT_DATA_UINT64 }, 773 { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, 774 { "mfu_ghost_size", KSTAT_DATA_UINT64 }, 775 { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, 776 { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, 777 { "l2_hits", KSTAT_DATA_UINT64 }, 778 { "l2_misses", KSTAT_DATA_UINT64 }, 779 { "l2_feeds", KSTAT_DATA_UINT64 }, 780 { "l2_rw_clash", KSTAT_DATA_UINT64 }, 781 { "l2_read_bytes", KSTAT_DATA_UINT64 }, 782 { "l2_write_bytes", KSTAT_DATA_UINT64 }, 783 { "l2_writes_sent", KSTAT_DATA_UINT64 }, 784 { "l2_writes_done", KSTAT_DATA_UINT64 }, 785 { "l2_writes_error", KSTAT_DATA_UINT64 }, 786 { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, 787 { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, 788 { "l2_evict_reading", KSTAT_DATA_UINT64 }, 789 { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, 790 { "l2_free_on_write", KSTAT_DATA_UINT64 }, 791 { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, 792 { "l2_cksum_bad", KSTAT_DATA_UINT64 }, 793 { "l2_io_error", KSTAT_DATA_UINT64 }, 794 { "l2_size", KSTAT_DATA_UINT64 }, 795 { "l2_asize", KSTAT_DATA_UINT64 }, 796 { "l2_hdr_size", KSTAT_DATA_UINT64 }, 797 { "memory_throttle_count", KSTAT_DATA_UINT64 }, 798 { "arc_meta_used", KSTAT_DATA_UINT64 }, 799 { "arc_meta_limit", KSTAT_DATA_UINT64 }, 800 { "arc_meta_max", KSTAT_DATA_UINT64 }, 801 { "arc_meta_min", KSTAT_DATA_UINT64 }, 802 { "async_upgrade_sync", KSTAT_DATA_UINT64 }, 803 { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, 804 { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, 805 }; 806 807 #define ARCSTAT(stat) (arc_stats.stat.value.ui64) 808 809 #define ARCSTAT_INCR(stat, val) \ 810 atomic_add_64(&arc_stats.stat.value.ui64, (val)) 811 812 #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) 813 #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) 814 815 #define ARCSTAT_MAX(stat, val) { \ 816 uint64_t m; \ 817 while ((val) > (m = arc_stats.stat.value.ui64) && \ 818 (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ 819 continue; \ 820 } 821 822 #define ARCSTAT_MAXSTAT(stat) \ 823 ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) 824 825 /* 826 * We define a macro to allow ARC hits/misses to be easily broken down by 827 * two separate conditions, giving a total of four different subtypes for 828 * each of hits and misses (so eight statistics total). 829 */ 830 #define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ 831 if (cond1) { \ 832 if (cond2) { \ 833 ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ 834 } else { \ 835 ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ 836 } \ 837 } else { \ 838 if (cond2) { \ 839 ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ 840 } else { \ 841 ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ 842 } \ 843 } 844 845 kstat_t *arc_ksp; 846 static arc_state_t *arc_anon; 847 static arc_state_t *arc_mru; 848 static arc_state_t *arc_mru_ghost; 849 static arc_state_t *arc_mfu; 850 static arc_state_t *arc_mfu_ghost; 851 static arc_state_t *arc_l2c_only; 852 853 /* 854 * There are several ARC variables that are critical to export as kstats -- 855 * but we don't want to have to grovel around in the kstat whenever we wish to 856 * manipulate them. For these variables, we therefore define them to be in 857 * terms of the statistic variable. This assures that we are not introducing 858 * the possibility of inconsistency by having shadow copies of the variables, 859 * while still allowing the code to be readable. 860 */ 861 #define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ 862 #define arc_c ARCSTAT(arcstat_c) /* target size of cache */ 863 #define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ 864 #define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ 865 #define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ 866 #define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ 867 #define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ 868 869 /* compressed size of entire arc */ 870 #define arc_compressed_size ARCSTAT(arcstat_compressed_size) 871 /* uncompressed size of entire arc */ 872 #define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) 873 /* number of bytes in the arc from arc_buf_t's */ 874 #define arc_overhead_size ARCSTAT(arcstat_overhead_size) 875 876 /* 877 * There are also some ARC variables that we want to export, but that are 878 * updated so often that having the canonical representation be the statistic 879 * variable causes a performance bottleneck. We want to use aggsum_t's for these 880 * instead, but still be able to export the kstat in the same way as before. 881 * The solution is to always use the aggsum version, except in the kstat update 882 * callback. 883 */ 884 aggsum_t arc_size; 885 aggsum_t arc_meta_used; 886 aggsum_t astat_data_size; 887 aggsum_t astat_metadata_size; 888 aggsum_t astat_hdr_size; 889 aggsum_t astat_other_size; 890 aggsum_t astat_l2_hdr_size; 891 892 static int arc_no_grow; /* Don't try to grow cache size */ 893 static hrtime_t arc_growtime; 894 static uint64_t arc_tempreserve; 895 static uint64_t arc_loaned_bytes; 896 897 typedef struct arc_callback arc_callback_t; 898 899 struct arc_callback { 900 void *acb_private; 901 arc_read_done_func_t *acb_done; 902 arc_buf_t *acb_buf; 903 boolean_t acb_encrypted; 904 boolean_t acb_compressed; 905 boolean_t acb_noauth; 906 zbookmark_phys_t acb_zb; 907 zio_t *acb_zio_dummy; 908 zio_t *acb_zio_head; 909 arc_callback_t *acb_next; 910 }; 911 912 typedef struct arc_write_callback arc_write_callback_t; 913 914 struct arc_write_callback { 915 void *awcb_private; 916 arc_write_done_func_t *awcb_ready; 917 arc_write_done_func_t *awcb_children_ready; 918 arc_write_done_func_t *awcb_physdone; 919 arc_write_done_func_t *awcb_done; 920 arc_buf_t *awcb_buf; 921 }; 922 923 /* 924 * ARC buffers are separated into multiple structs as a memory saving measure: 925 * - Common fields struct, always defined, and embedded within it: 926 * - L2-only fields, always allocated but undefined when not in L2ARC 927 * - L1-only fields, only allocated when in L1ARC 928 * 929 * Buffer in L1 Buffer only in L2 930 * +------------------------+ +------------------------+ 931 * | arc_buf_hdr_t | | arc_buf_hdr_t | 932 * | | | | 933 * | | | | 934 * | | | | 935 * +------------------------+ +------------------------+ 936 * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | 937 * | (undefined if L1-only) | | | 938 * +------------------------+ +------------------------+ 939 * | l1arc_buf_hdr_t | 940 * | | 941 * | | 942 * | | 943 * | | 944 * +------------------------+ 945 * 946 * Because it's possible for the L2ARC to become extremely large, we can wind 947 * up eating a lot of memory in L2ARC buffer headers, so the size of a header 948 * is minimized by only allocating the fields necessary for an L1-cached buffer 949 * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and 950 * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple 951 * words in pointers. arc_hdr_realloc() is used to switch a header between 952 * these two allocation states. 953 */ 954 typedef struct l1arc_buf_hdr { 955 kmutex_t b_freeze_lock; 956 zio_cksum_t *b_freeze_cksum; 957 #ifdef ZFS_DEBUG 958 /* 959 * Used for debugging with kmem_flags - by allocating and freeing 960 * b_thawed when the buffer is thawed, we get a record of the stack 961 * trace that thawed it. 962 */ 963 void *b_thawed; 964 #endif 965 966 arc_buf_t *b_buf; 967 uint32_t b_bufcnt; 968 /* for waiting on writes to complete */ 969 kcondvar_t b_cv; 970 uint8_t b_byteswap; 971 972 /* protected by arc state mutex */ 973 arc_state_t *b_state; 974 multilist_node_t b_arc_node; 975 976 /* updated atomically */ 977 clock_t b_arc_access; 978 979 /* self protecting */ 980 zfs_refcount_t b_refcnt; 981 982 arc_callback_t *b_acb; 983 abd_t *b_pabd; 984 } l1arc_buf_hdr_t; 985 986 /* 987 * Encrypted blocks will need to be stored encrypted on the L2ARC 988 * disk as they appear in the main pool. In order for this to work we 989 * need to pass around the encryption parameters so they can be used 990 * to write data to the L2ARC. This struct is only defined in the 991 * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED 992 * flag set. 993 */ 994 typedef struct arc_buf_hdr_crypt { 995 abd_t *b_rabd; /* raw encrypted data */ 996 dmu_object_type_t b_ot; /* object type */ 997 uint32_t b_ebufcnt; /* number or encryped buffers */ 998 999 /* dsobj for looking up encryption key for l2arc encryption */ 1000 uint64_t b_dsobj; /* for looking up key */ 1001 1002 /* encryption parameters */ 1003 uint8_t b_salt[ZIO_DATA_SALT_LEN]; 1004 uint8_t b_iv[ZIO_DATA_IV_LEN]; 1005 1006 /* 1007 * Technically this could be removed since we will always be able to 1008 * get the mac from the bp when we need it. However, it is inconvenient 1009 * for callers of arc code to have to pass a bp in all the time. This 1010 * also allows us to assert that L2ARC data is properly encrypted to 1011 * match the data in the main storage pool. 1012 */ 1013 uint8_t b_mac[ZIO_DATA_MAC_LEN]; 1014 } arc_buf_hdr_crypt_t; 1015 1016 typedef struct l2arc_dev l2arc_dev_t; 1017 1018 typedef struct l2arc_buf_hdr { 1019 /* protected by arc_buf_hdr mutex */ 1020 l2arc_dev_t *b_dev; /* L2ARC device */ 1021 uint64_t b_daddr; /* disk address, offset byte */ 1022 1023 list_node_t b_l2node; 1024 } l2arc_buf_hdr_t; 1025 1026 struct arc_buf_hdr { 1027 /* protected by hash lock */ 1028 dva_t b_dva; 1029 uint64_t b_birth; 1030 1031 arc_buf_contents_t b_type; 1032 arc_buf_hdr_t *b_hash_next; 1033 arc_flags_t b_flags; 1034 1035 /* 1036 * This field stores the size of the data buffer after 1037 * compression, and is set in the arc's zio completion handlers. 1038 * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). 1039 * 1040 * While the block pointers can store up to 32MB in their psize 1041 * field, we can only store up to 32MB minus 512B. This is due 1042 * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. 1043 * a field of zeros represents 512B in the bp). We can't use a 1044 * bias of 1 since we need to reserve a psize of zero, here, to 1045 * represent holes and embedded blocks. 1046 * 1047 * This isn't a problem in practice, since the maximum size of a 1048 * buffer is limited to 16MB, so we never need to store 32MB in 1049 * this field. Even in the upstream illumos code base, the 1050 * maximum size of a buffer is limited to 16MB. 1051 */ 1052 uint16_t b_psize; 1053 1054 /* 1055 * This field stores the size of the data buffer before 1056 * compression, and cannot change once set. It is in units 1057 * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) 1058 */ 1059 uint16_t b_lsize; /* immutable */ 1060 uint64_t b_spa; /* immutable */ 1061 1062 /* L2ARC fields. Undefined when not in L2ARC. */ 1063 l2arc_buf_hdr_t b_l2hdr; 1064 /* L1ARC fields. Undefined when in l2arc_only state */ 1065 l1arc_buf_hdr_t b_l1hdr; 1066 /* 1067 * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED 1068 * is set and the L1 header exists. 1069 */ 1070 arc_buf_hdr_crypt_t b_crypt_hdr; 1071 }; 1072 1073 #define GHOST_STATE(state) \ 1074 ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ 1075 (state) == arc_l2c_only) 1076 1077 #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) 1078 #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) 1079 #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) 1080 #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) 1081 #define HDR_PRESCIENT_PREFETCH(hdr) \ 1082 ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) 1083 #define HDR_COMPRESSION_ENABLED(hdr) \ 1084 ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) 1085 1086 #define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) 1087 #define HDR_L2_READING(hdr) \ 1088 (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ 1089 ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) 1090 #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) 1091 #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) 1092 #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) 1093 #define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) 1094 #define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) 1095 #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) 1096 1097 #define HDR_ISTYPE_METADATA(hdr) \ 1098 ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) 1099 #define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) 1100 1101 #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) 1102 #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) 1103 #define HDR_HAS_RABD(hdr) \ 1104 (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ 1105 (hdr)->b_crypt_hdr.b_rabd != NULL) 1106 #define HDR_ENCRYPTED(hdr) \ 1107 (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) 1108 #define HDR_AUTHENTICATED(hdr) \ 1109 (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) 1110 1111 /* For storing compression mode in b_flags */ 1112 #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) 1113 1114 #define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ 1115 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) 1116 #define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ 1117 HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); 1118 1119 #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) 1120 #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) 1121 #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) 1122 #define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) 1123 1124 /* 1125 * Other sizes 1126 */ 1127 1128 #define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) 1129 #define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) 1130 #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) 1131 1132 /* 1133 * Hash table routines 1134 */ 1135 1136 #define HT_LOCK_PAD 64 1137 1138 struct ht_lock { 1139 kmutex_t ht_lock; 1140 #ifdef _KERNEL 1141 unsigned char pad[(HT_LOCK_PAD - sizeof (kmutex_t))]; 1142 #endif 1143 }; 1144 1145 #define BUF_LOCKS 256 1146 typedef struct buf_hash_table { 1147 uint64_t ht_mask; 1148 arc_buf_hdr_t **ht_table; 1149 struct ht_lock ht_locks[BUF_LOCKS]; 1150 } buf_hash_table_t; 1151 1152 static buf_hash_table_t buf_hash_table; 1153 1154 #define BUF_HASH_INDEX(spa, dva, birth) \ 1155 (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) 1156 #define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) 1157 #define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) 1158 #define HDR_LOCK(hdr) \ 1159 (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) 1160 1161 uint64_t zfs_crc64_table[256]; 1162 1163 /* 1164 * Level 2 ARC 1165 */ 1166 1167 #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ 1168 #define L2ARC_HEADROOM 2 /* num of writes */ 1169 /* 1170 * If we discover during ARC scan any buffers to be compressed, we boost 1171 * our headroom for the next scanning cycle by this percentage multiple. 1172 */ 1173 #define L2ARC_HEADROOM_BOOST 200 1174 #define L2ARC_FEED_SECS 1 /* caching interval secs */ 1175 #define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ 1176 1177 #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) 1178 #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) 1179 1180 /* L2ARC Performance Tunables */ 1181 uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ 1182 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ 1183 uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ 1184 uint64_t l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; 1185 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ 1186 uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ 1187 boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ 1188 boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ 1189 boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ 1190 1191 /* 1192 * L2ARC Internals 1193 */ 1194 struct l2arc_dev { 1195 vdev_t *l2ad_vdev; /* vdev */ 1196 spa_t *l2ad_spa; /* spa */ 1197 uint64_t l2ad_hand; /* next write location */ 1198 uint64_t l2ad_start; /* first addr on device */ 1199 uint64_t l2ad_end; /* last addr on device */ 1200 boolean_t l2ad_first; /* first sweep through */ 1201 boolean_t l2ad_writing; /* currently writing */ 1202 kmutex_t l2ad_mtx; /* lock for buffer list */ 1203 list_t l2ad_buflist; /* buffer list */ 1204 list_node_t l2ad_node; /* device list node */ 1205 zfs_refcount_t l2ad_alloc; /* allocated bytes */ 1206 }; 1207 1208 static list_t L2ARC_dev_list; /* device list */ 1209 static list_t *l2arc_dev_list; /* device list pointer */ 1210 static kmutex_t l2arc_dev_mtx; /* device list mutex */ 1211 static l2arc_dev_t *l2arc_dev_last; /* last device used */ 1212 static list_t L2ARC_free_on_write; /* free after write buf list */ 1213 static list_t *l2arc_free_on_write; /* free after write list ptr */ 1214 static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ 1215 static uint64_t l2arc_ndev; /* number of devices */ 1216 1217 typedef struct l2arc_read_callback { 1218 arc_buf_hdr_t *l2rcb_hdr; /* read header */ 1219 blkptr_t l2rcb_bp; /* original blkptr */ 1220 zbookmark_phys_t l2rcb_zb; /* original bookmark */ 1221 int l2rcb_flags; /* original flags */ 1222 abd_t *l2rcb_abd; /* temporary buffer */ 1223 } l2arc_read_callback_t; 1224 1225 typedef struct l2arc_write_callback { 1226 l2arc_dev_t *l2wcb_dev; /* device info */ 1227 arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ 1228 } l2arc_write_callback_t; 1229 1230 typedef struct l2arc_data_free { 1231 /* protected by l2arc_free_on_write_mtx */ 1232 abd_t *l2df_abd; 1233 size_t l2df_size; 1234 arc_buf_contents_t l2df_type; 1235 list_node_t l2df_list_node; 1236 } l2arc_data_free_t; 1237 1238 static kmutex_t l2arc_feed_thr_lock; 1239 static kcondvar_t l2arc_feed_thr_cv; 1240 static uint8_t l2arc_thread_exit; 1241 1242 static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); 1243 typedef enum arc_fill_flags { 1244 ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ 1245 ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ 1246 ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ 1247 ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ 1248 ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ 1249 } arc_fill_flags_t; 1250 1251 static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); 1252 static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); 1253 static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); 1254 static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); 1255 static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); 1256 static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t); 1257 static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); 1258 static void arc_access(arc_buf_hdr_t *, kmutex_t *); 1259 static boolean_t arc_is_overflowing(); 1260 static void arc_buf_watch(arc_buf_t *); 1261 1262 static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); 1263 static uint32_t arc_bufc_to_flags(arc_buf_contents_t); 1264 static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1265 static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); 1266 1267 static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); 1268 static void l2arc_read_done(zio_t *); 1269 1270 1271 /* 1272 * We use Cityhash for this. It's fast, and has good hash properties without 1273 * requiring any large static buffers. 1274 */ 1275 static uint64_t 1276 buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) 1277 { 1278 return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); 1279 } 1280 1281 #define HDR_EMPTY(hdr) \ 1282 ((hdr)->b_dva.dva_word[0] == 0 && \ 1283 (hdr)->b_dva.dva_word[1] == 0) 1284 1285 #define HDR_EQUAL(spa, dva, birth, hdr) \ 1286 ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ 1287 ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ 1288 ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) 1289 1290 static void 1291 buf_discard_identity(arc_buf_hdr_t *hdr) 1292 { 1293 hdr->b_dva.dva_word[0] = 0; 1294 hdr->b_dva.dva_word[1] = 0; 1295 hdr->b_birth = 0; 1296 } 1297 1298 static arc_buf_hdr_t * 1299 buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) 1300 { 1301 const dva_t *dva = BP_IDENTITY(bp); 1302 uint64_t birth = BP_PHYSICAL_BIRTH(bp); 1303 uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); 1304 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1305 arc_buf_hdr_t *hdr; 1306 1307 mutex_enter(hash_lock); 1308 for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; 1309 hdr = hdr->b_hash_next) { 1310 if (HDR_EQUAL(spa, dva, birth, hdr)) { 1311 *lockp = hash_lock; 1312 return (hdr); 1313 } 1314 } 1315 mutex_exit(hash_lock); 1316 *lockp = NULL; 1317 return (NULL); 1318 } 1319 1320 /* 1321 * Insert an entry into the hash table. If there is already an element 1322 * equal to elem in the hash table, then the already existing element 1323 * will be returned and the new element will not be inserted. 1324 * Otherwise returns NULL. 1325 * If lockp == NULL, the caller is assumed to already hold the hash lock. 1326 */ 1327 static arc_buf_hdr_t * 1328 buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) 1329 { 1330 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1331 kmutex_t *hash_lock = BUF_HASH_LOCK(idx); 1332 arc_buf_hdr_t *fhdr; 1333 uint32_t i; 1334 1335 ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); 1336 ASSERT(hdr->b_birth != 0); 1337 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 1338 1339 if (lockp != NULL) { 1340 *lockp = hash_lock; 1341 mutex_enter(hash_lock); 1342 } else { 1343 ASSERT(MUTEX_HELD(hash_lock)); 1344 } 1345 1346 for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; 1347 fhdr = fhdr->b_hash_next, i++) { 1348 if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) 1349 return (fhdr); 1350 } 1351 1352 hdr->b_hash_next = buf_hash_table.ht_table[idx]; 1353 buf_hash_table.ht_table[idx] = hdr; 1354 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1355 1356 /* collect some hash table performance data */ 1357 if (i > 0) { 1358 ARCSTAT_BUMP(arcstat_hash_collisions); 1359 if (i == 1) 1360 ARCSTAT_BUMP(arcstat_hash_chains); 1361 1362 ARCSTAT_MAX(arcstat_hash_chain_max, i); 1363 } 1364 1365 ARCSTAT_BUMP(arcstat_hash_elements); 1366 ARCSTAT_MAXSTAT(arcstat_hash_elements); 1367 1368 return (NULL); 1369 } 1370 1371 static void 1372 buf_hash_remove(arc_buf_hdr_t *hdr) 1373 { 1374 arc_buf_hdr_t *fhdr, **hdrp; 1375 uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); 1376 1377 ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); 1378 ASSERT(HDR_IN_HASH_TABLE(hdr)); 1379 1380 hdrp = &buf_hash_table.ht_table[idx]; 1381 while ((fhdr = *hdrp) != hdr) { 1382 ASSERT3P(fhdr, !=, NULL); 1383 hdrp = &fhdr->b_hash_next; 1384 } 1385 *hdrp = hdr->b_hash_next; 1386 hdr->b_hash_next = NULL; 1387 arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 1388 1389 /* collect some hash table performance data */ 1390 ARCSTAT_BUMPDOWN(arcstat_hash_elements); 1391 1392 if (buf_hash_table.ht_table[idx] && 1393 buf_hash_table.ht_table[idx]->b_hash_next == NULL) 1394 ARCSTAT_BUMPDOWN(arcstat_hash_chains); 1395 } 1396 1397 /* 1398 * Global data structures and functions for the buf kmem cache. 1399 */ 1400 1401 static kmem_cache_t *hdr_full_cache; 1402 static kmem_cache_t *hdr_full_crypt_cache; 1403 static kmem_cache_t *hdr_l2only_cache; 1404 static kmem_cache_t *buf_cache; 1405 1406 static void 1407 buf_fini(void) 1408 { 1409 int i; 1410 1411 kmem_free(buf_hash_table.ht_table, 1412 (buf_hash_table.ht_mask + 1) * sizeof (void *)); 1413 for (i = 0; i < BUF_LOCKS; i++) 1414 mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); 1415 kmem_cache_destroy(hdr_full_cache); 1416 kmem_cache_destroy(hdr_full_crypt_cache); 1417 kmem_cache_destroy(hdr_l2only_cache); 1418 kmem_cache_destroy(buf_cache); 1419 } 1420 1421 /* 1422 * Constructor callback - called when the cache is empty 1423 * and a new buf is requested. 1424 */ 1425 /* ARGSUSED */ 1426 static int 1427 hdr_full_cons(void *vbuf, void *unused, int kmflag) 1428 { 1429 arc_buf_hdr_t *hdr = vbuf; 1430 1431 bzero(hdr, HDR_FULL_SIZE); 1432 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 1433 cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); 1434 zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); 1435 mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); 1436 multilist_link_init(&hdr->b_l1hdr.b_arc_node); 1437 arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1438 1439 return (0); 1440 } 1441 1442 /* ARGSUSED */ 1443 static int 1444 hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) 1445 { 1446 arc_buf_hdr_t *hdr = vbuf; 1447 1448 (void) hdr_full_cons(vbuf, unused, kmflag); 1449 bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); 1450 arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); 1451 1452 return (0); 1453 } 1454 1455 /* ARGSUSED */ 1456 static int 1457 hdr_l2only_cons(void *vbuf, void *unused, int kmflag) 1458 { 1459 arc_buf_hdr_t *hdr = vbuf; 1460 1461 bzero(hdr, HDR_L2ONLY_SIZE); 1462 arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1463 1464 return (0); 1465 } 1466 1467 /* ARGSUSED */ 1468 static int 1469 buf_cons(void *vbuf, void *unused, int kmflag) 1470 { 1471 arc_buf_t *buf = vbuf; 1472 1473 bzero(buf, sizeof (arc_buf_t)); 1474 mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); 1475 arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1476 1477 return (0); 1478 } 1479 1480 /* 1481 * Destructor callback - called when a cached buf is 1482 * no longer required. 1483 */ 1484 /* ARGSUSED */ 1485 static void 1486 hdr_full_dest(void *vbuf, void *unused) 1487 { 1488 arc_buf_hdr_t *hdr = vbuf; 1489 1490 ASSERT(HDR_EMPTY(hdr)); 1491 cv_destroy(&hdr->b_l1hdr.b_cv); 1492 zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); 1493 mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); 1494 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 1495 arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); 1496 } 1497 1498 /* ARGSUSED */ 1499 static void 1500 hdr_full_crypt_dest(void *vbuf, void *unused) 1501 { 1502 arc_buf_hdr_t *hdr = vbuf; 1503 1504 hdr_full_dest(hdr, unused); 1505 arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); 1506 } 1507 1508 /* ARGSUSED */ 1509 static void 1510 hdr_l2only_dest(void *vbuf, void *unused) 1511 { 1512 arc_buf_hdr_t *hdr = vbuf; 1513 1514 ASSERT(HDR_EMPTY(hdr)); 1515 arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); 1516 } 1517 1518 /* ARGSUSED */ 1519 static void 1520 buf_dest(void *vbuf, void *unused) 1521 { 1522 arc_buf_t *buf = vbuf; 1523 1524 mutex_destroy(&buf->b_evict_lock); 1525 arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); 1526 } 1527 1528 /* 1529 * Reclaim callback -- invoked when memory is low. 1530 */ 1531 /* ARGSUSED */ 1532 static void 1533 hdr_recl(void *unused) 1534 { 1535 dprintf("hdr_recl called\n"); 1536 /* 1537 * umem calls the reclaim func when we destroy the buf cache, 1538 * which is after we do arc_fini(). 1539 */ 1540 if (arc_initialized) 1541 zthr_wakeup(arc_reap_zthr); 1542 } 1543 1544 static void 1545 buf_init(void) 1546 { 1547 uint64_t *ct; 1548 uint64_t hsize = 1ULL << 12; 1549 int i, j; 1550 1551 /* 1552 * The hash table is big enough to fill all of physical memory 1553 * with an average block size of zfs_arc_average_blocksize (default 8K). 1554 * By default, the table will take up 1555 * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). 1556 */ 1557 while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) 1558 hsize <<= 1; 1559 retry: 1560 buf_hash_table.ht_mask = hsize - 1; 1561 buf_hash_table.ht_table = 1562 kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); 1563 if (buf_hash_table.ht_table == NULL) { 1564 ASSERT(hsize > (1ULL << 8)); 1565 hsize >>= 1; 1566 goto retry; 1567 } 1568 1569 hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 1570 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); 1571 hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", 1572 HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, 1573 hdr_recl, NULL, NULL, 0); 1574 hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", 1575 HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, 1576 NULL, NULL, 0); 1577 buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), 1578 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); 1579 1580 for (i = 0; i < 256; i++) 1581 for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) 1582 *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); 1583 1584 for (i = 0; i < BUF_LOCKS; i++) { 1585 mutex_init(&buf_hash_table.ht_locks[i].ht_lock, 1586 NULL, MUTEX_DEFAULT, NULL); 1587 } 1588 } 1589 1590 /* 1591 * This is the size that the buf occupies in memory. If the buf is compressed, 1592 * it will correspond to the compressed size. You should use this method of 1593 * getting the buf size unless you explicitly need the logical size. 1594 */ 1595 int32_t 1596 arc_buf_size(arc_buf_t *buf) 1597 { 1598 return (ARC_BUF_COMPRESSED(buf) ? 1599 HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); 1600 } 1601 1602 int32_t 1603 arc_buf_lsize(arc_buf_t *buf) 1604 { 1605 return (HDR_GET_LSIZE(buf->b_hdr)); 1606 } 1607 1608 /* 1609 * This function will return B_TRUE if the buffer is encrypted in memory. 1610 * This buffer can be decrypted by calling arc_untransform(). 1611 */ 1612 boolean_t 1613 arc_is_encrypted(arc_buf_t *buf) 1614 { 1615 return (ARC_BUF_ENCRYPTED(buf) != 0); 1616 } 1617 1618 /* 1619 * Returns B_TRUE if the buffer represents data that has not had its MAC 1620 * verified yet. 1621 */ 1622 boolean_t 1623 arc_is_unauthenticated(arc_buf_t *buf) 1624 { 1625 return (HDR_NOAUTH(buf->b_hdr) != 0); 1626 } 1627 1628 void 1629 arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, 1630 uint8_t *iv, uint8_t *mac) 1631 { 1632 arc_buf_hdr_t *hdr = buf->b_hdr; 1633 1634 ASSERT(HDR_PROTECTED(hdr)); 1635 1636 bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); 1637 bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); 1638 bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); 1639 *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? 1640 /* CONSTCOND */ 1641 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; 1642 } 1643 1644 /* 1645 * Indicates how this buffer is compressed in memory. If it is not compressed 1646 * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with 1647 * arc_untransform() as long as it is also unencrypted. 1648 */ 1649 enum zio_compress 1650 arc_get_compression(arc_buf_t *buf) 1651 { 1652 return (ARC_BUF_COMPRESSED(buf) ? 1653 HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); 1654 } 1655 1656 #define ARC_MINTIME (hz>>4) /* 62 ms */ 1657 1658 /* 1659 * Return the compression algorithm used to store this data in the ARC. If ARC 1660 * compression is enabled or this is an encrypted block, this will be the same 1661 * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. 1662 */ 1663 static inline enum zio_compress 1664 arc_hdr_get_compress(arc_buf_hdr_t *hdr) 1665 { 1666 return (HDR_COMPRESSION_ENABLED(hdr) ? 1667 HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); 1668 } 1669 1670 static inline boolean_t 1671 arc_buf_is_shared(arc_buf_t *buf) 1672 { 1673 boolean_t shared = (buf->b_data != NULL && 1674 buf->b_hdr->b_l1hdr.b_pabd != NULL && 1675 abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && 1676 buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); 1677 IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); 1678 IMPLY(shared, ARC_BUF_SHARED(buf)); 1679 IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); 1680 1681 /* 1682 * It would be nice to assert arc_can_share() too, but the "hdr isn't 1683 * already being shared" requirement prevents us from doing that. 1684 */ 1685 1686 return (shared); 1687 } 1688 1689 /* 1690 * Free the checksum associated with this header. If there is no checksum, this 1691 * is a no-op. 1692 */ 1693 static inline void 1694 arc_cksum_free(arc_buf_hdr_t *hdr) 1695 { 1696 ASSERT(HDR_HAS_L1HDR(hdr)); 1697 1698 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1699 if (hdr->b_l1hdr.b_freeze_cksum != NULL) { 1700 kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); 1701 hdr->b_l1hdr.b_freeze_cksum = NULL; 1702 } 1703 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1704 } 1705 1706 /* 1707 * Return true iff at least one of the bufs on hdr is not compressed. 1708 * Encrypted buffers count as compressed. 1709 */ 1710 static boolean_t 1711 arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) 1712 { 1713 ASSERT(hdr->b_l1hdr.b_state == arc_anon || 1714 MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 1715 1716 for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { 1717 if (!ARC_BUF_COMPRESSED(b)) { 1718 return (B_TRUE); 1719 } 1720 } 1721 return (B_FALSE); 1722 } 1723 1724 /* 1725 * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data 1726 * matches the checksum that is stored in the hdr. If there is no checksum, 1727 * or if the buf is compressed, this is a no-op. 1728 */ 1729 static void 1730 arc_cksum_verify(arc_buf_t *buf) 1731 { 1732 arc_buf_hdr_t *hdr = buf->b_hdr; 1733 zio_cksum_t zc; 1734 1735 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1736 return; 1737 1738 if (ARC_BUF_COMPRESSED(buf)) 1739 return; 1740 1741 ASSERT(HDR_HAS_L1HDR(hdr)); 1742 1743 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1744 1745 if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { 1746 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1747 return; 1748 } 1749 1750 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); 1751 if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) 1752 panic("buffer modified while frozen!"); 1753 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1754 } 1755 1756 /* 1757 * This function makes the assumption that data stored in the L2ARC 1758 * will be transformed exactly as it is in the main pool. Because of 1759 * this we can verify the checksum against the reading process's bp. 1760 */ 1761 static boolean_t 1762 arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) 1763 { 1764 enum zio_compress compress = BP_GET_COMPRESS(zio->io_bp); 1765 boolean_t valid_cksum; 1766 1767 ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); 1768 VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); 1769 1770 /* 1771 * We rely on the blkptr's checksum to determine if the block 1772 * is valid or not. When compressed arc is enabled, the l2arc 1773 * writes the block to the l2arc just as it appears in the pool. 1774 * This allows us to use the blkptr's checksum to validate the 1775 * data that we just read off of the l2arc without having to store 1776 * a separate checksum in the arc_buf_hdr_t. However, if compressed 1777 * arc is disabled, then the data written to the l2arc is always 1778 * uncompressed and won't match the block as it exists in the main 1779 * pool. When this is the case, we must first compress it if it is 1780 * compressed on the main pool before we can validate the checksum. 1781 */ 1782 if (!HDR_COMPRESSION_ENABLED(hdr) && compress != ZIO_COMPRESS_OFF) { 1783 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 1784 uint64_t lsize = HDR_GET_LSIZE(hdr); 1785 uint64_t csize; 1786 1787 abd_t *cdata = abd_alloc_linear(HDR_GET_PSIZE(hdr), B_TRUE); 1788 csize = zio_compress_data(compress, zio->io_abd, 1789 abd_to_buf(cdata), lsize); 1790 1791 ASSERT3U(csize, <=, HDR_GET_PSIZE(hdr)); 1792 if (csize < HDR_GET_PSIZE(hdr)) { 1793 /* 1794 * Compressed blocks are always a multiple of the 1795 * smallest ashift in the pool. Ideally, we would 1796 * like to round up the csize to the next 1797 * spa_min_ashift but that value may have changed 1798 * since the block was last written. Instead, 1799 * we rely on the fact that the hdr's psize 1800 * was set to the psize of the block when it was 1801 * last written. We set the csize to that value 1802 * and zero out any part that should not contain 1803 * data. 1804 */ 1805 abd_zero_off(cdata, csize, HDR_GET_PSIZE(hdr) - csize); 1806 csize = HDR_GET_PSIZE(hdr); 1807 } 1808 zio_push_transform(zio, cdata, csize, HDR_GET_PSIZE(hdr), NULL); 1809 } 1810 1811 /* 1812 * Block pointers always store the checksum for the logical data. 1813 * If the block pointer has the gang bit set, then the checksum 1814 * it represents is for the reconstituted data and not for an 1815 * individual gang member. The zio pipeline, however, must be able to 1816 * determine the checksum of each of the gang constituents so it 1817 * treats the checksum comparison differently than what we need 1818 * for l2arc blocks. This prevents us from using the 1819 * zio_checksum_error() interface directly. Instead we must call the 1820 * zio_checksum_error_impl() so that we can ensure the checksum is 1821 * generated using the correct checksum algorithm and accounts for the 1822 * logical I/O size and not just a gang fragment. 1823 */ 1824 valid_cksum = (zio_checksum_error_impl(zio->io_spa, zio->io_bp, 1825 BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, 1826 zio->io_offset, NULL) == 0); 1827 zio_pop_transforms(zio); 1828 return (valid_cksum); 1829 } 1830 1831 /* 1832 * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a 1833 * checksum and attaches it to the buf's hdr so that we can ensure that the buf 1834 * isn't modified later on. If buf is compressed or there is already a checksum 1835 * on the hdr, this is a no-op (we only checksum uncompressed bufs). 1836 */ 1837 static void 1838 arc_cksum_compute(arc_buf_t *buf) 1839 { 1840 arc_buf_hdr_t *hdr = buf->b_hdr; 1841 1842 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1843 return; 1844 1845 ASSERT(HDR_HAS_L1HDR(hdr)); 1846 1847 mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); 1848 if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { 1849 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1850 return; 1851 } 1852 1853 ASSERT(!ARC_BUF_ENCRYPTED(buf)); 1854 ASSERT(!ARC_BUF_COMPRESSED(buf)); 1855 hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), 1856 KM_SLEEP); 1857 fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, 1858 hdr->b_l1hdr.b_freeze_cksum); 1859 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1860 arc_buf_watch(buf); 1861 } 1862 1863 #ifndef _KERNEL 1864 typedef struct procctl { 1865 long cmd; 1866 prwatch_t prwatch; 1867 } procctl_t; 1868 #endif 1869 1870 /* ARGSUSED */ 1871 static void 1872 arc_buf_unwatch(arc_buf_t *buf) 1873 { 1874 #ifndef _KERNEL 1875 if (arc_watch) { 1876 int result; 1877 procctl_t ctl; 1878 ctl.cmd = PCWATCH; 1879 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1880 ctl.prwatch.pr_size = 0; 1881 ctl.prwatch.pr_wflags = 0; 1882 result = write(arc_procfd, &ctl, sizeof (ctl)); 1883 ASSERT3U(result, ==, sizeof (ctl)); 1884 } 1885 #endif 1886 } 1887 1888 /* ARGSUSED */ 1889 static void 1890 arc_buf_watch(arc_buf_t *buf) 1891 { 1892 #ifndef _KERNEL 1893 if (arc_watch) { 1894 int result; 1895 procctl_t ctl; 1896 ctl.cmd = PCWATCH; 1897 ctl.prwatch.pr_vaddr = (uintptr_t)buf->b_data; 1898 ctl.prwatch.pr_size = arc_buf_size(buf); 1899 ctl.prwatch.pr_wflags = WA_WRITE; 1900 result = write(arc_procfd, &ctl, sizeof (ctl)); 1901 ASSERT3U(result, ==, sizeof (ctl)); 1902 } 1903 #endif 1904 } 1905 1906 static arc_buf_contents_t 1907 arc_buf_type(arc_buf_hdr_t *hdr) 1908 { 1909 arc_buf_contents_t type; 1910 if (HDR_ISTYPE_METADATA(hdr)) { 1911 type = ARC_BUFC_METADATA; 1912 } else { 1913 type = ARC_BUFC_DATA; 1914 } 1915 VERIFY3U(hdr->b_type, ==, type); 1916 return (type); 1917 } 1918 1919 boolean_t 1920 arc_is_metadata(arc_buf_t *buf) 1921 { 1922 return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); 1923 } 1924 1925 static uint32_t 1926 arc_bufc_to_flags(arc_buf_contents_t type) 1927 { 1928 switch (type) { 1929 case ARC_BUFC_DATA: 1930 /* metadata field is 0 if buffer contains normal data */ 1931 return (0); 1932 case ARC_BUFC_METADATA: 1933 return (ARC_FLAG_BUFC_METADATA); 1934 default: 1935 break; 1936 } 1937 panic("undefined ARC buffer type!"); 1938 return ((uint32_t)-1); 1939 } 1940 1941 void 1942 arc_buf_thaw(arc_buf_t *buf) 1943 { 1944 arc_buf_hdr_t *hdr = buf->b_hdr; 1945 1946 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 1947 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 1948 1949 arc_cksum_verify(buf); 1950 1951 /* 1952 * Compressed buffers do not manipulate the b_freeze_cksum. 1953 */ 1954 if (ARC_BUF_COMPRESSED(buf)) 1955 return; 1956 1957 ASSERT(HDR_HAS_L1HDR(hdr)); 1958 arc_cksum_free(hdr); 1959 1960 mutex_enter(&hdr->b_l1hdr.b_freeze_lock); 1961 #ifdef ZFS_DEBUG 1962 if (zfs_flags & ZFS_DEBUG_MODIFY) { 1963 if (hdr->b_l1hdr.b_thawed != NULL) 1964 kmem_free(hdr->b_l1hdr.b_thawed, 1); 1965 hdr->b_l1hdr.b_thawed = kmem_alloc(1, KM_SLEEP); 1966 } 1967 #endif 1968 1969 mutex_exit(&hdr->b_l1hdr.b_freeze_lock); 1970 1971 arc_buf_unwatch(buf); 1972 } 1973 1974 void 1975 arc_buf_freeze(arc_buf_t *buf) 1976 { 1977 if (!(zfs_flags & ZFS_DEBUG_MODIFY)) 1978 return; 1979 1980 if (ARC_BUF_COMPRESSED(buf)) 1981 return; 1982 1983 ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); 1984 arc_cksum_compute(buf); 1985 } 1986 1987 /* 1988 * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, 1989 * the following functions should be used to ensure that the flags are 1990 * updated in a thread-safe way. When manipulating the flags either 1991 * the hash_lock must be held or the hdr must be undiscoverable. This 1992 * ensures that we're not racing with any other threads when updating 1993 * the flags. 1994 */ 1995 static inline void 1996 arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 1997 { 1998 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 1999 hdr->b_flags |= flags; 2000 } 2001 2002 static inline void 2003 arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) 2004 { 2005 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2006 hdr->b_flags &= ~flags; 2007 } 2008 2009 /* 2010 * Setting the compression bits in the arc_buf_hdr_t's b_flags is 2011 * done in a special way since we have to clear and set bits 2012 * at the same time. Consumers that wish to set the compression bits 2013 * must use this function to ensure that the flags are updated in 2014 * thread-safe manner. 2015 */ 2016 static void 2017 arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) 2018 { 2019 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2020 2021 /* 2022 * Holes and embedded blocks will always have a psize = 0 so 2023 * we ignore the compression of the blkptr and set the 2024 * arc_buf_hdr_t's compression to ZIO_COMPRESS_OFF. 2025 * Holes and embedded blocks remain anonymous so we don't 2026 * want to uncompress them. Mark them as uncompressed. 2027 */ 2028 if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { 2029 arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2030 ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); 2031 } else { 2032 arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); 2033 ASSERT(HDR_COMPRESSION_ENABLED(hdr)); 2034 } 2035 2036 HDR_SET_COMPRESS(hdr, cmp); 2037 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); 2038 } 2039 2040 /* 2041 * Looks for another buf on the same hdr which has the data decompressed, copies 2042 * from it, and returns true. If no such buf exists, returns false. 2043 */ 2044 static boolean_t 2045 arc_buf_try_copy_decompressed_data(arc_buf_t *buf) 2046 { 2047 arc_buf_hdr_t *hdr = buf->b_hdr; 2048 boolean_t copied = B_FALSE; 2049 2050 ASSERT(HDR_HAS_L1HDR(hdr)); 2051 ASSERT3P(buf->b_data, !=, NULL); 2052 ASSERT(!ARC_BUF_COMPRESSED(buf)); 2053 2054 for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; 2055 from = from->b_next) { 2056 /* can't use our own data buffer */ 2057 if (from == buf) { 2058 continue; 2059 } 2060 2061 if (!ARC_BUF_COMPRESSED(from)) { 2062 bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); 2063 copied = B_TRUE; 2064 break; 2065 } 2066 } 2067 2068 /* 2069 * Note: With encryption support, the following assertion is no longer 2070 * necessarily valid. If we receive two back to back raw snapshots 2071 * (send -w), the second receive can use a hdr with a cksum already 2072 * calculated. This happens via: 2073 * dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf() 2074 * The rsend/send_mixed_raw test case exercises this code path. 2075 * 2076 * There were no decompressed bufs, so there should not be a 2077 * checksum on the hdr either. 2078 * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); 2079 */ 2080 2081 return (copied); 2082 } 2083 2084 /* 2085 * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. 2086 */ 2087 static uint64_t 2088 arc_hdr_size(arc_buf_hdr_t *hdr) 2089 { 2090 uint64_t size; 2091 2092 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && 2093 HDR_GET_PSIZE(hdr) > 0) { 2094 size = HDR_GET_PSIZE(hdr); 2095 } else { 2096 ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); 2097 size = HDR_GET_LSIZE(hdr); 2098 } 2099 return (size); 2100 } 2101 2102 static int 2103 arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) 2104 { 2105 int ret; 2106 uint64_t csize; 2107 uint64_t lsize = HDR_GET_LSIZE(hdr); 2108 uint64_t psize = HDR_GET_PSIZE(hdr); 2109 void *tmpbuf = NULL; 2110 abd_t *abd = hdr->b_l1hdr.b_pabd; 2111 2112 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2113 ASSERT(HDR_AUTHENTICATED(hdr)); 2114 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2115 2116 /* 2117 * The MAC is calculated on the compressed data that is stored on disk. 2118 * However, if compressed arc is disabled we will only have the 2119 * decompressed data available to us now. Compress it into a temporary 2120 * abd so we can verify the MAC. The performance overhead of this will 2121 * be relatively low, since most objects in an encrypted objset will 2122 * be encrypted (instead of authenticated) anyway. 2123 */ 2124 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2125 !HDR_COMPRESSION_ENABLED(hdr)) { 2126 tmpbuf = zio_buf_alloc(lsize); 2127 abd = abd_get_from_buf(tmpbuf, lsize); 2128 abd_take_ownership_of_buf(abd, B_TRUE); 2129 2130 csize = zio_compress_data(HDR_GET_COMPRESS(hdr), 2131 hdr->b_l1hdr.b_pabd, tmpbuf, lsize); 2132 ASSERT3U(csize, <=, psize); 2133 abd_zero_off(abd, csize, psize - csize); 2134 } 2135 2136 /* 2137 * Authentication is best effort. We authenticate whenever the key is 2138 * available. If we succeed we clear ARC_FLAG_NOAUTH. 2139 */ 2140 if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { 2141 ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); 2142 ASSERT3U(lsize, ==, psize); 2143 ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, 2144 psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); 2145 } else { 2146 ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, 2147 hdr->b_crypt_hdr.b_mac); 2148 } 2149 2150 if (ret == 0) 2151 arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); 2152 else if (ret != ENOENT) 2153 goto error; 2154 2155 if (tmpbuf != NULL) 2156 abd_free(abd); 2157 2158 return (0); 2159 2160 error: 2161 if (tmpbuf != NULL) 2162 abd_free(abd); 2163 2164 return (ret); 2165 } 2166 2167 /* 2168 * This function will take a header that only has raw encrypted data in 2169 * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in 2170 * b_l1hdr.b_pabd. If designated in the header flags, this function will 2171 * also decompress the data. 2172 */ 2173 static int 2174 arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) 2175 { 2176 int ret; 2177 abd_t *cabd = NULL; 2178 void *tmp = NULL; 2179 boolean_t no_crypt = B_FALSE; 2180 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); 2181 2182 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2183 ASSERT(HDR_ENCRYPTED(hdr)); 2184 2185 arc_hdr_alloc_pabd(hdr, B_FALSE); 2186 2187 ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, 2188 B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, 2189 hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, 2190 hdr->b_crypt_hdr.b_rabd, &no_crypt); 2191 if (ret != 0) 2192 goto error; 2193 2194 if (no_crypt) { 2195 abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, 2196 HDR_GET_PSIZE(hdr)); 2197 } 2198 2199 /* 2200 * If this header has disabled arc compression but the b_pabd is 2201 * compressed after decrypting it, we need to decompress the newly 2202 * decrypted data. 2203 */ 2204 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 2205 !HDR_COMPRESSION_ENABLED(hdr)) { 2206 /* 2207 * We want to make sure that we are correctly honoring the 2208 * zfs_abd_scatter_enabled setting, so we allocate an abd here 2209 * and then loan a buffer from it, rather than allocating a 2210 * linear buffer and wrapping it in an abd later. 2211 */ 2212 cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 2213 tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); 2214 2215 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2216 hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), 2217 HDR_GET_LSIZE(hdr)); 2218 if (ret != 0) { 2219 abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); 2220 goto error; 2221 } 2222 2223 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); 2224 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 2225 arc_hdr_size(hdr), hdr); 2226 hdr->b_l1hdr.b_pabd = cabd; 2227 } 2228 2229 return (0); 2230 2231 error: 2232 arc_hdr_free_pabd(hdr, B_FALSE); 2233 if (cabd != NULL) 2234 arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); 2235 2236 return (ret); 2237 } 2238 2239 /* 2240 * This function is called during arc_buf_fill() to prepare the header's 2241 * abd plaintext pointer for use. This involves authenticated protected 2242 * data and decrypting encrypted data into the plaintext abd. 2243 */ 2244 static int 2245 arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, 2246 const zbookmark_phys_t *zb, boolean_t noauth) 2247 { 2248 int ret; 2249 2250 ASSERT(HDR_PROTECTED(hdr)); 2251 2252 if (hash_lock != NULL) 2253 mutex_enter(hash_lock); 2254 2255 if (HDR_NOAUTH(hdr) && !noauth) { 2256 /* 2257 * The caller requested authenticated data but our data has 2258 * not been authenticated yet. Verify the MAC now if we can. 2259 */ 2260 ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); 2261 if (ret != 0) 2262 goto error; 2263 } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { 2264 /* 2265 * If we only have the encrypted version of the data, but the 2266 * unencrypted version was requested we take this opportunity 2267 * to store the decrypted version in the header for future use. 2268 */ 2269 ret = arc_hdr_decrypt(hdr, spa, zb); 2270 if (ret != 0) 2271 goto error; 2272 } 2273 2274 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2275 2276 if (hash_lock != NULL) 2277 mutex_exit(hash_lock); 2278 2279 return (0); 2280 2281 error: 2282 if (hash_lock != NULL) 2283 mutex_exit(hash_lock); 2284 2285 return (ret); 2286 } 2287 2288 /* 2289 * This function is used by the dbuf code to decrypt bonus buffers in place. 2290 * The dbuf code itself doesn't have any locking for decrypting a shared dnode 2291 * block, so we use the hash lock here to protect against concurrent calls to 2292 * arc_buf_fill(). 2293 */ 2294 /* ARGSUSED */ 2295 static void 2296 arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) 2297 { 2298 arc_buf_hdr_t *hdr = buf->b_hdr; 2299 2300 ASSERT(HDR_ENCRYPTED(hdr)); 2301 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); 2302 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 2303 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 2304 2305 zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, 2306 arc_buf_size(buf)); 2307 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; 2308 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2309 hdr->b_crypt_hdr.b_ebufcnt -= 1; 2310 } 2311 2312 /* 2313 * Given a buf that has a data buffer attached to it, this function will 2314 * efficiently fill the buf with data of the specified compression setting from 2315 * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr 2316 * are already sharing a data buf, no copy is performed. 2317 * 2318 * If the buf is marked as compressed but uncompressed data was requested, this 2319 * will allocate a new data buffer for the buf, remove that flag, and fill the 2320 * buf with uncompressed data. You can't request a compressed buf on a hdr with 2321 * uncompressed data, and (since we haven't added support for it yet) if you 2322 * want compressed data your buf must already be marked as compressed and have 2323 * the correct-sized data buffer. 2324 */ 2325 static int 2326 arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, 2327 arc_fill_flags_t flags) 2328 { 2329 int error = 0; 2330 arc_buf_hdr_t *hdr = buf->b_hdr; 2331 boolean_t hdr_compressed = 2332 (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); 2333 boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; 2334 boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; 2335 dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; 2336 kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); 2337 2338 ASSERT3P(buf->b_data, !=, NULL); 2339 IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); 2340 IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); 2341 IMPLY(encrypted, HDR_ENCRYPTED(hdr)); 2342 IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); 2343 IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); 2344 IMPLY(encrypted, !ARC_BUF_SHARED(buf)); 2345 2346 /* 2347 * If the caller wanted encrypted data we just need to copy it from 2348 * b_rabd and potentially byteswap it. We won't be able to do any 2349 * further transforms on it. 2350 */ 2351 if (encrypted) { 2352 ASSERT(HDR_HAS_RABD(hdr)); 2353 abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, 2354 HDR_GET_PSIZE(hdr)); 2355 goto byteswap; 2356 } 2357 2358 /* 2359 * Adjust encrypted and authenticated headers to accomodate 2360 * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are 2361 * allowed to fail decryption due to keys not being loaded 2362 * without being marked as an IO error. 2363 */ 2364 if (HDR_PROTECTED(hdr)) { 2365 error = arc_fill_hdr_crypt(hdr, hash_lock, spa, 2366 zb, !!(flags & ARC_FILL_NOAUTH)); 2367 if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { 2368 return (error); 2369 } else if (error != 0) { 2370 if (hash_lock != NULL) 2371 mutex_enter(hash_lock); 2372 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 2373 if (hash_lock != NULL) 2374 mutex_exit(hash_lock); 2375 return (error); 2376 } 2377 } 2378 2379 /* 2380 * There is a special case here for dnode blocks which are 2381 * decrypting their bonus buffers. These blocks may request to 2382 * be decrypted in-place. This is necessary because there may 2383 * be many dnodes pointing into this buffer and there is 2384 * currently no method to synchronize replacing the backing 2385 * b_data buffer and updating all of the pointers. Here we use 2386 * the hash lock to ensure there are no races. If the need 2387 * arises for other types to be decrypted in-place, they must 2388 * add handling here as well. 2389 */ 2390 if ((flags & ARC_FILL_IN_PLACE) != 0) { 2391 ASSERT(!hdr_compressed); 2392 ASSERT(!compressed); 2393 ASSERT(!encrypted); 2394 2395 if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { 2396 ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); 2397 2398 if (hash_lock != NULL) 2399 mutex_enter(hash_lock); 2400 arc_buf_untransform_in_place(buf, hash_lock); 2401 if (hash_lock != NULL) 2402 mutex_exit(hash_lock); 2403 2404 /* Compute the hdr's checksum if necessary */ 2405 arc_cksum_compute(buf); 2406 } 2407 2408 return (0); 2409 } 2410 2411 if (hdr_compressed == compressed) { 2412 if (!arc_buf_is_shared(buf)) { 2413 abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, 2414 arc_buf_size(buf)); 2415 } 2416 } else { 2417 ASSERT(hdr_compressed); 2418 ASSERT(!compressed); 2419 ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); 2420 2421 /* 2422 * If the buf is sharing its data with the hdr, unlink it and 2423 * allocate a new data buffer for the buf. 2424 */ 2425 if (arc_buf_is_shared(buf)) { 2426 ASSERT(ARC_BUF_COMPRESSED(buf)); 2427 2428 /* We need to give the buf its own b_data */ 2429 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 2430 buf->b_data = 2431 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2432 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 2433 2434 /* Previously overhead was 0; just add new overhead */ 2435 ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); 2436 } else if (ARC_BUF_COMPRESSED(buf)) { 2437 /* We need to reallocate the buf's b_data */ 2438 arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), 2439 buf); 2440 buf->b_data = 2441 arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); 2442 2443 /* We increased the size of b_data; update overhead */ 2444 ARCSTAT_INCR(arcstat_overhead_size, 2445 HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); 2446 } 2447 2448 /* 2449 * Regardless of the buf's previous compression settings, it 2450 * should not be compressed at the end of this function. 2451 */ 2452 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 2453 2454 /* 2455 * Try copying the data from another buf which already has a 2456 * decompressed version. If that's not possible, it's time to 2457 * bite the bullet and decompress the data from the hdr. 2458 */ 2459 if (arc_buf_try_copy_decompressed_data(buf)) { 2460 /* Skip byteswapping and checksumming (already done) */ 2461 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); 2462 return (0); 2463 } else { 2464 error = zio_decompress_data(HDR_GET_COMPRESS(hdr), 2465 hdr->b_l1hdr.b_pabd, buf->b_data, 2466 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2467 2468 /* 2469 * Absent hardware errors or software bugs, this should 2470 * be impossible, but log it anyway so we can debug it. 2471 */ 2472 if (error != 0) { 2473 zfs_dbgmsg( 2474 "hdr %p, compress %d, psize %d, lsize %d", 2475 hdr, arc_hdr_get_compress(hdr), 2476 HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); 2477 if (hash_lock != NULL) 2478 mutex_enter(hash_lock); 2479 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 2480 if (hash_lock != NULL) 2481 mutex_exit(hash_lock); 2482 return (SET_ERROR(EIO)); 2483 } 2484 } 2485 } 2486 2487 byteswap: 2488 /* Byteswap the buf's data if necessary */ 2489 if (bswap != DMU_BSWAP_NUMFUNCS) { 2490 ASSERT(!HDR_SHARED_DATA(hdr)); 2491 ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); 2492 dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); 2493 } 2494 2495 /* Compute the hdr's checksum if necessary */ 2496 arc_cksum_compute(buf); 2497 2498 return (0); 2499 } 2500 2501 /* 2502 * If this function is being called to decrypt an encrypted buffer or verify an 2503 * authenticated one, the key must be loaded and a mapping must be made 2504 * available in the keystore via spa_keystore_create_mapping() or one of its 2505 * callers. 2506 */ 2507 int 2508 arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, 2509 boolean_t in_place) 2510 { 2511 int ret; 2512 arc_fill_flags_t flags = 0; 2513 2514 if (in_place) 2515 flags |= ARC_FILL_IN_PLACE; 2516 2517 ret = arc_buf_fill(buf, spa, zb, flags); 2518 if (ret == ECKSUM) { 2519 /* 2520 * Convert authentication and decryption errors to EIO 2521 * (and generate an ereport) before leaving the ARC. 2522 */ 2523 ret = SET_ERROR(EIO); 2524 spa_log_error(spa, zb); 2525 zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, 2526 spa, NULL, zb, NULL, 0, 0); 2527 } 2528 2529 return (ret); 2530 } 2531 2532 /* 2533 * Increment the amount of evictable space in the arc_state_t's refcount. 2534 * We account for the space used by the hdr and the arc buf individually 2535 * so that we can add and remove them from the refcount individually. 2536 */ 2537 static void 2538 arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) 2539 { 2540 arc_buf_contents_t type = arc_buf_type(hdr); 2541 2542 ASSERT(HDR_HAS_L1HDR(hdr)); 2543 2544 if (GHOST_STATE(state)) { 2545 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2546 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2547 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2548 ASSERT(!HDR_HAS_RABD(hdr)); 2549 (void) zfs_refcount_add_many(&state->arcs_esize[type], 2550 HDR_GET_LSIZE(hdr), hdr); 2551 return; 2552 } 2553 2554 ASSERT(!GHOST_STATE(state)); 2555 if (hdr->b_l1hdr.b_pabd != NULL) { 2556 (void) zfs_refcount_add_many(&state->arcs_esize[type], 2557 arc_hdr_size(hdr), hdr); 2558 } 2559 if (HDR_HAS_RABD(hdr)) { 2560 (void) zfs_refcount_add_many(&state->arcs_esize[type], 2561 HDR_GET_PSIZE(hdr), hdr); 2562 } 2563 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2564 buf = buf->b_next) { 2565 if (arc_buf_is_shared(buf)) 2566 continue; 2567 (void) zfs_refcount_add_many(&state->arcs_esize[type], 2568 arc_buf_size(buf), buf); 2569 } 2570 } 2571 2572 /* 2573 * Decrement the amount of evictable space in the arc_state_t's refcount. 2574 * We account for the space used by the hdr and the arc buf individually 2575 * so that we can add and remove them from the refcount individually. 2576 */ 2577 static void 2578 arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) 2579 { 2580 arc_buf_contents_t type = arc_buf_type(hdr); 2581 2582 ASSERT(HDR_HAS_L1HDR(hdr)); 2583 2584 if (GHOST_STATE(state)) { 2585 ASSERT0(hdr->b_l1hdr.b_bufcnt); 2586 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2587 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2588 ASSERT(!HDR_HAS_RABD(hdr)); 2589 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 2590 HDR_GET_LSIZE(hdr), hdr); 2591 return; 2592 } 2593 2594 ASSERT(!GHOST_STATE(state)); 2595 if (hdr->b_l1hdr.b_pabd != NULL) { 2596 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 2597 arc_hdr_size(hdr), hdr); 2598 } 2599 if (HDR_HAS_RABD(hdr)) { 2600 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 2601 HDR_GET_PSIZE(hdr), hdr); 2602 } 2603 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2604 buf = buf->b_next) { 2605 if (arc_buf_is_shared(buf)) 2606 continue; 2607 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 2608 arc_buf_size(buf), buf); 2609 } 2610 } 2611 2612 /* 2613 * Add a reference to this hdr indicating that someone is actively 2614 * referencing that memory. When the refcount transitions from 0 to 1, 2615 * we remove it from the respective arc_state_t list to indicate that 2616 * it is not evictable. 2617 */ 2618 static void 2619 add_reference(arc_buf_hdr_t *hdr, void *tag) 2620 { 2621 ASSERT(HDR_HAS_L1HDR(hdr)); 2622 if (!MUTEX_HELD(HDR_LOCK(hdr))) { 2623 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 2624 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 2625 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2626 } 2627 2628 arc_state_t *state = hdr->b_l1hdr.b_state; 2629 2630 if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && 2631 (state != arc_anon)) { 2632 /* We don't use the L2-only state list. */ 2633 if (state != arc_l2c_only) { 2634 multilist_remove(state->arcs_list[arc_buf_type(hdr)], 2635 hdr); 2636 arc_evictable_space_decrement(hdr, state); 2637 } 2638 /* remove the prefetch flag if we get a reference */ 2639 arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); 2640 } 2641 } 2642 2643 /* 2644 * Remove a reference from this hdr. When the reference transitions from 2645 * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's 2646 * list making it eligible for eviction. 2647 */ 2648 static int 2649 remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) 2650 { 2651 int cnt; 2652 arc_state_t *state = hdr->b_l1hdr.b_state; 2653 2654 ASSERT(HDR_HAS_L1HDR(hdr)); 2655 ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); 2656 ASSERT(!GHOST_STATE(state)); 2657 2658 /* 2659 * arc_l2c_only counts as a ghost state so we don't need to explicitly 2660 * check to prevent usage of the arc_l2c_only list. 2661 */ 2662 if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && 2663 (state != arc_anon)) { 2664 multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); 2665 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 2666 arc_evictable_space_increment(hdr, state); 2667 } 2668 return (cnt); 2669 } 2670 2671 /* 2672 * Move the supplied buffer to the indicated state. The hash lock 2673 * for the buffer must be held by the caller. 2674 */ 2675 static void 2676 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, 2677 kmutex_t *hash_lock) 2678 { 2679 arc_state_t *old_state; 2680 int64_t refcnt; 2681 uint32_t bufcnt; 2682 boolean_t update_old, update_new; 2683 arc_buf_contents_t buftype = arc_buf_type(hdr); 2684 2685 /* 2686 * We almost always have an L1 hdr here, since we call arc_hdr_realloc() 2687 * in arc_read() when bringing a buffer out of the L2ARC. However, the 2688 * L1 hdr doesn't always exist when we change state to arc_anon before 2689 * destroying a header, in which case reallocating to add the L1 hdr is 2690 * pointless. 2691 */ 2692 if (HDR_HAS_L1HDR(hdr)) { 2693 old_state = hdr->b_l1hdr.b_state; 2694 refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); 2695 bufcnt = hdr->b_l1hdr.b_bufcnt; 2696 2697 update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || 2698 HDR_HAS_RABD(hdr)); 2699 } else { 2700 old_state = arc_l2c_only; 2701 refcnt = 0; 2702 bufcnt = 0; 2703 update_old = B_FALSE; 2704 } 2705 update_new = update_old; 2706 2707 ASSERT(MUTEX_HELD(hash_lock)); 2708 ASSERT3P(new_state, !=, old_state); 2709 ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); 2710 ASSERT(old_state != arc_anon || bufcnt <= 1); 2711 2712 /* 2713 * If this buffer is evictable, transfer it from the 2714 * old state list to the new state list. 2715 */ 2716 if (refcnt == 0) { 2717 if (old_state != arc_anon && old_state != arc_l2c_only) { 2718 ASSERT(HDR_HAS_L1HDR(hdr)); 2719 multilist_remove(old_state->arcs_list[buftype], hdr); 2720 2721 if (GHOST_STATE(old_state)) { 2722 ASSERT0(bufcnt); 2723 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2724 update_old = B_TRUE; 2725 } 2726 arc_evictable_space_decrement(hdr, old_state); 2727 } 2728 if (new_state != arc_anon && new_state != arc_l2c_only) { 2729 2730 /* 2731 * An L1 header always exists here, since if we're 2732 * moving to some L1-cached state (i.e. not l2c_only or 2733 * anonymous), we realloc the header to add an L1hdr 2734 * beforehand. 2735 */ 2736 ASSERT(HDR_HAS_L1HDR(hdr)); 2737 multilist_insert(new_state->arcs_list[buftype], hdr); 2738 2739 if (GHOST_STATE(new_state)) { 2740 ASSERT0(bufcnt); 2741 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 2742 update_new = B_TRUE; 2743 } 2744 arc_evictable_space_increment(hdr, new_state); 2745 } 2746 } 2747 2748 ASSERT(!HDR_EMPTY(hdr)); 2749 if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) 2750 buf_hash_remove(hdr); 2751 2752 /* adjust state sizes (ignore arc_l2c_only) */ 2753 2754 if (update_new && new_state != arc_l2c_only) { 2755 ASSERT(HDR_HAS_L1HDR(hdr)); 2756 if (GHOST_STATE(new_state)) { 2757 ASSERT0(bufcnt); 2758 2759 /* 2760 * When moving a header to a ghost state, we first 2761 * remove all arc buffers. Thus, we'll have a 2762 * bufcnt of zero, and no arc buffer to use for 2763 * the reference. As a result, we use the arc 2764 * header pointer for the reference. 2765 */ 2766 (void) zfs_refcount_add_many(&new_state->arcs_size, 2767 HDR_GET_LSIZE(hdr), hdr); 2768 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2769 ASSERT(!HDR_HAS_RABD(hdr)); 2770 } else { 2771 uint32_t buffers = 0; 2772 2773 /* 2774 * Each individual buffer holds a unique reference, 2775 * thus we must remove each of these references one 2776 * at a time. 2777 */ 2778 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2779 buf = buf->b_next) { 2780 ASSERT3U(bufcnt, !=, 0); 2781 buffers++; 2782 2783 /* 2784 * When the arc_buf_t is sharing the data 2785 * block with the hdr, the owner of the 2786 * reference belongs to the hdr. Only 2787 * add to the refcount if the arc_buf_t is 2788 * not shared. 2789 */ 2790 if (arc_buf_is_shared(buf)) 2791 continue; 2792 2793 (void) zfs_refcount_add_many( 2794 &new_state->arcs_size, 2795 arc_buf_size(buf), buf); 2796 } 2797 ASSERT3U(bufcnt, ==, buffers); 2798 2799 if (hdr->b_l1hdr.b_pabd != NULL) { 2800 (void) zfs_refcount_add_many( 2801 &new_state->arcs_size, 2802 arc_hdr_size(hdr), hdr); 2803 } 2804 2805 if (HDR_HAS_RABD(hdr)) { 2806 (void) zfs_refcount_add_many( 2807 &new_state->arcs_size, 2808 HDR_GET_PSIZE(hdr), hdr); 2809 } 2810 } 2811 } 2812 2813 if (update_old && old_state != arc_l2c_only) { 2814 ASSERT(HDR_HAS_L1HDR(hdr)); 2815 if (GHOST_STATE(old_state)) { 2816 ASSERT0(bufcnt); 2817 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 2818 ASSERT(!HDR_HAS_RABD(hdr)); 2819 2820 /* 2821 * When moving a header off of a ghost state, 2822 * the header will not contain any arc buffers. 2823 * We use the arc header pointer for the reference 2824 * which is exactly what we did when we put the 2825 * header on the ghost state. 2826 */ 2827 2828 (void) zfs_refcount_remove_many(&old_state->arcs_size, 2829 HDR_GET_LSIZE(hdr), hdr); 2830 } else { 2831 uint32_t buffers = 0; 2832 2833 /* 2834 * Each individual buffer holds a unique reference, 2835 * thus we must remove each of these references one 2836 * at a time. 2837 */ 2838 for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; 2839 buf = buf->b_next) { 2840 ASSERT3U(bufcnt, !=, 0); 2841 buffers++; 2842 2843 /* 2844 * When the arc_buf_t is sharing the data 2845 * block with the hdr, the owner of the 2846 * reference belongs to the hdr. Only 2847 * add to the refcount if the arc_buf_t is 2848 * not shared. 2849 */ 2850 if (arc_buf_is_shared(buf)) 2851 continue; 2852 2853 (void) zfs_refcount_remove_many( 2854 &old_state->arcs_size, arc_buf_size(buf), 2855 buf); 2856 } 2857 ASSERT3U(bufcnt, ==, buffers); 2858 ASSERT(hdr->b_l1hdr.b_pabd != NULL || 2859 HDR_HAS_RABD(hdr)); 2860 2861 if (hdr->b_l1hdr.b_pabd != NULL) { 2862 (void) zfs_refcount_remove_many( 2863 &old_state->arcs_size, arc_hdr_size(hdr), 2864 hdr); 2865 } 2866 2867 if (HDR_HAS_RABD(hdr)) { 2868 (void) zfs_refcount_remove_many( 2869 &old_state->arcs_size, HDR_GET_PSIZE(hdr), 2870 hdr); 2871 } 2872 } 2873 } 2874 2875 if (HDR_HAS_L1HDR(hdr)) 2876 hdr->b_l1hdr.b_state = new_state; 2877 2878 /* 2879 * L2 headers should never be on the L2 state list since they don't 2880 * have L1 headers allocated. 2881 */ 2882 ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && 2883 multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); 2884 } 2885 2886 void 2887 arc_space_consume(uint64_t space, arc_space_type_t type) 2888 { 2889 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2890 2891 switch (type) { 2892 case ARC_SPACE_DATA: 2893 aggsum_add(&astat_data_size, space); 2894 break; 2895 case ARC_SPACE_META: 2896 aggsum_add(&astat_metadata_size, space); 2897 break; 2898 case ARC_SPACE_OTHER: 2899 aggsum_add(&astat_other_size, space); 2900 break; 2901 case ARC_SPACE_HDRS: 2902 aggsum_add(&astat_hdr_size, space); 2903 break; 2904 case ARC_SPACE_L2HDRS: 2905 aggsum_add(&astat_l2_hdr_size, space); 2906 break; 2907 } 2908 2909 if (type != ARC_SPACE_DATA) 2910 aggsum_add(&arc_meta_used, space); 2911 2912 aggsum_add(&arc_size, space); 2913 } 2914 2915 void 2916 arc_space_return(uint64_t space, arc_space_type_t type) 2917 { 2918 ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); 2919 2920 switch (type) { 2921 case ARC_SPACE_DATA: 2922 aggsum_add(&astat_data_size, -space); 2923 break; 2924 case ARC_SPACE_META: 2925 aggsum_add(&astat_metadata_size, -space); 2926 break; 2927 case ARC_SPACE_OTHER: 2928 aggsum_add(&astat_other_size, -space); 2929 break; 2930 case ARC_SPACE_HDRS: 2931 aggsum_add(&astat_hdr_size, -space); 2932 break; 2933 case ARC_SPACE_L2HDRS: 2934 aggsum_add(&astat_l2_hdr_size, -space); 2935 break; 2936 } 2937 2938 if (type != ARC_SPACE_DATA) { 2939 ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); 2940 /* 2941 * We use the upper bound here rather than the precise value 2942 * because the arc_meta_max value doesn't need to be 2943 * precise. It's only consumed by humans via arcstats. 2944 */ 2945 if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) 2946 arc_meta_max = aggsum_upper_bound(&arc_meta_used); 2947 aggsum_add(&arc_meta_used, -space); 2948 } 2949 2950 ASSERT(aggsum_compare(&arc_size, space) >= 0); 2951 aggsum_add(&arc_size, -space); 2952 } 2953 2954 /* 2955 * Given a hdr and a buf, returns whether that buf can share its b_data buffer 2956 * with the hdr's b_pabd. 2957 */ 2958 static boolean_t 2959 arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) 2960 { 2961 /* 2962 * The criteria for sharing a hdr's data are: 2963 * 1. the buffer is not encrypted 2964 * 2. the hdr's compression matches the buf's compression 2965 * 3. the hdr doesn't need to be byteswapped 2966 * 4. the hdr isn't already being shared 2967 * 5. the buf is either compressed or it is the last buf in the hdr list 2968 * 2969 * Criterion #5 maintains the invariant that shared uncompressed 2970 * bufs must be the final buf in the hdr's b_buf list. Reading this, you 2971 * might ask, "if a compressed buf is allocated first, won't that be the 2972 * last thing in the list?", but in that case it's impossible to create 2973 * a shared uncompressed buf anyway (because the hdr must be compressed 2974 * to have the compressed buf). You might also think that #3 is 2975 * sufficient to make this guarantee, however it's possible 2976 * (specifically in the rare L2ARC write race mentioned in 2977 * arc_buf_alloc_impl()) there will be an existing uncompressed buf that 2978 * is sharable, but wasn't at the time of its allocation. Rather than 2979 * allow a new shared uncompressed buf to be created and then shuffle 2980 * the list around to make it the last element, this simply disallows 2981 * sharing if the new buf isn't the first to be added. 2982 */ 2983 ASSERT3P(buf->b_hdr, ==, hdr); 2984 boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != 2985 ZIO_COMPRESS_OFF; 2986 boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; 2987 return (!ARC_BUF_ENCRYPTED(buf) && 2988 buf_compressed == hdr_compressed && 2989 hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && 2990 !HDR_SHARED_DATA(hdr) && 2991 (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); 2992 } 2993 2994 /* 2995 * Allocate a buf for this hdr. If you care about the data that's in the hdr, 2996 * or if you want a compressed buffer, pass those flags in. Returns 0 if the 2997 * copy was made successfully, or an error code otherwise. 2998 */ 2999 static int 3000 arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, 3001 void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, 3002 boolean_t fill, arc_buf_t **ret) 3003 { 3004 arc_buf_t *buf; 3005 arc_fill_flags_t flags = ARC_FILL_LOCKED; 3006 3007 ASSERT(HDR_HAS_L1HDR(hdr)); 3008 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3009 VERIFY(hdr->b_type == ARC_BUFC_DATA || 3010 hdr->b_type == ARC_BUFC_METADATA); 3011 ASSERT3P(ret, !=, NULL); 3012 ASSERT3P(*ret, ==, NULL); 3013 IMPLY(encrypted, compressed); 3014 3015 buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); 3016 buf->b_hdr = hdr; 3017 buf->b_data = NULL; 3018 buf->b_next = hdr->b_l1hdr.b_buf; 3019 buf->b_flags = 0; 3020 3021 add_reference(hdr, tag); 3022 3023 /* 3024 * We're about to change the hdr's b_flags. We must either 3025 * hold the hash_lock or be undiscoverable. 3026 */ 3027 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3028 3029 /* 3030 * Only honor requests for compressed bufs if the hdr is actually 3031 * compressed. This must be overriden if the buffer is encrypted since 3032 * encrypted buffers cannot be decompressed. 3033 */ 3034 if (encrypted) { 3035 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 3036 buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; 3037 flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; 3038 } else if (compressed && 3039 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { 3040 buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; 3041 flags |= ARC_FILL_COMPRESSED; 3042 } 3043 3044 if (noauth) { 3045 ASSERT0(encrypted); 3046 flags |= ARC_FILL_NOAUTH; 3047 } 3048 3049 /* 3050 * If the hdr's data can be shared then we share the data buffer and 3051 * set the appropriate bit in the hdr's b_flags to indicate the hdr is 3052 * allocate a new buffer to store the buf's data. 3053 * 3054 * There are two additional restrictions here because we're sharing 3055 * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be 3056 * actively involved in an L2ARC write, because if this buf is used by 3057 * an arc_write() then the hdr's data buffer will be released when the 3058 * write completes, even though the L2ARC write might still be using it. 3059 * Second, the hdr's ABD must be linear so that the buf's user doesn't 3060 * need to be ABD-aware. 3061 */ 3062 boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && 3063 hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd); 3064 3065 /* Set up b_data and sharing */ 3066 if (can_share) { 3067 buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); 3068 buf->b_flags |= ARC_BUF_FLAG_SHARED; 3069 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 3070 } else { 3071 buf->b_data = 3072 arc_get_data_buf(hdr, arc_buf_size(buf), buf); 3073 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 3074 } 3075 VERIFY3P(buf->b_data, !=, NULL); 3076 3077 hdr->b_l1hdr.b_buf = buf; 3078 hdr->b_l1hdr.b_bufcnt += 1; 3079 if (encrypted) 3080 hdr->b_crypt_hdr.b_ebufcnt += 1; 3081 3082 /* 3083 * If the user wants the data from the hdr, we need to either copy or 3084 * decompress the data. 3085 */ 3086 if (fill) { 3087 ASSERT3P(zb, !=, NULL); 3088 return (arc_buf_fill(buf, spa, zb, flags)); 3089 } 3090 3091 return (0); 3092 } 3093 3094 static char *arc_onloan_tag = "onloan"; 3095 3096 static inline void 3097 arc_loaned_bytes_update(int64_t delta) 3098 { 3099 atomic_add_64(&arc_loaned_bytes, delta); 3100 3101 /* assert that it did not wrap around */ 3102 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 3103 } 3104 3105 /* 3106 * Loan out an anonymous arc buffer. Loaned buffers are not counted as in 3107 * flight data by arc_tempreserve_space() until they are "returned". Loaned 3108 * buffers must be returned to the arc before they can be used by the DMU or 3109 * freed. 3110 */ 3111 arc_buf_t * 3112 arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) 3113 { 3114 arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, 3115 is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); 3116 3117 arc_loaned_bytes_update(arc_buf_size(buf)); 3118 3119 return (buf); 3120 } 3121 3122 arc_buf_t * 3123 arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, 3124 enum zio_compress compression_type) 3125 { 3126 arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, 3127 psize, lsize, compression_type); 3128 3129 arc_loaned_bytes_update(arc_buf_size(buf)); 3130 3131 return (buf); 3132 } 3133 3134 arc_buf_t * 3135 arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, 3136 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, 3137 dmu_object_type_t ot, uint64_t psize, uint64_t lsize, 3138 enum zio_compress compression_type) 3139 { 3140 arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, 3141 byteorder, salt, iv, mac, ot, psize, lsize, compression_type); 3142 3143 atomic_add_64(&arc_loaned_bytes, psize); 3144 return (buf); 3145 } 3146 3147 3148 /* 3149 * Return a loaned arc buffer to the arc. 3150 */ 3151 void 3152 arc_return_buf(arc_buf_t *buf, void *tag) 3153 { 3154 arc_buf_hdr_t *hdr = buf->b_hdr; 3155 3156 ASSERT3P(buf->b_data, !=, NULL); 3157 ASSERT(HDR_HAS_L1HDR(hdr)); 3158 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); 3159 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 3160 3161 arc_loaned_bytes_update(-arc_buf_size(buf)); 3162 } 3163 3164 /* Detach an arc_buf from a dbuf (tag) */ 3165 void 3166 arc_loan_inuse_buf(arc_buf_t *buf, void *tag) 3167 { 3168 arc_buf_hdr_t *hdr = buf->b_hdr; 3169 3170 ASSERT3P(buf->b_data, !=, NULL); 3171 ASSERT(HDR_HAS_L1HDR(hdr)); 3172 (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); 3173 (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); 3174 3175 arc_loaned_bytes_update(arc_buf_size(buf)); 3176 } 3177 3178 static void 3179 l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) 3180 { 3181 l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); 3182 3183 df->l2df_abd = abd; 3184 df->l2df_size = size; 3185 df->l2df_type = type; 3186 mutex_enter(&l2arc_free_on_write_mtx); 3187 list_insert_head(l2arc_free_on_write, df); 3188 mutex_exit(&l2arc_free_on_write_mtx); 3189 } 3190 3191 static void 3192 arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) 3193 { 3194 arc_state_t *state = hdr->b_l1hdr.b_state; 3195 arc_buf_contents_t type = arc_buf_type(hdr); 3196 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); 3197 3198 /* protected by hash lock, if in the hash table */ 3199 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 3200 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3201 ASSERT(state != arc_anon && state != arc_l2c_only); 3202 3203 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 3204 size, hdr); 3205 } 3206 (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); 3207 if (type == ARC_BUFC_METADATA) { 3208 arc_space_return(size, ARC_SPACE_META); 3209 } else { 3210 ASSERT(type == ARC_BUFC_DATA); 3211 arc_space_return(size, ARC_SPACE_DATA); 3212 } 3213 3214 if (free_rdata) { 3215 l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); 3216 } else { 3217 l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); 3218 } 3219 } 3220 3221 /* 3222 * Share the arc_buf_t's data with the hdr. Whenever we are sharing the 3223 * data buffer, we transfer the refcount ownership to the hdr and update 3224 * the appropriate kstats. 3225 */ 3226 static void 3227 arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3228 { 3229 /* LINTED */ 3230 arc_state_t *state = hdr->b_l1hdr.b_state; 3231 3232 ASSERT(arc_can_share(hdr, buf)); 3233 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3234 ASSERT(!ARC_BUF_ENCRYPTED(buf)); 3235 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3236 3237 /* 3238 * Start sharing the data buffer. We transfer the 3239 * refcount ownership to the hdr since it always owns 3240 * the refcount whenever an arc_buf_t is shared. 3241 */ 3242 zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, 3243 arc_hdr_size(hdr), buf, hdr); 3244 hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); 3245 abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, 3246 HDR_ISTYPE_METADATA(hdr)); 3247 arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); 3248 buf->b_flags |= ARC_BUF_FLAG_SHARED; 3249 3250 /* 3251 * Since we've transferred ownership to the hdr we need 3252 * to increment its compressed and uncompressed kstats and 3253 * decrement the overhead size. 3254 */ 3255 ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); 3256 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3257 ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); 3258 } 3259 3260 static void 3261 arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3262 { 3263 /* LINTED */ 3264 arc_state_t *state = hdr->b_l1hdr.b_state; 3265 3266 ASSERT(arc_buf_is_shared(buf)); 3267 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3268 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3269 3270 /* 3271 * We are no longer sharing this buffer so we need 3272 * to transfer its ownership to the rightful owner. 3273 */ 3274 zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, 3275 arc_hdr_size(hdr), hdr, buf); 3276 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3277 abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); 3278 abd_put(hdr->b_l1hdr.b_pabd); 3279 hdr->b_l1hdr.b_pabd = NULL; 3280 buf->b_flags &= ~ARC_BUF_FLAG_SHARED; 3281 3282 /* 3283 * Since the buffer is no longer shared between 3284 * the arc buf and the hdr, count it as overhead. 3285 */ 3286 ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); 3287 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3288 ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); 3289 } 3290 3291 /* 3292 * Remove an arc_buf_t from the hdr's buf list and return the last 3293 * arc_buf_t on the list. If no buffers remain on the list then return 3294 * NULL. 3295 */ 3296 static arc_buf_t * 3297 arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) 3298 { 3299 arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; 3300 arc_buf_t *lastbuf = NULL; 3301 3302 ASSERT(HDR_HAS_L1HDR(hdr)); 3303 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3304 3305 /* 3306 * Remove the buf from the hdr list and locate the last 3307 * remaining buffer on the list. 3308 */ 3309 while (*bufp != NULL) { 3310 if (*bufp == buf) 3311 *bufp = buf->b_next; 3312 3313 /* 3314 * If we've removed a buffer in the middle of 3315 * the list then update the lastbuf and update 3316 * bufp. 3317 */ 3318 if (*bufp != NULL) { 3319 lastbuf = *bufp; 3320 bufp = &(*bufp)->b_next; 3321 } 3322 } 3323 buf->b_next = NULL; 3324 ASSERT3P(lastbuf, !=, buf); 3325 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); 3326 IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); 3327 IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); 3328 3329 return (lastbuf); 3330 } 3331 3332 /* 3333 * Free up buf->b_data and pull the arc_buf_t off of the the arc_buf_hdr_t's 3334 * list and free it. 3335 */ 3336 static void 3337 arc_buf_destroy_impl(arc_buf_t *buf) 3338 { 3339 arc_buf_hdr_t *hdr = buf->b_hdr; 3340 3341 /* 3342 * Free up the data associated with the buf but only if we're not 3343 * sharing this with the hdr. If we are sharing it with the hdr, the 3344 * hdr is responsible for doing the free. 3345 */ 3346 if (buf->b_data != NULL) { 3347 /* 3348 * We're about to change the hdr's b_flags. We must either 3349 * hold the hash_lock or be undiscoverable. 3350 */ 3351 ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); 3352 3353 arc_cksum_verify(buf); 3354 arc_buf_unwatch(buf); 3355 3356 if (arc_buf_is_shared(buf)) { 3357 arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); 3358 } else { 3359 uint64_t size = arc_buf_size(buf); 3360 arc_free_data_buf(hdr, buf->b_data, size, buf); 3361 ARCSTAT_INCR(arcstat_overhead_size, -size); 3362 } 3363 buf->b_data = NULL; 3364 3365 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 3366 hdr->b_l1hdr.b_bufcnt -= 1; 3367 3368 if (ARC_BUF_ENCRYPTED(buf)) { 3369 hdr->b_crypt_hdr.b_ebufcnt -= 1; 3370 3371 /* 3372 * If we have no more encrypted buffers and we've 3373 * already gotten a copy of the decrypted data we can 3374 * free b_rabd to save some space. 3375 */ 3376 if (hdr->b_crypt_hdr.b_ebufcnt == 0 && 3377 HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && 3378 !HDR_IO_IN_PROGRESS(hdr)) { 3379 arc_hdr_free_pabd(hdr, B_TRUE); 3380 } 3381 } 3382 } 3383 3384 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 3385 3386 if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { 3387 /* 3388 * If the current arc_buf_t is sharing its data buffer with the 3389 * hdr, then reassign the hdr's b_pabd to share it with the new 3390 * buffer at the end of the list. The shared buffer is always 3391 * the last one on the hdr's buffer list. 3392 * 3393 * There is an equivalent case for compressed bufs, but since 3394 * they aren't guaranteed to be the last buf in the list and 3395 * that is an exceedingly rare case, we just allow that space be 3396 * wasted temporarily. We must also be careful not to share 3397 * encrypted buffers, since they cannot be shared. 3398 */ 3399 if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { 3400 /* Only one buf can be shared at once */ 3401 VERIFY(!arc_buf_is_shared(lastbuf)); 3402 /* hdr is uncompressed so can't have compressed buf */ 3403 VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); 3404 3405 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3406 arc_hdr_free_pabd(hdr, B_FALSE); 3407 3408 /* 3409 * We must setup a new shared block between the 3410 * last buffer and the hdr. The data would have 3411 * been allocated by the arc buf so we need to transfer 3412 * ownership to the hdr since it's now being shared. 3413 */ 3414 arc_share_buf(hdr, lastbuf); 3415 } 3416 } else if (HDR_SHARED_DATA(hdr)) { 3417 /* 3418 * Uncompressed shared buffers are always at the end 3419 * of the list. Compressed buffers don't have the 3420 * same requirements. This makes it hard to 3421 * simply assert that the lastbuf is shared so 3422 * we rely on the hdr's compression flags to determine 3423 * if we have a compressed, shared buffer. 3424 */ 3425 ASSERT3P(lastbuf, !=, NULL); 3426 ASSERT(arc_buf_is_shared(lastbuf) || 3427 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); 3428 } 3429 3430 /* 3431 * Free the checksum if we're removing the last uncompressed buf from 3432 * this hdr. 3433 */ 3434 if (!arc_hdr_has_uncompressed_buf(hdr)) { 3435 arc_cksum_free(hdr); 3436 } 3437 3438 /* clean up the buf */ 3439 buf->b_hdr = NULL; 3440 kmem_cache_free(buf_cache, buf); 3441 } 3442 3443 static void 3444 arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) 3445 { 3446 uint64_t size; 3447 3448 ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); 3449 ASSERT(HDR_HAS_L1HDR(hdr)); 3450 ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); 3451 IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); 3452 3453 if (alloc_rdata) { 3454 size = HDR_GET_PSIZE(hdr); 3455 ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); 3456 hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr); 3457 ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); 3458 } else { 3459 size = arc_hdr_size(hdr); 3460 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3461 hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr); 3462 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 3463 } 3464 3465 ARCSTAT_INCR(arcstat_compressed_size, size); 3466 ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); 3467 } 3468 3469 static void 3470 arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata) 3471 { 3472 uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); 3473 3474 ASSERT(HDR_HAS_L1HDR(hdr)); 3475 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); 3476 IMPLY(free_rdata, HDR_HAS_RABD(hdr)); 3477 3478 3479 /* 3480 * If the hdr is currently being written to the l2arc then 3481 * we defer freeing the data by adding it to the l2arc_free_on_write 3482 * list. The l2arc will free the data once it's finished 3483 * writing it to the l2arc device. 3484 */ 3485 if (HDR_L2_WRITING(hdr)) { 3486 arc_hdr_free_on_write(hdr, free_rdata); 3487 ARCSTAT_BUMP(arcstat_l2_free_on_write); 3488 } else if (free_rdata) { 3489 arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); 3490 } else { 3491 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 3492 size, hdr); 3493 } 3494 3495 if (free_rdata) { 3496 hdr->b_crypt_hdr.b_rabd = NULL; 3497 } else { 3498 hdr->b_l1hdr.b_pabd = NULL; 3499 } 3500 3501 if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) 3502 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 3503 3504 ARCSTAT_INCR(arcstat_compressed_size, -size); 3505 ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); 3506 } 3507 3508 static arc_buf_hdr_t * 3509 arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, 3510 boolean_t protected, enum zio_compress compression_type, 3511 arc_buf_contents_t type, boolean_t alloc_rdata) 3512 { 3513 arc_buf_hdr_t *hdr; 3514 3515 VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); 3516 if (protected) { 3517 hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); 3518 } else { 3519 hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); 3520 } 3521 ASSERT(HDR_EMPTY(hdr)); 3522 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3523 ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); 3524 HDR_SET_PSIZE(hdr, psize); 3525 HDR_SET_LSIZE(hdr, lsize); 3526 hdr->b_spa = spa; 3527 hdr->b_type = type; 3528 hdr->b_flags = 0; 3529 arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); 3530 arc_hdr_set_compress(hdr, compression_type); 3531 if (protected) 3532 arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); 3533 3534 hdr->b_l1hdr.b_state = arc_anon; 3535 hdr->b_l1hdr.b_arc_access = 0; 3536 hdr->b_l1hdr.b_bufcnt = 0; 3537 hdr->b_l1hdr.b_buf = NULL; 3538 3539 /* 3540 * Allocate the hdr's buffer. This will contain either 3541 * the compressed or uncompressed data depending on the block 3542 * it references and compressed arc enablement. 3543 */ 3544 arc_hdr_alloc_pabd(hdr, alloc_rdata); 3545 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3546 3547 return (hdr); 3548 } 3549 3550 /* 3551 * Transition between the two allocation states for the arc_buf_hdr struct. 3552 * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without 3553 * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller 3554 * version is used when a cache buffer is only in the L2ARC in order to reduce 3555 * memory usage. 3556 */ 3557 static arc_buf_hdr_t * 3558 arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) 3559 { 3560 ASSERT(HDR_HAS_L2HDR(hdr)); 3561 3562 arc_buf_hdr_t *nhdr; 3563 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3564 3565 ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || 3566 (old == hdr_l2only_cache && new == hdr_full_cache)); 3567 3568 /* 3569 * if the caller wanted a new full header and the header is to be 3570 * encrypted we will actually allocate the header from the full crypt 3571 * cache instead. The same applies to freeing from the old cache. 3572 */ 3573 if (HDR_PROTECTED(hdr) && new == hdr_full_cache) 3574 new = hdr_full_crypt_cache; 3575 if (HDR_PROTECTED(hdr) && old == hdr_full_cache) 3576 old = hdr_full_crypt_cache; 3577 3578 nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); 3579 3580 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 3581 buf_hash_remove(hdr); 3582 3583 bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); 3584 3585 if (new == hdr_full_cache || new == hdr_full_crypt_cache) { 3586 arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3587 /* 3588 * arc_access and arc_change_state need to be aware that a 3589 * header has just come out of L2ARC, so we set its state to 3590 * l2c_only even though it's about to change. 3591 */ 3592 nhdr->b_l1hdr.b_state = arc_l2c_only; 3593 3594 /* Verify previous threads set to NULL before freeing */ 3595 ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); 3596 ASSERT(!HDR_HAS_RABD(hdr)); 3597 } else { 3598 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 3599 ASSERT0(hdr->b_l1hdr.b_bufcnt); 3600 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3601 3602 /* 3603 * If we've reached here, We must have been called from 3604 * arc_evict_hdr(), as such we should have already been 3605 * removed from any ghost list we were previously on 3606 * (which protects us from racing with arc_evict_state), 3607 * thus no locking is needed during this check. 3608 */ 3609 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3610 3611 /* 3612 * A buffer must not be moved into the arc_l2c_only 3613 * state if it's not finished being written out to the 3614 * l2arc device. Otherwise, the b_l1hdr.b_pabd field 3615 * might try to be accessed, even though it was removed. 3616 */ 3617 VERIFY(!HDR_L2_WRITING(hdr)); 3618 VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); 3619 ASSERT(!HDR_HAS_RABD(hdr)); 3620 3621 #ifdef ZFS_DEBUG 3622 if (hdr->b_l1hdr.b_thawed != NULL) { 3623 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3624 hdr->b_l1hdr.b_thawed = NULL; 3625 } 3626 #endif 3627 3628 arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); 3629 } 3630 /* 3631 * The header has been reallocated so we need to re-insert it into any 3632 * lists it was on. 3633 */ 3634 (void) buf_hash_insert(nhdr, NULL); 3635 3636 ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); 3637 3638 mutex_enter(&dev->l2ad_mtx); 3639 3640 /* 3641 * We must place the realloc'ed header back into the list at 3642 * the same spot. Otherwise, if it's placed earlier in the list, 3643 * l2arc_write_buffers() could find it during the function's 3644 * write phase, and try to write it out to the l2arc. 3645 */ 3646 list_insert_after(&dev->l2ad_buflist, hdr, nhdr); 3647 list_remove(&dev->l2ad_buflist, hdr); 3648 3649 mutex_exit(&dev->l2ad_mtx); 3650 3651 /* 3652 * Since we're using the pointer address as the tag when 3653 * incrementing and decrementing the l2ad_alloc refcount, we 3654 * must remove the old pointer (that we're about to destroy) and 3655 * add the new pointer to the refcount. Otherwise we'd remove 3656 * the wrong pointer address when calling arc_hdr_destroy() later. 3657 */ 3658 3659 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), 3660 hdr); 3661 (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(nhdr), 3662 nhdr); 3663 3664 buf_discard_identity(hdr); 3665 kmem_cache_free(old, hdr); 3666 3667 return (nhdr); 3668 } 3669 3670 /* 3671 * This function allows an L1 header to be reallocated as a crypt 3672 * header and vice versa. If we are going to a crypt header, the 3673 * new fields will be zeroed out. 3674 */ 3675 static arc_buf_hdr_t * 3676 arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) 3677 { 3678 arc_buf_hdr_t *nhdr; 3679 arc_buf_t *buf; 3680 kmem_cache_t *ncache, *ocache; 3681 3682 ASSERT(HDR_HAS_L1HDR(hdr)); 3683 ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); 3684 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3685 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 3686 ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); 3687 ASSERT3P(hdr->b_hash_next, ==, NULL); 3688 3689 if (need_crypt) { 3690 ncache = hdr_full_crypt_cache; 3691 ocache = hdr_full_cache; 3692 } else { 3693 ncache = hdr_full_cache; 3694 ocache = hdr_full_crypt_cache; 3695 } 3696 3697 nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); 3698 3699 /* 3700 * Copy all members that aren't locks or condvars to the new header. 3701 * No lists are pointing to us (as we asserted above), so we don't 3702 * need to worry about the list nodes. 3703 */ 3704 nhdr->b_dva = hdr->b_dva; 3705 nhdr->b_birth = hdr->b_birth; 3706 nhdr->b_type = hdr->b_type; 3707 nhdr->b_flags = hdr->b_flags; 3708 nhdr->b_psize = hdr->b_psize; 3709 nhdr->b_lsize = hdr->b_lsize; 3710 nhdr->b_spa = hdr->b_spa; 3711 nhdr->b_l2hdr.b_dev = hdr->b_l2hdr.b_dev; 3712 nhdr->b_l2hdr.b_daddr = hdr->b_l2hdr.b_daddr; 3713 nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; 3714 nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; 3715 nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; 3716 nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; 3717 nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; 3718 nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; 3719 nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; 3720 #ifdef ZFS_DEBUG 3721 if (hdr->b_l1hdr.b_thawed != NULL) { 3722 nhdr->b_l1hdr.b_thawed = hdr->b_l1hdr.b_thawed; 3723 hdr->b_l1hdr.b_thawed = NULL; 3724 } 3725 #endif 3726 3727 /* 3728 * This refcount_add() exists only to ensure that the individual 3729 * arc buffers always point to a header that is referenced, avoiding 3730 * a small race condition that could trigger ASSERTs. 3731 */ 3732 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); 3733 nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; 3734 for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { 3735 mutex_enter(&buf->b_evict_lock); 3736 buf->b_hdr = nhdr; 3737 mutex_exit(&buf->b_evict_lock); 3738 } 3739 zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); 3740 (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); 3741 ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); 3742 3743 if (need_crypt) { 3744 arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); 3745 } else { 3746 arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); 3747 } 3748 3749 /* unset all members of the original hdr */ 3750 bzero(&hdr->b_dva, sizeof (dva_t)); 3751 hdr->b_birth = 0; 3752 hdr->b_type = ARC_BUFC_INVALID; 3753 hdr->b_flags = 0; 3754 hdr->b_psize = 0; 3755 hdr->b_lsize = 0; 3756 hdr->b_spa = 0; 3757 hdr->b_l2hdr.b_dev = NULL; 3758 hdr->b_l2hdr.b_daddr = 0; 3759 hdr->b_l1hdr.b_freeze_cksum = NULL; 3760 hdr->b_l1hdr.b_buf = NULL; 3761 hdr->b_l1hdr.b_bufcnt = 0; 3762 hdr->b_l1hdr.b_byteswap = 0; 3763 hdr->b_l1hdr.b_state = NULL; 3764 hdr->b_l1hdr.b_arc_access = 0; 3765 hdr->b_l1hdr.b_acb = NULL; 3766 hdr->b_l1hdr.b_pabd = NULL; 3767 3768 if (ocache == hdr_full_crypt_cache) { 3769 ASSERT(!HDR_HAS_RABD(hdr)); 3770 hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; 3771 hdr->b_crypt_hdr.b_ebufcnt = 0; 3772 hdr->b_crypt_hdr.b_dsobj = 0; 3773 bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); 3774 bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); 3775 bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); 3776 } 3777 3778 buf_discard_identity(hdr); 3779 kmem_cache_free(ocache, hdr); 3780 3781 return (nhdr); 3782 } 3783 3784 /* 3785 * This function is used by the send / receive code to convert a newly 3786 * allocated arc_buf_t to one that is suitable for a raw encrypted write. It 3787 * is also used to allow the root objset block to be uupdated without altering 3788 * its embedded MACs. Both block types will always be uncompressed so we do not 3789 * have to worry about compression type or psize. 3790 */ 3791 void 3792 arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, 3793 dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, 3794 const uint8_t *mac) 3795 { 3796 arc_buf_hdr_t *hdr = buf->b_hdr; 3797 3798 ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); 3799 ASSERT(HDR_HAS_L1HDR(hdr)); 3800 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3801 3802 buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); 3803 if (!HDR_PROTECTED(hdr)) 3804 hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); 3805 hdr->b_crypt_hdr.b_dsobj = dsobj; 3806 hdr->b_crypt_hdr.b_ot = ot; 3807 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? 3808 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); 3809 if (!arc_hdr_has_uncompressed_buf(hdr)) 3810 arc_cksum_free(hdr); 3811 3812 if (salt != NULL) 3813 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); 3814 if (iv != NULL) 3815 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); 3816 if (mac != NULL) 3817 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); 3818 } 3819 3820 /* 3821 * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. 3822 * The buf is returned thawed since we expect the consumer to modify it. 3823 */ 3824 arc_buf_t * 3825 arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) 3826 { 3827 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, 3828 B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); 3829 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3830 3831 arc_buf_t *buf = NULL; 3832 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, 3833 B_FALSE, B_FALSE, &buf)); 3834 arc_buf_thaw(buf); 3835 3836 return (buf); 3837 } 3838 3839 /* 3840 * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this 3841 * for bufs containing metadata. 3842 */ 3843 arc_buf_t * 3844 arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, 3845 enum zio_compress compression_type) 3846 { 3847 ASSERT3U(lsize, >, 0); 3848 ASSERT3U(lsize, >=, psize); 3849 ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); 3850 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); 3851 3852 arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 3853 B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); 3854 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3855 3856 arc_buf_t *buf = NULL; 3857 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, 3858 B_TRUE, B_FALSE, B_FALSE, &buf)); 3859 arc_buf_thaw(buf); 3860 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3861 3862 if (!arc_buf_is_shared(buf)) { 3863 /* 3864 * To ensure that the hdr has the correct data in it if we call 3865 * arc_untransform() on this buf before it's been written to 3866 * disk, it's easiest if we just set up sharing between the 3867 * buf and the hdr. 3868 */ 3869 ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); 3870 arc_hdr_free_pabd(hdr, B_FALSE); 3871 arc_share_buf(hdr, buf); 3872 } 3873 3874 return (buf); 3875 } 3876 3877 arc_buf_t * 3878 arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, 3879 const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, 3880 dmu_object_type_t ot, uint64_t psize, uint64_t lsize, 3881 enum zio_compress compression_type) 3882 { 3883 arc_buf_hdr_t *hdr; 3884 arc_buf_t *buf; 3885 arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? 3886 ARC_BUFC_METADATA : ARC_BUFC_DATA; 3887 3888 ASSERT3U(lsize, >, 0); 3889 ASSERT3U(lsize, >=, psize); 3890 ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); 3891 ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); 3892 3893 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, 3894 compression_type, type, B_TRUE); 3895 ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); 3896 3897 hdr->b_crypt_hdr.b_dsobj = dsobj; 3898 hdr->b_crypt_hdr.b_ot = ot; 3899 hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? 3900 DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); 3901 bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); 3902 bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); 3903 bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); 3904 3905 /* 3906 * This buffer will be considered encrypted even if the ot is not an 3907 * encrypted type. It will become authenticated instead in 3908 * arc_write_ready(). 3909 */ 3910 buf = NULL; 3911 VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, 3912 B_FALSE, B_FALSE, &buf)); 3913 arc_buf_thaw(buf); 3914 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 3915 3916 return (buf); 3917 } 3918 3919 static void 3920 arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) 3921 { 3922 l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; 3923 l2arc_dev_t *dev = l2hdr->b_dev; 3924 uint64_t psize = HDR_GET_PSIZE(hdr); 3925 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); 3926 3927 ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); 3928 ASSERT(HDR_HAS_L2HDR(hdr)); 3929 3930 list_remove(&dev->l2ad_buflist, hdr); 3931 3932 ARCSTAT_INCR(arcstat_l2_psize, -psize); 3933 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 3934 3935 vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); 3936 3937 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), 3938 hdr); 3939 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 3940 } 3941 3942 static void 3943 arc_hdr_destroy(arc_buf_hdr_t *hdr) 3944 { 3945 if (HDR_HAS_L1HDR(hdr)) { 3946 ASSERT(hdr->b_l1hdr.b_buf == NULL || 3947 hdr->b_l1hdr.b_bufcnt > 0); 3948 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 3949 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 3950 } 3951 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 3952 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 3953 3954 if (!HDR_EMPTY(hdr)) 3955 buf_discard_identity(hdr); 3956 3957 if (HDR_HAS_L2HDR(hdr)) { 3958 l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; 3959 boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); 3960 3961 if (!buflist_held) 3962 mutex_enter(&dev->l2ad_mtx); 3963 3964 /* 3965 * Even though we checked this conditional above, we 3966 * need to check this again now that we have the 3967 * l2ad_mtx. This is because we could be racing with 3968 * another thread calling l2arc_evict() which might have 3969 * destroyed this header's L2 portion as we were waiting 3970 * to acquire the l2ad_mtx. If that happens, we don't 3971 * want to re-destroy the header's L2 portion. 3972 */ 3973 if (HDR_HAS_L2HDR(hdr)) 3974 arc_hdr_l2hdr_destroy(hdr); 3975 3976 if (!buflist_held) 3977 mutex_exit(&dev->l2ad_mtx); 3978 } 3979 3980 if (HDR_HAS_L1HDR(hdr)) { 3981 arc_cksum_free(hdr); 3982 3983 while (hdr->b_l1hdr.b_buf != NULL) 3984 arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); 3985 3986 #ifdef ZFS_DEBUG 3987 if (hdr->b_l1hdr.b_thawed != NULL) { 3988 kmem_free(hdr->b_l1hdr.b_thawed, 1); 3989 hdr->b_l1hdr.b_thawed = NULL; 3990 } 3991 #endif 3992 3993 if (hdr->b_l1hdr.b_pabd != NULL) { 3994 arc_hdr_free_pabd(hdr, B_FALSE); 3995 } 3996 3997 if (HDR_HAS_RABD(hdr)) 3998 arc_hdr_free_pabd(hdr, B_TRUE); 3999 } 4000 4001 ASSERT3P(hdr->b_hash_next, ==, NULL); 4002 if (HDR_HAS_L1HDR(hdr)) { 4003 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 4004 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 4005 4006 if (!HDR_PROTECTED(hdr)) { 4007 kmem_cache_free(hdr_full_cache, hdr); 4008 } else { 4009 kmem_cache_free(hdr_full_crypt_cache, hdr); 4010 } 4011 } else { 4012 kmem_cache_free(hdr_l2only_cache, hdr); 4013 } 4014 } 4015 4016 void 4017 arc_buf_destroy(arc_buf_t *buf, void* tag) 4018 { 4019 arc_buf_hdr_t *hdr = buf->b_hdr; 4020 kmutex_t *hash_lock = HDR_LOCK(hdr); 4021 4022 if (hdr->b_l1hdr.b_state == arc_anon) { 4023 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 4024 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4025 VERIFY0(remove_reference(hdr, NULL, tag)); 4026 arc_hdr_destroy(hdr); 4027 return; 4028 } 4029 4030 mutex_enter(hash_lock); 4031 ASSERT3P(hdr, ==, buf->b_hdr); 4032 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 4033 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 4034 ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); 4035 ASSERT3P(buf->b_data, !=, NULL); 4036 4037 (void) remove_reference(hdr, hash_lock, tag); 4038 arc_buf_destroy_impl(buf); 4039 mutex_exit(hash_lock); 4040 } 4041 4042 /* 4043 * Evict the arc_buf_hdr that is provided as a parameter. The resultant 4044 * state of the header is dependent on its state prior to entering this 4045 * function. The following transitions are possible: 4046 * 4047 * - arc_mru -> arc_mru_ghost 4048 * - arc_mfu -> arc_mfu_ghost 4049 * - arc_mru_ghost -> arc_l2c_only 4050 * - arc_mru_ghost -> deleted 4051 * - arc_mfu_ghost -> arc_l2c_only 4052 * - arc_mfu_ghost -> deleted 4053 */ 4054 static int64_t 4055 arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 4056 { 4057 arc_state_t *evicted_state, *state; 4058 int64_t bytes_evicted = 0; 4059 int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? 4060 zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; 4061 4062 ASSERT(MUTEX_HELD(hash_lock)); 4063 ASSERT(HDR_HAS_L1HDR(hdr)); 4064 4065 state = hdr->b_l1hdr.b_state; 4066 if (GHOST_STATE(state)) { 4067 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 4068 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 4069 4070 /* 4071 * l2arc_write_buffers() relies on a header's L1 portion 4072 * (i.e. its b_pabd field) during its write phase. 4073 * Thus, we cannot push a header onto the arc_l2c_only 4074 * state (removing its L1 piece) until the header is 4075 * done being written to the l2arc. 4076 */ 4077 if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { 4078 ARCSTAT_BUMP(arcstat_evict_l2_skip); 4079 return (bytes_evicted); 4080 } 4081 4082 ARCSTAT_BUMP(arcstat_deleted); 4083 bytes_evicted += HDR_GET_LSIZE(hdr); 4084 4085 DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); 4086 4087 if (HDR_HAS_L2HDR(hdr)) { 4088 ASSERT(hdr->b_l1hdr.b_pabd == NULL); 4089 ASSERT(!HDR_HAS_RABD(hdr)); 4090 /* 4091 * This buffer is cached on the 2nd Level ARC; 4092 * don't destroy the header. 4093 */ 4094 arc_change_state(arc_l2c_only, hdr, hash_lock); 4095 /* 4096 * dropping from L1+L2 cached to L2-only, 4097 * realloc to remove the L1 header. 4098 */ 4099 hdr = arc_hdr_realloc(hdr, hdr_full_cache, 4100 hdr_l2only_cache); 4101 } else { 4102 arc_change_state(arc_anon, hdr, hash_lock); 4103 arc_hdr_destroy(hdr); 4104 } 4105 return (bytes_evicted); 4106 } 4107 4108 ASSERT(state == arc_mru || state == arc_mfu); 4109 evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; 4110 4111 /* prefetch buffers have a minimum lifespan */ 4112 if (HDR_IO_IN_PROGRESS(hdr) || 4113 ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && 4114 ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { 4115 ARCSTAT_BUMP(arcstat_evict_skip); 4116 return (bytes_evicted); 4117 } 4118 4119 ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); 4120 while (hdr->b_l1hdr.b_buf) { 4121 arc_buf_t *buf = hdr->b_l1hdr.b_buf; 4122 if (!mutex_tryenter(&buf->b_evict_lock)) { 4123 ARCSTAT_BUMP(arcstat_mutex_miss); 4124 break; 4125 } 4126 if (buf->b_data != NULL) 4127 bytes_evicted += HDR_GET_LSIZE(hdr); 4128 mutex_exit(&buf->b_evict_lock); 4129 arc_buf_destroy_impl(buf); 4130 } 4131 4132 if (HDR_HAS_L2HDR(hdr)) { 4133 ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); 4134 } else { 4135 if (l2arc_write_eligible(hdr->b_spa, hdr)) { 4136 ARCSTAT_INCR(arcstat_evict_l2_eligible, 4137 HDR_GET_LSIZE(hdr)); 4138 } else { 4139 ARCSTAT_INCR(arcstat_evict_l2_ineligible, 4140 HDR_GET_LSIZE(hdr)); 4141 } 4142 } 4143 4144 if (hdr->b_l1hdr.b_bufcnt == 0) { 4145 arc_cksum_free(hdr); 4146 4147 bytes_evicted += arc_hdr_size(hdr); 4148 4149 /* 4150 * If this hdr is being evicted and has a compressed 4151 * buffer then we discard it here before we change states. 4152 * This ensures that the accounting is updated correctly 4153 * in arc_free_data_impl(). 4154 */ 4155 if (hdr->b_l1hdr.b_pabd != NULL) 4156 arc_hdr_free_pabd(hdr, B_FALSE); 4157 4158 if (HDR_HAS_RABD(hdr)) 4159 arc_hdr_free_pabd(hdr, B_TRUE); 4160 4161 arc_change_state(evicted_state, hdr, hash_lock); 4162 ASSERT(HDR_IN_HASH_TABLE(hdr)); 4163 arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); 4164 DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); 4165 } 4166 4167 return (bytes_evicted); 4168 } 4169 4170 static uint64_t 4171 arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, 4172 uint64_t spa, int64_t bytes) 4173 { 4174 multilist_sublist_t *mls; 4175 uint64_t bytes_evicted = 0; 4176 arc_buf_hdr_t *hdr; 4177 kmutex_t *hash_lock; 4178 int evict_count = 0; 4179 4180 ASSERT3P(marker, !=, NULL); 4181 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 4182 4183 mls = multilist_sublist_lock(ml, idx); 4184 4185 for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; 4186 hdr = multilist_sublist_prev(mls, marker)) { 4187 if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || 4188 (evict_count >= zfs_arc_evict_batch_limit)) 4189 break; 4190 4191 /* 4192 * To keep our iteration location, move the marker 4193 * forward. Since we're not holding hdr's hash lock, we 4194 * must be very careful and not remove 'hdr' from the 4195 * sublist. Otherwise, other consumers might mistake the 4196 * 'hdr' as not being on a sublist when they call the 4197 * multilist_link_active() function (they all rely on 4198 * the hash lock protecting concurrent insertions and 4199 * removals). multilist_sublist_move_forward() was 4200 * specifically implemented to ensure this is the case 4201 * (only 'marker' will be removed and re-inserted). 4202 */ 4203 multilist_sublist_move_forward(mls, marker); 4204 4205 /* 4206 * The only case where the b_spa field should ever be 4207 * zero, is the marker headers inserted by 4208 * arc_evict_state(). It's possible for multiple threads 4209 * to be calling arc_evict_state() concurrently (e.g. 4210 * dsl_pool_close() and zio_inject_fault()), so we must 4211 * skip any markers we see from these other threads. 4212 */ 4213 if (hdr->b_spa == 0) 4214 continue; 4215 4216 /* we're only interested in evicting buffers of a certain spa */ 4217 if (spa != 0 && hdr->b_spa != spa) { 4218 ARCSTAT_BUMP(arcstat_evict_skip); 4219 continue; 4220 } 4221 4222 hash_lock = HDR_LOCK(hdr); 4223 4224 /* 4225 * We aren't calling this function from any code path 4226 * that would already be holding a hash lock, so we're 4227 * asserting on this assumption to be defensive in case 4228 * this ever changes. Without this check, it would be 4229 * possible to incorrectly increment arcstat_mutex_miss 4230 * below (e.g. if the code changed such that we called 4231 * this function with a hash lock held). 4232 */ 4233 ASSERT(!MUTEX_HELD(hash_lock)); 4234 4235 if (mutex_tryenter(hash_lock)) { 4236 uint64_t evicted = arc_evict_hdr(hdr, hash_lock); 4237 mutex_exit(hash_lock); 4238 4239 bytes_evicted += evicted; 4240 4241 /* 4242 * If evicted is zero, arc_evict_hdr() must have 4243 * decided to skip this header, don't increment 4244 * evict_count in this case. 4245 */ 4246 if (evicted != 0) 4247 evict_count++; 4248 4249 /* 4250 * If arc_size isn't overflowing, signal any 4251 * threads that might happen to be waiting. 4252 * 4253 * For each header evicted, we wake up a single 4254 * thread. If we used cv_broadcast, we could 4255 * wake up "too many" threads causing arc_size 4256 * to significantly overflow arc_c; since 4257 * arc_get_data_impl() doesn't check for overflow 4258 * when it's woken up (it doesn't because it's 4259 * possible for the ARC to be overflowing while 4260 * full of un-evictable buffers, and the 4261 * function should proceed in this case). 4262 * 4263 * If threads are left sleeping, due to not 4264 * using cv_broadcast here, they will be woken 4265 * up via cv_broadcast in arc_adjust_cb() just 4266 * before arc_adjust_zthr sleeps. 4267 */ 4268 mutex_enter(&arc_adjust_lock); 4269 if (!arc_is_overflowing()) 4270 cv_signal(&arc_adjust_waiters_cv); 4271 mutex_exit(&arc_adjust_lock); 4272 } else { 4273 ARCSTAT_BUMP(arcstat_mutex_miss); 4274 } 4275 } 4276 4277 multilist_sublist_unlock(mls); 4278 4279 return (bytes_evicted); 4280 } 4281 4282 /* 4283 * Evict buffers from the given arc state, until we've removed the 4284 * specified number of bytes. Move the removed buffers to the 4285 * appropriate evict state. 4286 * 4287 * This function makes a "best effort". It skips over any buffers 4288 * it can't get a hash_lock on, and so, may not catch all candidates. 4289 * It may also return without evicting as much space as requested. 4290 * 4291 * If bytes is specified using the special value ARC_EVICT_ALL, this 4292 * will evict all available (i.e. unlocked and evictable) buffers from 4293 * the given arc state; which is used by arc_flush(). 4294 */ 4295 static uint64_t 4296 arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, 4297 arc_buf_contents_t type) 4298 { 4299 uint64_t total_evicted = 0; 4300 multilist_t *ml = state->arcs_list[type]; 4301 int num_sublists; 4302 arc_buf_hdr_t **markers; 4303 4304 IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); 4305 4306 num_sublists = multilist_get_num_sublists(ml); 4307 4308 /* 4309 * If we've tried to evict from each sublist, made some 4310 * progress, but still have not hit the target number of bytes 4311 * to evict, we want to keep trying. The markers allow us to 4312 * pick up where we left off for each individual sublist, rather 4313 * than starting from the tail each time. 4314 */ 4315 markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); 4316 for (int i = 0; i < num_sublists; i++) { 4317 markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); 4318 4319 /* 4320 * A b_spa of 0 is used to indicate that this header is 4321 * a marker. This fact is used in arc_adjust_type() and 4322 * arc_evict_state_impl(). 4323 */ 4324 markers[i]->b_spa = 0; 4325 4326 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 4327 multilist_sublist_insert_tail(mls, markers[i]); 4328 multilist_sublist_unlock(mls); 4329 } 4330 4331 /* 4332 * While we haven't hit our target number of bytes to evict, or 4333 * we're evicting all available buffers. 4334 */ 4335 while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { 4336 /* 4337 * Start eviction using a randomly selected sublist, 4338 * this is to try and evenly balance eviction across all 4339 * sublists. Always starting at the same sublist 4340 * (e.g. index 0) would cause evictions to favor certain 4341 * sublists over others. 4342 */ 4343 int sublist_idx = multilist_get_random_index(ml); 4344 uint64_t scan_evicted = 0; 4345 4346 for (int i = 0; i < num_sublists; i++) { 4347 uint64_t bytes_remaining; 4348 uint64_t bytes_evicted; 4349 4350 if (bytes == ARC_EVICT_ALL) 4351 bytes_remaining = ARC_EVICT_ALL; 4352 else if (total_evicted < bytes) 4353 bytes_remaining = bytes - total_evicted; 4354 else 4355 break; 4356 4357 bytes_evicted = arc_evict_state_impl(ml, sublist_idx, 4358 markers[sublist_idx], spa, bytes_remaining); 4359 4360 scan_evicted += bytes_evicted; 4361 total_evicted += bytes_evicted; 4362 4363 /* we've reached the end, wrap to the beginning */ 4364 if (++sublist_idx >= num_sublists) 4365 sublist_idx = 0; 4366 } 4367 4368 /* 4369 * If we didn't evict anything during this scan, we have 4370 * no reason to believe we'll evict more during another 4371 * scan, so break the loop. 4372 */ 4373 if (scan_evicted == 0) { 4374 /* This isn't possible, let's make that obvious */ 4375 ASSERT3S(bytes, !=, 0); 4376 4377 /* 4378 * When bytes is ARC_EVICT_ALL, the only way to 4379 * break the loop is when scan_evicted is zero. 4380 * In that case, we actually have evicted enough, 4381 * so we don't want to increment the kstat. 4382 */ 4383 if (bytes != ARC_EVICT_ALL) { 4384 ASSERT3S(total_evicted, <, bytes); 4385 ARCSTAT_BUMP(arcstat_evict_not_enough); 4386 } 4387 4388 break; 4389 } 4390 } 4391 4392 for (int i = 0; i < num_sublists; i++) { 4393 multilist_sublist_t *mls = multilist_sublist_lock(ml, i); 4394 multilist_sublist_remove(mls, markers[i]); 4395 multilist_sublist_unlock(mls); 4396 4397 kmem_cache_free(hdr_full_cache, markers[i]); 4398 } 4399 kmem_free(markers, sizeof (*markers) * num_sublists); 4400 4401 return (total_evicted); 4402 } 4403 4404 /* 4405 * Flush all "evictable" data of the given type from the arc state 4406 * specified. This will not evict any "active" buffers (i.e. referenced). 4407 * 4408 * When 'retry' is set to B_FALSE, the function will make a single pass 4409 * over the state and evict any buffers that it can. Since it doesn't 4410 * continually retry the eviction, it might end up leaving some buffers 4411 * in the ARC due to lock misses. 4412 * 4413 * When 'retry' is set to B_TRUE, the function will continually retry the 4414 * eviction until *all* evictable buffers have been removed from the 4415 * state. As a result, if concurrent insertions into the state are 4416 * allowed (e.g. if the ARC isn't shutting down), this function might 4417 * wind up in an infinite loop, continually trying to evict buffers. 4418 */ 4419 static uint64_t 4420 arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, 4421 boolean_t retry) 4422 { 4423 uint64_t evicted = 0; 4424 4425 while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { 4426 evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); 4427 4428 if (!retry) 4429 break; 4430 } 4431 4432 return (evicted); 4433 } 4434 4435 /* 4436 * Evict the specified number of bytes from the state specified, 4437 * restricting eviction to the spa and type given. This function 4438 * prevents us from trying to evict more from a state's list than 4439 * is "evictable", and to skip evicting altogether when passed a 4440 * negative value for "bytes". In contrast, arc_evict_state() will 4441 * evict everything it can, when passed a negative value for "bytes". 4442 */ 4443 static uint64_t 4444 arc_adjust_impl(arc_state_t *state, uint64_t spa, int64_t bytes, 4445 arc_buf_contents_t type) 4446 { 4447 int64_t delta; 4448 4449 if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { 4450 delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), 4451 bytes); 4452 return (arc_evict_state(state, spa, delta, type)); 4453 } 4454 4455 return (0); 4456 } 4457 4458 /* 4459 * Evict metadata buffers from the cache, such that arc_meta_used is 4460 * capped by the arc_meta_limit tunable. 4461 */ 4462 static uint64_t 4463 arc_adjust_meta(uint64_t meta_used) 4464 { 4465 uint64_t total_evicted = 0; 4466 int64_t target; 4467 4468 /* 4469 * If we're over the meta limit, we want to evict enough 4470 * metadata to get back under the meta limit. We don't want to 4471 * evict so much that we drop the MRU below arc_p, though. If 4472 * we're over the meta limit more than we're over arc_p, we 4473 * evict some from the MRU here, and some from the MFU below. 4474 */ 4475 target = MIN((int64_t)(meta_used - arc_meta_limit), 4476 (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + 4477 zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); 4478 4479 total_evicted += arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4480 4481 /* 4482 * Similar to the above, we want to evict enough bytes to get us 4483 * below the meta limit, but not so much as to drop us below the 4484 * space allotted to the MFU (which is defined as arc_c - arc_p). 4485 */ 4486 target = MIN((int64_t)(meta_used - arc_meta_limit), 4487 (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - 4488 (arc_c - arc_p))); 4489 4490 total_evicted += arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4491 4492 return (total_evicted); 4493 } 4494 4495 /* 4496 * Return the type of the oldest buffer in the given arc state 4497 * 4498 * This function will select a random sublist of type ARC_BUFC_DATA and 4499 * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist 4500 * is compared, and the type which contains the "older" buffer will be 4501 * returned. 4502 */ 4503 static arc_buf_contents_t 4504 arc_adjust_type(arc_state_t *state) 4505 { 4506 multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; 4507 multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; 4508 int data_idx = multilist_get_random_index(data_ml); 4509 int meta_idx = multilist_get_random_index(meta_ml); 4510 multilist_sublist_t *data_mls; 4511 multilist_sublist_t *meta_mls; 4512 arc_buf_contents_t type; 4513 arc_buf_hdr_t *data_hdr; 4514 arc_buf_hdr_t *meta_hdr; 4515 4516 /* 4517 * We keep the sublist lock until we're finished, to prevent 4518 * the headers from being destroyed via arc_evict_state(). 4519 */ 4520 data_mls = multilist_sublist_lock(data_ml, data_idx); 4521 meta_mls = multilist_sublist_lock(meta_ml, meta_idx); 4522 4523 /* 4524 * These two loops are to ensure we skip any markers that 4525 * might be at the tail of the lists due to arc_evict_state(). 4526 */ 4527 4528 for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; 4529 data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { 4530 if (data_hdr->b_spa != 0) 4531 break; 4532 } 4533 4534 for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; 4535 meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { 4536 if (meta_hdr->b_spa != 0) 4537 break; 4538 } 4539 4540 if (data_hdr == NULL && meta_hdr == NULL) { 4541 type = ARC_BUFC_DATA; 4542 } else if (data_hdr == NULL) { 4543 ASSERT3P(meta_hdr, !=, NULL); 4544 type = ARC_BUFC_METADATA; 4545 } else if (meta_hdr == NULL) { 4546 ASSERT3P(data_hdr, !=, NULL); 4547 type = ARC_BUFC_DATA; 4548 } else { 4549 ASSERT3P(data_hdr, !=, NULL); 4550 ASSERT3P(meta_hdr, !=, NULL); 4551 4552 /* The headers can't be on the sublist without an L1 header */ 4553 ASSERT(HDR_HAS_L1HDR(data_hdr)); 4554 ASSERT(HDR_HAS_L1HDR(meta_hdr)); 4555 4556 if (data_hdr->b_l1hdr.b_arc_access < 4557 meta_hdr->b_l1hdr.b_arc_access) { 4558 type = ARC_BUFC_DATA; 4559 } else { 4560 type = ARC_BUFC_METADATA; 4561 } 4562 } 4563 4564 multilist_sublist_unlock(meta_mls); 4565 multilist_sublist_unlock(data_mls); 4566 4567 return (type); 4568 } 4569 4570 /* 4571 * Evict buffers from the cache, such that arc_size is capped by arc_c. 4572 */ 4573 static uint64_t 4574 arc_adjust(void) 4575 { 4576 uint64_t total_evicted = 0; 4577 uint64_t bytes; 4578 int64_t target; 4579 uint64_t asize = aggsum_value(&arc_size); 4580 uint64_t ameta = aggsum_value(&arc_meta_used); 4581 4582 /* 4583 * If we're over arc_meta_limit, we want to correct that before 4584 * potentially evicting data buffers below. 4585 */ 4586 total_evicted += arc_adjust_meta(ameta); 4587 4588 /* 4589 * Adjust MRU size 4590 * 4591 * If we're over the target cache size, we want to evict enough 4592 * from the list to get back to our target size. We don't want 4593 * to evict too much from the MRU, such that it drops below 4594 * arc_p. So, if we're over our target cache size more than 4595 * the MRU is over arc_p, we'll evict enough to get back to 4596 * arc_p here, and then evict more from the MFU below. 4597 */ 4598 target = MIN((int64_t)(asize - arc_c), 4599 (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + 4600 zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); 4601 4602 /* 4603 * If we're below arc_meta_min, always prefer to evict data. 4604 * Otherwise, try to satisfy the requested number of bytes to 4605 * evict from the type which contains older buffers; in an 4606 * effort to keep newer buffers in the cache regardless of their 4607 * type. If we cannot satisfy the number of bytes from this 4608 * type, spill over into the next type. 4609 */ 4610 if (arc_adjust_type(arc_mru) == ARC_BUFC_METADATA && 4611 ameta > arc_meta_min) { 4612 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4613 total_evicted += bytes; 4614 4615 /* 4616 * If we couldn't evict our target number of bytes from 4617 * metadata, we try to get the rest from data. 4618 */ 4619 target -= bytes; 4620 4621 total_evicted += 4622 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4623 } else { 4624 bytes = arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_DATA); 4625 total_evicted += bytes; 4626 4627 /* 4628 * If we couldn't evict our target number of bytes from 4629 * data, we try to get the rest from metadata. 4630 */ 4631 target -= bytes; 4632 4633 total_evicted += 4634 arc_adjust_impl(arc_mru, 0, target, ARC_BUFC_METADATA); 4635 } 4636 4637 /* 4638 * Adjust MFU size 4639 * 4640 * Now that we've tried to evict enough from the MRU to get its 4641 * size back to arc_p, if we're still above the target cache 4642 * size, we evict the rest from the MFU. 4643 */ 4644 target = asize - arc_c; 4645 4646 if (arc_adjust_type(arc_mfu) == ARC_BUFC_METADATA && 4647 ameta > arc_meta_min) { 4648 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4649 total_evicted += bytes; 4650 4651 /* 4652 * If we couldn't evict our target number of bytes from 4653 * metadata, we try to get the rest from data. 4654 */ 4655 target -= bytes; 4656 4657 total_evicted += 4658 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4659 } else { 4660 bytes = arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_DATA); 4661 total_evicted += bytes; 4662 4663 /* 4664 * If we couldn't evict our target number of bytes from 4665 * data, we try to get the rest from data. 4666 */ 4667 target -= bytes; 4668 4669 total_evicted += 4670 arc_adjust_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); 4671 } 4672 4673 /* 4674 * Adjust ghost lists 4675 * 4676 * In addition to the above, the ARC also defines target values 4677 * for the ghost lists. The sum of the mru list and mru ghost 4678 * list should never exceed the target size of the cache, and 4679 * the sum of the mru list, mfu list, mru ghost list, and mfu 4680 * ghost list should never exceed twice the target size of the 4681 * cache. The following logic enforces these limits on the ghost 4682 * caches, and evicts from them as needed. 4683 */ 4684 target = zfs_refcount_count(&arc_mru->arcs_size) + 4685 zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; 4686 4687 bytes = arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); 4688 total_evicted += bytes; 4689 4690 target -= bytes; 4691 4692 total_evicted += 4693 arc_adjust_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); 4694 4695 /* 4696 * We assume the sum of the mru list and mfu list is less than 4697 * or equal to arc_c (we enforced this above), which means we 4698 * can use the simpler of the two equations below: 4699 * 4700 * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c 4701 * mru ghost + mfu ghost <= arc_c 4702 */ 4703 target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + 4704 zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; 4705 4706 bytes = arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); 4707 total_evicted += bytes; 4708 4709 target -= bytes; 4710 4711 total_evicted += 4712 arc_adjust_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); 4713 4714 return (total_evicted); 4715 } 4716 4717 void 4718 arc_flush(spa_t *spa, boolean_t retry) 4719 { 4720 uint64_t guid = 0; 4721 4722 /* 4723 * If retry is B_TRUE, a spa must not be specified since we have 4724 * no good way to determine if all of a spa's buffers have been 4725 * evicted from an arc state. 4726 */ 4727 ASSERT(!retry || spa == 0); 4728 4729 if (spa != NULL) 4730 guid = spa_load_guid(spa); 4731 4732 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); 4733 (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); 4734 4735 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); 4736 (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); 4737 4738 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); 4739 (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); 4740 4741 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); 4742 (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); 4743 } 4744 4745 static void 4746 arc_reduce_target_size(int64_t to_free) 4747 { 4748 uint64_t asize = aggsum_value(&arc_size); 4749 if (arc_c > arc_c_min) { 4750 4751 if (arc_c > arc_c_min + to_free) 4752 atomic_add_64(&arc_c, -to_free); 4753 else 4754 arc_c = arc_c_min; 4755 4756 atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); 4757 if (asize < arc_c) 4758 arc_c = MAX(asize, arc_c_min); 4759 if (arc_p > arc_c) 4760 arc_p = (arc_c >> 1); 4761 ASSERT(arc_c >= arc_c_min); 4762 ASSERT((int64_t)arc_p >= 0); 4763 } 4764 4765 if (asize > arc_c) { 4766 /* See comment in arc_adjust_cb_check() on why lock+flag */ 4767 mutex_enter(&arc_adjust_lock); 4768 arc_adjust_needed = B_TRUE; 4769 mutex_exit(&arc_adjust_lock); 4770 zthr_wakeup(arc_adjust_zthr); 4771 } 4772 } 4773 4774 typedef enum free_memory_reason_t { 4775 FMR_UNKNOWN, 4776 FMR_NEEDFREE, 4777 FMR_LOTSFREE, 4778 FMR_SWAPFS_MINFREE, 4779 FMR_PAGES_PP_MAXIMUM, 4780 FMR_HEAP_ARENA, 4781 FMR_ZIO_ARENA, 4782 } free_memory_reason_t; 4783 4784 int64_t last_free_memory; 4785 free_memory_reason_t last_free_reason; 4786 4787 /* 4788 * Additional reserve of pages for pp_reserve. 4789 */ 4790 int64_t arc_pages_pp_reserve = 64; 4791 4792 /* 4793 * Additional reserve of pages for swapfs. 4794 */ 4795 int64_t arc_swapfs_reserve = 64; 4796 4797 /* 4798 * Return the amount of memory that can be consumed before reclaim will be 4799 * needed. Positive if there is sufficient free memory, negative indicates 4800 * the amount of memory that needs to be freed up. 4801 */ 4802 static int64_t 4803 arc_available_memory(void) 4804 { 4805 int64_t lowest = INT64_MAX; 4806 int64_t n; 4807 free_memory_reason_t r = FMR_UNKNOWN; 4808 4809 #ifdef _KERNEL 4810 if (needfree > 0) { 4811 n = PAGESIZE * (-needfree); 4812 if (n < lowest) { 4813 lowest = n; 4814 r = FMR_NEEDFREE; 4815 } 4816 } 4817 4818 /* 4819 * check that we're out of range of the pageout scanner. It starts to 4820 * schedule paging if freemem is less than lotsfree and needfree. 4821 * lotsfree is the high-water mark for pageout, and needfree is the 4822 * number of needed free pages. We add extra pages here to make sure 4823 * the scanner doesn't start up while we're freeing memory. 4824 */ 4825 n = PAGESIZE * (freemem - lotsfree - needfree - desfree); 4826 if (n < lowest) { 4827 lowest = n; 4828 r = FMR_LOTSFREE; 4829 } 4830 4831 /* 4832 * check to make sure that swapfs has enough space so that anon 4833 * reservations can still succeed. anon_resvmem() checks that the 4834 * availrmem is greater than swapfs_minfree, and the number of reserved 4835 * swap pages. We also add a bit of extra here just to prevent 4836 * circumstances from getting really dire. 4837 */ 4838 n = PAGESIZE * (availrmem - swapfs_minfree - swapfs_reserve - 4839 desfree - arc_swapfs_reserve); 4840 if (n < lowest) { 4841 lowest = n; 4842 r = FMR_SWAPFS_MINFREE; 4843 } 4844 4845 4846 /* 4847 * Check that we have enough availrmem that memory locking (e.g., via 4848 * mlock(3C) or memcntl(2)) can still succeed. (pages_pp_maximum 4849 * stores the number of pages that cannot be locked; when availrmem 4850 * drops below pages_pp_maximum, page locking mechanisms such as 4851 * page_pp_lock() will fail.) 4852 */ 4853 n = PAGESIZE * (availrmem - pages_pp_maximum - 4854 arc_pages_pp_reserve); 4855 if (n < lowest) { 4856 lowest = n; 4857 r = FMR_PAGES_PP_MAXIMUM; 4858 } 4859 4860 #if defined(__i386) 4861 /* 4862 * If we're on an i386 platform, it's possible that we'll exhaust the 4863 * kernel heap space before we ever run out of available physical 4864 * memory. Most checks of the size of the heap_area compare against 4865 * tune.t_minarmem, which is the minimum available real memory that we 4866 * can have in the system. However, this is generally fixed at 25 pages 4867 * which is so low that it's useless. In this comparison, we seek to 4868 * calculate the total heap-size, and reclaim if more than 3/4ths of the 4869 * heap is allocated. (Or, in the calculation, if less than 1/4th is 4870 * free) 4871 */ 4872 n = (int64_t)vmem_size(heap_arena, VMEM_FREE) - 4873 (vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC) >> 2); 4874 if (n < lowest) { 4875 lowest = n; 4876 r = FMR_HEAP_ARENA; 4877 } 4878 #endif 4879 4880 /* 4881 * If zio data pages are being allocated out of a separate heap segment, 4882 * then enforce that the size of available vmem for this arena remains 4883 * above about 1/4th (1/(2^arc_zio_arena_free_shift)) free. 4884 * 4885 * Note that reducing the arc_zio_arena_free_shift keeps more virtual 4886 * memory (in the zio_arena) free, which can avoid memory 4887 * fragmentation issues. 4888 */ 4889 if (zio_arena != NULL) { 4890 n = (int64_t)vmem_size(zio_arena, VMEM_FREE) - 4891 (vmem_size(zio_arena, VMEM_ALLOC) >> 4892 arc_zio_arena_free_shift); 4893 if (n < lowest) { 4894 lowest = n; 4895 r = FMR_ZIO_ARENA; 4896 } 4897 } 4898 #else 4899 /* Every 100 calls, free a small amount */ 4900 if (spa_get_random(100) == 0) 4901 lowest = -1024; 4902 #endif 4903 4904 last_free_memory = lowest; 4905 last_free_reason = r; 4906 4907 return (lowest); 4908 } 4909 4910 4911 /* 4912 * Determine if the system is under memory pressure and is asking 4913 * to reclaim memory. A return value of B_TRUE indicates that the system 4914 * is under memory pressure and that the arc should adjust accordingly. 4915 */ 4916 static boolean_t 4917 arc_reclaim_needed(void) 4918 { 4919 return (arc_available_memory() < 0); 4920 } 4921 4922 static void 4923 arc_kmem_reap_soon(void) 4924 { 4925 size_t i; 4926 kmem_cache_t *prev_cache = NULL; 4927 kmem_cache_t *prev_data_cache = NULL; 4928 extern kmem_cache_t *zio_buf_cache[]; 4929 extern kmem_cache_t *zio_data_buf_cache[]; 4930 extern kmem_cache_t *range_seg_cache; 4931 extern kmem_cache_t *abd_chunk_cache; 4932 4933 #ifdef _KERNEL 4934 if (aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) { 4935 /* 4936 * We are exceeding our meta-data cache limit. 4937 * Purge some DNLC entries to release holds on meta-data. 4938 */ 4939 dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent); 4940 } 4941 #if defined(__i386) 4942 /* 4943 * Reclaim unused memory from all kmem caches. 4944 */ 4945 kmem_reap(); 4946 #endif 4947 #endif 4948 4949 for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { 4950 if (zio_buf_cache[i] != prev_cache) { 4951 prev_cache = zio_buf_cache[i]; 4952 kmem_cache_reap_soon(zio_buf_cache[i]); 4953 } 4954 if (zio_data_buf_cache[i] != prev_data_cache) { 4955 prev_data_cache = zio_data_buf_cache[i]; 4956 kmem_cache_reap_soon(zio_data_buf_cache[i]); 4957 } 4958 } 4959 kmem_cache_reap_soon(abd_chunk_cache); 4960 kmem_cache_reap_soon(buf_cache); 4961 kmem_cache_reap_soon(hdr_full_cache); 4962 kmem_cache_reap_soon(hdr_l2only_cache); 4963 kmem_cache_reap_soon(range_seg_cache); 4964 4965 if (zio_arena != NULL) { 4966 /* 4967 * Ask the vmem arena to reclaim unused memory from its 4968 * quantum caches. 4969 */ 4970 vmem_qcache_reap(zio_arena); 4971 } 4972 } 4973 4974 /* ARGSUSED */ 4975 static boolean_t 4976 arc_adjust_cb_check(void *arg, zthr_t *zthr) 4977 { 4978 /* 4979 * This is necessary in order for the mdb ::arc dcmd to 4980 * show up to date information. Since the ::arc command 4981 * does not call the kstat's update function, without 4982 * this call, the command may show stale stats for the 4983 * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even 4984 * with this change, the data might be up to 1 second 4985 * out of date(the arc_adjust_zthr has a maximum sleep 4986 * time of 1 second); but that should suffice. The 4987 * arc_state_t structures can be queried directly if more 4988 * accurate information is needed. 4989 */ 4990 if (arc_ksp != NULL) 4991 arc_ksp->ks_update(arc_ksp, KSTAT_READ); 4992 4993 /* 4994 * We have to rely on arc_get_data_impl() to tell us when to adjust, 4995 * rather than checking if we are overflowing here, so that we are 4996 * sure to not leave arc_get_data_impl() waiting on 4997 * arc_adjust_waiters_cv. If we have become "not overflowing" since 4998 * arc_get_data_impl() checked, we need to wake it up. We could 4999 * broadcast the CV here, but arc_get_data_impl() may have not yet 5000 * gone to sleep. We would need to use a mutex to ensure that this 5001 * function doesn't broadcast until arc_get_data_impl() has gone to 5002 * sleep (e.g. the arc_adjust_lock). However, the lock ordering of 5003 * such a lock would necessarily be incorrect with respect to the 5004 * zthr_lock, which is held before this function is called, and is 5005 * held by arc_get_data_impl() when it calls zthr_wakeup(). 5006 */ 5007 return (arc_adjust_needed); 5008 } 5009 5010 /* 5011 * Keep arc_size under arc_c by running arc_adjust which evicts data 5012 * from the ARC. 5013 */ 5014 /* ARGSUSED */ 5015 static void 5016 arc_adjust_cb(void *arg, zthr_t *zthr) 5017 { 5018 uint64_t evicted = 0; 5019 5020 /* Evict from cache */ 5021 evicted = arc_adjust(); 5022 5023 /* 5024 * If evicted is zero, we couldn't evict anything 5025 * via arc_adjust(). This could be due to hash lock 5026 * collisions, but more likely due to the majority of 5027 * arc buffers being unevictable. Therefore, even if 5028 * arc_size is above arc_c, another pass is unlikely to 5029 * be helpful and could potentially cause us to enter an 5030 * infinite loop. Additionally, zthr_iscancelled() is 5031 * checked here so that if the arc is shutting down, the 5032 * broadcast will wake any remaining arc adjust waiters. 5033 */ 5034 mutex_enter(&arc_adjust_lock); 5035 arc_adjust_needed = !zthr_iscancelled(arc_adjust_zthr) && 5036 evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; 5037 if (!arc_adjust_needed) { 5038 /* 5039 * We're either no longer overflowing, or we 5040 * can't evict anything more, so we should wake 5041 * up any waiters. 5042 */ 5043 cv_broadcast(&arc_adjust_waiters_cv); 5044 } 5045 mutex_exit(&arc_adjust_lock); 5046 } 5047 5048 /* ARGSUSED */ 5049 static boolean_t 5050 arc_reap_cb_check(void *arg, zthr_t *zthr) 5051 { 5052 int64_t free_memory = arc_available_memory(); 5053 5054 /* 5055 * If a kmem reap is already active, don't schedule more. We must 5056 * check for this because kmem_cache_reap_soon() won't actually 5057 * block on the cache being reaped (this is to prevent callers from 5058 * becoming implicitly blocked by a system-wide kmem reap -- which, 5059 * on a system with many, many full magazines, can take minutes). 5060 */ 5061 if (!kmem_cache_reap_active() && 5062 free_memory < 0) { 5063 arc_no_grow = B_TRUE; 5064 arc_warm = B_TRUE; 5065 /* 5066 * Wait at least zfs_grow_retry (default 60) seconds 5067 * before considering growing. 5068 */ 5069 arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); 5070 return (B_TRUE); 5071 } else if (free_memory < arc_c >> arc_no_grow_shift) { 5072 arc_no_grow = B_TRUE; 5073 } else if (gethrtime() >= arc_growtime) { 5074 arc_no_grow = B_FALSE; 5075 } 5076 5077 return (B_FALSE); 5078 } 5079 5080 /* 5081 * Keep enough free memory in the system by reaping the ARC's kmem 5082 * caches. To cause more slabs to be reapable, we may reduce the 5083 * target size of the cache (arc_c), causing the arc_adjust_cb() 5084 * to free more buffers. 5085 */ 5086 /* ARGSUSED */ 5087 static void 5088 arc_reap_cb(void *arg, zthr_t *zthr) 5089 { 5090 int64_t free_memory; 5091 5092 /* 5093 * Kick off asynchronous kmem_reap()'s of all our caches. 5094 */ 5095 arc_kmem_reap_soon(); 5096 5097 /* 5098 * Wait at least arc_kmem_cache_reap_retry_ms between 5099 * arc_kmem_reap_soon() calls. Without this check it is possible to 5100 * end up in a situation where we spend lots of time reaping 5101 * caches, while we're near arc_c_min. Waiting here also gives the 5102 * subsequent free memory check a chance of finding that the 5103 * asynchronous reap has already freed enough memory, and we don't 5104 * need to call arc_reduce_target_size(). 5105 */ 5106 delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); 5107 5108 /* 5109 * Reduce the target size as needed to maintain the amount of free 5110 * memory in the system at a fraction of the arc_size (1/128th by 5111 * default). If oversubscribed (free_memory < 0) then reduce the 5112 * target arc_size by the deficit amount plus the fractional 5113 * amount. If free memory is positive but less then the fractional 5114 * amount, reduce by what is needed to hit the fractional amount. 5115 */ 5116 free_memory = arc_available_memory(); 5117 5118 int64_t to_free = 5119 (arc_c >> arc_shrink_shift) - free_memory; 5120 if (to_free > 0) { 5121 #ifdef _KERNEL 5122 to_free = MAX(to_free, ptob(needfree)); 5123 #endif 5124 arc_reduce_target_size(to_free); 5125 } 5126 } 5127 5128 /* 5129 * Adapt arc info given the number of bytes we are trying to add and 5130 * the state that we are coming from. This function is only called 5131 * when we are adding new content to the cache. 5132 */ 5133 static void 5134 arc_adapt(int bytes, arc_state_t *state) 5135 { 5136 int mult; 5137 uint64_t arc_p_min = (arc_c >> arc_p_min_shift); 5138 int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); 5139 int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); 5140 5141 if (state == arc_l2c_only) 5142 return; 5143 5144 ASSERT(bytes > 0); 5145 /* 5146 * Adapt the target size of the MRU list: 5147 * - if we just hit in the MRU ghost list, then increase 5148 * the target size of the MRU list. 5149 * - if we just hit in the MFU ghost list, then increase 5150 * the target size of the MFU list by decreasing the 5151 * target size of the MRU list. 5152 */ 5153 if (state == arc_mru_ghost) { 5154 mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); 5155 mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ 5156 5157 arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); 5158 } else if (state == arc_mfu_ghost) { 5159 uint64_t delta; 5160 5161 mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); 5162 mult = MIN(mult, 10); 5163 5164 delta = MIN(bytes * mult, arc_p); 5165 arc_p = MAX(arc_p_min, arc_p - delta); 5166 } 5167 ASSERT((int64_t)arc_p >= 0); 5168 5169 /* 5170 * Wake reap thread if we do not have any available memory 5171 */ 5172 if (arc_reclaim_needed()) { 5173 zthr_wakeup(arc_reap_zthr); 5174 return; 5175 } 5176 5177 5178 if (arc_no_grow) 5179 return; 5180 5181 if (arc_c >= arc_c_max) 5182 return; 5183 5184 /* 5185 * If we're within (2 * maxblocksize) bytes of the target 5186 * cache size, increment the target cache size 5187 */ 5188 if (aggsum_compare(&arc_size, arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) > 5189 0) { 5190 atomic_add_64(&arc_c, (int64_t)bytes); 5191 if (arc_c > arc_c_max) 5192 arc_c = arc_c_max; 5193 else if (state == arc_anon) 5194 atomic_add_64(&arc_p, (int64_t)bytes); 5195 if (arc_p > arc_c) 5196 arc_p = arc_c; 5197 } 5198 ASSERT((int64_t)arc_p >= 0); 5199 } 5200 5201 /* 5202 * Check if arc_size has grown past our upper threshold, determined by 5203 * zfs_arc_overflow_shift. 5204 */ 5205 static boolean_t 5206 arc_is_overflowing(void) 5207 { 5208 /* Always allow at least one block of overflow */ 5209 uint64_t overflow = MAX(SPA_MAXBLOCKSIZE, 5210 arc_c >> zfs_arc_overflow_shift); 5211 5212 /* 5213 * We just compare the lower bound here for performance reasons. Our 5214 * primary goals are to make sure that the arc never grows without 5215 * bound, and that it can reach its maximum size. This check 5216 * accomplishes both goals. The maximum amount we could run over by is 5217 * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block 5218 * in the ARC. In practice, that's in the tens of MB, which is low 5219 * enough to be safe. 5220 */ 5221 return (aggsum_lower_bound(&arc_size) >= arc_c + overflow); 5222 } 5223 5224 static abd_t * 5225 arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 5226 { 5227 arc_buf_contents_t type = arc_buf_type(hdr); 5228 5229 arc_get_data_impl(hdr, size, tag); 5230 if (type == ARC_BUFC_METADATA) { 5231 return (abd_alloc(size, B_TRUE)); 5232 } else { 5233 ASSERT(type == ARC_BUFC_DATA); 5234 return (abd_alloc(size, B_FALSE)); 5235 } 5236 } 5237 5238 static void * 5239 arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 5240 { 5241 arc_buf_contents_t type = arc_buf_type(hdr); 5242 5243 arc_get_data_impl(hdr, size, tag); 5244 if (type == ARC_BUFC_METADATA) { 5245 return (zio_buf_alloc(size)); 5246 } else { 5247 ASSERT(type == ARC_BUFC_DATA); 5248 return (zio_data_buf_alloc(size)); 5249 } 5250 } 5251 5252 /* 5253 * Allocate a block and return it to the caller. If we are hitting the 5254 * hard limit for the cache size, we must sleep, waiting for the eviction 5255 * thread to catch up. If we're past the target size but below the hard 5256 * limit, we'll only signal the reclaim thread and continue on. 5257 */ 5258 static void 5259 arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 5260 { 5261 arc_state_t *state = hdr->b_l1hdr.b_state; 5262 arc_buf_contents_t type = arc_buf_type(hdr); 5263 5264 arc_adapt(size, state); 5265 5266 /* 5267 * If arc_size is currently overflowing, and has grown past our 5268 * upper limit, we must be adding data faster than the evict 5269 * thread can evict. Thus, to ensure we don't compound the 5270 * problem by adding more data and forcing arc_size to grow even 5271 * further past its target size, we halt and wait for the 5272 * eviction thread to catch up. 5273 * 5274 * It's also possible that the reclaim thread is unable to evict 5275 * enough buffers to get arc_size below the overflow limit (e.g. 5276 * due to buffers being un-evictable, or hash lock collisions). 5277 * In this case, we want to proceed regardless if we're 5278 * overflowing; thus we don't use a while loop here. 5279 */ 5280 if (arc_is_overflowing()) { 5281 mutex_enter(&arc_adjust_lock); 5282 5283 /* 5284 * Now that we've acquired the lock, we may no longer be 5285 * over the overflow limit, lets check. 5286 * 5287 * We're ignoring the case of spurious wake ups. If that 5288 * were to happen, it'd let this thread consume an ARC 5289 * buffer before it should have (i.e. before we're under 5290 * the overflow limit and were signalled by the reclaim 5291 * thread). As long as that is a rare occurrence, it 5292 * shouldn't cause any harm. 5293 */ 5294 if (arc_is_overflowing()) { 5295 arc_adjust_needed = B_TRUE; 5296 zthr_wakeup(arc_adjust_zthr); 5297 (void) cv_wait(&arc_adjust_waiters_cv, 5298 &arc_adjust_lock); 5299 } 5300 mutex_exit(&arc_adjust_lock); 5301 } 5302 5303 VERIFY3U(hdr->b_type, ==, type); 5304 if (type == ARC_BUFC_METADATA) { 5305 arc_space_consume(size, ARC_SPACE_META); 5306 } else { 5307 arc_space_consume(size, ARC_SPACE_DATA); 5308 } 5309 5310 /* 5311 * Update the state size. Note that ghost states have a 5312 * "ghost size" and so don't need to be updated. 5313 */ 5314 if (!GHOST_STATE(state)) { 5315 5316 (void) zfs_refcount_add_many(&state->arcs_size, size, tag); 5317 5318 /* 5319 * If this is reached via arc_read, the link is 5320 * protected by the hash lock. If reached via 5321 * arc_buf_alloc, the header should not be accessed by 5322 * any other thread. And, if reached via arc_read_done, 5323 * the hash lock will protect it if it's found in the 5324 * hash table; otherwise no other thread should be 5325 * trying to [add|remove]_reference it. 5326 */ 5327 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 5328 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5329 (void) zfs_refcount_add_many(&state->arcs_esize[type], 5330 size, tag); 5331 } 5332 5333 /* 5334 * If we are growing the cache, and we are adding anonymous 5335 * data, and we have outgrown arc_p, update arc_p 5336 */ 5337 if (aggsum_compare(&arc_size, arc_c) < 0 && 5338 hdr->b_l1hdr.b_state == arc_anon && 5339 (zfs_refcount_count(&arc_anon->arcs_size) + 5340 zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) 5341 arc_p = MIN(arc_c, arc_p + size); 5342 } 5343 } 5344 5345 static void 5346 arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) 5347 { 5348 arc_free_data_impl(hdr, size, tag); 5349 abd_free(abd); 5350 } 5351 5352 static void 5353 arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) 5354 { 5355 arc_buf_contents_t type = arc_buf_type(hdr); 5356 5357 arc_free_data_impl(hdr, size, tag); 5358 if (type == ARC_BUFC_METADATA) { 5359 zio_buf_free(buf, size); 5360 } else { 5361 ASSERT(type == ARC_BUFC_DATA); 5362 zio_data_buf_free(buf, size); 5363 } 5364 } 5365 5366 /* 5367 * Free the arc data buffer. 5368 */ 5369 static void 5370 arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) 5371 { 5372 arc_state_t *state = hdr->b_l1hdr.b_state; 5373 arc_buf_contents_t type = arc_buf_type(hdr); 5374 5375 /* protected by hash lock, if in the hash table */ 5376 if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { 5377 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 5378 ASSERT(state != arc_anon && state != arc_l2c_only); 5379 5380 (void) zfs_refcount_remove_many(&state->arcs_esize[type], 5381 size, tag); 5382 } 5383 (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); 5384 5385 VERIFY3U(hdr->b_type, ==, type); 5386 if (type == ARC_BUFC_METADATA) { 5387 arc_space_return(size, ARC_SPACE_META); 5388 } else { 5389 ASSERT(type == ARC_BUFC_DATA); 5390 arc_space_return(size, ARC_SPACE_DATA); 5391 } 5392 } 5393 5394 /* 5395 * This routine is called whenever a buffer is accessed. 5396 * NOTE: the hash lock is dropped in this function. 5397 */ 5398 static void 5399 arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) 5400 { 5401 clock_t now; 5402 5403 ASSERT(MUTEX_HELD(hash_lock)); 5404 ASSERT(HDR_HAS_L1HDR(hdr)); 5405 5406 if (hdr->b_l1hdr.b_state == arc_anon) { 5407 /* 5408 * This buffer is not in the cache, and does not 5409 * appear in our "ghost" list. Add the new buffer 5410 * to the MRU state. 5411 */ 5412 5413 ASSERT0(hdr->b_l1hdr.b_arc_access); 5414 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5415 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5416 arc_change_state(arc_mru, hdr, hash_lock); 5417 5418 } else if (hdr->b_l1hdr.b_state == arc_mru) { 5419 now = ddi_get_lbolt(); 5420 5421 /* 5422 * If this buffer is here because of a prefetch, then either: 5423 * - clear the flag if this is a "referencing" read 5424 * (any subsequent access will bump this into the MFU state). 5425 * or 5426 * - move the buffer to the head of the list if this is 5427 * another prefetch (to make it less likely to be evicted). 5428 */ 5429 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5430 if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 5431 /* link protected by hash lock */ 5432 ASSERT(multilist_link_active( 5433 &hdr->b_l1hdr.b_arc_node)); 5434 } else { 5435 arc_hdr_clear_flags(hdr, 5436 ARC_FLAG_PREFETCH | 5437 ARC_FLAG_PRESCIENT_PREFETCH); 5438 ARCSTAT_BUMP(arcstat_mru_hits); 5439 } 5440 hdr->b_l1hdr.b_arc_access = now; 5441 return; 5442 } 5443 5444 /* 5445 * This buffer has been "accessed" only once so far, 5446 * but it is still in the cache. Move it to the MFU 5447 * state. 5448 */ 5449 if (now > hdr->b_l1hdr.b_arc_access + ARC_MINTIME) { 5450 /* 5451 * More than 125ms have passed since we 5452 * instantiated this buffer. Move it to the 5453 * most frequently used state. 5454 */ 5455 hdr->b_l1hdr.b_arc_access = now; 5456 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5457 arc_change_state(arc_mfu, hdr, hash_lock); 5458 } 5459 ARCSTAT_BUMP(arcstat_mru_hits); 5460 } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { 5461 arc_state_t *new_state; 5462 /* 5463 * This buffer has been "accessed" recently, but 5464 * was evicted from the cache. Move it to the 5465 * MFU state. 5466 */ 5467 5468 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5469 new_state = arc_mru; 5470 if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { 5471 arc_hdr_clear_flags(hdr, 5472 ARC_FLAG_PREFETCH | 5473 ARC_FLAG_PRESCIENT_PREFETCH); 5474 } 5475 DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); 5476 } else { 5477 new_state = arc_mfu; 5478 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5479 } 5480 5481 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5482 arc_change_state(new_state, hdr, hash_lock); 5483 5484 ARCSTAT_BUMP(arcstat_mru_ghost_hits); 5485 } else if (hdr->b_l1hdr.b_state == arc_mfu) { 5486 /* 5487 * This buffer has been accessed more than once and is 5488 * still in the cache. Keep it in the MFU state. 5489 * 5490 * NOTE: an add_reference() that occurred when we did 5491 * the arc_read() will have kicked this off the list. 5492 * If it was a prefetch, we will explicitly move it to 5493 * the head of the list now. 5494 */ 5495 ARCSTAT_BUMP(arcstat_mfu_hits); 5496 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5497 } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { 5498 arc_state_t *new_state = arc_mfu; 5499 /* 5500 * This buffer has been accessed more than once but has 5501 * been evicted from the cache. Move it back to the 5502 * MFU state. 5503 */ 5504 5505 if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { 5506 /* 5507 * This is a prefetch access... 5508 * move this block back to the MRU state. 5509 */ 5510 new_state = arc_mru; 5511 } 5512 5513 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5514 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5515 arc_change_state(new_state, hdr, hash_lock); 5516 5517 ARCSTAT_BUMP(arcstat_mfu_ghost_hits); 5518 } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { 5519 /* 5520 * This buffer is on the 2nd Level ARC. 5521 */ 5522 5523 hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); 5524 DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); 5525 arc_change_state(arc_mfu, hdr, hash_lock); 5526 } else { 5527 ASSERT(!"invalid arc state"); 5528 } 5529 } 5530 5531 /* 5532 * This routine is called by dbuf_hold() to update the arc_access() state 5533 * which otherwise would be skipped for entries in the dbuf cache. 5534 */ 5535 void 5536 arc_buf_access(arc_buf_t *buf) 5537 { 5538 mutex_enter(&buf->b_evict_lock); 5539 arc_buf_hdr_t *hdr = buf->b_hdr; 5540 5541 /* 5542 * Avoid taking the hash_lock when possible as an optimization. 5543 * The header must be checked again under the hash_lock in order 5544 * to handle the case where it is concurrently being released. 5545 */ 5546 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5547 mutex_exit(&buf->b_evict_lock); 5548 return; 5549 } 5550 5551 kmutex_t *hash_lock = HDR_LOCK(hdr); 5552 mutex_enter(hash_lock); 5553 5554 if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { 5555 mutex_exit(hash_lock); 5556 mutex_exit(&buf->b_evict_lock); 5557 ARCSTAT_BUMP(arcstat_access_skip); 5558 return; 5559 } 5560 5561 mutex_exit(&buf->b_evict_lock); 5562 5563 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 5564 hdr->b_l1hdr.b_state == arc_mfu); 5565 5566 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 5567 arc_access(hdr, hash_lock); 5568 mutex_exit(hash_lock); 5569 5570 ARCSTAT_BUMP(arcstat_hits); 5571 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 5572 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); 5573 } 5574 5575 /* a generic arc_read_done_func_t which you can use */ 5576 /* ARGSUSED */ 5577 void 5578 arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5579 arc_buf_t *buf, void *arg) 5580 { 5581 if (buf == NULL) 5582 return; 5583 5584 bcopy(buf->b_data, arg, arc_buf_size(buf)); 5585 arc_buf_destroy(buf, arg); 5586 } 5587 5588 /* a generic arc_read_done_func_t */ 5589 void 5590 arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, 5591 arc_buf_t *buf, void *arg) 5592 { 5593 arc_buf_t **bufp = arg; 5594 5595 if (buf == NULL) { 5596 ASSERT(zio == NULL || zio->io_error != 0); 5597 *bufp = NULL; 5598 } else { 5599 ASSERT(zio == NULL || zio->io_error == 0); 5600 *bufp = buf; 5601 ASSERT(buf->b_data != NULL); 5602 } 5603 } 5604 5605 static void 5606 arc_hdr_verify(arc_buf_hdr_t *hdr, const blkptr_t *bp) 5607 { 5608 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 5609 ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); 5610 ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); 5611 } else { 5612 if (HDR_COMPRESSION_ENABLED(hdr)) { 5613 ASSERT3U(arc_hdr_get_compress(hdr), ==, 5614 BP_GET_COMPRESS(bp)); 5615 } 5616 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 5617 ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); 5618 ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); 5619 } 5620 } 5621 5622 /* 5623 * XXX this should be changed to return an error, and callers 5624 * re-read from disk on failure (on nondebug bits). 5625 */ 5626 static void 5627 arc_hdr_verify_checksum(spa_t *spa, arc_buf_hdr_t *hdr, const blkptr_t *bp) 5628 { 5629 arc_hdr_verify(hdr, bp); 5630 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) 5631 return; 5632 int err = 0; 5633 abd_t *abd = NULL; 5634 if (BP_IS_ENCRYPTED(bp)) { 5635 if (HDR_HAS_RABD(hdr)) { 5636 abd = hdr->b_crypt_hdr.b_rabd; 5637 } 5638 } else if (HDR_COMPRESSION_ENABLED(hdr)) { 5639 abd = hdr->b_l1hdr.b_pabd; 5640 } 5641 if (abd != NULL) { 5642 /* 5643 * The offset is only used for labels, which are not 5644 * cached in the ARC, so it doesn't matter what we 5645 * pass for the offset parameter. 5646 */ 5647 int psize = HDR_GET_PSIZE(hdr); 5648 err = zio_checksum_error_impl(spa, bp, 5649 BP_GET_CHECKSUM(bp), abd, psize, 0, NULL); 5650 if (err != 0) { 5651 /* 5652 * Use abd_copy_to_buf() rather than 5653 * abd_borrow_buf_copy() so that we are sure to 5654 * include the buf in crash dumps. 5655 */ 5656 void *buf = kmem_alloc(psize, KM_SLEEP); 5657 abd_copy_to_buf(buf, abd, psize); 5658 panic("checksum of cached data doesn't match BP " 5659 "err=%u hdr=%p bp=%p abd=%p buf=%p", 5660 err, (void *)hdr, (void *)bp, (void *)abd, buf); 5661 } 5662 } 5663 } 5664 5665 static void 5666 arc_read_done(zio_t *zio) 5667 { 5668 blkptr_t *bp = zio->io_bp; 5669 arc_buf_hdr_t *hdr = zio->io_private; 5670 kmutex_t *hash_lock = NULL; 5671 arc_callback_t *callback_list; 5672 arc_callback_t *acb; 5673 boolean_t freeable = B_FALSE; 5674 5675 /* 5676 * The hdr was inserted into hash-table and removed from lists 5677 * prior to starting I/O. We should find this header, since 5678 * it's in the hash table, and it should be legit since it's 5679 * not possible to evict it during the I/O. The only possible 5680 * reason for it not to be found is if we were freed during the 5681 * read. 5682 */ 5683 if (HDR_IN_HASH_TABLE(hdr)) { 5684 ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); 5685 ASSERT3U(hdr->b_dva.dva_word[0], ==, 5686 BP_IDENTITY(zio->io_bp)->dva_word[0]); 5687 ASSERT3U(hdr->b_dva.dva_word[1], ==, 5688 BP_IDENTITY(zio->io_bp)->dva_word[1]); 5689 5690 arc_buf_hdr_t *found = buf_hash_find(hdr->b_spa, zio->io_bp, 5691 &hash_lock); 5692 5693 ASSERT((found == hdr && 5694 DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || 5695 (found == hdr && HDR_L2_READING(hdr))); 5696 ASSERT3P(hash_lock, !=, NULL); 5697 } 5698 5699 if (BP_IS_PROTECTED(bp)) { 5700 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); 5701 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; 5702 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, 5703 hdr->b_crypt_hdr.b_iv); 5704 5705 if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { 5706 void *tmpbuf; 5707 5708 tmpbuf = abd_borrow_buf_copy(zio->io_abd, 5709 sizeof (zil_chain_t)); 5710 zio_crypt_decode_mac_zil(tmpbuf, 5711 hdr->b_crypt_hdr.b_mac); 5712 abd_return_buf(zio->io_abd, tmpbuf, 5713 sizeof (zil_chain_t)); 5714 } else { 5715 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); 5716 } 5717 } 5718 5719 if (zio->io_error == 0) { 5720 /* byteswap if necessary */ 5721 if (BP_SHOULD_BYTESWAP(zio->io_bp)) { 5722 if (BP_GET_LEVEL(zio->io_bp) > 0) { 5723 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 5724 } else { 5725 hdr->b_l1hdr.b_byteswap = 5726 DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); 5727 } 5728 } else { 5729 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 5730 } 5731 } 5732 5733 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); 5734 if (l2arc_noprefetch && HDR_PREFETCH(hdr)) 5735 arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); 5736 5737 callback_list = hdr->b_l1hdr.b_acb; 5738 ASSERT3P(callback_list, !=, NULL); 5739 5740 if (hash_lock && zio->io_error == 0 && 5741 hdr->b_l1hdr.b_state == arc_anon) { 5742 /* 5743 * Only call arc_access on anonymous buffers. This is because 5744 * if we've issued an I/O for an evicted buffer, we've already 5745 * called arc_access (to prevent any simultaneous readers from 5746 * getting confused). 5747 */ 5748 arc_access(hdr, hash_lock); 5749 } 5750 5751 /* 5752 * If a read request has a callback (i.e. acb_done is not NULL), then we 5753 * make a buf containing the data according to the parameters which were 5754 * passed in. The implementation of arc_buf_alloc_impl() ensures that we 5755 * aren't needlessly decompressing the data multiple times. 5756 */ 5757 int callback_cnt = 0; 5758 for (acb = callback_list; acb != NULL; acb = acb->acb_next) { 5759 if (!acb->acb_done) 5760 continue; 5761 5762 callback_cnt++; 5763 5764 if (zio->io_error != 0) 5765 continue; 5766 5767 int error = arc_buf_alloc_impl(hdr, zio->io_spa, 5768 &acb->acb_zb, acb->acb_private, acb->acb_encrypted, 5769 acb->acb_compressed, acb->acb_noauth, B_TRUE, 5770 &acb->acb_buf); 5771 5772 /* 5773 * Assert non-speculative zios didn't fail because an 5774 * encryption key wasn't loaded 5775 */ 5776 ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || 5777 error != EACCES); 5778 5779 /* 5780 * If we failed to decrypt, report an error now (as the zio 5781 * layer would have done if it had done the transforms). 5782 */ 5783 if (error == ECKSUM) { 5784 ASSERT(BP_IS_PROTECTED(bp)); 5785 error = SET_ERROR(EIO); 5786 if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { 5787 spa_log_error(zio->io_spa, &acb->acb_zb); 5788 zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, 5789 zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); 5790 } 5791 } 5792 5793 if (error != 0) { 5794 /* 5795 * Decompression failed. Set io_error 5796 * so that when we call acb_done (below), 5797 * we will indicate that the read failed. 5798 * Note that in the unusual case where one 5799 * callback is compressed and another 5800 * uncompressed, we will mark all of them 5801 * as failed, even though the uncompressed 5802 * one can't actually fail. In this case, 5803 * the hdr will not be anonymous, because 5804 * if there are multiple callbacks, it's 5805 * because multiple threads found the same 5806 * arc buf in the hash table. 5807 */ 5808 zio->io_error = error; 5809 } 5810 } 5811 5812 /* 5813 * If there are multiple callbacks, we must have the hash lock, 5814 * because the only way for multiple threads to find this hdr is 5815 * in the hash table. This ensures that if there are multiple 5816 * callbacks, the hdr is not anonymous. If it were anonymous, 5817 * we couldn't use arc_buf_destroy() in the error case below. 5818 */ 5819 ASSERT(callback_cnt < 2 || hash_lock != NULL); 5820 5821 hdr->b_l1hdr.b_acb = NULL; 5822 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 5823 if (callback_cnt == 0) 5824 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); 5825 5826 ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || 5827 callback_list != NULL); 5828 5829 if (zio->io_error == 0) { 5830 arc_hdr_verify(hdr, zio->io_bp); 5831 } else { 5832 arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); 5833 if (hdr->b_l1hdr.b_state != arc_anon) 5834 arc_change_state(arc_anon, hdr, hash_lock); 5835 if (HDR_IN_HASH_TABLE(hdr)) 5836 buf_hash_remove(hdr); 5837 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5838 } 5839 5840 /* 5841 * Broadcast before we drop the hash_lock to avoid the possibility 5842 * that the hdr (and hence the cv) might be freed before we get to 5843 * the cv_broadcast(). 5844 */ 5845 cv_broadcast(&hdr->b_l1hdr.b_cv); 5846 5847 if (hash_lock != NULL) { 5848 mutex_exit(hash_lock); 5849 } else { 5850 /* 5851 * This block was freed while we waited for the read to 5852 * complete. It has been removed from the hash table and 5853 * moved to the anonymous state (so that it won't show up 5854 * in the cache). 5855 */ 5856 ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); 5857 freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); 5858 } 5859 5860 /* execute each callback and free its structure */ 5861 while ((acb = callback_list) != NULL) { 5862 5863 if (acb->acb_done != NULL) { 5864 if (zio->io_error != 0 && acb->acb_buf != NULL) { 5865 /* 5866 * If arc_buf_alloc_impl() fails during 5867 * decompression, the buf will still be 5868 * allocated, and needs to be freed here. 5869 */ 5870 arc_buf_destroy(acb->acb_buf, acb->acb_private); 5871 acb->acb_buf = NULL; 5872 } 5873 acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, 5874 acb->acb_buf, acb->acb_private); 5875 } 5876 5877 if (acb->acb_zio_dummy != NULL) { 5878 acb->acb_zio_dummy->io_error = zio->io_error; 5879 zio_nowait(acb->acb_zio_dummy); 5880 } 5881 5882 callback_list = acb->acb_next; 5883 kmem_free(acb, sizeof (arc_callback_t)); 5884 } 5885 5886 if (freeable) 5887 arc_hdr_destroy(hdr); 5888 } 5889 5890 /* 5891 * "Read" the block at the specified DVA (in bp) via the 5892 * cache. If the block is found in the cache, invoke the provided 5893 * callback immediately and return. Note that the `zio' parameter 5894 * in the callback will be NULL in this case, since no IO was 5895 * required. If the block is not in the cache pass the read request 5896 * on to the spa with a substitute callback function, so that the 5897 * requested block will be added to the cache. 5898 * 5899 * If a read request arrives for a block that has a read in-progress, 5900 * either wait for the in-progress read to complete (and return the 5901 * results); or, if this is a read with a "done" func, add a record 5902 * to the read to invoke the "done" func when the read completes, 5903 * and return; or just return. 5904 * 5905 * arc_read_done() will invoke all the requested "done" functions 5906 * for readers of this block. 5907 */ 5908 int 5909 arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, 5910 void *private, zio_priority_t priority, int zio_flags, 5911 arc_flags_t *arc_flags, const zbookmark_phys_t *zb) 5912 { 5913 arc_buf_hdr_t *hdr = NULL; 5914 kmutex_t *hash_lock = NULL; 5915 zio_t *rzio; 5916 uint64_t guid = spa_load_guid(spa); 5917 boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; 5918 boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && 5919 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; 5920 boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && 5921 (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; 5922 int rc = 0; 5923 5924 ASSERT(!BP_IS_EMBEDDED(bp) || 5925 BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); 5926 5927 top: 5928 if (!BP_IS_EMBEDDED(bp)) { 5929 /* 5930 * Embedded BP's have no DVA and require no I/O to "read". 5931 * Create an anonymous arc buf to back it. 5932 */ 5933 hdr = buf_hash_find(guid, bp, &hash_lock); 5934 } 5935 5936 /* 5937 * Determine if we have an L1 cache hit or a cache miss. For simplicity 5938 * we maintain encrypted data seperately from compressed / uncompressed 5939 * data. If the user is requesting raw encrypted data and we don't have 5940 * that in the header we will read from disk to guarantee that we can 5941 * get it even if the encryption keys aren't loaded. 5942 */ 5943 if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || 5944 (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { 5945 arc_buf_t *buf = NULL; 5946 *arc_flags |= ARC_FLAG_CACHED; 5947 5948 if (HDR_IO_IN_PROGRESS(hdr)) { 5949 zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; 5950 5951 ASSERT3P(head_zio, !=, NULL); 5952 if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && 5953 priority == ZIO_PRIORITY_SYNC_READ) { 5954 /* 5955 * This is a sync read that needs to wait for 5956 * an in-flight async read. Request that the 5957 * zio have its priority upgraded. 5958 */ 5959 zio_change_priority(head_zio, priority); 5960 DTRACE_PROBE1(arc__async__upgrade__sync, 5961 arc_buf_hdr_t *, hdr); 5962 ARCSTAT_BUMP(arcstat_async_upgrade_sync); 5963 } 5964 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 5965 arc_hdr_clear_flags(hdr, 5966 ARC_FLAG_PREDICTIVE_PREFETCH); 5967 } 5968 5969 if (*arc_flags & ARC_FLAG_WAIT) { 5970 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 5971 mutex_exit(hash_lock); 5972 goto top; 5973 } 5974 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 5975 5976 if (done) { 5977 arc_callback_t *acb = NULL; 5978 5979 acb = kmem_zalloc(sizeof (arc_callback_t), 5980 KM_SLEEP); 5981 acb->acb_done = done; 5982 acb->acb_private = private; 5983 acb->acb_compressed = compressed_read; 5984 acb->acb_encrypted = encrypted_read; 5985 acb->acb_noauth = noauth_read; 5986 acb->acb_zb = *zb; 5987 if (pio != NULL) 5988 acb->acb_zio_dummy = zio_null(pio, 5989 spa, NULL, NULL, NULL, zio_flags); 5990 5991 ASSERT3P(acb->acb_done, !=, NULL); 5992 acb->acb_zio_head = head_zio; 5993 acb->acb_next = hdr->b_l1hdr.b_acb; 5994 hdr->b_l1hdr.b_acb = acb; 5995 mutex_exit(hash_lock); 5996 return (0); 5997 } 5998 mutex_exit(hash_lock); 5999 return (0); 6000 } 6001 6002 ASSERT(hdr->b_l1hdr.b_state == arc_mru || 6003 hdr->b_l1hdr.b_state == arc_mfu); 6004 6005 if (done) { 6006 if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { 6007 /* 6008 * This is a demand read which does not have to 6009 * wait for i/o because we did a predictive 6010 * prefetch i/o for it, which has completed. 6011 */ 6012 DTRACE_PROBE1( 6013 arc__demand__hit__predictive__prefetch, 6014 arc_buf_hdr_t *, hdr); 6015 ARCSTAT_BUMP( 6016 arcstat_demand_hit_predictive_prefetch); 6017 arc_hdr_clear_flags(hdr, 6018 ARC_FLAG_PREDICTIVE_PREFETCH); 6019 } 6020 6021 if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { 6022 ARCSTAT_BUMP( 6023 arcstat_demand_hit_prescient_prefetch); 6024 arc_hdr_clear_flags(hdr, 6025 ARC_FLAG_PRESCIENT_PREFETCH); 6026 } 6027 6028 ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); 6029 6030 arc_hdr_verify_checksum(spa, hdr, bp); 6031 6032 /* Get a buf with the desired data in it. */ 6033 rc = arc_buf_alloc_impl(hdr, spa, zb, private, 6034 encrypted_read, compressed_read, noauth_read, 6035 B_TRUE, &buf); 6036 if (rc == ECKSUM) { 6037 /* 6038 * Convert authentication and decryption errors 6039 * to EIO (and generate an ereport if needed) 6040 * before leaving the ARC. 6041 */ 6042 rc = SET_ERROR(EIO); 6043 if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { 6044 spa_log_error(spa, zb); 6045 zfs_ereport_post( 6046 FM_EREPORT_ZFS_AUTHENTICATION, 6047 spa, NULL, zb, NULL, 0, 0); 6048 } 6049 } 6050 if (rc != 0) { 6051 (void) remove_reference(hdr, hash_lock, 6052 private); 6053 arc_buf_destroy_impl(buf); 6054 buf = NULL; 6055 } 6056 /* assert any errors weren't due to unloaded keys */ 6057 ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || 6058 rc != EACCES); 6059 } else if (*arc_flags & ARC_FLAG_PREFETCH && 6060 zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { 6061 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 6062 } 6063 DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); 6064 arc_access(hdr, hash_lock); 6065 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 6066 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 6067 if (*arc_flags & ARC_FLAG_L2CACHE) 6068 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6069 mutex_exit(hash_lock); 6070 ARCSTAT_BUMP(arcstat_hits); 6071 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 6072 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 6073 data, metadata, hits); 6074 6075 if (done) 6076 done(NULL, zb, bp, buf, private); 6077 } else { 6078 uint64_t lsize = BP_GET_LSIZE(bp); 6079 uint64_t psize = BP_GET_PSIZE(bp); 6080 arc_callback_t *acb; 6081 vdev_t *vd = NULL; 6082 uint64_t addr = 0; 6083 boolean_t devw = B_FALSE; 6084 uint64_t size; 6085 abd_t *hdr_abd; 6086 6087 if (hdr == NULL) { 6088 /* this block is not in the cache */ 6089 arc_buf_hdr_t *exists = NULL; 6090 arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); 6091 hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, 6092 BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type, 6093 encrypted_read); 6094 6095 if (!BP_IS_EMBEDDED(bp)) { 6096 hdr->b_dva = *BP_IDENTITY(bp); 6097 hdr->b_birth = BP_PHYSICAL_BIRTH(bp); 6098 exists = buf_hash_insert(hdr, &hash_lock); 6099 } 6100 if (exists != NULL) { 6101 /* somebody beat us to the hash insert */ 6102 mutex_exit(hash_lock); 6103 buf_discard_identity(hdr); 6104 arc_hdr_destroy(hdr); 6105 goto top; /* restart the IO request */ 6106 } 6107 } else { 6108 /* 6109 * This block is in the ghost cache or encrypted data 6110 * was requested and we didn't have it. If it was 6111 * L2-only (and thus didn't have an L1 hdr), 6112 * we realloc the header to add an L1 hdr. 6113 */ 6114 if (!HDR_HAS_L1HDR(hdr)) { 6115 hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, 6116 hdr_full_cache); 6117 } 6118 6119 if (GHOST_STATE(hdr->b_l1hdr.b_state)) { 6120 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6121 ASSERT(!HDR_HAS_RABD(hdr)); 6122 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6123 ASSERT0(zfs_refcount_count( 6124 &hdr->b_l1hdr.b_refcnt)); 6125 ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); 6126 ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); 6127 } else if (HDR_IO_IN_PROGRESS(hdr)) { 6128 /* 6129 * If this header already had an IO in progress 6130 * and we are performing another IO to fetch 6131 * encrypted data we must wait until the first 6132 * IO completes so as not to confuse 6133 * arc_read_done(). This should be very rare 6134 * and so the performance impact shouldn't 6135 * matter. 6136 */ 6137 cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); 6138 mutex_exit(hash_lock); 6139 goto top; 6140 } 6141 6142 /* 6143 * This is a delicate dance that we play here. 6144 * This hdr might be in the ghost list so we access 6145 * it to move it out of the ghost list before we 6146 * initiate the read. If it's a prefetch then 6147 * it won't have a callback so we'll remove the 6148 * reference that arc_buf_alloc_impl() created. We 6149 * do this after we've called arc_access() to 6150 * avoid hitting an assert in remove_reference(). 6151 */ 6152 arc_access(hdr, hash_lock); 6153 arc_hdr_alloc_pabd(hdr, encrypted_read); 6154 } 6155 6156 if (encrypted_read) { 6157 ASSERT(HDR_HAS_RABD(hdr)); 6158 size = HDR_GET_PSIZE(hdr); 6159 hdr_abd = hdr->b_crypt_hdr.b_rabd; 6160 zio_flags |= ZIO_FLAG_RAW; 6161 } else { 6162 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 6163 size = arc_hdr_size(hdr); 6164 hdr_abd = hdr->b_l1hdr.b_pabd; 6165 6166 if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { 6167 zio_flags |= ZIO_FLAG_RAW_COMPRESS; 6168 } 6169 6170 /* 6171 * For authenticated bp's, we do not ask the ZIO layer 6172 * to authenticate them since this will cause the entire 6173 * IO to fail if the key isn't loaded. Instead, we 6174 * defer authentication until arc_buf_fill(), which will 6175 * verify the data when the key is available. 6176 */ 6177 if (BP_IS_AUTHENTICATED(bp)) 6178 zio_flags |= ZIO_FLAG_RAW_ENCRYPT; 6179 } 6180 6181 if (*arc_flags & ARC_FLAG_PREFETCH && 6182 zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) 6183 arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); 6184 if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) 6185 arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); 6186 6187 if (*arc_flags & ARC_FLAG_L2CACHE) 6188 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6189 if (BP_IS_AUTHENTICATED(bp)) 6190 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); 6191 if (BP_GET_LEVEL(bp) > 0) 6192 arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); 6193 if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) 6194 arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); 6195 ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); 6196 6197 acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); 6198 acb->acb_done = done; 6199 acb->acb_private = private; 6200 acb->acb_compressed = compressed_read; 6201 acb->acb_encrypted = encrypted_read; 6202 acb->acb_noauth = noauth_read; 6203 acb->acb_zb = *zb; 6204 6205 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6206 hdr->b_l1hdr.b_acb = acb; 6207 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6208 6209 if (HDR_HAS_L2HDR(hdr) && 6210 (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { 6211 devw = hdr->b_l2hdr.b_dev->l2ad_writing; 6212 addr = hdr->b_l2hdr.b_daddr; 6213 /* 6214 * Lock out L2ARC device removal. 6215 */ 6216 if (vdev_is_dead(vd) || 6217 !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) 6218 vd = NULL; 6219 } 6220 6221 /* 6222 * We count both async reads and scrub IOs as asynchronous so 6223 * that both can be upgraded in the event of a cache hit while 6224 * the read IO is still in-flight. 6225 */ 6226 if (priority == ZIO_PRIORITY_ASYNC_READ || 6227 priority == ZIO_PRIORITY_SCRUB) 6228 arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 6229 else 6230 arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); 6231 6232 /* 6233 * At this point, we have a level 1 cache miss. Try again in 6234 * L2ARC if possible. 6235 */ 6236 ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); 6237 6238 DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, 6239 uint64_t, lsize, zbookmark_phys_t *, zb); 6240 ARCSTAT_BUMP(arcstat_misses); 6241 ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), 6242 demand, prefetch, !HDR_ISTYPE_METADATA(hdr), 6243 data, metadata, misses); 6244 6245 if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { 6246 /* 6247 * Read from the L2ARC if the following are true: 6248 * 1. The L2ARC vdev was previously cached. 6249 * 2. This buffer still has L2ARC metadata. 6250 * 3. This buffer isn't currently writing to the L2ARC. 6251 * 4. The L2ARC entry wasn't evicted, which may 6252 * also have invalidated the vdev. 6253 * 5. This isn't prefetch and l2arc_noprefetch is set. 6254 */ 6255 if (HDR_HAS_L2HDR(hdr) && 6256 !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && 6257 !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { 6258 l2arc_read_callback_t *cb; 6259 abd_t *abd; 6260 uint64_t asize; 6261 6262 DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); 6263 ARCSTAT_BUMP(arcstat_l2_hits); 6264 6265 cb = kmem_zalloc(sizeof (l2arc_read_callback_t), 6266 KM_SLEEP); 6267 cb->l2rcb_hdr = hdr; 6268 cb->l2rcb_bp = *bp; 6269 cb->l2rcb_zb = *zb; 6270 cb->l2rcb_flags = zio_flags; 6271 6272 asize = vdev_psize_to_asize(vd, size); 6273 if (asize != size) { 6274 abd = abd_alloc_for_io(asize, 6275 HDR_ISTYPE_METADATA(hdr)); 6276 cb->l2rcb_abd = abd; 6277 } else { 6278 abd = hdr_abd; 6279 } 6280 6281 ASSERT(addr >= VDEV_LABEL_START_SIZE && 6282 addr + asize <= vd->vdev_psize - 6283 VDEV_LABEL_END_SIZE); 6284 6285 /* 6286 * l2arc read. The SCL_L2ARC lock will be 6287 * released by l2arc_read_done(). 6288 * Issue a null zio if the underlying buffer 6289 * was squashed to zero size by compression. 6290 */ 6291 ASSERT3U(arc_hdr_get_compress(hdr), !=, 6292 ZIO_COMPRESS_EMPTY); 6293 rzio = zio_read_phys(pio, vd, addr, 6294 asize, abd, 6295 ZIO_CHECKSUM_OFF, 6296 l2arc_read_done, cb, priority, 6297 zio_flags | ZIO_FLAG_DONT_CACHE | 6298 ZIO_FLAG_CANFAIL | 6299 ZIO_FLAG_DONT_PROPAGATE | 6300 ZIO_FLAG_DONT_RETRY, B_FALSE); 6301 acb->acb_zio_head = rzio; 6302 6303 if (hash_lock != NULL) 6304 mutex_exit(hash_lock); 6305 6306 DTRACE_PROBE2(l2arc__read, vdev_t *, vd, 6307 zio_t *, rzio); 6308 ARCSTAT_INCR(arcstat_l2_read_bytes, 6309 HDR_GET_PSIZE(hdr)); 6310 6311 if (*arc_flags & ARC_FLAG_NOWAIT) { 6312 zio_nowait(rzio); 6313 return (0); 6314 } 6315 6316 ASSERT(*arc_flags & ARC_FLAG_WAIT); 6317 if (zio_wait(rzio) == 0) 6318 return (0); 6319 6320 /* l2arc read error; goto zio_read() */ 6321 if (hash_lock != NULL) 6322 mutex_enter(hash_lock); 6323 } else { 6324 DTRACE_PROBE1(l2arc__miss, 6325 arc_buf_hdr_t *, hdr); 6326 ARCSTAT_BUMP(arcstat_l2_misses); 6327 if (HDR_L2_WRITING(hdr)) 6328 ARCSTAT_BUMP(arcstat_l2_rw_clash); 6329 spa_config_exit(spa, SCL_L2ARC, vd); 6330 } 6331 } else { 6332 if (vd != NULL) 6333 spa_config_exit(spa, SCL_L2ARC, vd); 6334 if (l2arc_ndev != 0) { 6335 DTRACE_PROBE1(l2arc__miss, 6336 arc_buf_hdr_t *, hdr); 6337 ARCSTAT_BUMP(arcstat_l2_misses); 6338 } 6339 } 6340 6341 rzio = zio_read(pio, spa, bp, hdr_abd, size, 6342 arc_read_done, hdr, priority, zio_flags, zb); 6343 acb->acb_zio_head = rzio; 6344 6345 if (hash_lock != NULL) 6346 mutex_exit(hash_lock); 6347 6348 if (*arc_flags & ARC_FLAG_WAIT) 6349 return (zio_wait(rzio)); 6350 6351 ASSERT(*arc_flags & ARC_FLAG_NOWAIT); 6352 zio_nowait(rzio); 6353 } 6354 return (rc); 6355 } 6356 6357 /* 6358 * Notify the arc that a block was freed, and thus will never be used again. 6359 */ 6360 void 6361 arc_freed(spa_t *spa, const blkptr_t *bp) 6362 { 6363 arc_buf_hdr_t *hdr; 6364 kmutex_t *hash_lock; 6365 uint64_t guid = spa_load_guid(spa); 6366 6367 ASSERT(!BP_IS_EMBEDDED(bp)); 6368 6369 hdr = buf_hash_find(guid, bp, &hash_lock); 6370 if (hdr == NULL) 6371 return; 6372 6373 /* 6374 * We might be trying to free a block that is still doing I/O 6375 * (i.e. prefetch) or has a reference (i.e. a dedup-ed, 6376 * dmu_sync-ed block). If this block is being prefetched, then it 6377 * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr 6378 * until the I/O completes. A block may also have a reference if it is 6379 * part of a dedup-ed, dmu_synced write. The dmu_sync() function would 6380 * have written the new block to its final resting place on disk but 6381 * without the dedup flag set. This would have left the hdr in the MRU 6382 * state and discoverable. When the txg finally syncs it detects that 6383 * the block was overridden in open context and issues an override I/O. 6384 * Since this is a dedup block, the override I/O will determine if the 6385 * block is already in the DDT. If so, then it will replace the io_bp 6386 * with the bp from the DDT and allow the I/O to finish. When the I/O 6387 * reaches the done callback, dbuf_write_override_done, it will 6388 * check to see if the io_bp and io_bp_override are identical. 6389 * If they are not, then it indicates that the bp was replaced with 6390 * the bp in the DDT and the override bp is freed. This allows 6391 * us to arrive here with a reference on a block that is being 6392 * freed. So if we have an I/O in progress, or a reference to 6393 * this hdr, then we don't destroy the hdr. 6394 */ 6395 if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && 6396 zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { 6397 arc_change_state(arc_anon, hdr, hash_lock); 6398 arc_hdr_destroy(hdr); 6399 mutex_exit(hash_lock); 6400 } else { 6401 mutex_exit(hash_lock); 6402 } 6403 6404 } 6405 6406 /* 6407 * Release this buffer from the cache, making it an anonymous buffer. This 6408 * must be done after a read and prior to modifying the buffer contents. 6409 * If the buffer has more than one reference, we must make 6410 * a new hdr for the buffer. 6411 */ 6412 void 6413 arc_release(arc_buf_t *buf, void *tag) 6414 { 6415 arc_buf_hdr_t *hdr = buf->b_hdr; 6416 6417 /* 6418 * It would be nice to assert that if its DMU metadata (level > 6419 * 0 || it's the dnode file), then it must be syncing context. 6420 * But we don't know that information at this level. 6421 */ 6422 6423 mutex_enter(&buf->b_evict_lock); 6424 6425 ASSERT(HDR_HAS_L1HDR(hdr)); 6426 6427 /* 6428 * We don't grab the hash lock prior to this check, because if 6429 * the buffer's header is in the arc_anon state, it won't be 6430 * linked into the hash table. 6431 */ 6432 if (hdr->b_l1hdr.b_state == arc_anon) { 6433 mutex_exit(&buf->b_evict_lock); 6434 /* 6435 * If we are called from dmu_convert_mdn_block_to_raw(), 6436 * a write might be in progress. This is OK because 6437 * the caller won't change the content of this buffer, 6438 * only the flags (via arc_convert_to_raw()). 6439 */ 6440 /* ASSERT(!HDR_IO_IN_PROGRESS(hdr)); */ 6441 ASSERT(!HDR_IN_HASH_TABLE(hdr)); 6442 ASSERT(!HDR_HAS_L2HDR(hdr)); 6443 ASSERT(HDR_EMPTY(hdr)); 6444 6445 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 6446 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); 6447 ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); 6448 6449 hdr->b_l1hdr.b_arc_access = 0; 6450 6451 /* 6452 * If the buf is being overridden then it may already 6453 * have a hdr that is not empty. 6454 */ 6455 buf_discard_identity(hdr); 6456 arc_buf_thaw(buf); 6457 6458 return; 6459 } 6460 6461 kmutex_t *hash_lock = HDR_LOCK(hdr); 6462 mutex_enter(hash_lock); 6463 6464 /* 6465 * This assignment is only valid as long as the hash_lock is 6466 * held, we must be careful not to reference state or the 6467 * b_state field after dropping the lock. 6468 */ 6469 arc_state_t *state = hdr->b_l1hdr.b_state; 6470 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 6471 ASSERT3P(state, !=, arc_anon); 6472 6473 /* this buffer is not on any list */ 6474 ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); 6475 6476 if (HDR_HAS_L2HDR(hdr)) { 6477 mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); 6478 6479 /* 6480 * We have to recheck this conditional again now that 6481 * we're holding the l2ad_mtx to prevent a race with 6482 * another thread which might be concurrently calling 6483 * l2arc_evict(). In that case, l2arc_evict() might have 6484 * destroyed the header's L2 portion as we were waiting 6485 * to acquire the l2ad_mtx. 6486 */ 6487 if (HDR_HAS_L2HDR(hdr)) 6488 arc_hdr_l2hdr_destroy(hdr); 6489 6490 mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); 6491 } 6492 6493 /* 6494 * Do we have more than one buf? 6495 */ 6496 if (hdr->b_l1hdr.b_bufcnt > 1) { 6497 arc_buf_hdr_t *nhdr; 6498 uint64_t spa = hdr->b_spa; 6499 uint64_t psize = HDR_GET_PSIZE(hdr); 6500 uint64_t lsize = HDR_GET_LSIZE(hdr); 6501 boolean_t protected = HDR_PROTECTED(hdr); 6502 enum zio_compress compress = arc_hdr_get_compress(hdr); 6503 arc_buf_contents_t type = arc_buf_type(hdr); 6504 VERIFY3U(hdr->b_type, ==, type); 6505 6506 ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); 6507 (void) remove_reference(hdr, hash_lock, tag); 6508 6509 if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { 6510 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 6511 ASSERT(ARC_BUF_LAST(buf)); 6512 } 6513 6514 /* 6515 * Pull the data off of this hdr and attach it to 6516 * a new anonymous hdr. Also find the last buffer 6517 * in the hdr's buffer list. 6518 */ 6519 arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); 6520 ASSERT3P(lastbuf, !=, NULL); 6521 6522 /* 6523 * If the current arc_buf_t and the hdr are sharing their data 6524 * buffer, then we must stop sharing that block. 6525 */ 6526 if (arc_buf_is_shared(buf)) { 6527 ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); 6528 VERIFY(!arc_buf_is_shared(lastbuf)); 6529 6530 /* 6531 * First, sever the block sharing relationship between 6532 * buf and the arc_buf_hdr_t. 6533 */ 6534 arc_unshare_buf(hdr, buf); 6535 6536 /* 6537 * Now we need to recreate the hdr's b_pabd. Since we 6538 * have lastbuf handy, we try to share with it, but if 6539 * we can't then we allocate a new b_pabd and copy the 6540 * data from buf into it. 6541 */ 6542 if (arc_can_share(hdr, lastbuf)) { 6543 arc_share_buf(hdr, lastbuf); 6544 } else { 6545 arc_hdr_alloc_pabd(hdr, B_FALSE); 6546 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, 6547 buf->b_data, psize); 6548 } 6549 VERIFY3P(lastbuf->b_data, !=, NULL); 6550 } else if (HDR_SHARED_DATA(hdr)) { 6551 /* 6552 * Uncompressed shared buffers are always at the end 6553 * of the list. Compressed buffers don't have the 6554 * same requirements. This makes it hard to 6555 * simply assert that the lastbuf is shared so 6556 * we rely on the hdr's compression flags to determine 6557 * if we have a compressed, shared buffer. 6558 */ 6559 ASSERT(arc_buf_is_shared(lastbuf) || 6560 arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); 6561 ASSERT(!ARC_BUF_SHARED(buf)); 6562 } 6563 ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); 6564 ASSERT3P(state, !=, arc_l2c_only); 6565 6566 (void) zfs_refcount_remove_many(&state->arcs_size, 6567 arc_buf_size(buf), buf); 6568 6569 if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { 6570 ASSERT3P(state, !=, arc_l2c_only); 6571 (void) zfs_refcount_remove_many( 6572 &state->arcs_esize[type], 6573 arc_buf_size(buf), buf); 6574 } 6575 6576 hdr->b_l1hdr.b_bufcnt -= 1; 6577 if (ARC_BUF_ENCRYPTED(buf)) 6578 hdr->b_crypt_hdr.b_ebufcnt -= 1; 6579 6580 arc_cksum_verify(buf); 6581 arc_buf_unwatch(buf); 6582 6583 /* if this is the last uncompressed buf free the checksum */ 6584 if (!arc_hdr_has_uncompressed_buf(hdr)) 6585 arc_cksum_free(hdr); 6586 6587 mutex_exit(hash_lock); 6588 6589 /* 6590 * Allocate a new hdr. The new hdr will contain a b_pabd 6591 * buffer which will be freed in arc_write(). 6592 */ 6593 nhdr = arc_hdr_alloc(spa, psize, lsize, protected, 6594 compress, type, HDR_HAS_RABD(hdr)); 6595 ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); 6596 ASSERT0(nhdr->b_l1hdr.b_bufcnt); 6597 ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); 6598 VERIFY3U(nhdr->b_type, ==, type); 6599 ASSERT(!HDR_SHARED_DATA(nhdr)); 6600 6601 nhdr->b_l1hdr.b_buf = buf; 6602 nhdr->b_l1hdr.b_bufcnt = 1; 6603 if (ARC_BUF_ENCRYPTED(buf)) 6604 nhdr->b_crypt_hdr.b_ebufcnt = 1; 6605 (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); 6606 buf->b_hdr = nhdr; 6607 6608 mutex_exit(&buf->b_evict_lock); 6609 (void) zfs_refcount_add_many(&arc_anon->arcs_size, 6610 arc_buf_size(buf), buf); 6611 } else { 6612 mutex_exit(&buf->b_evict_lock); 6613 ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); 6614 /* protected by hash lock, or hdr is on arc_anon */ 6615 ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); 6616 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6617 arc_change_state(arc_anon, hdr, hash_lock); 6618 hdr->b_l1hdr.b_arc_access = 0; 6619 6620 mutex_exit(hash_lock); 6621 buf_discard_identity(hdr); 6622 arc_buf_thaw(buf); 6623 } 6624 } 6625 6626 int 6627 arc_released(arc_buf_t *buf) 6628 { 6629 int released; 6630 6631 mutex_enter(&buf->b_evict_lock); 6632 released = (buf->b_data != NULL && 6633 buf->b_hdr->b_l1hdr.b_state == arc_anon); 6634 mutex_exit(&buf->b_evict_lock); 6635 return (released); 6636 } 6637 6638 #ifdef ZFS_DEBUG 6639 int 6640 arc_referenced(arc_buf_t *buf) 6641 { 6642 int referenced; 6643 6644 mutex_enter(&buf->b_evict_lock); 6645 referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); 6646 mutex_exit(&buf->b_evict_lock); 6647 return (referenced); 6648 } 6649 #endif 6650 6651 static void 6652 arc_write_ready(zio_t *zio) 6653 { 6654 arc_write_callback_t *callback = zio->io_private; 6655 arc_buf_t *buf = callback->awcb_buf; 6656 arc_buf_hdr_t *hdr = buf->b_hdr; 6657 blkptr_t *bp = zio->io_bp; 6658 uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); 6659 6660 ASSERT(HDR_HAS_L1HDR(hdr)); 6661 ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); 6662 ASSERT(hdr->b_l1hdr.b_bufcnt > 0); 6663 6664 /* 6665 * If we're reexecuting this zio because the pool suspended, then 6666 * cleanup any state that was previously set the first time the 6667 * callback was invoked. 6668 */ 6669 if (zio->io_flags & ZIO_FLAG_REEXECUTED) { 6670 arc_cksum_free(hdr); 6671 arc_buf_unwatch(buf); 6672 if (hdr->b_l1hdr.b_pabd != NULL) { 6673 if (arc_buf_is_shared(buf)) { 6674 arc_unshare_buf(hdr, buf); 6675 } else { 6676 arc_hdr_free_pabd(hdr, B_FALSE); 6677 } 6678 } 6679 6680 if (HDR_HAS_RABD(hdr)) 6681 arc_hdr_free_pabd(hdr, B_TRUE); 6682 } 6683 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6684 ASSERT(!HDR_HAS_RABD(hdr)); 6685 ASSERT(!HDR_SHARED_DATA(hdr)); 6686 ASSERT(!arc_buf_is_shared(buf)); 6687 6688 callback->awcb_ready(zio, buf, callback->awcb_private); 6689 6690 if (HDR_IO_IN_PROGRESS(hdr)) 6691 ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); 6692 6693 arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6694 6695 if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) 6696 hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); 6697 6698 if (BP_IS_PROTECTED(bp)) { 6699 /* ZIL blocks are written through zio_rewrite */ 6700 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); 6701 ASSERT(HDR_PROTECTED(hdr)); 6702 6703 if (BP_SHOULD_BYTESWAP(bp)) { 6704 if (BP_GET_LEVEL(bp) > 0) { 6705 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; 6706 } else { 6707 hdr->b_l1hdr.b_byteswap = 6708 DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); 6709 } 6710 } else { 6711 hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; 6712 } 6713 6714 hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); 6715 hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; 6716 zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, 6717 hdr->b_crypt_hdr.b_iv); 6718 zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); 6719 } 6720 6721 /* 6722 * If this block was written for raw encryption but the zio layer 6723 * ended up only authenticating it, adjust the buffer flags now. 6724 */ 6725 if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { 6726 arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); 6727 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; 6728 if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) 6729 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 6730 } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { 6731 buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; 6732 buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; 6733 } 6734 6735 /* this must be done after the buffer flags are adjusted */ 6736 arc_cksum_compute(buf); 6737 6738 enum zio_compress compress; 6739 if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { 6740 compress = ZIO_COMPRESS_OFF; 6741 } else { 6742 ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); 6743 compress = BP_GET_COMPRESS(bp); 6744 } 6745 HDR_SET_PSIZE(hdr, psize); 6746 arc_hdr_set_compress(hdr, compress); 6747 6748 if (zio->io_error != 0 || psize == 0) 6749 goto out; 6750 6751 /* 6752 * Fill the hdr with data. If the buffer is encrypted we have no choice 6753 * but to copy the data into b_rabd. If the hdr is compressed, the data 6754 * we want is available from the zio, otherwise we can take it from 6755 * the buf. 6756 * 6757 * We might be able to share the buf's data with the hdr here. However, 6758 * doing so would cause the ARC to be full of linear ABDs if we write a 6759 * lot of shareable data. As a compromise, we check whether scattered 6760 * ABDs are allowed, and assume that if they are then the user wants 6761 * the ARC to be primarily filled with them regardless of the data being 6762 * written. Therefore, if they're allowed then we allocate one and copy 6763 * the data into it; otherwise, we share the data directly if we can. 6764 */ 6765 if (ARC_BUF_ENCRYPTED(buf)) { 6766 ASSERT3U(psize, >, 0); 6767 ASSERT(ARC_BUF_COMPRESSED(buf)); 6768 arc_hdr_alloc_pabd(hdr, B_TRUE); 6769 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); 6770 } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { 6771 /* 6772 * Ideally, we would always copy the io_abd into b_pabd, but the 6773 * user may have disabled compressed ARC, thus we must check the 6774 * hdr's compression setting rather than the io_bp's. 6775 */ 6776 if (BP_IS_ENCRYPTED(bp)) { 6777 ASSERT3U(psize, >, 0); 6778 arc_hdr_alloc_pabd(hdr, B_TRUE); 6779 abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); 6780 } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && 6781 !ARC_BUF_COMPRESSED(buf)) { 6782 ASSERT3U(psize, >, 0); 6783 arc_hdr_alloc_pabd(hdr, B_FALSE); 6784 abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); 6785 } else { 6786 ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); 6787 arc_hdr_alloc_pabd(hdr, B_FALSE); 6788 abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, 6789 arc_buf_size(buf)); 6790 } 6791 } else { 6792 ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); 6793 ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); 6794 ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); 6795 arc_share_buf(hdr, buf); 6796 } 6797 6798 out: 6799 arc_hdr_verify(hdr, bp); 6800 } 6801 6802 static void 6803 arc_write_children_ready(zio_t *zio) 6804 { 6805 arc_write_callback_t *callback = zio->io_private; 6806 arc_buf_t *buf = callback->awcb_buf; 6807 6808 callback->awcb_children_ready(zio, buf, callback->awcb_private); 6809 } 6810 6811 /* 6812 * The SPA calls this callback for each physical write that happens on behalf 6813 * of a logical write. See the comment in dbuf_write_physdone() for details. 6814 */ 6815 static void 6816 arc_write_physdone(zio_t *zio) 6817 { 6818 arc_write_callback_t *cb = zio->io_private; 6819 if (cb->awcb_physdone != NULL) 6820 cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); 6821 } 6822 6823 static void 6824 arc_write_done(zio_t *zio) 6825 { 6826 arc_write_callback_t *callback = zio->io_private; 6827 arc_buf_t *buf = callback->awcb_buf; 6828 arc_buf_hdr_t *hdr = buf->b_hdr; 6829 6830 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6831 6832 if (zio->io_error == 0) { 6833 arc_hdr_verify(hdr, zio->io_bp); 6834 6835 if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { 6836 buf_discard_identity(hdr); 6837 } else { 6838 hdr->b_dva = *BP_IDENTITY(zio->io_bp); 6839 hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); 6840 } 6841 } else { 6842 ASSERT(HDR_EMPTY(hdr)); 6843 } 6844 6845 /* 6846 * If the block to be written was all-zero or compressed enough to be 6847 * embedded in the BP, no write was performed so there will be no 6848 * dva/birth/checksum. The buffer must therefore remain anonymous 6849 * (and uncached). 6850 */ 6851 if (!HDR_EMPTY(hdr)) { 6852 arc_buf_hdr_t *exists; 6853 kmutex_t *hash_lock; 6854 6855 ASSERT3U(zio->io_error, ==, 0); 6856 6857 arc_cksum_verify(buf); 6858 6859 exists = buf_hash_insert(hdr, &hash_lock); 6860 if (exists != NULL) { 6861 /* 6862 * This can only happen if we overwrite for 6863 * sync-to-convergence, because we remove 6864 * buffers from the hash table when we arc_free(). 6865 */ 6866 if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { 6867 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6868 panic("bad overwrite, hdr=%p exists=%p", 6869 (void *)hdr, (void *)exists); 6870 ASSERT(zfs_refcount_is_zero( 6871 &exists->b_l1hdr.b_refcnt)); 6872 arc_change_state(arc_anon, exists, hash_lock); 6873 mutex_exit(hash_lock); 6874 arc_hdr_destroy(exists); 6875 exists = buf_hash_insert(hdr, &hash_lock); 6876 ASSERT3P(exists, ==, NULL); 6877 } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { 6878 /* nopwrite */ 6879 ASSERT(zio->io_prop.zp_nopwrite); 6880 if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) 6881 panic("bad nopwrite, hdr=%p exists=%p", 6882 (void *)hdr, (void *)exists); 6883 } else { 6884 /* Dedup */ 6885 ASSERT(hdr->b_l1hdr.b_bufcnt == 1); 6886 ASSERT(hdr->b_l1hdr.b_state == arc_anon); 6887 ASSERT(BP_GET_DEDUP(zio->io_bp)); 6888 ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); 6889 } 6890 } 6891 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6892 /* if it's not anon, we are doing a scrub */ 6893 if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) 6894 arc_access(hdr, hash_lock); 6895 mutex_exit(hash_lock); 6896 } else { 6897 arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); 6898 } 6899 6900 ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); 6901 callback->awcb_done(zio, buf, callback->awcb_private); 6902 6903 abd_put(zio->io_abd); 6904 kmem_free(callback, sizeof (arc_write_callback_t)); 6905 } 6906 6907 zio_t * 6908 arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, 6909 boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, 6910 arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, 6911 arc_write_done_func_t *done, void *private, zio_priority_t priority, 6912 int zio_flags, const zbookmark_phys_t *zb) 6913 { 6914 arc_buf_hdr_t *hdr = buf->b_hdr; 6915 arc_write_callback_t *callback; 6916 zio_t *zio; 6917 zio_prop_t localprop = *zp; 6918 6919 ASSERT3P(ready, !=, NULL); 6920 ASSERT3P(done, !=, NULL); 6921 ASSERT(!HDR_IO_ERROR(hdr)); 6922 ASSERT(!HDR_IO_IN_PROGRESS(hdr)); 6923 ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); 6924 ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); 6925 if (l2arc) 6926 arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); 6927 6928 if (ARC_BUF_ENCRYPTED(buf)) { 6929 ASSERT(ARC_BUF_COMPRESSED(buf)); 6930 localprop.zp_encrypt = B_TRUE; 6931 localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6932 /* CONSTCOND */ 6933 localprop.zp_byteorder = 6934 (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? 6935 ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; 6936 bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, 6937 ZIO_DATA_SALT_LEN); 6938 bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, 6939 ZIO_DATA_IV_LEN); 6940 bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, 6941 ZIO_DATA_MAC_LEN); 6942 if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { 6943 localprop.zp_nopwrite = B_FALSE; 6944 localprop.zp_copies = 6945 MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); 6946 } 6947 zio_flags |= ZIO_FLAG_RAW; 6948 } else if (ARC_BUF_COMPRESSED(buf)) { 6949 ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); 6950 localprop.zp_compress = HDR_GET_COMPRESS(hdr); 6951 zio_flags |= ZIO_FLAG_RAW_COMPRESS; 6952 } 6953 6954 callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); 6955 callback->awcb_ready = ready; 6956 callback->awcb_children_ready = children_ready; 6957 callback->awcb_physdone = physdone; 6958 callback->awcb_done = done; 6959 callback->awcb_private = private; 6960 callback->awcb_buf = buf; 6961 6962 /* 6963 * The hdr's b_pabd is now stale, free it now. A new data block 6964 * will be allocated when the zio pipeline calls arc_write_ready(). 6965 */ 6966 if (hdr->b_l1hdr.b_pabd != NULL) { 6967 /* 6968 * If the buf is currently sharing the data block with 6969 * the hdr then we need to break that relationship here. 6970 * The hdr will remain with a NULL data pointer and the 6971 * buf will take sole ownership of the block. 6972 */ 6973 if (arc_buf_is_shared(buf)) { 6974 arc_unshare_buf(hdr, buf); 6975 } else { 6976 arc_hdr_free_pabd(hdr, B_FALSE); 6977 } 6978 VERIFY3P(buf->b_data, !=, NULL); 6979 } 6980 6981 if (HDR_HAS_RABD(hdr)) 6982 arc_hdr_free_pabd(hdr, B_TRUE); 6983 6984 if (!(zio_flags & ZIO_FLAG_RAW)) 6985 arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); 6986 6987 ASSERT(!arc_buf_is_shared(buf)); 6988 ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); 6989 6990 zio = zio_write(pio, spa, txg, bp, 6991 abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), 6992 HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, 6993 (children_ready != NULL) ? arc_write_children_ready : NULL, 6994 arc_write_physdone, arc_write_done, callback, 6995 priority, zio_flags, zb); 6996 6997 return (zio); 6998 } 6999 7000 static int 7001 arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) 7002 { 7003 #ifdef _KERNEL 7004 uint64_t available_memory = ptob(freemem); 7005 7006 #if defined(__i386) 7007 available_memory = 7008 MIN(available_memory, vmem_size(heap_arena, VMEM_FREE)); 7009 #endif 7010 7011 if (freemem > physmem * arc_lotsfree_percent / 100) 7012 return (0); 7013 7014 if (txg > spa->spa_lowmem_last_txg) { 7015 spa->spa_lowmem_last_txg = txg; 7016 spa->spa_lowmem_page_load = 0; 7017 } 7018 /* 7019 * If we are in pageout, we know that memory is already tight, 7020 * the arc is already going to be evicting, so we just want to 7021 * continue to let page writes occur as quickly as possible. 7022 */ 7023 if (curproc == proc_pageout) { 7024 if (spa->spa_lowmem_page_load > 7025 MAX(ptob(minfree), available_memory) / 4) 7026 return (SET_ERROR(ERESTART)); 7027 /* Note: reserve is inflated, so we deflate */ 7028 atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); 7029 return (0); 7030 } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { 7031 /* memory is low, delay before restarting */ 7032 ARCSTAT_INCR(arcstat_memory_throttle_count, 1); 7033 return (SET_ERROR(EAGAIN)); 7034 } 7035 spa->spa_lowmem_page_load = 0; 7036 #endif /* _KERNEL */ 7037 return (0); 7038 } 7039 7040 void 7041 arc_tempreserve_clear(uint64_t reserve) 7042 { 7043 atomic_add_64(&arc_tempreserve, -reserve); 7044 ASSERT((int64_t)arc_tempreserve >= 0); 7045 } 7046 7047 int 7048 arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) 7049 { 7050 int error; 7051 uint64_t anon_size; 7052 7053 if (reserve > arc_c/4 && !arc_no_grow) 7054 arc_c = MIN(arc_c_max, reserve * 4); 7055 if (reserve > arc_c) 7056 return (SET_ERROR(ENOMEM)); 7057 7058 /* 7059 * Don't count loaned bufs as in flight dirty data to prevent long 7060 * network delays from blocking transactions that are ready to be 7061 * assigned to a txg. 7062 */ 7063 7064 /* assert that it has not wrapped around */ 7065 ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); 7066 7067 anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - 7068 arc_loaned_bytes), 0); 7069 7070 /* 7071 * Writes will, almost always, require additional memory allocations 7072 * in order to compress/encrypt/etc the data. We therefore need to 7073 * make sure that there is sufficient available memory for this. 7074 */ 7075 error = arc_memory_throttle(spa, reserve, txg); 7076 if (error != 0) 7077 return (error); 7078 7079 /* 7080 * Throttle writes when the amount of dirty data in the cache 7081 * gets too large. We try to keep the cache less than half full 7082 * of dirty blocks so that our sync times don't grow too large. 7083 * 7084 * In the case of one pool being built on another pool, we want 7085 * to make sure we don't end up throttling the lower (backing) 7086 * pool when the upper pool is the majority contributor to dirty 7087 * data. To insure we make forward progress during throttling, we 7088 * also check the current pool's net dirty data and only throttle 7089 * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty 7090 * data in the cache. 7091 * 7092 * Note: if two requests come in concurrently, we might let them 7093 * both succeed, when one of them should fail. Not a huge deal. 7094 */ 7095 uint64_t total_dirty = reserve + arc_tempreserve + anon_size; 7096 uint64_t spa_dirty_anon = spa_dirty_data(spa); 7097 7098 if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && 7099 anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && 7100 spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { 7101 uint64_t meta_esize = 7102 zfs_refcount_count( 7103 &arc_anon->arcs_esize[ARC_BUFC_METADATA]); 7104 uint64_t data_esize = 7105 zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 7106 dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " 7107 "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", 7108 arc_tempreserve >> 10, meta_esize >> 10, 7109 data_esize >> 10, reserve >> 10, arc_c >> 10); 7110 return (SET_ERROR(ERESTART)); 7111 } 7112 atomic_add_64(&arc_tempreserve, reserve); 7113 return (0); 7114 } 7115 7116 static void 7117 arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, 7118 kstat_named_t *evict_data, kstat_named_t *evict_metadata) 7119 { 7120 size->value.ui64 = zfs_refcount_count(&state->arcs_size); 7121 evict_data->value.ui64 = 7122 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); 7123 evict_metadata->value.ui64 = 7124 zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); 7125 } 7126 7127 static int 7128 arc_kstat_update(kstat_t *ksp, int rw) 7129 { 7130 arc_stats_t *as = ksp->ks_data; 7131 7132 if (rw == KSTAT_WRITE) { 7133 return (EACCES); 7134 } else { 7135 arc_kstat_update_state(arc_anon, 7136 &as->arcstat_anon_size, 7137 &as->arcstat_anon_evictable_data, 7138 &as->arcstat_anon_evictable_metadata); 7139 arc_kstat_update_state(arc_mru, 7140 &as->arcstat_mru_size, 7141 &as->arcstat_mru_evictable_data, 7142 &as->arcstat_mru_evictable_metadata); 7143 arc_kstat_update_state(arc_mru_ghost, 7144 &as->arcstat_mru_ghost_size, 7145 &as->arcstat_mru_ghost_evictable_data, 7146 &as->arcstat_mru_ghost_evictable_metadata); 7147 arc_kstat_update_state(arc_mfu, 7148 &as->arcstat_mfu_size, 7149 &as->arcstat_mfu_evictable_data, 7150 &as->arcstat_mfu_evictable_metadata); 7151 arc_kstat_update_state(arc_mfu_ghost, 7152 &as->arcstat_mfu_ghost_size, 7153 &as->arcstat_mfu_ghost_evictable_data, 7154 &as->arcstat_mfu_ghost_evictable_metadata); 7155 7156 ARCSTAT(arcstat_size) = aggsum_value(&arc_size); 7157 ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); 7158 ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); 7159 ARCSTAT(arcstat_metadata_size) = 7160 aggsum_value(&astat_metadata_size); 7161 ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); 7162 ARCSTAT(arcstat_other_size) = aggsum_value(&astat_other_size); 7163 ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); 7164 } 7165 7166 return (0); 7167 } 7168 7169 /* 7170 * This function *must* return indices evenly distributed between all 7171 * sublists of the multilist. This is needed due to how the ARC eviction 7172 * code is laid out; arc_evict_state() assumes ARC buffers are evenly 7173 * distributed between all sublists and uses this assumption when 7174 * deciding which sublist to evict from and how much to evict from it. 7175 */ 7176 unsigned int 7177 arc_state_multilist_index_func(multilist_t *ml, void *obj) 7178 { 7179 arc_buf_hdr_t *hdr = obj; 7180 7181 /* 7182 * We rely on b_dva to generate evenly distributed index 7183 * numbers using buf_hash below. So, as an added precaution, 7184 * let's make sure we never add empty buffers to the arc lists. 7185 */ 7186 ASSERT(!HDR_EMPTY(hdr)); 7187 7188 /* 7189 * The assumption here, is the hash value for a given 7190 * arc_buf_hdr_t will remain constant throughout its lifetime 7191 * (i.e. its b_spa, b_dva, and b_birth fields don't change). 7192 * Thus, we don't need to store the header's sublist index 7193 * on insertion, as this index can be recalculated on removal. 7194 * 7195 * Also, the low order bits of the hash value are thought to be 7196 * distributed evenly. Otherwise, in the case that the multilist 7197 * has a power of two number of sublists, each sublists' usage 7198 * would not be evenly distributed. 7199 */ 7200 return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % 7201 multilist_get_num_sublists(ml)); 7202 } 7203 7204 static void 7205 arc_state_init(void) 7206 { 7207 arc_anon = &ARC_anon; 7208 arc_mru = &ARC_mru; 7209 arc_mru_ghost = &ARC_mru_ghost; 7210 arc_mfu = &ARC_mfu; 7211 arc_mfu_ghost = &ARC_mfu_ghost; 7212 arc_l2c_only = &ARC_l2c_only; 7213 7214 arc_mru->arcs_list[ARC_BUFC_METADATA] = 7215 multilist_create(sizeof (arc_buf_hdr_t), 7216 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7217 arc_state_multilist_index_func); 7218 arc_mru->arcs_list[ARC_BUFC_DATA] = 7219 multilist_create(sizeof (arc_buf_hdr_t), 7220 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7221 arc_state_multilist_index_func); 7222 arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = 7223 multilist_create(sizeof (arc_buf_hdr_t), 7224 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7225 arc_state_multilist_index_func); 7226 arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = 7227 multilist_create(sizeof (arc_buf_hdr_t), 7228 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7229 arc_state_multilist_index_func); 7230 arc_mfu->arcs_list[ARC_BUFC_METADATA] = 7231 multilist_create(sizeof (arc_buf_hdr_t), 7232 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7233 arc_state_multilist_index_func); 7234 arc_mfu->arcs_list[ARC_BUFC_DATA] = 7235 multilist_create(sizeof (arc_buf_hdr_t), 7236 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7237 arc_state_multilist_index_func); 7238 arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = 7239 multilist_create(sizeof (arc_buf_hdr_t), 7240 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7241 arc_state_multilist_index_func); 7242 arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = 7243 multilist_create(sizeof (arc_buf_hdr_t), 7244 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7245 arc_state_multilist_index_func); 7246 arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = 7247 multilist_create(sizeof (arc_buf_hdr_t), 7248 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7249 arc_state_multilist_index_func); 7250 arc_l2c_only->arcs_list[ARC_BUFC_DATA] = 7251 multilist_create(sizeof (arc_buf_hdr_t), 7252 offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), 7253 arc_state_multilist_index_func); 7254 7255 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 7256 zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 7257 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 7258 zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 7259 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 7260 zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 7261 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 7262 zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 7263 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 7264 zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 7265 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 7266 zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 7267 7268 zfs_refcount_create(&arc_anon->arcs_size); 7269 zfs_refcount_create(&arc_mru->arcs_size); 7270 zfs_refcount_create(&arc_mru_ghost->arcs_size); 7271 zfs_refcount_create(&arc_mfu->arcs_size); 7272 zfs_refcount_create(&arc_mfu_ghost->arcs_size); 7273 zfs_refcount_create(&arc_l2c_only->arcs_size); 7274 7275 aggsum_init(&arc_meta_used, 0); 7276 aggsum_init(&arc_size, 0); 7277 aggsum_init(&astat_data_size, 0); 7278 aggsum_init(&astat_metadata_size, 0); 7279 aggsum_init(&astat_hdr_size, 0); 7280 aggsum_init(&astat_other_size, 0); 7281 aggsum_init(&astat_l2_hdr_size, 0); 7282 } 7283 7284 static void 7285 arc_state_fini(void) 7286 { 7287 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); 7288 zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); 7289 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); 7290 zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); 7291 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); 7292 zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); 7293 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); 7294 zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); 7295 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); 7296 zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); 7297 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); 7298 zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); 7299 7300 zfs_refcount_destroy(&arc_anon->arcs_size); 7301 zfs_refcount_destroy(&arc_mru->arcs_size); 7302 zfs_refcount_destroy(&arc_mru_ghost->arcs_size); 7303 zfs_refcount_destroy(&arc_mfu->arcs_size); 7304 zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); 7305 zfs_refcount_destroy(&arc_l2c_only->arcs_size); 7306 7307 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); 7308 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); 7309 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); 7310 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); 7311 multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); 7312 multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); 7313 multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); 7314 multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); 7315 multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); 7316 multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); 7317 7318 aggsum_fini(&arc_meta_used); 7319 aggsum_fini(&arc_size); 7320 aggsum_fini(&astat_data_size); 7321 aggsum_fini(&astat_metadata_size); 7322 aggsum_fini(&astat_hdr_size); 7323 aggsum_fini(&astat_other_size); 7324 aggsum_fini(&astat_l2_hdr_size); 7325 7326 } 7327 7328 uint64_t 7329 arc_max_bytes(void) 7330 { 7331 return (arc_c_max); 7332 } 7333 7334 void 7335 arc_init(void) 7336 { 7337 /* 7338 * allmem is "all memory that we could possibly use". 7339 */ 7340 #ifdef _KERNEL 7341 uint64_t allmem = ptob(physmem - swapfs_minfree); 7342 #else 7343 uint64_t allmem = (physmem * PAGESIZE) / 2; 7344 #endif 7345 mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); 7346 cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); 7347 7348 /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ 7349 arc_c_min = MAX(allmem / 32, 64 << 20); 7350 /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ 7351 if (allmem >= 1 << 30) 7352 arc_c_max = allmem - (1 << 30); 7353 else 7354 arc_c_max = arc_c_min; 7355 arc_c_max = MAX(allmem * 3 / 4, arc_c_max); 7356 7357 /* 7358 * In userland, there's only the memory pressure that we artificially 7359 * create (see arc_available_memory()). Don't let arc_c get too 7360 * small, because it can cause transactions to be larger than 7361 * arc_c, causing arc_tempreserve_space() to fail. 7362 */ 7363 #ifndef _KERNEL 7364 arc_c_min = arc_c_max / 2; 7365 #endif 7366 7367 /* 7368 * Allow the tunables to override our calculations if they are 7369 * reasonable (ie. over 64MB) 7370 */ 7371 if (zfs_arc_max > 64 << 20 && zfs_arc_max < allmem) { 7372 arc_c_max = zfs_arc_max; 7373 arc_c_min = MIN(arc_c_min, arc_c_max); 7374 } 7375 if (zfs_arc_min > 64 << 20 && zfs_arc_min <= arc_c_max) 7376 arc_c_min = zfs_arc_min; 7377 7378 arc_c = arc_c_max; 7379 arc_p = (arc_c >> 1); 7380 7381 /* limit meta-data to 1/4 of the arc capacity */ 7382 arc_meta_limit = arc_c_max / 4; 7383 7384 #ifdef _KERNEL 7385 /* 7386 * Metadata is stored in the kernel's heap. Don't let us 7387 * use more than half the heap for the ARC. 7388 */ 7389 arc_meta_limit = MIN(arc_meta_limit, 7390 vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 2); 7391 #endif 7392 7393 /* Allow the tunable to override if it is reasonable */ 7394 if (zfs_arc_meta_limit > 0 && zfs_arc_meta_limit <= arc_c_max) 7395 arc_meta_limit = zfs_arc_meta_limit; 7396 7397 if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) 7398 arc_c_min = arc_meta_limit / 2; 7399 7400 if (zfs_arc_meta_min > 0) { 7401 arc_meta_min = zfs_arc_meta_min; 7402 } else { 7403 arc_meta_min = arc_c_min / 2; 7404 } 7405 7406 if (zfs_arc_grow_retry > 0) 7407 arc_grow_retry = zfs_arc_grow_retry; 7408 7409 if (zfs_arc_shrink_shift > 0) 7410 arc_shrink_shift = zfs_arc_shrink_shift; 7411 7412 /* 7413 * Ensure that arc_no_grow_shift is less than arc_shrink_shift. 7414 */ 7415 if (arc_no_grow_shift >= arc_shrink_shift) 7416 arc_no_grow_shift = arc_shrink_shift - 1; 7417 7418 if (zfs_arc_p_min_shift > 0) 7419 arc_p_min_shift = zfs_arc_p_min_shift; 7420 7421 /* if kmem_flags are set, lets try to use less memory */ 7422 if (kmem_debugging()) 7423 arc_c = arc_c / 2; 7424 if (arc_c < arc_c_min) 7425 arc_c = arc_c_min; 7426 7427 arc_state_init(); 7428 7429 /* 7430 * The arc must be "uninitialized", so that hdr_recl() (which is 7431 * registered by buf_init()) will not access arc_reap_zthr before 7432 * it is created. 7433 */ 7434 ASSERT(!arc_initialized); 7435 buf_init(); 7436 7437 arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, 7438 sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); 7439 7440 if (arc_ksp != NULL) { 7441 arc_ksp->ks_data = &arc_stats; 7442 arc_ksp->ks_update = arc_kstat_update; 7443 kstat_install(arc_ksp); 7444 } 7445 7446 arc_adjust_zthr = zthr_create(arc_adjust_cb_check, 7447 arc_adjust_cb, NULL); 7448 arc_reap_zthr = zthr_create_timer(arc_reap_cb_check, 7449 arc_reap_cb, NULL, SEC2NSEC(1)); 7450 7451 arc_initialized = B_TRUE; 7452 arc_warm = B_FALSE; 7453 7454 /* 7455 * Calculate maximum amount of dirty data per pool. 7456 * 7457 * If it has been set by /etc/system, take that. 7458 * Otherwise, use a percentage of physical memory defined by 7459 * zfs_dirty_data_max_percent (default 10%) with a cap at 7460 * zfs_dirty_data_max_max (default 4GB). 7461 */ 7462 if (zfs_dirty_data_max == 0) { 7463 zfs_dirty_data_max = physmem * PAGESIZE * 7464 zfs_dirty_data_max_percent / 100; 7465 zfs_dirty_data_max = MIN(zfs_dirty_data_max, 7466 zfs_dirty_data_max_max); 7467 } 7468 } 7469 7470 void 7471 arc_fini(void) 7472 { 7473 /* Use B_TRUE to ensure *all* buffers are evicted */ 7474 arc_flush(NULL, B_TRUE); 7475 7476 arc_initialized = B_FALSE; 7477 7478 if (arc_ksp != NULL) { 7479 kstat_delete(arc_ksp); 7480 arc_ksp = NULL; 7481 } 7482 7483 (void) zthr_cancel(arc_adjust_zthr); 7484 zthr_destroy(arc_adjust_zthr); 7485 7486 (void) zthr_cancel(arc_reap_zthr); 7487 zthr_destroy(arc_reap_zthr); 7488 7489 mutex_destroy(&arc_adjust_lock); 7490 cv_destroy(&arc_adjust_waiters_cv); 7491 7492 /* 7493 * buf_fini() must proceed arc_state_fini() because buf_fin() may 7494 * trigger the release of kmem magazines, which can callback to 7495 * arc_space_return() which accesses aggsums freed in act_state_fini(). 7496 */ 7497 buf_fini(); 7498 arc_state_fini(); 7499 7500 ASSERT0(arc_loaned_bytes); 7501 } 7502 7503 /* 7504 * Level 2 ARC 7505 * 7506 * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. 7507 * It uses dedicated storage devices to hold cached data, which are populated 7508 * using large infrequent writes. The main role of this cache is to boost 7509 * the performance of random read workloads. The intended L2ARC devices 7510 * include short-stroked disks, solid state disks, and other media with 7511 * substantially faster read latency than disk. 7512 * 7513 * +-----------------------+ 7514 * | ARC | 7515 * +-----------------------+ 7516 * | ^ ^ 7517 * | | | 7518 * l2arc_feed_thread() arc_read() 7519 * | | | 7520 * | l2arc read | 7521 * V | | 7522 * +---------------+ | 7523 * | L2ARC | | 7524 * +---------------+ | 7525 * | ^ | 7526 * l2arc_write() | | 7527 * | | | 7528 * V | | 7529 * +-------+ +-------+ 7530 * | vdev | | vdev | 7531 * | cache | | cache | 7532 * +-------+ +-------+ 7533 * +=========+ .-----. 7534 * : L2ARC : |-_____-| 7535 * : devices : | Disks | 7536 * +=========+ `-_____-' 7537 * 7538 * Read requests are satisfied from the following sources, in order: 7539 * 7540 * 1) ARC 7541 * 2) vdev cache of L2ARC devices 7542 * 3) L2ARC devices 7543 * 4) vdev cache of disks 7544 * 5) disks 7545 * 7546 * Some L2ARC device types exhibit extremely slow write performance. 7547 * To accommodate for this there are some significant differences between 7548 * the L2ARC and traditional cache design: 7549 * 7550 * 1. There is no eviction path from the ARC to the L2ARC. Evictions from 7551 * the ARC behave as usual, freeing buffers and placing headers on ghost 7552 * lists. The ARC does not send buffers to the L2ARC during eviction as 7553 * this would add inflated write latencies for all ARC memory pressure. 7554 * 7555 * 2. The L2ARC attempts to cache data from the ARC before it is evicted. 7556 * It does this by periodically scanning buffers from the eviction-end of 7557 * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are 7558 * not already there. It scans until a headroom of buffers is satisfied, 7559 * which itself is a buffer for ARC eviction. If a compressible buffer is 7560 * found during scanning and selected for writing to an L2ARC device, we 7561 * temporarily boost scanning headroom during the next scan cycle to make 7562 * sure we adapt to compression effects (which might significantly reduce 7563 * the data volume we write to L2ARC). The thread that does this is 7564 * l2arc_feed_thread(), illustrated below; example sizes are included to 7565 * provide a better sense of ratio than this diagram: 7566 * 7567 * head --> tail 7568 * +---------------------+----------+ 7569 * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC 7570 * +---------------------+----------+ | o L2ARC eligible 7571 * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer 7572 * +---------------------+----------+ | 7573 * 15.9 Gbytes ^ 32 Mbytes | 7574 * headroom | 7575 * l2arc_feed_thread() 7576 * | 7577 * l2arc write hand <--[oooo]--' 7578 * | 8 Mbyte 7579 * | write max 7580 * V 7581 * +==============================+ 7582 * L2ARC dev |####|#|###|###| |####| ... | 7583 * +==============================+ 7584 * 32 Gbytes 7585 * 7586 * 3. If an ARC buffer is copied to the L2ARC but then hit instead of 7587 * evicted, then the L2ARC has cached a buffer much sooner than it probably 7588 * needed to, potentially wasting L2ARC device bandwidth and storage. It is 7589 * safe to say that this is an uncommon case, since buffers at the end of 7590 * the ARC lists have moved there due to inactivity. 7591 * 7592 * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, 7593 * then the L2ARC simply misses copying some buffers. This serves as a 7594 * pressure valve to prevent heavy read workloads from both stalling the ARC 7595 * with waits and clogging the L2ARC with writes. This also helps prevent 7596 * the potential for the L2ARC to churn if it attempts to cache content too 7597 * quickly, such as during backups of the entire pool. 7598 * 7599 * 5. After system boot and before the ARC has filled main memory, there are 7600 * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru 7601 * lists can remain mostly static. Instead of searching from tail of these 7602 * lists as pictured, the l2arc_feed_thread() will search from the list heads 7603 * for eligible buffers, greatly increasing its chance of finding them. 7604 * 7605 * The L2ARC device write speed is also boosted during this time so that 7606 * the L2ARC warms up faster. Since there have been no ARC evictions yet, 7607 * there are no L2ARC reads, and no fear of degrading read performance 7608 * through increased writes. 7609 * 7610 * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that 7611 * the vdev queue can aggregate them into larger and fewer writes. Each 7612 * device is written to in a rotor fashion, sweeping writes through 7613 * available space then repeating. 7614 * 7615 * 7. The L2ARC does not store dirty content. It never needs to flush 7616 * write buffers back to disk based storage. 7617 * 7618 * 8. If an ARC buffer is written (and dirtied) which also exists in the 7619 * L2ARC, the now stale L2ARC buffer is immediately dropped. 7620 * 7621 * The performance of the L2ARC can be tweaked by a number of tunables, which 7622 * may be necessary for different workloads: 7623 * 7624 * l2arc_write_max max write bytes per interval 7625 * l2arc_write_boost extra write bytes during device warmup 7626 * l2arc_noprefetch skip caching prefetched buffers 7627 * l2arc_headroom number of max device writes to precache 7628 * l2arc_headroom_boost when we find compressed buffers during ARC 7629 * scanning, we multiply headroom by this 7630 * percentage factor for the next scan cycle, 7631 * since more compressed buffers are likely to 7632 * be present 7633 * l2arc_feed_secs seconds between L2ARC writing 7634 * 7635 * Tunables may be removed or added as future performance improvements are 7636 * integrated, and also may become zpool properties. 7637 * 7638 * There are three key functions that control how the L2ARC warms up: 7639 * 7640 * l2arc_write_eligible() check if a buffer is eligible to cache 7641 * l2arc_write_size() calculate how much to write 7642 * l2arc_write_interval() calculate sleep delay between writes 7643 * 7644 * These three functions determine what to write, how much, and how quickly 7645 * to send writes. 7646 */ 7647 7648 static boolean_t 7649 l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) 7650 { 7651 /* 7652 * A buffer is *not* eligible for the L2ARC if it: 7653 * 1. belongs to a different spa. 7654 * 2. is already cached on the L2ARC. 7655 * 3. has an I/O in progress (it may be an incomplete read). 7656 * 4. is flagged not eligible (zfs property). 7657 */ 7658 if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || 7659 HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) 7660 return (B_FALSE); 7661 7662 return (B_TRUE); 7663 } 7664 7665 static uint64_t 7666 l2arc_write_size(void) 7667 { 7668 uint64_t size; 7669 7670 /* 7671 * Make sure our globals have meaningful values in case the user 7672 * altered them. 7673 */ 7674 size = l2arc_write_max; 7675 if (size == 0) { 7676 cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " 7677 "be greater than zero, resetting it to the default (%d)", 7678 L2ARC_WRITE_SIZE); 7679 size = l2arc_write_max = L2ARC_WRITE_SIZE; 7680 } 7681 7682 if (arc_warm == B_FALSE) 7683 size += l2arc_write_boost; 7684 7685 return (size); 7686 7687 } 7688 7689 static clock_t 7690 l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) 7691 { 7692 clock_t interval, next, now; 7693 7694 /* 7695 * If the ARC lists are busy, increase our write rate; if the 7696 * lists are stale, idle back. This is achieved by checking 7697 * how much we previously wrote - if it was more than half of 7698 * what we wanted, schedule the next write much sooner. 7699 */ 7700 if (l2arc_feed_again && wrote > (wanted / 2)) 7701 interval = (hz * l2arc_feed_min_ms) / 1000; 7702 else 7703 interval = hz * l2arc_feed_secs; 7704 7705 now = ddi_get_lbolt(); 7706 next = MAX(now, MIN(now + interval, began + interval)); 7707 7708 return (next); 7709 } 7710 7711 /* 7712 * Cycle through L2ARC devices. This is how L2ARC load balances. 7713 * If a device is returned, this also returns holding the spa config lock. 7714 */ 7715 static l2arc_dev_t * 7716 l2arc_dev_get_next(void) 7717 { 7718 l2arc_dev_t *first, *next = NULL; 7719 7720 /* 7721 * Lock out the removal of spas (spa_namespace_lock), then removal 7722 * of cache devices (l2arc_dev_mtx). Once a device has been selected, 7723 * both locks will be dropped and a spa config lock held instead. 7724 */ 7725 mutex_enter(&spa_namespace_lock); 7726 mutex_enter(&l2arc_dev_mtx); 7727 7728 /* if there are no vdevs, there is nothing to do */ 7729 if (l2arc_ndev == 0) 7730 goto out; 7731 7732 first = NULL; 7733 next = l2arc_dev_last; 7734 do { 7735 /* loop around the list looking for a non-faulted vdev */ 7736 if (next == NULL) { 7737 next = list_head(l2arc_dev_list); 7738 } else { 7739 next = list_next(l2arc_dev_list, next); 7740 if (next == NULL) 7741 next = list_head(l2arc_dev_list); 7742 } 7743 7744 /* if we have come back to the start, bail out */ 7745 if (first == NULL) 7746 first = next; 7747 else if (next == first) 7748 break; 7749 7750 } while (vdev_is_dead(next->l2ad_vdev)); 7751 7752 /* if we were unable to find any usable vdevs, return NULL */ 7753 if (vdev_is_dead(next->l2ad_vdev)) 7754 next = NULL; 7755 7756 l2arc_dev_last = next; 7757 7758 out: 7759 mutex_exit(&l2arc_dev_mtx); 7760 7761 /* 7762 * Grab the config lock to prevent the 'next' device from being 7763 * removed while we are writing to it. 7764 */ 7765 if (next != NULL) 7766 spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); 7767 mutex_exit(&spa_namespace_lock); 7768 7769 return (next); 7770 } 7771 7772 /* 7773 * Free buffers that were tagged for destruction. 7774 */ 7775 static void 7776 l2arc_do_free_on_write() 7777 { 7778 list_t *buflist; 7779 l2arc_data_free_t *df, *df_prev; 7780 7781 mutex_enter(&l2arc_free_on_write_mtx); 7782 buflist = l2arc_free_on_write; 7783 7784 for (df = list_tail(buflist); df; df = df_prev) { 7785 df_prev = list_prev(buflist, df); 7786 ASSERT3P(df->l2df_abd, !=, NULL); 7787 abd_free(df->l2df_abd); 7788 list_remove(buflist, df); 7789 kmem_free(df, sizeof (l2arc_data_free_t)); 7790 } 7791 7792 mutex_exit(&l2arc_free_on_write_mtx); 7793 } 7794 7795 /* 7796 * A write to a cache device has completed. Update all headers to allow 7797 * reads from these buffers to begin. 7798 */ 7799 static void 7800 l2arc_write_done(zio_t *zio) 7801 { 7802 l2arc_write_callback_t *cb; 7803 l2arc_dev_t *dev; 7804 list_t *buflist; 7805 arc_buf_hdr_t *head, *hdr, *hdr_prev; 7806 kmutex_t *hash_lock; 7807 int64_t bytes_dropped = 0; 7808 7809 cb = zio->io_private; 7810 ASSERT3P(cb, !=, NULL); 7811 dev = cb->l2wcb_dev; 7812 ASSERT3P(dev, !=, NULL); 7813 head = cb->l2wcb_head; 7814 ASSERT3P(head, !=, NULL); 7815 buflist = &dev->l2ad_buflist; 7816 ASSERT3P(buflist, !=, NULL); 7817 DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, 7818 l2arc_write_callback_t *, cb); 7819 7820 if (zio->io_error != 0) 7821 ARCSTAT_BUMP(arcstat_l2_writes_error); 7822 7823 /* 7824 * All writes completed, or an error was hit. 7825 */ 7826 top: 7827 mutex_enter(&dev->l2ad_mtx); 7828 for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { 7829 hdr_prev = list_prev(buflist, hdr); 7830 7831 hash_lock = HDR_LOCK(hdr); 7832 7833 /* 7834 * We cannot use mutex_enter or else we can deadlock 7835 * with l2arc_write_buffers (due to swapping the order 7836 * the hash lock and l2ad_mtx are taken). 7837 */ 7838 if (!mutex_tryenter(hash_lock)) { 7839 /* 7840 * Missed the hash lock. We must retry so we 7841 * don't leave the ARC_FLAG_L2_WRITING bit set. 7842 */ 7843 ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); 7844 7845 /* 7846 * We don't want to rescan the headers we've 7847 * already marked as having been written out, so 7848 * we reinsert the head node so we can pick up 7849 * where we left off. 7850 */ 7851 list_remove(buflist, head); 7852 list_insert_after(buflist, hdr, head); 7853 7854 mutex_exit(&dev->l2ad_mtx); 7855 7856 /* 7857 * We wait for the hash lock to become available 7858 * to try and prevent busy waiting, and increase 7859 * the chance we'll be able to acquire the lock 7860 * the next time around. 7861 */ 7862 mutex_enter(hash_lock); 7863 mutex_exit(hash_lock); 7864 goto top; 7865 } 7866 7867 /* 7868 * We could not have been moved into the arc_l2c_only 7869 * state while in-flight due to our ARC_FLAG_L2_WRITING 7870 * bit being set. Let's just ensure that's being enforced. 7871 */ 7872 ASSERT(HDR_HAS_L1HDR(hdr)); 7873 7874 if (zio->io_error != 0) { 7875 /* 7876 * Error - drop L2ARC entry. 7877 */ 7878 list_remove(buflist, hdr); 7879 arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); 7880 7881 uint64_t psize = HDR_GET_PSIZE(hdr); 7882 ARCSTAT_INCR(arcstat_l2_psize, -psize); 7883 ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); 7884 7885 bytes_dropped += 7886 vdev_psize_to_asize(dev->l2ad_vdev, psize); 7887 (void) zfs_refcount_remove_many(&dev->l2ad_alloc, 7888 arc_hdr_size(hdr), hdr); 7889 } 7890 7891 /* 7892 * Allow ARC to begin reads and ghost list evictions to 7893 * this L2ARC entry. 7894 */ 7895 arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); 7896 7897 mutex_exit(hash_lock); 7898 } 7899 7900 atomic_inc_64(&l2arc_writes_done); 7901 list_remove(buflist, head); 7902 ASSERT(!HDR_HAS_L1HDR(head)); 7903 kmem_cache_free(hdr_l2only_cache, head); 7904 mutex_exit(&dev->l2ad_mtx); 7905 7906 vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); 7907 7908 l2arc_do_free_on_write(); 7909 7910 kmem_free(cb, sizeof (l2arc_write_callback_t)); 7911 } 7912 7913 static int 7914 l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) 7915 { 7916 int ret; 7917 spa_t *spa = zio->io_spa; 7918 arc_buf_hdr_t *hdr = cb->l2rcb_hdr; 7919 blkptr_t *bp = zio->io_bp; 7920 uint8_t salt[ZIO_DATA_SALT_LEN]; 7921 uint8_t iv[ZIO_DATA_IV_LEN]; 7922 uint8_t mac[ZIO_DATA_MAC_LEN]; 7923 boolean_t no_crypt = B_FALSE; 7924 7925 /* 7926 * ZIL data is never be written to the L2ARC, so we don't need 7927 * special handling for its unique MAC storage. 7928 */ 7929 ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); 7930 ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); 7931 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 7932 7933 /* 7934 * If the data was encrypted, decrypt it now. Note that 7935 * we must check the bp here and not the hdr, since the 7936 * hdr does not have its encryption parameters updated 7937 * until arc_read_done(). 7938 */ 7939 if (BP_IS_ENCRYPTED(bp)) { 7940 abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 7941 7942 zio_crypt_decode_params_bp(bp, salt, iv); 7943 zio_crypt_decode_mac_bp(bp, mac); 7944 7945 ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, 7946 BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), 7947 salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, 7948 hdr->b_l1hdr.b_pabd, &no_crypt); 7949 if (ret != 0) { 7950 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); 7951 goto error; 7952 } 7953 7954 /* 7955 * If we actually performed decryption, replace b_pabd 7956 * with the decrypted data. Otherwise we can just throw 7957 * our decryption buffer away. 7958 */ 7959 if (!no_crypt) { 7960 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 7961 arc_hdr_size(hdr), hdr); 7962 hdr->b_l1hdr.b_pabd = eabd; 7963 zio->io_abd = eabd; 7964 } else { 7965 arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); 7966 } 7967 } 7968 7969 /* 7970 * If the L2ARC block was compressed, but ARC compression 7971 * is disabled we decompress the data into a new buffer and 7972 * replace the existing data. 7973 */ 7974 if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 7975 !HDR_COMPRESSION_ENABLED(hdr)) { 7976 abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); 7977 void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); 7978 7979 ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), 7980 hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), 7981 HDR_GET_LSIZE(hdr)); 7982 if (ret != 0) { 7983 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); 7984 arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); 7985 goto error; 7986 } 7987 7988 abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); 7989 arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, 7990 arc_hdr_size(hdr), hdr); 7991 hdr->b_l1hdr.b_pabd = cabd; 7992 zio->io_abd = cabd; 7993 zio->io_size = HDR_GET_LSIZE(hdr); 7994 } 7995 7996 return (0); 7997 7998 error: 7999 return (ret); 8000 } 8001 8002 8003 /* 8004 * A read to a cache device completed. Validate buffer contents before 8005 * handing over to the regular ARC routines. 8006 */ 8007 static void 8008 l2arc_read_done(zio_t *zio) 8009 { 8010 int tfm_error = 0; 8011 l2arc_read_callback_t *cb = zio->io_private; 8012 arc_buf_hdr_t *hdr; 8013 kmutex_t *hash_lock; 8014 boolean_t valid_cksum; 8015 boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && 8016 (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); 8017 8018 ASSERT3P(zio->io_vd, !=, NULL); 8019 ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); 8020 8021 spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); 8022 8023 ASSERT3P(cb, !=, NULL); 8024 hdr = cb->l2rcb_hdr; 8025 ASSERT3P(hdr, !=, NULL); 8026 8027 hash_lock = HDR_LOCK(hdr); 8028 mutex_enter(hash_lock); 8029 ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); 8030 8031 /* 8032 * If the data was read into a temporary buffer, 8033 * move it and free the buffer. 8034 */ 8035 if (cb->l2rcb_abd != NULL) { 8036 ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); 8037 if (zio->io_error == 0) { 8038 if (using_rdata) { 8039 abd_copy(hdr->b_crypt_hdr.b_rabd, 8040 cb->l2rcb_abd, arc_hdr_size(hdr)); 8041 } else { 8042 abd_copy(hdr->b_l1hdr.b_pabd, 8043 cb->l2rcb_abd, arc_hdr_size(hdr)); 8044 } 8045 } 8046 8047 /* 8048 * The following must be done regardless of whether 8049 * there was an error: 8050 * - free the temporary buffer 8051 * - point zio to the real ARC buffer 8052 * - set zio size accordingly 8053 * These are required because zio is either re-used for 8054 * an I/O of the block in the case of the error 8055 * or the zio is passed to arc_read_done() and it 8056 * needs real data. 8057 */ 8058 abd_free(cb->l2rcb_abd); 8059 zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); 8060 8061 if (using_rdata) { 8062 ASSERT(HDR_HAS_RABD(hdr)); 8063 zio->io_abd = zio->io_orig_abd = 8064 hdr->b_crypt_hdr.b_rabd; 8065 } else { 8066 ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); 8067 zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; 8068 } 8069 } 8070 8071 ASSERT3P(zio->io_abd, !=, NULL); 8072 8073 /* 8074 * Check this survived the L2ARC journey. 8075 */ 8076 ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || 8077 (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); 8078 zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ 8079 zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ 8080 8081 valid_cksum = arc_cksum_is_equal(hdr, zio); 8082 8083 /* 8084 * b_rabd will always match the data as it exists on disk if it is 8085 * being used. Therefore if we are reading into b_rabd we do not 8086 * attempt to untransform the data. 8087 */ 8088 if (valid_cksum && !using_rdata) 8089 tfm_error = l2arc_untransform(zio, cb); 8090 8091 if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && 8092 !HDR_L2_EVICTED(hdr)) { 8093 mutex_exit(hash_lock); 8094 zio->io_private = hdr; 8095 arc_read_done(zio); 8096 } else { 8097 mutex_exit(hash_lock); 8098 /* 8099 * Buffer didn't survive caching. Increment stats and 8100 * reissue to the original storage device. 8101 */ 8102 if (zio->io_error != 0) { 8103 ARCSTAT_BUMP(arcstat_l2_io_error); 8104 } else { 8105 zio->io_error = SET_ERROR(EIO); 8106 } 8107 if (!valid_cksum || tfm_error != 0) 8108 ARCSTAT_BUMP(arcstat_l2_cksum_bad); 8109 8110 /* 8111 * If there's no waiter, issue an async i/o to the primary 8112 * storage now. If there *is* a waiter, the caller must 8113 * issue the i/o in a context where it's OK to block. 8114 */ 8115 if (zio->io_waiter == NULL) { 8116 zio_t *pio = zio_unique_parent(zio); 8117 void *abd = (using_rdata) ? 8118 hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; 8119 8120 ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); 8121 8122 zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, 8123 abd, zio->io_size, arc_read_done, 8124 hdr, zio->io_priority, cb->l2rcb_flags, 8125 &cb->l2rcb_zb)); 8126 } 8127 } 8128 8129 kmem_free(cb, sizeof (l2arc_read_callback_t)); 8130 } 8131 8132 /* 8133 * This is the list priority from which the L2ARC will search for pages to 8134 * cache. This is used within loops (0..3) to cycle through lists in the 8135 * desired order. This order can have a significant effect on cache 8136 * performance. 8137 * 8138 * Currently the metadata lists are hit first, MFU then MRU, followed by 8139 * the data lists. This function returns a locked list, and also returns 8140 * the lock pointer. 8141 */ 8142 static multilist_sublist_t * 8143 l2arc_sublist_lock(int list_num) 8144 { 8145 multilist_t *ml = NULL; 8146 unsigned int idx; 8147 8148 ASSERT(list_num >= 0 && list_num <= 3); 8149 8150 switch (list_num) { 8151 case 0: 8152 ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; 8153 break; 8154 case 1: 8155 ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; 8156 break; 8157 case 2: 8158 ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; 8159 break; 8160 case 3: 8161 ml = arc_mru->arcs_list[ARC_BUFC_DATA]; 8162 break; 8163 } 8164 8165 /* 8166 * Return a randomly-selected sublist. This is acceptable 8167 * because the caller feeds only a little bit of data for each 8168 * call (8MB). Subsequent calls will result in different 8169 * sublists being selected. 8170 */ 8171 idx = multilist_get_random_index(ml); 8172 return (multilist_sublist_lock(ml, idx)); 8173 } 8174 8175 /* 8176 * Evict buffers from the device write hand to the distance specified in 8177 * bytes. This distance may span populated buffers, it may span nothing. 8178 * This is clearing a region on the L2ARC device ready for writing. 8179 * If the 'all' boolean is set, every buffer is evicted. 8180 */ 8181 static void 8182 l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) 8183 { 8184 list_t *buflist; 8185 arc_buf_hdr_t *hdr, *hdr_prev; 8186 kmutex_t *hash_lock; 8187 uint64_t taddr; 8188 8189 buflist = &dev->l2ad_buflist; 8190 8191 if (!all && dev->l2ad_first) { 8192 /* 8193 * This is the first sweep through the device. There is 8194 * nothing to evict. 8195 */ 8196 return; 8197 } 8198 8199 if (dev->l2ad_hand >= (dev->l2ad_end - (2 * distance))) { 8200 /* 8201 * When nearing the end of the device, evict to the end 8202 * before the device write hand jumps to the start. 8203 */ 8204 taddr = dev->l2ad_end; 8205 } else { 8206 taddr = dev->l2ad_hand + distance; 8207 } 8208 DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, 8209 uint64_t, taddr, boolean_t, all); 8210 8211 top: 8212 mutex_enter(&dev->l2ad_mtx); 8213 for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { 8214 hdr_prev = list_prev(buflist, hdr); 8215 8216 hash_lock = HDR_LOCK(hdr); 8217 8218 /* 8219 * We cannot use mutex_enter or else we can deadlock 8220 * with l2arc_write_buffers (due to swapping the order 8221 * the hash lock and l2ad_mtx are taken). 8222 */ 8223 if (!mutex_tryenter(hash_lock)) { 8224 /* 8225 * Missed the hash lock. Retry. 8226 */ 8227 ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); 8228 mutex_exit(&dev->l2ad_mtx); 8229 mutex_enter(hash_lock); 8230 mutex_exit(hash_lock); 8231 goto top; 8232 } 8233 8234 /* 8235 * A header can't be on this list if it doesn't have L2 header. 8236 */ 8237 ASSERT(HDR_HAS_L2HDR(hdr)); 8238 8239 /* Ensure this header has finished being written. */ 8240 ASSERT(!HDR_L2_WRITING(hdr)); 8241 ASSERT(!HDR_L2_WRITE_HEAD(hdr)); 8242 8243 if (!all && (hdr->b_l2hdr.b_daddr >= taddr || 8244 hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { 8245 /* 8246 * We've evicted to the target address, 8247 * or the end of the device. 8248 */ 8249 mutex_exit(hash_lock); 8250 break; 8251 } 8252 8253 if (!HDR_HAS_L1HDR(hdr)) { 8254 ASSERT(!HDR_L2_READING(hdr)); 8255 /* 8256 * This doesn't exist in the ARC. Destroy. 8257 * arc_hdr_destroy() will call list_remove() 8258 * and decrement arcstat_l2_lsize. 8259 */ 8260 arc_change_state(arc_anon, hdr, hash_lock); 8261 arc_hdr_destroy(hdr); 8262 } else { 8263 ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); 8264 ARCSTAT_BUMP(arcstat_l2_evict_l1cached); 8265 /* 8266 * Invalidate issued or about to be issued 8267 * reads, since we may be about to write 8268 * over this location. 8269 */ 8270 if (HDR_L2_READING(hdr)) { 8271 ARCSTAT_BUMP(arcstat_l2_evict_reading); 8272 arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); 8273 } 8274 8275 arc_hdr_l2hdr_destroy(hdr); 8276 } 8277 mutex_exit(hash_lock); 8278 } 8279 mutex_exit(&dev->l2ad_mtx); 8280 } 8281 8282 /* 8283 * Handle any abd transforms that might be required for writing to the L2ARC. 8284 * If successful, this function will always return an abd with the data 8285 * transformed as it is on disk in a new abd of asize bytes. 8286 */ 8287 static int 8288 l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, 8289 abd_t **abd_out) 8290 { 8291 int ret; 8292 void *tmp = NULL; 8293 abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; 8294 enum zio_compress compress = HDR_GET_COMPRESS(hdr); 8295 uint64_t psize = HDR_GET_PSIZE(hdr); 8296 uint64_t size = arc_hdr_size(hdr); 8297 boolean_t ismd = HDR_ISTYPE_METADATA(hdr); 8298 boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); 8299 dsl_crypto_key_t *dck = NULL; 8300 uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; 8301 boolean_t no_crypt = B_FALSE; 8302 8303 ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && 8304 !HDR_COMPRESSION_ENABLED(hdr)) || 8305 HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); 8306 ASSERT3U(psize, <=, asize); 8307 8308 /* 8309 * If this data simply needs its own buffer, we simply allocate it 8310 * and copy the data. This may be done to eliminate a dependency on a 8311 * shared buffer or to reallocate the buffer to match asize. 8312 */ 8313 if (HDR_HAS_RABD(hdr) && asize != psize) { 8314 ASSERT3U(asize, >=, psize); 8315 to_write = abd_alloc_for_io(asize, ismd); 8316 abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); 8317 if (psize != asize) 8318 abd_zero_off(to_write, psize, asize - psize); 8319 goto out; 8320 } 8321 8322 if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && 8323 !HDR_ENCRYPTED(hdr)) { 8324 ASSERT3U(size, ==, psize); 8325 to_write = abd_alloc_for_io(asize, ismd); 8326 abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); 8327 if (size != asize) 8328 abd_zero_off(to_write, size, asize - size); 8329 goto out; 8330 } 8331 8332 if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { 8333 cabd = abd_alloc_for_io(asize, ismd); 8334 tmp = abd_borrow_buf(cabd, asize); 8335 8336 psize = zio_compress_data(compress, to_write, tmp, size); 8337 ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); 8338 if (psize < asize) 8339 bzero((char *)tmp + psize, asize - psize); 8340 psize = HDR_GET_PSIZE(hdr); 8341 abd_return_buf_copy(cabd, tmp, asize); 8342 to_write = cabd; 8343 } 8344 8345 if (HDR_ENCRYPTED(hdr)) { 8346 eabd = abd_alloc_for_io(asize, ismd); 8347 8348 /* 8349 * If the dataset was disowned before the buffer 8350 * made it to this point, the key to re-encrypt 8351 * it won't be available. In this case we simply 8352 * won't write the buffer to the L2ARC. 8353 */ 8354 ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, 8355 FTAG, &dck); 8356 if (ret != 0) 8357 goto error; 8358 8359 ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, 8360 hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, 8361 hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, 8362 &no_crypt); 8363 if (ret != 0) 8364 goto error; 8365 8366 if (no_crypt) 8367 abd_copy(eabd, to_write, psize); 8368 8369 if (psize != asize) 8370 abd_zero_off(eabd, psize, asize - psize); 8371 8372 /* assert that the MAC we got here matches the one we saved */ 8373 ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); 8374 spa_keystore_dsl_key_rele(spa, dck, FTAG); 8375 8376 if (to_write == cabd) 8377 abd_free(cabd); 8378 8379 to_write = eabd; 8380 } 8381 8382 out: 8383 ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); 8384 *abd_out = to_write; 8385 return (0); 8386 8387 error: 8388 if (dck != NULL) 8389 spa_keystore_dsl_key_rele(spa, dck, FTAG); 8390 if (cabd != NULL) 8391 abd_free(cabd); 8392 if (eabd != NULL) 8393 abd_free(eabd); 8394 8395 *abd_out = NULL; 8396 return (ret); 8397 } 8398 8399 /* 8400 * Find and write ARC buffers to the L2ARC device. 8401 * 8402 * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid 8403 * for reading until they have completed writing. 8404 * The headroom_boost is an in-out parameter used to maintain headroom boost 8405 * state between calls to this function. 8406 * 8407 * Returns the number of bytes actually written (which may be smaller than 8408 * the delta by which the device hand has changed due to alignment). 8409 */ 8410 static uint64_t 8411 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) 8412 { 8413 arc_buf_hdr_t *hdr, *hdr_prev, *head; 8414 uint64_t write_asize, write_psize, write_lsize, headroom; 8415 boolean_t full; 8416 l2arc_write_callback_t *cb; 8417 zio_t *pio, *wzio; 8418 uint64_t guid = spa_load_guid(spa); 8419 8420 ASSERT3P(dev->l2ad_vdev, !=, NULL); 8421 8422 pio = NULL; 8423 write_lsize = write_asize = write_psize = 0; 8424 full = B_FALSE; 8425 head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); 8426 arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); 8427 8428 /* 8429 * Copy buffers for L2ARC writing. 8430 */ 8431 for (int try = 0; try <= 3; try++) { 8432 multilist_sublist_t *mls = l2arc_sublist_lock(try); 8433 uint64_t passed_sz = 0; 8434 8435 VERIFY3P(mls, !=, NULL); 8436 8437 /* 8438 * L2ARC fast warmup. 8439 * 8440 * Until the ARC is warm and starts to evict, read from the 8441 * head of the ARC lists rather than the tail. 8442 */ 8443 if (arc_warm == B_FALSE) 8444 hdr = multilist_sublist_head(mls); 8445 else 8446 hdr = multilist_sublist_tail(mls); 8447 8448 headroom = target_sz * l2arc_headroom; 8449 if (zfs_compressed_arc_enabled) 8450 headroom = (headroom * l2arc_headroom_boost) / 100; 8451 8452 for (; hdr; hdr = hdr_prev) { 8453 kmutex_t *hash_lock; 8454 abd_t *to_write = NULL; 8455 8456 if (arc_warm == B_FALSE) 8457 hdr_prev = multilist_sublist_next(mls, hdr); 8458 else 8459 hdr_prev = multilist_sublist_prev(mls, hdr); 8460 8461 hash_lock = HDR_LOCK(hdr); 8462 if (!mutex_tryenter(hash_lock)) { 8463 /* 8464 * Skip this buffer rather than waiting. 8465 */ 8466 continue; 8467 } 8468 8469 passed_sz += HDR_GET_LSIZE(hdr); 8470 if (passed_sz > headroom) { 8471 /* 8472 * Searched too far. 8473 */ 8474 mutex_exit(hash_lock); 8475 break; 8476 } 8477 8478 if (!l2arc_write_eligible(guid, hdr)) { 8479 mutex_exit(hash_lock); 8480 continue; 8481 } 8482 8483 /* 8484 * We rely on the L1 portion of the header below, so 8485 * it's invalid for this header to have been evicted out 8486 * of the ghost cache, prior to being written out. The 8487 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 8488 */ 8489 ASSERT(HDR_HAS_L1HDR(hdr)); 8490 8491 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 8492 ASSERT3U(arc_hdr_size(hdr), >, 0); 8493 ASSERT(hdr->b_l1hdr.b_pabd != NULL || 8494 HDR_HAS_RABD(hdr)); 8495 uint64_t psize = HDR_GET_PSIZE(hdr); 8496 uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, 8497 psize); 8498 8499 if ((write_asize + asize) > target_sz) { 8500 full = B_TRUE; 8501 mutex_exit(hash_lock); 8502 break; 8503 } 8504 8505 /* 8506 * We rely on the L1 portion of the header below, so 8507 * it's invalid for this header to have been evicted out 8508 * of the ghost cache, prior to being written out. The 8509 * ARC_FLAG_L2_WRITING bit ensures this won't happen. 8510 */ 8511 arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); 8512 ASSERT(HDR_HAS_L1HDR(hdr)); 8513 8514 ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); 8515 ASSERT(hdr->b_l1hdr.b_pabd != NULL || 8516 HDR_HAS_RABD(hdr)); 8517 ASSERT3U(arc_hdr_size(hdr), >, 0); 8518 8519 /* 8520 * If this header has b_rabd, we can use this since it 8521 * must always match the data exactly as it exists on 8522 * disk. Otherwise, the L2ARC can normally use the 8523 * hdr's data, but if we're sharing data between the 8524 * hdr and one of its bufs, L2ARC needs its own copy of 8525 * the data so that the ZIO below can't race with the 8526 * buf consumer. To ensure that this copy will be 8527 * available for the lifetime of the ZIO and be cleaned 8528 * up afterwards, we add it to the l2arc_free_on_write 8529 * queue. If we need to apply any transforms to the 8530 * data (compression, encryption) we will also need the 8531 * extra buffer. 8532 */ 8533 if (HDR_HAS_RABD(hdr) && psize == asize) { 8534 to_write = hdr->b_crypt_hdr.b_rabd; 8535 } else if ((HDR_COMPRESSION_ENABLED(hdr) || 8536 HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && 8537 !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && 8538 psize == asize) { 8539 to_write = hdr->b_l1hdr.b_pabd; 8540 } else { 8541 int ret; 8542 arc_buf_contents_t type = arc_buf_type(hdr); 8543 8544 ret = l2arc_apply_transforms(spa, hdr, asize, 8545 &to_write); 8546 if (ret != 0) { 8547 arc_hdr_clear_flags(hdr, 8548 ARC_FLAG_L2_WRITING); 8549 mutex_exit(hash_lock); 8550 continue; 8551 } 8552 8553 l2arc_free_abd_on_write(to_write, asize, type); 8554 } 8555 8556 if (pio == NULL) { 8557 /* 8558 * Insert a dummy header on the buflist so 8559 * l2arc_write_done() can find where the 8560 * write buffers begin without searching. 8561 */ 8562 mutex_enter(&dev->l2ad_mtx); 8563 list_insert_head(&dev->l2ad_buflist, head); 8564 mutex_exit(&dev->l2ad_mtx); 8565 8566 cb = kmem_alloc( 8567 sizeof (l2arc_write_callback_t), KM_SLEEP); 8568 cb->l2wcb_dev = dev; 8569 cb->l2wcb_head = head; 8570 pio = zio_root(spa, l2arc_write_done, cb, 8571 ZIO_FLAG_CANFAIL); 8572 } 8573 8574 hdr->b_l2hdr.b_dev = dev; 8575 hdr->b_l2hdr.b_daddr = dev->l2ad_hand; 8576 arc_hdr_set_flags(hdr, 8577 ARC_FLAG_L2_WRITING | ARC_FLAG_HAS_L2HDR); 8578 8579 mutex_enter(&dev->l2ad_mtx); 8580 list_insert_head(&dev->l2ad_buflist, hdr); 8581 mutex_exit(&dev->l2ad_mtx); 8582 8583 (void) zfs_refcount_add_many(&dev->l2ad_alloc, 8584 arc_hdr_size(hdr), hdr); 8585 8586 wzio = zio_write_phys(pio, dev->l2ad_vdev, 8587 hdr->b_l2hdr.b_daddr, asize, to_write, 8588 ZIO_CHECKSUM_OFF, NULL, hdr, 8589 ZIO_PRIORITY_ASYNC_WRITE, 8590 ZIO_FLAG_CANFAIL, B_FALSE); 8591 8592 write_lsize += HDR_GET_LSIZE(hdr); 8593 DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, 8594 zio_t *, wzio); 8595 8596 write_psize += psize; 8597 write_asize += asize; 8598 dev->l2ad_hand += asize; 8599 vdev_space_update(dev->l2ad_vdev, asize, 0, 0); 8600 8601 mutex_exit(hash_lock); 8602 8603 (void) zio_nowait(wzio); 8604 } 8605 8606 multilist_sublist_unlock(mls); 8607 8608 if (full == B_TRUE) 8609 break; 8610 } 8611 8612 /* No buffers selected for writing? */ 8613 if (pio == NULL) { 8614 ASSERT0(write_lsize); 8615 ASSERT(!HDR_HAS_L1HDR(head)); 8616 kmem_cache_free(hdr_l2only_cache, head); 8617 return (0); 8618 } 8619 8620 ASSERT3U(write_asize, <=, target_sz); 8621 ARCSTAT_BUMP(arcstat_l2_writes_sent); 8622 ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); 8623 ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); 8624 ARCSTAT_INCR(arcstat_l2_psize, write_psize); 8625 8626 /* 8627 * Bump device hand to the device start if it is approaching the end. 8628 * l2arc_evict() will already have evicted ahead for this case. 8629 */ 8630 if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { 8631 dev->l2ad_hand = dev->l2ad_start; 8632 dev->l2ad_first = B_FALSE; 8633 } 8634 8635 dev->l2ad_writing = B_TRUE; 8636 (void) zio_wait(pio); 8637 dev->l2ad_writing = B_FALSE; 8638 8639 return (write_asize); 8640 } 8641 8642 /* 8643 * This thread feeds the L2ARC at regular intervals. This is the beating 8644 * heart of the L2ARC. 8645 */ 8646 /* ARGSUSED */ 8647 static void 8648 l2arc_feed_thread(void *unused) 8649 { 8650 callb_cpr_t cpr; 8651 l2arc_dev_t *dev; 8652 spa_t *spa; 8653 uint64_t size, wrote; 8654 clock_t begin, next = ddi_get_lbolt(); 8655 8656 CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); 8657 8658 mutex_enter(&l2arc_feed_thr_lock); 8659 8660 while (l2arc_thread_exit == 0) { 8661 CALLB_CPR_SAFE_BEGIN(&cpr); 8662 (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, 8663 next); 8664 CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); 8665 next = ddi_get_lbolt() + hz; 8666 8667 /* 8668 * Quick check for L2ARC devices. 8669 */ 8670 mutex_enter(&l2arc_dev_mtx); 8671 if (l2arc_ndev == 0) { 8672 mutex_exit(&l2arc_dev_mtx); 8673 continue; 8674 } 8675 mutex_exit(&l2arc_dev_mtx); 8676 begin = ddi_get_lbolt(); 8677 8678 /* 8679 * This selects the next l2arc device to write to, and in 8680 * doing so the next spa to feed from: dev->l2ad_spa. This 8681 * will return NULL if there are now no l2arc devices or if 8682 * they are all faulted. 8683 * 8684 * If a device is returned, its spa's config lock is also 8685 * held to prevent device removal. l2arc_dev_get_next() 8686 * will grab and release l2arc_dev_mtx. 8687 */ 8688 if ((dev = l2arc_dev_get_next()) == NULL) 8689 continue; 8690 8691 spa = dev->l2ad_spa; 8692 ASSERT3P(spa, !=, NULL); 8693 8694 /* 8695 * If the pool is read-only then force the feed thread to 8696 * sleep a little longer. 8697 */ 8698 if (!spa_writeable(spa)) { 8699 next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; 8700 spa_config_exit(spa, SCL_L2ARC, dev); 8701 continue; 8702 } 8703 8704 /* 8705 * Avoid contributing to memory pressure. 8706 */ 8707 if (arc_reclaim_needed()) { 8708 ARCSTAT_BUMP(arcstat_l2_abort_lowmem); 8709 spa_config_exit(spa, SCL_L2ARC, dev); 8710 continue; 8711 } 8712 8713 ARCSTAT_BUMP(arcstat_l2_feeds); 8714 8715 size = l2arc_write_size(); 8716 8717 /* 8718 * Evict L2ARC buffers that will be overwritten. 8719 */ 8720 l2arc_evict(dev, size, B_FALSE); 8721 8722 /* 8723 * Write ARC buffers. 8724 */ 8725 wrote = l2arc_write_buffers(spa, dev, size); 8726 8727 /* 8728 * Calculate interval between writes. 8729 */ 8730 next = l2arc_write_interval(begin, size, wrote); 8731 spa_config_exit(spa, SCL_L2ARC, dev); 8732 } 8733 8734 l2arc_thread_exit = 0; 8735 cv_broadcast(&l2arc_feed_thr_cv); 8736 CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ 8737 thread_exit(); 8738 } 8739 8740 boolean_t 8741 l2arc_vdev_present(vdev_t *vd) 8742 { 8743 l2arc_dev_t *dev; 8744 8745 mutex_enter(&l2arc_dev_mtx); 8746 for (dev = list_head(l2arc_dev_list); dev != NULL; 8747 dev = list_next(l2arc_dev_list, dev)) { 8748 if (dev->l2ad_vdev == vd) 8749 break; 8750 } 8751 mutex_exit(&l2arc_dev_mtx); 8752 8753 return (dev != NULL); 8754 } 8755 8756 /* 8757 * Add a vdev for use by the L2ARC. By this point the spa has already 8758 * validated the vdev and opened it. 8759 */ 8760 void 8761 l2arc_add_vdev(spa_t *spa, vdev_t *vd) 8762 { 8763 l2arc_dev_t *adddev; 8764 8765 ASSERT(!l2arc_vdev_present(vd)); 8766 8767 /* 8768 * Create a new l2arc device entry. 8769 */ 8770 adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); 8771 adddev->l2ad_spa = spa; 8772 adddev->l2ad_vdev = vd; 8773 adddev->l2ad_start = VDEV_LABEL_START_SIZE; 8774 adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); 8775 adddev->l2ad_hand = adddev->l2ad_start; 8776 adddev->l2ad_first = B_TRUE; 8777 adddev->l2ad_writing = B_FALSE; 8778 8779 mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); 8780 /* 8781 * This is a list of all ARC buffers that are still valid on the 8782 * device. 8783 */ 8784 list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), 8785 offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); 8786 8787 vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); 8788 zfs_refcount_create(&adddev->l2ad_alloc); 8789 8790 /* 8791 * Add device to global list 8792 */ 8793 mutex_enter(&l2arc_dev_mtx); 8794 list_insert_head(l2arc_dev_list, adddev); 8795 atomic_inc_64(&l2arc_ndev); 8796 mutex_exit(&l2arc_dev_mtx); 8797 } 8798 8799 /* 8800 * Remove a vdev from the L2ARC. 8801 */ 8802 void 8803 l2arc_remove_vdev(vdev_t *vd) 8804 { 8805 l2arc_dev_t *dev, *nextdev, *remdev = NULL; 8806 8807 /* 8808 * Find the device by vdev 8809 */ 8810 mutex_enter(&l2arc_dev_mtx); 8811 for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { 8812 nextdev = list_next(l2arc_dev_list, dev); 8813 if (vd == dev->l2ad_vdev) { 8814 remdev = dev; 8815 break; 8816 } 8817 } 8818 ASSERT3P(remdev, !=, NULL); 8819 8820 /* 8821 * Remove device from global list 8822 */ 8823 list_remove(l2arc_dev_list, remdev); 8824 l2arc_dev_last = NULL; /* may have been invalidated */ 8825 atomic_dec_64(&l2arc_ndev); 8826 mutex_exit(&l2arc_dev_mtx); 8827 8828 /* 8829 * Clear all buflists and ARC references. L2ARC device flush. 8830 */ 8831 l2arc_evict(remdev, 0, B_TRUE); 8832 list_destroy(&remdev->l2ad_buflist); 8833 mutex_destroy(&remdev->l2ad_mtx); 8834 zfs_refcount_destroy(&remdev->l2ad_alloc); 8835 kmem_free(remdev, sizeof (l2arc_dev_t)); 8836 } 8837 8838 void 8839 l2arc_init(void) 8840 { 8841 l2arc_thread_exit = 0; 8842 l2arc_ndev = 0; 8843 l2arc_writes_sent = 0; 8844 l2arc_writes_done = 0; 8845 8846 mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); 8847 cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); 8848 mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); 8849 mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); 8850 8851 l2arc_dev_list = &L2ARC_dev_list; 8852 l2arc_free_on_write = &L2ARC_free_on_write; 8853 list_create(l2arc_dev_list, sizeof (l2arc_dev_t), 8854 offsetof(l2arc_dev_t, l2ad_node)); 8855 list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), 8856 offsetof(l2arc_data_free_t, l2df_list_node)); 8857 } 8858 8859 void 8860 l2arc_fini(void) 8861 { 8862 /* 8863 * This is called from dmu_fini(), which is called from spa_fini(); 8864 * Because of this, we can assume that all l2arc devices have 8865 * already been removed when the pools themselves were removed. 8866 */ 8867 8868 l2arc_do_free_on_write(); 8869 8870 mutex_destroy(&l2arc_feed_thr_lock); 8871 cv_destroy(&l2arc_feed_thr_cv); 8872 mutex_destroy(&l2arc_dev_mtx); 8873 mutex_destroy(&l2arc_free_on_write_mtx); 8874 8875 list_destroy(l2arc_dev_list); 8876 list_destroy(l2arc_free_on_write); 8877 } 8878 8879 void 8880 l2arc_start(void) 8881 { 8882 if (!(spa_mode_global & FWRITE)) 8883 return; 8884 8885 (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, 8886 TS_RUN, minclsyspri); 8887 } 8888 8889 void 8890 l2arc_stop(void) 8891 { 8892 if (!(spa_mode_global & FWRITE)) 8893 return; 8894 8895 mutex_enter(&l2arc_feed_thr_lock); 8896 cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ 8897 l2arc_thread_exit = 1; 8898 while (l2arc_thread_exit != 0) 8899 cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); 8900 mutex_exit(&l2arc_feed_thr_lock); 8901 } 8902