xref: /titanic_51/usr/src/uts/common/fs/zfs/arc.c (revision 3a3e8d7acddcf5f846fdd54de49bd37c17e44d43)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * DVA-based Adjustable Relpacement Cache
30  *
31  * While much of the theory of operation and algorithms used here
32  * are based on the self-tuning, low overhead replacement cache
33  * presented by Megiddo and Modha at FAST 2003, there are some
34  * significant differences:
35  *
36  * 1. The Megiddo and Modha model assumes any page is evictable.
37  * Pages in its cache cannot be "locked" into memory.  This makes
38  * the eviction algorithm simple: evict the last page in the list.
39  * This also make the performance characteristics easy to reason
40  * about.  Our cache is not so simple.  At any given moment, some
41  * subset of the blocks in the cache are un-evictable because we
42  * have handed out a reference to them.  Blocks are only evictable
43  * when there are no external references active.  This makes
44  * eviction far more problematic:  we choose to evict the evictable
45  * blocks that are the "lowest" in the list.
46  *
47  * There are times when it is not possible to evict the requested
48  * space.  In these circumstances we are unable to adjust the cache
49  * size.  To prevent the cache growing unbounded at these times we
50  * implement a "cache throttle" that slowes the flow of new data
51  * into the cache until we can make space avaiable.
52  *
53  * 2. The Megiddo and Modha model assumes a fixed cache size.
54  * Pages are evicted when the cache is full and there is a cache
55  * miss.  Our model has a variable sized cache.  It grows with
56  * high use, but also tries to react to memory preasure from the
57  * operating system: decreasing its size when system memory is
58  * tight.
59  *
60  * 3. The Megiddo and Modha model assumes a fixed page size. All
61  * elements of the cache are therefor exactly the same size.  So
62  * when adjusting the cache size following a cache miss, its simply
63  * a matter of choosing a single page to evict.  In our model, we
64  * have variable sized cache blocks (rangeing from 512 bytes to
65  * 128K bytes).  We therefor choose a set of blocks to evict to make
66  * space for a cache miss that approximates as closely as possible
67  * the space used by the new block.
68  *
69  * See also:  "ARC: A Self-Tuning, Low Overhead Replacement Cache"
70  * by N. Megiddo & D. Modha, FAST 2003
71  */
72 
73 /*
74  * The locking model:
75  *
76  * A new reference to a cache buffer can be obtained in two
77  * ways: 1) via a hash table lookup using the DVA as a key,
78  * or 2) via one of the ARC lists.  The arc_read() inerface
79  * uses method 1, while the internal arc algorithms for
80  * adjusting the cache use method 2.  We therefor provide two
81  * types of locks: 1) the hash table lock array, and 2) the
82  * arc list locks.
83  *
84  * Buffers do not have their own mutexs, rather they rely on the
85  * hash table mutexs for the bulk of their protection (i.e. most
86  * fields in the arc_buf_hdr_t are protected by these mutexs).
87  *
88  * buf_hash_find() returns the appropriate mutex (held) when it
89  * locates the requested buffer in the hash table.  It returns
90  * NULL for the mutex if the buffer was not in the table.
91  *
92  * buf_hash_remove() expects the appropriate hash mutex to be
93  * already held before it is invoked.
94  *
95  * Each arc state also has a mutex which is used to protect the
96  * buffer list associated with the state.  When attempting to
97  * obtain a hash table lock while holding an arc list lock you
98  * must use: mutex_tryenter() to avoid deadlock.  Also note that
99  * the "top" state mutex must be held before the "bot" state mutex.
100  *
101  * Note that the majority of the performance stats are manipulated
102  * with atomic operations.
103  */
104 
105 #include <sys/spa.h>
106 #include <sys/zio.h>
107 #include <sys/zfs_context.h>
108 #include <sys/arc.h>
109 #include <sys/refcount.h>
110 #ifdef _KERNEL
111 #include <sys/vmsystm.h>
112 #include <vm/anon.h>
113 #include <sys/fs/swapnode.h>
114 #include <sys/dnlc.h>
115 #endif
116 #include <sys/callb.h>
117 
118 static kmutex_t		arc_reclaim_thr_lock;
119 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
120 static uint8_t		arc_thread_exit;
121 
122 #define	ARC_REDUCE_DNLC_PERCENT	3
123 uint_t arc_reduce_dnlc_percent = ARC_REDUCE_DNLC_PERCENT;
124 
125 typedef enum arc_reclaim_strategy {
126 	ARC_RECLAIM_AGGR,		/* Aggressive reclaim strategy */
127 	ARC_RECLAIM_CONS		/* Conservative reclaim strategy */
128 } arc_reclaim_strategy_t;
129 
130 /* number of seconds before growing cache again */
131 static int		arc_grow_retry = 60;
132 
133 static kmutex_t arc_reclaim_lock;
134 static int arc_dead;
135 
136 /*
137  * Note that buffers can be on one of 5 states:
138  *	ARC_anon	- anonymous (discussed below)
139  *	ARC_mru_top	- recently used, currently cached
140  *	ARC_mru_bot	- recentely used, no longer in cache
141  *	ARC_mfu_top	- frequently used, currently cached
142  *	ARC_mfu_bot	- frequently used, no longer in cache
143  * When there are no active references to the buffer, they
144  * are linked onto one of the lists in arc.  These are the
145  * only buffers that can be evicted or deleted.
146  *
147  * Anonymous buffers are buffers that are not associated with
148  * a DVA.  These are buffers that hold dirty block copies
149  * before they are written to stable storage.  By definition,
150  * they are "ref'd" and are considered part of arc_mru_top
151  * that cannot be freed.  Generally, they will aquire a DVA
152  * as they are written and migrate onto the arc_mru_top list.
153  */
154 
155 typedef struct arc_state {
156 	list_t	list;	/* linked list of evictable buffer in state */
157 	uint64_t lsize;	/* total size of buffers in the linked list */
158 	uint64_t size;	/* total size of all buffers in this state */
159 	uint64_t hits;
160 	kmutex_t mtx;
161 } arc_state_t;
162 
163 /* The 5 states: */
164 static arc_state_t ARC_anon;
165 static arc_state_t ARC_mru_top;
166 static arc_state_t ARC_mru_bot;
167 static arc_state_t ARC_mfu_top;
168 static arc_state_t ARC_mfu_bot;
169 
170 static struct arc {
171 	arc_state_t 	*anon;
172 	arc_state_t	*mru_top;
173 	arc_state_t	*mru_bot;
174 	arc_state_t	*mfu_top;
175 	arc_state_t	*mfu_bot;
176 	uint64_t	size;		/* Actual total arc size */
177 	uint64_t	p;		/* Target size (in bytes) of mru_top */
178 	uint64_t	c;		/* Target size of cache (in bytes) */
179 	uint64_t	c_min;		/* Minimum target cache size */
180 	uint64_t	c_max;		/* Maximum target cache size */
181 	uint64_t	incr;		/* Size by which to increment arc.c */
182 	int64_t		size_check;
183 
184 	/* performance stats */
185 	uint64_t	hits;
186 	uint64_t	misses;
187 	uint64_t	deleted;
188 	uint64_t	skipped;
189 	uint64_t	hash_elements;
190 	uint64_t	hash_elements_max;
191 	uint64_t	hash_collisions;
192 	uint64_t	hash_chains;
193 	uint32_t	hash_chain_max;
194 
195 	int		no_grow;	/* Don't try to grow cache size */
196 } arc;
197 
198 /* Default amount to grow arc.incr */
199 static int64_t arc_incr_size = 1024;
200 
201 /* > 0 ==> time to increment arc.c */
202 static int64_t arc_size_check_default = -1000;
203 
204 static uint64_t arc_tempreserve;
205 
206 typedef struct arc_callback arc_callback_t;
207 
208 struct arc_callback {
209 	arc_done_func_t		*acb_done;
210 	void			*acb_private;
211 	arc_byteswap_func_t	*acb_byteswap;
212 	arc_buf_t		*acb_buf;
213 	zio_t			*acb_zio_dummy;
214 	arc_callback_t		*acb_next;
215 };
216 
217 struct arc_buf_hdr {
218 	/* immutable */
219 	uint64_t		b_size;
220 	spa_t			*b_spa;
221 
222 	/* protected by hash lock */
223 	dva_t			b_dva;
224 	uint64_t		b_birth;
225 	uint64_t		b_cksum0;
226 
227 	arc_buf_hdr_t		*b_hash_next;
228 	arc_buf_t		*b_buf;
229 	uint32_t		b_flags;
230 
231 	kcondvar_t		b_cv;
232 	arc_callback_t		*b_acb;
233 
234 	/* protected by arc state mutex */
235 	arc_state_t		*b_state;
236 	list_node_t		b_arc_node;
237 
238 	/* updated atomically */
239 	clock_t			b_arc_access;
240 
241 	/* self protecting */
242 	refcount_t		b_refcnt;
243 };
244 
245 /*
246  * Private ARC flags.  These flags are private ARC only flags that will show up
247  * in b_flags in the arc_hdr_buf_t.  Some flags are publicly declared, and can
248  * be passed in as arc_flags in things like arc_read.  However, these flags
249  * should never be passed and should only be set by ARC code.  When adding new
250  * public flags, make sure not to smash the private ones.
251  */
252 
253 #define	ARC_IO_IN_PROGRESS	(1 << 10)	/* I/O in progress for buf */
254 #define	ARC_IO_ERROR		(1 << 11)	/* I/O failed for buf */
255 #define	ARC_FREED_IN_READ	(1 << 12)	/* buf freed while in read */
256 
257 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
258 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
259 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
260 
261 /*
262  * Hash table routines
263  */
264 
265 #define	HT_LOCK_PAD	64
266 
267 struct ht_lock {
268 	kmutex_t	ht_lock;
269 #ifdef _KERNEL
270 	unsigned char	pad[(HT_LOCK_PAD - sizeof (kmutex_t))];
271 #endif
272 };
273 
274 #define	BUF_LOCKS 256
275 typedef struct buf_hash_table {
276 	uint64_t ht_mask;
277 	arc_buf_hdr_t **ht_table;
278 	struct ht_lock ht_locks[BUF_LOCKS];
279 } buf_hash_table_t;
280 
281 static buf_hash_table_t buf_hash_table;
282 
283 #define	BUF_HASH_INDEX(spa, dva, birth) \
284 	(buf_hash(spa, dva, birth) & buf_hash_table.ht_mask)
285 #define	BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)])
286 #define	BUF_HASH_LOCK(idx)	(&(BUF_HASH_LOCK_NTRY(idx).ht_lock))
287 #define	HDR_LOCK(buf) \
288 	(BUF_HASH_LOCK(BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth)))
289 
290 uint64_t zfs_crc64_table[256];
291 
292 static uint64_t
293 buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
294 {
295 	uintptr_t spav = (uintptr_t)spa;
296 	uint8_t *vdva = (uint8_t *)dva;
297 	uint64_t crc = -1ULL;
298 	int i;
299 
300 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
301 
302 	for (i = 0; i < sizeof (dva_t); i++)
303 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
304 
305 	crc ^= (spav>>8) ^ birth;
306 
307 	return (crc);
308 }
309 
310 #define	BUF_EMPTY(buf)						\
311 	((buf)->b_dva.dva_word[0] == 0 &&			\
312 	(buf)->b_dva.dva_word[1] == 0 &&			\
313 	(buf)->b_birth == 0)
314 
315 #define	BUF_EQUAL(spa, dva, birth, buf)				\
316 	((buf)->b_dva.dva_word[0] == (dva)->dva_word[0]) &&	\
317 	((buf)->b_dva.dva_word[1] == (dva)->dva_word[1]) &&	\
318 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
319 
320 static arc_buf_hdr_t *
321 buf_hash_find(spa_t *spa, dva_t *dva, uint64_t birth, kmutex_t **lockp)
322 {
323 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
324 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
325 	arc_buf_hdr_t *buf;
326 
327 	mutex_enter(hash_lock);
328 	for (buf = buf_hash_table.ht_table[idx]; buf != NULL;
329 	    buf = buf->b_hash_next) {
330 		if (BUF_EQUAL(spa, dva, birth, buf)) {
331 			*lockp = hash_lock;
332 			return (buf);
333 		}
334 	}
335 	mutex_exit(hash_lock);
336 	*lockp = NULL;
337 	return (NULL);
338 }
339 
340 /*
341  * Insert an entry into the hash table.  If there is already an element
342  * equal to elem in the hash table, then the already existing element
343  * will be returned and the new element will not be inserted.
344  * Otherwise returns NULL.
345  */
346 static arc_buf_hdr_t *fbufs[4]; /* XXX to find 6341326 */
347 static kthread_t *fbufs_lastthread;
348 static arc_buf_hdr_t *
349 buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
350 {
351 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
352 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
353 	arc_buf_hdr_t *fbuf;
354 	uint32_t max, i;
355 
356 	fbufs_lastthread = curthread;
357 	*lockp = hash_lock;
358 	mutex_enter(hash_lock);
359 	for (fbuf = buf_hash_table.ht_table[idx], i = 0; fbuf != NULL;
360 	    fbuf = fbuf->b_hash_next, i++) {
361 		if (i < sizeof (fbufs) / sizeof (fbufs[0]))
362 			fbufs[i] = fbuf;
363 		if (BUF_EQUAL(buf->b_spa, &buf->b_dva, buf->b_birth, fbuf))
364 			return (fbuf);
365 	}
366 
367 	buf->b_hash_next = buf_hash_table.ht_table[idx];
368 	buf_hash_table.ht_table[idx] = buf;
369 
370 	/* collect some hash table performance data */
371 	if (i > 0) {
372 		atomic_add_64(&arc.hash_collisions, 1);
373 		if (i == 1)
374 			atomic_add_64(&arc.hash_chains, 1);
375 	}
376 	while (i > (max = arc.hash_chain_max) &&
377 	    max != atomic_cas_32(&arc.hash_chain_max, max, i)) {
378 		continue;
379 	}
380 	atomic_add_64(&arc.hash_elements, 1);
381 	if (arc.hash_elements > arc.hash_elements_max)
382 		atomic_add_64(&arc.hash_elements_max, 1);
383 
384 	return (NULL);
385 }
386 
387 static void
388 buf_hash_remove(arc_buf_hdr_t *buf)
389 {
390 	arc_buf_hdr_t *fbuf, **bufp;
391 	uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
392 
393 	ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
394 
395 	bufp = &buf_hash_table.ht_table[idx];
396 	while ((fbuf = *bufp) != buf) {
397 		ASSERT(fbuf != NULL);
398 		bufp = &fbuf->b_hash_next;
399 	}
400 	*bufp = buf->b_hash_next;
401 	buf->b_hash_next = NULL;
402 
403 	/* collect some hash table performance data */
404 	atomic_add_64(&arc.hash_elements, -1);
405 	if (buf_hash_table.ht_table[idx] &&
406 	    buf_hash_table.ht_table[idx]->b_hash_next == NULL)
407 		atomic_add_64(&arc.hash_chains, -1);
408 }
409 
410 /*
411  * Global data structures and functions for the buf kmem cache.
412  */
413 static kmem_cache_t *hdr_cache;
414 static kmem_cache_t *buf_cache;
415 
416 static void
417 buf_fini(void)
418 {
419 	int i;
420 
421 	kmem_free(buf_hash_table.ht_table,
422 	    (buf_hash_table.ht_mask + 1) * sizeof (void *));
423 	for (i = 0; i < BUF_LOCKS; i++)
424 		mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock);
425 	kmem_cache_destroy(hdr_cache);
426 	kmem_cache_destroy(buf_cache);
427 }
428 
429 /*
430  * Constructor callback - called when the cache is empty
431  * and a new buf is requested.
432  */
433 /* ARGSUSED */
434 static int
435 hdr_cons(void *vbuf, void *unused, int kmflag)
436 {
437 	arc_buf_hdr_t *buf = vbuf;
438 
439 	bzero(buf, sizeof (arc_buf_hdr_t));
440 	refcount_create(&buf->b_refcnt);
441 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
442 	return (0);
443 }
444 
445 /*
446  * Destructor callback - called when a cached buf is
447  * no longer required.
448  */
449 /* ARGSUSED */
450 static void
451 hdr_dest(void *vbuf, void *unused)
452 {
453 	arc_buf_hdr_t *buf = vbuf;
454 
455 	refcount_destroy(&buf->b_refcnt);
456 	cv_destroy(&buf->b_cv);
457 }
458 
459 void arc_kmem_reclaim(void);
460 
461 /*
462  * Reclaim callback -- invoked when memory is low.
463  */
464 /* ARGSUSED */
465 static void
466 hdr_recl(void *unused)
467 {
468 	dprintf("hdr_recl called\n");
469 	arc_kmem_reclaim();
470 }
471 
472 static void
473 buf_init(void)
474 {
475 	uint64_t *ct;
476 	uint64_t hsize = 1ULL << 10;
477 	int i, j;
478 
479 	/*
480 	 * The hash table is big enough to fill all of physical memory
481 	 * with an average 4k block size.  The table will take up
482 	 * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
483 	 * pointers).
484 	 */
485 	while (hsize * 4096 < physmem * PAGESIZE)
486 		hsize <<= 1;
487 
488 	buf_hash_table.ht_mask = hsize - 1;
489 	buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
490 
491 	hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
492 	    0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
493 	buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t),
494 	    0, NULL, NULL, NULL, NULL, NULL, 0);
495 
496 	for (i = 0; i < 256; i++)
497 		for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--)
498 			*ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY);
499 
500 	for (i = 0; i < BUF_LOCKS; i++) {
501 		mutex_init(&buf_hash_table.ht_locks[i].ht_lock,
502 		    NULL, MUTEX_DEFAULT, NULL);
503 	}
504 }
505 
506 #define	ARC_MINTIME	(hz>>4) /* 62 ms */
507 
508 #define	ARC_TAG		(void *)0x05201962
509 
510 static void
511 add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
512 {
513 	ASSERT(MUTEX_HELD(hash_lock));
514 
515 	if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
516 	    (ab->b_state != arc.anon)) {
517 
518 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
519 		mutex_enter(&ab->b_state->mtx);
520 		ASSERT(!refcount_is_zero(&ab->b_refcnt));
521 		ASSERT(list_link_active(&ab->b_arc_node));
522 		list_remove(&ab->b_state->list, ab);
523 		ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
524 		ab->b_state->lsize -= ab->b_size;
525 		mutex_exit(&ab->b_state->mtx);
526 	}
527 }
528 
529 static int
530 remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
531 {
532 	int cnt;
533 
534 	ASSERT(MUTEX_HELD(hash_lock));
535 
536 	if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
537 	    (ab->b_state != arc.anon)) {
538 
539 		ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
540 		mutex_enter(&ab->b_state->mtx);
541 		ASSERT(!list_link_active(&ab->b_arc_node));
542 		list_insert_head(&ab->b_state->list, ab);
543 		ASSERT(ab->b_buf != NULL);
544 		ab->b_state->lsize += ab->b_size;
545 		mutex_exit(&ab->b_state->mtx);
546 	}
547 	return (cnt);
548 }
549 
550 /*
551  * Move the supplied buffer to the indicated state.  The mutex
552  * for the buffer must be held by the caller.
553  */
554 static void
555 arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
556     kmutex_t *hash_lock)
557 {
558 	arc_buf_t *buf;
559 
560 	ASSERT(MUTEX_HELD(hash_lock));
561 
562 	/*
563 	 * If this buffer is evictable, transfer it from the
564 	 * old state list to the new state list.
565 	 */
566 	if (refcount_is_zero(&ab->b_refcnt)) {
567 		if (ab->b_state != arc.anon) {
568 			int drop_mutex = FALSE;
569 
570 			if (!MUTEX_HELD(&ab->b_state->mtx)) {
571 				mutex_enter(&ab->b_state->mtx);
572 				drop_mutex = TRUE;
573 			}
574 			ASSERT(list_link_active(&ab->b_arc_node));
575 			list_remove(&ab->b_state->list, ab);
576 			ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
577 			ab->b_state->lsize -= ab->b_size;
578 			if (drop_mutex)
579 				mutex_exit(&ab->b_state->mtx);
580 		}
581 		if (new_state != arc.anon) {
582 			int drop_mutex = FALSE;
583 
584 			if (!MUTEX_HELD(&new_state->mtx)) {
585 				mutex_enter(&new_state->mtx);
586 				drop_mutex = TRUE;
587 			}
588 			list_insert_head(&new_state->list, ab);
589 			ASSERT(ab->b_buf != NULL);
590 			new_state->lsize += ab->b_size;
591 			if (drop_mutex)
592 				mutex_exit(&new_state->mtx);
593 		}
594 	}
595 
596 	ASSERT(!BUF_EMPTY(ab));
597 	if (new_state == arc.anon && ab->b_state != arc.anon) {
598 		buf_hash_remove(ab);
599 	}
600 
601 	/*
602 	 * If this buffer isn't being transferred to the MRU-top
603 	 * state, it's safe to clear its prefetch flag
604 	 */
605 	if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
606 		ab->b_flags &= ~ARC_PREFETCH;
607 	}
608 
609 	buf = ab->b_buf;
610 	if (buf == NULL) {
611 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
612 		atomic_add_64(&ab->b_state->size, -ab->b_size);
613 		/* we should only be here if we are deleting state */
614 		ASSERT(new_state == arc.anon &&
615 		    (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
616 	} else while (buf) {
617 		ASSERT3U(ab->b_state->size, >=, ab->b_size);
618 		atomic_add_64(&ab->b_state->size, -ab->b_size);
619 		atomic_add_64(&new_state->size, ab->b_size);
620 		buf = buf->b_next;
621 	}
622 	ab->b_state = new_state;
623 }
624 
625 arc_buf_t *
626 arc_buf_alloc(spa_t *spa, int size, void *tag)
627 {
628 	arc_buf_hdr_t *hdr;
629 	arc_buf_t *buf;
630 
631 	ASSERT3U(size, >, 0);
632 	hdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
633 	ASSERT(BUF_EMPTY(hdr));
634 	hdr->b_size = size;
635 	hdr->b_spa = spa;
636 	hdr->b_state = arc.anon;
637 	hdr->b_arc_access = 0;
638 	buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
639 	buf->b_hdr = hdr;
640 	buf->b_next = NULL;
641 	buf->b_data = zio_buf_alloc(size);
642 	hdr->b_buf = buf;
643 	hdr->b_flags = 0;
644 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
645 	(void) refcount_add(&hdr->b_refcnt, tag);
646 
647 	atomic_add_64(&arc.size, size);
648 	atomic_add_64(&arc.anon->size, size);
649 
650 	return (buf);
651 }
652 
653 static void
654 arc_hdr_free(arc_buf_hdr_t *hdr)
655 {
656 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
657 	ASSERT3P(hdr->b_state, ==, arc.anon);
658 
659 	if (!BUF_EMPTY(hdr)) {
660 		/*
661 		 * We can be called with an arc state lock held,
662 		 * so we can't hold a hash lock here.
663 		 * ASSERT(not in hash table)
664 		 */
665 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
666 		bzero(&hdr->b_dva, sizeof (dva_t));
667 		hdr->b_birth = 0;
668 		hdr->b_cksum0 = 0;
669 	}
670 	if (hdr->b_buf) {
671 		arc_buf_t *buf = hdr->b_buf;
672 
673 		ASSERT3U(hdr->b_size, >, 0);
674 		zio_buf_free(buf->b_data, hdr->b_size);
675 		atomic_add_64(&arc.size, -hdr->b_size);
676 		ASSERT3U(arc.anon->size, >=, hdr->b_size);
677 		atomic_add_64(&arc.anon->size, -hdr->b_size);
678 		ASSERT3P(buf->b_next, ==, NULL);
679 		kmem_cache_free(buf_cache, buf);
680 		hdr->b_buf = NULL;
681 	}
682 	ASSERT(!list_link_active(&hdr->b_arc_node));
683 	ASSERT3P(hdr->b_hash_next, ==, NULL);
684 	ASSERT3P(hdr->b_acb, ==, NULL);
685 	kmem_cache_free(hdr_cache, hdr);
686 }
687 
688 void
689 arc_buf_free(arc_buf_t *buf, void *tag)
690 {
691 	arc_buf_hdr_t *hdr = buf->b_hdr;
692 	kmutex_t *hash_lock = HDR_LOCK(hdr);
693 	int freeable;
694 
695 	mutex_enter(hash_lock);
696 	if (remove_reference(hdr, hash_lock, tag) > 0) {
697 		arc_buf_t **bufp = &hdr->b_buf;
698 		arc_state_t *state = hdr->b_state;
699 		uint64_t size = hdr->b_size;
700 
701 		ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
702 		while (*bufp != buf) {
703 			ASSERT(*bufp);
704 			bufp = &(*bufp)->b_next;
705 		}
706 		*bufp = buf->b_next;
707 		mutex_exit(hash_lock);
708 		zio_buf_free(buf->b_data, size);
709 		atomic_add_64(&arc.size, -size);
710 		kmem_cache_free(buf_cache, buf);
711 		ASSERT3U(state->size, >=, size);
712 		atomic_add_64(&state->size, -size);
713 		return;
714 	}
715 
716 	/* don't free buffers that are in the middle of an async write */
717 	freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
718 	mutex_exit(hash_lock);
719 
720 	if (freeable)
721 		arc_hdr_free(hdr);
722 }
723 
724 int
725 arc_buf_size(arc_buf_t *buf)
726 {
727 	return (buf->b_hdr->b_size);
728 }
729 
730 /*
731  * Evict buffers from list until we've removed the specified number of
732  * bytes.  Move the removed buffers to the appropriate evict state.
733  */
734 static uint64_t
735 arc_evict_state(arc_state_t *state, int64_t bytes)
736 {
737 	arc_state_t *evicted_state;
738 	uint64_t bytes_evicted = 0;
739 	arc_buf_hdr_t *ab, *ab_prev;
740 	kmutex_t *hash_lock;
741 
742 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
743 
744 	if (state == arc.mru_top)
745 		evicted_state = arc.mru_bot;
746 	else
747 		evicted_state = arc.mfu_bot;
748 
749 	mutex_enter(&state->mtx);
750 	mutex_enter(&evicted_state->mtx);
751 
752 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
753 		ab_prev = list_prev(&state->list, ab);
754 		hash_lock = HDR_LOCK(ab);
755 		if (mutex_tryenter(hash_lock)) {
756 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
757 			arc_change_state(evicted_state, ab, hash_lock);
758 			zio_buf_free(ab->b_buf->b_data, ab->b_size);
759 			atomic_add_64(&arc.size, -ab->b_size);
760 			ASSERT3P(ab->b_buf->b_next, ==, NULL);
761 			kmem_cache_free(buf_cache, ab->b_buf);
762 			ab->b_buf = NULL;
763 			DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
764 			bytes_evicted += ab->b_size;
765 			mutex_exit(hash_lock);
766 			if (bytes_evicted >= bytes)
767 				break;
768 		} else {
769 			atomic_add_64(&arc.skipped, 1);
770 		}
771 	}
772 	mutex_exit(&evicted_state->mtx);
773 	mutex_exit(&state->mtx);
774 
775 	if (bytes_evicted < bytes)
776 		dprintf("only evicted %lld bytes from %x",
777 		    (longlong_t)bytes_evicted, state);
778 
779 	return (bytes_evicted);
780 }
781 
782 /*
783  * Remove buffers from list until we've removed the specified number of
784  * bytes.  Destroy the buffers that are removed.
785  */
786 static void
787 arc_delete_state(arc_state_t *state, int64_t bytes)
788 {
789 	uint_t bufs_skipped = 0;
790 	uint64_t bytes_deleted = 0;
791 	arc_buf_hdr_t *ab, *ab_prev;
792 	kmutex_t *hash_lock;
793 
794 top:
795 	mutex_enter(&state->mtx);
796 	for (ab = list_tail(&state->list); ab; ab = ab_prev) {
797 		ab_prev = list_prev(&state->list, ab);
798 		hash_lock = HDR_LOCK(ab);
799 		if (mutex_tryenter(hash_lock)) {
800 			arc_change_state(arc.anon, ab, hash_lock);
801 			mutex_exit(hash_lock);
802 			atomic_add_64(&arc.deleted, 1);
803 			DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
804 			bytes_deleted += ab->b_size;
805 			arc_hdr_free(ab);
806 			if (bytes >= 0 && bytes_deleted >= bytes)
807 				break;
808 		} else {
809 			if (bytes < 0) {
810 				mutex_exit(&state->mtx);
811 				mutex_enter(hash_lock);
812 				mutex_exit(hash_lock);
813 				goto top;
814 			}
815 			bufs_skipped += 1;
816 		}
817 	}
818 	mutex_exit(&state->mtx);
819 
820 	if (bufs_skipped) {
821 		atomic_add_64(&arc.skipped, bufs_skipped);
822 		ASSERT(bytes >= 0);
823 	}
824 
825 	if (bytes_deleted < bytes)
826 		dprintf("only deleted %lld bytes from %p",
827 		    (longlong_t)bytes_deleted, state);
828 }
829 
830 static void
831 arc_adjust(void)
832 {
833 	int64_t top_sz, mru_over, arc_over;
834 
835 	top_sz = arc.anon->size + arc.mru_top->size;
836 
837 	if (top_sz > arc.p && arc.mru_top->lsize > 0) {
838 		int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
839 		(void) arc_evict_state(arc.mru_top, toevict);
840 		top_sz = arc.anon->size + arc.mru_top->size;
841 	}
842 
843 	mru_over = top_sz + arc.mru_bot->size - arc.c;
844 
845 	if (mru_over > 0) {
846 		if (arc.mru_bot->lsize > 0) {
847 			int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
848 			arc_delete_state(arc.mru_bot, todelete);
849 		}
850 	}
851 
852 	if ((arc_over = arc.size - arc.c) > 0) {
853 		int64_t table_over;
854 
855 		if (arc.mfu_top->lsize > 0) {
856 			int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
857 			(void) arc_evict_state(arc.mfu_top, toevict);
858 		}
859 
860 		table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
861 		    - arc.c*2;
862 
863 		if (table_over > 0 && arc.mfu_bot->lsize > 0) {
864 			int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
865 			arc_delete_state(arc.mfu_bot, todelete);
866 		}
867 	}
868 }
869 
870 /*
871  * Flush all *evictable* data from the cache.
872  * NOTE: this will not touch "active" (i.e. referenced) data.
873  */
874 void
875 arc_flush(void)
876 {
877 	arc_delete_state(arc.mru_top, -1);
878 	arc_delete_state(arc.mfu_top, -1);
879 
880 	arc_delete_state(arc.mru_bot, -1);
881 	arc_delete_state(arc.mfu_bot, -1);
882 }
883 
884 void
885 arc_kmem_reclaim(void)
886 {
887 	/* Remove 6.25% */
888 	/*
889 	 * We need arc_reclaim_lock because we don't want multiple
890 	 * threads trying to reclaim concurrently.
891 	 */
892 
893 	/*
894 	 * umem calls the reclaim func when we destroy the buf cache,
895 	 * which is after we do arc_fini().  So we set a flag to prevent
896 	 * accessing the destroyed mutexes and lists.
897 	 */
898 	if (arc_dead)
899 		return;
900 
901 	mutex_enter(&arc_reclaim_lock);
902 
903 	atomic_add_64(&arc.c, -(arc.c >> 4));
904 	if (arc.c < arc.c_min)
905 		arc.c = arc.c_min;
906 	atomic_add_64(&arc.p, -(arc.p >> 4));
907 
908 	arc_adjust();
909 
910 	/* Cool it for a while */
911 	arc.incr = 0;
912 	arc.size_check = arc_size_check_default << 3;
913 
914 	mutex_exit(&arc_reclaim_lock);
915 }
916 
917 static int
918 arc_reclaim_needed(void)
919 {
920 	uint64_t extra;
921 
922 #ifdef _KERNEL
923 	/*
924 	 * take 'desfree' extra pages, so we reclaim sooner, rather than later
925 	 */
926 	extra = desfree;
927 
928 	/*
929 	 * check that we're out of range of the pageout scanner.  It starts to
930 	 * schedule paging if freemem is less than lotsfree and needfree.
931 	 * lotsfree is the high-water mark for pageout, and needfree is the
932 	 * number of needed free pages.  We add extra pages here to make sure
933 	 * the scanner doesn't start up while we're freeing memory.
934 	 */
935 	if (freemem < lotsfree + needfree + extra)
936 		return (1);
937 
938 	/*
939 	 * check to make sure that swapfs has enough space so that anon
940 	 * reservations can still succeeed. anon_resvmem() checks that the
941 	 * availrmem is greater than swapfs_minfree, and the number of reserved
942 	 * swap pages.  We also add a bit of extra here just to prevent
943 	 * circumstances from getting really dire.
944 	 */
945 	if (availrmem < swapfs_minfree + swapfs_reserve + extra)
946 		return (1);
947 
948 	/*
949 	 * If we're on an i386 platform, it's possible that we'll exhaust the
950 	 * kernel heap space before we ever run out of available physical
951 	 * memory.  Most checks of the size of the heap_area compare against
952 	 * tune.t_minarmem, which is the minimum available real memory that we
953 	 * can have in the system.  However, this is generally fixed at 25 pages
954 	 * which is so low that it's useless.  In this comparison, we seek to
955 	 * calculate the total heap-size, and reclaim if more than 3/4ths of the
956 	 * heap is allocated.  (Or, in the caclulation, if less than 1/4th is
957 	 * free)
958 	 */
959 #if defined(__i386)
960 	if (btop(vmem_size(heap_arena, VMEM_FREE)) <
961 	    (btop(vmem_size(heap_arena, VMEM_FREE | VMEM_ALLOC)) >> 2))
962 		return (1);
963 #endif
964 
965 #else
966 	if (spa_get_random(100) == 0)
967 		return (1);
968 #endif
969 	return (0);
970 }
971 
972 static void
973 arc_kmem_reap_now(arc_reclaim_strategy_t strat)
974 {
975 	size_t			i;
976 	kmem_cache_t		*prev_cache = NULL;
977 	extern kmem_cache_t	*zio_buf_cache[];
978 
979 #ifdef _KERNEL
980 	/*
981 	 * First purge some DNLC entries, in case the DNLC is using
982 	 * up too much memory.
983 	 */
984 	dnlc_reduce_cache((void *)(uintptr_t)arc_reduce_dnlc_percent);
985 #endif
986 
987 	/*
988 	 * an agressive reclamation will shrink the cache size as well as reap
989 	 * free kmem buffers.  The arc_kmem_reclaim function is called when the
990 	 * header-cache is reaped, so we only reap the header cache if we're
991 	 * performing an agressive reclaim.  If we're not, just clean the kmem
992 	 * buffer caches.
993 	 */
994 	if (strat == ARC_RECLAIM_AGGR)
995 		kmem_cache_reap_now(hdr_cache);
996 
997 	kmem_cache_reap_now(buf_cache);
998 
999 	for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
1000 		if (zio_buf_cache[i] != prev_cache) {
1001 			prev_cache = zio_buf_cache[i];
1002 			kmem_cache_reap_now(zio_buf_cache[i]);
1003 		}
1004 	}
1005 }
1006 
1007 static void
1008 arc_reclaim_thread(void)
1009 {
1010 	clock_t			growtime = 0;
1011 	arc_reclaim_strategy_t	last_reclaim = ARC_RECLAIM_CONS;
1012 	callb_cpr_t		cpr;
1013 
1014 	CALLB_CPR_INIT(&cpr, &arc_reclaim_thr_lock, callb_generic_cpr, FTAG);
1015 
1016 	mutex_enter(&arc_reclaim_thr_lock);
1017 	while (arc_thread_exit == 0) {
1018 		if (arc_reclaim_needed()) {
1019 
1020 			if (arc.no_grow) {
1021 				if (last_reclaim == ARC_RECLAIM_CONS) {
1022 					last_reclaim = ARC_RECLAIM_AGGR;
1023 				} else {
1024 					last_reclaim = ARC_RECLAIM_CONS;
1025 				}
1026 			} else {
1027 				arc.no_grow = TRUE;
1028 				last_reclaim = ARC_RECLAIM_AGGR;
1029 				membar_producer();
1030 			}
1031 
1032 			/* reset the growth delay for every reclaim */
1033 			growtime = lbolt + (arc_grow_retry * hz);
1034 
1035 			arc_kmem_reap_now(last_reclaim);
1036 
1037 		} else if ((growtime > 0) && ((growtime - lbolt) <= 0)) {
1038 			arc.no_grow = FALSE;
1039 		}
1040 
1041 		/* block until needed, or one second, whichever is shorter */
1042 		CALLB_CPR_SAFE_BEGIN(&cpr);
1043 		(void) cv_timedwait(&arc_reclaim_thr_cv,
1044 		    &arc_reclaim_thr_lock, (lbolt + hz));
1045 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
1046 	}
1047 
1048 	arc_thread_exit = 0;
1049 	cv_broadcast(&arc_reclaim_thr_cv);
1050 	CALLB_CPR_EXIT(&cpr);		/* drops arc_reclaim_thr_lock */
1051 	thread_exit();
1052 }
1053 
1054 static void
1055 arc_try_grow(int64_t bytes)
1056 {
1057 	/*
1058 	 * If we're within (2 * maxblocksize) bytes of the target
1059 	 * cache size, increment the target cache size
1060 	 */
1061 	atomic_add_64((uint64_t *)&arc.size_check, 1);
1062 
1063 	if (arc_reclaim_needed()) {
1064 		cv_signal(&arc_reclaim_thr_cv);
1065 		return;
1066 	}
1067 
1068 	if (arc.no_grow)
1069 		return;
1070 
1071 	/*
1072 	 * return true if we successfully grow, or if there's enough space that
1073 	 * we don't have to grow.  Above, we return false if we can't grow, or
1074 	 * if we shouldn't because a reclaim is in progress.
1075 	 */
1076 	if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
1077 		if (arc.size_check > 0) {
1078 			arc.size_check = arc_size_check_default;
1079 			atomic_add_64(&arc.incr, arc_incr_size);
1080 		}
1081 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1082 		if (arc.c > arc.c_max)
1083 			arc.c = arc.c_max;
1084 		else
1085 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1086 	} else if (arc.size > arc.c) {
1087 		if (arc.size_check > 0) {
1088 			arc.size_check = arc_size_check_default;
1089 			atomic_add_64(&arc.incr, arc_incr_size);
1090 		}
1091 		atomic_add_64(&arc.c, MIN(bytes, arc.incr));
1092 		if (arc.c > arc.c_max)
1093 			arc.c = arc.c_max;
1094 		else
1095 			atomic_add_64(&arc.p, MIN(bytes, arc.incr));
1096 	}
1097 }
1098 
1099 /*
1100  * check if the cache has reached its limits and eviction is required prior to
1101  * insert.  In this situation, we want to evict if no_grow is set Otherwise, the
1102  * cache is either big enough that we can insert, or a arc_try_grow will result
1103  * in more space being made available.
1104  */
1105 
1106 static int
1107 arc_evict_needed()
1108 {
1109 
1110 	if (arc_reclaim_needed())
1111 		return (1);
1112 
1113 	if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
1114 		return (1);
1115 
1116 	return (0);
1117 }
1118 
1119 /*
1120  * The state, supplied as the first argument, is going to have something
1121  * inserted on its behalf. So, determine which cache must be victimized to
1122  * satisfy an insertion for this state.  We have the following cases:
1123  *
1124  * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
1125  * In this situation if we're out of space, but the resident size of the MFU is
1126  * under the limit, victimize the MFU cache to satisfy this insertion request.
1127  *
1128  * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
1129  * Here, we've used up all of the available space for the MRU, so we need to
1130  * evict from our own cache instead.  Evict from the set of resident MRU
1131  * entries.
1132  *
1133  * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
1134  * c minus p represents the MFU space in the cache, since p is the size of the
1135  * cache that is dedicated to the MRU.  In this situation there's still space on
1136  * the MFU side, so the MRU side needs to be victimized.
1137  *
1138  * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
1139  * MFU's resident set is consuming more space than it has been allotted.  In
1140  * this situation, we must victimize our own cache, the MFU, for this insertion.
1141  */
1142 static void
1143 arc_evict_for_state(arc_state_t *state, uint64_t bytes)
1144 {
1145 	uint64_t	mru_used;
1146 	uint64_t	mfu_space;
1147 	uint64_t	evicted;
1148 
1149 	ASSERT(state == arc.mru_top || state == arc.mfu_top);
1150 
1151 	if (state == arc.mru_top) {
1152 		mru_used = arc.anon->size + arc.mru_top->size;
1153 		if (arc.p > mru_used) {
1154 			/* case 1 */
1155 			evicted = arc_evict_state(arc.mfu_top, bytes);
1156 			if (evicted < bytes) {
1157 				arc_adjust();
1158 			}
1159 		} else {
1160 			/* case 2 */
1161 			evicted = arc_evict_state(arc.mru_top, bytes);
1162 			if (evicted < bytes) {
1163 				arc_adjust();
1164 			}
1165 		}
1166 	} else {
1167 		/* MFU_top case */
1168 		mfu_space = arc.c - arc.p;
1169 		if (mfu_space > arc.mfu_top->size) {
1170 			/* case 3 */
1171 			evicted = arc_evict_state(arc.mru_top, bytes);
1172 			if (evicted < bytes) {
1173 				arc_adjust();
1174 			}
1175 		} else {
1176 			/* case 4 */
1177 			evicted = arc_evict_state(arc.mfu_top, bytes);
1178 			if (evicted < bytes) {
1179 				arc_adjust();
1180 			}
1181 		}
1182 	}
1183 }
1184 
1185 /*
1186  * This routine is called whenever a buffer is accessed.
1187  */
1188 static void
1189 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
1190 {
1191 	int		blksz, mult;
1192 
1193 	ASSERT(MUTEX_HELD(hash_lock));
1194 
1195 	blksz = buf->b_size;
1196 
1197 	if (buf->b_state == arc.anon) {
1198 		/*
1199 		 * This buffer is not in the cache, and does not
1200 		 * appear in our "ghost" list.  Add the new buffer
1201 		 * to the MRU state.
1202 		 */
1203 
1204 		arc_try_grow(blksz);
1205 		if (arc_evict_needed()) {
1206 			arc_evict_for_state(arc.mru_top, blksz);
1207 		}
1208 
1209 		ASSERT(buf->b_arc_access == 0);
1210 		buf->b_arc_access = lbolt;
1211 		DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
1212 		    buf);
1213 		arc_change_state(arc.mru_top, buf, hash_lock);
1214 
1215 		/*
1216 		 * If we are using less than 2/3 of our total target
1217 		 * cache size, bump up the target size for the MRU
1218 		 * list.
1219 		 */
1220 		if (arc.size < arc.c*2/3) {
1221 			arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
1222 		}
1223 
1224 	} else if (buf->b_state == arc.mru_top) {
1225 		/*
1226 		 * If this buffer is in the MRU-top state and has the prefetch
1227 		 * flag, the first read was actually part of a prefetch.  In
1228 		 * this situation, we simply want to clear the flag and return.
1229 		 * A subsequent access should bump this into the MFU state.
1230 		 */
1231 		if ((buf->b_flags & ARC_PREFETCH) != 0) {
1232 			buf->b_flags &= ~ARC_PREFETCH;
1233 			atomic_add_64(&arc.mru_top->hits, 1);
1234 			return;
1235 		}
1236 
1237 		/*
1238 		 * This buffer has been "accessed" only once so far,
1239 		 * but it is still in the cache. Move it to the MFU
1240 		 * state.
1241 		 */
1242 		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
1243 			/*
1244 			 * More than 125ms have passed since we
1245 			 * instantiated this buffer.  Move it to the
1246 			 * most frequently used state.
1247 			 */
1248 			buf->b_arc_access = lbolt;
1249 			DTRACE_PROBE1(new_state__mfu_top,
1250 			    arc_buf_hdr_t *, buf);
1251 			arc_change_state(arc.mfu_top, buf, hash_lock);
1252 		}
1253 		atomic_add_64(&arc.mru_top->hits, 1);
1254 	} else if (buf->b_state == arc.mru_bot) {
1255 		arc_state_t	*new_state;
1256 		/*
1257 		 * This buffer has been "accessed" recently, but
1258 		 * was evicted from the cache.  Move it to the
1259 		 * MFU state.
1260 		 */
1261 
1262 		if (buf->b_flags & ARC_PREFETCH) {
1263 			new_state = arc.mru_top;
1264 			DTRACE_PROBE1(new_state__mru_top,
1265 			    arc_buf_hdr_t *, buf);
1266 		} else {
1267 			new_state = arc.mfu_top;
1268 			DTRACE_PROBE1(new_state__mfu_top,
1269 			    arc_buf_hdr_t *, buf);
1270 		}
1271 
1272 		arc_try_grow(blksz);
1273 		if (arc_evict_needed()) {
1274 			arc_evict_for_state(new_state, blksz);
1275 		}
1276 
1277 		/* Bump up the target size of the MRU list */
1278 		mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
1279 		    1 : (arc.mfu_bot->size/arc.mru_bot->size));
1280 		arc.p = MIN(arc.c, arc.p + blksz * mult);
1281 
1282 		buf->b_arc_access = lbolt;
1283 		arc_change_state(new_state, buf, hash_lock);
1284 
1285 		atomic_add_64(&arc.mru_bot->hits, 1);
1286 	} else if (buf->b_state == arc.mfu_top) {
1287 		/*
1288 		 * This buffer has been accessed more than once and is
1289 		 * still in the cache.  Keep it in the MFU state.
1290 		 *
1291 		 * NOTE: the add_reference() that occurred when we did
1292 		 * the arc_read() should have kicked this off the list,
1293 		 * so even if it was a prefetch, it will be put back at
1294 		 * the head of the list when we remove_reference().
1295 		 */
1296 		atomic_add_64(&arc.mfu_top->hits, 1);
1297 	} else if (buf->b_state == arc.mfu_bot) {
1298 		/*
1299 		 * This buffer has been accessed more than once but has
1300 		 * been evicted from the cache.  Move it back to the
1301 		 * MFU state.
1302 		 */
1303 
1304 		arc_try_grow(blksz);
1305 		if (arc_evict_needed()) {
1306 			arc_evict_for_state(arc.mfu_top, blksz);
1307 		}
1308 
1309 		/* Bump up the target size for the MFU list */
1310 		mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
1311 		    1 : (arc.mru_bot->size/arc.mfu_bot->size));
1312 		arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
1313 
1314 		buf->b_arc_access = lbolt;
1315 		DTRACE_PROBE1(new_state__mfu_top,
1316 		    arc_buf_hdr_t *, buf);
1317 		arc_change_state(arc.mfu_top, buf, hash_lock);
1318 
1319 		atomic_add_64(&arc.mfu_bot->hits, 1);
1320 	} else {
1321 		ASSERT(!"invalid arc state");
1322 	}
1323 
1324 }
1325 
1326 /* a generic arc_done_func_t which you can use */
1327 /* ARGSUSED */
1328 void
1329 arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
1330 {
1331 	bcopy(buf->b_data, arg, buf->b_hdr->b_size);
1332 	arc_buf_free(buf, arg);
1333 }
1334 
1335 /* a generic arc_done_func_t which you can use */
1336 void
1337 arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
1338 {
1339 	arc_buf_t **bufp = arg;
1340 	if (zio && zio->io_error) {
1341 		arc_buf_free(buf, arg);
1342 		*bufp = NULL;
1343 	} else {
1344 		*bufp = buf;
1345 	}
1346 }
1347 
1348 static void
1349 arc_read_done(zio_t *zio)
1350 {
1351 	arc_buf_hdr_t	*hdr;
1352 	arc_buf_t	*buf;
1353 	arc_buf_t	*abuf;	/* buffer we're assigning to callback */
1354 	kmutex_t	*hash_lock;
1355 	arc_callback_t	*callback_list, *acb;
1356 	int		freeable = FALSE;
1357 
1358 	buf = zio->io_private;
1359 	hdr = buf->b_hdr;
1360 
1361 	if (!HDR_FREED_IN_READ(hdr)) {
1362 		arc_buf_hdr_t *found;
1363 
1364 		found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
1365 		    &hash_lock);
1366 
1367 		/*
1368 		 * Buffer was inserted into hash-table and removed from lists
1369 		 * prior to starting I/O.  We should find this header, since
1370 		 * it's in the hash table, and it should be legit since it's
1371 		 * not possible to evict it during the I/O.
1372 		 */
1373 
1374 		ASSERT(found);
1375 		ASSERT(DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)));
1376 	}
1377 
1378 	/* byteswap if necessary */
1379 	callback_list = hdr->b_acb;
1380 	ASSERT(callback_list != NULL);
1381 	if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
1382 		callback_list->acb_byteswap(buf->b_data, hdr->b_size);
1383 
1384 	/* create copies of the data buffer for the callers */
1385 	abuf = buf;
1386 	for (acb = callback_list; acb; acb = acb->acb_next) {
1387 		if (acb->acb_done) {
1388 			if (abuf == NULL) {
1389 				abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1390 				abuf->b_data = zio_buf_alloc(hdr->b_size);
1391 				atomic_add_64(&arc.size, hdr->b_size);
1392 				bcopy(buf->b_data, abuf->b_data, hdr->b_size);
1393 				abuf->b_hdr = hdr;
1394 				abuf->b_next = hdr->b_buf;
1395 				hdr->b_buf = abuf;
1396 				atomic_add_64(&hdr->b_state->size, hdr->b_size);
1397 			}
1398 			acb->acb_buf = abuf;
1399 			abuf = NULL;
1400 		} else {
1401 			/*
1402 			 * The caller did not provide a callback function.
1403 			 * In this case, we should just remove the reference.
1404 			 */
1405 			if (HDR_FREED_IN_READ(hdr)) {
1406 				ASSERT3P(hdr->b_state, ==, arc.anon);
1407 				(void) refcount_remove(&hdr->b_refcnt,
1408 				    acb->acb_private);
1409 			} else {
1410 				(void) remove_reference(hdr, hash_lock,
1411 				    acb->acb_private);
1412 			}
1413 		}
1414 	}
1415 	hdr->b_acb = NULL;
1416 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
1417 
1418 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
1419 
1420 	if (zio->io_error != 0) {
1421 		hdr->b_flags |= ARC_IO_ERROR;
1422 		if (hdr->b_state != arc.anon)
1423 			arc_change_state(arc.anon, hdr, hash_lock);
1424 		freeable = refcount_is_zero(&hdr->b_refcnt);
1425 	}
1426 
1427 	if (!HDR_FREED_IN_READ(hdr)) {
1428 		/*
1429 		 * Only call arc_access on anonymous buffers.  This is because
1430 		 * if we've issued an I/O for an evicted buffer, we've already
1431 		 * called arc_access (to prevent any simultaneous readers from
1432 		 * getting confused).
1433 		 */
1434 		if (zio->io_error == 0 && hdr->b_state == arc.anon)
1435 			arc_access(hdr, hash_lock);
1436 		mutex_exit(hash_lock);
1437 	} else {
1438 		/*
1439 		 * This block was freed while we waited for the read to
1440 		 * complete.  It has been removed from the hash table and
1441 		 * moved to the anonymous state (so that it won't show up
1442 		 * in the cache).
1443 		 */
1444 		ASSERT3P(hdr->b_state, ==, arc.anon);
1445 		freeable = refcount_is_zero(&hdr->b_refcnt);
1446 	}
1447 
1448 	cv_broadcast(&hdr->b_cv);
1449 
1450 	/* execute each callback and free its structure */
1451 	while ((acb = callback_list) != NULL) {
1452 		if (acb->acb_done)
1453 			acb->acb_done(zio, acb->acb_buf, acb->acb_private);
1454 
1455 		if (acb->acb_zio_dummy != NULL) {
1456 			acb->acb_zio_dummy->io_error = zio->io_error;
1457 			zio_nowait(acb->acb_zio_dummy);
1458 		}
1459 
1460 		callback_list = acb->acb_next;
1461 		kmem_free(acb, sizeof (arc_callback_t));
1462 	}
1463 
1464 	if (freeable)
1465 		arc_hdr_free(hdr);
1466 }
1467 
1468 /*
1469  * "Read" the block block at the specified DVA (in bp) via the
1470  * cache.  If the block is found in the cache, invoke the provided
1471  * callback immediately and return.  Note that the `zio' parameter
1472  * in the callback will be NULL in this case, since no IO was
1473  * required.  If the block is not in the cache pass the read request
1474  * on to the spa with a substitute callback function, so that the
1475  * requested block will be added to the cache.
1476  *
1477  * If a read request arrives for a block that has a read in-progress,
1478  * either wait for the in-progress read to complete (and return the
1479  * results); or, if this is a read with a "done" func, add a record
1480  * to the read to invoke the "done" func when the read completes,
1481  * and return; or just return.
1482  *
1483  * arc_read_done() will invoke all the requested "done" functions
1484  * for readers of this block.
1485  */
1486 int
1487 arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
1488     arc_done_func_t *done, void *private, int priority, int flags,
1489     uint32_t arc_flags)
1490 {
1491 	arc_buf_hdr_t *hdr;
1492 	arc_buf_t *buf;
1493 	kmutex_t *hash_lock;
1494 	zio_t	*rzio;
1495 
1496 top:
1497 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1498 	if (hdr && hdr->b_buf) {
1499 
1500 		ASSERT((hdr->b_state == arc.mru_top) ||
1501 		    (hdr->b_state == arc.mfu_top) ||
1502 		    ((hdr->b_state == arc.anon) &&
1503 		    (HDR_IO_IN_PROGRESS(hdr))));
1504 
1505 		if (HDR_IO_IN_PROGRESS(hdr)) {
1506 
1507 			if ((arc_flags & ARC_NOWAIT) && done) {
1508 				arc_callback_t	*acb = NULL;
1509 
1510 				acb = kmem_zalloc(sizeof (arc_callback_t),
1511 				    KM_SLEEP);
1512 				acb->acb_done = done;
1513 				acb->acb_private = private;
1514 				acb->acb_byteswap = swap;
1515 				if (pio != NULL)
1516 					acb->acb_zio_dummy = zio_null(pio,
1517 					    spa, NULL, NULL, flags);
1518 
1519 				ASSERT(acb->acb_done != NULL);
1520 				acb->acb_next = hdr->b_acb;
1521 				hdr->b_acb = acb;
1522 				add_reference(hdr, hash_lock, private);
1523 				mutex_exit(hash_lock);
1524 				return (0);
1525 			} else if (arc_flags & ARC_WAIT) {
1526 				cv_wait(&hdr->b_cv, hash_lock);
1527 				mutex_exit(hash_lock);
1528 				goto top;
1529 			}
1530 
1531 			mutex_exit(hash_lock);
1532 			return (0);
1533 		}
1534 
1535 		/*
1536 		 * If there is already a reference on this block, create
1537 		 * a new copy of the data so that we will be guaranteed
1538 		 * that arc_release() will always succeed.
1539 		 */
1540 
1541 		if (done)
1542 			add_reference(hdr, hash_lock, private);
1543 		if (done && refcount_count(&hdr->b_refcnt) > 1) {
1544 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1545 			buf->b_data = zio_buf_alloc(hdr->b_size);
1546 			ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
1547 			atomic_add_64(&arc.size, hdr->b_size);
1548 			bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
1549 			buf->b_hdr = hdr;
1550 			buf->b_next = hdr->b_buf;
1551 			hdr->b_buf = buf;
1552 			atomic_add_64(&hdr->b_state->size, hdr->b_size);
1553 		} else {
1554 			buf = hdr->b_buf;
1555 		}
1556 		DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
1557 		arc_access(hdr, hash_lock);
1558 		mutex_exit(hash_lock);
1559 		atomic_add_64(&arc.hits, 1);
1560 		if (done)
1561 			done(NULL, buf, private);
1562 	} else {
1563 		uint64_t size = BP_GET_LSIZE(bp);
1564 		arc_callback_t	*acb;
1565 
1566 		if (hdr == NULL) {
1567 			/* this block is not in the cache */
1568 			arc_buf_hdr_t	*exists;
1569 
1570 			buf = arc_buf_alloc(spa, size, private);
1571 			hdr = buf->b_hdr;
1572 			hdr->b_dva = *BP_IDENTITY(bp);
1573 			hdr->b_birth = bp->blk_birth;
1574 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
1575 			exists = buf_hash_insert(hdr, &hash_lock);
1576 			if (exists) {
1577 				/* somebody beat us to the hash insert */
1578 				mutex_exit(hash_lock);
1579 				bzero(&hdr->b_dva, sizeof (dva_t));
1580 				hdr->b_birth = 0;
1581 				hdr->b_cksum0 = 0;
1582 				arc_buf_free(buf, private);
1583 				goto top; /* restart the IO request */
1584 			}
1585 
1586 		} else {
1587 			/* this block is in the ghost cache */
1588 			ASSERT((hdr->b_state == arc.mru_bot) ||
1589 			    (hdr->b_state == arc.mfu_bot));
1590 			add_reference(hdr, hash_lock, private);
1591 
1592 			buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
1593 			buf->b_data = zio_buf_alloc(hdr->b_size);
1594 			atomic_add_64(&arc.size, hdr->b_size);
1595 			ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1596 			ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1597 			buf->b_hdr = hdr;
1598 			buf->b_next = NULL;
1599 			hdr->b_buf = buf;
1600 		}
1601 
1602 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1603 		acb->acb_done = done;
1604 		acb->acb_private = private;
1605 		acb->acb_byteswap = swap;
1606 
1607 		ASSERT(hdr->b_acb == NULL);
1608 		hdr->b_acb = acb;
1609 
1610 		/*
1611 		 * If this DVA is part of a prefetch, mark the buf
1612 		 * header with the prefetch flag
1613 		 */
1614 		if (arc_flags & ARC_PREFETCH)
1615 			hdr->b_flags |= ARC_PREFETCH;
1616 		hdr->b_flags |= ARC_IO_IN_PROGRESS;
1617 
1618 		/*
1619 		 * If the buffer has been evicted, migrate it to a present state
1620 		 * before issuing the I/O.  Once we drop the hash-table lock,
1621 		 * the header will be marked as I/O in progress and have an
1622 		 * attached buffer.  At this point, anybody who finds this
1623 		 * buffer ought to notice that it's legit but has a pending I/O.
1624 		 */
1625 
1626 		if ((hdr->b_state == arc.mru_bot) ||
1627 		    (hdr->b_state == arc.mfu_bot))
1628 			arc_access(hdr, hash_lock);
1629 
1630 		mutex_exit(hash_lock);
1631 
1632 		ASSERT3U(hdr->b_size, ==, size);
1633 		DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
1634 		    uint64_t, size);
1635 		atomic_add_64(&arc.misses, 1);
1636 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
1637 		    arc_read_done, buf, priority, flags);
1638 
1639 		if (arc_flags & ARC_WAIT)
1640 			return (zio_wait(rzio));
1641 
1642 		ASSERT(arc_flags & ARC_NOWAIT);
1643 		zio_nowait(rzio);
1644 	}
1645 	return (0);
1646 }
1647 
1648 /*
1649  * arc_read() variant to support pool traversal.  If the block is already
1650  * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
1651  * The idea is that we don't want pool traversal filling up memory, but
1652  * if the ARC already has the data anyway, we shouldn't pay for the I/O.
1653  */
1654 int
1655 arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
1656 {
1657 	arc_buf_hdr_t *hdr;
1658 	kmutex_t *hash_mtx;
1659 	int rc = 0;
1660 
1661 	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
1662 
1663 	if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
1664 		bcopy(hdr->b_buf->b_data, data, hdr->b_size);
1665 	else
1666 		rc = ENOENT;
1667 
1668 	if (hash_mtx)
1669 		mutex_exit(hash_mtx);
1670 
1671 	return (rc);
1672 }
1673 
1674 /*
1675  * Release this buffer from the cache.  This must be done
1676  * after a read and prior to modifying the buffer contents.
1677  * If the buffer has more than one reference, we must make
1678  * make a new hdr for the buffer.
1679  */
1680 void
1681 arc_release(arc_buf_t *buf, void *tag)
1682 {
1683 	arc_buf_hdr_t *hdr = buf->b_hdr;
1684 	kmutex_t *hash_lock = HDR_LOCK(hdr);
1685 
1686 	/* this buffer is not on any list */
1687 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
1688 
1689 	if (hdr->b_state == arc.anon) {
1690 		/* this buffer is already released */
1691 		ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
1692 		ASSERT(BUF_EMPTY(hdr));
1693 		return;
1694 	}
1695 
1696 	mutex_enter(hash_lock);
1697 
1698 	if (refcount_count(&hdr->b_refcnt) > 1) {
1699 		arc_buf_hdr_t *nhdr;
1700 		arc_buf_t **bufp;
1701 		uint64_t blksz = hdr->b_size;
1702 		spa_t *spa = hdr->b_spa;
1703 
1704 		/*
1705 		 * Pull the data off of this buf and attach it to
1706 		 * a new anonymous buf.
1707 		 */
1708 		bufp = &hdr->b_buf;
1709 		while (*bufp != buf) {
1710 			ASSERT(*bufp);
1711 			bufp = &(*bufp)->b_next;
1712 		}
1713 		*bufp = (*bufp)->b_next;
1714 		(void) refcount_remove(&hdr->b_refcnt, tag);
1715 		ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
1716 		atomic_add_64(&hdr->b_state->size, -hdr->b_size);
1717 		mutex_exit(hash_lock);
1718 
1719 		nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
1720 		nhdr->b_size = blksz;
1721 		nhdr->b_spa = spa;
1722 		nhdr->b_buf = buf;
1723 		nhdr->b_state = arc.anon;
1724 		nhdr->b_arc_access = 0;
1725 		nhdr->b_flags = 0;
1726 		buf->b_hdr = nhdr;
1727 		buf->b_next = NULL;
1728 		(void) refcount_add(&nhdr->b_refcnt, tag);
1729 		atomic_add_64(&arc.anon->size, blksz);
1730 
1731 		hdr = nhdr;
1732 	} else {
1733 		ASSERT(!list_link_active(&hdr->b_arc_node));
1734 		ASSERT(!HDR_IO_IN_PROGRESS(hdr));
1735 		arc_change_state(arc.anon, hdr, hash_lock);
1736 		hdr->b_arc_access = 0;
1737 		mutex_exit(hash_lock);
1738 		bzero(&hdr->b_dva, sizeof (dva_t));
1739 		hdr->b_birth = 0;
1740 		hdr->b_cksum0 = 0;
1741 	}
1742 }
1743 
1744 int
1745 arc_released(arc_buf_t *buf)
1746 {
1747 	return (buf->b_hdr->b_state == arc.anon);
1748 }
1749 
1750 static void
1751 arc_write_done(zio_t *zio)
1752 {
1753 	arc_buf_t *buf;
1754 	arc_buf_hdr_t *hdr;
1755 	arc_callback_t *acb;
1756 
1757 	buf = zio->io_private;
1758 	hdr = buf->b_hdr;
1759 	acb = hdr->b_acb;
1760 	hdr->b_acb = NULL;
1761 
1762 	/* this buffer is on no lists and is not in the hash table */
1763 	ASSERT3P(hdr->b_state, ==, arc.anon);
1764 
1765 	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
1766 	hdr->b_birth = zio->io_bp->blk_birth;
1767 	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
1768 	/* clear the "in-write" flag */
1769 	hdr->b_hash_next = NULL;
1770 	/* This write may be all-zero */
1771 	if (!BUF_EMPTY(hdr)) {
1772 		arc_buf_hdr_t *exists;
1773 		kmutex_t *hash_lock;
1774 
1775 		exists = buf_hash_insert(hdr, &hash_lock);
1776 		if (exists) {
1777 			/*
1778 			 * This can only happen if we overwrite for
1779 			 * sync-to-convergence, because we remove
1780 			 * buffers from the hash table when we arc_free().
1781 			 */
1782 			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
1783 			    BP_IDENTITY(zio->io_bp)));
1784 			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
1785 			    zio->io_bp->blk_birth);
1786 
1787 			ASSERT(refcount_is_zero(&exists->b_refcnt));
1788 			arc_change_state(arc.anon, exists, hash_lock);
1789 			mutex_exit(hash_lock);
1790 			arc_hdr_free(exists);
1791 			exists = buf_hash_insert(hdr, &hash_lock);
1792 			ASSERT3P(exists, ==, NULL);
1793 		}
1794 		arc_access(hdr, hash_lock);
1795 		mutex_exit(hash_lock);
1796 	}
1797 	if (acb && acb->acb_done) {
1798 		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
1799 		acb->acb_done(zio, buf, acb->acb_private);
1800 	}
1801 
1802 	if (acb)
1803 		kmem_free(acb, sizeof (arc_callback_t));
1804 }
1805 
1806 int
1807 arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
1808     uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
1809     arc_done_func_t *done, void *private, int priority, int flags,
1810     uint32_t arc_flags)
1811 {
1812 	arc_buf_hdr_t *hdr = buf->b_hdr;
1813 	arc_callback_t	*acb;
1814 	zio_t	*rzio;
1815 
1816 	/* this is a private buffer - no locking required */
1817 	ASSERT3P(hdr->b_state, ==, arc.anon);
1818 	ASSERT(BUF_EMPTY(hdr));
1819 	ASSERT(!HDR_IO_ERROR(hdr));
1820 	acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
1821 	acb->acb_done = done;
1822 	acb->acb_private = private;
1823 	acb->acb_byteswap = (arc_byteswap_func_t *)-1;
1824 	hdr->b_acb = acb;
1825 	rzio = zio_write(pio, spa, checksum, compress, txg, bp,
1826 	    buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
1827 
1828 	if (arc_flags & ARC_WAIT)
1829 		return (zio_wait(rzio));
1830 
1831 	ASSERT(arc_flags & ARC_NOWAIT);
1832 	zio_nowait(rzio);
1833 
1834 	return (0);
1835 }
1836 
1837 int
1838 arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
1839     zio_done_func_t *done, void *private, uint32_t arc_flags)
1840 {
1841 	arc_buf_hdr_t *ab;
1842 	kmutex_t *hash_lock;
1843 	zio_t	*zio;
1844 
1845 	/*
1846 	 * If this buffer is in the cache, release it, so it
1847 	 * can be re-used.
1848 	 */
1849 	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
1850 	if (ab != NULL) {
1851 		/*
1852 		 * The checksum of blocks to free is not always
1853 		 * preserved (eg. on the deadlist).  However, if it is
1854 		 * nonzero, it should match what we have in the cache.
1855 		 */
1856 		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
1857 		    ab->b_cksum0 == bp->blk_cksum.zc_word[0]);
1858 		arc_change_state(arc.anon, ab, hash_lock);
1859 		if (refcount_is_zero(&ab->b_refcnt)) {
1860 			mutex_exit(hash_lock);
1861 			arc_hdr_free(ab);
1862 			atomic_add_64(&arc.deleted, 1);
1863 		} else {
1864 			ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
1865 			if (HDR_IO_IN_PROGRESS(ab))
1866 				ab->b_flags |= ARC_FREED_IN_READ;
1867 			ab->b_arc_access = 0;
1868 			bzero(&ab->b_dva, sizeof (dva_t));
1869 			ab->b_birth = 0;
1870 			ab->b_cksum0 = 0;
1871 			mutex_exit(hash_lock);
1872 		}
1873 	}
1874 
1875 	zio = zio_free(pio, spa, txg, bp, done, private);
1876 
1877 	if (arc_flags & ARC_WAIT)
1878 		return (zio_wait(zio));
1879 
1880 	ASSERT(arc_flags & ARC_NOWAIT);
1881 	zio_nowait(zio);
1882 
1883 	return (0);
1884 }
1885 
1886 void
1887 arc_tempreserve_clear(uint64_t tempreserve)
1888 {
1889 	atomic_add_64(&arc_tempreserve, -tempreserve);
1890 	ASSERT((int64_t)arc_tempreserve >= 0);
1891 }
1892 
1893 int
1894 arc_tempreserve_space(uint64_t tempreserve)
1895 {
1896 #ifdef ZFS_DEBUG
1897 	/*
1898 	 * Once in a while, fail for no reason.  Everything should cope.
1899 	 */
1900 	if (spa_get_random(10000) == 0) {
1901 		dprintf("forcing random failure\n");
1902 		return (ERESTART);
1903 	}
1904 #endif
1905 	if (tempreserve > arc.c/4 && !arc.no_grow)
1906 		arc.c = MIN(arc.c_max, tempreserve * 4);
1907 	if (tempreserve > arc.c)
1908 		return (ENOMEM);
1909 
1910 	/*
1911 	 * Throttle writes when the amount of dirty data in the cache
1912 	 * gets too large.  We try to keep the cache less than half full
1913 	 * of dirty blocks so that our sync times don't grow too large.
1914 	 * Note: if two requests come in concurrently, we might let them
1915 	 * both succeed, when one of them should fail.  Not a huge deal.
1916 	 *
1917 	 * XXX The limit should be adjusted dynamically to keep the time
1918 	 * to sync a dataset fixed (around 1-5 seconds?).
1919 	 */
1920 
1921 	if (tempreserve + arc_tempreserve + arc.anon->size > arc.c / 2 &&
1922 	    arc_tempreserve + arc.anon->size > arc.c / 4) {
1923 		dprintf("failing, arc_tempreserve=%lluK anon=%lluK "
1924 		    "tempreserve=%lluK arc.c=%lluK\n",
1925 		    arc_tempreserve>>10, arc.anon->lsize>>10,
1926 		    tempreserve>>10, arc.c>>10);
1927 		return (ERESTART);
1928 	}
1929 	atomic_add_64(&arc_tempreserve, tempreserve);
1930 	return (0);
1931 }
1932 
1933 void
1934 arc_init(void)
1935 {
1936 	mutex_init(&arc_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
1937 	mutex_init(&arc_reclaim_thr_lock, NULL, MUTEX_DEFAULT, NULL);
1938 	cv_init(&arc_reclaim_thr_cv, NULL, CV_DEFAULT, NULL);
1939 
1940 	/* Start out with 1/8 of all memory */
1941 	arc.c = physmem * PAGESIZE / 8;
1942 
1943 #ifdef _KERNEL
1944 	/*
1945 	 * On architectures where the physical memory can be larger
1946 	 * than the addressable space (intel in 32-bit mode), we may
1947 	 * need to limit the cache to 1/8 of VM size.
1948 	 */
1949 	arc.c = MIN(arc.c, vmem_size(heap_arena, VMEM_ALLOC | VMEM_FREE) / 8);
1950 #endif
1951 
1952 	/* set min cache to 1/32 of all memory, or 64MB, whichever is more */
1953 	arc.c_min = MAX(arc.c / 4, 64<<20);
1954 	/* set max to 3/4 of all memory, or all but 1GB, whichever is more */
1955 	if (arc.c * 8 >= 1<<30)
1956 		arc.c_max = (arc.c * 8) - (1<<30);
1957 	else
1958 		arc.c_max = arc.c_min;
1959 	arc.c_max = MAX(arc.c * 6, arc.c_max);
1960 	arc.c = arc.c_max;
1961 	arc.p = (arc.c >> 1);
1962 
1963 	/* if kmem_flags are set, lets try to use less memory */
1964 	if (kmem_debugging())
1965 		arc.c = arc.c / 2;
1966 	if (arc.c < arc.c_min)
1967 		arc.c = arc.c_min;
1968 
1969 	arc.anon = &ARC_anon;
1970 	arc.mru_top = &ARC_mru_top;
1971 	arc.mru_bot = &ARC_mru_bot;
1972 	arc.mfu_top = &ARC_mfu_top;
1973 	arc.mfu_bot = &ARC_mfu_bot;
1974 
1975 	list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
1976 	    offsetof(arc_buf_hdr_t, b_arc_node));
1977 	list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
1978 	    offsetof(arc_buf_hdr_t, b_arc_node));
1979 	list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
1980 	    offsetof(arc_buf_hdr_t, b_arc_node));
1981 	list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
1982 	    offsetof(arc_buf_hdr_t, b_arc_node));
1983 
1984 	buf_init();
1985 
1986 	arc_thread_exit = 0;
1987 
1988 	(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
1989 	    TS_RUN, minclsyspri);
1990 }
1991 
1992 void
1993 arc_fini(void)
1994 {
1995 	mutex_enter(&arc_reclaim_thr_lock);
1996 	arc_thread_exit = 1;
1997 	while (arc_thread_exit != 0)
1998 		cv_wait(&arc_reclaim_thr_cv, &arc_reclaim_thr_lock);
1999 	mutex_exit(&arc_reclaim_thr_lock);
2000 
2001 	arc_flush();
2002 
2003 	arc_dead = TRUE;
2004 
2005 	mutex_destroy(&arc_reclaim_lock);
2006 	mutex_destroy(&arc_reclaim_thr_lock);
2007 	cv_destroy(&arc_reclaim_thr_cv);
2008 
2009 	list_destroy(&arc.mru_top->list);
2010 	list_destroy(&arc.mru_bot->list);
2011 	list_destroy(&arc.mfu_top->list);
2012 	list_destroy(&arc.mfu_bot->list);
2013 
2014 	buf_fini();
2015 }
2016