xref: /titanic_44/usr/src/uts/common/fs/zfs/dmu_zfetch.c (revision a2cdcdd260232b58202b11a9bfc0103c9449ed52)
1fa9e4066Sahrens /*
2fa9e4066Sahrens  * CDDL HEADER START
3fa9e4066Sahrens  *
4fa9e4066Sahrens  * The contents of this file are subject to the terms of the
513506d1eSmaybee  * Common Development and Distribution License (the "License").
613506d1eSmaybee  * You may not use this file except in compliance with the License.
7fa9e4066Sahrens  *
8fa9e4066Sahrens  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9fa9e4066Sahrens  * or http://www.opensolaris.org/os/licensing.
10fa9e4066Sahrens  * See the License for the specific language governing permissions
11fa9e4066Sahrens  * and limitations under the License.
12fa9e4066Sahrens  *
13fa9e4066Sahrens  * When distributing Covered Code, include this CDDL HEADER in each
14fa9e4066Sahrens  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15fa9e4066Sahrens  * If applicable, add the following below this CDDL HEADER, with the
16fa9e4066Sahrens  * fields enclosed by brackets "[]" replaced with your own identifying
17fa9e4066Sahrens  * information: Portions Copyright [yyyy] [name of copyright owner]
18fa9e4066Sahrens  *
19fa9e4066Sahrens  * CDDL HEADER END
20fa9e4066Sahrens  */
21fa9e4066Sahrens /*
227cbf8b43SRich Morris  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23fa9e4066Sahrens  * Use is subject to license terms.
24fa9e4066Sahrens  */
25fa9e4066Sahrens 
2669962b56SMatthew Ahrens /*
2769962b56SMatthew Ahrens  * Copyright (c) 2013 by Delphix. All rights reserved.
2869962b56SMatthew Ahrens  */
2969962b56SMatthew Ahrens 
30fa9e4066Sahrens #include <sys/zfs_context.h>
31fa9e4066Sahrens #include <sys/dnode.h>
32fa9e4066Sahrens #include <sys/dmu_objset.h>
33fa9e4066Sahrens #include <sys/dmu_zfetch.h>
34fa9e4066Sahrens #include <sys/dmu.h>
35fa9e4066Sahrens #include <sys/dbuf.h>
367cbf8b43SRich Morris #include <sys/kstat.h>
37fa9e4066Sahrens 
38fa9e4066Sahrens /*
39fa9e4066Sahrens  * I'm against tune-ables, but these should probably exist as tweakable globals
40fa9e4066Sahrens  * until we can get this working the way we want it to.
41fa9e4066Sahrens  */
42fa9e4066Sahrens 
43416e0cd8Sek110237 int zfs_prefetch_disable = 0;
44a2eea2e1Sahrens 
45fa9e4066Sahrens /* max # of streams per zfetch */
46fa9e4066Sahrens uint32_t	zfetch_max_streams = 8;
47fa9e4066Sahrens /* min time before stream reclaim */
48fa9e4066Sahrens uint32_t	zfetch_min_sec_reap = 2;
49fa9e4066Sahrens /* max number of blocks to fetch at a time */
5013506d1eSmaybee uint32_t	zfetch_block_cap = 256;
51fa9e4066Sahrens /* number of bytes in a array_read at which we stop prefetching (1Mb) */
52fa9e4066Sahrens uint64_t	zfetch_array_rd_sz = 1024 * 1024;
53fa9e4066Sahrens 
54fa9e4066Sahrens /* forward decls for static routines */
553e30c24aSWill Andrews static boolean_t	dmu_zfetch_colinear(zfetch_t *, zstream_t *);
56fa9e4066Sahrens static void		dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
57fa9e4066Sahrens static uint64_t		dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
58fa9e4066Sahrens static uint64_t		dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
593e30c24aSWill Andrews static boolean_t	dmu_zfetch_find(zfetch_t *, zstream_t *, int);
60fa9e4066Sahrens static int		dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
61fa9e4066Sahrens static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
62fa9e4066Sahrens static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
63fa9e4066Sahrens static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
64fa9e4066Sahrens 
657cbf8b43SRich Morris typedef struct zfetch_stats {
667cbf8b43SRich Morris 	kstat_named_t zfetchstat_hits;
677cbf8b43SRich Morris 	kstat_named_t zfetchstat_misses;
687cbf8b43SRich Morris 	kstat_named_t zfetchstat_colinear_hits;
697cbf8b43SRich Morris 	kstat_named_t zfetchstat_colinear_misses;
707cbf8b43SRich Morris 	kstat_named_t zfetchstat_stride_hits;
717cbf8b43SRich Morris 	kstat_named_t zfetchstat_stride_misses;
727cbf8b43SRich Morris 	kstat_named_t zfetchstat_reclaim_successes;
737cbf8b43SRich Morris 	kstat_named_t zfetchstat_reclaim_failures;
747cbf8b43SRich Morris 	kstat_named_t zfetchstat_stream_resets;
757cbf8b43SRich Morris 	kstat_named_t zfetchstat_stream_noresets;
767cbf8b43SRich Morris 	kstat_named_t zfetchstat_bogus_streams;
777cbf8b43SRich Morris } zfetch_stats_t;
787cbf8b43SRich Morris 
797cbf8b43SRich Morris static zfetch_stats_t zfetch_stats = {
807cbf8b43SRich Morris 	{ "hits",			KSTAT_DATA_UINT64 },
817cbf8b43SRich Morris 	{ "misses",			KSTAT_DATA_UINT64 },
827cbf8b43SRich Morris 	{ "colinear_hits",		KSTAT_DATA_UINT64 },
837cbf8b43SRich Morris 	{ "colinear_misses",		KSTAT_DATA_UINT64 },
847cbf8b43SRich Morris 	{ "stride_hits",		KSTAT_DATA_UINT64 },
857cbf8b43SRich Morris 	{ "stride_misses",		KSTAT_DATA_UINT64 },
867cbf8b43SRich Morris 	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
877cbf8b43SRich Morris 	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
887cbf8b43SRich Morris 	{ "streams_resets",		KSTAT_DATA_UINT64 },
897cbf8b43SRich Morris 	{ "streams_noresets",		KSTAT_DATA_UINT64 },
907cbf8b43SRich Morris 	{ "bogus_streams",		KSTAT_DATA_UINT64 },
917cbf8b43SRich Morris };
927cbf8b43SRich Morris 
937cbf8b43SRich Morris #define	ZFETCHSTAT_INCR(stat, val) \
947cbf8b43SRich Morris 	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
957cbf8b43SRich Morris 
967cbf8b43SRich Morris #define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
977cbf8b43SRich Morris 
987cbf8b43SRich Morris kstat_t		*zfetch_ksp;
997cbf8b43SRich Morris 
100fa9e4066Sahrens /*
101fa9e4066Sahrens  * Given a zfetch structure and a zstream structure, determine whether the
10213506d1eSmaybee  * blocks to be read are part of a co-linear pair of existing prefetch
103fa9e4066Sahrens  * streams.  If a set is found, coalesce the streams, removing one, and
104fa9e4066Sahrens  * configure the prefetch so it looks for a strided access pattern.
105fa9e4066Sahrens  *
10613506d1eSmaybee  * In other words: if we find two sequential access streams that are
10713506d1eSmaybee  * the same length and distance N appart, and this read is N from the
10813506d1eSmaybee  * last stream, then we are probably in a strided access pattern.  So
10913506d1eSmaybee  * combine the two sequential streams into a single strided stream.
11013506d1eSmaybee  *
1113e30c24aSWill Andrews  * Returns whether co-linear streams were found.
112fa9e4066Sahrens  */
1133e30c24aSWill Andrews static boolean_t
dmu_zfetch_colinear(zfetch_t * zf,zstream_t * zh)114fa9e4066Sahrens dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
115fa9e4066Sahrens {
116fa9e4066Sahrens 	zstream_t	*z_walk;
117fa9e4066Sahrens 	zstream_t	*z_comp;
118fa9e4066Sahrens 
119404ba9d7Srbourbon 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
120404ba9d7Srbourbon 		return (0);
121fa9e4066Sahrens 
122fa9e4066Sahrens 	if (zh == NULL) {
123fa9e4066Sahrens 		rw_exit(&zf->zf_rwlock);
124fa9e4066Sahrens 		return (0);
125fa9e4066Sahrens 	}
126fa9e4066Sahrens 
127fa9e4066Sahrens 	for (z_walk = list_head(&zf->zf_stream); z_walk;
128fa9e4066Sahrens 	    z_walk = list_next(&zf->zf_stream, z_walk)) {
129fa9e4066Sahrens 		for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp;
130fa9e4066Sahrens 		    z_comp = list_next(&zf->zf_stream, z_comp)) {
131fa9e4066Sahrens 			int64_t		diff;
132fa9e4066Sahrens 
133fa9e4066Sahrens 			if (z_walk->zst_len != z_walk->zst_stride ||
134fa9e4066Sahrens 			    z_comp->zst_len != z_comp->zst_stride) {
135fa9e4066Sahrens 				continue;
136fa9e4066Sahrens 			}
137fa9e4066Sahrens 
138fa9e4066Sahrens 			diff = z_comp->zst_offset - z_walk->zst_offset;
139fa9e4066Sahrens 			if (z_comp->zst_offset + diff == zh->zst_offset) {
140fa9e4066Sahrens 				z_walk->zst_offset = zh->zst_offset;
141fa9e4066Sahrens 				z_walk->zst_direction = diff < 0 ? -1 : 1;
142fa9e4066Sahrens 				z_walk->zst_stride =
143fa9e4066Sahrens 				    diff * z_walk->zst_direction;
144fa9e4066Sahrens 				z_walk->zst_ph_offset =
145fa9e4066Sahrens 				    zh->zst_offset + z_walk->zst_stride;
146fa9e4066Sahrens 				dmu_zfetch_stream_remove(zf, z_comp);
147fa9e4066Sahrens 				mutex_destroy(&z_comp->zst_lock);
148fa9e4066Sahrens 				kmem_free(z_comp, sizeof (zstream_t));
149fa9e4066Sahrens 
150fa9e4066Sahrens 				dmu_zfetch_dofetch(zf, z_walk);
151fa9e4066Sahrens 
152fa9e4066Sahrens 				rw_exit(&zf->zf_rwlock);
153fa9e4066Sahrens 				return (1);
154fa9e4066Sahrens 			}
155fa9e4066Sahrens 
156fa9e4066Sahrens 			diff = z_walk->zst_offset - z_comp->zst_offset;
157fa9e4066Sahrens 			if (z_walk->zst_offset + diff == zh->zst_offset) {
158fa9e4066Sahrens 				z_walk->zst_offset = zh->zst_offset;
159fa9e4066Sahrens 				z_walk->zst_direction = diff < 0 ? -1 : 1;
160fa9e4066Sahrens 				z_walk->zst_stride =
161fa9e4066Sahrens 				    diff * z_walk->zst_direction;
162fa9e4066Sahrens 				z_walk->zst_ph_offset =
163fa9e4066Sahrens 				    zh->zst_offset + z_walk->zst_stride;
164fa9e4066Sahrens 				dmu_zfetch_stream_remove(zf, z_comp);
165fa9e4066Sahrens 				mutex_destroy(&z_comp->zst_lock);
166fa9e4066Sahrens 				kmem_free(z_comp, sizeof (zstream_t));
167fa9e4066Sahrens 
168fa9e4066Sahrens 				dmu_zfetch_dofetch(zf, z_walk);
169fa9e4066Sahrens 
170fa9e4066Sahrens 				rw_exit(&zf->zf_rwlock);
171fa9e4066Sahrens 				return (1);
172fa9e4066Sahrens 			}
173fa9e4066Sahrens 		}
174fa9e4066Sahrens 	}
175fa9e4066Sahrens 
176fa9e4066Sahrens 	rw_exit(&zf->zf_rwlock);
177fa9e4066Sahrens 	return (0);
178fa9e4066Sahrens }
179fa9e4066Sahrens 
180fa9e4066Sahrens /*
181fa9e4066Sahrens  * Given a zstream_t, determine the bounds of the prefetch.  Then call the
182fa9e4066Sahrens  * routine that actually prefetches the individual blocks.
183fa9e4066Sahrens  */
184fa9e4066Sahrens static void
dmu_zfetch_dofetch(zfetch_t * zf,zstream_t * zs)185fa9e4066Sahrens dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
186fa9e4066Sahrens {
187fa9e4066Sahrens 	uint64_t	prefetch_tail;
188fa9e4066Sahrens 	uint64_t	prefetch_limit;
189fa9e4066Sahrens 	uint64_t	prefetch_ofst;
190fa9e4066Sahrens 	uint64_t	prefetch_len;
191fa9e4066Sahrens 	uint64_t	blocks_fetched;
192fa9e4066Sahrens 
193fa9e4066Sahrens 	zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len);
194fa9e4066Sahrens 	zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap);
195fa9e4066Sahrens 
196fa9e4066Sahrens 	prefetch_tail = MAX((int64_t)zs->zst_ph_offset,
197fa9e4066Sahrens 	    (int64_t)(zs->zst_offset + zs->zst_stride));
198fa9e4066Sahrens 	/*
199fa9e4066Sahrens 	 * XXX: use a faster division method?
200fa9e4066Sahrens 	 */
201fa9e4066Sahrens 	prefetch_limit = zs->zst_offset + zs->zst_len +
202fa9e4066Sahrens 	    (zs->zst_cap * zs->zst_stride) / zs->zst_len;
203fa9e4066Sahrens 
204fa9e4066Sahrens 	while (prefetch_tail < prefetch_limit) {
205fa9e4066Sahrens 		prefetch_ofst = zs->zst_offset + zs->zst_direction *
206fa9e4066Sahrens 		    (prefetch_tail - zs->zst_offset);
207fa9e4066Sahrens 
208fa9e4066Sahrens 		prefetch_len = zs->zst_len;
209fa9e4066Sahrens 
210fa9e4066Sahrens 		/*
211fa9e4066Sahrens 		 * Don't prefetch beyond the end of the file, if working
212fa9e4066Sahrens 		 * backwards.
213fa9e4066Sahrens 		 */
214fa9e4066Sahrens 		if ((zs->zst_direction == ZFETCH_BACKWARD) &&
215fa9e4066Sahrens 		    (prefetch_ofst > prefetch_tail)) {
216fa9e4066Sahrens 			prefetch_len += prefetch_ofst;
217fa9e4066Sahrens 			prefetch_ofst = 0;
218fa9e4066Sahrens 		}
219fa9e4066Sahrens 
220fa9e4066Sahrens 		/* don't prefetch more than we're supposed to */
221fa9e4066Sahrens 		if (prefetch_len > zs->zst_len)
222fa9e4066Sahrens 			break;
223fa9e4066Sahrens 
224fa9e4066Sahrens 		blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode,
225fa9e4066Sahrens 		    prefetch_ofst, zs->zst_len);
226fa9e4066Sahrens 
227fa9e4066Sahrens 		prefetch_tail += zs->zst_stride;
228fa9e4066Sahrens 		/* stop if we've run out of stuff to prefetch */
229fa9e4066Sahrens 		if (blocks_fetched < zs->zst_len)
230fa9e4066Sahrens 			break;
231fa9e4066Sahrens 	}
232fa9e4066Sahrens 	zs->zst_ph_offset = prefetch_tail;
233d3d50737SRafael Vanoni 	zs->zst_last = ddi_get_lbolt();
234fa9e4066Sahrens }
235fa9e4066Sahrens 
2367cbf8b43SRich Morris void
zfetch_init(void)2377cbf8b43SRich Morris zfetch_init(void)
2387cbf8b43SRich Morris {
2397cbf8b43SRich Morris 
2407cbf8b43SRich Morris 	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
2417cbf8b43SRich Morris 	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
2427cbf8b43SRich Morris 	    KSTAT_FLAG_VIRTUAL);
2437cbf8b43SRich Morris 
2447cbf8b43SRich Morris 	if (zfetch_ksp != NULL) {
2457cbf8b43SRich Morris 		zfetch_ksp->ks_data = &zfetch_stats;
2467cbf8b43SRich Morris 		kstat_install(zfetch_ksp);
2477cbf8b43SRich Morris 	}
2487cbf8b43SRich Morris }
2497cbf8b43SRich Morris 
2507cbf8b43SRich Morris void
zfetch_fini(void)2517cbf8b43SRich Morris zfetch_fini(void)
2527cbf8b43SRich Morris {
2537cbf8b43SRich Morris 	if (zfetch_ksp != NULL) {
2547cbf8b43SRich Morris 		kstat_delete(zfetch_ksp);
2557cbf8b43SRich Morris 		zfetch_ksp = NULL;
2567cbf8b43SRich Morris 	}
2577cbf8b43SRich Morris }
2587cbf8b43SRich Morris 
259fa9e4066Sahrens /*
260fa9e4066Sahrens  * This takes a pointer to a zfetch structure and a dnode.  It performs the
261fa9e4066Sahrens  * necessary setup for the zfetch structure, grokking data from the
262fa9e4066Sahrens  * associated dnode.
263fa9e4066Sahrens  */
264fa9e4066Sahrens void
dmu_zfetch_init(zfetch_t * zf,dnode_t * dno)265fa9e4066Sahrens dmu_zfetch_init(zfetch_t *zf, dnode_t *dno)
266fa9e4066Sahrens {
267fa9e4066Sahrens 	if (zf == NULL) {
268fa9e4066Sahrens 		return;
269fa9e4066Sahrens 	}
270fa9e4066Sahrens 
271fa9e4066Sahrens 	zf->zf_dnode = dno;
272fa9e4066Sahrens 	zf->zf_stream_cnt = 0;
273fa9e4066Sahrens 	zf->zf_alloc_fail = 0;
274fa9e4066Sahrens 
275fa9e4066Sahrens 	list_create(&zf->zf_stream, sizeof (zstream_t),
276fa9e4066Sahrens 	    offsetof(zstream_t, zst_node));
277fa9e4066Sahrens 
278fa9e4066Sahrens 	rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL);
279fa9e4066Sahrens }
280fa9e4066Sahrens 
281fa9e4066Sahrens /*
282fa9e4066Sahrens  * This function computes the actual size, in blocks, that can be prefetched,
283fa9e4066Sahrens  * and fetches it.
284fa9e4066Sahrens  */
285fa9e4066Sahrens static uint64_t
dmu_zfetch_fetch(dnode_t * dn,uint64_t blkid,uint64_t nblks)286fa9e4066Sahrens dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks)
287fa9e4066Sahrens {
288fa9e4066Sahrens 	uint64_t	fetchsz;
289fa9e4066Sahrens 	uint64_t	i;
290fa9e4066Sahrens 
291fa9e4066Sahrens 	fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks);
292fa9e4066Sahrens 
293fa9e4066Sahrens 	for (i = 0; i < fetchsz; i++) {
294*a2cdcdd2SPaul Dagnelie 		dbuf_prefetch(dn, 0, blkid + i, ZIO_PRIORITY_ASYNC_READ,
295*a2cdcdd2SPaul Dagnelie 		    ARC_FLAG_PREFETCH);
296fa9e4066Sahrens 	}
297fa9e4066Sahrens 
298fa9e4066Sahrens 	return (fetchsz);
299fa9e4066Sahrens }
300fa9e4066Sahrens 
301fa9e4066Sahrens /*
302fa9e4066Sahrens  * this function returns the number of blocks that would be prefetched, based
303fa9e4066Sahrens  * upon the supplied dnode, blockid, and nblks.  This is used so that we can
304fa9e4066Sahrens  * update streams in place, and then prefetch with their old value after the
305fa9e4066Sahrens  * fact.  This way, we can delay the prefetch, but subsequent accesses to the
306fa9e4066Sahrens  * stream won't result in the same data being prefetched multiple times.
307fa9e4066Sahrens  */
308fa9e4066Sahrens static uint64_t
dmu_zfetch_fetchsz(dnode_t * dn,uint64_t blkid,uint64_t nblks)309fa9e4066Sahrens dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
310fa9e4066Sahrens {
311fa9e4066Sahrens 	uint64_t	fetchsz;
312fa9e4066Sahrens 
313fa9e4066Sahrens 	if (blkid > dn->dn_maxblkid) {
314fa9e4066Sahrens 		return (0);
315fa9e4066Sahrens 	}
316fa9e4066Sahrens 
317fa9e4066Sahrens 	/* compute fetch size */
31813506d1eSmaybee 	if (blkid + nblks + 1 > dn->dn_maxblkid) {
31913506d1eSmaybee 		fetchsz = (dn->dn_maxblkid - blkid) + 1;
32013506d1eSmaybee 		ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid);
321fa9e4066Sahrens 	} else {
322fa9e4066Sahrens 		fetchsz = nblks;
323fa9e4066Sahrens 	}
324fa9e4066Sahrens 
325fa9e4066Sahrens 
326fa9e4066Sahrens 	return (fetchsz);
327fa9e4066Sahrens }
328fa9e4066Sahrens 
329fa9e4066Sahrens /*
3307cbf8b43SRich Morris  * given a zfetch and a zstream structure, see if there is an associated zstream
331fa9e4066Sahrens  * for this block read.  If so, it starts a prefetch for the stream it
332fa9e4066Sahrens  * located and returns true, otherwise it returns false
333fa9e4066Sahrens  */
3343e30c24aSWill Andrews static boolean_t
dmu_zfetch_find(zfetch_t * zf,zstream_t * zh,int prefetched)33513506d1eSmaybee dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
336fa9e4066Sahrens {
337fa9e4066Sahrens 	zstream_t	*zs;
338fa9e4066Sahrens 	int64_t		diff;
33913506d1eSmaybee 	int		reset = !prefetched;
340fa9e4066Sahrens 	int		rc = 0;
341fa9e4066Sahrens 
342fa9e4066Sahrens 	if (zh == NULL)
343fa9e4066Sahrens 		return (0);
344fa9e4066Sahrens 
345fa9e4066Sahrens 	/*
346fa9e4066Sahrens 	 * XXX: This locking strategy is a bit coarse; however, it's impact has
347fa9e4066Sahrens 	 * yet to be tested.  If this turns out to be an issue, it can be
348fa9e4066Sahrens 	 * modified in a number of different ways.
349fa9e4066Sahrens 	 */
350fa9e4066Sahrens 
351fa9e4066Sahrens 	rw_enter(&zf->zf_rwlock, RW_READER);
352fa9e4066Sahrens top:
353fa9e4066Sahrens 
354fa9e4066Sahrens 	for (zs = list_head(&zf->zf_stream); zs;
355fa9e4066Sahrens 	    zs = list_next(&zf->zf_stream, zs)) {
356fa9e4066Sahrens 
35713506d1eSmaybee 		/*
35813506d1eSmaybee 		 * XXX - should this be an assert?
35913506d1eSmaybee 		 */
360fa9e4066Sahrens 		if (zs->zst_len == 0) {
361fa9e4066Sahrens 			/* bogus stream */
3627cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
363fa9e4066Sahrens 			continue;
364fa9e4066Sahrens 		}
365fa9e4066Sahrens 
36613506d1eSmaybee 		/*
36713506d1eSmaybee 		 * We hit this case when we are in a strided prefetch stream:
36813506d1eSmaybee 		 * we will read "len" blocks before "striding".
36913506d1eSmaybee 		 */
37013506d1eSmaybee 		if (zh->zst_offset >= zs->zst_offset &&
37113506d1eSmaybee 		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
3727cbf8b43SRich Morris 			if (prefetched) {
373fa9e4066Sahrens 				/* already fetched */
3747cbf8b43SRich Morris 				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
37513506d1eSmaybee 				rc = 1;
37613506d1eSmaybee 				goto out;
3777cbf8b43SRich Morris 			} else {
3787cbf8b43SRich Morris 				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
3797cbf8b43SRich Morris 			}
380fa9e4066Sahrens 		}
381fa9e4066Sahrens 
38213506d1eSmaybee 		/*
38313506d1eSmaybee 		 * This is the forward sequential read case: we increment
38413506d1eSmaybee 		 * len by one each time we hit here, so we will enter this
38513506d1eSmaybee 		 * case on every read.
38613506d1eSmaybee 		 */
387fa9e4066Sahrens 		if (zh->zst_offset == zs->zst_offset + zs->zst_len) {
38813506d1eSmaybee 
38913506d1eSmaybee 			reset = !prefetched && zs->zst_len > 1;
390fa9e4066Sahrens 
391fa9e4066Sahrens 			mutex_enter(&zs->zst_lock);
392fa9e4066Sahrens 
393fa9e4066Sahrens 			if (zh->zst_offset != zs->zst_offset + zs->zst_len) {
394fa9e4066Sahrens 				mutex_exit(&zs->zst_lock);
395fa9e4066Sahrens 				goto top;
396fa9e4066Sahrens 			}
397fa9e4066Sahrens 			zs->zst_len += zh->zst_len;
398fa9e4066Sahrens 			diff = zs->zst_len - zfetch_block_cap;
399fa9e4066Sahrens 			if (diff > 0) {
400fa9e4066Sahrens 				zs->zst_offset += diff;
401fa9e4066Sahrens 				zs->zst_len = zs->zst_len > diff ?
402fa9e4066Sahrens 				    zs->zst_len - diff : 0;
403fa9e4066Sahrens 			}
404fa9e4066Sahrens 			zs->zst_direction = ZFETCH_FORWARD;
405fa9e4066Sahrens 
406fa9e4066Sahrens 			break;
407fa9e4066Sahrens 
40813506d1eSmaybee 		/*
40913506d1eSmaybee 		 * Same as above, but reading backwards through the file.
41013506d1eSmaybee 		 */
411fa9e4066Sahrens 		} else if (zh->zst_offset == zs->zst_offset - zh->zst_len) {
412fa9e4066Sahrens 			/* backwards sequential access */
413fa9e4066Sahrens 
41413506d1eSmaybee 			reset = !prefetched && zs->zst_len > 1;
41513506d1eSmaybee 
416fa9e4066Sahrens 			mutex_enter(&zs->zst_lock);
417fa9e4066Sahrens 
418fa9e4066Sahrens 			if (zh->zst_offset != zs->zst_offset - zh->zst_len) {
419fa9e4066Sahrens 				mutex_exit(&zs->zst_lock);
420fa9e4066Sahrens 				goto top;
421fa9e4066Sahrens 			}
422fa9e4066Sahrens 
423fa9e4066Sahrens 			zs->zst_offset = zs->zst_offset > zh->zst_len ?
424fa9e4066Sahrens 			    zs->zst_offset - zh->zst_len : 0;
425fa9e4066Sahrens 			zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ?
426fa9e4066Sahrens 			    zs->zst_ph_offset - zh->zst_len : 0;
427fa9e4066Sahrens 			zs->zst_len += zh->zst_len;
428fa9e4066Sahrens 
429fa9e4066Sahrens 			diff = zs->zst_len - zfetch_block_cap;
430fa9e4066Sahrens 			if (diff > 0) {
431fa9e4066Sahrens 				zs->zst_ph_offset = zs->zst_ph_offset > diff ?
432fa9e4066Sahrens 				    zs->zst_ph_offset - diff : 0;
433fa9e4066Sahrens 				zs->zst_len = zs->zst_len > diff ?
434fa9e4066Sahrens 				    zs->zst_len - diff : zs->zst_len;
435fa9e4066Sahrens 			}
436fa9e4066Sahrens 			zs->zst_direction = ZFETCH_BACKWARD;
437fa9e4066Sahrens 
438fa9e4066Sahrens 			break;
439fa9e4066Sahrens 
440fa9e4066Sahrens 		} else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride <
441fa9e4066Sahrens 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
442fa9e4066Sahrens 			/* strided forward access */
443fa9e4066Sahrens 
444fa9e4066Sahrens 			mutex_enter(&zs->zst_lock);
445fa9e4066Sahrens 
446fa9e4066Sahrens 			if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >=
447fa9e4066Sahrens 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
448fa9e4066Sahrens 				mutex_exit(&zs->zst_lock);
449fa9e4066Sahrens 				goto top;
450fa9e4066Sahrens 			}
451fa9e4066Sahrens 
452fa9e4066Sahrens 			zs->zst_offset += zs->zst_stride;
453fa9e4066Sahrens 			zs->zst_direction = ZFETCH_FORWARD;
454fa9e4066Sahrens 
455fa9e4066Sahrens 			break;
456fa9e4066Sahrens 
457fa9e4066Sahrens 		} else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride <
458fa9e4066Sahrens 		    zs->zst_len) && (zs->zst_len != zs->zst_stride)) {
459fa9e4066Sahrens 			/* strided reverse access */
460fa9e4066Sahrens 
461fa9e4066Sahrens 			mutex_enter(&zs->zst_lock);
462fa9e4066Sahrens 
463fa9e4066Sahrens 			if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >=
464fa9e4066Sahrens 			    zs->zst_len) || (zs->zst_len == zs->zst_stride)) {
465fa9e4066Sahrens 				mutex_exit(&zs->zst_lock);
466fa9e4066Sahrens 				goto top;
467fa9e4066Sahrens 			}
468fa9e4066Sahrens 
469fa9e4066Sahrens 			zs->zst_offset = zs->zst_offset > zs->zst_stride ?
470fa9e4066Sahrens 			    zs->zst_offset - zs->zst_stride : 0;
471fa9e4066Sahrens 			zs->zst_ph_offset = (zs->zst_ph_offset >
472fa9e4066Sahrens 			    (2 * zs->zst_stride)) ?
473fa9e4066Sahrens 			    (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0;
474fa9e4066Sahrens 			zs->zst_direction = ZFETCH_BACKWARD;
475fa9e4066Sahrens 
476fa9e4066Sahrens 			break;
477fa9e4066Sahrens 		}
478fa9e4066Sahrens 	}
479fa9e4066Sahrens 
480fa9e4066Sahrens 	if (zs) {
48113506d1eSmaybee 		if (reset) {
48213506d1eSmaybee 			zstream_t *remove = zs;
48313506d1eSmaybee 
4847cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
48513506d1eSmaybee 			rc = 0;
48613506d1eSmaybee 			mutex_exit(&zs->zst_lock);
48713506d1eSmaybee 			rw_exit(&zf->zf_rwlock);
48813506d1eSmaybee 			rw_enter(&zf->zf_rwlock, RW_WRITER);
48913506d1eSmaybee 			/*
49013506d1eSmaybee 			 * Relocate the stream, in case someone removes
49113506d1eSmaybee 			 * it while we were acquiring the WRITER lock.
49213506d1eSmaybee 			 */
49313506d1eSmaybee 			for (zs = list_head(&zf->zf_stream); zs;
49413506d1eSmaybee 			    zs = list_next(&zf->zf_stream, zs)) {
49513506d1eSmaybee 				if (zs == remove) {
49613506d1eSmaybee 					dmu_zfetch_stream_remove(zf, zs);
49713506d1eSmaybee 					mutex_destroy(&zs->zst_lock);
49813506d1eSmaybee 					kmem_free(zs, sizeof (zstream_t));
49913506d1eSmaybee 					break;
50013506d1eSmaybee 				}
50113506d1eSmaybee 			}
50213506d1eSmaybee 		} else {
5037cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
504fa9e4066Sahrens 			rc = 1;
505fa9e4066Sahrens 			dmu_zfetch_dofetch(zf, zs);
506fa9e4066Sahrens 			mutex_exit(&zs->zst_lock);
507fa9e4066Sahrens 		}
50813506d1eSmaybee 	}
50913506d1eSmaybee out:
510fa9e4066Sahrens 	rw_exit(&zf->zf_rwlock);
511fa9e4066Sahrens 	return (rc);
512fa9e4066Sahrens }
513fa9e4066Sahrens 
514fa9e4066Sahrens /*
515fa9e4066Sahrens  * Clean-up state associated with a zfetch structure.  This frees allocated
516fa9e4066Sahrens  * structure members, empties the zf_stream tree, and generally makes things
517fa9e4066Sahrens  * nice.  This doesn't free the zfetch_t itself, that's left to the caller.
518fa9e4066Sahrens  */
519fa9e4066Sahrens void
dmu_zfetch_rele(zfetch_t * zf)520fa9e4066Sahrens dmu_zfetch_rele(zfetch_t *zf)
521fa9e4066Sahrens {
522fa9e4066Sahrens 	zstream_t	*zs;
523fa9e4066Sahrens 	zstream_t	*zs_next;
524fa9e4066Sahrens 
525fa9e4066Sahrens 	ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock));
526fa9e4066Sahrens 
527fa9e4066Sahrens 	for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) {
528fa9e4066Sahrens 		zs_next = list_next(&zf->zf_stream, zs);
529fa9e4066Sahrens 
530fa9e4066Sahrens 		list_remove(&zf->zf_stream, zs);
531fa9e4066Sahrens 		mutex_destroy(&zs->zst_lock);
532fa9e4066Sahrens 		kmem_free(zs, sizeof (zstream_t));
533fa9e4066Sahrens 	}
534fa9e4066Sahrens 	list_destroy(&zf->zf_stream);
535fa9e4066Sahrens 	rw_destroy(&zf->zf_rwlock);
536fa9e4066Sahrens 
537fa9e4066Sahrens 	zf->zf_dnode = NULL;
538fa9e4066Sahrens }
539fa9e4066Sahrens 
540fa9e4066Sahrens /*
541fa9e4066Sahrens  * Given a zfetch and zstream structure, insert the zstream structure into the
542fa9e4066Sahrens  * AVL tree contained within the zfetch structure.  Peform the appropriate
543fa9e4066Sahrens  * book-keeping.  It is possible that another thread has inserted a stream which
544fa9e4066Sahrens  * matches one that we are about to insert, so we must be sure to check for this
545fa9e4066Sahrens  * case.  If one is found, return failure, and let the caller cleanup the
546fa9e4066Sahrens  * duplicates.
547fa9e4066Sahrens  */
548fa9e4066Sahrens static int
dmu_zfetch_stream_insert(zfetch_t * zf,zstream_t * zs)549fa9e4066Sahrens dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
550fa9e4066Sahrens {
551fa9e4066Sahrens 	zstream_t	*zs_walk;
552fa9e4066Sahrens 	zstream_t	*zs_next;
553fa9e4066Sahrens 
554fa9e4066Sahrens 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
555fa9e4066Sahrens 
556fa9e4066Sahrens 	for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) {
557fa9e4066Sahrens 		zs_next = list_next(&zf->zf_stream, zs_walk);
558fa9e4066Sahrens 
559fa9e4066Sahrens 		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
560fa9e4066Sahrens 			return (0);
561fa9e4066Sahrens 		}
562fa9e4066Sahrens 	}
563fa9e4066Sahrens 
564fa9e4066Sahrens 	list_insert_head(&zf->zf_stream, zs);
565fa9e4066Sahrens 	zf->zf_stream_cnt++;
566fa9e4066Sahrens 	return (1);
567fa9e4066Sahrens }
568fa9e4066Sahrens 
569fa9e4066Sahrens 
570fa9e4066Sahrens /*
571fa9e4066Sahrens  * Walk the list of zstreams in the given zfetch, find an old one (by time), and
572fa9e4066Sahrens  * reclaim it for use by the caller.
573fa9e4066Sahrens  */
574fa9e4066Sahrens static zstream_t *
dmu_zfetch_stream_reclaim(zfetch_t * zf)575fa9e4066Sahrens dmu_zfetch_stream_reclaim(zfetch_t *zf)
576fa9e4066Sahrens {
577fa9e4066Sahrens 	zstream_t	*zs;
578fa9e4066Sahrens 
579404ba9d7Srbourbon 	if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER))
580404ba9d7Srbourbon 		return (0);
581fa9e4066Sahrens 
582fa9e4066Sahrens 	for (zs = list_head(&zf->zf_stream); zs;
583fa9e4066Sahrens 	    zs = list_next(&zf->zf_stream, zs)) {
584fa9e4066Sahrens 
585d3d50737SRafael Vanoni 		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
586fa9e4066Sahrens 			break;
587fa9e4066Sahrens 	}
588fa9e4066Sahrens 
589fa9e4066Sahrens 	if (zs) {
590fa9e4066Sahrens 		dmu_zfetch_stream_remove(zf, zs);
591fa9e4066Sahrens 		mutex_destroy(&zs->zst_lock);
592fa9e4066Sahrens 		bzero(zs, sizeof (zstream_t));
593fa9e4066Sahrens 	} else {
594fa9e4066Sahrens 		zf->zf_alloc_fail++;
595fa9e4066Sahrens 	}
596fa9e4066Sahrens 	rw_exit(&zf->zf_rwlock);
597fa9e4066Sahrens 
598fa9e4066Sahrens 	return (zs);
599fa9e4066Sahrens }
600fa9e4066Sahrens 
601fa9e4066Sahrens /*
602fa9e4066Sahrens  * Given a zfetch and zstream structure, remove the zstream structure from its
603fa9e4066Sahrens  * container in the zfetch structure.  Perform the appropriate book-keeping.
604fa9e4066Sahrens  */
605fa9e4066Sahrens static void
dmu_zfetch_stream_remove(zfetch_t * zf,zstream_t * zs)606fa9e4066Sahrens dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs)
607fa9e4066Sahrens {
608fa9e4066Sahrens 	ASSERT(RW_WRITE_HELD(&zf->zf_rwlock));
609fa9e4066Sahrens 
610fa9e4066Sahrens 	list_remove(&zf->zf_stream, zs);
611fa9e4066Sahrens 	zf->zf_stream_cnt--;
612fa9e4066Sahrens }
613fa9e4066Sahrens 
614fa9e4066Sahrens static int
dmu_zfetch_streams_equal(zstream_t * zs1,zstream_t * zs2)615fa9e4066Sahrens dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2)
616fa9e4066Sahrens {
617fa9e4066Sahrens 	if (zs1->zst_offset != zs2->zst_offset)
618fa9e4066Sahrens 		return (0);
619fa9e4066Sahrens 
620fa9e4066Sahrens 	if (zs1->zst_len != zs2->zst_len)
621fa9e4066Sahrens 		return (0);
622fa9e4066Sahrens 
623fa9e4066Sahrens 	if (zs1->zst_stride != zs2->zst_stride)
624fa9e4066Sahrens 		return (0);
625fa9e4066Sahrens 
626fa9e4066Sahrens 	if (zs1->zst_ph_offset != zs2->zst_ph_offset)
627fa9e4066Sahrens 		return (0);
628fa9e4066Sahrens 
629fa9e4066Sahrens 	if (zs1->zst_cap != zs2->zst_cap)
630fa9e4066Sahrens 		return (0);
631fa9e4066Sahrens 
632fa9e4066Sahrens 	if (zs1->zst_direction != zs2->zst_direction)
633fa9e4066Sahrens 		return (0);
634fa9e4066Sahrens 
635fa9e4066Sahrens 	return (1);
636fa9e4066Sahrens }
637fa9e4066Sahrens 
638fa9e4066Sahrens /*
639fa9e4066Sahrens  * This is the prefetch entry point.  It calls all of the other dmu_zfetch
640fa9e4066Sahrens  * routines to create, delete, find, or operate upon prefetch streams.
641fa9e4066Sahrens  */
642fa9e4066Sahrens void
dmu_zfetch(zfetch_t * zf,uint64_t offset,uint64_t size,int prefetched)64313506d1eSmaybee dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
644fa9e4066Sahrens {
645fa9e4066Sahrens 	zstream_t	zst;
646fa9e4066Sahrens 	zstream_t	*newstream;
6473e30c24aSWill Andrews 	boolean_t	fetched;
648fa9e4066Sahrens 	int		inserted;
649fa9e4066Sahrens 	unsigned int	blkshft;
650fa9e4066Sahrens 	uint64_t	blksz;
651fa9e4066Sahrens 
652a2eea2e1Sahrens 	if (zfs_prefetch_disable)
653fa9e4066Sahrens 		return;
654a2eea2e1Sahrens 
655a2eea2e1Sahrens 	/* files that aren't ln2 blocksz are only one block -- nothing to do */
656a2eea2e1Sahrens 	if (!zf->zf_dnode->dn_datablkshift)
657a2eea2e1Sahrens 		return;
658fa9e4066Sahrens 
659fa9e4066Sahrens 	/* convert offset and size, into blockid and nblocks */
660fa9e4066Sahrens 	blkshft = zf->zf_dnode->dn_datablkshift;
661fa9e4066Sahrens 	blksz = (1 << blkshft);
662fa9e4066Sahrens 
663fa9e4066Sahrens 	bzero(&zst, sizeof (zstream_t));
664fa9e4066Sahrens 	zst.zst_offset = offset >> blkshft;
665fa9e4066Sahrens 	zst.zst_len = (P2ROUNDUP(offset + size, blksz) -
666fa9e4066Sahrens 	    P2ALIGN(offset, blksz)) >> blkshft;
667fa9e4066Sahrens 
66813506d1eSmaybee 	fetched = dmu_zfetch_find(zf, &zst, prefetched);
6697cbf8b43SRich Morris 	if (fetched) {
6707cbf8b43SRich Morris 		ZFETCHSTAT_BUMP(zfetchstat_hits);
6717cbf8b43SRich Morris 	} else {
6727cbf8b43SRich Morris 		ZFETCHSTAT_BUMP(zfetchstat_misses);
6733e30c24aSWill Andrews 		fetched = dmu_zfetch_colinear(zf, &zst);
6743e30c24aSWill Andrews 		if (fetched) {
6757cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
6767cbf8b43SRich Morris 		} else {
6777cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
6787cbf8b43SRich Morris 		}
679fa9e4066Sahrens 	}
680fa9e4066Sahrens 
681fa9e4066Sahrens 	if (!fetched) {
682fa9e4066Sahrens 		newstream = dmu_zfetch_stream_reclaim(zf);
683fa9e4066Sahrens 
684fa9e4066Sahrens 		/*
685fa9e4066Sahrens 		 * we still couldn't find a stream, drop the lock, and allocate
686fa9e4066Sahrens 		 * one if possible.  Otherwise, give up and go home.
687fa9e4066Sahrens 		 */
6887cbf8b43SRich Morris 		if (newstream) {
6897cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
6907cbf8b43SRich Morris 		} else {
691fa9e4066Sahrens 			uint64_t	maxblocks;
692fa9e4066Sahrens 			uint32_t	max_streams;
693fa9e4066Sahrens 			uint32_t	cur_streams;
694fa9e4066Sahrens 
6957cbf8b43SRich Morris 			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
696fa9e4066Sahrens 			cur_streams = zf->zf_stream_cnt;
697fa9e4066Sahrens 			maxblocks = zf->zf_dnode->dn_maxblkid;
698fa9e4066Sahrens 
699fa9e4066Sahrens 			max_streams = MIN(zfetch_max_streams,
700fa9e4066Sahrens 			    (maxblocks / zfetch_block_cap));
701fa9e4066Sahrens 			if (max_streams == 0) {
702fa9e4066Sahrens 				max_streams++;
703fa9e4066Sahrens 			}
704fa9e4066Sahrens 
705fa9e4066Sahrens 			if (cur_streams >= max_streams) {
706fa9e4066Sahrens 				return;
707fa9e4066Sahrens 			}
708fa9e4066Sahrens 			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
709fa9e4066Sahrens 		}
710fa9e4066Sahrens 
711fa9e4066Sahrens 		newstream->zst_offset = zst.zst_offset;
712fa9e4066Sahrens 		newstream->zst_len = zst.zst_len;
713fa9e4066Sahrens 		newstream->zst_stride = zst.zst_len;
714fa9e4066Sahrens 		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
715fa9e4066Sahrens 		newstream->zst_cap = zst.zst_len;
716fa9e4066Sahrens 		newstream->zst_direction = ZFETCH_FORWARD;
717d3d50737SRafael Vanoni 		newstream->zst_last = ddi_get_lbolt();
718fa9e4066Sahrens 
719fa9e4066Sahrens 		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
720fa9e4066Sahrens 
721fa9e4066Sahrens 		rw_enter(&zf->zf_rwlock, RW_WRITER);
722fa9e4066Sahrens 		inserted = dmu_zfetch_stream_insert(zf, newstream);
723fa9e4066Sahrens 		rw_exit(&zf->zf_rwlock);
724fa9e4066Sahrens 
725fa9e4066Sahrens 		if (!inserted) {
726fa9e4066Sahrens 			mutex_destroy(&newstream->zst_lock);
727fa9e4066Sahrens 			kmem_free(newstream, sizeof (zstream_t));
728fa9e4066Sahrens 		}
729fa9e4066Sahrens 	}
730fa9e4066Sahrens }
731