1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 312a58b312SMartin Matuska #include <sys/arc_impl.h> 32eda14cbcSMatt Macy #include <sys/dnode.h> 33eda14cbcSMatt Macy #include <sys/dmu_objset.h> 34eda14cbcSMatt Macy #include <sys/dmu_zfetch.h> 35eda14cbcSMatt Macy #include <sys/dmu.h> 36eda14cbcSMatt Macy #include <sys/dbuf.h> 37eda14cbcSMatt Macy #include <sys/kstat.h> 380d8fe237SMartin Matuska #include <sys/wmsum.h> 39eda14cbcSMatt Macy 40eda14cbcSMatt Macy /* 41eda14cbcSMatt Macy * This tunable disables predictive prefetch. Note that it leaves "prescient" 42eda14cbcSMatt Macy * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 43eda14cbcSMatt Macy * prescient prefetch never issues i/os that end up not being needed, 44eda14cbcSMatt Macy * so it can't hurt performance. 45eda14cbcSMatt Macy */ 46eda14cbcSMatt Macy 47e92ffd9bSMartin Matuska static int zfs_prefetch_disable = B_FALSE; 48eda14cbcSMatt Macy 49eda14cbcSMatt Macy /* max # of streams per zfetch */ 50e92ffd9bSMartin Matuska static unsigned int zfetch_max_streams = 8; 51eda14cbcSMatt Macy /* min time before stream reclaim */ 52e3aa18adSMartin Matuska static unsigned int zfetch_min_sec_reap = 1; 53e3aa18adSMartin Matuska /* max time before stream delete */ 54e3aa18adSMartin Matuska static unsigned int zfetch_max_sec_reap = 2; 55315ee00fSMartin Matuska #ifdef _ILP32 56315ee00fSMartin Matuska /* min bytes to prefetch per stream (default 2MB) */ 57315ee00fSMartin Matuska static unsigned int zfetch_min_distance = 2 * 1024 * 1024; 58315ee00fSMartin Matuska /* max bytes to prefetch per stream (default 8MB) */ 59315ee00fSMartin Matuska unsigned int zfetch_max_distance = 8 * 1024 * 1024; 60315ee00fSMartin Matuska #else 61e3aa18adSMartin Matuska /* min bytes to prefetch per stream (default 4MB) */ 62e3aa18adSMartin Matuska static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 63e3aa18adSMartin Matuska /* max bytes to prefetch per stream (default 64MB) */ 64e3aa18adSMartin Matuska unsigned int zfetch_max_distance = 64 * 1024 * 1024; 65315ee00fSMartin Matuska #endif 66*17aab35aSMartin Matuska /* max bytes to prefetch indirects for per stream (default 128MB) */ 67*17aab35aSMartin Matuska unsigned int zfetch_max_idistance = 128 * 1024 * 1024; 681719886fSMartin Matuska /* max request reorder distance within a stream (default 16MB) */ 691719886fSMartin Matuska unsigned int zfetch_max_reorder = 16 * 1024 * 1024; 701719886fSMartin Matuska /* Max log2 fraction of holes in a stream */ 711719886fSMartin Matuska unsigned int zfetch_hole_shift = 2; 72eda14cbcSMatt Macy 73eda14cbcSMatt Macy typedef struct zfetch_stats { 74eda14cbcSMatt Macy kstat_named_t zfetchstat_hits; 751719886fSMartin Matuska kstat_named_t zfetchstat_future; 761719886fSMartin Matuska kstat_named_t zfetchstat_stride; 771719886fSMartin Matuska kstat_named_t zfetchstat_past; 78eda14cbcSMatt Macy kstat_named_t zfetchstat_misses; 79eda14cbcSMatt Macy kstat_named_t zfetchstat_max_streams; 807877fdebSMatt Macy kstat_named_t zfetchstat_io_issued; 812a58b312SMartin Matuska kstat_named_t zfetchstat_io_active; 82eda14cbcSMatt Macy } zfetch_stats_t; 83eda14cbcSMatt Macy 84eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = { 85eda14cbcSMatt Macy { "hits", KSTAT_DATA_UINT64 }, 861719886fSMartin Matuska { "future", KSTAT_DATA_UINT64 }, 871719886fSMartin Matuska { "stride", KSTAT_DATA_UINT64 }, 881719886fSMartin Matuska { "past", KSTAT_DATA_UINT64 }, 89eda14cbcSMatt Macy { "misses", KSTAT_DATA_UINT64 }, 90eda14cbcSMatt Macy { "max_streams", KSTAT_DATA_UINT64 }, 917877fdebSMatt Macy { "io_issued", KSTAT_DATA_UINT64 }, 922a58b312SMartin Matuska { "io_active", KSTAT_DATA_UINT64 }, 93eda14cbcSMatt Macy }; 94eda14cbcSMatt Macy 950d8fe237SMartin Matuska struct { 960d8fe237SMartin Matuska wmsum_t zfetchstat_hits; 971719886fSMartin Matuska wmsum_t zfetchstat_future; 981719886fSMartin Matuska wmsum_t zfetchstat_stride; 991719886fSMartin Matuska wmsum_t zfetchstat_past; 1000d8fe237SMartin Matuska wmsum_t zfetchstat_misses; 1010d8fe237SMartin Matuska wmsum_t zfetchstat_max_streams; 1020d8fe237SMartin Matuska wmsum_t zfetchstat_io_issued; 1032a58b312SMartin Matuska aggsum_t zfetchstat_io_active; 1040d8fe237SMartin Matuska } zfetch_sums; 1050d8fe237SMartin Matuska 106eda14cbcSMatt Macy #define ZFETCHSTAT_BUMP(stat) \ 1070d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, 1) 1087877fdebSMatt Macy #define ZFETCHSTAT_ADD(stat, val) \ 1090d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, val) 1107877fdebSMatt Macy 111eda14cbcSMatt Macy 112e92ffd9bSMartin Matuska static kstat_t *zfetch_ksp; 113eda14cbcSMatt Macy 1140d8fe237SMartin Matuska static int 1150d8fe237SMartin Matuska zfetch_kstats_update(kstat_t *ksp, int rw) 1160d8fe237SMartin Matuska { 1170d8fe237SMartin Matuska zfetch_stats_t *zs = ksp->ks_data; 1180d8fe237SMartin Matuska 1190d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 1200d8fe237SMartin Matuska return (EACCES); 1210d8fe237SMartin Matuska zs->zfetchstat_hits.value.ui64 = 1220d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_hits); 1231719886fSMartin Matuska zs->zfetchstat_future.value.ui64 = 1241719886fSMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_future); 1251719886fSMartin Matuska zs->zfetchstat_stride.value.ui64 = 1261719886fSMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_stride); 1271719886fSMartin Matuska zs->zfetchstat_past.value.ui64 = 1281719886fSMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_past); 1290d8fe237SMartin Matuska zs->zfetchstat_misses.value.ui64 = 1300d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_misses); 1310d8fe237SMartin Matuska zs->zfetchstat_max_streams.value.ui64 = 1320d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_max_streams); 1330d8fe237SMartin Matuska zs->zfetchstat_io_issued.value.ui64 = 1340d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_io_issued); 1352a58b312SMartin Matuska zs->zfetchstat_io_active.value.ui64 = 1362a58b312SMartin Matuska aggsum_value(&zfetch_sums.zfetchstat_io_active); 1370d8fe237SMartin Matuska return (0); 1380d8fe237SMartin Matuska } 1390d8fe237SMartin Matuska 140eda14cbcSMatt Macy void 141eda14cbcSMatt Macy zfetch_init(void) 142eda14cbcSMatt Macy { 1430d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 1441719886fSMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_future, 0); 1451719886fSMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_stride, 0); 1461719886fSMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_past, 0); 1470d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 1480d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 1490d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 1502a58b312SMartin Matuska aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 1510d8fe237SMartin Matuska 152eda14cbcSMatt Macy zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 153eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 154eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 155eda14cbcSMatt Macy 156eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 157eda14cbcSMatt Macy zfetch_ksp->ks_data = &zfetch_stats; 1580d8fe237SMartin Matuska zfetch_ksp->ks_update = zfetch_kstats_update; 159eda14cbcSMatt Macy kstat_install(zfetch_ksp); 160eda14cbcSMatt Macy } 161eda14cbcSMatt Macy } 162eda14cbcSMatt Macy 163eda14cbcSMatt Macy void 164eda14cbcSMatt Macy zfetch_fini(void) 165eda14cbcSMatt Macy { 166eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 167eda14cbcSMatt Macy kstat_delete(zfetch_ksp); 168eda14cbcSMatt Macy zfetch_ksp = NULL; 169eda14cbcSMatt Macy } 1700d8fe237SMartin Matuska 1710d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_hits); 1721719886fSMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_future); 1731719886fSMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_stride); 1741719886fSMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_past); 1750d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_misses); 1760d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 1770d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 1782a58b312SMartin Matuska ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 1792a58b312SMartin Matuska aggsum_fini(&zfetch_sums.zfetchstat_io_active); 180eda14cbcSMatt Macy } 181eda14cbcSMatt Macy 182eda14cbcSMatt Macy /* 183eda14cbcSMatt Macy * This takes a pointer to a zfetch structure and a dnode. It performs the 184eda14cbcSMatt Macy * necessary setup for the zfetch structure, grokking data from the 185eda14cbcSMatt Macy * associated dnode. 186eda14cbcSMatt Macy */ 187eda14cbcSMatt Macy void 188eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 189eda14cbcSMatt Macy { 190eda14cbcSMatt Macy if (zf == NULL) 191eda14cbcSMatt Macy return; 192eda14cbcSMatt Macy zf->zf_dnode = dno; 1937877fdebSMatt Macy zf->zf_numstreams = 0; 194eda14cbcSMatt Macy 195eda14cbcSMatt Macy list_create(&zf->zf_stream, sizeof (zstream_t), 196eda14cbcSMatt Macy offsetof(zstream_t, zs_node)); 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 199eda14cbcSMatt Macy } 200eda14cbcSMatt Macy 201eda14cbcSMatt Macy static void 2027877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs) 2037877fdebSMatt Macy { 204f9693befSMartin Matuska ASSERT(!list_link_active(&zs->zs_node)); 2057cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_callers); 2067cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_refs); 2077877fdebSMatt Macy kmem_free(zs, sizeof (*zs)); 2087877fdebSMatt Macy } 2097877fdebSMatt Macy 2107877fdebSMatt Macy static void 211eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 212eda14cbcSMatt Macy { 213eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 214eda14cbcSMatt Macy list_remove(&zf->zf_stream, zs); 215f9693befSMartin Matuska zf->zf_numstreams--; 216f9693befSMartin Matuska membar_producer(); 217f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 2187877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 219eda14cbcSMatt Macy } 220eda14cbcSMatt Macy 221eda14cbcSMatt Macy /* 222eda14cbcSMatt Macy * Clean-up state associated with a zfetch structure (e.g. destroy the 223eda14cbcSMatt Macy * streams). This doesn't free the zfetch_t itself, that's left to the caller. 224eda14cbcSMatt Macy */ 225eda14cbcSMatt Macy void 226eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf) 227eda14cbcSMatt Macy { 228eda14cbcSMatt Macy zstream_t *zs; 229eda14cbcSMatt Macy 230eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 231f9693befSMartin Matuska while ((zs = list_head(&zf->zf_stream)) != NULL) 232eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 233eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 234eda14cbcSMatt Macy list_destroy(&zf->zf_stream); 235eda14cbcSMatt Macy mutex_destroy(&zf->zf_lock); 236eda14cbcSMatt Macy 237eda14cbcSMatt Macy zf->zf_dnode = NULL; 238eda14cbcSMatt Macy } 239eda14cbcSMatt Macy 240eda14cbcSMatt Macy /* 241e3aa18adSMartin Matuska * If there aren't too many active streams already, create one more. 242e3aa18adSMartin Matuska * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 243e3aa18adSMartin Matuska * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 244eda14cbcSMatt Macy * The "blkid" argument is the next block that we expect this stream to access. 245eda14cbcSMatt Macy */ 246eda14cbcSMatt Macy static void 247eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 248eda14cbcSMatt Macy { 249e3aa18adSMartin Matuska zstream_t *zs, *zs_next, *zs_old = NULL; 2501719886fSMartin Matuska uint_t now = gethrestime_sec(), t; 251eda14cbcSMatt Macy 252eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 253eda14cbcSMatt Macy 254eda14cbcSMatt Macy /* 255e3aa18adSMartin Matuska * Delete too old streams, reusing the first found one. 256eda14cbcSMatt Macy */ 2571719886fSMartin Matuska t = now - zfetch_max_sec_reap; 258e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 259eda14cbcSMatt Macy zs_next = list_next(&zf->zf_stream, zs); 2607877fdebSMatt Macy /* 261f9693befSMartin Matuska * Skip if still active. 1 -- zf_stream reference. 2627877fdebSMatt Macy */ 2631719886fSMartin Matuska if ((int)(zs->zs_atime - t) >= 0) 2647877fdebSMatt Macy continue; 2651719886fSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 266e3aa18adSMartin Matuska continue; 267e3aa18adSMartin Matuska if (zs_old) 268eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 269e3aa18adSMartin Matuska else 270e3aa18adSMartin Matuska zs_old = zs; 271e3aa18adSMartin Matuska } 272e3aa18adSMartin Matuska if (zs_old) { 273e3aa18adSMartin Matuska zs = zs_old; 2741719886fSMartin Matuska list_remove(&zf->zf_stream, zs); 275e3aa18adSMartin Matuska goto reuse; 276eda14cbcSMatt Macy } 277eda14cbcSMatt Macy 278eda14cbcSMatt Macy /* 279eda14cbcSMatt Macy * The maximum number of streams is normally zfetch_max_streams, 280eda14cbcSMatt Macy * but for small files we lower it such that it's at least possible 281eda14cbcSMatt Macy * for all the streams to be non-overlapping. 282eda14cbcSMatt Macy */ 283eda14cbcSMatt Macy uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 2841719886fSMartin Matuska (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / 285eda14cbcSMatt Macy zfetch_max_distance)); 2867877fdebSMatt Macy if (zf->zf_numstreams >= max_streams) { 2871719886fSMartin Matuska t = now - zfetch_min_sec_reap; 288e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; 289e3aa18adSMartin Matuska zs = list_next(&zf->zf_stream, zs)) { 2901719886fSMartin Matuska if ((int)(zs->zs_atime - t) >= 0) 2911719886fSMartin Matuska continue; 292e3aa18adSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 293e3aa18adSMartin Matuska continue; 2941719886fSMartin Matuska if (zs_old == NULL || 2951719886fSMartin Matuska (int)(zs_old->zs_atime - zs->zs_atime) >= 0) 296e3aa18adSMartin Matuska zs_old = zs; 297e3aa18adSMartin Matuska } 298e3aa18adSMartin Matuska if (zs_old) { 299e3aa18adSMartin Matuska zs = zs_old; 3001719886fSMartin Matuska list_remove(&zf->zf_stream, zs); 301e3aa18adSMartin Matuska goto reuse; 302e3aa18adSMartin Matuska } 303eda14cbcSMatt Macy ZFETCHSTAT_BUMP(zfetchstat_max_streams); 304eda14cbcSMatt Macy return; 305eda14cbcSMatt Macy } 306eda14cbcSMatt Macy 307e3aa18adSMartin Matuska zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 308f9693befSMartin Matuska zfs_refcount_create(&zs->zs_callers); 309f9693befSMartin Matuska zfs_refcount_create(&zs->zs_refs); 310f9693befSMartin Matuska /* One reference for zf_stream. */ 311f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 3127877fdebSMatt Macy zf->zf_numstreams++; 313e3aa18adSMartin Matuska 314e3aa18adSMartin Matuska reuse: 3151719886fSMartin Matuska list_insert_head(&zf->zf_stream, zs); 316e3aa18adSMartin Matuska zs->zs_blkid = blkid; 3171719886fSMartin Matuska /* Allow immediate stream reuse until first hit. */ 3181719886fSMartin Matuska zs->zs_atime = now - zfetch_min_sec_reap; 3191719886fSMartin Matuska memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); 320e3aa18adSMartin Matuska zs->zs_pf_dist = 0; 3211719886fSMartin Matuska zs->zs_ipf_dist = 0; 322e3aa18adSMartin Matuska zs->zs_pf_start = blkid; 323e3aa18adSMartin Matuska zs->zs_pf_end = blkid; 324e3aa18adSMartin Matuska zs->zs_ipf_start = blkid; 325e3aa18adSMartin Matuska zs->zs_ipf_end = blkid; 326e3aa18adSMartin Matuska zs->zs_missed = B_FALSE; 327e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 328eda14cbcSMatt Macy } 329eda14cbcSMatt Macy 3307877fdebSMatt Macy static void 331e3aa18adSMartin Matuska dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 3327877fdebSMatt Macy { 3337877fdebSMatt Macy zstream_t *zs = arg; 3347877fdebSMatt Macy 335e3aa18adSMartin Matuska if (io_issued && level == 0 && blkid < zs->zs_blkid) 336e3aa18adSMartin Matuska zs->zs_more = B_TRUE; 337f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 3387877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 3392a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 3407877fdebSMatt Macy } 3417877fdebSMatt Macy 342eda14cbcSMatt Macy /* 3431719886fSMartin Matuska * Process stream hit access for nblks blocks starting at zs_blkid. Return 3441719886fSMartin Matuska * number of blocks to proceed for after aggregation with future ranges. 3451719886fSMartin Matuska */ 3461719886fSMartin Matuska static uint64_t 3471719886fSMartin Matuska dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) 3481719886fSMartin Matuska { 3491719886fSMartin Matuska uint_t i, j; 3501719886fSMartin Matuska 3511719886fSMartin Matuska /* Optimize sequential accesses (no future ranges). */ 3521719886fSMartin Matuska if (zs->zs_ranges[0].start == 0) 3531719886fSMartin Matuska goto done; 3541719886fSMartin Matuska 3551719886fSMartin Matuska /* Look for intersections with further ranges. */ 3561719886fSMartin Matuska for (i = 0; i < ZFETCH_RANGES; i++) { 3571719886fSMartin Matuska zsrange_t *r = &zs->zs_ranges[i]; 3581719886fSMartin Matuska if (r->start == 0 || r->start > nblks) 3591719886fSMartin Matuska break; 3601719886fSMartin Matuska if (r->end >= nblks) { 3611719886fSMartin Matuska nblks = r->end; 3621719886fSMartin Matuska i++; 3631719886fSMartin Matuska break; 3641719886fSMartin Matuska } 3651719886fSMartin Matuska } 3661719886fSMartin Matuska 3671719886fSMartin Matuska /* Delete all found intersecting ranges, updates remaining. */ 3681719886fSMartin Matuska for (j = 0; i < ZFETCH_RANGES; i++, j++) { 3691719886fSMartin Matuska if (zs->zs_ranges[i].start == 0) 3701719886fSMartin Matuska break; 3711719886fSMartin Matuska ASSERT3U(zs->zs_ranges[i].start, >, nblks); 3721719886fSMartin Matuska ASSERT3U(zs->zs_ranges[i].end, >, nblks); 3731719886fSMartin Matuska zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; 3741719886fSMartin Matuska zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; 3751719886fSMartin Matuska } 3761719886fSMartin Matuska if (j < ZFETCH_RANGES) { 3771719886fSMartin Matuska zs->zs_ranges[j].start = 0; 3781719886fSMartin Matuska zs->zs_ranges[j].end = 0; 3791719886fSMartin Matuska } 3801719886fSMartin Matuska 3811719886fSMartin Matuska done: 3821719886fSMartin Matuska zs->zs_blkid += nblks; 3831719886fSMartin Matuska return (nblks); 3841719886fSMartin Matuska } 3851719886fSMartin Matuska 3861719886fSMartin Matuska /* 3871719886fSMartin Matuska * Process future stream access for nblks blocks starting at blkid. Return 3881719886fSMartin Matuska * number of blocks to proceed for if future ranges reach fill threshold. 3891719886fSMartin Matuska */ 3901719886fSMartin Matuska static uint64_t 3911719886fSMartin Matuska dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) 3921719886fSMartin Matuska { 3931719886fSMartin Matuska ASSERT3U(blkid, >, zs->zs_blkid); 3941719886fSMartin Matuska blkid -= zs->zs_blkid; 3951719886fSMartin Matuska ASSERT3U(blkid + nblks, <=, UINT16_MAX); 3961719886fSMartin Matuska 3971719886fSMartin Matuska /* Search for first and last intersection or insert point. */ 3981719886fSMartin Matuska uint_t f = ZFETCH_RANGES, l = 0, i; 3991719886fSMartin Matuska for (i = 0; i < ZFETCH_RANGES; i++) { 4001719886fSMartin Matuska zsrange_t *r = &zs->zs_ranges[i]; 4011719886fSMartin Matuska if (r->start == 0 || r->start > blkid + nblks) 4021719886fSMartin Matuska break; 4031719886fSMartin Matuska if (r->end < blkid) 4041719886fSMartin Matuska continue; 4051719886fSMartin Matuska if (f > i) 4061719886fSMartin Matuska f = i; 4071719886fSMartin Matuska if (l < i) 4081719886fSMartin Matuska l = i; 4091719886fSMartin Matuska } 4101719886fSMartin Matuska if (f <= l) { 4111719886fSMartin Matuska /* Got some intersecting range, expand it if needed. */ 4121719886fSMartin Matuska if (zs->zs_ranges[f].start > blkid) 4131719886fSMartin Matuska zs->zs_ranges[f].start = blkid; 4141719886fSMartin Matuska zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); 4151719886fSMartin Matuska if (f < l) { 4161719886fSMartin Matuska /* Got more than one intersection, remove others. */ 4171719886fSMartin Matuska for (f++, l++; l < ZFETCH_RANGES; f++, l++) { 4181719886fSMartin Matuska zs->zs_ranges[f].start = zs->zs_ranges[l].start; 4191719886fSMartin Matuska zs->zs_ranges[f].end = zs->zs_ranges[l].end; 4201719886fSMartin Matuska } 4211719886fSMartin Matuska zs->zs_ranges[f].start = 0; 4221719886fSMartin Matuska zs->zs_ranges[f].end = 0; 4231719886fSMartin Matuska } 4241719886fSMartin Matuska } else if (i < ZFETCH_RANGES) { 4251719886fSMartin Matuska /* Got no intersecting ranges, insert new one. */ 4261719886fSMartin Matuska for (l = ZFETCH_RANGES - 1; l > i; l--) { 4271719886fSMartin Matuska zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; 4281719886fSMartin Matuska zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; 4291719886fSMartin Matuska } 4301719886fSMartin Matuska zs->zs_ranges[i].start = blkid; 4311719886fSMartin Matuska zs->zs_ranges[i].end = blkid + nblks; 4321719886fSMartin Matuska } else { 4331719886fSMartin Matuska /* No space left to insert. Drop the range. */ 4341719886fSMartin Matuska return (0); 4351719886fSMartin Matuska } 4361719886fSMartin Matuska 4371719886fSMartin Matuska /* Check if with the new access addition we reached fill threshold. */ 4381719886fSMartin Matuska if (zfetch_hole_shift >= 16) 4391719886fSMartin Matuska return (0); 4401719886fSMartin Matuska uint_t hole = 0; 4411719886fSMartin Matuska for (i = f = l = 0; i < ZFETCH_RANGES; i++) { 4421719886fSMartin Matuska zsrange_t *r = &zs->zs_ranges[i]; 4431719886fSMartin Matuska if (r->start == 0) 4441719886fSMartin Matuska break; 4451719886fSMartin Matuska hole += r->start - f; 4461719886fSMartin Matuska f = r->end; 4471719886fSMartin Matuska if (hole <= r->end >> zfetch_hole_shift) 4481719886fSMartin Matuska l = r->end; 4491719886fSMartin Matuska } 4501719886fSMartin Matuska if (l > 0) 4511719886fSMartin Matuska return (dmu_zfetch_hit(zs, l)); 4521719886fSMartin Matuska 4531719886fSMartin Matuska return (0); 4541719886fSMartin Matuska } 4551719886fSMartin Matuska 4561719886fSMartin Matuska /* 457f9693befSMartin Matuska * This is the predictive prefetch entry point. dmu_zfetch_prepare() 458f9693befSMartin Matuska * associates dnode access specified with blkid and nblks arguments with 459f9693befSMartin Matuska * prefetch stream, predicts further accesses based on that stats and returns 460f9693befSMartin Matuska * the stream pointer on success. That pointer must later be passed to 461f9693befSMartin Matuska * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 462f9693befSMartin Matuska * release it. dmu_zfetch() is a wrapper for simple cases when window between 463f9693befSMartin Matuska * prediction and prefetch initiation is not needed. 464eda14cbcSMatt Macy * fetch_data argument specifies whether actual data blocks should be fetched: 465eda14cbcSMatt Macy * FALSE -- prefetch only indirect blocks for predicted data blocks; 466eda14cbcSMatt Macy * TRUE -- prefetch predicted data blocks plus following indirect blocks. 467eda14cbcSMatt Macy */ 468f9693befSMartin Matuska zstream_t * 469f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 470f9693befSMartin Matuska boolean_t fetch_data, boolean_t have_lock) 471eda14cbcSMatt Macy { 472eda14cbcSMatt Macy zstream_t *zs; 473eda14cbcSMatt Macy spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 4746c1e79dfSMartin Matuska zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; 475*17aab35aSMartin Matuska int64_t ipf_start, ipf_end; 476eda14cbcSMatt Macy 4776c1e79dfSMartin Matuska if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) 478f9693befSMartin Matuska return (NULL); 4796c1e79dfSMartin Matuska 4806c1e79dfSMartin Matuska if (os_prefetch == ZFS_PREFETCH_METADATA) 4816c1e79dfSMartin Matuska fetch_data = B_FALSE; 4826c1e79dfSMartin Matuska 483eda14cbcSMatt Macy /* 484eda14cbcSMatt Macy * If we haven't yet loaded the indirect vdevs' mappings, we 485eda14cbcSMatt Macy * can only read from blocks that we carefully ensure are on 486eda14cbcSMatt Macy * concrete vdevs (or previously-loaded indirect vdevs). So we 487eda14cbcSMatt Macy * can't allow the predictive prefetcher to attempt reads of other 488eda14cbcSMatt Macy * blocks (e.g. of the MOS's dnode object). 489eda14cbcSMatt Macy */ 490eda14cbcSMatt Macy if (!spa_indirect_vdevs_loaded(spa)) 491f9693befSMartin Matuska return (NULL); 492eda14cbcSMatt Macy 493eda14cbcSMatt Macy /* 494eda14cbcSMatt Macy * As a fast path for small (single-block) files, ignore access 495eda14cbcSMatt Macy * to the first block. 496eda14cbcSMatt Macy */ 4977877fdebSMatt Macy if (!have_lock && blkid == 0) 498f9693befSMartin Matuska return (NULL); 499eda14cbcSMatt Macy 500eda14cbcSMatt Macy if (!have_lock) 501eda14cbcSMatt Macy rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 5027877fdebSMatt Macy 5037877fdebSMatt Macy /* 5047877fdebSMatt Macy * A fast path for small files for which no prefetch will 5057877fdebSMatt Macy * happen. 5067877fdebSMatt Macy */ 507e3aa18adSMartin Matuska uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 508f9693befSMartin Matuska if (maxblkid < 2) { 5097877fdebSMatt Macy if (!have_lock) 5107877fdebSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 511f9693befSMartin Matuska return (NULL); 5127877fdebSMatt Macy } 513eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 514eda14cbcSMatt Macy 515eda14cbcSMatt Macy /* 5161719886fSMartin Matuska * Find perfect prefetch stream. Depending on whether the accesses 517eda14cbcSMatt Macy * are block-aligned, first block of the new access may either follow 518eda14cbcSMatt Macy * the last block of the previous access, or be equal to it. 519eda14cbcSMatt Macy */ 5201719886fSMartin Matuska unsigned int dbs = zf->zf_dnode->dn_datablkshift; 5211719886fSMartin Matuska uint64_t end_blkid = blkid + nblks; 522eda14cbcSMatt Macy for (zs = list_head(&zf->zf_stream); zs != NULL; 523eda14cbcSMatt Macy zs = list_next(&zf->zf_stream, zs)) { 524eda14cbcSMatt Macy if (blkid == zs->zs_blkid) { 5251719886fSMartin Matuska goto hit; 526eda14cbcSMatt Macy } else if (blkid + 1 == zs->zs_blkid) { 527eda14cbcSMatt Macy blkid++; 528eda14cbcSMatt Macy nblks--; 5291719886fSMartin Matuska goto hit; 530eda14cbcSMatt Macy } 531eda14cbcSMatt Macy } 532f9693befSMartin Matuska 533f9693befSMartin Matuska /* 5341719886fSMartin Matuska * Find close enough prefetch stream. Access crossing stream position 5351719886fSMartin Matuska * is a hit in its new part. Access ahead of stream position considered 5361719886fSMartin Matuska * a hit for metadata prefetch, since we do not care about fill percent, 5371719886fSMartin Matuska * or stored for future otherwise. Access behind stream position is 5381719886fSMartin Matuska * silently ignored, since we already skipped it reaching fill percent. 539f9693befSMartin Matuska */ 5401719886fSMartin Matuska uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); 5411719886fSMartin Matuska uint_t t = gethrestime_sec() - zfetch_max_sec_reap; 5421719886fSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; 5431719886fSMartin Matuska zs = list_next(&zf->zf_stream, zs)) { 5441719886fSMartin Matuska if (blkid > zs->zs_blkid) { 5451719886fSMartin Matuska if (end_blkid <= zs->zs_blkid + max_reorder) { 5461719886fSMartin Matuska if (!fetch_data) { 5471719886fSMartin Matuska nblks = dmu_zfetch_hit(zs, 5481719886fSMartin Matuska end_blkid - zs->zs_blkid); 5491719886fSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_stride); 5501719886fSMartin Matuska goto future; 5511719886fSMartin Matuska } 5521719886fSMartin Matuska nblks = dmu_zfetch_future(zs, blkid, nblks); 5531719886fSMartin Matuska if (nblks > 0) 5541719886fSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_stride); 5551719886fSMartin Matuska else 5561719886fSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_future); 5571719886fSMartin Matuska goto future; 5581719886fSMartin Matuska } 5591719886fSMartin Matuska } else if (end_blkid >= zs->zs_blkid) { 5601719886fSMartin Matuska nblks -= zs->zs_blkid - blkid; 5611719886fSMartin Matuska blkid += zs->zs_blkid - blkid; 5621719886fSMartin Matuska goto hit; 5631719886fSMartin Matuska } else if (end_blkid + max_reorder > zs->zs_blkid && 5641719886fSMartin Matuska (int)(zs->zs_atime - t) >= 0) { 5651719886fSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_past); 5661719886fSMartin Matuska zs->zs_atime = gethrestime_sec(); 5671719886fSMartin Matuska goto out; 5681719886fSMartin Matuska } 569f9693befSMartin Matuska } 570f9693befSMartin Matuska 571eda14cbcSMatt Macy /* 5721719886fSMartin Matuska * This access is not part of any existing stream. Create a new 5731719886fSMartin Matuska * stream for it unless we are at the end of file. 574eda14cbcSMatt Macy */ 575*17aab35aSMartin Matuska ASSERT0P(zs); 5761719886fSMartin Matuska if (end_blkid < maxblkid) 5771719886fSMartin Matuska dmu_zfetch_stream_create(zf, end_blkid); 578eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 579f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_misses); 580*17aab35aSMartin Matuska ipf_start = 0; 581*17aab35aSMartin Matuska goto prescient; 5821719886fSMartin Matuska 5831719886fSMartin Matuska hit: 5841719886fSMartin Matuska nblks = dmu_zfetch_hit(zs, nblks); 5851719886fSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_hits); 5861719886fSMartin Matuska 5871719886fSMartin Matuska future: 5881719886fSMartin Matuska zs->zs_atime = gethrestime_sec(); 5891719886fSMartin Matuska 5901719886fSMartin Matuska /* Exit if we already prefetched for this position before. */ 5911719886fSMartin Matuska if (nblks == 0) 5921719886fSMartin Matuska goto out; 5931719886fSMartin Matuska 5941719886fSMartin Matuska /* If the file is ending, remove the stream. */ 5951719886fSMartin Matuska end_blkid = zs->zs_blkid; 5961719886fSMartin Matuska if (end_blkid >= maxblkid) { 5971719886fSMartin Matuska dmu_zfetch_stream_remove(zf, zs); 5981719886fSMartin Matuska out: 5991719886fSMartin Matuska mutex_exit(&zf->zf_lock); 6001719886fSMartin Matuska if (!have_lock) 6011719886fSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 6021719886fSMartin Matuska return (NULL); 603eda14cbcSMatt Macy } 604eda14cbcSMatt Macy 605eda14cbcSMatt Macy /* 606eda14cbcSMatt Macy * This access was to a block that we issued a prefetch for on 607e3aa18adSMartin Matuska * behalf of this stream. Calculate further prefetch distances. 608eda14cbcSMatt Macy * 609e3aa18adSMartin Matuska * Start prefetch from the demand access size (nblks). Double the 610e3aa18adSMartin Matuska * distance every access up to zfetch_min_distance. After that only 611e3aa18adSMartin Matuska * if needed increase the distance by 1/8 up to zfetch_max_distance. 6122a58b312SMartin Matuska * 6132a58b312SMartin Matuska * Don't double the distance beyond single block if we have more 6142a58b312SMartin Matuska * than ~6% of ARC held by active prefetches. It should help with 6152a58b312SMartin Matuska * getting out of RAM on some badly mispredicted read patterns. 616eda14cbcSMatt Macy */ 6172a58b312SMartin Matuska unsigned int nbytes = nblks << dbs; 618e3aa18adSMartin Matuska unsigned int pf_nblks; 619eda14cbcSMatt Macy if (fetch_data) { 620e3aa18adSMartin Matuska if (unlikely(zs->zs_pf_dist < nbytes)) 621e3aa18adSMartin Matuska zs->zs_pf_dist = nbytes; 6222a58b312SMartin Matuska else if (zs->zs_pf_dist < zfetch_min_distance && 6232a58b312SMartin Matuska (zs->zs_pf_dist < (1 << dbs) || 6242a58b312SMartin Matuska aggsum_compare(&zfetch_sums.zfetchstat_io_active, 6252a58b312SMartin Matuska arc_c_max >> (4 + dbs)) < 0)) 626e3aa18adSMartin Matuska zs->zs_pf_dist *= 2; 627e3aa18adSMartin Matuska else if (zs->zs_more) 628e3aa18adSMartin Matuska zs->zs_pf_dist += zs->zs_pf_dist / 8; 629e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 630e3aa18adSMartin Matuska if (zs->zs_pf_dist > zfetch_max_distance) 631e3aa18adSMartin Matuska zs->zs_pf_dist = zfetch_max_distance; 6322a58b312SMartin Matuska pf_nblks = zs->zs_pf_dist >> dbs; 633eda14cbcSMatt Macy } else { 634eda14cbcSMatt Macy pf_nblks = 0; 635eda14cbcSMatt Macy } 6361719886fSMartin Matuska if (zs->zs_pf_start < end_blkid) 6371719886fSMartin Matuska zs->zs_pf_start = end_blkid; 6381719886fSMartin Matuska if (zs->zs_pf_end < end_blkid + pf_nblks) 6391719886fSMartin Matuska zs->zs_pf_end = end_blkid + pf_nblks; 640eda14cbcSMatt Macy 641eda14cbcSMatt Macy /* 642e3aa18adSMartin Matuska * Do the same for indirects, starting where we will stop reading 643e3aa18adSMartin Matuska * data blocks (and the indirects that point to them). 644eda14cbcSMatt Macy */ 645e3aa18adSMartin Matuska if (unlikely(zs->zs_ipf_dist < nbytes)) 646e3aa18adSMartin Matuska zs->zs_ipf_dist = nbytes; 647e3aa18adSMartin Matuska else 648e3aa18adSMartin Matuska zs->zs_ipf_dist *= 2; 649e3aa18adSMartin Matuska if (zs->zs_ipf_dist > zfetch_max_idistance) 650e3aa18adSMartin Matuska zs->zs_ipf_dist = zfetch_max_idistance; 6512a58b312SMartin Matuska pf_nblks = zs->zs_ipf_dist >> dbs; 652e3aa18adSMartin Matuska if (zs->zs_ipf_start < zs->zs_pf_end) 653e3aa18adSMartin Matuska zs->zs_ipf_start = zs->zs_pf_end; 654*17aab35aSMartin Matuska ipf_start = zs->zs_ipf_end; 655e3aa18adSMartin Matuska if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 656e3aa18adSMartin Matuska zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 657eda14cbcSMatt Macy 658f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 659f9693befSMartin Matuska /* Count concurrent callers. */ 660f9693befSMartin Matuska zfs_refcount_add(&zs->zs_callers, NULL); 661eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 662f9693befSMartin Matuska 663*17aab35aSMartin Matuska prescient: 664*17aab35aSMartin Matuska /* 665*17aab35aSMartin Matuska * Prefetch the following indirect blocks for this access to reduce 666*17aab35aSMartin Matuska * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode(). 667*17aab35aSMartin Matuska * This covers the gap during the first couple accesses when we can 668*17aab35aSMartin Matuska * not predict the future yet, but know what is needed right now. 669*17aab35aSMartin Matuska * This should be very rare for reads/writes to need more than one 670*17aab35aSMartin Matuska * indirect, but more useful for cloning due to much bigger accesses. 671*17aab35aSMartin Matuska */ 672*17aab35aSMartin Matuska ipf_start = MAX(ipf_start, blkid + 1); 673*17aab35aSMartin Matuska int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 674*17aab35aSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 675*17aab35aSMartin Matuska ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs; 676*17aab35aSMartin Matuska 677*17aab35aSMartin Matuska int issued = 0; 678*17aab35aSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 679*17aab35aSMartin Matuska issued += dbuf_prefetch(zf->zf_dnode, 1, iblk, 680*17aab35aSMartin Matuska ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH); 681*17aab35aSMartin Matuska } 682*17aab35aSMartin Matuska 683f9693befSMartin Matuska if (!have_lock) 684f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 685*17aab35aSMartin Matuska if (issued) 686*17aab35aSMartin Matuska ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 687f9693befSMartin Matuska return (zs); 688f9693befSMartin Matuska } 689f9693befSMartin Matuska 690f9693befSMartin Matuska void 6911719886fSMartin Matuska dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, 6921719886fSMartin Matuska boolean_t have_lock) 693f9693befSMartin Matuska { 694f9693befSMartin Matuska int64_t pf_start, pf_end, ipf_start, ipf_end; 695f9693befSMartin Matuska int epbs, issued; 696f9693befSMartin Matuska 697f9693befSMartin Matuska if (missed) 698f9693befSMartin Matuska zs->zs_missed = missed; 699eda14cbcSMatt Macy 700eda14cbcSMatt Macy /* 701f9693befSMartin Matuska * Postpone the prefetch if there are more concurrent callers. 702f9693befSMartin Matuska * It happens when multiple requests are waiting for the same 703f9693befSMartin Matuska * indirect block. The last one will run the prefetch for all. 704eda14cbcSMatt Macy */ 705f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 706f9693befSMartin Matuska /* Drop reference taken in dmu_zfetch_prepare(). */ 707f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 708f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 709f9693befSMartin Matuska return; 710f9693befSMartin Matuska } 711eda14cbcSMatt Macy 712f9693befSMartin Matuska mutex_enter(&zf->zf_lock); 713f9693befSMartin Matuska if (zs->zs_missed) { 714e3aa18adSMartin Matuska pf_start = zs->zs_pf_start; 715e3aa18adSMartin Matuska pf_end = zs->zs_pf_start = zs->zs_pf_end; 716f9693befSMartin Matuska } else { 717f9693befSMartin Matuska pf_start = pf_end = 0; 718f9693befSMartin Matuska } 719e3aa18adSMartin Matuska ipf_start = zs->zs_ipf_start; 720e3aa18adSMartin Matuska ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 721f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 722f9693befSMartin Matuska ASSERT3S(pf_start, <=, pf_end); 723f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 724f9693befSMartin Matuska 725f9693befSMartin Matuska epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 726f9693befSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 727f9693befSMartin Matuska ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 728f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 729f9693befSMartin Matuska issued = pf_end - pf_start + ipf_end - ipf_start; 730f9693befSMartin Matuska if (issued > 1) { 731f9693befSMartin Matuska /* More references on top of taken in dmu_zfetch_prepare(). */ 7324e8d558cSMartin Matuska zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); 733f9693befSMartin Matuska } else if (issued == 0) { 734f9693befSMartin Matuska /* Some other thread has done our work, so drop the ref. */ 735f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 736f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 737f9693befSMartin Matuska return; 738f9693befSMartin Matuska } 7392a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 740f9693befSMartin Matuska 741f9693befSMartin Matuska if (!have_lock) 742f9693befSMartin Matuska rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 743f9693befSMartin Matuska 744f9693befSMartin Matuska issued = 0; 745f9693befSMartin Matuska for (int64_t blk = pf_start; blk < pf_end; blk++) { 746f9693befSMartin Matuska issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 74715f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 748eda14cbcSMatt Macy } 749f9693befSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 7507877fdebSMatt Macy issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 75115f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 752eda14cbcSMatt Macy } 753f9693befSMartin Matuska 754eda14cbcSMatt Macy if (!have_lock) 755eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 7567877fdebSMatt Macy 7577877fdebSMatt Macy if (issued) 7587877fdebSMatt Macy ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 759eda14cbcSMatt Macy } 760eda14cbcSMatt Macy 761f9693befSMartin Matuska void 762f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 763f9693befSMartin Matuska boolean_t missed, boolean_t have_lock) 764f9693befSMartin Matuska { 765f9693befSMartin Matuska zstream_t *zs; 766f9693befSMartin Matuska 767f9693befSMartin Matuska zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 768f9693befSMartin Matuska if (zs) 7691719886fSMartin Matuska dmu_zfetch_run(zf, zs, missed, have_lock); 770f9693befSMartin Matuska } 771f9693befSMartin Matuska 772eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 773eda14cbcSMatt Macy "Disable all ZFS prefetching"); 774eda14cbcSMatt Macy 775eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 776eda14cbcSMatt Macy "Max number of streams per zfetch"); 777eda14cbcSMatt Macy 778eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 779eda14cbcSMatt Macy "Min time before stream reclaim"); 780eda14cbcSMatt Macy 781e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 782e3aa18adSMartin Matuska "Max time before stream delete"); 783e3aa18adSMartin Matuska 784e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 785e3aa18adSMartin Matuska "Min bytes to prefetch per stream"); 786e3aa18adSMartin Matuska 787eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 788180f8225SMatt Macy "Max bytes to prefetch per stream"); 789180f8225SMatt Macy 790180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 791180f8225SMatt Macy "Max bytes to prefetch indirects for per stream"); 7921719886fSMartin Matuska 7931719886fSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, 7941719886fSMartin Matuska "Max request reorder distance within a stream"); 7951719886fSMartin Matuska 7961719886fSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, 7971719886fSMartin Matuska "Max log2 fraction of holes in a stream"); 798