1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31*2a58b312SMartin Matuska #include <sys/arc_impl.h> 32eda14cbcSMatt Macy #include <sys/dnode.h> 33eda14cbcSMatt Macy #include <sys/dmu_objset.h> 34eda14cbcSMatt Macy #include <sys/dmu_zfetch.h> 35eda14cbcSMatt Macy #include <sys/dmu.h> 36eda14cbcSMatt Macy #include <sys/dbuf.h> 37eda14cbcSMatt Macy #include <sys/kstat.h> 380d8fe237SMartin Matuska #include <sys/wmsum.h> 39eda14cbcSMatt Macy 40eda14cbcSMatt Macy /* 41eda14cbcSMatt Macy * This tunable disables predictive prefetch. Note that it leaves "prescient" 42eda14cbcSMatt Macy * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 43eda14cbcSMatt Macy * prescient prefetch never issues i/os that end up not being needed, 44eda14cbcSMatt Macy * so it can't hurt performance. 45eda14cbcSMatt Macy */ 46eda14cbcSMatt Macy 47e92ffd9bSMartin Matuska static int zfs_prefetch_disable = B_FALSE; 48eda14cbcSMatt Macy 49eda14cbcSMatt Macy /* max # of streams per zfetch */ 50e92ffd9bSMartin Matuska static unsigned int zfetch_max_streams = 8; 51eda14cbcSMatt Macy /* min time before stream reclaim */ 52e3aa18adSMartin Matuska static unsigned int zfetch_min_sec_reap = 1; 53e3aa18adSMartin Matuska /* max time before stream delete */ 54e3aa18adSMartin Matuska static unsigned int zfetch_max_sec_reap = 2; 55e3aa18adSMartin Matuska /* min bytes to prefetch per stream (default 4MB) */ 56e3aa18adSMartin Matuska static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 57e3aa18adSMartin Matuska /* max bytes to prefetch per stream (default 64MB) */ 58e3aa18adSMartin Matuska unsigned int zfetch_max_distance = 64 * 1024 * 1024; 59eda14cbcSMatt Macy /* max bytes to prefetch indirects for per stream (default 64MB) */ 60eda14cbcSMatt Macy unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 61eda14cbcSMatt Macy /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 62dbd5678dSMartin Matuska uint64_t zfetch_array_rd_sz = 1024 * 1024; 63eda14cbcSMatt Macy 64eda14cbcSMatt Macy typedef struct zfetch_stats { 65eda14cbcSMatt Macy kstat_named_t zfetchstat_hits; 66eda14cbcSMatt Macy kstat_named_t zfetchstat_misses; 67eda14cbcSMatt Macy kstat_named_t zfetchstat_max_streams; 687877fdebSMatt Macy kstat_named_t zfetchstat_io_issued; 69*2a58b312SMartin Matuska kstat_named_t zfetchstat_io_active; 70eda14cbcSMatt Macy } zfetch_stats_t; 71eda14cbcSMatt Macy 72eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = { 73eda14cbcSMatt Macy { "hits", KSTAT_DATA_UINT64 }, 74eda14cbcSMatt Macy { "misses", KSTAT_DATA_UINT64 }, 75eda14cbcSMatt Macy { "max_streams", KSTAT_DATA_UINT64 }, 767877fdebSMatt Macy { "io_issued", KSTAT_DATA_UINT64 }, 77*2a58b312SMartin Matuska { "io_active", KSTAT_DATA_UINT64 }, 78eda14cbcSMatt Macy }; 79eda14cbcSMatt Macy 800d8fe237SMartin Matuska struct { 810d8fe237SMartin Matuska wmsum_t zfetchstat_hits; 820d8fe237SMartin Matuska wmsum_t zfetchstat_misses; 830d8fe237SMartin Matuska wmsum_t zfetchstat_max_streams; 840d8fe237SMartin Matuska wmsum_t zfetchstat_io_issued; 85*2a58b312SMartin Matuska aggsum_t zfetchstat_io_active; 860d8fe237SMartin Matuska } zfetch_sums; 870d8fe237SMartin Matuska 88eda14cbcSMatt Macy #define ZFETCHSTAT_BUMP(stat) \ 890d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, 1) 907877fdebSMatt Macy #define ZFETCHSTAT_ADD(stat, val) \ 910d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, val) 927877fdebSMatt Macy 93eda14cbcSMatt Macy 94e92ffd9bSMartin Matuska static kstat_t *zfetch_ksp; 95eda14cbcSMatt Macy 960d8fe237SMartin Matuska static int 970d8fe237SMartin Matuska zfetch_kstats_update(kstat_t *ksp, int rw) 980d8fe237SMartin Matuska { 990d8fe237SMartin Matuska zfetch_stats_t *zs = ksp->ks_data; 1000d8fe237SMartin Matuska 1010d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 1020d8fe237SMartin Matuska return (EACCES); 1030d8fe237SMartin Matuska zs->zfetchstat_hits.value.ui64 = 1040d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_hits); 1050d8fe237SMartin Matuska zs->zfetchstat_misses.value.ui64 = 1060d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_misses); 1070d8fe237SMartin Matuska zs->zfetchstat_max_streams.value.ui64 = 1080d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_max_streams); 1090d8fe237SMartin Matuska zs->zfetchstat_io_issued.value.ui64 = 1100d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_io_issued); 111*2a58b312SMartin Matuska zs->zfetchstat_io_active.value.ui64 = 112*2a58b312SMartin Matuska aggsum_value(&zfetch_sums.zfetchstat_io_active); 1130d8fe237SMartin Matuska return (0); 1140d8fe237SMartin Matuska } 1150d8fe237SMartin Matuska 116eda14cbcSMatt Macy void 117eda14cbcSMatt Macy zfetch_init(void) 118eda14cbcSMatt Macy { 1190d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 1200d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 1210d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 1220d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 123*2a58b312SMartin Matuska aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 1240d8fe237SMartin Matuska 125eda14cbcSMatt Macy zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 126eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 127eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 128eda14cbcSMatt Macy 129eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 130eda14cbcSMatt Macy zfetch_ksp->ks_data = &zfetch_stats; 1310d8fe237SMartin Matuska zfetch_ksp->ks_update = zfetch_kstats_update; 132eda14cbcSMatt Macy kstat_install(zfetch_ksp); 133eda14cbcSMatt Macy } 134eda14cbcSMatt Macy } 135eda14cbcSMatt Macy 136eda14cbcSMatt Macy void 137eda14cbcSMatt Macy zfetch_fini(void) 138eda14cbcSMatt Macy { 139eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 140eda14cbcSMatt Macy kstat_delete(zfetch_ksp); 141eda14cbcSMatt Macy zfetch_ksp = NULL; 142eda14cbcSMatt Macy } 1430d8fe237SMartin Matuska 1440d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_hits); 1450d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_misses); 1460d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 1470d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 148*2a58b312SMartin Matuska ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 149*2a58b312SMartin Matuska aggsum_fini(&zfetch_sums.zfetchstat_io_active); 150eda14cbcSMatt Macy } 151eda14cbcSMatt Macy 152eda14cbcSMatt Macy /* 153eda14cbcSMatt Macy * This takes a pointer to a zfetch structure and a dnode. It performs the 154eda14cbcSMatt Macy * necessary setup for the zfetch structure, grokking data from the 155eda14cbcSMatt Macy * associated dnode. 156eda14cbcSMatt Macy */ 157eda14cbcSMatt Macy void 158eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 159eda14cbcSMatt Macy { 160eda14cbcSMatt Macy if (zf == NULL) 161eda14cbcSMatt Macy return; 162eda14cbcSMatt Macy zf->zf_dnode = dno; 1637877fdebSMatt Macy zf->zf_numstreams = 0; 164eda14cbcSMatt Macy 165eda14cbcSMatt Macy list_create(&zf->zf_stream, sizeof (zstream_t), 166eda14cbcSMatt Macy offsetof(zstream_t, zs_node)); 167eda14cbcSMatt Macy 168eda14cbcSMatt Macy mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 169eda14cbcSMatt Macy } 170eda14cbcSMatt Macy 171eda14cbcSMatt Macy static void 1727877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs) 1737877fdebSMatt Macy { 174f9693befSMartin Matuska ASSERT(!list_link_active(&zs->zs_node)); 1757cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_callers); 1767cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_refs); 1777877fdebSMatt Macy kmem_free(zs, sizeof (*zs)); 1787877fdebSMatt Macy } 1797877fdebSMatt Macy 1807877fdebSMatt Macy static void 181eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 182eda14cbcSMatt Macy { 183eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 184eda14cbcSMatt Macy list_remove(&zf->zf_stream, zs); 185f9693befSMartin Matuska zf->zf_numstreams--; 186f9693befSMartin Matuska membar_producer(); 187f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 1887877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 189eda14cbcSMatt Macy } 190eda14cbcSMatt Macy 191eda14cbcSMatt Macy /* 192eda14cbcSMatt Macy * Clean-up state associated with a zfetch structure (e.g. destroy the 193eda14cbcSMatt Macy * streams). This doesn't free the zfetch_t itself, that's left to the caller. 194eda14cbcSMatt Macy */ 195eda14cbcSMatt Macy void 196eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf) 197eda14cbcSMatt Macy { 198eda14cbcSMatt Macy zstream_t *zs; 199eda14cbcSMatt Macy 200eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 201f9693befSMartin Matuska while ((zs = list_head(&zf->zf_stream)) != NULL) 202eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 203eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 204eda14cbcSMatt Macy list_destroy(&zf->zf_stream); 205eda14cbcSMatt Macy mutex_destroy(&zf->zf_lock); 206eda14cbcSMatt Macy 207eda14cbcSMatt Macy zf->zf_dnode = NULL; 208eda14cbcSMatt Macy } 209eda14cbcSMatt Macy 210eda14cbcSMatt Macy /* 211e3aa18adSMartin Matuska * If there aren't too many active streams already, create one more. 212e3aa18adSMartin Matuska * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 213e3aa18adSMartin Matuska * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 214eda14cbcSMatt Macy * The "blkid" argument is the next block that we expect this stream to access. 215eda14cbcSMatt Macy */ 216eda14cbcSMatt Macy static void 217eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 218eda14cbcSMatt Macy { 219e3aa18adSMartin Matuska zstream_t *zs, *zs_next, *zs_old = NULL; 220e3aa18adSMartin Matuska hrtime_t now = gethrtime(), t; 221eda14cbcSMatt Macy 222eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 223eda14cbcSMatt Macy 224eda14cbcSMatt Macy /* 225e3aa18adSMartin Matuska * Delete too old streams, reusing the first found one. 226eda14cbcSMatt Macy */ 227e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_max_sec_reap); 228e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 229eda14cbcSMatt Macy zs_next = list_next(&zf->zf_stream, zs); 2307877fdebSMatt Macy /* 231f9693befSMartin Matuska * Skip if still active. 1 -- zf_stream reference. 2327877fdebSMatt Macy */ 233f9693befSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 2347877fdebSMatt Macy continue; 235e3aa18adSMartin Matuska if (zs->zs_atime > t) 236e3aa18adSMartin Matuska continue; 237e3aa18adSMartin Matuska if (zs_old) 238eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 239e3aa18adSMartin Matuska else 240e3aa18adSMartin Matuska zs_old = zs; 241e3aa18adSMartin Matuska } 242e3aa18adSMartin Matuska if (zs_old) { 243e3aa18adSMartin Matuska zs = zs_old; 244e3aa18adSMartin Matuska goto reuse; 245eda14cbcSMatt Macy } 246eda14cbcSMatt Macy 247eda14cbcSMatt Macy /* 248eda14cbcSMatt Macy * The maximum number of streams is normally zfetch_max_streams, 249eda14cbcSMatt Macy * but for small files we lower it such that it's at least possible 250eda14cbcSMatt Macy * for all the streams to be non-overlapping. 251eda14cbcSMatt Macy */ 252eda14cbcSMatt Macy uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 253eda14cbcSMatt Macy zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 254eda14cbcSMatt Macy zfetch_max_distance)); 2557877fdebSMatt Macy if (zf->zf_numstreams >= max_streams) { 256e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_min_sec_reap); 257e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; 258e3aa18adSMartin Matuska zs = list_next(&zf->zf_stream, zs)) { 259e3aa18adSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 260e3aa18adSMartin Matuska continue; 261e3aa18adSMartin Matuska if (zs->zs_atime > t) 262e3aa18adSMartin Matuska continue; 263e3aa18adSMartin Matuska if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) 264e3aa18adSMartin Matuska zs_old = zs; 265e3aa18adSMartin Matuska } 266e3aa18adSMartin Matuska if (zs_old) { 267e3aa18adSMartin Matuska zs = zs_old; 268e3aa18adSMartin Matuska goto reuse; 269e3aa18adSMartin Matuska } 270eda14cbcSMatt Macy ZFETCHSTAT_BUMP(zfetchstat_max_streams); 271eda14cbcSMatt Macy return; 272eda14cbcSMatt Macy } 273eda14cbcSMatt Macy 274e3aa18adSMartin Matuska zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 2757877fdebSMatt Macy zs->zs_fetch = zf; 276f9693befSMartin Matuska zfs_refcount_create(&zs->zs_callers); 277f9693befSMartin Matuska zfs_refcount_create(&zs->zs_refs); 278f9693befSMartin Matuska /* One reference for zf_stream. */ 279f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 2807877fdebSMatt Macy zf->zf_numstreams++; 281eda14cbcSMatt Macy list_insert_head(&zf->zf_stream, zs); 282e3aa18adSMartin Matuska 283e3aa18adSMartin Matuska reuse: 284e3aa18adSMartin Matuska zs->zs_blkid = blkid; 285e3aa18adSMartin Matuska zs->zs_pf_dist = 0; 286e3aa18adSMartin Matuska zs->zs_pf_start = blkid; 287e3aa18adSMartin Matuska zs->zs_pf_end = blkid; 288e3aa18adSMartin Matuska zs->zs_ipf_dist = 0; 289e3aa18adSMartin Matuska zs->zs_ipf_start = blkid; 290e3aa18adSMartin Matuska zs->zs_ipf_end = blkid; 291e3aa18adSMartin Matuska /* Allow immediate stream reuse until first hit. */ 292e3aa18adSMartin Matuska zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); 293e3aa18adSMartin Matuska zs->zs_missed = B_FALSE; 294e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 295eda14cbcSMatt Macy } 296eda14cbcSMatt Macy 2977877fdebSMatt Macy static void 298e3aa18adSMartin Matuska dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 2997877fdebSMatt Macy { 3007877fdebSMatt Macy zstream_t *zs = arg; 3017877fdebSMatt Macy 302e3aa18adSMartin Matuska if (io_issued && level == 0 && blkid < zs->zs_blkid) 303e3aa18adSMartin Matuska zs->zs_more = B_TRUE; 304f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 3057877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 306*2a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 3077877fdebSMatt Macy } 3087877fdebSMatt Macy 309eda14cbcSMatt Macy /* 310f9693befSMartin Matuska * This is the predictive prefetch entry point. dmu_zfetch_prepare() 311f9693befSMartin Matuska * associates dnode access specified with blkid and nblks arguments with 312f9693befSMartin Matuska * prefetch stream, predicts further accesses based on that stats and returns 313f9693befSMartin Matuska * the stream pointer on success. That pointer must later be passed to 314f9693befSMartin Matuska * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 315f9693befSMartin Matuska * release it. dmu_zfetch() is a wrapper for simple cases when window between 316f9693befSMartin Matuska * prediction and prefetch initiation is not needed. 317eda14cbcSMatt Macy * fetch_data argument specifies whether actual data blocks should be fetched: 318eda14cbcSMatt Macy * FALSE -- prefetch only indirect blocks for predicted data blocks; 319eda14cbcSMatt Macy * TRUE -- prefetch predicted data blocks plus following indirect blocks. 320eda14cbcSMatt Macy */ 321f9693befSMartin Matuska zstream_t * 322f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 323f9693befSMartin Matuska boolean_t fetch_data, boolean_t have_lock) 324eda14cbcSMatt Macy { 325eda14cbcSMatt Macy zstream_t *zs; 326eda14cbcSMatt Macy spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 327eda14cbcSMatt Macy 328eda14cbcSMatt Macy if (zfs_prefetch_disable) 329f9693befSMartin Matuska return (NULL); 330eda14cbcSMatt Macy /* 331eda14cbcSMatt Macy * If we haven't yet loaded the indirect vdevs' mappings, we 332eda14cbcSMatt Macy * can only read from blocks that we carefully ensure are on 333eda14cbcSMatt Macy * concrete vdevs (or previously-loaded indirect vdevs). So we 334eda14cbcSMatt Macy * can't allow the predictive prefetcher to attempt reads of other 335eda14cbcSMatt Macy * blocks (e.g. of the MOS's dnode object). 336eda14cbcSMatt Macy */ 337eda14cbcSMatt Macy if (!spa_indirect_vdevs_loaded(spa)) 338f9693befSMartin Matuska return (NULL); 339eda14cbcSMatt Macy 340eda14cbcSMatt Macy /* 341eda14cbcSMatt Macy * As a fast path for small (single-block) files, ignore access 342eda14cbcSMatt Macy * to the first block. 343eda14cbcSMatt Macy */ 3447877fdebSMatt Macy if (!have_lock && blkid == 0) 345f9693befSMartin Matuska return (NULL); 346eda14cbcSMatt Macy 347eda14cbcSMatt Macy if (!have_lock) 348eda14cbcSMatt Macy rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 3497877fdebSMatt Macy 3507877fdebSMatt Macy /* 3517877fdebSMatt Macy * A fast path for small files for which no prefetch will 3527877fdebSMatt Macy * happen. 3537877fdebSMatt Macy */ 354e3aa18adSMartin Matuska uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 355f9693befSMartin Matuska if (maxblkid < 2) { 3567877fdebSMatt Macy if (!have_lock) 3577877fdebSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 358f9693befSMartin Matuska return (NULL); 3597877fdebSMatt Macy } 360eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 361eda14cbcSMatt Macy 362eda14cbcSMatt Macy /* 363eda14cbcSMatt Macy * Find matching prefetch stream. Depending on whether the accesses 364eda14cbcSMatt Macy * are block-aligned, first block of the new access may either follow 365eda14cbcSMatt Macy * the last block of the previous access, or be equal to it. 366eda14cbcSMatt Macy */ 367eda14cbcSMatt Macy for (zs = list_head(&zf->zf_stream); zs != NULL; 368eda14cbcSMatt Macy zs = list_next(&zf->zf_stream, zs)) { 369eda14cbcSMatt Macy if (blkid == zs->zs_blkid) { 370eda14cbcSMatt Macy break; 371eda14cbcSMatt Macy } else if (blkid + 1 == zs->zs_blkid) { 372eda14cbcSMatt Macy blkid++; 373eda14cbcSMatt Macy nblks--; 374eda14cbcSMatt Macy break; 375eda14cbcSMatt Macy } 376eda14cbcSMatt Macy } 377f9693befSMartin Matuska 378f9693befSMartin Matuska /* 379f9693befSMartin Matuska * If the file is ending, remove the matching stream if found. 380f9693befSMartin Matuska * If not found then it is too late to create a new one now. 381f9693befSMartin Matuska */ 382e3aa18adSMartin Matuska uint64_t end_of_access_blkid = blkid + nblks; 383f9693befSMartin Matuska if (end_of_access_blkid >= maxblkid) { 384f9693befSMartin Matuska if (zs != NULL) 385f9693befSMartin Matuska dmu_zfetch_stream_remove(zf, zs); 386f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 387f9693befSMartin Matuska if (!have_lock) 388f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 389f9693befSMartin Matuska return (NULL); 390f9693befSMartin Matuska } 391f9693befSMartin Matuska 392f9693befSMartin Matuska /* Exit if we already prefetched this block before. */ 393f9693befSMartin Matuska if (nblks == 0) { 394f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 395f9693befSMartin Matuska if (!have_lock) 396f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 397f9693befSMartin Matuska return (NULL); 398eda14cbcSMatt Macy } 399eda14cbcSMatt Macy 400eda14cbcSMatt Macy if (zs == NULL) { 401eda14cbcSMatt Macy /* 402eda14cbcSMatt Macy * This access is not part of any existing stream. Create 403eda14cbcSMatt Macy * a new stream for it. 404eda14cbcSMatt Macy */ 405eda14cbcSMatt Macy dmu_zfetch_stream_create(zf, end_of_access_blkid); 406eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 407eda14cbcSMatt Macy if (!have_lock) 408eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 409f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_misses); 410f9693befSMartin Matuska return (NULL); 411eda14cbcSMatt Macy } 412eda14cbcSMatt Macy 413eda14cbcSMatt Macy /* 414eda14cbcSMatt Macy * This access was to a block that we issued a prefetch for on 415e3aa18adSMartin Matuska * behalf of this stream. Calculate further prefetch distances. 416eda14cbcSMatt Macy * 417e3aa18adSMartin Matuska * Start prefetch from the demand access size (nblks). Double the 418e3aa18adSMartin Matuska * distance every access up to zfetch_min_distance. After that only 419e3aa18adSMartin Matuska * if needed increase the distance by 1/8 up to zfetch_max_distance. 420*2a58b312SMartin Matuska * 421*2a58b312SMartin Matuska * Don't double the distance beyond single block if we have more 422*2a58b312SMartin Matuska * than ~6% of ARC held by active prefetches. It should help with 423*2a58b312SMartin Matuska * getting out of RAM on some badly mispredicted read patterns. 424eda14cbcSMatt Macy */ 425*2a58b312SMartin Matuska unsigned int dbs = zf->zf_dnode->dn_datablkshift; 426*2a58b312SMartin Matuska unsigned int nbytes = nblks << dbs; 427e3aa18adSMartin Matuska unsigned int pf_nblks; 428eda14cbcSMatt Macy if (fetch_data) { 429e3aa18adSMartin Matuska if (unlikely(zs->zs_pf_dist < nbytes)) 430e3aa18adSMartin Matuska zs->zs_pf_dist = nbytes; 431*2a58b312SMartin Matuska else if (zs->zs_pf_dist < zfetch_min_distance && 432*2a58b312SMartin Matuska (zs->zs_pf_dist < (1 << dbs) || 433*2a58b312SMartin Matuska aggsum_compare(&zfetch_sums.zfetchstat_io_active, 434*2a58b312SMartin Matuska arc_c_max >> (4 + dbs)) < 0)) 435e3aa18adSMartin Matuska zs->zs_pf_dist *= 2; 436e3aa18adSMartin Matuska else if (zs->zs_more) 437e3aa18adSMartin Matuska zs->zs_pf_dist += zs->zs_pf_dist / 8; 438e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 439e3aa18adSMartin Matuska if (zs->zs_pf_dist > zfetch_max_distance) 440e3aa18adSMartin Matuska zs->zs_pf_dist = zfetch_max_distance; 441*2a58b312SMartin Matuska pf_nblks = zs->zs_pf_dist >> dbs; 442eda14cbcSMatt Macy } else { 443eda14cbcSMatt Macy pf_nblks = 0; 444eda14cbcSMatt Macy } 445e3aa18adSMartin Matuska if (zs->zs_pf_start < end_of_access_blkid) 446e3aa18adSMartin Matuska zs->zs_pf_start = end_of_access_blkid; 447e3aa18adSMartin Matuska if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) 448e3aa18adSMartin Matuska zs->zs_pf_end = end_of_access_blkid + pf_nblks; 449eda14cbcSMatt Macy 450eda14cbcSMatt Macy /* 451e3aa18adSMartin Matuska * Do the same for indirects, starting where we will stop reading 452e3aa18adSMartin Matuska * data blocks (and the indirects that point to them). 453eda14cbcSMatt Macy */ 454e3aa18adSMartin Matuska if (unlikely(zs->zs_ipf_dist < nbytes)) 455e3aa18adSMartin Matuska zs->zs_ipf_dist = nbytes; 456e3aa18adSMartin Matuska else 457e3aa18adSMartin Matuska zs->zs_ipf_dist *= 2; 458e3aa18adSMartin Matuska if (zs->zs_ipf_dist > zfetch_max_idistance) 459e3aa18adSMartin Matuska zs->zs_ipf_dist = zfetch_max_idistance; 460*2a58b312SMartin Matuska pf_nblks = zs->zs_ipf_dist >> dbs; 461e3aa18adSMartin Matuska if (zs->zs_ipf_start < zs->zs_pf_end) 462e3aa18adSMartin Matuska zs->zs_ipf_start = zs->zs_pf_end; 463e3aa18adSMartin Matuska if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 464e3aa18adSMartin Matuska zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 465eda14cbcSMatt Macy 466eda14cbcSMatt Macy zs->zs_blkid = end_of_access_blkid; 467f9693befSMartin Matuska /* Protect the stream from reclamation. */ 468f9693befSMartin Matuska zs->zs_atime = gethrtime(); 469f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 470f9693befSMartin Matuska /* Count concurrent callers. */ 471f9693befSMartin Matuska zfs_refcount_add(&zs->zs_callers, NULL); 472eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 473f9693befSMartin Matuska 474f9693befSMartin Matuska if (!have_lock) 475f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 476f9693befSMartin Matuska 477f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_hits); 478f9693befSMartin Matuska return (zs); 479f9693befSMartin Matuska } 480f9693befSMartin Matuska 481f9693befSMartin Matuska void 482f9693befSMartin Matuska dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 483f9693befSMartin Matuska { 484f9693befSMartin Matuska zfetch_t *zf = zs->zs_fetch; 485f9693befSMartin Matuska int64_t pf_start, pf_end, ipf_start, ipf_end; 486f9693befSMartin Matuska int epbs, issued; 487f9693befSMartin Matuska 488f9693befSMartin Matuska if (missed) 489f9693befSMartin Matuska zs->zs_missed = missed; 490eda14cbcSMatt Macy 491eda14cbcSMatt Macy /* 492f9693befSMartin Matuska * Postpone the prefetch if there are more concurrent callers. 493f9693befSMartin Matuska * It happens when multiple requests are waiting for the same 494f9693befSMartin Matuska * indirect block. The last one will run the prefetch for all. 495eda14cbcSMatt Macy */ 496f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 497f9693befSMartin Matuska /* Drop reference taken in dmu_zfetch_prepare(). */ 498f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 499f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 500f9693befSMartin Matuska return; 501f9693befSMartin Matuska } 502eda14cbcSMatt Macy 503f9693befSMartin Matuska mutex_enter(&zf->zf_lock); 504f9693befSMartin Matuska if (zs->zs_missed) { 505e3aa18adSMartin Matuska pf_start = zs->zs_pf_start; 506e3aa18adSMartin Matuska pf_end = zs->zs_pf_start = zs->zs_pf_end; 507f9693befSMartin Matuska } else { 508f9693befSMartin Matuska pf_start = pf_end = 0; 509f9693befSMartin Matuska } 510e3aa18adSMartin Matuska ipf_start = zs->zs_ipf_start; 511e3aa18adSMartin Matuska ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 512f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 513f9693befSMartin Matuska ASSERT3S(pf_start, <=, pf_end); 514f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 515f9693befSMartin Matuska 516f9693befSMartin Matuska epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 517f9693befSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 518f9693befSMartin Matuska ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 519f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 520f9693befSMartin Matuska issued = pf_end - pf_start + ipf_end - ipf_start; 521f9693befSMartin Matuska if (issued > 1) { 522f9693befSMartin Matuska /* More references on top of taken in dmu_zfetch_prepare(). */ 5236ba2210eSMartin Matuska for (int i = 0; i < issued - 1; i++) 5246ba2210eSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 525f9693befSMartin Matuska } else if (issued == 0) { 526f9693befSMartin Matuska /* Some other thread has done our work, so drop the ref. */ 527f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 528f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 529f9693befSMartin Matuska return; 530f9693befSMartin Matuska } 531*2a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 532f9693befSMartin Matuska 533f9693befSMartin Matuska if (!have_lock) 534f9693befSMartin Matuska rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 535f9693befSMartin Matuska 536f9693befSMartin Matuska issued = 0; 537f9693befSMartin Matuska for (int64_t blk = pf_start; blk < pf_end; blk++) { 538f9693befSMartin Matuska issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 53915f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 540eda14cbcSMatt Macy } 541f9693befSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 5427877fdebSMatt Macy issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 54315f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 544eda14cbcSMatt Macy } 545f9693befSMartin Matuska 546eda14cbcSMatt Macy if (!have_lock) 547eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 5487877fdebSMatt Macy 5497877fdebSMatt Macy if (issued) 5507877fdebSMatt Macy ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 551eda14cbcSMatt Macy } 552eda14cbcSMatt Macy 553f9693befSMartin Matuska void 554f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 555f9693befSMartin Matuska boolean_t missed, boolean_t have_lock) 556f9693befSMartin Matuska { 557f9693befSMartin Matuska zstream_t *zs; 558f9693befSMartin Matuska 559f9693befSMartin Matuska zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 560f9693befSMartin Matuska if (zs) 561f9693befSMartin Matuska dmu_zfetch_run(zs, missed, have_lock); 562f9693befSMartin Matuska } 563f9693befSMartin Matuska 564eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 565eda14cbcSMatt Macy "Disable all ZFS prefetching"); 566eda14cbcSMatt Macy 567eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 568eda14cbcSMatt Macy "Max number of streams per zfetch"); 569eda14cbcSMatt Macy 570eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 571eda14cbcSMatt Macy "Min time before stream reclaim"); 572eda14cbcSMatt Macy 573e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 574e3aa18adSMartin Matuska "Max time before stream delete"); 575e3aa18adSMartin Matuska 576e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 577e3aa18adSMartin Matuska "Min bytes to prefetch per stream"); 578e3aa18adSMartin Matuska 579eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 580180f8225SMatt Macy "Max bytes to prefetch per stream"); 581180f8225SMatt Macy 582180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 583180f8225SMatt Macy "Max bytes to prefetch indirects for per stream"); 584eda14cbcSMatt Macy 585dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW, 586eda14cbcSMatt Macy "Number of bytes in a array_read"); 587