1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 31eda14cbcSMatt Macy #include <sys/dnode.h> 32eda14cbcSMatt Macy #include <sys/dmu_objset.h> 33eda14cbcSMatt Macy #include <sys/dmu_zfetch.h> 34eda14cbcSMatt Macy #include <sys/dmu.h> 35eda14cbcSMatt Macy #include <sys/dbuf.h> 36eda14cbcSMatt Macy #include <sys/kstat.h> 370d8fe237SMartin Matuska #include <sys/wmsum.h> 38eda14cbcSMatt Macy 39eda14cbcSMatt Macy /* 40eda14cbcSMatt Macy * This tunable disables predictive prefetch. Note that it leaves "prescient" 41eda14cbcSMatt Macy * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 42eda14cbcSMatt Macy * prescient prefetch never issues i/os that end up not being needed, 43eda14cbcSMatt Macy * so it can't hurt performance. 44eda14cbcSMatt Macy */ 45eda14cbcSMatt Macy 46e92ffd9bSMartin Matuska static int zfs_prefetch_disable = B_FALSE; 47eda14cbcSMatt Macy 48eda14cbcSMatt Macy /* max # of streams per zfetch */ 49e92ffd9bSMartin Matuska static unsigned int zfetch_max_streams = 8; 50eda14cbcSMatt Macy /* min time before stream reclaim */ 51e3aa18adSMartin Matuska static unsigned int zfetch_min_sec_reap = 1; 52e3aa18adSMartin Matuska /* max time before stream delete */ 53e3aa18adSMartin Matuska static unsigned int zfetch_max_sec_reap = 2; 54e3aa18adSMartin Matuska /* min bytes to prefetch per stream (default 4MB) */ 55e3aa18adSMartin Matuska static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 56e3aa18adSMartin Matuska /* max bytes to prefetch per stream (default 64MB) */ 57e3aa18adSMartin Matuska unsigned int zfetch_max_distance = 64 * 1024 * 1024; 58eda14cbcSMatt Macy /* max bytes to prefetch indirects for per stream (default 64MB) */ 59eda14cbcSMatt Macy unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 60eda14cbcSMatt Macy /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 61dbd5678dSMartin Matuska uint64_t zfetch_array_rd_sz = 1024 * 1024; 62eda14cbcSMatt Macy 63eda14cbcSMatt Macy typedef struct zfetch_stats { 64eda14cbcSMatt Macy kstat_named_t zfetchstat_hits; 65eda14cbcSMatt Macy kstat_named_t zfetchstat_misses; 66eda14cbcSMatt Macy kstat_named_t zfetchstat_max_streams; 677877fdebSMatt Macy kstat_named_t zfetchstat_io_issued; 68eda14cbcSMatt Macy } zfetch_stats_t; 69eda14cbcSMatt Macy 70eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = { 71eda14cbcSMatt Macy { "hits", KSTAT_DATA_UINT64 }, 72eda14cbcSMatt Macy { "misses", KSTAT_DATA_UINT64 }, 73eda14cbcSMatt Macy { "max_streams", KSTAT_DATA_UINT64 }, 747877fdebSMatt Macy { "io_issued", KSTAT_DATA_UINT64 }, 75eda14cbcSMatt Macy }; 76eda14cbcSMatt Macy 770d8fe237SMartin Matuska struct { 780d8fe237SMartin Matuska wmsum_t zfetchstat_hits; 790d8fe237SMartin Matuska wmsum_t zfetchstat_misses; 800d8fe237SMartin Matuska wmsum_t zfetchstat_max_streams; 810d8fe237SMartin Matuska wmsum_t zfetchstat_io_issued; 820d8fe237SMartin Matuska } zfetch_sums; 830d8fe237SMartin Matuska 84eda14cbcSMatt Macy #define ZFETCHSTAT_BUMP(stat) \ 850d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, 1) 867877fdebSMatt Macy #define ZFETCHSTAT_ADD(stat, val) \ 870d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, val) 887877fdebSMatt Macy 89eda14cbcSMatt Macy 90e92ffd9bSMartin Matuska static kstat_t *zfetch_ksp; 91eda14cbcSMatt Macy 920d8fe237SMartin Matuska static int 930d8fe237SMartin Matuska zfetch_kstats_update(kstat_t *ksp, int rw) 940d8fe237SMartin Matuska { 950d8fe237SMartin Matuska zfetch_stats_t *zs = ksp->ks_data; 960d8fe237SMartin Matuska 970d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 980d8fe237SMartin Matuska return (EACCES); 990d8fe237SMartin Matuska zs->zfetchstat_hits.value.ui64 = 1000d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_hits); 1010d8fe237SMartin Matuska zs->zfetchstat_misses.value.ui64 = 1020d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_misses); 1030d8fe237SMartin Matuska zs->zfetchstat_max_streams.value.ui64 = 1040d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_max_streams); 1050d8fe237SMartin Matuska zs->zfetchstat_io_issued.value.ui64 = 1060d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_io_issued); 1070d8fe237SMartin Matuska return (0); 1080d8fe237SMartin Matuska } 1090d8fe237SMartin Matuska 110eda14cbcSMatt Macy void 111eda14cbcSMatt Macy zfetch_init(void) 112eda14cbcSMatt Macy { 1130d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 1140d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 1150d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 1160d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 1170d8fe237SMartin Matuska 118eda14cbcSMatt Macy zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 119eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 120eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 121eda14cbcSMatt Macy 122eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 123eda14cbcSMatt Macy zfetch_ksp->ks_data = &zfetch_stats; 1240d8fe237SMartin Matuska zfetch_ksp->ks_update = zfetch_kstats_update; 125eda14cbcSMatt Macy kstat_install(zfetch_ksp); 126eda14cbcSMatt Macy } 127eda14cbcSMatt Macy } 128eda14cbcSMatt Macy 129eda14cbcSMatt Macy void 130eda14cbcSMatt Macy zfetch_fini(void) 131eda14cbcSMatt Macy { 132eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 133eda14cbcSMatt Macy kstat_delete(zfetch_ksp); 134eda14cbcSMatt Macy zfetch_ksp = NULL; 135eda14cbcSMatt Macy } 1360d8fe237SMartin Matuska 1370d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_hits); 1380d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_misses); 1390d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 1400d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 141eda14cbcSMatt Macy } 142eda14cbcSMatt Macy 143eda14cbcSMatt Macy /* 144eda14cbcSMatt Macy * This takes a pointer to a zfetch structure and a dnode. It performs the 145eda14cbcSMatt Macy * necessary setup for the zfetch structure, grokking data from the 146eda14cbcSMatt Macy * associated dnode. 147eda14cbcSMatt Macy */ 148eda14cbcSMatt Macy void 149eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 150eda14cbcSMatt Macy { 151eda14cbcSMatt Macy if (zf == NULL) 152eda14cbcSMatt Macy return; 153eda14cbcSMatt Macy zf->zf_dnode = dno; 1547877fdebSMatt Macy zf->zf_numstreams = 0; 155eda14cbcSMatt Macy 156eda14cbcSMatt Macy list_create(&zf->zf_stream, sizeof (zstream_t), 157eda14cbcSMatt Macy offsetof(zstream_t, zs_node)); 158eda14cbcSMatt Macy 159eda14cbcSMatt Macy mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 160eda14cbcSMatt Macy } 161eda14cbcSMatt Macy 162eda14cbcSMatt Macy static void 1637877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs) 1647877fdebSMatt Macy { 165f9693befSMartin Matuska ASSERT(!list_link_active(&zs->zs_node)); 1667cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_callers); 1677cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_refs); 1687877fdebSMatt Macy kmem_free(zs, sizeof (*zs)); 1697877fdebSMatt Macy } 1707877fdebSMatt Macy 1717877fdebSMatt Macy static void 172eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 173eda14cbcSMatt Macy { 174eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 175eda14cbcSMatt Macy list_remove(&zf->zf_stream, zs); 176f9693befSMartin Matuska zf->zf_numstreams--; 177f9693befSMartin Matuska membar_producer(); 178f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 1797877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 180eda14cbcSMatt Macy } 181eda14cbcSMatt Macy 182eda14cbcSMatt Macy /* 183eda14cbcSMatt Macy * Clean-up state associated with a zfetch structure (e.g. destroy the 184eda14cbcSMatt Macy * streams). This doesn't free the zfetch_t itself, that's left to the caller. 185eda14cbcSMatt Macy */ 186eda14cbcSMatt Macy void 187eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf) 188eda14cbcSMatt Macy { 189eda14cbcSMatt Macy zstream_t *zs; 190eda14cbcSMatt Macy 191eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 192f9693befSMartin Matuska while ((zs = list_head(&zf->zf_stream)) != NULL) 193eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 194eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 195eda14cbcSMatt Macy list_destroy(&zf->zf_stream); 196eda14cbcSMatt Macy mutex_destroy(&zf->zf_lock); 197eda14cbcSMatt Macy 198eda14cbcSMatt Macy zf->zf_dnode = NULL; 199eda14cbcSMatt Macy } 200eda14cbcSMatt Macy 201eda14cbcSMatt Macy /* 202e3aa18adSMartin Matuska * If there aren't too many active streams already, create one more. 203e3aa18adSMartin Matuska * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 204e3aa18adSMartin Matuska * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 205eda14cbcSMatt Macy * The "blkid" argument is the next block that we expect this stream to access. 206eda14cbcSMatt Macy */ 207eda14cbcSMatt Macy static void 208eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 209eda14cbcSMatt Macy { 210e3aa18adSMartin Matuska zstream_t *zs, *zs_next, *zs_old = NULL; 211e3aa18adSMartin Matuska hrtime_t now = gethrtime(), t; 212eda14cbcSMatt Macy 213eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 214eda14cbcSMatt Macy 215eda14cbcSMatt Macy /* 216e3aa18adSMartin Matuska * Delete too old streams, reusing the first found one. 217eda14cbcSMatt Macy */ 218e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_max_sec_reap); 219e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 220eda14cbcSMatt Macy zs_next = list_next(&zf->zf_stream, zs); 2217877fdebSMatt Macy /* 222f9693befSMartin Matuska * Skip if still active. 1 -- zf_stream reference. 2237877fdebSMatt Macy */ 224f9693befSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 2257877fdebSMatt Macy continue; 226e3aa18adSMartin Matuska if (zs->zs_atime > t) 227e3aa18adSMartin Matuska continue; 228e3aa18adSMartin Matuska if (zs_old) 229eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 230e3aa18adSMartin Matuska else 231e3aa18adSMartin Matuska zs_old = zs; 232e3aa18adSMartin Matuska } 233e3aa18adSMartin Matuska if (zs_old) { 234e3aa18adSMartin Matuska zs = zs_old; 235e3aa18adSMartin Matuska goto reuse; 236eda14cbcSMatt Macy } 237eda14cbcSMatt Macy 238eda14cbcSMatt Macy /* 239eda14cbcSMatt Macy * The maximum number of streams is normally zfetch_max_streams, 240eda14cbcSMatt Macy * but for small files we lower it such that it's at least possible 241eda14cbcSMatt Macy * for all the streams to be non-overlapping. 242eda14cbcSMatt Macy */ 243eda14cbcSMatt Macy uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 244eda14cbcSMatt Macy zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 245eda14cbcSMatt Macy zfetch_max_distance)); 2467877fdebSMatt Macy if (zf->zf_numstreams >= max_streams) { 247e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_min_sec_reap); 248e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; 249e3aa18adSMartin Matuska zs = list_next(&zf->zf_stream, zs)) { 250e3aa18adSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 251e3aa18adSMartin Matuska continue; 252e3aa18adSMartin Matuska if (zs->zs_atime > t) 253e3aa18adSMartin Matuska continue; 254e3aa18adSMartin Matuska if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) 255e3aa18adSMartin Matuska zs_old = zs; 256e3aa18adSMartin Matuska } 257e3aa18adSMartin Matuska if (zs_old) { 258e3aa18adSMartin Matuska zs = zs_old; 259e3aa18adSMartin Matuska goto reuse; 260e3aa18adSMartin Matuska } 261eda14cbcSMatt Macy ZFETCHSTAT_BUMP(zfetchstat_max_streams); 262eda14cbcSMatt Macy return; 263eda14cbcSMatt Macy } 264eda14cbcSMatt Macy 265e3aa18adSMartin Matuska zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 2667877fdebSMatt Macy zs->zs_fetch = zf; 267f9693befSMartin Matuska zfs_refcount_create(&zs->zs_callers); 268f9693befSMartin Matuska zfs_refcount_create(&zs->zs_refs); 269f9693befSMartin Matuska /* One reference for zf_stream. */ 270f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 2717877fdebSMatt Macy zf->zf_numstreams++; 272eda14cbcSMatt Macy list_insert_head(&zf->zf_stream, zs); 273e3aa18adSMartin Matuska 274e3aa18adSMartin Matuska reuse: 275e3aa18adSMartin Matuska zs->zs_blkid = blkid; 276e3aa18adSMartin Matuska zs->zs_pf_dist = 0; 277e3aa18adSMartin Matuska zs->zs_pf_start = blkid; 278e3aa18adSMartin Matuska zs->zs_pf_end = blkid; 279e3aa18adSMartin Matuska zs->zs_ipf_dist = 0; 280e3aa18adSMartin Matuska zs->zs_ipf_start = blkid; 281e3aa18adSMartin Matuska zs->zs_ipf_end = blkid; 282e3aa18adSMartin Matuska /* Allow immediate stream reuse until first hit. */ 283e3aa18adSMartin Matuska zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); 284e3aa18adSMartin Matuska zs->zs_missed = B_FALSE; 285e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 286eda14cbcSMatt Macy } 287eda14cbcSMatt Macy 2887877fdebSMatt Macy static void 289e3aa18adSMartin Matuska dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 2907877fdebSMatt Macy { 2917877fdebSMatt Macy zstream_t *zs = arg; 2927877fdebSMatt Macy 293e3aa18adSMartin Matuska if (io_issued && level == 0 && blkid < zs->zs_blkid) 294e3aa18adSMartin Matuska zs->zs_more = B_TRUE; 295f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 2967877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 2977877fdebSMatt Macy } 2987877fdebSMatt Macy 299eda14cbcSMatt Macy /* 300f9693befSMartin Matuska * This is the predictive prefetch entry point. dmu_zfetch_prepare() 301f9693befSMartin Matuska * associates dnode access specified with blkid and nblks arguments with 302f9693befSMartin Matuska * prefetch stream, predicts further accesses based on that stats and returns 303f9693befSMartin Matuska * the stream pointer on success. That pointer must later be passed to 304f9693befSMartin Matuska * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 305f9693befSMartin Matuska * release it. dmu_zfetch() is a wrapper for simple cases when window between 306f9693befSMartin Matuska * prediction and prefetch initiation is not needed. 307eda14cbcSMatt Macy * fetch_data argument specifies whether actual data blocks should be fetched: 308eda14cbcSMatt Macy * FALSE -- prefetch only indirect blocks for predicted data blocks; 309eda14cbcSMatt Macy * TRUE -- prefetch predicted data blocks plus following indirect blocks. 310eda14cbcSMatt Macy */ 311f9693befSMartin Matuska zstream_t * 312f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 313f9693befSMartin Matuska boolean_t fetch_data, boolean_t have_lock) 314eda14cbcSMatt Macy { 315eda14cbcSMatt Macy zstream_t *zs; 316eda14cbcSMatt Macy spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 317eda14cbcSMatt Macy 318eda14cbcSMatt Macy if (zfs_prefetch_disable) 319f9693befSMartin Matuska return (NULL); 320eda14cbcSMatt Macy /* 321eda14cbcSMatt Macy * If we haven't yet loaded the indirect vdevs' mappings, we 322eda14cbcSMatt Macy * can only read from blocks that we carefully ensure are on 323eda14cbcSMatt Macy * concrete vdevs (or previously-loaded indirect vdevs). So we 324eda14cbcSMatt Macy * can't allow the predictive prefetcher to attempt reads of other 325eda14cbcSMatt Macy * blocks (e.g. of the MOS's dnode object). 326eda14cbcSMatt Macy */ 327eda14cbcSMatt Macy if (!spa_indirect_vdevs_loaded(spa)) 328f9693befSMartin Matuska return (NULL); 329eda14cbcSMatt Macy 330eda14cbcSMatt Macy /* 331eda14cbcSMatt Macy * As a fast path for small (single-block) files, ignore access 332eda14cbcSMatt Macy * to the first block. 333eda14cbcSMatt Macy */ 3347877fdebSMatt Macy if (!have_lock && blkid == 0) 335f9693befSMartin Matuska return (NULL); 336eda14cbcSMatt Macy 337eda14cbcSMatt Macy if (!have_lock) 338eda14cbcSMatt Macy rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 3397877fdebSMatt Macy 3407877fdebSMatt Macy /* 3417877fdebSMatt Macy * A fast path for small files for which no prefetch will 3427877fdebSMatt Macy * happen. 3437877fdebSMatt Macy */ 344e3aa18adSMartin Matuska uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 345f9693befSMartin Matuska if (maxblkid < 2) { 3467877fdebSMatt Macy if (!have_lock) 3477877fdebSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 348f9693befSMartin Matuska return (NULL); 3497877fdebSMatt Macy } 350eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 351eda14cbcSMatt Macy 352eda14cbcSMatt Macy /* 353eda14cbcSMatt Macy * Find matching prefetch stream. Depending on whether the accesses 354eda14cbcSMatt Macy * are block-aligned, first block of the new access may either follow 355eda14cbcSMatt Macy * the last block of the previous access, or be equal to it. 356eda14cbcSMatt Macy */ 357eda14cbcSMatt Macy for (zs = list_head(&zf->zf_stream); zs != NULL; 358eda14cbcSMatt Macy zs = list_next(&zf->zf_stream, zs)) { 359eda14cbcSMatt Macy if (blkid == zs->zs_blkid) { 360eda14cbcSMatt Macy break; 361eda14cbcSMatt Macy } else if (blkid + 1 == zs->zs_blkid) { 362eda14cbcSMatt Macy blkid++; 363eda14cbcSMatt Macy nblks--; 364eda14cbcSMatt Macy break; 365eda14cbcSMatt Macy } 366eda14cbcSMatt Macy } 367f9693befSMartin Matuska 368f9693befSMartin Matuska /* 369f9693befSMartin Matuska * If the file is ending, remove the matching stream if found. 370f9693befSMartin Matuska * If not found then it is too late to create a new one now. 371f9693befSMartin Matuska */ 372e3aa18adSMartin Matuska uint64_t end_of_access_blkid = blkid + nblks; 373f9693befSMartin Matuska if (end_of_access_blkid >= maxblkid) { 374f9693befSMartin Matuska if (zs != NULL) 375f9693befSMartin Matuska dmu_zfetch_stream_remove(zf, zs); 376f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 377f9693befSMartin Matuska if (!have_lock) 378f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 379f9693befSMartin Matuska return (NULL); 380f9693befSMartin Matuska } 381f9693befSMartin Matuska 382f9693befSMartin Matuska /* Exit if we already prefetched this block before. */ 383f9693befSMartin Matuska if (nblks == 0) { 384f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 385f9693befSMartin Matuska if (!have_lock) 386f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 387f9693befSMartin Matuska return (NULL); 388eda14cbcSMatt Macy } 389eda14cbcSMatt Macy 390eda14cbcSMatt Macy if (zs == NULL) { 391eda14cbcSMatt Macy /* 392eda14cbcSMatt Macy * This access is not part of any existing stream. Create 393eda14cbcSMatt Macy * a new stream for it. 394eda14cbcSMatt Macy */ 395eda14cbcSMatt Macy dmu_zfetch_stream_create(zf, end_of_access_blkid); 396eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 397eda14cbcSMatt Macy if (!have_lock) 398eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 399f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_misses); 400f9693befSMartin Matuska return (NULL); 401eda14cbcSMatt Macy } 402eda14cbcSMatt Macy 403eda14cbcSMatt Macy /* 404eda14cbcSMatt Macy * This access was to a block that we issued a prefetch for on 405e3aa18adSMartin Matuska * behalf of this stream. Calculate further prefetch distances. 406eda14cbcSMatt Macy * 407e3aa18adSMartin Matuska * Start prefetch from the demand access size (nblks). Double the 408e3aa18adSMartin Matuska * distance every access up to zfetch_min_distance. After that only 409e3aa18adSMartin Matuska * if needed increase the distance by 1/8 up to zfetch_max_distance. 410eda14cbcSMatt Macy */ 411e3aa18adSMartin Matuska unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift; 412e3aa18adSMartin Matuska unsigned int pf_nblks; 413eda14cbcSMatt Macy if (fetch_data) { 414e3aa18adSMartin Matuska if (unlikely(zs->zs_pf_dist < nbytes)) 415e3aa18adSMartin Matuska zs->zs_pf_dist = nbytes; 416e3aa18adSMartin Matuska else if (zs->zs_pf_dist < zfetch_min_distance) 417e3aa18adSMartin Matuska zs->zs_pf_dist *= 2; 418e3aa18adSMartin Matuska else if (zs->zs_more) 419e3aa18adSMartin Matuska zs->zs_pf_dist += zs->zs_pf_dist / 8; 420e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 421e3aa18adSMartin Matuska if (zs->zs_pf_dist > zfetch_max_distance) 422e3aa18adSMartin Matuska zs->zs_pf_dist = zfetch_max_distance; 423e3aa18adSMartin Matuska pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift; 424eda14cbcSMatt Macy } else { 425eda14cbcSMatt Macy pf_nblks = 0; 426eda14cbcSMatt Macy } 427e3aa18adSMartin Matuska if (zs->zs_pf_start < end_of_access_blkid) 428e3aa18adSMartin Matuska zs->zs_pf_start = end_of_access_blkid; 429e3aa18adSMartin Matuska if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) 430e3aa18adSMartin Matuska zs->zs_pf_end = end_of_access_blkid + pf_nblks; 431eda14cbcSMatt Macy 432eda14cbcSMatt Macy /* 433e3aa18adSMartin Matuska * Do the same for indirects, starting where we will stop reading 434e3aa18adSMartin Matuska * data blocks (and the indirects that point to them). 435eda14cbcSMatt Macy */ 436e3aa18adSMartin Matuska if (unlikely(zs->zs_ipf_dist < nbytes)) 437e3aa18adSMartin Matuska zs->zs_ipf_dist = nbytes; 438e3aa18adSMartin Matuska else 439e3aa18adSMartin Matuska zs->zs_ipf_dist *= 2; 440e3aa18adSMartin Matuska if (zs->zs_ipf_dist > zfetch_max_idistance) 441e3aa18adSMartin Matuska zs->zs_ipf_dist = zfetch_max_idistance; 442e3aa18adSMartin Matuska pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift; 443e3aa18adSMartin Matuska if (zs->zs_ipf_start < zs->zs_pf_end) 444e3aa18adSMartin Matuska zs->zs_ipf_start = zs->zs_pf_end; 445e3aa18adSMartin Matuska if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 446e3aa18adSMartin Matuska zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 447eda14cbcSMatt Macy 448eda14cbcSMatt Macy zs->zs_blkid = end_of_access_blkid; 449f9693befSMartin Matuska /* Protect the stream from reclamation. */ 450f9693befSMartin Matuska zs->zs_atime = gethrtime(); 451f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 452f9693befSMartin Matuska /* Count concurrent callers. */ 453f9693befSMartin Matuska zfs_refcount_add(&zs->zs_callers, NULL); 454eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 455f9693befSMartin Matuska 456f9693befSMartin Matuska if (!have_lock) 457f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 458f9693befSMartin Matuska 459f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_hits); 460f9693befSMartin Matuska return (zs); 461f9693befSMartin Matuska } 462f9693befSMartin Matuska 463f9693befSMartin Matuska void 464f9693befSMartin Matuska dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 465f9693befSMartin Matuska { 466f9693befSMartin Matuska zfetch_t *zf = zs->zs_fetch; 467f9693befSMartin Matuska int64_t pf_start, pf_end, ipf_start, ipf_end; 468f9693befSMartin Matuska int epbs, issued; 469f9693befSMartin Matuska 470f9693befSMartin Matuska if (missed) 471f9693befSMartin Matuska zs->zs_missed = missed; 472eda14cbcSMatt Macy 473eda14cbcSMatt Macy /* 474f9693befSMartin Matuska * Postpone the prefetch if there are more concurrent callers. 475f9693befSMartin Matuska * It happens when multiple requests are waiting for the same 476f9693befSMartin Matuska * indirect block. The last one will run the prefetch for all. 477eda14cbcSMatt Macy */ 478f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 479f9693befSMartin Matuska /* Drop reference taken in dmu_zfetch_prepare(). */ 480f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 481f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 482f9693befSMartin Matuska return; 483f9693befSMartin Matuska } 484eda14cbcSMatt Macy 485f9693befSMartin Matuska mutex_enter(&zf->zf_lock); 486f9693befSMartin Matuska if (zs->zs_missed) { 487e3aa18adSMartin Matuska pf_start = zs->zs_pf_start; 488e3aa18adSMartin Matuska pf_end = zs->zs_pf_start = zs->zs_pf_end; 489f9693befSMartin Matuska } else { 490f9693befSMartin Matuska pf_start = pf_end = 0; 491f9693befSMartin Matuska } 492e3aa18adSMartin Matuska ipf_start = zs->zs_ipf_start; 493e3aa18adSMartin Matuska ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 494f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 495f9693befSMartin Matuska ASSERT3S(pf_start, <=, pf_end); 496f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 497f9693befSMartin Matuska 498f9693befSMartin Matuska epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 499f9693befSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 500f9693befSMartin Matuska ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 501f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 502f9693befSMartin Matuska issued = pf_end - pf_start + ipf_end - ipf_start; 503f9693befSMartin Matuska if (issued > 1) { 504f9693befSMartin Matuska /* More references on top of taken in dmu_zfetch_prepare(). */ 5056ba2210eSMartin Matuska for (int i = 0; i < issued - 1; i++) 5066ba2210eSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 507f9693befSMartin Matuska } else if (issued == 0) { 508f9693befSMartin Matuska /* Some other thread has done our work, so drop the ref. */ 509f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 510f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 511f9693befSMartin Matuska return; 512f9693befSMartin Matuska } 513f9693befSMartin Matuska 514f9693befSMartin Matuska if (!have_lock) 515f9693befSMartin Matuska rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 516f9693befSMartin Matuska 517f9693befSMartin Matuska issued = 0; 518f9693befSMartin Matuska for (int64_t blk = pf_start; blk < pf_end; blk++) { 519f9693befSMartin Matuska issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 520*15f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 521eda14cbcSMatt Macy } 522f9693befSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 5237877fdebSMatt Macy issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 524*15f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 525eda14cbcSMatt Macy } 526f9693befSMartin Matuska 527eda14cbcSMatt Macy if (!have_lock) 528eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 5297877fdebSMatt Macy 5307877fdebSMatt Macy if (issued) 5317877fdebSMatt Macy ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 532eda14cbcSMatt Macy } 533eda14cbcSMatt Macy 534f9693befSMartin Matuska void 535f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 536f9693befSMartin Matuska boolean_t missed, boolean_t have_lock) 537f9693befSMartin Matuska { 538f9693befSMartin Matuska zstream_t *zs; 539f9693befSMartin Matuska 540f9693befSMartin Matuska zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 541f9693befSMartin Matuska if (zs) 542f9693befSMartin Matuska dmu_zfetch_run(zs, missed, have_lock); 543f9693befSMartin Matuska } 544f9693befSMartin Matuska 545eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 546eda14cbcSMatt Macy "Disable all ZFS prefetching"); 547eda14cbcSMatt Macy 548eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 549eda14cbcSMatt Macy "Max number of streams per zfetch"); 550eda14cbcSMatt Macy 551eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 552eda14cbcSMatt Macy "Min time before stream reclaim"); 553eda14cbcSMatt Macy 554e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 555e3aa18adSMartin Matuska "Max time before stream delete"); 556e3aa18adSMartin Matuska 557e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 558e3aa18adSMartin Matuska "Min bytes to prefetch per stream"); 559e3aa18adSMartin Matuska 560eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 561180f8225SMatt Macy "Max bytes to prefetch per stream"); 562180f8225SMatt Macy 563180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 564180f8225SMatt Macy "Max bytes to prefetch indirects for per stream"); 565eda14cbcSMatt Macy 566dbd5678dSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, U64, ZMOD_RW, 567eda14cbcSMatt Macy "Number of bytes in a array_read"); 568