1eda14cbcSMatt Macy /* 2eda14cbcSMatt Macy * CDDL HEADER START 3eda14cbcSMatt Macy * 4eda14cbcSMatt Macy * The contents of this file are subject to the terms of the 5eda14cbcSMatt Macy * Common Development and Distribution License (the "License"). 6eda14cbcSMatt Macy * You may not use this file except in compliance with the License. 7eda14cbcSMatt Macy * 8eda14cbcSMatt Macy * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9271171e0SMartin Matuska * or https://opensource.org/licenses/CDDL-1.0. 10eda14cbcSMatt Macy * See the License for the specific language governing permissions 11eda14cbcSMatt Macy * and limitations under the License. 12eda14cbcSMatt Macy * 13eda14cbcSMatt Macy * When distributing Covered Code, include this CDDL HEADER in each 14eda14cbcSMatt Macy * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15eda14cbcSMatt Macy * If applicable, add the following below this CDDL HEADER, with the 16eda14cbcSMatt Macy * fields enclosed by brackets "[]" replaced with your own identifying 17eda14cbcSMatt Macy * information: Portions Copyright [yyyy] [name of copyright owner] 18eda14cbcSMatt Macy * 19eda14cbcSMatt Macy * CDDL HEADER END 20eda14cbcSMatt Macy */ 21eda14cbcSMatt Macy /* 22eda14cbcSMatt Macy * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23eda14cbcSMatt Macy * Use is subject to license terms. 24eda14cbcSMatt Macy */ 25eda14cbcSMatt Macy 26eda14cbcSMatt Macy /* 27eda14cbcSMatt Macy * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28eda14cbcSMatt Macy */ 29eda14cbcSMatt Macy 30eda14cbcSMatt Macy #include <sys/zfs_context.h> 312a58b312SMartin Matuska #include <sys/arc_impl.h> 32eda14cbcSMatt Macy #include <sys/dnode.h> 33eda14cbcSMatt Macy #include <sys/dmu_objset.h> 34eda14cbcSMatt Macy #include <sys/dmu_zfetch.h> 35eda14cbcSMatt Macy #include <sys/dmu.h> 36eda14cbcSMatt Macy #include <sys/dbuf.h> 37eda14cbcSMatt Macy #include <sys/kstat.h> 380d8fe237SMartin Matuska #include <sys/wmsum.h> 39eda14cbcSMatt Macy 40eda14cbcSMatt Macy /* 41eda14cbcSMatt Macy * This tunable disables predictive prefetch. Note that it leaves "prescient" 42eda14cbcSMatt Macy * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 43eda14cbcSMatt Macy * prescient prefetch never issues i/os that end up not being needed, 44eda14cbcSMatt Macy * so it can't hurt performance. 45eda14cbcSMatt Macy */ 46eda14cbcSMatt Macy 47e92ffd9bSMartin Matuska static int zfs_prefetch_disable = B_FALSE; 48eda14cbcSMatt Macy 49eda14cbcSMatt Macy /* max # of streams per zfetch */ 50e92ffd9bSMartin Matuska static unsigned int zfetch_max_streams = 8; 51eda14cbcSMatt Macy /* min time before stream reclaim */ 52e3aa18adSMartin Matuska static unsigned int zfetch_min_sec_reap = 1; 53e3aa18adSMartin Matuska /* max time before stream delete */ 54e3aa18adSMartin Matuska static unsigned int zfetch_max_sec_reap = 2; 55315ee00fSMartin Matuska #ifdef _ILP32 56315ee00fSMartin Matuska /* min bytes to prefetch per stream (default 2MB) */ 57315ee00fSMartin Matuska static unsigned int zfetch_min_distance = 2 * 1024 * 1024; 58315ee00fSMartin Matuska /* max bytes to prefetch per stream (default 8MB) */ 59315ee00fSMartin Matuska unsigned int zfetch_max_distance = 8 * 1024 * 1024; 60315ee00fSMartin Matuska #else 61e3aa18adSMartin Matuska /* min bytes to prefetch per stream (default 4MB) */ 62e3aa18adSMartin Matuska static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 63e3aa18adSMartin Matuska /* max bytes to prefetch per stream (default 64MB) */ 64e3aa18adSMartin Matuska unsigned int zfetch_max_distance = 64 * 1024 * 1024; 65315ee00fSMartin Matuska #endif 66eda14cbcSMatt Macy /* max bytes to prefetch indirects for per stream (default 64MB) */ 67eda14cbcSMatt Macy unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 68eda14cbcSMatt Macy 69eda14cbcSMatt Macy typedef struct zfetch_stats { 70eda14cbcSMatt Macy kstat_named_t zfetchstat_hits; 71eda14cbcSMatt Macy kstat_named_t zfetchstat_misses; 72eda14cbcSMatt Macy kstat_named_t zfetchstat_max_streams; 737877fdebSMatt Macy kstat_named_t zfetchstat_io_issued; 742a58b312SMartin Matuska kstat_named_t zfetchstat_io_active; 75eda14cbcSMatt Macy } zfetch_stats_t; 76eda14cbcSMatt Macy 77eda14cbcSMatt Macy static zfetch_stats_t zfetch_stats = { 78eda14cbcSMatt Macy { "hits", KSTAT_DATA_UINT64 }, 79eda14cbcSMatt Macy { "misses", KSTAT_DATA_UINT64 }, 80eda14cbcSMatt Macy { "max_streams", KSTAT_DATA_UINT64 }, 817877fdebSMatt Macy { "io_issued", KSTAT_DATA_UINT64 }, 822a58b312SMartin Matuska { "io_active", KSTAT_DATA_UINT64 }, 83eda14cbcSMatt Macy }; 84eda14cbcSMatt Macy 850d8fe237SMartin Matuska struct { 860d8fe237SMartin Matuska wmsum_t zfetchstat_hits; 870d8fe237SMartin Matuska wmsum_t zfetchstat_misses; 880d8fe237SMartin Matuska wmsum_t zfetchstat_max_streams; 890d8fe237SMartin Matuska wmsum_t zfetchstat_io_issued; 902a58b312SMartin Matuska aggsum_t zfetchstat_io_active; 910d8fe237SMartin Matuska } zfetch_sums; 920d8fe237SMartin Matuska 93eda14cbcSMatt Macy #define ZFETCHSTAT_BUMP(stat) \ 940d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, 1) 957877fdebSMatt Macy #define ZFETCHSTAT_ADD(stat, val) \ 960d8fe237SMartin Matuska wmsum_add(&zfetch_sums.stat, val) 977877fdebSMatt Macy 98eda14cbcSMatt Macy 99e92ffd9bSMartin Matuska static kstat_t *zfetch_ksp; 100eda14cbcSMatt Macy 1010d8fe237SMartin Matuska static int 1020d8fe237SMartin Matuska zfetch_kstats_update(kstat_t *ksp, int rw) 1030d8fe237SMartin Matuska { 1040d8fe237SMartin Matuska zfetch_stats_t *zs = ksp->ks_data; 1050d8fe237SMartin Matuska 1060d8fe237SMartin Matuska if (rw == KSTAT_WRITE) 1070d8fe237SMartin Matuska return (EACCES); 1080d8fe237SMartin Matuska zs->zfetchstat_hits.value.ui64 = 1090d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_hits); 1100d8fe237SMartin Matuska zs->zfetchstat_misses.value.ui64 = 1110d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_misses); 1120d8fe237SMartin Matuska zs->zfetchstat_max_streams.value.ui64 = 1130d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_max_streams); 1140d8fe237SMartin Matuska zs->zfetchstat_io_issued.value.ui64 = 1150d8fe237SMartin Matuska wmsum_value(&zfetch_sums.zfetchstat_io_issued); 1162a58b312SMartin Matuska zs->zfetchstat_io_active.value.ui64 = 1172a58b312SMartin Matuska aggsum_value(&zfetch_sums.zfetchstat_io_active); 1180d8fe237SMartin Matuska return (0); 1190d8fe237SMartin Matuska } 1200d8fe237SMartin Matuska 121eda14cbcSMatt Macy void 122eda14cbcSMatt Macy zfetch_init(void) 123eda14cbcSMatt Macy { 1240d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 1250d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 1260d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 1270d8fe237SMartin Matuska wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 1282a58b312SMartin Matuska aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 1290d8fe237SMartin Matuska 130eda14cbcSMatt Macy zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 131eda14cbcSMatt Macy KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 132eda14cbcSMatt Macy KSTAT_FLAG_VIRTUAL); 133eda14cbcSMatt Macy 134eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 135eda14cbcSMatt Macy zfetch_ksp->ks_data = &zfetch_stats; 1360d8fe237SMartin Matuska zfetch_ksp->ks_update = zfetch_kstats_update; 137eda14cbcSMatt Macy kstat_install(zfetch_ksp); 138eda14cbcSMatt Macy } 139eda14cbcSMatt Macy } 140eda14cbcSMatt Macy 141eda14cbcSMatt Macy void 142eda14cbcSMatt Macy zfetch_fini(void) 143eda14cbcSMatt Macy { 144eda14cbcSMatt Macy if (zfetch_ksp != NULL) { 145eda14cbcSMatt Macy kstat_delete(zfetch_ksp); 146eda14cbcSMatt Macy zfetch_ksp = NULL; 147eda14cbcSMatt Macy } 1480d8fe237SMartin Matuska 1490d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_hits); 1500d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_misses); 1510d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 1520d8fe237SMartin Matuska wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 1532a58b312SMartin Matuska ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 1542a58b312SMartin Matuska aggsum_fini(&zfetch_sums.zfetchstat_io_active); 155eda14cbcSMatt Macy } 156eda14cbcSMatt Macy 157eda14cbcSMatt Macy /* 158eda14cbcSMatt Macy * This takes a pointer to a zfetch structure and a dnode. It performs the 159eda14cbcSMatt Macy * necessary setup for the zfetch structure, grokking data from the 160eda14cbcSMatt Macy * associated dnode. 161eda14cbcSMatt Macy */ 162eda14cbcSMatt Macy void 163eda14cbcSMatt Macy dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 164eda14cbcSMatt Macy { 165eda14cbcSMatt Macy if (zf == NULL) 166eda14cbcSMatt Macy return; 167eda14cbcSMatt Macy zf->zf_dnode = dno; 1687877fdebSMatt Macy zf->zf_numstreams = 0; 169eda14cbcSMatt Macy 170eda14cbcSMatt Macy list_create(&zf->zf_stream, sizeof (zstream_t), 171eda14cbcSMatt Macy offsetof(zstream_t, zs_node)); 172eda14cbcSMatt Macy 173eda14cbcSMatt Macy mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 174eda14cbcSMatt Macy } 175eda14cbcSMatt Macy 176eda14cbcSMatt Macy static void 1777877fdebSMatt Macy dmu_zfetch_stream_fini(zstream_t *zs) 1787877fdebSMatt Macy { 179f9693befSMartin Matuska ASSERT(!list_link_active(&zs->zs_node)); 1807cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_callers); 1817cd22ac4SMartin Matuska zfs_refcount_destroy(&zs->zs_refs); 1827877fdebSMatt Macy kmem_free(zs, sizeof (*zs)); 1837877fdebSMatt Macy } 1847877fdebSMatt Macy 1857877fdebSMatt Macy static void 186eda14cbcSMatt Macy dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 187eda14cbcSMatt Macy { 188eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 189eda14cbcSMatt Macy list_remove(&zf->zf_stream, zs); 190f9693befSMartin Matuska zf->zf_numstreams--; 191f9693befSMartin Matuska membar_producer(); 192f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 1937877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 194eda14cbcSMatt Macy } 195eda14cbcSMatt Macy 196eda14cbcSMatt Macy /* 197eda14cbcSMatt Macy * Clean-up state associated with a zfetch structure (e.g. destroy the 198eda14cbcSMatt Macy * streams). This doesn't free the zfetch_t itself, that's left to the caller. 199eda14cbcSMatt Macy */ 200eda14cbcSMatt Macy void 201eda14cbcSMatt Macy dmu_zfetch_fini(zfetch_t *zf) 202eda14cbcSMatt Macy { 203eda14cbcSMatt Macy zstream_t *zs; 204eda14cbcSMatt Macy 205eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 206f9693befSMartin Matuska while ((zs = list_head(&zf->zf_stream)) != NULL) 207eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 208eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 209eda14cbcSMatt Macy list_destroy(&zf->zf_stream); 210eda14cbcSMatt Macy mutex_destroy(&zf->zf_lock); 211eda14cbcSMatt Macy 212eda14cbcSMatt Macy zf->zf_dnode = NULL; 213eda14cbcSMatt Macy } 214eda14cbcSMatt Macy 215eda14cbcSMatt Macy /* 216e3aa18adSMartin Matuska * If there aren't too many active streams already, create one more. 217e3aa18adSMartin Matuska * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 218e3aa18adSMartin Matuska * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 219eda14cbcSMatt Macy * The "blkid" argument is the next block that we expect this stream to access. 220eda14cbcSMatt Macy */ 221eda14cbcSMatt Macy static void 222eda14cbcSMatt Macy dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 223eda14cbcSMatt Macy { 224e3aa18adSMartin Matuska zstream_t *zs, *zs_next, *zs_old = NULL; 225e3aa18adSMartin Matuska hrtime_t now = gethrtime(), t; 226eda14cbcSMatt Macy 227eda14cbcSMatt Macy ASSERT(MUTEX_HELD(&zf->zf_lock)); 228eda14cbcSMatt Macy 229eda14cbcSMatt Macy /* 230e3aa18adSMartin Matuska * Delete too old streams, reusing the first found one. 231eda14cbcSMatt Macy */ 232e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_max_sec_reap); 233e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 234eda14cbcSMatt Macy zs_next = list_next(&zf->zf_stream, zs); 2357877fdebSMatt Macy /* 236f9693befSMartin Matuska * Skip if still active. 1 -- zf_stream reference. 2377877fdebSMatt Macy */ 238f9693befSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 2397877fdebSMatt Macy continue; 240e3aa18adSMartin Matuska if (zs->zs_atime > t) 241e3aa18adSMartin Matuska continue; 242e3aa18adSMartin Matuska if (zs_old) 243eda14cbcSMatt Macy dmu_zfetch_stream_remove(zf, zs); 244e3aa18adSMartin Matuska else 245e3aa18adSMartin Matuska zs_old = zs; 246e3aa18adSMartin Matuska } 247e3aa18adSMartin Matuska if (zs_old) { 248e3aa18adSMartin Matuska zs = zs_old; 249e3aa18adSMartin Matuska goto reuse; 250eda14cbcSMatt Macy } 251eda14cbcSMatt Macy 252eda14cbcSMatt Macy /* 253eda14cbcSMatt Macy * The maximum number of streams is normally zfetch_max_streams, 254eda14cbcSMatt Macy * but for small files we lower it such that it's at least possible 255eda14cbcSMatt Macy * for all the streams to be non-overlapping. 256eda14cbcSMatt Macy */ 257eda14cbcSMatt Macy uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 258eda14cbcSMatt Macy zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 259eda14cbcSMatt Macy zfetch_max_distance)); 2607877fdebSMatt Macy if (zf->zf_numstreams >= max_streams) { 261e3aa18adSMartin Matuska t = now - SEC2NSEC(zfetch_min_sec_reap); 262e3aa18adSMartin Matuska for (zs = list_head(&zf->zf_stream); zs != NULL; 263e3aa18adSMartin Matuska zs = list_next(&zf->zf_stream, zs)) { 264e3aa18adSMartin Matuska if (zfs_refcount_count(&zs->zs_refs) != 1) 265e3aa18adSMartin Matuska continue; 266e3aa18adSMartin Matuska if (zs->zs_atime > t) 267e3aa18adSMartin Matuska continue; 268e3aa18adSMartin Matuska if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) 269e3aa18adSMartin Matuska zs_old = zs; 270e3aa18adSMartin Matuska } 271e3aa18adSMartin Matuska if (zs_old) { 272e3aa18adSMartin Matuska zs = zs_old; 273e3aa18adSMartin Matuska goto reuse; 274e3aa18adSMartin Matuska } 275eda14cbcSMatt Macy ZFETCHSTAT_BUMP(zfetchstat_max_streams); 276eda14cbcSMatt Macy return; 277eda14cbcSMatt Macy } 278eda14cbcSMatt Macy 279e3aa18adSMartin Matuska zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 2807877fdebSMatt Macy zs->zs_fetch = zf; 281f9693befSMartin Matuska zfs_refcount_create(&zs->zs_callers); 282f9693befSMartin Matuska zfs_refcount_create(&zs->zs_refs); 283f9693befSMartin Matuska /* One reference for zf_stream. */ 284f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 2857877fdebSMatt Macy zf->zf_numstreams++; 286eda14cbcSMatt Macy list_insert_head(&zf->zf_stream, zs); 287e3aa18adSMartin Matuska 288e3aa18adSMartin Matuska reuse: 289e3aa18adSMartin Matuska zs->zs_blkid = blkid; 290e3aa18adSMartin Matuska zs->zs_pf_dist = 0; 291e3aa18adSMartin Matuska zs->zs_pf_start = blkid; 292e3aa18adSMartin Matuska zs->zs_pf_end = blkid; 293e3aa18adSMartin Matuska zs->zs_ipf_dist = 0; 294e3aa18adSMartin Matuska zs->zs_ipf_start = blkid; 295e3aa18adSMartin Matuska zs->zs_ipf_end = blkid; 296e3aa18adSMartin Matuska /* Allow immediate stream reuse until first hit. */ 297e3aa18adSMartin Matuska zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); 298e3aa18adSMartin Matuska zs->zs_missed = B_FALSE; 299e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 300eda14cbcSMatt Macy } 301eda14cbcSMatt Macy 3027877fdebSMatt Macy static void 303e3aa18adSMartin Matuska dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 3047877fdebSMatt Macy { 3057877fdebSMatt Macy zstream_t *zs = arg; 3067877fdebSMatt Macy 307e3aa18adSMartin Matuska if (io_issued && level == 0 && blkid < zs->zs_blkid) 308e3aa18adSMartin Matuska zs->zs_more = B_TRUE; 309f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 3107877fdebSMatt Macy dmu_zfetch_stream_fini(zs); 3112a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 3127877fdebSMatt Macy } 3137877fdebSMatt Macy 314eda14cbcSMatt Macy /* 315f9693befSMartin Matuska * This is the predictive prefetch entry point. dmu_zfetch_prepare() 316f9693befSMartin Matuska * associates dnode access specified with blkid and nblks arguments with 317f9693befSMartin Matuska * prefetch stream, predicts further accesses based on that stats and returns 318f9693befSMartin Matuska * the stream pointer on success. That pointer must later be passed to 319f9693befSMartin Matuska * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 320f9693befSMartin Matuska * release it. dmu_zfetch() is a wrapper for simple cases when window between 321f9693befSMartin Matuska * prediction and prefetch initiation is not needed. 322eda14cbcSMatt Macy * fetch_data argument specifies whether actual data blocks should be fetched: 323eda14cbcSMatt Macy * FALSE -- prefetch only indirect blocks for predicted data blocks; 324eda14cbcSMatt Macy * TRUE -- prefetch predicted data blocks plus following indirect blocks. 325eda14cbcSMatt Macy */ 326f9693befSMartin Matuska zstream_t * 327f9693befSMartin Matuska dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 328f9693befSMartin Matuska boolean_t fetch_data, boolean_t have_lock) 329eda14cbcSMatt Macy { 330eda14cbcSMatt Macy zstream_t *zs; 331eda14cbcSMatt Macy spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 332*6c1e79dfSMartin Matuska zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; 333eda14cbcSMatt Macy 334*6c1e79dfSMartin Matuska if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) 335f9693befSMartin Matuska return (NULL); 336*6c1e79dfSMartin Matuska 337*6c1e79dfSMartin Matuska if (os_prefetch == ZFS_PREFETCH_METADATA) 338*6c1e79dfSMartin Matuska fetch_data = B_FALSE; 339*6c1e79dfSMartin Matuska 340eda14cbcSMatt Macy /* 341eda14cbcSMatt Macy * If we haven't yet loaded the indirect vdevs' mappings, we 342eda14cbcSMatt Macy * can only read from blocks that we carefully ensure are on 343eda14cbcSMatt Macy * concrete vdevs (or previously-loaded indirect vdevs). So we 344eda14cbcSMatt Macy * can't allow the predictive prefetcher to attempt reads of other 345eda14cbcSMatt Macy * blocks (e.g. of the MOS's dnode object). 346eda14cbcSMatt Macy */ 347eda14cbcSMatt Macy if (!spa_indirect_vdevs_loaded(spa)) 348f9693befSMartin Matuska return (NULL); 349eda14cbcSMatt Macy 350eda14cbcSMatt Macy /* 351eda14cbcSMatt Macy * As a fast path for small (single-block) files, ignore access 352eda14cbcSMatt Macy * to the first block. 353eda14cbcSMatt Macy */ 3547877fdebSMatt Macy if (!have_lock && blkid == 0) 355f9693befSMartin Matuska return (NULL); 356eda14cbcSMatt Macy 357eda14cbcSMatt Macy if (!have_lock) 358eda14cbcSMatt Macy rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 3597877fdebSMatt Macy 3607877fdebSMatt Macy /* 3617877fdebSMatt Macy * A fast path for small files for which no prefetch will 3627877fdebSMatt Macy * happen. 3637877fdebSMatt Macy */ 364e3aa18adSMartin Matuska uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 365f9693befSMartin Matuska if (maxblkid < 2) { 3667877fdebSMatt Macy if (!have_lock) 3677877fdebSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 368f9693befSMartin Matuska return (NULL); 3697877fdebSMatt Macy } 370eda14cbcSMatt Macy mutex_enter(&zf->zf_lock); 371eda14cbcSMatt Macy 372eda14cbcSMatt Macy /* 373eda14cbcSMatt Macy * Find matching prefetch stream. Depending on whether the accesses 374eda14cbcSMatt Macy * are block-aligned, first block of the new access may either follow 375eda14cbcSMatt Macy * the last block of the previous access, or be equal to it. 376eda14cbcSMatt Macy */ 377eda14cbcSMatt Macy for (zs = list_head(&zf->zf_stream); zs != NULL; 378eda14cbcSMatt Macy zs = list_next(&zf->zf_stream, zs)) { 379eda14cbcSMatt Macy if (blkid == zs->zs_blkid) { 380eda14cbcSMatt Macy break; 381eda14cbcSMatt Macy } else if (blkid + 1 == zs->zs_blkid) { 382eda14cbcSMatt Macy blkid++; 383eda14cbcSMatt Macy nblks--; 384eda14cbcSMatt Macy break; 385eda14cbcSMatt Macy } 386eda14cbcSMatt Macy } 387f9693befSMartin Matuska 388f9693befSMartin Matuska /* 389f9693befSMartin Matuska * If the file is ending, remove the matching stream if found. 390f9693befSMartin Matuska * If not found then it is too late to create a new one now. 391f9693befSMartin Matuska */ 392e3aa18adSMartin Matuska uint64_t end_of_access_blkid = blkid + nblks; 393f9693befSMartin Matuska if (end_of_access_blkid >= maxblkid) { 394f9693befSMartin Matuska if (zs != NULL) 395f9693befSMartin Matuska dmu_zfetch_stream_remove(zf, zs); 396f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 397f9693befSMartin Matuska if (!have_lock) 398f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 399f9693befSMartin Matuska return (NULL); 400f9693befSMartin Matuska } 401f9693befSMartin Matuska 402f9693befSMartin Matuska /* Exit if we already prefetched this block before. */ 403f9693befSMartin Matuska if (nblks == 0) { 404f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 405f9693befSMartin Matuska if (!have_lock) 406f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 407f9693befSMartin Matuska return (NULL); 408eda14cbcSMatt Macy } 409eda14cbcSMatt Macy 410eda14cbcSMatt Macy if (zs == NULL) { 411eda14cbcSMatt Macy /* 412eda14cbcSMatt Macy * This access is not part of any existing stream. Create 413eda14cbcSMatt Macy * a new stream for it. 414eda14cbcSMatt Macy */ 415eda14cbcSMatt Macy dmu_zfetch_stream_create(zf, end_of_access_blkid); 416eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 417eda14cbcSMatt Macy if (!have_lock) 418eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 419f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_misses); 420f9693befSMartin Matuska return (NULL); 421eda14cbcSMatt Macy } 422eda14cbcSMatt Macy 423eda14cbcSMatt Macy /* 424eda14cbcSMatt Macy * This access was to a block that we issued a prefetch for on 425e3aa18adSMartin Matuska * behalf of this stream. Calculate further prefetch distances. 426eda14cbcSMatt Macy * 427e3aa18adSMartin Matuska * Start prefetch from the demand access size (nblks). Double the 428e3aa18adSMartin Matuska * distance every access up to zfetch_min_distance. After that only 429e3aa18adSMartin Matuska * if needed increase the distance by 1/8 up to zfetch_max_distance. 4302a58b312SMartin Matuska * 4312a58b312SMartin Matuska * Don't double the distance beyond single block if we have more 4322a58b312SMartin Matuska * than ~6% of ARC held by active prefetches. It should help with 4332a58b312SMartin Matuska * getting out of RAM on some badly mispredicted read patterns. 434eda14cbcSMatt Macy */ 4352a58b312SMartin Matuska unsigned int dbs = zf->zf_dnode->dn_datablkshift; 4362a58b312SMartin Matuska unsigned int nbytes = nblks << dbs; 437e3aa18adSMartin Matuska unsigned int pf_nblks; 438eda14cbcSMatt Macy if (fetch_data) { 439e3aa18adSMartin Matuska if (unlikely(zs->zs_pf_dist < nbytes)) 440e3aa18adSMartin Matuska zs->zs_pf_dist = nbytes; 4412a58b312SMartin Matuska else if (zs->zs_pf_dist < zfetch_min_distance && 4422a58b312SMartin Matuska (zs->zs_pf_dist < (1 << dbs) || 4432a58b312SMartin Matuska aggsum_compare(&zfetch_sums.zfetchstat_io_active, 4442a58b312SMartin Matuska arc_c_max >> (4 + dbs)) < 0)) 445e3aa18adSMartin Matuska zs->zs_pf_dist *= 2; 446e3aa18adSMartin Matuska else if (zs->zs_more) 447e3aa18adSMartin Matuska zs->zs_pf_dist += zs->zs_pf_dist / 8; 448e3aa18adSMartin Matuska zs->zs_more = B_FALSE; 449e3aa18adSMartin Matuska if (zs->zs_pf_dist > zfetch_max_distance) 450e3aa18adSMartin Matuska zs->zs_pf_dist = zfetch_max_distance; 4512a58b312SMartin Matuska pf_nblks = zs->zs_pf_dist >> dbs; 452eda14cbcSMatt Macy } else { 453eda14cbcSMatt Macy pf_nblks = 0; 454eda14cbcSMatt Macy } 455e3aa18adSMartin Matuska if (zs->zs_pf_start < end_of_access_blkid) 456e3aa18adSMartin Matuska zs->zs_pf_start = end_of_access_blkid; 457e3aa18adSMartin Matuska if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) 458e3aa18adSMartin Matuska zs->zs_pf_end = end_of_access_blkid + pf_nblks; 459eda14cbcSMatt Macy 460eda14cbcSMatt Macy /* 461e3aa18adSMartin Matuska * Do the same for indirects, starting where we will stop reading 462e3aa18adSMartin Matuska * data blocks (and the indirects that point to them). 463eda14cbcSMatt Macy */ 464e3aa18adSMartin Matuska if (unlikely(zs->zs_ipf_dist < nbytes)) 465e3aa18adSMartin Matuska zs->zs_ipf_dist = nbytes; 466e3aa18adSMartin Matuska else 467e3aa18adSMartin Matuska zs->zs_ipf_dist *= 2; 468e3aa18adSMartin Matuska if (zs->zs_ipf_dist > zfetch_max_idistance) 469e3aa18adSMartin Matuska zs->zs_ipf_dist = zfetch_max_idistance; 4702a58b312SMartin Matuska pf_nblks = zs->zs_ipf_dist >> dbs; 471e3aa18adSMartin Matuska if (zs->zs_ipf_start < zs->zs_pf_end) 472e3aa18adSMartin Matuska zs->zs_ipf_start = zs->zs_pf_end; 473e3aa18adSMartin Matuska if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 474e3aa18adSMartin Matuska zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 475eda14cbcSMatt Macy 476eda14cbcSMatt Macy zs->zs_blkid = end_of_access_blkid; 477f9693befSMartin Matuska /* Protect the stream from reclamation. */ 478f9693befSMartin Matuska zs->zs_atime = gethrtime(); 479f9693befSMartin Matuska zfs_refcount_add(&zs->zs_refs, NULL); 480f9693befSMartin Matuska /* Count concurrent callers. */ 481f9693befSMartin Matuska zfs_refcount_add(&zs->zs_callers, NULL); 482eda14cbcSMatt Macy mutex_exit(&zf->zf_lock); 483f9693befSMartin Matuska 484f9693befSMartin Matuska if (!have_lock) 485f9693befSMartin Matuska rw_exit(&zf->zf_dnode->dn_struct_rwlock); 486f9693befSMartin Matuska 487f9693befSMartin Matuska ZFETCHSTAT_BUMP(zfetchstat_hits); 488f9693befSMartin Matuska return (zs); 489f9693befSMartin Matuska } 490f9693befSMartin Matuska 491f9693befSMartin Matuska void 492f9693befSMartin Matuska dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 493f9693befSMartin Matuska { 494f9693befSMartin Matuska zfetch_t *zf = zs->zs_fetch; 495f9693befSMartin Matuska int64_t pf_start, pf_end, ipf_start, ipf_end; 496f9693befSMartin Matuska int epbs, issued; 497f9693befSMartin Matuska 498f9693befSMartin Matuska if (missed) 499f9693befSMartin Matuska zs->zs_missed = missed; 500eda14cbcSMatt Macy 501eda14cbcSMatt Macy /* 502f9693befSMartin Matuska * Postpone the prefetch if there are more concurrent callers. 503f9693befSMartin Matuska * It happens when multiple requests are waiting for the same 504f9693befSMartin Matuska * indirect block. The last one will run the prefetch for all. 505eda14cbcSMatt Macy */ 506f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 507f9693befSMartin Matuska /* Drop reference taken in dmu_zfetch_prepare(). */ 508f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 509f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 510f9693befSMartin Matuska return; 511f9693befSMartin Matuska } 512eda14cbcSMatt Macy 513f9693befSMartin Matuska mutex_enter(&zf->zf_lock); 514f9693befSMartin Matuska if (zs->zs_missed) { 515e3aa18adSMartin Matuska pf_start = zs->zs_pf_start; 516e3aa18adSMartin Matuska pf_end = zs->zs_pf_start = zs->zs_pf_end; 517f9693befSMartin Matuska } else { 518f9693befSMartin Matuska pf_start = pf_end = 0; 519f9693befSMartin Matuska } 520e3aa18adSMartin Matuska ipf_start = zs->zs_ipf_start; 521e3aa18adSMartin Matuska ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 522f9693befSMartin Matuska mutex_exit(&zf->zf_lock); 523f9693befSMartin Matuska ASSERT3S(pf_start, <=, pf_end); 524f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 525f9693befSMartin Matuska 526f9693befSMartin Matuska epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 527f9693befSMartin Matuska ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 528f9693befSMartin Matuska ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 529f9693befSMartin Matuska ASSERT3S(ipf_start, <=, ipf_end); 530f9693befSMartin Matuska issued = pf_end - pf_start + ipf_end - ipf_start; 531f9693befSMartin Matuska if (issued > 1) { 532f9693befSMartin Matuska /* More references on top of taken in dmu_zfetch_prepare(). */ 5334e8d558cSMartin Matuska zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); 534f9693befSMartin Matuska } else if (issued == 0) { 535f9693befSMartin Matuska /* Some other thread has done our work, so drop the ref. */ 536f9693befSMartin Matuska if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 537f9693befSMartin Matuska dmu_zfetch_stream_fini(zs); 538f9693befSMartin Matuska return; 539f9693befSMartin Matuska } 5402a58b312SMartin Matuska aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 541f9693befSMartin Matuska 542f9693befSMartin Matuska if (!have_lock) 543f9693befSMartin Matuska rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 544f9693befSMartin Matuska 545f9693befSMartin Matuska issued = 0; 546f9693befSMartin Matuska for (int64_t blk = pf_start; blk < pf_end; blk++) { 547f9693befSMartin Matuska issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 54815f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 549eda14cbcSMatt Macy } 550f9693befSMartin Matuska for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 5517877fdebSMatt Macy issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 55215f0b8c3SMartin Matuska ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 553eda14cbcSMatt Macy } 554f9693befSMartin Matuska 555eda14cbcSMatt Macy if (!have_lock) 556eda14cbcSMatt Macy rw_exit(&zf->zf_dnode->dn_struct_rwlock); 5577877fdebSMatt Macy 5587877fdebSMatt Macy if (issued) 5597877fdebSMatt Macy ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 560eda14cbcSMatt Macy } 561eda14cbcSMatt Macy 562f9693befSMartin Matuska void 563f9693befSMartin Matuska dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 564f9693befSMartin Matuska boolean_t missed, boolean_t have_lock) 565f9693befSMartin Matuska { 566f9693befSMartin Matuska zstream_t *zs; 567f9693befSMartin Matuska 568f9693befSMartin Matuska zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 569f9693befSMartin Matuska if (zs) 570f9693befSMartin Matuska dmu_zfetch_run(zs, missed, have_lock); 571f9693befSMartin Matuska } 572f9693befSMartin Matuska 573eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 574eda14cbcSMatt Macy "Disable all ZFS prefetching"); 575eda14cbcSMatt Macy 576eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 577eda14cbcSMatt Macy "Max number of streams per zfetch"); 578eda14cbcSMatt Macy 579eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 580eda14cbcSMatt Macy "Min time before stream reclaim"); 581eda14cbcSMatt Macy 582e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 583e3aa18adSMartin Matuska "Max time before stream delete"); 584e3aa18adSMartin Matuska 585e3aa18adSMartin Matuska ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 586e3aa18adSMartin Matuska "Min bytes to prefetch per stream"); 587e3aa18adSMartin Matuska 588eda14cbcSMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 589180f8225SMatt Macy "Max bytes to prefetch per stream"); 590180f8225SMatt Macy 591180f8225SMatt Macy ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 592180f8225SMatt Macy "Max bytes to prefetch indirects for per stream"); 593