1fa9e4066Sahrens /* 2fa9e4066Sahrens * CDDL HEADER START 3fa9e4066Sahrens * 4fa9e4066Sahrens * The contents of this file are subject to the terms of the 513506d1eSmaybee * Common Development and Distribution License (the "License"). 613506d1eSmaybee * You may not use this file except in compliance with the License. 7fa9e4066Sahrens * 8fa9e4066Sahrens * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9fa9e4066Sahrens * or http://www.opensolaris.org/os/licensing. 10fa9e4066Sahrens * See the License for the specific language governing permissions 11fa9e4066Sahrens * and limitations under the License. 12fa9e4066Sahrens * 13fa9e4066Sahrens * When distributing Covered Code, include this CDDL HEADER in each 14fa9e4066Sahrens * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15fa9e4066Sahrens * If applicable, add the following below this CDDL HEADER, with the 16fa9e4066Sahrens * fields enclosed by brackets "[]" replaced with your own identifying 17fa9e4066Sahrens * information: Portions Copyright [yyyy] [name of copyright owner] 18fa9e4066Sahrens * 19fa9e4066Sahrens * CDDL HEADER END 20fa9e4066Sahrens */ 21fa9e4066Sahrens /* 227cbf8b43SRich Morris * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23fa9e4066Sahrens * Use is subject to license terms. 24fa9e4066Sahrens */ 25fa9e4066Sahrens 2669962b56SMatthew Ahrens /* 27*63280274SGeorge Wilson * Copyright (c) 2013, 2015 by Delphix. All rights reserved. 2869962b56SMatthew Ahrens */ 2969962b56SMatthew Ahrens 30fa9e4066Sahrens #include <sys/zfs_context.h> 31fa9e4066Sahrens #include <sys/dnode.h> 32fa9e4066Sahrens #include <sys/dmu_objset.h> 33fa9e4066Sahrens #include <sys/dmu_zfetch.h> 34fa9e4066Sahrens #include <sys/dmu.h> 35fa9e4066Sahrens #include <sys/dbuf.h> 367cbf8b43SRich Morris #include <sys/kstat.h> 37fa9e4066Sahrens 38fa9e4066Sahrens /* 39cf6106c8SMatthew Ahrens * This tunable disables predictive prefetch. Note that it leaves "prescient" 40cf6106c8SMatthew Ahrens * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 41cf6106c8SMatthew Ahrens * prescient prefetch never issues i/os that end up not being needed, 42cf6106c8SMatthew Ahrens * so it can't hurt performance. 43fa9e4066Sahrens */ 44cf6106c8SMatthew Ahrens boolean_t zfs_prefetch_disable = B_FALSE; 45a2eea2e1Sahrens 46fa9e4066Sahrens /* max # of streams per zfetch */ 47fa9e4066Sahrens uint32_t zfetch_max_streams = 8; 48fa9e4066Sahrens /* min time before stream reclaim */ 49fa9e4066Sahrens uint32_t zfetch_min_sec_reap = 2; 50cf6106c8SMatthew Ahrens /* max bytes to prefetch per stream (default 8MB) */ 51cf6106c8SMatthew Ahrens uint32_t zfetch_max_distance = 8 * 1024 * 1024; 52*63280274SGeorge Wilson /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 53fa9e4066Sahrens uint64_t zfetch_array_rd_sz = 1024 * 1024; 54fa9e4066Sahrens 557cbf8b43SRich Morris typedef struct zfetch_stats { 567cbf8b43SRich Morris kstat_named_t zfetchstat_hits; 577cbf8b43SRich Morris kstat_named_t zfetchstat_misses; 58cf6106c8SMatthew Ahrens kstat_named_t zfetchstat_max_streams; 597cbf8b43SRich Morris } zfetch_stats_t; 607cbf8b43SRich Morris 617cbf8b43SRich Morris static zfetch_stats_t zfetch_stats = { 627cbf8b43SRich Morris { "hits", KSTAT_DATA_UINT64 }, 637cbf8b43SRich Morris { "misses", KSTAT_DATA_UINT64 }, 64cf6106c8SMatthew Ahrens { "max_streams", KSTAT_DATA_UINT64 }, 657cbf8b43SRich Morris }; 667cbf8b43SRich Morris 67cf6106c8SMatthew Ahrens #define ZFETCHSTAT_BUMP(stat) \ 68cf6106c8SMatthew Ahrens atomic_inc_64(&zfetch_stats.stat.value.ui64); 697cbf8b43SRich Morris 707cbf8b43SRich Morris kstat_t *zfetch_ksp; 717cbf8b43SRich Morris 727cbf8b43SRich Morris void 737cbf8b43SRich Morris zfetch_init(void) 747cbf8b43SRich Morris { 757cbf8b43SRich Morris zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 767cbf8b43SRich Morris KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 777cbf8b43SRich Morris KSTAT_FLAG_VIRTUAL); 787cbf8b43SRich Morris 797cbf8b43SRich Morris if (zfetch_ksp != NULL) { 807cbf8b43SRich Morris zfetch_ksp->ks_data = &zfetch_stats; 817cbf8b43SRich Morris kstat_install(zfetch_ksp); 827cbf8b43SRich Morris } 837cbf8b43SRich Morris } 847cbf8b43SRich Morris 857cbf8b43SRich Morris void 867cbf8b43SRich Morris zfetch_fini(void) 877cbf8b43SRich Morris { 887cbf8b43SRich Morris if (zfetch_ksp != NULL) { 897cbf8b43SRich Morris kstat_delete(zfetch_ksp); 907cbf8b43SRich Morris zfetch_ksp = NULL; 917cbf8b43SRich Morris } 927cbf8b43SRich Morris } 937cbf8b43SRich Morris 94fa9e4066Sahrens /* 95fa9e4066Sahrens * This takes a pointer to a zfetch structure and a dnode. It performs the 96fa9e4066Sahrens * necessary setup for the zfetch structure, grokking data from the 97fa9e4066Sahrens * associated dnode. 98fa9e4066Sahrens */ 99fa9e4066Sahrens void 100fa9e4066Sahrens dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 101fa9e4066Sahrens { 102cf6106c8SMatthew Ahrens if (zf == NULL) 103fa9e4066Sahrens return; 104fa9e4066Sahrens 105fa9e4066Sahrens zf->zf_dnode = dno; 106fa9e4066Sahrens 107fa9e4066Sahrens list_create(&zf->zf_stream, sizeof (zstream_t), 108cf6106c8SMatthew Ahrens offsetof(zstream_t, zs_node)); 109fa9e4066Sahrens 110fa9e4066Sahrens rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 111fa9e4066Sahrens } 112fa9e4066Sahrens 113cf6106c8SMatthew Ahrens static void 114cf6106c8SMatthew Ahrens dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 115fa9e4066Sahrens { 116cf6106c8SMatthew Ahrens ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 117cf6106c8SMatthew Ahrens list_remove(&zf->zf_stream, zs); 118cf6106c8SMatthew Ahrens mutex_destroy(&zs->zs_lock); 119cf6106c8SMatthew Ahrens kmem_free(zs, sizeof (*zs)); 120fa9e4066Sahrens } 121fa9e4066Sahrens 122fa9e4066Sahrens /* 123cf6106c8SMatthew Ahrens * Clean-up state associated with a zfetch structure (e.g. destroy the 124cf6106c8SMatthew Ahrens * streams). This doesn't free the zfetch_t itself, that's left to the caller. 125fa9e4066Sahrens */ 126fa9e4066Sahrens void 127cf6106c8SMatthew Ahrens dmu_zfetch_fini(zfetch_t *zf) 128fa9e4066Sahrens { 129fa9e4066Sahrens zstream_t *zs; 130fa9e4066Sahrens 131fa9e4066Sahrens ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 132fa9e4066Sahrens 133cf6106c8SMatthew Ahrens rw_enter(&zf->zf_rwlock, RW_WRITER); 134cf6106c8SMatthew Ahrens while ((zs = list_head(&zf->zf_stream)) != NULL) 135cf6106c8SMatthew Ahrens dmu_zfetch_stream_remove(zf, zs); 136cf6106c8SMatthew Ahrens rw_exit(&zf->zf_rwlock); 137fa9e4066Sahrens list_destroy(&zf->zf_stream); 138fa9e4066Sahrens rw_destroy(&zf->zf_rwlock); 139fa9e4066Sahrens 140fa9e4066Sahrens zf->zf_dnode = NULL; 141fa9e4066Sahrens } 142fa9e4066Sahrens 143fa9e4066Sahrens /* 144cf6106c8SMatthew Ahrens * If there aren't too many streams already, create a new stream. 145cf6106c8SMatthew Ahrens * The "blkid" argument is the next block that we expect this stream to access. 146cf6106c8SMatthew Ahrens * While we're here, clean up old streams (which haven't been 147cf6106c8SMatthew Ahrens * accessed for at least zfetch_min_sec_reap seconds). 148fa9e4066Sahrens */ 149fa9e4066Sahrens static void 150cf6106c8SMatthew Ahrens dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 151fa9e4066Sahrens { 152cf6106c8SMatthew Ahrens zstream_t *zs_next; 153cf6106c8SMatthew Ahrens int numstreams = 0; 154cf6106c8SMatthew Ahrens 155fa9e4066Sahrens ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 156fa9e4066Sahrens 157cf6106c8SMatthew Ahrens /* 158cf6106c8SMatthew Ahrens * Clean up old streams. 159cf6106c8SMatthew Ahrens */ 160cf6106c8SMatthew Ahrens for (zstream_t *zs = list_head(&zf->zf_stream); 161cf6106c8SMatthew Ahrens zs != NULL; zs = zs_next) { 162cf6106c8SMatthew Ahrens zs_next = list_next(&zf->zf_stream, zs); 163cf6106c8SMatthew Ahrens if (((gethrtime() - zs->zs_atime) / NANOSEC) > 164cf6106c8SMatthew Ahrens zfetch_min_sec_reap) 165cf6106c8SMatthew Ahrens dmu_zfetch_stream_remove(zf, zs); 166cf6106c8SMatthew Ahrens else 167cf6106c8SMatthew Ahrens numstreams++; 168fa9e4066Sahrens } 169fa9e4066Sahrens 170cf6106c8SMatthew Ahrens /* 171cf6106c8SMatthew Ahrens * The maximum number of streams is normally zfetch_max_streams, 172cf6106c8SMatthew Ahrens * but for small files we lower it such that it's at least possible 173cf6106c8SMatthew Ahrens * for all the streams to be non-overlapping. 174cf6106c8SMatthew Ahrens * 175cf6106c8SMatthew Ahrens * If we are already at the maximum number of streams for this file, 176cf6106c8SMatthew Ahrens * even after removing old streams, then don't create this stream. 177cf6106c8SMatthew Ahrens */ 178cf6106c8SMatthew Ahrens uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 179cf6106c8SMatthew Ahrens zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 180cf6106c8SMatthew Ahrens zfetch_max_distance)); 181cf6106c8SMatthew Ahrens if (numstreams >= max_streams) { 182cf6106c8SMatthew Ahrens ZFETCHSTAT_BUMP(zfetchstat_max_streams); 183cf6106c8SMatthew Ahrens return; 184cf6106c8SMatthew Ahrens } 185fa9e4066Sahrens 186cf6106c8SMatthew Ahrens zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 187cf6106c8SMatthew Ahrens zs->zs_blkid = blkid; 188cf6106c8SMatthew Ahrens zs->zs_pf_blkid = blkid; 189cf6106c8SMatthew Ahrens zs->zs_atime = gethrtime(); 190cf6106c8SMatthew Ahrens mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); 191fa9e4066Sahrens 192cf6106c8SMatthew Ahrens list_insert_head(&zf->zf_stream, zs); 193fa9e4066Sahrens } 194fa9e4066Sahrens 195fa9e4066Sahrens /* 196fa9e4066Sahrens * This is the prefetch entry point. It calls all of the other dmu_zfetch 197fa9e4066Sahrens * routines to create, delete, find, or operate upon prefetch streams. 198fa9e4066Sahrens */ 199fa9e4066Sahrens void 200cf6106c8SMatthew Ahrens dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks) 201fa9e4066Sahrens { 202cf6106c8SMatthew Ahrens zstream_t *zs; 203fa9e4066Sahrens 204a2eea2e1Sahrens if (zfs_prefetch_disable) 205fa9e4066Sahrens return; 206a2eea2e1Sahrens 207cf6106c8SMatthew Ahrens /* 208cf6106c8SMatthew Ahrens * As a fast path for small (single-block) files, ignore access 209cf6106c8SMatthew Ahrens * to the first block. 210cf6106c8SMatthew Ahrens */ 211cf6106c8SMatthew Ahrens if (blkid == 0) 212a2eea2e1Sahrens return; 213fa9e4066Sahrens 214cf6106c8SMatthew Ahrens rw_enter(&zf->zf_rwlock, RW_READER); 215fa9e4066Sahrens 216cf6106c8SMatthew Ahrens for (zs = list_head(&zf->zf_stream); zs != NULL; 217cf6106c8SMatthew Ahrens zs = list_next(&zf->zf_stream, zs)) { 218cf6106c8SMatthew Ahrens if (blkid == zs->zs_blkid) { 219cf6106c8SMatthew Ahrens mutex_enter(&zs->zs_lock); 220cf6106c8SMatthew Ahrens /* 221cf6106c8SMatthew Ahrens * zs_blkid could have changed before we 222cf6106c8SMatthew Ahrens * acquired zs_lock; re-check them here. 223cf6106c8SMatthew Ahrens */ 224cf6106c8SMatthew Ahrens if (blkid != zs->zs_blkid) { 225cf6106c8SMatthew Ahrens mutex_exit(&zs->zs_lock); 226cf6106c8SMatthew Ahrens continue; 227cf6106c8SMatthew Ahrens } 228cf6106c8SMatthew Ahrens break; 229cf6106c8SMatthew Ahrens } 230cf6106c8SMatthew Ahrens } 231fa9e4066Sahrens 232cf6106c8SMatthew Ahrens if (zs == NULL) { 233cf6106c8SMatthew Ahrens /* 234cf6106c8SMatthew Ahrens * This access is not part of any existing stream. Create 235cf6106c8SMatthew Ahrens * a new stream for it. 236cf6106c8SMatthew Ahrens */ 2377cbf8b43SRich Morris ZFETCHSTAT_BUMP(zfetchstat_misses); 238cf6106c8SMatthew Ahrens if (rw_tryupgrade(&zf->zf_rwlock)) 239cf6106c8SMatthew Ahrens dmu_zfetch_stream_create(zf, blkid + nblks); 240cf6106c8SMatthew Ahrens rw_exit(&zf->zf_rwlock); 241cf6106c8SMatthew Ahrens return; 2427cbf8b43SRich Morris } 243fa9e4066Sahrens 244fa9e4066Sahrens /* 245cf6106c8SMatthew Ahrens * This access was to a block that we issued a prefetch for on 246cf6106c8SMatthew Ahrens * behalf of this stream. Issue further prefetches for this stream. 247cf6106c8SMatthew Ahrens * 248cf6106c8SMatthew Ahrens * Normally, we start prefetching where we stopped 249cf6106c8SMatthew Ahrens * prefetching last (zs_pf_blkid). But when we get our first 250cf6106c8SMatthew Ahrens * hit on this stream, zs_pf_blkid == zs_blkid, we don't 251cf6106c8SMatthew Ahrens * want to prefetch to block we just accessed. In this case, 252cf6106c8SMatthew Ahrens * start just after the block we just accessed. 253fa9e4066Sahrens */ 254cf6106c8SMatthew Ahrens int64_t pf_start = MAX(zs->zs_pf_blkid, blkid + nblks); 255fa9e4066Sahrens 256cf6106c8SMatthew Ahrens /* 257cf6106c8SMatthew Ahrens * Double our amount of prefetched data, but don't let the 258cf6106c8SMatthew Ahrens * prefetch get further ahead than zfetch_max_distance. 259cf6106c8SMatthew Ahrens */ 260cf6106c8SMatthew Ahrens int pf_nblks = 261cf6106c8SMatthew Ahrens MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks, 262cf6106c8SMatthew Ahrens zs->zs_blkid + nblks + 263cf6106c8SMatthew Ahrens (zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start); 264fa9e4066Sahrens 265cf6106c8SMatthew Ahrens zs->zs_pf_blkid = pf_start + pf_nblks; 266cf6106c8SMatthew Ahrens zs->zs_atime = gethrtime(); 267cf6106c8SMatthew Ahrens zs->zs_blkid = blkid + nblks; 268fa9e4066Sahrens 269cf6106c8SMatthew Ahrens /* 270cf6106c8SMatthew Ahrens * dbuf_prefetch() issues the prefetch i/o 271cf6106c8SMatthew Ahrens * asynchronously, but it may need to wait for an 272cf6106c8SMatthew Ahrens * indirect block to be read from disk. Therefore 273cf6106c8SMatthew Ahrens * we do not want to hold any locks while we call it. 274cf6106c8SMatthew Ahrens */ 275cf6106c8SMatthew Ahrens mutex_exit(&zs->zs_lock); 276fa9e4066Sahrens rw_exit(&zf->zf_rwlock); 277cf6106c8SMatthew Ahrens for (int i = 0; i < pf_nblks; i++) { 278cf6106c8SMatthew Ahrens dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, 279cf6106c8SMatthew Ahrens ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); 280fa9e4066Sahrens } 281cf6106c8SMatthew Ahrens ZFETCHSTAT_BUMP(zfetchstat_hits); 282fa9e4066Sahrens } 283