1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/dnode.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dmu_zfetch.h> 34 #include <sys/dmu.h> 35 #include <sys/dbuf.h> 36 #include <sys/kstat.h> 37 38 /* 39 * This tunable disables predictive prefetch. Note that it leaves "prescient" 40 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 41 * prescient prefetch never issues i/os that end up not being needed, 42 * so it can't hurt performance. 43 */ 44 45 int zfs_prefetch_disable = B_FALSE; 46 47 /* max # of streams per zfetch */ 48 unsigned int zfetch_max_streams = 8; 49 /* min time before stream reclaim */ 50 unsigned int zfetch_min_sec_reap = 2; 51 /* max bytes to prefetch per stream (default 8MB) */ 52 unsigned int zfetch_max_distance = 8 * 1024 * 1024; 53 /* max bytes to prefetch indirects for per stream (default 64MB) */ 54 unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 55 /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 56 unsigned long zfetch_array_rd_sz = 1024 * 1024; 57 58 typedef struct zfetch_stats { 59 kstat_named_t zfetchstat_hits; 60 kstat_named_t zfetchstat_misses; 61 kstat_named_t zfetchstat_max_streams; 62 kstat_named_t zfetchstat_io_issued; 63 } zfetch_stats_t; 64 65 static zfetch_stats_t zfetch_stats = { 66 { "hits", KSTAT_DATA_UINT64 }, 67 { "misses", KSTAT_DATA_UINT64 }, 68 { "max_streams", KSTAT_DATA_UINT64 }, 69 { "io_issued", KSTAT_DATA_UINT64 }, 70 }; 71 72 #define ZFETCHSTAT_BUMP(stat) \ 73 atomic_inc_64(&zfetch_stats.stat.value.ui64) 74 #define ZFETCHSTAT_ADD(stat, val) \ 75 atomic_add_64(&zfetch_stats.stat.value.ui64, val) 76 #define ZFETCHSTAT_SET(stat, val) \ 77 zfetch_stats.stat.value.ui64 = val 78 #define ZFETCHSTAT_GET(stat) \ 79 zfetch_stats.stat.value.ui64 80 81 82 kstat_t *zfetch_ksp; 83 84 void 85 zfetch_init(void) 86 { 87 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 88 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 89 KSTAT_FLAG_VIRTUAL); 90 91 if (zfetch_ksp != NULL) { 92 zfetch_ksp->ks_data = &zfetch_stats; 93 kstat_install(zfetch_ksp); 94 } 95 } 96 97 void 98 zfetch_fini(void) 99 { 100 if (zfetch_ksp != NULL) { 101 kstat_delete(zfetch_ksp); 102 zfetch_ksp = NULL; 103 } 104 } 105 106 /* 107 * This takes a pointer to a zfetch structure and a dnode. It performs the 108 * necessary setup for the zfetch structure, grokking data from the 109 * associated dnode. 110 */ 111 void 112 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 113 { 114 if (zf == NULL) 115 return; 116 zf->zf_dnode = dno; 117 zf->zf_numstreams = 0; 118 119 list_create(&zf->zf_stream, sizeof (zstream_t), 120 offsetof(zstream_t, zs_node)); 121 122 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 123 } 124 125 static void 126 dmu_zfetch_stream_fini(zstream_t *zs) 127 { 128 ASSERT(!list_link_active(&zs->zs_node)); 129 kmem_free(zs, sizeof (*zs)); 130 } 131 132 static void 133 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 134 { 135 ASSERT(MUTEX_HELD(&zf->zf_lock)); 136 list_remove(&zf->zf_stream, zs); 137 zf->zf_numstreams--; 138 membar_producer(); 139 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 140 dmu_zfetch_stream_fini(zs); 141 } 142 143 /* 144 * Clean-up state associated with a zfetch structure (e.g. destroy the 145 * streams). This doesn't free the zfetch_t itself, that's left to the caller. 146 */ 147 void 148 dmu_zfetch_fini(zfetch_t *zf) 149 { 150 zstream_t *zs; 151 152 mutex_enter(&zf->zf_lock); 153 while ((zs = list_head(&zf->zf_stream)) != NULL) 154 dmu_zfetch_stream_remove(zf, zs); 155 mutex_exit(&zf->zf_lock); 156 list_destroy(&zf->zf_stream); 157 mutex_destroy(&zf->zf_lock); 158 159 zf->zf_dnode = NULL; 160 } 161 162 /* 163 * If there aren't too many streams already, create a new stream. 164 * The "blkid" argument is the next block that we expect this stream to access. 165 * While we're here, clean up old streams (which haven't been 166 * accessed for at least zfetch_min_sec_reap seconds). 167 */ 168 static void 169 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 170 { 171 zstream_t *zs_next; 172 hrtime_t now = gethrtime(); 173 174 ASSERT(MUTEX_HELD(&zf->zf_lock)); 175 176 /* 177 * Clean up old streams. 178 */ 179 for (zstream_t *zs = list_head(&zf->zf_stream); 180 zs != NULL; zs = zs_next) { 181 zs_next = list_next(&zf->zf_stream, zs); 182 /* 183 * Skip if still active. 1 -- zf_stream reference. 184 */ 185 if (zfs_refcount_count(&zs->zs_refs) != 1) 186 continue; 187 if (((now - zs->zs_atime) / NANOSEC) > 188 zfetch_min_sec_reap) 189 dmu_zfetch_stream_remove(zf, zs); 190 } 191 192 /* 193 * The maximum number of streams is normally zfetch_max_streams, 194 * but for small files we lower it such that it's at least possible 195 * for all the streams to be non-overlapping. 196 * 197 * If we are already at the maximum number of streams for this file, 198 * even after removing old streams, then don't create this stream. 199 */ 200 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 201 zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 202 zfetch_max_distance)); 203 if (zf->zf_numstreams >= max_streams) { 204 ZFETCHSTAT_BUMP(zfetchstat_max_streams); 205 return; 206 } 207 208 zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 209 zs->zs_blkid = blkid; 210 zs->zs_pf_blkid1 = blkid; 211 zs->zs_pf_blkid = blkid; 212 zs->zs_ipf_blkid1 = blkid; 213 zs->zs_ipf_blkid = blkid; 214 zs->zs_atime = now; 215 zs->zs_fetch = zf; 216 zs->zs_missed = B_FALSE; 217 zfs_refcount_create(&zs->zs_callers); 218 zfs_refcount_create(&zs->zs_refs); 219 /* One reference for zf_stream. */ 220 zfs_refcount_add(&zs->zs_refs, NULL); 221 zf->zf_numstreams++; 222 list_insert_head(&zf->zf_stream, zs); 223 } 224 225 static void 226 dmu_zfetch_stream_done(void *arg, boolean_t io_issued) 227 { 228 zstream_t *zs = arg; 229 230 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 231 dmu_zfetch_stream_fini(zs); 232 } 233 234 /* 235 * This is the predictive prefetch entry point. dmu_zfetch_prepare() 236 * associates dnode access specified with blkid and nblks arguments with 237 * prefetch stream, predicts further accesses based on that stats and returns 238 * the stream pointer on success. That pointer must later be passed to 239 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 240 * release it. dmu_zfetch() is a wrapper for simple cases when window between 241 * prediction and prefetch initiation is not needed. 242 * fetch_data argument specifies whether actual data blocks should be fetched: 243 * FALSE -- prefetch only indirect blocks for predicted data blocks; 244 * TRUE -- prefetch predicted data blocks plus following indirect blocks. 245 */ 246 zstream_t * 247 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 248 boolean_t fetch_data, boolean_t have_lock) 249 { 250 zstream_t *zs; 251 int64_t pf_start, ipf_start; 252 int64_t pf_ahead_blks, max_blks; 253 int max_dist_blks, pf_nblks, ipf_nblks; 254 uint64_t end_of_access_blkid, maxblkid; 255 end_of_access_blkid = blkid + nblks; 256 spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 257 258 if (zfs_prefetch_disable) 259 return (NULL); 260 /* 261 * If we haven't yet loaded the indirect vdevs' mappings, we 262 * can only read from blocks that we carefully ensure are on 263 * concrete vdevs (or previously-loaded indirect vdevs). So we 264 * can't allow the predictive prefetcher to attempt reads of other 265 * blocks (e.g. of the MOS's dnode object). 266 */ 267 if (!spa_indirect_vdevs_loaded(spa)) 268 return (NULL); 269 270 /* 271 * As a fast path for small (single-block) files, ignore access 272 * to the first block. 273 */ 274 if (!have_lock && blkid == 0) 275 return (NULL); 276 277 if (!have_lock) 278 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 279 280 /* 281 * A fast path for small files for which no prefetch will 282 * happen. 283 */ 284 maxblkid = zf->zf_dnode->dn_maxblkid; 285 if (maxblkid < 2) { 286 if (!have_lock) 287 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 288 return (NULL); 289 } 290 mutex_enter(&zf->zf_lock); 291 292 /* 293 * Find matching prefetch stream. Depending on whether the accesses 294 * are block-aligned, first block of the new access may either follow 295 * the last block of the previous access, or be equal to it. 296 */ 297 for (zs = list_head(&zf->zf_stream); zs != NULL; 298 zs = list_next(&zf->zf_stream, zs)) { 299 if (blkid == zs->zs_blkid) { 300 break; 301 } else if (blkid + 1 == zs->zs_blkid) { 302 blkid++; 303 nblks--; 304 break; 305 } 306 } 307 308 /* 309 * If the file is ending, remove the matching stream if found. 310 * If not found then it is too late to create a new one now. 311 */ 312 if (end_of_access_blkid >= maxblkid) { 313 if (zs != NULL) 314 dmu_zfetch_stream_remove(zf, zs); 315 mutex_exit(&zf->zf_lock); 316 if (!have_lock) 317 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 318 return (NULL); 319 } 320 321 /* Exit if we already prefetched this block before. */ 322 if (nblks == 0) { 323 mutex_exit(&zf->zf_lock); 324 if (!have_lock) 325 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 326 return (NULL); 327 } 328 329 if (zs == NULL) { 330 /* 331 * This access is not part of any existing stream. Create 332 * a new stream for it. 333 */ 334 dmu_zfetch_stream_create(zf, end_of_access_blkid); 335 mutex_exit(&zf->zf_lock); 336 if (!have_lock) 337 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 338 ZFETCHSTAT_BUMP(zfetchstat_misses); 339 return (NULL); 340 } 341 342 /* 343 * This access was to a block that we issued a prefetch for on 344 * behalf of this stream. Issue further prefetches for this stream. 345 * 346 * Normally, we start prefetching where we stopped 347 * prefetching last (zs_pf_blkid). But when we get our first 348 * hit on this stream, zs_pf_blkid == zs_blkid, we don't 349 * want to prefetch the block we just accessed. In this case, 350 * start just after the block we just accessed. 351 */ 352 pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); 353 if (zs->zs_pf_blkid1 < end_of_access_blkid) 354 zs->zs_pf_blkid1 = end_of_access_blkid; 355 if (zs->zs_ipf_blkid1 < end_of_access_blkid) 356 zs->zs_ipf_blkid1 = end_of_access_blkid; 357 358 /* 359 * Double our amount of prefetched data, but don't let the 360 * prefetch get further ahead than zfetch_max_distance. 361 */ 362 if (fetch_data) { 363 max_dist_blks = 364 zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; 365 /* 366 * Previously, we were (zs_pf_blkid - blkid) ahead. We 367 * want to now be double that, so read that amount again, 368 * plus the amount we are catching up by (i.e. the amount 369 * read just now). 370 */ 371 pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; 372 max_blks = max_dist_blks - (pf_start - end_of_access_blkid); 373 pf_nblks = MIN(pf_ahead_blks, max_blks); 374 } else { 375 pf_nblks = 0; 376 } 377 378 zs->zs_pf_blkid = pf_start + pf_nblks; 379 380 /* 381 * Do the same for indirects, starting from where we stopped last, 382 * or where we will stop reading data blocks (and the indirects 383 * that point to them). 384 */ 385 ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); 386 max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; 387 /* 388 * We want to double our distance ahead of the data prefetch 389 * (or reader, if we are not prefetching data). Previously, we 390 * were (zs_ipf_blkid - blkid) ahead. To double that, we read 391 * that amount again, plus the amount we are catching up by 392 * (i.e. the amount read now + the amount of data prefetched now). 393 */ 394 pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; 395 max_blks = max_dist_blks - (ipf_start - zs->zs_pf_blkid); 396 ipf_nblks = MIN(pf_ahead_blks, max_blks); 397 zs->zs_ipf_blkid = ipf_start + ipf_nblks; 398 399 zs->zs_blkid = end_of_access_blkid; 400 /* Protect the stream from reclamation. */ 401 zs->zs_atime = gethrtime(); 402 zfs_refcount_add(&zs->zs_refs, NULL); 403 /* Count concurrent callers. */ 404 zfs_refcount_add(&zs->zs_callers, NULL); 405 mutex_exit(&zf->zf_lock); 406 407 if (!have_lock) 408 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 409 410 ZFETCHSTAT_BUMP(zfetchstat_hits); 411 return (zs); 412 } 413 414 void 415 dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 416 { 417 zfetch_t *zf = zs->zs_fetch; 418 int64_t pf_start, pf_end, ipf_start, ipf_end; 419 int epbs, issued; 420 421 if (missed) 422 zs->zs_missed = missed; 423 424 /* 425 * Postpone the prefetch if there are more concurrent callers. 426 * It happens when multiple requests are waiting for the same 427 * indirect block. The last one will run the prefetch for all. 428 */ 429 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 430 /* Drop reference taken in dmu_zfetch_prepare(). */ 431 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 432 dmu_zfetch_stream_fini(zs); 433 return; 434 } 435 436 mutex_enter(&zf->zf_lock); 437 if (zs->zs_missed) { 438 pf_start = zs->zs_pf_blkid1; 439 pf_end = zs->zs_pf_blkid1 = zs->zs_pf_blkid; 440 } else { 441 pf_start = pf_end = 0; 442 } 443 ipf_start = MAX(zs->zs_pf_blkid1, zs->zs_ipf_blkid1); 444 ipf_end = zs->zs_ipf_blkid1 = zs->zs_ipf_blkid; 445 mutex_exit(&zf->zf_lock); 446 ASSERT3S(pf_start, <=, pf_end); 447 ASSERT3S(ipf_start, <=, ipf_end); 448 449 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 450 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 451 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 452 ASSERT3S(ipf_start, <=, ipf_end); 453 issued = pf_end - pf_start + ipf_end - ipf_start; 454 if (issued > 1) { 455 /* More references on top of taken in dmu_zfetch_prepare(). */ 456 zfs_refcount_add_many(&zs->zs_refs, issued - 1, NULL); 457 } else if (issued == 0) { 458 /* Some other thread has done our work, so drop the ref. */ 459 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 460 dmu_zfetch_stream_fini(zs); 461 return; 462 } 463 464 if (!have_lock) 465 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 466 467 issued = 0; 468 for (int64_t blk = pf_start; blk < pf_end; blk++) { 469 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 470 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 471 dmu_zfetch_stream_done, zs); 472 } 473 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 474 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 475 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 476 dmu_zfetch_stream_done, zs); 477 } 478 479 if (!have_lock) 480 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 481 482 if (issued) 483 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 484 } 485 486 void 487 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 488 boolean_t missed, boolean_t have_lock) 489 { 490 zstream_t *zs; 491 492 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 493 if (zs) 494 dmu_zfetch_run(zs, missed, have_lock); 495 } 496 497 /* BEGIN CSTYLED */ 498 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 499 "Disable all ZFS prefetching"); 500 501 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 502 "Max number of streams per zfetch"); 503 504 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 505 "Min time before stream reclaim"); 506 507 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 508 "Max bytes to prefetch per stream"); 509 510 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 511 "Max bytes to prefetch indirects for per stream"); 512 513 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, 514 "Number of bytes in a array_read"); 515 /* END CSTYLED */ 516