1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/arc_impl.h> 32 #include <sys/dnode.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dmu_zfetch.h> 35 #include <sys/dmu.h> 36 #include <sys/dbuf.h> 37 #include <sys/kstat.h> 38 #include <sys/wmsum.h> 39 40 /* 41 * This tunable disables predictive prefetch. Note that it leaves "prescient" 42 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 43 * prescient prefetch never issues i/os that end up not being needed, 44 * so it can't hurt performance. 45 */ 46 47 static int zfs_prefetch_disable = B_FALSE; 48 49 /* max # of streams per zfetch */ 50 static unsigned int zfetch_max_streams = 8; 51 /* min time before stream reclaim */ 52 static unsigned int zfetch_min_sec_reap = 1; 53 /* max time before stream delete */ 54 static unsigned int zfetch_max_sec_reap = 2; 55 #ifdef _ILP32 56 /* min bytes to prefetch per stream (default 2MB) */ 57 static unsigned int zfetch_min_distance = 2 * 1024 * 1024; 58 /* max bytes to prefetch per stream (default 8MB) */ 59 unsigned int zfetch_max_distance = 8 * 1024 * 1024; 60 #else 61 /* min bytes to prefetch per stream (default 4MB) */ 62 static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 63 /* max bytes to prefetch per stream (default 64MB) */ 64 unsigned int zfetch_max_distance = 64 * 1024 * 1024; 65 #endif 66 /* max bytes to prefetch indirects for per stream (default 128MB) */ 67 unsigned int zfetch_max_idistance = 128 * 1024 * 1024; 68 /* max request reorder distance within a stream (default 16MB) */ 69 unsigned int zfetch_max_reorder = 16 * 1024 * 1024; 70 /* Max log2 fraction of holes in a stream */ 71 unsigned int zfetch_hole_shift = 2; 72 73 typedef struct zfetch_stats { 74 kstat_named_t zfetchstat_hits; 75 kstat_named_t zfetchstat_future; 76 kstat_named_t zfetchstat_stride; 77 kstat_named_t zfetchstat_past; 78 kstat_named_t zfetchstat_misses; 79 kstat_named_t zfetchstat_max_streams; 80 kstat_named_t zfetchstat_io_issued; 81 kstat_named_t zfetchstat_io_active; 82 } zfetch_stats_t; 83 84 static zfetch_stats_t zfetch_stats = { 85 { "hits", KSTAT_DATA_UINT64 }, 86 { "future", KSTAT_DATA_UINT64 }, 87 { "stride", KSTAT_DATA_UINT64 }, 88 { "past", KSTAT_DATA_UINT64 }, 89 { "misses", KSTAT_DATA_UINT64 }, 90 { "max_streams", KSTAT_DATA_UINT64 }, 91 { "io_issued", KSTAT_DATA_UINT64 }, 92 { "io_active", KSTAT_DATA_UINT64 }, 93 }; 94 95 struct { 96 wmsum_t zfetchstat_hits; 97 wmsum_t zfetchstat_future; 98 wmsum_t zfetchstat_stride; 99 wmsum_t zfetchstat_past; 100 wmsum_t zfetchstat_misses; 101 wmsum_t zfetchstat_max_streams; 102 wmsum_t zfetchstat_io_issued; 103 aggsum_t zfetchstat_io_active; 104 } zfetch_sums; 105 106 #define ZFETCHSTAT_BUMP(stat) \ 107 wmsum_add(&zfetch_sums.stat, 1) 108 #define ZFETCHSTAT_ADD(stat, val) \ 109 wmsum_add(&zfetch_sums.stat, val) 110 111 112 static kstat_t *zfetch_ksp; 113 114 static int 115 zfetch_kstats_update(kstat_t *ksp, int rw) 116 { 117 zfetch_stats_t *zs = ksp->ks_data; 118 119 if (rw == KSTAT_WRITE) 120 return (EACCES); 121 zs->zfetchstat_hits.value.ui64 = 122 wmsum_value(&zfetch_sums.zfetchstat_hits); 123 zs->zfetchstat_future.value.ui64 = 124 wmsum_value(&zfetch_sums.zfetchstat_future); 125 zs->zfetchstat_stride.value.ui64 = 126 wmsum_value(&zfetch_sums.zfetchstat_stride); 127 zs->zfetchstat_past.value.ui64 = 128 wmsum_value(&zfetch_sums.zfetchstat_past); 129 zs->zfetchstat_misses.value.ui64 = 130 wmsum_value(&zfetch_sums.zfetchstat_misses); 131 zs->zfetchstat_max_streams.value.ui64 = 132 wmsum_value(&zfetch_sums.zfetchstat_max_streams); 133 zs->zfetchstat_io_issued.value.ui64 = 134 wmsum_value(&zfetch_sums.zfetchstat_io_issued); 135 zs->zfetchstat_io_active.value.ui64 = 136 aggsum_value(&zfetch_sums.zfetchstat_io_active); 137 return (0); 138 } 139 140 void 141 zfetch_init(void) 142 { 143 wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 144 wmsum_init(&zfetch_sums.zfetchstat_future, 0); 145 wmsum_init(&zfetch_sums.zfetchstat_stride, 0); 146 wmsum_init(&zfetch_sums.zfetchstat_past, 0); 147 wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 148 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 149 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 150 aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 151 152 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 153 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 154 KSTAT_FLAG_VIRTUAL); 155 156 if (zfetch_ksp != NULL) { 157 zfetch_ksp->ks_data = &zfetch_stats; 158 zfetch_ksp->ks_update = zfetch_kstats_update; 159 kstat_install(zfetch_ksp); 160 } 161 } 162 163 void 164 zfetch_fini(void) 165 { 166 if (zfetch_ksp != NULL) { 167 kstat_delete(zfetch_ksp); 168 zfetch_ksp = NULL; 169 } 170 171 wmsum_fini(&zfetch_sums.zfetchstat_hits); 172 wmsum_fini(&zfetch_sums.zfetchstat_future); 173 wmsum_fini(&zfetch_sums.zfetchstat_stride); 174 wmsum_fini(&zfetch_sums.zfetchstat_past); 175 wmsum_fini(&zfetch_sums.zfetchstat_misses); 176 wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 177 wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 178 ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 179 aggsum_fini(&zfetch_sums.zfetchstat_io_active); 180 } 181 182 /* 183 * This takes a pointer to a zfetch structure and a dnode. It performs the 184 * necessary setup for the zfetch structure, grokking data from the 185 * associated dnode. 186 */ 187 void 188 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 189 { 190 if (zf == NULL) 191 return; 192 zf->zf_dnode = dno; 193 zf->zf_numstreams = 0; 194 195 list_create(&zf->zf_stream, sizeof (zstream_t), 196 offsetof(zstream_t, zs_node)); 197 198 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 199 } 200 201 static void 202 dmu_zfetch_stream_fini(zstream_t *zs) 203 { 204 ASSERT(!list_link_active(&zs->zs_node)); 205 zfs_refcount_destroy(&zs->zs_callers); 206 zfs_refcount_destroy(&zs->zs_refs); 207 kmem_free(zs, sizeof (*zs)); 208 } 209 210 static void 211 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 212 { 213 ASSERT(MUTEX_HELD(&zf->zf_lock)); 214 list_remove(&zf->zf_stream, zs); 215 zf->zf_numstreams--; 216 membar_producer(); 217 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 218 dmu_zfetch_stream_fini(zs); 219 } 220 221 /* 222 * Clean-up state associated with a zfetch structure (e.g. destroy the 223 * streams). This doesn't free the zfetch_t itself, that's left to the caller. 224 */ 225 void 226 dmu_zfetch_fini(zfetch_t *zf) 227 { 228 zstream_t *zs; 229 230 mutex_enter(&zf->zf_lock); 231 while ((zs = list_head(&zf->zf_stream)) != NULL) 232 dmu_zfetch_stream_remove(zf, zs); 233 mutex_exit(&zf->zf_lock); 234 list_destroy(&zf->zf_stream); 235 mutex_destroy(&zf->zf_lock); 236 237 zf->zf_dnode = NULL; 238 } 239 240 /* 241 * If there aren't too many active streams already, create one more. 242 * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 243 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 244 * The "blkid" argument is the next block that we expect this stream to access. 245 */ 246 static void 247 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 248 { 249 zstream_t *zs, *zs_next, *zs_old = NULL; 250 uint_t now = gethrestime_sec(), t; 251 252 ASSERT(MUTEX_HELD(&zf->zf_lock)); 253 254 /* 255 * Delete too old streams, reusing the first found one. 256 */ 257 t = now - zfetch_max_sec_reap; 258 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 259 zs_next = list_next(&zf->zf_stream, zs); 260 /* 261 * Skip if still active. 1 -- zf_stream reference. 262 */ 263 if ((int)(zs->zs_atime - t) >= 0) 264 continue; 265 if (zfs_refcount_count(&zs->zs_refs) != 1) 266 continue; 267 if (zs_old) 268 dmu_zfetch_stream_remove(zf, zs); 269 else 270 zs_old = zs; 271 } 272 if (zs_old) { 273 zs = zs_old; 274 list_remove(&zf->zf_stream, zs); 275 goto reuse; 276 } 277 278 /* 279 * The maximum number of streams is normally zfetch_max_streams, 280 * but for small files we lower it such that it's at least possible 281 * for all the streams to be non-overlapping. 282 */ 283 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 284 (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / 285 zfetch_max_distance)); 286 if (zf->zf_numstreams >= max_streams) { 287 t = now - zfetch_min_sec_reap; 288 for (zs = list_head(&zf->zf_stream); zs != NULL; 289 zs = list_next(&zf->zf_stream, zs)) { 290 if ((int)(zs->zs_atime - t) >= 0) 291 continue; 292 if (zfs_refcount_count(&zs->zs_refs) != 1) 293 continue; 294 if (zs_old == NULL || 295 (int)(zs_old->zs_atime - zs->zs_atime) >= 0) 296 zs_old = zs; 297 } 298 if (zs_old) { 299 zs = zs_old; 300 list_remove(&zf->zf_stream, zs); 301 goto reuse; 302 } 303 ZFETCHSTAT_BUMP(zfetchstat_max_streams); 304 return; 305 } 306 307 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 308 zfs_refcount_create(&zs->zs_callers); 309 zfs_refcount_create(&zs->zs_refs); 310 /* One reference for zf_stream. */ 311 zfs_refcount_add(&zs->zs_refs, NULL); 312 zf->zf_numstreams++; 313 314 reuse: 315 list_insert_head(&zf->zf_stream, zs); 316 zs->zs_blkid = blkid; 317 /* Allow immediate stream reuse until first hit. */ 318 zs->zs_atime = now - zfetch_min_sec_reap; 319 memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); 320 zs->zs_pf_dist = 0; 321 zs->zs_ipf_dist = 0; 322 zs->zs_pf_start = blkid; 323 zs->zs_pf_end = blkid; 324 zs->zs_ipf_start = blkid; 325 zs->zs_ipf_end = blkid; 326 zs->zs_missed = B_FALSE; 327 zs->zs_more = B_FALSE; 328 } 329 330 static void 331 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 332 { 333 zstream_t *zs = arg; 334 335 if (io_issued && level == 0 && blkid < zs->zs_blkid) 336 zs->zs_more = B_TRUE; 337 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 338 dmu_zfetch_stream_fini(zs); 339 aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 340 } 341 342 /* 343 * Process stream hit access for nblks blocks starting at zs_blkid. Return 344 * number of blocks to proceed for after aggregation with future ranges. 345 */ 346 static uint64_t 347 dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) 348 { 349 uint_t i, j; 350 351 /* Optimize sequential accesses (no future ranges). */ 352 if (zs->zs_ranges[0].start == 0) 353 goto done; 354 355 /* Look for intersections with further ranges. */ 356 for (i = 0; i < ZFETCH_RANGES; i++) { 357 zsrange_t *r = &zs->zs_ranges[i]; 358 if (r->start == 0 || r->start > nblks) 359 break; 360 if (r->end >= nblks) { 361 nblks = r->end; 362 i++; 363 break; 364 } 365 } 366 367 /* Delete all found intersecting ranges, updates remaining. */ 368 for (j = 0; i < ZFETCH_RANGES; i++, j++) { 369 if (zs->zs_ranges[i].start == 0) 370 break; 371 ASSERT3U(zs->zs_ranges[i].start, >, nblks); 372 ASSERT3U(zs->zs_ranges[i].end, >, nblks); 373 zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; 374 zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; 375 } 376 if (j < ZFETCH_RANGES) { 377 zs->zs_ranges[j].start = 0; 378 zs->zs_ranges[j].end = 0; 379 } 380 381 done: 382 zs->zs_blkid += nblks; 383 return (nblks); 384 } 385 386 /* 387 * Process future stream access for nblks blocks starting at blkid. Return 388 * number of blocks to proceed for if future ranges reach fill threshold. 389 */ 390 static uint64_t 391 dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) 392 { 393 ASSERT3U(blkid, >, zs->zs_blkid); 394 blkid -= zs->zs_blkid; 395 ASSERT3U(blkid + nblks, <=, UINT16_MAX); 396 397 /* Search for first and last intersection or insert point. */ 398 uint_t f = ZFETCH_RANGES, l = 0, i; 399 for (i = 0; i < ZFETCH_RANGES; i++) { 400 zsrange_t *r = &zs->zs_ranges[i]; 401 if (r->start == 0 || r->start > blkid + nblks) 402 break; 403 if (r->end < blkid) 404 continue; 405 if (f > i) 406 f = i; 407 if (l < i) 408 l = i; 409 } 410 if (f <= l) { 411 /* Got some intersecting range, expand it if needed. */ 412 if (zs->zs_ranges[f].start > blkid) 413 zs->zs_ranges[f].start = blkid; 414 zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); 415 if (f < l) { 416 /* Got more than one intersection, remove others. */ 417 for (f++, l++; l < ZFETCH_RANGES; f++, l++) { 418 zs->zs_ranges[f].start = zs->zs_ranges[l].start; 419 zs->zs_ranges[f].end = zs->zs_ranges[l].end; 420 } 421 zs->zs_ranges[f].start = 0; 422 zs->zs_ranges[f].end = 0; 423 } 424 } else if (i < ZFETCH_RANGES) { 425 /* Got no intersecting ranges, insert new one. */ 426 for (l = ZFETCH_RANGES - 1; l > i; l--) { 427 zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; 428 zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; 429 } 430 zs->zs_ranges[i].start = blkid; 431 zs->zs_ranges[i].end = blkid + nblks; 432 } else { 433 /* No space left to insert. Drop the range. */ 434 return (0); 435 } 436 437 /* Check if with the new access addition we reached fill threshold. */ 438 if (zfetch_hole_shift >= 16) 439 return (0); 440 uint_t hole = 0; 441 for (i = f = l = 0; i < ZFETCH_RANGES; i++) { 442 zsrange_t *r = &zs->zs_ranges[i]; 443 if (r->start == 0) 444 break; 445 hole += r->start - f; 446 f = r->end; 447 if (hole <= r->end >> zfetch_hole_shift) 448 l = r->end; 449 } 450 if (l > 0) 451 return (dmu_zfetch_hit(zs, l)); 452 453 return (0); 454 } 455 456 /* 457 * This is the predictive prefetch entry point. dmu_zfetch_prepare() 458 * associates dnode access specified with blkid and nblks arguments with 459 * prefetch stream, predicts further accesses based on that stats and returns 460 * the stream pointer on success. That pointer must later be passed to 461 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 462 * release it. dmu_zfetch() is a wrapper for simple cases when window between 463 * prediction and prefetch initiation is not needed. 464 * fetch_data argument specifies whether actual data blocks should be fetched: 465 * FALSE -- prefetch only indirect blocks for predicted data blocks; 466 * TRUE -- prefetch predicted data blocks plus following indirect blocks. 467 */ 468 zstream_t * 469 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 470 boolean_t fetch_data, boolean_t have_lock) 471 { 472 zstream_t *zs; 473 spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 474 zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; 475 int64_t ipf_start, ipf_end; 476 477 if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) 478 return (NULL); 479 480 if (os_prefetch == ZFS_PREFETCH_METADATA) 481 fetch_data = B_FALSE; 482 483 /* 484 * If we haven't yet loaded the indirect vdevs' mappings, we 485 * can only read from blocks that we carefully ensure are on 486 * concrete vdevs (or previously-loaded indirect vdevs). So we 487 * can't allow the predictive prefetcher to attempt reads of other 488 * blocks (e.g. of the MOS's dnode object). 489 */ 490 if (!spa_indirect_vdevs_loaded(spa)) 491 return (NULL); 492 493 /* 494 * As a fast path for small (single-block) files, ignore access 495 * to the first block. 496 */ 497 if (!have_lock && blkid == 0) 498 return (NULL); 499 500 if (!have_lock) 501 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 502 503 /* 504 * A fast path for small files for which no prefetch will 505 * happen. 506 */ 507 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 508 if (maxblkid < 2) { 509 if (!have_lock) 510 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 511 return (NULL); 512 } 513 mutex_enter(&zf->zf_lock); 514 515 /* 516 * Find perfect prefetch stream. Depending on whether the accesses 517 * are block-aligned, first block of the new access may either follow 518 * the last block of the previous access, or be equal to it. 519 */ 520 unsigned int dbs = zf->zf_dnode->dn_datablkshift; 521 uint64_t end_blkid = blkid + nblks; 522 for (zs = list_head(&zf->zf_stream); zs != NULL; 523 zs = list_next(&zf->zf_stream, zs)) { 524 if (blkid == zs->zs_blkid) { 525 goto hit; 526 } else if (blkid + 1 == zs->zs_blkid) { 527 blkid++; 528 nblks--; 529 goto hit; 530 } 531 } 532 533 /* 534 * Find close enough prefetch stream. Access crossing stream position 535 * is a hit in its new part. Access ahead of stream position considered 536 * a hit for metadata prefetch, since we do not care about fill percent, 537 * or stored for future otherwise. Access behind stream position is 538 * silently ignored, since we already skipped it reaching fill percent. 539 */ 540 uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); 541 uint_t t = gethrestime_sec() - zfetch_max_sec_reap; 542 for (zs = list_head(&zf->zf_stream); zs != NULL; 543 zs = list_next(&zf->zf_stream, zs)) { 544 if (blkid > zs->zs_blkid) { 545 if (end_blkid <= zs->zs_blkid + max_reorder) { 546 if (!fetch_data) { 547 nblks = dmu_zfetch_hit(zs, 548 end_blkid - zs->zs_blkid); 549 ZFETCHSTAT_BUMP(zfetchstat_stride); 550 goto future; 551 } 552 nblks = dmu_zfetch_future(zs, blkid, nblks); 553 if (nblks > 0) 554 ZFETCHSTAT_BUMP(zfetchstat_stride); 555 else 556 ZFETCHSTAT_BUMP(zfetchstat_future); 557 goto future; 558 } 559 } else if (end_blkid >= zs->zs_blkid) { 560 nblks -= zs->zs_blkid - blkid; 561 blkid += zs->zs_blkid - blkid; 562 goto hit; 563 } else if (end_blkid + max_reorder > zs->zs_blkid && 564 (int)(zs->zs_atime - t) >= 0) { 565 ZFETCHSTAT_BUMP(zfetchstat_past); 566 zs->zs_atime = gethrestime_sec(); 567 goto out; 568 } 569 } 570 571 /* 572 * This access is not part of any existing stream. Create a new 573 * stream for it unless we are at the end of file. 574 */ 575 ASSERT0P(zs); 576 if (end_blkid < maxblkid) 577 dmu_zfetch_stream_create(zf, end_blkid); 578 mutex_exit(&zf->zf_lock); 579 ZFETCHSTAT_BUMP(zfetchstat_misses); 580 ipf_start = 0; 581 goto prescient; 582 583 hit: 584 nblks = dmu_zfetch_hit(zs, nblks); 585 ZFETCHSTAT_BUMP(zfetchstat_hits); 586 587 future: 588 zs->zs_atime = gethrestime_sec(); 589 590 /* Exit if we already prefetched for this position before. */ 591 if (nblks == 0) 592 goto out; 593 594 /* If the file is ending, remove the stream. */ 595 end_blkid = zs->zs_blkid; 596 if (end_blkid >= maxblkid) { 597 dmu_zfetch_stream_remove(zf, zs); 598 out: 599 mutex_exit(&zf->zf_lock); 600 if (!have_lock) 601 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 602 return (NULL); 603 } 604 605 /* 606 * This access was to a block that we issued a prefetch for on 607 * behalf of this stream. Calculate further prefetch distances. 608 * 609 * Start prefetch from the demand access size (nblks). Double the 610 * distance every access up to zfetch_min_distance. After that only 611 * if needed increase the distance by 1/8 up to zfetch_max_distance. 612 * 613 * Don't double the distance beyond single block if we have more 614 * than ~6% of ARC held by active prefetches. It should help with 615 * getting out of RAM on some badly mispredicted read patterns. 616 */ 617 unsigned int nbytes = nblks << dbs; 618 unsigned int pf_nblks; 619 if (fetch_data) { 620 if (unlikely(zs->zs_pf_dist < nbytes)) 621 zs->zs_pf_dist = nbytes; 622 else if (zs->zs_pf_dist < zfetch_min_distance && 623 (zs->zs_pf_dist < (1 << dbs) || 624 aggsum_compare(&zfetch_sums.zfetchstat_io_active, 625 arc_c_max >> (4 + dbs)) < 0)) 626 zs->zs_pf_dist *= 2; 627 else if (zs->zs_more) 628 zs->zs_pf_dist += zs->zs_pf_dist / 8; 629 zs->zs_more = B_FALSE; 630 if (zs->zs_pf_dist > zfetch_max_distance) 631 zs->zs_pf_dist = zfetch_max_distance; 632 pf_nblks = zs->zs_pf_dist >> dbs; 633 } else { 634 pf_nblks = 0; 635 } 636 if (zs->zs_pf_start < end_blkid) 637 zs->zs_pf_start = end_blkid; 638 if (zs->zs_pf_end < end_blkid + pf_nblks) 639 zs->zs_pf_end = end_blkid + pf_nblks; 640 641 /* 642 * Do the same for indirects, starting where we will stop reading 643 * data blocks (and the indirects that point to them). 644 */ 645 if (unlikely(zs->zs_ipf_dist < nbytes)) 646 zs->zs_ipf_dist = nbytes; 647 else 648 zs->zs_ipf_dist *= 2; 649 if (zs->zs_ipf_dist > zfetch_max_idistance) 650 zs->zs_ipf_dist = zfetch_max_idistance; 651 pf_nblks = zs->zs_ipf_dist >> dbs; 652 if (zs->zs_ipf_start < zs->zs_pf_end) 653 zs->zs_ipf_start = zs->zs_pf_end; 654 ipf_start = zs->zs_ipf_end; 655 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 656 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 657 658 zfs_refcount_add(&zs->zs_refs, NULL); 659 /* Count concurrent callers. */ 660 zfs_refcount_add(&zs->zs_callers, NULL); 661 mutex_exit(&zf->zf_lock); 662 663 prescient: 664 /* 665 * Prefetch the following indirect blocks for this access to reduce 666 * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode(). 667 * This covers the gap during the first couple accesses when we can 668 * not predict the future yet, but know what is needed right now. 669 * This should be very rare for reads/writes to need more than one 670 * indirect, but more useful for cloning due to much bigger accesses. 671 */ 672 ipf_start = MAX(ipf_start, blkid + 1); 673 int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 674 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 675 ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs; 676 677 int issued = 0; 678 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 679 issued += dbuf_prefetch(zf->zf_dnode, 1, iblk, 680 ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH); 681 } 682 683 if (!have_lock) 684 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 685 if (issued) 686 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 687 return (zs); 688 } 689 690 void 691 dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, 692 boolean_t have_lock) 693 { 694 int64_t pf_start, pf_end, ipf_start, ipf_end; 695 int epbs, issued; 696 697 if (missed) 698 zs->zs_missed = missed; 699 700 /* 701 * Postpone the prefetch if there are more concurrent callers. 702 * It happens when multiple requests are waiting for the same 703 * indirect block. The last one will run the prefetch for all. 704 */ 705 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 706 /* Drop reference taken in dmu_zfetch_prepare(). */ 707 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 708 dmu_zfetch_stream_fini(zs); 709 return; 710 } 711 712 mutex_enter(&zf->zf_lock); 713 if (zs->zs_missed) { 714 pf_start = zs->zs_pf_start; 715 pf_end = zs->zs_pf_start = zs->zs_pf_end; 716 } else { 717 pf_start = pf_end = 0; 718 } 719 ipf_start = zs->zs_ipf_start; 720 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 721 mutex_exit(&zf->zf_lock); 722 ASSERT3S(pf_start, <=, pf_end); 723 ASSERT3S(ipf_start, <=, ipf_end); 724 725 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 726 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 727 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 728 ASSERT3S(ipf_start, <=, ipf_end); 729 issued = pf_end - pf_start + ipf_end - ipf_start; 730 if (issued > 1) { 731 /* More references on top of taken in dmu_zfetch_prepare(). */ 732 zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); 733 } else if (issued == 0) { 734 /* Some other thread has done our work, so drop the ref. */ 735 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 736 dmu_zfetch_stream_fini(zs); 737 return; 738 } 739 aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 740 741 if (!have_lock) 742 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 743 744 issued = 0; 745 for (int64_t blk = pf_start; blk < pf_end; blk++) { 746 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 747 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 748 } 749 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 750 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 751 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 752 } 753 754 if (!have_lock) 755 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 756 757 if (issued) 758 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 759 } 760 761 void 762 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 763 boolean_t missed, boolean_t have_lock) 764 { 765 zstream_t *zs; 766 767 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 768 if (zs) 769 dmu_zfetch_run(zf, zs, missed, have_lock); 770 } 771 772 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 773 "Disable all ZFS prefetching"); 774 775 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 776 "Max number of streams per zfetch"); 777 778 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 779 "Min time before stream reclaim"); 780 781 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 782 "Max time before stream delete"); 783 784 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 785 "Min bytes to prefetch per stream"); 786 787 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 788 "Max bytes to prefetch per stream"); 789 790 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 791 "Max bytes to prefetch indirects for per stream"); 792 793 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, 794 "Max request reorder distance within a stream"); 795 796 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, 797 "Max log2 fraction of holes in a stream"); 798