1 // SPDX-License-Identifier: CDDL-1.0 2 /* 3 * CDDL HEADER START 4 * 5 * The contents of this file are subject to the terms of the 6 * Common Development and Distribution License (the "License"). 7 * You may not use this file except in compliance with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or https://opensource.org/licenses/CDDL-1.0. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 29 */ 30 31 #include <sys/zfs_context.h> 32 #include <sys/arc_impl.h> 33 #include <sys/dnode.h> 34 #include <sys/dmu_objset.h> 35 #include <sys/dmu_zfetch.h> 36 #include <sys/dmu.h> 37 #include <sys/dbuf.h> 38 #include <sys/kstat.h> 39 #include <sys/wmsum.h> 40 41 /* 42 * This tunable disables predictive prefetch. Note that it leaves "prescient" 43 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 44 * prescient prefetch never issues i/os that end up not being needed, 45 * so it can't hurt performance. 46 */ 47 48 static int zfs_prefetch_disable = B_FALSE; 49 50 /* max # of streams per zfetch */ 51 static unsigned int zfetch_max_streams = 8; 52 /* min time before stream reclaim */ 53 static unsigned int zfetch_min_sec_reap = 1; 54 /* max time before stream delete */ 55 static unsigned int zfetch_max_sec_reap = 2; 56 #ifdef _ILP32 57 /* min bytes to prefetch per stream (default 2MB) */ 58 static unsigned int zfetch_min_distance = 2 * 1024 * 1024; 59 /* max bytes to prefetch per stream (default 8MB) */ 60 unsigned int zfetch_max_distance = 8 * 1024 * 1024; 61 #else 62 /* min bytes to prefetch per stream (default 4MB) */ 63 static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 64 /* max bytes to prefetch per stream (default 64MB) */ 65 unsigned int zfetch_max_distance = 64 * 1024 * 1024; 66 #endif 67 /* max bytes to prefetch indirects for per stream (default 128MB) */ 68 unsigned int zfetch_max_idistance = 128 * 1024 * 1024; 69 /* max request reorder distance within a stream (default 16MB) */ 70 unsigned int zfetch_max_reorder = 16 * 1024 * 1024; 71 /* Max log2 fraction of holes in a stream */ 72 unsigned int zfetch_hole_shift = 2; 73 74 typedef struct zfetch_stats { 75 kstat_named_t zfetchstat_hits; 76 kstat_named_t zfetchstat_future; 77 kstat_named_t zfetchstat_stride; 78 kstat_named_t zfetchstat_past; 79 kstat_named_t zfetchstat_misses; 80 kstat_named_t zfetchstat_max_streams; 81 kstat_named_t zfetchstat_io_issued; 82 kstat_named_t zfetchstat_io_active; 83 } zfetch_stats_t; 84 85 static zfetch_stats_t zfetch_stats = { 86 { "hits", KSTAT_DATA_UINT64 }, 87 { "future", KSTAT_DATA_UINT64 }, 88 { "stride", KSTAT_DATA_UINT64 }, 89 { "past", KSTAT_DATA_UINT64 }, 90 { "misses", KSTAT_DATA_UINT64 }, 91 { "max_streams", KSTAT_DATA_UINT64 }, 92 { "io_issued", KSTAT_DATA_UINT64 }, 93 { "io_active", KSTAT_DATA_UINT64 }, 94 }; 95 96 struct { 97 wmsum_t zfetchstat_hits; 98 wmsum_t zfetchstat_future; 99 wmsum_t zfetchstat_stride; 100 wmsum_t zfetchstat_past; 101 wmsum_t zfetchstat_misses; 102 wmsum_t zfetchstat_max_streams; 103 wmsum_t zfetchstat_io_issued; 104 aggsum_t zfetchstat_io_active; 105 } zfetch_sums; 106 107 #define ZFETCHSTAT_BUMP(stat) \ 108 wmsum_add(&zfetch_sums.stat, 1) 109 #define ZFETCHSTAT_ADD(stat, val) \ 110 wmsum_add(&zfetch_sums.stat, val) 111 112 113 static kstat_t *zfetch_ksp; 114 115 static int 116 zfetch_kstats_update(kstat_t *ksp, int rw) 117 { 118 zfetch_stats_t *zs = ksp->ks_data; 119 120 if (rw == KSTAT_WRITE) 121 return (EACCES); 122 zs->zfetchstat_hits.value.ui64 = 123 wmsum_value(&zfetch_sums.zfetchstat_hits); 124 zs->zfetchstat_future.value.ui64 = 125 wmsum_value(&zfetch_sums.zfetchstat_future); 126 zs->zfetchstat_stride.value.ui64 = 127 wmsum_value(&zfetch_sums.zfetchstat_stride); 128 zs->zfetchstat_past.value.ui64 = 129 wmsum_value(&zfetch_sums.zfetchstat_past); 130 zs->zfetchstat_misses.value.ui64 = 131 wmsum_value(&zfetch_sums.zfetchstat_misses); 132 zs->zfetchstat_max_streams.value.ui64 = 133 wmsum_value(&zfetch_sums.zfetchstat_max_streams); 134 zs->zfetchstat_io_issued.value.ui64 = 135 wmsum_value(&zfetch_sums.zfetchstat_io_issued); 136 zs->zfetchstat_io_active.value.ui64 = 137 aggsum_value(&zfetch_sums.zfetchstat_io_active); 138 return (0); 139 } 140 141 void 142 zfetch_init(void) 143 { 144 wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 145 wmsum_init(&zfetch_sums.zfetchstat_future, 0); 146 wmsum_init(&zfetch_sums.zfetchstat_stride, 0); 147 wmsum_init(&zfetch_sums.zfetchstat_past, 0); 148 wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 149 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 150 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 151 aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 152 153 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 154 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 155 KSTAT_FLAG_VIRTUAL); 156 157 if (zfetch_ksp != NULL) { 158 zfetch_ksp->ks_data = &zfetch_stats; 159 zfetch_ksp->ks_update = zfetch_kstats_update; 160 kstat_install(zfetch_ksp); 161 } 162 } 163 164 void 165 zfetch_fini(void) 166 { 167 if (zfetch_ksp != NULL) { 168 kstat_delete(zfetch_ksp); 169 zfetch_ksp = NULL; 170 } 171 172 wmsum_fini(&zfetch_sums.zfetchstat_hits); 173 wmsum_fini(&zfetch_sums.zfetchstat_future); 174 wmsum_fini(&zfetch_sums.zfetchstat_stride); 175 wmsum_fini(&zfetch_sums.zfetchstat_past); 176 wmsum_fini(&zfetch_sums.zfetchstat_misses); 177 wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 178 wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 179 ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 180 aggsum_fini(&zfetch_sums.zfetchstat_io_active); 181 } 182 183 /* 184 * This takes a pointer to a zfetch structure and a dnode. It performs the 185 * necessary setup for the zfetch structure, grokking data from the 186 * associated dnode. 187 */ 188 void 189 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 190 { 191 if (zf == NULL) 192 return; 193 zf->zf_dnode = dno; 194 zf->zf_numstreams = 0; 195 196 list_create(&zf->zf_stream, sizeof (zstream_t), 197 offsetof(zstream_t, zs_node)); 198 199 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 200 } 201 202 static void 203 dmu_zfetch_stream_fini(zstream_t *zs) 204 { 205 ASSERT(!list_link_active(&zs->zs_node)); 206 zfs_refcount_destroy(&zs->zs_callers); 207 zfs_refcount_destroy(&zs->zs_refs); 208 kmem_free(zs, sizeof (*zs)); 209 } 210 211 static void 212 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 213 { 214 ASSERT(MUTEX_HELD(&zf->zf_lock)); 215 list_remove(&zf->zf_stream, zs); 216 zf->zf_numstreams--; 217 membar_producer(); 218 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 219 dmu_zfetch_stream_fini(zs); 220 } 221 222 /* 223 * Clean-up state associated with a zfetch structure (e.g. destroy the 224 * streams). This doesn't free the zfetch_t itself, that's left to the caller. 225 */ 226 void 227 dmu_zfetch_fini(zfetch_t *zf) 228 { 229 zstream_t *zs; 230 231 mutex_enter(&zf->zf_lock); 232 while ((zs = list_head(&zf->zf_stream)) != NULL) 233 dmu_zfetch_stream_remove(zf, zs); 234 mutex_exit(&zf->zf_lock); 235 list_destroy(&zf->zf_stream); 236 mutex_destroy(&zf->zf_lock); 237 238 zf->zf_dnode = NULL; 239 } 240 241 /* 242 * If there aren't too many active streams already, create one more. 243 * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 244 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 245 * The "blkid" argument is the next block that we expect this stream to access. 246 */ 247 static void 248 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 249 { 250 zstream_t *zs, *zs_next, *zs_old = NULL; 251 uint_t now = gethrestime_sec(), t; 252 253 ASSERT(MUTEX_HELD(&zf->zf_lock)); 254 255 /* 256 * Delete too old streams, reusing the first found one. 257 */ 258 t = now - zfetch_max_sec_reap; 259 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 260 zs_next = list_next(&zf->zf_stream, zs); 261 /* 262 * Skip if still active. 1 -- zf_stream reference. 263 */ 264 if ((int)(zs->zs_atime - t) >= 0) 265 continue; 266 if (zfs_refcount_count(&zs->zs_refs) != 1) 267 continue; 268 if (zs_old) 269 dmu_zfetch_stream_remove(zf, zs); 270 else 271 zs_old = zs; 272 } 273 if (zs_old) { 274 zs = zs_old; 275 list_remove(&zf->zf_stream, zs); 276 goto reuse; 277 } 278 279 /* 280 * The maximum number of streams is normally zfetch_max_streams, 281 * but for small files we lower it such that it's at least possible 282 * for all the streams to be non-overlapping. 283 */ 284 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 285 (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / 286 zfetch_max_distance)); 287 if (zf->zf_numstreams >= max_streams) { 288 t = now - zfetch_min_sec_reap; 289 for (zs = list_head(&zf->zf_stream); zs != NULL; 290 zs = list_next(&zf->zf_stream, zs)) { 291 if ((int)(zs->zs_atime - t) >= 0) 292 continue; 293 if (zfs_refcount_count(&zs->zs_refs) != 1) 294 continue; 295 if (zs_old == NULL || 296 (int)(zs_old->zs_atime - zs->zs_atime) >= 0) 297 zs_old = zs; 298 } 299 if (zs_old) { 300 zs = zs_old; 301 list_remove(&zf->zf_stream, zs); 302 goto reuse; 303 } 304 ZFETCHSTAT_BUMP(zfetchstat_max_streams); 305 return; 306 } 307 308 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 309 zfs_refcount_create(&zs->zs_callers); 310 zfs_refcount_create(&zs->zs_refs); 311 /* One reference for zf_stream. */ 312 zfs_refcount_add(&zs->zs_refs, NULL); 313 zf->zf_numstreams++; 314 315 reuse: 316 list_insert_head(&zf->zf_stream, zs); 317 zs->zs_blkid = blkid; 318 /* Allow immediate stream reuse until first hit. */ 319 zs->zs_atime = now - zfetch_min_sec_reap; 320 memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); 321 zs->zs_pf_dist = 0; 322 zs->zs_ipf_dist = 0; 323 zs->zs_pf_start = blkid; 324 zs->zs_pf_end = blkid; 325 zs->zs_ipf_start = blkid; 326 zs->zs_ipf_end = blkid; 327 zs->zs_missed = B_FALSE; 328 zs->zs_more = B_FALSE; 329 } 330 331 static void 332 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 333 { 334 zstream_t *zs = arg; 335 336 if (io_issued && level == 0 && blkid < zs->zs_blkid) 337 zs->zs_more = B_TRUE; 338 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 339 dmu_zfetch_stream_fini(zs); 340 aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 341 } 342 343 /* 344 * Process stream hit access for nblks blocks starting at zs_blkid. Return 345 * number of blocks to proceed for after aggregation with future ranges. 346 */ 347 static uint64_t 348 dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) 349 { 350 uint_t i, j; 351 352 /* Optimize sequential accesses (no future ranges). */ 353 if (zs->zs_ranges[0].start == 0) 354 goto done; 355 356 /* Look for intersections with further ranges. */ 357 for (i = 0; i < ZFETCH_RANGES; i++) { 358 zsrange_t *r = &zs->zs_ranges[i]; 359 if (r->start == 0 || r->start > nblks) 360 break; 361 if (r->end >= nblks) { 362 nblks = r->end; 363 i++; 364 break; 365 } 366 } 367 368 /* Delete all found intersecting ranges, updates remaining. */ 369 for (j = 0; i < ZFETCH_RANGES; i++, j++) { 370 if (zs->zs_ranges[i].start == 0) 371 break; 372 ASSERT3U(zs->zs_ranges[i].start, >, nblks); 373 ASSERT3U(zs->zs_ranges[i].end, >, nblks); 374 zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; 375 zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; 376 } 377 if (j < ZFETCH_RANGES) { 378 zs->zs_ranges[j].start = 0; 379 zs->zs_ranges[j].end = 0; 380 } 381 382 done: 383 zs->zs_blkid += nblks; 384 return (nblks); 385 } 386 387 /* 388 * Process future stream access for nblks blocks starting at blkid. Return 389 * number of blocks to proceed for if future ranges reach fill threshold. 390 */ 391 static uint64_t 392 dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) 393 { 394 ASSERT3U(blkid, >, zs->zs_blkid); 395 blkid -= zs->zs_blkid; 396 ASSERT3U(blkid + nblks, <=, UINT16_MAX); 397 398 /* Search for first and last intersection or insert point. */ 399 uint_t f = ZFETCH_RANGES, l = 0, i; 400 for (i = 0; i < ZFETCH_RANGES; i++) { 401 zsrange_t *r = &zs->zs_ranges[i]; 402 if (r->start == 0 || r->start > blkid + nblks) 403 break; 404 if (r->end < blkid) 405 continue; 406 if (f > i) 407 f = i; 408 if (l < i) 409 l = i; 410 } 411 if (f <= l) { 412 /* Got some intersecting range, expand it if needed. */ 413 if (zs->zs_ranges[f].start > blkid) 414 zs->zs_ranges[f].start = blkid; 415 zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); 416 if (f < l) { 417 /* Got more than one intersection, remove others. */ 418 for (f++, l++; l < ZFETCH_RANGES; f++, l++) { 419 zs->zs_ranges[f].start = zs->zs_ranges[l].start; 420 zs->zs_ranges[f].end = zs->zs_ranges[l].end; 421 } 422 zs->zs_ranges[f].start = 0; 423 zs->zs_ranges[f].end = 0; 424 } 425 } else if (i < ZFETCH_RANGES) { 426 /* Got no intersecting ranges, insert new one. */ 427 for (l = ZFETCH_RANGES - 1; l > i; l--) { 428 zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; 429 zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; 430 } 431 zs->zs_ranges[i].start = blkid; 432 zs->zs_ranges[i].end = blkid + nblks; 433 } else { 434 /* No space left to insert. Drop the range. */ 435 return (0); 436 } 437 438 /* Check if with the new access addition we reached fill threshold. */ 439 if (zfetch_hole_shift >= 16) 440 return (0); 441 uint_t hole = 0; 442 for (i = f = l = 0; i < ZFETCH_RANGES; i++) { 443 zsrange_t *r = &zs->zs_ranges[i]; 444 if (r->start == 0) 445 break; 446 hole += r->start - f; 447 f = r->end; 448 if (hole <= r->end >> zfetch_hole_shift) 449 l = r->end; 450 } 451 if (l > 0) 452 return (dmu_zfetch_hit(zs, l)); 453 454 return (0); 455 } 456 457 /* 458 * This is the predictive prefetch entry point. dmu_zfetch_prepare() 459 * associates dnode access specified with blkid and nblks arguments with 460 * prefetch stream, predicts further accesses based on that stats and returns 461 * the stream pointer on success. That pointer must later be passed to 462 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 463 * release it. dmu_zfetch() is a wrapper for simple cases when window between 464 * prediction and prefetch initiation is not needed. 465 * fetch_data argument specifies whether actual data blocks should be fetched: 466 * FALSE -- prefetch only indirect blocks for predicted data blocks; 467 * TRUE -- prefetch predicted data blocks plus following indirect blocks. 468 */ 469 zstream_t * 470 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 471 boolean_t fetch_data, boolean_t have_lock) 472 { 473 zstream_t *zs; 474 spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 475 zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; 476 int64_t ipf_start, ipf_end; 477 478 if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) 479 return (NULL); 480 481 if (os_prefetch == ZFS_PREFETCH_METADATA) 482 fetch_data = B_FALSE; 483 484 /* 485 * If we haven't yet loaded the indirect vdevs' mappings, we 486 * can only read from blocks that we carefully ensure are on 487 * concrete vdevs (or previously-loaded indirect vdevs). So we 488 * can't allow the predictive prefetcher to attempt reads of other 489 * blocks (e.g. of the MOS's dnode object). 490 */ 491 if (!spa_indirect_vdevs_loaded(spa)) 492 return (NULL); 493 494 /* 495 * As a fast path for small (single-block) files, ignore access 496 * to the first block. 497 */ 498 if (!have_lock && blkid == 0) 499 return (NULL); 500 501 if (!have_lock) 502 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 503 504 /* 505 * A fast path for small files for which no prefetch will 506 * happen. 507 */ 508 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 509 if (maxblkid < 2) { 510 if (!have_lock) 511 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 512 return (NULL); 513 } 514 mutex_enter(&zf->zf_lock); 515 516 /* 517 * Find perfect prefetch stream. Depending on whether the accesses 518 * are block-aligned, first block of the new access may either follow 519 * the last block of the previous access, or be equal to it. 520 */ 521 unsigned int dbs = zf->zf_dnode->dn_datablkshift; 522 uint64_t end_blkid = blkid + nblks; 523 for (zs = list_head(&zf->zf_stream); zs != NULL; 524 zs = list_next(&zf->zf_stream, zs)) { 525 if (blkid == zs->zs_blkid) { 526 goto hit; 527 } else if (blkid + 1 == zs->zs_blkid) { 528 blkid++; 529 nblks--; 530 goto hit; 531 } 532 } 533 534 /* 535 * Find close enough prefetch stream. Access crossing stream position 536 * is a hit in its new part. Access ahead of stream position considered 537 * a hit for metadata prefetch, since we do not care about fill percent, 538 * or stored for future otherwise. Access behind stream position is 539 * silently ignored, since we already skipped it reaching fill percent. 540 */ 541 uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); 542 uint_t t = gethrestime_sec() - zfetch_max_sec_reap; 543 for (zs = list_head(&zf->zf_stream); zs != NULL; 544 zs = list_next(&zf->zf_stream, zs)) { 545 if (blkid > zs->zs_blkid) { 546 if (end_blkid <= zs->zs_blkid + max_reorder) { 547 if (!fetch_data) { 548 nblks = dmu_zfetch_hit(zs, 549 end_blkid - zs->zs_blkid); 550 ZFETCHSTAT_BUMP(zfetchstat_stride); 551 goto future; 552 } 553 nblks = dmu_zfetch_future(zs, blkid, nblks); 554 if (nblks > 0) 555 ZFETCHSTAT_BUMP(zfetchstat_stride); 556 else 557 ZFETCHSTAT_BUMP(zfetchstat_future); 558 goto future; 559 } 560 } else if (end_blkid >= zs->zs_blkid) { 561 nblks -= zs->zs_blkid - blkid; 562 blkid += zs->zs_blkid - blkid; 563 goto hit; 564 } else if (end_blkid + max_reorder > zs->zs_blkid && 565 (int)(zs->zs_atime - t) >= 0) { 566 ZFETCHSTAT_BUMP(zfetchstat_past); 567 zs->zs_atime = gethrestime_sec(); 568 goto out; 569 } 570 } 571 572 /* 573 * This access is not part of any existing stream. Create a new 574 * stream for it unless we are at the end of file. 575 */ 576 ASSERT0P(zs); 577 if (end_blkid < maxblkid) 578 dmu_zfetch_stream_create(zf, end_blkid); 579 mutex_exit(&zf->zf_lock); 580 ZFETCHSTAT_BUMP(zfetchstat_misses); 581 ipf_start = 0; 582 goto prescient; 583 584 hit: 585 nblks = dmu_zfetch_hit(zs, nblks); 586 ZFETCHSTAT_BUMP(zfetchstat_hits); 587 588 future: 589 zs->zs_atime = gethrestime_sec(); 590 591 /* Exit if we already prefetched for this position before. */ 592 if (nblks == 0) 593 goto out; 594 595 /* If the file is ending, remove the stream. */ 596 end_blkid = zs->zs_blkid; 597 if (end_blkid >= maxblkid) { 598 dmu_zfetch_stream_remove(zf, zs); 599 out: 600 mutex_exit(&zf->zf_lock); 601 if (!have_lock) 602 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 603 return (NULL); 604 } 605 606 /* 607 * This access was to a block that we issued a prefetch for on 608 * behalf of this stream. Calculate further prefetch distances. 609 * 610 * Start prefetch from the demand access size (nblks). Double the 611 * distance every access up to zfetch_min_distance. After that only 612 * if needed increase the distance by 1/8 up to zfetch_max_distance. 613 * 614 * Don't double the distance beyond single block if we have more 615 * than ~6% of ARC held by active prefetches. It should help with 616 * getting out of RAM on some badly mispredicted read patterns. 617 */ 618 unsigned int nbytes = nblks << dbs; 619 unsigned int pf_nblks; 620 if (fetch_data) { 621 if (unlikely(zs->zs_pf_dist < nbytes)) 622 zs->zs_pf_dist = nbytes; 623 else if (zs->zs_pf_dist < zfetch_min_distance && 624 (zs->zs_pf_dist < (1 << dbs) || 625 aggsum_compare(&zfetch_sums.zfetchstat_io_active, 626 arc_c_max >> (4 + dbs)) < 0)) 627 zs->zs_pf_dist *= 2; 628 else if (zs->zs_more) 629 zs->zs_pf_dist += zs->zs_pf_dist / 8; 630 zs->zs_more = B_FALSE; 631 if (zs->zs_pf_dist > zfetch_max_distance) 632 zs->zs_pf_dist = zfetch_max_distance; 633 pf_nblks = zs->zs_pf_dist >> dbs; 634 } else { 635 pf_nblks = 0; 636 } 637 if (zs->zs_pf_start < end_blkid) 638 zs->zs_pf_start = end_blkid; 639 if (zs->zs_pf_end < end_blkid + pf_nblks) 640 zs->zs_pf_end = end_blkid + pf_nblks; 641 642 /* 643 * Do the same for indirects, starting where we will stop reading 644 * data blocks (and the indirects that point to them). 645 */ 646 if (unlikely(zs->zs_ipf_dist < nbytes)) 647 zs->zs_ipf_dist = nbytes; 648 else 649 zs->zs_ipf_dist *= 2; 650 if (zs->zs_ipf_dist > zfetch_max_idistance) 651 zs->zs_ipf_dist = zfetch_max_idistance; 652 pf_nblks = zs->zs_ipf_dist >> dbs; 653 if (zs->zs_ipf_start < zs->zs_pf_end) 654 zs->zs_ipf_start = zs->zs_pf_end; 655 ipf_start = zs->zs_ipf_end; 656 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 657 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 658 659 zfs_refcount_add(&zs->zs_refs, NULL); 660 /* Count concurrent callers. */ 661 zfs_refcount_add(&zs->zs_callers, NULL); 662 mutex_exit(&zf->zf_lock); 663 664 prescient: 665 /* 666 * Prefetch the following indirect blocks for this access to reduce 667 * dbuf_hold() sync read delays in dmu_buf_hold_array_by_dnode(). 668 * This covers the gap during the first couple accesses when we can 669 * not predict the future yet, but know what is needed right now. 670 * This should be very rare for reads/writes to need more than one 671 * indirect, but more useful for cloning due to much bigger accesses. 672 */ 673 ipf_start = MAX(ipf_start, blkid + 1); 674 int epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 675 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 676 ipf_end = P2ROUNDUP(end_blkid, 1 << epbs) >> epbs; 677 678 int issued = 0; 679 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 680 issued += dbuf_prefetch(zf->zf_dnode, 1, iblk, 681 ZIO_PRIORITY_SYNC_READ, ARC_FLAG_PRESCIENT_PREFETCH); 682 } 683 684 if (!have_lock) 685 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 686 if (issued) 687 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 688 return (zs); 689 } 690 691 void 692 dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, 693 boolean_t have_lock, boolean_t uncached) 694 { 695 int64_t pf_start, pf_end, ipf_start, ipf_end; 696 int epbs, issued; 697 698 if (missed) 699 zs->zs_missed = missed; 700 701 /* 702 * Postpone the prefetch if there are more concurrent callers. 703 * It happens when multiple requests are waiting for the same 704 * indirect block. The last one will run the prefetch for all. 705 */ 706 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 707 /* Drop reference taken in dmu_zfetch_prepare(). */ 708 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 709 dmu_zfetch_stream_fini(zs); 710 return; 711 } 712 713 mutex_enter(&zf->zf_lock); 714 if (zs->zs_missed) { 715 pf_start = zs->zs_pf_start; 716 pf_end = zs->zs_pf_start = zs->zs_pf_end; 717 } else { 718 pf_start = pf_end = 0; 719 } 720 ipf_start = zs->zs_ipf_start; 721 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 722 mutex_exit(&zf->zf_lock); 723 ASSERT3S(pf_start, <=, pf_end); 724 ASSERT3S(ipf_start, <=, ipf_end); 725 726 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 727 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 728 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 729 ASSERT3S(ipf_start, <=, ipf_end); 730 issued = pf_end - pf_start + ipf_end - ipf_start; 731 if (issued > 1) { 732 /* More references on top of taken in dmu_zfetch_prepare(). */ 733 zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); 734 } else if (issued == 0) { 735 /* Some other thread has done our work, so drop the ref. */ 736 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 737 dmu_zfetch_stream_fini(zs); 738 return; 739 } 740 aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 741 742 if (!have_lock) 743 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 744 745 issued = 0; 746 for (int64_t blk = pf_start; blk < pf_end; blk++) { 747 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 748 ZIO_PRIORITY_ASYNC_READ, uncached ? 749 ARC_FLAG_UNCACHED : 0, dmu_zfetch_done, zs); 750 } 751 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 752 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 753 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 754 } 755 756 if (!have_lock) 757 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 758 759 if (issued) 760 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 761 } 762 763 void 764 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 765 boolean_t missed, boolean_t have_lock, boolean_t uncached) 766 { 767 zstream_t *zs; 768 769 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 770 if (zs) 771 dmu_zfetch_run(zf, zs, missed, have_lock, uncached); 772 } 773 774 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 775 "Disable all ZFS prefetching"); 776 777 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 778 "Max number of streams per zfetch"); 779 780 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 781 "Min time before stream reclaim"); 782 783 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 784 "Max time before stream delete"); 785 786 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 787 "Min bytes to prefetch per stream"); 788 789 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 790 "Max bytes to prefetch per stream"); 791 792 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 793 "Max bytes to prefetch indirects for per stream"); 794 795 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, 796 "Max request reorder distance within a stream"); 797 798 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, 799 "Max log2 fraction of holes in a stream"); 800