1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/arc_impl.h> 32 #include <sys/dnode.h> 33 #include <sys/dmu_objset.h> 34 #include <sys/dmu_zfetch.h> 35 #include <sys/dmu.h> 36 #include <sys/dbuf.h> 37 #include <sys/kstat.h> 38 #include <sys/wmsum.h> 39 40 /* 41 * This tunable disables predictive prefetch. Note that it leaves "prescient" 42 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 43 * prescient prefetch never issues i/os that end up not being needed, 44 * so it can't hurt performance. 45 */ 46 47 static int zfs_prefetch_disable = B_FALSE; 48 49 /* max # of streams per zfetch */ 50 static unsigned int zfetch_max_streams = 8; 51 /* min time before stream reclaim */ 52 static unsigned int zfetch_min_sec_reap = 1; 53 /* max time before stream delete */ 54 static unsigned int zfetch_max_sec_reap = 2; 55 #ifdef _ILP32 56 /* min bytes to prefetch per stream (default 2MB) */ 57 static unsigned int zfetch_min_distance = 2 * 1024 * 1024; 58 /* max bytes to prefetch per stream (default 8MB) */ 59 unsigned int zfetch_max_distance = 8 * 1024 * 1024; 60 #else 61 /* min bytes to prefetch per stream (default 4MB) */ 62 static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 63 /* max bytes to prefetch per stream (default 64MB) */ 64 unsigned int zfetch_max_distance = 64 * 1024 * 1024; 65 #endif 66 /* max bytes to prefetch indirects for per stream (default 64MB) */ 67 unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 68 /* max request reorder distance within a stream (default 16MB) */ 69 unsigned int zfetch_max_reorder = 16 * 1024 * 1024; 70 /* Max log2 fraction of holes in a stream */ 71 unsigned int zfetch_hole_shift = 2; 72 73 typedef struct zfetch_stats { 74 kstat_named_t zfetchstat_hits; 75 kstat_named_t zfetchstat_future; 76 kstat_named_t zfetchstat_stride; 77 kstat_named_t zfetchstat_past; 78 kstat_named_t zfetchstat_misses; 79 kstat_named_t zfetchstat_max_streams; 80 kstat_named_t zfetchstat_io_issued; 81 kstat_named_t zfetchstat_io_active; 82 } zfetch_stats_t; 83 84 static zfetch_stats_t zfetch_stats = { 85 { "hits", KSTAT_DATA_UINT64 }, 86 { "future", KSTAT_DATA_UINT64 }, 87 { "stride", KSTAT_DATA_UINT64 }, 88 { "past", KSTAT_DATA_UINT64 }, 89 { "misses", KSTAT_DATA_UINT64 }, 90 { "max_streams", KSTAT_DATA_UINT64 }, 91 { "io_issued", KSTAT_DATA_UINT64 }, 92 { "io_active", KSTAT_DATA_UINT64 }, 93 }; 94 95 struct { 96 wmsum_t zfetchstat_hits; 97 wmsum_t zfetchstat_future; 98 wmsum_t zfetchstat_stride; 99 wmsum_t zfetchstat_past; 100 wmsum_t zfetchstat_misses; 101 wmsum_t zfetchstat_max_streams; 102 wmsum_t zfetchstat_io_issued; 103 aggsum_t zfetchstat_io_active; 104 } zfetch_sums; 105 106 #define ZFETCHSTAT_BUMP(stat) \ 107 wmsum_add(&zfetch_sums.stat, 1) 108 #define ZFETCHSTAT_ADD(stat, val) \ 109 wmsum_add(&zfetch_sums.stat, val) 110 111 112 static kstat_t *zfetch_ksp; 113 114 static int 115 zfetch_kstats_update(kstat_t *ksp, int rw) 116 { 117 zfetch_stats_t *zs = ksp->ks_data; 118 119 if (rw == KSTAT_WRITE) 120 return (EACCES); 121 zs->zfetchstat_hits.value.ui64 = 122 wmsum_value(&zfetch_sums.zfetchstat_hits); 123 zs->zfetchstat_future.value.ui64 = 124 wmsum_value(&zfetch_sums.zfetchstat_future); 125 zs->zfetchstat_stride.value.ui64 = 126 wmsum_value(&zfetch_sums.zfetchstat_stride); 127 zs->zfetchstat_past.value.ui64 = 128 wmsum_value(&zfetch_sums.zfetchstat_past); 129 zs->zfetchstat_misses.value.ui64 = 130 wmsum_value(&zfetch_sums.zfetchstat_misses); 131 zs->zfetchstat_max_streams.value.ui64 = 132 wmsum_value(&zfetch_sums.zfetchstat_max_streams); 133 zs->zfetchstat_io_issued.value.ui64 = 134 wmsum_value(&zfetch_sums.zfetchstat_io_issued); 135 zs->zfetchstat_io_active.value.ui64 = 136 aggsum_value(&zfetch_sums.zfetchstat_io_active); 137 return (0); 138 } 139 140 void 141 zfetch_init(void) 142 { 143 wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 144 wmsum_init(&zfetch_sums.zfetchstat_future, 0); 145 wmsum_init(&zfetch_sums.zfetchstat_stride, 0); 146 wmsum_init(&zfetch_sums.zfetchstat_past, 0); 147 wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 148 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 149 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 150 aggsum_init(&zfetch_sums.zfetchstat_io_active, 0); 151 152 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 153 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 154 KSTAT_FLAG_VIRTUAL); 155 156 if (zfetch_ksp != NULL) { 157 zfetch_ksp->ks_data = &zfetch_stats; 158 zfetch_ksp->ks_update = zfetch_kstats_update; 159 kstat_install(zfetch_ksp); 160 } 161 } 162 163 void 164 zfetch_fini(void) 165 { 166 if (zfetch_ksp != NULL) { 167 kstat_delete(zfetch_ksp); 168 zfetch_ksp = NULL; 169 } 170 171 wmsum_fini(&zfetch_sums.zfetchstat_hits); 172 wmsum_fini(&zfetch_sums.zfetchstat_future); 173 wmsum_fini(&zfetch_sums.zfetchstat_stride); 174 wmsum_fini(&zfetch_sums.zfetchstat_past); 175 wmsum_fini(&zfetch_sums.zfetchstat_misses); 176 wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 177 wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 178 ASSERT0(aggsum_value(&zfetch_sums.zfetchstat_io_active)); 179 aggsum_fini(&zfetch_sums.zfetchstat_io_active); 180 } 181 182 /* 183 * This takes a pointer to a zfetch structure and a dnode. It performs the 184 * necessary setup for the zfetch structure, grokking data from the 185 * associated dnode. 186 */ 187 void 188 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 189 { 190 if (zf == NULL) 191 return; 192 zf->zf_dnode = dno; 193 zf->zf_numstreams = 0; 194 195 list_create(&zf->zf_stream, sizeof (zstream_t), 196 offsetof(zstream_t, zs_node)); 197 198 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 199 } 200 201 static void 202 dmu_zfetch_stream_fini(zstream_t *zs) 203 { 204 ASSERT(!list_link_active(&zs->zs_node)); 205 zfs_refcount_destroy(&zs->zs_callers); 206 zfs_refcount_destroy(&zs->zs_refs); 207 kmem_free(zs, sizeof (*zs)); 208 } 209 210 static void 211 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 212 { 213 ASSERT(MUTEX_HELD(&zf->zf_lock)); 214 list_remove(&zf->zf_stream, zs); 215 zf->zf_numstreams--; 216 membar_producer(); 217 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 218 dmu_zfetch_stream_fini(zs); 219 } 220 221 /* 222 * Clean-up state associated with a zfetch structure (e.g. destroy the 223 * streams). This doesn't free the zfetch_t itself, that's left to the caller. 224 */ 225 void 226 dmu_zfetch_fini(zfetch_t *zf) 227 { 228 zstream_t *zs; 229 230 mutex_enter(&zf->zf_lock); 231 while ((zs = list_head(&zf->zf_stream)) != NULL) 232 dmu_zfetch_stream_remove(zf, zs); 233 mutex_exit(&zf->zf_lock); 234 list_destroy(&zf->zf_stream); 235 mutex_destroy(&zf->zf_lock); 236 237 zf->zf_dnode = NULL; 238 } 239 240 /* 241 * If there aren't too many active streams already, create one more. 242 * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 243 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 244 * The "blkid" argument is the next block that we expect this stream to access. 245 */ 246 static void 247 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 248 { 249 zstream_t *zs, *zs_next, *zs_old = NULL; 250 uint_t now = gethrestime_sec(), t; 251 252 ASSERT(MUTEX_HELD(&zf->zf_lock)); 253 254 /* 255 * Delete too old streams, reusing the first found one. 256 */ 257 t = now - zfetch_max_sec_reap; 258 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 259 zs_next = list_next(&zf->zf_stream, zs); 260 /* 261 * Skip if still active. 1 -- zf_stream reference. 262 */ 263 if ((int)(zs->zs_atime - t) >= 0) 264 continue; 265 if (zfs_refcount_count(&zs->zs_refs) != 1) 266 continue; 267 if (zs_old) 268 dmu_zfetch_stream_remove(zf, zs); 269 else 270 zs_old = zs; 271 } 272 if (zs_old) { 273 zs = zs_old; 274 list_remove(&zf->zf_stream, zs); 275 goto reuse; 276 } 277 278 /* 279 * The maximum number of streams is normally zfetch_max_streams, 280 * but for small files we lower it such that it's at least possible 281 * for all the streams to be non-overlapping. 282 */ 283 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 284 (zf->zf_dnode->dn_maxblkid << zf->zf_dnode->dn_datablkshift) / 285 zfetch_max_distance)); 286 if (zf->zf_numstreams >= max_streams) { 287 t = now - zfetch_min_sec_reap; 288 for (zs = list_head(&zf->zf_stream); zs != NULL; 289 zs = list_next(&zf->zf_stream, zs)) { 290 if ((int)(zs->zs_atime - t) >= 0) 291 continue; 292 if (zfs_refcount_count(&zs->zs_refs) != 1) 293 continue; 294 if (zs_old == NULL || 295 (int)(zs_old->zs_atime - zs->zs_atime) >= 0) 296 zs_old = zs; 297 } 298 if (zs_old) { 299 zs = zs_old; 300 list_remove(&zf->zf_stream, zs); 301 goto reuse; 302 } 303 ZFETCHSTAT_BUMP(zfetchstat_max_streams); 304 return; 305 } 306 307 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 308 zfs_refcount_create(&zs->zs_callers); 309 zfs_refcount_create(&zs->zs_refs); 310 /* One reference for zf_stream. */ 311 zfs_refcount_add(&zs->zs_refs, NULL); 312 zf->zf_numstreams++; 313 314 reuse: 315 list_insert_head(&zf->zf_stream, zs); 316 zs->zs_blkid = blkid; 317 /* Allow immediate stream reuse until first hit. */ 318 zs->zs_atime = now - zfetch_min_sec_reap; 319 memset(zs->zs_ranges, 0, sizeof (zs->zs_ranges)); 320 zs->zs_pf_dist = 0; 321 zs->zs_ipf_dist = 0; 322 zs->zs_pf_start = blkid; 323 zs->zs_pf_end = blkid; 324 zs->zs_ipf_start = blkid; 325 zs->zs_ipf_end = blkid; 326 zs->zs_missed = B_FALSE; 327 zs->zs_more = B_FALSE; 328 } 329 330 static void 331 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 332 { 333 zstream_t *zs = arg; 334 335 if (io_issued && level == 0 && blkid < zs->zs_blkid) 336 zs->zs_more = B_TRUE; 337 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 338 dmu_zfetch_stream_fini(zs); 339 aggsum_add(&zfetch_sums.zfetchstat_io_active, -1); 340 } 341 342 /* 343 * Process stream hit access for nblks blocks starting at zs_blkid. Return 344 * number of blocks to proceed for after aggregation with future ranges. 345 */ 346 static uint64_t 347 dmu_zfetch_hit(zstream_t *zs, uint64_t nblks) 348 { 349 uint_t i, j; 350 351 /* Optimize sequential accesses (no future ranges). */ 352 if (zs->zs_ranges[0].start == 0) 353 goto done; 354 355 /* Look for intersections with further ranges. */ 356 for (i = 0; i < ZFETCH_RANGES; i++) { 357 zsrange_t *r = &zs->zs_ranges[i]; 358 if (r->start == 0 || r->start > nblks) 359 break; 360 if (r->end >= nblks) { 361 nblks = r->end; 362 i++; 363 break; 364 } 365 } 366 367 /* Delete all found intersecting ranges, updates remaining. */ 368 for (j = 0; i < ZFETCH_RANGES; i++, j++) { 369 if (zs->zs_ranges[i].start == 0) 370 break; 371 ASSERT3U(zs->zs_ranges[i].start, >, nblks); 372 ASSERT3U(zs->zs_ranges[i].end, >, nblks); 373 zs->zs_ranges[j].start = zs->zs_ranges[i].start - nblks; 374 zs->zs_ranges[j].end = zs->zs_ranges[i].end - nblks; 375 } 376 if (j < ZFETCH_RANGES) { 377 zs->zs_ranges[j].start = 0; 378 zs->zs_ranges[j].end = 0; 379 } 380 381 done: 382 zs->zs_blkid += nblks; 383 return (nblks); 384 } 385 386 /* 387 * Process future stream access for nblks blocks starting at blkid. Return 388 * number of blocks to proceed for if future ranges reach fill threshold. 389 */ 390 static uint64_t 391 dmu_zfetch_future(zstream_t *zs, uint64_t blkid, uint64_t nblks) 392 { 393 ASSERT3U(blkid, >, zs->zs_blkid); 394 blkid -= zs->zs_blkid; 395 ASSERT3U(blkid + nblks, <=, UINT16_MAX); 396 397 /* Search for first and last intersection or insert point. */ 398 uint_t f = ZFETCH_RANGES, l = 0, i; 399 for (i = 0; i < ZFETCH_RANGES; i++) { 400 zsrange_t *r = &zs->zs_ranges[i]; 401 if (r->start == 0 || r->start > blkid + nblks) 402 break; 403 if (r->end < blkid) 404 continue; 405 if (f > i) 406 f = i; 407 if (l < i) 408 l = i; 409 } 410 if (f <= l) { 411 /* Got some intersecting range, expand it if needed. */ 412 if (zs->zs_ranges[f].start > blkid) 413 zs->zs_ranges[f].start = blkid; 414 zs->zs_ranges[f].end = MAX(zs->zs_ranges[l].end, blkid + nblks); 415 if (f < l) { 416 /* Got more than one intersection, remove others. */ 417 for (f++, l++; l < ZFETCH_RANGES; f++, l++) { 418 zs->zs_ranges[f].start = zs->zs_ranges[l].start; 419 zs->zs_ranges[f].end = zs->zs_ranges[l].end; 420 } 421 zs->zs_ranges[f].start = 0; 422 zs->zs_ranges[f].end = 0; 423 } 424 } else if (i < ZFETCH_RANGES) { 425 /* Got no intersecting ranges, insert new one. */ 426 for (l = ZFETCH_RANGES - 1; l > i; l--) { 427 zs->zs_ranges[l].start = zs->zs_ranges[l - 1].start; 428 zs->zs_ranges[l].end = zs->zs_ranges[l - 1].end; 429 } 430 zs->zs_ranges[i].start = blkid; 431 zs->zs_ranges[i].end = blkid + nblks; 432 } else { 433 /* No space left to insert. Drop the range. */ 434 return (0); 435 } 436 437 /* Check if with the new access addition we reached fill threshold. */ 438 if (zfetch_hole_shift >= 16) 439 return (0); 440 uint_t hole = 0; 441 for (i = f = l = 0; i < ZFETCH_RANGES; i++) { 442 zsrange_t *r = &zs->zs_ranges[i]; 443 if (r->start == 0) 444 break; 445 hole += r->start - f; 446 f = r->end; 447 if (hole <= r->end >> zfetch_hole_shift) 448 l = r->end; 449 } 450 if (l > 0) 451 return (dmu_zfetch_hit(zs, l)); 452 453 return (0); 454 } 455 456 /* 457 * This is the predictive prefetch entry point. dmu_zfetch_prepare() 458 * associates dnode access specified with blkid and nblks arguments with 459 * prefetch stream, predicts further accesses based on that stats and returns 460 * the stream pointer on success. That pointer must later be passed to 461 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 462 * release it. dmu_zfetch() is a wrapper for simple cases when window between 463 * prediction and prefetch initiation is not needed. 464 * fetch_data argument specifies whether actual data blocks should be fetched: 465 * FALSE -- prefetch only indirect blocks for predicted data blocks; 466 * TRUE -- prefetch predicted data blocks plus following indirect blocks. 467 */ 468 zstream_t * 469 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 470 boolean_t fetch_data, boolean_t have_lock) 471 { 472 zstream_t *zs; 473 spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 474 zfs_prefetch_type_t os_prefetch = zf->zf_dnode->dn_objset->os_prefetch; 475 476 if (zfs_prefetch_disable || os_prefetch == ZFS_PREFETCH_NONE) 477 return (NULL); 478 479 if (os_prefetch == ZFS_PREFETCH_METADATA) 480 fetch_data = B_FALSE; 481 482 /* 483 * If we haven't yet loaded the indirect vdevs' mappings, we 484 * can only read from blocks that we carefully ensure are on 485 * concrete vdevs (or previously-loaded indirect vdevs). So we 486 * can't allow the predictive prefetcher to attempt reads of other 487 * blocks (e.g. of the MOS's dnode object). 488 */ 489 if (!spa_indirect_vdevs_loaded(spa)) 490 return (NULL); 491 492 /* 493 * As a fast path for small (single-block) files, ignore access 494 * to the first block. 495 */ 496 if (!have_lock && blkid == 0) 497 return (NULL); 498 499 if (!have_lock) 500 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 501 502 /* 503 * A fast path for small files for which no prefetch will 504 * happen. 505 */ 506 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 507 if (maxblkid < 2) { 508 if (!have_lock) 509 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 510 return (NULL); 511 } 512 mutex_enter(&zf->zf_lock); 513 514 /* 515 * Find perfect prefetch stream. Depending on whether the accesses 516 * are block-aligned, first block of the new access may either follow 517 * the last block of the previous access, or be equal to it. 518 */ 519 unsigned int dbs = zf->zf_dnode->dn_datablkshift; 520 uint64_t end_blkid = blkid + nblks; 521 for (zs = list_head(&zf->zf_stream); zs != NULL; 522 zs = list_next(&zf->zf_stream, zs)) { 523 if (blkid == zs->zs_blkid) { 524 goto hit; 525 } else if (blkid + 1 == zs->zs_blkid) { 526 blkid++; 527 nblks--; 528 goto hit; 529 } 530 } 531 532 /* 533 * Find close enough prefetch stream. Access crossing stream position 534 * is a hit in its new part. Access ahead of stream position considered 535 * a hit for metadata prefetch, since we do not care about fill percent, 536 * or stored for future otherwise. Access behind stream position is 537 * silently ignored, since we already skipped it reaching fill percent. 538 */ 539 uint_t max_reorder = MIN((zfetch_max_reorder >> dbs) + 1, UINT16_MAX); 540 uint_t t = gethrestime_sec() - zfetch_max_sec_reap; 541 for (zs = list_head(&zf->zf_stream); zs != NULL; 542 zs = list_next(&zf->zf_stream, zs)) { 543 if (blkid > zs->zs_blkid) { 544 if (end_blkid <= zs->zs_blkid + max_reorder) { 545 if (!fetch_data) { 546 nblks = dmu_zfetch_hit(zs, 547 end_blkid - zs->zs_blkid); 548 ZFETCHSTAT_BUMP(zfetchstat_stride); 549 goto future; 550 } 551 nblks = dmu_zfetch_future(zs, blkid, nblks); 552 if (nblks > 0) 553 ZFETCHSTAT_BUMP(zfetchstat_stride); 554 else 555 ZFETCHSTAT_BUMP(zfetchstat_future); 556 goto future; 557 } 558 } else if (end_blkid >= zs->zs_blkid) { 559 nblks -= zs->zs_blkid - blkid; 560 blkid += zs->zs_blkid - blkid; 561 goto hit; 562 } else if (end_blkid + max_reorder > zs->zs_blkid && 563 (int)(zs->zs_atime - t) >= 0) { 564 ZFETCHSTAT_BUMP(zfetchstat_past); 565 zs->zs_atime = gethrestime_sec(); 566 goto out; 567 } 568 } 569 570 /* 571 * This access is not part of any existing stream. Create a new 572 * stream for it unless we are at the end of file. 573 */ 574 if (end_blkid < maxblkid) 575 dmu_zfetch_stream_create(zf, end_blkid); 576 mutex_exit(&zf->zf_lock); 577 if (!have_lock) 578 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 579 ZFETCHSTAT_BUMP(zfetchstat_misses); 580 return (NULL); 581 582 hit: 583 nblks = dmu_zfetch_hit(zs, nblks); 584 ZFETCHSTAT_BUMP(zfetchstat_hits); 585 586 future: 587 zs->zs_atime = gethrestime_sec(); 588 589 /* Exit if we already prefetched for this position before. */ 590 if (nblks == 0) 591 goto out; 592 593 /* If the file is ending, remove the stream. */ 594 end_blkid = zs->zs_blkid; 595 if (end_blkid >= maxblkid) { 596 dmu_zfetch_stream_remove(zf, zs); 597 out: 598 mutex_exit(&zf->zf_lock); 599 if (!have_lock) 600 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 601 return (NULL); 602 } 603 604 /* 605 * This access was to a block that we issued a prefetch for on 606 * behalf of this stream. Calculate further prefetch distances. 607 * 608 * Start prefetch from the demand access size (nblks). Double the 609 * distance every access up to zfetch_min_distance. After that only 610 * if needed increase the distance by 1/8 up to zfetch_max_distance. 611 * 612 * Don't double the distance beyond single block if we have more 613 * than ~6% of ARC held by active prefetches. It should help with 614 * getting out of RAM on some badly mispredicted read patterns. 615 */ 616 unsigned int nbytes = nblks << dbs; 617 unsigned int pf_nblks; 618 if (fetch_data) { 619 if (unlikely(zs->zs_pf_dist < nbytes)) 620 zs->zs_pf_dist = nbytes; 621 else if (zs->zs_pf_dist < zfetch_min_distance && 622 (zs->zs_pf_dist < (1 << dbs) || 623 aggsum_compare(&zfetch_sums.zfetchstat_io_active, 624 arc_c_max >> (4 + dbs)) < 0)) 625 zs->zs_pf_dist *= 2; 626 else if (zs->zs_more) 627 zs->zs_pf_dist += zs->zs_pf_dist / 8; 628 zs->zs_more = B_FALSE; 629 if (zs->zs_pf_dist > zfetch_max_distance) 630 zs->zs_pf_dist = zfetch_max_distance; 631 pf_nblks = zs->zs_pf_dist >> dbs; 632 } else { 633 pf_nblks = 0; 634 } 635 if (zs->zs_pf_start < end_blkid) 636 zs->zs_pf_start = end_blkid; 637 if (zs->zs_pf_end < end_blkid + pf_nblks) 638 zs->zs_pf_end = end_blkid + pf_nblks; 639 640 /* 641 * Do the same for indirects, starting where we will stop reading 642 * data blocks (and the indirects that point to them). 643 */ 644 if (unlikely(zs->zs_ipf_dist < nbytes)) 645 zs->zs_ipf_dist = nbytes; 646 else 647 zs->zs_ipf_dist *= 2; 648 if (zs->zs_ipf_dist > zfetch_max_idistance) 649 zs->zs_ipf_dist = zfetch_max_idistance; 650 pf_nblks = zs->zs_ipf_dist >> dbs; 651 if (zs->zs_ipf_start < zs->zs_pf_end) 652 zs->zs_ipf_start = zs->zs_pf_end; 653 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 654 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 655 656 zfs_refcount_add(&zs->zs_refs, NULL); 657 /* Count concurrent callers. */ 658 zfs_refcount_add(&zs->zs_callers, NULL); 659 mutex_exit(&zf->zf_lock); 660 661 if (!have_lock) 662 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 663 return (zs); 664 } 665 666 void 667 dmu_zfetch_run(zfetch_t *zf, zstream_t *zs, boolean_t missed, 668 boolean_t have_lock) 669 { 670 int64_t pf_start, pf_end, ipf_start, ipf_end; 671 int epbs, issued; 672 673 if (missed) 674 zs->zs_missed = missed; 675 676 /* 677 * Postpone the prefetch if there are more concurrent callers. 678 * It happens when multiple requests are waiting for the same 679 * indirect block. The last one will run the prefetch for all. 680 */ 681 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 682 /* Drop reference taken in dmu_zfetch_prepare(). */ 683 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 684 dmu_zfetch_stream_fini(zs); 685 return; 686 } 687 688 mutex_enter(&zf->zf_lock); 689 if (zs->zs_missed) { 690 pf_start = zs->zs_pf_start; 691 pf_end = zs->zs_pf_start = zs->zs_pf_end; 692 } else { 693 pf_start = pf_end = 0; 694 } 695 ipf_start = zs->zs_ipf_start; 696 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 697 mutex_exit(&zf->zf_lock); 698 ASSERT3S(pf_start, <=, pf_end); 699 ASSERT3S(ipf_start, <=, ipf_end); 700 701 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 702 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 703 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 704 ASSERT3S(ipf_start, <=, ipf_end); 705 issued = pf_end - pf_start + ipf_end - ipf_start; 706 if (issued > 1) { 707 /* More references on top of taken in dmu_zfetch_prepare(). */ 708 zfs_refcount_add_few(&zs->zs_refs, issued - 1, NULL); 709 } else if (issued == 0) { 710 /* Some other thread has done our work, so drop the ref. */ 711 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 712 dmu_zfetch_stream_fini(zs); 713 return; 714 } 715 aggsum_add(&zfetch_sums.zfetchstat_io_active, issued); 716 717 if (!have_lock) 718 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 719 720 issued = 0; 721 for (int64_t blk = pf_start; blk < pf_end; blk++) { 722 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 723 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 724 } 725 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 726 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 727 ZIO_PRIORITY_ASYNC_READ, 0, dmu_zfetch_done, zs); 728 } 729 730 if (!have_lock) 731 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 732 733 if (issued) 734 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 735 } 736 737 void 738 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 739 boolean_t missed, boolean_t have_lock) 740 { 741 zstream_t *zs; 742 743 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 744 if (zs) 745 dmu_zfetch_run(zf, zs, missed, have_lock); 746 } 747 748 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 749 "Disable all ZFS prefetching"); 750 751 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 752 "Max number of streams per zfetch"); 753 754 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 755 "Min time before stream reclaim"); 756 757 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 758 "Max time before stream delete"); 759 760 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 761 "Min bytes to prefetch per stream"); 762 763 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 764 "Max bytes to prefetch per stream"); 765 766 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 767 "Max bytes to prefetch indirects for per stream"); 768 769 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_reorder, UINT, ZMOD_RW, 770 "Max request reorder distance within a stream"); 771 772 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, hole_shift, UINT, ZMOD_RW, 773 "Max log2 fraction of holes in a stream"); 774