1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or https://opensource.org/licenses/CDDL-1.0. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2013, 2017 by Delphix. All rights reserved. 28 */ 29 30 #include <sys/zfs_context.h> 31 #include <sys/dnode.h> 32 #include <sys/dmu_objset.h> 33 #include <sys/dmu_zfetch.h> 34 #include <sys/dmu.h> 35 #include <sys/dbuf.h> 36 #include <sys/kstat.h> 37 #include <sys/wmsum.h> 38 39 /* 40 * This tunable disables predictive prefetch. Note that it leaves "prescient" 41 * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, 42 * prescient prefetch never issues i/os that end up not being needed, 43 * so it can't hurt performance. 44 */ 45 46 static int zfs_prefetch_disable = B_FALSE; 47 48 /* max # of streams per zfetch */ 49 static unsigned int zfetch_max_streams = 8; 50 /* min time before stream reclaim */ 51 static unsigned int zfetch_min_sec_reap = 1; 52 /* max time before stream delete */ 53 static unsigned int zfetch_max_sec_reap = 2; 54 /* min bytes to prefetch per stream (default 4MB) */ 55 static unsigned int zfetch_min_distance = 4 * 1024 * 1024; 56 /* max bytes to prefetch per stream (default 64MB) */ 57 unsigned int zfetch_max_distance = 64 * 1024 * 1024; 58 /* max bytes to prefetch indirects for per stream (default 64MB) */ 59 unsigned int zfetch_max_idistance = 64 * 1024 * 1024; 60 /* max number of bytes in an array_read in which we allow prefetching (1MB) */ 61 unsigned long zfetch_array_rd_sz = 1024 * 1024; 62 63 typedef struct zfetch_stats { 64 kstat_named_t zfetchstat_hits; 65 kstat_named_t zfetchstat_misses; 66 kstat_named_t zfetchstat_max_streams; 67 kstat_named_t zfetchstat_io_issued; 68 } zfetch_stats_t; 69 70 static zfetch_stats_t zfetch_stats = { 71 { "hits", KSTAT_DATA_UINT64 }, 72 { "misses", KSTAT_DATA_UINT64 }, 73 { "max_streams", KSTAT_DATA_UINT64 }, 74 { "io_issued", KSTAT_DATA_UINT64 }, 75 }; 76 77 struct { 78 wmsum_t zfetchstat_hits; 79 wmsum_t zfetchstat_misses; 80 wmsum_t zfetchstat_max_streams; 81 wmsum_t zfetchstat_io_issued; 82 } zfetch_sums; 83 84 #define ZFETCHSTAT_BUMP(stat) \ 85 wmsum_add(&zfetch_sums.stat, 1) 86 #define ZFETCHSTAT_ADD(stat, val) \ 87 wmsum_add(&zfetch_sums.stat, val) 88 89 90 static kstat_t *zfetch_ksp; 91 92 static int 93 zfetch_kstats_update(kstat_t *ksp, int rw) 94 { 95 zfetch_stats_t *zs = ksp->ks_data; 96 97 if (rw == KSTAT_WRITE) 98 return (EACCES); 99 zs->zfetchstat_hits.value.ui64 = 100 wmsum_value(&zfetch_sums.zfetchstat_hits); 101 zs->zfetchstat_misses.value.ui64 = 102 wmsum_value(&zfetch_sums.zfetchstat_misses); 103 zs->zfetchstat_max_streams.value.ui64 = 104 wmsum_value(&zfetch_sums.zfetchstat_max_streams); 105 zs->zfetchstat_io_issued.value.ui64 = 106 wmsum_value(&zfetch_sums.zfetchstat_io_issued); 107 return (0); 108 } 109 110 void 111 zfetch_init(void) 112 { 113 wmsum_init(&zfetch_sums.zfetchstat_hits, 0); 114 wmsum_init(&zfetch_sums.zfetchstat_misses, 0); 115 wmsum_init(&zfetch_sums.zfetchstat_max_streams, 0); 116 wmsum_init(&zfetch_sums.zfetchstat_io_issued, 0); 117 118 zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", 119 KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), 120 KSTAT_FLAG_VIRTUAL); 121 122 if (zfetch_ksp != NULL) { 123 zfetch_ksp->ks_data = &zfetch_stats; 124 zfetch_ksp->ks_update = zfetch_kstats_update; 125 kstat_install(zfetch_ksp); 126 } 127 } 128 129 void 130 zfetch_fini(void) 131 { 132 if (zfetch_ksp != NULL) { 133 kstat_delete(zfetch_ksp); 134 zfetch_ksp = NULL; 135 } 136 137 wmsum_fini(&zfetch_sums.zfetchstat_hits); 138 wmsum_fini(&zfetch_sums.zfetchstat_misses); 139 wmsum_fini(&zfetch_sums.zfetchstat_max_streams); 140 wmsum_fini(&zfetch_sums.zfetchstat_io_issued); 141 } 142 143 /* 144 * This takes a pointer to a zfetch structure and a dnode. It performs the 145 * necessary setup for the zfetch structure, grokking data from the 146 * associated dnode. 147 */ 148 void 149 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 150 { 151 if (zf == NULL) 152 return; 153 zf->zf_dnode = dno; 154 zf->zf_numstreams = 0; 155 156 list_create(&zf->zf_stream, sizeof (zstream_t), 157 offsetof(zstream_t, zs_node)); 158 159 mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); 160 } 161 162 static void 163 dmu_zfetch_stream_fini(zstream_t *zs) 164 { 165 ASSERT(!list_link_active(&zs->zs_node)); 166 zfs_refcount_destroy(&zs->zs_callers); 167 zfs_refcount_destroy(&zs->zs_refs); 168 kmem_free(zs, sizeof (*zs)); 169 } 170 171 static void 172 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 173 { 174 ASSERT(MUTEX_HELD(&zf->zf_lock)); 175 list_remove(&zf->zf_stream, zs); 176 zf->zf_numstreams--; 177 membar_producer(); 178 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 179 dmu_zfetch_stream_fini(zs); 180 } 181 182 /* 183 * Clean-up state associated with a zfetch structure (e.g. destroy the 184 * streams). This doesn't free the zfetch_t itself, that's left to the caller. 185 */ 186 void 187 dmu_zfetch_fini(zfetch_t *zf) 188 { 189 zstream_t *zs; 190 191 mutex_enter(&zf->zf_lock); 192 while ((zs = list_head(&zf->zf_stream)) != NULL) 193 dmu_zfetch_stream_remove(zf, zs); 194 mutex_exit(&zf->zf_lock); 195 list_destroy(&zf->zf_stream); 196 mutex_destroy(&zf->zf_lock); 197 198 zf->zf_dnode = NULL; 199 } 200 201 /* 202 * If there aren't too many active streams already, create one more. 203 * In process delete/reuse all streams without hits for zfetch_max_sec_reap. 204 * If needed, reuse oldest stream without hits for zfetch_min_sec_reap or ever. 205 * The "blkid" argument is the next block that we expect this stream to access. 206 */ 207 static void 208 dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) 209 { 210 zstream_t *zs, *zs_next, *zs_old = NULL; 211 hrtime_t now = gethrtime(), t; 212 213 ASSERT(MUTEX_HELD(&zf->zf_lock)); 214 215 /* 216 * Delete too old streams, reusing the first found one. 217 */ 218 t = now - SEC2NSEC(zfetch_max_sec_reap); 219 for (zs = list_head(&zf->zf_stream); zs != NULL; zs = zs_next) { 220 zs_next = list_next(&zf->zf_stream, zs); 221 /* 222 * Skip if still active. 1 -- zf_stream reference. 223 */ 224 if (zfs_refcount_count(&zs->zs_refs) != 1) 225 continue; 226 if (zs->zs_atime > t) 227 continue; 228 if (zs_old) 229 dmu_zfetch_stream_remove(zf, zs); 230 else 231 zs_old = zs; 232 } 233 if (zs_old) { 234 zs = zs_old; 235 goto reuse; 236 } 237 238 /* 239 * The maximum number of streams is normally zfetch_max_streams, 240 * but for small files we lower it such that it's at least possible 241 * for all the streams to be non-overlapping. 242 */ 243 uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, 244 zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / 245 zfetch_max_distance)); 246 if (zf->zf_numstreams >= max_streams) { 247 t = now - SEC2NSEC(zfetch_min_sec_reap); 248 for (zs = list_head(&zf->zf_stream); zs != NULL; 249 zs = list_next(&zf->zf_stream, zs)) { 250 if (zfs_refcount_count(&zs->zs_refs) != 1) 251 continue; 252 if (zs->zs_atime > t) 253 continue; 254 if (zs_old == NULL || zs->zs_atime < zs_old->zs_atime) 255 zs_old = zs; 256 } 257 if (zs_old) { 258 zs = zs_old; 259 goto reuse; 260 } 261 ZFETCHSTAT_BUMP(zfetchstat_max_streams); 262 return; 263 } 264 265 zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); 266 zs->zs_fetch = zf; 267 zfs_refcount_create(&zs->zs_callers); 268 zfs_refcount_create(&zs->zs_refs); 269 /* One reference for zf_stream. */ 270 zfs_refcount_add(&zs->zs_refs, NULL); 271 zf->zf_numstreams++; 272 list_insert_head(&zf->zf_stream, zs); 273 274 reuse: 275 zs->zs_blkid = blkid; 276 zs->zs_pf_dist = 0; 277 zs->zs_pf_start = blkid; 278 zs->zs_pf_end = blkid; 279 zs->zs_ipf_dist = 0; 280 zs->zs_ipf_start = blkid; 281 zs->zs_ipf_end = blkid; 282 /* Allow immediate stream reuse until first hit. */ 283 zs->zs_atime = now - SEC2NSEC(zfetch_min_sec_reap); 284 zs->zs_missed = B_FALSE; 285 zs->zs_more = B_FALSE; 286 } 287 288 static void 289 dmu_zfetch_done(void *arg, uint64_t level, uint64_t blkid, boolean_t io_issued) 290 { 291 zstream_t *zs = arg; 292 293 if (io_issued && level == 0 && blkid < zs->zs_blkid) 294 zs->zs_more = B_TRUE; 295 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 296 dmu_zfetch_stream_fini(zs); 297 } 298 299 /* 300 * This is the predictive prefetch entry point. dmu_zfetch_prepare() 301 * associates dnode access specified with blkid and nblks arguments with 302 * prefetch stream, predicts further accesses based on that stats and returns 303 * the stream pointer on success. That pointer must later be passed to 304 * dmu_zfetch_run() to initiate the speculative prefetch for the stream and 305 * release it. dmu_zfetch() is a wrapper for simple cases when window between 306 * prediction and prefetch initiation is not needed. 307 * fetch_data argument specifies whether actual data blocks should be fetched: 308 * FALSE -- prefetch only indirect blocks for predicted data blocks; 309 * TRUE -- prefetch predicted data blocks plus following indirect blocks. 310 */ 311 zstream_t * 312 dmu_zfetch_prepare(zfetch_t *zf, uint64_t blkid, uint64_t nblks, 313 boolean_t fetch_data, boolean_t have_lock) 314 { 315 zstream_t *zs; 316 spa_t *spa = zf->zf_dnode->dn_objset->os_spa; 317 318 if (zfs_prefetch_disable) 319 return (NULL); 320 /* 321 * If we haven't yet loaded the indirect vdevs' mappings, we 322 * can only read from blocks that we carefully ensure are on 323 * concrete vdevs (or previously-loaded indirect vdevs). So we 324 * can't allow the predictive prefetcher to attempt reads of other 325 * blocks (e.g. of the MOS's dnode object). 326 */ 327 if (!spa_indirect_vdevs_loaded(spa)) 328 return (NULL); 329 330 /* 331 * As a fast path for small (single-block) files, ignore access 332 * to the first block. 333 */ 334 if (!have_lock && blkid == 0) 335 return (NULL); 336 337 if (!have_lock) 338 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 339 340 /* 341 * A fast path for small files for which no prefetch will 342 * happen. 343 */ 344 uint64_t maxblkid = zf->zf_dnode->dn_maxblkid; 345 if (maxblkid < 2) { 346 if (!have_lock) 347 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 348 return (NULL); 349 } 350 mutex_enter(&zf->zf_lock); 351 352 /* 353 * Find matching prefetch stream. Depending on whether the accesses 354 * are block-aligned, first block of the new access may either follow 355 * the last block of the previous access, or be equal to it. 356 */ 357 for (zs = list_head(&zf->zf_stream); zs != NULL; 358 zs = list_next(&zf->zf_stream, zs)) { 359 if (blkid == zs->zs_blkid) { 360 break; 361 } else if (blkid + 1 == zs->zs_blkid) { 362 blkid++; 363 nblks--; 364 break; 365 } 366 } 367 368 /* 369 * If the file is ending, remove the matching stream if found. 370 * If not found then it is too late to create a new one now. 371 */ 372 uint64_t end_of_access_blkid = blkid + nblks; 373 if (end_of_access_blkid >= maxblkid) { 374 if (zs != NULL) 375 dmu_zfetch_stream_remove(zf, zs); 376 mutex_exit(&zf->zf_lock); 377 if (!have_lock) 378 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 379 return (NULL); 380 } 381 382 /* Exit if we already prefetched this block before. */ 383 if (nblks == 0) { 384 mutex_exit(&zf->zf_lock); 385 if (!have_lock) 386 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 387 return (NULL); 388 } 389 390 if (zs == NULL) { 391 /* 392 * This access is not part of any existing stream. Create 393 * a new stream for it. 394 */ 395 dmu_zfetch_stream_create(zf, end_of_access_blkid); 396 mutex_exit(&zf->zf_lock); 397 if (!have_lock) 398 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 399 ZFETCHSTAT_BUMP(zfetchstat_misses); 400 return (NULL); 401 } 402 403 /* 404 * This access was to a block that we issued a prefetch for on 405 * behalf of this stream. Calculate further prefetch distances. 406 * 407 * Start prefetch from the demand access size (nblks). Double the 408 * distance every access up to zfetch_min_distance. After that only 409 * if needed increase the distance by 1/8 up to zfetch_max_distance. 410 */ 411 unsigned int nbytes = nblks << zf->zf_dnode->dn_datablkshift; 412 unsigned int pf_nblks; 413 if (fetch_data) { 414 if (unlikely(zs->zs_pf_dist < nbytes)) 415 zs->zs_pf_dist = nbytes; 416 else if (zs->zs_pf_dist < zfetch_min_distance) 417 zs->zs_pf_dist *= 2; 418 else if (zs->zs_more) 419 zs->zs_pf_dist += zs->zs_pf_dist / 8; 420 zs->zs_more = B_FALSE; 421 if (zs->zs_pf_dist > zfetch_max_distance) 422 zs->zs_pf_dist = zfetch_max_distance; 423 pf_nblks = zs->zs_pf_dist >> zf->zf_dnode->dn_datablkshift; 424 } else { 425 pf_nblks = 0; 426 } 427 if (zs->zs_pf_start < end_of_access_blkid) 428 zs->zs_pf_start = end_of_access_blkid; 429 if (zs->zs_pf_end < end_of_access_blkid + pf_nblks) 430 zs->zs_pf_end = end_of_access_blkid + pf_nblks; 431 432 /* 433 * Do the same for indirects, starting where we will stop reading 434 * data blocks (and the indirects that point to them). 435 */ 436 if (unlikely(zs->zs_ipf_dist < nbytes)) 437 zs->zs_ipf_dist = nbytes; 438 else 439 zs->zs_ipf_dist *= 2; 440 if (zs->zs_ipf_dist > zfetch_max_idistance) 441 zs->zs_ipf_dist = zfetch_max_idistance; 442 pf_nblks = zs->zs_ipf_dist >> zf->zf_dnode->dn_datablkshift; 443 if (zs->zs_ipf_start < zs->zs_pf_end) 444 zs->zs_ipf_start = zs->zs_pf_end; 445 if (zs->zs_ipf_end < zs->zs_pf_end + pf_nblks) 446 zs->zs_ipf_end = zs->zs_pf_end + pf_nblks; 447 448 zs->zs_blkid = end_of_access_blkid; 449 /* Protect the stream from reclamation. */ 450 zs->zs_atime = gethrtime(); 451 zfs_refcount_add(&zs->zs_refs, NULL); 452 /* Count concurrent callers. */ 453 zfs_refcount_add(&zs->zs_callers, NULL); 454 mutex_exit(&zf->zf_lock); 455 456 if (!have_lock) 457 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 458 459 ZFETCHSTAT_BUMP(zfetchstat_hits); 460 return (zs); 461 } 462 463 void 464 dmu_zfetch_run(zstream_t *zs, boolean_t missed, boolean_t have_lock) 465 { 466 zfetch_t *zf = zs->zs_fetch; 467 int64_t pf_start, pf_end, ipf_start, ipf_end; 468 int epbs, issued; 469 470 if (missed) 471 zs->zs_missed = missed; 472 473 /* 474 * Postpone the prefetch if there are more concurrent callers. 475 * It happens when multiple requests are waiting for the same 476 * indirect block. The last one will run the prefetch for all. 477 */ 478 if (zfs_refcount_remove(&zs->zs_callers, NULL) != 0) { 479 /* Drop reference taken in dmu_zfetch_prepare(). */ 480 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 481 dmu_zfetch_stream_fini(zs); 482 return; 483 } 484 485 mutex_enter(&zf->zf_lock); 486 if (zs->zs_missed) { 487 pf_start = zs->zs_pf_start; 488 pf_end = zs->zs_pf_start = zs->zs_pf_end; 489 } else { 490 pf_start = pf_end = 0; 491 } 492 ipf_start = zs->zs_ipf_start; 493 ipf_end = zs->zs_ipf_start = zs->zs_ipf_end; 494 mutex_exit(&zf->zf_lock); 495 ASSERT3S(pf_start, <=, pf_end); 496 ASSERT3S(ipf_start, <=, ipf_end); 497 498 epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; 499 ipf_start = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; 500 ipf_end = P2ROUNDUP(ipf_end, 1 << epbs) >> epbs; 501 ASSERT3S(ipf_start, <=, ipf_end); 502 issued = pf_end - pf_start + ipf_end - ipf_start; 503 if (issued > 1) { 504 /* More references on top of taken in dmu_zfetch_prepare(). */ 505 for (int i = 0; i < issued - 1; i++) 506 zfs_refcount_add(&zs->zs_refs, NULL); 507 } else if (issued == 0) { 508 /* Some other thread has done our work, so drop the ref. */ 509 if (zfs_refcount_remove(&zs->zs_refs, NULL) == 0) 510 dmu_zfetch_stream_fini(zs); 511 return; 512 } 513 514 if (!have_lock) 515 rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); 516 517 issued = 0; 518 for (int64_t blk = pf_start; blk < pf_end; blk++) { 519 issued += dbuf_prefetch_impl(zf->zf_dnode, 0, blk, 520 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 521 dmu_zfetch_done, zs); 522 } 523 for (int64_t iblk = ipf_start; iblk < ipf_end; iblk++) { 524 issued += dbuf_prefetch_impl(zf->zf_dnode, 1, iblk, 525 ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH, 526 dmu_zfetch_done, zs); 527 } 528 529 if (!have_lock) 530 rw_exit(&zf->zf_dnode->dn_struct_rwlock); 531 532 if (issued) 533 ZFETCHSTAT_ADD(zfetchstat_io_issued, issued); 534 } 535 536 void 537 dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, 538 boolean_t missed, boolean_t have_lock) 539 { 540 zstream_t *zs; 541 542 zs = dmu_zfetch_prepare(zf, blkid, nblks, fetch_data, have_lock); 543 if (zs) 544 dmu_zfetch_run(zs, missed, have_lock); 545 } 546 547 ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, 548 "Disable all ZFS prefetching"); 549 550 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, 551 "Max number of streams per zfetch"); 552 553 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, 554 "Min time before stream reclaim"); 555 556 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_sec_reap, UINT, ZMOD_RW, 557 "Max time before stream delete"); 558 559 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_distance, UINT, ZMOD_RW, 560 "Min bytes to prefetch per stream"); 561 562 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, 563 "Max bytes to prefetch per stream"); 564 565 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_idistance, UINT, ZMOD_RW, 566 "Max bytes to prefetch indirects for per stream"); 567 568 ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, 569 "Number of bytes in a array_read"); 570