1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dnode.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_zfetch.h> 32 #include <sys/dmu.h> 33 #include <sys/dbuf.h> 34 35 /* 36 * I'm against tune-ables, but these should probably exist as tweakable globals 37 * until we can get this working the way we want it to. 38 */ 39 40 /* max # of streams per zfetch */ 41 uint32_t zfetch_max_streams = 8; 42 /* min time before stream reclaim */ 43 uint32_t zfetch_min_sec_reap = 2; 44 /* max number of blocks to fetch at a time */ 45 uint32_t zfetch_block_cap = 256; 46 /* number of bytes in a array_read at which we stop prefetching (1Mb) */ 47 uint64_t zfetch_array_rd_sz = 1024 * 1024; 48 49 /* forward decls for static routines */ 50 static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); 51 static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); 52 static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); 53 static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); 54 static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); 55 static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); 56 static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); 57 static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); 58 static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); 59 60 /* 61 * Given a zfetch structure and a zstream structure, determine whether the 62 * blocks to be read are part of a co-linear pair of existing prefetch 63 * streams. If a set is found, coalesce the streams, removing one, and 64 * configure the prefetch so it looks for a strided access pattern. 65 * 66 * In other words: if we find two sequential access streams that are 67 * the same length and distance N appart, and this read is N from the 68 * last stream, then we are probably in a strided access pattern. So 69 * combine the two sequential streams into a single strided stream. 70 * 71 * If no co-linear streams are found, return NULL. 72 */ 73 static int 74 dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) 75 { 76 zstream_t *z_walk; 77 zstream_t *z_comp; 78 79 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 80 return (0); 81 82 if (zh == NULL) { 83 rw_exit(&zf->zf_rwlock); 84 return (0); 85 } 86 87 for (z_walk = list_head(&zf->zf_stream); z_walk; 88 z_walk = list_next(&zf->zf_stream, z_walk)) { 89 for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; 90 z_comp = list_next(&zf->zf_stream, z_comp)) { 91 int64_t diff; 92 93 if (z_walk->zst_len != z_walk->zst_stride || 94 z_comp->zst_len != z_comp->zst_stride) { 95 continue; 96 } 97 98 diff = z_comp->zst_offset - z_walk->zst_offset; 99 if (z_comp->zst_offset + diff == zh->zst_offset) { 100 z_walk->zst_offset = zh->zst_offset; 101 z_walk->zst_direction = diff < 0 ? -1 : 1; 102 z_walk->zst_stride = 103 diff * z_walk->zst_direction; 104 z_walk->zst_ph_offset = 105 zh->zst_offset + z_walk->zst_stride; 106 dmu_zfetch_stream_remove(zf, z_comp); 107 mutex_destroy(&z_comp->zst_lock); 108 kmem_free(z_comp, sizeof (zstream_t)); 109 110 dmu_zfetch_dofetch(zf, z_walk); 111 112 rw_exit(&zf->zf_rwlock); 113 return (1); 114 } 115 116 diff = z_walk->zst_offset - z_comp->zst_offset; 117 if (z_walk->zst_offset + diff == zh->zst_offset) { 118 z_walk->zst_offset = zh->zst_offset; 119 z_walk->zst_direction = diff < 0 ? -1 : 1; 120 z_walk->zst_stride = 121 diff * z_walk->zst_direction; 122 z_walk->zst_ph_offset = 123 zh->zst_offset + z_walk->zst_stride; 124 dmu_zfetch_stream_remove(zf, z_comp); 125 mutex_destroy(&z_comp->zst_lock); 126 kmem_free(z_comp, sizeof (zstream_t)); 127 128 dmu_zfetch_dofetch(zf, z_walk); 129 130 rw_exit(&zf->zf_rwlock); 131 return (1); 132 } 133 } 134 } 135 136 rw_exit(&zf->zf_rwlock); 137 return (0); 138 } 139 140 /* 141 * Given a zstream_t, determine the bounds of the prefetch. Then call the 142 * routine that actually prefetches the individual blocks. 143 */ 144 static void 145 dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) 146 { 147 uint64_t prefetch_tail; 148 uint64_t prefetch_limit; 149 uint64_t prefetch_ofst; 150 uint64_t prefetch_len; 151 uint64_t blocks_fetched; 152 153 zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); 154 zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); 155 156 prefetch_tail = MAX((int64_t)zs->zst_ph_offset, 157 (int64_t)(zs->zst_offset + zs->zst_stride)); 158 /* 159 * XXX: use a faster division method? 160 */ 161 prefetch_limit = zs->zst_offset + zs->zst_len + 162 (zs->zst_cap * zs->zst_stride) / zs->zst_len; 163 164 while (prefetch_tail < prefetch_limit) { 165 prefetch_ofst = zs->zst_offset + zs->zst_direction * 166 (prefetch_tail - zs->zst_offset); 167 168 prefetch_len = zs->zst_len; 169 170 /* 171 * Don't prefetch beyond the end of the file, if working 172 * backwards. 173 */ 174 if ((zs->zst_direction == ZFETCH_BACKWARD) && 175 (prefetch_ofst > prefetch_tail)) { 176 prefetch_len += prefetch_ofst; 177 prefetch_ofst = 0; 178 } 179 180 /* don't prefetch more than we're supposed to */ 181 if (prefetch_len > zs->zst_len) 182 break; 183 184 blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, 185 prefetch_ofst, zs->zst_len); 186 187 prefetch_tail += zs->zst_stride; 188 /* stop if we've run out of stuff to prefetch */ 189 if (blocks_fetched < zs->zst_len) 190 break; 191 } 192 zs->zst_ph_offset = prefetch_tail; 193 zs->zst_last = lbolt; 194 } 195 196 /* 197 * This takes a pointer to a zfetch structure and a dnode. It performs the 198 * necessary setup for the zfetch structure, grokking data from the 199 * associated dnode. 200 */ 201 void 202 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 203 { 204 if (zf == NULL) { 205 return; 206 } 207 208 zf->zf_dnode = dno; 209 zf->zf_stream_cnt = 0; 210 zf->zf_alloc_fail = 0; 211 212 list_create(&zf->zf_stream, sizeof (zstream_t), 213 offsetof(zstream_t, zst_node)); 214 215 rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 216 } 217 218 /* 219 * This function computes the actual size, in blocks, that can be prefetched, 220 * and fetches it. 221 */ 222 static uint64_t 223 dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) 224 { 225 uint64_t fetchsz; 226 uint64_t i; 227 228 fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); 229 230 for (i = 0; i < fetchsz; i++) { 231 dbuf_prefetch(dn, blkid + i); 232 } 233 234 return (fetchsz); 235 } 236 237 /* 238 * this function returns the number of blocks that would be prefetched, based 239 * upon the supplied dnode, blockid, and nblks. This is used so that we can 240 * update streams in place, and then prefetch with their old value after the 241 * fact. This way, we can delay the prefetch, but subsequent accesses to the 242 * stream won't result in the same data being prefetched multiple times. 243 */ 244 static uint64_t 245 dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) 246 { 247 uint64_t fetchsz; 248 249 if (blkid > dn->dn_maxblkid) { 250 return (0); 251 } 252 253 /* compute fetch size */ 254 if (blkid + nblks + 1 > dn->dn_maxblkid) { 255 fetchsz = (dn->dn_maxblkid - blkid) + 1; 256 ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); 257 } else { 258 fetchsz = nblks; 259 } 260 261 262 return (fetchsz); 263 } 264 265 /* 266 * given a zfetch and a zsearch structure, see if there is an associated zstream 267 * for this block read. If so, it starts a prefetch for the stream it 268 * located and returns true, otherwise it returns false 269 */ 270 static int 271 dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) 272 { 273 zstream_t *zs; 274 int64_t diff; 275 int reset = !prefetched; 276 int rc = 0; 277 278 if (zh == NULL) 279 return (0); 280 281 /* 282 * XXX: This locking strategy is a bit coarse; however, it's impact has 283 * yet to be tested. If this turns out to be an issue, it can be 284 * modified in a number of different ways. 285 */ 286 287 rw_enter(&zf->zf_rwlock, RW_READER); 288 top: 289 290 for (zs = list_head(&zf->zf_stream); zs; 291 zs = list_next(&zf->zf_stream, zs)) { 292 293 /* 294 * XXX - should this be an assert? 295 */ 296 if (zs->zst_len == 0) { 297 /* bogus stream */ 298 continue; 299 } 300 301 /* 302 * We hit this case when we are in a strided prefetch stream: 303 * we will read "len" blocks before "striding". 304 */ 305 if (zh->zst_offset >= zs->zst_offset && 306 zh->zst_offset < zs->zst_offset + zs->zst_len) { 307 /* already fetched */ 308 rc = 1; 309 goto out; 310 } 311 312 /* 313 * This is the forward sequential read case: we increment 314 * len by one each time we hit here, so we will enter this 315 * case on every read. 316 */ 317 if (zh->zst_offset == zs->zst_offset + zs->zst_len) { 318 319 reset = !prefetched && zs->zst_len > 1; 320 321 mutex_enter(&zs->zst_lock); 322 323 if (zh->zst_offset != zs->zst_offset + zs->zst_len) { 324 mutex_exit(&zs->zst_lock); 325 goto top; 326 } 327 zs->zst_len += zh->zst_len; 328 diff = zs->zst_len - zfetch_block_cap; 329 if (diff > 0) { 330 zs->zst_offset += diff; 331 zs->zst_len = zs->zst_len > diff ? 332 zs->zst_len - diff : 0; 333 } 334 zs->zst_direction = ZFETCH_FORWARD; 335 336 break; 337 338 /* 339 * Same as above, but reading backwards through the file. 340 */ 341 } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { 342 /* backwards sequential access */ 343 344 reset = !prefetched && zs->zst_len > 1; 345 346 mutex_enter(&zs->zst_lock); 347 348 if (zh->zst_offset != zs->zst_offset - zh->zst_len) { 349 mutex_exit(&zs->zst_lock); 350 goto top; 351 } 352 353 zs->zst_offset = zs->zst_offset > zh->zst_len ? 354 zs->zst_offset - zh->zst_len : 0; 355 zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? 356 zs->zst_ph_offset - zh->zst_len : 0; 357 zs->zst_len += zh->zst_len; 358 359 diff = zs->zst_len - zfetch_block_cap; 360 if (diff > 0) { 361 zs->zst_ph_offset = zs->zst_ph_offset > diff ? 362 zs->zst_ph_offset - diff : 0; 363 zs->zst_len = zs->zst_len > diff ? 364 zs->zst_len - diff : zs->zst_len; 365 } 366 zs->zst_direction = ZFETCH_BACKWARD; 367 368 break; 369 370 } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < 371 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 372 /* strided forward access */ 373 374 mutex_enter(&zs->zst_lock); 375 376 if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= 377 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 378 mutex_exit(&zs->zst_lock); 379 goto top; 380 } 381 382 zs->zst_offset += zs->zst_stride; 383 zs->zst_direction = ZFETCH_FORWARD; 384 385 break; 386 387 } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < 388 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 389 /* strided reverse access */ 390 391 mutex_enter(&zs->zst_lock); 392 393 if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= 394 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 395 mutex_exit(&zs->zst_lock); 396 goto top; 397 } 398 399 zs->zst_offset = zs->zst_offset > zs->zst_stride ? 400 zs->zst_offset - zs->zst_stride : 0; 401 zs->zst_ph_offset = (zs->zst_ph_offset > 402 (2 * zs->zst_stride)) ? 403 (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; 404 zs->zst_direction = ZFETCH_BACKWARD; 405 406 break; 407 } 408 } 409 410 if (zs) { 411 if (reset) { 412 zstream_t *remove = zs; 413 414 rc = 0; 415 mutex_exit(&zs->zst_lock); 416 rw_exit(&zf->zf_rwlock); 417 rw_enter(&zf->zf_rwlock, RW_WRITER); 418 /* 419 * Relocate the stream, in case someone removes 420 * it while we were acquiring the WRITER lock. 421 */ 422 for (zs = list_head(&zf->zf_stream); zs; 423 zs = list_next(&zf->zf_stream, zs)) { 424 if (zs == remove) { 425 dmu_zfetch_stream_remove(zf, zs); 426 mutex_destroy(&zs->zst_lock); 427 kmem_free(zs, sizeof (zstream_t)); 428 break; 429 } 430 } 431 } else { 432 rc = 1; 433 dmu_zfetch_dofetch(zf, zs); 434 mutex_exit(&zs->zst_lock); 435 } 436 } 437 out: 438 rw_exit(&zf->zf_rwlock); 439 return (rc); 440 } 441 442 /* 443 * Clean-up state associated with a zfetch structure. This frees allocated 444 * structure members, empties the zf_stream tree, and generally makes things 445 * nice. This doesn't free the zfetch_t itself, that's left to the caller. 446 */ 447 void 448 dmu_zfetch_rele(zfetch_t *zf) 449 { 450 zstream_t *zs; 451 zstream_t *zs_next; 452 453 ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 454 455 for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { 456 zs_next = list_next(&zf->zf_stream, zs); 457 458 list_remove(&zf->zf_stream, zs); 459 mutex_destroy(&zs->zst_lock); 460 kmem_free(zs, sizeof (zstream_t)); 461 } 462 list_destroy(&zf->zf_stream); 463 rw_destroy(&zf->zf_rwlock); 464 465 zf->zf_dnode = NULL; 466 } 467 468 /* 469 * Given a zfetch and zstream structure, insert the zstream structure into the 470 * AVL tree contained within the zfetch structure. Peform the appropriate 471 * book-keeping. It is possible that another thread has inserted a stream which 472 * matches one that we are about to insert, so we must be sure to check for this 473 * case. If one is found, return failure, and let the caller cleanup the 474 * duplicates. 475 */ 476 static int 477 dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) 478 { 479 zstream_t *zs_walk; 480 zstream_t *zs_next; 481 482 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 483 484 for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { 485 zs_next = list_next(&zf->zf_stream, zs_walk); 486 487 if (dmu_zfetch_streams_equal(zs_walk, zs)) { 488 return (0); 489 } 490 } 491 492 list_insert_head(&zf->zf_stream, zs); 493 zf->zf_stream_cnt++; 494 495 return (1); 496 } 497 498 499 /* 500 * Walk the list of zstreams in the given zfetch, find an old one (by time), and 501 * reclaim it for use by the caller. 502 */ 503 static zstream_t * 504 dmu_zfetch_stream_reclaim(zfetch_t *zf) 505 { 506 zstream_t *zs; 507 508 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 509 return (0); 510 511 for (zs = list_head(&zf->zf_stream); zs; 512 zs = list_next(&zf->zf_stream, zs)) { 513 514 if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) 515 break; 516 } 517 518 if (zs) { 519 dmu_zfetch_stream_remove(zf, zs); 520 mutex_destroy(&zs->zst_lock); 521 bzero(zs, sizeof (zstream_t)); 522 } else { 523 zf->zf_alloc_fail++; 524 } 525 rw_exit(&zf->zf_rwlock); 526 527 return (zs); 528 } 529 530 /* 531 * Given a zfetch and zstream structure, remove the zstream structure from its 532 * container in the zfetch structure. Perform the appropriate book-keeping. 533 */ 534 static void 535 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 536 { 537 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 538 539 list_remove(&zf->zf_stream, zs); 540 zf->zf_stream_cnt--; 541 } 542 543 static int 544 dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) 545 { 546 if (zs1->zst_offset != zs2->zst_offset) 547 return (0); 548 549 if (zs1->zst_len != zs2->zst_len) 550 return (0); 551 552 if (zs1->zst_stride != zs2->zst_stride) 553 return (0); 554 555 if (zs1->zst_ph_offset != zs2->zst_ph_offset) 556 return (0); 557 558 if (zs1->zst_cap != zs2->zst_cap) 559 return (0); 560 561 if (zs1->zst_direction != zs2->zst_direction) 562 return (0); 563 564 return (1); 565 } 566 567 /* 568 * This is the prefetch entry point. It calls all of the other dmu_zfetch 569 * routines to create, delete, find, or operate upon prefetch streams. 570 */ 571 void 572 dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) 573 { 574 zstream_t zst; 575 zstream_t *newstream; 576 int fetched; 577 int inserted; 578 unsigned int blkshft; 579 uint64_t blksz; 580 581 /* files that aren't ln2 blocksz are only one block -- nothing to do */ 582 if (!zf->zf_dnode->dn_datablkshift) { 583 return; 584 } 585 586 /* convert offset and size, into blockid and nblocks */ 587 blkshft = zf->zf_dnode->dn_datablkshift; 588 blksz = (1 << blkshft); 589 590 bzero(&zst, sizeof (zstream_t)); 591 zst.zst_offset = offset >> blkshft; 592 zst.zst_len = (P2ROUNDUP(offset + size, blksz) - 593 P2ALIGN(offset, blksz)) >> blkshft; 594 595 fetched = dmu_zfetch_find(zf, &zst, prefetched); 596 if (!fetched) { 597 fetched = dmu_zfetch_colinear(zf, &zst); 598 } 599 600 if (!fetched) { 601 newstream = dmu_zfetch_stream_reclaim(zf); 602 603 /* 604 * we still couldn't find a stream, drop the lock, and allocate 605 * one if possible. Otherwise, give up and go home. 606 */ 607 if (newstream == NULL) { 608 uint64_t maxblocks; 609 uint32_t max_streams; 610 uint32_t cur_streams; 611 612 cur_streams = zf->zf_stream_cnt; 613 maxblocks = zf->zf_dnode->dn_maxblkid; 614 615 max_streams = MIN(zfetch_max_streams, 616 (maxblocks / zfetch_block_cap)); 617 if (max_streams == 0) { 618 max_streams++; 619 } 620 621 if (cur_streams >= max_streams) { 622 return; 623 } 624 625 newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); 626 } 627 628 newstream->zst_offset = zst.zst_offset; 629 newstream->zst_len = zst.zst_len; 630 newstream->zst_stride = zst.zst_len; 631 newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; 632 newstream->zst_cap = zst.zst_len; 633 newstream->zst_direction = ZFETCH_FORWARD; 634 newstream->zst_last = lbolt; 635 636 mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); 637 638 rw_enter(&zf->zf_rwlock, RW_WRITER); 639 inserted = dmu_zfetch_stream_insert(zf, newstream); 640 rw_exit(&zf->zf_rwlock); 641 642 if (!inserted) { 643 mutex_destroy(&newstream->zst_lock); 644 kmem_free(newstream, sizeof (zstream_t)); 645 } 646 } 647 } 648