1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/dnode.h> 30 #include <sys/dmu_objset.h> 31 #include <sys/dmu_zfetch.h> 32 #include <sys/dmu.h> 33 #include <sys/dbuf.h> 34 35 /* 36 * I'm against tune-ables, but these should probably exist as tweakable globals 37 * until we can get this working the way we want it to. 38 */ 39 40 /* max # of streams per zfetch */ 41 uint32_t zfetch_max_streams = 8; 42 /* min time before stream reclaim */ 43 uint32_t zfetch_min_sec_reap = 2; 44 /* max number of blocks to fetch at a time */ 45 uint32_t zfetch_block_cap = 256; 46 /* number of bytes in a array_read at which we stop prefetching (1Mb) */ 47 uint64_t zfetch_array_rd_sz = 1024 * 1024; 48 49 /* forward decls for static routines */ 50 static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); 51 static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); 52 static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); 53 static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); 54 static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); 55 static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); 56 static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); 57 static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); 58 static void dmu_zfetch_stream_update(zfetch_t *, zstream_t *); 59 static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); 60 61 /* 62 * Given a zfetch structure and a zstream structure, determine whether the 63 * blocks to be read are part of a co-linear pair of existing prefetch 64 * streams. If a set is found, coalesce the streams, removing one, and 65 * configure the prefetch so it looks for a strided access pattern. 66 * 67 * In other words: if we find two sequential access streams that are 68 * the same length and distance N appart, and this read is N from the 69 * last stream, then we are probably in a strided access pattern. So 70 * combine the two sequential streams into a single strided stream. 71 * 72 * If no co-linear streams are found, return NULL. 73 */ 74 static int 75 dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) 76 { 77 zstream_t *z_walk; 78 zstream_t *z_comp; 79 80 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 81 return (0); 82 83 if (zh == NULL) { 84 rw_exit(&zf->zf_rwlock); 85 return (0); 86 } 87 88 for (z_walk = list_head(&zf->zf_stream); z_walk; 89 z_walk = list_next(&zf->zf_stream, z_walk)) { 90 for (z_comp = list_next(&zf->zf_stream, z_walk); z_comp; 91 z_comp = list_next(&zf->zf_stream, z_comp)) { 92 int64_t diff; 93 94 if (z_walk->zst_len != z_walk->zst_stride || 95 z_comp->zst_len != z_comp->zst_stride) { 96 continue; 97 } 98 99 diff = z_comp->zst_offset - z_walk->zst_offset; 100 if (z_comp->zst_offset + diff == zh->zst_offset) { 101 z_walk->zst_offset = zh->zst_offset; 102 z_walk->zst_direction = diff < 0 ? -1 : 1; 103 z_walk->zst_stride = 104 diff * z_walk->zst_direction; 105 z_walk->zst_ph_offset = 106 zh->zst_offset + z_walk->zst_stride; 107 dmu_zfetch_stream_remove(zf, z_comp); 108 mutex_destroy(&z_comp->zst_lock); 109 kmem_free(z_comp, sizeof (zstream_t)); 110 111 dmu_zfetch_dofetch(zf, z_walk); 112 113 rw_exit(&zf->zf_rwlock); 114 return (1); 115 } 116 117 diff = z_walk->zst_offset - z_comp->zst_offset; 118 if (z_walk->zst_offset + diff == zh->zst_offset) { 119 z_walk->zst_offset = zh->zst_offset; 120 z_walk->zst_direction = diff < 0 ? -1 : 1; 121 z_walk->zst_stride = 122 diff * z_walk->zst_direction; 123 z_walk->zst_ph_offset = 124 zh->zst_offset + z_walk->zst_stride; 125 dmu_zfetch_stream_remove(zf, z_comp); 126 mutex_destroy(&z_comp->zst_lock); 127 kmem_free(z_comp, sizeof (zstream_t)); 128 129 dmu_zfetch_dofetch(zf, z_walk); 130 131 rw_exit(&zf->zf_rwlock); 132 return (1); 133 } 134 } 135 } 136 137 rw_exit(&zf->zf_rwlock); 138 return (0); 139 } 140 141 /* 142 * Given a zstream_t, determine the bounds of the prefetch. Then call the 143 * routine that actually prefetches the individual blocks. 144 */ 145 static void 146 dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) 147 { 148 uint64_t prefetch_tail; 149 uint64_t prefetch_limit; 150 uint64_t prefetch_ofst; 151 uint64_t prefetch_len; 152 uint64_t blocks_fetched; 153 154 zs->zst_stride = MAX((int64_t)zs->zst_stride, zs->zst_len); 155 zs->zst_cap = MIN(zfetch_block_cap, 2 * zs->zst_cap); 156 157 prefetch_tail = MAX((int64_t)zs->zst_ph_offset, 158 (int64_t)(zs->zst_offset + zs->zst_stride)); 159 /* 160 * XXX: use a faster division method? 161 */ 162 prefetch_limit = zs->zst_offset + zs->zst_len + 163 (zs->zst_cap * zs->zst_stride) / zs->zst_len; 164 165 while (prefetch_tail < prefetch_limit) { 166 prefetch_ofst = zs->zst_offset + zs->zst_direction * 167 (prefetch_tail - zs->zst_offset); 168 169 prefetch_len = zs->zst_len; 170 171 /* 172 * Don't prefetch beyond the end of the file, if working 173 * backwards. 174 */ 175 if ((zs->zst_direction == ZFETCH_BACKWARD) && 176 (prefetch_ofst > prefetch_tail)) { 177 prefetch_len += prefetch_ofst; 178 prefetch_ofst = 0; 179 } 180 181 /* don't prefetch more than we're supposed to */ 182 if (prefetch_len > zs->zst_len) 183 break; 184 185 blocks_fetched = dmu_zfetch_fetch(zf->zf_dnode, 186 prefetch_ofst, zs->zst_len); 187 188 prefetch_tail += zs->zst_stride; 189 /* stop if we've run out of stuff to prefetch */ 190 if (blocks_fetched < zs->zst_len) 191 break; 192 } 193 zs->zst_ph_offset = prefetch_tail; 194 zs->zst_last = lbolt; 195 } 196 197 /* 198 * This takes a pointer to a zfetch structure and a dnode. It performs the 199 * necessary setup for the zfetch structure, grokking data from the 200 * associated dnode. 201 */ 202 void 203 dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) 204 { 205 if (zf == NULL) { 206 return; 207 } 208 209 zf->zf_dnode = dno; 210 zf->zf_stream_cnt = 0; 211 zf->zf_alloc_fail = 0; 212 213 list_create(&zf->zf_stream, sizeof (zstream_t), 214 offsetof(zstream_t, zst_node)); 215 216 rw_init(&zf->zf_rwlock, NULL, RW_DEFAULT, NULL); 217 } 218 219 /* 220 * This function computes the actual size, in blocks, that can be prefetched, 221 * and fetches it. 222 */ 223 static uint64_t 224 dmu_zfetch_fetch(dnode_t *dn, uint64_t blkid, uint64_t nblks) 225 { 226 uint64_t fetchsz; 227 uint64_t i; 228 229 fetchsz = dmu_zfetch_fetchsz(dn, blkid, nblks); 230 231 for (i = 0; i < fetchsz; i++) { 232 dbuf_prefetch(dn, blkid + i); 233 } 234 235 return (fetchsz); 236 } 237 238 /* 239 * this function returns the number of blocks that would be prefetched, based 240 * upon the supplied dnode, blockid, and nblks. This is used so that we can 241 * update streams in place, and then prefetch with their old value after the 242 * fact. This way, we can delay the prefetch, but subsequent accesses to the 243 * stream won't result in the same data being prefetched multiple times. 244 */ 245 static uint64_t 246 dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) 247 { 248 uint64_t fetchsz; 249 250 if (blkid > dn->dn_maxblkid) { 251 return (0); 252 } 253 254 /* compute fetch size */ 255 if (blkid + nblks + 1 > dn->dn_maxblkid) { 256 fetchsz = (dn->dn_maxblkid - blkid) + 1; 257 ASSERT(blkid + fetchsz - 1 <= dn->dn_maxblkid); 258 } else { 259 fetchsz = nblks; 260 } 261 262 263 return (fetchsz); 264 } 265 266 /* 267 * given a zfetch and a zsearch structure, see if there is an associated zstream 268 * for this block read. If so, it starts a prefetch for the stream it 269 * located and returns true, otherwise it returns false 270 */ 271 static int 272 dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) 273 { 274 zstream_t *zs; 275 int64_t diff; 276 int reset = !prefetched; 277 int rc = 0; 278 279 if (zh == NULL) 280 return (0); 281 282 /* 283 * XXX: This locking strategy is a bit coarse; however, it's impact has 284 * yet to be tested. If this turns out to be an issue, it can be 285 * modified in a number of different ways. 286 */ 287 288 rw_enter(&zf->zf_rwlock, RW_READER); 289 top: 290 291 for (zs = list_head(&zf->zf_stream); zs; 292 zs = list_next(&zf->zf_stream, zs)) { 293 294 /* 295 * XXX - should this be an assert? 296 */ 297 if (zs->zst_len == 0) { 298 /* bogus stream */ 299 continue; 300 } 301 302 /* 303 * We hit this case when we are in a strided prefetch stream: 304 * we will read "len" blocks before "striding". 305 */ 306 if (zh->zst_offset >= zs->zst_offset && 307 zh->zst_offset < zs->zst_offset + zs->zst_len) { 308 /* already fetched */ 309 rc = 1; 310 goto out; 311 } 312 313 /* 314 * This is the forward sequential read case: we increment 315 * len by one each time we hit here, so we will enter this 316 * case on every read. 317 */ 318 if (zh->zst_offset == zs->zst_offset + zs->zst_len) { 319 320 reset = !prefetched && zs->zst_len > 1; 321 322 mutex_enter(&zs->zst_lock); 323 324 if (zh->zst_offset != zs->zst_offset + zs->zst_len) { 325 mutex_exit(&zs->zst_lock); 326 goto top; 327 } 328 zs->zst_len += zh->zst_len; 329 diff = zs->zst_len - zfetch_block_cap; 330 if (diff > 0) { 331 zs->zst_offset += diff; 332 zs->zst_len = zs->zst_len > diff ? 333 zs->zst_len - diff : 0; 334 } 335 zs->zst_direction = ZFETCH_FORWARD; 336 337 break; 338 339 /* 340 * Same as above, but reading backwards through the file. 341 */ 342 } else if (zh->zst_offset == zs->zst_offset - zh->zst_len) { 343 /* backwards sequential access */ 344 345 reset = !prefetched && zs->zst_len > 1; 346 347 mutex_enter(&zs->zst_lock); 348 349 if (zh->zst_offset != zs->zst_offset - zh->zst_len) { 350 mutex_exit(&zs->zst_lock); 351 goto top; 352 } 353 354 zs->zst_offset = zs->zst_offset > zh->zst_len ? 355 zs->zst_offset - zh->zst_len : 0; 356 zs->zst_ph_offset = zs->zst_ph_offset > zh->zst_len ? 357 zs->zst_ph_offset - zh->zst_len : 0; 358 zs->zst_len += zh->zst_len; 359 360 diff = zs->zst_len - zfetch_block_cap; 361 if (diff > 0) { 362 zs->zst_ph_offset = zs->zst_ph_offset > diff ? 363 zs->zst_ph_offset - diff : 0; 364 zs->zst_len = zs->zst_len > diff ? 365 zs->zst_len - diff : zs->zst_len; 366 } 367 zs->zst_direction = ZFETCH_BACKWARD; 368 369 break; 370 371 } else if ((zh->zst_offset - zs->zst_offset - zs->zst_stride < 372 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 373 /* strided forward access */ 374 375 mutex_enter(&zs->zst_lock); 376 377 if ((zh->zst_offset - zs->zst_offset - zs->zst_stride >= 378 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 379 mutex_exit(&zs->zst_lock); 380 goto top; 381 } 382 383 zs->zst_offset += zs->zst_stride; 384 zs->zst_direction = ZFETCH_FORWARD; 385 386 break; 387 388 } else if ((zh->zst_offset - zs->zst_offset + zs->zst_stride < 389 zs->zst_len) && (zs->zst_len != zs->zst_stride)) { 390 /* strided reverse access */ 391 392 mutex_enter(&zs->zst_lock); 393 394 if ((zh->zst_offset - zs->zst_offset + zs->zst_stride >= 395 zs->zst_len) || (zs->zst_len == zs->zst_stride)) { 396 mutex_exit(&zs->zst_lock); 397 goto top; 398 } 399 400 zs->zst_offset = zs->zst_offset > zs->zst_stride ? 401 zs->zst_offset - zs->zst_stride : 0; 402 zs->zst_ph_offset = (zs->zst_ph_offset > 403 (2 * zs->zst_stride)) ? 404 (zs->zst_ph_offset - (2 * zs->zst_stride)) : 0; 405 zs->zst_direction = ZFETCH_BACKWARD; 406 407 break; 408 } 409 } 410 411 if (zs) { 412 if (reset) { 413 zstream_t *remove = zs; 414 415 rc = 0; 416 mutex_exit(&zs->zst_lock); 417 rw_exit(&zf->zf_rwlock); 418 rw_enter(&zf->zf_rwlock, RW_WRITER); 419 /* 420 * Relocate the stream, in case someone removes 421 * it while we were acquiring the WRITER lock. 422 */ 423 for (zs = list_head(&zf->zf_stream); zs; 424 zs = list_next(&zf->zf_stream, zs)) { 425 if (zs == remove) { 426 dmu_zfetch_stream_remove(zf, zs); 427 mutex_destroy(&zs->zst_lock); 428 kmem_free(zs, sizeof (zstream_t)); 429 break; 430 } 431 } 432 } else { 433 rc = 1; 434 dmu_zfetch_dofetch(zf, zs); 435 mutex_exit(&zs->zst_lock); 436 } 437 } 438 out: 439 rw_exit(&zf->zf_rwlock); 440 return (rc); 441 } 442 443 /* 444 * Clean-up state associated with a zfetch structure. This frees allocated 445 * structure members, empties the zf_stream tree, and generally makes things 446 * nice. This doesn't free the zfetch_t itself, that's left to the caller. 447 */ 448 void 449 dmu_zfetch_rele(zfetch_t *zf) 450 { 451 zstream_t *zs; 452 zstream_t *zs_next; 453 454 ASSERT(!RW_LOCK_HELD(&zf->zf_rwlock)); 455 456 for (zs = list_head(&zf->zf_stream); zs; zs = zs_next) { 457 zs_next = list_next(&zf->zf_stream, zs); 458 459 list_remove(&zf->zf_stream, zs); 460 mutex_destroy(&zs->zst_lock); 461 kmem_free(zs, sizeof (zstream_t)); 462 } 463 list_destroy(&zf->zf_stream); 464 rw_destroy(&zf->zf_rwlock); 465 466 zf->zf_dnode = NULL; 467 } 468 469 /* 470 * Given a zfetch and zstream structure, insert the zstream structure into the 471 * AVL tree contained within the zfetch structure. Peform the appropriate 472 * book-keeping. It is possible that another thread has inserted a stream which 473 * matches one that we are about to insert, so we must be sure to check for this 474 * case. If one is found, return failure, and let the caller cleanup the 475 * duplicates. 476 */ 477 static int 478 dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) 479 { 480 zstream_t *zs_walk; 481 zstream_t *zs_next; 482 483 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 484 485 for (zs_walk = list_head(&zf->zf_stream); zs_walk; zs_walk = zs_next) { 486 zs_next = list_next(&zf->zf_stream, zs_walk); 487 488 if (dmu_zfetch_streams_equal(zs_walk, zs)) { 489 return (0); 490 } 491 } 492 493 list_insert_head(&zf->zf_stream, zs); 494 zf->zf_stream_cnt++; 495 496 return (1); 497 } 498 499 500 /* 501 * Walk the list of zstreams in the given zfetch, find an old one (by time), and 502 * reclaim it for use by the caller. 503 */ 504 static zstream_t * 505 dmu_zfetch_stream_reclaim(zfetch_t *zf) 506 { 507 zstream_t *zs; 508 509 if (! rw_tryenter(&zf->zf_rwlock, RW_WRITER)) 510 return (0); 511 512 for (zs = list_head(&zf->zf_stream); zs; 513 zs = list_next(&zf->zf_stream, zs)) { 514 515 if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) 516 break; 517 } 518 519 if (zs) { 520 dmu_zfetch_stream_remove(zf, zs); 521 mutex_destroy(&zs->zst_lock); 522 bzero(zs, sizeof (zstream_t)); 523 } else { 524 zf->zf_alloc_fail++; 525 } 526 rw_exit(&zf->zf_rwlock); 527 528 return (zs); 529 } 530 531 /* 532 * Given a zfetch and zstream structure, remove the zstream structure from its 533 * container in the zfetch structure. Perform the appropriate book-keeping. 534 */ 535 static void 536 dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) 537 { 538 ASSERT(RW_WRITE_HELD(&zf->zf_rwlock)); 539 540 list_remove(&zf->zf_stream, zs); 541 zf->zf_stream_cnt--; 542 } 543 544 static int 545 dmu_zfetch_streams_equal(zstream_t *zs1, zstream_t *zs2) 546 { 547 if (zs1->zst_offset != zs2->zst_offset) 548 return (0); 549 550 if (zs1->zst_len != zs2->zst_len) 551 return (0); 552 553 if (zs1->zst_stride != zs2->zst_stride) 554 return (0); 555 556 if (zs1->zst_ph_offset != zs2->zst_ph_offset) 557 return (0); 558 559 if (zs1->zst_cap != zs2->zst_cap) 560 return (0); 561 562 if (zs1->zst_direction != zs2->zst_direction) 563 return (0); 564 565 return (1); 566 } 567 568 /* 569 * This is the prefetch entry point. It calls all of the other dmu_zfetch 570 * routines to create, delete, find, or operate upon prefetch streams. 571 */ 572 void 573 dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) 574 { 575 zstream_t zst; 576 zstream_t *newstream; 577 int fetched; 578 int inserted; 579 unsigned int blkshft; 580 uint64_t blksz; 581 582 /* files that aren't ln2 blocksz are only one block -- nothing to do */ 583 if (!zf->zf_dnode->dn_datablkshift) { 584 return; 585 } 586 587 /* convert offset and size, into blockid and nblocks */ 588 blkshft = zf->zf_dnode->dn_datablkshift; 589 blksz = (1 << blkshft); 590 591 bzero(&zst, sizeof (zstream_t)); 592 zst.zst_offset = offset >> blkshft; 593 zst.zst_len = (P2ROUNDUP(offset + size, blksz) - 594 P2ALIGN(offset, blksz)) >> blkshft; 595 596 fetched = dmu_zfetch_find(zf, &zst, prefetched); 597 if (!fetched) { 598 fetched = dmu_zfetch_colinear(zf, &zst); 599 } 600 601 if (!fetched) { 602 newstream = dmu_zfetch_stream_reclaim(zf); 603 604 /* 605 * we still couldn't find a stream, drop the lock, and allocate 606 * one if possible. Otherwise, give up and go home. 607 */ 608 if (newstream == NULL) { 609 uint64_t maxblocks; 610 uint32_t max_streams; 611 uint32_t cur_streams; 612 613 cur_streams = zf->zf_stream_cnt; 614 maxblocks = zf->zf_dnode->dn_maxblkid; 615 616 max_streams = MIN(zfetch_max_streams, 617 (maxblocks / zfetch_block_cap)); 618 if (max_streams == 0) { 619 max_streams++; 620 } 621 622 if (cur_streams >= max_streams) { 623 return; 624 } 625 626 newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); 627 } 628 629 newstream->zst_offset = zst.zst_offset; 630 newstream->zst_len = zst.zst_len; 631 newstream->zst_stride = zst.zst_len; 632 newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; 633 newstream->zst_cap = zst.zst_len; 634 newstream->zst_direction = ZFETCH_FORWARD; 635 newstream->zst_last = lbolt; 636 637 mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); 638 639 rw_enter(&zf->zf_rwlock, RW_WRITER); 640 inserted = dmu_zfetch_stream_insert(zf, newstream); 641 rw_exit(&zf->zf_rwlock); 642 643 if (!inserted) { 644 mutex_destroy(&newstream->zst_lock); 645 kmem_free(newstream, sizeof (zstream_t)); 646 } 647 } 648 } 649