1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2012, 2014 by Delphix. All rights reserved. 28 * Copyright (c) 2014 Integros [integros.com] 29 */ 30 31 #include <sys/zfs_context.h> 32 #include <sys/vdev_impl.h> 33 #include <sys/spa_impl.h> 34 #include <sys/zio.h> 35 #include <sys/avl.h> 36 #include <sys/dsl_pool.h> 37 #include <sys/metaslab_impl.h> 38 39 /* 40 * ZFS I/O Scheduler 41 * --------------- 42 * 43 * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The 44 * I/O scheduler determines when and in what order those operations are 45 * issued. The I/O scheduler divides operations into five I/O classes 46 * prioritized in the following order: sync read, sync write, async read, 47 * async write, and scrub/resilver. Each queue defines the minimum and 48 * maximum number of concurrent operations that may be issued to the device. 49 * In addition, the device has an aggregate maximum. Note that the sum of the 50 * per-queue minimums must not exceed the aggregate maximum, and if the 51 * aggregate maximum is equal to or greater than the sum of the per-queue 52 * maximums, the per-queue minimum has no effect. 53 * 54 * For many physical devices, throughput increases with the number of 55 * concurrent operations, but latency typically suffers. Further, physical 56 * devices typically have a limit at which more concurrent operations have no 57 * effect on throughput or can actually cause it to decrease. 58 * 59 * The scheduler selects the next operation to issue by first looking for an 60 * I/O class whose minimum has not been satisfied. Once all are satisfied and 61 * the aggregate maximum has not been hit, the scheduler looks for classes 62 * whose maximum has not been satisfied. Iteration through the I/O classes is 63 * done in the order specified above. No further operations are issued if the 64 * aggregate maximum number of concurrent operations has been hit or if there 65 * are no operations queued for an I/O class that has not hit its maximum. 66 * Every time an i/o is queued or an operation completes, the I/O scheduler 67 * looks for new operations to issue. 68 * 69 * All I/O classes have a fixed maximum number of outstanding operations 70 * except for the async write class. Asynchronous writes represent the data 71 * that is committed to stable storage during the syncing stage for 72 * transaction groups (see txg.c). Transaction groups enter the syncing state 73 * periodically so the number of queued async writes will quickly burst up and 74 * then bleed down to zero. Rather than servicing them as quickly as possible, 75 * the I/O scheduler changes the maximum number of active async write i/os 76 * according to the amount of dirty data in the pool (see dsl_pool.c). Since 77 * both throughput and latency typically increase with the number of 78 * concurrent operations issued to physical devices, reducing the burstiness 79 * in the number of concurrent operations also stabilizes the response time of 80 * operations from other -- and in particular synchronous -- queues. In broad 81 * strokes, the I/O scheduler will issue more concurrent operations from the 82 * async write queue as there's more dirty data in the pool. 83 * 84 * Async Writes 85 * 86 * The number of concurrent operations issued for the async write I/O class 87 * follows a piece-wise linear function defined by a few adjustable points. 88 * 89 * | o---------| <-- zfs_vdev_async_write_max_active 90 * ^ | /^ | 91 * | | / | | 92 * active | / | | 93 * I/O | / | | 94 * count | / | | 95 * | / | | 96 * |------------o | | <-- zfs_vdev_async_write_min_active 97 * 0|____________^______|_________| 98 * 0% | | 100% of zfs_dirty_data_max 99 * | | 100 * | `-- zfs_vdev_async_write_active_max_dirty_percent 101 * `--------- zfs_vdev_async_write_active_min_dirty_percent 102 * 103 * Until the amount of dirty data exceeds a minimum percentage of the dirty 104 * data allowed in the pool, the I/O scheduler will limit the number of 105 * concurrent operations to the minimum. As that threshold is crossed, the 106 * number of concurrent operations issued increases linearly to the maximum at 107 * the specified maximum percentage of the dirty data allowed in the pool. 108 * 109 * Ideally, the amount of dirty data on a busy pool will stay in the sloped 110 * part of the function between zfs_vdev_async_write_active_min_dirty_percent 111 * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the 112 * maximum percentage, this indicates that the rate of incoming data is 113 * greater than the rate that the backend storage can handle. In this case, we 114 * must further throttle incoming writes (see dmu_tx_delay() for details). 115 */ 116 117 /* 118 * The maximum number of i/os active to each device. Ideally, this will be >= 119 * the sum of each queue's max_active. It must be at least the sum of each 120 * queue's min_active. 121 */ 122 uint32_t zfs_vdev_max_active = 1000; 123 124 /* 125 * Per-queue limits on the number of i/os active to each device. If the 126 * sum of the queue's max_active is < zfs_vdev_max_active, then the 127 * min_active comes into play. We will send min_active from each queue, 128 * and then select from queues in the order defined by zio_priority_t. 129 * 130 * In general, smaller max_active's will lead to lower latency of synchronous 131 * operations. Larger max_active's may lead to higher overall throughput, 132 * depending on underlying storage. 133 * 134 * The ratio of the queues' max_actives determines the balance of performance 135 * between reads, writes, and scrubs. E.g., increasing 136 * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete 137 * more quickly, but reads and writes to have higher latency and lower 138 * throughput. 139 */ 140 uint32_t zfs_vdev_sync_read_min_active = 10; 141 uint32_t zfs_vdev_sync_read_max_active = 10; 142 uint32_t zfs_vdev_sync_write_min_active = 10; 143 uint32_t zfs_vdev_sync_write_max_active = 10; 144 uint32_t zfs_vdev_async_read_min_active = 1; 145 uint32_t zfs_vdev_async_read_max_active = 3; 146 uint32_t zfs_vdev_async_write_min_active = 1; 147 uint32_t zfs_vdev_async_write_max_active = 10; 148 uint32_t zfs_vdev_scrub_min_active = 1; 149 uint32_t zfs_vdev_scrub_max_active = 2; 150 151 /* 152 * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent 153 * dirty data, use zfs_vdev_async_write_min_active. When it has more than 154 * zfs_vdev_async_write_active_max_dirty_percent, use 155 * zfs_vdev_async_write_max_active. The value is linearly interpolated 156 * between min and max. 157 */ 158 int zfs_vdev_async_write_active_min_dirty_percent = 30; 159 int zfs_vdev_async_write_active_max_dirty_percent = 60; 160 161 /* 162 * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. 163 * For read I/Os, we also aggregate across small adjacency gaps; for writes 164 * we include spans of optional I/Os to aid aggregation at the disk even when 165 * they aren't able to help us aggregate at this level. 166 */ 167 int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; 168 int zfs_vdev_read_gap_limit = 32 << 10; 169 int zfs_vdev_write_gap_limit = 4 << 10; 170 171 /* 172 * Define the queue depth percentage for each top-level. This percentage is 173 * used in conjunction with zfs_vdev_async_max_active to determine how many 174 * allocations a specific top-level vdev should handle. Once the queue depth 175 * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 176 * then allocator will stop allocating blocks on that top-level device. 177 * The default kernel setting is 1000% which will yield 100 allocations per 178 * device. For userland testing, the default setting is 300% which equates 179 * to 30 allocations per device. 180 */ 181 #ifdef _KERNEL 182 int zfs_vdev_queue_depth_pct = 1000; 183 #else 184 int zfs_vdev_queue_depth_pct = 300; 185 #endif 186 187 188 int 189 vdev_queue_offset_compare(const void *x1, const void *x2) 190 { 191 const zio_t *z1 = x1; 192 const zio_t *z2 = x2; 193 194 if (z1->io_offset < z2->io_offset) 195 return (-1); 196 if (z1->io_offset > z2->io_offset) 197 return (1); 198 199 if (z1 < z2) 200 return (-1); 201 if (z1 > z2) 202 return (1); 203 204 return (0); 205 } 206 207 static inline avl_tree_t * 208 vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) 209 { 210 return (&vq->vq_class[p].vqc_queued_tree); 211 } 212 213 static inline avl_tree_t * 214 vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) 215 { 216 ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE); 217 if (t == ZIO_TYPE_READ) 218 return (&vq->vq_read_offset_tree); 219 else 220 return (&vq->vq_write_offset_tree); 221 } 222 223 int 224 vdev_queue_timestamp_compare(const void *x1, const void *x2) 225 { 226 const zio_t *z1 = x1; 227 const zio_t *z2 = x2; 228 229 if (z1->io_timestamp < z2->io_timestamp) 230 return (-1); 231 if (z1->io_timestamp > z2->io_timestamp) 232 return (1); 233 234 if (z1 < z2) 235 return (-1); 236 if (z1 > z2) 237 return (1); 238 239 return (0); 240 } 241 242 void 243 vdev_queue_init(vdev_t *vd) 244 { 245 vdev_queue_t *vq = &vd->vdev_queue; 246 247 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 248 vq->vq_vdev = vd; 249 250 avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, 251 sizeof (zio_t), offsetof(struct zio, io_queue_node)); 252 avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), 253 vdev_queue_offset_compare, sizeof (zio_t), 254 offsetof(struct zio, io_offset_node)); 255 avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), 256 vdev_queue_offset_compare, sizeof (zio_t), 257 offsetof(struct zio, io_offset_node)); 258 259 for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { 260 int (*compfn) (const void *, const void *); 261 262 /* 263 * The synchronous i/o queues are dispatched in FIFO rather 264 * than LBA order. This provides more consistent latency for 265 * these i/os. 266 */ 267 if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE) 268 compfn = vdev_queue_timestamp_compare; 269 else 270 compfn = vdev_queue_offset_compare; 271 272 avl_create(vdev_queue_class_tree(vq, p), compfn, 273 sizeof (zio_t), offsetof(struct zio, io_queue_node)); 274 } 275 } 276 277 void 278 vdev_queue_fini(vdev_t *vd) 279 { 280 vdev_queue_t *vq = &vd->vdev_queue; 281 282 for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) 283 avl_destroy(vdev_queue_class_tree(vq, p)); 284 avl_destroy(&vq->vq_active_tree); 285 avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); 286 avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); 287 288 mutex_destroy(&vq->vq_lock); 289 } 290 291 static void 292 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 293 { 294 spa_t *spa = zio->io_spa; 295 296 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 297 avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); 298 avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); 299 300 mutex_enter(&spa->spa_iokstat_lock); 301 spa->spa_queue_stats[zio->io_priority].spa_queued++; 302 if (spa->spa_iokstat != NULL) 303 kstat_waitq_enter(spa->spa_iokstat->ks_data); 304 mutex_exit(&spa->spa_iokstat_lock); 305 } 306 307 static void 308 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 309 { 310 spa_t *spa = zio->io_spa; 311 312 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 313 avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); 314 avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); 315 316 mutex_enter(&spa->spa_iokstat_lock); 317 ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0); 318 spa->spa_queue_stats[zio->io_priority].spa_queued--; 319 if (spa->spa_iokstat != NULL) 320 kstat_waitq_exit(spa->spa_iokstat->ks_data); 321 mutex_exit(&spa->spa_iokstat_lock); 322 } 323 324 static void 325 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) 326 { 327 spa_t *spa = zio->io_spa; 328 ASSERT(MUTEX_HELD(&vq->vq_lock)); 329 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 330 vq->vq_class[zio->io_priority].vqc_active++; 331 avl_add(&vq->vq_active_tree, zio); 332 333 mutex_enter(&spa->spa_iokstat_lock); 334 spa->spa_queue_stats[zio->io_priority].spa_active++; 335 if (spa->spa_iokstat != NULL) 336 kstat_runq_enter(spa->spa_iokstat->ks_data); 337 mutex_exit(&spa->spa_iokstat_lock); 338 } 339 340 static void 341 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) 342 { 343 spa_t *spa = zio->io_spa; 344 ASSERT(MUTEX_HELD(&vq->vq_lock)); 345 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); 346 vq->vq_class[zio->io_priority].vqc_active--; 347 avl_remove(&vq->vq_active_tree, zio); 348 349 mutex_enter(&spa->spa_iokstat_lock); 350 ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_active, >, 0); 351 spa->spa_queue_stats[zio->io_priority].spa_active--; 352 if (spa->spa_iokstat != NULL) { 353 kstat_io_t *ksio = spa->spa_iokstat->ks_data; 354 355 kstat_runq_exit(spa->spa_iokstat->ks_data); 356 if (zio->io_type == ZIO_TYPE_READ) { 357 ksio->reads++; 358 ksio->nread += zio->io_size; 359 } else if (zio->io_type == ZIO_TYPE_WRITE) { 360 ksio->writes++; 361 ksio->nwritten += zio->io_size; 362 } 363 } 364 mutex_exit(&spa->spa_iokstat_lock); 365 } 366 367 static void 368 vdev_queue_agg_io_done(zio_t *aio) 369 { 370 if (aio->io_type == ZIO_TYPE_READ) { 371 zio_t *pio; 372 zio_link_t *zl = NULL; 373 while ((pio = zio_walk_parents(aio, &zl)) != NULL) { 374 bcopy((char *)aio->io_data + (pio->io_offset - 375 aio->io_offset), pio->io_data, pio->io_size); 376 } 377 } 378 379 zio_buf_free(aio->io_data, aio->io_size); 380 } 381 382 static int 383 vdev_queue_class_min_active(zio_priority_t p) 384 { 385 switch (p) { 386 case ZIO_PRIORITY_SYNC_READ: 387 return (zfs_vdev_sync_read_min_active); 388 case ZIO_PRIORITY_SYNC_WRITE: 389 return (zfs_vdev_sync_write_min_active); 390 case ZIO_PRIORITY_ASYNC_READ: 391 return (zfs_vdev_async_read_min_active); 392 case ZIO_PRIORITY_ASYNC_WRITE: 393 return (zfs_vdev_async_write_min_active); 394 case ZIO_PRIORITY_SCRUB: 395 return (zfs_vdev_scrub_min_active); 396 default: 397 panic("invalid priority %u", p); 398 return (0); 399 } 400 } 401 402 static int 403 vdev_queue_max_async_writes(spa_t *spa) 404 { 405 int writes; 406 uint64_t dirty = spa->spa_dsl_pool->dp_dirty_total; 407 uint64_t min_bytes = zfs_dirty_data_max * 408 zfs_vdev_async_write_active_min_dirty_percent / 100; 409 uint64_t max_bytes = zfs_dirty_data_max * 410 zfs_vdev_async_write_active_max_dirty_percent / 100; 411 412 /* 413 * Sync tasks correspond to interactive user actions. To reduce the 414 * execution time of those actions we push data out as fast as possible. 415 */ 416 if (spa_has_pending_synctask(spa)) { 417 return (zfs_vdev_async_write_max_active); 418 } 419 420 if (dirty < min_bytes) 421 return (zfs_vdev_async_write_min_active); 422 if (dirty > max_bytes) 423 return (zfs_vdev_async_write_max_active); 424 425 /* 426 * linear interpolation: 427 * slope = (max_writes - min_writes) / (max_bytes - min_bytes) 428 * move right by min_bytes 429 * move up by min_writes 430 */ 431 writes = (dirty - min_bytes) * 432 (zfs_vdev_async_write_max_active - 433 zfs_vdev_async_write_min_active) / 434 (max_bytes - min_bytes) + 435 zfs_vdev_async_write_min_active; 436 ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); 437 ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); 438 return (writes); 439 } 440 441 static int 442 vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) 443 { 444 switch (p) { 445 case ZIO_PRIORITY_SYNC_READ: 446 return (zfs_vdev_sync_read_max_active); 447 case ZIO_PRIORITY_SYNC_WRITE: 448 return (zfs_vdev_sync_write_max_active); 449 case ZIO_PRIORITY_ASYNC_READ: 450 return (zfs_vdev_async_read_max_active); 451 case ZIO_PRIORITY_ASYNC_WRITE: 452 return (vdev_queue_max_async_writes(spa)); 453 case ZIO_PRIORITY_SCRUB: 454 return (zfs_vdev_scrub_max_active); 455 default: 456 panic("invalid priority %u", p); 457 return (0); 458 } 459 } 460 461 /* 462 * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if 463 * there is no eligible class. 464 */ 465 static zio_priority_t 466 vdev_queue_class_to_issue(vdev_queue_t *vq) 467 { 468 spa_t *spa = vq->vq_vdev->vdev_spa; 469 zio_priority_t p; 470 471 if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) 472 return (ZIO_PRIORITY_NUM_QUEUEABLE); 473 474 /* find a queue that has not reached its minimum # outstanding i/os */ 475 for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { 476 if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && 477 vq->vq_class[p].vqc_active < 478 vdev_queue_class_min_active(p)) 479 return (p); 480 } 481 482 /* 483 * If we haven't found a queue, look for one that hasn't reached its 484 * maximum # outstanding i/os. 485 */ 486 for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { 487 if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && 488 vq->vq_class[p].vqc_active < 489 vdev_queue_class_max_active(spa, p)) 490 return (p); 491 } 492 493 /* No eligible queued i/os */ 494 return (ZIO_PRIORITY_NUM_QUEUEABLE); 495 } 496 497 /* 498 * Compute the range spanned by two i/os, which is the endpoint of the last 499 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). 500 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); 501 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. 502 */ 503 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) 504 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) 505 506 static zio_t * 507 vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) 508 { 509 zio_t *first, *last, *aio, *dio, *mandatory, *nio; 510 uint64_t maxgap = 0; 511 uint64_t size; 512 boolean_t stretch = B_FALSE; 513 avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); 514 enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; 515 516 if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) 517 return (NULL); 518 519 first = last = zio; 520 521 if (zio->io_type == ZIO_TYPE_READ) 522 maxgap = zfs_vdev_read_gap_limit; 523 524 /* 525 * We can aggregate I/Os that are sufficiently adjacent and of 526 * the same flavor, as expressed by the AGG_INHERIT flags. 527 * The latter requirement is necessary so that certain 528 * attributes of the I/O, such as whether it's a normal I/O 529 * or a scrub/resilver, can be preserved in the aggregate. 530 * We can include optional I/Os, but don't allow them 531 * to begin a range as they add no benefit in that situation. 532 */ 533 534 /* 535 * We keep track of the last non-optional I/O. 536 */ 537 mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; 538 539 /* 540 * Walk backwards through sufficiently contiguous I/Os 541 * recording the last non-option I/O. 542 */ 543 while ((dio = AVL_PREV(t, first)) != NULL && 544 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 545 IO_SPAN(dio, last) <= zfs_vdev_aggregation_limit && 546 IO_GAP(dio, first) <= maxgap) { 547 first = dio; 548 if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) 549 mandatory = first; 550 } 551 552 /* 553 * Skip any initial optional I/Os. 554 */ 555 while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { 556 first = AVL_NEXT(t, first); 557 ASSERT(first != NULL); 558 } 559 560 /* 561 * Walk forward through sufficiently contiguous I/Os. 562 */ 563 while ((dio = AVL_NEXT(t, last)) != NULL && 564 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 565 IO_SPAN(first, dio) <= zfs_vdev_aggregation_limit && 566 IO_GAP(last, dio) <= maxgap) { 567 last = dio; 568 if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) 569 mandatory = last; 570 } 571 572 /* 573 * Now that we've established the range of the I/O aggregation 574 * we must decide what to do with trailing optional I/Os. 575 * For reads, there's nothing to do. While we are unable to 576 * aggregate further, it's possible that a trailing optional 577 * I/O would allow the underlying device to aggregate with 578 * subsequent I/Os. We must therefore determine if the next 579 * non-optional I/O is close enough to make aggregation 580 * worthwhile. 581 */ 582 if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { 583 zio_t *nio = last; 584 while ((dio = AVL_NEXT(t, nio)) != NULL && 585 IO_GAP(nio, dio) == 0 && 586 IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { 587 nio = dio; 588 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { 589 stretch = B_TRUE; 590 break; 591 } 592 } 593 } 594 595 if (stretch) { 596 /* This may be a no-op. */ 597 dio = AVL_NEXT(t, last); 598 dio->io_flags &= ~ZIO_FLAG_OPTIONAL; 599 } else { 600 while (last != mandatory && last != first) { 601 ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); 602 last = AVL_PREV(t, last); 603 ASSERT(last != NULL); 604 } 605 } 606 607 if (first == last) 608 return (NULL); 609 610 size = IO_SPAN(first, last); 611 ASSERT3U(size, <=, zfs_vdev_aggregation_limit); 612 613 aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, 614 zio_buf_alloc(size), size, first->io_type, zio->io_priority, 615 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 616 vdev_queue_agg_io_done, NULL); 617 aio->io_timestamp = first->io_timestamp; 618 619 nio = first; 620 do { 621 dio = nio; 622 nio = AVL_NEXT(t, dio); 623 ASSERT3U(dio->io_type, ==, aio->io_type); 624 625 if (dio->io_flags & ZIO_FLAG_NODATA) { 626 ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); 627 bzero((char *)aio->io_data + (dio->io_offset - 628 aio->io_offset), dio->io_size); 629 } else if (dio->io_type == ZIO_TYPE_WRITE) { 630 bcopy(dio->io_data, (char *)aio->io_data + 631 (dio->io_offset - aio->io_offset), 632 dio->io_size); 633 } 634 635 zio_add_child(dio, aio); 636 vdev_queue_io_remove(vq, dio); 637 zio_vdev_io_bypass(dio); 638 zio_execute(dio); 639 } while (dio != last); 640 641 return (aio); 642 } 643 644 static zio_t * 645 vdev_queue_io_to_issue(vdev_queue_t *vq) 646 { 647 zio_t *zio, *aio; 648 zio_priority_t p; 649 avl_index_t idx; 650 avl_tree_t *tree; 651 zio_t search; 652 653 again: 654 ASSERT(MUTEX_HELD(&vq->vq_lock)); 655 656 p = vdev_queue_class_to_issue(vq); 657 658 if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { 659 /* No eligible queued i/os */ 660 return (NULL); 661 } 662 663 /* 664 * For LBA-ordered queues (async / scrub), issue the i/o which follows 665 * the most recently issued i/o in LBA (offset) order. 666 * 667 * For FIFO queues (sync), issue the i/o with the lowest timestamp. 668 */ 669 tree = vdev_queue_class_tree(vq, p); 670 search.io_timestamp = 0; 671 search.io_offset = vq->vq_last_offset + 1; 672 VERIFY3P(avl_find(tree, &search, &idx), ==, NULL); 673 zio = avl_nearest(tree, idx, AVL_AFTER); 674 if (zio == NULL) 675 zio = avl_first(tree); 676 ASSERT3U(zio->io_priority, ==, p); 677 678 aio = vdev_queue_aggregate(vq, zio); 679 if (aio != NULL) 680 zio = aio; 681 else 682 vdev_queue_io_remove(vq, zio); 683 684 /* 685 * If the I/O is or was optional and therefore has no data, we need to 686 * simply discard it. We need to drop the vdev queue's lock to avoid a 687 * deadlock that we could encounter since this I/O will complete 688 * immediately. 689 */ 690 if (zio->io_flags & ZIO_FLAG_NODATA) { 691 mutex_exit(&vq->vq_lock); 692 zio_vdev_io_bypass(zio); 693 zio_execute(zio); 694 mutex_enter(&vq->vq_lock); 695 goto again; 696 } 697 698 vdev_queue_pending_add(vq, zio); 699 vq->vq_last_offset = zio->io_offset; 700 701 return (zio); 702 } 703 704 zio_t * 705 vdev_queue_io(zio_t *zio) 706 { 707 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 708 zio_t *nio; 709 710 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 711 return (zio); 712 713 /* 714 * Children i/os inherent their parent's priority, which might 715 * not match the child's i/o type. Fix it up here. 716 */ 717 if (zio->io_type == ZIO_TYPE_READ) { 718 if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && 719 zio->io_priority != ZIO_PRIORITY_ASYNC_READ && 720 zio->io_priority != ZIO_PRIORITY_SCRUB) 721 zio->io_priority = ZIO_PRIORITY_ASYNC_READ; 722 } else { 723 ASSERT(zio->io_type == ZIO_TYPE_WRITE); 724 if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && 725 zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE) 726 zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; 727 } 728 729 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 730 731 mutex_enter(&vq->vq_lock); 732 zio->io_timestamp = gethrtime(); 733 vdev_queue_io_add(vq, zio); 734 nio = vdev_queue_io_to_issue(vq); 735 mutex_exit(&vq->vq_lock); 736 737 if (nio == NULL) 738 return (NULL); 739 740 if (nio->io_done == vdev_queue_agg_io_done) { 741 zio_nowait(nio); 742 return (NULL); 743 } 744 745 return (nio); 746 } 747 748 void 749 vdev_queue_io_done(zio_t *zio) 750 { 751 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 752 zio_t *nio; 753 754 mutex_enter(&vq->vq_lock); 755 756 vdev_queue_pending_remove(vq, zio); 757 758 vq->vq_io_complete_ts = gethrtime(); 759 760 while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { 761 mutex_exit(&vq->vq_lock); 762 if (nio->io_done == vdev_queue_agg_io_done) { 763 zio_nowait(nio); 764 } else { 765 zio_vdev_io_reissue(nio); 766 zio_execute(nio); 767 } 768 mutex_enter(&vq->vq_lock); 769 } 770 771 mutex_exit(&vq->vq_lock); 772 } 773