Lines Matching +full:non +full:- +full:active
9 * or https://opensource.org/licenses/CDDL-1.0.
42 * ---------------
51 * per-queue minimums must not exceed the aggregate maximum. If the
52 * sum of the per-queue maximums exceeds the aggregate maximum, then the
53 * number of active i/os may reach zfs_vdev_max_active, in which case no
54 * further i/os will be issued regardless of whether all per-queue
78 * the I/O scheduler changes the maximum number of active async write i/os
83 * operations from other -- and in particular synchronous -- queues. In broad
90 * follows a piece-wise linear function defined by a few adjustable points.
92 * | o---------| <-- zfs_vdev_async_write_max_active
95 * active | / | |
99 * |------------o | | <-- zfs_vdev_async_write_min_active
103 * | `-- zfs_vdev_async_write_active_max_dirty_percent
104 * `--------- zfs_vdev_async_write_active_min_dirty_percent
121 * The maximum number of i/os active to each device. Ideally, this will be >=
127 * Per-queue limits on the number of i/os active to each device. If the
128 * number of active i/os is < zfs_vdev_max_active, then the min_active comes
129 * into play. We will send min_active from each queue round-robin, and then
131 * Some queues have additional mechanisms to limit number of active I/Os in
174 * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
175 * the number of concurrently-active I/O's is limited to *_min_active, unless
176 * the vdev is "idle". When there are no interactive I/Os active (sync or
179 * of concurrently-active non-interactive I/O's is increased to *_max_active.
187 * *_max_active to 1 does not help. To prevent non-interactive I/Os, like
207 * Define the queue depth percentage for each top-level. This percentage is
209 * allocations a specific top-level vdev should handle. Once the queue depth
211 * then allocator will stop allocating blocks on that top-level device.
237 int cmp = TREE_CMP(z1->io_offset, z2->io_offset); in vdev_queue_offset_compare()
253 int tcmp = TREE_CMP(z1->io_timestamp >> VDQ_T_SHIFT, in vdev_queue_to_compare()
254 z2->io_timestamp >> VDQ_T_SHIFT); in vdev_queue_to_compare()
255 int ocmp = TREE_CMP(z1->io_offset, z2->io_offset); in vdev_queue_to_compare()
258 if (likely(cmp | (z1->io_queue_state == ZIO_QS_NONE))) in vdev_queue_to_compare()
274 zio_priority_t p = zio->io_priority; in vdev_queue_class_add()
275 vq->vq_cqueued |= 1U << p; in vdev_queue_class_add()
277 list_insert_tail(&vq->vq_class[p].vqc_list, zio); in vdev_queue_class_add()
278 vq->vq_class[p].vqc_list_numnodes++; in vdev_queue_class_add()
281 avl_add(&vq->vq_class[p].vqc_tree, zio); in vdev_queue_class_add()
287 zio_priority_t p = zio->io_priority; in vdev_queue_class_remove()
290 list_t *list = &vq->vq_class[p].vqc_list; in vdev_queue_class_remove()
293 vq->vq_class[p].vqc_list_numnodes--; in vdev_queue_class_remove()
295 avl_tree_t *tree = &vq->vq_class[p].vqc_tree; in vdev_queue_class_remove()
299 vq->vq_cqueued &= ~(empty << p); in vdev_queue_class_remove()
315 return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active : in vdev_queue_class_min_active()
316 MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active)); in vdev_queue_class_min_active()
318 return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active : in vdev_queue_class_min_active()
319 MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active)); in vdev_queue_class_min_active()
321 return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active: in vdev_queue_class_min_active()
322 MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active)); in vdev_queue_class_min_active()
326 return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active : in vdev_queue_class_min_active()
327 MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active)); in vdev_queue_class_min_active()
347 * dsl_pool_t if a self-healing zio is issued prior to the in vdev_queue_max_async_writes()
357 dirty = dp->dp_dirty_total; in vdev_queue_max_async_writes()
366 * slope = (max_writes - min_writes) / (max_bytes - min_bytes) in vdev_queue_max_async_writes()
370 writes = (dirty - min_bytes) * in vdev_queue_max_async_writes()
371 (zfs_vdev_async_write_max_active - in vdev_queue_max_async_writes()
373 (max_bytes - min_bytes) + in vdev_queue_max_async_writes()
391 return (vdev_queue_max_async_writes(vq->vq_vdev->vdev_spa)); in vdev_queue_class_max_active()
393 if (vq->vq_ia_active > 0) { in vdev_queue_class_max_active()
394 return (MIN(vq->vq_nia_credit, in vdev_queue_class_max_active()
396 } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) in vdev_queue_class_max_active()
400 if (vq->vq_ia_active > 0) { in vdev_queue_class_max_active()
401 return (MIN(vq->vq_nia_credit, in vdev_queue_class_max_active()
403 } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) in vdev_queue_class_max_active()
407 if (vq->vq_ia_active > 0) { in vdev_queue_class_max_active()
408 return (MIN(vq->vq_nia_credit, in vdev_queue_class_max_active()
410 } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) in vdev_queue_class_max_active()
416 if (vq->vq_ia_active > 0) { in vdev_queue_class_max_active()
417 return (MIN(vq->vq_nia_credit, in vdev_queue_class_max_active()
419 } else if (vq->vq_nia_credit < zfs_vdev_nia_delay) in vdev_queue_class_max_active()
435 uint32_t cq = vq->vq_cqueued; in vdev_queue_class_to_issue()
438 if (cq == 0 || vq->vq_active >= zfs_vdev_max_active) in vdev_queue_class_to_issue()
443 * Do round-robin to reduce starvation due to zfs_vdev_max_active in vdev_queue_class_to_issue()
446 p1 = vq->vq_last_prio + 1; in vdev_queue_class_to_issue()
450 if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < in vdev_queue_class_to_issue()
455 if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < in vdev_queue_class_to_issue()
465 if ((cq & (1U << p)) != 0 && vq->vq_cactive[p] < in vdev_queue_class_to_issue()
471 vq->vq_last_prio = p; in vdev_queue_class_to_issue()
478 vdev_queue_t *vq = &vd->vdev_queue; in vdev_queue_init()
481 vq->vq_vdev = vd; in vdev_queue_init()
485 list_create(&vq->vq_class[p].vqc_list, in vdev_queue_init()
489 avl_create(&vq->vq_class[p].vqc_tree, in vdev_queue_init()
494 avl_create(&vq->vq_read_offset_tree, in vdev_queue_init()
497 avl_create(&vq->vq_write_offset_tree, in vdev_queue_init()
501 vq->vq_last_offset = 0; in vdev_queue_init()
502 list_create(&vq->vq_active_list, sizeof (struct zio), in vdev_queue_init()
504 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); in vdev_queue_init()
510 vdev_queue_t *vq = &vd->vdev_queue; in vdev_queue_fini()
514 list_destroy(&vq->vq_class[p].vqc_list); in vdev_queue_fini()
516 avl_destroy(&vq->vq_class[p].vqc_tree); in vdev_queue_fini()
518 avl_destroy(&vq->vq_read_offset_tree); in vdev_queue_fini()
519 avl_destroy(&vq->vq_write_offset_tree); in vdev_queue_fini()
521 list_destroy(&vq->vq_active_list); in vdev_queue_fini()
522 mutex_destroy(&vq->vq_lock); in vdev_queue_fini()
528 zio->io_queue_state = ZIO_QS_QUEUED; in vdev_queue_io_add()
530 if (zio->io_type == ZIO_TYPE_READ) in vdev_queue_io_add()
531 avl_add(&vq->vq_read_offset_tree, zio); in vdev_queue_io_add()
532 else if (zio->io_type == ZIO_TYPE_WRITE) in vdev_queue_io_add()
533 avl_add(&vq->vq_write_offset_tree, zio); in vdev_queue_io_add()
540 if (zio->io_type == ZIO_TYPE_READ) in vdev_queue_io_remove()
541 avl_remove(&vq->vq_read_offset_tree, zio); in vdev_queue_io_remove()
542 else if (zio->io_type == ZIO_TYPE_WRITE) in vdev_queue_io_remove()
543 avl_remove(&vq->vq_write_offset_tree, zio); in vdev_queue_io_remove()
544 zio->io_queue_state = ZIO_QS_NONE; in vdev_queue_io_remove()
564 ASSERT(MUTEX_HELD(&vq->vq_lock)); in vdev_queue_pending_add()
565 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); in vdev_queue_pending_add()
566 vq->vq_cactive[zio->io_priority]++; in vdev_queue_pending_add()
567 vq->vq_active++; in vdev_queue_pending_add()
568 if (vdev_queue_is_interactive(zio->io_priority)) { in vdev_queue_pending_add()
569 if (++vq->vq_ia_active == 1) in vdev_queue_pending_add()
570 vq->vq_nia_credit = 1; in vdev_queue_pending_add()
571 } else if (vq->vq_ia_active > 0) { in vdev_queue_pending_add()
572 vq->vq_nia_credit--; in vdev_queue_pending_add()
574 zio->io_queue_state = ZIO_QS_ACTIVE; in vdev_queue_pending_add()
575 list_insert_tail(&vq->vq_active_list, zio); in vdev_queue_pending_add()
581 ASSERT(MUTEX_HELD(&vq->vq_lock)); in vdev_queue_pending_remove()
582 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); in vdev_queue_pending_remove()
583 vq->vq_cactive[zio->io_priority]--; in vdev_queue_pending_remove()
584 vq->vq_active--; in vdev_queue_pending_remove()
585 if (vdev_queue_is_interactive(zio->io_priority)) { in vdev_queue_pending_remove()
586 if (--vq->vq_ia_active == 0) in vdev_queue_pending_remove()
587 vq->vq_nia_credit = 0; in vdev_queue_pending_remove()
589 vq->vq_nia_credit = zfs_vdev_nia_credit; in vdev_queue_pending_remove()
590 } else if (vq->vq_ia_active == 0) in vdev_queue_pending_remove()
591 vq->vq_nia_credit++; in vdev_queue_pending_remove()
592 list_remove(&vq->vq_active_list, zio); in vdev_queue_pending_remove()
593 zio->io_queue_state = ZIO_QS_NONE; in vdev_queue_pending_remove()
599 abd_free(aio->io_abd); in vdev_queue_agg_io_done()
604 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
605 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
608 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
609 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
635 if (zio->io_type == ZIO_TYPE_TRIM) in vdev_queue_aggregate()
638 if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE) in vdev_queue_aggregate()
641 if (vq->vq_vdev->vdev_nonrot) in vdev_queue_aggregate()
654 ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops); in vdev_queue_aggregate()
658 if (zio->io_type == ZIO_TYPE_READ) { in vdev_queue_aggregate()
660 t = &vq->vq_read_offset_tree; in vdev_queue_aggregate()
662 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); in vdev_queue_aggregate()
663 t = &vq->vq_write_offset_tree; in vdev_queue_aggregate()
677 * We keep track of the last non-optional I/O. in vdev_queue_aggregate()
679 mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; in vdev_queue_aggregate()
683 * recording the last non-optional I/O. in vdev_queue_aggregate()
685 zio_flag_t flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; in vdev_queue_aggregate()
687 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && in vdev_queue_aggregate()
690 dio->io_type == zio->io_type) { in vdev_queue_aggregate()
692 if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) in vdev_queue_aggregate()
699 while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { in vdev_queue_aggregate()
712 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && in vdev_queue_aggregate()
714 (dio->io_flags & ZIO_FLAG_OPTIONAL)) && in vdev_queue_aggregate()
717 dio->io_type == zio->io_type) { in vdev_queue_aggregate()
719 if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) in vdev_queue_aggregate()
730 * non-optional I/O is close enough to make aggregation in vdev_queue_aggregate()
733 if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { in vdev_queue_aggregate()
739 if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { in vdev_queue_aggregate()
755 dio->io_flags &= ~ZIO_FLAG_OPTIONAL; in vdev_queue_aggregate()
759 ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); in vdev_queue_aggregate()
775 aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, in vdev_queue_aggregate()
776 abd, size, first->io_type, zio->io_priority, in vdev_queue_aggregate()
778 aio->io_timestamp = first->io_timestamp; in vdev_queue_aggregate()
781 next_offset = first->io_offset; in vdev_queue_aggregate()
789 if (dio->io_offset != next_offset) { in vdev_queue_aggregate()
791 ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); in vdev_queue_aggregate()
792 ASSERT3U(dio->io_offset, >, next_offset); in vdev_queue_aggregate()
794 dio->io_offset - next_offset, B_TRUE); in vdev_queue_aggregate()
795 abd_gang_add(aio->io_abd, abd, B_TRUE); in vdev_queue_aggregate()
797 if (dio->io_abd && in vdev_queue_aggregate()
798 (dio->io_size != abd_get_size(dio->io_abd))) { in vdev_queue_aggregate()
800 ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); in vdev_queue_aggregate()
801 abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); in vdev_queue_aggregate()
802 abd_gang_add(aio->io_abd, abd, B_TRUE); in vdev_queue_aggregate()
804 if (dio->io_flags & ZIO_FLAG_NODATA) { in vdev_queue_aggregate()
806 ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); in vdev_queue_aggregate()
807 ASSERT3P(dio->io_abd, ==, NULL); in vdev_queue_aggregate()
808 abd_gang_add(aio->io_abd, in vdev_queue_aggregate()
809 abd_get_zeros(dio->io_size), B_TRUE); in vdev_queue_aggregate()
817 abd_gang_add(aio->io_abd, dio->io_abd, in vdev_queue_aggregate()
821 next_offset = dio->io_offset + dio->io_size; in vdev_queue_aggregate()
823 ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size); in vdev_queue_aggregate()
844 ASSERT(MUTEX_HELD(&vq->vq_lock)); in vdev_queue_io_to_issue()
854 zio = list_head(&vq->vq_class[p].vqc_list); in vdev_queue_io_to_issue()
857 * For LBA-ordered queues (async / scrub / initializing), in vdev_queue_io_to_issue()
862 tree = &vq->vq_class[p].vqc_tree; in vdev_queue_io_to_issue()
864 if (zio->io_offset < vq->vq_last_offset) { in vdev_queue_io_to_issue()
865 vq->vq_io_search.io_timestamp = zio->io_timestamp; in vdev_queue_io_to_issue()
866 vq->vq_io_search.io_offset = vq->vq_last_offset; in vdev_queue_io_to_issue()
867 zio = avl_find(tree, &vq->vq_io_search, &idx); in vdev_queue_io_to_issue()
871 (zio->io_timestamp >> VDQ_T_SHIFT) != in vdev_queue_io_to_issue()
872 (aio->io_timestamp >> VDQ_T_SHIFT)) in vdev_queue_io_to_issue()
877 ASSERT3U(zio->io_priority, ==, p); in vdev_queue_io_to_issue()
891 if (zio->io_flags & ZIO_FLAG_NODATA) { in vdev_queue_io_to_issue()
892 mutex_exit(&vq->vq_lock); in vdev_queue_io_to_issue()
895 mutex_enter(&vq->vq_lock); in vdev_queue_io_to_issue()
901 vq->vq_last_offset = zio->io_offset + zio->io_size; in vdev_queue_io_to_issue()
909 vdev_queue_t *vq = &zio->io_vd->vdev_queue; in vdev_queue_io()
913 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) in vdev_queue_io()
920 if (zio->io_type == ZIO_TYPE_READ) { in vdev_queue_io()
921 ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); in vdev_queue_io()
923 if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && in vdev_queue_io()
924 zio->io_priority != ZIO_PRIORITY_ASYNC_READ && in vdev_queue_io()
925 zio->io_priority != ZIO_PRIORITY_SCRUB && in vdev_queue_io()
926 zio->io_priority != ZIO_PRIORITY_REMOVAL && in vdev_queue_io()
927 zio->io_priority != ZIO_PRIORITY_INITIALIZING && in vdev_queue_io()
928 zio->io_priority != ZIO_PRIORITY_REBUILD) { in vdev_queue_io()
929 zio->io_priority = ZIO_PRIORITY_ASYNC_READ; in vdev_queue_io()
931 } else if (zio->io_type == ZIO_TYPE_WRITE) { in vdev_queue_io()
932 ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); in vdev_queue_io()
934 if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && in vdev_queue_io()
935 zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && in vdev_queue_io()
936 zio->io_priority != ZIO_PRIORITY_REMOVAL && in vdev_queue_io()
937 zio->io_priority != ZIO_PRIORITY_INITIALIZING && in vdev_queue_io()
938 zio->io_priority != ZIO_PRIORITY_REBUILD) { in vdev_queue_io()
939 zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; in vdev_queue_io()
942 ASSERT(zio->io_type == ZIO_TYPE_TRIM); in vdev_queue_io()
943 ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); in vdev_queue_io()
946 zio->io_flags |= ZIO_FLAG_DONT_QUEUE; in vdev_queue_io()
947 zio->io_timestamp = gethrtime(); in vdev_queue_io()
949 mutex_enter(&vq->vq_lock); in vdev_queue_io()
952 mutex_exit(&vq->vq_lock); in vdev_queue_io()
957 if (nio->io_done == vdev_queue_agg_io_done) { in vdev_queue_io()
959 ASSERT3U(dio->io_type, ==, nio->io_type); in vdev_queue_io()
973 vdev_queue_t *vq = &zio->io_vd->vdev_queue; in vdev_queue_io_done()
978 vq->vq_io_complete_ts = now; in vdev_queue_io_done()
979 vq->vq_io_delta_ts = zio->io_delta = now - zio->io_timestamp; in vdev_queue_io_done()
981 mutex_enter(&vq->vq_lock); in vdev_queue_io_done()
985 mutex_exit(&vq->vq_lock); in vdev_queue_io_done()
986 if (nio->io_done == vdev_queue_agg_io_done) { in vdev_queue_io_done()
988 ASSERT3U(dio->io_type, ==, nio->io_type); in vdev_queue_io_done()
997 mutex_enter(&vq->vq_lock); in vdev_queue_io_done()
1000 mutex_exit(&vq->vq_lock); in vdev_queue_io_done()
1006 vdev_queue_t *vq = &zio->io_vd->vdev_queue; in vdev_queue_change_io_priority()
1014 if (zio->io_priority == ZIO_PRIORITY_NOW) in vdev_queue_change_io_priority()
1017 ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); in vdev_queue_change_io_priority()
1020 if (zio->io_type == ZIO_TYPE_READ) { in vdev_queue_change_io_priority()
1026 ASSERT(zio->io_type == ZIO_TYPE_WRITE); in vdev_queue_change_io_priority()
1032 mutex_enter(&vq->vq_lock); in vdev_queue_change_io_priority()
1037 * remove it from the queue and re-insert it with the new priority. in vdev_queue_change_io_priority()
1038 * Otherwise, the zio is currently active and we cannot change its in vdev_queue_change_io_priority()
1041 if (zio->io_queue_state == ZIO_QS_QUEUED) { in vdev_queue_change_io_priority()
1043 zio->io_priority = priority; in vdev_queue_change_io_priority()
1045 } else if (zio->io_queue_state == ZIO_QS_NONE) { in vdev_queue_change_io_priority()
1046 zio->io_priority = priority; in vdev_queue_change_io_priority()
1049 mutex_exit(&vq->vq_lock); in vdev_queue_change_io_priority()
1061 return (vd->vdev_queue.vq_active); in vdev_queue_length()
1067 return (vd->vdev_queue.vq_last_offset); in vdev_queue_last_offset()
1073 vdev_queue_t *vq = &vd->vdev_queue; in vdev_queue_class_length()
1075 return (vq->vq_class[p].vqc_list_numnodes); in vdev_queue_class_length()
1077 return (avl_numnodes(&vq->vq_class[p].vqc_tree)); in vdev_queue_class_length()
1084 ZMOD_RW, "Max vdev I/O aggregation size for non-rotating media");
1093 "Maximum number of active I/Os per vdev");
1102 "Max active async read I/Os per vdev");
1105 "Min active async read I/Os per vdev");
1108 "Max active async write I/Os per vdev");
1111 "Min active async write I/Os per vdev");
1114 "Max active initializing I/Os per vdev");
1117 "Min active initializing I/Os per vdev");
1120 "Max active removal I/Os per vdev");
1123 "Min active removal I/Os per vdev");
1126 "Max active scrub I/Os per vdev");
1129 "Min active scrub I/Os per vdev");
1132 "Max active sync read I/Os per vdev");
1135 "Min active sync read I/Os per vdev");
1138 "Max active sync write I/Os per vdev");
1141 "Min active sync write I/Os per vdev");
1144 "Max active trim/discard I/Os per vdev");
1147 "Min active trim/discard I/Os per vdev");
1150 "Max active rebuild I/Os per vdev");
1153 "Min active rebuild I/Os per vdev");
1156 "Number of non-interactive I/Os to allow in sequence");
1159 "Number of non-interactive I/Os before _max_active");
1162 "Queue depth percentage for each top-level vdev");