1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include <sys/zfs_context.h> 27 #include <sys/spa.h> 28 #include <sys/vdev_impl.h> 29 #include <sys/zio.h> 30 #include <sys/avl.h> 31 32 /* 33 * These tunables are for performance analysis. 34 */ 35 /* 36 * zfs_vdev_max_pending is the maximum number of i/os concurrently 37 * pending to each device. zfs_vdev_min_pending is the initial number 38 * of i/os pending to each device (before it starts ramping up to 39 * max_pending). 40 */ 41 int zfs_vdev_max_pending = 35; 42 int zfs_vdev_min_pending = 4; 43 44 /* deadline = pri + (lbolt >> time_shift) */ 45 int zfs_vdev_time_shift = 6; 46 47 /* exponential I/O issue ramp-up rate */ 48 int zfs_vdev_ramp_rate = 2; 49 50 /* 51 * To reduce IOPs, we aggregate small adjacent i/os into one large i/o. 52 * For read i/os, we also aggregate across small adjacency gaps. 53 */ 54 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 55 int zfs_vdev_read_gap_limit = 32 << 10; 56 57 /* 58 * Virtual device vector for disk I/O scheduling. 59 */ 60 int 61 vdev_queue_deadline_compare(const void *x1, const void *x2) 62 { 63 const zio_t *z1 = x1; 64 const zio_t *z2 = x2; 65 66 if (z1->io_deadline < z2->io_deadline) 67 return (-1); 68 if (z1->io_deadline > z2->io_deadline) 69 return (1); 70 71 if (z1->io_offset < z2->io_offset) 72 return (-1); 73 if (z1->io_offset > z2->io_offset) 74 return (1); 75 76 if (z1 < z2) 77 return (-1); 78 if (z1 > z2) 79 return (1); 80 81 return (0); 82 } 83 84 int 85 vdev_queue_offset_compare(const void *x1, const void *x2) 86 { 87 const zio_t *z1 = x1; 88 const zio_t *z2 = x2; 89 90 if (z1->io_offset < z2->io_offset) 91 return (-1); 92 if (z1->io_offset > z2->io_offset) 93 return (1); 94 95 if (z1 < z2) 96 return (-1); 97 if (z1 > z2) 98 return (1); 99 100 return (0); 101 } 102 103 void 104 vdev_queue_init(vdev_t *vd) 105 { 106 vdev_queue_t *vq = &vd->vdev_queue; 107 108 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 109 110 avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 111 sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 112 113 avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 114 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 115 116 avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 117 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 118 119 avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 120 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 121 } 122 123 void 124 vdev_queue_fini(vdev_t *vd) 125 { 126 vdev_queue_t *vq = &vd->vdev_queue; 127 128 avl_destroy(&vq->vq_deadline_tree); 129 avl_destroy(&vq->vq_read_tree); 130 avl_destroy(&vq->vq_write_tree); 131 avl_destroy(&vq->vq_pending_tree); 132 133 mutex_destroy(&vq->vq_lock); 134 } 135 136 static void 137 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 138 { 139 avl_add(&vq->vq_deadline_tree, zio); 140 avl_add(zio->io_vdev_tree, zio); 141 } 142 143 static void 144 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 145 { 146 avl_remove(&vq->vq_deadline_tree, zio); 147 avl_remove(zio->io_vdev_tree, zio); 148 } 149 150 static void 151 vdev_queue_agg_io_done(zio_t *aio) 152 { 153 zio_t *pio; 154 155 while ((pio = zio_walk_parents(aio)) != NULL) 156 if (aio->io_type == ZIO_TYPE_READ) 157 bcopy((char *)aio->io_data + (pio->io_offset - 158 aio->io_offset), pio->io_data, pio->io_size); 159 160 zio_buf_free(aio->io_data, aio->io_size); 161 } 162 163 /* 164 * Compute the range spanned by two i/os, which is the endpoint of the last 165 * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). 166 * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); 167 * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. 168 */ 169 #define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) 170 #define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) 171 172 static zio_t * 173 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) 174 { 175 zio_t *fio, *lio, *aio, *dio, *nio; 176 avl_tree_t *t; 177 int flags; 178 uint64_t maxspan = zfs_vdev_aggregation_limit; 179 uint64_t maxgap; 180 181 ASSERT(MUTEX_HELD(&vq->vq_lock)); 182 183 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 184 avl_numnodes(&vq->vq_deadline_tree) == 0) 185 return (NULL); 186 187 fio = lio = avl_first(&vq->vq_deadline_tree); 188 189 t = fio->io_vdev_tree; 190 flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; 191 maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; 192 193 if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { 194 /* 195 * We can aggregate I/Os that are adjacent and of the 196 * same flavor, as expressed by the AGG_INHERIT flags. 197 * The latter is necessary so that certain attributes 198 * of the I/O, such as whether it's a normal I/O or a 199 * scrub/resilver, can be preserved in the aggregate. 200 */ 201 while ((dio = AVL_PREV(t, fio)) != NULL && 202 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 203 IO_SPAN(dio, lio) <= maxspan && IO_GAP(dio, fio) <= maxgap) 204 fio = dio; 205 206 while ((dio = AVL_NEXT(t, lio)) != NULL && 207 (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && 208 IO_SPAN(fio, dio) <= maxspan && IO_GAP(lio, dio) <= maxgap) 209 lio = dio; 210 } 211 212 if (fio != lio) { 213 uint64_t size = IO_SPAN(fio, lio); 214 ASSERT(size <= zfs_vdev_aggregation_limit); 215 216 aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, 217 zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_NOW, 218 flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, 219 vdev_queue_agg_io_done, NULL); 220 221 nio = fio; 222 do { 223 dio = nio; 224 nio = AVL_NEXT(t, dio); 225 ASSERT(dio->io_type == aio->io_type); 226 ASSERT(dio->io_vdev_tree == t); 227 228 if (dio->io_type == ZIO_TYPE_WRITE) 229 bcopy(dio->io_data, (char *)aio->io_data + 230 (dio->io_offset - aio->io_offset), 231 dio->io_size); 232 233 zio_add_child(dio, aio); 234 vdev_queue_io_remove(vq, dio); 235 zio_vdev_io_bypass(dio); 236 zio_execute(dio); 237 } while (dio != lio); 238 239 avl_add(&vq->vq_pending_tree, aio); 240 241 return (aio); 242 } 243 244 ASSERT(fio->io_vdev_tree == t); 245 vdev_queue_io_remove(vq, fio); 246 247 avl_add(&vq->vq_pending_tree, fio); 248 249 return (fio); 250 } 251 252 zio_t * 253 vdev_queue_io(zio_t *zio) 254 { 255 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 256 zio_t *nio; 257 258 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 259 260 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 261 return (zio); 262 263 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 264 265 if (zio->io_type == ZIO_TYPE_READ) 266 zio->io_vdev_tree = &vq->vq_read_tree; 267 else 268 zio->io_vdev_tree = &vq->vq_write_tree; 269 270 mutex_enter(&vq->vq_lock); 271 272 zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; 273 274 vdev_queue_io_add(vq, zio); 275 276 nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending); 277 278 mutex_exit(&vq->vq_lock); 279 280 if (nio == NULL) 281 return (NULL); 282 283 if (nio->io_done == vdev_queue_agg_io_done) { 284 zio_nowait(nio); 285 return (NULL); 286 } 287 288 return (nio); 289 } 290 291 void 292 vdev_queue_io_done(zio_t *zio) 293 { 294 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 295 296 mutex_enter(&vq->vq_lock); 297 298 avl_remove(&vq->vq_pending_tree, zio); 299 300 for (int i = 0; i < zfs_vdev_ramp_rate; i++) { 301 zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending); 302 if (nio == NULL) 303 break; 304 mutex_exit(&vq->vq_lock); 305 if (nio->io_done == vdev_queue_agg_io_done) { 306 zio_nowait(nio); 307 } else { 308 zio_vdev_io_reissue(nio); 309 zio_execute(nio); 310 } 311 mutex_enter(&vq->vq_lock); 312 } 313 314 mutex_exit(&vq->vq_lock); 315 } 316