1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/zfs_context.h> 29 #include <sys/spa.h> 30 #include <sys/vdev_impl.h> 31 #include <sys/zio.h> 32 #include <sys/avl.h> 33 34 /* 35 * These tunables are for performance analysis. 36 */ 37 /* 38 * zfs_vdev_max_pending is the maximum number of i/os concurrently 39 * pending to each device. zfs_vdev_min_pending is the initial number 40 * of i/os pending to each device (before it starts ramping up to 41 * max_pending). 42 */ 43 int zfs_vdev_max_pending = 35; 44 int zfs_vdev_min_pending = 4; 45 46 /* maximum scrub/resilver I/O queue */ 47 int zfs_scrub_limit = 70; 48 49 /* deadline = pri + (lbolt >> time_shift) */ 50 int zfs_vdev_time_shift = 6; 51 52 /* exponential I/O issue ramp-up rate */ 53 int zfs_vdev_ramp_rate = 2; 54 55 /* 56 * i/os will be aggregated into a single large i/o up to 57 * zfs_vdev_aggregation_limit bytes long. 58 */ 59 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; 60 61 /* 62 * Virtual device vector for disk I/O scheduling. 63 */ 64 int 65 vdev_queue_deadline_compare(const void *x1, const void *x2) 66 { 67 const zio_t *z1 = x1; 68 const zio_t *z2 = x2; 69 70 if (z1->io_deadline < z2->io_deadline) 71 return (-1); 72 if (z1->io_deadline > z2->io_deadline) 73 return (1); 74 75 if (z1->io_offset < z2->io_offset) 76 return (-1); 77 if (z1->io_offset > z2->io_offset) 78 return (1); 79 80 if (z1 < z2) 81 return (-1); 82 if (z1 > z2) 83 return (1); 84 85 return (0); 86 } 87 88 int 89 vdev_queue_offset_compare(const void *x1, const void *x2) 90 { 91 const zio_t *z1 = x1; 92 const zio_t *z2 = x2; 93 94 if (z1->io_offset < z2->io_offset) 95 return (-1); 96 if (z1->io_offset > z2->io_offset) 97 return (1); 98 99 if (z1 < z2) 100 return (-1); 101 if (z1 > z2) 102 return (1); 103 104 return (0); 105 } 106 107 void 108 vdev_queue_init(vdev_t *vd) 109 { 110 vdev_queue_t *vq = &vd->vdev_queue; 111 112 mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); 113 114 avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare, 115 sizeof (zio_t), offsetof(struct zio, io_deadline_node)); 116 117 avl_create(&vq->vq_read_tree, vdev_queue_offset_compare, 118 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 119 120 avl_create(&vq->vq_write_tree, vdev_queue_offset_compare, 121 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 122 123 avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare, 124 sizeof (zio_t), offsetof(struct zio, io_offset_node)); 125 } 126 127 void 128 vdev_queue_fini(vdev_t *vd) 129 { 130 vdev_queue_t *vq = &vd->vdev_queue; 131 132 ASSERT(vq->vq_scrub_count == 0); 133 134 avl_destroy(&vq->vq_deadline_tree); 135 avl_destroy(&vq->vq_read_tree); 136 avl_destroy(&vq->vq_write_tree); 137 avl_destroy(&vq->vq_pending_tree); 138 139 mutex_destroy(&vq->vq_lock); 140 } 141 142 static void 143 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) 144 { 145 avl_add(&vq->vq_deadline_tree, zio); 146 avl_add(zio->io_vdev_tree, zio); 147 148 if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && 149 ++vq->vq_scrub_count >= zfs_scrub_limit) 150 spa_scrub_throttle(zio->io_spa, 1); 151 } 152 153 static void 154 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) 155 { 156 if ((zio->io_flags & ZIO_FLAG_SCRUB_THREAD) && 157 vq->vq_scrub_count-- >= zfs_scrub_limit) 158 spa_scrub_throttle(zio->io_spa, -1); 159 160 avl_remove(&vq->vq_deadline_tree, zio); 161 avl_remove(zio->io_vdev_tree, zio); 162 } 163 164 static void 165 vdev_queue_agg_io_done(zio_t *aio) 166 { 167 zio_t *dio; 168 uint64_t offset = 0; 169 170 while ((dio = aio->io_delegate_list) != NULL) { 171 if (aio->io_type == ZIO_TYPE_READ) 172 bcopy((char *)aio->io_data + offset, dio->io_data, 173 dio->io_size); 174 offset += dio->io_size; 175 aio->io_delegate_list = dio->io_delegate_next; 176 dio->io_delegate_next = NULL; 177 dio->io_error = aio->io_error; 178 zio_next_stage(dio); 179 } 180 ASSERT3U(offset, ==, aio->io_size); 181 182 zio_buf_free(aio->io_data, aio->io_size); 183 } 184 185 #define IS_ADJACENT(io, nio) \ 186 ((io)->io_offset + (io)->io_size == (nio)->io_offset) 187 188 typedef void zio_issue_func_t(zio_t *); 189 190 static zio_t * 191 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, 192 zio_issue_func_t **funcp) 193 { 194 zio_t *fio, *lio, *aio, *dio; 195 avl_tree_t *tree; 196 uint64_t size; 197 198 ASSERT(MUTEX_HELD(&vq->vq_lock)); 199 200 *funcp = NULL; 201 202 if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || 203 avl_numnodes(&vq->vq_deadline_tree) == 0) 204 return (NULL); 205 206 fio = lio = avl_first(&vq->vq_deadline_tree); 207 208 tree = fio->io_vdev_tree; 209 size = fio->io_size; 210 211 while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && 212 size + dio->io_size <= zfs_vdev_aggregation_limit) { 213 dio->io_delegate_next = fio; 214 fio = dio; 215 size += dio->io_size; 216 } 217 218 while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && 219 size + dio->io_size <= zfs_vdev_aggregation_limit) { 220 lio->io_delegate_next = dio; 221 lio = dio; 222 size += dio->io_size; 223 } 224 225 if (fio != lio) { 226 char *buf = zio_buf_alloc(size); 227 uint64_t offset = 0; 228 int nagg = 0; 229 230 ASSERT(size <= zfs_vdev_aggregation_limit); 231 232 aio = zio_vdev_child_io(fio, NULL, fio->io_vd, 233 fio->io_offset, buf, size, fio->io_type, 234 ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | 235 ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | 236 ZIO_FLAG_NOBOOKMARK, 237 vdev_queue_agg_io_done, NULL); 238 239 aio->io_delegate_list = fio; 240 241 for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { 242 ASSERT(dio->io_type == aio->io_type); 243 ASSERT(dio->io_vdev_tree == tree); 244 if (dio->io_type == ZIO_TYPE_WRITE) 245 bcopy(dio->io_data, buf + offset, dio->io_size); 246 offset += dio->io_size; 247 vdev_queue_io_remove(vq, dio); 248 zio_vdev_io_bypass(dio); 249 nagg++; 250 } 251 252 ASSERT(offset == size); 253 254 dprintf("%5s T=%llu off=%8llx agg=%3d " 255 "old=%5llx new=%5llx\n", 256 zio_type_name[fio->io_type], 257 fio->io_deadline, fio->io_offset, nagg, fio->io_size, size); 258 259 avl_add(&vq->vq_pending_tree, aio); 260 261 *funcp = zio_nowait; 262 return (aio); 263 } 264 265 ASSERT(fio->io_vdev_tree == tree); 266 vdev_queue_io_remove(vq, fio); 267 268 avl_add(&vq->vq_pending_tree, fio); 269 270 *funcp = zio_next_stage; 271 272 return (fio); 273 } 274 275 zio_t * 276 vdev_queue_io(zio_t *zio) 277 { 278 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 279 zio_t *nio; 280 zio_issue_func_t *func; 281 282 ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); 283 284 if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) 285 return (zio); 286 287 zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; 288 289 if (zio->io_type == ZIO_TYPE_READ) 290 zio->io_vdev_tree = &vq->vq_read_tree; 291 else 292 zio->io_vdev_tree = &vq->vq_write_tree; 293 294 mutex_enter(&vq->vq_lock); 295 296 zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) + 297 zio->io_priority; 298 299 vdev_queue_io_add(vq, zio); 300 301 nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending, &func); 302 303 mutex_exit(&vq->vq_lock); 304 305 if (nio == NULL || func != zio_nowait) 306 return (nio); 307 308 func(nio); 309 return (NULL); 310 } 311 312 void 313 vdev_queue_io_done(zio_t *zio) 314 { 315 vdev_queue_t *vq = &zio->io_vd->vdev_queue; 316 zio_t *nio; 317 zio_issue_func_t *func; 318 int i; 319 320 mutex_enter(&vq->vq_lock); 321 322 avl_remove(&vq->vq_pending_tree, zio); 323 324 for (i = 0; i < zfs_vdev_ramp_rate; i++) { 325 nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending, &func); 326 if (nio == NULL) 327 break; 328 mutex_exit(&vq->vq_lock); 329 if (func == zio_next_stage) 330 zio_vdev_io_reissue(nio); 331 func(nio); 332 mutex_enter(&vq->vq_lock); 333 } 334 335 mutex_exit(&vq->vq_lock); 336 } 337