xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_queue.c (revision c56b380c394069fe4abc1010960870ca8b2726a8)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2012 by Delphix. All rights reserved.
28  */
29 
30 #include <sys/zfs_context.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/spa_impl.h>
33 #include <sys/zio.h>
34 #include <sys/avl.h>
35 
36 /*
37  * These tunables are for performance analysis.
38  */
39 
40 /* The maximum number of I/Os concurrently pending to each device. */
41 int zfs_vdev_max_pending = 10;
42 
43 /*
44  * The initial number of I/Os pending to each device, before it starts ramping
45  * up to zfs_vdev_max_pending.
46  */
47 int zfs_vdev_min_pending = 4;
48 
49 /*
50  * The deadlines are grouped into buckets based on zfs_vdev_time_shift:
51  * deadline = pri + gethrtime() >> time_shift)
52  */
53 int zfs_vdev_time_shift = 29; /* each bucket is 0.537 seconds */
54 
55 /* exponential I/O issue ramp-up rate */
56 int zfs_vdev_ramp_rate = 2;
57 
58 /*
59  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
60  * For read I/Os, we also aggregate across small adjacency gaps; for writes
61  * we include spans of optional I/Os to aid aggregation at the disk even when
62  * they aren't able to help us aggregate at this level.
63  */
64 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
65 int zfs_vdev_read_gap_limit = 32 << 10;
66 int zfs_vdev_write_gap_limit = 4 << 10;
67 
68 /*
69  * Virtual device vector for disk I/O scheduling.
70  */
71 int
72 vdev_queue_deadline_compare(const void *x1, const void *x2)
73 {
74 	const zio_t *z1 = x1;
75 	const zio_t *z2 = x2;
76 
77 	if (z1->io_deadline < z2->io_deadline)
78 		return (-1);
79 	if (z1->io_deadline > z2->io_deadline)
80 		return (1);
81 
82 	if (z1->io_offset < z2->io_offset)
83 		return (-1);
84 	if (z1->io_offset > z2->io_offset)
85 		return (1);
86 
87 	if (z1 < z2)
88 		return (-1);
89 	if (z1 > z2)
90 		return (1);
91 
92 	return (0);
93 }
94 
95 int
96 vdev_queue_offset_compare(const void *x1, const void *x2)
97 {
98 	const zio_t *z1 = x1;
99 	const zio_t *z2 = x2;
100 
101 	if (z1->io_offset < z2->io_offset)
102 		return (-1);
103 	if (z1->io_offset > z2->io_offset)
104 		return (1);
105 
106 	if (z1 < z2)
107 		return (-1);
108 	if (z1 > z2)
109 		return (1);
110 
111 	return (0);
112 }
113 
114 void
115 vdev_queue_init(vdev_t *vd)
116 {
117 	vdev_queue_t *vq = &vd->vdev_queue;
118 
119 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
120 
121 	avl_create(&vq->vq_deadline_tree, vdev_queue_deadline_compare,
122 	    sizeof (zio_t), offsetof(struct zio, io_deadline_node));
123 
124 	avl_create(&vq->vq_read_tree, vdev_queue_offset_compare,
125 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
126 
127 	avl_create(&vq->vq_write_tree, vdev_queue_offset_compare,
128 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
129 
130 	avl_create(&vq->vq_pending_tree, vdev_queue_offset_compare,
131 	    sizeof (zio_t), offsetof(struct zio, io_offset_node));
132 }
133 
134 void
135 vdev_queue_fini(vdev_t *vd)
136 {
137 	vdev_queue_t *vq = &vd->vdev_queue;
138 
139 	avl_destroy(&vq->vq_deadline_tree);
140 	avl_destroy(&vq->vq_read_tree);
141 	avl_destroy(&vq->vq_write_tree);
142 	avl_destroy(&vq->vq_pending_tree);
143 
144 	mutex_destroy(&vq->vq_lock);
145 }
146 
147 static void
148 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
149 {
150 	spa_t *spa = zio->io_spa;
151 	avl_add(&vq->vq_deadline_tree, zio);
152 	avl_add(zio->io_vdev_tree, zio);
153 
154 	if (spa->spa_iokstat != NULL) {
155 		mutex_enter(&spa->spa_iokstat_lock);
156 		kstat_waitq_enter(spa->spa_iokstat->ks_data);
157 		mutex_exit(&spa->spa_iokstat_lock);
158 	}
159 }
160 
161 static void
162 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
163 {
164 	spa_t *spa = zio->io_spa;
165 	avl_remove(&vq->vq_deadline_tree, zio);
166 	avl_remove(zio->io_vdev_tree, zio);
167 
168 	if (spa->spa_iokstat != NULL) {
169 		mutex_enter(&spa->spa_iokstat_lock);
170 		kstat_waitq_exit(spa->spa_iokstat->ks_data);
171 		mutex_exit(&spa->spa_iokstat_lock);
172 	}
173 }
174 
175 static void
176 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
177 {
178 	spa_t *spa = zio->io_spa;
179 	avl_add(&vq->vq_pending_tree, zio);
180 	if (spa->spa_iokstat != NULL) {
181 		mutex_enter(&spa->spa_iokstat_lock);
182 		kstat_runq_enter(spa->spa_iokstat->ks_data);
183 		mutex_exit(&spa->spa_iokstat_lock);
184 	}
185 }
186 
187 static void
188 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
189 {
190 	spa_t *spa = zio->io_spa;
191 	avl_remove(&vq->vq_pending_tree, zio);
192 	if (spa->spa_iokstat != NULL) {
193 		kstat_io_t *ksio = spa->spa_iokstat->ks_data;
194 
195 		mutex_enter(&spa->spa_iokstat_lock);
196 		kstat_runq_exit(spa->spa_iokstat->ks_data);
197 		if (zio->io_type == ZIO_TYPE_READ) {
198 			ksio->reads++;
199 			ksio->nread += zio->io_size;
200 		} else if (zio->io_type == ZIO_TYPE_WRITE) {
201 			ksio->writes++;
202 			ksio->nwritten += zio->io_size;
203 		}
204 		mutex_exit(&spa->spa_iokstat_lock);
205 	}
206 }
207 
208 static void
209 vdev_queue_agg_io_done(zio_t *aio)
210 {
211 	zio_t *pio;
212 
213 	while ((pio = zio_walk_parents(aio)) != NULL)
214 		if (aio->io_type == ZIO_TYPE_READ)
215 			bcopy((char *)aio->io_data + (pio->io_offset -
216 			    aio->io_offset), pio->io_data, pio->io_size);
217 
218 	zio_buf_free(aio->io_data, aio->io_size);
219 }
220 
221 /*
222  * Compute the range spanned by two i/os, which is the endpoint of the last
223  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
224  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
225  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
226  */
227 #define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
228 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
229 
230 static zio_t *
231 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
232 {
233 	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
234 	avl_tree_t *t;
235 	int flags;
236 	uint64_t maxspan = zfs_vdev_aggregation_limit;
237 	uint64_t maxgap;
238 	int stretch;
239 
240 again:
241 	ASSERT(MUTEX_HELD(&vq->vq_lock));
242 
243 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
244 	    avl_numnodes(&vq->vq_deadline_tree) == 0)
245 		return (NULL);
246 
247 	fio = lio = avl_first(&vq->vq_deadline_tree);
248 
249 	t = fio->io_vdev_tree;
250 	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
251 	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
252 
253 	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
254 		/*
255 		 * We can aggregate I/Os that are sufficiently adjacent and of
256 		 * the same flavor, as expressed by the AGG_INHERIT flags.
257 		 * The latter requirement is necessary so that certain
258 		 * attributes of the I/O, such as whether it's a normal I/O
259 		 * or a scrub/resilver, can be preserved in the aggregate.
260 		 * We can include optional I/Os, but don't allow them
261 		 * to begin a range as they add no benefit in that situation.
262 		 */
263 
264 		/*
265 		 * We keep track of the last non-optional I/O.
266 		 */
267 		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
268 
269 		/*
270 		 * Walk backwards through sufficiently contiguous I/Os
271 		 * recording the last non-option I/O.
272 		 */
273 		while ((dio = AVL_PREV(t, fio)) != NULL &&
274 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
275 		    IO_SPAN(dio, lio) <= maxspan &&
276 		    IO_GAP(dio, fio) <= maxgap) {
277 			fio = dio;
278 			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
279 				mio = fio;
280 		}
281 
282 		/*
283 		 * Skip any initial optional I/Os.
284 		 */
285 		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
286 			fio = AVL_NEXT(t, fio);
287 			ASSERT(fio != NULL);
288 		}
289 
290 		/*
291 		 * Walk forward through sufficiently contiguous I/Os.
292 		 */
293 		while ((dio = AVL_NEXT(t, lio)) != NULL &&
294 		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
295 		    IO_SPAN(fio, dio) <= maxspan &&
296 		    IO_GAP(lio, dio) <= maxgap) {
297 			lio = dio;
298 			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
299 				mio = lio;
300 		}
301 
302 		/*
303 		 * Now that we've established the range of the I/O aggregation
304 		 * we must decide what to do with trailing optional I/Os.
305 		 * For reads, there's nothing to do. While we are unable to
306 		 * aggregate further, it's possible that a trailing optional
307 		 * I/O would allow the underlying device to aggregate with
308 		 * subsequent I/Os. We must therefore determine if the next
309 		 * non-optional I/O is close enough to make aggregation
310 		 * worthwhile.
311 		 */
312 		stretch = B_FALSE;
313 		if (t != &vq->vq_read_tree && mio != NULL) {
314 			nio = lio;
315 			while ((dio = AVL_NEXT(t, nio)) != NULL &&
316 			    IO_GAP(nio, dio) == 0 &&
317 			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
318 				nio = dio;
319 				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
320 					stretch = B_TRUE;
321 					break;
322 				}
323 			}
324 		}
325 
326 		if (stretch) {
327 			/* This may be a no-op. */
328 			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
329 			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
330 		} else {
331 			while (lio != mio && lio != fio) {
332 				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
333 				lio = AVL_PREV(t, lio);
334 				ASSERT(lio != NULL);
335 			}
336 		}
337 	}
338 
339 	if (fio != lio) {
340 		uint64_t size = IO_SPAN(fio, lio);
341 		ASSERT(size <= zfs_vdev_aggregation_limit);
342 
343 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
344 		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
345 		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
346 		    vdev_queue_agg_io_done, NULL);
347 		aio->io_timestamp = fio->io_timestamp;
348 
349 		nio = fio;
350 		do {
351 			dio = nio;
352 			nio = AVL_NEXT(t, dio);
353 			ASSERT(dio->io_type == aio->io_type);
354 			ASSERT(dio->io_vdev_tree == t);
355 
356 			if (dio->io_flags & ZIO_FLAG_NODATA) {
357 				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
358 				bzero((char *)aio->io_data + (dio->io_offset -
359 				    aio->io_offset), dio->io_size);
360 			} else if (dio->io_type == ZIO_TYPE_WRITE) {
361 				bcopy(dio->io_data, (char *)aio->io_data +
362 				    (dio->io_offset - aio->io_offset),
363 				    dio->io_size);
364 			}
365 
366 			zio_add_child(dio, aio);
367 			vdev_queue_io_remove(vq, dio);
368 			zio_vdev_io_bypass(dio);
369 			zio_execute(dio);
370 		} while (dio != lio);
371 
372 		vdev_queue_pending_add(vq, aio);
373 
374 		return (aio);
375 	}
376 
377 	ASSERT(fio->io_vdev_tree == t);
378 	vdev_queue_io_remove(vq, fio);
379 
380 	/*
381 	 * If the I/O is or was optional and therefore has no data, we need to
382 	 * simply discard it. We need to drop the vdev queue's lock to avoid a
383 	 * deadlock that we could encounter since this I/O will complete
384 	 * immediately.
385 	 */
386 	if (fio->io_flags & ZIO_FLAG_NODATA) {
387 		mutex_exit(&vq->vq_lock);
388 		zio_vdev_io_bypass(fio);
389 		zio_execute(fio);
390 		mutex_enter(&vq->vq_lock);
391 		goto again;
392 	}
393 
394 	vdev_queue_pending_add(vq, fio);
395 
396 	return (fio);
397 }
398 
399 zio_t *
400 vdev_queue_io(zio_t *zio)
401 {
402 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
403 	zio_t *nio;
404 
405 	ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
406 
407 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
408 		return (zio);
409 
410 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
411 
412 	if (zio->io_type == ZIO_TYPE_READ)
413 		zio->io_vdev_tree = &vq->vq_read_tree;
414 	else
415 		zio->io_vdev_tree = &vq->vq_write_tree;
416 
417 	mutex_enter(&vq->vq_lock);
418 
419 	zio->io_timestamp = gethrtime();
420 	zio->io_deadline = (zio->io_timestamp >> zfs_vdev_time_shift) +
421 	    zio->io_priority;
422 
423 	vdev_queue_io_add(vq, zio);
424 
425 	nio = vdev_queue_io_to_issue(vq, zfs_vdev_min_pending);
426 
427 	mutex_exit(&vq->vq_lock);
428 
429 	if (nio == NULL)
430 		return (NULL);
431 
432 	if (nio->io_done == vdev_queue_agg_io_done) {
433 		zio_nowait(nio);
434 		return (NULL);
435 	}
436 
437 	return (nio);
438 }
439 
440 void
441 vdev_queue_io_done(zio_t *zio)
442 {
443 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
444 
445 	if (zio_injection_enabled)
446 		delay(SEC_TO_TICK(zio_handle_io_delay(zio)));
447 
448 	mutex_enter(&vq->vq_lock);
449 
450 	vdev_queue_pending_remove(vq, zio);
451 
452 	vq->vq_io_complete_ts = gethrtime();
453 
454 	for (int i = 0; i < zfs_vdev_ramp_rate; i++) {
455 		zio_t *nio = vdev_queue_io_to_issue(vq, zfs_vdev_max_pending);
456 		if (nio == NULL)
457 			break;
458 		mutex_exit(&vq->vq_lock);
459 		if (nio->io_done == vdev_queue_agg_io_done) {
460 			zio_nowait(nio);
461 		} else {
462 			zio_vdev_io_reissue(nio);
463 			zio_execute(nio);
464 		}
465 		mutex_enter(&vq->vq_lock);
466 	}
467 
468 	mutex_exit(&vq->vq_lock);
469 }
470