xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_mirror.c (revision 240e56fe6d839850e4681b02d57a1bb22e08eb86)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/fs/zfs.h>
33 
34 /*
35  * Virtual device vector for mirroring.
36  */
37 
38 typedef struct mirror_map {
39 	int	mm_error;
40 	short	mm_tried;
41 	short	mm_skipped;
42 } mirror_map_t;
43 
44 static mirror_map_t *
45 vdev_mirror_map_alloc(zio_t *zio)
46 {
47 	zio->io_vsd = kmem_zalloc(zio->io_vd->vdev_children *
48 	    sizeof (mirror_map_t), KM_SLEEP);
49 	return (zio->io_vsd);
50 }
51 
52 static void
53 vdev_mirror_map_free(zio_t *zio)
54 {
55 	kmem_free(zio->io_vsd,
56 	    zio->io_vd->vdev_children * sizeof (mirror_map_t));
57 	zio->io_vsd = NULL;
58 }
59 
60 static int
61 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
62 {
63 	vdev_t *cvd;
64 	uint64_t c;
65 	int numerrors = 0;
66 	int ret, lasterror = 0;
67 
68 	if (vd->vdev_children == 0) {
69 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
70 		return (EINVAL);
71 	}
72 
73 	for (c = 0; c < vd->vdev_children; c++) {
74 		cvd = vd->vdev_child[c];
75 
76 		if ((ret = vdev_open(cvd)) != 0) {
77 			lasterror = ret;
78 			numerrors++;
79 			continue;
80 		}
81 
82 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
83 		*ashift = MAX(*ashift, cvd->vdev_ashift);
84 	}
85 
86 	if (numerrors == vd->vdev_children) {
87 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
88 		return (lasterror);
89 	}
90 
91 	return (0);
92 }
93 
94 static void
95 vdev_mirror_close(vdev_t *vd)
96 {
97 	uint64_t c;
98 
99 	for (c = 0; c < vd->vdev_children; c++)
100 		vdev_close(vd->vdev_child[c]);
101 }
102 
103 static void
104 vdev_mirror_child_done(zio_t *zio)
105 {
106 	mirror_map_t *mm = zio->io_private;
107 
108 	mm->mm_error = zio->io_error;
109 	mm->mm_tried = 1;
110 	mm->mm_skipped = 0;
111 }
112 
113 static void
114 vdev_mirror_scrub_done(zio_t *zio)
115 {
116 	mirror_map_t *mm = zio->io_private;
117 
118 	if (zio->io_error == 0) {
119 		zio_t *pio = zio->io_parent;
120 		mutex_enter(&pio->io_lock);
121 		bcopy(zio->io_data, pio->io_data, pio->io_size);
122 		mutex_exit(&pio->io_lock);
123 	}
124 
125 	zio_buf_free(zio->io_data, zio->io_size);
126 
127 	mm->mm_error = zio->io_error;
128 	mm->mm_tried = 1;
129 	mm->mm_skipped = 0;
130 }
131 
132 static void
133 vdev_mirror_repair_done(zio_t *zio)
134 {
135 	ASSERT(zio->io_private == zio->io_parent);
136 	vdev_mirror_map_free(zio->io_private);
137 }
138 
139 /*
140  * Try to find a child whose DTL doesn't contain the block we want to read.
141  * If we can't, try the read on any vdev we haven't already tried.
142  */
143 static int
144 vdev_mirror_child_select(zio_t *zio)
145 {
146 	mirror_map_t *mm = zio->io_vsd;
147 	vdev_t *vd = zio->io_vd;
148 	vdev_t *cvd;
149 	uint64_t txg = zio->io_txg;
150 	int i, c;
151 
152 	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
153 
154 	/*
155 	 * Select the child we'd like to read from absent any errors.
156 	 * The current policy is to alternate sides at 8M granularity.
157 	 * XXX -- investigate other policies for read distribution.
158 	 */
159 	c = (zio->io_offset >> (SPA_MAXBLOCKSHIFT + 6)) % vd->vdev_children;
160 
161 	/*
162 	 * If this is a replacing vdev, always try child 0 (the source) first.
163 	 */
164 	if (vd->vdev_ops == &vdev_replacing_ops)
165 		c = 0;
166 
167 	/*
168 	 * Try to find a child whose DTL doesn't contain the block to read.
169 	 * If a child is known to be completely inaccessible (indicated by
170 	 * vdev_is_dead() returning B_TRUE), don't even try.
171 	 */
172 	for (i = 0; i < vd->vdev_children; i++, c++) {
173 		if (c >= vd->vdev_children)
174 			c = 0;
175 		if (mm[c].mm_tried || mm[c].mm_skipped)
176 			continue;
177 		cvd = vd->vdev_child[c];
178 		if (vdev_is_dead(cvd)) {
179 			mm[c].mm_error = ENXIO;
180 			mm[c].mm_tried = 1;	/* don't even try */
181 			mm[c].mm_skipped = 1;
182 			continue;
183 		}
184 		if (!vdev_dtl_contains(&cvd->vdev_dtl_map, txg, 1))
185 			return (c);
186 		mm[c].mm_error = ESTALE;
187 		mm[c].mm_skipped = 1;
188 	}
189 
190 	/*
191 	 * Every device is either missing or has this txg in its DTL.
192 	 * If we don't have any sibling replicas to consult, look for
193 	 * any child we haven't already tried before giving up.
194 	 */
195 	if (vd == vd->vdev_top || vd->vdev_parent->vdev_children <= 1) {
196 		for (c = 0; c < vd->vdev_children; c++) {
197 			if (!mm[c].mm_tried)
198 				return (c);
199 		}
200 	}
201 
202 	/*
203 	 * Every child failed.  There's no place left to look.
204 	 */
205 	return (-1);
206 }
207 
208 static void
209 vdev_mirror_io_start(zio_t *zio)
210 {
211 	vdev_t *vd = zio->io_vd;
212 	mirror_map_t *mm;
213 	int c, children;
214 
215 	mm = vdev_mirror_map_alloc(zio);
216 
217 	if (zio->io_type == ZIO_TYPE_READ) {
218 		if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
219 		    vd->vdev_ops != &vdev_replacing_ops) {
220 			/*
221 			 * For scrubbing reads we need to allocate a read
222 			 * buffer for each child and issue reads to all
223 			 * children.  If any child succeeds, it will copy its
224 			 * data into zio->io_data in vdev_mirror_scrub_done.
225 			 */
226 			for (c = 0; c < vd->vdev_children; c++) {
227 				zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
228 				    vd->vdev_child[c], zio->io_offset,
229 				    zio_buf_alloc(zio->io_size), zio->io_size,
230 				    zio->io_type, zio->io_priority,
231 				    ZIO_FLAG_CANFAIL, vdev_mirror_scrub_done,
232 				    &mm[c]));
233 			}
234 			zio_wait_children_done(zio);
235 			return;
236 		}
237 		/*
238 		 * For normal reads just pick one child.
239 		 */
240 		c = vdev_mirror_child_select(zio);
241 		children = (c >= 0);
242 	} else {
243 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
244 
245 		/*
246 		 * If this is a resilvering I/O to a replacing vdev,
247 		 * only the last child should be written -- unless the
248 		 * first child happens to have a DTL entry here as well.
249 		 * All other writes go to all children.
250 		 */
251 		if ((zio->io_flags & ZIO_FLAG_RESILVER) &&
252 		    vd->vdev_ops == &vdev_replacing_ops &&
253 		    !vdev_dtl_contains(&vd->vdev_child[0]->vdev_dtl_map,
254 		    zio->io_txg, 1)) {
255 			c = vd->vdev_children - 1;
256 			children = 1;
257 		} else {
258 			c = 0;
259 			children = vd->vdev_children;
260 		}
261 	}
262 
263 	while (children--) {
264 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
265 		    vd->vdev_child[c], zio->io_offset, zio->io_data,
266 		    zio->io_size, zio->io_type, zio->io_priority,
267 		    ZIO_FLAG_CANFAIL, vdev_mirror_child_done, &mm[c]));
268 		c++;
269 	}
270 
271 	zio_wait_children_done(zio);
272 }
273 
274 static void
275 vdev_mirror_io_done(zio_t *zio)
276 {
277 	vdev_t *vd = zio->io_vd;
278 	vdev_t *cvd;
279 	mirror_map_t *mm = zio->io_vsd;
280 	int c;
281 	int good_copies = 0;
282 	int unexpected_errors = 0;
283 
284 	ASSERT(mm != NULL);
285 
286 	zio->io_error = 0;
287 	zio->io_numerrors = 0;
288 
289 	for (c = 0; c < vd->vdev_children; c++) {
290 		if (mm[c].mm_tried && mm[c].mm_error == 0) {
291 			good_copies++;
292 			continue;
293 		}
294 
295 		/*
296 		 * We preserve any EIOs because those may be worth retrying;
297 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
298 		 */
299 		if (mm[c].mm_error) {
300 			if (zio->io_error != EIO)
301 				zio->io_error = mm[c].mm_error;
302 			if (!mm[c].mm_skipped)
303 				unexpected_errors++;
304 			zio->io_numerrors++;
305 		}
306 	}
307 
308 	if (zio->io_type == ZIO_TYPE_WRITE) {
309 		/*
310 		 * XXX -- for now, treat partial writes as success.
311 		 */
312 		/* XXPOLICY */
313 		if (good_copies != 0)
314 			zio->io_error = 0;
315 		ASSERT(mm != NULL);
316 		vdev_mirror_map_free(zio);
317 		zio_next_stage(zio);
318 		return;
319 	}
320 
321 	ASSERT(zio->io_type == ZIO_TYPE_READ);
322 
323 	/*
324 	 * If we don't have a good copy yet, keep trying other children.
325 	 */
326 	/* XXPOLICY */
327 	if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
328 		ASSERT(c >= 0 && c < vd->vdev_children);
329 		cvd = vd->vdev_child[c];
330 		dprintf("%s: retrying i/o (err=%d) on child %s\n",
331 		    vdev_description(zio->io_vd), zio->io_error,
332 		    vdev_description(cvd));
333 		zio->io_error = 0;
334 		zio_vdev_io_redone(zio);
335 		zio_nowait(zio_vdev_child_io(zio, zio->io_bp, cvd,
336 		    zio->io_offset, zio->io_data, zio->io_size,
337 		    ZIO_TYPE_READ, zio->io_priority, ZIO_FLAG_CANFAIL,
338 		    vdev_mirror_child_done, &mm[c]));
339 		zio_wait_children_done(zio);
340 		return;
341 	}
342 
343 	/* XXPOLICY */
344 	if (good_copies)
345 		zio->io_error = 0;
346 	else
347 		ASSERT(zio->io_error != 0);
348 
349 	if (good_copies && (spa_mode & FWRITE) &&
350 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
351 		zio_t *rio;
352 
353 		/*
354 		 * Use the good data we have in hand to repair damaged children.
355 		 *
356 		 * We issue all repair I/Os as children of 'rio' to arrange
357 		 * that vdev_mirror_map_free(zio) will be invoked after all
358 		 * repairs complete, but before we advance to the next stage.
359 		 */
360 		rio = zio_null(zio, zio->io_spa,
361 		    vdev_mirror_repair_done, zio, ZIO_FLAG_CANFAIL);
362 
363 		for (c = 0; c < vd->vdev_children; c++) {
364 			/*
365 			 * Don't rewrite known good children.
366 			 * Not only is it unnecessary, it could
367 			 * actually be harmful: if the system lost
368 			 * power while rewriting the only good copy,
369 			 * there would be no good copies left!
370 			 */
371 			cvd = vd->vdev_child[c];
372 
373 			if (mm[c].mm_error == 0) {
374 				if (mm[c].mm_tried)
375 					continue;
376 				if (!vdev_dtl_contains(&cvd->vdev_dtl_map,
377 				    zio->io_txg, 1))
378 					continue;
379 				mm[c].mm_error = ESTALE;
380 			}
381 
382 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
383 			    vdev_description(vd),
384 			    vdev_description(cvd),
385 			    zio->io_offset, mm[c].mm_error);
386 
387 			zio_nowait(zio_vdev_child_io(rio, zio->io_bp, cvd,
388 			    zio->io_offset, zio->io_data, zio->io_size,
389 			    ZIO_TYPE_WRITE, zio->io_priority,
390 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
391 			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
392 		}
393 
394 		zio_nowait(rio);
395 		zio_wait_children_done(zio);
396 		return;
397 	}
398 
399 	vdev_mirror_map_free(zio);
400 	zio_next_stage(zio);
401 }
402 
403 static void
404 vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
405 {
406 	if (faulted == vd->vdev_children)
407 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
408 		    VDEV_AUX_NO_REPLICAS);
409 	else if (degraded + faulted != 0)
410 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
411 	else
412 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
413 }
414 
415 vdev_ops_t vdev_mirror_ops = {
416 	vdev_mirror_open,
417 	vdev_mirror_close,
418 	vdev_default_asize,
419 	vdev_mirror_io_start,
420 	vdev_mirror_io_done,
421 	vdev_mirror_state_change,
422 	VDEV_TYPE_MIRROR,	/* name of this vdev type */
423 	B_FALSE			/* not a leaf vdev */
424 };
425 
426 vdev_ops_t vdev_replacing_ops = {
427 	vdev_mirror_open,
428 	vdev_mirror_close,
429 	vdev_default_asize,
430 	vdev_mirror_io_start,
431 	vdev_mirror_io_done,
432 	vdev_mirror_state_change,
433 	VDEV_TYPE_REPLACING,	/* name of this vdev type */
434 	B_FALSE			/* not a leaf vdev */
435 };
436