xref: /titanic_41/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision f2a3c691e1fab4dee486fd83642311ec59dc3732)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 #include <sys/zfs_context.h>
29 #include <sys/spa.h>
30 #include <sys/vdev_impl.h>
31 #include <sys/zio.h>
32 #include <sys/zio_checksum.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/fm/fs/zfs.h>
35 
36 /*
37  * Virtual device vector for RAID-Z.
38  */
39 
40 /*
41  * We currently allow up to two-way replication (i.e. single-fault
42  * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
43  * must all be multiples of two times the leaf vdev blocksize.
44  */
45 #define	VDEV_RAIDZ_ALIGN	2ULL
46 
47 typedef struct raidz_col {
48 	uint64_t	rc_col;
49 	uint64_t	rc_offset;
50 	uint64_t	rc_size;
51 	void		*rc_data;
52 	int		rc_error;
53 	short		rc_tried;
54 	short		rc_skipped;
55 } raidz_col_t;
56 
57 typedef struct raidz_map {
58 	uint64_t	rm_cols;
59 	uint64_t	rm_bigcols;
60 	uint64_t	rm_asize;
61 	int		rm_missing_child;
62 	int		rm_firstdatacol;
63 	raidz_col_t	rm_col[1];
64 } raidz_map_t;
65 
66 static raidz_map_t *
67 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols)
68 {
69 	raidz_map_t *rm;
70 	uint64_t b = zio->io_offset >> unit_shift;
71 	uint64_t s = zio->io_size >> unit_shift;
72 	uint64_t f = b % dcols;
73 	uint64_t o = (b / dcols) << unit_shift;
74 	uint64_t q, r, c, bc, col, acols, coff;
75 	int firstdatacol;
76 
77 	q = s / (dcols - 1);
78 	r = s - q * (dcols - 1);
79 	bc = r + !!r;
80 	firstdatacol = 1;
81 
82 	acols = (q == 0 ? bc : dcols);
83 
84 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
85 
86 	rm->rm_cols = acols;
87 	rm->rm_bigcols = bc;
88 	rm->rm_asize = 0;
89 	rm->rm_missing_child = -1;
90 	rm->rm_firstdatacol = firstdatacol;
91 
92 	for (c = 0; c < acols; c++) {
93 		col = f + c;
94 		coff = o;
95 		if (col >= dcols) {
96 			col -= dcols;
97 			coff += 1ULL << unit_shift;
98 		}
99 		rm->rm_col[c].rc_col = col;
100 		rm->rm_col[c].rc_offset = coff;
101 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
102 		rm->rm_col[c].rc_data = NULL;
103 		rm->rm_col[c].rc_error = 0;
104 		rm->rm_col[c].rc_tried = 0;
105 		rm->rm_col[c].rc_skipped = 0;
106 		rm->rm_asize += rm->rm_col[c].rc_size;
107 	}
108 
109 	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
110 
111 	for (c = 0; c < rm->rm_firstdatacol; c++)
112 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
113 
114 	rm->rm_col[c].rc_data = zio->io_data;
115 
116 	for (c = c + 1; c < acols; c++)
117 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
118 		    rm->rm_col[c - 1].rc_size;
119 
120 	/*
121 	 * To prevent hot parity disks, switch the parity and data
122 	 * columns every 1MB.
123 	 */
124 	ASSERT(rm->rm_cols >= 2);
125 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
126 
127 	if (zio->io_offset & (1ULL << 20)) {
128 		col = rm->rm_col[0].rc_col;
129 		o = rm->rm_col[0].rc_offset;
130 		rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
131 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
132 		rm->rm_col[1].rc_col = col;
133 		rm->rm_col[1].rc_offset = o;
134 	}
135 
136 	zio->io_vsd = rm;
137 	return (rm);
138 }
139 
140 static void
141 vdev_raidz_map_free(zio_t *zio)
142 {
143 	raidz_map_t *rm = zio->io_vsd;
144 	int c;
145 
146 	for (c = 0; c < rm->rm_firstdatacol; c++)
147 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
148 
149 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
150 	zio->io_vsd = NULL;
151 }
152 
153 static void
154 vdev_raidz_reconstruct(raidz_map_t *rm, int x)
155 {
156 	uint64_t *dst, *src, count, xsize, csize;
157 	int i, c;
158 
159 	for (c = 0; c < rm->rm_cols; c++) {
160 		if (c == x)
161 			continue;
162 		src = rm->rm_col[c].rc_data;
163 		dst = rm->rm_col[x].rc_data;
164 		csize = rm->rm_col[c].rc_size;
165 		xsize = rm->rm_col[x].rc_size;
166 		count = MIN(csize, xsize) / sizeof (uint64_t);
167 		if (c == !x) {
168 			/*
169 			 * The initial copy happens at either c == 0 or c == 1.
170 			 * Both of these columns are 'big' columns, so we'll
171 			 * definitely initialize all of column x.
172 			 */
173 			ASSERT3U(xsize, <=, csize);
174 			for (i = 0; i < count; i++)
175 				*dst++ = *src++;
176 		} else {
177 			for (i = 0; i < count; i++)
178 				*dst++ ^= *src++;
179 		}
180 	}
181 }
182 
183 static int
184 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
185 {
186 	vdev_t *cvd;
187 	int c, error;
188 	int lasterror = 0;
189 	int numerrors = 0;
190 
191 	/*
192 	 * XXX -- minimum children should be raid-type-specific
193 	 */
194 	if (vd->vdev_children < 2) {
195 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
196 		return (EINVAL);
197 	}
198 
199 	for (c = 0; c < vd->vdev_children; c++) {
200 		cvd = vd->vdev_child[c];
201 
202 		if ((error = vdev_open(cvd)) != 0) {
203 			lasterror = error;
204 			numerrors++;
205 			continue;
206 		}
207 
208 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
209 		*ashift = MAX(*ashift, cvd->vdev_ashift);
210 	}
211 
212 	*asize *= vd->vdev_children;
213 
214 	if (numerrors > 1) {
215 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
216 		return (lasterror);
217 	}
218 
219 	return (0);
220 }
221 
222 static void
223 vdev_raidz_close(vdev_t *vd)
224 {
225 	int c;
226 
227 	for (c = 0; c < vd->vdev_children; c++)
228 		vdev_close(vd->vdev_child[c]);
229 }
230 
231 static uint64_t
232 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
233 {
234 	uint64_t asize;
235 	uint64_t ashift = vd->vdev_top->vdev_ashift;
236 	uint64_t cols = vd->vdev_children;
237 
238 	asize = ((psize - 1) >> ashift) + 1;
239 	asize += (asize + cols - 2) / (cols - 1);
240 	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << ashift;
241 
242 	return (asize);
243 }
244 
245 static void
246 vdev_raidz_child_done(zio_t *zio)
247 {
248 	raidz_col_t *rc = zio->io_private;
249 
250 	rc->rc_error = zio->io_error;
251 	rc->rc_tried = 1;
252 	rc->rc_skipped = 0;
253 }
254 
255 static void
256 vdev_raidz_repair_done(zio_t *zio)
257 {
258 	ASSERT(zio->io_private == zio->io_parent);
259 	vdev_raidz_map_free(zio->io_private);
260 }
261 
262 static void
263 vdev_raidz_io_start(zio_t *zio)
264 {
265 	vdev_t *vd = zio->io_vd;
266 	vdev_t *tvd = vd->vdev_top;
267 	vdev_t *cvd;
268 	blkptr_t *bp = zio->io_bp;
269 	raidz_map_t *rm;
270 	raidz_col_t *rc;
271 	int c;
272 
273 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children);
274 
275 	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
276 		ASSERT3U(rm->rm_asize, ==,
277 		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
278 	} else {
279 		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
280 	}
281 
282 	if (zio->io_type == ZIO_TYPE_WRITE) {
283 
284 		/*
285 		 * Generate RAID parity in virtual column 0.
286 		 */
287 		vdev_raidz_reconstruct(rm, 0);
288 
289 		for (c = 0; c < rm->rm_cols; c++) {
290 			rc = &rm->rm_col[c];
291 			cvd = vd->vdev_child[rc->rc_col];
292 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
293 			    rc->rc_offset, rc->rc_data, rc->rc_size,
294 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
295 			    vdev_raidz_child_done, rc));
296 		}
297 		zio_wait_children_done(zio);
298 		return;
299 	}
300 
301 	ASSERT(zio->io_type == ZIO_TYPE_READ);
302 
303 	for (c = rm->rm_cols - 1; c >= 0; c--) {
304 		rc = &rm->rm_col[c];
305 		cvd = vd->vdev_child[rc->rc_col];
306 		if (vdev_is_dead(cvd)) {
307 			rm->rm_missing_child = c;
308 			rc->rc_error = ENXIO;
309 			rc->rc_tried = 1;	/* don't even try */
310 			rc->rc_skipped = 1;
311 			continue;
312 		}
313 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
314 			rm->rm_missing_child = c;
315 			rc->rc_error = ESTALE;
316 			rc->rc_skipped = 1;
317 			continue;
318 		}
319 		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
320 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
321 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
322 			    rc->rc_offset, rc->rc_data, rc->rc_size,
323 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
324 			    vdev_raidz_child_done, rc));
325 		}
326 	}
327 
328 	zio_wait_children_done(zio);
329 }
330 
331 /*
332  * Report a checksum error for a child of a RAID-Z device.
333  */
334 static void
335 raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
336 {
337 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col];
338 	dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
339 	    vdev_description(vd));
340 
341 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
342 		mutex_enter(&vd->vdev_stat_lock);
343 		vd->vdev_stat.vs_checksum_errors++;
344 		mutex_exit(&vd->vdev_stat_lock);
345 	}
346 
347 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
348 		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
349 		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
350 }
351 
352 
353 static void
354 vdev_raidz_io_done(zio_t *zio)
355 {
356 	vdev_t *vd = zio->io_vd;
357 	vdev_t *cvd;
358 	raidz_map_t *rm = zio->io_vsd;
359 	raidz_col_t *rc;
360 	blkptr_t *bp = zio->io_bp;
361 	int unexpected_errors = 0;
362 	int c;
363 
364 	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
365 
366 	zio->io_error = 0;
367 	zio->io_numerrors = 0;
368 
369 	for (c = 0; c < rm->rm_cols; c++) {
370 		rc = &rm->rm_col[c];
371 
372 		/*
373 		 * We preserve any EIOs because those may be worth retrying;
374 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
375 		 */
376 		if (rc->rc_error) {
377 			if (zio->io_error != EIO)
378 				zio->io_error = rc->rc_error;
379 			if (!rc->rc_skipped)
380 				unexpected_errors++;
381 			zio->io_numerrors++;
382 		}
383 	}
384 
385 	if (zio->io_type == ZIO_TYPE_WRITE) {
386 		/*
387 		 * If this is not a failfast write, and we were able to
388 		 * write enough columns to reconstruct the data, good enough.
389 		 */
390 		/* XXPOLICY */
391 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
392 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
393 			zio->io_error = 0;
394 
395 		vdev_raidz_map_free(zio);
396 		zio_next_stage(zio);
397 		return;
398 	}
399 
400 	ASSERT(zio->io_type == ZIO_TYPE_READ);
401 
402 	/*
403 	 * If there were no I/O errors, and the data checksums correctly,
404 	 * the read is complete.
405 	 */
406 	/* XXPOLICY */
407 	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
408 		ASSERT(unexpected_errors == 0);
409 		ASSERT(zio->io_error == 0);
410 
411 		/*
412 		 * We know the data's good.  If we read the parity,
413 		 * verify that it's good as well.  If not, fix it.
414 		 */
415 		for (c = 0; c < rm->rm_firstdatacol; c++) {
416 			void *orig;
417 			rc = &rm->rm_col[c];
418 			if (!rc->rc_tried)
419 				continue;
420 			orig = zio_buf_alloc(rc->rc_size);
421 			bcopy(rc->rc_data, orig, rc->rc_size);
422 			vdev_raidz_reconstruct(rm, c);
423 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
424 				raidz_checksum_error(zio, rc);
425 				rc->rc_error = ECKSUM;
426 				unexpected_errors++;
427 			}
428 			zio_buf_free(orig, rc->rc_size);
429 		}
430 		goto done;
431 	}
432 
433 	/*
434 	 * If there was exactly one I/O error, it's the one we expected,
435 	 * and the reconstructed data checksums, the read is complete.
436 	 * This happens when one child is offline and vdev_fault_assess()
437 	 * knows it, or when one child has stale data and the DTL knows it.
438 	 */
439 	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
440 		rc = &rm->rm_col[c];
441 		ASSERT(unexpected_errors == 0);
442 		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
443 		vdev_raidz_reconstruct(rm, c);
444 		if (zio_checksum_error(zio) == 0) {
445 			zio->io_error = 0;
446 			goto done;
447 		}
448 	}
449 
450 	/*
451 	 * This isn't a typical error -- either we got a read error or
452 	 * more than one child claimed a problem.  Read every block we
453 	 * haven't already so we can try combinatorial reconstruction.
454 	 */
455 	unexpected_errors = 1;
456 	rm->rm_missing_child = -1;
457 
458 	for (c = 0; c < rm->rm_cols; c++)
459 		if (!rm->rm_col[c].rc_tried)
460 			break;
461 
462 	if (c != rm->rm_cols) {
463 		zio->io_error = 0;
464 		zio_vdev_io_redone(zio);
465 		for (c = 0; c < rm->rm_cols; c++) {
466 			rc = &rm->rm_col[c];
467 			if (rc->rc_tried)
468 				continue;
469 			zio_nowait(zio_vdev_child_io(zio, NULL,
470 			    vd->vdev_child[rc->rc_col],
471 			    rc->rc_offset, rc->rc_data, rc->rc_size,
472 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
473 			    vdev_raidz_child_done, rc));
474 		}
475 		zio_wait_children_done(zio);
476 		return;
477 	}
478 
479 	/*
480 	 * If there were more errors than parity disks, give up.
481 	 */
482 	if (zio->io_numerrors > rm->rm_firstdatacol) {
483 		ASSERT(zio->io_error != 0);
484 		goto done;
485 	}
486 
487 	/*
488 	 * The number of I/O errors is correctable.  Correct them here.
489 	 */
490 	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
491 	for (c = 0; c < rm->rm_cols; c++) {
492 		rc = &rm->rm_col[c];
493 		ASSERT(rc->rc_tried);
494 		if (rc->rc_error) {
495 			vdev_raidz_reconstruct(rm, c);
496 			if (zio_checksum_error(zio) == 0)
497 				zio->io_error = 0;
498 			else
499 				zio->io_error = rc->rc_error;
500 			goto done;
501 		}
502 	}
503 
504 	/*
505 	 * There were no I/O errors, but the data doesn't checksum.
506 	 * Try all permutations to see if we can find one that does.
507 	 */
508 	ASSERT(zio->io_numerrors == 0);
509 	for (c = 0; c < rm->rm_cols; c++) {
510 		void *orig;
511 		rc = &rm->rm_col[c];
512 
513 		orig = zio_buf_alloc(rc->rc_size);
514 		bcopy(rc->rc_data, orig, rc->rc_size);
515 		vdev_raidz_reconstruct(rm, c);
516 
517 		if (zio_checksum_error(zio) == 0) {
518 			zio_buf_free(orig, rc->rc_size);
519 			zio->io_error = 0;
520 			/*
521 			 * If this child didn't know that it returned bad data,
522 			 * inform it.
523 			 */
524 			if (rc->rc_tried && rc->rc_error == 0)
525 				raidz_checksum_error(zio, rc);
526 			rc->rc_error = ECKSUM;
527 			goto done;
528 		}
529 
530 		bcopy(orig, rc->rc_data, rc->rc_size);
531 		zio_buf_free(orig, rc->rc_size);
532 	}
533 
534 	/*
535 	 * All combinations failed to checksum.  Generate checksum ereports for
536 	 * every one.
537 	 */
538 	zio->io_error = ECKSUM;
539 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
540 		for (c = 0; c < rm->rm_cols; c++) {
541 			rc = &rm->rm_col[c];
542 			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
543 			    zio->io_spa, vd->vdev_child[rc->rc_col], zio,
544 			    rc->rc_offset, rc->rc_size);
545 		}
546 	}
547 
548 done:
549 	zio_checksum_verified(zio);
550 
551 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
552 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
553 		zio_t *rio;
554 
555 		/*
556 		 * Use the good data we have in hand to repair damaged children.
557 		 *
558 		 * We issue all repair I/Os as children of 'rio' to arrange
559 		 * that vdev_raidz_map_free(zio) will be invoked after all
560 		 * repairs complete, but before we advance to the next stage.
561 		 */
562 		rio = zio_null(zio, zio->io_spa,
563 		    vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
564 
565 		for (c = 0; c < rm->rm_cols; c++) {
566 			rc = &rm->rm_col[c];
567 			cvd = vd->vdev_child[rc->rc_col];
568 
569 			if (rc->rc_error == 0)
570 				continue;
571 
572 			dprintf("%s resilvered %s @ 0x%llx error %d\n",
573 			    vdev_description(vd),
574 			    vdev_description(cvd),
575 			    zio->io_offset, rc->rc_error);
576 
577 			zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
578 			    rc->rc_offset, rc->rc_data, rc->rc_size,
579 			    ZIO_TYPE_WRITE, zio->io_priority,
580 			    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
581 			    ZIO_FLAG_DONT_PROPAGATE, NULL, NULL));
582 		}
583 
584 		zio_nowait(rio);
585 		zio_wait_children_done(zio);
586 		return;
587 	}
588 
589 	vdev_raidz_map_free(zio);
590 	zio_next_stage(zio);
591 }
592 
593 static void
594 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
595 {
596 	if (faulted > 1)
597 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
598 		    VDEV_AUX_NO_REPLICAS);
599 	else if (degraded + faulted != 0)
600 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
601 	else
602 		vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
603 }
604 
605 vdev_ops_t vdev_raidz_ops = {
606 	vdev_raidz_open,
607 	vdev_raidz_close,
608 	vdev_raidz_asize,
609 	vdev_raidz_io_start,
610 	vdev_raidz_io_done,
611 	vdev_raidz_state_change,
612 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
613 	B_FALSE			/* not a leaf vdev */
614 };
615