xref: /illumos-gate/usr/src/uts/common/fs/zfs/vdev_raidz.c (revision 43d18f1c320355e93c47399bea0b2e022fe06364)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/zfs_context.h>
30 #include <sys/spa.h>
31 #include <sys/vdev_impl.h>
32 #include <sys/zio.h>
33 #include <sys/zio_checksum.h>
34 #include <sys/fs/zfs.h>
35 
36 /*
37  * Virtual device vector for RAID-Z.
38  */
39 
40 /*
41  * We currently allow up to two-way replication (i.e. single-fault
42  * reconstruction) models in RAID-Z vdevs.  The blocks in such vdevs
43  * must all be multiples of two times the leaf vdev blocksize.
44  */
45 #define	VDEV_RAIDZ_ALIGN	2ULL
46 
47 typedef struct raidz_col {
48 	uint64_t	rc_col;
49 	uint64_t	rc_offset;
50 	uint64_t	rc_size;
51 	void		*rc_data;
52 	int		rc_error;
53 	short		rc_tried;
54 	short		rc_skipped;
55 } raidz_col_t;
56 
57 typedef struct raidz_map {
58 	uint64_t	rm_cols;
59 	uint64_t	rm_bigcols;
60 	uint64_t	rm_asize;
61 	int		rm_missing_child;
62 	int		rm_type;
63 	int		rm_firstdatacol;
64 	raidz_col_t	rm_col[1];
65 } raidz_map_t;
66 
67 #define	RAIDZ_SINGLE	0
68 #define	RAIDZ_PARITY	1
69 
70 static raidz_map_t *
71 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
72 	int raid_type)
73 {
74 	raidz_map_t *rm;
75 	uint64_t b = zio->io_offset >> unit_shift;
76 	uint64_t s = zio->io_size >> unit_shift;
77 	uint64_t f = b % dcols;
78 	uint64_t o = (b / dcols) << unit_shift;
79 	uint64_t q, r, c, bc, col, acols, coff;
80 	int firstdatacol;
81 
82 	switch (raid_type) {
83 	case RAIDZ_SINGLE:
84 		q = s / dcols;
85 		r = s - q * dcols;
86 		bc = r;
87 		firstdatacol = 0;
88 		break;
89 	case RAIDZ_PARITY:
90 		q = s / (dcols - 1);
91 		r = s - q * (dcols - 1);
92 		bc = r + !!r;
93 		firstdatacol = 1;
94 		break;
95 	}
96 
97 	acols = (q == 0 ? bc : dcols);
98 
99 	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
100 
101 	rm->rm_cols = acols;
102 	rm->rm_bigcols = bc;
103 	rm->rm_asize = 0;
104 	rm->rm_missing_child = -1;
105 	rm->rm_type = raid_type;
106 	rm->rm_firstdatacol = firstdatacol;
107 
108 	for (c = 0; c < acols; c++) {
109 		col = f + c;
110 		coff = o;
111 		if (col >= dcols) {
112 			col -= dcols;
113 			coff += 1ULL << unit_shift;
114 		}
115 		rm->rm_col[c].rc_col = col;
116 		rm->rm_col[c].rc_offset = coff;
117 		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
118 		rm->rm_col[c].rc_data = NULL;
119 		rm->rm_col[c].rc_error = 0;
120 		rm->rm_col[c].rc_tried = 0;
121 		rm->rm_col[c].rc_skipped = 0;
122 		rm->rm_asize += rm->rm_col[c].rc_size;
123 	}
124 
125 	rm->rm_asize = P2ROUNDUP(rm->rm_asize, VDEV_RAIDZ_ALIGN << unit_shift);
126 
127 	for (c = 0; c < rm->rm_firstdatacol; c++)
128 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
129 
130 	rm->rm_col[c].rc_data = zio->io_data;
131 
132 	for (c = c + 1; c < acols; c++)
133 		rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
134 		    rm->rm_col[c - 1].rc_size;
135 
136 	if (raid_type == RAIDZ_PARITY) {
137 		/*
138 		 * To prevent hot parity disks, switch the parity and data
139 		 * columns every 1MB.
140 		 */
141 		ASSERT(rm->rm_cols >= 2);
142 		ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
143 
144 		if (zio->io_offset & (1ULL << 20)) {
145 			col = rm->rm_col[0].rc_col;
146 			o = rm->rm_col[0].rc_offset;
147 			rm->rm_col[0].rc_col = rm->rm_col[1].rc_col;
148 			rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
149 			rm->rm_col[1].rc_col = col;
150 			rm->rm_col[1].rc_offset = o;
151 		}
152 	}
153 
154 	zio->io_vsd = rm;
155 	return (rm);
156 }
157 
158 static void
159 vdev_raidz_map_free(zio_t *zio)
160 {
161 	raidz_map_t *rm = zio->io_vsd;
162 	int c;
163 
164 	for (c = 0; c < rm->rm_firstdatacol; c++)
165 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
166 
167 	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
168 	zio->io_vsd = NULL;
169 }
170 
171 static void
172 vdev_raidz_reconstruct(raidz_map_t *rm, int x)
173 {
174 	uint64_t *dst, *src, count, xsize, csize;
175 	int i, c;
176 
177 	for (c = 0; c < rm->rm_cols; c++) {
178 		if (c == x)
179 			continue;
180 		src = rm->rm_col[c].rc_data;
181 		dst = rm->rm_col[x].rc_data;
182 		csize = rm->rm_col[c].rc_size;
183 		xsize = rm->rm_col[x].rc_size;
184 		count = MIN(csize, xsize) / sizeof (uint64_t);
185 		if (c == !x) {
186 			/*
187 			 * The initial copy happens at either c == 0 or c == 1.
188 			 * Both of these columns are 'big' columns, so we'll
189 			 * definitely initialize all of column x.
190 			 */
191 			ASSERT3U(xsize, <=, csize);
192 			for (i = 0; i < count; i++)
193 				*dst++ = *src++;
194 		} else {
195 			for (i = 0; i < count; i++)
196 				*dst++ ^= *src++;
197 		}
198 	}
199 }
200 
201 static int
202 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
203 {
204 	vdev_t *cvd;
205 	int c, error;
206 	int lasterror = 0;
207 	int numerrors = 0;
208 
209 	/*
210 	 * XXX -- minimum children should be raid-type-specific
211 	 */
212 	if (vd->vdev_children < 2) {
213 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
214 		return (EINVAL);
215 	}
216 
217 	for (c = 0; c < vd->vdev_children; c++) {
218 		cvd = vd->vdev_child[c];
219 
220 		if ((error = vdev_open(cvd)) != 0) {
221 			lasterror = error;
222 			numerrors++;
223 			continue;
224 		}
225 
226 		*asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
227 		*ashift = cvd->vdev_ashift;
228 	}
229 
230 	*asize *= vd->vdev_children;
231 
232 	if (numerrors > 1) {
233 		vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
234 		return (lasterror);
235 	}
236 
237 	return (0);
238 }
239 
240 static void
241 vdev_raidz_close(vdev_t *vd)
242 {
243 	int c;
244 
245 	for (c = 0; c < vd->vdev_children; c++)
246 		vdev_close(vd->vdev_child[c]);
247 }
248 
249 static uint64_t
250 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
251 {
252 	uint64_t asize;
253 	uint64_t cols = vd->vdev_children;
254 
255 	/*
256 	 * These calculations assume RAIDZ_PARITY.
257 	 */
258 	asize = psize >> vd->vdev_ashift;
259 	asize += (asize + cols - 2) / (cols - 1);
260 	asize = P2ROUNDUP(asize, VDEV_RAIDZ_ALIGN) << vd->vdev_ashift;
261 
262 	return (asize);
263 }
264 
265 static void
266 vdev_raidz_child_done(zio_t *zio)
267 {
268 	raidz_col_t *rc = zio->io_private;
269 
270 	rc->rc_error = zio->io_error;
271 	rc->rc_tried = 1;
272 	rc->rc_skipped = 0;
273 }
274 
275 static void
276 vdev_raidz_repair_done(zio_t *zio)
277 {
278 	zio_buf_free(zio->io_data, zio->io_size);
279 }
280 
281 static void
282 vdev_raidz_io_start(zio_t *zio)
283 {
284 	vdev_t *vd = zio->io_vd;
285 	vdev_t *cvd;
286 	blkptr_t *bp = zio->io_bp;
287 	raidz_map_t *rm;
288 	raidz_col_t *rc;
289 	int c;
290 
291 	rm = vdev_raidz_map_alloc(zio, vd->vdev_ashift, vd->vdev_children,
292 	    RAIDZ_PARITY);
293 
294 	if (DVA_GET_GANG(ZIO_GET_DVA(zio))) {
295 		ASSERT3U(rm->rm_asize, ==,
296 		    vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE));
297 		ASSERT3U(zio->io_size, ==, SPA_GANGBLOCKSIZE);
298 	} else {
299 		ASSERT3U(rm->rm_asize, ==, DVA_GET_ASIZE(ZIO_GET_DVA(zio)));
300 		ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
301 	}
302 
303 	if (zio->io_type == ZIO_TYPE_WRITE) {
304 
305 		/*
306 		 * Generate RAID parity in virtual column 0.
307 		 */
308 		vdev_raidz_reconstruct(rm, 0);
309 
310 		for (c = 0; c < rm->rm_cols; c++) {
311 			rc = &rm->rm_col[c];
312 			cvd = vd->vdev_child[rc->rc_col];
313 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
314 			    rc->rc_offset, rc->rc_data, rc->rc_size,
315 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
316 			    vdev_raidz_child_done, rc));
317 		}
318 		zio_wait_children_done(zio);
319 		return;
320 	}
321 
322 	ASSERT(zio->io_type == ZIO_TYPE_READ);
323 
324 	for (c = rm->rm_cols - 1; c >= 0; c--) {
325 		rc = &rm->rm_col[c];
326 		cvd = vd->vdev_child[rc->rc_col];
327 		if (vdev_is_dead(cvd)) {
328 			rm->rm_missing_child = c;
329 			rc->rc_error = ENXIO;
330 			rc->rc_tried = 1;	/* don't even try */
331 			rc->rc_skipped = 1;
332 			continue;
333 		}
334 		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
335 			rm->rm_missing_child = c;
336 			rc->rc_error = ESTALE;
337 			rc->rc_skipped = 1;
338 			continue;
339 		}
340 		if (c >= rm->rm_firstdatacol || rm->rm_missing_child != -1 ||
341 		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
342 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
343 			    rc->rc_offset, rc->rc_data, rc->rc_size,
344 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
345 			    vdev_raidz_child_done, rc));
346 		}
347 	}
348 
349 	zio_wait_children_done(zio);
350 }
351 
352 static void
353 vdev_raidz_io_done(zio_t *zio)
354 {
355 	vdev_t *vd = zio->io_vd;
356 	vdev_t *cvd;
357 	raidz_map_t *rm = zio->io_vsd;
358 	raidz_col_t *rc;
359 	blkptr_t *bp = zio->io_bp;
360 	int unexpected_errors = 0;
361 	int c;
362 
363 	ASSERT(bp != NULL);	/* XXX need to add code to enforce this */
364 
365 	zio->io_error = 0;
366 	zio->io_numerrors = 0;
367 
368 	for (c = 0; c < rm->rm_cols; c++) {
369 		rc = &rm->rm_col[c];
370 
371 		/*
372 		 * We preserve any EIOs because those may be worth retrying;
373 		 * whereas ECKSUM and ENXIO are more likely to be persistent.
374 		 */
375 		if (rc->rc_error) {
376 			if (zio->io_error != EIO)
377 				zio->io_error = rc->rc_error;
378 			if (!rc->rc_skipped)
379 				unexpected_errors++;
380 			zio->io_numerrors++;
381 		}
382 	}
383 
384 	if (zio->io_type == ZIO_TYPE_WRITE) {
385 		/*
386 		 * If this is not a failfast write, and we were able to
387 		 * write enough columns to reconstruct the data, good enough.
388 		 */
389 		/* XXPOLICY */
390 		if (zio->io_numerrors <= rm->rm_firstdatacol &&
391 		    !(zio->io_flags & ZIO_FLAG_FAILFAST))
392 			zio->io_error = 0;
393 
394 		vdev_raidz_map_free(zio);
395 		zio_next_stage(zio);
396 		return;
397 	}
398 
399 	ASSERT(zio->io_type == ZIO_TYPE_READ);
400 
401 	/*
402 	 * If there were no I/O errors, and the data checksums correctly,
403 	 * the read is complete.
404 	 */
405 	/* XXPOLICY */
406 	if (zio->io_numerrors == 0 && zio_checksum_error(zio) == 0) {
407 		ASSERT(unexpected_errors == 0);
408 		ASSERT(zio->io_error == 0);
409 
410 		/*
411 		 * We know the data's good.  If we read the parity,
412 		 * verify that it's good as well.  If not, fix it.
413 		 */
414 		for (c = 0; c < rm->rm_firstdatacol; c++) {
415 			void *orig;
416 			rc = &rm->rm_col[c];
417 			if (!rc->rc_tried)
418 				continue;
419 			orig = zio_buf_alloc(rc->rc_size);
420 			bcopy(rc->rc_data, orig, rc->rc_size);
421 			vdev_raidz_reconstruct(rm, c);
422 			if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
423 				vdev_checksum_error(zio,
424 				    vd->vdev_child[rc->rc_col]);
425 				rc->rc_error = ECKSUM;
426 				unexpected_errors++;
427 			}
428 			zio_buf_free(orig, rc->rc_size);
429 		}
430 		goto done;
431 	}
432 
433 	/*
434 	 * If there was exactly one I/O error, it's the one we expected,
435 	 * and the reconstructed data checksums, the read is complete.
436 	 * This happens when one child is offline and vdev_fault_assess()
437 	 * knows it, or when one child has stale data and the DTL knows it.
438 	 */
439 	if (zio->io_numerrors == 1 && (c = rm->rm_missing_child) != -1) {
440 		rc = &rm->rm_col[c];
441 		ASSERT(unexpected_errors == 0);
442 		ASSERT(rc->rc_error == ENXIO || rc->rc_error == ESTALE);
443 		vdev_raidz_reconstruct(rm, c);
444 		if (zio_checksum_error(zio) == 0) {
445 			zio->io_error = 0;
446 			goto done;
447 		}
448 	}
449 
450 	/*
451 	 * This isn't a typical error -- either we got a read error or
452 	 * more than one child claimed a problem.  Read every block we
453 	 * haven't already so we can try combinatorial reconstruction.
454 	 */
455 	unexpected_errors = 1;
456 	rm->rm_missing_child = -1;
457 
458 	for (c = 0; c < rm->rm_cols; c++)
459 		if (!rm->rm_col[c].rc_tried)
460 			break;
461 
462 	if (c != rm->rm_cols) {
463 		zio->io_error = 0;
464 		zio_vdev_io_redone(zio);
465 		for (c = 0; c < rm->rm_cols; c++) {
466 			rc = &rm->rm_col[c];
467 			if (rc->rc_tried)
468 				continue;
469 			zio_nowait(zio_vdev_child_io(zio, NULL,
470 			    vd->vdev_child[rc->rc_col],
471 			    rc->rc_offset, rc->rc_data, rc->rc_size,
472 			    zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
473 			    vdev_raidz_child_done, rc));
474 		}
475 		zio_wait_children_done(zio);
476 		return;
477 	}
478 
479 	/*
480 	 * If there were more errors than parity disks, give up.
481 	 */
482 	if (zio->io_numerrors > rm->rm_firstdatacol) {
483 		ASSERT(zio->io_error != 0);
484 		goto done;
485 	}
486 
487 	/*
488 	 * The number of I/O errors is correctable.  Correct them here.
489 	 */
490 	ASSERT(zio->io_numerrors <= rm->rm_firstdatacol);
491 	for (c = 0; c < rm->rm_cols; c++) {
492 		rc = &rm->rm_col[c];
493 		ASSERT(rc->rc_tried);
494 		if (rc->rc_error) {
495 			vdev_raidz_reconstruct(rm, c);
496 			if (zio_checksum_error(zio) == 0)
497 				zio->io_error = 0;
498 			else
499 				zio->io_error = rc->rc_error;
500 			goto done;
501 		}
502 	}
503 
504 	/*
505 	 * There were no I/O errors, but the data doesn't checksum.
506 	 * Try all permutations to see if we can find one that does.
507 	 */
508 	ASSERT(zio->io_numerrors == 0);
509 	for (c = 0; c < rm->rm_cols; c++) {
510 		void *orig;
511 		rc = &rm->rm_col[c];
512 
513 		orig = zio_buf_alloc(rc->rc_size);
514 		bcopy(rc->rc_data, orig, rc->rc_size);
515 		vdev_raidz_reconstruct(rm, c);
516 
517 		if (zio_checksum_error(zio) == 0) {
518 			zio_buf_free(orig, rc->rc_size);
519 			zio->io_error = 0;
520 			/*
521 			 * If this child didn't know that it returned bad data,
522 			 * inform it.
523 			 */
524 			if (rc->rc_tried && rc->rc_error == 0)
525 				vdev_checksum_error(zio,
526 				    vd->vdev_child[rc->rc_col]);
527 			rc->rc_error = ECKSUM;
528 			goto done;
529 		}
530 
531 		bcopy(orig, rc->rc_data, rc->rc_size);
532 		zio_buf_free(orig, rc->rc_size);
533 	}
534 
535 	/*
536 	 * All combinations failed to checksum.
537 	 */
538 	zio->io_error = ECKSUM;
539 
540 done:
541 	zio_checksum_verified(zio);
542 
543 	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
544 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
545 		/*
546 		 * Use the good data we have in hand to repair damaged children.
547 		 */
548 		for (c = 0; c < rm->rm_cols; c++) {
549 			rc = &rm->rm_col[c];
550 			cvd = vd->vdev_child[rc->rc_col];
551 
552 			if (rc->rc_error) {
553 				/*
554 				 * Make a copy of the data because we're
555 				 * going to free the RAID-Z map below.
556 				 */
557 				void *data = zio_buf_alloc(rc->rc_size);
558 				bcopy(rc->rc_data, data, rc->rc_size);
559 
560 				dprintf("%s resilvered %s @ 0x%llx error %d\n",
561 				    vdev_description(vd),
562 				    vdev_description(cvd),
563 				    zio->io_offset, rc->rc_error);
564 
565 				zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
566 				    rc->rc_offset, data, rc->rc_size,
567 				    ZIO_TYPE_WRITE, zio->io_priority,
568 				    ZIO_FLAG_IO_REPAIR | ZIO_FLAG_CANFAIL |
569 				    ZIO_FLAG_DONT_PROPAGATE,
570 				    vdev_raidz_repair_done, NULL));
571 			}
572 		}
573 	}
574 
575 	vdev_raidz_map_free(zio);
576 	zio_next_stage(zio);
577 }
578 
579 static void
580 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
581 {
582 	if (faulted > 1)
583 		vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
584 	else if (degraded + faulted != 0)
585 		vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
586 	else
587 		vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
588 }
589 
590 vdev_ops_t vdev_raidz_ops = {
591 	vdev_raidz_open,
592 	vdev_raidz_close,
593 	vdev_raidz_asize,
594 	vdev_raidz_io_start,
595 	vdev_raidz_io_done,
596 	vdev_raidz_state_change,
597 	VDEV_TYPE_RAIDZ,	/* name of this vdev type */
598 	B_FALSE			/* not a leaf vdev */
599 };
600