xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision 8311bc5f17dec348749f763b82dfe2737bc53cd7)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/param.h>
30 #include <sys/bio.h>
31 #include <sys/endian.h>
32 #include <sys/kernel.h>
33 #include <sys/kobj.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/sysctl.h>
39 #include <sys/systm.h>
40 #include <geom/geom.h>
41 #include <geom/geom_dbg.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44 
45 #define N	2
46 
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48 
49 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
52     &g_raid1e_rebuild_slab, 0,
53     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
54 
55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
58     &g_raid1e_rebuild_fair_io, 0,
59     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
60 
61 #define RAID1E_REBUILD_CLUSTER_IDLE 100
62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
64     &g_raid1e_rebuild_cluster_idle, 0,
65     "Number of slabs to do each time we trigger a rebuild cycle");
66 
67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
70     &g_raid1e_rebuild_meta_update, 0,
71     "When to update the meta data.");
72 
73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
74 
75 #define TR_RAID1E_NONE 0
76 #define TR_RAID1E_REBUILD 1
77 #define TR_RAID1E_RESYNC 2
78 
79 #define TR_RAID1E_F_DOING_SOME	0x1
80 #define TR_RAID1E_F_LOCKED	0x2
81 #define TR_RAID1E_F_ABORT	0x4
82 
83 struct g_raid_tr_raid1e_object {
84 	struct g_raid_tr_object	 trso_base;
85 	int			 trso_starting;
86 	int			 trso_stopping;
87 	int			 trso_type;
88 	int			 trso_recover_slabs; /* slabs before rest */
89 	int			 trso_fair_io;
90 	int			 trso_meta_update;
91 	int			 trso_flags;
92 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
93 	void			*trso_buffer;	 /* Buffer space */
94 	off_t			 trso_lock_pos; /* Locked range start. */
95 	off_t			 trso_lock_len; /* Locked range length. */
96 	struct bio		 trso_bio;
97 };
98 
99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
100 static g_raid_tr_event_t g_raid_tr_event_raid1e;
101 static g_raid_tr_start_t g_raid_tr_start_raid1e;
102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
108 static g_raid_tr_free_t g_raid_tr_free_raid1e;
109 
110 static kobj_method_t g_raid_tr_raid1e_methods[] = {
111 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
112 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
113 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
114 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
115 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
116 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
117 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
118 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
119 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
120 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
121 	{ 0, 0 }
122 };
123 
124 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
125 	"RAID1E",
126 	g_raid_tr_raid1e_methods,
127 	sizeof(struct g_raid_tr_raid1e_object),
128 	.trc_enable = 1,
129 	.trc_priority = 200,
130 	.trc_accept_unmapped = 1
131 };
132 
133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
135     struct g_raid_subdisk *sd);
136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
137     int no, off_t off, off_t len, u_int mask);
138 
139 static inline void
140 V2P(struct g_raid_volume *vol, off_t virt,
141     int *disk, off_t *offset, off_t *start)
142 {
143 	off_t nstrip;
144 	u_int strip_size;
145 
146 	strip_size = vol->v_strip_size;
147 	/* Strip number. */
148 	nstrip = virt / strip_size;
149 	/* Start position in strip. */
150 	*start = virt % strip_size;
151 	/* Disk number. */
152 	*disk = (nstrip * N) % vol->v_disks_count;
153 	/* Strip start position in disk. */
154 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
155 }
156 
157 static inline void
158 P2V(struct g_raid_volume *vol, int disk, off_t offset,
159     off_t *virt, int *copy)
160 {
161 	off_t nstrip, start;
162 	u_int strip_size;
163 
164 	strip_size = vol->v_strip_size;
165 	/* Start position in strip. */
166 	start = offset % strip_size;
167 	/* Physical strip number. */
168 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
169 	/* Number of physical strip (copy) inside virtual strip. */
170 	*copy = nstrip % N;
171 	/* Offset in virtual space. */
172 	*virt = (nstrip / N) * strip_size + start;
173 }
174 
175 static int
176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
177 {
178 	struct g_raid_tr_raid1e_object *trs;
179 
180 	trs = (struct g_raid_tr_raid1e_object *)tr;
181 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
183 		return (G_RAID_TR_TASTE_FAIL);
184 	trs->trso_starting = 1;
185 	return (G_RAID_TR_TASTE_SUCCEED);
186 }
187 
188 static int
189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
190 {
191 	struct g_raid_softc *sc;
192 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
193 	int i, j, state, sstate;
194 
195 	sc = vol->v_softc;
196 	state = G_RAID_VOLUME_S_OPTIMAL;
197 	for (i = 0; i < vol->v_disks_count / N; i++) {
198 		bestsd = &vol->v_subdisks[i * N];
199 		for (j = 1; j < N; j++) {
200 			sd = &vol->v_subdisks[i * N + j];
201 			if (sd->sd_state > bestsd->sd_state)
202 				bestsd = sd;
203 			else if (sd->sd_state == bestsd->sd_state &&
204 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
205 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
206 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
207 				bestsd = sd;
208 		}
209 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
210 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
211 			/* We found reasonable candidate. */
212 			G_RAID_DEBUG1(1, sc,
213 			    "Promote subdisk %s:%d from %s to ACTIVE.",
214 			    vol->v_name, bestsd->sd_pos,
215 			    g_raid_subdisk_state2str(bestsd->sd_state));
216 			g_raid_change_subdisk_state(bestsd,
217 			    G_RAID_SUBDISK_S_ACTIVE);
218 			g_raid_write_metadata(sc,
219 			    vol, bestsd, bestsd->sd_disk);
220 		}
221 		worstsd = &vol->v_subdisks[i * N];
222 		for (j = 1; j < N; j++) {
223 			sd = &vol->v_subdisks[i * N + j];
224 			if (sd->sd_state < worstsd->sd_state)
225 				worstsd = sd;
226 		}
227 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
228 			sstate = G_RAID_VOLUME_S_OPTIMAL;
229 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
230 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
231 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
232 			sstate = G_RAID_VOLUME_S_DEGRADED;
233 		else
234 			sstate = G_RAID_VOLUME_S_BROKEN;
235 		if (sstate < state)
236 			state = sstate;
237 	}
238 	return (state);
239 }
240 
241 static int
242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
243 {
244 	struct g_raid_softc *sc;
245 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
246 	int i, j, state, sstate;
247 
248 	sc = vol->v_softc;
249 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
250 	    vol->v_disks_count)
251 		return (G_RAID_VOLUME_S_OPTIMAL);
252 	for (i = 0; i < vol->v_disks_count; i++) {
253 		sd = &vol->v_subdisks[i];
254 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
255 			/* We found reasonable candidate. */
256 			G_RAID_DEBUG1(1, sc,
257 			    "Promote subdisk %s:%d from %s to STALE.",
258 			    vol->v_name, sd->sd_pos,
259 			    g_raid_subdisk_state2str(sd->sd_state));
260 			g_raid_change_subdisk_state(sd,
261 			    G_RAID_SUBDISK_S_STALE);
262 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
263 		}
264 	}
265 	state = G_RAID_VOLUME_S_OPTIMAL;
266 	for (i = 0; i < vol->v_disks_count; i++) {
267 		bestsd = &vol->v_subdisks[i];
268 		worstsd = &vol->v_subdisks[i];
269 		for (j = 1; j < N; j++) {
270 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
271 			if (sd->sd_state > bestsd->sd_state)
272 				bestsd = sd;
273 			else if (sd->sd_state == bestsd->sd_state &&
274 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
275 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
276 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
277 				bestsd = sd;
278 			if (sd->sd_state < worstsd->sd_state)
279 				worstsd = sd;
280 		}
281 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
282 			sstate = G_RAID_VOLUME_S_OPTIMAL;
283 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
284 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
285 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286 			sstate = G_RAID_VOLUME_S_DEGRADED;
287 		else
288 			sstate = G_RAID_VOLUME_S_BROKEN;
289 		if (sstate < state)
290 			state = sstate;
291 	}
292 	return (state);
293 }
294 
295 static int
296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
297     struct g_raid_subdisk *sd)
298 {
299 	struct g_raid_tr_raid1e_object *trs;
300 	struct g_raid_softc *sc;
301 	u_int s;
302 
303 	sc = vol->v_softc;
304 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
305 	if (trs->trso_stopping &&
306 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
307 		s = G_RAID_VOLUME_S_STOPPED;
308 	else if (trs->trso_starting)
309 		s = G_RAID_VOLUME_S_STARTING;
310 	else {
311 		if ((vol->v_disks_count % N) == 0)
312 			s = g_raid_tr_update_state_raid1e_even(vol);
313 		else
314 			s = g_raid_tr_update_state_raid1e_odd(vol);
315 	}
316 	if (s != vol->v_state) {
317 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
318 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
319 		    G_RAID_EVENT_VOLUME);
320 		g_raid_change_volume_state(vol, s);
321 		if (!trs->trso_starting && !trs->trso_stopping)
322 			g_raid_write_metadata(sc, vol, NULL, NULL);
323 	}
324 	if (!trs->trso_starting && !trs->trso_stopping)
325 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
326 	return (0);
327 }
328 
329 static void
330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
331     struct g_raid_disk *disk)
332 {
333 	struct g_raid_volume *vol;
334 
335 	vol = sd->sd_volume;
336 	/*
337 	 * We don't fail the last disk in the pack, since it still has decent
338 	 * data on it and that's better than failing the disk if it is the root
339 	 * file system.
340 	 *
341 	 * XXX should this be controlled via a tunable?  It makes sense for
342 	 * the volume that has / on it.  I can't think of a case where we'd
343 	 * want the volume to go away on this kind of event.
344 	 */
345 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349 	     vol->v_disks_count) &&
350 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
351 		return;
352 	g_raid_fail_disk(sc, sd, disk);
353 }
354 
355 static void
356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
357 {
358 	struct g_raid_volume *vol;
359 	struct g_raid_subdisk *sd;
360 
361 	vol = trs->trso_base.tro_volume;
362 	sd = trs->trso_failed_sd;
363 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
364 	free(trs->trso_buffer, M_TR_RAID1E);
365 	trs->trso_buffer = NULL;
366 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
367 	trs->trso_type = TR_RAID1E_NONE;
368 	trs->trso_recover_slabs = 0;
369 	trs->trso_failed_sd = NULL;
370 	g_raid_tr_update_state_raid1e(vol, NULL);
371 }
372 
373 static void
374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
375 {
376 	struct g_raid_tr_raid1e_object *trs;
377 	struct g_raid_subdisk *sd;
378 
379 	trs = (struct g_raid_tr_raid1e_object *)tr;
380 	sd = trs->trso_failed_sd;
381 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
382 	    "Subdisk %s:%d-%s rebuild completed.",
383 	    sd->sd_volume->v_name, sd->sd_pos,
384 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
385 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
386 	sd->sd_rebuild_pos = 0;
387 	g_raid_tr_raid1e_rebuild_done(trs);
388 }
389 
390 static void
391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
392 {
393 	struct g_raid_tr_raid1e_object *trs;
394 	struct g_raid_subdisk *sd;
395 	struct g_raid_volume *vol;
396 
397 	vol = tr->tro_volume;
398 	trs = (struct g_raid_tr_raid1e_object *)tr;
399 	sd = trs->trso_failed_sd;
400 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
401 		G_RAID_DEBUG1(1, vol->v_softc,
402 		    "Subdisk %s:%d-%s rebuild is aborting.",
403 		    sd->sd_volume->v_name, sd->sd_pos,
404 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
405 		trs->trso_flags |= TR_RAID1E_F_ABORT;
406 	} else {
407 		G_RAID_DEBUG1(0, vol->v_softc,
408 		    "Subdisk %s:%d-%s rebuild aborted.",
409 		    sd->sd_volume->v_name, sd->sd_pos,
410 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
411 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
412 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
413 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
414 			g_raid_unlock_range(tr->tro_volume,
415 			    trs->trso_lock_pos, trs->trso_lock_len);
416 		}
417 		g_raid_tr_raid1e_rebuild_done(trs);
418 	}
419 }
420 
421 static void
422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
423 {
424 	struct g_raid_tr_raid1e_object *trs;
425 	struct g_raid_softc *sc;
426 	struct g_raid_volume *vol;
427 	struct g_raid_subdisk *sd;
428 	struct bio *bp;
429 	off_t len, virtual, vend, offset, start;
430 	int disk, copy, best;
431 
432 	trs = (struct g_raid_tr_raid1e_object *)tr;
433 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
434 		return;
435 	vol = tr->tro_volume;
436 	sc = vol->v_softc;
437 	sd = trs->trso_failed_sd;
438 
439 	while (1) {
440 		if (sd->sd_rebuild_pos >= sd->sd_size) {
441 			g_raid_tr_raid1e_rebuild_finish(tr);
442 			return;
443 		}
444 		/* Get virtual offset from physical rebuild position. */
445 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
446 		/* Get physical offset back to get first stripe position. */
447 		V2P(vol, virtual, &disk, &offset, &start);
448 		/* Calculate contignous data length. */
449 		len = MIN(g_raid1e_rebuild_slab,
450 		    sd->sd_size - sd->sd_rebuild_pos);
451 		if ((vol->v_disks_count % N) != 0)
452 			len = MIN(len, vol->v_strip_size - start);
453 		/* Find disk with most accurate data. */
454 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
455 		    offset + start, len, 0);
456 		if (best < 0) {
457 			/* There is no any valid disk. */
458 			g_raid_tr_raid1e_rebuild_abort(tr);
459 			return;
460 		} else if (best != copy) {
461 			/* Some other disk has better data. */
462 			break;
463 		}
464 		/* We have the most accurate data. Skip the range. */
465 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
466 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
467 		sd->sd_rebuild_pos += len;
468 	}
469 
470 	bp = &trs->trso_bio;
471 	memset(bp, 0, sizeof(*bp));
472 	bp->bio_offset = offset + start +
473 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
474 	bp->bio_length = len;
475 	bp->bio_data = trs->trso_buffer;
476 	bp->bio_cmd = BIO_READ;
477 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
478 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
479 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
480 	/*
481 	 * If we are crossing stripe boundary, correct affected virtual
482 	 * range we should lock.
483 	 */
484 	if (start + len > vol->v_strip_size) {
485 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
486 		len = vend - virtual;
487 	}
488 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
489 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
490 	trs->trso_lock_pos = virtual;
491 	trs->trso_lock_len = len;
492 	/* Lock callback starts I/O */
493 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
494 }
495 
496 static void
497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
498 {
499 	struct g_raid_volume *vol;
500 	struct g_raid_tr_raid1e_object *trs;
501 	struct g_raid_subdisk *sd;
502 
503 	vol = tr->tro_volume;
504 	trs = (struct g_raid_tr_raid1e_object *)tr;
505 	if (trs->trso_failed_sd) {
506 		G_RAID_DEBUG1(1, vol->v_softc,
507 		    "Already rebuild in start rebuild. pos %jd\n",
508 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
509 		return;
510 	}
511 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
512 	if (sd == NULL)
513 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
514 	if (sd == NULL) {
515 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
516 		if (sd != NULL) {
517 			sd->sd_rebuild_pos = 0;
518 			g_raid_change_subdisk_state(sd,
519 			    G_RAID_SUBDISK_S_RESYNC);
520 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
521 		} else {
522 			sd = g_raid_get_subdisk(vol,
523 			    G_RAID_SUBDISK_S_UNINITIALIZED);
524 			if (sd == NULL)
525 				sd = g_raid_get_subdisk(vol,
526 				    G_RAID_SUBDISK_S_NEW);
527 			if (sd != NULL) {
528 				sd->sd_rebuild_pos = 0;
529 				g_raid_change_subdisk_state(sd,
530 				    G_RAID_SUBDISK_S_REBUILD);
531 				g_raid_write_metadata(vol->v_softc,
532 				    vol, sd, NULL);
533 			}
534 		}
535 	}
536 	if (sd == NULL) {
537 		G_RAID_DEBUG1(1, vol->v_softc,
538 		    "No failed disk to rebuild.  night night.");
539 		return;
540 	}
541 	trs->trso_failed_sd = sd;
542 	G_RAID_DEBUG1(0, vol->v_softc,
543 	    "Subdisk %s:%d-%s rebuild start at %jd.",
544 	    sd->sd_volume->v_name, sd->sd_pos,
545 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
546 	    trs->trso_failed_sd->sd_rebuild_pos);
547 	trs->trso_type = TR_RAID1E_REBUILD;
548 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
549 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
550 	g_raid_tr_raid1e_rebuild_some(tr);
551 }
552 
553 static void
554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
555     struct g_raid_subdisk *sd)
556 {
557 	struct g_raid_volume *vol;
558 	struct g_raid_tr_raid1e_object *trs;
559 	int nr;
560 
561 	vol = tr->tro_volume;
562 	trs = (struct g_raid_tr_raid1e_object *)tr;
563 	if (trs->trso_stopping)
564 		return;
565 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
566 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
567 	switch(trs->trso_type) {
568 	case TR_RAID1E_NONE:
569 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
570 			return;
571 		if (nr == 0) {
572 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
573 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
574 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
575 			if (nr == 0)
576 				return;
577 		}
578 		g_raid_tr_raid1e_rebuild_start(tr);
579 		break;
580 	case TR_RAID1E_REBUILD:
581 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
582 		    trs->trso_failed_sd == sd)
583 			g_raid_tr_raid1e_rebuild_abort(tr);
584 		break;
585 	case TR_RAID1E_RESYNC:
586 		break;
587 	}
588 }
589 
590 static int
591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
592     struct g_raid_subdisk *sd, u_int event)
593 {
594 
595 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
596 	return (0);
597 }
598 
599 static int
600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
601 {
602 	struct g_raid_tr_raid1e_object *trs;
603 	struct g_raid_volume *vol;
604 
605 	trs = (struct g_raid_tr_raid1e_object *)tr;
606 	vol = tr->tro_volume;
607 	trs->trso_starting = 0;
608 	g_raid_tr_update_state_raid1e(vol, NULL);
609 	return (0);
610 }
611 
612 static int
613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
614 {
615 	struct g_raid_tr_raid1e_object *trs;
616 	struct g_raid_volume *vol;
617 
618 	trs = (struct g_raid_tr_raid1e_object *)tr;
619 	vol = tr->tro_volume;
620 	trs->trso_starting = 0;
621 	trs->trso_stopping = 1;
622 	g_raid_tr_update_state_raid1e(vol, NULL);
623 	return (0);
624 }
625 
626 /*
627  * Select the disk to read from.  Take into account: subdisk state, running
628  * error recovery, average disk load, head position and possible cache hits.
629  */
630 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
631 static int
632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
633     int no, off_t off, off_t len, u_int mask)
634 {
635 	struct g_raid_subdisk *sd;
636 	off_t offset;
637 	int i, best, prio, bestprio;
638 
639 	best = -1;
640 	bestprio = INT_MAX;
641 	for (i = 0; i < N; i++) {
642 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
643 		offset = off;
644 		if (no + i >= vol->v_disks_count)
645 			offset += vol->v_strip_size;
646 
647 		prio = G_RAID_SUBDISK_LOAD(sd);
648 		if ((mask & (1 << sd->sd_pos)) != 0)
649 			continue;
650 		switch (sd->sd_state) {
651 		case G_RAID_SUBDISK_S_ACTIVE:
652 			break;
653 		case G_RAID_SUBDISK_S_RESYNC:
654 			if (offset + off < sd->sd_rebuild_pos)
655 				break;
656 			/* FALLTHROUGH */
657 		case G_RAID_SUBDISK_S_STALE:
658 			prio += i << 24;
659 			break;
660 		case G_RAID_SUBDISK_S_REBUILD:
661 			if (offset + off < sd->sd_rebuild_pos)
662 				break;
663 			/* FALLTHROUGH */
664 		default:
665 			continue;
666 		}
667 		prio += min(sd->sd_recovery, 255) << 16;
668 		/* If disk head is precisely in position - highly prefer it. */
669 		if (G_RAID_SUBDISK_POS(sd) == offset)
670 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
671 		else
672 		/* If disk head is close to position - prefer it. */
673 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
674 		    G_RAID_SUBDISK_TRACK_SIZE)
675 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
676 		if (prio < bestprio) {
677 			bestprio = prio;
678 			best = i;
679 		}
680 	}
681 	return (best);
682 }
683 
684 static void
685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
686 {
687 	struct g_raid_volume *vol;
688 	struct g_raid_subdisk *sd;
689 	struct bio_queue_head queue;
690 	struct bio *cbp;
691 	char *addr;
692 	off_t offset, start, length, remain;
693 	u_int no, strip_size;
694 	int best;
695 
696 	vol = tr->tro_volume;
697 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698 		addr = NULL;
699 	else
700 		addr = bp->bio_data;
701 	strip_size = vol->v_strip_size;
702 	V2P(vol, bp->bio_offset, &no, &offset, &start);
703 	remain = bp->bio_length;
704 	bioq_init(&queue);
705 	while (remain > 0) {
706 		length = MIN(strip_size - start, remain);
707 		best = g_raid_tr_raid1e_select_read_disk(vol,
708 		    no, offset, length, 0);
709 		KASSERT(best >= 0, ("No readable disk in volume %s!",
710 		    vol->v_name));
711 		no += best;
712 		if (no >= vol->v_disks_count) {
713 			no -= vol->v_disks_count;
714 			offset += strip_size;
715 		}
716 		cbp = g_clone_bio(bp);
717 		if (cbp == NULL)
718 			goto failure;
719 		cbp->bio_offset = offset + start;
720 		cbp->bio_length = length;
721 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722 			cbp->bio_ma_offset += (uintptr_t)addr;
723 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724 			cbp->bio_ma_offset %= PAGE_SIZE;
725 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726 			    cbp->bio_length) / PAGE_SIZE;
727 		} else
728 			cbp->bio_data = addr;
729 		cbp->bio_caller1 = &vol->v_subdisks[no];
730 		bioq_insert_tail(&queue, cbp);
731 		no += N - best;
732 		if (no >= vol->v_disks_count) {
733 			no -= vol->v_disks_count;
734 			offset += strip_size;
735 		}
736 		remain -= length;
737 		addr += length;
738 		start = 0;
739 	}
740 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
741 		sd = cbp->bio_caller1;
742 		cbp->bio_caller1 = NULL;
743 		g_raid_subdisk_iostart(sd, cbp);
744 	}
745 	return;
746 failure:
747 	while ((cbp = bioq_takefirst(&queue)) != NULL)
748 		g_destroy_bio(cbp);
749 	if (bp->bio_error == 0)
750 		bp->bio_error = ENOMEM;
751 	g_raid_iodone(bp, bp->bio_error);
752 }
753 
754 static void
755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
756 {
757 	struct g_raid_volume *vol;
758 	struct g_raid_subdisk *sd;
759 	struct bio_queue_head queue;
760 	struct bio *cbp;
761 	char *addr;
762 	off_t offset, start, length, remain;
763 	u_int no, strip_size;
764 	int i;
765 
766 	vol = tr->tro_volume;
767 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768 		addr = NULL;
769 	else
770 		addr = bp->bio_data;
771 	strip_size = vol->v_strip_size;
772 	V2P(vol, bp->bio_offset, &no, &offset, &start);
773 	remain = bp->bio_length;
774 	bioq_init(&queue);
775 	while (remain > 0) {
776 		length = MIN(strip_size - start, remain);
777 		for (i = 0; i < N; i++) {
778 			sd = &vol->v_subdisks[no];
779 			switch (sd->sd_state) {
780 			case G_RAID_SUBDISK_S_ACTIVE:
781 			case G_RAID_SUBDISK_S_STALE:
782 			case G_RAID_SUBDISK_S_RESYNC:
783 				break;
784 			case G_RAID_SUBDISK_S_REBUILD:
785 				if (offset + start >= sd->sd_rebuild_pos)
786 					goto nextdisk;
787 				break;
788 			default:
789 				goto nextdisk;
790 			}
791 			cbp = g_clone_bio(bp);
792 			if (cbp == NULL)
793 				goto failure;
794 			cbp->bio_offset = offset + start;
795 			cbp->bio_length = length;
796 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797 			    bp->bio_cmd != BIO_DELETE) {
798 				cbp->bio_ma_offset += (uintptr_t)addr;
799 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800 				cbp->bio_ma_offset %= PAGE_SIZE;
801 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802 				    cbp->bio_length) / PAGE_SIZE;
803 			} else
804 				cbp->bio_data = addr;
805 			cbp->bio_caller1 = sd;
806 			bioq_insert_tail(&queue, cbp);
807 nextdisk:
808 			if (++no >= vol->v_disks_count) {
809 				no = 0;
810 				offset += strip_size;
811 			}
812 		}
813 		remain -= length;
814 		if (bp->bio_cmd != BIO_DELETE)
815 			addr += length;
816 		start = 0;
817 	}
818 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
819 		sd = cbp->bio_caller1;
820 		cbp->bio_caller1 = NULL;
821 		g_raid_subdisk_iostart(sd, cbp);
822 	}
823 	return;
824 failure:
825 	while ((cbp = bioq_takefirst(&queue)) != NULL)
826 		g_destroy_bio(cbp);
827 	if (bp->bio_error == 0)
828 		bp->bio_error = ENOMEM;
829 	g_raid_iodone(bp, bp->bio_error);
830 }
831 
832 static void
833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
834 {
835 	struct g_raid_volume *vol;
836 	struct g_raid_tr_raid1e_object *trs;
837 
838 	vol = tr->tro_volume;
839 	trs = (struct g_raid_tr_raid1e_object *)tr;
840 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
841 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
842 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
843 		g_raid_iodone(bp, EIO);
844 		return;
845 	}
846 	/*
847 	 * If we're rebuilding, squeeze in rebuild activity every so often,
848 	 * even when the disk is busy.  Be sure to only count real I/O
849 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
850 	 * by this module.
851 	 */
852 	if (trs->trso_failed_sd != NULL &&
853 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
854 		/* Make this new or running now round short. */
855 		trs->trso_recover_slabs = 0;
856 		if (--trs->trso_fair_io <= 0) {
857 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
858 			g_raid_tr_raid1e_rebuild_some(tr);
859 		}
860 	}
861 	switch (bp->bio_cmd) {
862 	case BIO_READ:
863 		g_raid_tr_iostart_raid1e_read(tr, bp);
864 		break;
865 	case BIO_WRITE:
866 	case BIO_DELETE:
867 		g_raid_tr_iostart_raid1e_write(tr, bp);
868 		break;
869 	case BIO_SPEEDUP:
870 	case BIO_FLUSH:
871 		g_raid_tr_flush_common(tr, bp);
872 		break;
873 	default:
874 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
875 		    bp->bio_cmd, vol->v_name));
876 		break;
877 	}
878 }
879 
880 static void
881 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
882     struct g_raid_subdisk *sd, struct bio *bp)
883 {
884 	struct bio *cbp;
885 	struct g_raid_subdisk *nsd;
886 	struct g_raid_volume *vol;
887 	struct bio *pbp;
888 	struct g_raid_tr_raid1e_object *trs;
889 	off_t virtual, offset, start;
890 	uintptr_t mask;
891 	int error, do_write, copy, disk, best;
892 
893 	trs = (struct g_raid_tr_raid1e_object *)tr;
894 	vol = tr->tro_volume;
895 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
896 		if (trs->trso_type == TR_RAID1E_REBUILD) {
897 			nsd = trs->trso_failed_sd;
898 			if (bp->bio_cmd == BIO_READ) {
899 				/* Immediately abort rebuild, if requested. */
900 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
901 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
902 					g_raid_tr_raid1e_rebuild_abort(tr);
903 					return;
904 				}
905 
906 				/* On read error, skip and cross fingers. */
907 				if (bp->bio_error != 0) {
908 					G_RAID_LOGREQ(0, bp,
909 					    "Read error during rebuild (%d), "
910 					    "possible data loss!",
911 					    bp->bio_error);
912 					goto rebuild_round_done;
913 				}
914 
915 				/*
916 				 * The read operation finished, queue the
917 				 * write and get out.
918 				 */
919 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
920 				    bp->bio_error);
921 				bp->bio_cmd = BIO_WRITE;
922 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
923 				bp->bio_offset = nsd->sd_rebuild_pos;
924 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
925 				g_raid_subdisk_iostart(nsd, bp);
926 			} else {
927 				/*
928 				 * The write operation just finished.  Do
929 				 * another.  We keep cloning the master bio
930 				 * since it has the right buffers allocated to
931 				 * it.
932 				 */
933 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
934 				    bp->bio_error);
935 				if (bp->bio_error != 0 ||
936 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
937 					if ((trs->trso_flags &
938 					    TR_RAID1E_F_ABORT) == 0) {
939 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
940 						    nsd, nsd->sd_disk);
941 					}
942 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
943 					g_raid_tr_raid1e_rebuild_abort(tr);
944 					return;
945 				}
946 rebuild_round_done:
947 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
948 				g_raid_unlock_range(tr->tro_volume,
949 				    trs->trso_lock_pos, trs->trso_lock_len);
950 				nsd->sd_rebuild_pos += bp->bio_length;
951 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
952 					g_raid_tr_raid1e_rebuild_finish(tr);
953 					return;
954 				}
955 
956 				/* Abort rebuild if we are stopping */
957 				if (trs->trso_stopping) {
958 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
959 					g_raid_tr_raid1e_rebuild_abort(tr);
960 					return;
961 				}
962 
963 				if (--trs->trso_meta_update <= 0) {
964 					g_raid_write_metadata(vol->v_softc,
965 					    vol, nsd, nsd->sd_disk);
966 					trs->trso_meta_update =
967 					    g_raid1e_rebuild_meta_update;
968 					/* Compensate short rebuild I/Os. */
969 					if ((vol->v_disks_count % N) != 0 &&
970 					    vol->v_strip_size <
971 					     g_raid1e_rebuild_slab) {
972 						trs->trso_meta_update *=
973 						    g_raid1e_rebuild_slab;
974 						trs->trso_meta_update /=
975 						    vol->v_strip_size;
976 					}
977 				}
978 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
979 				if (--trs->trso_recover_slabs <= 0)
980 					return;
981 				/* Run next rebuild iteration. */
982 				g_raid_tr_raid1e_rebuild_some(tr);
983 			}
984 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
985 			/*
986 			 * read good sd, read bad sd in parallel.  when both
987 			 * done, compare the buffers.  write good to the bad
988 			 * if different.  do the next bit of work.
989 			 */
990 			panic("Somehow, we think we're doing a resync");
991 		}
992 		return;
993 	}
994 	pbp = bp->bio_parent;
995 	pbp->bio_inbed++;
996 	mask = (intptr_t)bp->bio_caller2;
997 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
998 		/*
999 		 * Read failed on first drive.  Retry the read error on
1000 		 * another disk drive, if available, before erroring out the
1001 		 * read.
1002 		 */
1003 		sd->sd_disk->d_read_errs++;
1004 		G_RAID_LOGREQ(0, bp,
1005 		    "Read error (%d), %d read errors total",
1006 		    bp->bio_error, sd->sd_disk->d_read_errs);
1007 
1008 		/*
1009 		 * If there are too many read errors, we move to degraded.
1010 		 * XXX Do we want to FAIL the drive (eg, make the user redo
1011 		 * everything to get it back in sync), or just degrade the
1012 		 * drive, which kicks off a resync?
1013 		 */
1014 		do_write = 0;
1015 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017 		else if (mask == 0)
1018 			do_write = 1;
1019 
1020 		/* Restore what we were doing. */
1021 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1022 		V2P(vol, virtual, &disk, &offset, &start);
1023 
1024 		/* Find the other disk, and try to do the I/O to it. */
1025 		mask |= 1 << copy;
1026 		best = g_raid_tr_raid1e_select_read_disk(vol,
1027 		    disk, offset, start, mask);
1028 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029 			disk += best;
1030 			if (disk >= vol->v_disks_count) {
1031 				disk -= vol->v_disks_count;
1032 				offset += vol->v_strip_size;
1033 			}
1034 			cbp->bio_offset = offset + start;
1035 			cbp->bio_length = bp->bio_length;
1036 			cbp->bio_data = bp->bio_data;
1037 			cbp->bio_ma = bp->bio_ma;
1038 			cbp->bio_ma_offset = bp->bio_ma_offset;
1039 			cbp->bio_ma_n = bp->bio_ma_n;
1040 			g_destroy_bio(bp);
1041 			nsd = &vol->v_subdisks[disk];
1042 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043 			    nsd->sd_pos);
1044 			if (do_write)
1045 				mask |= 1 << 31;
1046 			if ((mask & (1U << 31)) != 0)
1047 				sd->sd_recovery++;
1048 			cbp->bio_caller2 = (void *)mask;
1049 			if (do_write) {
1050 				cbp->bio_caller1 = nsd;
1051 				/* Lock callback starts I/O */
1052 				g_raid_lock_range(sd->sd_volume,
1053 				    virtual, cbp->bio_length, pbp, cbp);
1054 			} else {
1055 				g_raid_subdisk_iostart(nsd, cbp);
1056 			}
1057 			return;
1058 		}
1059 		/*
1060 		 * We can't retry.  Return the original error by falling
1061 		 * through.  This will happen when there's only one good disk.
1062 		 * We don't need to fail the raid, since its actual state is
1063 		 * based on the state of the subdisks.
1064 		 */
1065 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066 	}
1067 	if (bp->bio_cmd == BIO_READ &&
1068 	    bp->bio_error == 0 &&
1069 	    (mask & (1U << 31)) != 0) {
1070 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071 
1072 		/* Restore what we were doing. */
1073 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1074 		V2P(vol, virtual, &disk, &offset, &start);
1075 
1076 		/* Find best disk to write. */
1077 		best = g_raid_tr_raid1e_select_read_disk(vol,
1078 		    disk, offset, start, ~mask);
1079 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080 			disk += best;
1081 			if (disk >= vol->v_disks_count) {
1082 				disk -= vol->v_disks_count;
1083 				offset += vol->v_strip_size;
1084 			}
1085 			cbp->bio_offset = offset + start;
1086 			cbp->bio_cmd = BIO_WRITE;
1087 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088 			cbp->bio_caller2 = (void *)mask;
1089 			g_destroy_bio(bp);
1090 			G_RAID_LOGREQ(2, cbp,
1091 			    "Attempting bad sector remap on failing drive.");
1092 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093 			return;
1094 		}
1095 	}
1096 	if ((mask & (1U << 31)) != 0) {
1097 		/*
1098 		 * We're done with a recovery, mark the range as unlocked.
1099 		 * For any write errors, we aggressively fail the disk since
1100 		 * there was both a READ and a WRITE error at this location.
1101 		 * Both types of errors generally indicates the drive is on
1102 		 * the verge of total failure anyway.  Better to stop trusting
1103 		 * it now.  However, we need to reset error to 0 in that case
1104 		 * because we're not failing the original I/O which succeeded.
1105 		 */
1106 
1107 		/* Restore what we were doing. */
1108 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1109 		V2P(vol, virtual, &disk, &offset, &start);
1110 
1111 		for (copy = 0; copy < N; copy++) {
1112 			if ((mask & (1 << copy) ) != 0)
1113 				vol->v_subdisks[(disk + copy) %
1114 				    vol->v_disks_count].sd_recovery--;
1115 		}
1116 
1117 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119 			    "failing subdisk.");
1120 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121 			bp->bio_error = 0;
1122 		}
1123 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125 	}
1126 	if (pbp->bio_cmd != BIO_READ) {
1127 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128 			pbp->bio_error = bp->bio_error;
1129 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132 		}
1133 		error = pbp->bio_error;
1134 	} else
1135 		error = bp->bio_error;
1136 	g_destroy_bio(bp);
1137 	if (pbp->bio_children == pbp->bio_inbed) {
1138 		pbp->bio_completed = pbp->bio_length;
1139 		g_raid_iodone(pbp, error);
1140 	}
1141 }
1142 
1143 static int
1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1145     off_t boffset, size_t blength)
1146 {
1147 	struct g_raid_volume *vol;
1148 	struct g_raid_subdisk *sd;
1149 	struct bio_queue_head queue;
1150 	char *addr;
1151 	off_t offset, start, length, remain;
1152 	u_int no, strip_size;
1153 	int i, error;
1154 
1155 	vol = tr->tro_volume;
1156 	addr = virtual;
1157 	strip_size = vol->v_strip_size;
1158 	V2P(vol, boffset, &no, &offset, &start);
1159 	remain = blength;
1160 	bioq_init(&queue);
1161 	while (remain > 0) {
1162 		length = MIN(strip_size - start, remain);
1163 		for (i = 0; i < N; i++) {
1164 			sd = &vol->v_subdisks[no];
1165 			switch (sd->sd_state) {
1166 			case G_RAID_SUBDISK_S_ACTIVE:
1167 			case G_RAID_SUBDISK_S_STALE:
1168 			case G_RAID_SUBDISK_S_RESYNC:
1169 				break;
1170 			case G_RAID_SUBDISK_S_REBUILD:
1171 				if (offset + start >= sd->sd_rebuild_pos)
1172 					goto nextdisk;
1173 				break;
1174 			default:
1175 				goto nextdisk;
1176 			}
1177 			error = g_raid_subdisk_kerneldump(sd, addr,
1178 			    offset + start, length);
1179 			if (error != 0)
1180 				return (error);
1181 nextdisk:
1182 			if (++no >= vol->v_disks_count) {
1183 				no = 0;
1184 				offset += strip_size;
1185 			}
1186 		}
1187 		remain -= length;
1188 		addr += length;
1189 		start = 0;
1190 	}
1191 	return (0);
1192 }
1193 
1194 static int
1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196 {
1197 	struct bio *bp;
1198 	struct g_raid_subdisk *sd;
1199 
1200 	bp = (struct bio *)argp;
1201 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202 	g_raid_subdisk_iostart(sd, bp);
1203 
1204 	return (0);
1205 }
1206 
1207 static int
1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209 {
1210 	struct g_raid_tr_raid1e_object *trs;
1211 	struct g_raid_volume *vol;
1212 
1213 	vol = tr->tro_volume;
1214 	trs = (struct g_raid_tr_raid1e_object *)tr;
1215 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217 	/* Compensate short rebuild I/Os. */
1218 	if ((vol->v_disks_count % N) != 0 &&
1219 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1220 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221 		trs->trso_recover_slabs /= vol->v_strip_size;
1222 	}
1223 	if (trs->trso_type == TR_RAID1E_REBUILD)
1224 		g_raid_tr_raid1e_rebuild_some(tr);
1225 	return (0);
1226 }
1227 
1228 static int
1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230 {
1231 	struct g_raid_tr_raid1e_object *trs;
1232 
1233 	trs = (struct g_raid_tr_raid1e_object *)tr;
1234 
1235 	if (trs->trso_buffer != NULL) {
1236 		free(trs->trso_buffer, M_TR_RAID1E);
1237 		trs->trso_buffer = NULL;
1238 	}
1239 	return (0);
1240 }
1241 
1242 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1243