xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision edf8578117e8844e02c0121147f45e4609b30680)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/cdefs.h>
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include <geom/geom_dbg.h>
43 #include "geom/raid/g_raid.h"
44 #include "g_raid_tr_if.h"
45 
46 #define N	2
47 
48 SYSCTL_DECL(_kern_geom_raid_raid1e);
49 
50 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
51 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
52 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
53     &g_raid1e_rebuild_slab, 0,
54     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
55 
56 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
57 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
58 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
59     &g_raid1e_rebuild_fair_io, 0,
60     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
61 
62 #define RAID1E_REBUILD_CLUSTER_IDLE 100
63 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
64 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
65     &g_raid1e_rebuild_cluster_idle, 0,
66     "Number of slabs to do each time we trigger a rebuild cycle");
67 
68 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
69 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
70 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
71     &g_raid1e_rebuild_meta_update, 0,
72     "When to update the meta data.");
73 
74 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
75 
76 #define TR_RAID1E_NONE 0
77 #define TR_RAID1E_REBUILD 1
78 #define TR_RAID1E_RESYNC 2
79 
80 #define TR_RAID1E_F_DOING_SOME	0x1
81 #define TR_RAID1E_F_LOCKED	0x2
82 #define TR_RAID1E_F_ABORT	0x4
83 
84 struct g_raid_tr_raid1e_object {
85 	struct g_raid_tr_object	 trso_base;
86 	int			 trso_starting;
87 	int			 trso_stopping;
88 	int			 trso_type;
89 	int			 trso_recover_slabs; /* slabs before rest */
90 	int			 trso_fair_io;
91 	int			 trso_meta_update;
92 	int			 trso_flags;
93 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
94 	void			*trso_buffer;	 /* Buffer space */
95 	off_t			 trso_lock_pos; /* Locked range start. */
96 	off_t			 trso_lock_len; /* Locked range length. */
97 	struct bio		 trso_bio;
98 };
99 
100 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
101 static g_raid_tr_event_t g_raid_tr_event_raid1e;
102 static g_raid_tr_start_t g_raid_tr_start_raid1e;
103 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
104 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
105 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
106 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
107 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
108 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
109 static g_raid_tr_free_t g_raid_tr_free_raid1e;
110 
111 static kobj_method_t g_raid_tr_raid1e_methods[] = {
112 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
113 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
114 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
115 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
116 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
117 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
118 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
119 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
120 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
121 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
122 	{ 0, 0 }
123 };
124 
125 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
126 	"RAID1E",
127 	g_raid_tr_raid1e_methods,
128 	sizeof(struct g_raid_tr_raid1e_object),
129 	.trc_enable = 1,
130 	.trc_priority = 200,
131 	.trc_accept_unmapped = 1
132 };
133 
134 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
135 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
136     struct g_raid_subdisk *sd);
137 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
138     int no, off_t off, off_t len, u_int mask);
139 
140 static inline void
141 V2P(struct g_raid_volume *vol, off_t virt,
142     int *disk, off_t *offset, off_t *start)
143 {
144 	off_t nstrip;
145 	u_int strip_size;
146 
147 	strip_size = vol->v_strip_size;
148 	/* Strip number. */
149 	nstrip = virt / strip_size;
150 	/* Start position in strip. */
151 	*start = virt % strip_size;
152 	/* Disk number. */
153 	*disk = (nstrip * N) % vol->v_disks_count;
154 	/* Strip start position in disk. */
155 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
156 }
157 
158 static inline void
159 P2V(struct g_raid_volume *vol, int disk, off_t offset,
160     off_t *virt, int *copy)
161 {
162 	off_t nstrip, start;
163 	u_int strip_size;
164 
165 	strip_size = vol->v_strip_size;
166 	/* Start position in strip. */
167 	start = offset % strip_size;
168 	/* Physical strip number. */
169 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
170 	/* Number of physical strip (copy) inside virtual strip. */
171 	*copy = nstrip % N;
172 	/* Offset in virtual space. */
173 	*virt = (nstrip / N) * strip_size + start;
174 }
175 
176 static int
177 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
178 {
179 	struct g_raid_tr_raid1e_object *trs;
180 
181 	trs = (struct g_raid_tr_raid1e_object *)tr;
182 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
183 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
184 		return (G_RAID_TR_TASTE_FAIL);
185 	trs->trso_starting = 1;
186 	return (G_RAID_TR_TASTE_SUCCEED);
187 }
188 
189 static int
190 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
191 {
192 	struct g_raid_softc *sc;
193 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
194 	int i, j, state, sstate;
195 
196 	sc = vol->v_softc;
197 	state = G_RAID_VOLUME_S_OPTIMAL;
198 	for (i = 0; i < vol->v_disks_count / N; i++) {
199 		bestsd = &vol->v_subdisks[i * N];
200 		for (j = 1; j < N; j++) {
201 			sd = &vol->v_subdisks[i * N + j];
202 			if (sd->sd_state > bestsd->sd_state)
203 				bestsd = sd;
204 			else if (sd->sd_state == bestsd->sd_state &&
205 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
206 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
207 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
208 				bestsd = sd;
209 		}
210 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
211 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
212 			/* We found reasonable candidate. */
213 			G_RAID_DEBUG1(1, sc,
214 			    "Promote subdisk %s:%d from %s to ACTIVE.",
215 			    vol->v_name, bestsd->sd_pos,
216 			    g_raid_subdisk_state2str(bestsd->sd_state));
217 			g_raid_change_subdisk_state(bestsd,
218 			    G_RAID_SUBDISK_S_ACTIVE);
219 			g_raid_write_metadata(sc,
220 			    vol, bestsd, bestsd->sd_disk);
221 		}
222 		worstsd = &vol->v_subdisks[i * N];
223 		for (j = 1; j < N; j++) {
224 			sd = &vol->v_subdisks[i * N + j];
225 			if (sd->sd_state < worstsd->sd_state)
226 				worstsd = sd;
227 		}
228 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
229 			sstate = G_RAID_VOLUME_S_OPTIMAL;
230 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
231 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
232 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
233 			sstate = G_RAID_VOLUME_S_DEGRADED;
234 		else
235 			sstate = G_RAID_VOLUME_S_BROKEN;
236 		if (sstate < state)
237 			state = sstate;
238 	}
239 	return (state);
240 }
241 
242 static int
243 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
244 {
245 	struct g_raid_softc *sc;
246 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
247 	int i, j, state, sstate;
248 
249 	sc = vol->v_softc;
250 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
251 	    vol->v_disks_count)
252 		return (G_RAID_VOLUME_S_OPTIMAL);
253 	for (i = 0; i < vol->v_disks_count; i++) {
254 		sd = &vol->v_subdisks[i];
255 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
256 			/* We found reasonable candidate. */
257 			G_RAID_DEBUG1(1, sc,
258 			    "Promote subdisk %s:%d from %s to STALE.",
259 			    vol->v_name, sd->sd_pos,
260 			    g_raid_subdisk_state2str(sd->sd_state));
261 			g_raid_change_subdisk_state(sd,
262 			    G_RAID_SUBDISK_S_STALE);
263 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
264 		}
265 	}
266 	state = G_RAID_VOLUME_S_OPTIMAL;
267 	for (i = 0; i < vol->v_disks_count; i++) {
268 		bestsd = &vol->v_subdisks[i];
269 		worstsd = &vol->v_subdisks[i];
270 		for (j = 1; j < N; j++) {
271 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
272 			if (sd->sd_state > bestsd->sd_state)
273 				bestsd = sd;
274 			else if (sd->sd_state == bestsd->sd_state &&
275 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
276 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
277 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
278 				bestsd = sd;
279 			if (sd->sd_state < worstsd->sd_state)
280 				worstsd = sd;
281 		}
282 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
283 			sstate = G_RAID_VOLUME_S_OPTIMAL;
284 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
285 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
286 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
287 			sstate = G_RAID_VOLUME_S_DEGRADED;
288 		else
289 			sstate = G_RAID_VOLUME_S_BROKEN;
290 		if (sstate < state)
291 			state = sstate;
292 	}
293 	return (state);
294 }
295 
296 static int
297 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
298     struct g_raid_subdisk *sd)
299 {
300 	struct g_raid_tr_raid1e_object *trs;
301 	struct g_raid_softc *sc;
302 	u_int s;
303 
304 	sc = vol->v_softc;
305 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
306 	if (trs->trso_stopping &&
307 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
308 		s = G_RAID_VOLUME_S_STOPPED;
309 	else if (trs->trso_starting)
310 		s = G_RAID_VOLUME_S_STARTING;
311 	else {
312 		if ((vol->v_disks_count % N) == 0)
313 			s = g_raid_tr_update_state_raid1e_even(vol);
314 		else
315 			s = g_raid_tr_update_state_raid1e_odd(vol);
316 	}
317 	if (s != vol->v_state) {
318 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
319 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
320 		    G_RAID_EVENT_VOLUME);
321 		g_raid_change_volume_state(vol, s);
322 		if (!trs->trso_starting && !trs->trso_stopping)
323 			g_raid_write_metadata(sc, vol, NULL, NULL);
324 	}
325 	if (!trs->trso_starting && !trs->trso_stopping)
326 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
327 	return (0);
328 }
329 
330 static void
331 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
332     struct g_raid_disk *disk)
333 {
334 	struct g_raid_volume *vol;
335 
336 	vol = sd->sd_volume;
337 	/*
338 	 * We don't fail the last disk in the pack, since it still has decent
339 	 * data on it and that's better than failing the disk if it is the root
340 	 * file system.
341 	 *
342 	 * XXX should this be controlled via a tunable?  It makes sense for
343 	 * the volume that has / on it.  I can't think of a case where we'd
344 	 * want the volume to go away on this kind of event.
345 	 */
346 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
347 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
348 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
349 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
350 	     vol->v_disks_count) &&
351 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
352 		return;
353 	g_raid_fail_disk(sc, sd, disk);
354 }
355 
356 static void
357 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
358 {
359 	struct g_raid_volume *vol;
360 	struct g_raid_subdisk *sd;
361 
362 	vol = trs->trso_base.tro_volume;
363 	sd = trs->trso_failed_sd;
364 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
365 	free(trs->trso_buffer, M_TR_RAID1E);
366 	trs->trso_buffer = NULL;
367 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
368 	trs->trso_type = TR_RAID1E_NONE;
369 	trs->trso_recover_slabs = 0;
370 	trs->trso_failed_sd = NULL;
371 	g_raid_tr_update_state_raid1e(vol, NULL);
372 }
373 
374 static void
375 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
376 {
377 	struct g_raid_tr_raid1e_object *trs;
378 	struct g_raid_subdisk *sd;
379 
380 	trs = (struct g_raid_tr_raid1e_object *)tr;
381 	sd = trs->trso_failed_sd;
382 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
383 	    "Subdisk %s:%d-%s rebuild completed.",
384 	    sd->sd_volume->v_name, sd->sd_pos,
385 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
386 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
387 	sd->sd_rebuild_pos = 0;
388 	g_raid_tr_raid1e_rebuild_done(trs);
389 }
390 
391 static void
392 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
393 {
394 	struct g_raid_tr_raid1e_object *trs;
395 	struct g_raid_subdisk *sd;
396 	struct g_raid_volume *vol;
397 
398 	vol = tr->tro_volume;
399 	trs = (struct g_raid_tr_raid1e_object *)tr;
400 	sd = trs->trso_failed_sd;
401 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
402 		G_RAID_DEBUG1(1, vol->v_softc,
403 		    "Subdisk %s:%d-%s rebuild is aborting.",
404 		    sd->sd_volume->v_name, sd->sd_pos,
405 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
406 		trs->trso_flags |= TR_RAID1E_F_ABORT;
407 	} else {
408 		G_RAID_DEBUG1(0, vol->v_softc,
409 		    "Subdisk %s:%d-%s rebuild aborted.",
410 		    sd->sd_volume->v_name, sd->sd_pos,
411 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
413 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
414 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
415 			g_raid_unlock_range(tr->tro_volume,
416 			    trs->trso_lock_pos, trs->trso_lock_len);
417 		}
418 		g_raid_tr_raid1e_rebuild_done(trs);
419 	}
420 }
421 
422 static void
423 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
424 {
425 	struct g_raid_tr_raid1e_object *trs;
426 	struct g_raid_softc *sc;
427 	struct g_raid_volume *vol;
428 	struct g_raid_subdisk *sd;
429 	struct bio *bp;
430 	off_t len, virtual, vend, offset, start;
431 	int disk, copy, best;
432 
433 	trs = (struct g_raid_tr_raid1e_object *)tr;
434 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
435 		return;
436 	vol = tr->tro_volume;
437 	sc = vol->v_softc;
438 	sd = trs->trso_failed_sd;
439 
440 	while (1) {
441 		if (sd->sd_rebuild_pos >= sd->sd_size) {
442 			g_raid_tr_raid1e_rebuild_finish(tr);
443 			return;
444 		}
445 		/* Get virtual offset from physical rebuild position. */
446 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
447 		/* Get physical offset back to get first stripe position. */
448 		V2P(vol, virtual, &disk, &offset, &start);
449 		/* Calculate contignous data length. */
450 		len = MIN(g_raid1e_rebuild_slab,
451 		    sd->sd_size - sd->sd_rebuild_pos);
452 		if ((vol->v_disks_count % N) != 0)
453 			len = MIN(len, vol->v_strip_size - start);
454 		/* Find disk with most accurate data. */
455 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
456 		    offset + start, len, 0);
457 		if (best < 0) {
458 			/* There is no any valid disk. */
459 			g_raid_tr_raid1e_rebuild_abort(tr);
460 			return;
461 		} else if (best != copy) {
462 			/* Some other disk has better data. */
463 			break;
464 		}
465 		/* We have the most accurate data. Skip the range. */
466 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
467 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
468 		sd->sd_rebuild_pos += len;
469 	}
470 
471 	bp = &trs->trso_bio;
472 	memset(bp, 0, sizeof(*bp));
473 	bp->bio_offset = offset + start +
474 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
475 	bp->bio_length = len;
476 	bp->bio_data = trs->trso_buffer;
477 	bp->bio_cmd = BIO_READ;
478 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
479 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
480 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
481 	/*
482 	 * If we are crossing stripe boundary, correct affected virtual
483 	 * range we should lock.
484 	 */
485 	if (start + len > vol->v_strip_size) {
486 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
487 		len = vend - virtual;
488 	}
489 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
490 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
491 	trs->trso_lock_pos = virtual;
492 	trs->trso_lock_len = len;
493 	/* Lock callback starts I/O */
494 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
495 }
496 
497 static void
498 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
499 {
500 	struct g_raid_volume *vol;
501 	struct g_raid_tr_raid1e_object *trs;
502 	struct g_raid_subdisk *sd;
503 
504 	vol = tr->tro_volume;
505 	trs = (struct g_raid_tr_raid1e_object *)tr;
506 	if (trs->trso_failed_sd) {
507 		G_RAID_DEBUG1(1, vol->v_softc,
508 		    "Already rebuild in start rebuild. pos %jd\n",
509 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
510 		return;
511 	}
512 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
513 	if (sd == NULL)
514 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
515 	if (sd == NULL) {
516 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
517 		if (sd != NULL) {
518 			sd->sd_rebuild_pos = 0;
519 			g_raid_change_subdisk_state(sd,
520 			    G_RAID_SUBDISK_S_RESYNC);
521 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
522 		} else {
523 			sd = g_raid_get_subdisk(vol,
524 			    G_RAID_SUBDISK_S_UNINITIALIZED);
525 			if (sd == NULL)
526 				sd = g_raid_get_subdisk(vol,
527 				    G_RAID_SUBDISK_S_NEW);
528 			if (sd != NULL) {
529 				sd->sd_rebuild_pos = 0;
530 				g_raid_change_subdisk_state(sd,
531 				    G_RAID_SUBDISK_S_REBUILD);
532 				g_raid_write_metadata(vol->v_softc,
533 				    vol, sd, NULL);
534 			}
535 		}
536 	}
537 	if (sd == NULL) {
538 		G_RAID_DEBUG1(1, vol->v_softc,
539 		    "No failed disk to rebuild.  night night.");
540 		return;
541 	}
542 	trs->trso_failed_sd = sd;
543 	G_RAID_DEBUG1(0, vol->v_softc,
544 	    "Subdisk %s:%d-%s rebuild start at %jd.",
545 	    sd->sd_volume->v_name, sd->sd_pos,
546 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
547 	    trs->trso_failed_sd->sd_rebuild_pos);
548 	trs->trso_type = TR_RAID1E_REBUILD;
549 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
550 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
551 	g_raid_tr_raid1e_rebuild_some(tr);
552 }
553 
554 static void
555 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
556     struct g_raid_subdisk *sd)
557 {
558 	struct g_raid_volume *vol;
559 	struct g_raid_tr_raid1e_object *trs;
560 	int nr;
561 
562 	vol = tr->tro_volume;
563 	trs = (struct g_raid_tr_raid1e_object *)tr;
564 	if (trs->trso_stopping)
565 		return;
566 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
567 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
568 	switch(trs->trso_type) {
569 	case TR_RAID1E_NONE:
570 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
571 			return;
572 		if (nr == 0) {
573 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
574 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
575 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
576 			if (nr == 0)
577 				return;
578 		}
579 		g_raid_tr_raid1e_rebuild_start(tr);
580 		break;
581 	case TR_RAID1E_REBUILD:
582 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
583 		    trs->trso_failed_sd == sd)
584 			g_raid_tr_raid1e_rebuild_abort(tr);
585 		break;
586 	case TR_RAID1E_RESYNC:
587 		break;
588 	}
589 }
590 
591 static int
592 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
593     struct g_raid_subdisk *sd, u_int event)
594 {
595 
596 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
597 	return (0);
598 }
599 
600 static int
601 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
602 {
603 	struct g_raid_tr_raid1e_object *trs;
604 	struct g_raid_volume *vol;
605 
606 	trs = (struct g_raid_tr_raid1e_object *)tr;
607 	vol = tr->tro_volume;
608 	trs->trso_starting = 0;
609 	g_raid_tr_update_state_raid1e(vol, NULL);
610 	return (0);
611 }
612 
613 static int
614 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
615 {
616 	struct g_raid_tr_raid1e_object *trs;
617 	struct g_raid_volume *vol;
618 
619 	trs = (struct g_raid_tr_raid1e_object *)tr;
620 	vol = tr->tro_volume;
621 	trs->trso_starting = 0;
622 	trs->trso_stopping = 1;
623 	g_raid_tr_update_state_raid1e(vol, NULL);
624 	return (0);
625 }
626 
627 /*
628  * Select the disk to read from.  Take into account: subdisk state, running
629  * error recovery, average disk load, head position and possible cache hits.
630  */
631 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
632 static int
633 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
634     int no, off_t off, off_t len, u_int mask)
635 {
636 	struct g_raid_subdisk *sd;
637 	off_t offset;
638 	int i, best, prio, bestprio;
639 
640 	best = -1;
641 	bestprio = INT_MAX;
642 	for (i = 0; i < N; i++) {
643 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
644 		offset = off;
645 		if (no + i >= vol->v_disks_count)
646 			offset += vol->v_strip_size;
647 
648 		prio = G_RAID_SUBDISK_LOAD(sd);
649 		if ((mask & (1 << sd->sd_pos)) != 0)
650 			continue;
651 		switch (sd->sd_state) {
652 		case G_RAID_SUBDISK_S_ACTIVE:
653 			break;
654 		case G_RAID_SUBDISK_S_RESYNC:
655 			if (offset + off < sd->sd_rebuild_pos)
656 				break;
657 			/* FALLTHROUGH */
658 		case G_RAID_SUBDISK_S_STALE:
659 			prio += i << 24;
660 			break;
661 		case G_RAID_SUBDISK_S_REBUILD:
662 			if (offset + off < sd->sd_rebuild_pos)
663 				break;
664 			/* FALLTHROUGH */
665 		default:
666 			continue;
667 		}
668 		prio += min(sd->sd_recovery, 255) << 16;
669 		/* If disk head is precisely in position - highly prefer it. */
670 		if (G_RAID_SUBDISK_POS(sd) == offset)
671 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
672 		else
673 		/* If disk head is close to position - prefer it. */
674 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
675 		    G_RAID_SUBDISK_TRACK_SIZE)
676 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
677 		if (prio < bestprio) {
678 			bestprio = prio;
679 			best = i;
680 		}
681 	}
682 	return (best);
683 }
684 
685 static void
686 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
687 {
688 	struct g_raid_volume *vol;
689 	struct g_raid_subdisk *sd;
690 	struct bio_queue_head queue;
691 	struct bio *cbp;
692 	char *addr;
693 	off_t offset, start, length, remain;
694 	u_int no, strip_size;
695 	int best;
696 
697 	vol = tr->tro_volume;
698 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
699 		addr = NULL;
700 	else
701 		addr = bp->bio_data;
702 	strip_size = vol->v_strip_size;
703 	V2P(vol, bp->bio_offset, &no, &offset, &start);
704 	remain = bp->bio_length;
705 	bioq_init(&queue);
706 	while (remain > 0) {
707 		length = MIN(strip_size - start, remain);
708 		best = g_raid_tr_raid1e_select_read_disk(vol,
709 		    no, offset, length, 0);
710 		KASSERT(best >= 0, ("No readable disk in volume %s!",
711 		    vol->v_name));
712 		no += best;
713 		if (no >= vol->v_disks_count) {
714 			no -= vol->v_disks_count;
715 			offset += strip_size;
716 		}
717 		cbp = g_clone_bio(bp);
718 		if (cbp == NULL)
719 			goto failure;
720 		cbp->bio_offset = offset + start;
721 		cbp->bio_length = length;
722 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
723 			cbp->bio_ma_offset += (uintptr_t)addr;
724 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
725 			cbp->bio_ma_offset %= PAGE_SIZE;
726 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
727 			    cbp->bio_length) / PAGE_SIZE;
728 		} else
729 			cbp->bio_data = addr;
730 		cbp->bio_caller1 = &vol->v_subdisks[no];
731 		bioq_insert_tail(&queue, cbp);
732 		no += N - best;
733 		if (no >= vol->v_disks_count) {
734 			no -= vol->v_disks_count;
735 			offset += strip_size;
736 		}
737 		remain -= length;
738 		addr += length;
739 		start = 0;
740 	}
741 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
742 		sd = cbp->bio_caller1;
743 		cbp->bio_caller1 = NULL;
744 		g_raid_subdisk_iostart(sd, cbp);
745 	}
746 	return;
747 failure:
748 	while ((cbp = bioq_takefirst(&queue)) != NULL)
749 		g_destroy_bio(cbp);
750 	if (bp->bio_error == 0)
751 		bp->bio_error = ENOMEM;
752 	g_raid_iodone(bp, bp->bio_error);
753 }
754 
755 static void
756 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
757 {
758 	struct g_raid_volume *vol;
759 	struct g_raid_subdisk *sd;
760 	struct bio_queue_head queue;
761 	struct bio *cbp;
762 	char *addr;
763 	off_t offset, start, length, remain;
764 	u_int no, strip_size;
765 	int i;
766 
767 	vol = tr->tro_volume;
768 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
769 		addr = NULL;
770 	else
771 		addr = bp->bio_data;
772 	strip_size = vol->v_strip_size;
773 	V2P(vol, bp->bio_offset, &no, &offset, &start);
774 	remain = bp->bio_length;
775 	bioq_init(&queue);
776 	while (remain > 0) {
777 		length = MIN(strip_size - start, remain);
778 		for (i = 0; i < N; i++) {
779 			sd = &vol->v_subdisks[no];
780 			switch (sd->sd_state) {
781 			case G_RAID_SUBDISK_S_ACTIVE:
782 			case G_RAID_SUBDISK_S_STALE:
783 			case G_RAID_SUBDISK_S_RESYNC:
784 				break;
785 			case G_RAID_SUBDISK_S_REBUILD:
786 				if (offset + start >= sd->sd_rebuild_pos)
787 					goto nextdisk;
788 				break;
789 			default:
790 				goto nextdisk;
791 			}
792 			cbp = g_clone_bio(bp);
793 			if (cbp == NULL)
794 				goto failure;
795 			cbp->bio_offset = offset + start;
796 			cbp->bio_length = length;
797 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
798 			    bp->bio_cmd != BIO_DELETE) {
799 				cbp->bio_ma_offset += (uintptr_t)addr;
800 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
801 				cbp->bio_ma_offset %= PAGE_SIZE;
802 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
803 				    cbp->bio_length) / PAGE_SIZE;
804 			} else
805 				cbp->bio_data = addr;
806 			cbp->bio_caller1 = sd;
807 			bioq_insert_tail(&queue, cbp);
808 nextdisk:
809 			if (++no >= vol->v_disks_count) {
810 				no = 0;
811 				offset += strip_size;
812 			}
813 		}
814 		remain -= length;
815 		if (bp->bio_cmd != BIO_DELETE)
816 			addr += length;
817 		start = 0;
818 	}
819 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
820 		sd = cbp->bio_caller1;
821 		cbp->bio_caller1 = NULL;
822 		g_raid_subdisk_iostart(sd, cbp);
823 	}
824 	return;
825 failure:
826 	while ((cbp = bioq_takefirst(&queue)) != NULL)
827 		g_destroy_bio(cbp);
828 	if (bp->bio_error == 0)
829 		bp->bio_error = ENOMEM;
830 	g_raid_iodone(bp, bp->bio_error);
831 }
832 
833 static void
834 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
835 {
836 	struct g_raid_volume *vol;
837 	struct g_raid_tr_raid1e_object *trs;
838 
839 	vol = tr->tro_volume;
840 	trs = (struct g_raid_tr_raid1e_object *)tr;
841 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
842 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
843 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
844 		g_raid_iodone(bp, EIO);
845 		return;
846 	}
847 	/*
848 	 * If we're rebuilding, squeeze in rebuild activity every so often,
849 	 * even when the disk is busy.  Be sure to only count real I/O
850 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
851 	 * by this module.
852 	 */
853 	if (trs->trso_failed_sd != NULL &&
854 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
855 		/* Make this new or running now round short. */
856 		trs->trso_recover_slabs = 0;
857 		if (--trs->trso_fair_io <= 0) {
858 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
859 			g_raid_tr_raid1e_rebuild_some(tr);
860 		}
861 	}
862 	switch (bp->bio_cmd) {
863 	case BIO_READ:
864 		g_raid_tr_iostart_raid1e_read(tr, bp);
865 		break;
866 	case BIO_WRITE:
867 	case BIO_DELETE:
868 		g_raid_tr_iostart_raid1e_write(tr, bp);
869 		break;
870 	case BIO_SPEEDUP:
871 	case BIO_FLUSH:
872 		g_raid_tr_flush_common(tr, bp);
873 		break;
874 	default:
875 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
876 		    bp->bio_cmd, vol->v_name));
877 		break;
878 	}
879 }
880 
881 static void
882 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
883     struct g_raid_subdisk *sd, struct bio *bp)
884 {
885 	struct bio *cbp;
886 	struct g_raid_subdisk *nsd;
887 	struct g_raid_volume *vol;
888 	struct bio *pbp;
889 	struct g_raid_tr_raid1e_object *trs;
890 	off_t virtual, offset, start;
891 	uintptr_t mask;
892 	int error, do_write, copy, disk, best;
893 
894 	trs = (struct g_raid_tr_raid1e_object *)tr;
895 	vol = tr->tro_volume;
896 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
897 		if (trs->trso_type == TR_RAID1E_REBUILD) {
898 			nsd = trs->trso_failed_sd;
899 			if (bp->bio_cmd == BIO_READ) {
900 				/* Immediately abort rebuild, if requested. */
901 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
902 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
903 					g_raid_tr_raid1e_rebuild_abort(tr);
904 					return;
905 				}
906 
907 				/* On read error, skip and cross fingers. */
908 				if (bp->bio_error != 0) {
909 					G_RAID_LOGREQ(0, bp,
910 					    "Read error during rebuild (%d), "
911 					    "possible data loss!",
912 					    bp->bio_error);
913 					goto rebuild_round_done;
914 				}
915 
916 				/*
917 				 * The read operation finished, queue the
918 				 * write and get out.
919 				 */
920 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
921 				    bp->bio_error);
922 				bp->bio_cmd = BIO_WRITE;
923 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
924 				bp->bio_offset = nsd->sd_rebuild_pos;
925 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
926 				g_raid_subdisk_iostart(nsd, bp);
927 			} else {
928 				/*
929 				 * The write operation just finished.  Do
930 				 * another.  We keep cloning the master bio
931 				 * since it has the right buffers allocated to
932 				 * it.
933 				 */
934 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
935 				    bp->bio_error);
936 				if (bp->bio_error != 0 ||
937 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
938 					if ((trs->trso_flags &
939 					    TR_RAID1E_F_ABORT) == 0) {
940 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
941 						    nsd, nsd->sd_disk);
942 					}
943 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
944 					g_raid_tr_raid1e_rebuild_abort(tr);
945 					return;
946 				}
947 rebuild_round_done:
948 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
949 				g_raid_unlock_range(tr->tro_volume,
950 				    trs->trso_lock_pos, trs->trso_lock_len);
951 				nsd->sd_rebuild_pos += bp->bio_length;
952 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
953 					g_raid_tr_raid1e_rebuild_finish(tr);
954 					return;
955 				}
956 
957 				/* Abort rebuild if we are stopping */
958 				if (trs->trso_stopping) {
959 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
960 					g_raid_tr_raid1e_rebuild_abort(tr);
961 					return;
962 				}
963 
964 				if (--trs->trso_meta_update <= 0) {
965 					g_raid_write_metadata(vol->v_softc,
966 					    vol, nsd, nsd->sd_disk);
967 					trs->trso_meta_update =
968 					    g_raid1e_rebuild_meta_update;
969 					/* Compensate short rebuild I/Os. */
970 					if ((vol->v_disks_count % N) != 0 &&
971 					    vol->v_strip_size <
972 					     g_raid1e_rebuild_slab) {
973 						trs->trso_meta_update *=
974 						    g_raid1e_rebuild_slab;
975 						trs->trso_meta_update /=
976 						    vol->v_strip_size;
977 					}
978 				}
979 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
980 				if (--trs->trso_recover_slabs <= 0)
981 					return;
982 				/* Run next rebuild iteration. */
983 				g_raid_tr_raid1e_rebuild_some(tr);
984 			}
985 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
986 			/*
987 			 * read good sd, read bad sd in parallel.  when both
988 			 * done, compare the buffers.  write good to the bad
989 			 * if different.  do the next bit of work.
990 			 */
991 			panic("Somehow, we think we're doing a resync");
992 		}
993 		return;
994 	}
995 	pbp = bp->bio_parent;
996 	pbp->bio_inbed++;
997 	mask = (intptr_t)bp->bio_caller2;
998 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
999 		/*
1000 		 * Read failed on first drive.  Retry the read error on
1001 		 * another disk drive, if available, before erroring out the
1002 		 * read.
1003 		 */
1004 		sd->sd_disk->d_read_errs++;
1005 		G_RAID_LOGREQ(0, bp,
1006 		    "Read error (%d), %d read errors total",
1007 		    bp->bio_error, sd->sd_disk->d_read_errs);
1008 
1009 		/*
1010 		 * If there are too many read errors, we move to degraded.
1011 		 * XXX Do we want to FAIL the drive (eg, make the user redo
1012 		 * everything to get it back in sync), or just degrade the
1013 		 * drive, which kicks off a resync?
1014 		 */
1015 		do_write = 0;
1016 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1017 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1018 		else if (mask == 0)
1019 			do_write = 1;
1020 
1021 		/* Restore what we were doing. */
1022 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1023 		V2P(vol, virtual, &disk, &offset, &start);
1024 
1025 		/* Find the other disk, and try to do the I/O to it. */
1026 		mask |= 1 << copy;
1027 		best = g_raid_tr_raid1e_select_read_disk(vol,
1028 		    disk, offset, start, mask);
1029 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1030 			disk += best;
1031 			if (disk >= vol->v_disks_count) {
1032 				disk -= vol->v_disks_count;
1033 				offset += vol->v_strip_size;
1034 			}
1035 			cbp->bio_offset = offset + start;
1036 			cbp->bio_length = bp->bio_length;
1037 			cbp->bio_data = bp->bio_data;
1038 			cbp->bio_ma = bp->bio_ma;
1039 			cbp->bio_ma_offset = bp->bio_ma_offset;
1040 			cbp->bio_ma_n = bp->bio_ma_n;
1041 			g_destroy_bio(bp);
1042 			nsd = &vol->v_subdisks[disk];
1043 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1044 			    nsd->sd_pos);
1045 			if (do_write)
1046 				mask |= 1 << 31;
1047 			if ((mask & (1U << 31)) != 0)
1048 				sd->sd_recovery++;
1049 			cbp->bio_caller2 = (void *)mask;
1050 			if (do_write) {
1051 				cbp->bio_caller1 = nsd;
1052 				/* Lock callback starts I/O */
1053 				g_raid_lock_range(sd->sd_volume,
1054 				    virtual, cbp->bio_length, pbp, cbp);
1055 			} else {
1056 				g_raid_subdisk_iostart(nsd, cbp);
1057 			}
1058 			return;
1059 		}
1060 		/*
1061 		 * We can't retry.  Return the original error by falling
1062 		 * through.  This will happen when there's only one good disk.
1063 		 * We don't need to fail the raid, since its actual state is
1064 		 * based on the state of the subdisks.
1065 		 */
1066 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1067 	}
1068 	if (bp->bio_cmd == BIO_READ &&
1069 	    bp->bio_error == 0 &&
1070 	    (mask & (1U << 31)) != 0) {
1071 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1072 
1073 		/* Restore what we were doing. */
1074 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1075 		V2P(vol, virtual, &disk, &offset, &start);
1076 
1077 		/* Find best disk to write. */
1078 		best = g_raid_tr_raid1e_select_read_disk(vol,
1079 		    disk, offset, start, ~mask);
1080 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1081 			disk += best;
1082 			if (disk >= vol->v_disks_count) {
1083 				disk -= vol->v_disks_count;
1084 				offset += vol->v_strip_size;
1085 			}
1086 			cbp->bio_offset = offset + start;
1087 			cbp->bio_cmd = BIO_WRITE;
1088 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1089 			cbp->bio_caller2 = (void *)mask;
1090 			g_destroy_bio(bp);
1091 			G_RAID_LOGREQ(2, cbp,
1092 			    "Attempting bad sector remap on failing drive.");
1093 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1094 			return;
1095 		}
1096 	}
1097 	if ((mask & (1U << 31)) != 0) {
1098 		/*
1099 		 * We're done with a recovery, mark the range as unlocked.
1100 		 * For any write errors, we aggressively fail the disk since
1101 		 * there was both a READ and a WRITE error at this location.
1102 		 * Both types of errors generally indicates the drive is on
1103 		 * the verge of total failure anyway.  Better to stop trusting
1104 		 * it now.  However, we need to reset error to 0 in that case
1105 		 * because we're not failing the original I/O which succeeded.
1106 		 */
1107 
1108 		/* Restore what we were doing. */
1109 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1110 		V2P(vol, virtual, &disk, &offset, &start);
1111 
1112 		for (copy = 0; copy < N; copy++) {
1113 			if ((mask & (1 << copy) ) != 0)
1114 				vol->v_subdisks[(disk + copy) %
1115 				    vol->v_disks_count].sd_recovery--;
1116 		}
1117 
1118 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1119 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1120 			    "failing subdisk.");
1121 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1122 			bp->bio_error = 0;
1123 		}
1124 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1125 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1126 	}
1127 	if (pbp->bio_cmd != BIO_READ) {
1128 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1129 			pbp->bio_error = bp->bio_error;
1130 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1131 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1132 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1133 		}
1134 		error = pbp->bio_error;
1135 	} else
1136 		error = bp->bio_error;
1137 	g_destroy_bio(bp);
1138 	if (pbp->bio_children == pbp->bio_inbed) {
1139 		pbp->bio_completed = pbp->bio_length;
1140 		g_raid_iodone(pbp, error);
1141 	}
1142 }
1143 
1144 static int
1145 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1146     off_t boffset, size_t blength)
1147 {
1148 	struct g_raid_volume *vol;
1149 	struct g_raid_subdisk *sd;
1150 	struct bio_queue_head queue;
1151 	char *addr;
1152 	off_t offset, start, length, remain;
1153 	u_int no, strip_size;
1154 	int i, error;
1155 
1156 	vol = tr->tro_volume;
1157 	addr = virtual;
1158 	strip_size = vol->v_strip_size;
1159 	V2P(vol, boffset, &no, &offset, &start);
1160 	remain = blength;
1161 	bioq_init(&queue);
1162 	while (remain > 0) {
1163 		length = MIN(strip_size - start, remain);
1164 		for (i = 0; i < N; i++) {
1165 			sd = &vol->v_subdisks[no];
1166 			switch (sd->sd_state) {
1167 			case G_RAID_SUBDISK_S_ACTIVE:
1168 			case G_RAID_SUBDISK_S_STALE:
1169 			case G_RAID_SUBDISK_S_RESYNC:
1170 				break;
1171 			case G_RAID_SUBDISK_S_REBUILD:
1172 				if (offset + start >= sd->sd_rebuild_pos)
1173 					goto nextdisk;
1174 				break;
1175 			default:
1176 				goto nextdisk;
1177 			}
1178 			error = g_raid_subdisk_kerneldump(sd, addr,
1179 			    offset + start, length);
1180 			if (error != 0)
1181 				return (error);
1182 nextdisk:
1183 			if (++no >= vol->v_disks_count) {
1184 				no = 0;
1185 				offset += strip_size;
1186 			}
1187 		}
1188 		remain -= length;
1189 		addr += length;
1190 		start = 0;
1191 	}
1192 	return (0);
1193 }
1194 
1195 static int
1196 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1197 {
1198 	struct bio *bp;
1199 	struct g_raid_subdisk *sd;
1200 
1201 	bp = (struct bio *)argp;
1202 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1203 	g_raid_subdisk_iostart(sd, bp);
1204 
1205 	return (0);
1206 }
1207 
1208 static int
1209 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1210 {
1211 	struct g_raid_tr_raid1e_object *trs;
1212 	struct g_raid_volume *vol;
1213 
1214 	vol = tr->tro_volume;
1215 	trs = (struct g_raid_tr_raid1e_object *)tr;
1216 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1217 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1218 	/* Compensate short rebuild I/Os. */
1219 	if ((vol->v_disks_count % N) != 0 &&
1220 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1221 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1222 		trs->trso_recover_slabs /= vol->v_strip_size;
1223 	}
1224 	if (trs->trso_type == TR_RAID1E_REBUILD)
1225 		g_raid_tr_raid1e_rebuild_some(tr);
1226 	return (0);
1227 }
1228 
1229 static int
1230 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1231 {
1232 	struct g_raid_tr_raid1e_object *trs;
1233 
1234 	trs = (struct g_raid_tr_raid1e_object *)tr;
1235 
1236 	if (trs->trso_buffer != NULL) {
1237 		free(trs->trso_buffer, M_TR_RAID1E);
1238 		trs->trso_buffer = NULL;
1239 	}
1240 	return (0);
1241 }
1242 
1243 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1244