xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision ce3adf4362fcca6a43e500b2531f0038adbfbd21)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44 
45 #define N	2
46 
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48 
49 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
52     &g_raid1e_rebuild_slab);
53 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
54     &g_raid1e_rebuild_slab, 0,
55     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
56 
57 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
58 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
59 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
60     &g_raid1e_rebuild_fair_io);
61 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
62     &g_raid1e_rebuild_fair_io, 0,
63     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
64 
65 #define RAID1E_REBUILD_CLUSTER_IDLE 100
66 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
67 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
68     &g_raid1e_rebuild_cluster_idle);
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
70     &g_raid1e_rebuild_cluster_idle, 0,
71     "Number of slabs to do each time we trigger a rebuild cycle");
72 
73 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
74 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
75 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
76     &g_raid1e_rebuild_meta_update);
77 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
78     &g_raid1e_rebuild_meta_update, 0,
79     "When to update the meta data.");
80 
81 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
82 
83 #define TR_RAID1E_NONE 0
84 #define TR_RAID1E_REBUILD 1
85 #define TR_RAID1E_RESYNC 2
86 
87 #define TR_RAID1E_F_DOING_SOME	0x1
88 #define TR_RAID1E_F_LOCKED	0x2
89 #define TR_RAID1E_F_ABORT	0x4
90 
91 struct g_raid_tr_raid1e_object {
92 	struct g_raid_tr_object	 trso_base;
93 	int			 trso_starting;
94 	int			 trso_stopping;
95 	int			 trso_type;
96 	int			 trso_recover_slabs; /* slabs before rest */
97 	int			 trso_fair_io;
98 	int			 trso_meta_update;
99 	int			 trso_flags;
100 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
101 	void			*trso_buffer;	 /* Buffer space */
102 	off_t			 trso_lock_pos; /* Locked range start. */
103 	off_t			 trso_lock_len; /* Locked range length. */
104 	struct bio		 trso_bio;
105 };
106 
107 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
108 static g_raid_tr_event_t g_raid_tr_event_raid1e;
109 static g_raid_tr_start_t g_raid_tr_start_raid1e;
110 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
111 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
112 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
113 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
114 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
115 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
116 static g_raid_tr_free_t g_raid_tr_free_raid1e;
117 
118 static kobj_method_t g_raid_tr_raid1e_methods[] = {
119 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
120 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
121 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
122 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
123 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
124 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
125 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
126 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
127 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
128 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
129 	{ 0, 0 }
130 };
131 
132 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
133 	"RAID1E",
134 	g_raid_tr_raid1e_methods,
135 	sizeof(struct g_raid_tr_raid1e_object),
136 	.trc_enable = 1,
137 	.trc_priority = 200
138 };
139 
140 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
141 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
142     struct g_raid_subdisk *sd);
143 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
144     int no, off_t off, off_t len, u_int mask);
145 
146 static inline void
147 V2P(struct g_raid_volume *vol, off_t virt,
148     int *disk, off_t *offset, off_t *start)
149 {
150 	off_t nstrip;
151 	u_int strip_size;
152 
153 	strip_size = vol->v_strip_size;
154 	/* Strip number. */
155 	nstrip = virt / strip_size;
156 	/* Start position in strip. */
157 	*start = virt % strip_size;
158 	/* Disk number. */
159 	*disk = (nstrip * N) % vol->v_disks_count;
160 	/* Strip start position in disk. */
161 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
162 }
163 
164 static inline void
165 P2V(struct g_raid_volume *vol, int disk, off_t offset,
166     off_t *virt, int *copy)
167 {
168 	off_t nstrip, start;
169 	u_int strip_size;
170 
171 	strip_size = vol->v_strip_size;
172 	/* Start position in strip. */
173 	start = offset % strip_size;
174 	/* Physical strip number. */
175 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
176 	/* Number of physical strip (copy) inside virtual strip. */
177 	*copy = nstrip % N;
178 	/* Offset in virtual space. */
179 	*virt = (nstrip / N) * strip_size + start;
180 }
181 
182 static int
183 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
184 {
185 	struct g_raid_tr_raid1e_object *trs;
186 
187 	trs = (struct g_raid_tr_raid1e_object *)tr;
188 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
189 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
190 		return (G_RAID_TR_TASTE_FAIL);
191 	trs->trso_starting = 1;
192 	return (G_RAID_TR_TASTE_SUCCEED);
193 }
194 
195 static int
196 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
197 {
198 	struct g_raid_softc *sc;
199 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
200 	int i, j, state, sstate;
201 
202 	sc = vol->v_softc;
203 	state = G_RAID_VOLUME_S_OPTIMAL;
204 	for (i = 0; i < vol->v_disks_count / N; i++) {
205 		bestsd = &vol->v_subdisks[i * N];
206 		for (j = 1; j < N; j++) {
207 			sd = &vol->v_subdisks[i * N + j];
208 			if (sd->sd_state > bestsd->sd_state)
209 				bestsd = sd;
210 			else if (sd->sd_state == bestsd->sd_state &&
211 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
212 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
213 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
214 				bestsd = sd;
215 		}
216 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
217 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
218 			/* We found reasonable candidate. */
219 			G_RAID_DEBUG1(1, sc,
220 			    "Promote subdisk %s:%d from %s to ACTIVE.",
221 			    vol->v_name, bestsd->sd_pos,
222 			    g_raid_subdisk_state2str(bestsd->sd_state));
223 			g_raid_change_subdisk_state(bestsd,
224 			    G_RAID_SUBDISK_S_ACTIVE);
225 			g_raid_write_metadata(sc,
226 			    vol, bestsd, bestsd->sd_disk);
227 		}
228 		worstsd = &vol->v_subdisks[i * N];
229 		for (j = 1; j < N; j++) {
230 			sd = &vol->v_subdisks[i * N + j];
231 			if (sd->sd_state < worstsd->sd_state)
232 				worstsd = sd;
233 		}
234 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
235 			sstate = G_RAID_VOLUME_S_OPTIMAL;
236 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
237 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
238 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
239 			sstate = G_RAID_VOLUME_S_DEGRADED;
240 		else
241 			sstate = G_RAID_VOLUME_S_BROKEN;
242 		if (sstate < state)
243 			state = sstate;
244 	}
245 	return (state);
246 }
247 
248 static int
249 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
250 {
251 	struct g_raid_softc *sc;
252 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
253 	int i, j, state, sstate;
254 
255 	sc = vol->v_softc;
256 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
257 	    vol->v_disks_count)
258 		return (G_RAID_VOLUME_S_OPTIMAL);
259 	for (i = 0; i < vol->v_disks_count; i++) {
260 		sd = &vol->v_subdisks[i];
261 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
262 			/* We found reasonable candidate. */
263 			G_RAID_DEBUG1(1, sc,
264 			    "Promote subdisk %s:%d from %s to STALE.",
265 			    vol->v_name, sd->sd_pos,
266 			    g_raid_subdisk_state2str(sd->sd_state));
267 			g_raid_change_subdisk_state(sd,
268 			    G_RAID_SUBDISK_S_STALE);
269 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
270 		}
271 	}
272 	state = G_RAID_VOLUME_S_OPTIMAL;
273 	for (i = 0; i < vol->v_disks_count; i++) {
274 		bestsd = &vol->v_subdisks[i];
275 		worstsd = &vol->v_subdisks[i];
276 		for (j = 1; j < N; j++) {
277 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
278 			if (sd->sd_state > bestsd->sd_state)
279 				bestsd = sd;
280 			else if (sd->sd_state == bestsd->sd_state &&
281 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
282 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
283 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
284 				bestsd = sd;
285 			if (sd->sd_state < worstsd->sd_state)
286 				worstsd = sd;
287 		}
288 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
289 			sstate = G_RAID_VOLUME_S_OPTIMAL;
290 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
291 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
292 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
293 			sstate = G_RAID_VOLUME_S_DEGRADED;
294 		else
295 			sstate = G_RAID_VOLUME_S_BROKEN;
296 		if (sstate < state)
297 			state = sstate;
298 	}
299 	return (state);
300 }
301 
302 static int
303 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
304     struct g_raid_subdisk *sd)
305 {
306 	struct g_raid_tr_raid1e_object *trs;
307 	struct g_raid_softc *sc;
308 	u_int s;
309 
310 	sc = vol->v_softc;
311 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
312 	if (trs->trso_stopping &&
313 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
314 		s = G_RAID_VOLUME_S_STOPPED;
315 	else if (trs->trso_starting)
316 		s = G_RAID_VOLUME_S_STARTING;
317 	else {
318 		if ((vol->v_disks_count % N) == 0)
319 			s = g_raid_tr_update_state_raid1e_even(vol);
320 		else
321 			s = g_raid_tr_update_state_raid1e_odd(vol);
322 	}
323 	if (s != vol->v_state) {
324 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
325 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
326 		    G_RAID_EVENT_VOLUME);
327 		g_raid_change_volume_state(vol, s);
328 		if (!trs->trso_starting && !trs->trso_stopping)
329 			g_raid_write_metadata(sc, vol, NULL, NULL);
330 	}
331 	if (!trs->trso_starting && !trs->trso_stopping)
332 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
333 	return (0);
334 }
335 
336 static void
337 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
338     struct g_raid_disk *disk)
339 {
340 	struct g_raid_volume *vol;
341 
342 	vol = sd->sd_volume;
343 	/*
344 	 * We don't fail the last disk in the pack, since it still has decent
345 	 * data on it and that's better than failing the disk if it is the root
346 	 * file system.
347 	 *
348 	 * XXX should this be controlled via a tunable?  It makes sense for
349 	 * the volume that has / on it.  I can't think of a case where we'd
350 	 * want the volume to go away on this kind of event.
351 	 */
352 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
353 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
354 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
355 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
356 	     vol->v_disks_count) &&
357 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
358 		return;
359 	g_raid_fail_disk(sc, sd, disk);
360 }
361 
362 static void
363 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
364 {
365 	struct g_raid_volume *vol;
366 	struct g_raid_subdisk *sd;
367 
368 	vol = trs->trso_base.tro_volume;
369 	sd = trs->trso_failed_sd;
370 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
371 	free(trs->trso_buffer, M_TR_RAID1E);
372 	trs->trso_buffer = NULL;
373 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
374 	trs->trso_type = TR_RAID1E_NONE;
375 	trs->trso_recover_slabs = 0;
376 	trs->trso_failed_sd = NULL;
377 	g_raid_tr_update_state_raid1e(vol, NULL);
378 }
379 
380 static void
381 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
382 {
383 	struct g_raid_tr_raid1e_object *trs;
384 	struct g_raid_subdisk *sd;
385 
386 	trs = (struct g_raid_tr_raid1e_object *)tr;
387 	sd = trs->trso_failed_sd;
388 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
389 	    "Subdisk %s:%d-%s rebuild completed.",
390 	    sd->sd_volume->v_name, sd->sd_pos,
391 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
392 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
393 	sd->sd_rebuild_pos = 0;
394 	g_raid_tr_raid1e_rebuild_done(trs);
395 }
396 
397 static void
398 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
399 {
400 	struct g_raid_tr_raid1e_object *trs;
401 	struct g_raid_subdisk *sd;
402 	struct g_raid_volume *vol;
403 
404 	vol = tr->tro_volume;
405 	trs = (struct g_raid_tr_raid1e_object *)tr;
406 	sd = trs->trso_failed_sd;
407 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
408 		G_RAID_DEBUG1(1, vol->v_softc,
409 		    "Subdisk %s:%d-%s rebuild is aborting.",
410 		    sd->sd_volume->v_name, sd->sd_pos,
411 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412 		trs->trso_flags |= TR_RAID1E_F_ABORT;
413 	} else {
414 		G_RAID_DEBUG1(0, vol->v_softc,
415 		    "Subdisk %s:%d-%s rebuild aborted.",
416 		    sd->sd_volume->v_name, sd->sd_pos,
417 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
418 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
419 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
420 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
421 			g_raid_unlock_range(tr->tro_volume,
422 			    trs->trso_lock_pos, trs->trso_lock_len);
423 		}
424 		g_raid_tr_raid1e_rebuild_done(trs);
425 	}
426 }
427 
428 static void
429 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
430 {
431 	struct g_raid_tr_raid1e_object *trs;
432 	struct g_raid_softc *sc;
433 	struct g_raid_volume *vol;
434 	struct g_raid_subdisk *sd;
435 	struct bio *bp;
436 	off_t len, virtual, vend, offset, start;
437 	int disk, copy, best;
438 
439 	trs = (struct g_raid_tr_raid1e_object *)tr;
440 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
441 		return;
442 	vol = tr->tro_volume;
443 	sc = vol->v_softc;
444 	sd = trs->trso_failed_sd;
445 
446 	while (1) {
447 		if (sd->sd_rebuild_pos >= sd->sd_size) {
448 			g_raid_tr_raid1e_rebuild_finish(tr);
449 			return;
450 		}
451 		/* Get virtual offset from physical rebuild position. */
452 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
453 		/* Get physical offset back to get first stripe position. */
454 		V2P(vol, virtual, &disk, &offset, &start);
455 		/* Calculate contignous data length. */
456 		len = MIN(g_raid1e_rebuild_slab,
457 		    sd->sd_size - sd->sd_rebuild_pos);
458 		if ((vol->v_disks_count % N) != 0)
459 			len = MIN(len, vol->v_strip_size - start);
460 		/* Find disk with most accurate data. */
461 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
462 		    offset + start, len, 0);
463 		if (best < 0) {
464 			/* There is no any valid disk. */
465 			g_raid_tr_raid1e_rebuild_abort(tr);
466 			return;
467 		} else if (best != copy) {
468 			/* Some other disk has better data. */
469 			break;
470 		}
471 		/* We have the most accurate data. Skip the range. */
472 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
473 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
474 		sd->sd_rebuild_pos += len;
475 	}
476 
477 	bp = &trs->trso_bio;
478 	memset(bp, 0, sizeof(*bp));
479 	bp->bio_offset = offset + start +
480 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
481 	bp->bio_length = len;
482 	bp->bio_data = trs->trso_buffer;
483 	bp->bio_cmd = BIO_READ;
484 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
485 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
486 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
487 	/*
488 	 * If we are crossing stripe boundary, correct affected virtual
489 	 * range we should lock.
490 	 */
491 	if (start + len > vol->v_strip_size) {
492 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
493 		len = vend - virtual;
494 	}
495 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
496 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
497 	trs->trso_lock_pos = virtual;
498 	trs->trso_lock_len = len;
499 	/* Lock callback starts I/O */
500 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
501 }
502 
503 static void
504 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
505 {
506 	struct g_raid_volume *vol;
507 	struct g_raid_tr_raid1e_object *trs;
508 	struct g_raid_subdisk *sd;
509 
510 	vol = tr->tro_volume;
511 	trs = (struct g_raid_tr_raid1e_object *)tr;
512 	if (trs->trso_failed_sd) {
513 		G_RAID_DEBUG1(1, vol->v_softc,
514 		    "Already rebuild in start rebuild. pos %jd\n",
515 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
516 		return;
517 	}
518 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
519 	if (sd == NULL)
520 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
521 	if (sd == NULL) {
522 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
523 		if (sd != NULL) {
524 			sd->sd_rebuild_pos = 0;
525 			g_raid_change_subdisk_state(sd,
526 			    G_RAID_SUBDISK_S_RESYNC);
527 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
528 		} else {
529 			sd = g_raid_get_subdisk(vol,
530 			    G_RAID_SUBDISK_S_UNINITIALIZED);
531 			if (sd == NULL)
532 				sd = g_raid_get_subdisk(vol,
533 				    G_RAID_SUBDISK_S_NEW);
534 			if (sd != NULL) {
535 				sd->sd_rebuild_pos = 0;
536 				g_raid_change_subdisk_state(sd,
537 				    G_RAID_SUBDISK_S_REBUILD);
538 				g_raid_write_metadata(vol->v_softc,
539 				    vol, sd, NULL);
540 			}
541 		}
542 	}
543 	if (sd == NULL) {
544 		G_RAID_DEBUG1(1, vol->v_softc,
545 		    "No failed disk to rebuild.  night night.");
546 		return;
547 	}
548 	trs->trso_failed_sd = sd;
549 	G_RAID_DEBUG1(0, vol->v_softc,
550 	    "Subdisk %s:%d-%s rebuild start at %jd.",
551 	    sd->sd_volume->v_name, sd->sd_pos,
552 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
553 	    trs->trso_failed_sd->sd_rebuild_pos);
554 	trs->trso_type = TR_RAID1E_REBUILD;
555 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
556 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
557 	g_raid_tr_raid1e_rebuild_some(tr);
558 }
559 
560 static void
561 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
562     struct g_raid_subdisk *sd)
563 {
564 	struct g_raid_volume *vol;
565 	struct g_raid_tr_raid1e_object *trs;
566 	int nr;
567 
568 	vol = tr->tro_volume;
569 	trs = (struct g_raid_tr_raid1e_object *)tr;
570 	if (trs->trso_stopping)
571 		return;
572 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
573 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
574 	switch(trs->trso_type) {
575 	case TR_RAID1E_NONE:
576 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
577 			return;
578 		if (nr == 0) {
579 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
580 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
581 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
582 			if (nr == 0)
583 				return;
584 		}
585 		g_raid_tr_raid1e_rebuild_start(tr);
586 		break;
587 	case TR_RAID1E_REBUILD:
588 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
589 		    trs->trso_failed_sd == sd)
590 			g_raid_tr_raid1e_rebuild_abort(tr);
591 		break;
592 	case TR_RAID1E_RESYNC:
593 		break;
594 	}
595 }
596 
597 static int
598 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
599     struct g_raid_subdisk *sd, u_int event)
600 {
601 
602 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
603 	return (0);
604 }
605 
606 static int
607 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
608 {
609 	struct g_raid_tr_raid1e_object *trs;
610 	struct g_raid_volume *vol;
611 
612 	trs = (struct g_raid_tr_raid1e_object *)tr;
613 	vol = tr->tro_volume;
614 	trs->trso_starting = 0;
615 	g_raid_tr_update_state_raid1e(vol, NULL);
616 	return (0);
617 }
618 
619 static int
620 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
621 {
622 	struct g_raid_tr_raid1e_object *trs;
623 	struct g_raid_volume *vol;
624 
625 	trs = (struct g_raid_tr_raid1e_object *)tr;
626 	vol = tr->tro_volume;
627 	trs->trso_starting = 0;
628 	trs->trso_stopping = 1;
629 	g_raid_tr_update_state_raid1e(vol, NULL);
630 	return (0);
631 }
632 
633 /*
634  * Select the disk to read from.  Take into account: subdisk state, running
635  * error recovery, average disk load, head position and possible cache hits.
636  */
637 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
638 static int
639 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
640     int no, off_t off, off_t len, u_int mask)
641 {
642 	struct g_raid_subdisk *sd;
643 	off_t offset;
644 	int i, best, prio, bestprio;
645 
646 	best = -1;
647 	bestprio = INT_MAX;
648 	for (i = 0; i < N; i++) {
649 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
650 		offset = off;
651 		if (no + i >= vol->v_disks_count)
652 			offset += vol->v_strip_size;
653 
654 		prio = G_RAID_SUBDISK_LOAD(sd);
655 		if ((mask & (1 << sd->sd_pos)) != 0)
656 			continue;
657 		switch (sd->sd_state) {
658 		case G_RAID_SUBDISK_S_ACTIVE:
659 			break;
660 		case G_RAID_SUBDISK_S_RESYNC:
661 			if (offset + off < sd->sd_rebuild_pos)
662 				break;
663 			/* FALLTHROUGH */
664 		case G_RAID_SUBDISK_S_STALE:
665 			prio += i << 24;
666 			break;
667 		case G_RAID_SUBDISK_S_REBUILD:
668 			if (offset + off < sd->sd_rebuild_pos)
669 				break;
670 			/* FALLTHROUGH */
671 		default:
672 			continue;
673 		}
674 		prio += min(sd->sd_recovery, 255) << 16;
675 		/* If disk head is precisely in position - highly prefer it. */
676 		if (G_RAID_SUBDISK_POS(sd) == offset)
677 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
678 		else
679 		/* If disk head is close to position - prefer it. */
680 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
681 		    G_RAID_SUBDISK_TRACK_SIZE)
682 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
683 		if (prio < bestprio) {
684 			bestprio = prio;
685 			best = i;
686 		}
687 	}
688 	return (best);
689 }
690 
691 static void
692 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
693 {
694 	struct g_raid_volume *vol;
695 	struct g_raid_subdisk *sd;
696 	struct bio_queue_head queue;
697 	struct bio *cbp;
698 	char *addr;
699 	off_t offset, start, length, remain;
700 	u_int no, strip_size;
701 	int best;
702 
703 	vol = tr->tro_volume;
704 	addr = bp->bio_data;
705 	strip_size = vol->v_strip_size;
706 	V2P(vol, bp->bio_offset, &no, &offset, &start);
707 	remain = bp->bio_length;
708 	bioq_init(&queue);
709 	while (remain > 0) {
710 		length = MIN(strip_size - start, remain);
711 		best = g_raid_tr_raid1e_select_read_disk(vol,
712 		    no, offset, length, 0);
713 		KASSERT(best >= 0, ("No readable disk in volume %s!",
714 		    vol->v_name));
715 		no += best;
716 		if (no >= vol->v_disks_count) {
717 			no -= vol->v_disks_count;
718 			offset += strip_size;
719 		}
720 		cbp = g_clone_bio(bp);
721 		if (cbp == NULL)
722 			goto failure;
723 		cbp->bio_offset = offset + start;
724 		cbp->bio_data = addr;
725 		cbp->bio_length = length;
726 		cbp->bio_caller1 = &vol->v_subdisks[no];
727 		bioq_insert_tail(&queue, cbp);
728 		no += N - best;
729 		if (no >= vol->v_disks_count) {
730 			no -= vol->v_disks_count;
731 			offset += strip_size;
732 		}
733 		remain -= length;
734 		addr += length;
735 		start = 0;
736 	}
737 	for (cbp = bioq_first(&queue); cbp != NULL;
738 	    cbp = bioq_first(&queue)) {
739 		bioq_remove(&queue, cbp);
740 		sd = cbp->bio_caller1;
741 		cbp->bio_caller1 = NULL;
742 		g_raid_subdisk_iostart(sd, cbp);
743 	}
744 	return;
745 failure:
746 	for (cbp = bioq_first(&queue); cbp != NULL;
747 	    cbp = bioq_first(&queue)) {
748 		bioq_remove(&queue, cbp);
749 		g_destroy_bio(cbp);
750 	}
751 	if (bp->bio_error == 0)
752 		bp->bio_error = ENOMEM;
753 	g_raid_iodone(bp, bp->bio_error);
754 }
755 
756 static void
757 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
758 {
759 	struct g_raid_volume *vol;
760 	struct g_raid_subdisk *sd;
761 	struct bio_queue_head queue;
762 	struct bio *cbp;
763 	char *addr;
764 	off_t offset, start, length, remain;
765 	u_int no, strip_size;
766 	int i;
767 
768 	vol = tr->tro_volume;
769 	addr = bp->bio_data;
770 	strip_size = vol->v_strip_size;
771 	V2P(vol, bp->bio_offset, &no, &offset, &start);
772 	remain = bp->bio_length;
773 	bioq_init(&queue);
774 	while (remain > 0) {
775 		length = MIN(strip_size - start, remain);
776 		for (i = 0; i < N; i++) {
777 			sd = &vol->v_subdisks[no];
778 			switch (sd->sd_state) {
779 			case G_RAID_SUBDISK_S_ACTIVE:
780 			case G_RAID_SUBDISK_S_STALE:
781 			case G_RAID_SUBDISK_S_RESYNC:
782 				break;
783 			case G_RAID_SUBDISK_S_REBUILD:
784 				if (offset + start >= sd->sd_rebuild_pos)
785 					goto nextdisk;
786 				break;
787 			default:
788 				goto nextdisk;
789 			}
790 			cbp = g_clone_bio(bp);
791 			if (cbp == NULL)
792 				goto failure;
793 			cbp->bio_offset = offset + start;
794 			cbp->bio_data = addr;
795 			cbp->bio_length = length;
796 			cbp->bio_caller1 = sd;
797 			bioq_insert_tail(&queue, cbp);
798 nextdisk:
799 			if (++no >= vol->v_disks_count) {
800 				no = 0;
801 				offset += strip_size;
802 			}
803 		}
804 		remain -= length;
805 		if (bp->bio_cmd != BIO_DELETE)
806 			addr += length;
807 		start = 0;
808 	}
809 	for (cbp = bioq_first(&queue); cbp != NULL;
810 	    cbp = bioq_first(&queue)) {
811 		bioq_remove(&queue, cbp);
812 		sd = cbp->bio_caller1;
813 		cbp->bio_caller1 = NULL;
814 		g_raid_subdisk_iostart(sd, cbp);
815 	}
816 	return;
817 failure:
818 	for (cbp = bioq_first(&queue); cbp != NULL;
819 	    cbp = bioq_first(&queue)) {
820 		bioq_remove(&queue, cbp);
821 		g_destroy_bio(cbp);
822 	}
823 	if (bp->bio_error == 0)
824 		bp->bio_error = ENOMEM;
825 	g_raid_iodone(bp, bp->bio_error);
826 }
827 
828 static void
829 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
830 {
831 	struct g_raid_volume *vol;
832 	struct g_raid_tr_raid1e_object *trs;
833 
834 	vol = tr->tro_volume;
835 	trs = (struct g_raid_tr_raid1e_object *)tr;
836 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
837 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
838 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
839 		g_raid_iodone(bp, EIO);
840 		return;
841 	}
842 	/*
843 	 * If we're rebuilding, squeeze in rebuild activity every so often,
844 	 * even when the disk is busy.  Be sure to only count real I/O
845 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
846 	 * by this module.
847 	 */
848 	if (trs->trso_failed_sd != NULL &&
849 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
850 		/* Make this new or running now round short. */
851 		trs->trso_recover_slabs = 0;
852 		if (--trs->trso_fair_io <= 0) {
853 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
854 			g_raid_tr_raid1e_rebuild_some(tr);
855 		}
856 	}
857 	switch (bp->bio_cmd) {
858 	case BIO_READ:
859 		g_raid_tr_iostart_raid1e_read(tr, bp);
860 		break;
861 	case BIO_WRITE:
862 	case BIO_DELETE:
863 		g_raid_tr_iostart_raid1e_write(tr, bp);
864 		break;
865 	case BIO_FLUSH:
866 		g_raid_tr_flush_common(tr, bp);
867 		break;
868 	default:
869 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
870 		    bp->bio_cmd, vol->v_name));
871 		break;
872 	}
873 }
874 
875 static void
876 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
877     struct g_raid_subdisk *sd, struct bio *bp)
878 {
879 	struct bio *cbp;
880 	struct g_raid_subdisk *nsd;
881 	struct g_raid_volume *vol;
882 	struct bio *pbp;
883 	struct g_raid_tr_raid1e_object *trs;
884 	off_t virtual, offset, start;
885 	uintptr_t mask;
886 	int error, do_write, copy, disk, best;
887 
888 	trs = (struct g_raid_tr_raid1e_object *)tr;
889 	vol = tr->tro_volume;
890 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
891 		if (trs->trso_type == TR_RAID1E_REBUILD) {
892 			nsd = trs->trso_failed_sd;
893 			if (bp->bio_cmd == BIO_READ) {
894 
895 				/* Immediately abort rebuild, if requested. */
896 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
897 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
898 					g_raid_tr_raid1e_rebuild_abort(tr);
899 					return;
900 				}
901 
902 				/* On read error, skip and cross fingers. */
903 				if (bp->bio_error != 0) {
904 					G_RAID_LOGREQ(0, bp,
905 					    "Read error during rebuild (%d), "
906 					    "possible data loss!",
907 					    bp->bio_error);
908 					goto rebuild_round_done;
909 				}
910 
911 				/*
912 				 * The read operation finished, queue the
913 				 * write and get out.
914 				 */
915 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
916 				    bp->bio_error);
917 				bp->bio_cmd = BIO_WRITE;
918 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
919 				bp->bio_offset = nsd->sd_rebuild_pos;
920 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
921 				g_raid_subdisk_iostart(nsd, bp);
922 			} else {
923 				/*
924 				 * The write operation just finished.  Do
925 				 * another.  We keep cloning the master bio
926 				 * since it has the right buffers allocated to
927 				 * it.
928 				 */
929 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
930 				    bp->bio_error);
931 				if (bp->bio_error != 0 ||
932 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
933 					if ((trs->trso_flags &
934 					    TR_RAID1E_F_ABORT) == 0) {
935 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
936 						    nsd, nsd->sd_disk);
937 					}
938 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
939 					g_raid_tr_raid1e_rebuild_abort(tr);
940 					return;
941 				}
942 rebuild_round_done:
943 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
944 				g_raid_unlock_range(tr->tro_volume,
945 				    trs->trso_lock_pos, trs->trso_lock_len);
946 				nsd->sd_rebuild_pos += bp->bio_length;
947 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
948 					g_raid_tr_raid1e_rebuild_finish(tr);
949 					return;
950 				}
951 
952 				/* Abort rebuild if we are stopping */
953 				if (trs->trso_stopping) {
954 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
955 					g_raid_tr_raid1e_rebuild_abort(tr);
956 					return;
957 				}
958 
959 				if (--trs->trso_meta_update <= 0) {
960 					g_raid_write_metadata(vol->v_softc,
961 					    vol, nsd, nsd->sd_disk);
962 					trs->trso_meta_update =
963 					    g_raid1e_rebuild_meta_update;
964 					/* Compensate short rebuild I/Os. */
965 					if ((vol->v_disks_count % N) != 0 &&
966 					    vol->v_strip_size <
967 					     g_raid1e_rebuild_slab) {
968 						trs->trso_meta_update *=
969 						    g_raid1e_rebuild_slab;
970 						trs->trso_meta_update /=
971 						    vol->v_strip_size;
972 					}
973 				}
974 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
975 				if (--trs->trso_recover_slabs <= 0)
976 					return;
977 				/* Run next rebuild iteration. */
978 				g_raid_tr_raid1e_rebuild_some(tr);
979 			}
980 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
981 			/*
982 			 * read good sd, read bad sd in parallel.  when both
983 			 * done, compare the buffers.  write good to the bad
984 			 * if different.  do the next bit of work.
985 			 */
986 			panic("Somehow, we think we're doing a resync");
987 		}
988 		return;
989 	}
990 	pbp = bp->bio_parent;
991 	pbp->bio_inbed++;
992 	mask = (intptr_t)bp->bio_caller2;
993 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
994 		/*
995 		 * Read failed on first drive.  Retry the read error on
996 		 * another disk drive, if available, before erroring out the
997 		 * read.
998 		 */
999 		sd->sd_disk->d_read_errs++;
1000 		G_RAID_LOGREQ(0, bp,
1001 		    "Read error (%d), %d read errors total",
1002 		    bp->bio_error, sd->sd_disk->d_read_errs);
1003 
1004 		/*
1005 		 * If there are too many read errors, we move to degraded.
1006 		 * XXX Do we want to FAIL the drive (eg, make the user redo
1007 		 * everything to get it back in sync), or just degrade the
1008 		 * drive, which kicks off a resync?
1009 		 */
1010 		do_write = 0;
1011 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1012 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1013 		else if (mask == 0)
1014 			do_write = 1;
1015 
1016 		/* Restore what we were doing. */
1017 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1018 		V2P(vol, virtual, &disk, &offset, &start);
1019 
1020 		/* Find the other disk, and try to do the I/O to it. */
1021 		mask |= 1 << copy;
1022 		best = g_raid_tr_raid1e_select_read_disk(vol,
1023 		    disk, offset, start, mask);
1024 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1025 			disk += best;
1026 			if (disk >= vol->v_disks_count) {
1027 				disk -= vol->v_disks_count;
1028 				offset += vol->v_strip_size;
1029 			}
1030 			cbp->bio_offset = offset + start;
1031 			cbp->bio_length = bp->bio_length;
1032 			cbp->bio_data = bp->bio_data;
1033 			g_destroy_bio(bp);
1034 			nsd = &vol->v_subdisks[disk];
1035 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1036 			    nsd->sd_pos);
1037 			if (do_write)
1038 				mask |= 1 << 31;
1039 			if ((mask & (1 << 31)) != 0)
1040 				sd->sd_recovery++;
1041 			cbp->bio_caller2 = (void *)mask;
1042 			if (do_write) {
1043 				cbp->bio_caller1 = nsd;
1044 				/* Lock callback starts I/O */
1045 				g_raid_lock_range(sd->sd_volume,
1046 				    virtual, cbp->bio_length, pbp, cbp);
1047 			} else {
1048 				g_raid_subdisk_iostart(nsd, cbp);
1049 			}
1050 			return;
1051 		}
1052 		/*
1053 		 * We can't retry.  Return the original error by falling
1054 		 * through.  This will happen when there's only one good disk.
1055 		 * We don't need to fail the raid, since its actual state is
1056 		 * based on the state of the subdisks.
1057 		 */
1058 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1059 	}
1060 	if (bp->bio_cmd == BIO_READ &&
1061 	    bp->bio_error == 0 &&
1062 	    (mask & (1 << 31)) != 0) {
1063 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1064 
1065 		/* Restore what we were doing. */
1066 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1067 		V2P(vol, virtual, &disk, &offset, &start);
1068 
1069 		/* Find best disk to write. */
1070 		best = g_raid_tr_raid1e_select_read_disk(vol,
1071 		    disk, offset, start, ~mask);
1072 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1073 			disk += best;
1074 			if (disk >= vol->v_disks_count) {
1075 				disk -= vol->v_disks_count;
1076 				offset += vol->v_strip_size;
1077 			}
1078 			cbp->bio_offset = offset + start;
1079 			cbp->bio_cmd = BIO_WRITE;
1080 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1081 			cbp->bio_caller2 = (void *)mask;
1082 			g_destroy_bio(bp);
1083 			G_RAID_LOGREQ(2, cbp,
1084 			    "Attempting bad sector remap on failing drive.");
1085 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1086 			return;
1087 		}
1088 	}
1089 	if ((mask & (1 << 31)) != 0) {
1090 		/*
1091 		 * We're done with a recovery, mark the range as unlocked.
1092 		 * For any write errors, we agressively fail the disk since
1093 		 * there was both a READ and a WRITE error at this location.
1094 		 * Both types of errors generally indicates the drive is on
1095 		 * the verge of total failure anyway.  Better to stop trusting
1096 		 * it now.  However, we need to reset error to 0 in that case
1097 		 * because we're not failing the original I/O which succeeded.
1098 		 */
1099 
1100 		/* Restore what we were doing. */
1101 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1102 		V2P(vol, virtual, &disk, &offset, &start);
1103 
1104 		for (copy = 0; copy < N; copy++) {
1105 			if ((mask & (1 << copy) ) != 0)
1106 				vol->v_subdisks[(disk + copy) %
1107 				    vol->v_disks_count].sd_recovery--;
1108 		}
1109 
1110 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1111 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1112 			    "failing subdisk.");
1113 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1114 			bp->bio_error = 0;
1115 		}
1116 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1117 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1118 	}
1119 	if (pbp->bio_cmd != BIO_READ) {
1120 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1121 			pbp->bio_error = bp->bio_error;
1122 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1123 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1124 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1125 		}
1126 		error = pbp->bio_error;
1127 	} else
1128 		error = bp->bio_error;
1129 	g_destroy_bio(bp);
1130 	if (pbp->bio_children == pbp->bio_inbed) {
1131 		pbp->bio_completed = pbp->bio_length;
1132 		g_raid_iodone(pbp, error);
1133 	}
1134 }
1135 
1136 static int
1137 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1138     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1139 {
1140 	struct g_raid_volume *vol;
1141 	struct g_raid_subdisk *sd;
1142 	struct bio_queue_head queue;
1143 	char *addr;
1144 	off_t offset, start, length, remain;
1145 	u_int no, strip_size;
1146 	int i, error;
1147 
1148 	vol = tr->tro_volume;
1149 	addr = virtual;
1150 	strip_size = vol->v_strip_size;
1151 	V2P(vol, boffset, &no, &offset, &start);
1152 	remain = blength;
1153 	bioq_init(&queue);
1154 	while (remain > 0) {
1155 		length = MIN(strip_size - start, remain);
1156 		for (i = 0; i < N; i++) {
1157 			sd = &vol->v_subdisks[no];
1158 			switch (sd->sd_state) {
1159 			case G_RAID_SUBDISK_S_ACTIVE:
1160 			case G_RAID_SUBDISK_S_STALE:
1161 			case G_RAID_SUBDISK_S_RESYNC:
1162 				break;
1163 			case G_RAID_SUBDISK_S_REBUILD:
1164 				if (offset + start >= sd->sd_rebuild_pos)
1165 					goto nextdisk;
1166 				break;
1167 			default:
1168 				goto nextdisk;
1169 			}
1170 			error = g_raid_subdisk_kerneldump(sd,
1171 			    addr, 0, offset + start, length);
1172 			if (error != 0)
1173 				return (error);
1174 nextdisk:
1175 			if (++no >= vol->v_disks_count) {
1176 				no = 0;
1177 				offset += strip_size;
1178 			}
1179 		}
1180 		remain -= length;
1181 		addr += length;
1182 		start = 0;
1183 	}
1184 	return (0);
1185 }
1186 
1187 static int
1188 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1189 {
1190 	struct bio *bp;
1191 	struct g_raid_subdisk *sd;
1192 
1193 	bp = (struct bio *)argp;
1194 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1195 	g_raid_subdisk_iostart(sd, bp);
1196 
1197 	return (0);
1198 }
1199 
1200 static int
1201 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1202 {
1203 	struct g_raid_tr_raid1e_object *trs;
1204 	struct g_raid_volume *vol;
1205 
1206 	vol = tr->tro_volume;
1207 	trs = (struct g_raid_tr_raid1e_object *)tr;
1208 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1209 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1210 	/* Compensate short rebuild I/Os. */
1211 	if ((vol->v_disks_count % N) != 0 &&
1212 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1213 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1214 		trs->trso_recover_slabs /= vol->v_strip_size;
1215 	}
1216 	if (trs->trso_type == TR_RAID1E_REBUILD)
1217 		g_raid_tr_raid1e_rebuild_some(tr);
1218 	return (0);
1219 }
1220 
1221 static int
1222 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1223 {
1224 	struct g_raid_tr_raid1e_object *trs;
1225 
1226 	trs = (struct g_raid_tr_raid1e_object *)tr;
1227 
1228 	if (trs->trso_buffer != NULL) {
1229 		free(trs->trso_buffer, M_TR_RAID1E);
1230 		trs->trso_buffer = NULL;
1231 	}
1232 	return (0);
1233 }
1234 
1235 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1236