xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision 586f63035fbe5e45cfc971037fd76375661ece26)
1 /*-
2  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include <sys/cdefs.h>
28 __FBSDID("$FreeBSD$");
29 
30 #include <sys/param.h>
31 #include <sys/bio.h>
32 #include <sys/endian.h>
33 #include <sys/kernel.h>
34 #include <sys/kobj.h>
35 #include <sys/limits.h>
36 #include <sys/lock.h>
37 #include <sys/malloc.h>
38 #include <sys/mutex.h>
39 #include <sys/sysctl.h>
40 #include <sys/systm.h>
41 #include <geom/geom.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44 
45 #define N	2
46 
47 SYSCTL_DECL(_kern_geom_raid);
48 static SYSCTL_NODE(_kern_geom_raid, OID_AUTO, raid1e, CTLFLAG_RW, 0,
49     "RAID1E parameters");
50 
51 #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
52 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
53 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_slab_size",
54     &g_raid1e_rebuild_slab);
55 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RW,
56     &g_raid1e_rebuild_slab, 0,
57     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
58 
59 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
60 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
61 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_fair_io",
62     &g_raid1e_rebuild_fair_io);
63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RW,
64     &g_raid1e_rebuild_fair_io, 0,
65     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
66 
67 #define RAID1E_REBUILD_CLUSTER_IDLE 100
68 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
69 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_cluster_idle",
70     &g_raid1e_rebuild_cluster_idle);
71 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RW,
72     &g_raid1e_rebuild_cluster_idle, 0,
73     "Number of slabs to do each time we trigger a rebuild cycle");
74 
75 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
76 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
77 TUNABLE_INT("kern.geom.raid.raid1e.rebuild_meta_update",
78     &g_raid1e_rebuild_meta_update);
79 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RW,
80     &g_raid1e_rebuild_meta_update, 0,
81     "When to update the meta data.");
82 
83 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
84 
85 #define TR_RAID1E_NONE 0
86 #define TR_RAID1E_REBUILD 1
87 #define TR_RAID1E_RESYNC 2
88 
89 #define TR_RAID1E_F_DOING_SOME	0x1
90 #define TR_RAID1E_F_LOCKED	0x2
91 #define TR_RAID1E_F_ABORT	0x4
92 
93 struct g_raid_tr_raid1e_object {
94 	struct g_raid_tr_object	 trso_base;
95 	int			 trso_starting;
96 	int			 trso_stopping;
97 	int			 trso_type;
98 	int			 trso_recover_slabs; /* slabs before rest */
99 	int			 trso_fair_io;
100 	int			 trso_meta_update;
101 	int			 trso_flags;
102 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
103 	void			*trso_buffer;	 /* Buffer space */
104 	off_t			 trso_lock_pos; /* Locked range start. */
105 	off_t			 trso_lock_len; /* Locked range length. */
106 	struct bio		 trso_bio;
107 };
108 
109 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
110 static g_raid_tr_event_t g_raid_tr_event_raid1e;
111 static g_raid_tr_start_t g_raid_tr_start_raid1e;
112 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
113 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
114 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
115 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
116 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
117 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
118 static g_raid_tr_free_t g_raid_tr_free_raid1e;
119 
120 static kobj_method_t g_raid_tr_raid1e_methods[] = {
121 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
122 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
123 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
124 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
125 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
126 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
127 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
128 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
129 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
130 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
131 	{ 0, 0 }
132 };
133 
134 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
135 	"RAID1E",
136 	g_raid_tr_raid1e_methods,
137 	sizeof(struct g_raid_tr_raid1e_object),
138 	.trc_priority = 200
139 };
140 
141 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
142 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
143     struct g_raid_subdisk *sd);
144 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
145     int no, off_t off, off_t len, u_int mask);
146 
147 static inline void
148 V2P(struct g_raid_volume *vol, off_t virt,
149     int *disk, off_t *offset, off_t *start)
150 {
151 	off_t nstrip;
152 	u_int strip_size;
153 
154 	strip_size = vol->v_strip_size;
155 	/* Strip number. */
156 	nstrip = virt / strip_size;
157 	/* Start position in strip. */
158 	*start = virt % strip_size;
159 	/* Disk number. */
160 	*disk = (nstrip * N) % vol->v_disks_count;
161 	/* Strip start position in disk. */
162 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
163 }
164 
165 static inline void
166 P2V(struct g_raid_volume *vol, int disk, off_t offset,
167     off_t *virt, int *copy)
168 {
169 	off_t nstrip, start;
170 	u_int strip_size;
171 
172 	strip_size = vol->v_strip_size;
173 	/* Start position in strip. */
174 	start = offset % strip_size;
175 	/* Physical strip number. */
176 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
177 	/* Number of physical strip (copy) inside virtual strip. */
178 	*copy = nstrip % N;
179 	/* Offset in virtual space. */
180 	*virt = (nstrip / N) * strip_size + start;
181 }
182 
183 static int
184 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
185 {
186 	struct g_raid_tr_raid1e_object *trs;
187 
188 	trs = (struct g_raid_tr_raid1e_object *)tr;
189 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
190 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_NONE)
191 		return (G_RAID_TR_TASTE_FAIL);
192 	trs->trso_starting = 1;
193 	return (G_RAID_TR_TASTE_SUCCEED);
194 }
195 
196 static int
197 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
198 {
199 	struct g_raid_softc *sc;
200 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
201 	int i, j, state, sstate;
202 
203 	sc = vol->v_softc;
204 	state = G_RAID_VOLUME_S_OPTIMAL;
205 	for (i = 0; i < vol->v_disks_count / N; i++) {
206 		bestsd = &vol->v_subdisks[i * N];
207 		for (j = 1; j < N; j++) {
208 			sd = &vol->v_subdisks[i * N + j];
209 			if (sd->sd_state > bestsd->sd_state)
210 				bestsd = sd;
211 			else if (sd->sd_state == bestsd->sd_state &&
212 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
213 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
214 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
215 				bestsd = sd;
216 		}
217 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
218 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
219 			/* We found reasonable candidate. */
220 			G_RAID_DEBUG1(1, sc,
221 			    "Promote subdisk %s:%d from %s to ACTIVE.",
222 			    vol->v_name, bestsd->sd_pos,
223 			    g_raid_subdisk_state2str(bestsd->sd_state));
224 			g_raid_change_subdisk_state(bestsd,
225 			    G_RAID_SUBDISK_S_ACTIVE);
226 			g_raid_write_metadata(sc,
227 			    vol, bestsd, bestsd->sd_disk);
228 		}
229 		worstsd = &vol->v_subdisks[i * N];
230 		for (j = 1; j < N; j++) {
231 			sd = &vol->v_subdisks[i * N + j];
232 			if (sd->sd_state < worstsd->sd_state)
233 				worstsd = sd;
234 		}
235 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
236 			sstate = G_RAID_VOLUME_S_OPTIMAL;
237 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
238 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
239 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
240 			sstate = G_RAID_VOLUME_S_DEGRADED;
241 		else
242 			sstate = G_RAID_VOLUME_S_BROKEN;
243 		if (sstate < state)
244 			state = sstate;
245 	}
246 	return (state);
247 }
248 
249 static int
250 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
251 {
252 	struct g_raid_softc *sc;
253 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
254 	int i, j, state, sstate;
255 
256 	sc = vol->v_softc;
257 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
258 	    vol->v_disks_count)
259 		return (G_RAID_VOLUME_S_OPTIMAL);
260 	for (i = 0; i < vol->v_disks_count; i++) {
261 		sd = &vol->v_subdisks[i];
262 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
263 			/* We found reasonable candidate. */
264 			G_RAID_DEBUG1(1, sc,
265 			    "Promote subdisk %s:%d from %s to STALE.",
266 			    vol->v_name, sd->sd_pos,
267 			    g_raid_subdisk_state2str(sd->sd_state));
268 			g_raid_change_subdisk_state(sd,
269 			    G_RAID_SUBDISK_S_STALE);
270 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
271 		}
272 	}
273 	state = G_RAID_VOLUME_S_OPTIMAL;
274 	for (i = 0; i < vol->v_disks_count; i++) {
275 		bestsd = &vol->v_subdisks[i];
276 		worstsd = &vol->v_subdisks[i];
277 		for (j = 1; j < N; j++) {
278 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
279 			if (sd->sd_state > bestsd->sd_state)
280 				bestsd = sd;
281 			else if (sd->sd_state == bestsd->sd_state &&
282 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
283 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
284 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
285 				bestsd = sd;
286 			if (sd->sd_state < worstsd->sd_state)
287 				worstsd = sd;
288 		}
289 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
290 			sstate = G_RAID_VOLUME_S_OPTIMAL;
291 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
292 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
293 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
294 			sstate = G_RAID_VOLUME_S_DEGRADED;
295 		else
296 			sstate = G_RAID_VOLUME_S_BROKEN;
297 		if (sstate < state)
298 			state = sstate;
299 	}
300 	return (state);
301 }
302 
303 static int
304 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
305     struct g_raid_subdisk *sd)
306 {
307 	struct g_raid_tr_raid1e_object *trs;
308 	struct g_raid_softc *sc;
309 	u_int s;
310 
311 	sc = vol->v_softc;
312 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
313 	if (trs->trso_stopping &&
314 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
315 		s = G_RAID_VOLUME_S_STOPPED;
316 	else if (trs->trso_starting)
317 		s = G_RAID_VOLUME_S_STARTING;
318 	else {
319 		if ((vol->v_disks_count % N) == 0)
320 			s = g_raid_tr_update_state_raid1e_even(vol);
321 		else
322 			s = g_raid_tr_update_state_raid1e_odd(vol);
323 	}
324 	if (s != vol->v_state) {
325 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
326 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
327 		    G_RAID_EVENT_VOLUME);
328 		g_raid_change_volume_state(vol, s);
329 		if (!trs->trso_starting && !trs->trso_stopping)
330 			g_raid_write_metadata(sc, vol, NULL, NULL);
331 	}
332 	if (!trs->trso_starting && !trs->trso_stopping)
333 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
334 	return (0);
335 }
336 
337 static void
338 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
339     struct g_raid_disk *disk)
340 {
341 	/*
342 	 * We don't fail the last disk in the pack, since it still has decent
343 	 * data on it and that's better than failing the disk if it is the root
344 	 * file system.
345 	 *
346 	 * XXX should this be controlled via a tunable?  It makes sense for
347 	 * the volume that has / on it.  I can't think of a case where we'd
348 	 * want the volume to go away on this kind of event.
349 	 */
350 	if (g_raid_nsubdisks(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == 1 &&
351 	    g_raid_get_subdisk(sd->sd_volume, G_RAID_SUBDISK_S_ACTIVE) == sd)
352 		return;
353 	g_raid_fail_disk(sc, sd, disk);
354 }
355 
356 static void
357 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
358 {
359 	struct g_raid_volume *vol;
360 	struct g_raid_subdisk *sd;
361 
362 	vol = trs->trso_base.tro_volume;
363 	sd = trs->trso_failed_sd;
364 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
365 	free(trs->trso_buffer, M_TR_RAID1E);
366 	trs->trso_buffer = NULL;
367 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
368 	trs->trso_type = TR_RAID1E_NONE;
369 	trs->trso_recover_slabs = 0;
370 	trs->trso_failed_sd = NULL;
371 	g_raid_tr_update_state_raid1e(vol, NULL);
372 }
373 
374 static void
375 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
376 {
377 	struct g_raid_tr_raid1e_object *trs;
378 	struct g_raid_subdisk *sd;
379 
380 	trs = (struct g_raid_tr_raid1e_object *)tr;
381 	sd = trs->trso_failed_sd;
382 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
383 	    "Subdisk %s:%d-%s rebuild completed.",
384 	    sd->sd_volume->v_name, sd->sd_pos,
385 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
386 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
387 	sd->sd_rebuild_pos = 0;
388 	g_raid_tr_raid1e_rebuild_done(trs);
389 }
390 
391 static void
392 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
393 {
394 	struct g_raid_tr_raid1e_object *trs;
395 	struct g_raid_subdisk *sd;
396 	struct g_raid_volume *vol;
397 
398 	vol = tr->tro_volume;
399 	trs = (struct g_raid_tr_raid1e_object *)tr;
400 	sd = trs->trso_failed_sd;
401 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
402 		G_RAID_DEBUG1(1, vol->v_softc,
403 		    "Subdisk %s:%d-%s rebuild is aborting.",
404 		    sd->sd_volume->v_name, sd->sd_pos,
405 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
406 		trs->trso_flags |= TR_RAID1E_F_ABORT;
407 	} else {
408 		G_RAID_DEBUG1(0, vol->v_softc,
409 		    "Subdisk %s:%d-%s rebuild aborted.",
410 		    sd->sd_volume->v_name, sd->sd_pos,
411 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
412 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
413 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
414 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
415 			g_raid_unlock_range(tr->tro_volume,
416 			    trs->trso_lock_pos, trs->trso_lock_len);
417 		}
418 		g_raid_tr_raid1e_rebuild_done(trs);
419 	}
420 }
421 
422 static void
423 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
424 {
425 	struct g_raid_tr_raid1e_object *trs;
426 	struct g_raid_softc *sc;
427 	struct g_raid_volume *vol;
428 	struct g_raid_subdisk *sd;
429 	struct bio *bp;
430 	off_t len, virtual, vend, offset, start;
431 	int disk, copy, best;
432 
433 	trs = (struct g_raid_tr_raid1e_object *)tr;
434 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
435 		return;
436 	vol = tr->tro_volume;
437 	sc = vol->v_softc;
438 	sd = trs->trso_failed_sd;
439 
440 	while (1) {
441 		if (sd->sd_rebuild_pos >= sd->sd_size) {
442 			g_raid_tr_raid1e_rebuild_finish(tr);
443 			return;
444 		}
445 		/* Get virtual offset from physical rebuild position. */
446 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
447 		/* Get physical offset back to get first stripe position. */
448 		V2P(vol, virtual, &disk, &offset, &start);
449 		/* Calculate contignous data length. */
450 		len = MIN(g_raid1e_rebuild_slab,
451 		    sd->sd_size - sd->sd_rebuild_pos);
452 		if ((vol->v_disks_count % N) != 0)
453 			len = MIN(len, vol->v_strip_size - start);
454 		/* Find disk with most accurate data. */
455 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
456 		    offset + start, len, 0);
457 		if (best < 0) {
458 			/* There is no any valid disk. */
459 			g_raid_tr_raid1e_rebuild_abort(tr);
460 			return;
461 		} else if (best != copy) {
462 			/* Some other disk has better data. */
463 			break;
464 		}
465 		/* We have the most accurate data. Skip the range. */
466 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
467 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
468 		sd->sd_rebuild_pos += len;
469 	}
470 
471 	bp = &trs->trso_bio;
472 	memset(bp, 0, sizeof(*bp));
473 	bp->bio_offset = offset + start +
474 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
475 	bp->bio_length = len;
476 	bp->bio_data = trs->trso_buffer;
477 	bp->bio_cmd = BIO_READ;
478 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
479 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
480 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
481 	/*
482 	 * If we are crossing stripe boundary, correct affected virtual
483 	 * range we should lock.
484 	 */
485 	if (start + len > vol->v_strip_size) {
486 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
487 		len = vend - virtual;
488 	}
489 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
490 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
491 	trs->trso_lock_pos = virtual;
492 	trs->trso_lock_len = len;
493 	/* Lock callback starts I/O */
494 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
495 }
496 
497 static void
498 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
499 {
500 	struct g_raid_volume *vol;
501 	struct g_raid_tr_raid1e_object *trs;
502 	struct g_raid_subdisk *sd;
503 
504 	vol = tr->tro_volume;
505 	trs = (struct g_raid_tr_raid1e_object *)tr;
506 	if (trs->trso_failed_sd) {
507 		G_RAID_DEBUG1(1, vol->v_softc,
508 		    "Already rebuild in start rebuild. pos %jd\n",
509 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
510 		return;
511 	}
512 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
513 	if (sd == NULL)
514 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
515 	if (sd == NULL) {
516 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
517 		if (sd != NULL) {
518 			sd->sd_rebuild_pos = 0;
519 			g_raid_change_subdisk_state(sd,
520 			    G_RAID_SUBDISK_S_RESYNC);
521 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
522 		} else {
523 			sd = g_raid_get_subdisk(vol,
524 			    G_RAID_SUBDISK_S_UNINITIALIZED);
525 			if (sd == NULL)
526 				sd = g_raid_get_subdisk(vol,
527 				    G_RAID_SUBDISK_S_NEW);
528 			if (sd != NULL) {
529 				sd->sd_rebuild_pos = 0;
530 				g_raid_change_subdisk_state(sd,
531 				    G_RAID_SUBDISK_S_REBUILD);
532 				g_raid_write_metadata(vol->v_softc,
533 				    vol, sd, NULL);
534 			}
535 		}
536 	}
537 	if (sd == NULL) {
538 		G_RAID_DEBUG1(1, vol->v_softc,
539 		    "No failed disk to rebuild.  night night.");
540 		return;
541 	}
542 	trs->trso_failed_sd = sd;
543 	G_RAID_DEBUG1(0, vol->v_softc,
544 	    "Subdisk %s:%d-%s rebuild start at %jd.",
545 	    sd->sd_volume->v_name, sd->sd_pos,
546 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
547 	    trs->trso_failed_sd->sd_rebuild_pos);
548 	trs->trso_type = TR_RAID1E_REBUILD;
549 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
550 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
551 	g_raid_tr_raid1e_rebuild_some(tr);
552 }
553 
554 static void
555 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
556     struct g_raid_subdisk *sd)
557 {
558 	struct g_raid_volume *vol;
559 	struct g_raid_tr_raid1e_object *trs;
560 	int nr;
561 
562 	vol = tr->tro_volume;
563 	trs = (struct g_raid_tr_raid1e_object *)tr;
564 	if (trs->trso_stopping)
565 		return;
566 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
567 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
568 	switch(trs->trso_type) {
569 	case TR_RAID1E_NONE:
570 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
571 			return;
572 		if (nr == 0) {
573 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
574 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
575 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
576 			if (nr == 0)
577 				return;
578 		}
579 		g_raid_tr_raid1e_rebuild_start(tr);
580 		break;
581 	case TR_RAID1E_REBUILD:
582 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
583 		    trs->trso_failed_sd == sd)
584 			g_raid_tr_raid1e_rebuild_abort(tr);
585 		break;
586 	case TR_RAID1E_RESYNC:
587 		break;
588 	}
589 }
590 
591 static int
592 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
593     struct g_raid_subdisk *sd, u_int event)
594 {
595 
596 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
597 	return (0);
598 }
599 
600 static int
601 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
602 {
603 	struct g_raid_tr_raid1e_object *trs;
604 	struct g_raid_volume *vol;
605 
606 	trs = (struct g_raid_tr_raid1e_object *)tr;
607 	vol = tr->tro_volume;
608 	trs->trso_starting = 0;
609 	g_raid_tr_update_state_raid1e(vol, NULL);
610 	return (0);
611 }
612 
613 static int
614 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
615 {
616 	struct g_raid_tr_raid1e_object *trs;
617 	struct g_raid_volume *vol;
618 
619 	trs = (struct g_raid_tr_raid1e_object *)tr;
620 	vol = tr->tro_volume;
621 	trs->trso_starting = 0;
622 	trs->trso_stopping = 1;
623 	g_raid_tr_update_state_raid1e(vol, NULL);
624 	return (0);
625 }
626 
627 /*
628  * Select the disk to read from.  Take into account: subdisk state, running
629  * error recovery, average disk load, head position and possible cache hits.
630  */
631 #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
632 static int
633 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
634     int no, off_t off, off_t len, u_int mask)
635 {
636 	struct g_raid_subdisk *sd;
637 	off_t offset;
638 	int i, best, prio, bestprio;
639 
640 	best = -1;
641 	bestprio = INT_MAX;
642 	for (i = 0; i < N; i++) {
643 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
644 		offset = off;
645 		if (no + i >= vol->v_disks_count)
646 			offset += vol->v_strip_size;
647 
648 		prio = G_RAID_SUBDISK_LOAD(sd);
649 		if ((mask & (1 << sd->sd_pos)) != 0)
650 			continue;
651 		switch (sd->sd_state) {
652 		case G_RAID_SUBDISK_S_ACTIVE:
653 			break;
654 		case G_RAID_SUBDISK_S_RESYNC:
655 			if (offset + off < sd->sd_rebuild_pos)
656 				break;
657 			/* FALLTHROUGH */
658 		case G_RAID_SUBDISK_S_STALE:
659 			prio += i << 24;
660 			break;
661 		case G_RAID_SUBDISK_S_REBUILD:
662 			if (offset + off < sd->sd_rebuild_pos)
663 				break;
664 			/* FALLTHROUGH */
665 		default:
666 			continue;
667 		}
668 		prio += min(sd->sd_recovery, 255) << 16;
669 		/* If disk head is precisely in position - highly prefer it. */
670 		if (G_RAID_SUBDISK_POS(sd) == offset)
671 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
672 		else
673 		/* If disk head is close to position - prefer it. */
674 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
675 		    G_RAID_SUBDISK_TRACK_SIZE)
676 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
677 		if (prio < bestprio) {
678 			bestprio = prio;
679 			best = i;
680 		}
681 	}
682 	return (best);
683 }
684 
685 static void
686 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
687 {
688 	struct g_raid_volume *vol;
689 	struct g_raid_subdisk *sd;
690 	struct bio_queue_head queue;
691 	struct bio *cbp;
692 	char *addr;
693 	off_t offset, start, length, remain;
694 	u_int no, strip_size;
695 	int best;
696 
697 	vol = tr->tro_volume;
698 	addr = bp->bio_data;
699 	strip_size = vol->v_strip_size;
700 	V2P(vol, bp->bio_offset, &no, &offset, &start);
701 	remain = bp->bio_length;
702 	bioq_init(&queue);
703 	while (remain > 0) {
704 		length = MIN(strip_size - start, remain);
705 		best = g_raid_tr_raid1e_select_read_disk(vol,
706 		    no, offset, length, 0);
707 		KASSERT(best >= 0, ("No readable disk in volume %s!",
708 		    vol->v_name));
709 		no += best;
710 		if (no >= vol->v_disks_count) {
711 			no -= vol->v_disks_count;
712 			offset += strip_size;
713 		}
714 		cbp = g_clone_bio(bp);
715 		if (cbp == NULL)
716 			goto failure;
717 		cbp->bio_offset = offset + start;
718 		cbp->bio_data = addr;
719 		cbp->bio_length = length;
720 		cbp->bio_caller1 = &vol->v_subdisks[no];
721 		bioq_insert_tail(&queue, cbp);
722 		no += N - best;
723 		if (no >= vol->v_disks_count) {
724 			no -= vol->v_disks_count;
725 			offset += strip_size;
726 		}
727 		remain -= length;
728 		addr += length;
729 		start = 0;
730 	}
731 	for (cbp = bioq_first(&queue); cbp != NULL;
732 	    cbp = bioq_first(&queue)) {
733 		bioq_remove(&queue, cbp);
734 		sd = cbp->bio_caller1;
735 		cbp->bio_caller1 = NULL;
736 		g_raid_subdisk_iostart(sd, cbp);
737 	}
738 	return;
739 failure:
740 	for (cbp = bioq_first(&queue); cbp != NULL;
741 	    cbp = bioq_first(&queue)) {
742 		bioq_remove(&queue, cbp);
743 		g_destroy_bio(cbp);
744 	}
745 	if (bp->bio_error == 0)
746 		bp->bio_error = ENOMEM;
747 	g_raid_iodone(bp, bp->bio_error);
748 }
749 
750 static void
751 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
752 {
753 	struct g_raid_volume *vol;
754 	struct g_raid_subdisk *sd;
755 	struct bio_queue_head queue;
756 	struct bio *cbp;
757 	char *addr;
758 	off_t offset, start, length, remain;
759 	u_int no, strip_size;
760 	int i;
761 
762 	vol = tr->tro_volume;
763 	addr = bp->bio_data;
764 	strip_size = vol->v_strip_size;
765 	V2P(vol, bp->bio_offset, &no, &offset, &start);
766 	remain = bp->bio_length;
767 	bioq_init(&queue);
768 	while (remain > 0) {
769 		length = MIN(strip_size - start, remain);
770 		for (i = 0; i < N; i++) {
771 			sd = &vol->v_subdisks[no];
772 			switch (sd->sd_state) {
773 			case G_RAID_SUBDISK_S_ACTIVE:
774 			case G_RAID_SUBDISK_S_STALE:
775 			case G_RAID_SUBDISK_S_RESYNC:
776 				break;
777 			case G_RAID_SUBDISK_S_REBUILD:
778 				if (offset + start >= sd->sd_rebuild_pos)
779 					goto nextdisk;
780 				break;
781 			default:
782 				goto nextdisk;
783 			}
784 			cbp = g_clone_bio(bp);
785 			if (cbp == NULL)
786 				goto failure;
787 			cbp->bio_offset = offset + start;
788 			cbp->bio_data = addr;
789 			cbp->bio_length = length;
790 			cbp->bio_caller1 = sd;
791 			bioq_insert_tail(&queue, cbp);
792 nextdisk:
793 			if (++no >= vol->v_disks_count) {
794 				no = 0;
795 				offset += strip_size;
796 			}
797 		}
798 		remain -= length;
799 		addr += length;
800 		start = 0;
801 	}
802 	for (cbp = bioq_first(&queue); cbp != NULL;
803 	    cbp = bioq_first(&queue)) {
804 		bioq_remove(&queue, cbp);
805 		sd = cbp->bio_caller1;
806 		cbp->bio_caller1 = NULL;
807 		g_raid_subdisk_iostart(sd, cbp);
808 	}
809 	return;
810 failure:
811 	for (cbp = bioq_first(&queue); cbp != NULL;
812 	    cbp = bioq_first(&queue)) {
813 		bioq_remove(&queue, cbp);
814 		g_destroy_bio(cbp);
815 	}
816 	if (bp->bio_error == 0)
817 		bp->bio_error = ENOMEM;
818 	g_raid_iodone(bp, bp->bio_error);
819 }
820 
821 static void
822 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
823 {
824 	struct g_raid_volume *vol;
825 	struct g_raid_tr_raid1e_object *trs;
826 
827 	vol = tr->tro_volume;
828 	trs = (struct g_raid_tr_raid1e_object *)tr;
829 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
830 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
831 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
832 		g_raid_iodone(bp, EIO);
833 		return;
834 	}
835 	/*
836 	 * If we're rebuilding, squeeze in rebuild activity every so often,
837 	 * even when the disk is busy.  Be sure to only count real I/O
838 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
839 	 * by this module.
840 	 */
841 	if (trs->trso_failed_sd != NULL &&
842 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
843 		/* Make this new or running now round short. */
844 		trs->trso_recover_slabs = 0;
845 		if (--trs->trso_fair_io <= 0) {
846 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
847 			g_raid_tr_raid1e_rebuild_some(tr);
848 		}
849 	}
850 	switch (bp->bio_cmd) {
851 	case BIO_READ:
852 		g_raid_tr_iostart_raid1e_read(tr, bp);
853 		break;
854 	case BIO_WRITE:
855 		g_raid_tr_iostart_raid1e_write(tr, bp);
856 		break;
857 	case BIO_DELETE:
858 		g_raid_iodone(bp, EIO);
859 		break;
860 	case BIO_FLUSH:
861 		g_raid_tr_flush_common(tr, bp);
862 		break;
863 	default:
864 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
865 		    bp->bio_cmd, vol->v_name));
866 		break;
867 	}
868 }
869 
870 static void
871 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
872     struct g_raid_subdisk *sd, struct bio *bp)
873 {
874 	struct bio *cbp;
875 	struct g_raid_subdisk *nsd;
876 	struct g_raid_volume *vol;
877 	struct bio *pbp;
878 	struct g_raid_tr_raid1e_object *trs;
879 	off_t virtual, offset, start;
880 	uintptr_t mask;
881 	int error, do_write, copy, disk, best;
882 
883 	trs = (struct g_raid_tr_raid1e_object *)tr;
884 	vol = tr->tro_volume;
885 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
886 		if (trs->trso_type == TR_RAID1E_REBUILD) {
887 			nsd = trs->trso_failed_sd;
888 			if (bp->bio_cmd == BIO_READ) {
889 
890 				/* Immediately abort rebuild, if requested. */
891 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
892 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
893 					g_raid_tr_raid1e_rebuild_abort(tr);
894 					return;
895 				}
896 
897 				/* On read error, skip and cross fingers. */
898 				if (bp->bio_error != 0) {
899 					G_RAID_LOGREQ(0, bp,
900 					    "Read error during rebuild (%d), "
901 					    "possible data loss!",
902 					    bp->bio_error);
903 					goto rebuild_round_done;
904 				}
905 
906 				/*
907 				 * The read operation finished, queue the
908 				 * write and get out.
909 				 */
910 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
911 				    bp->bio_error);
912 				bp->bio_cmd = BIO_WRITE;
913 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
914 				bp->bio_offset = nsd->sd_rebuild_pos;
915 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
916 				g_raid_subdisk_iostart(nsd, bp);
917 			} else {
918 				/*
919 				 * The write operation just finished.  Do
920 				 * another.  We keep cloning the master bio
921 				 * since it has the right buffers allocated to
922 				 * it.
923 				 */
924 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
925 				    bp->bio_error);
926 				if (bp->bio_error != 0 ||
927 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
928 					if ((trs->trso_flags &
929 					    TR_RAID1E_F_ABORT) == 0) {
930 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
931 						    nsd, nsd->sd_disk);
932 					}
933 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
934 					g_raid_tr_raid1e_rebuild_abort(tr);
935 					return;
936 				}
937 rebuild_round_done:
938 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
939 				g_raid_unlock_range(tr->tro_volume,
940 				    trs->trso_lock_pos, trs->trso_lock_len);
941 				nsd->sd_rebuild_pos += bp->bio_length;
942 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
943 					g_raid_tr_raid1e_rebuild_finish(tr);
944 					return;
945 				}
946 
947 				/* Abort rebuild if we are stopping */
948 				if (trs->trso_stopping) {
949 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
950 					g_raid_tr_raid1e_rebuild_abort(tr);
951 					return;
952 				}
953 
954 				if (--trs->trso_meta_update <= 0) {
955 					g_raid_write_metadata(vol->v_softc,
956 					    vol, nsd, nsd->sd_disk);
957 					trs->trso_meta_update =
958 					    g_raid1e_rebuild_meta_update;
959 					/* Compensate short rebuild I/Os. */
960 					if ((vol->v_disks_count % N) != 0 &&
961 					    vol->v_strip_size <
962 					     g_raid1e_rebuild_slab) {
963 						trs->trso_meta_update *=
964 						    g_raid1e_rebuild_slab;
965 						trs->trso_meta_update /=
966 						    vol->v_strip_size;
967 					}
968 				}
969 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
970 				if (--trs->trso_recover_slabs <= 0)
971 					return;
972 				/* Run next rebuild iteration. */
973 				g_raid_tr_raid1e_rebuild_some(tr);
974 			}
975 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
976 			/*
977 			 * read good sd, read bad sd in parallel.  when both
978 			 * done, compare the buffers.  write good to the bad
979 			 * if different.  do the next bit of work.
980 			 */
981 			panic("Somehow, we think we're doing a resync");
982 		}
983 		return;
984 	}
985 	pbp = bp->bio_parent;
986 	pbp->bio_inbed++;
987 	mask = (intptr_t)bp->bio_caller2;
988 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
989 		/*
990 		 * Read failed on first drive.  Retry the read error on
991 		 * another disk drive, if available, before erroring out the
992 		 * read.
993 		 */
994 		sd->sd_disk->d_read_errs++;
995 		G_RAID_LOGREQ(0, bp,
996 		    "Read error (%d), %d read errors total",
997 		    bp->bio_error, sd->sd_disk->d_read_errs);
998 
999 		/*
1000 		 * If there are too many read errors, we move to degraded.
1001 		 * XXX Do we want to FAIL the drive (eg, make the user redo
1002 		 * everything to get it back in sync), or just degrade the
1003 		 * drive, which kicks off a resync?
1004 		 */
1005 		do_write = 0;
1006 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1007 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1008 		else if (mask == 0)
1009 			do_write = 1;
1010 
1011 		/* Restore what we were doing. */
1012 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1013 		V2P(vol, virtual, &disk, &offset, &start);
1014 
1015 		/* Find the other disk, and try to do the I/O to it. */
1016 		mask |= 1 << copy;
1017 		best = g_raid_tr_raid1e_select_read_disk(vol,
1018 		    disk, offset, start, mask);
1019 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1020 			disk += best;
1021 			if (disk >= vol->v_disks_count) {
1022 				disk -= vol->v_disks_count;
1023 				offset += vol->v_strip_size;
1024 			}
1025 			cbp->bio_offset = offset + start;
1026 			cbp->bio_length = bp->bio_length;
1027 			cbp->bio_data = bp->bio_data;
1028 			g_destroy_bio(bp);
1029 			nsd = &vol->v_subdisks[disk];
1030 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1031 			    nsd->sd_pos);
1032 			if (do_write)
1033 				mask |= 1 << 31;
1034 			if ((mask & (1 << 31)) != 0)
1035 				sd->sd_recovery++;
1036 			cbp->bio_caller2 = (void *)mask;
1037 			if (do_write) {
1038 				cbp->bio_caller1 = nsd;
1039 				/* Lock callback starts I/O */
1040 				g_raid_lock_range(sd->sd_volume,
1041 				    virtual, cbp->bio_length, pbp, cbp);
1042 			} else {
1043 				g_raid_subdisk_iostart(nsd, cbp);
1044 			}
1045 			return;
1046 		}
1047 		/*
1048 		 * We can't retry.  Return the original error by falling
1049 		 * through.  This will happen when there's only one good disk.
1050 		 * We don't need to fail the raid, since its actual state is
1051 		 * based on the state of the subdisks.
1052 		 */
1053 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1054 	}
1055 	if (bp->bio_cmd == BIO_READ &&
1056 	    bp->bio_error == 0 &&
1057 	    (mask & (1 << 31)) != 0) {
1058 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1059 
1060 		/* Restore what we were doing. */
1061 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1062 		V2P(vol, virtual, &disk, &offset, &start);
1063 
1064 		/* Find best disk to write. */
1065 		best = g_raid_tr_raid1e_select_read_disk(vol,
1066 		    disk, offset, start, ~mask);
1067 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1068 			disk += best;
1069 			if (disk >= vol->v_disks_count) {
1070 				disk -= vol->v_disks_count;
1071 				offset += vol->v_strip_size;
1072 			}
1073 			cbp->bio_offset = offset + start;
1074 			cbp->bio_length = bp->bio_length;
1075 			cbp->bio_data = bp->bio_data;
1076 			cbp->bio_cmd = BIO_WRITE;
1077 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1078 			cbp->bio_caller2 = (void *)mask;
1079 			g_destroy_bio(bp);
1080 			G_RAID_LOGREQ(2, cbp,
1081 			    "Attempting bad sector remap on failing drive.");
1082 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1083 			return;
1084 		}
1085 	}
1086 	if ((mask & (1 << 31)) != 0) {
1087 		/*
1088 		 * We're done with a recovery, mark the range as unlocked.
1089 		 * For any write errors, we agressively fail the disk since
1090 		 * there was both a READ and a WRITE error at this location.
1091 		 * Both types of errors generally indicates the drive is on
1092 		 * the verge of total failure anyway.  Better to stop trusting
1093 		 * it now.  However, we need to reset error to 0 in that case
1094 		 * because we're not failing the original I/O which succeeded.
1095 		 */
1096 
1097 		/* Restore what we were doing. */
1098 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
1099 		V2P(vol, virtual, &disk, &offset, &start);
1100 
1101 		for (copy = 0; copy < N; copy++) {
1102 			if ((mask & (1 << copy) ) != 0)
1103 				vol->v_subdisks[(disk + copy) %
1104 				    vol->v_disks_count].sd_recovery--;
1105 		}
1106 
1107 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1108 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
1109 			    "failing subdisk.");
1110 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1111 			bp->bio_error = 0;
1112 		}
1113 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1114 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1115 	}
1116 	error = bp->bio_error;
1117 	g_destroy_bio(bp);
1118 	if (pbp->bio_children == pbp->bio_inbed) {
1119 		pbp->bio_completed = pbp->bio_length;
1120 		g_raid_iodone(pbp, error);
1121 	}
1122 }
1123 
1124 static int
1125 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr,
1126     void *virtual, vm_offset_t physical, off_t boffset, size_t blength)
1127 {
1128 	struct g_raid_volume *vol;
1129 	struct g_raid_subdisk *sd;
1130 	struct bio_queue_head queue;
1131 	char *addr;
1132 	off_t offset, start, length, remain;
1133 	u_int no, strip_size;
1134 	int i, error;
1135 
1136 	vol = tr->tro_volume;
1137 	addr = virtual;
1138 	strip_size = vol->v_strip_size;
1139 	V2P(vol, boffset, &no, &offset, &start);
1140 	remain = blength;
1141 	bioq_init(&queue);
1142 	while (remain > 0) {
1143 		length = MIN(strip_size - start, remain);
1144 		for (i = 0; i < N; i++) {
1145 			sd = &vol->v_subdisks[no];
1146 			switch (sd->sd_state) {
1147 			case G_RAID_SUBDISK_S_ACTIVE:
1148 			case G_RAID_SUBDISK_S_STALE:
1149 			case G_RAID_SUBDISK_S_RESYNC:
1150 				break;
1151 			case G_RAID_SUBDISK_S_REBUILD:
1152 				if (offset + start >= sd->sd_rebuild_pos)
1153 					goto nextdisk;
1154 				break;
1155 			default:
1156 				goto nextdisk;
1157 			}
1158 			error = g_raid_subdisk_kerneldump(sd,
1159 			    addr, 0, offset + start, length);
1160 			if (error != 0)
1161 				return (error);
1162 nextdisk:
1163 			if (++no >= vol->v_disks_count) {
1164 				no = 0;
1165 				offset += strip_size;
1166 			}
1167 		}
1168 		remain -= length;
1169 		addr += length;
1170 		start = 0;
1171 	}
1172 	return (0);
1173 }
1174 
1175 static int
1176 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1177 {
1178 	struct bio *bp;
1179 	struct g_raid_subdisk *sd;
1180 
1181 	bp = (struct bio *)argp;
1182 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
1183 	g_raid_subdisk_iostart(sd, bp);
1184 
1185 	return (0);
1186 }
1187 
1188 static int
1189 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1190 {
1191 	struct g_raid_tr_raid1e_object *trs;
1192 	struct g_raid_volume *vol;
1193 
1194 	vol = tr->tro_volume;
1195 	trs = (struct g_raid_tr_raid1e_object *)tr;
1196 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1197 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1198 	/* Compensate short rebuild I/Os. */
1199 	if ((vol->v_disks_count % N) != 0 &&
1200 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
1201 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1202 		trs->trso_recover_slabs /= vol->v_strip_size;
1203 	}
1204 	if (trs->trso_type == TR_RAID1E_REBUILD)
1205 		g_raid_tr_raid1e_rebuild_some(tr);
1206 	return (0);
1207 }
1208 
1209 static int
1210 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1211 {
1212 	struct g_raid_tr_raid1e_object *trs;
1213 
1214 	trs = (struct g_raid_tr_raid1e_object *)tr;
1215 
1216 	if (trs->trso_buffer != NULL) {
1217 		free(trs->trso_buffer, M_TR_RAID1E);
1218 		trs->trso_buffer = NULL;
1219 	}
1220 	return (0);
1221 }
1222 
1223 G_RAID_TR_DECLARE(g_raid_tr_raid1e);
1224