xref: /freebsd/sys/geom/raid/tr_raid1e.c (revision fdafd315ad0d0f28a11b9fb4476a9ab059c62b92)
189b17223SAlexander Motin /*-
2*4d846d26SWarner Losh  * SPDX-License-Identifier: BSD-2-Clause
33728855aSPedro F. Giffuni  *
489b17223SAlexander Motin  * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
589b17223SAlexander Motin  * All rights reserved.
689b17223SAlexander Motin  *
789b17223SAlexander Motin  * Redistribution and use in source and binary forms, with or without
889b17223SAlexander Motin  * modification, are permitted provided that the following conditions
989b17223SAlexander Motin  * are met:
1089b17223SAlexander Motin  * 1. Redistributions of source code must retain the above copyright
1189b17223SAlexander Motin  *    notice, this list of conditions and the following disclaimer.
1289b17223SAlexander Motin  * 2. Redistributions in binary form must reproduce the above copyright
1389b17223SAlexander Motin  *    notice, this list of conditions and the following disclaimer in the
1489b17223SAlexander Motin  *    documentation and/or other materials provided with the distribution.
1589b17223SAlexander Motin  *
1689b17223SAlexander Motin  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
1789b17223SAlexander Motin  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1889b17223SAlexander Motin  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1989b17223SAlexander Motin  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
2089b17223SAlexander Motin  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2189b17223SAlexander Motin  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2289b17223SAlexander Motin  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2389b17223SAlexander Motin  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2489b17223SAlexander Motin  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2589b17223SAlexander Motin  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2689b17223SAlexander Motin  * SUCH DAMAGE.
2789b17223SAlexander Motin  */
2889b17223SAlexander Motin 
2989b17223SAlexander Motin #include <sys/param.h>
3089b17223SAlexander Motin #include <sys/bio.h>
3189b17223SAlexander Motin #include <sys/endian.h>
3289b17223SAlexander Motin #include <sys/kernel.h>
3389b17223SAlexander Motin #include <sys/kobj.h>
3489b17223SAlexander Motin #include <sys/limits.h>
3589b17223SAlexander Motin #include <sys/lock.h>
3689b17223SAlexander Motin #include <sys/malloc.h>
3789b17223SAlexander Motin #include <sys/mutex.h>
3889b17223SAlexander Motin #include <sys/sysctl.h>
3989b17223SAlexander Motin #include <sys/systm.h>
4089b17223SAlexander Motin #include <geom/geom.h>
41ac03832eSConrad Meyer #include <geom/geom_dbg.h>
4289b17223SAlexander Motin #include "geom/raid/g_raid.h"
4389b17223SAlexander Motin #include "g_raid_tr_if.h"
4489b17223SAlexander Motin 
4589b17223SAlexander Motin #define N	2
4689b17223SAlexander Motin 
47c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1e);
4889b17223SAlexander Motin 
4989b17223SAlexander Motin #define RAID1E_REBUILD_SLAB	(1 << 20) /* One transation in a rebuild */
5089b17223SAlexander Motin static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
5289b17223SAlexander Motin     &g_raid1e_rebuild_slab, 0,
5389b17223SAlexander Motin     "Amount of the disk to rebuild each read/write cycle of the rebuild.");
5489b17223SAlexander Motin 
5589b17223SAlexander Motin #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
5689b17223SAlexander Motin static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
5889b17223SAlexander Motin     &g_raid1e_rebuild_fair_io, 0,
5989b17223SAlexander Motin     "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
6089b17223SAlexander Motin 
6189b17223SAlexander Motin #define RAID1E_REBUILD_CLUSTER_IDLE 100
6289b17223SAlexander Motin static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
6489b17223SAlexander Motin     &g_raid1e_rebuild_cluster_idle, 0,
6589b17223SAlexander Motin     "Number of slabs to do each time we trigger a rebuild cycle");
6689b17223SAlexander Motin 
6789b17223SAlexander Motin #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
6889b17223SAlexander Motin static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
7089b17223SAlexander Motin     &g_raid1e_rebuild_meta_update, 0,
7189b17223SAlexander Motin     "When to update the meta data.");
7289b17223SAlexander Motin 
7389b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
7489b17223SAlexander Motin 
7589b17223SAlexander Motin #define TR_RAID1E_NONE 0
7689b17223SAlexander Motin #define TR_RAID1E_REBUILD 1
7789b17223SAlexander Motin #define TR_RAID1E_RESYNC 2
7889b17223SAlexander Motin 
7989b17223SAlexander Motin #define TR_RAID1E_F_DOING_SOME	0x1
8089b17223SAlexander Motin #define TR_RAID1E_F_LOCKED	0x2
8189b17223SAlexander Motin #define TR_RAID1E_F_ABORT	0x4
8289b17223SAlexander Motin 
8389b17223SAlexander Motin struct g_raid_tr_raid1e_object {
8489b17223SAlexander Motin 	struct g_raid_tr_object	 trso_base;
8589b17223SAlexander Motin 	int			 trso_starting;
8689b17223SAlexander Motin 	int			 trso_stopping;
8789b17223SAlexander Motin 	int			 trso_type;
8889b17223SAlexander Motin 	int			 trso_recover_slabs; /* slabs before rest */
8989b17223SAlexander Motin 	int			 trso_fair_io;
9089b17223SAlexander Motin 	int			 trso_meta_update;
9189b17223SAlexander Motin 	int			 trso_flags;
9289b17223SAlexander Motin 	struct g_raid_subdisk	*trso_failed_sd; /* like per volume */
9389b17223SAlexander Motin 	void			*trso_buffer;	 /* Buffer space */
9489b17223SAlexander Motin 	off_t			 trso_lock_pos; /* Locked range start. */
9589b17223SAlexander Motin 	off_t			 trso_lock_len; /* Locked range length. */
9689b17223SAlexander Motin 	struct bio		 trso_bio;
9789b17223SAlexander Motin };
9889b17223SAlexander Motin 
9989b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
10089b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1e;
10189b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1e;
10289b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
10389b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
10489b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
10589b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
10689b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
10789b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
10889b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1e;
10989b17223SAlexander Motin 
11089b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1e_methods[] = {
11189b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_taste,	g_raid_tr_taste_raid1e),
11289b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_event,	g_raid_tr_event_raid1e),
11389b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_start,	g_raid_tr_start_raid1e),
11489b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_stop,	g_raid_tr_stop_raid1e),
11589b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_iostart,	g_raid_tr_iostart_raid1e),
11689b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_iodone,	g_raid_tr_iodone_raid1e),
11789b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
11889b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_locked,	g_raid_tr_locked_raid1e),
11989b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_idle,	g_raid_tr_idle_raid1e),
12089b17223SAlexander Motin 	KOBJMETHOD(g_raid_tr_free,	g_raid_tr_free_raid1e),
12189b17223SAlexander Motin 	{ 0, 0 }
12289b17223SAlexander Motin };
12389b17223SAlexander Motin 
12489b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1e_class = {
12589b17223SAlexander Motin 	"RAID1E",
12689b17223SAlexander Motin 	g_raid_tr_raid1e_methods,
12789b17223SAlexander Motin 	sizeof(struct g_raid_tr_raid1e_object),
128c89d2fbeSAlexander Motin 	.trc_enable = 1,
129b43560abSAlexander Motin 	.trc_priority = 200,
130b43560abSAlexander Motin 	.trc_accept_unmapped = 1
13189b17223SAlexander Motin };
13289b17223SAlexander Motin 
13389b17223SAlexander Motin static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
13489b17223SAlexander Motin static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
13589b17223SAlexander Motin     struct g_raid_subdisk *sd);
13689b17223SAlexander Motin static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
13789b17223SAlexander Motin     int no, off_t off, off_t len, u_int mask);
13889b17223SAlexander Motin 
13989b17223SAlexander Motin static inline void
V2P(struct g_raid_volume * vol,off_t virt,int * disk,off_t * offset,off_t * start)14089b17223SAlexander Motin V2P(struct g_raid_volume *vol, off_t virt,
14189b17223SAlexander Motin     int *disk, off_t *offset, off_t *start)
14289b17223SAlexander Motin {
14389b17223SAlexander Motin 	off_t nstrip;
14489b17223SAlexander Motin 	u_int strip_size;
14589b17223SAlexander Motin 
14689b17223SAlexander Motin 	strip_size = vol->v_strip_size;
14789b17223SAlexander Motin 	/* Strip number. */
14889b17223SAlexander Motin 	nstrip = virt / strip_size;
14989b17223SAlexander Motin 	/* Start position in strip. */
15089b17223SAlexander Motin 	*start = virt % strip_size;
15189b17223SAlexander Motin 	/* Disk number. */
15289b17223SAlexander Motin 	*disk = (nstrip * N) % vol->v_disks_count;
15389b17223SAlexander Motin 	/* Strip start position in disk. */
15489b17223SAlexander Motin 	*offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
15589b17223SAlexander Motin }
15689b17223SAlexander Motin 
15789b17223SAlexander Motin static inline void
P2V(struct g_raid_volume * vol,int disk,off_t offset,off_t * virt,int * copy)15889b17223SAlexander Motin P2V(struct g_raid_volume *vol, int disk, off_t offset,
15989b17223SAlexander Motin     off_t *virt, int *copy)
16089b17223SAlexander Motin {
16189b17223SAlexander Motin 	off_t nstrip, start;
16289b17223SAlexander Motin 	u_int strip_size;
16389b17223SAlexander Motin 
16489b17223SAlexander Motin 	strip_size = vol->v_strip_size;
16589b17223SAlexander Motin 	/* Start position in strip. */
16689b17223SAlexander Motin 	start = offset % strip_size;
16789b17223SAlexander Motin 	/* Physical strip number. */
16889b17223SAlexander Motin 	nstrip = (offset / strip_size) * vol->v_disks_count + disk;
16989b17223SAlexander Motin 	/* Number of physical strip (copy) inside virtual strip. */
17089b17223SAlexander Motin 	*copy = nstrip % N;
17189b17223SAlexander Motin 	/* Offset in virtual space. */
17289b17223SAlexander Motin 	*virt = (nstrip / N) * strip_size + start;
17389b17223SAlexander Motin }
17489b17223SAlexander Motin 
17589b17223SAlexander Motin static int
g_raid_tr_taste_raid1e(struct g_raid_tr_object * tr,struct g_raid_volume * vol)17689b17223SAlexander Motin g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
17789b17223SAlexander Motin {
17889b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
17989b17223SAlexander Motin 
18089b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
18189b17223SAlexander Motin 	if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182dbb2e755SAlexander Motin 	    tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
18389b17223SAlexander Motin 		return (G_RAID_TR_TASTE_FAIL);
18489b17223SAlexander Motin 	trs->trso_starting = 1;
18589b17223SAlexander Motin 	return (G_RAID_TR_TASTE_SUCCEED);
18689b17223SAlexander Motin }
18789b17223SAlexander Motin 
18889b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e_even(struct g_raid_volume * vol)18989b17223SAlexander Motin g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
19089b17223SAlexander Motin {
19189b17223SAlexander Motin 	struct g_raid_softc *sc;
19289b17223SAlexander Motin 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
19389b17223SAlexander Motin 	int i, j, state, sstate;
19489b17223SAlexander Motin 
19589b17223SAlexander Motin 	sc = vol->v_softc;
19689b17223SAlexander Motin 	state = G_RAID_VOLUME_S_OPTIMAL;
19789b17223SAlexander Motin 	for (i = 0; i < vol->v_disks_count / N; i++) {
19889b17223SAlexander Motin 		bestsd = &vol->v_subdisks[i * N];
19989b17223SAlexander Motin 		for (j = 1; j < N; j++) {
20089b17223SAlexander Motin 			sd = &vol->v_subdisks[i * N + j];
20189b17223SAlexander Motin 			if (sd->sd_state > bestsd->sd_state)
20289b17223SAlexander Motin 				bestsd = sd;
20389b17223SAlexander Motin 			else if (sd->sd_state == bestsd->sd_state &&
20489b17223SAlexander Motin 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
20589b17223SAlexander Motin 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
20689b17223SAlexander Motin 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
20789b17223SAlexander Motin 				bestsd = sd;
20889b17223SAlexander Motin 		}
20989b17223SAlexander Motin 		if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
21089b17223SAlexander Motin 		    bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
21189b17223SAlexander Motin 			/* We found reasonable candidate. */
21289b17223SAlexander Motin 			G_RAID_DEBUG1(1, sc,
21389b17223SAlexander Motin 			    "Promote subdisk %s:%d from %s to ACTIVE.",
21489b17223SAlexander Motin 			    vol->v_name, bestsd->sd_pos,
21589b17223SAlexander Motin 			    g_raid_subdisk_state2str(bestsd->sd_state));
21689b17223SAlexander Motin 			g_raid_change_subdisk_state(bestsd,
21789b17223SAlexander Motin 			    G_RAID_SUBDISK_S_ACTIVE);
21889b17223SAlexander Motin 			g_raid_write_metadata(sc,
21989b17223SAlexander Motin 			    vol, bestsd, bestsd->sd_disk);
22089b17223SAlexander Motin 		}
22189b17223SAlexander Motin 		worstsd = &vol->v_subdisks[i * N];
22289b17223SAlexander Motin 		for (j = 1; j < N; j++) {
22389b17223SAlexander Motin 			sd = &vol->v_subdisks[i * N + j];
22489b17223SAlexander Motin 			if (sd->sd_state < worstsd->sd_state)
22589b17223SAlexander Motin 				worstsd = sd;
22689b17223SAlexander Motin 		}
22789b17223SAlexander Motin 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
22889b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_OPTIMAL;
22989b17223SAlexander Motin 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
23089b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
23189b17223SAlexander Motin 		else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
23289b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_DEGRADED;
23389b17223SAlexander Motin 		else
23489b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_BROKEN;
23589b17223SAlexander Motin 		if (sstate < state)
23689b17223SAlexander Motin 			state = sstate;
23789b17223SAlexander Motin 	}
23889b17223SAlexander Motin 	return (state);
23989b17223SAlexander Motin }
24089b17223SAlexander Motin 
24189b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e_odd(struct g_raid_volume * vol)24289b17223SAlexander Motin g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
24389b17223SAlexander Motin {
24489b17223SAlexander Motin 	struct g_raid_softc *sc;
24589b17223SAlexander Motin 	struct g_raid_subdisk *sd, *bestsd, *worstsd;
24689b17223SAlexander Motin 	int i, j, state, sstate;
24789b17223SAlexander Motin 
24889b17223SAlexander Motin 	sc = vol->v_softc;
24989b17223SAlexander Motin 	if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
25089b17223SAlexander Motin 	    vol->v_disks_count)
25189b17223SAlexander Motin 		return (G_RAID_VOLUME_S_OPTIMAL);
25289b17223SAlexander Motin 	for (i = 0; i < vol->v_disks_count; i++) {
25389b17223SAlexander Motin 		sd = &vol->v_subdisks[i];
25489b17223SAlexander Motin 		if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
25589b17223SAlexander Motin 			/* We found reasonable candidate. */
25689b17223SAlexander Motin 			G_RAID_DEBUG1(1, sc,
25789b17223SAlexander Motin 			    "Promote subdisk %s:%d from %s to STALE.",
25889b17223SAlexander Motin 			    vol->v_name, sd->sd_pos,
25989b17223SAlexander Motin 			    g_raid_subdisk_state2str(sd->sd_state));
26089b17223SAlexander Motin 			g_raid_change_subdisk_state(sd,
26189b17223SAlexander Motin 			    G_RAID_SUBDISK_S_STALE);
26289b17223SAlexander Motin 			g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
26389b17223SAlexander Motin 		}
26489b17223SAlexander Motin 	}
26589b17223SAlexander Motin 	state = G_RAID_VOLUME_S_OPTIMAL;
26689b17223SAlexander Motin 	for (i = 0; i < vol->v_disks_count; i++) {
26789b17223SAlexander Motin 		bestsd = &vol->v_subdisks[i];
26889b17223SAlexander Motin 		worstsd = &vol->v_subdisks[i];
26989b17223SAlexander Motin 		for (j = 1; j < N; j++) {
27089b17223SAlexander Motin 			sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
27189b17223SAlexander Motin 			if (sd->sd_state > bestsd->sd_state)
27289b17223SAlexander Motin 				bestsd = sd;
27389b17223SAlexander Motin 			else if (sd->sd_state == bestsd->sd_state &&
27489b17223SAlexander Motin 			    (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
27589b17223SAlexander Motin 			     sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
27689b17223SAlexander Motin 			    sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
27789b17223SAlexander Motin 				bestsd = sd;
27889b17223SAlexander Motin 			if (sd->sd_state < worstsd->sd_state)
27989b17223SAlexander Motin 				worstsd = sd;
28089b17223SAlexander Motin 		}
28189b17223SAlexander Motin 		if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
28289b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_OPTIMAL;
28389b17223SAlexander Motin 		else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
28489b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
28589b17223SAlexander Motin 		else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
28689b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_DEGRADED;
28789b17223SAlexander Motin 		else
28889b17223SAlexander Motin 			sstate = G_RAID_VOLUME_S_BROKEN;
28989b17223SAlexander Motin 		if (sstate < state)
29089b17223SAlexander Motin 			state = sstate;
29189b17223SAlexander Motin 	}
29289b17223SAlexander Motin 	return (state);
29389b17223SAlexander Motin }
29489b17223SAlexander Motin 
29589b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e(struct g_raid_volume * vol,struct g_raid_subdisk * sd)29689b17223SAlexander Motin g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
29789b17223SAlexander Motin     struct g_raid_subdisk *sd)
29889b17223SAlexander Motin {
29989b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
30089b17223SAlexander Motin 	struct g_raid_softc *sc;
30189b17223SAlexander Motin 	u_int s;
30289b17223SAlexander Motin 
30389b17223SAlexander Motin 	sc = vol->v_softc;
30489b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
30589b17223SAlexander Motin 	if (trs->trso_stopping &&
30689b17223SAlexander Motin 	    (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
30789b17223SAlexander Motin 		s = G_RAID_VOLUME_S_STOPPED;
30889b17223SAlexander Motin 	else if (trs->trso_starting)
30989b17223SAlexander Motin 		s = G_RAID_VOLUME_S_STARTING;
31089b17223SAlexander Motin 	else {
31189b17223SAlexander Motin 		if ((vol->v_disks_count % N) == 0)
31289b17223SAlexander Motin 			s = g_raid_tr_update_state_raid1e_even(vol);
31389b17223SAlexander Motin 		else
31489b17223SAlexander Motin 			s = g_raid_tr_update_state_raid1e_odd(vol);
31589b17223SAlexander Motin 	}
31689b17223SAlexander Motin 	if (s != vol->v_state) {
31789b17223SAlexander Motin 		g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
31889b17223SAlexander Motin 		    G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
31989b17223SAlexander Motin 		    G_RAID_EVENT_VOLUME);
32089b17223SAlexander Motin 		g_raid_change_volume_state(vol, s);
32189b17223SAlexander Motin 		if (!trs->trso_starting && !trs->trso_stopping)
32289b17223SAlexander Motin 			g_raid_write_metadata(sc, vol, NULL, NULL);
32389b17223SAlexander Motin 	}
32489b17223SAlexander Motin 	if (!trs->trso_starting && !trs->trso_stopping)
32589b17223SAlexander Motin 		g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
32689b17223SAlexander Motin 	return (0);
32789b17223SAlexander Motin }
32889b17223SAlexander Motin 
32989b17223SAlexander Motin static void
g_raid_tr_raid1e_fail_disk(struct g_raid_softc * sc,struct g_raid_subdisk * sd,struct g_raid_disk * disk)33089b17223SAlexander Motin g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
33189b17223SAlexander Motin     struct g_raid_disk *disk)
33289b17223SAlexander Motin {
333ef844ef7SAlexander Motin 	struct g_raid_volume *vol;
334ef844ef7SAlexander Motin 
335ef844ef7SAlexander Motin 	vol = sd->sd_volume;
33689b17223SAlexander Motin 	/*
33789b17223SAlexander Motin 	 * We don't fail the last disk in the pack, since it still has decent
33889b17223SAlexander Motin 	 * data on it and that's better than failing the disk if it is the root
33989b17223SAlexander Motin 	 * file system.
34089b17223SAlexander Motin 	 *
34189b17223SAlexander Motin 	 * XXX should this be controlled via a tunable?  It makes sense for
34289b17223SAlexander Motin 	 * the volume that has / on it.  I can't think of a case where we'd
34389b17223SAlexander Motin 	 * want the volume to go away on this kind of event.
34489b17223SAlexander Motin 	 */
345ef844ef7SAlexander Motin 	if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346ef844ef7SAlexander Motin 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347ef844ef7SAlexander Motin 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348ef844ef7SAlexander Motin 	     g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349ef844ef7SAlexander Motin 	     vol->v_disks_count) &&
350ef844ef7SAlexander Motin 	    (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
35189b17223SAlexander Motin 		return;
35289b17223SAlexander Motin 	g_raid_fail_disk(sc, sd, disk);
35389b17223SAlexander Motin }
35489b17223SAlexander Motin 
35589b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object * trs)35689b17223SAlexander Motin g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
35789b17223SAlexander Motin {
35889b17223SAlexander Motin 	struct g_raid_volume *vol;
35989b17223SAlexander Motin 	struct g_raid_subdisk *sd;
36089b17223SAlexander Motin 
36189b17223SAlexander Motin 	vol = trs->trso_base.tro_volume;
36289b17223SAlexander Motin 	sd = trs->trso_failed_sd;
36389b17223SAlexander Motin 	g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
36489b17223SAlexander Motin 	free(trs->trso_buffer, M_TR_RAID1E);
36589b17223SAlexander Motin 	trs->trso_buffer = NULL;
36689b17223SAlexander Motin 	trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
36789b17223SAlexander Motin 	trs->trso_type = TR_RAID1E_NONE;
36889b17223SAlexander Motin 	trs->trso_recover_slabs = 0;
36989b17223SAlexander Motin 	trs->trso_failed_sd = NULL;
37089b17223SAlexander Motin 	g_raid_tr_update_state_raid1e(vol, NULL);
37189b17223SAlexander Motin }
37289b17223SAlexander Motin 
37389b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object * tr)37489b17223SAlexander Motin g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
37589b17223SAlexander Motin {
37689b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
37789b17223SAlexander Motin 	struct g_raid_subdisk *sd;
37889b17223SAlexander Motin 
37989b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
38089b17223SAlexander Motin 	sd = trs->trso_failed_sd;
38189b17223SAlexander Motin 	G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
38289b17223SAlexander Motin 	    "Subdisk %s:%d-%s rebuild completed.",
38389b17223SAlexander Motin 	    sd->sd_volume->v_name, sd->sd_pos,
38489b17223SAlexander Motin 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
38589b17223SAlexander Motin 	g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
38689b17223SAlexander Motin 	sd->sd_rebuild_pos = 0;
38789b17223SAlexander Motin 	g_raid_tr_raid1e_rebuild_done(trs);
38889b17223SAlexander Motin }
38989b17223SAlexander Motin 
39089b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object * tr)39189b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
39289b17223SAlexander Motin {
39389b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
39489b17223SAlexander Motin 	struct g_raid_subdisk *sd;
39589b17223SAlexander Motin 	struct g_raid_volume *vol;
39689b17223SAlexander Motin 
39789b17223SAlexander Motin 	vol = tr->tro_volume;
39889b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
39989b17223SAlexander Motin 	sd = trs->trso_failed_sd;
40089b17223SAlexander Motin 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
40189b17223SAlexander Motin 		G_RAID_DEBUG1(1, vol->v_softc,
40289b17223SAlexander Motin 		    "Subdisk %s:%d-%s rebuild is aborting.",
40389b17223SAlexander Motin 		    sd->sd_volume->v_name, sd->sd_pos,
40489b17223SAlexander Motin 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
40589b17223SAlexander Motin 		trs->trso_flags |= TR_RAID1E_F_ABORT;
40689b17223SAlexander Motin 	} else {
40789b17223SAlexander Motin 		G_RAID_DEBUG1(0, vol->v_softc,
40889b17223SAlexander Motin 		    "Subdisk %s:%d-%s rebuild aborted.",
40989b17223SAlexander Motin 		    sd->sd_volume->v_name, sd->sd_pos,
41089b17223SAlexander Motin 		    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
41189b17223SAlexander Motin 		trs->trso_flags &= ~TR_RAID1E_F_ABORT;
41289b17223SAlexander Motin 		if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
41389b17223SAlexander Motin 			trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
41489b17223SAlexander Motin 			g_raid_unlock_range(tr->tro_volume,
41589b17223SAlexander Motin 			    trs->trso_lock_pos, trs->trso_lock_len);
41689b17223SAlexander Motin 		}
41789b17223SAlexander Motin 		g_raid_tr_raid1e_rebuild_done(trs);
41889b17223SAlexander Motin 	}
41989b17223SAlexander Motin }
42089b17223SAlexander Motin 
42189b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object * tr)42289b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
42389b17223SAlexander Motin {
42489b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
42589b17223SAlexander Motin 	struct g_raid_softc *sc;
42689b17223SAlexander Motin 	struct g_raid_volume *vol;
42789b17223SAlexander Motin 	struct g_raid_subdisk *sd;
42889b17223SAlexander Motin 	struct bio *bp;
42989b17223SAlexander Motin 	off_t len, virtual, vend, offset, start;
43089b17223SAlexander Motin 	int disk, copy, best;
43189b17223SAlexander Motin 
43289b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
43389b17223SAlexander Motin 	if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
43489b17223SAlexander Motin 		return;
43589b17223SAlexander Motin 	vol = tr->tro_volume;
43689b17223SAlexander Motin 	sc = vol->v_softc;
43789b17223SAlexander Motin 	sd = trs->trso_failed_sd;
43889b17223SAlexander Motin 
43989b17223SAlexander Motin 	while (1) {
44089b17223SAlexander Motin 		if (sd->sd_rebuild_pos >= sd->sd_size) {
44189b17223SAlexander Motin 			g_raid_tr_raid1e_rebuild_finish(tr);
44289b17223SAlexander Motin 			return;
44389b17223SAlexander Motin 		}
44489b17223SAlexander Motin 		/* Get virtual offset from physical rebuild position. */
44589b17223SAlexander Motin 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, &copy);
44689b17223SAlexander Motin 		/* Get physical offset back to get first stripe position. */
44789b17223SAlexander Motin 		V2P(vol, virtual, &disk, &offset, &start);
44889b17223SAlexander Motin 		/* Calculate contignous data length. */
44989b17223SAlexander Motin 		len = MIN(g_raid1e_rebuild_slab,
45089b17223SAlexander Motin 		    sd->sd_size - sd->sd_rebuild_pos);
45189b17223SAlexander Motin 		if ((vol->v_disks_count % N) != 0)
45289b17223SAlexander Motin 			len = MIN(len, vol->v_strip_size - start);
45389b17223SAlexander Motin 		/* Find disk with most accurate data. */
45489b17223SAlexander Motin 		best = g_raid_tr_raid1e_select_read_disk(vol, disk,
45589b17223SAlexander Motin 		    offset + start, len, 0);
45689b17223SAlexander Motin 		if (best < 0) {
45789b17223SAlexander Motin 			/* There is no any valid disk. */
45889b17223SAlexander Motin 			g_raid_tr_raid1e_rebuild_abort(tr);
45989b17223SAlexander Motin 			return;
46089b17223SAlexander Motin 		} else if (best != copy) {
46189b17223SAlexander Motin 			/* Some other disk has better data. */
46289b17223SAlexander Motin 			break;
46389b17223SAlexander Motin 		}
46489b17223SAlexander Motin 		/* We have the most accurate data. Skip the range. */
46589b17223SAlexander Motin 		G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
46689b17223SAlexander Motin 		    sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
46789b17223SAlexander Motin 		sd->sd_rebuild_pos += len;
46889b17223SAlexander Motin 	}
46989b17223SAlexander Motin 
47089b17223SAlexander Motin 	bp = &trs->trso_bio;
47189b17223SAlexander Motin 	memset(bp, 0, sizeof(*bp));
47289b17223SAlexander Motin 	bp->bio_offset = offset + start +
47389b17223SAlexander Motin 	    ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
47489b17223SAlexander Motin 	bp->bio_length = len;
47589b17223SAlexander Motin 	bp->bio_data = trs->trso_buffer;
47689b17223SAlexander Motin 	bp->bio_cmd = BIO_READ;
47789b17223SAlexander Motin 	bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
47889b17223SAlexander Motin 	bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
47989b17223SAlexander Motin 	G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
48089b17223SAlexander Motin 	/*
48189b17223SAlexander Motin 	 * If we are crossing stripe boundary, correct affected virtual
48289b17223SAlexander Motin 	 * range we should lock.
48389b17223SAlexander Motin 	 */
48489b17223SAlexander Motin 	if (start + len > vol->v_strip_size) {
48589b17223SAlexander Motin 		P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, &copy);
48689b17223SAlexander Motin 		len = vend - virtual;
48789b17223SAlexander Motin 	}
48889b17223SAlexander Motin 	trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
48989b17223SAlexander Motin 	trs->trso_flags |= TR_RAID1E_F_LOCKED;
49089b17223SAlexander Motin 	trs->trso_lock_pos = virtual;
49189b17223SAlexander Motin 	trs->trso_lock_len = len;
49289b17223SAlexander Motin 	/* Lock callback starts I/O */
49389b17223SAlexander Motin 	g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
49489b17223SAlexander Motin }
49589b17223SAlexander Motin 
49689b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object * tr)49789b17223SAlexander Motin g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
49889b17223SAlexander Motin {
49989b17223SAlexander Motin 	struct g_raid_volume *vol;
50089b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
50189b17223SAlexander Motin 	struct g_raid_subdisk *sd;
50289b17223SAlexander Motin 
50389b17223SAlexander Motin 	vol = tr->tro_volume;
50489b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
50589b17223SAlexander Motin 	if (trs->trso_failed_sd) {
50689b17223SAlexander Motin 		G_RAID_DEBUG1(1, vol->v_softc,
50789b17223SAlexander Motin 		    "Already rebuild in start rebuild. pos %jd\n",
50889b17223SAlexander Motin 		    (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
50989b17223SAlexander Motin 		return;
51089b17223SAlexander Motin 	}
51189b17223SAlexander Motin 	sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
51289b17223SAlexander Motin 	if (sd == NULL)
51389b17223SAlexander Motin 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
51489b17223SAlexander Motin 	if (sd == NULL) {
51589b17223SAlexander Motin 		sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
51689b17223SAlexander Motin 		if (sd != NULL) {
51789b17223SAlexander Motin 			sd->sd_rebuild_pos = 0;
51889b17223SAlexander Motin 			g_raid_change_subdisk_state(sd,
51989b17223SAlexander Motin 			    G_RAID_SUBDISK_S_RESYNC);
52089b17223SAlexander Motin 			g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
52189b17223SAlexander Motin 		} else {
52289b17223SAlexander Motin 			sd = g_raid_get_subdisk(vol,
52389b17223SAlexander Motin 			    G_RAID_SUBDISK_S_UNINITIALIZED);
52489b17223SAlexander Motin 			if (sd == NULL)
52589b17223SAlexander Motin 				sd = g_raid_get_subdisk(vol,
52689b17223SAlexander Motin 				    G_RAID_SUBDISK_S_NEW);
52789b17223SAlexander Motin 			if (sd != NULL) {
52889b17223SAlexander Motin 				sd->sd_rebuild_pos = 0;
52989b17223SAlexander Motin 				g_raid_change_subdisk_state(sd,
53089b17223SAlexander Motin 				    G_RAID_SUBDISK_S_REBUILD);
53189b17223SAlexander Motin 				g_raid_write_metadata(vol->v_softc,
53289b17223SAlexander Motin 				    vol, sd, NULL);
53389b17223SAlexander Motin 			}
53489b17223SAlexander Motin 		}
53589b17223SAlexander Motin 	}
53689b17223SAlexander Motin 	if (sd == NULL) {
53789b17223SAlexander Motin 		G_RAID_DEBUG1(1, vol->v_softc,
53889b17223SAlexander Motin 		    "No failed disk to rebuild.  night night.");
53989b17223SAlexander Motin 		return;
54089b17223SAlexander Motin 	}
54189b17223SAlexander Motin 	trs->trso_failed_sd = sd;
54289b17223SAlexander Motin 	G_RAID_DEBUG1(0, vol->v_softc,
54389b17223SAlexander Motin 	    "Subdisk %s:%d-%s rebuild start at %jd.",
54489b17223SAlexander Motin 	    sd->sd_volume->v_name, sd->sd_pos,
54589b17223SAlexander Motin 	    sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
54689b17223SAlexander Motin 	    trs->trso_failed_sd->sd_rebuild_pos);
54789b17223SAlexander Motin 	trs->trso_type = TR_RAID1E_REBUILD;
54889b17223SAlexander Motin 	trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
54989b17223SAlexander Motin 	trs->trso_meta_update = g_raid1e_rebuild_meta_update;
55089b17223SAlexander Motin 	g_raid_tr_raid1e_rebuild_some(tr);
55189b17223SAlexander Motin }
55289b17223SAlexander Motin 
55389b17223SAlexander Motin static void
g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd)55489b17223SAlexander Motin g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
55589b17223SAlexander Motin     struct g_raid_subdisk *sd)
55689b17223SAlexander Motin {
55789b17223SAlexander Motin 	struct g_raid_volume *vol;
55889b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
55989b17223SAlexander Motin 	int nr;
56089b17223SAlexander Motin 
56189b17223SAlexander Motin 	vol = tr->tro_volume;
56289b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
56389b17223SAlexander Motin 	if (trs->trso_stopping)
56489b17223SAlexander Motin 		return;
56589b17223SAlexander Motin 	nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
56689b17223SAlexander Motin 	    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
56789b17223SAlexander Motin 	switch(trs->trso_type) {
56889b17223SAlexander Motin 	case TR_RAID1E_NONE:
56989b17223SAlexander Motin 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
57089b17223SAlexander Motin 			return;
57189b17223SAlexander Motin 		if (nr == 0) {
57289b17223SAlexander Motin 			nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
57389b17223SAlexander Motin 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
57489b17223SAlexander Motin 			    g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
57589b17223SAlexander Motin 			if (nr == 0)
57689b17223SAlexander Motin 				return;
57789b17223SAlexander Motin 		}
57889b17223SAlexander Motin 		g_raid_tr_raid1e_rebuild_start(tr);
57989b17223SAlexander Motin 		break;
58089b17223SAlexander Motin 	case TR_RAID1E_REBUILD:
58189b17223SAlexander Motin 		if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
58289b17223SAlexander Motin 		    trs->trso_failed_sd == sd)
58389b17223SAlexander Motin 			g_raid_tr_raid1e_rebuild_abort(tr);
58489b17223SAlexander Motin 		break;
58589b17223SAlexander Motin 	case TR_RAID1E_RESYNC:
58689b17223SAlexander Motin 		break;
58789b17223SAlexander Motin 	}
58889b17223SAlexander Motin }
58989b17223SAlexander Motin 
59089b17223SAlexander Motin static int
g_raid_tr_event_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,u_int event)59189b17223SAlexander Motin g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
59289b17223SAlexander Motin     struct g_raid_subdisk *sd, u_int event)
59389b17223SAlexander Motin {
59489b17223SAlexander Motin 
59589b17223SAlexander Motin 	g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
59689b17223SAlexander Motin 	return (0);
59789b17223SAlexander Motin }
59889b17223SAlexander Motin 
59989b17223SAlexander Motin static int
g_raid_tr_start_raid1e(struct g_raid_tr_object * tr)60089b17223SAlexander Motin g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
60189b17223SAlexander Motin {
60289b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
60389b17223SAlexander Motin 	struct g_raid_volume *vol;
60489b17223SAlexander Motin 
60589b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
60689b17223SAlexander Motin 	vol = tr->tro_volume;
60789b17223SAlexander Motin 	trs->trso_starting = 0;
60889b17223SAlexander Motin 	g_raid_tr_update_state_raid1e(vol, NULL);
60989b17223SAlexander Motin 	return (0);
61089b17223SAlexander Motin }
61189b17223SAlexander Motin 
61289b17223SAlexander Motin static int
g_raid_tr_stop_raid1e(struct g_raid_tr_object * tr)61389b17223SAlexander Motin g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
61489b17223SAlexander Motin {
61589b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
61689b17223SAlexander Motin 	struct g_raid_volume *vol;
61789b17223SAlexander Motin 
61889b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
61989b17223SAlexander Motin 	vol = tr->tro_volume;
62089b17223SAlexander Motin 	trs->trso_starting = 0;
62189b17223SAlexander Motin 	trs->trso_stopping = 1;
62289b17223SAlexander Motin 	g_raid_tr_update_state_raid1e(vol, NULL);
62389b17223SAlexander Motin 	return (0);
62489b17223SAlexander Motin }
62589b17223SAlexander Motin 
62689b17223SAlexander Motin /*
62789b17223SAlexander Motin  * Select the disk to read from.  Take into account: subdisk state, running
62889b17223SAlexander Motin  * error recovery, average disk load, head position and possible cache hits.
62989b17223SAlexander Motin  */
63089b17223SAlexander Motin #define ABS(x)		(((x) >= 0) ? (x) : (-(x)))
63189b17223SAlexander Motin static int
g_raid_tr_raid1e_select_read_disk(struct g_raid_volume * vol,int no,off_t off,off_t len,u_int mask)63289b17223SAlexander Motin g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
63389b17223SAlexander Motin     int no, off_t off, off_t len, u_int mask)
63489b17223SAlexander Motin {
63589b17223SAlexander Motin 	struct g_raid_subdisk *sd;
63689b17223SAlexander Motin 	off_t offset;
63789b17223SAlexander Motin 	int i, best, prio, bestprio;
63889b17223SAlexander Motin 
63989b17223SAlexander Motin 	best = -1;
64089b17223SAlexander Motin 	bestprio = INT_MAX;
64189b17223SAlexander Motin 	for (i = 0; i < N; i++) {
64289b17223SAlexander Motin 		sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
64389b17223SAlexander Motin 		offset = off;
64489b17223SAlexander Motin 		if (no + i >= vol->v_disks_count)
64589b17223SAlexander Motin 			offset += vol->v_strip_size;
64689b17223SAlexander Motin 
64789b17223SAlexander Motin 		prio = G_RAID_SUBDISK_LOAD(sd);
64889b17223SAlexander Motin 		if ((mask & (1 << sd->sd_pos)) != 0)
64989b17223SAlexander Motin 			continue;
65089b17223SAlexander Motin 		switch (sd->sd_state) {
65189b17223SAlexander Motin 		case G_RAID_SUBDISK_S_ACTIVE:
65289b17223SAlexander Motin 			break;
65389b17223SAlexander Motin 		case G_RAID_SUBDISK_S_RESYNC:
65489b17223SAlexander Motin 			if (offset + off < sd->sd_rebuild_pos)
65589b17223SAlexander Motin 				break;
65689b17223SAlexander Motin 			/* FALLTHROUGH */
65789b17223SAlexander Motin 		case G_RAID_SUBDISK_S_STALE:
65889b17223SAlexander Motin 			prio += i << 24;
65989b17223SAlexander Motin 			break;
66089b17223SAlexander Motin 		case G_RAID_SUBDISK_S_REBUILD:
66189b17223SAlexander Motin 			if (offset + off < sd->sd_rebuild_pos)
66289b17223SAlexander Motin 				break;
66389b17223SAlexander Motin 			/* FALLTHROUGH */
66489b17223SAlexander Motin 		default:
66589b17223SAlexander Motin 			continue;
66689b17223SAlexander Motin 		}
66789b17223SAlexander Motin 		prio += min(sd->sd_recovery, 255) << 16;
66889b17223SAlexander Motin 		/* If disk head is precisely in position - highly prefer it. */
66989b17223SAlexander Motin 		if (G_RAID_SUBDISK_POS(sd) == offset)
67089b17223SAlexander Motin 			prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
67189b17223SAlexander Motin 		else
67289b17223SAlexander Motin 		/* If disk head is close to position - prefer it. */
67389b17223SAlexander Motin 		if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
67489b17223SAlexander Motin 		    G_RAID_SUBDISK_TRACK_SIZE)
67589b17223SAlexander Motin 			prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
67689b17223SAlexander Motin 		if (prio < bestprio) {
67789b17223SAlexander Motin 			bestprio = prio;
67889b17223SAlexander Motin 			best = i;
67989b17223SAlexander Motin 		}
68089b17223SAlexander Motin 	}
68189b17223SAlexander Motin 	return (best);
68289b17223SAlexander Motin }
68389b17223SAlexander Motin 
68489b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object * tr,struct bio * bp)68589b17223SAlexander Motin g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
68689b17223SAlexander Motin {
68789b17223SAlexander Motin 	struct g_raid_volume *vol;
68889b17223SAlexander Motin 	struct g_raid_subdisk *sd;
68989b17223SAlexander Motin 	struct bio_queue_head queue;
69089b17223SAlexander Motin 	struct bio *cbp;
69189b17223SAlexander Motin 	char *addr;
69289b17223SAlexander Motin 	off_t offset, start, length, remain;
69389b17223SAlexander Motin 	u_int no, strip_size;
69489b17223SAlexander Motin 	int best;
69589b17223SAlexander Motin 
69689b17223SAlexander Motin 	vol = tr->tro_volume;
697b43560abSAlexander Motin 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698b43560abSAlexander Motin 		addr = NULL;
699b43560abSAlexander Motin 	else
70089b17223SAlexander Motin 		addr = bp->bio_data;
70189b17223SAlexander Motin 	strip_size = vol->v_strip_size;
70289b17223SAlexander Motin 	V2P(vol, bp->bio_offset, &no, &offset, &start);
70389b17223SAlexander Motin 	remain = bp->bio_length;
70489b17223SAlexander Motin 	bioq_init(&queue);
70589b17223SAlexander Motin 	while (remain > 0) {
70689b17223SAlexander Motin 		length = MIN(strip_size - start, remain);
70789b17223SAlexander Motin 		best = g_raid_tr_raid1e_select_read_disk(vol,
70889b17223SAlexander Motin 		    no, offset, length, 0);
70989b17223SAlexander Motin 		KASSERT(best >= 0, ("No readable disk in volume %s!",
71089b17223SAlexander Motin 		    vol->v_name));
71189b17223SAlexander Motin 		no += best;
71289b17223SAlexander Motin 		if (no >= vol->v_disks_count) {
71389b17223SAlexander Motin 			no -= vol->v_disks_count;
71489b17223SAlexander Motin 			offset += strip_size;
71589b17223SAlexander Motin 		}
71689b17223SAlexander Motin 		cbp = g_clone_bio(bp);
71789b17223SAlexander Motin 		if (cbp == NULL)
71889b17223SAlexander Motin 			goto failure;
71989b17223SAlexander Motin 		cbp->bio_offset = offset + start;
72089b17223SAlexander Motin 		cbp->bio_length = length;
721b43560abSAlexander Motin 		if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722b43560abSAlexander Motin 			cbp->bio_ma_offset += (uintptr_t)addr;
723b43560abSAlexander Motin 			cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724b43560abSAlexander Motin 			cbp->bio_ma_offset %= PAGE_SIZE;
725b43560abSAlexander Motin 			cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726b43560abSAlexander Motin 			    cbp->bio_length) / PAGE_SIZE;
727b43560abSAlexander Motin 		} else
728b43560abSAlexander Motin 			cbp->bio_data = addr;
72989b17223SAlexander Motin 		cbp->bio_caller1 = &vol->v_subdisks[no];
73089b17223SAlexander Motin 		bioq_insert_tail(&queue, cbp);
73189b17223SAlexander Motin 		no += N - best;
73289b17223SAlexander Motin 		if (no >= vol->v_disks_count) {
73389b17223SAlexander Motin 			no -= vol->v_disks_count;
73489b17223SAlexander Motin 			offset += strip_size;
73589b17223SAlexander Motin 		}
73689b17223SAlexander Motin 		remain -= length;
73789b17223SAlexander Motin 		addr += length;
73889b17223SAlexander Motin 		start = 0;
73989b17223SAlexander Motin 	}
740b43560abSAlexander Motin 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
74189b17223SAlexander Motin 		sd = cbp->bio_caller1;
74289b17223SAlexander Motin 		cbp->bio_caller1 = NULL;
74389b17223SAlexander Motin 		g_raid_subdisk_iostart(sd, cbp);
74489b17223SAlexander Motin 	}
74589b17223SAlexander Motin 	return;
74689b17223SAlexander Motin failure:
747b43560abSAlexander Motin 	while ((cbp = bioq_takefirst(&queue)) != NULL)
74889b17223SAlexander Motin 		g_destroy_bio(cbp);
74989b17223SAlexander Motin 	if (bp->bio_error == 0)
75089b17223SAlexander Motin 		bp->bio_error = ENOMEM;
75189b17223SAlexander Motin 	g_raid_iodone(bp, bp->bio_error);
75289b17223SAlexander Motin }
75389b17223SAlexander Motin 
75489b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object * tr,struct bio * bp)75589b17223SAlexander Motin g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
75689b17223SAlexander Motin {
75789b17223SAlexander Motin 	struct g_raid_volume *vol;
75889b17223SAlexander Motin 	struct g_raid_subdisk *sd;
75989b17223SAlexander Motin 	struct bio_queue_head queue;
76089b17223SAlexander Motin 	struct bio *cbp;
76189b17223SAlexander Motin 	char *addr;
76289b17223SAlexander Motin 	off_t offset, start, length, remain;
76389b17223SAlexander Motin 	u_int no, strip_size;
76489b17223SAlexander Motin 	int i;
76589b17223SAlexander Motin 
76689b17223SAlexander Motin 	vol = tr->tro_volume;
767b43560abSAlexander Motin 	if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768b43560abSAlexander Motin 		addr = NULL;
769b43560abSAlexander Motin 	else
77089b17223SAlexander Motin 		addr = bp->bio_data;
77189b17223SAlexander Motin 	strip_size = vol->v_strip_size;
77289b17223SAlexander Motin 	V2P(vol, bp->bio_offset, &no, &offset, &start);
77389b17223SAlexander Motin 	remain = bp->bio_length;
77489b17223SAlexander Motin 	bioq_init(&queue);
77589b17223SAlexander Motin 	while (remain > 0) {
77689b17223SAlexander Motin 		length = MIN(strip_size - start, remain);
77789b17223SAlexander Motin 		for (i = 0; i < N; i++) {
77889b17223SAlexander Motin 			sd = &vol->v_subdisks[no];
77989b17223SAlexander Motin 			switch (sd->sd_state) {
78089b17223SAlexander Motin 			case G_RAID_SUBDISK_S_ACTIVE:
78189b17223SAlexander Motin 			case G_RAID_SUBDISK_S_STALE:
78289b17223SAlexander Motin 			case G_RAID_SUBDISK_S_RESYNC:
78389b17223SAlexander Motin 				break;
78489b17223SAlexander Motin 			case G_RAID_SUBDISK_S_REBUILD:
78589b17223SAlexander Motin 				if (offset + start >= sd->sd_rebuild_pos)
78689b17223SAlexander Motin 					goto nextdisk;
78789b17223SAlexander Motin 				break;
78889b17223SAlexander Motin 			default:
78989b17223SAlexander Motin 				goto nextdisk;
79089b17223SAlexander Motin 			}
79189b17223SAlexander Motin 			cbp = g_clone_bio(bp);
79289b17223SAlexander Motin 			if (cbp == NULL)
79389b17223SAlexander Motin 				goto failure;
79489b17223SAlexander Motin 			cbp->bio_offset = offset + start;
79589b17223SAlexander Motin 			cbp->bio_length = length;
796b43560abSAlexander Motin 			if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797b43560abSAlexander Motin 			    bp->bio_cmd != BIO_DELETE) {
798b43560abSAlexander Motin 				cbp->bio_ma_offset += (uintptr_t)addr;
799b43560abSAlexander Motin 				cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800b43560abSAlexander Motin 				cbp->bio_ma_offset %= PAGE_SIZE;
801b43560abSAlexander Motin 				cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802b43560abSAlexander Motin 				    cbp->bio_length) / PAGE_SIZE;
803b43560abSAlexander Motin 			} else
804b43560abSAlexander Motin 				cbp->bio_data = addr;
80589b17223SAlexander Motin 			cbp->bio_caller1 = sd;
80689b17223SAlexander Motin 			bioq_insert_tail(&queue, cbp);
80789b17223SAlexander Motin nextdisk:
80889b17223SAlexander Motin 			if (++no >= vol->v_disks_count) {
80989b17223SAlexander Motin 				no = 0;
81089b17223SAlexander Motin 				offset += strip_size;
81189b17223SAlexander Motin 			}
81289b17223SAlexander Motin 		}
81389b17223SAlexander Motin 		remain -= length;
814609a7474SAlexander Motin 		if (bp->bio_cmd != BIO_DELETE)
81589b17223SAlexander Motin 			addr += length;
81689b17223SAlexander Motin 		start = 0;
81789b17223SAlexander Motin 	}
818b43560abSAlexander Motin 	while ((cbp = bioq_takefirst(&queue)) != NULL) {
81989b17223SAlexander Motin 		sd = cbp->bio_caller1;
82089b17223SAlexander Motin 		cbp->bio_caller1 = NULL;
82189b17223SAlexander Motin 		g_raid_subdisk_iostart(sd, cbp);
82289b17223SAlexander Motin 	}
82389b17223SAlexander Motin 	return;
82489b17223SAlexander Motin failure:
825b43560abSAlexander Motin 	while ((cbp = bioq_takefirst(&queue)) != NULL)
82689b17223SAlexander Motin 		g_destroy_bio(cbp);
82789b17223SAlexander Motin 	if (bp->bio_error == 0)
82889b17223SAlexander Motin 		bp->bio_error = ENOMEM;
82989b17223SAlexander Motin 	g_raid_iodone(bp, bp->bio_error);
83089b17223SAlexander Motin }
83189b17223SAlexander Motin 
83289b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e(struct g_raid_tr_object * tr,struct bio * bp)83389b17223SAlexander Motin g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
83489b17223SAlexander Motin {
83589b17223SAlexander Motin 	struct g_raid_volume *vol;
83689b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
83789b17223SAlexander Motin 
83889b17223SAlexander Motin 	vol = tr->tro_volume;
83989b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
84089b17223SAlexander Motin 	if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
84189b17223SAlexander Motin 	    vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
84289b17223SAlexander Motin 	    vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
84389b17223SAlexander Motin 		g_raid_iodone(bp, EIO);
84489b17223SAlexander Motin 		return;
84589b17223SAlexander Motin 	}
84689b17223SAlexander Motin 	/*
84789b17223SAlexander Motin 	 * If we're rebuilding, squeeze in rebuild activity every so often,
84889b17223SAlexander Motin 	 * even when the disk is busy.  Be sure to only count real I/O
84989b17223SAlexander Motin 	 * to the disk.  All 'SPECIAL' I/O is traffic generated to the disk
85089b17223SAlexander Motin 	 * by this module.
85189b17223SAlexander Motin 	 */
85289b17223SAlexander Motin 	if (trs->trso_failed_sd != NULL &&
85389b17223SAlexander Motin 	    !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
85489b17223SAlexander Motin 		/* Make this new or running now round short. */
85589b17223SAlexander Motin 		trs->trso_recover_slabs = 0;
85689b17223SAlexander Motin 		if (--trs->trso_fair_io <= 0) {
85789b17223SAlexander Motin 			trs->trso_fair_io = g_raid1e_rebuild_fair_io;
85889b17223SAlexander Motin 			g_raid_tr_raid1e_rebuild_some(tr);
85989b17223SAlexander Motin 		}
86089b17223SAlexander Motin 	}
86189b17223SAlexander Motin 	switch (bp->bio_cmd) {
86289b17223SAlexander Motin 	case BIO_READ:
86389b17223SAlexander Motin 		g_raid_tr_iostart_raid1e_read(tr, bp);
86489b17223SAlexander Motin 		break;
86589b17223SAlexander Motin 	case BIO_WRITE:
86689b17223SAlexander Motin 	case BIO_DELETE:
867609a7474SAlexander Motin 		g_raid_tr_iostart_raid1e_write(tr, bp);
86889b17223SAlexander Motin 		break;
8698b522bdaSWarner Losh 	case BIO_SPEEDUP:
87089b17223SAlexander Motin 	case BIO_FLUSH:
87189b17223SAlexander Motin 		g_raid_tr_flush_common(tr, bp);
87289b17223SAlexander Motin 		break;
87389b17223SAlexander Motin 	default:
87489b17223SAlexander Motin 		KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
87589b17223SAlexander Motin 		    bp->bio_cmd, vol->v_name));
87689b17223SAlexander Motin 		break;
87789b17223SAlexander Motin 	}
87889b17223SAlexander Motin }
87989b17223SAlexander Motin 
88089b17223SAlexander Motin static void
g_raid_tr_iodone_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,struct bio * bp)88189b17223SAlexander Motin g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
88289b17223SAlexander Motin     struct g_raid_subdisk *sd, struct bio *bp)
88389b17223SAlexander Motin {
88489b17223SAlexander Motin 	struct bio *cbp;
88589b17223SAlexander Motin 	struct g_raid_subdisk *nsd;
88689b17223SAlexander Motin 	struct g_raid_volume *vol;
88789b17223SAlexander Motin 	struct bio *pbp;
88889b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
88989b17223SAlexander Motin 	off_t virtual, offset, start;
89089b17223SAlexander Motin 	uintptr_t mask;
89189b17223SAlexander Motin 	int error, do_write, copy, disk, best;
89289b17223SAlexander Motin 
89389b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
89489b17223SAlexander Motin 	vol = tr->tro_volume;
89589b17223SAlexander Motin 	if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
89689b17223SAlexander Motin 		if (trs->trso_type == TR_RAID1E_REBUILD) {
89789b17223SAlexander Motin 			nsd = trs->trso_failed_sd;
89889b17223SAlexander Motin 			if (bp->bio_cmd == BIO_READ) {
89989b17223SAlexander Motin 				/* Immediately abort rebuild, if requested. */
90089b17223SAlexander Motin 				if (trs->trso_flags & TR_RAID1E_F_ABORT) {
90189b17223SAlexander Motin 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
90289b17223SAlexander Motin 					g_raid_tr_raid1e_rebuild_abort(tr);
90389b17223SAlexander Motin 					return;
90489b17223SAlexander Motin 				}
90589b17223SAlexander Motin 
90689b17223SAlexander Motin 				/* On read error, skip and cross fingers. */
90789b17223SAlexander Motin 				if (bp->bio_error != 0) {
90889b17223SAlexander Motin 					G_RAID_LOGREQ(0, bp,
90989b17223SAlexander Motin 					    "Read error during rebuild (%d), "
91089b17223SAlexander Motin 					    "possible data loss!",
91189b17223SAlexander Motin 					    bp->bio_error);
91289b17223SAlexander Motin 					goto rebuild_round_done;
91389b17223SAlexander Motin 				}
91489b17223SAlexander Motin 
91589b17223SAlexander Motin 				/*
91689b17223SAlexander Motin 				 * The read operation finished, queue the
91789b17223SAlexander Motin 				 * write and get out.
91889b17223SAlexander Motin 				 */
91989b17223SAlexander Motin 				G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
92089b17223SAlexander Motin 				    bp->bio_error);
92189b17223SAlexander Motin 				bp->bio_cmd = BIO_WRITE;
92289b17223SAlexander Motin 				bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
92389b17223SAlexander Motin 				bp->bio_offset = nsd->sd_rebuild_pos;
92489b17223SAlexander Motin 				G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
92589b17223SAlexander Motin 				g_raid_subdisk_iostart(nsd, bp);
92689b17223SAlexander Motin 			} else {
92789b17223SAlexander Motin 				/*
92889b17223SAlexander Motin 				 * The write operation just finished.  Do
92989b17223SAlexander Motin 				 * another.  We keep cloning the master bio
93089b17223SAlexander Motin 				 * since it has the right buffers allocated to
93189b17223SAlexander Motin 				 * it.
93289b17223SAlexander Motin 				 */
93389b17223SAlexander Motin 				G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
93489b17223SAlexander Motin 				    bp->bio_error);
93589b17223SAlexander Motin 				if (bp->bio_error != 0 ||
93689b17223SAlexander Motin 				    trs->trso_flags & TR_RAID1E_F_ABORT) {
93789b17223SAlexander Motin 					if ((trs->trso_flags &
93889b17223SAlexander Motin 					    TR_RAID1E_F_ABORT) == 0) {
93989b17223SAlexander Motin 						g_raid_tr_raid1e_fail_disk(sd->sd_softc,
94089b17223SAlexander Motin 						    nsd, nsd->sd_disk);
94189b17223SAlexander Motin 					}
94289b17223SAlexander Motin 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
94389b17223SAlexander Motin 					g_raid_tr_raid1e_rebuild_abort(tr);
94489b17223SAlexander Motin 					return;
94589b17223SAlexander Motin 				}
94689b17223SAlexander Motin rebuild_round_done:
94789b17223SAlexander Motin 				trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
94889b17223SAlexander Motin 				g_raid_unlock_range(tr->tro_volume,
94989b17223SAlexander Motin 				    trs->trso_lock_pos, trs->trso_lock_len);
95089b17223SAlexander Motin 				nsd->sd_rebuild_pos += bp->bio_length;
95189b17223SAlexander Motin 				if (nsd->sd_rebuild_pos >= nsd->sd_size) {
95289b17223SAlexander Motin 					g_raid_tr_raid1e_rebuild_finish(tr);
95389b17223SAlexander Motin 					return;
95489b17223SAlexander Motin 				}
95589b17223SAlexander Motin 
95689b17223SAlexander Motin 				/* Abort rebuild if we are stopping */
95789b17223SAlexander Motin 				if (trs->trso_stopping) {
95889b17223SAlexander Motin 					trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
95989b17223SAlexander Motin 					g_raid_tr_raid1e_rebuild_abort(tr);
96089b17223SAlexander Motin 					return;
96189b17223SAlexander Motin 				}
96289b17223SAlexander Motin 
96389b17223SAlexander Motin 				if (--trs->trso_meta_update <= 0) {
96489b17223SAlexander Motin 					g_raid_write_metadata(vol->v_softc,
96589b17223SAlexander Motin 					    vol, nsd, nsd->sd_disk);
96689b17223SAlexander Motin 					trs->trso_meta_update =
96789b17223SAlexander Motin 					    g_raid1e_rebuild_meta_update;
96889b17223SAlexander Motin 					/* Compensate short rebuild I/Os. */
96989b17223SAlexander Motin 					if ((vol->v_disks_count % N) != 0 &&
97089b17223SAlexander Motin 					    vol->v_strip_size <
97189b17223SAlexander Motin 					     g_raid1e_rebuild_slab) {
97289b17223SAlexander Motin 						trs->trso_meta_update *=
97389b17223SAlexander Motin 						    g_raid1e_rebuild_slab;
97489b17223SAlexander Motin 						trs->trso_meta_update /=
97589b17223SAlexander Motin 						    vol->v_strip_size;
97689b17223SAlexander Motin 					}
97789b17223SAlexander Motin 				}
97889b17223SAlexander Motin 				trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
97989b17223SAlexander Motin 				if (--trs->trso_recover_slabs <= 0)
98089b17223SAlexander Motin 					return;
98189b17223SAlexander Motin 				/* Run next rebuild iteration. */
98289b17223SAlexander Motin 				g_raid_tr_raid1e_rebuild_some(tr);
98389b17223SAlexander Motin 			}
98489b17223SAlexander Motin 		} else if (trs->trso_type == TR_RAID1E_RESYNC) {
98589b17223SAlexander Motin 			/*
98689b17223SAlexander Motin 			 * read good sd, read bad sd in parallel.  when both
98789b17223SAlexander Motin 			 * done, compare the buffers.  write good to the bad
98889b17223SAlexander Motin 			 * if different.  do the next bit of work.
98989b17223SAlexander Motin 			 */
99089b17223SAlexander Motin 			panic("Somehow, we think we're doing a resync");
99189b17223SAlexander Motin 		}
99289b17223SAlexander Motin 		return;
99389b17223SAlexander Motin 	}
99489b17223SAlexander Motin 	pbp = bp->bio_parent;
99589b17223SAlexander Motin 	pbp->bio_inbed++;
99689b17223SAlexander Motin 	mask = (intptr_t)bp->bio_caller2;
99789b17223SAlexander Motin 	if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
99889b17223SAlexander Motin 		/*
99989b17223SAlexander Motin 		 * Read failed on first drive.  Retry the read error on
100089b17223SAlexander Motin 		 * another disk drive, if available, before erroring out the
100189b17223SAlexander Motin 		 * read.
100289b17223SAlexander Motin 		 */
100389b17223SAlexander Motin 		sd->sd_disk->d_read_errs++;
100489b17223SAlexander Motin 		G_RAID_LOGREQ(0, bp,
100589b17223SAlexander Motin 		    "Read error (%d), %d read errors total",
100689b17223SAlexander Motin 		    bp->bio_error, sd->sd_disk->d_read_errs);
100789b17223SAlexander Motin 
100889b17223SAlexander Motin 		/*
100989b17223SAlexander Motin 		 * If there are too many read errors, we move to degraded.
101089b17223SAlexander Motin 		 * XXX Do we want to FAIL the drive (eg, make the user redo
101189b17223SAlexander Motin 		 * everything to get it back in sync), or just degrade the
101289b17223SAlexander Motin 		 * drive, which kicks off a resync?
101389b17223SAlexander Motin 		 */
101489b17223SAlexander Motin 		do_write = 0;
101589b17223SAlexander Motin 		if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
101689b17223SAlexander Motin 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
101789b17223SAlexander Motin 		else if (mask == 0)
101889b17223SAlexander Motin 			do_write = 1;
101989b17223SAlexander Motin 
102089b17223SAlexander Motin 		/* Restore what we were doing. */
102189b17223SAlexander Motin 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
102289b17223SAlexander Motin 		V2P(vol, virtual, &disk, &offset, &start);
102389b17223SAlexander Motin 
102489b17223SAlexander Motin 		/* Find the other disk, and try to do the I/O to it. */
102589b17223SAlexander Motin 		mask |= 1 << copy;
102689b17223SAlexander Motin 		best = g_raid_tr_raid1e_select_read_disk(vol,
102789b17223SAlexander Motin 		    disk, offset, start, mask);
102889b17223SAlexander Motin 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
102989b17223SAlexander Motin 			disk += best;
103089b17223SAlexander Motin 			if (disk >= vol->v_disks_count) {
103189b17223SAlexander Motin 				disk -= vol->v_disks_count;
103289b17223SAlexander Motin 				offset += vol->v_strip_size;
103389b17223SAlexander Motin 			}
103489b17223SAlexander Motin 			cbp->bio_offset = offset + start;
103589b17223SAlexander Motin 			cbp->bio_length = bp->bio_length;
103689b17223SAlexander Motin 			cbp->bio_data = bp->bio_data;
1037b43560abSAlexander Motin 			cbp->bio_ma = bp->bio_ma;
1038b43560abSAlexander Motin 			cbp->bio_ma_offset = bp->bio_ma_offset;
1039b43560abSAlexander Motin 			cbp->bio_ma_n = bp->bio_ma_n;
104089b17223SAlexander Motin 			g_destroy_bio(bp);
104189b17223SAlexander Motin 			nsd = &vol->v_subdisks[disk];
104289b17223SAlexander Motin 			G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
104389b17223SAlexander Motin 			    nsd->sd_pos);
104489b17223SAlexander Motin 			if (do_write)
104589b17223SAlexander Motin 				mask |= 1 << 31;
10467a22215cSEitan Adler 			if ((mask & (1U << 31)) != 0)
104789b17223SAlexander Motin 				sd->sd_recovery++;
104889b17223SAlexander Motin 			cbp->bio_caller2 = (void *)mask;
104989b17223SAlexander Motin 			if (do_write) {
105089b17223SAlexander Motin 				cbp->bio_caller1 = nsd;
105189b17223SAlexander Motin 				/* Lock callback starts I/O */
105289b17223SAlexander Motin 				g_raid_lock_range(sd->sd_volume,
105389b17223SAlexander Motin 				    virtual, cbp->bio_length, pbp, cbp);
105489b17223SAlexander Motin 			} else {
105589b17223SAlexander Motin 				g_raid_subdisk_iostart(nsd, cbp);
105689b17223SAlexander Motin 			}
105789b17223SAlexander Motin 			return;
105889b17223SAlexander Motin 		}
105989b17223SAlexander Motin 		/*
106089b17223SAlexander Motin 		 * We can't retry.  Return the original error by falling
106189b17223SAlexander Motin 		 * through.  This will happen when there's only one good disk.
106289b17223SAlexander Motin 		 * We don't need to fail the raid, since its actual state is
106389b17223SAlexander Motin 		 * based on the state of the subdisks.
106489b17223SAlexander Motin 		 */
106589b17223SAlexander Motin 		G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
106689b17223SAlexander Motin 	}
106789b17223SAlexander Motin 	if (bp->bio_cmd == BIO_READ &&
106889b17223SAlexander Motin 	    bp->bio_error == 0 &&
10697a22215cSEitan Adler 	    (mask & (1U << 31)) != 0) {
107089b17223SAlexander Motin 		G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
107189b17223SAlexander Motin 
107289b17223SAlexander Motin 		/* Restore what we were doing. */
107389b17223SAlexander Motin 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
107489b17223SAlexander Motin 		V2P(vol, virtual, &disk, &offset, &start);
107589b17223SAlexander Motin 
107689b17223SAlexander Motin 		/* Find best disk to write. */
107789b17223SAlexander Motin 		best = g_raid_tr_raid1e_select_read_disk(vol,
107889b17223SAlexander Motin 		    disk, offset, start, ~mask);
107989b17223SAlexander Motin 		if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
108089b17223SAlexander Motin 			disk += best;
108189b17223SAlexander Motin 			if (disk >= vol->v_disks_count) {
108289b17223SAlexander Motin 				disk -= vol->v_disks_count;
108389b17223SAlexander Motin 				offset += vol->v_strip_size;
108489b17223SAlexander Motin 			}
108589b17223SAlexander Motin 			cbp->bio_offset = offset + start;
108689b17223SAlexander Motin 			cbp->bio_cmd = BIO_WRITE;
108789b17223SAlexander Motin 			cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
108889b17223SAlexander Motin 			cbp->bio_caller2 = (void *)mask;
108989b17223SAlexander Motin 			g_destroy_bio(bp);
109089b17223SAlexander Motin 			G_RAID_LOGREQ(2, cbp,
109189b17223SAlexander Motin 			    "Attempting bad sector remap on failing drive.");
109289b17223SAlexander Motin 			g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
109389b17223SAlexander Motin 			return;
109489b17223SAlexander Motin 		}
109589b17223SAlexander Motin 	}
10967a22215cSEitan Adler 	if ((mask & (1U << 31)) != 0) {
109789b17223SAlexander Motin 		/*
109889b17223SAlexander Motin 		 * We're done with a recovery, mark the range as unlocked.
1099e8d57122SPedro F. Giffuni 		 * For any write errors, we aggressively fail the disk since
110089b17223SAlexander Motin 		 * there was both a READ and a WRITE error at this location.
110189b17223SAlexander Motin 		 * Both types of errors generally indicates the drive is on
110289b17223SAlexander Motin 		 * the verge of total failure anyway.  Better to stop trusting
110389b17223SAlexander Motin 		 * it now.  However, we need to reset error to 0 in that case
110489b17223SAlexander Motin 		 * because we're not failing the original I/O which succeeded.
110589b17223SAlexander Motin 		 */
110689b17223SAlexander Motin 
110789b17223SAlexander Motin 		/* Restore what we were doing. */
110889b17223SAlexander Motin 		P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, &copy);
110989b17223SAlexander Motin 		V2P(vol, virtual, &disk, &offset, &start);
111089b17223SAlexander Motin 
111189b17223SAlexander Motin 		for (copy = 0; copy < N; copy++) {
111289b17223SAlexander Motin 			if ((mask & (1 << copy) ) != 0)
111389b17223SAlexander Motin 				vol->v_subdisks[(disk + copy) %
111489b17223SAlexander Motin 				    vol->v_disks_count].sd_recovery--;
111589b17223SAlexander Motin 		}
111689b17223SAlexander Motin 
111789b17223SAlexander Motin 		if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
111889b17223SAlexander Motin 			G_RAID_LOGREQ(0, bp, "Remap write failed: "
111989b17223SAlexander Motin 			    "failing subdisk.");
112089b17223SAlexander Motin 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
112189b17223SAlexander Motin 			bp->bio_error = 0;
112289b17223SAlexander Motin 		}
112389b17223SAlexander Motin 		G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
112489b17223SAlexander Motin 		g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
112589b17223SAlexander Motin 	}
1126650e245eSAlexander Motin 	if (pbp->bio_cmd != BIO_READ) {
1127ef844ef7SAlexander Motin 		if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128ef844ef7SAlexander Motin 			pbp->bio_error = bp->bio_error;
1129650e245eSAlexander Motin 		if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130ef844ef7SAlexander Motin 			G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131ef844ef7SAlexander Motin 			g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132ef844ef7SAlexander Motin 		}
1133ef844ef7SAlexander Motin 		error = pbp->bio_error;
1134ef844ef7SAlexander Motin 	} else
113589b17223SAlexander Motin 		error = bp->bio_error;
113689b17223SAlexander Motin 	g_destroy_bio(bp);
113789b17223SAlexander Motin 	if (pbp->bio_children == pbp->bio_inbed) {
113889b17223SAlexander Motin 		pbp->bio_completed = pbp->bio_length;
113989b17223SAlexander Motin 		g_raid_iodone(pbp, error);
114089b17223SAlexander Motin 	}
114189b17223SAlexander Motin }
114289b17223SAlexander Motin 
114389b17223SAlexander Motin static int
g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object * tr,void * virtual,off_t boffset,size_t blength)1144489ba222SMitchell Horne g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1145489ba222SMitchell Horne     off_t boffset, size_t blength)
114689b17223SAlexander Motin {
114789b17223SAlexander Motin 	struct g_raid_volume *vol;
114889b17223SAlexander Motin 	struct g_raid_subdisk *sd;
114989b17223SAlexander Motin 	struct bio_queue_head queue;
115089b17223SAlexander Motin 	char *addr;
115189b17223SAlexander Motin 	off_t offset, start, length, remain;
115289b17223SAlexander Motin 	u_int no, strip_size;
115389b17223SAlexander Motin 	int i, error;
115489b17223SAlexander Motin 
115589b17223SAlexander Motin 	vol = tr->tro_volume;
115689b17223SAlexander Motin 	addr = virtual;
115789b17223SAlexander Motin 	strip_size = vol->v_strip_size;
115889b17223SAlexander Motin 	V2P(vol, boffset, &no, &offset, &start);
115989b17223SAlexander Motin 	remain = blength;
116089b17223SAlexander Motin 	bioq_init(&queue);
116189b17223SAlexander Motin 	while (remain > 0) {
116289b17223SAlexander Motin 		length = MIN(strip_size - start, remain);
116389b17223SAlexander Motin 		for (i = 0; i < N; i++) {
116489b17223SAlexander Motin 			sd = &vol->v_subdisks[no];
116589b17223SAlexander Motin 			switch (sd->sd_state) {
116689b17223SAlexander Motin 			case G_RAID_SUBDISK_S_ACTIVE:
116789b17223SAlexander Motin 			case G_RAID_SUBDISK_S_STALE:
116889b17223SAlexander Motin 			case G_RAID_SUBDISK_S_RESYNC:
116989b17223SAlexander Motin 				break;
117089b17223SAlexander Motin 			case G_RAID_SUBDISK_S_REBUILD:
117189b17223SAlexander Motin 				if (offset + start >= sd->sd_rebuild_pos)
117289b17223SAlexander Motin 					goto nextdisk;
117389b17223SAlexander Motin 				break;
117489b17223SAlexander Motin 			default:
117589b17223SAlexander Motin 				goto nextdisk;
117689b17223SAlexander Motin 			}
1177489ba222SMitchell Horne 			error = g_raid_subdisk_kerneldump(sd, addr,
1178489ba222SMitchell Horne 			    offset + start, length);
117989b17223SAlexander Motin 			if (error != 0)
118089b17223SAlexander Motin 				return (error);
118189b17223SAlexander Motin nextdisk:
118289b17223SAlexander Motin 			if (++no >= vol->v_disks_count) {
118389b17223SAlexander Motin 				no = 0;
118489b17223SAlexander Motin 				offset += strip_size;
118589b17223SAlexander Motin 			}
118689b17223SAlexander Motin 		}
118789b17223SAlexander Motin 		remain -= length;
118889b17223SAlexander Motin 		addr += length;
118989b17223SAlexander Motin 		start = 0;
119089b17223SAlexander Motin 	}
119189b17223SAlexander Motin 	return (0);
119289b17223SAlexander Motin }
119389b17223SAlexander Motin 
119489b17223SAlexander Motin static int
g_raid_tr_locked_raid1e(struct g_raid_tr_object * tr,void * argp)119589b17223SAlexander Motin g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
119689b17223SAlexander Motin {
119789b17223SAlexander Motin 	struct bio *bp;
119889b17223SAlexander Motin 	struct g_raid_subdisk *sd;
119989b17223SAlexander Motin 
120089b17223SAlexander Motin 	bp = (struct bio *)argp;
120189b17223SAlexander Motin 	sd = (struct g_raid_subdisk *)bp->bio_caller1;
120289b17223SAlexander Motin 	g_raid_subdisk_iostart(sd, bp);
120389b17223SAlexander Motin 
120489b17223SAlexander Motin 	return (0);
120589b17223SAlexander Motin }
120689b17223SAlexander Motin 
120789b17223SAlexander Motin static int
g_raid_tr_idle_raid1e(struct g_raid_tr_object * tr)120889b17223SAlexander Motin g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
120989b17223SAlexander Motin {
121089b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
121189b17223SAlexander Motin 	struct g_raid_volume *vol;
121289b17223SAlexander Motin 
121389b17223SAlexander Motin 	vol = tr->tro_volume;
121489b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
121589b17223SAlexander Motin 	trs->trso_fair_io = g_raid1e_rebuild_fair_io;
121689b17223SAlexander Motin 	trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
121789b17223SAlexander Motin 	/* Compensate short rebuild I/Os. */
121889b17223SAlexander Motin 	if ((vol->v_disks_count % N) != 0 &&
121989b17223SAlexander Motin 	    vol->v_strip_size < g_raid1e_rebuild_slab) {
122089b17223SAlexander Motin 		trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
122189b17223SAlexander Motin 		trs->trso_recover_slabs /= vol->v_strip_size;
122289b17223SAlexander Motin 	}
122389b17223SAlexander Motin 	if (trs->trso_type == TR_RAID1E_REBUILD)
122489b17223SAlexander Motin 		g_raid_tr_raid1e_rebuild_some(tr);
122589b17223SAlexander Motin 	return (0);
122689b17223SAlexander Motin }
122789b17223SAlexander Motin 
122889b17223SAlexander Motin static int
g_raid_tr_free_raid1e(struct g_raid_tr_object * tr)122989b17223SAlexander Motin g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
123089b17223SAlexander Motin {
123189b17223SAlexander Motin 	struct g_raid_tr_raid1e_object *trs;
123289b17223SAlexander Motin 
123389b17223SAlexander Motin 	trs = (struct g_raid_tr_raid1e_object *)tr;
123489b17223SAlexander Motin 
123589b17223SAlexander Motin 	if (trs->trso_buffer != NULL) {
123689b17223SAlexander Motin 		free(trs->trso_buffer, M_TR_RAID1E);
123789b17223SAlexander Motin 		trs->trso_buffer = NULL;
123889b17223SAlexander Motin 	}
123989b17223SAlexander Motin 	return (0);
124089b17223SAlexander Motin }
124189b17223SAlexander Motin 
1242c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1e, "RAID1E");
1243