189b17223SAlexander Motin /*-
2*4d846d26SWarner Losh * SPDX-License-Identifier: BSD-2-Clause
33728855aSPedro F. Giffuni *
489b17223SAlexander Motin * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
589b17223SAlexander Motin * All rights reserved.
689b17223SAlexander Motin *
789b17223SAlexander Motin * Redistribution and use in source and binary forms, with or without
889b17223SAlexander Motin * modification, are permitted provided that the following conditions
989b17223SAlexander Motin * are met:
1089b17223SAlexander Motin * 1. Redistributions of source code must retain the above copyright
1189b17223SAlexander Motin * notice, this list of conditions and the following disclaimer.
1289b17223SAlexander Motin * 2. Redistributions in binary form must reproduce the above copyright
1389b17223SAlexander Motin * notice, this list of conditions and the following disclaimer in the
1489b17223SAlexander Motin * documentation and/or other materials provided with the distribution.
1589b17223SAlexander Motin *
1689b17223SAlexander Motin * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
1789b17223SAlexander Motin * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
1889b17223SAlexander Motin * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
1989b17223SAlexander Motin * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
2089b17223SAlexander Motin * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
2189b17223SAlexander Motin * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
2289b17223SAlexander Motin * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
2389b17223SAlexander Motin * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
2489b17223SAlexander Motin * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
2589b17223SAlexander Motin * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
2689b17223SAlexander Motin * SUCH DAMAGE.
2789b17223SAlexander Motin */
2889b17223SAlexander Motin
2989b17223SAlexander Motin #include <sys/param.h>
3089b17223SAlexander Motin #include <sys/bio.h>
3189b17223SAlexander Motin #include <sys/endian.h>
3289b17223SAlexander Motin #include <sys/kernel.h>
3389b17223SAlexander Motin #include <sys/kobj.h>
3489b17223SAlexander Motin #include <sys/limits.h>
3589b17223SAlexander Motin #include <sys/lock.h>
3689b17223SAlexander Motin #include <sys/malloc.h>
3789b17223SAlexander Motin #include <sys/mutex.h>
3889b17223SAlexander Motin #include <sys/sysctl.h>
3989b17223SAlexander Motin #include <sys/systm.h>
4089b17223SAlexander Motin #include <geom/geom.h>
41ac03832eSConrad Meyer #include <geom/geom_dbg.h>
4289b17223SAlexander Motin #include "geom/raid/g_raid.h"
4389b17223SAlexander Motin #include "g_raid_tr_if.h"
4489b17223SAlexander Motin
4589b17223SAlexander Motin #define N 2
4689b17223SAlexander Motin
47c89d2fbeSAlexander Motin SYSCTL_DECL(_kern_geom_raid_raid1e);
4889b17223SAlexander Motin
4989b17223SAlexander Motin #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
5089b17223SAlexander Motin static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
5289b17223SAlexander Motin &g_raid1e_rebuild_slab, 0,
5389b17223SAlexander Motin "Amount of the disk to rebuild each read/write cycle of the rebuild.");
5489b17223SAlexander Motin
5589b17223SAlexander Motin #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
5689b17223SAlexander Motin static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
5889b17223SAlexander Motin &g_raid1e_rebuild_fair_io, 0,
5989b17223SAlexander Motin "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
6089b17223SAlexander Motin
6189b17223SAlexander Motin #define RAID1E_REBUILD_CLUSTER_IDLE 100
6289b17223SAlexander Motin static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
6489b17223SAlexander Motin &g_raid1e_rebuild_cluster_idle, 0,
6589b17223SAlexander Motin "Number of slabs to do each time we trigger a rebuild cycle");
6689b17223SAlexander Motin
6789b17223SAlexander Motin #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
6889b17223SAlexander Motin static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69af3b2549SHans Petter Selasky SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
7089b17223SAlexander Motin &g_raid1e_rebuild_meta_update, 0,
7189b17223SAlexander Motin "When to update the meta data.");
7289b17223SAlexander Motin
7389b17223SAlexander Motin static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
7489b17223SAlexander Motin
7589b17223SAlexander Motin #define TR_RAID1E_NONE 0
7689b17223SAlexander Motin #define TR_RAID1E_REBUILD 1
7789b17223SAlexander Motin #define TR_RAID1E_RESYNC 2
7889b17223SAlexander Motin
7989b17223SAlexander Motin #define TR_RAID1E_F_DOING_SOME 0x1
8089b17223SAlexander Motin #define TR_RAID1E_F_LOCKED 0x2
8189b17223SAlexander Motin #define TR_RAID1E_F_ABORT 0x4
8289b17223SAlexander Motin
8389b17223SAlexander Motin struct g_raid_tr_raid1e_object {
8489b17223SAlexander Motin struct g_raid_tr_object trso_base;
8589b17223SAlexander Motin int trso_starting;
8689b17223SAlexander Motin int trso_stopping;
8789b17223SAlexander Motin int trso_type;
8889b17223SAlexander Motin int trso_recover_slabs; /* slabs before rest */
8989b17223SAlexander Motin int trso_fair_io;
9089b17223SAlexander Motin int trso_meta_update;
9189b17223SAlexander Motin int trso_flags;
9289b17223SAlexander Motin struct g_raid_subdisk *trso_failed_sd; /* like per volume */
9389b17223SAlexander Motin void *trso_buffer; /* Buffer space */
9489b17223SAlexander Motin off_t trso_lock_pos; /* Locked range start. */
9589b17223SAlexander Motin off_t trso_lock_len; /* Locked range length. */
9689b17223SAlexander Motin struct bio trso_bio;
9789b17223SAlexander Motin };
9889b17223SAlexander Motin
9989b17223SAlexander Motin static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
10089b17223SAlexander Motin static g_raid_tr_event_t g_raid_tr_event_raid1e;
10189b17223SAlexander Motin static g_raid_tr_start_t g_raid_tr_start_raid1e;
10289b17223SAlexander Motin static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
10389b17223SAlexander Motin static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
10489b17223SAlexander Motin static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
10589b17223SAlexander Motin static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
10689b17223SAlexander Motin static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
10789b17223SAlexander Motin static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
10889b17223SAlexander Motin static g_raid_tr_free_t g_raid_tr_free_raid1e;
10989b17223SAlexander Motin
11089b17223SAlexander Motin static kobj_method_t g_raid_tr_raid1e_methods[] = {
11189b17223SAlexander Motin KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
11289b17223SAlexander Motin KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
11389b17223SAlexander Motin KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
11489b17223SAlexander Motin KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
11589b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
11689b17223SAlexander Motin KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
11789b17223SAlexander Motin KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
11889b17223SAlexander Motin KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
11989b17223SAlexander Motin KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
12089b17223SAlexander Motin KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
12189b17223SAlexander Motin { 0, 0 }
12289b17223SAlexander Motin };
12389b17223SAlexander Motin
12489b17223SAlexander Motin static struct g_raid_tr_class g_raid_tr_raid1e_class = {
12589b17223SAlexander Motin "RAID1E",
12689b17223SAlexander Motin g_raid_tr_raid1e_methods,
12789b17223SAlexander Motin sizeof(struct g_raid_tr_raid1e_object),
128c89d2fbeSAlexander Motin .trc_enable = 1,
129b43560abSAlexander Motin .trc_priority = 200,
130b43560abSAlexander Motin .trc_accept_unmapped = 1
13189b17223SAlexander Motin };
13289b17223SAlexander Motin
13389b17223SAlexander Motin static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
13489b17223SAlexander Motin static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
13589b17223SAlexander Motin struct g_raid_subdisk *sd);
13689b17223SAlexander Motin static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
13789b17223SAlexander Motin int no, off_t off, off_t len, u_int mask);
13889b17223SAlexander Motin
13989b17223SAlexander Motin static inline void
V2P(struct g_raid_volume * vol,off_t virt,int * disk,off_t * offset,off_t * start)14089b17223SAlexander Motin V2P(struct g_raid_volume *vol, off_t virt,
14189b17223SAlexander Motin int *disk, off_t *offset, off_t *start)
14289b17223SAlexander Motin {
14389b17223SAlexander Motin off_t nstrip;
14489b17223SAlexander Motin u_int strip_size;
14589b17223SAlexander Motin
14689b17223SAlexander Motin strip_size = vol->v_strip_size;
14789b17223SAlexander Motin /* Strip number. */
14889b17223SAlexander Motin nstrip = virt / strip_size;
14989b17223SAlexander Motin /* Start position in strip. */
15089b17223SAlexander Motin *start = virt % strip_size;
15189b17223SAlexander Motin /* Disk number. */
15289b17223SAlexander Motin *disk = (nstrip * N) % vol->v_disks_count;
15389b17223SAlexander Motin /* Strip start position in disk. */
15489b17223SAlexander Motin *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
15589b17223SAlexander Motin }
15689b17223SAlexander Motin
15789b17223SAlexander Motin static inline void
P2V(struct g_raid_volume * vol,int disk,off_t offset,off_t * virt,int * copy)15889b17223SAlexander Motin P2V(struct g_raid_volume *vol, int disk, off_t offset,
15989b17223SAlexander Motin off_t *virt, int *copy)
16089b17223SAlexander Motin {
16189b17223SAlexander Motin off_t nstrip, start;
16289b17223SAlexander Motin u_int strip_size;
16389b17223SAlexander Motin
16489b17223SAlexander Motin strip_size = vol->v_strip_size;
16589b17223SAlexander Motin /* Start position in strip. */
16689b17223SAlexander Motin start = offset % strip_size;
16789b17223SAlexander Motin /* Physical strip number. */
16889b17223SAlexander Motin nstrip = (offset / strip_size) * vol->v_disks_count + disk;
16989b17223SAlexander Motin /* Number of physical strip (copy) inside virtual strip. */
17089b17223SAlexander Motin *copy = nstrip % N;
17189b17223SAlexander Motin /* Offset in virtual space. */
17289b17223SAlexander Motin *virt = (nstrip / N) * strip_size + start;
17389b17223SAlexander Motin }
17489b17223SAlexander Motin
17589b17223SAlexander Motin static int
g_raid_tr_taste_raid1e(struct g_raid_tr_object * tr,struct g_raid_volume * vol)17689b17223SAlexander Motin g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
17789b17223SAlexander Motin {
17889b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
17989b17223SAlexander Motin
18089b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
18189b17223SAlexander Motin if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182dbb2e755SAlexander Motin tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
18389b17223SAlexander Motin return (G_RAID_TR_TASTE_FAIL);
18489b17223SAlexander Motin trs->trso_starting = 1;
18589b17223SAlexander Motin return (G_RAID_TR_TASTE_SUCCEED);
18689b17223SAlexander Motin }
18789b17223SAlexander Motin
18889b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e_even(struct g_raid_volume * vol)18989b17223SAlexander Motin g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
19089b17223SAlexander Motin {
19189b17223SAlexander Motin struct g_raid_softc *sc;
19289b17223SAlexander Motin struct g_raid_subdisk *sd, *bestsd, *worstsd;
19389b17223SAlexander Motin int i, j, state, sstate;
19489b17223SAlexander Motin
19589b17223SAlexander Motin sc = vol->v_softc;
19689b17223SAlexander Motin state = G_RAID_VOLUME_S_OPTIMAL;
19789b17223SAlexander Motin for (i = 0; i < vol->v_disks_count / N; i++) {
19889b17223SAlexander Motin bestsd = &vol->v_subdisks[i * N];
19989b17223SAlexander Motin for (j = 1; j < N; j++) {
20089b17223SAlexander Motin sd = &vol->v_subdisks[i * N + j];
20189b17223SAlexander Motin if (sd->sd_state > bestsd->sd_state)
20289b17223SAlexander Motin bestsd = sd;
20389b17223SAlexander Motin else if (sd->sd_state == bestsd->sd_state &&
20489b17223SAlexander Motin (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
20589b17223SAlexander Motin sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
20689b17223SAlexander Motin sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
20789b17223SAlexander Motin bestsd = sd;
20889b17223SAlexander Motin }
20989b17223SAlexander Motin if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
21089b17223SAlexander Motin bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
21189b17223SAlexander Motin /* We found reasonable candidate. */
21289b17223SAlexander Motin G_RAID_DEBUG1(1, sc,
21389b17223SAlexander Motin "Promote subdisk %s:%d from %s to ACTIVE.",
21489b17223SAlexander Motin vol->v_name, bestsd->sd_pos,
21589b17223SAlexander Motin g_raid_subdisk_state2str(bestsd->sd_state));
21689b17223SAlexander Motin g_raid_change_subdisk_state(bestsd,
21789b17223SAlexander Motin G_RAID_SUBDISK_S_ACTIVE);
21889b17223SAlexander Motin g_raid_write_metadata(sc,
21989b17223SAlexander Motin vol, bestsd, bestsd->sd_disk);
22089b17223SAlexander Motin }
22189b17223SAlexander Motin worstsd = &vol->v_subdisks[i * N];
22289b17223SAlexander Motin for (j = 1; j < N; j++) {
22389b17223SAlexander Motin sd = &vol->v_subdisks[i * N + j];
22489b17223SAlexander Motin if (sd->sd_state < worstsd->sd_state)
22589b17223SAlexander Motin worstsd = sd;
22689b17223SAlexander Motin }
22789b17223SAlexander Motin if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
22889b17223SAlexander Motin sstate = G_RAID_VOLUME_S_OPTIMAL;
22989b17223SAlexander Motin else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
23089b17223SAlexander Motin sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
23189b17223SAlexander Motin else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
23289b17223SAlexander Motin sstate = G_RAID_VOLUME_S_DEGRADED;
23389b17223SAlexander Motin else
23489b17223SAlexander Motin sstate = G_RAID_VOLUME_S_BROKEN;
23589b17223SAlexander Motin if (sstate < state)
23689b17223SAlexander Motin state = sstate;
23789b17223SAlexander Motin }
23889b17223SAlexander Motin return (state);
23989b17223SAlexander Motin }
24089b17223SAlexander Motin
24189b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e_odd(struct g_raid_volume * vol)24289b17223SAlexander Motin g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
24389b17223SAlexander Motin {
24489b17223SAlexander Motin struct g_raid_softc *sc;
24589b17223SAlexander Motin struct g_raid_subdisk *sd, *bestsd, *worstsd;
24689b17223SAlexander Motin int i, j, state, sstate;
24789b17223SAlexander Motin
24889b17223SAlexander Motin sc = vol->v_softc;
24989b17223SAlexander Motin if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
25089b17223SAlexander Motin vol->v_disks_count)
25189b17223SAlexander Motin return (G_RAID_VOLUME_S_OPTIMAL);
25289b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) {
25389b17223SAlexander Motin sd = &vol->v_subdisks[i];
25489b17223SAlexander Motin if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
25589b17223SAlexander Motin /* We found reasonable candidate. */
25689b17223SAlexander Motin G_RAID_DEBUG1(1, sc,
25789b17223SAlexander Motin "Promote subdisk %s:%d from %s to STALE.",
25889b17223SAlexander Motin vol->v_name, sd->sd_pos,
25989b17223SAlexander Motin g_raid_subdisk_state2str(sd->sd_state));
26089b17223SAlexander Motin g_raid_change_subdisk_state(sd,
26189b17223SAlexander Motin G_RAID_SUBDISK_S_STALE);
26289b17223SAlexander Motin g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
26389b17223SAlexander Motin }
26489b17223SAlexander Motin }
26589b17223SAlexander Motin state = G_RAID_VOLUME_S_OPTIMAL;
26689b17223SAlexander Motin for (i = 0; i < vol->v_disks_count; i++) {
26789b17223SAlexander Motin bestsd = &vol->v_subdisks[i];
26889b17223SAlexander Motin worstsd = &vol->v_subdisks[i];
26989b17223SAlexander Motin for (j = 1; j < N; j++) {
27089b17223SAlexander Motin sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
27189b17223SAlexander Motin if (sd->sd_state > bestsd->sd_state)
27289b17223SAlexander Motin bestsd = sd;
27389b17223SAlexander Motin else if (sd->sd_state == bestsd->sd_state &&
27489b17223SAlexander Motin (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
27589b17223SAlexander Motin sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
27689b17223SAlexander Motin sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
27789b17223SAlexander Motin bestsd = sd;
27889b17223SAlexander Motin if (sd->sd_state < worstsd->sd_state)
27989b17223SAlexander Motin worstsd = sd;
28089b17223SAlexander Motin }
28189b17223SAlexander Motin if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
28289b17223SAlexander Motin sstate = G_RAID_VOLUME_S_OPTIMAL;
28389b17223SAlexander Motin else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
28489b17223SAlexander Motin sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
28589b17223SAlexander Motin else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
28689b17223SAlexander Motin sstate = G_RAID_VOLUME_S_DEGRADED;
28789b17223SAlexander Motin else
28889b17223SAlexander Motin sstate = G_RAID_VOLUME_S_BROKEN;
28989b17223SAlexander Motin if (sstate < state)
29089b17223SAlexander Motin state = sstate;
29189b17223SAlexander Motin }
29289b17223SAlexander Motin return (state);
29389b17223SAlexander Motin }
29489b17223SAlexander Motin
29589b17223SAlexander Motin static int
g_raid_tr_update_state_raid1e(struct g_raid_volume * vol,struct g_raid_subdisk * sd)29689b17223SAlexander Motin g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
29789b17223SAlexander Motin struct g_raid_subdisk *sd)
29889b17223SAlexander Motin {
29989b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
30089b17223SAlexander Motin struct g_raid_softc *sc;
30189b17223SAlexander Motin u_int s;
30289b17223SAlexander Motin
30389b17223SAlexander Motin sc = vol->v_softc;
30489b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
30589b17223SAlexander Motin if (trs->trso_stopping &&
30689b17223SAlexander Motin (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
30789b17223SAlexander Motin s = G_RAID_VOLUME_S_STOPPED;
30889b17223SAlexander Motin else if (trs->trso_starting)
30989b17223SAlexander Motin s = G_RAID_VOLUME_S_STARTING;
31089b17223SAlexander Motin else {
31189b17223SAlexander Motin if ((vol->v_disks_count % N) == 0)
31289b17223SAlexander Motin s = g_raid_tr_update_state_raid1e_even(vol);
31389b17223SAlexander Motin else
31489b17223SAlexander Motin s = g_raid_tr_update_state_raid1e_odd(vol);
31589b17223SAlexander Motin }
31689b17223SAlexander Motin if (s != vol->v_state) {
31789b17223SAlexander Motin g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
31889b17223SAlexander Motin G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
31989b17223SAlexander Motin G_RAID_EVENT_VOLUME);
32089b17223SAlexander Motin g_raid_change_volume_state(vol, s);
32189b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping)
32289b17223SAlexander Motin g_raid_write_metadata(sc, vol, NULL, NULL);
32389b17223SAlexander Motin }
32489b17223SAlexander Motin if (!trs->trso_starting && !trs->trso_stopping)
32589b17223SAlexander Motin g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
32689b17223SAlexander Motin return (0);
32789b17223SAlexander Motin }
32889b17223SAlexander Motin
32989b17223SAlexander Motin static void
g_raid_tr_raid1e_fail_disk(struct g_raid_softc * sc,struct g_raid_subdisk * sd,struct g_raid_disk * disk)33089b17223SAlexander Motin g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
33189b17223SAlexander Motin struct g_raid_disk *disk)
33289b17223SAlexander Motin {
333ef844ef7SAlexander Motin struct g_raid_volume *vol;
334ef844ef7SAlexander Motin
335ef844ef7SAlexander Motin vol = sd->sd_volume;
33689b17223SAlexander Motin /*
33789b17223SAlexander Motin * We don't fail the last disk in the pack, since it still has decent
33889b17223SAlexander Motin * data on it and that's better than failing the disk if it is the root
33989b17223SAlexander Motin * file system.
34089b17223SAlexander Motin *
34189b17223SAlexander Motin * XXX should this be controlled via a tunable? It makes sense for
34289b17223SAlexander Motin * the volume that has / on it. I can't think of a case where we'd
34389b17223SAlexander Motin * want the volume to go away on this kind of event.
34489b17223SAlexander Motin */
345ef844ef7SAlexander Motin if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346ef844ef7SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347ef844ef7SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348ef844ef7SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349ef844ef7SAlexander Motin vol->v_disks_count) &&
350ef844ef7SAlexander Motin (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
35189b17223SAlexander Motin return;
35289b17223SAlexander Motin g_raid_fail_disk(sc, sd, disk);
35389b17223SAlexander Motin }
35489b17223SAlexander Motin
35589b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object * trs)35689b17223SAlexander Motin g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
35789b17223SAlexander Motin {
35889b17223SAlexander Motin struct g_raid_volume *vol;
35989b17223SAlexander Motin struct g_raid_subdisk *sd;
36089b17223SAlexander Motin
36189b17223SAlexander Motin vol = trs->trso_base.tro_volume;
36289b17223SAlexander Motin sd = trs->trso_failed_sd;
36389b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
36489b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1E);
36589b17223SAlexander Motin trs->trso_buffer = NULL;
36689b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
36789b17223SAlexander Motin trs->trso_type = TR_RAID1E_NONE;
36889b17223SAlexander Motin trs->trso_recover_slabs = 0;
36989b17223SAlexander Motin trs->trso_failed_sd = NULL;
37089b17223SAlexander Motin g_raid_tr_update_state_raid1e(vol, NULL);
37189b17223SAlexander Motin }
37289b17223SAlexander Motin
37389b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object * tr)37489b17223SAlexander Motin g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
37589b17223SAlexander Motin {
37689b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
37789b17223SAlexander Motin struct g_raid_subdisk *sd;
37889b17223SAlexander Motin
37989b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
38089b17223SAlexander Motin sd = trs->trso_failed_sd;
38189b17223SAlexander Motin G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
38289b17223SAlexander Motin "Subdisk %s:%d-%s rebuild completed.",
38389b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
38489b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
38589b17223SAlexander Motin g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
38689b17223SAlexander Motin sd->sd_rebuild_pos = 0;
38789b17223SAlexander Motin g_raid_tr_raid1e_rebuild_done(trs);
38889b17223SAlexander Motin }
38989b17223SAlexander Motin
39089b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object * tr)39189b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
39289b17223SAlexander Motin {
39389b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
39489b17223SAlexander Motin struct g_raid_subdisk *sd;
39589b17223SAlexander Motin struct g_raid_volume *vol;
39689b17223SAlexander Motin
39789b17223SAlexander Motin vol = tr->tro_volume;
39889b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
39989b17223SAlexander Motin sd = trs->trso_failed_sd;
40089b17223SAlexander Motin if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
40189b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
40289b17223SAlexander Motin "Subdisk %s:%d-%s rebuild is aborting.",
40389b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
40489b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
40589b17223SAlexander Motin trs->trso_flags |= TR_RAID1E_F_ABORT;
40689b17223SAlexander Motin } else {
40789b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc,
40889b17223SAlexander Motin "Subdisk %s:%d-%s rebuild aborted.",
40989b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
41089b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
41189b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_ABORT;
41289b17223SAlexander Motin if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
41389b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
41489b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume,
41589b17223SAlexander Motin trs->trso_lock_pos, trs->trso_lock_len);
41689b17223SAlexander Motin }
41789b17223SAlexander Motin g_raid_tr_raid1e_rebuild_done(trs);
41889b17223SAlexander Motin }
41989b17223SAlexander Motin }
42089b17223SAlexander Motin
42189b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object * tr)42289b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
42389b17223SAlexander Motin {
42489b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
42589b17223SAlexander Motin struct g_raid_softc *sc;
42689b17223SAlexander Motin struct g_raid_volume *vol;
42789b17223SAlexander Motin struct g_raid_subdisk *sd;
42889b17223SAlexander Motin struct bio *bp;
42989b17223SAlexander Motin off_t len, virtual, vend, offset, start;
43089b17223SAlexander Motin int disk, copy, best;
43189b17223SAlexander Motin
43289b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
43389b17223SAlexander Motin if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
43489b17223SAlexander Motin return;
43589b17223SAlexander Motin vol = tr->tro_volume;
43689b17223SAlexander Motin sc = vol->v_softc;
43789b17223SAlexander Motin sd = trs->trso_failed_sd;
43889b17223SAlexander Motin
43989b17223SAlexander Motin while (1) {
44089b17223SAlexander Motin if (sd->sd_rebuild_pos >= sd->sd_size) {
44189b17223SAlexander Motin g_raid_tr_raid1e_rebuild_finish(tr);
44289b17223SAlexander Motin return;
44389b17223SAlexander Motin }
44489b17223SAlexander Motin /* Get virtual offset from physical rebuild position. */
44589b17223SAlexander Motin P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
44689b17223SAlexander Motin /* Get physical offset back to get first stripe position. */
44789b17223SAlexander Motin V2P(vol, virtual, &disk, &offset, &start);
44889b17223SAlexander Motin /* Calculate contignous data length. */
44989b17223SAlexander Motin len = MIN(g_raid1e_rebuild_slab,
45089b17223SAlexander Motin sd->sd_size - sd->sd_rebuild_pos);
45189b17223SAlexander Motin if ((vol->v_disks_count % N) != 0)
45289b17223SAlexander Motin len = MIN(len, vol->v_strip_size - start);
45389b17223SAlexander Motin /* Find disk with most accurate data. */
45489b17223SAlexander Motin best = g_raid_tr_raid1e_select_read_disk(vol, disk,
45589b17223SAlexander Motin offset + start, len, 0);
45689b17223SAlexander Motin if (best < 0) {
45789b17223SAlexander Motin /* There is no any valid disk. */
45889b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(tr);
45989b17223SAlexander Motin return;
46089b17223SAlexander Motin } else if (best != copy) {
46189b17223SAlexander Motin /* Some other disk has better data. */
46289b17223SAlexander Motin break;
46389b17223SAlexander Motin }
46489b17223SAlexander Motin /* We have the most accurate data. Skip the range. */
46589b17223SAlexander Motin G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
46689b17223SAlexander Motin sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
46789b17223SAlexander Motin sd->sd_rebuild_pos += len;
46889b17223SAlexander Motin }
46989b17223SAlexander Motin
47089b17223SAlexander Motin bp = &trs->trso_bio;
47189b17223SAlexander Motin memset(bp, 0, sizeof(*bp));
47289b17223SAlexander Motin bp->bio_offset = offset + start +
47389b17223SAlexander Motin ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
47489b17223SAlexander Motin bp->bio_length = len;
47589b17223SAlexander Motin bp->bio_data = trs->trso_buffer;
47689b17223SAlexander Motin bp->bio_cmd = BIO_READ;
47789b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
47889b17223SAlexander Motin bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
47989b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
48089b17223SAlexander Motin /*
48189b17223SAlexander Motin * If we are crossing stripe boundary, correct affected virtual
48289b17223SAlexander Motin * range we should lock.
48389b17223SAlexander Motin */
48489b17223SAlexander Motin if (start + len > vol->v_strip_size) {
48589b17223SAlexander Motin P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
48689b17223SAlexander Motin len = vend - virtual;
48789b17223SAlexander Motin }
48889b17223SAlexander Motin trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
48989b17223SAlexander Motin trs->trso_flags |= TR_RAID1E_F_LOCKED;
49089b17223SAlexander Motin trs->trso_lock_pos = virtual;
49189b17223SAlexander Motin trs->trso_lock_len = len;
49289b17223SAlexander Motin /* Lock callback starts I/O */
49389b17223SAlexander Motin g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
49489b17223SAlexander Motin }
49589b17223SAlexander Motin
49689b17223SAlexander Motin static void
g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object * tr)49789b17223SAlexander Motin g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
49889b17223SAlexander Motin {
49989b17223SAlexander Motin struct g_raid_volume *vol;
50089b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
50189b17223SAlexander Motin struct g_raid_subdisk *sd;
50289b17223SAlexander Motin
50389b17223SAlexander Motin vol = tr->tro_volume;
50489b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
50589b17223SAlexander Motin if (trs->trso_failed_sd) {
50689b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
50789b17223SAlexander Motin "Already rebuild in start rebuild. pos %jd\n",
50889b17223SAlexander Motin (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
50989b17223SAlexander Motin return;
51089b17223SAlexander Motin }
51189b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
51289b17223SAlexander Motin if (sd == NULL)
51389b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
51489b17223SAlexander Motin if (sd == NULL) {
51589b17223SAlexander Motin sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
51689b17223SAlexander Motin if (sd != NULL) {
51789b17223SAlexander Motin sd->sd_rebuild_pos = 0;
51889b17223SAlexander Motin g_raid_change_subdisk_state(sd,
51989b17223SAlexander Motin G_RAID_SUBDISK_S_RESYNC);
52089b17223SAlexander Motin g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
52189b17223SAlexander Motin } else {
52289b17223SAlexander Motin sd = g_raid_get_subdisk(vol,
52389b17223SAlexander Motin G_RAID_SUBDISK_S_UNINITIALIZED);
52489b17223SAlexander Motin if (sd == NULL)
52589b17223SAlexander Motin sd = g_raid_get_subdisk(vol,
52689b17223SAlexander Motin G_RAID_SUBDISK_S_NEW);
52789b17223SAlexander Motin if (sd != NULL) {
52889b17223SAlexander Motin sd->sd_rebuild_pos = 0;
52989b17223SAlexander Motin g_raid_change_subdisk_state(sd,
53089b17223SAlexander Motin G_RAID_SUBDISK_S_REBUILD);
53189b17223SAlexander Motin g_raid_write_metadata(vol->v_softc,
53289b17223SAlexander Motin vol, sd, NULL);
53389b17223SAlexander Motin }
53489b17223SAlexander Motin }
53589b17223SAlexander Motin }
53689b17223SAlexander Motin if (sd == NULL) {
53789b17223SAlexander Motin G_RAID_DEBUG1(1, vol->v_softc,
53889b17223SAlexander Motin "No failed disk to rebuild. night night.");
53989b17223SAlexander Motin return;
54089b17223SAlexander Motin }
54189b17223SAlexander Motin trs->trso_failed_sd = sd;
54289b17223SAlexander Motin G_RAID_DEBUG1(0, vol->v_softc,
54389b17223SAlexander Motin "Subdisk %s:%d-%s rebuild start at %jd.",
54489b17223SAlexander Motin sd->sd_volume->v_name, sd->sd_pos,
54589b17223SAlexander Motin sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
54689b17223SAlexander Motin trs->trso_failed_sd->sd_rebuild_pos);
54789b17223SAlexander Motin trs->trso_type = TR_RAID1E_REBUILD;
54889b17223SAlexander Motin trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
54989b17223SAlexander Motin trs->trso_meta_update = g_raid1e_rebuild_meta_update;
55089b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(tr);
55189b17223SAlexander Motin }
55289b17223SAlexander Motin
55389b17223SAlexander Motin static void
g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd)55489b17223SAlexander Motin g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
55589b17223SAlexander Motin struct g_raid_subdisk *sd)
55689b17223SAlexander Motin {
55789b17223SAlexander Motin struct g_raid_volume *vol;
55889b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
55989b17223SAlexander Motin int nr;
56089b17223SAlexander Motin
56189b17223SAlexander Motin vol = tr->tro_volume;
56289b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
56389b17223SAlexander Motin if (trs->trso_stopping)
56489b17223SAlexander Motin return;
56589b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
56689b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
56789b17223SAlexander Motin switch(trs->trso_type) {
56889b17223SAlexander Motin case TR_RAID1E_NONE:
56989b17223SAlexander Motin if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
57089b17223SAlexander Motin return;
57189b17223SAlexander Motin if (nr == 0) {
57289b17223SAlexander Motin nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
57389b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
57489b17223SAlexander Motin g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
57589b17223SAlexander Motin if (nr == 0)
57689b17223SAlexander Motin return;
57789b17223SAlexander Motin }
57889b17223SAlexander Motin g_raid_tr_raid1e_rebuild_start(tr);
57989b17223SAlexander Motin break;
58089b17223SAlexander Motin case TR_RAID1E_REBUILD:
58189b17223SAlexander Motin if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
58289b17223SAlexander Motin trs->trso_failed_sd == sd)
58389b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(tr);
58489b17223SAlexander Motin break;
58589b17223SAlexander Motin case TR_RAID1E_RESYNC:
58689b17223SAlexander Motin break;
58789b17223SAlexander Motin }
58889b17223SAlexander Motin }
58989b17223SAlexander Motin
59089b17223SAlexander Motin static int
g_raid_tr_event_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,u_int event)59189b17223SAlexander Motin g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
59289b17223SAlexander Motin struct g_raid_subdisk *sd, u_int event)
59389b17223SAlexander Motin {
59489b17223SAlexander Motin
59589b17223SAlexander Motin g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
59689b17223SAlexander Motin return (0);
59789b17223SAlexander Motin }
59889b17223SAlexander Motin
59989b17223SAlexander Motin static int
g_raid_tr_start_raid1e(struct g_raid_tr_object * tr)60089b17223SAlexander Motin g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
60189b17223SAlexander Motin {
60289b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
60389b17223SAlexander Motin struct g_raid_volume *vol;
60489b17223SAlexander Motin
60589b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
60689b17223SAlexander Motin vol = tr->tro_volume;
60789b17223SAlexander Motin trs->trso_starting = 0;
60889b17223SAlexander Motin g_raid_tr_update_state_raid1e(vol, NULL);
60989b17223SAlexander Motin return (0);
61089b17223SAlexander Motin }
61189b17223SAlexander Motin
61289b17223SAlexander Motin static int
g_raid_tr_stop_raid1e(struct g_raid_tr_object * tr)61389b17223SAlexander Motin g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
61489b17223SAlexander Motin {
61589b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
61689b17223SAlexander Motin struct g_raid_volume *vol;
61789b17223SAlexander Motin
61889b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
61989b17223SAlexander Motin vol = tr->tro_volume;
62089b17223SAlexander Motin trs->trso_starting = 0;
62189b17223SAlexander Motin trs->trso_stopping = 1;
62289b17223SAlexander Motin g_raid_tr_update_state_raid1e(vol, NULL);
62389b17223SAlexander Motin return (0);
62489b17223SAlexander Motin }
62589b17223SAlexander Motin
62689b17223SAlexander Motin /*
62789b17223SAlexander Motin * Select the disk to read from. Take into account: subdisk state, running
62889b17223SAlexander Motin * error recovery, average disk load, head position and possible cache hits.
62989b17223SAlexander Motin */
63089b17223SAlexander Motin #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
63189b17223SAlexander Motin static int
g_raid_tr_raid1e_select_read_disk(struct g_raid_volume * vol,int no,off_t off,off_t len,u_int mask)63289b17223SAlexander Motin g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
63389b17223SAlexander Motin int no, off_t off, off_t len, u_int mask)
63489b17223SAlexander Motin {
63589b17223SAlexander Motin struct g_raid_subdisk *sd;
63689b17223SAlexander Motin off_t offset;
63789b17223SAlexander Motin int i, best, prio, bestprio;
63889b17223SAlexander Motin
63989b17223SAlexander Motin best = -1;
64089b17223SAlexander Motin bestprio = INT_MAX;
64189b17223SAlexander Motin for (i = 0; i < N; i++) {
64289b17223SAlexander Motin sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
64389b17223SAlexander Motin offset = off;
64489b17223SAlexander Motin if (no + i >= vol->v_disks_count)
64589b17223SAlexander Motin offset += vol->v_strip_size;
64689b17223SAlexander Motin
64789b17223SAlexander Motin prio = G_RAID_SUBDISK_LOAD(sd);
64889b17223SAlexander Motin if ((mask & (1 << sd->sd_pos)) != 0)
64989b17223SAlexander Motin continue;
65089b17223SAlexander Motin switch (sd->sd_state) {
65189b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE:
65289b17223SAlexander Motin break;
65389b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC:
65489b17223SAlexander Motin if (offset + off < sd->sd_rebuild_pos)
65589b17223SAlexander Motin break;
65689b17223SAlexander Motin /* FALLTHROUGH */
65789b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE:
65889b17223SAlexander Motin prio += i << 24;
65989b17223SAlexander Motin break;
66089b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD:
66189b17223SAlexander Motin if (offset + off < sd->sd_rebuild_pos)
66289b17223SAlexander Motin break;
66389b17223SAlexander Motin /* FALLTHROUGH */
66489b17223SAlexander Motin default:
66589b17223SAlexander Motin continue;
66689b17223SAlexander Motin }
66789b17223SAlexander Motin prio += min(sd->sd_recovery, 255) << 16;
66889b17223SAlexander Motin /* If disk head is precisely in position - highly prefer it. */
66989b17223SAlexander Motin if (G_RAID_SUBDISK_POS(sd) == offset)
67089b17223SAlexander Motin prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
67189b17223SAlexander Motin else
67289b17223SAlexander Motin /* If disk head is close to position - prefer it. */
67389b17223SAlexander Motin if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
67489b17223SAlexander Motin G_RAID_SUBDISK_TRACK_SIZE)
67589b17223SAlexander Motin prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
67689b17223SAlexander Motin if (prio < bestprio) {
67789b17223SAlexander Motin bestprio = prio;
67889b17223SAlexander Motin best = i;
67989b17223SAlexander Motin }
68089b17223SAlexander Motin }
68189b17223SAlexander Motin return (best);
68289b17223SAlexander Motin }
68389b17223SAlexander Motin
68489b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object * tr,struct bio * bp)68589b17223SAlexander Motin g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
68689b17223SAlexander Motin {
68789b17223SAlexander Motin struct g_raid_volume *vol;
68889b17223SAlexander Motin struct g_raid_subdisk *sd;
68989b17223SAlexander Motin struct bio_queue_head queue;
69089b17223SAlexander Motin struct bio *cbp;
69189b17223SAlexander Motin char *addr;
69289b17223SAlexander Motin off_t offset, start, length, remain;
69389b17223SAlexander Motin u_int no, strip_size;
69489b17223SAlexander Motin int best;
69589b17223SAlexander Motin
69689b17223SAlexander Motin vol = tr->tro_volume;
697b43560abSAlexander Motin if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698b43560abSAlexander Motin addr = NULL;
699b43560abSAlexander Motin else
70089b17223SAlexander Motin addr = bp->bio_data;
70189b17223SAlexander Motin strip_size = vol->v_strip_size;
70289b17223SAlexander Motin V2P(vol, bp->bio_offset, &no, &offset, &start);
70389b17223SAlexander Motin remain = bp->bio_length;
70489b17223SAlexander Motin bioq_init(&queue);
70589b17223SAlexander Motin while (remain > 0) {
70689b17223SAlexander Motin length = MIN(strip_size - start, remain);
70789b17223SAlexander Motin best = g_raid_tr_raid1e_select_read_disk(vol,
70889b17223SAlexander Motin no, offset, length, 0);
70989b17223SAlexander Motin KASSERT(best >= 0, ("No readable disk in volume %s!",
71089b17223SAlexander Motin vol->v_name));
71189b17223SAlexander Motin no += best;
71289b17223SAlexander Motin if (no >= vol->v_disks_count) {
71389b17223SAlexander Motin no -= vol->v_disks_count;
71489b17223SAlexander Motin offset += strip_size;
71589b17223SAlexander Motin }
71689b17223SAlexander Motin cbp = g_clone_bio(bp);
71789b17223SAlexander Motin if (cbp == NULL)
71889b17223SAlexander Motin goto failure;
71989b17223SAlexander Motin cbp->bio_offset = offset + start;
72089b17223SAlexander Motin cbp->bio_length = length;
721b43560abSAlexander Motin if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722b43560abSAlexander Motin cbp->bio_ma_offset += (uintptr_t)addr;
723b43560abSAlexander Motin cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724b43560abSAlexander Motin cbp->bio_ma_offset %= PAGE_SIZE;
725b43560abSAlexander Motin cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726b43560abSAlexander Motin cbp->bio_length) / PAGE_SIZE;
727b43560abSAlexander Motin } else
728b43560abSAlexander Motin cbp->bio_data = addr;
72989b17223SAlexander Motin cbp->bio_caller1 = &vol->v_subdisks[no];
73089b17223SAlexander Motin bioq_insert_tail(&queue, cbp);
73189b17223SAlexander Motin no += N - best;
73289b17223SAlexander Motin if (no >= vol->v_disks_count) {
73389b17223SAlexander Motin no -= vol->v_disks_count;
73489b17223SAlexander Motin offset += strip_size;
73589b17223SAlexander Motin }
73689b17223SAlexander Motin remain -= length;
73789b17223SAlexander Motin addr += length;
73889b17223SAlexander Motin start = 0;
73989b17223SAlexander Motin }
740b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) {
74189b17223SAlexander Motin sd = cbp->bio_caller1;
74289b17223SAlexander Motin cbp->bio_caller1 = NULL;
74389b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp);
74489b17223SAlexander Motin }
74589b17223SAlexander Motin return;
74689b17223SAlexander Motin failure:
747b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL)
74889b17223SAlexander Motin g_destroy_bio(cbp);
74989b17223SAlexander Motin if (bp->bio_error == 0)
75089b17223SAlexander Motin bp->bio_error = ENOMEM;
75189b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error);
75289b17223SAlexander Motin }
75389b17223SAlexander Motin
75489b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object * tr,struct bio * bp)75589b17223SAlexander Motin g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
75689b17223SAlexander Motin {
75789b17223SAlexander Motin struct g_raid_volume *vol;
75889b17223SAlexander Motin struct g_raid_subdisk *sd;
75989b17223SAlexander Motin struct bio_queue_head queue;
76089b17223SAlexander Motin struct bio *cbp;
76189b17223SAlexander Motin char *addr;
76289b17223SAlexander Motin off_t offset, start, length, remain;
76389b17223SAlexander Motin u_int no, strip_size;
76489b17223SAlexander Motin int i;
76589b17223SAlexander Motin
76689b17223SAlexander Motin vol = tr->tro_volume;
767b43560abSAlexander Motin if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768b43560abSAlexander Motin addr = NULL;
769b43560abSAlexander Motin else
77089b17223SAlexander Motin addr = bp->bio_data;
77189b17223SAlexander Motin strip_size = vol->v_strip_size;
77289b17223SAlexander Motin V2P(vol, bp->bio_offset, &no, &offset, &start);
77389b17223SAlexander Motin remain = bp->bio_length;
77489b17223SAlexander Motin bioq_init(&queue);
77589b17223SAlexander Motin while (remain > 0) {
77689b17223SAlexander Motin length = MIN(strip_size - start, remain);
77789b17223SAlexander Motin for (i = 0; i < N; i++) {
77889b17223SAlexander Motin sd = &vol->v_subdisks[no];
77989b17223SAlexander Motin switch (sd->sd_state) {
78089b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE:
78189b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE:
78289b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC:
78389b17223SAlexander Motin break;
78489b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD:
78589b17223SAlexander Motin if (offset + start >= sd->sd_rebuild_pos)
78689b17223SAlexander Motin goto nextdisk;
78789b17223SAlexander Motin break;
78889b17223SAlexander Motin default:
78989b17223SAlexander Motin goto nextdisk;
79089b17223SAlexander Motin }
79189b17223SAlexander Motin cbp = g_clone_bio(bp);
79289b17223SAlexander Motin if (cbp == NULL)
79389b17223SAlexander Motin goto failure;
79489b17223SAlexander Motin cbp->bio_offset = offset + start;
79589b17223SAlexander Motin cbp->bio_length = length;
796b43560abSAlexander Motin if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797b43560abSAlexander Motin bp->bio_cmd != BIO_DELETE) {
798b43560abSAlexander Motin cbp->bio_ma_offset += (uintptr_t)addr;
799b43560abSAlexander Motin cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800b43560abSAlexander Motin cbp->bio_ma_offset %= PAGE_SIZE;
801b43560abSAlexander Motin cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802b43560abSAlexander Motin cbp->bio_length) / PAGE_SIZE;
803b43560abSAlexander Motin } else
804b43560abSAlexander Motin cbp->bio_data = addr;
80589b17223SAlexander Motin cbp->bio_caller1 = sd;
80689b17223SAlexander Motin bioq_insert_tail(&queue, cbp);
80789b17223SAlexander Motin nextdisk:
80889b17223SAlexander Motin if (++no >= vol->v_disks_count) {
80989b17223SAlexander Motin no = 0;
81089b17223SAlexander Motin offset += strip_size;
81189b17223SAlexander Motin }
81289b17223SAlexander Motin }
81389b17223SAlexander Motin remain -= length;
814609a7474SAlexander Motin if (bp->bio_cmd != BIO_DELETE)
81589b17223SAlexander Motin addr += length;
81689b17223SAlexander Motin start = 0;
81789b17223SAlexander Motin }
818b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL) {
81989b17223SAlexander Motin sd = cbp->bio_caller1;
82089b17223SAlexander Motin cbp->bio_caller1 = NULL;
82189b17223SAlexander Motin g_raid_subdisk_iostart(sd, cbp);
82289b17223SAlexander Motin }
82389b17223SAlexander Motin return;
82489b17223SAlexander Motin failure:
825b43560abSAlexander Motin while ((cbp = bioq_takefirst(&queue)) != NULL)
82689b17223SAlexander Motin g_destroy_bio(cbp);
82789b17223SAlexander Motin if (bp->bio_error == 0)
82889b17223SAlexander Motin bp->bio_error = ENOMEM;
82989b17223SAlexander Motin g_raid_iodone(bp, bp->bio_error);
83089b17223SAlexander Motin }
83189b17223SAlexander Motin
83289b17223SAlexander Motin static void
g_raid_tr_iostart_raid1e(struct g_raid_tr_object * tr,struct bio * bp)83389b17223SAlexander Motin g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
83489b17223SAlexander Motin {
83589b17223SAlexander Motin struct g_raid_volume *vol;
83689b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
83789b17223SAlexander Motin
83889b17223SAlexander Motin vol = tr->tro_volume;
83989b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
84089b17223SAlexander Motin if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
84189b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
84289b17223SAlexander Motin vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
84389b17223SAlexander Motin g_raid_iodone(bp, EIO);
84489b17223SAlexander Motin return;
84589b17223SAlexander Motin }
84689b17223SAlexander Motin /*
84789b17223SAlexander Motin * If we're rebuilding, squeeze in rebuild activity every so often,
84889b17223SAlexander Motin * even when the disk is busy. Be sure to only count real I/O
84989b17223SAlexander Motin * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
85089b17223SAlexander Motin * by this module.
85189b17223SAlexander Motin */
85289b17223SAlexander Motin if (trs->trso_failed_sd != NULL &&
85389b17223SAlexander Motin !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
85489b17223SAlexander Motin /* Make this new or running now round short. */
85589b17223SAlexander Motin trs->trso_recover_slabs = 0;
85689b17223SAlexander Motin if (--trs->trso_fair_io <= 0) {
85789b17223SAlexander Motin trs->trso_fair_io = g_raid1e_rebuild_fair_io;
85889b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(tr);
85989b17223SAlexander Motin }
86089b17223SAlexander Motin }
86189b17223SAlexander Motin switch (bp->bio_cmd) {
86289b17223SAlexander Motin case BIO_READ:
86389b17223SAlexander Motin g_raid_tr_iostart_raid1e_read(tr, bp);
86489b17223SAlexander Motin break;
86589b17223SAlexander Motin case BIO_WRITE:
86689b17223SAlexander Motin case BIO_DELETE:
867609a7474SAlexander Motin g_raid_tr_iostart_raid1e_write(tr, bp);
86889b17223SAlexander Motin break;
8698b522bdaSWarner Losh case BIO_SPEEDUP:
87089b17223SAlexander Motin case BIO_FLUSH:
87189b17223SAlexander Motin g_raid_tr_flush_common(tr, bp);
87289b17223SAlexander Motin break;
87389b17223SAlexander Motin default:
87489b17223SAlexander Motin KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
87589b17223SAlexander Motin bp->bio_cmd, vol->v_name));
87689b17223SAlexander Motin break;
87789b17223SAlexander Motin }
87889b17223SAlexander Motin }
87989b17223SAlexander Motin
88089b17223SAlexander Motin static void
g_raid_tr_iodone_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,struct bio * bp)88189b17223SAlexander Motin g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
88289b17223SAlexander Motin struct g_raid_subdisk *sd, struct bio *bp)
88389b17223SAlexander Motin {
88489b17223SAlexander Motin struct bio *cbp;
88589b17223SAlexander Motin struct g_raid_subdisk *nsd;
88689b17223SAlexander Motin struct g_raid_volume *vol;
88789b17223SAlexander Motin struct bio *pbp;
88889b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
88989b17223SAlexander Motin off_t virtual, offset, start;
89089b17223SAlexander Motin uintptr_t mask;
89189b17223SAlexander Motin int error, do_write, copy, disk, best;
89289b17223SAlexander Motin
89389b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
89489b17223SAlexander Motin vol = tr->tro_volume;
89589b17223SAlexander Motin if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
89689b17223SAlexander Motin if (trs->trso_type == TR_RAID1E_REBUILD) {
89789b17223SAlexander Motin nsd = trs->trso_failed_sd;
89889b17223SAlexander Motin if (bp->bio_cmd == BIO_READ) {
89989b17223SAlexander Motin /* Immediately abort rebuild, if requested. */
90089b17223SAlexander Motin if (trs->trso_flags & TR_RAID1E_F_ABORT) {
90189b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
90289b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(tr);
90389b17223SAlexander Motin return;
90489b17223SAlexander Motin }
90589b17223SAlexander Motin
90689b17223SAlexander Motin /* On read error, skip and cross fingers. */
90789b17223SAlexander Motin if (bp->bio_error != 0) {
90889b17223SAlexander Motin G_RAID_LOGREQ(0, bp,
90989b17223SAlexander Motin "Read error during rebuild (%d), "
91089b17223SAlexander Motin "possible data loss!",
91189b17223SAlexander Motin bp->bio_error);
91289b17223SAlexander Motin goto rebuild_round_done;
91389b17223SAlexander Motin }
91489b17223SAlexander Motin
91589b17223SAlexander Motin /*
91689b17223SAlexander Motin * The read operation finished, queue the
91789b17223SAlexander Motin * write and get out.
91889b17223SAlexander Motin */
91989b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
92089b17223SAlexander Motin bp->bio_error);
92189b17223SAlexander Motin bp->bio_cmd = BIO_WRITE;
92289b17223SAlexander Motin bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
92389b17223SAlexander Motin bp->bio_offset = nsd->sd_rebuild_pos;
92489b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
92589b17223SAlexander Motin g_raid_subdisk_iostart(nsd, bp);
92689b17223SAlexander Motin } else {
92789b17223SAlexander Motin /*
92889b17223SAlexander Motin * The write operation just finished. Do
92989b17223SAlexander Motin * another. We keep cloning the master bio
93089b17223SAlexander Motin * since it has the right buffers allocated to
93189b17223SAlexander Motin * it.
93289b17223SAlexander Motin */
93389b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
93489b17223SAlexander Motin bp->bio_error);
93589b17223SAlexander Motin if (bp->bio_error != 0 ||
93689b17223SAlexander Motin trs->trso_flags & TR_RAID1E_F_ABORT) {
93789b17223SAlexander Motin if ((trs->trso_flags &
93889b17223SAlexander Motin TR_RAID1E_F_ABORT) == 0) {
93989b17223SAlexander Motin g_raid_tr_raid1e_fail_disk(sd->sd_softc,
94089b17223SAlexander Motin nsd, nsd->sd_disk);
94189b17223SAlexander Motin }
94289b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
94389b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(tr);
94489b17223SAlexander Motin return;
94589b17223SAlexander Motin }
94689b17223SAlexander Motin rebuild_round_done:
94789b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
94889b17223SAlexander Motin g_raid_unlock_range(tr->tro_volume,
94989b17223SAlexander Motin trs->trso_lock_pos, trs->trso_lock_len);
95089b17223SAlexander Motin nsd->sd_rebuild_pos += bp->bio_length;
95189b17223SAlexander Motin if (nsd->sd_rebuild_pos >= nsd->sd_size) {
95289b17223SAlexander Motin g_raid_tr_raid1e_rebuild_finish(tr);
95389b17223SAlexander Motin return;
95489b17223SAlexander Motin }
95589b17223SAlexander Motin
95689b17223SAlexander Motin /* Abort rebuild if we are stopping */
95789b17223SAlexander Motin if (trs->trso_stopping) {
95889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
95989b17223SAlexander Motin g_raid_tr_raid1e_rebuild_abort(tr);
96089b17223SAlexander Motin return;
96189b17223SAlexander Motin }
96289b17223SAlexander Motin
96389b17223SAlexander Motin if (--trs->trso_meta_update <= 0) {
96489b17223SAlexander Motin g_raid_write_metadata(vol->v_softc,
96589b17223SAlexander Motin vol, nsd, nsd->sd_disk);
96689b17223SAlexander Motin trs->trso_meta_update =
96789b17223SAlexander Motin g_raid1e_rebuild_meta_update;
96889b17223SAlexander Motin /* Compensate short rebuild I/Os. */
96989b17223SAlexander Motin if ((vol->v_disks_count % N) != 0 &&
97089b17223SAlexander Motin vol->v_strip_size <
97189b17223SAlexander Motin g_raid1e_rebuild_slab) {
97289b17223SAlexander Motin trs->trso_meta_update *=
97389b17223SAlexander Motin g_raid1e_rebuild_slab;
97489b17223SAlexander Motin trs->trso_meta_update /=
97589b17223SAlexander Motin vol->v_strip_size;
97689b17223SAlexander Motin }
97789b17223SAlexander Motin }
97889b17223SAlexander Motin trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
97989b17223SAlexander Motin if (--trs->trso_recover_slabs <= 0)
98089b17223SAlexander Motin return;
98189b17223SAlexander Motin /* Run next rebuild iteration. */
98289b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(tr);
98389b17223SAlexander Motin }
98489b17223SAlexander Motin } else if (trs->trso_type == TR_RAID1E_RESYNC) {
98589b17223SAlexander Motin /*
98689b17223SAlexander Motin * read good sd, read bad sd in parallel. when both
98789b17223SAlexander Motin * done, compare the buffers. write good to the bad
98889b17223SAlexander Motin * if different. do the next bit of work.
98989b17223SAlexander Motin */
99089b17223SAlexander Motin panic("Somehow, we think we're doing a resync");
99189b17223SAlexander Motin }
99289b17223SAlexander Motin return;
99389b17223SAlexander Motin }
99489b17223SAlexander Motin pbp = bp->bio_parent;
99589b17223SAlexander Motin pbp->bio_inbed++;
99689b17223SAlexander Motin mask = (intptr_t)bp->bio_caller2;
99789b17223SAlexander Motin if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
99889b17223SAlexander Motin /*
99989b17223SAlexander Motin * Read failed on first drive. Retry the read error on
100089b17223SAlexander Motin * another disk drive, if available, before erroring out the
100189b17223SAlexander Motin * read.
100289b17223SAlexander Motin */
100389b17223SAlexander Motin sd->sd_disk->d_read_errs++;
100489b17223SAlexander Motin G_RAID_LOGREQ(0, bp,
100589b17223SAlexander Motin "Read error (%d), %d read errors total",
100689b17223SAlexander Motin bp->bio_error, sd->sd_disk->d_read_errs);
100789b17223SAlexander Motin
100889b17223SAlexander Motin /*
100989b17223SAlexander Motin * If there are too many read errors, we move to degraded.
101089b17223SAlexander Motin * XXX Do we want to FAIL the drive (eg, make the user redo
101189b17223SAlexander Motin * everything to get it back in sync), or just degrade the
101289b17223SAlexander Motin * drive, which kicks off a resync?
101389b17223SAlexander Motin */
101489b17223SAlexander Motin do_write = 0;
101589b17223SAlexander Motin if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
101689b17223SAlexander Motin g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
101789b17223SAlexander Motin else if (mask == 0)
101889b17223SAlexander Motin do_write = 1;
101989b17223SAlexander Motin
102089b17223SAlexander Motin /* Restore what we were doing. */
102189b17223SAlexander Motin P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
102289b17223SAlexander Motin V2P(vol, virtual, &disk, &offset, &start);
102389b17223SAlexander Motin
102489b17223SAlexander Motin /* Find the other disk, and try to do the I/O to it. */
102589b17223SAlexander Motin mask |= 1 << copy;
102689b17223SAlexander Motin best = g_raid_tr_raid1e_select_read_disk(vol,
102789b17223SAlexander Motin disk, offset, start, mask);
102889b17223SAlexander Motin if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
102989b17223SAlexander Motin disk += best;
103089b17223SAlexander Motin if (disk >= vol->v_disks_count) {
103189b17223SAlexander Motin disk -= vol->v_disks_count;
103289b17223SAlexander Motin offset += vol->v_strip_size;
103389b17223SAlexander Motin }
103489b17223SAlexander Motin cbp->bio_offset = offset + start;
103589b17223SAlexander Motin cbp->bio_length = bp->bio_length;
103689b17223SAlexander Motin cbp->bio_data = bp->bio_data;
1037b43560abSAlexander Motin cbp->bio_ma = bp->bio_ma;
1038b43560abSAlexander Motin cbp->bio_ma_offset = bp->bio_ma_offset;
1039b43560abSAlexander Motin cbp->bio_ma_n = bp->bio_ma_n;
104089b17223SAlexander Motin g_destroy_bio(bp);
104189b17223SAlexander Motin nsd = &vol->v_subdisks[disk];
104289b17223SAlexander Motin G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
104389b17223SAlexander Motin nsd->sd_pos);
104489b17223SAlexander Motin if (do_write)
104589b17223SAlexander Motin mask |= 1 << 31;
10467a22215cSEitan Adler if ((mask & (1U << 31)) != 0)
104789b17223SAlexander Motin sd->sd_recovery++;
104889b17223SAlexander Motin cbp->bio_caller2 = (void *)mask;
104989b17223SAlexander Motin if (do_write) {
105089b17223SAlexander Motin cbp->bio_caller1 = nsd;
105189b17223SAlexander Motin /* Lock callback starts I/O */
105289b17223SAlexander Motin g_raid_lock_range(sd->sd_volume,
105389b17223SAlexander Motin virtual, cbp->bio_length, pbp, cbp);
105489b17223SAlexander Motin } else {
105589b17223SAlexander Motin g_raid_subdisk_iostart(nsd, cbp);
105689b17223SAlexander Motin }
105789b17223SAlexander Motin return;
105889b17223SAlexander Motin }
105989b17223SAlexander Motin /*
106089b17223SAlexander Motin * We can't retry. Return the original error by falling
106189b17223SAlexander Motin * through. This will happen when there's only one good disk.
106289b17223SAlexander Motin * We don't need to fail the raid, since its actual state is
106389b17223SAlexander Motin * based on the state of the subdisks.
106489b17223SAlexander Motin */
106589b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
106689b17223SAlexander Motin }
106789b17223SAlexander Motin if (bp->bio_cmd == BIO_READ &&
106889b17223SAlexander Motin bp->bio_error == 0 &&
10697a22215cSEitan Adler (mask & (1U << 31)) != 0) {
107089b17223SAlexander Motin G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
107189b17223SAlexander Motin
107289b17223SAlexander Motin /* Restore what we were doing. */
107389b17223SAlexander Motin P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
107489b17223SAlexander Motin V2P(vol, virtual, &disk, &offset, &start);
107589b17223SAlexander Motin
107689b17223SAlexander Motin /* Find best disk to write. */
107789b17223SAlexander Motin best = g_raid_tr_raid1e_select_read_disk(vol,
107889b17223SAlexander Motin disk, offset, start, ~mask);
107989b17223SAlexander Motin if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
108089b17223SAlexander Motin disk += best;
108189b17223SAlexander Motin if (disk >= vol->v_disks_count) {
108289b17223SAlexander Motin disk -= vol->v_disks_count;
108389b17223SAlexander Motin offset += vol->v_strip_size;
108489b17223SAlexander Motin }
108589b17223SAlexander Motin cbp->bio_offset = offset + start;
108689b17223SAlexander Motin cbp->bio_cmd = BIO_WRITE;
108789b17223SAlexander Motin cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
108889b17223SAlexander Motin cbp->bio_caller2 = (void *)mask;
108989b17223SAlexander Motin g_destroy_bio(bp);
109089b17223SAlexander Motin G_RAID_LOGREQ(2, cbp,
109189b17223SAlexander Motin "Attempting bad sector remap on failing drive.");
109289b17223SAlexander Motin g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
109389b17223SAlexander Motin return;
109489b17223SAlexander Motin }
109589b17223SAlexander Motin }
10967a22215cSEitan Adler if ((mask & (1U << 31)) != 0) {
109789b17223SAlexander Motin /*
109889b17223SAlexander Motin * We're done with a recovery, mark the range as unlocked.
1099e8d57122SPedro F. Giffuni * For any write errors, we aggressively fail the disk since
110089b17223SAlexander Motin * there was both a READ and a WRITE error at this location.
110189b17223SAlexander Motin * Both types of errors generally indicates the drive is on
110289b17223SAlexander Motin * the verge of total failure anyway. Better to stop trusting
110389b17223SAlexander Motin * it now. However, we need to reset error to 0 in that case
110489b17223SAlexander Motin * because we're not failing the original I/O which succeeded.
110589b17223SAlexander Motin */
110689b17223SAlexander Motin
110789b17223SAlexander Motin /* Restore what we were doing. */
110889b17223SAlexander Motin P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
110989b17223SAlexander Motin V2P(vol, virtual, &disk, &offset, &start);
111089b17223SAlexander Motin
111189b17223SAlexander Motin for (copy = 0; copy < N; copy++) {
111289b17223SAlexander Motin if ((mask & (1 << copy) ) != 0)
111389b17223SAlexander Motin vol->v_subdisks[(disk + copy) %
111489b17223SAlexander Motin vol->v_disks_count].sd_recovery--;
111589b17223SAlexander Motin }
111689b17223SAlexander Motin
111789b17223SAlexander Motin if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
111889b17223SAlexander Motin G_RAID_LOGREQ(0, bp, "Remap write failed: "
111989b17223SAlexander Motin "failing subdisk.");
112089b17223SAlexander Motin g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
112189b17223SAlexander Motin bp->bio_error = 0;
112289b17223SAlexander Motin }
112389b17223SAlexander Motin G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
112489b17223SAlexander Motin g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
112589b17223SAlexander Motin }
1126650e245eSAlexander Motin if (pbp->bio_cmd != BIO_READ) {
1127ef844ef7SAlexander Motin if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128ef844ef7SAlexander Motin pbp->bio_error = bp->bio_error;
1129650e245eSAlexander Motin if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130ef844ef7SAlexander Motin G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131ef844ef7SAlexander Motin g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132ef844ef7SAlexander Motin }
1133ef844ef7SAlexander Motin error = pbp->bio_error;
1134ef844ef7SAlexander Motin } else
113589b17223SAlexander Motin error = bp->bio_error;
113689b17223SAlexander Motin g_destroy_bio(bp);
113789b17223SAlexander Motin if (pbp->bio_children == pbp->bio_inbed) {
113889b17223SAlexander Motin pbp->bio_completed = pbp->bio_length;
113989b17223SAlexander Motin g_raid_iodone(pbp, error);
114089b17223SAlexander Motin }
114189b17223SAlexander Motin }
114289b17223SAlexander Motin
114389b17223SAlexander Motin static int
g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object * tr,void * virtual,off_t boffset,size_t blength)1144489ba222SMitchell Horne g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1145489ba222SMitchell Horne off_t boffset, size_t blength)
114689b17223SAlexander Motin {
114789b17223SAlexander Motin struct g_raid_volume *vol;
114889b17223SAlexander Motin struct g_raid_subdisk *sd;
114989b17223SAlexander Motin struct bio_queue_head queue;
115089b17223SAlexander Motin char *addr;
115189b17223SAlexander Motin off_t offset, start, length, remain;
115289b17223SAlexander Motin u_int no, strip_size;
115389b17223SAlexander Motin int i, error;
115489b17223SAlexander Motin
115589b17223SAlexander Motin vol = tr->tro_volume;
115689b17223SAlexander Motin addr = virtual;
115789b17223SAlexander Motin strip_size = vol->v_strip_size;
115889b17223SAlexander Motin V2P(vol, boffset, &no, &offset, &start);
115989b17223SAlexander Motin remain = blength;
116089b17223SAlexander Motin bioq_init(&queue);
116189b17223SAlexander Motin while (remain > 0) {
116289b17223SAlexander Motin length = MIN(strip_size - start, remain);
116389b17223SAlexander Motin for (i = 0; i < N; i++) {
116489b17223SAlexander Motin sd = &vol->v_subdisks[no];
116589b17223SAlexander Motin switch (sd->sd_state) {
116689b17223SAlexander Motin case G_RAID_SUBDISK_S_ACTIVE:
116789b17223SAlexander Motin case G_RAID_SUBDISK_S_STALE:
116889b17223SAlexander Motin case G_RAID_SUBDISK_S_RESYNC:
116989b17223SAlexander Motin break;
117089b17223SAlexander Motin case G_RAID_SUBDISK_S_REBUILD:
117189b17223SAlexander Motin if (offset + start >= sd->sd_rebuild_pos)
117289b17223SAlexander Motin goto nextdisk;
117389b17223SAlexander Motin break;
117489b17223SAlexander Motin default:
117589b17223SAlexander Motin goto nextdisk;
117689b17223SAlexander Motin }
1177489ba222SMitchell Horne error = g_raid_subdisk_kerneldump(sd, addr,
1178489ba222SMitchell Horne offset + start, length);
117989b17223SAlexander Motin if (error != 0)
118089b17223SAlexander Motin return (error);
118189b17223SAlexander Motin nextdisk:
118289b17223SAlexander Motin if (++no >= vol->v_disks_count) {
118389b17223SAlexander Motin no = 0;
118489b17223SAlexander Motin offset += strip_size;
118589b17223SAlexander Motin }
118689b17223SAlexander Motin }
118789b17223SAlexander Motin remain -= length;
118889b17223SAlexander Motin addr += length;
118989b17223SAlexander Motin start = 0;
119089b17223SAlexander Motin }
119189b17223SAlexander Motin return (0);
119289b17223SAlexander Motin }
119389b17223SAlexander Motin
119489b17223SAlexander Motin static int
g_raid_tr_locked_raid1e(struct g_raid_tr_object * tr,void * argp)119589b17223SAlexander Motin g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
119689b17223SAlexander Motin {
119789b17223SAlexander Motin struct bio *bp;
119889b17223SAlexander Motin struct g_raid_subdisk *sd;
119989b17223SAlexander Motin
120089b17223SAlexander Motin bp = (struct bio *)argp;
120189b17223SAlexander Motin sd = (struct g_raid_subdisk *)bp->bio_caller1;
120289b17223SAlexander Motin g_raid_subdisk_iostart(sd, bp);
120389b17223SAlexander Motin
120489b17223SAlexander Motin return (0);
120589b17223SAlexander Motin }
120689b17223SAlexander Motin
120789b17223SAlexander Motin static int
g_raid_tr_idle_raid1e(struct g_raid_tr_object * tr)120889b17223SAlexander Motin g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
120989b17223SAlexander Motin {
121089b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
121189b17223SAlexander Motin struct g_raid_volume *vol;
121289b17223SAlexander Motin
121389b17223SAlexander Motin vol = tr->tro_volume;
121489b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
121589b17223SAlexander Motin trs->trso_fair_io = g_raid1e_rebuild_fair_io;
121689b17223SAlexander Motin trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
121789b17223SAlexander Motin /* Compensate short rebuild I/Os. */
121889b17223SAlexander Motin if ((vol->v_disks_count % N) != 0 &&
121989b17223SAlexander Motin vol->v_strip_size < g_raid1e_rebuild_slab) {
122089b17223SAlexander Motin trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
122189b17223SAlexander Motin trs->trso_recover_slabs /= vol->v_strip_size;
122289b17223SAlexander Motin }
122389b17223SAlexander Motin if (trs->trso_type == TR_RAID1E_REBUILD)
122489b17223SAlexander Motin g_raid_tr_raid1e_rebuild_some(tr);
122589b17223SAlexander Motin return (0);
122689b17223SAlexander Motin }
122789b17223SAlexander Motin
122889b17223SAlexander Motin static int
g_raid_tr_free_raid1e(struct g_raid_tr_object * tr)122989b17223SAlexander Motin g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
123089b17223SAlexander Motin {
123189b17223SAlexander Motin struct g_raid_tr_raid1e_object *trs;
123289b17223SAlexander Motin
123389b17223SAlexander Motin trs = (struct g_raid_tr_raid1e_object *)tr;
123489b17223SAlexander Motin
123589b17223SAlexander Motin if (trs->trso_buffer != NULL) {
123689b17223SAlexander Motin free(trs->trso_buffer, M_TR_RAID1E);
123789b17223SAlexander Motin trs->trso_buffer = NULL;
123889b17223SAlexander Motin }
123989b17223SAlexander Motin return (0);
124089b17223SAlexander Motin }
124189b17223SAlexander Motin
1242c89d2fbeSAlexander Motin G_RAID_TR_DECLARE(raid1e, "RAID1E");
1243