1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org>
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26 * SUCH DAMAGE.
27 */
28
29 #include <sys/param.h>
30 #include <sys/bio.h>
31 #include <sys/endian.h>
32 #include <sys/kernel.h>
33 #include <sys/kobj.h>
34 #include <sys/limits.h>
35 #include <sys/lock.h>
36 #include <sys/malloc.h>
37 #include <sys/mutex.h>
38 #include <sys/sysctl.h>
39 #include <sys/systm.h>
40 #include <geom/geom.h>
41 #include <geom/geom_dbg.h>
42 #include "geom/raid/g_raid.h"
43 #include "g_raid_tr_if.h"
44
45 #define N 2
46
47 SYSCTL_DECL(_kern_geom_raid_raid1e);
48
49 #define RAID1E_REBUILD_SLAB (1 << 20) /* One transation in a rebuild */
50 static int g_raid1e_rebuild_slab = RAID1E_REBUILD_SLAB;
51 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_slab_size, CTLFLAG_RWTUN,
52 &g_raid1e_rebuild_slab, 0,
53 "Amount of the disk to rebuild each read/write cycle of the rebuild.");
54
55 #define RAID1E_REBUILD_FAIR_IO 20 /* use 1/x of the available I/O */
56 static int g_raid1e_rebuild_fair_io = RAID1E_REBUILD_FAIR_IO;
57 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_fair_io, CTLFLAG_RWTUN,
58 &g_raid1e_rebuild_fair_io, 0,
59 "Fraction of the I/O bandwidth to use when disk busy for rebuild.");
60
61 #define RAID1E_REBUILD_CLUSTER_IDLE 100
62 static int g_raid1e_rebuild_cluster_idle = RAID1E_REBUILD_CLUSTER_IDLE;
63 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_cluster_idle, CTLFLAG_RWTUN,
64 &g_raid1e_rebuild_cluster_idle, 0,
65 "Number of slabs to do each time we trigger a rebuild cycle");
66
67 #define RAID1E_REBUILD_META_UPDATE 1024 /* update meta data every 1GB or so */
68 static int g_raid1e_rebuild_meta_update = RAID1E_REBUILD_META_UPDATE;
69 SYSCTL_UINT(_kern_geom_raid_raid1e, OID_AUTO, rebuild_meta_update, CTLFLAG_RWTUN,
70 &g_raid1e_rebuild_meta_update, 0,
71 "When to update the meta data.");
72
73 static MALLOC_DEFINE(M_TR_RAID1E, "tr_raid1e_data", "GEOM_RAID RAID1E data");
74
75 #define TR_RAID1E_NONE 0
76 #define TR_RAID1E_REBUILD 1
77 #define TR_RAID1E_RESYNC 2
78
79 #define TR_RAID1E_F_DOING_SOME 0x1
80 #define TR_RAID1E_F_LOCKED 0x2
81 #define TR_RAID1E_F_ABORT 0x4
82
83 struct g_raid_tr_raid1e_object {
84 struct g_raid_tr_object trso_base;
85 int trso_starting;
86 int trso_stopping;
87 int trso_type;
88 int trso_recover_slabs; /* slabs before rest */
89 int trso_fair_io;
90 int trso_meta_update;
91 int trso_flags;
92 struct g_raid_subdisk *trso_failed_sd; /* like per volume */
93 void *trso_buffer; /* Buffer space */
94 off_t trso_lock_pos; /* Locked range start. */
95 off_t trso_lock_len; /* Locked range length. */
96 struct bio trso_bio;
97 };
98
99 static g_raid_tr_taste_t g_raid_tr_taste_raid1e;
100 static g_raid_tr_event_t g_raid_tr_event_raid1e;
101 static g_raid_tr_start_t g_raid_tr_start_raid1e;
102 static g_raid_tr_stop_t g_raid_tr_stop_raid1e;
103 static g_raid_tr_iostart_t g_raid_tr_iostart_raid1e;
104 static g_raid_tr_iodone_t g_raid_tr_iodone_raid1e;
105 static g_raid_tr_kerneldump_t g_raid_tr_kerneldump_raid1e;
106 static g_raid_tr_locked_t g_raid_tr_locked_raid1e;
107 static g_raid_tr_idle_t g_raid_tr_idle_raid1e;
108 static g_raid_tr_free_t g_raid_tr_free_raid1e;
109
110 static kobj_method_t g_raid_tr_raid1e_methods[] = {
111 KOBJMETHOD(g_raid_tr_taste, g_raid_tr_taste_raid1e),
112 KOBJMETHOD(g_raid_tr_event, g_raid_tr_event_raid1e),
113 KOBJMETHOD(g_raid_tr_start, g_raid_tr_start_raid1e),
114 KOBJMETHOD(g_raid_tr_stop, g_raid_tr_stop_raid1e),
115 KOBJMETHOD(g_raid_tr_iostart, g_raid_tr_iostart_raid1e),
116 KOBJMETHOD(g_raid_tr_iodone, g_raid_tr_iodone_raid1e),
117 KOBJMETHOD(g_raid_tr_kerneldump, g_raid_tr_kerneldump_raid1e),
118 KOBJMETHOD(g_raid_tr_locked, g_raid_tr_locked_raid1e),
119 KOBJMETHOD(g_raid_tr_idle, g_raid_tr_idle_raid1e),
120 KOBJMETHOD(g_raid_tr_free, g_raid_tr_free_raid1e),
121 { 0, 0 }
122 };
123
124 static struct g_raid_tr_class g_raid_tr_raid1e_class = {
125 "RAID1E",
126 g_raid_tr_raid1e_methods,
127 sizeof(struct g_raid_tr_raid1e_object),
128 .trc_enable = 1,
129 .trc_priority = 200,
130 .trc_accept_unmapped = 1
131 };
132
133 static void g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr);
134 static void g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
135 struct g_raid_subdisk *sd);
136 static int g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
137 int no, off_t off, off_t len, u_int mask);
138
139 static inline void
V2P(struct g_raid_volume * vol,off_t virt,int * disk,off_t * offset,off_t * start)140 V2P(struct g_raid_volume *vol, off_t virt,
141 int *disk, off_t *offset, off_t *start)
142 {
143 off_t nstrip;
144 u_int strip_size;
145
146 strip_size = vol->v_strip_size;
147 /* Strip number. */
148 nstrip = virt / strip_size;
149 /* Start position in strip. */
150 *start = virt % strip_size;
151 /* Disk number. */
152 *disk = (nstrip * N) % vol->v_disks_count;
153 /* Strip start position in disk. */
154 *offset = ((nstrip * N) / vol->v_disks_count) * strip_size;
155 }
156
157 static inline void
P2V(struct g_raid_volume * vol,int disk,off_t offset,off_t * virt,int * copy)158 P2V(struct g_raid_volume *vol, int disk, off_t offset,
159 off_t *virt, int *copy)
160 {
161 off_t nstrip, start;
162 u_int strip_size;
163
164 strip_size = vol->v_strip_size;
165 /* Start position in strip. */
166 start = offset % strip_size;
167 /* Physical strip number. */
168 nstrip = (offset / strip_size) * vol->v_disks_count + disk;
169 /* Number of physical strip (copy) inside virtual strip. */
170 *copy = nstrip % N;
171 /* Offset in virtual space. */
172 *virt = (nstrip / N) * strip_size + start;
173 }
174
175 static int
g_raid_tr_taste_raid1e(struct g_raid_tr_object * tr,struct g_raid_volume * vol)176 g_raid_tr_taste_raid1e(struct g_raid_tr_object *tr, struct g_raid_volume *vol)
177 {
178 struct g_raid_tr_raid1e_object *trs;
179
180 trs = (struct g_raid_tr_raid1e_object *)tr;
181 if (tr->tro_volume->v_raid_level != G_RAID_VOLUME_RL_RAID1E ||
182 tr->tro_volume->v_raid_level_qualifier != G_RAID_VOLUME_RLQ_R1EA)
183 return (G_RAID_TR_TASTE_FAIL);
184 trs->trso_starting = 1;
185 return (G_RAID_TR_TASTE_SUCCEED);
186 }
187
188 static int
g_raid_tr_update_state_raid1e_even(struct g_raid_volume * vol)189 g_raid_tr_update_state_raid1e_even(struct g_raid_volume *vol)
190 {
191 struct g_raid_softc *sc;
192 struct g_raid_subdisk *sd, *bestsd, *worstsd;
193 int i, j, state, sstate;
194
195 sc = vol->v_softc;
196 state = G_RAID_VOLUME_S_OPTIMAL;
197 for (i = 0; i < vol->v_disks_count / N; i++) {
198 bestsd = &vol->v_subdisks[i * N];
199 for (j = 1; j < N; j++) {
200 sd = &vol->v_subdisks[i * N + j];
201 if (sd->sd_state > bestsd->sd_state)
202 bestsd = sd;
203 else if (sd->sd_state == bestsd->sd_state &&
204 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
205 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
206 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
207 bestsd = sd;
208 }
209 if (bestsd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED &&
210 bestsd->sd_state != G_RAID_SUBDISK_S_ACTIVE) {
211 /* We found reasonable candidate. */
212 G_RAID_DEBUG1(1, sc,
213 "Promote subdisk %s:%d from %s to ACTIVE.",
214 vol->v_name, bestsd->sd_pos,
215 g_raid_subdisk_state2str(bestsd->sd_state));
216 g_raid_change_subdisk_state(bestsd,
217 G_RAID_SUBDISK_S_ACTIVE);
218 g_raid_write_metadata(sc,
219 vol, bestsd, bestsd->sd_disk);
220 }
221 worstsd = &vol->v_subdisks[i * N];
222 for (j = 1; j < N; j++) {
223 sd = &vol->v_subdisks[i * N + j];
224 if (sd->sd_state < worstsd->sd_state)
225 worstsd = sd;
226 }
227 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
228 sstate = G_RAID_VOLUME_S_OPTIMAL;
229 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
230 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
231 else if (bestsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
232 sstate = G_RAID_VOLUME_S_DEGRADED;
233 else
234 sstate = G_RAID_VOLUME_S_BROKEN;
235 if (sstate < state)
236 state = sstate;
237 }
238 return (state);
239 }
240
241 static int
g_raid_tr_update_state_raid1e_odd(struct g_raid_volume * vol)242 g_raid_tr_update_state_raid1e_odd(struct g_raid_volume *vol)
243 {
244 struct g_raid_softc *sc;
245 struct g_raid_subdisk *sd, *bestsd, *worstsd;
246 int i, j, state, sstate;
247
248 sc = vol->v_softc;
249 if (g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) ==
250 vol->v_disks_count)
251 return (G_RAID_VOLUME_S_OPTIMAL);
252 for (i = 0; i < vol->v_disks_count; i++) {
253 sd = &vol->v_subdisks[i];
254 if (sd->sd_state == G_RAID_SUBDISK_S_UNINITIALIZED) {
255 /* We found reasonable candidate. */
256 G_RAID_DEBUG1(1, sc,
257 "Promote subdisk %s:%d from %s to STALE.",
258 vol->v_name, sd->sd_pos,
259 g_raid_subdisk_state2str(sd->sd_state));
260 g_raid_change_subdisk_state(sd,
261 G_RAID_SUBDISK_S_STALE);
262 g_raid_write_metadata(sc, vol, sd, sd->sd_disk);
263 }
264 }
265 state = G_RAID_VOLUME_S_OPTIMAL;
266 for (i = 0; i < vol->v_disks_count; i++) {
267 bestsd = &vol->v_subdisks[i];
268 worstsd = &vol->v_subdisks[i];
269 for (j = 1; j < N; j++) {
270 sd = &vol->v_subdisks[(i + j) % vol->v_disks_count];
271 if (sd->sd_state > bestsd->sd_state)
272 bestsd = sd;
273 else if (sd->sd_state == bestsd->sd_state &&
274 (sd->sd_state == G_RAID_SUBDISK_S_REBUILD ||
275 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) &&
276 sd->sd_rebuild_pos > bestsd->sd_rebuild_pos)
277 bestsd = sd;
278 if (sd->sd_state < worstsd->sd_state)
279 worstsd = sd;
280 }
281 if (worstsd->sd_state == G_RAID_SUBDISK_S_ACTIVE)
282 sstate = G_RAID_VOLUME_S_OPTIMAL;
283 else if (worstsd->sd_state >= G_RAID_SUBDISK_S_STALE)
284 sstate = G_RAID_VOLUME_S_SUBOPTIMAL;
285 else if (bestsd->sd_state >= G_RAID_SUBDISK_S_STALE)
286 sstate = G_RAID_VOLUME_S_DEGRADED;
287 else
288 sstate = G_RAID_VOLUME_S_BROKEN;
289 if (sstate < state)
290 state = sstate;
291 }
292 return (state);
293 }
294
295 static int
g_raid_tr_update_state_raid1e(struct g_raid_volume * vol,struct g_raid_subdisk * sd)296 g_raid_tr_update_state_raid1e(struct g_raid_volume *vol,
297 struct g_raid_subdisk *sd)
298 {
299 struct g_raid_tr_raid1e_object *trs;
300 struct g_raid_softc *sc;
301 u_int s;
302
303 sc = vol->v_softc;
304 trs = (struct g_raid_tr_raid1e_object *)vol->v_tr;
305 if (trs->trso_stopping &&
306 (trs->trso_flags & TR_RAID1E_F_DOING_SOME) == 0)
307 s = G_RAID_VOLUME_S_STOPPED;
308 else if (trs->trso_starting)
309 s = G_RAID_VOLUME_S_STARTING;
310 else {
311 if ((vol->v_disks_count % N) == 0)
312 s = g_raid_tr_update_state_raid1e_even(vol);
313 else
314 s = g_raid_tr_update_state_raid1e_odd(vol);
315 }
316 if (s != vol->v_state) {
317 g_raid_event_send(vol, G_RAID_VOLUME_S_ALIVE(s) ?
318 G_RAID_VOLUME_E_UP : G_RAID_VOLUME_E_DOWN,
319 G_RAID_EVENT_VOLUME);
320 g_raid_change_volume_state(vol, s);
321 if (!trs->trso_starting && !trs->trso_stopping)
322 g_raid_write_metadata(sc, vol, NULL, NULL);
323 }
324 if (!trs->trso_starting && !trs->trso_stopping)
325 g_raid_tr_raid1e_maybe_rebuild(vol->v_tr, sd);
326 return (0);
327 }
328
329 static void
g_raid_tr_raid1e_fail_disk(struct g_raid_softc * sc,struct g_raid_subdisk * sd,struct g_raid_disk * disk)330 g_raid_tr_raid1e_fail_disk(struct g_raid_softc *sc, struct g_raid_subdisk *sd,
331 struct g_raid_disk *disk)
332 {
333 struct g_raid_volume *vol;
334
335 vol = sd->sd_volume;
336 /*
337 * We don't fail the last disk in the pack, since it still has decent
338 * data on it and that's better than failing the disk if it is the root
339 * file system.
340 *
341 * XXX should this be controlled via a tunable? It makes sense for
342 * the volume that has / on it. I can't think of a case where we'd
343 * want the volume to go away on this kind of event.
344 */
345 if ((g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_ACTIVE) +
346 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC) +
347 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
348 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED) <
349 vol->v_disks_count) &&
350 (sd->sd_state >= G_RAID_SUBDISK_S_UNINITIALIZED))
351 return;
352 g_raid_fail_disk(sc, sd, disk);
353 }
354
355 static void
g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object * trs)356 g_raid_tr_raid1e_rebuild_done(struct g_raid_tr_raid1e_object *trs)
357 {
358 struct g_raid_volume *vol;
359 struct g_raid_subdisk *sd;
360
361 vol = trs->trso_base.tro_volume;
362 sd = trs->trso_failed_sd;
363 g_raid_write_metadata(vol->v_softc, vol, sd, sd->sd_disk);
364 free(trs->trso_buffer, M_TR_RAID1E);
365 trs->trso_buffer = NULL;
366 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
367 trs->trso_type = TR_RAID1E_NONE;
368 trs->trso_recover_slabs = 0;
369 trs->trso_failed_sd = NULL;
370 g_raid_tr_update_state_raid1e(vol, NULL);
371 }
372
373 static void
g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object * tr)374 g_raid_tr_raid1e_rebuild_finish(struct g_raid_tr_object *tr)
375 {
376 struct g_raid_tr_raid1e_object *trs;
377 struct g_raid_subdisk *sd;
378
379 trs = (struct g_raid_tr_raid1e_object *)tr;
380 sd = trs->trso_failed_sd;
381 G_RAID_DEBUG1(0, tr->tro_volume->v_softc,
382 "Subdisk %s:%d-%s rebuild completed.",
383 sd->sd_volume->v_name, sd->sd_pos,
384 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
385 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_ACTIVE);
386 sd->sd_rebuild_pos = 0;
387 g_raid_tr_raid1e_rebuild_done(trs);
388 }
389
390 static void
g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object * tr)391 g_raid_tr_raid1e_rebuild_abort(struct g_raid_tr_object *tr)
392 {
393 struct g_raid_tr_raid1e_object *trs;
394 struct g_raid_subdisk *sd;
395 struct g_raid_volume *vol;
396
397 vol = tr->tro_volume;
398 trs = (struct g_raid_tr_raid1e_object *)tr;
399 sd = trs->trso_failed_sd;
400 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME) {
401 G_RAID_DEBUG1(1, vol->v_softc,
402 "Subdisk %s:%d-%s rebuild is aborting.",
403 sd->sd_volume->v_name, sd->sd_pos,
404 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
405 trs->trso_flags |= TR_RAID1E_F_ABORT;
406 } else {
407 G_RAID_DEBUG1(0, vol->v_softc,
408 "Subdisk %s:%d-%s rebuild aborted.",
409 sd->sd_volume->v_name, sd->sd_pos,
410 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]");
411 trs->trso_flags &= ~TR_RAID1E_F_ABORT;
412 if (trs->trso_flags & TR_RAID1E_F_LOCKED) {
413 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
414 g_raid_unlock_range(tr->tro_volume,
415 trs->trso_lock_pos, trs->trso_lock_len);
416 }
417 g_raid_tr_raid1e_rebuild_done(trs);
418 }
419 }
420
421 static void
g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object * tr)422 g_raid_tr_raid1e_rebuild_some(struct g_raid_tr_object *tr)
423 {
424 struct g_raid_tr_raid1e_object *trs;
425 struct g_raid_softc *sc;
426 struct g_raid_volume *vol;
427 struct g_raid_subdisk *sd;
428 struct bio *bp;
429 off_t len, virtual, vend, offset, start;
430 int disk, copy, best;
431
432 trs = (struct g_raid_tr_raid1e_object *)tr;
433 if (trs->trso_flags & TR_RAID1E_F_DOING_SOME)
434 return;
435 vol = tr->tro_volume;
436 sc = vol->v_softc;
437 sd = trs->trso_failed_sd;
438
439 while (1) {
440 if (sd->sd_rebuild_pos >= sd->sd_size) {
441 g_raid_tr_raid1e_rebuild_finish(tr);
442 return;
443 }
444 /* Get virtual offset from physical rebuild position. */
445 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos, &virtual, ©);
446 /* Get physical offset back to get first stripe position. */
447 V2P(vol, virtual, &disk, &offset, &start);
448 /* Calculate contignous data length. */
449 len = MIN(g_raid1e_rebuild_slab,
450 sd->sd_size - sd->sd_rebuild_pos);
451 if ((vol->v_disks_count % N) != 0)
452 len = MIN(len, vol->v_strip_size - start);
453 /* Find disk with most accurate data. */
454 best = g_raid_tr_raid1e_select_read_disk(vol, disk,
455 offset + start, len, 0);
456 if (best < 0) {
457 /* There is no any valid disk. */
458 g_raid_tr_raid1e_rebuild_abort(tr);
459 return;
460 } else if (best != copy) {
461 /* Some other disk has better data. */
462 break;
463 }
464 /* We have the most accurate data. Skip the range. */
465 G_RAID_DEBUG1(3, sc, "Skipping rebuild for range %ju - %ju",
466 sd->sd_rebuild_pos, sd->sd_rebuild_pos + len);
467 sd->sd_rebuild_pos += len;
468 }
469
470 bp = &trs->trso_bio;
471 memset(bp, 0, sizeof(*bp));
472 bp->bio_offset = offset + start +
473 ((disk + best >= vol->v_disks_count) ? vol->v_strip_size : 0);
474 bp->bio_length = len;
475 bp->bio_data = trs->trso_buffer;
476 bp->bio_cmd = BIO_READ;
477 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
478 bp->bio_caller1 = &vol->v_subdisks[(disk + best) % vol->v_disks_count];
479 G_RAID_LOGREQ(3, bp, "Queueing rebuild read");
480 /*
481 * If we are crossing stripe boundary, correct affected virtual
482 * range we should lock.
483 */
484 if (start + len > vol->v_strip_size) {
485 P2V(vol, sd->sd_pos, sd->sd_rebuild_pos + len, &vend, ©);
486 len = vend - virtual;
487 }
488 trs->trso_flags |= TR_RAID1E_F_DOING_SOME;
489 trs->trso_flags |= TR_RAID1E_F_LOCKED;
490 trs->trso_lock_pos = virtual;
491 trs->trso_lock_len = len;
492 /* Lock callback starts I/O */
493 g_raid_lock_range(sd->sd_volume, virtual, len, NULL, bp);
494 }
495
496 static void
g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object * tr)497 g_raid_tr_raid1e_rebuild_start(struct g_raid_tr_object *tr)
498 {
499 struct g_raid_volume *vol;
500 struct g_raid_tr_raid1e_object *trs;
501 struct g_raid_subdisk *sd;
502
503 vol = tr->tro_volume;
504 trs = (struct g_raid_tr_raid1e_object *)tr;
505 if (trs->trso_failed_sd) {
506 G_RAID_DEBUG1(1, vol->v_softc,
507 "Already rebuild in start rebuild. pos %jd\n",
508 (intmax_t)trs->trso_failed_sd->sd_rebuild_pos);
509 return;
510 }
511 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_RESYNC);
512 if (sd == NULL)
513 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_REBUILD);
514 if (sd == NULL) {
515 sd = g_raid_get_subdisk(vol, G_RAID_SUBDISK_S_STALE);
516 if (sd != NULL) {
517 sd->sd_rebuild_pos = 0;
518 g_raid_change_subdisk_state(sd,
519 G_RAID_SUBDISK_S_RESYNC);
520 g_raid_write_metadata(vol->v_softc, vol, sd, NULL);
521 } else {
522 sd = g_raid_get_subdisk(vol,
523 G_RAID_SUBDISK_S_UNINITIALIZED);
524 if (sd == NULL)
525 sd = g_raid_get_subdisk(vol,
526 G_RAID_SUBDISK_S_NEW);
527 if (sd != NULL) {
528 sd->sd_rebuild_pos = 0;
529 g_raid_change_subdisk_state(sd,
530 G_RAID_SUBDISK_S_REBUILD);
531 g_raid_write_metadata(vol->v_softc,
532 vol, sd, NULL);
533 }
534 }
535 }
536 if (sd == NULL) {
537 G_RAID_DEBUG1(1, vol->v_softc,
538 "No failed disk to rebuild. night night.");
539 return;
540 }
541 trs->trso_failed_sd = sd;
542 G_RAID_DEBUG1(0, vol->v_softc,
543 "Subdisk %s:%d-%s rebuild start at %jd.",
544 sd->sd_volume->v_name, sd->sd_pos,
545 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]",
546 trs->trso_failed_sd->sd_rebuild_pos);
547 trs->trso_type = TR_RAID1E_REBUILD;
548 trs->trso_buffer = malloc(g_raid1e_rebuild_slab, M_TR_RAID1E, M_WAITOK);
549 trs->trso_meta_update = g_raid1e_rebuild_meta_update;
550 g_raid_tr_raid1e_rebuild_some(tr);
551 }
552
553 static void
g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd)554 g_raid_tr_raid1e_maybe_rebuild(struct g_raid_tr_object *tr,
555 struct g_raid_subdisk *sd)
556 {
557 struct g_raid_volume *vol;
558 struct g_raid_tr_raid1e_object *trs;
559 int nr;
560
561 vol = tr->tro_volume;
562 trs = (struct g_raid_tr_raid1e_object *)tr;
563 if (trs->trso_stopping)
564 return;
565 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_REBUILD) +
566 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_RESYNC);
567 switch(trs->trso_type) {
568 case TR_RAID1E_NONE:
569 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED)
570 return;
571 if (nr == 0) {
572 nr = g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_NEW) +
573 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_STALE) +
574 g_raid_nsubdisks(vol, G_RAID_SUBDISK_S_UNINITIALIZED);
575 if (nr == 0)
576 return;
577 }
578 g_raid_tr_raid1e_rebuild_start(tr);
579 break;
580 case TR_RAID1E_REBUILD:
581 if (vol->v_state < G_RAID_VOLUME_S_DEGRADED || nr == 0 ||
582 trs->trso_failed_sd == sd)
583 g_raid_tr_raid1e_rebuild_abort(tr);
584 break;
585 case TR_RAID1E_RESYNC:
586 break;
587 }
588 }
589
590 static int
g_raid_tr_event_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,u_int event)591 g_raid_tr_event_raid1e(struct g_raid_tr_object *tr,
592 struct g_raid_subdisk *sd, u_int event)
593 {
594
595 g_raid_tr_update_state_raid1e(tr->tro_volume, sd);
596 return (0);
597 }
598
599 static int
g_raid_tr_start_raid1e(struct g_raid_tr_object * tr)600 g_raid_tr_start_raid1e(struct g_raid_tr_object *tr)
601 {
602 struct g_raid_tr_raid1e_object *trs;
603 struct g_raid_volume *vol;
604
605 trs = (struct g_raid_tr_raid1e_object *)tr;
606 vol = tr->tro_volume;
607 trs->trso_starting = 0;
608 g_raid_tr_update_state_raid1e(vol, NULL);
609 return (0);
610 }
611
612 static int
g_raid_tr_stop_raid1e(struct g_raid_tr_object * tr)613 g_raid_tr_stop_raid1e(struct g_raid_tr_object *tr)
614 {
615 struct g_raid_tr_raid1e_object *trs;
616 struct g_raid_volume *vol;
617
618 trs = (struct g_raid_tr_raid1e_object *)tr;
619 vol = tr->tro_volume;
620 trs->trso_starting = 0;
621 trs->trso_stopping = 1;
622 g_raid_tr_update_state_raid1e(vol, NULL);
623 return (0);
624 }
625
626 /*
627 * Select the disk to read from. Take into account: subdisk state, running
628 * error recovery, average disk load, head position and possible cache hits.
629 */
630 #define ABS(x) (((x) >= 0) ? (x) : (-(x)))
631 static int
g_raid_tr_raid1e_select_read_disk(struct g_raid_volume * vol,int no,off_t off,off_t len,u_int mask)632 g_raid_tr_raid1e_select_read_disk(struct g_raid_volume *vol,
633 int no, off_t off, off_t len, u_int mask)
634 {
635 struct g_raid_subdisk *sd;
636 off_t offset;
637 int i, best, prio, bestprio;
638
639 best = -1;
640 bestprio = INT_MAX;
641 for (i = 0; i < N; i++) {
642 sd = &vol->v_subdisks[(no + i) % vol->v_disks_count];
643 offset = off;
644 if (no + i >= vol->v_disks_count)
645 offset += vol->v_strip_size;
646
647 prio = G_RAID_SUBDISK_LOAD(sd);
648 if ((mask & (1 << sd->sd_pos)) != 0)
649 continue;
650 switch (sd->sd_state) {
651 case G_RAID_SUBDISK_S_ACTIVE:
652 break;
653 case G_RAID_SUBDISK_S_RESYNC:
654 if (offset + off < sd->sd_rebuild_pos)
655 break;
656 /* FALLTHROUGH */
657 case G_RAID_SUBDISK_S_STALE:
658 prio += i << 24;
659 break;
660 case G_RAID_SUBDISK_S_REBUILD:
661 if (offset + off < sd->sd_rebuild_pos)
662 break;
663 /* FALLTHROUGH */
664 default:
665 continue;
666 }
667 prio += min(sd->sd_recovery, 255) << 16;
668 /* If disk head is precisely in position - highly prefer it. */
669 if (G_RAID_SUBDISK_POS(sd) == offset)
670 prio -= 2 * G_RAID_SUBDISK_LOAD_SCALE;
671 else
672 /* If disk head is close to position - prefer it. */
673 if (ABS(G_RAID_SUBDISK_POS(sd) - offset) <
674 G_RAID_SUBDISK_TRACK_SIZE)
675 prio -= 1 * G_RAID_SUBDISK_LOAD_SCALE;
676 if (prio < bestprio) {
677 bestprio = prio;
678 best = i;
679 }
680 }
681 return (best);
682 }
683
684 static void
g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object * tr,struct bio * bp)685 g_raid_tr_iostart_raid1e_read(struct g_raid_tr_object *tr, struct bio *bp)
686 {
687 struct g_raid_volume *vol;
688 struct g_raid_subdisk *sd;
689 struct bio_queue_head queue;
690 struct bio *cbp;
691 char *addr;
692 off_t offset, start, length, remain;
693 u_int no, strip_size;
694 int best;
695
696 vol = tr->tro_volume;
697 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
698 addr = NULL;
699 else
700 addr = bp->bio_data;
701 strip_size = vol->v_strip_size;
702 V2P(vol, bp->bio_offset, &no, &offset, &start);
703 remain = bp->bio_length;
704 bioq_init(&queue);
705 while (remain > 0) {
706 length = MIN(strip_size - start, remain);
707 best = g_raid_tr_raid1e_select_read_disk(vol,
708 no, offset, length, 0);
709 KASSERT(best >= 0, ("No readable disk in volume %s!",
710 vol->v_name));
711 no += best;
712 if (no >= vol->v_disks_count) {
713 no -= vol->v_disks_count;
714 offset += strip_size;
715 }
716 cbp = g_clone_bio(bp);
717 if (cbp == NULL)
718 goto failure;
719 cbp->bio_offset = offset + start;
720 cbp->bio_length = length;
721 if ((bp->bio_flags & BIO_UNMAPPED) != 0) {
722 cbp->bio_ma_offset += (uintptr_t)addr;
723 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
724 cbp->bio_ma_offset %= PAGE_SIZE;
725 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
726 cbp->bio_length) / PAGE_SIZE;
727 } else
728 cbp->bio_data = addr;
729 cbp->bio_caller1 = &vol->v_subdisks[no];
730 bioq_insert_tail(&queue, cbp);
731 no += N - best;
732 if (no >= vol->v_disks_count) {
733 no -= vol->v_disks_count;
734 offset += strip_size;
735 }
736 remain -= length;
737 addr += length;
738 start = 0;
739 }
740 while ((cbp = bioq_takefirst(&queue)) != NULL) {
741 sd = cbp->bio_caller1;
742 cbp->bio_caller1 = NULL;
743 g_raid_subdisk_iostart(sd, cbp);
744 }
745 return;
746 failure:
747 while ((cbp = bioq_takefirst(&queue)) != NULL)
748 g_destroy_bio(cbp);
749 if (bp->bio_error == 0)
750 bp->bio_error = ENOMEM;
751 g_raid_iodone(bp, bp->bio_error);
752 }
753
754 static void
g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object * tr,struct bio * bp)755 g_raid_tr_iostart_raid1e_write(struct g_raid_tr_object *tr, struct bio *bp)
756 {
757 struct g_raid_volume *vol;
758 struct g_raid_subdisk *sd;
759 struct bio_queue_head queue;
760 struct bio *cbp;
761 char *addr;
762 off_t offset, start, length, remain;
763 u_int no, strip_size;
764 int i;
765
766 vol = tr->tro_volume;
767 if ((bp->bio_flags & BIO_UNMAPPED) != 0)
768 addr = NULL;
769 else
770 addr = bp->bio_data;
771 strip_size = vol->v_strip_size;
772 V2P(vol, bp->bio_offset, &no, &offset, &start);
773 remain = bp->bio_length;
774 bioq_init(&queue);
775 while (remain > 0) {
776 length = MIN(strip_size - start, remain);
777 for (i = 0; i < N; i++) {
778 sd = &vol->v_subdisks[no];
779 switch (sd->sd_state) {
780 case G_RAID_SUBDISK_S_ACTIVE:
781 case G_RAID_SUBDISK_S_STALE:
782 case G_RAID_SUBDISK_S_RESYNC:
783 break;
784 case G_RAID_SUBDISK_S_REBUILD:
785 if (offset + start >= sd->sd_rebuild_pos)
786 goto nextdisk;
787 break;
788 default:
789 goto nextdisk;
790 }
791 cbp = g_clone_bio(bp);
792 if (cbp == NULL)
793 goto failure;
794 cbp->bio_offset = offset + start;
795 cbp->bio_length = length;
796 if ((bp->bio_flags & BIO_UNMAPPED) != 0 &&
797 bp->bio_cmd != BIO_DELETE) {
798 cbp->bio_ma_offset += (uintptr_t)addr;
799 cbp->bio_ma += cbp->bio_ma_offset / PAGE_SIZE;
800 cbp->bio_ma_offset %= PAGE_SIZE;
801 cbp->bio_ma_n = round_page(cbp->bio_ma_offset +
802 cbp->bio_length) / PAGE_SIZE;
803 } else
804 cbp->bio_data = addr;
805 cbp->bio_caller1 = sd;
806 bioq_insert_tail(&queue, cbp);
807 nextdisk:
808 if (++no >= vol->v_disks_count) {
809 no = 0;
810 offset += strip_size;
811 }
812 }
813 remain -= length;
814 if (bp->bio_cmd != BIO_DELETE)
815 addr += length;
816 start = 0;
817 }
818 while ((cbp = bioq_takefirst(&queue)) != NULL) {
819 sd = cbp->bio_caller1;
820 cbp->bio_caller1 = NULL;
821 g_raid_subdisk_iostart(sd, cbp);
822 }
823 return;
824 failure:
825 while ((cbp = bioq_takefirst(&queue)) != NULL)
826 g_destroy_bio(cbp);
827 if (bp->bio_error == 0)
828 bp->bio_error = ENOMEM;
829 g_raid_iodone(bp, bp->bio_error);
830 }
831
832 static void
g_raid_tr_iostart_raid1e(struct g_raid_tr_object * tr,struct bio * bp)833 g_raid_tr_iostart_raid1e(struct g_raid_tr_object *tr, struct bio *bp)
834 {
835 struct g_raid_volume *vol;
836 struct g_raid_tr_raid1e_object *trs;
837
838 vol = tr->tro_volume;
839 trs = (struct g_raid_tr_raid1e_object *)tr;
840 if (vol->v_state != G_RAID_VOLUME_S_OPTIMAL &&
841 vol->v_state != G_RAID_VOLUME_S_SUBOPTIMAL &&
842 vol->v_state != G_RAID_VOLUME_S_DEGRADED) {
843 g_raid_iodone(bp, EIO);
844 return;
845 }
846 /*
847 * If we're rebuilding, squeeze in rebuild activity every so often,
848 * even when the disk is busy. Be sure to only count real I/O
849 * to the disk. All 'SPECIAL' I/O is traffic generated to the disk
850 * by this module.
851 */
852 if (trs->trso_failed_sd != NULL &&
853 !(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL)) {
854 /* Make this new or running now round short. */
855 trs->trso_recover_slabs = 0;
856 if (--trs->trso_fair_io <= 0) {
857 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
858 g_raid_tr_raid1e_rebuild_some(tr);
859 }
860 }
861 switch (bp->bio_cmd) {
862 case BIO_READ:
863 g_raid_tr_iostart_raid1e_read(tr, bp);
864 break;
865 case BIO_WRITE:
866 case BIO_DELETE:
867 g_raid_tr_iostart_raid1e_write(tr, bp);
868 break;
869 case BIO_SPEEDUP:
870 case BIO_FLUSH:
871 g_raid_tr_flush_common(tr, bp);
872 break;
873 default:
874 KASSERT(1 == 0, ("Invalid command here: %u (volume=%s)",
875 bp->bio_cmd, vol->v_name));
876 break;
877 }
878 }
879
880 static void
g_raid_tr_iodone_raid1e(struct g_raid_tr_object * tr,struct g_raid_subdisk * sd,struct bio * bp)881 g_raid_tr_iodone_raid1e(struct g_raid_tr_object *tr,
882 struct g_raid_subdisk *sd, struct bio *bp)
883 {
884 struct bio *cbp;
885 struct g_raid_subdisk *nsd;
886 struct g_raid_volume *vol;
887 struct bio *pbp;
888 struct g_raid_tr_raid1e_object *trs;
889 off_t virtual, offset, start;
890 uintptr_t mask;
891 int error, do_write, copy, disk, best;
892
893 trs = (struct g_raid_tr_raid1e_object *)tr;
894 vol = tr->tro_volume;
895 if (bp->bio_cflags & G_RAID_BIO_FLAG_SYNC) {
896 if (trs->trso_type == TR_RAID1E_REBUILD) {
897 nsd = trs->trso_failed_sd;
898 if (bp->bio_cmd == BIO_READ) {
899 /* Immediately abort rebuild, if requested. */
900 if (trs->trso_flags & TR_RAID1E_F_ABORT) {
901 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
902 g_raid_tr_raid1e_rebuild_abort(tr);
903 return;
904 }
905
906 /* On read error, skip and cross fingers. */
907 if (bp->bio_error != 0) {
908 G_RAID_LOGREQ(0, bp,
909 "Read error during rebuild (%d), "
910 "possible data loss!",
911 bp->bio_error);
912 goto rebuild_round_done;
913 }
914
915 /*
916 * The read operation finished, queue the
917 * write and get out.
918 */
919 G_RAID_LOGREQ(3, bp, "Rebuild read done: %d",
920 bp->bio_error);
921 bp->bio_cmd = BIO_WRITE;
922 bp->bio_cflags = G_RAID_BIO_FLAG_SYNC;
923 bp->bio_offset = nsd->sd_rebuild_pos;
924 G_RAID_LOGREQ(3, bp, "Queueing rebuild write.");
925 g_raid_subdisk_iostart(nsd, bp);
926 } else {
927 /*
928 * The write operation just finished. Do
929 * another. We keep cloning the master bio
930 * since it has the right buffers allocated to
931 * it.
932 */
933 G_RAID_LOGREQ(3, bp, "Rebuild write done: %d",
934 bp->bio_error);
935 if (bp->bio_error != 0 ||
936 trs->trso_flags & TR_RAID1E_F_ABORT) {
937 if ((trs->trso_flags &
938 TR_RAID1E_F_ABORT) == 0) {
939 g_raid_tr_raid1e_fail_disk(sd->sd_softc,
940 nsd, nsd->sd_disk);
941 }
942 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
943 g_raid_tr_raid1e_rebuild_abort(tr);
944 return;
945 }
946 rebuild_round_done:
947 trs->trso_flags &= ~TR_RAID1E_F_LOCKED;
948 g_raid_unlock_range(tr->tro_volume,
949 trs->trso_lock_pos, trs->trso_lock_len);
950 nsd->sd_rebuild_pos += bp->bio_length;
951 if (nsd->sd_rebuild_pos >= nsd->sd_size) {
952 g_raid_tr_raid1e_rebuild_finish(tr);
953 return;
954 }
955
956 /* Abort rebuild if we are stopping */
957 if (trs->trso_stopping) {
958 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
959 g_raid_tr_raid1e_rebuild_abort(tr);
960 return;
961 }
962
963 if (--trs->trso_meta_update <= 0) {
964 g_raid_write_metadata(vol->v_softc,
965 vol, nsd, nsd->sd_disk);
966 trs->trso_meta_update =
967 g_raid1e_rebuild_meta_update;
968 /* Compensate short rebuild I/Os. */
969 if ((vol->v_disks_count % N) != 0 &&
970 vol->v_strip_size <
971 g_raid1e_rebuild_slab) {
972 trs->trso_meta_update *=
973 g_raid1e_rebuild_slab;
974 trs->trso_meta_update /=
975 vol->v_strip_size;
976 }
977 }
978 trs->trso_flags &= ~TR_RAID1E_F_DOING_SOME;
979 if (--trs->trso_recover_slabs <= 0)
980 return;
981 /* Run next rebuild iteration. */
982 g_raid_tr_raid1e_rebuild_some(tr);
983 }
984 } else if (trs->trso_type == TR_RAID1E_RESYNC) {
985 /*
986 * read good sd, read bad sd in parallel. when both
987 * done, compare the buffers. write good to the bad
988 * if different. do the next bit of work.
989 */
990 panic("Somehow, we think we're doing a resync");
991 }
992 return;
993 }
994 pbp = bp->bio_parent;
995 pbp->bio_inbed++;
996 mask = (intptr_t)bp->bio_caller2;
997 if (bp->bio_cmd == BIO_READ && bp->bio_error != 0) {
998 /*
999 * Read failed on first drive. Retry the read error on
1000 * another disk drive, if available, before erroring out the
1001 * read.
1002 */
1003 sd->sd_disk->d_read_errs++;
1004 G_RAID_LOGREQ(0, bp,
1005 "Read error (%d), %d read errors total",
1006 bp->bio_error, sd->sd_disk->d_read_errs);
1007
1008 /*
1009 * If there are too many read errors, we move to degraded.
1010 * XXX Do we want to FAIL the drive (eg, make the user redo
1011 * everything to get it back in sync), or just degrade the
1012 * drive, which kicks off a resync?
1013 */
1014 do_write = 0;
1015 if (sd->sd_disk->d_read_errs > g_raid_read_err_thresh)
1016 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1017 else if (mask == 0)
1018 do_write = 1;
1019
1020 /* Restore what we were doing. */
1021 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1022 V2P(vol, virtual, &disk, &offset, &start);
1023
1024 /* Find the other disk, and try to do the I/O to it. */
1025 mask |= 1 << copy;
1026 best = g_raid_tr_raid1e_select_read_disk(vol,
1027 disk, offset, start, mask);
1028 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1029 disk += best;
1030 if (disk >= vol->v_disks_count) {
1031 disk -= vol->v_disks_count;
1032 offset += vol->v_strip_size;
1033 }
1034 cbp->bio_offset = offset + start;
1035 cbp->bio_length = bp->bio_length;
1036 cbp->bio_data = bp->bio_data;
1037 cbp->bio_ma = bp->bio_ma;
1038 cbp->bio_ma_offset = bp->bio_ma_offset;
1039 cbp->bio_ma_n = bp->bio_ma_n;
1040 g_destroy_bio(bp);
1041 nsd = &vol->v_subdisks[disk];
1042 G_RAID_LOGREQ(2, cbp, "Retrying read from %d",
1043 nsd->sd_pos);
1044 if (do_write)
1045 mask |= 1 << 31;
1046 if ((mask & (1U << 31)) != 0)
1047 sd->sd_recovery++;
1048 cbp->bio_caller2 = (void *)mask;
1049 if (do_write) {
1050 cbp->bio_caller1 = nsd;
1051 /* Lock callback starts I/O */
1052 g_raid_lock_range(sd->sd_volume,
1053 virtual, cbp->bio_length, pbp, cbp);
1054 } else {
1055 g_raid_subdisk_iostart(nsd, cbp);
1056 }
1057 return;
1058 }
1059 /*
1060 * We can't retry. Return the original error by falling
1061 * through. This will happen when there's only one good disk.
1062 * We don't need to fail the raid, since its actual state is
1063 * based on the state of the subdisks.
1064 */
1065 G_RAID_LOGREQ(2, bp, "Couldn't retry read, failing it");
1066 }
1067 if (bp->bio_cmd == BIO_READ &&
1068 bp->bio_error == 0 &&
1069 (mask & (1U << 31)) != 0) {
1070 G_RAID_LOGREQ(3, bp, "Recovered data from other drive");
1071
1072 /* Restore what we were doing. */
1073 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1074 V2P(vol, virtual, &disk, &offset, &start);
1075
1076 /* Find best disk to write. */
1077 best = g_raid_tr_raid1e_select_read_disk(vol,
1078 disk, offset, start, ~mask);
1079 if (best >= 0 && (cbp = g_clone_bio(pbp)) != NULL) {
1080 disk += best;
1081 if (disk >= vol->v_disks_count) {
1082 disk -= vol->v_disks_count;
1083 offset += vol->v_strip_size;
1084 }
1085 cbp->bio_offset = offset + start;
1086 cbp->bio_cmd = BIO_WRITE;
1087 cbp->bio_cflags = G_RAID_BIO_FLAG_REMAP;
1088 cbp->bio_caller2 = (void *)mask;
1089 g_destroy_bio(bp);
1090 G_RAID_LOGREQ(2, cbp,
1091 "Attempting bad sector remap on failing drive.");
1092 g_raid_subdisk_iostart(&vol->v_subdisks[disk], cbp);
1093 return;
1094 }
1095 }
1096 if ((mask & (1U << 31)) != 0) {
1097 /*
1098 * We're done with a recovery, mark the range as unlocked.
1099 * For any write errors, we aggressively fail the disk since
1100 * there was both a READ and a WRITE error at this location.
1101 * Both types of errors generally indicates the drive is on
1102 * the verge of total failure anyway. Better to stop trusting
1103 * it now. However, we need to reset error to 0 in that case
1104 * because we're not failing the original I/O which succeeded.
1105 */
1106
1107 /* Restore what we were doing. */
1108 P2V(vol, sd->sd_pos, bp->bio_offset, &virtual, ©);
1109 V2P(vol, virtual, &disk, &offset, &start);
1110
1111 for (copy = 0; copy < N; copy++) {
1112 if ((mask & (1 << copy) ) != 0)
1113 vol->v_subdisks[(disk + copy) %
1114 vol->v_disks_count].sd_recovery--;
1115 }
1116
1117 if (bp->bio_cmd == BIO_WRITE && bp->bio_error) {
1118 G_RAID_LOGREQ(0, bp, "Remap write failed: "
1119 "failing subdisk.");
1120 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1121 bp->bio_error = 0;
1122 }
1123 G_RAID_LOGREQ(2, bp, "REMAP done %d.", bp->bio_error);
1124 g_raid_unlock_range(sd->sd_volume, virtual, bp->bio_length);
1125 }
1126 if (pbp->bio_cmd != BIO_READ) {
1127 if (pbp->bio_inbed == 1 || pbp->bio_error != 0)
1128 pbp->bio_error = bp->bio_error;
1129 if (pbp->bio_cmd == BIO_WRITE && bp->bio_error != 0) {
1130 G_RAID_LOGREQ(0, bp, "Write failed: failing subdisk.");
1131 g_raid_tr_raid1e_fail_disk(sd->sd_softc, sd, sd->sd_disk);
1132 }
1133 error = pbp->bio_error;
1134 } else
1135 error = bp->bio_error;
1136 g_destroy_bio(bp);
1137 if (pbp->bio_children == pbp->bio_inbed) {
1138 pbp->bio_completed = pbp->bio_length;
1139 g_raid_iodone(pbp, error);
1140 }
1141 }
1142
1143 static int
g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object * tr,void * virtual,off_t boffset,size_t blength)1144 g_raid_tr_kerneldump_raid1e(struct g_raid_tr_object *tr, void *virtual,
1145 off_t boffset, size_t blength)
1146 {
1147 struct g_raid_volume *vol;
1148 struct g_raid_subdisk *sd;
1149 struct bio_queue_head queue;
1150 char *addr;
1151 off_t offset, start, length, remain;
1152 u_int no, strip_size;
1153 int i, error;
1154
1155 vol = tr->tro_volume;
1156 addr = virtual;
1157 strip_size = vol->v_strip_size;
1158 V2P(vol, boffset, &no, &offset, &start);
1159 remain = blength;
1160 bioq_init(&queue);
1161 while (remain > 0) {
1162 length = MIN(strip_size - start, remain);
1163 for (i = 0; i < N; i++) {
1164 sd = &vol->v_subdisks[no];
1165 switch (sd->sd_state) {
1166 case G_RAID_SUBDISK_S_ACTIVE:
1167 case G_RAID_SUBDISK_S_STALE:
1168 case G_RAID_SUBDISK_S_RESYNC:
1169 break;
1170 case G_RAID_SUBDISK_S_REBUILD:
1171 if (offset + start >= sd->sd_rebuild_pos)
1172 goto nextdisk;
1173 break;
1174 default:
1175 goto nextdisk;
1176 }
1177 error = g_raid_subdisk_kerneldump(sd, addr,
1178 offset + start, length);
1179 if (error != 0)
1180 return (error);
1181 nextdisk:
1182 if (++no >= vol->v_disks_count) {
1183 no = 0;
1184 offset += strip_size;
1185 }
1186 }
1187 remain -= length;
1188 addr += length;
1189 start = 0;
1190 }
1191 return (0);
1192 }
1193
1194 static int
g_raid_tr_locked_raid1e(struct g_raid_tr_object * tr,void * argp)1195 g_raid_tr_locked_raid1e(struct g_raid_tr_object *tr, void *argp)
1196 {
1197 struct bio *bp;
1198 struct g_raid_subdisk *sd;
1199
1200 bp = (struct bio *)argp;
1201 sd = (struct g_raid_subdisk *)bp->bio_caller1;
1202 g_raid_subdisk_iostart(sd, bp);
1203
1204 return (0);
1205 }
1206
1207 static int
g_raid_tr_idle_raid1e(struct g_raid_tr_object * tr)1208 g_raid_tr_idle_raid1e(struct g_raid_tr_object *tr)
1209 {
1210 struct g_raid_tr_raid1e_object *trs;
1211 struct g_raid_volume *vol;
1212
1213 vol = tr->tro_volume;
1214 trs = (struct g_raid_tr_raid1e_object *)tr;
1215 trs->trso_fair_io = g_raid1e_rebuild_fair_io;
1216 trs->trso_recover_slabs = g_raid1e_rebuild_cluster_idle;
1217 /* Compensate short rebuild I/Os. */
1218 if ((vol->v_disks_count % N) != 0 &&
1219 vol->v_strip_size < g_raid1e_rebuild_slab) {
1220 trs->trso_recover_slabs *= g_raid1e_rebuild_slab;
1221 trs->trso_recover_slabs /= vol->v_strip_size;
1222 }
1223 if (trs->trso_type == TR_RAID1E_REBUILD)
1224 g_raid_tr_raid1e_rebuild_some(tr);
1225 return (0);
1226 }
1227
1228 static int
g_raid_tr_free_raid1e(struct g_raid_tr_object * tr)1229 g_raid_tr_free_raid1e(struct g_raid_tr_object *tr)
1230 {
1231 struct g_raid_tr_raid1e_object *trs;
1232
1233 trs = (struct g_raid_tr_raid1e_object *)tr;
1234
1235 if (trs->trso_buffer != NULL) {
1236 free(trs->trso_buffer, M_TR_RAID1E);
1237 trs->trso_buffer = NULL;
1238 }
1239 return (0);
1240 }
1241
1242 G_RAID_TR_DECLARE(raid1e, "RAID1E");
1243