1 /*- 2 * Copyright (c) 2004-2006 Pawel Jakub Dawidek <pjd@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/module.h> 34 #include <sys/limits.h> 35 #include <sys/lock.h> 36 #include <sys/mutex.h> 37 #include <sys/bio.h> 38 #include <sys/sysctl.h> 39 #include <sys/malloc.h> 40 #include <sys/eventhandler.h> 41 #include <vm/uma.h> 42 #include <geom/geom.h> 43 #include <sys/proc.h> 44 #include <sys/kthread.h> 45 #include <sys/sched.h> 46 #include <geom/raid3/g_raid3.h> 47 48 49 static MALLOC_DEFINE(M_RAID3, "raid3_data", "GEOM_RAID3 Data"); 50 51 SYSCTL_DECL(_kern_geom); 52 SYSCTL_NODE(_kern_geom, OID_AUTO, raid3, CTLFLAG_RW, 0, "GEOM_RAID3 stuff"); 53 u_int g_raid3_debug = 0; 54 TUNABLE_INT("kern.geom.raid3.debug", &g_raid3_debug); 55 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, debug, CTLFLAG_RW, &g_raid3_debug, 0, 56 "Debug level"); 57 static u_int g_raid3_timeout = 4; 58 TUNABLE_INT("kern.geom.raid3.timeout", &g_raid3_timeout); 59 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, timeout, CTLFLAG_RW, &g_raid3_timeout, 60 0, "Time to wait on all raid3 components"); 61 static u_int g_raid3_idletime = 5; 62 TUNABLE_INT("kern.geom.raid3.idletime", &g_raid3_idletime); 63 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, idletime, CTLFLAG_RW, 64 &g_raid3_idletime, 0, "Mark components as clean when idling"); 65 static u_int g_raid3_disconnect_on_failure = 1; 66 TUNABLE_INT("kern.geom.raid3.disconnect_on_failure", 67 &g_raid3_disconnect_on_failure); 68 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, 69 &g_raid3_disconnect_on_failure, 0, "Disconnect component on I/O failure."); 70 static u_int g_raid3_syncreqs = 2; 71 TUNABLE_INT("kern.geom.raid3.sync_requests", &g_raid3_syncreqs); 72 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, sync_requests, CTLFLAG_RDTUN, 73 &g_raid3_syncreqs, 0, "Parallel synchronization I/O requests."); 74 75 static u_int g_raid3_n64k = 50; 76 TUNABLE_INT("kern.geom.raid3.n64k", &g_raid3_n64k); 77 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n64k, CTLFLAG_RD, &g_raid3_n64k, 0, 78 "Maximum number of 64kB allocations"); 79 static u_int g_raid3_n16k = 200; 80 TUNABLE_INT("kern.geom.raid3.n16k", &g_raid3_n16k); 81 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n16k, CTLFLAG_RD, &g_raid3_n16k, 0, 82 "Maximum number of 16kB allocations"); 83 static u_int g_raid3_n4k = 1200; 84 TUNABLE_INT("kern.geom.raid3.n4k", &g_raid3_n4k); 85 SYSCTL_UINT(_kern_geom_raid3, OID_AUTO, n4k, CTLFLAG_RD, &g_raid3_n4k, 0, 86 "Maximum number of 4kB allocations"); 87 88 SYSCTL_NODE(_kern_geom_raid3, OID_AUTO, stat, CTLFLAG_RW, 0, 89 "GEOM_RAID3 statistics"); 90 static u_int g_raid3_parity_mismatch = 0; 91 SYSCTL_UINT(_kern_geom_raid3_stat, OID_AUTO, parity_mismatch, CTLFLAG_RD, 92 &g_raid3_parity_mismatch, 0, "Number of failures in VERIFY mode"); 93 94 #define MSLEEP(ident, mtx, priority, wmesg, timeout) do { \ 95 G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ 96 msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ 97 G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ 98 } while (0) 99 100 static eventhandler_tag g_raid3_pre_sync = NULL, g_raid3_post_sync = NULL; 101 102 static int g_raid3_destroy_geom(struct gctl_req *req, struct g_class *mp, 103 struct g_geom *gp); 104 static g_taste_t g_raid3_taste; 105 static void g_raid3_init(struct g_class *mp); 106 static void g_raid3_fini(struct g_class *mp); 107 108 struct g_class g_raid3_class = { 109 .name = G_RAID3_CLASS_NAME, 110 .version = G_VERSION, 111 .ctlreq = g_raid3_config, 112 .taste = g_raid3_taste, 113 .destroy_geom = g_raid3_destroy_geom, 114 .init = g_raid3_init, 115 .fini = g_raid3_fini 116 }; 117 118 119 static void g_raid3_destroy_provider(struct g_raid3_softc *sc); 120 static int g_raid3_update_disk(struct g_raid3_disk *disk, u_int state); 121 static void g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force); 122 static void g_raid3_dumpconf(struct sbuf *sb, const char *indent, 123 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 124 static void g_raid3_sync_stop(struct g_raid3_softc *sc, int type); 125 static int g_raid3_register_request(struct bio *pbp); 126 static void g_raid3_sync_release(struct g_raid3_softc *sc); 127 128 129 static const char * 130 g_raid3_disk_state2str(int state) 131 { 132 133 switch (state) { 134 case G_RAID3_DISK_STATE_NODISK: 135 return ("NODISK"); 136 case G_RAID3_DISK_STATE_NONE: 137 return ("NONE"); 138 case G_RAID3_DISK_STATE_NEW: 139 return ("NEW"); 140 case G_RAID3_DISK_STATE_ACTIVE: 141 return ("ACTIVE"); 142 case G_RAID3_DISK_STATE_STALE: 143 return ("STALE"); 144 case G_RAID3_DISK_STATE_SYNCHRONIZING: 145 return ("SYNCHRONIZING"); 146 case G_RAID3_DISK_STATE_DISCONNECTED: 147 return ("DISCONNECTED"); 148 default: 149 return ("INVALID"); 150 } 151 } 152 153 static const char * 154 g_raid3_device_state2str(int state) 155 { 156 157 switch (state) { 158 case G_RAID3_DEVICE_STATE_STARTING: 159 return ("STARTING"); 160 case G_RAID3_DEVICE_STATE_DEGRADED: 161 return ("DEGRADED"); 162 case G_RAID3_DEVICE_STATE_COMPLETE: 163 return ("COMPLETE"); 164 default: 165 return ("INVALID"); 166 } 167 } 168 169 const char * 170 g_raid3_get_diskname(struct g_raid3_disk *disk) 171 { 172 173 if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) 174 return ("[unknown]"); 175 return (disk->d_name); 176 } 177 178 static int 179 g_raid3_uma_ctor(void *mem, int size, void *arg, int flags) 180 { 181 struct g_raid3_zone *sz = arg; 182 183 if (sz->sz_inuse == sz->sz_max) 184 return (ENOMEM); 185 sz->sz_inuse++; 186 return (0); 187 } 188 189 static void 190 g_raid3_uma_dtor(void *mem, int size, void *arg) 191 { 192 struct g_raid3_zone *sz = arg; 193 194 sz->sz_inuse--; 195 } 196 197 #define g_raid3_xor(src1, src2, dst, size) \ 198 _g_raid3_xor((uint64_t *)(src1), (uint64_t *)(src2), \ 199 (uint64_t *)(dst), (size_t)size) 200 static void 201 _g_raid3_xor(uint64_t *src1, uint64_t *src2, uint64_t *dst, size_t size) 202 { 203 204 KASSERT((size % 128) == 0, ("Invalid size: %zu.", size)); 205 for (; size > 0; size -= 128) { 206 *dst++ = (*src1++) ^ (*src2++); 207 *dst++ = (*src1++) ^ (*src2++); 208 *dst++ = (*src1++) ^ (*src2++); 209 *dst++ = (*src1++) ^ (*src2++); 210 *dst++ = (*src1++) ^ (*src2++); 211 *dst++ = (*src1++) ^ (*src2++); 212 *dst++ = (*src1++) ^ (*src2++); 213 *dst++ = (*src1++) ^ (*src2++); 214 *dst++ = (*src1++) ^ (*src2++); 215 *dst++ = (*src1++) ^ (*src2++); 216 *dst++ = (*src1++) ^ (*src2++); 217 *dst++ = (*src1++) ^ (*src2++); 218 *dst++ = (*src1++) ^ (*src2++); 219 *dst++ = (*src1++) ^ (*src2++); 220 *dst++ = (*src1++) ^ (*src2++); 221 *dst++ = (*src1++) ^ (*src2++); 222 } 223 } 224 225 static int 226 g_raid3_is_zero(struct bio *bp) 227 { 228 static const uint64_t zeros[] = { 229 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 230 }; 231 u_char *addr; 232 ssize_t size; 233 234 size = bp->bio_length; 235 addr = (u_char *)bp->bio_data; 236 for (; size > 0; size -= sizeof(zeros), addr += sizeof(zeros)) { 237 if (bcmp(addr, zeros, sizeof(zeros)) != 0) 238 return (0); 239 } 240 return (1); 241 } 242 243 /* 244 * --- Events handling functions --- 245 * Events in geom_raid3 are used to maintain disks and device status 246 * from one thread to simplify locking. 247 */ 248 static void 249 g_raid3_event_free(struct g_raid3_event *ep) 250 { 251 252 free(ep, M_RAID3); 253 } 254 255 int 256 g_raid3_event_send(void *arg, int state, int flags) 257 { 258 struct g_raid3_softc *sc; 259 struct g_raid3_disk *disk; 260 struct g_raid3_event *ep; 261 int error; 262 263 ep = malloc(sizeof(*ep), M_RAID3, M_WAITOK); 264 G_RAID3_DEBUG(4, "%s: Sending event %p.", __func__, ep); 265 if ((flags & G_RAID3_EVENT_DEVICE) != 0) { 266 disk = NULL; 267 sc = arg; 268 } else { 269 disk = arg; 270 sc = disk->d_softc; 271 } 272 ep->e_disk = disk; 273 ep->e_state = state; 274 ep->e_flags = flags; 275 ep->e_error = 0; 276 mtx_lock(&sc->sc_events_mtx); 277 TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); 278 mtx_unlock(&sc->sc_events_mtx); 279 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 280 mtx_lock(&sc->sc_queue_mtx); 281 wakeup(sc); 282 wakeup(&sc->sc_queue); 283 mtx_unlock(&sc->sc_queue_mtx); 284 if ((flags & G_RAID3_EVENT_DONTWAIT) != 0) 285 return (0); 286 sx_assert(&sc->sc_lock, SX_XLOCKED); 287 G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, ep); 288 sx_xunlock(&sc->sc_lock); 289 while ((ep->e_flags & G_RAID3_EVENT_DONE) == 0) { 290 mtx_lock(&sc->sc_events_mtx); 291 MSLEEP(ep, &sc->sc_events_mtx, PRIBIO | PDROP, "r3:event", 292 hz * 5); 293 } 294 error = ep->e_error; 295 g_raid3_event_free(ep); 296 sx_xlock(&sc->sc_lock); 297 return (error); 298 } 299 300 static struct g_raid3_event * 301 g_raid3_event_get(struct g_raid3_softc *sc) 302 { 303 struct g_raid3_event *ep; 304 305 mtx_lock(&sc->sc_events_mtx); 306 ep = TAILQ_FIRST(&sc->sc_events); 307 mtx_unlock(&sc->sc_events_mtx); 308 return (ep); 309 } 310 311 static void 312 g_raid3_event_remove(struct g_raid3_softc *sc, struct g_raid3_event *ep) 313 { 314 315 mtx_lock(&sc->sc_events_mtx); 316 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 317 mtx_unlock(&sc->sc_events_mtx); 318 } 319 320 static void 321 g_raid3_event_cancel(struct g_raid3_disk *disk) 322 { 323 struct g_raid3_softc *sc; 324 struct g_raid3_event *ep, *tmpep; 325 326 sc = disk->d_softc; 327 sx_assert(&sc->sc_lock, SX_XLOCKED); 328 329 mtx_lock(&sc->sc_events_mtx); 330 TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { 331 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) 332 continue; 333 if (ep->e_disk != disk) 334 continue; 335 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 336 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 337 g_raid3_event_free(ep); 338 else { 339 ep->e_error = ECANCELED; 340 wakeup(ep); 341 } 342 } 343 mtx_unlock(&sc->sc_events_mtx); 344 } 345 346 /* 347 * Return the number of disks in the given state. 348 * If state is equal to -1, count all connected disks. 349 */ 350 u_int 351 g_raid3_ndisks(struct g_raid3_softc *sc, int state) 352 { 353 struct g_raid3_disk *disk; 354 u_int n, ndisks; 355 356 sx_assert(&sc->sc_lock, SX_LOCKED); 357 358 for (n = ndisks = 0; n < sc->sc_ndisks; n++) { 359 disk = &sc->sc_disks[n]; 360 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 361 continue; 362 if (state == -1 || disk->d_state == state) 363 ndisks++; 364 } 365 return (ndisks); 366 } 367 368 static u_int 369 g_raid3_nrequests(struct g_raid3_softc *sc, struct g_consumer *cp) 370 { 371 struct bio *bp; 372 u_int nreqs = 0; 373 374 mtx_lock(&sc->sc_queue_mtx); 375 TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 376 if (bp->bio_from == cp) 377 nreqs++; 378 } 379 mtx_unlock(&sc->sc_queue_mtx); 380 return (nreqs); 381 } 382 383 static int 384 g_raid3_is_busy(struct g_raid3_softc *sc, struct g_consumer *cp) 385 { 386 387 if (cp->index > 0) { 388 G_RAID3_DEBUG(2, 389 "I/O requests for %s exist, can't destroy it now.", 390 cp->provider->name); 391 return (1); 392 } 393 if (g_raid3_nrequests(sc, cp) > 0) { 394 G_RAID3_DEBUG(2, 395 "I/O requests for %s in queue, can't destroy it now.", 396 cp->provider->name); 397 return (1); 398 } 399 return (0); 400 } 401 402 static void 403 g_raid3_destroy_consumer(void *arg, int flags __unused) 404 { 405 struct g_consumer *cp; 406 407 g_topology_assert(); 408 409 cp = arg; 410 G_RAID3_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); 411 g_detach(cp); 412 g_destroy_consumer(cp); 413 } 414 415 static void 416 g_raid3_kill_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 417 { 418 struct g_provider *pp; 419 int retaste_wait; 420 421 g_topology_assert(); 422 423 cp->private = NULL; 424 if (g_raid3_is_busy(sc, cp)) 425 return; 426 G_RAID3_DEBUG(2, "Consumer %s destroyed.", cp->provider->name); 427 pp = cp->provider; 428 retaste_wait = 0; 429 if (cp->acw == 1) { 430 if ((pp->geom->flags & G_GEOM_WITHER) == 0) 431 retaste_wait = 1; 432 } 433 G_RAID3_DEBUG(2, "Access %s r%dw%de%d = %d", pp->name, -cp->acr, 434 -cp->acw, -cp->ace, 0); 435 if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) 436 g_access(cp, -cp->acr, -cp->acw, -cp->ace); 437 if (retaste_wait) { 438 /* 439 * After retaste event was send (inside g_access()), we can send 440 * event to detach and destroy consumer. 441 * A class, which has consumer to the given provider connected 442 * will not receive retaste event for the provider. 443 * This is the way how I ignore retaste events when I close 444 * consumers opened for write: I detach and destroy consumer 445 * after retaste event is sent. 446 */ 447 g_post_event(g_raid3_destroy_consumer, cp, M_WAITOK, NULL); 448 return; 449 } 450 G_RAID3_DEBUG(1, "Consumer %s destroyed.", pp->name); 451 g_detach(cp); 452 g_destroy_consumer(cp); 453 } 454 455 static int 456 g_raid3_connect_disk(struct g_raid3_disk *disk, struct g_provider *pp) 457 { 458 struct g_consumer *cp; 459 int error; 460 461 g_topology_assert_not(); 462 KASSERT(disk->d_consumer == NULL, 463 ("Disk already connected (device %s).", disk->d_softc->sc_name)); 464 465 g_topology_lock(); 466 cp = g_new_consumer(disk->d_softc->sc_geom); 467 error = g_attach(cp, pp); 468 if (error != 0) { 469 g_destroy_consumer(cp); 470 g_topology_unlock(); 471 return (error); 472 } 473 error = g_access(cp, 1, 1, 1); 474 g_topology_unlock(); 475 if (error != 0) { 476 g_detach(cp); 477 g_destroy_consumer(cp); 478 G_RAID3_DEBUG(0, "Cannot open consumer %s (error=%d).", 479 pp->name, error); 480 return (error); 481 } 482 disk->d_consumer = cp; 483 disk->d_consumer->private = disk; 484 disk->d_consumer->index = 0; 485 G_RAID3_DEBUG(2, "Disk %s connected.", g_raid3_get_diskname(disk)); 486 return (0); 487 } 488 489 static void 490 g_raid3_disconnect_consumer(struct g_raid3_softc *sc, struct g_consumer *cp) 491 { 492 493 g_topology_assert(); 494 495 if (cp == NULL) 496 return; 497 if (cp->provider != NULL) 498 g_raid3_kill_consumer(sc, cp); 499 else 500 g_destroy_consumer(cp); 501 } 502 503 /* 504 * Initialize disk. This means allocate memory, create consumer, attach it 505 * to the provider and open access (r1w1e1) to it. 506 */ 507 static struct g_raid3_disk * 508 g_raid3_init_disk(struct g_raid3_softc *sc, struct g_provider *pp, 509 struct g_raid3_metadata *md, int *errorp) 510 { 511 struct g_raid3_disk *disk; 512 int error; 513 514 disk = &sc->sc_disks[md->md_no]; 515 error = g_raid3_connect_disk(disk, pp); 516 if (error != 0) { 517 if (errorp != NULL) 518 *errorp = error; 519 return (NULL); 520 } 521 disk->d_state = G_RAID3_DISK_STATE_NONE; 522 disk->d_flags = md->md_dflags; 523 if (md->md_provider[0] != '\0') 524 disk->d_flags |= G_RAID3_DISK_FLAG_HARDCODED; 525 disk->d_sync.ds_consumer = NULL; 526 disk->d_sync.ds_offset = md->md_sync_offset; 527 disk->d_sync.ds_offset_done = md->md_sync_offset; 528 disk->d_genid = md->md_genid; 529 disk->d_sync.ds_syncid = md->md_syncid; 530 if (errorp != NULL) 531 *errorp = 0; 532 return (disk); 533 } 534 535 static void 536 g_raid3_destroy_disk(struct g_raid3_disk *disk) 537 { 538 struct g_raid3_softc *sc; 539 540 g_topology_assert_not(); 541 sc = disk->d_softc; 542 sx_assert(&sc->sc_lock, SX_XLOCKED); 543 544 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 545 return; 546 g_raid3_event_cancel(disk); 547 switch (disk->d_state) { 548 case G_RAID3_DISK_STATE_SYNCHRONIZING: 549 if (sc->sc_syncdisk != NULL) 550 g_raid3_sync_stop(sc, 1); 551 /* FALLTHROUGH */ 552 case G_RAID3_DISK_STATE_NEW: 553 case G_RAID3_DISK_STATE_STALE: 554 case G_RAID3_DISK_STATE_ACTIVE: 555 g_topology_lock(); 556 g_raid3_disconnect_consumer(sc, disk->d_consumer); 557 g_topology_unlock(); 558 disk->d_consumer = NULL; 559 break; 560 default: 561 KASSERT(0 == 1, ("Wrong disk state (%s, %s).", 562 g_raid3_get_diskname(disk), 563 g_raid3_disk_state2str(disk->d_state))); 564 } 565 disk->d_state = G_RAID3_DISK_STATE_NODISK; 566 } 567 568 static void 569 g_raid3_destroy_device(struct g_raid3_softc *sc) 570 { 571 struct g_raid3_event *ep; 572 struct g_raid3_disk *disk; 573 struct g_geom *gp; 574 struct g_consumer *cp; 575 u_int n; 576 577 g_topology_assert_not(); 578 sx_assert(&sc->sc_lock, SX_XLOCKED); 579 580 gp = sc->sc_geom; 581 if (sc->sc_provider != NULL) 582 g_raid3_destroy_provider(sc); 583 for (n = 0; n < sc->sc_ndisks; n++) { 584 disk = &sc->sc_disks[n]; 585 if (disk->d_state != G_RAID3_DISK_STATE_NODISK) { 586 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 587 g_raid3_update_metadata(disk); 588 g_raid3_destroy_disk(disk); 589 } 590 } 591 while ((ep = g_raid3_event_get(sc)) != NULL) { 592 g_raid3_event_remove(sc, ep); 593 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) 594 g_raid3_event_free(ep); 595 else { 596 ep->e_error = ECANCELED; 597 ep->e_flags |= G_RAID3_EVENT_DONE; 598 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, ep); 599 mtx_lock(&sc->sc_events_mtx); 600 wakeup(ep); 601 mtx_unlock(&sc->sc_events_mtx); 602 } 603 } 604 callout_drain(&sc->sc_callout); 605 gp->softc = NULL; 606 cp = LIST_FIRST(&sc->sc_sync.ds_geom->consumer); 607 g_topology_lock(); 608 if (cp != NULL) 609 g_raid3_disconnect_consumer(sc, cp); 610 sc->sc_sync.ds_geom->softc = NULL; 611 g_wither_geom(sc->sc_sync.ds_geom, ENXIO); 612 G_RAID3_DEBUG(0, "Device %s destroyed.", gp->name); 613 g_wither_geom(gp, ENXIO); 614 g_topology_unlock(); 615 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); 616 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); 617 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); 618 mtx_destroy(&sc->sc_queue_mtx); 619 mtx_destroy(&sc->sc_events_mtx); 620 sx_xunlock(&sc->sc_lock); 621 sx_destroy(&sc->sc_lock); 622 } 623 624 static void 625 g_raid3_orphan(struct g_consumer *cp) 626 { 627 struct g_raid3_disk *disk; 628 629 g_topology_assert(); 630 631 disk = cp->private; 632 if (disk == NULL) 633 return; 634 disk->d_softc->sc_bump_id = G_RAID3_BUMP_SYNCID; 635 g_raid3_event_send(disk, G_RAID3_DISK_STATE_DISCONNECTED, 636 G_RAID3_EVENT_DONTWAIT); 637 } 638 639 static int 640 g_raid3_write_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 641 { 642 struct g_raid3_softc *sc; 643 struct g_consumer *cp; 644 off_t offset, length; 645 u_char *sector; 646 int error = 0; 647 648 g_topology_assert_not(); 649 sc = disk->d_softc; 650 sx_assert(&sc->sc_lock, SX_LOCKED); 651 652 cp = disk->d_consumer; 653 KASSERT(cp != NULL, ("NULL consumer (%s).", sc->sc_name)); 654 KASSERT(cp->provider != NULL, ("NULL provider (%s).", sc->sc_name)); 655 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 656 ("Consumer %s closed? (r%dw%de%d).", cp->provider->name, cp->acr, 657 cp->acw, cp->ace)); 658 length = cp->provider->sectorsize; 659 offset = cp->provider->mediasize - length; 660 sector = malloc((size_t)length, M_RAID3, M_WAITOK | M_ZERO); 661 if (md != NULL) 662 raid3_metadata_encode(md, sector); 663 error = g_write_data(cp, offset, sector, length); 664 free(sector, M_RAID3); 665 if (error != 0) { 666 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 667 G_RAID3_DEBUG(0, "Cannot write metadata on %s " 668 "(device=%s, error=%d).", 669 g_raid3_get_diskname(disk), sc->sc_name, error); 670 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 671 } else { 672 G_RAID3_DEBUG(1, "Cannot write metadata on %s " 673 "(device=%s, error=%d).", 674 g_raid3_get_diskname(disk), sc->sc_name, error); 675 } 676 if (g_raid3_disconnect_on_failure && 677 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 678 sc->sc_bump_id |= G_RAID3_BUMP_GENID; 679 g_raid3_event_send(disk, 680 G_RAID3_DISK_STATE_DISCONNECTED, 681 G_RAID3_EVENT_DONTWAIT); 682 } 683 } 684 return (error); 685 } 686 687 int 688 g_raid3_clear_metadata(struct g_raid3_disk *disk) 689 { 690 int error; 691 692 g_topology_assert_not(); 693 sx_assert(&disk->d_softc->sc_lock, SX_LOCKED); 694 695 error = g_raid3_write_metadata(disk, NULL); 696 if (error == 0) { 697 G_RAID3_DEBUG(2, "Metadata on %s cleared.", 698 g_raid3_get_diskname(disk)); 699 } else { 700 G_RAID3_DEBUG(0, 701 "Cannot clear metadata on disk %s (error=%d).", 702 g_raid3_get_diskname(disk), error); 703 } 704 return (error); 705 } 706 707 void 708 g_raid3_fill_metadata(struct g_raid3_disk *disk, struct g_raid3_metadata *md) 709 { 710 struct g_raid3_softc *sc; 711 struct g_provider *pp; 712 713 sc = disk->d_softc; 714 strlcpy(md->md_magic, G_RAID3_MAGIC, sizeof(md->md_magic)); 715 md->md_version = G_RAID3_VERSION; 716 strlcpy(md->md_name, sc->sc_name, sizeof(md->md_name)); 717 md->md_id = sc->sc_id; 718 md->md_all = sc->sc_ndisks; 719 md->md_genid = sc->sc_genid; 720 md->md_mediasize = sc->sc_mediasize; 721 md->md_sectorsize = sc->sc_sectorsize; 722 md->md_mflags = (sc->sc_flags & G_RAID3_DEVICE_FLAG_MASK); 723 md->md_no = disk->d_no; 724 md->md_syncid = disk->d_sync.ds_syncid; 725 md->md_dflags = (disk->d_flags & G_RAID3_DISK_FLAG_MASK); 726 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) 727 md->md_sync_offset = disk->d_sync.ds_offset_done; 728 else 729 md->md_sync_offset = 0; 730 if (disk->d_consumer != NULL && disk->d_consumer->provider != NULL) 731 pp = disk->d_consumer->provider; 732 else 733 pp = NULL; 734 if ((disk->d_flags & G_RAID3_DISK_FLAG_HARDCODED) != 0 && pp != NULL) 735 strlcpy(md->md_provider, pp->name, sizeof(md->md_provider)); 736 else 737 bzero(md->md_provider, sizeof(md->md_provider)); 738 if (pp != NULL) 739 md->md_provsize = pp->mediasize; 740 else 741 md->md_provsize = 0; 742 } 743 744 void 745 g_raid3_update_metadata(struct g_raid3_disk *disk) 746 { 747 struct g_raid3_softc *sc; 748 struct g_raid3_metadata md; 749 int error; 750 751 g_topology_assert_not(); 752 sc = disk->d_softc; 753 sx_assert(&sc->sc_lock, SX_LOCKED); 754 755 g_raid3_fill_metadata(disk, &md); 756 error = g_raid3_write_metadata(disk, &md); 757 if (error == 0) { 758 G_RAID3_DEBUG(2, "Metadata on %s updated.", 759 g_raid3_get_diskname(disk)); 760 } else { 761 G_RAID3_DEBUG(0, 762 "Cannot update metadata on disk %s (error=%d).", 763 g_raid3_get_diskname(disk), error); 764 } 765 } 766 767 static void 768 g_raid3_bump_syncid(struct g_raid3_softc *sc) 769 { 770 struct g_raid3_disk *disk; 771 u_int n; 772 773 g_topology_assert_not(); 774 sx_assert(&sc->sc_lock, SX_XLOCKED); 775 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 776 ("%s called with no active disks (device=%s).", __func__, 777 sc->sc_name)); 778 779 sc->sc_syncid++; 780 G_RAID3_DEBUG(1, "Device %s: syncid bumped to %u.", sc->sc_name, 781 sc->sc_syncid); 782 for (n = 0; n < sc->sc_ndisks; n++) { 783 disk = &sc->sc_disks[n]; 784 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 785 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 786 disk->d_sync.ds_syncid = sc->sc_syncid; 787 g_raid3_update_metadata(disk); 788 } 789 } 790 } 791 792 static void 793 g_raid3_bump_genid(struct g_raid3_softc *sc) 794 { 795 struct g_raid3_disk *disk; 796 u_int n; 797 798 g_topology_assert_not(); 799 sx_assert(&sc->sc_lock, SX_XLOCKED); 800 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) > 0, 801 ("%s called with no active disks (device=%s).", __func__, 802 sc->sc_name)); 803 804 sc->sc_genid++; 805 G_RAID3_DEBUG(1, "Device %s: genid bumped to %u.", sc->sc_name, 806 sc->sc_genid); 807 for (n = 0; n < sc->sc_ndisks; n++) { 808 disk = &sc->sc_disks[n]; 809 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 810 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 811 disk->d_genid = sc->sc_genid; 812 g_raid3_update_metadata(disk); 813 } 814 } 815 } 816 817 static int 818 g_raid3_idle(struct g_raid3_softc *sc, int acw) 819 { 820 struct g_raid3_disk *disk; 821 u_int i; 822 int timeout; 823 824 g_topology_assert_not(); 825 sx_assert(&sc->sc_lock, SX_XLOCKED); 826 827 if (sc->sc_provider == NULL) 828 return (0); 829 if (sc->sc_idle) 830 return (0); 831 if (sc->sc_writes > 0) 832 return (0); 833 if (acw > 0 || (acw == -1 && sc->sc_provider->acw > 0)) { 834 timeout = g_raid3_idletime - (time_uptime - sc->sc_last_write); 835 if (timeout > 0) 836 return (timeout); 837 } 838 sc->sc_idle = 1; 839 for (i = 0; i < sc->sc_ndisks; i++) { 840 disk = &sc->sc_disks[i]; 841 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 842 continue; 843 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 844 g_raid3_get_diskname(disk), sc->sc_name); 845 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 846 g_raid3_update_metadata(disk); 847 } 848 return (0); 849 } 850 851 static void 852 g_raid3_unidle(struct g_raid3_softc *sc) 853 { 854 struct g_raid3_disk *disk; 855 u_int i; 856 857 g_topology_assert_not(); 858 sx_assert(&sc->sc_lock, SX_XLOCKED); 859 860 sc->sc_idle = 0; 861 sc->sc_last_write = time_uptime; 862 for (i = 0; i < sc->sc_ndisks; i++) { 863 disk = &sc->sc_disks[i]; 864 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) 865 continue; 866 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 867 g_raid3_get_diskname(disk), sc->sc_name); 868 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 869 g_raid3_update_metadata(disk); 870 } 871 } 872 873 /* 874 * Treat bio_driver1 field in parent bio as list head and field bio_caller1 875 * in child bio as pointer to the next element on the list. 876 */ 877 #define G_RAID3_HEAD_BIO(pbp) (pbp)->bio_driver1 878 879 #define G_RAID3_NEXT_BIO(cbp) (cbp)->bio_caller1 880 881 #define G_RAID3_FOREACH_BIO(pbp, bp) \ 882 for ((bp) = G_RAID3_HEAD_BIO(pbp); (bp) != NULL; \ 883 (bp) = G_RAID3_NEXT_BIO(bp)) 884 885 #define G_RAID3_FOREACH_SAFE_BIO(pbp, bp, tmpbp) \ 886 for ((bp) = G_RAID3_HEAD_BIO(pbp); \ 887 (bp) != NULL && ((tmpbp) = G_RAID3_NEXT_BIO(bp), 1); \ 888 (bp) = (tmpbp)) 889 890 static void 891 g_raid3_init_bio(struct bio *pbp) 892 { 893 894 G_RAID3_HEAD_BIO(pbp) = NULL; 895 } 896 897 static void 898 g_raid3_remove_bio(struct bio *cbp) 899 { 900 struct bio *pbp, *bp; 901 902 pbp = cbp->bio_parent; 903 if (G_RAID3_HEAD_BIO(pbp) == cbp) 904 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 905 else { 906 G_RAID3_FOREACH_BIO(pbp, bp) { 907 if (G_RAID3_NEXT_BIO(bp) == cbp) { 908 G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 909 break; 910 } 911 } 912 } 913 G_RAID3_NEXT_BIO(cbp) = NULL; 914 } 915 916 static void 917 g_raid3_replace_bio(struct bio *sbp, struct bio *dbp) 918 { 919 struct bio *pbp, *bp; 920 921 g_raid3_remove_bio(sbp); 922 pbp = dbp->bio_parent; 923 G_RAID3_NEXT_BIO(sbp) = G_RAID3_NEXT_BIO(dbp); 924 if (G_RAID3_HEAD_BIO(pbp) == dbp) 925 G_RAID3_HEAD_BIO(pbp) = sbp; 926 else { 927 G_RAID3_FOREACH_BIO(pbp, bp) { 928 if (G_RAID3_NEXT_BIO(bp) == dbp) { 929 G_RAID3_NEXT_BIO(bp) = sbp; 930 break; 931 } 932 } 933 } 934 G_RAID3_NEXT_BIO(dbp) = NULL; 935 } 936 937 static void 938 g_raid3_destroy_bio(struct g_raid3_softc *sc, struct bio *cbp) 939 { 940 struct bio *bp, *pbp; 941 size_t size; 942 943 pbp = cbp->bio_parent; 944 pbp->bio_children--; 945 KASSERT(cbp->bio_data != NULL, ("NULL bio_data")); 946 size = pbp->bio_length / (sc->sc_ndisks - 1); 947 uma_zfree_arg(sc->sc_zones[g_raid3_zone(size)].sz_zone, 948 cbp->bio_data, 949 &sc->sc_zones[g_raid3_zone(size)]); 950 if (G_RAID3_HEAD_BIO(pbp) == cbp) { 951 G_RAID3_HEAD_BIO(pbp) = G_RAID3_NEXT_BIO(cbp); 952 G_RAID3_NEXT_BIO(cbp) = NULL; 953 g_destroy_bio(cbp); 954 } else { 955 G_RAID3_FOREACH_BIO(pbp, bp) { 956 if (G_RAID3_NEXT_BIO(bp) == cbp) 957 break; 958 } 959 if (bp != NULL) { 960 KASSERT(G_RAID3_NEXT_BIO(bp) != NULL, 961 ("NULL bp->bio_driver1")); 962 G_RAID3_NEXT_BIO(bp) = G_RAID3_NEXT_BIO(cbp); 963 G_RAID3_NEXT_BIO(cbp) = NULL; 964 } 965 g_destroy_bio(cbp); 966 } 967 } 968 969 static struct bio * 970 g_raid3_clone_bio(struct g_raid3_softc *sc, struct bio *pbp) 971 { 972 struct bio *bp, *cbp; 973 size_t size; 974 int memflag; 975 976 cbp = g_clone_bio(pbp); 977 if (cbp == NULL) 978 return (NULL); 979 size = pbp->bio_length / (sc->sc_ndisks - 1); 980 if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) 981 memflag = M_WAITOK; 982 else 983 memflag = M_NOWAIT; 984 cbp->bio_data = uma_zalloc_arg(sc->sc_zones[g_raid3_zone(size)].sz_zone, 985 &sc->sc_zones[g_raid3_zone(size)], memflag); 986 sc->sc_zones[g_raid3_zone(size)].sz_requested++; 987 if (cbp->bio_data == NULL) { 988 sc->sc_zones[g_raid3_zone(size)].sz_failed++; 989 pbp->bio_children--; 990 g_destroy_bio(cbp); 991 return (NULL); 992 } 993 G_RAID3_NEXT_BIO(cbp) = NULL; 994 if (G_RAID3_HEAD_BIO(pbp) == NULL) 995 G_RAID3_HEAD_BIO(pbp) = cbp; 996 else { 997 G_RAID3_FOREACH_BIO(pbp, bp) { 998 if (G_RAID3_NEXT_BIO(bp) == NULL) { 999 G_RAID3_NEXT_BIO(bp) = cbp; 1000 break; 1001 } 1002 } 1003 } 1004 return (cbp); 1005 } 1006 1007 static void 1008 g_raid3_scatter(struct bio *pbp) 1009 { 1010 struct g_raid3_softc *sc; 1011 struct g_raid3_disk *disk; 1012 struct bio *bp, *cbp; 1013 off_t atom, cadd, padd, left; 1014 1015 sc = pbp->bio_to->geom->softc; 1016 bp = NULL; 1017 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1018 /* 1019 * Find bio for which we should calculate data. 1020 */ 1021 G_RAID3_FOREACH_BIO(pbp, cbp) { 1022 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1023 bp = cbp; 1024 break; 1025 } 1026 } 1027 KASSERT(bp != NULL, ("NULL parity bio.")); 1028 } 1029 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1030 cadd = padd = 0; 1031 for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1032 G_RAID3_FOREACH_BIO(pbp, cbp) { 1033 if (cbp == bp) 1034 continue; 1035 bcopy(pbp->bio_data + padd, cbp->bio_data + cadd, atom); 1036 padd += atom; 1037 } 1038 cadd += atom; 1039 } 1040 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_NOPARITY) == 0) { 1041 struct bio *tmpbp; 1042 1043 /* 1044 * Calculate parity. 1045 */ 1046 bzero(bp->bio_data, bp->bio_length); 1047 G_RAID3_FOREACH_SAFE_BIO(pbp, cbp, tmpbp) { 1048 if (cbp == bp) 1049 continue; 1050 g_raid3_xor(cbp->bio_data, bp->bio_data, bp->bio_data, 1051 bp->bio_length); 1052 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_NODISK) != 0) 1053 g_raid3_destroy_bio(sc, cbp); 1054 } 1055 } 1056 G_RAID3_FOREACH_BIO(pbp, cbp) { 1057 struct g_consumer *cp; 1058 1059 disk = cbp->bio_caller2; 1060 cp = disk->d_consumer; 1061 cbp->bio_to = cp->provider; 1062 G_RAID3_LOGREQ(3, cbp, "Sending request."); 1063 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1064 ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1065 cp->acr, cp->acw, cp->ace)); 1066 cp->index++; 1067 sc->sc_writes++; 1068 g_io_request(cbp, cp); 1069 } 1070 } 1071 1072 static void 1073 g_raid3_gather(struct bio *pbp) 1074 { 1075 struct g_raid3_softc *sc; 1076 struct g_raid3_disk *disk; 1077 struct bio *xbp, *fbp, *cbp; 1078 off_t atom, cadd, padd, left; 1079 1080 sc = pbp->bio_to->geom->softc; 1081 /* 1082 * Find bio for which we have to calculate data. 1083 * While going through this path, check if all requests 1084 * succeeded, if not, deny whole request. 1085 * If we're in COMPLETE mode, we allow one request to fail, 1086 * so if we find one, we're sending it to the parity consumer. 1087 * If there are more failed requests, we deny whole request. 1088 */ 1089 xbp = fbp = NULL; 1090 G_RAID3_FOREACH_BIO(pbp, cbp) { 1091 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) { 1092 KASSERT(xbp == NULL, ("More than one parity bio.")); 1093 xbp = cbp; 1094 } 1095 if (cbp->bio_error == 0) 1096 continue; 1097 /* 1098 * Found failed request. 1099 */ 1100 if (fbp == NULL) { 1101 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_DEGRADED) != 0) { 1102 /* 1103 * We are already in degraded mode, so we can't 1104 * accept any failures. 1105 */ 1106 if (pbp->bio_error == 0) 1107 pbp->bio_error = cbp->bio_error; 1108 } else { 1109 fbp = cbp; 1110 } 1111 } else { 1112 /* 1113 * Next failed request, that's too many. 1114 */ 1115 if (pbp->bio_error == 0) 1116 pbp->bio_error = fbp->bio_error; 1117 } 1118 disk = cbp->bio_caller2; 1119 if (disk == NULL) 1120 continue; 1121 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 1122 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 1123 G_RAID3_LOGREQ(0, cbp, "Request failed (error=%d).", 1124 cbp->bio_error); 1125 } else { 1126 G_RAID3_LOGREQ(1, cbp, "Request failed (error=%d).", 1127 cbp->bio_error); 1128 } 1129 if (g_raid3_disconnect_on_failure && 1130 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1131 sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1132 g_raid3_event_send(disk, 1133 G_RAID3_DISK_STATE_DISCONNECTED, 1134 G_RAID3_EVENT_DONTWAIT); 1135 } 1136 } 1137 if (pbp->bio_error != 0) 1138 goto finish; 1139 if (fbp != NULL && (pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1140 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_VERIFY; 1141 if (xbp != fbp) 1142 g_raid3_replace_bio(xbp, fbp); 1143 g_raid3_destroy_bio(sc, fbp); 1144 } else if (fbp != NULL) { 1145 struct g_consumer *cp; 1146 1147 /* 1148 * One request failed, so send the same request to 1149 * the parity consumer. 1150 */ 1151 disk = pbp->bio_driver2; 1152 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1153 pbp->bio_error = fbp->bio_error; 1154 goto finish; 1155 } 1156 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1157 pbp->bio_inbed--; 1158 fbp->bio_flags &= ~(BIO_DONE | BIO_ERROR); 1159 if (disk->d_no == sc->sc_ndisks - 1) 1160 fbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1161 fbp->bio_error = 0; 1162 fbp->bio_completed = 0; 1163 fbp->bio_children = 0; 1164 fbp->bio_inbed = 0; 1165 cp = disk->d_consumer; 1166 fbp->bio_caller2 = disk; 1167 fbp->bio_to = cp->provider; 1168 G_RAID3_LOGREQ(3, fbp, "Sending request (recover)."); 1169 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1170 ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1171 cp->acr, cp->acw, cp->ace)); 1172 cp->index++; 1173 g_io_request(fbp, cp); 1174 return; 1175 } 1176 if (xbp != NULL) { 1177 /* 1178 * Calculate parity. 1179 */ 1180 G_RAID3_FOREACH_BIO(pbp, cbp) { 1181 if ((cbp->bio_cflags & G_RAID3_BIO_CFLAG_PARITY) != 0) 1182 continue; 1183 g_raid3_xor(cbp->bio_data, xbp->bio_data, xbp->bio_data, 1184 xbp->bio_length); 1185 } 1186 xbp->bio_cflags &= ~G_RAID3_BIO_CFLAG_PARITY; 1187 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) { 1188 if (!g_raid3_is_zero(xbp)) { 1189 g_raid3_parity_mismatch++; 1190 pbp->bio_error = EIO; 1191 goto finish; 1192 } 1193 g_raid3_destroy_bio(sc, xbp); 1194 } 1195 } 1196 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1197 cadd = padd = 0; 1198 for (left = pbp->bio_length; left > 0; left -= sc->sc_sectorsize) { 1199 G_RAID3_FOREACH_BIO(pbp, cbp) { 1200 bcopy(cbp->bio_data + cadd, pbp->bio_data + padd, atom); 1201 pbp->bio_completed += atom; 1202 padd += atom; 1203 } 1204 cadd += atom; 1205 } 1206 finish: 1207 if (pbp->bio_error == 0) 1208 G_RAID3_LOGREQ(3, pbp, "Request finished."); 1209 else { 1210 if ((pbp->bio_pflags & G_RAID3_BIO_PFLAG_VERIFY) != 0) 1211 G_RAID3_LOGREQ(1, pbp, "Verification error."); 1212 else 1213 G_RAID3_LOGREQ(0, pbp, "Request failed."); 1214 } 1215 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_MASK; 1216 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1217 g_raid3_destroy_bio(sc, cbp); 1218 g_io_deliver(pbp, pbp->bio_error); 1219 } 1220 1221 static void 1222 g_raid3_done(struct bio *bp) 1223 { 1224 struct g_raid3_softc *sc; 1225 1226 sc = bp->bio_from->geom->softc; 1227 bp->bio_cflags |= G_RAID3_BIO_CFLAG_REGULAR; 1228 G_RAID3_LOGREQ(3, bp, "Regular request done (error=%d).", bp->bio_error); 1229 mtx_lock(&sc->sc_queue_mtx); 1230 bioq_insert_head(&sc->sc_queue, bp); 1231 wakeup(sc); 1232 wakeup(&sc->sc_queue); 1233 mtx_unlock(&sc->sc_queue_mtx); 1234 } 1235 1236 static void 1237 g_raid3_regular_request(struct bio *cbp) 1238 { 1239 struct g_raid3_softc *sc; 1240 struct g_raid3_disk *disk; 1241 struct bio *pbp; 1242 1243 g_topology_assert_not(); 1244 1245 pbp = cbp->bio_parent; 1246 sc = pbp->bio_to->geom->softc; 1247 cbp->bio_from->index--; 1248 if (cbp->bio_cmd == BIO_WRITE) 1249 sc->sc_writes--; 1250 disk = cbp->bio_from->private; 1251 if (disk == NULL) { 1252 g_topology_lock(); 1253 g_raid3_kill_consumer(sc, cbp->bio_from); 1254 g_topology_unlock(); 1255 } 1256 1257 G_RAID3_LOGREQ(3, cbp, "Request finished."); 1258 pbp->bio_inbed++; 1259 KASSERT(pbp->bio_inbed <= pbp->bio_children, 1260 ("bio_inbed (%u) is bigger than bio_children (%u).", pbp->bio_inbed, 1261 pbp->bio_children)); 1262 if (pbp->bio_inbed != pbp->bio_children) 1263 return; 1264 switch (pbp->bio_cmd) { 1265 case BIO_READ: 1266 g_raid3_gather(pbp); 1267 break; 1268 case BIO_WRITE: 1269 case BIO_DELETE: 1270 { 1271 int error = 0; 1272 1273 pbp->bio_completed = pbp->bio_length; 1274 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) { 1275 if (cbp->bio_error == 0) { 1276 g_raid3_destroy_bio(sc, cbp); 1277 continue; 1278 } 1279 1280 if (error == 0) 1281 error = cbp->bio_error; 1282 else if (pbp->bio_error == 0) { 1283 /* 1284 * Next failed request, that's too many. 1285 */ 1286 pbp->bio_error = error; 1287 } 1288 1289 disk = cbp->bio_caller2; 1290 if (disk == NULL) { 1291 g_raid3_destroy_bio(sc, cbp); 1292 continue; 1293 } 1294 1295 if ((disk->d_flags & G_RAID3_DISK_FLAG_BROKEN) == 0) { 1296 disk->d_flags |= G_RAID3_DISK_FLAG_BROKEN; 1297 G_RAID3_LOGREQ(0, cbp, 1298 "Request failed (error=%d).", 1299 cbp->bio_error); 1300 } else { 1301 G_RAID3_LOGREQ(1, cbp, 1302 "Request failed (error=%d).", 1303 cbp->bio_error); 1304 } 1305 if (g_raid3_disconnect_on_failure && 1306 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1307 sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1308 g_raid3_event_send(disk, 1309 G_RAID3_DISK_STATE_DISCONNECTED, 1310 G_RAID3_EVENT_DONTWAIT); 1311 } 1312 g_raid3_destroy_bio(sc, cbp); 1313 } 1314 if (pbp->bio_error == 0) 1315 G_RAID3_LOGREQ(3, pbp, "Request finished."); 1316 else 1317 G_RAID3_LOGREQ(0, pbp, "Request failed."); 1318 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_DEGRADED; 1319 pbp->bio_pflags &= ~G_RAID3_BIO_PFLAG_NOPARITY; 1320 bioq_remove(&sc->sc_inflight, pbp); 1321 /* Release delayed sync requests if possible. */ 1322 g_raid3_sync_release(sc); 1323 g_io_deliver(pbp, pbp->bio_error); 1324 break; 1325 } 1326 } 1327 } 1328 1329 static void 1330 g_raid3_sync_done(struct bio *bp) 1331 { 1332 struct g_raid3_softc *sc; 1333 1334 G_RAID3_LOGREQ(3, bp, "Synchronization request delivered."); 1335 sc = bp->bio_from->geom->softc; 1336 bp->bio_cflags |= G_RAID3_BIO_CFLAG_SYNC; 1337 mtx_lock(&sc->sc_queue_mtx); 1338 bioq_insert_head(&sc->sc_queue, bp); 1339 wakeup(sc); 1340 wakeup(&sc->sc_queue); 1341 mtx_unlock(&sc->sc_queue_mtx); 1342 } 1343 1344 static void 1345 g_raid3_start(struct bio *bp) 1346 { 1347 struct g_raid3_softc *sc; 1348 1349 sc = bp->bio_to->geom->softc; 1350 /* 1351 * If sc == NULL or there are no valid disks, provider's error 1352 * should be set and g_raid3_start() should not be called at all. 1353 */ 1354 KASSERT(sc != NULL && (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 1355 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE), 1356 ("Provider's error should be set (error=%d)(device=%s).", 1357 bp->bio_to->error, bp->bio_to->name)); 1358 G_RAID3_LOGREQ(3, bp, "Request received."); 1359 1360 switch (bp->bio_cmd) { 1361 case BIO_READ: 1362 case BIO_WRITE: 1363 case BIO_DELETE: 1364 break; 1365 case BIO_GETATTR: 1366 default: 1367 g_io_deliver(bp, EOPNOTSUPP); 1368 return; 1369 } 1370 mtx_lock(&sc->sc_queue_mtx); 1371 bioq_insert_tail(&sc->sc_queue, bp); 1372 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 1373 wakeup(sc); 1374 mtx_unlock(&sc->sc_queue_mtx); 1375 } 1376 1377 /* 1378 * Return TRUE if the given request is colliding with a in-progress 1379 * synchronization request. 1380 */ 1381 static int 1382 g_raid3_sync_collision(struct g_raid3_softc *sc, struct bio *bp) 1383 { 1384 struct g_raid3_disk *disk; 1385 struct bio *sbp; 1386 off_t rstart, rend, sstart, send; 1387 int i; 1388 1389 disk = sc->sc_syncdisk; 1390 if (disk == NULL) 1391 return (0); 1392 rstart = bp->bio_offset; 1393 rend = bp->bio_offset + bp->bio_length; 1394 for (i = 0; i < g_raid3_syncreqs; i++) { 1395 sbp = disk->d_sync.ds_bios[i]; 1396 if (sbp == NULL) 1397 continue; 1398 sstart = sbp->bio_offset; 1399 send = sbp->bio_length; 1400 if (sbp->bio_cmd == BIO_WRITE) { 1401 sstart *= sc->sc_ndisks - 1; 1402 send *= sc->sc_ndisks - 1; 1403 } 1404 send += sstart; 1405 if (rend > sstart && rstart < send) 1406 return (1); 1407 } 1408 return (0); 1409 } 1410 1411 /* 1412 * Return TRUE if the given sync request is colliding with a in-progress regular 1413 * request. 1414 */ 1415 static int 1416 g_raid3_regular_collision(struct g_raid3_softc *sc, struct bio *sbp) 1417 { 1418 off_t rstart, rend, sstart, send; 1419 struct bio *bp; 1420 1421 if (sc->sc_syncdisk == NULL) 1422 return (0); 1423 sstart = sbp->bio_offset; 1424 send = sstart + sbp->bio_length; 1425 TAILQ_FOREACH(bp, &sc->sc_inflight.queue, bio_queue) { 1426 rstart = bp->bio_offset; 1427 rend = bp->bio_offset + bp->bio_length; 1428 if (rend > sstart && rstart < send) 1429 return (1); 1430 } 1431 return (0); 1432 } 1433 1434 /* 1435 * Puts request onto delayed queue. 1436 */ 1437 static void 1438 g_raid3_regular_delay(struct g_raid3_softc *sc, struct bio *bp) 1439 { 1440 1441 G_RAID3_LOGREQ(2, bp, "Delaying request."); 1442 bioq_insert_head(&sc->sc_regular_delayed, bp); 1443 } 1444 1445 /* 1446 * Puts synchronization request onto delayed queue. 1447 */ 1448 static void 1449 g_raid3_sync_delay(struct g_raid3_softc *sc, struct bio *bp) 1450 { 1451 1452 G_RAID3_LOGREQ(2, bp, "Delaying synchronization request."); 1453 bioq_insert_tail(&sc->sc_sync_delayed, bp); 1454 } 1455 1456 /* 1457 * Releases delayed regular requests which don't collide anymore with sync 1458 * requests. 1459 */ 1460 static void 1461 g_raid3_regular_release(struct g_raid3_softc *sc) 1462 { 1463 struct bio *bp, *bp2; 1464 1465 TAILQ_FOREACH_SAFE(bp, &sc->sc_regular_delayed.queue, bio_queue, bp2) { 1466 if (g_raid3_sync_collision(sc, bp)) 1467 continue; 1468 bioq_remove(&sc->sc_regular_delayed, bp); 1469 G_RAID3_LOGREQ(2, bp, "Releasing delayed request (%p).", bp); 1470 mtx_lock(&sc->sc_queue_mtx); 1471 bioq_insert_head(&sc->sc_queue, bp); 1472 #if 0 1473 /* 1474 * wakeup() is not needed, because this function is called from 1475 * the worker thread. 1476 */ 1477 wakeup(&sc->sc_queue); 1478 #endif 1479 mtx_unlock(&sc->sc_queue_mtx); 1480 } 1481 } 1482 1483 /* 1484 * Releases delayed sync requests which don't collide anymore with regular 1485 * requests. 1486 */ 1487 static void 1488 g_raid3_sync_release(struct g_raid3_softc *sc) 1489 { 1490 struct bio *bp, *bp2; 1491 1492 TAILQ_FOREACH_SAFE(bp, &sc->sc_sync_delayed.queue, bio_queue, bp2) { 1493 if (g_raid3_regular_collision(sc, bp)) 1494 continue; 1495 bioq_remove(&sc->sc_sync_delayed, bp); 1496 G_RAID3_LOGREQ(2, bp, 1497 "Releasing delayed synchronization request."); 1498 g_io_request(bp, bp->bio_from); 1499 } 1500 } 1501 1502 /* 1503 * Handle synchronization requests. 1504 * Every synchronization request is two-steps process: first, READ request is 1505 * send to active provider and then WRITE request (with read data) to the provider 1506 * beeing synchronized. When WRITE is finished, new synchronization request is 1507 * send. 1508 */ 1509 static void 1510 g_raid3_sync_request(struct bio *bp) 1511 { 1512 struct g_raid3_softc *sc; 1513 struct g_raid3_disk *disk; 1514 1515 bp->bio_from->index--; 1516 sc = bp->bio_from->geom->softc; 1517 disk = bp->bio_from->private; 1518 if (disk == NULL) { 1519 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ 1520 g_topology_lock(); 1521 g_raid3_kill_consumer(sc, bp->bio_from); 1522 g_topology_unlock(); 1523 free(bp->bio_data, M_RAID3); 1524 g_destroy_bio(bp); 1525 sx_xlock(&sc->sc_lock); 1526 return; 1527 } 1528 1529 /* 1530 * Synchronization request. 1531 */ 1532 switch (bp->bio_cmd) { 1533 case BIO_READ: 1534 { 1535 struct g_consumer *cp; 1536 u_char *dst, *src; 1537 off_t left; 1538 u_int atom; 1539 1540 if (bp->bio_error != 0) { 1541 G_RAID3_LOGREQ(0, bp, 1542 "Synchronization request failed (error=%d).", 1543 bp->bio_error); 1544 g_destroy_bio(bp); 1545 return; 1546 } 1547 G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1548 atom = sc->sc_sectorsize / (sc->sc_ndisks - 1); 1549 dst = src = bp->bio_data; 1550 if (disk->d_no == sc->sc_ndisks - 1) { 1551 u_int n; 1552 1553 /* Parity component. */ 1554 for (left = bp->bio_length; left > 0; 1555 left -= sc->sc_sectorsize) { 1556 bcopy(src, dst, atom); 1557 src += atom; 1558 for (n = 1; n < sc->sc_ndisks - 1; n++) { 1559 g_raid3_xor(src, dst, dst, atom); 1560 src += atom; 1561 } 1562 dst += atom; 1563 } 1564 } else { 1565 /* Regular component. */ 1566 src += atom * disk->d_no; 1567 for (left = bp->bio_length; left > 0; 1568 left -= sc->sc_sectorsize) { 1569 bcopy(src, dst, atom); 1570 src += sc->sc_sectorsize; 1571 dst += atom; 1572 } 1573 } 1574 bp->bio_driver1 = bp->bio_driver2 = NULL; 1575 bp->bio_pflags = 0; 1576 bp->bio_offset /= sc->sc_ndisks - 1; 1577 bp->bio_length /= sc->sc_ndisks - 1; 1578 bp->bio_cmd = BIO_WRITE; 1579 bp->bio_cflags = 0; 1580 bp->bio_children = bp->bio_inbed = 0; 1581 cp = disk->d_consumer; 1582 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1583 ("Consumer %s not opened (r%dw%de%d).", cp->provider->name, 1584 cp->acr, cp->acw, cp->ace)); 1585 cp->index++; 1586 g_io_request(bp, cp); 1587 return; 1588 } 1589 case BIO_WRITE: 1590 { 1591 struct g_raid3_disk_sync *sync; 1592 off_t boffset, moffset; 1593 void *data; 1594 int i; 1595 1596 if (bp->bio_error != 0) { 1597 G_RAID3_LOGREQ(0, bp, 1598 "Synchronization request failed (error=%d).", 1599 bp->bio_error); 1600 g_destroy_bio(bp); 1601 sc->sc_bump_id |= G_RAID3_BUMP_GENID; 1602 g_raid3_event_send(disk, 1603 G_RAID3_DISK_STATE_DISCONNECTED, 1604 G_RAID3_EVENT_DONTWAIT); 1605 return; 1606 } 1607 G_RAID3_LOGREQ(3, bp, "Synchronization request finished."); 1608 sync = &disk->d_sync; 1609 if (sync->ds_offset == sc->sc_mediasize / (sc->sc_ndisks - 1) || 1610 sync->ds_consumer == NULL || 1611 (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1612 /* Don't send more synchronization requests. */ 1613 sync->ds_inflight--; 1614 if (sync->ds_bios != NULL) { 1615 i = (int)(uintptr_t)bp->bio_caller1; 1616 sync->ds_bios[i] = NULL; 1617 } 1618 free(bp->bio_data, M_RAID3); 1619 g_destroy_bio(bp); 1620 if (sync->ds_inflight > 0) 1621 return; 1622 if (sync->ds_consumer == NULL || 1623 (sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1624 return; 1625 } 1626 /* 1627 * Disk up-to-date, activate it. 1628 */ 1629 g_raid3_event_send(disk, G_RAID3_DISK_STATE_ACTIVE, 1630 G_RAID3_EVENT_DONTWAIT); 1631 return; 1632 } 1633 1634 /* Send next synchronization request. */ 1635 data = bp->bio_data; 1636 bzero(bp, sizeof(*bp)); 1637 bp->bio_cmd = BIO_READ; 1638 bp->bio_offset = sync->ds_offset * (sc->sc_ndisks - 1); 1639 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); 1640 sync->ds_offset += bp->bio_length / (sc->sc_ndisks - 1); 1641 bp->bio_done = g_raid3_sync_done; 1642 bp->bio_data = data; 1643 bp->bio_from = sync->ds_consumer; 1644 bp->bio_to = sc->sc_provider; 1645 G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); 1646 sync->ds_consumer->index++; 1647 /* 1648 * Delay the request if it is colliding with a regular request. 1649 */ 1650 if (g_raid3_regular_collision(sc, bp)) 1651 g_raid3_sync_delay(sc, bp); 1652 else 1653 g_io_request(bp, sync->ds_consumer); 1654 1655 /* Release delayed requests if possible. */ 1656 g_raid3_regular_release(sc); 1657 1658 /* Find the smallest offset. */ 1659 moffset = sc->sc_mediasize; 1660 for (i = 0; i < g_raid3_syncreqs; i++) { 1661 bp = sync->ds_bios[i]; 1662 boffset = bp->bio_offset; 1663 if (bp->bio_cmd == BIO_WRITE) 1664 boffset *= sc->sc_ndisks - 1; 1665 if (boffset < moffset) 1666 moffset = boffset; 1667 } 1668 if (sync->ds_offset_done + (MAXPHYS * 100) < moffset) { 1669 /* Update offset_done on every 100 blocks. */ 1670 sync->ds_offset_done = moffset; 1671 g_raid3_update_metadata(disk); 1672 } 1673 return; 1674 } 1675 default: 1676 KASSERT(1 == 0, ("Invalid command here: %u (device=%s)", 1677 bp->bio_cmd, sc->sc_name)); 1678 break; 1679 } 1680 } 1681 1682 static int 1683 g_raid3_register_request(struct bio *pbp) 1684 { 1685 struct g_raid3_softc *sc; 1686 struct g_raid3_disk *disk; 1687 struct g_consumer *cp; 1688 struct bio *cbp; 1689 off_t offset, length; 1690 u_int n, ndisks; 1691 int round_robin, verify; 1692 1693 ndisks = 0; 1694 sc = pbp->bio_to->geom->softc; 1695 if ((pbp->bio_cflags & G_RAID3_BIO_CFLAG_REGSYNC) != 0 && 1696 sc->sc_syncdisk == NULL) { 1697 g_io_deliver(pbp, EIO); 1698 return (0); 1699 } 1700 g_raid3_init_bio(pbp); 1701 length = pbp->bio_length / (sc->sc_ndisks - 1); 1702 offset = pbp->bio_offset / (sc->sc_ndisks - 1); 1703 round_robin = verify = 0; 1704 switch (pbp->bio_cmd) { 1705 case BIO_READ: 1706 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 1707 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1708 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_VERIFY; 1709 verify = 1; 1710 ndisks = sc->sc_ndisks; 1711 } else { 1712 verify = 0; 1713 ndisks = sc->sc_ndisks - 1; 1714 } 1715 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0 && 1716 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 1717 round_robin = 1; 1718 } else { 1719 round_robin = 0; 1720 } 1721 KASSERT(!round_robin || !verify, 1722 ("ROUND-ROBIN and VERIFY are mutually exclusive.")); 1723 pbp->bio_driver2 = &sc->sc_disks[sc->sc_ndisks - 1]; 1724 break; 1725 case BIO_WRITE: 1726 case BIO_DELETE: 1727 /* 1728 * Delay the request if it is colliding with a synchronization 1729 * request. 1730 */ 1731 if (g_raid3_sync_collision(sc, pbp)) { 1732 g_raid3_regular_delay(sc, pbp); 1733 return (0); 1734 } 1735 1736 if (sc->sc_idle) 1737 g_raid3_unidle(sc); 1738 else 1739 sc->sc_last_write = time_uptime; 1740 1741 ndisks = sc->sc_ndisks; 1742 break; 1743 } 1744 for (n = 0; n < ndisks; n++) { 1745 disk = &sc->sc_disks[n]; 1746 cbp = g_raid3_clone_bio(sc, pbp); 1747 if (cbp == NULL) { 1748 while ((cbp = G_RAID3_HEAD_BIO(pbp)) != NULL) 1749 g_raid3_destroy_bio(sc, cbp); 1750 /* 1751 * To prevent deadlock, we must run back up 1752 * with the ENOMEM for failed requests of any 1753 * of our consumers. Our own sync requests 1754 * can stick around, as they are finite. 1755 */ 1756 if ((pbp->bio_cflags & 1757 G_RAID3_BIO_CFLAG_REGULAR) != 0) { 1758 g_io_deliver(pbp, ENOMEM); 1759 return (0); 1760 } 1761 return (ENOMEM); 1762 } 1763 cbp->bio_offset = offset; 1764 cbp->bio_length = length; 1765 cbp->bio_done = g_raid3_done; 1766 switch (pbp->bio_cmd) { 1767 case BIO_READ: 1768 if (disk->d_state != G_RAID3_DISK_STATE_ACTIVE) { 1769 /* 1770 * Replace invalid component with the parity 1771 * component. 1772 */ 1773 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1774 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1775 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1776 } else if (round_robin && 1777 disk->d_no == sc->sc_round_robin) { 1778 /* 1779 * In round-robin mode skip one data component 1780 * and use parity component when reading. 1781 */ 1782 pbp->bio_driver2 = disk; 1783 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 1784 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1785 sc->sc_round_robin++; 1786 round_robin = 0; 1787 } else if (verify && disk->d_no == sc->sc_ndisks - 1) { 1788 cbp->bio_cflags |= G_RAID3_BIO_CFLAG_PARITY; 1789 } 1790 break; 1791 case BIO_WRITE: 1792 case BIO_DELETE: 1793 if (disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 1794 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 1795 if (n == ndisks - 1) { 1796 /* 1797 * Active parity component, mark it as such. 1798 */ 1799 cbp->bio_cflags |= 1800 G_RAID3_BIO_CFLAG_PARITY; 1801 } 1802 } else { 1803 pbp->bio_pflags |= G_RAID3_BIO_PFLAG_DEGRADED; 1804 if (n == ndisks - 1) { 1805 /* 1806 * Parity component is not connected, 1807 * so destroy its request. 1808 */ 1809 pbp->bio_pflags |= 1810 G_RAID3_BIO_PFLAG_NOPARITY; 1811 g_raid3_destroy_bio(sc, cbp); 1812 cbp = NULL; 1813 } else { 1814 cbp->bio_cflags |= 1815 G_RAID3_BIO_CFLAG_NODISK; 1816 disk = NULL; 1817 } 1818 } 1819 break; 1820 } 1821 if (cbp != NULL) 1822 cbp->bio_caller2 = disk; 1823 } 1824 switch (pbp->bio_cmd) { 1825 case BIO_READ: 1826 if (round_robin) { 1827 /* 1828 * If we are in round-robin mode and 'round_robin' is 1829 * still 1, it means, that we skipped parity component 1830 * for this read and must reset sc_round_robin field. 1831 */ 1832 sc->sc_round_robin = 0; 1833 } 1834 G_RAID3_FOREACH_BIO(pbp, cbp) { 1835 disk = cbp->bio_caller2; 1836 cp = disk->d_consumer; 1837 cbp->bio_to = cp->provider; 1838 G_RAID3_LOGREQ(3, cbp, "Sending request."); 1839 KASSERT(cp->acr >= 1 && cp->acw >= 1 && cp->ace >= 1, 1840 ("Consumer %s not opened (r%dw%de%d).", 1841 cp->provider->name, cp->acr, cp->acw, cp->ace)); 1842 cp->index++; 1843 g_io_request(cbp, cp); 1844 } 1845 break; 1846 case BIO_WRITE: 1847 case BIO_DELETE: 1848 /* 1849 * Put request onto inflight queue, so we can check if new 1850 * synchronization requests don't collide with it. 1851 */ 1852 bioq_insert_tail(&sc->sc_inflight, pbp); 1853 1854 /* 1855 * Bump syncid on first write. 1856 */ 1857 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) { 1858 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 1859 g_topology_lock(); 1860 g_raid3_bump_syncid(sc); 1861 g_topology_unlock(); 1862 } 1863 g_raid3_scatter(pbp); 1864 break; 1865 } 1866 return (0); 1867 } 1868 1869 static int 1870 g_raid3_can_destroy(struct g_raid3_softc *sc) 1871 { 1872 struct g_geom *gp; 1873 struct g_consumer *cp; 1874 1875 g_topology_assert(); 1876 gp = sc->sc_geom; 1877 LIST_FOREACH(cp, &gp->consumer, consumer) { 1878 if (g_raid3_is_busy(sc, cp)) 1879 return (0); 1880 } 1881 gp = sc->sc_sync.ds_geom; 1882 LIST_FOREACH(cp, &gp->consumer, consumer) { 1883 if (g_raid3_is_busy(sc, cp)) 1884 return (0); 1885 } 1886 G_RAID3_DEBUG(2, "No I/O requests for %s, it can be destroyed.", 1887 sc->sc_name); 1888 return (1); 1889 } 1890 1891 static int 1892 g_raid3_try_destroy(struct g_raid3_softc *sc) 1893 { 1894 1895 g_topology_assert_not(); 1896 sx_assert(&sc->sc_lock, SX_XLOCKED); 1897 1898 if (sc->sc_rootmount != NULL) { 1899 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 1900 sc->sc_rootmount); 1901 root_mount_rel(sc->sc_rootmount); 1902 sc->sc_rootmount = NULL; 1903 } 1904 1905 g_topology_lock(); 1906 if (!g_raid3_can_destroy(sc)) { 1907 g_topology_unlock(); 1908 return (0); 1909 } 1910 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_WAIT) != 0) { 1911 g_topology_unlock(); 1912 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 1913 &sc->sc_worker); 1914 /* Unlock sc_lock here, as it can be destroyed after wakeup. */ 1915 sx_xunlock(&sc->sc_lock); 1916 wakeup(&sc->sc_worker); 1917 sc->sc_worker = NULL; 1918 } else { 1919 g_topology_unlock(); 1920 g_raid3_destroy_device(sc); 1921 free(sc->sc_disks, M_RAID3); 1922 free(sc, M_RAID3); 1923 } 1924 return (1); 1925 } 1926 1927 /* 1928 * Worker thread. 1929 */ 1930 static void 1931 g_raid3_worker(void *arg) 1932 { 1933 struct g_raid3_softc *sc; 1934 struct g_raid3_event *ep; 1935 struct bio *bp; 1936 int timeout; 1937 1938 sc = arg; 1939 mtx_lock_spin(&sched_lock); 1940 sched_prio(curthread, PRIBIO); 1941 mtx_unlock_spin(&sched_lock); 1942 1943 sx_xlock(&sc->sc_lock); 1944 for (;;) { 1945 G_RAID3_DEBUG(5, "%s: Let's see...", __func__); 1946 /* 1947 * First take a look at events. 1948 * This is important to handle events before any I/O requests. 1949 */ 1950 ep = g_raid3_event_get(sc); 1951 if (ep != NULL) { 1952 g_raid3_event_remove(sc, ep); 1953 if ((ep->e_flags & G_RAID3_EVENT_DEVICE) != 0) { 1954 /* Update only device status. */ 1955 G_RAID3_DEBUG(3, 1956 "Running event for device %s.", 1957 sc->sc_name); 1958 ep->e_error = 0; 1959 g_raid3_update_device(sc, 1); 1960 } else { 1961 /* Update disk status. */ 1962 G_RAID3_DEBUG(3, "Running event for disk %s.", 1963 g_raid3_get_diskname(ep->e_disk)); 1964 ep->e_error = g_raid3_update_disk(ep->e_disk, 1965 ep->e_state); 1966 if (ep->e_error == 0) 1967 g_raid3_update_device(sc, 0); 1968 } 1969 if ((ep->e_flags & G_RAID3_EVENT_DONTWAIT) != 0) { 1970 KASSERT(ep->e_error == 0, 1971 ("Error cannot be handled.")); 1972 g_raid3_event_free(ep); 1973 } else { 1974 ep->e_flags |= G_RAID3_EVENT_DONE; 1975 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, 1976 ep); 1977 mtx_lock(&sc->sc_events_mtx); 1978 wakeup(ep); 1979 mtx_unlock(&sc->sc_events_mtx); 1980 } 1981 if ((sc->sc_flags & 1982 G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 1983 if (g_raid3_try_destroy(sc)) { 1984 curthread->td_pflags &= ~TDP_GEOM; 1985 G_RAID3_DEBUG(1, "Thread exiting."); 1986 kthread_exit(0); 1987 } 1988 } 1989 G_RAID3_DEBUG(5, "%s: I'm here 1.", __func__); 1990 continue; 1991 } 1992 /* 1993 * Check if we can mark array as CLEAN and if we can't take 1994 * how much seconds should we wait. 1995 */ 1996 timeout = g_raid3_idle(sc, -1); 1997 /* 1998 * Now I/O requests. 1999 */ 2000 /* Get first request from the queue. */ 2001 mtx_lock(&sc->sc_queue_mtx); 2002 bp = bioq_first(&sc->sc_queue); 2003 if (bp == NULL) { 2004 if (ep != NULL) { 2005 /* 2006 * We have a pending even, try to serve it 2007 * again. 2008 */ 2009 mtx_unlock(&sc->sc_queue_mtx); 2010 tsleep(ep, PRIBIO, "r3:top1", hz / 5); 2011 continue; 2012 } 2013 if ((sc->sc_flags & 2014 G_RAID3_DEVICE_FLAG_DESTROY) != 0) { 2015 mtx_unlock(&sc->sc_queue_mtx); 2016 if (g_raid3_try_destroy(sc)) { 2017 curthread->td_pflags &= ~TDP_GEOM; 2018 G_RAID3_DEBUG(0, "Thread exiting."); 2019 kthread_exit(0); 2020 } 2021 mtx_lock(&sc->sc_queue_mtx); 2022 } 2023 sx_xunlock(&sc->sc_lock); 2024 MSLEEP(sc, &sc->sc_queue_mtx, PRIBIO | PDROP, "r3:w1", 2025 timeout * hz); 2026 sx_xlock(&sc->sc_lock); 2027 G_RAID3_DEBUG(5, "%s: I'm here 4.", __func__); 2028 continue; 2029 } 2030 bioq_remove(&sc->sc_queue, bp); 2031 mtx_unlock(&sc->sc_queue_mtx); 2032 2033 if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_REGULAR) != 0) 2034 g_raid3_regular_request(bp); 2035 else if ((bp->bio_cflags & G_RAID3_BIO_CFLAG_SYNC) != 0) 2036 g_raid3_sync_request(bp); 2037 else { 2038 if (g_raid3_register_request(bp) != 0) { 2039 mtx_lock(&sc->sc_queue_mtx); 2040 bioq_insert_head(&sc->sc_queue, bp); 2041 MSLEEP(&sc->sc_queue, &sc->sc_queue_mtx, 2042 PRIBIO | PDROP, "r3:lowmem", hz / 10); 2043 } 2044 } 2045 G_RAID3_DEBUG(5, "%s: I'm here 9.", __func__); 2046 } 2047 } 2048 2049 static void 2050 g_raid3_update_idle(struct g_raid3_softc *sc, struct g_raid3_disk *disk) 2051 { 2052 2053 sx_assert(&sc->sc_lock, SX_LOCKED); 2054 if (!sc->sc_idle && (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) == 0) { 2055 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as dirty.", 2056 g_raid3_get_diskname(disk), sc->sc_name); 2057 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 2058 } else if (sc->sc_idle && 2059 (disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) { 2060 G_RAID3_DEBUG(1, "Disk %s (device %s) marked as clean.", 2061 g_raid3_get_diskname(disk), sc->sc_name); 2062 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2063 } 2064 } 2065 2066 static void 2067 g_raid3_sync_start(struct g_raid3_softc *sc) 2068 { 2069 struct g_raid3_disk *disk; 2070 struct g_consumer *cp; 2071 struct bio *bp; 2072 int error; 2073 u_int n; 2074 2075 g_topology_assert_not(); 2076 sx_assert(&sc->sc_lock, SX_XLOCKED); 2077 2078 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 2079 ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 2080 sc->sc_state)); 2081 KASSERT(sc->sc_syncdisk == NULL, ("Syncdisk is not NULL (%s, %u).", 2082 sc->sc_name, sc->sc_state)); 2083 disk = NULL; 2084 for (n = 0; n < sc->sc_ndisks; n++) { 2085 if (sc->sc_disks[n].d_state != G_RAID3_DISK_STATE_SYNCHRONIZING) 2086 continue; 2087 disk = &sc->sc_disks[n]; 2088 break; 2089 } 2090 if (disk == NULL) 2091 return; 2092 2093 sx_xunlock(&sc->sc_lock); 2094 g_topology_lock(); 2095 cp = g_new_consumer(sc->sc_sync.ds_geom); 2096 error = g_attach(cp, sc->sc_provider); 2097 KASSERT(error == 0, 2098 ("Cannot attach to %s (error=%d).", sc->sc_name, error)); 2099 error = g_access(cp, 1, 0, 0); 2100 KASSERT(error == 0, ("Cannot open %s (error=%d).", sc->sc_name, error)); 2101 g_topology_unlock(); 2102 sx_xlock(&sc->sc_lock); 2103 2104 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s.", sc->sc_name, 2105 g_raid3_get_diskname(disk)); 2106 disk->d_flags |= G_RAID3_DISK_FLAG_DIRTY; 2107 KASSERT(disk->d_sync.ds_consumer == NULL, 2108 ("Sync consumer already exists (device=%s, disk=%s).", 2109 sc->sc_name, g_raid3_get_diskname(disk))); 2110 2111 disk->d_sync.ds_consumer = cp; 2112 disk->d_sync.ds_consumer->private = disk; 2113 disk->d_sync.ds_consumer->index = 0; 2114 sc->sc_syncdisk = disk; 2115 2116 /* 2117 * Allocate memory for synchronization bios and initialize them. 2118 */ 2119 disk->d_sync.ds_bios = malloc(sizeof(struct bio *) * g_raid3_syncreqs, 2120 M_RAID3, M_WAITOK); 2121 for (n = 0; n < g_raid3_syncreqs; n++) { 2122 bp = g_alloc_bio(); 2123 disk->d_sync.ds_bios[n] = bp; 2124 bp->bio_parent = NULL; 2125 bp->bio_cmd = BIO_READ; 2126 bp->bio_data = malloc(MAXPHYS, M_RAID3, M_WAITOK); 2127 bp->bio_cflags = 0; 2128 bp->bio_offset = disk->d_sync.ds_offset * (sc->sc_ndisks - 1); 2129 bp->bio_length = MIN(MAXPHYS, sc->sc_mediasize - bp->bio_offset); 2130 disk->d_sync.ds_offset += bp->bio_length / (sc->sc_ndisks - 1); 2131 bp->bio_done = g_raid3_sync_done; 2132 bp->bio_from = disk->d_sync.ds_consumer; 2133 bp->bio_to = sc->sc_provider; 2134 bp->bio_caller1 = (void *)(uintptr_t)n; 2135 } 2136 2137 /* Set the number of in-flight synchronization requests. */ 2138 disk->d_sync.ds_inflight = g_raid3_syncreqs; 2139 2140 /* 2141 * Fire off first synchronization requests. 2142 */ 2143 for (n = 0; n < g_raid3_syncreqs; n++) { 2144 bp = disk->d_sync.ds_bios[n]; 2145 G_RAID3_LOGREQ(3, bp, "Sending synchronization request."); 2146 disk->d_sync.ds_consumer->index++; 2147 /* 2148 * Delay the request if it is colliding with a regular request. 2149 */ 2150 if (g_raid3_regular_collision(sc, bp)) 2151 g_raid3_sync_delay(sc, bp); 2152 else 2153 g_io_request(bp, disk->d_sync.ds_consumer); 2154 } 2155 } 2156 2157 /* 2158 * Stop synchronization process. 2159 * type: 0 - synchronization finished 2160 * 1 - synchronization stopped 2161 */ 2162 static void 2163 g_raid3_sync_stop(struct g_raid3_softc *sc, int type) 2164 { 2165 struct g_raid3_disk *disk; 2166 struct g_consumer *cp; 2167 2168 g_topology_assert_not(); 2169 sx_assert(&sc->sc_lock, SX_LOCKED); 2170 2171 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED, 2172 ("Device not in DEGRADED state (%s, %u).", sc->sc_name, 2173 sc->sc_state)); 2174 disk = sc->sc_syncdisk; 2175 sc->sc_syncdisk = NULL; 2176 KASSERT(disk != NULL, ("No disk was synchronized (%s).", sc->sc_name)); 2177 KASSERT(disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2178 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2179 g_raid3_disk_state2str(disk->d_state))); 2180 if (disk->d_sync.ds_consumer == NULL) 2181 return; 2182 2183 if (type == 0) { 2184 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s finished.", 2185 sc->sc_name, g_raid3_get_diskname(disk)); 2186 } else /* if (type == 1) */ { 2187 G_RAID3_DEBUG(0, "Device %s: rebuilding provider %s stopped.", 2188 sc->sc_name, g_raid3_get_diskname(disk)); 2189 } 2190 free(disk->d_sync.ds_bios, M_RAID3); 2191 disk->d_sync.ds_bios = NULL; 2192 cp = disk->d_sync.ds_consumer; 2193 disk->d_sync.ds_consumer = NULL; 2194 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2195 sx_xunlock(&sc->sc_lock); /* Avoid recursion on sc_lock. */ 2196 g_topology_lock(); 2197 g_raid3_kill_consumer(sc, cp); 2198 g_topology_unlock(); 2199 sx_xlock(&sc->sc_lock); 2200 } 2201 2202 static void 2203 g_raid3_launch_provider(struct g_raid3_softc *sc) 2204 { 2205 struct g_provider *pp; 2206 2207 sx_assert(&sc->sc_lock, SX_LOCKED); 2208 2209 g_topology_lock(); 2210 pp = g_new_providerf(sc->sc_geom, "raid3/%s", sc->sc_name); 2211 pp->mediasize = sc->sc_mediasize; 2212 pp->sectorsize = sc->sc_sectorsize; 2213 sc->sc_provider = pp; 2214 g_error_provider(pp, 0); 2215 g_topology_unlock(); 2216 G_RAID3_DEBUG(0, "Device %s: provider %s launched.", sc->sc_name, 2217 pp->name); 2218 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED) 2219 g_raid3_sync_start(sc); 2220 } 2221 2222 static void 2223 g_raid3_destroy_provider(struct g_raid3_softc *sc) 2224 { 2225 struct bio *bp; 2226 2227 g_topology_assert_not(); 2228 KASSERT(sc->sc_provider != NULL, ("NULL provider (device=%s).", 2229 sc->sc_name)); 2230 2231 g_topology_lock(); 2232 g_error_provider(sc->sc_provider, ENXIO); 2233 mtx_lock(&sc->sc_queue_mtx); 2234 while ((bp = bioq_first(&sc->sc_queue)) != NULL) { 2235 bioq_remove(&sc->sc_queue, bp); 2236 g_io_deliver(bp, ENXIO); 2237 } 2238 mtx_unlock(&sc->sc_queue_mtx); 2239 G_RAID3_DEBUG(0, "Device %s: provider %s destroyed.", sc->sc_name, 2240 sc->sc_provider->name); 2241 sc->sc_provider->flags |= G_PF_WITHER; 2242 g_orphan_provider(sc->sc_provider, ENXIO); 2243 g_topology_unlock(); 2244 sc->sc_provider = NULL; 2245 if (sc->sc_syncdisk != NULL) 2246 g_raid3_sync_stop(sc, 1); 2247 } 2248 2249 static void 2250 g_raid3_go(void *arg) 2251 { 2252 struct g_raid3_softc *sc; 2253 2254 sc = arg; 2255 G_RAID3_DEBUG(0, "Force device %s start due to timeout.", sc->sc_name); 2256 g_raid3_event_send(sc, 0, 2257 G_RAID3_EVENT_DONTWAIT | G_RAID3_EVENT_DEVICE); 2258 } 2259 2260 static u_int 2261 g_raid3_determine_state(struct g_raid3_disk *disk) 2262 { 2263 struct g_raid3_softc *sc; 2264 u_int state; 2265 2266 sc = disk->d_softc; 2267 if (sc->sc_syncid == disk->d_sync.ds_syncid) { 2268 if ((disk->d_flags & 2269 G_RAID3_DISK_FLAG_SYNCHRONIZING) == 0) { 2270 /* Disk does not need synchronization. */ 2271 state = G_RAID3_DISK_STATE_ACTIVE; 2272 } else { 2273 if ((sc->sc_flags & 2274 G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2275 (disk->d_flags & 2276 G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2277 /* 2278 * We can start synchronization from 2279 * the stored offset. 2280 */ 2281 state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2282 } else { 2283 state = G_RAID3_DISK_STATE_STALE; 2284 } 2285 } 2286 } else if (disk->d_sync.ds_syncid < sc->sc_syncid) { 2287 /* 2288 * Reset all synchronization data for this disk, 2289 * because if it even was synchronized, it was 2290 * synchronized to disks with different syncid. 2291 */ 2292 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2293 disk->d_sync.ds_offset = 0; 2294 disk->d_sync.ds_offset_done = 0; 2295 disk->d_sync.ds_syncid = sc->sc_syncid; 2296 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) == 0 || 2297 (disk->d_flags & G_RAID3_DISK_FLAG_FORCE_SYNC) != 0) { 2298 state = G_RAID3_DISK_STATE_SYNCHRONIZING; 2299 } else { 2300 state = G_RAID3_DISK_STATE_STALE; 2301 } 2302 } else /* if (sc->sc_syncid < disk->d_sync.ds_syncid) */ { 2303 /* 2304 * Not good, NOT GOOD! 2305 * It means that device was started on stale disks 2306 * and more fresh disk just arrive. 2307 * If there were writes, device is fucked up, sorry. 2308 * I think the best choice here is don't touch 2309 * this disk and inform the user laudly. 2310 */ 2311 G_RAID3_DEBUG(0, "Device %s was started before the freshest " 2312 "disk (%s) arrives!! It will not be connected to the " 2313 "running device.", sc->sc_name, 2314 g_raid3_get_diskname(disk)); 2315 g_raid3_destroy_disk(disk); 2316 state = G_RAID3_DISK_STATE_NONE; 2317 /* Return immediately, because disk was destroyed. */ 2318 return (state); 2319 } 2320 G_RAID3_DEBUG(3, "State for %s disk: %s.", 2321 g_raid3_get_diskname(disk), g_raid3_disk_state2str(state)); 2322 return (state); 2323 } 2324 2325 /* 2326 * Update device state. 2327 */ 2328 static void 2329 g_raid3_update_device(struct g_raid3_softc *sc, boolean_t force) 2330 { 2331 struct g_raid3_disk *disk; 2332 u_int state; 2333 2334 sx_assert(&sc->sc_lock, SX_XLOCKED); 2335 2336 switch (sc->sc_state) { 2337 case G_RAID3_DEVICE_STATE_STARTING: 2338 { 2339 u_int n, ndirty, ndisks, genid, syncid; 2340 2341 KASSERT(sc->sc_provider == NULL, 2342 ("Non-NULL provider in STARTING state (%s).", sc->sc_name)); 2343 /* 2344 * Are we ready? We are, if all disks are connected or 2345 * one disk is missing and 'force' is true. 2346 */ 2347 if (g_raid3_ndisks(sc, -1) + force == sc->sc_ndisks) { 2348 if (!force) 2349 callout_drain(&sc->sc_callout); 2350 } else { 2351 if (force) { 2352 /* 2353 * Timeout expired, so destroy device. 2354 */ 2355 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2356 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", 2357 __LINE__, sc->sc_rootmount); 2358 root_mount_rel(sc->sc_rootmount); 2359 sc->sc_rootmount = NULL; 2360 } 2361 return; 2362 } 2363 2364 /* 2365 * Find the biggest genid. 2366 */ 2367 genid = 0; 2368 for (n = 0; n < sc->sc_ndisks; n++) { 2369 disk = &sc->sc_disks[n]; 2370 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2371 continue; 2372 if (disk->d_genid > genid) 2373 genid = disk->d_genid; 2374 } 2375 sc->sc_genid = genid; 2376 /* 2377 * Remove all disks without the biggest genid. 2378 */ 2379 for (n = 0; n < sc->sc_ndisks; n++) { 2380 disk = &sc->sc_disks[n]; 2381 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2382 continue; 2383 if (disk->d_genid < genid) { 2384 G_RAID3_DEBUG(0, 2385 "Component %s (device %s) broken, skipping.", 2386 g_raid3_get_diskname(disk), sc->sc_name); 2387 g_raid3_destroy_disk(disk); 2388 } 2389 } 2390 2391 /* 2392 * There must be at least 'sc->sc_ndisks - 1' components 2393 * with the same syncid and without SYNCHRONIZING flag. 2394 */ 2395 2396 /* 2397 * Find the biggest syncid, number of valid components and 2398 * number of dirty components. 2399 */ 2400 ndirty = ndisks = syncid = 0; 2401 for (n = 0; n < sc->sc_ndisks; n++) { 2402 disk = &sc->sc_disks[n]; 2403 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2404 continue; 2405 if ((disk->d_flags & G_RAID3_DISK_FLAG_DIRTY) != 0) 2406 ndirty++; 2407 if (disk->d_sync.ds_syncid > syncid) { 2408 syncid = disk->d_sync.ds_syncid; 2409 ndisks = 0; 2410 } else if (disk->d_sync.ds_syncid < syncid) { 2411 continue; 2412 } 2413 if ((disk->d_flags & 2414 G_RAID3_DISK_FLAG_SYNCHRONIZING) != 0) { 2415 continue; 2416 } 2417 ndisks++; 2418 } 2419 /* 2420 * Do we have enough valid components? 2421 */ 2422 if (ndisks + 1 < sc->sc_ndisks) { 2423 G_RAID3_DEBUG(0, 2424 "Device %s is broken, too few valid components.", 2425 sc->sc_name); 2426 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2427 return; 2428 } 2429 /* 2430 * If there is one DIRTY component and all disks are present, 2431 * mark it for synchronization. If there is more than one DIRTY 2432 * component, mark parity component for synchronization. 2433 */ 2434 if (ndisks == sc->sc_ndisks && ndirty == 1) { 2435 for (n = 0; n < sc->sc_ndisks; n++) { 2436 disk = &sc->sc_disks[n]; 2437 if ((disk->d_flags & 2438 G_RAID3_DISK_FLAG_DIRTY) == 0) { 2439 continue; 2440 } 2441 disk->d_flags |= 2442 G_RAID3_DISK_FLAG_SYNCHRONIZING; 2443 } 2444 } else if (ndisks == sc->sc_ndisks && ndirty > 1) { 2445 disk = &sc->sc_disks[sc->sc_ndisks - 1]; 2446 disk->d_flags |= G_RAID3_DISK_FLAG_SYNCHRONIZING; 2447 } 2448 2449 sc->sc_syncid = syncid; 2450 if (force) { 2451 /* Remember to bump syncid on first write. */ 2452 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2453 } 2454 if (ndisks == sc->sc_ndisks) 2455 state = G_RAID3_DEVICE_STATE_COMPLETE; 2456 else /* if (ndisks == sc->sc_ndisks - 1) */ 2457 state = G_RAID3_DEVICE_STATE_DEGRADED; 2458 G_RAID3_DEBUG(1, "Device %s state changed from %s to %s.", 2459 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2460 g_raid3_device_state2str(state)); 2461 sc->sc_state = state; 2462 for (n = 0; n < sc->sc_ndisks; n++) { 2463 disk = &sc->sc_disks[n]; 2464 if (disk->d_state == G_RAID3_DISK_STATE_NODISK) 2465 continue; 2466 state = g_raid3_determine_state(disk); 2467 g_raid3_event_send(disk, state, G_RAID3_EVENT_DONTWAIT); 2468 if (state == G_RAID3_DISK_STATE_STALE) 2469 sc->sc_bump_id |= G_RAID3_BUMP_SYNCID; 2470 } 2471 break; 2472 } 2473 case G_RAID3_DEVICE_STATE_DEGRADED: 2474 /* 2475 * Genid need to be bumped immediately, so do it here. 2476 */ 2477 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2478 sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2479 g_raid3_bump_genid(sc); 2480 } 2481 2482 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2483 return; 2484 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < 2485 sc->sc_ndisks - 1) { 2486 if (sc->sc_provider != NULL) 2487 g_raid3_destroy_provider(sc); 2488 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 2489 return; 2490 } 2491 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2492 sc->sc_ndisks) { 2493 state = G_RAID3_DEVICE_STATE_COMPLETE; 2494 G_RAID3_DEBUG(1, 2495 "Device %s state changed from %s to %s.", 2496 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2497 g_raid3_device_state2str(state)); 2498 sc->sc_state = state; 2499 } 2500 if (sc->sc_provider == NULL) 2501 g_raid3_launch_provider(sc); 2502 if (sc->sc_rootmount != NULL) { 2503 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 2504 sc->sc_rootmount); 2505 root_mount_rel(sc->sc_rootmount); 2506 sc->sc_rootmount = NULL; 2507 } 2508 break; 2509 case G_RAID3_DEVICE_STATE_COMPLETE: 2510 /* 2511 * Genid need to be bumped immediately, so do it here. 2512 */ 2513 if ((sc->sc_bump_id & G_RAID3_BUMP_GENID) != 0) { 2514 sc->sc_bump_id &= ~G_RAID3_BUMP_GENID; 2515 g_raid3_bump_genid(sc); 2516 } 2517 2518 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NEW) > 0) 2519 return; 2520 KASSERT(g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) >= 2521 sc->sc_ndisks - 1, 2522 ("Too few ACTIVE components in COMPLETE state (device %s).", 2523 sc->sc_name)); 2524 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) == 2525 sc->sc_ndisks - 1) { 2526 state = G_RAID3_DEVICE_STATE_DEGRADED; 2527 G_RAID3_DEBUG(1, 2528 "Device %s state changed from %s to %s.", 2529 sc->sc_name, g_raid3_device_state2str(sc->sc_state), 2530 g_raid3_device_state2str(state)); 2531 sc->sc_state = state; 2532 } 2533 if (sc->sc_provider == NULL) 2534 g_raid3_launch_provider(sc); 2535 if (sc->sc_rootmount != NULL) { 2536 G_RAID3_DEBUG(1, "root_mount_rel[%u] %p", __LINE__, 2537 sc->sc_rootmount); 2538 root_mount_rel(sc->sc_rootmount); 2539 sc->sc_rootmount = NULL; 2540 } 2541 break; 2542 default: 2543 KASSERT(1 == 0, ("Wrong device state (%s, %s).", sc->sc_name, 2544 g_raid3_device_state2str(sc->sc_state))); 2545 break; 2546 } 2547 } 2548 2549 /* 2550 * Update disk state and device state if needed. 2551 */ 2552 #define DISK_STATE_CHANGED() G_RAID3_DEBUG(1, \ 2553 "Disk %s state changed from %s to %s (device %s).", \ 2554 g_raid3_get_diskname(disk), \ 2555 g_raid3_disk_state2str(disk->d_state), \ 2556 g_raid3_disk_state2str(state), sc->sc_name) 2557 static int 2558 g_raid3_update_disk(struct g_raid3_disk *disk, u_int state) 2559 { 2560 struct g_raid3_softc *sc; 2561 2562 sc = disk->d_softc; 2563 sx_assert(&sc->sc_lock, SX_XLOCKED); 2564 2565 again: 2566 G_RAID3_DEBUG(3, "Changing disk %s state from %s to %s.", 2567 g_raid3_get_diskname(disk), g_raid3_disk_state2str(disk->d_state), 2568 g_raid3_disk_state2str(state)); 2569 switch (state) { 2570 case G_RAID3_DISK_STATE_NEW: 2571 /* 2572 * Possible scenarios: 2573 * 1. New disk arrive. 2574 */ 2575 /* Previous state should be NONE. */ 2576 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NONE, 2577 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2578 g_raid3_disk_state2str(disk->d_state))); 2579 DISK_STATE_CHANGED(); 2580 2581 disk->d_state = state; 2582 G_RAID3_DEBUG(0, "Device %s: provider %s detected.", 2583 sc->sc_name, g_raid3_get_diskname(disk)); 2584 if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) 2585 break; 2586 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2587 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2588 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2589 g_raid3_device_state2str(sc->sc_state), 2590 g_raid3_get_diskname(disk), 2591 g_raid3_disk_state2str(disk->d_state))); 2592 state = g_raid3_determine_state(disk); 2593 if (state != G_RAID3_DISK_STATE_NONE) 2594 goto again; 2595 break; 2596 case G_RAID3_DISK_STATE_ACTIVE: 2597 /* 2598 * Possible scenarios: 2599 * 1. New disk does not need synchronization. 2600 * 2. Synchronization process finished successfully. 2601 */ 2602 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2603 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2604 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2605 g_raid3_device_state2str(sc->sc_state), 2606 g_raid3_get_diskname(disk), 2607 g_raid3_disk_state2str(disk->d_state))); 2608 /* Previous state should be NEW or SYNCHRONIZING. */ 2609 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW || 2610 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2611 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2612 g_raid3_disk_state2str(disk->d_state))); 2613 DISK_STATE_CHANGED(); 2614 2615 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 2616 disk->d_flags &= ~G_RAID3_DISK_FLAG_SYNCHRONIZING; 2617 disk->d_flags &= ~G_RAID3_DISK_FLAG_FORCE_SYNC; 2618 g_raid3_sync_stop(sc, 0); 2619 } 2620 disk->d_state = state; 2621 disk->d_sync.ds_offset = 0; 2622 disk->d_sync.ds_offset_done = 0; 2623 g_raid3_update_idle(sc, disk); 2624 g_raid3_update_metadata(disk); 2625 G_RAID3_DEBUG(0, "Device %s: provider %s activated.", 2626 sc->sc_name, g_raid3_get_diskname(disk)); 2627 break; 2628 case G_RAID3_DISK_STATE_STALE: 2629 /* 2630 * Possible scenarios: 2631 * 1. Stale disk was connected. 2632 */ 2633 /* Previous state should be NEW. */ 2634 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2635 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2636 g_raid3_disk_state2str(disk->d_state))); 2637 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2638 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2639 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2640 g_raid3_device_state2str(sc->sc_state), 2641 g_raid3_get_diskname(disk), 2642 g_raid3_disk_state2str(disk->d_state))); 2643 /* 2644 * STALE state is only possible if device is marked 2645 * NOAUTOSYNC. 2646 */ 2647 KASSERT((sc->sc_flags & G_RAID3_DEVICE_FLAG_NOAUTOSYNC) != 0, 2648 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2649 g_raid3_device_state2str(sc->sc_state), 2650 g_raid3_get_diskname(disk), 2651 g_raid3_disk_state2str(disk->d_state))); 2652 DISK_STATE_CHANGED(); 2653 2654 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2655 disk->d_state = state; 2656 g_raid3_update_metadata(disk); 2657 G_RAID3_DEBUG(0, "Device %s: provider %s is stale.", 2658 sc->sc_name, g_raid3_get_diskname(disk)); 2659 break; 2660 case G_RAID3_DISK_STATE_SYNCHRONIZING: 2661 /* 2662 * Possible scenarios: 2663 * 1. Disk which needs synchronization was connected. 2664 */ 2665 /* Previous state should be NEW. */ 2666 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2667 ("Wrong disk state (%s, %s).", g_raid3_get_diskname(disk), 2668 g_raid3_disk_state2str(disk->d_state))); 2669 KASSERT(sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2670 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE, 2671 ("Wrong device state (%s, %s, %s, %s).", sc->sc_name, 2672 g_raid3_device_state2str(sc->sc_state), 2673 g_raid3_get_diskname(disk), 2674 g_raid3_disk_state2str(disk->d_state))); 2675 DISK_STATE_CHANGED(); 2676 2677 if (disk->d_state == G_RAID3_DISK_STATE_NEW) 2678 disk->d_flags &= ~G_RAID3_DISK_FLAG_DIRTY; 2679 disk->d_state = state; 2680 if (sc->sc_provider != NULL) { 2681 g_raid3_sync_start(sc); 2682 g_raid3_update_metadata(disk); 2683 } 2684 break; 2685 case G_RAID3_DISK_STATE_DISCONNECTED: 2686 /* 2687 * Possible scenarios: 2688 * 1. Device wasn't running yet, but disk disappear. 2689 * 2. Disk was active and disapppear. 2690 * 3. Disk disappear during synchronization process. 2691 */ 2692 if (sc->sc_state == G_RAID3_DEVICE_STATE_DEGRADED || 2693 sc->sc_state == G_RAID3_DEVICE_STATE_COMPLETE) { 2694 /* 2695 * Previous state should be ACTIVE, STALE or 2696 * SYNCHRONIZING. 2697 */ 2698 KASSERT(disk->d_state == G_RAID3_DISK_STATE_ACTIVE || 2699 disk->d_state == G_RAID3_DISK_STATE_STALE || 2700 disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING, 2701 ("Wrong disk state (%s, %s).", 2702 g_raid3_get_diskname(disk), 2703 g_raid3_disk_state2str(disk->d_state))); 2704 } else if (sc->sc_state == G_RAID3_DEVICE_STATE_STARTING) { 2705 /* Previous state should be NEW. */ 2706 KASSERT(disk->d_state == G_RAID3_DISK_STATE_NEW, 2707 ("Wrong disk state (%s, %s).", 2708 g_raid3_get_diskname(disk), 2709 g_raid3_disk_state2str(disk->d_state))); 2710 /* 2711 * Reset bumping syncid if disk disappeared in STARTING 2712 * state. 2713 */ 2714 if ((sc->sc_bump_id & G_RAID3_BUMP_SYNCID) != 0) 2715 sc->sc_bump_id &= ~G_RAID3_BUMP_SYNCID; 2716 #ifdef INVARIANTS 2717 } else { 2718 KASSERT(1 == 0, ("Wrong device state (%s, %s, %s, %s).", 2719 sc->sc_name, 2720 g_raid3_device_state2str(sc->sc_state), 2721 g_raid3_get_diskname(disk), 2722 g_raid3_disk_state2str(disk->d_state))); 2723 #endif 2724 } 2725 DISK_STATE_CHANGED(); 2726 G_RAID3_DEBUG(0, "Device %s: provider %s disconnected.", 2727 sc->sc_name, g_raid3_get_diskname(disk)); 2728 2729 g_raid3_destroy_disk(disk); 2730 break; 2731 default: 2732 KASSERT(1 == 0, ("Unknown state (%u).", state)); 2733 break; 2734 } 2735 return (0); 2736 } 2737 #undef DISK_STATE_CHANGED 2738 2739 int 2740 g_raid3_read_metadata(struct g_consumer *cp, struct g_raid3_metadata *md) 2741 { 2742 struct g_provider *pp; 2743 u_char *buf; 2744 int error; 2745 2746 g_topology_assert(); 2747 2748 error = g_access(cp, 1, 0, 0); 2749 if (error != 0) 2750 return (error); 2751 pp = cp->provider; 2752 g_topology_unlock(); 2753 /* Metadata are stored on last sector. */ 2754 buf = g_read_data(cp, pp->mediasize - pp->sectorsize, pp->sectorsize, 2755 &error); 2756 g_topology_lock(); 2757 g_access(cp, -1, 0, 0); 2758 if (buf == NULL) { 2759 G_RAID3_DEBUG(1, "Cannot read metadata from %s (error=%d).", 2760 cp->provider->name, error); 2761 return (error); 2762 } 2763 2764 /* Decode metadata. */ 2765 error = raid3_metadata_decode(buf, md); 2766 g_free(buf); 2767 if (strcmp(md->md_magic, G_RAID3_MAGIC) != 0) 2768 return (EINVAL); 2769 if (md->md_version > G_RAID3_VERSION) { 2770 G_RAID3_DEBUG(0, 2771 "Kernel module is too old to handle metadata from %s.", 2772 cp->provider->name); 2773 return (EINVAL); 2774 } 2775 if (error != 0) { 2776 G_RAID3_DEBUG(1, "MD5 metadata hash mismatch for provider %s.", 2777 cp->provider->name); 2778 return (error); 2779 } 2780 2781 return (0); 2782 } 2783 2784 static int 2785 g_raid3_check_metadata(struct g_raid3_softc *sc, struct g_provider *pp, 2786 struct g_raid3_metadata *md) 2787 { 2788 2789 if (md->md_no >= sc->sc_ndisks) { 2790 G_RAID3_DEBUG(1, "Invalid disk %s number (no=%u), skipping.", 2791 pp->name, md->md_no); 2792 return (EINVAL); 2793 } 2794 if (sc->sc_disks[md->md_no].d_state != G_RAID3_DISK_STATE_NODISK) { 2795 G_RAID3_DEBUG(1, "Disk %s (no=%u) already exists, skipping.", 2796 pp->name, md->md_no); 2797 return (EEXIST); 2798 } 2799 if (md->md_all != sc->sc_ndisks) { 2800 G_RAID3_DEBUG(1, 2801 "Invalid '%s' field on disk %s (device %s), skipping.", 2802 "md_all", pp->name, sc->sc_name); 2803 return (EINVAL); 2804 } 2805 if (md->md_mediasize != sc->sc_mediasize) { 2806 G_RAID3_DEBUG(1, 2807 "Invalid '%s' field on disk %s (device %s), skipping.", 2808 "md_mediasize", pp->name, sc->sc_name); 2809 return (EINVAL); 2810 } 2811 if ((md->md_mediasize % (sc->sc_ndisks - 1)) != 0) { 2812 G_RAID3_DEBUG(1, 2813 "Invalid '%s' field on disk %s (device %s), skipping.", 2814 "md_mediasize", pp->name, sc->sc_name); 2815 return (EINVAL); 2816 } 2817 if ((sc->sc_mediasize / (sc->sc_ndisks - 1)) > pp->mediasize) { 2818 G_RAID3_DEBUG(1, 2819 "Invalid size of disk %s (device %s), skipping.", pp->name, 2820 sc->sc_name); 2821 return (EINVAL); 2822 } 2823 if ((md->md_sectorsize / pp->sectorsize) < sc->sc_ndisks - 1) { 2824 G_RAID3_DEBUG(1, 2825 "Invalid '%s' field on disk %s (device %s), skipping.", 2826 "md_sectorsize", pp->name, sc->sc_name); 2827 return (EINVAL); 2828 } 2829 if (md->md_sectorsize != sc->sc_sectorsize) { 2830 G_RAID3_DEBUG(1, 2831 "Invalid '%s' field on disk %s (device %s), skipping.", 2832 "md_sectorsize", pp->name, sc->sc_name); 2833 return (EINVAL); 2834 } 2835 if ((sc->sc_sectorsize % pp->sectorsize) != 0) { 2836 G_RAID3_DEBUG(1, 2837 "Invalid sector size of disk %s (device %s), skipping.", 2838 pp->name, sc->sc_name); 2839 return (EINVAL); 2840 } 2841 if ((md->md_mflags & ~G_RAID3_DEVICE_FLAG_MASK) != 0) { 2842 G_RAID3_DEBUG(1, 2843 "Invalid device flags on disk %s (device %s), skipping.", 2844 pp->name, sc->sc_name); 2845 return (EINVAL); 2846 } 2847 if ((md->md_mflags & G_RAID3_DEVICE_FLAG_VERIFY) != 0 && 2848 (md->md_mflags & G_RAID3_DEVICE_FLAG_ROUND_ROBIN) != 0) { 2849 /* 2850 * VERIFY and ROUND-ROBIN options are mutally exclusive. 2851 */ 2852 G_RAID3_DEBUG(1, "Both VERIFY and ROUND-ROBIN flags exist on " 2853 "disk %s (device %s), skipping.", pp->name, sc->sc_name); 2854 return (EINVAL); 2855 } 2856 if ((md->md_dflags & ~G_RAID3_DISK_FLAG_MASK) != 0) { 2857 G_RAID3_DEBUG(1, 2858 "Invalid disk flags on disk %s (device %s), skipping.", 2859 pp->name, sc->sc_name); 2860 return (EINVAL); 2861 } 2862 return (0); 2863 } 2864 2865 int 2866 g_raid3_add_disk(struct g_raid3_softc *sc, struct g_provider *pp, 2867 struct g_raid3_metadata *md) 2868 { 2869 struct g_raid3_disk *disk; 2870 int error; 2871 2872 g_topology_assert_not(); 2873 G_RAID3_DEBUG(2, "Adding disk %s.", pp->name); 2874 2875 error = g_raid3_check_metadata(sc, pp, md); 2876 if (error != 0) 2877 return (error); 2878 if (sc->sc_state != G_RAID3_DEVICE_STATE_STARTING && 2879 md->md_genid < sc->sc_genid) { 2880 G_RAID3_DEBUG(0, "Component %s (device %s) broken, skipping.", 2881 pp->name, sc->sc_name); 2882 return (EINVAL); 2883 } 2884 disk = g_raid3_init_disk(sc, pp, md, &error); 2885 if (disk == NULL) 2886 return (error); 2887 error = g_raid3_event_send(disk, G_RAID3_DISK_STATE_NEW, 2888 G_RAID3_EVENT_WAIT); 2889 if (error != 0) 2890 return (error); 2891 if (md->md_version < G_RAID3_VERSION) { 2892 G_RAID3_DEBUG(0, "Upgrading metadata on %s (v%d->v%d).", 2893 pp->name, md->md_version, G_RAID3_VERSION); 2894 g_raid3_update_metadata(disk); 2895 } 2896 return (0); 2897 } 2898 2899 static int 2900 g_raid3_access(struct g_provider *pp, int acr, int acw, int ace) 2901 { 2902 struct g_raid3_softc *sc; 2903 int dcr, dcw, dce, error; 2904 2905 g_topology_assert(); 2906 G_RAID3_DEBUG(2, "Access request for %s: r%dw%de%d.", pp->name, acr, 2907 acw, ace); 2908 2909 dcr = pp->acr + acr; 2910 dcw = pp->acw + acw; 2911 dce = pp->ace + ace; 2912 2913 error = 0; 2914 sc = pp->geom->softc; 2915 if (sc != NULL) { 2916 if ((sc->sc_flags & G_RAID3_DEVICE_FLAG_DESTROY) != 0) 2917 sc = NULL; 2918 else { 2919 g_topology_unlock(); 2920 sx_xlock(&sc->sc_lock); 2921 } 2922 } 2923 if (sc == NULL || 2924 g_raid3_ndisks(sc, G_RAID3_DISK_STATE_ACTIVE) < sc->sc_ndisks - 1) { 2925 if (acr > 0 || acw > 0 || ace > 0) 2926 error = ENXIO; 2927 goto end; 2928 } 2929 if (dcw == 0 && !sc->sc_idle) 2930 g_raid3_idle(sc, dcw); 2931 end: 2932 if (sc != NULL) { 2933 sx_xunlock(&sc->sc_lock); 2934 g_topology_lock(); 2935 } 2936 return (error); 2937 } 2938 2939 static struct g_geom * 2940 g_raid3_create(struct g_class *mp, const struct g_raid3_metadata *md) 2941 { 2942 struct g_raid3_softc *sc; 2943 struct g_geom *gp; 2944 int error, timeout; 2945 u_int n; 2946 2947 g_topology_assert(); 2948 G_RAID3_DEBUG(1, "Creating device %s (id=%u).", md->md_name, md->md_id); 2949 2950 /* One disk is minimum. */ 2951 if (md->md_all < 1) 2952 return (NULL); 2953 /* 2954 * Action geom. 2955 */ 2956 gp = g_new_geomf(mp, "%s", md->md_name); 2957 sc = malloc(sizeof(*sc), M_RAID3, M_WAITOK | M_ZERO); 2958 sc->sc_disks = malloc(sizeof(struct g_raid3_disk) * md->md_all, M_RAID3, 2959 M_WAITOK | M_ZERO); 2960 gp->start = g_raid3_start; 2961 gp->orphan = g_raid3_orphan; 2962 gp->access = g_raid3_access; 2963 gp->dumpconf = g_raid3_dumpconf; 2964 2965 sc->sc_id = md->md_id; 2966 sc->sc_mediasize = md->md_mediasize; 2967 sc->sc_sectorsize = md->md_sectorsize; 2968 sc->sc_ndisks = md->md_all; 2969 sc->sc_round_robin = 0; 2970 sc->sc_flags = md->md_mflags; 2971 sc->sc_bump_id = 0; 2972 sc->sc_idle = 1; 2973 sc->sc_last_write = time_uptime; 2974 sc->sc_writes = 0; 2975 for (n = 0; n < sc->sc_ndisks; n++) { 2976 sc->sc_disks[n].d_softc = sc; 2977 sc->sc_disks[n].d_no = n; 2978 sc->sc_disks[n].d_state = G_RAID3_DISK_STATE_NODISK; 2979 } 2980 sx_init(&sc->sc_lock, "graid3:lock"); 2981 bioq_init(&sc->sc_queue); 2982 mtx_init(&sc->sc_queue_mtx, "graid3:queue", NULL, MTX_DEF); 2983 bioq_init(&sc->sc_regular_delayed); 2984 bioq_init(&sc->sc_inflight); 2985 bioq_init(&sc->sc_sync_delayed); 2986 TAILQ_INIT(&sc->sc_events); 2987 mtx_init(&sc->sc_events_mtx, "graid3:events", NULL, MTX_DEF); 2988 callout_init(&sc->sc_callout, CALLOUT_MPSAFE); 2989 sc->sc_state = G_RAID3_DEVICE_STATE_STARTING; 2990 gp->softc = sc; 2991 sc->sc_geom = gp; 2992 sc->sc_provider = NULL; 2993 /* 2994 * Synchronization geom. 2995 */ 2996 gp = g_new_geomf(mp, "%s.sync", md->md_name); 2997 gp->softc = sc; 2998 gp->orphan = g_raid3_orphan; 2999 sc->sc_sync.ds_geom = gp; 3000 3001 sc->sc_zones[G_RAID3_ZONE_64K].sz_zone = uma_zcreate("gr3:64k", 65536, 3002 g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 3003 sc->sc_zones[G_RAID3_ZONE_64K].sz_inuse = 0; 3004 sc->sc_zones[G_RAID3_ZONE_64K].sz_max = g_raid3_n64k; 3005 sc->sc_zones[G_RAID3_ZONE_64K].sz_requested = 3006 sc->sc_zones[G_RAID3_ZONE_64K].sz_failed = 0; 3007 sc->sc_zones[G_RAID3_ZONE_16K].sz_zone = uma_zcreate("gr3:16k", 16384, 3008 g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 3009 sc->sc_zones[G_RAID3_ZONE_16K].sz_inuse = 0; 3010 sc->sc_zones[G_RAID3_ZONE_16K].sz_max = g_raid3_n16k; 3011 sc->sc_zones[G_RAID3_ZONE_16K].sz_requested = 3012 sc->sc_zones[G_RAID3_ZONE_16K].sz_failed = 0; 3013 sc->sc_zones[G_RAID3_ZONE_4K].sz_zone = uma_zcreate("gr3:4k", 4096, 3014 g_raid3_uma_ctor, g_raid3_uma_dtor, NULL, NULL, UMA_ALIGN_PTR, 0); 3015 sc->sc_zones[G_RAID3_ZONE_4K].sz_inuse = 0; 3016 sc->sc_zones[G_RAID3_ZONE_4K].sz_max = g_raid3_n4k; 3017 sc->sc_zones[G_RAID3_ZONE_4K].sz_requested = 3018 sc->sc_zones[G_RAID3_ZONE_4K].sz_failed = 0; 3019 3020 error = kthread_create(g_raid3_worker, sc, &sc->sc_worker, 0, 0, 3021 "g_raid3 %s", md->md_name); 3022 if (error != 0) { 3023 G_RAID3_DEBUG(1, "Cannot create kernel thread for %s.", 3024 sc->sc_name); 3025 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_64K].sz_zone); 3026 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_16K].sz_zone); 3027 uma_zdestroy(sc->sc_zones[G_RAID3_ZONE_4K].sz_zone); 3028 g_destroy_geom(sc->sc_sync.ds_geom); 3029 mtx_destroy(&sc->sc_events_mtx); 3030 mtx_destroy(&sc->sc_queue_mtx); 3031 sx_destroy(&sc->sc_lock); 3032 g_destroy_geom(sc->sc_geom); 3033 free(sc->sc_disks, M_RAID3); 3034 free(sc, M_RAID3); 3035 return (NULL); 3036 } 3037 3038 G_RAID3_DEBUG(0, "Device %s created (id=%u).", sc->sc_name, sc->sc_id); 3039 3040 sc->sc_rootmount = root_mount_hold("GRAID3"); 3041 G_RAID3_DEBUG(1, "root_mount_hold %p", sc->sc_rootmount); 3042 3043 /* 3044 * Run timeout. 3045 */ 3046 timeout = atomic_load_acq_int(&g_raid3_timeout); 3047 callout_reset(&sc->sc_callout, timeout * hz, g_raid3_go, sc); 3048 return (sc->sc_geom); 3049 } 3050 3051 int 3052 g_raid3_destroy(struct g_raid3_softc *sc, boolean_t force) 3053 { 3054 struct g_provider *pp; 3055 3056 g_topology_assert_not(); 3057 if (sc == NULL) 3058 return (ENXIO); 3059 sx_assert(&sc->sc_lock, SX_XLOCKED); 3060 3061 pp = sc->sc_provider; 3062 if (pp != NULL && (pp->acr != 0 || pp->acw != 0 || pp->ace != 0)) { 3063 if (force) { 3064 G_RAID3_DEBUG(1, "Device %s is still open, so it " 3065 "can't be definitely removed.", pp->name); 3066 } else { 3067 G_RAID3_DEBUG(1, 3068 "Device %s is still open (r%dw%de%d).", pp->name, 3069 pp->acr, pp->acw, pp->ace); 3070 return (EBUSY); 3071 } 3072 } 3073 3074 sc->sc_flags |= G_RAID3_DEVICE_FLAG_DESTROY; 3075 sc->sc_flags |= G_RAID3_DEVICE_FLAG_WAIT; 3076 G_RAID3_DEBUG(4, "%s: Waking up %p.", __func__, sc); 3077 sx_xunlock(&sc->sc_lock); 3078 mtx_lock(&sc->sc_queue_mtx); 3079 wakeup(sc); 3080 wakeup(&sc->sc_queue); 3081 mtx_unlock(&sc->sc_queue_mtx); 3082 G_RAID3_DEBUG(4, "%s: Sleeping %p.", __func__, &sc->sc_worker); 3083 while (sc->sc_worker != NULL) 3084 tsleep(&sc->sc_worker, PRIBIO, "r3:destroy", hz / 5); 3085 G_RAID3_DEBUG(4, "%s: Woken up %p.", __func__, &sc->sc_worker); 3086 sx_xlock(&sc->sc_lock); 3087 g_raid3_destroy_device(sc); 3088 free(sc->sc_disks, M_RAID3); 3089 free(sc, M_RAID3); 3090 return (0); 3091 } 3092 3093 static void 3094 g_raid3_taste_orphan(struct g_consumer *cp) 3095 { 3096 3097 KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 3098 cp->provider->name)); 3099 } 3100 3101 static struct g_geom * 3102 g_raid3_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 3103 { 3104 struct g_raid3_metadata md; 3105 struct g_raid3_softc *sc; 3106 struct g_consumer *cp; 3107 struct g_geom *gp; 3108 int error; 3109 3110 g_topology_assert(); 3111 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 3112 G_RAID3_DEBUG(2, "Tasting %s.", pp->name); 3113 3114 gp = g_new_geomf(mp, "raid3:taste"); 3115 /* This orphan function should be never called. */ 3116 gp->orphan = g_raid3_taste_orphan; 3117 cp = g_new_consumer(gp); 3118 g_attach(cp, pp); 3119 error = g_raid3_read_metadata(cp, &md); 3120 g_detach(cp); 3121 g_destroy_consumer(cp); 3122 g_destroy_geom(gp); 3123 if (error != 0) 3124 return (NULL); 3125 gp = NULL; 3126 3127 if (md.md_provider[0] != '\0' && strcmp(md.md_provider, pp->name) != 0) 3128 return (NULL); 3129 if (md.md_provsize != 0 && md.md_provsize != pp->mediasize) 3130 return (NULL); 3131 if (g_raid3_debug >= 2) 3132 raid3_metadata_dump(&md); 3133 3134 /* 3135 * Let's check if device already exists. 3136 */ 3137 sc = NULL; 3138 LIST_FOREACH(gp, &mp->geom, geom) { 3139 sc = gp->softc; 3140 if (sc == NULL) 3141 continue; 3142 if (sc->sc_sync.ds_geom == gp) 3143 continue; 3144 if (strcmp(md.md_name, sc->sc_name) != 0) 3145 continue; 3146 if (md.md_id != sc->sc_id) { 3147 G_RAID3_DEBUG(0, "Device %s already configured.", 3148 sc->sc_name); 3149 return (NULL); 3150 } 3151 break; 3152 } 3153 if (gp == NULL) { 3154 gp = g_raid3_create(mp, &md); 3155 if (gp == NULL) { 3156 G_RAID3_DEBUG(0, "Cannot create device %s.", 3157 md.md_name); 3158 return (NULL); 3159 } 3160 sc = gp->softc; 3161 } 3162 G_RAID3_DEBUG(1, "Adding disk %s to %s.", pp->name, gp->name); 3163 g_topology_unlock(); 3164 sx_xlock(&sc->sc_lock); 3165 error = g_raid3_add_disk(sc, pp, &md); 3166 if (error != 0) { 3167 G_RAID3_DEBUG(0, "Cannot add disk %s to %s (error=%d).", 3168 pp->name, gp->name, error); 3169 if (g_raid3_ndisks(sc, G_RAID3_DISK_STATE_NODISK) == 3170 sc->sc_ndisks) { 3171 g_raid3_destroy(sc, 1); 3172 g_topology_lock(); 3173 return (NULL); 3174 } 3175 gp = NULL; 3176 } 3177 sx_xunlock(&sc->sc_lock); 3178 g_topology_lock(); 3179 return (gp); 3180 } 3181 3182 static int 3183 g_raid3_destroy_geom(struct gctl_req *req __unused, struct g_class *mp __unused, 3184 struct g_geom *gp) 3185 { 3186 struct g_raid3_softc *sc; 3187 int error; 3188 3189 g_topology_unlock(); 3190 sc = gp->softc; 3191 sx_xlock(&sc->sc_lock); 3192 error = g_raid3_destroy(gp->softc, 0); 3193 if (error != 0) 3194 sx_xunlock(&sc->sc_lock); 3195 g_topology_lock(); 3196 return (error); 3197 } 3198 3199 static void 3200 g_raid3_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 3201 struct g_consumer *cp, struct g_provider *pp) 3202 { 3203 struct g_raid3_softc *sc; 3204 3205 g_topology_assert(); 3206 3207 sc = gp->softc; 3208 if (sc == NULL) 3209 return; 3210 /* Skip synchronization geom. */ 3211 if (gp == sc->sc_sync.ds_geom) 3212 return; 3213 if (pp != NULL) { 3214 /* Nothing here. */ 3215 } else if (cp != NULL) { 3216 struct g_raid3_disk *disk; 3217 3218 disk = cp->private; 3219 if (disk == NULL) 3220 return; 3221 g_topology_unlock(); 3222 sx_xlock(&sc->sc_lock); 3223 sbuf_printf(sb, "%s<Type>", indent); 3224 if (disk->d_no == sc->sc_ndisks - 1) 3225 sbuf_printf(sb, "PARITY"); 3226 else 3227 sbuf_printf(sb, "DATA"); 3228 sbuf_printf(sb, "</Type>\n"); 3229 sbuf_printf(sb, "%s<Number>%u</Number>\n", indent, 3230 (u_int)disk->d_no); 3231 if (disk->d_state == G_RAID3_DISK_STATE_SYNCHRONIZING) { 3232 sbuf_printf(sb, "%s<Synchronized>", indent); 3233 if (disk->d_sync.ds_offset == 0) 3234 sbuf_printf(sb, "0%%"); 3235 else { 3236 sbuf_printf(sb, "%u%%", 3237 (u_int)((disk->d_sync.ds_offset * 100) / 3238 (sc->sc_mediasize / (sc->sc_ndisks - 1)))); 3239 } 3240 sbuf_printf(sb, "</Synchronized>\n"); 3241 } 3242 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, 3243 disk->d_sync.ds_syncid); 3244 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, disk->d_genid); 3245 sbuf_printf(sb, "%s<Flags>", indent); 3246 if (disk->d_flags == 0) 3247 sbuf_printf(sb, "NONE"); 3248 else { 3249 int first = 1; 3250 3251 #define ADD_FLAG(flag, name) do { \ 3252 if ((disk->d_flags & (flag)) != 0) { \ 3253 if (!first) \ 3254 sbuf_printf(sb, ", "); \ 3255 else \ 3256 first = 0; \ 3257 sbuf_printf(sb, name); \ 3258 } \ 3259 } while (0) 3260 ADD_FLAG(G_RAID3_DISK_FLAG_DIRTY, "DIRTY"); 3261 ADD_FLAG(G_RAID3_DISK_FLAG_HARDCODED, "HARDCODED"); 3262 ADD_FLAG(G_RAID3_DISK_FLAG_SYNCHRONIZING, 3263 "SYNCHRONIZING"); 3264 ADD_FLAG(G_RAID3_DISK_FLAG_FORCE_SYNC, "FORCE_SYNC"); 3265 ADD_FLAG(G_RAID3_DISK_FLAG_BROKEN, "BROKEN"); 3266 #undef ADD_FLAG 3267 } 3268 sbuf_printf(sb, "</Flags>\n"); 3269 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3270 g_raid3_disk_state2str(disk->d_state)); 3271 sx_xunlock(&sc->sc_lock); 3272 g_topology_lock(); 3273 } else { 3274 g_topology_unlock(); 3275 sx_xlock(&sc->sc_lock); 3276 sbuf_printf(sb, "%s<Zone4kRequested>%u</Zone4kRequested>\n", 3277 indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_requested); 3278 sbuf_printf(sb, "%s<Zone4kFailed>%u</Zone4kFailed>\n", 3279 indent, sc->sc_zones[G_RAID3_ZONE_4K].sz_failed); 3280 sbuf_printf(sb, "%s<Zone16kRequested>%u</Zone16kRequested>\n", 3281 indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_requested); 3282 sbuf_printf(sb, "%s<Zone16kFailed>%u</Zone16kFailed>\n", 3283 indent, sc->sc_zones[G_RAID3_ZONE_16K].sz_failed); 3284 sbuf_printf(sb, "%s<Zone64kRequested>%u</Zone64kRequested>\n", 3285 indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_requested); 3286 sbuf_printf(sb, "%s<Zone64kFailed>%u</Zone64kFailed>\n", 3287 indent, sc->sc_zones[G_RAID3_ZONE_64K].sz_failed); 3288 sbuf_printf(sb, "%s<ID>%u</ID>\n", indent, (u_int)sc->sc_id); 3289 sbuf_printf(sb, "%s<SyncID>%u</SyncID>\n", indent, sc->sc_syncid); 3290 sbuf_printf(sb, "%s<GenID>%u</GenID>\n", indent, sc->sc_genid); 3291 sbuf_printf(sb, "%s<Flags>", indent); 3292 if (sc->sc_flags == 0) 3293 sbuf_printf(sb, "NONE"); 3294 else { 3295 int first = 1; 3296 3297 #define ADD_FLAG(flag, name) do { \ 3298 if ((sc->sc_flags & (flag)) != 0) { \ 3299 if (!first) \ 3300 sbuf_printf(sb, ", "); \ 3301 else \ 3302 first = 0; \ 3303 sbuf_printf(sb, name); \ 3304 } \ 3305 } while (0) 3306 ADD_FLAG(G_RAID3_DEVICE_FLAG_NOAUTOSYNC, "NOAUTOSYNC"); 3307 ADD_FLAG(G_RAID3_DEVICE_FLAG_ROUND_ROBIN, 3308 "ROUND-ROBIN"); 3309 ADD_FLAG(G_RAID3_DEVICE_FLAG_VERIFY, "VERIFY"); 3310 #undef ADD_FLAG 3311 } 3312 sbuf_printf(sb, "</Flags>\n"); 3313 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent, 3314 sc->sc_ndisks); 3315 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 3316 g_raid3_device_state2str(sc->sc_state)); 3317 sx_xunlock(&sc->sc_lock); 3318 g_topology_lock(); 3319 } 3320 } 3321 3322 static void 3323 g_raid3_shutdown_pre_sync(void *arg, int howto) 3324 { 3325 struct g_class *mp; 3326 struct g_geom *gp, *gp2; 3327 struct g_raid3_softc *sc; 3328 3329 mp = arg; 3330 DROP_GIANT(); 3331 g_topology_lock(); 3332 LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 3333 if ((sc = gp->softc) == NULL) 3334 continue; 3335 g_topology_unlock(); 3336 sx_xlock(&sc->sc_lock); 3337 if (sc->sc_syncdisk != NULL) 3338 g_raid3_sync_stop(sc, 1); 3339 sx_xunlock(&sc->sc_lock); 3340 g_topology_lock(); 3341 } 3342 g_topology_unlock(); 3343 PICKUP_GIANT(); 3344 } 3345 3346 static void 3347 g_raid3_shutdown_post_sync(void *arg, int howto) 3348 { 3349 struct g_class *mp; 3350 struct g_geom *gp, *gp2; 3351 struct g_raid3_softc *sc; 3352 3353 mp = arg; 3354 DROP_GIANT(); 3355 g_topology_lock(); 3356 LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 3357 if ((sc = gp->softc) == NULL) 3358 continue; 3359 g_topology_unlock(); 3360 sx_xlock(&sc->sc_lock); 3361 g_raid3_destroy(sc, 1); 3362 g_topology_lock(); 3363 } 3364 g_topology_unlock(); 3365 PICKUP_GIANT(); 3366 #if 0 3367 tsleep(&gp, PRIBIO, "r3:shutdown", hz * 20); 3368 #endif 3369 } 3370 3371 static void 3372 g_raid3_init(struct g_class *mp) 3373 { 3374 3375 g_raid3_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, 3376 g_raid3_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); 3377 g_raid3_post_sync = EVENTHANDLER_REGISTER(shutdown_post_sync, 3378 g_raid3_shutdown_post_sync, mp, SHUTDOWN_PRI_FIRST); 3379 if (g_raid3_pre_sync == NULL || g_raid3_post_sync == NULL) 3380 G_RAID3_DEBUG(0, "Warning! Cannot register shutdown event."); 3381 } 3382 3383 static void 3384 g_raid3_fini(struct g_class *mp) 3385 { 3386 3387 if (g_raid3_pre_sync != NULL) 3388 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid3_pre_sync); 3389 if (g_raid3_post_sync != NULL) 3390 EVENTHANDLER_DEREGISTER(shutdown_post_sync, g_raid3_post_sync); 3391 } 3392 3393 DECLARE_GEOM_CLASS(g_raid3_class, g_raid3); 3394