1 /*- 2 * Copyright (c) 2010 Alexander Motin <mav@FreeBSD.org> 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 #include <sys/param.h> 31 #include <sys/systm.h> 32 #include <sys/kernel.h> 33 #include <sys/module.h> 34 #include <sys/limits.h> 35 #include <sys/lock.h> 36 #include <sys/mutex.h> 37 #include <sys/bio.h> 38 #include <sys/sbuf.h> 39 #include <sys/sysctl.h> 40 #include <sys/malloc.h> 41 #include <sys/eventhandler.h> 42 #include <vm/uma.h> 43 #include <geom/geom.h> 44 #include <sys/proc.h> 45 #include <sys/kthread.h> 46 #include <sys/sched.h> 47 #include <geom/raid/g_raid.h> 48 #include "g_raid_md_if.h" 49 #include "g_raid_tr_if.h" 50 51 static MALLOC_DEFINE(M_RAID, "raid_data", "GEOM_RAID Data"); 52 53 SYSCTL_DECL(_kern_geom); 54 SYSCTL_NODE(_kern_geom, OID_AUTO, raid, CTLFLAG_RW, 0, "GEOM_RAID stuff"); 55 u_int g_raid_aggressive_spare = 0; 56 TUNABLE_INT("kern.geom.raid.aggressive_spare", &g_raid_aggressive_spare); 57 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, aggressive_spare, CTLFLAG_RW, 58 &g_raid_aggressive_spare, 0, "Use disks without metadata as spare"); 59 u_int g_raid_debug = 0; 60 TUNABLE_INT("kern.geom.raid.debug", &g_raid_debug); 61 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, debug, CTLFLAG_RW, &g_raid_debug, 0, 62 "Debug level"); 63 int g_raid_read_err_thresh = 10; 64 TUNABLE_INT("kern.geom.raid.read_err_thresh", &g_raid_read_err_thresh); 65 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, read_err_thresh, CTLFLAG_RW, 66 &g_raid_read_err_thresh, 0, 67 "Number of read errors equated to disk failure"); 68 u_int g_raid_start_timeout = 30; 69 TUNABLE_INT("kern.geom.raid.start_timeout", &g_raid_start_timeout); 70 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, start_timeout, CTLFLAG_RW, 71 &g_raid_start_timeout, 0, 72 "Time to wait for all array components"); 73 static u_int g_raid_clean_time = 5; 74 TUNABLE_INT("kern.geom.raid.clean_time", &g_raid_clean_time); 75 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, clean_time, CTLFLAG_RW, 76 &g_raid_clean_time, 0, "Mark volume as clean when idling"); 77 static u_int g_raid_disconnect_on_failure = 1; 78 TUNABLE_INT("kern.geom.raid.disconnect_on_failure", 79 &g_raid_disconnect_on_failure); 80 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, disconnect_on_failure, CTLFLAG_RW, 81 &g_raid_disconnect_on_failure, 0, "Disconnect component on I/O failure."); 82 static u_int g_raid_name_format = 0; 83 TUNABLE_INT("kern.geom.raid.name_format", &g_raid_name_format); 84 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, name_format, CTLFLAG_RW, 85 &g_raid_name_format, 0, "Providers name format."); 86 static u_int g_raid_idle_threshold = 1000000; 87 TUNABLE_INT("kern.geom.raid.idle_threshold", &g_raid_idle_threshold); 88 SYSCTL_UINT(_kern_geom_raid, OID_AUTO, idle_threshold, CTLFLAG_RW, 89 &g_raid_idle_threshold, 1000000, 90 "Time in microseconds to consider a volume idle."); 91 92 #define MSLEEP(rv, ident, mtx, priority, wmesg, timeout) do { \ 93 G_RAID_DEBUG(4, "%s: Sleeping %p.", __func__, (ident)); \ 94 rv = msleep((ident), (mtx), (priority), (wmesg), (timeout)); \ 95 G_RAID_DEBUG(4, "%s: Woken up %p.", __func__, (ident)); \ 96 } while (0) 97 98 LIST_HEAD(, g_raid_md_class) g_raid_md_classes = 99 LIST_HEAD_INITIALIZER(g_raid_md_classes); 100 101 LIST_HEAD(, g_raid_tr_class) g_raid_tr_classes = 102 LIST_HEAD_INITIALIZER(g_raid_tr_classes); 103 104 LIST_HEAD(, g_raid_volume) g_raid_volumes = 105 LIST_HEAD_INITIALIZER(g_raid_volumes); 106 107 static eventhandler_tag g_raid_pre_sync = NULL; 108 static int g_raid_started = 0; 109 110 static int g_raid_destroy_geom(struct gctl_req *req, struct g_class *mp, 111 struct g_geom *gp); 112 static g_taste_t g_raid_taste; 113 static void g_raid_init(struct g_class *mp); 114 static void g_raid_fini(struct g_class *mp); 115 116 struct g_class g_raid_class = { 117 .name = G_RAID_CLASS_NAME, 118 .version = G_VERSION, 119 .ctlreq = g_raid_ctl, 120 .taste = g_raid_taste, 121 .destroy_geom = g_raid_destroy_geom, 122 .init = g_raid_init, 123 .fini = g_raid_fini 124 }; 125 126 static void g_raid_destroy_provider(struct g_raid_volume *vol); 127 static int g_raid_update_disk(struct g_raid_disk *disk, u_int event); 128 static int g_raid_update_subdisk(struct g_raid_subdisk *subdisk, u_int event); 129 static int g_raid_update_volume(struct g_raid_volume *vol, u_int event); 130 static int g_raid_update_node(struct g_raid_softc *sc, u_int event); 131 static void g_raid_dumpconf(struct sbuf *sb, const char *indent, 132 struct g_geom *gp, struct g_consumer *cp, struct g_provider *pp); 133 static void g_raid_start(struct bio *bp); 134 static void g_raid_start_request(struct bio *bp); 135 static void g_raid_disk_done(struct bio *bp); 136 static void g_raid_poll(struct g_raid_softc *sc); 137 138 static const char * 139 g_raid_node_event2str(int event) 140 { 141 142 switch (event) { 143 case G_RAID_NODE_E_WAKE: 144 return ("WAKE"); 145 case G_RAID_NODE_E_START: 146 return ("START"); 147 default: 148 return ("INVALID"); 149 } 150 } 151 152 const char * 153 g_raid_disk_state2str(int state) 154 { 155 156 switch (state) { 157 case G_RAID_DISK_S_NONE: 158 return ("NONE"); 159 case G_RAID_DISK_S_OFFLINE: 160 return ("OFFLINE"); 161 case G_RAID_DISK_S_FAILED: 162 return ("FAILED"); 163 case G_RAID_DISK_S_STALE_FAILED: 164 return ("STALE_FAILED"); 165 case G_RAID_DISK_S_SPARE: 166 return ("SPARE"); 167 case G_RAID_DISK_S_STALE: 168 return ("STALE"); 169 case G_RAID_DISK_S_ACTIVE: 170 return ("ACTIVE"); 171 default: 172 return ("INVALID"); 173 } 174 } 175 176 static const char * 177 g_raid_disk_event2str(int event) 178 { 179 180 switch (event) { 181 case G_RAID_DISK_E_DISCONNECTED: 182 return ("DISCONNECTED"); 183 default: 184 return ("INVALID"); 185 } 186 } 187 188 const char * 189 g_raid_subdisk_state2str(int state) 190 { 191 192 switch (state) { 193 case G_RAID_SUBDISK_S_NONE: 194 return ("NONE"); 195 case G_RAID_SUBDISK_S_FAILED: 196 return ("FAILED"); 197 case G_RAID_SUBDISK_S_NEW: 198 return ("NEW"); 199 case G_RAID_SUBDISK_S_REBUILD: 200 return ("REBUILD"); 201 case G_RAID_SUBDISK_S_UNINITIALIZED: 202 return ("UNINITIALIZED"); 203 case G_RAID_SUBDISK_S_STALE: 204 return ("STALE"); 205 case G_RAID_SUBDISK_S_RESYNC: 206 return ("RESYNC"); 207 case G_RAID_SUBDISK_S_ACTIVE: 208 return ("ACTIVE"); 209 default: 210 return ("INVALID"); 211 } 212 } 213 214 static const char * 215 g_raid_subdisk_event2str(int event) 216 { 217 218 switch (event) { 219 case G_RAID_SUBDISK_E_NEW: 220 return ("NEW"); 221 case G_RAID_SUBDISK_E_DISCONNECTED: 222 return ("DISCONNECTED"); 223 default: 224 return ("INVALID"); 225 } 226 } 227 228 const char * 229 g_raid_volume_state2str(int state) 230 { 231 232 switch (state) { 233 case G_RAID_VOLUME_S_STARTING: 234 return ("STARTING"); 235 case G_RAID_VOLUME_S_BROKEN: 236 return ("BROKEN"); 237 case G_RAID_VOLUME_S_DEGRADED: 238 return ("DEGRADED"); 239 case G_RAID_VOLUME_S_SUBOPTIMAL: 240 return ("SUBOPTIMAL"); 241 case G_RAID_VOLUME_S_OPTIMAL: 242 return ("OPTIMAL"); 243 case G_RAID_VOLUME_S_UNSUPPORTED: 244 return ("UNSUPPORTED"); 245 case G_RAID_VOLUME_S_STOPPED: 246 return ("STOPPED"); 247 default: 248 return ("INVALID"); 249 } 250 } 251 252 static const char * 253 g_raid_volume_event2str(int event) 254 { 255 256 switch (event) { 257 case G_RAID_VOLUME_E_UP: 258 return ("UP"); 259 case G_RAID_VOLUME_E_DOWN: 260 return ("DOWN"); 261 case G_RAID_VOLUME_E_START: 262 return ("START"); 263 case G_RAID_VOLUME_E_STARTMD: 264 return ("STARTMD"); 265 default: 266 return ("INVALID"); 267 } 268 } 269 270 const char * 271 g_raid_volume_level2str(int level, int qual) 272 { 273 274 switch (level) { 275 case G_RAID_VOLUME_RL_RAID0: 276 return ("RAID0"); 277 case G_RAID_VOLUME_RL_RAID1: 278 return ("RAID1"); 279 case G_RAID_VOLUME_RL_RAID3: 280 return ("RAID3"); 281 case G_RAID_VOLUME_RL_RAID4: 282 return ("RAID4"); 283 case G_RAID_VOLUME_RL_RAID5: 284 return ("RAID5"); 285 case G_RAID_VOLUME_RL_RAID6: 286 return ("RAID6"); 287 case G_RAID_VOLUME_RL_RAID1E: 288 return ("RAID1E"); 289 case G_RAID_VOLUME_RL_SINGLE: 290 return ("SINGLE"); 291 case G_RAID_VOLUME_RL_CONCAT: 292 return ("CONCAT"); 293 case G_RAID_VOLUME_RL_RAID5E: 294 return ("RAID5E"); 295 case G_RAID_VOLUME_RL_RAID5EE: 296 return ("RAID5EE"); 297 default: 298 return ("UNKNOWN"); 299 } 300 } 301 302 int 303 g_raid_volume_str2level(const char *str, int *level, int *qual) 304 { 305 306 *level = G_RAID_VOLUME_RL_UNKNOWN; 307 *qual = G_RAID_VOLUME_RLQ_NONE; 308 if (strcasecmp(str, "RAID0") == 0) 309 *level = G_RAID_VOLUME_RL_RAID0; 310 else if (strcasecmp(str, "RAID1") == 0) 311 *level = G_RAID_VOLUME_RL_RAID1; 312 else if (strcasecmp(str, "RAID3") == 0) 313 *level = G_RAID_VOLUME_RL_RAID3; 314 else if (strcasecmp(str, "RAID4") == 0) 315 *level = G_RAID_VOLUME_RL_RAID4; 316 else if (strcasecmp(str, "RAID5") == 0) 317 *level = G_RAID_VOLUME_RL_RAID5; 318 else if (strcasecmp(str, "RAID6") == 0) 319 *level = G_RAID_VOLUME_RL_RAID6; 320 else if (strcasecmp(str, "RAID10") == 0 || 321 strcasecmp(str, "RAID1E") == 0) 322 *level = G_RAID_VOLUME_RL_RAID1E; 323 else if (strcasecmp(str, "SINGLE") == 0) 324 *level = G_RAID_VOLUME_RL_SINGLE; 325 else if (strcasecmp(str, "CONCAT") == 0) 326 *level = G_RAID_VOLUME_RL_CONCAT; 327 else if (strcasecmp(str, "RAID5E") == 0) 328 *level = G_RAID_VOLUME_RL_RAID5E; 329 else if (strcasecmp(str, "RAID5EE") == 0) 330 *level = G_RAID_VOLUME_RL_RAID5EE; 331 else 332 return (-1); 333 return (0); 334 } 335 336 const char * 337 g_raid_get_diskname(struct g_raid_disk *disk) 338 { 339 340 if (disk->d_consumer == NULL || disk->d_consumer->provider == NULL) 341 return ("[unknown]"); 342 return (disk->d_consumer->provider->name); 343 } 344 345 void 346 g_raid_report_disk_state(struct g_raid_disk *disk) 347 { 348 struct g_raid_subdisk *sd; 349 int len, state; 350 uint32_t s; 351 352 if (disk->d_consumer == NULL) 353 return; 354 if (disk->d_state == G_RAID_DISK_S_FAILED || 355 disk->d_state == G_RAID_DISK_S_STALE_FAILED) { 356 s = G_STATE_FAILED; 357 } else { 358 state = G_RAID_SUBDISK_S_ACTIVE; 359 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 360 if (sd->sd_state < state) 361 state = sd->sd_state; 362 } 363 if (state == G_RAID_SUBDISK_S_FAILED) 364 s = G_STATE_FAILED; 365 else if (state == G_RAID_SUBDISK_S_NEW || 366 state == G_RAID_SUBDISK_S_REBUILD) 367 s = G_STATE_REBUILD; 368 else if (state == G_RAID_SUBDISK_S_STALE || 369 state == G_RAID_SUBDISK_S_RESYNC) 370 s = G_STATE_RESYNC; 371 else 372 s = G_STATE_ACTIVE; 373 } 374 len = sizeof(s); 375 g_io_getattr("GEOM::setstate", disk->d_consumer, &len, &s); 376 G_RAID_DEBUG1(2, disk->d_softc, "Disk %s state reported as %d.", 377 g_raid_get_diskname(disk), s); 378 } 379 380 void 381 g_raid_change_disk_state(struct g_raid_disk *disk, int state) 382 { 383 384 G_RAID_DEBUG1(0, disk->d_softc, "Disk %s state changed from %s to %s.", 385 g_raid_get_diskname(disk), 386 g_raid_disk_state2str(disk->d_state), 387 g_raid_disk_state2str(state)); 388 disk->d_state = state; 389 g_raid_report_disk_state(disk); 390 } 391 392 void 393 g_raid_change_subdisk_state(struct g_raid_subdisk *sd, int state) 394 { 395 396 G_RAID_DEBUG1(0, sd->sd_softc, 397 "Subdisk %s:%d-%s state changed from %s to %s.", 398 sd->sd_volume->v_name, sd->sd_pos, 399 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]", 400 g_raid_subdisk_state2str(sd->sd_state), 401 g_raid_subdisk_state2str(state)); 402 sd->sd_state = state; 403 if (sd->sd_disk) 404 g_raid_report_disk_state(sd->sd_disk); 405 } 406 407 void 408 g_raid_change_volume_state(struct g_raid_volume *vol, int state) 409 { 410 411 G_RAID_DEBUG1(0, vol->v_softc, 412 "Volume %s state changed from %s to %s.", 413 vol->v_name, 414 g_raid_volume_state2str(vol->v_state), 415 g_raid_volume_state2str(state)); 416 vol->v_state = state; 417 } 418 419 /* 420 * --- Events handling functions --- 421 * Events in geom_raid are used to maintain subdisks and volumes status 422 * from one thread to simplify locking. 423 */ 424 static void 425 g_raid_event_free(struct g_raid_event *ep) 426 { 427 428 free(ep, M_RAID); 429 } 430 431 int 432 g_raid_event_send(void *arg, int event, int flags) 433 { 434 struct g_raid_softc *sc; 435 struct g_raid_event *ep; 436 int error; 437 438 if ((flags & G_RAID_EVENT_VOLUME) != 0) { 439 sc = ((struct g_raid_volume *)arg)->v_softc; 440 } else if ((flags & G_RAID_EVENT_DISK) != 0) { 441 sc = ((struct g_raid_disk *)arg)->d_softc; 442 } else if ((flags & G_RAID_EVENT_SUBDISK) != 0) { 443 sc = ((struct g_raid_subdisk *)arg)->sd_softc; 444 } else { 445 sc = arg; 446 } 447 ep = malloc(sizeof(*ep), M_RAID, 448 sx_xlocked(&sc->sc_lock) ? M_WAITOK : M_NOWAIT); 449 if (ep == NULL) 450 return (ENOMEM); 451 ep->e_tgt = arg; 452 ep->e_event = event; 453 ep->e_flags = flags; 454 ep->e_error = 0; 455 G_RAID_DEBUG1(4, sc, "Sending event %p. Waking up %p.", ep, sc); 456 mtx_lock(&sc->sc_queue_mtx); 457 TAILQ_INSERT_TAIL(&sc->sc_events, ep, e_next); 458 mtx_unlock(&sc->sc_queue_mtx); 459 wakeup(sc); 460 461 if ((flags & G_RAID_EVENT_WAIT) == 0) 462 return (0); 463 464 sx_assert(&sc->sc_lock, SX_XLOCKED); 465 G_RAID_DEBUG1(4, sc, "Sleeping on %p.", ep); 466 sx_xunlock(&sc->sc_lock); 467 while ((ep->e_flags & G_RAID_EVENT_DONE) == 0) { 468 mtx_lock(&sc->sc_queue_mtx); 469 MSLEEP(error, ep, &sc->sc_queue_mtx, PRIBIO | PDROP, "m:event", 470 hz * 5); 471 } 472 error = ep->e_error; 473 g_raid_event_free(ep); 474 sx_xlock(&sc->sc_lock); 475 return (error); 476 } 477 478 static void 479 g_raid_event_cancel(struct g_raid_softc *sc, void *tgt) 480 { 481 struct g_raid_event *ep, *tmpep; 482 483 sx_assert(&sc->sc_lock, SX_XLOCKED); 484 485 mtx_lock(&sc->sc_queue_mtx); 486 TAILQ_FOREACH_SAFE(ep, &sc->sc_events, e_next, tmpep) { 487 if (ep->e_tgt != tgt) 488 continue; 489 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 490 if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) 491 g_raid_event_free(ep); 492 else { 493 ep->e_error = ECANCELED; 494 wakeup(ep); 495 } 496 } 497 mtx_unlock(&sc->sc_queue_mtx); 498 } 499 500 static int 501 g_raid_event_check(struct g_raid_softc *sc, void *tgt) 502 { 503 struct g_raid_event *ep; 504 int res = 0; 505 506 sx_assert(&sc->sc_lock, SX_XLOCKED); 507 508 mtx_lock(&sc->sc_queue_mtx); 509 TAILQ_FOREACH(ep, &sc->sc_events, e_next) { 510 if (ep->e_tgt != tgt) 511 continue; 512 res = 1; 513 break; 514 } 515 mtx_unlock(&sc->sc_queue_mtx); 516 return (res); 517 } 518 519 /* 520 * Return the number of disks in given state. 521 * If state is equal to -1, count all connected disks. 522 */ 523 u_int 524 g_raid_ndisks(struct g_raid_softc *sc, int state) 525 { 526 struct g_raid_disk *disk; 527 u_int n; 528 529 sx_assert(&sc->sc_lock, SX_LOCKED); 530 531 n = 0; 532 TAILQ_FOREACH(disk, &sc->sc_disks, d_next) { 533 if (disk->d_state == state || state == -1) 534 n++; 535 } 536 return (n); 537 } 538 539 /* 540 * Return the number of subdisks in given state. 541 * If state is equal to -1, count all connected disks. 542 */ 543 u_int 544 g_raid_nsubdisks(struct g_raid_volume *vol, int state) 545 { 546 struct g_raid_subdisk *subdisk; 547 struct g_raid_softc *sc; 548 u_int i, n ; 549 550 sc = vol->v_softc; 551 sx_assert(&sc->sc_lock, SX_LOCKED); 552 553 n = 0; 554 for (i = 0; i < vol->v_disks_count; i++) { 555 subdisk = &vol->v_subdisks[i]; 556 if ((state == -1 && 557 subdisk->sd_state != G_RAID_SUBDISK_S_NONE) || 558 subdisk->sd_state == state) 559 n++; 560 } 561 return (n); 562 } 563 564 /* 565 * Return the first subdisk in given state. 566 * If state is equal to -1, then the first connected disks. 567 */ 568 struct g_raid_subdisk * 569 g_raid_get_subdisk(struct g_raid_volume *vol, int state) 570 { 571 struct g_raid_subdisk *sd; 572 struct g_raid_softc *sc; 573 u_int i; 574 575 sc = vol->v_softc; 576 sx_assert(&sc->sc_lock, SX_LOCKED); 577 578 for (i = 0; i < vol->v_disks_count; i++) { 579 sd = &vol->v_subdisks[i]; 580 if ((state == -1 && 581 sd->sd_state != G_RAID_SUBDISK_S_NONE) || 582 sd->sd_state == state) 583 return (sd); 584 } 585 return (NULL); 586 } 587 588 struct g_consumer * 589 g_raid_open_consumer(struct g_raid_softc *sc, const char *name) 590 { 591 struct g_consumer *cp; 592 struct g_provider *pp; 593 594 g_topology_assert(); 595 596 if (strncmp(name, "/dev/", 5) == 0) 597 name += 5; 598 pp = g_provider_by_name(name); 599 if (pp == NULL) 600 return (NULL); 601 cp = g_new_consumer(sc->sc_geom); 602 if (g_attach(cp, pp) != 0) { 603 g_destroy_consumer(cp); 604 return (NULL); 605 } 606 if (g_access(cp, 1, 1, 1) != 0) { 607 g_detach(cp); 608 g_destroy_consumer(cp); 609 return (NULL); 610 } 611 return (cp); 612 } 613 614 static u_int 615 g_raid_nrequests(struct g_raid_softc *sc, struct g_consumer *cp) 616 { 617 struct bio *bp; 618 u_int nreqs = 0; 619 620 mtx_lock(&sc->sc_queue_mtx); 621 TAILQ_FOREACH(bp, &sc->sc_queue.queue, bio_queue) { 622 if (bp->bio_from == cp) 623 nreqs++; 624 } 625 mtx_unlock(&sc->sc_queue_mtx); 626 return (nreqs); 627 } 628 629 u_int 630 g_raid_nopens(struct g_raid_softc *sc) 631 { 632 struct g_raid_volume *vol; 633 u_int opens; 634 635 opens = 0; 636 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 637 if (vol->v_provider_open != 0) 638 opens++; 639 } 640 return (opens); 641 } 642 643 static int 644 g_raid_consumer_is_busy(struct g_raid_softc *sc, struct g_consumer *cp) 645 { 646 647 if (cp->index > 0) { 648 G_RAID_DEBUG1(2, sc, 649 "I/O requests for %s exist, can't destroy it now.", 650 cp->provider->name); 651 return (1); 652 } 653 if (g_raid_nrequests(sc, cp) > 0) { 654 G_RAID_DEBUG1(2, sc, 655 "I/O requests for %s in queue, can't destroy it now.", 656 cp->provider->name); 657 return (1); 658 } 659 return (0); 660 } 661 662 static void 663 g_raid_destroy_consumer(void *arg, int flags __unused) 664 { 665 struct g_consumer *cp; 666 667 g_topology_assert(); 668 669 cp = arg; 670 G_RAID_DEBUG(1, "Consumer %s destroyed.", cp->provider->name); 671 g_detach(cp); 672 g_destroy_consumer(cp); 673 } 674 675 void 676 g_raid_kill_consumer(struct g_raid_softc *sc, struct g_consumer *cp) 677 { 678 struct g_provider *pp; 679 int retaste_wait; 680 681 g_topology_assert_not(); 682 683 g_topology_lock(); 684 cp->private = NULL; 685 if (g_raid_consumer_is_busy(sc, cp)) 686 goto out; 687 pp = cp->provider; 688 retaste_wait = 0; 689 if (cp->acw == 1) { 690 if ((pp->geom->flags & G_GEOM_WITHER) == 0) 691 retaste_wait = 1; 692 } 693 if (cp->acr > 0 || cp->acw > 0 || cp->ace > 0) 694 g_access(cp, -cp->acr, -cp->acw, -cp->ace); 695 if (retaste_wait) { 696 /* 697 * After retaste event was send (inside g_access()), we can send 698 * event to detach and destroy consumer. 699 * A class, which has consumer to the given provider connected 700 * will not receive retaste event for the provider. 701 * This is the way how I ignore retaste events when I close 702 * consumers opened for write: I detach and destroy consumer 703 * after retaste event is sent. 704 */ 705 g_post_event(g_raid_destroy_consumer, cp, M_WAITOK, NULL); 706 goto out; 707 } 708 G_RAID_DEBUG(1, "Consumer %s destroyed.", pp->name); 709 g_detach(cp); 710 g_destroy_consumer(cp); 711 out: 712 g_topology_unlock(); 713 } 714 715 static void 716 g_raid_orphan(struct g_consumer *cp) 717 { 718 struct g_raid_disk *disk; 719 720 g_topology_assert(); 721 722 disk = cp->private; 723 if (disk == NULL) 724 return; 725 g_raid_event_send(disk, G_RAID_DISK_E_DISCONNECTED, 726 G_RAID_EVENT_DISK); 727 } 728 729 static int 730 g_raid_clean(struct g_raid_volume *vol, int acw) 731 { 732 struct g_raid_softc *sc; 733 int timeout; 734 735 sc = vol->v_softc; 736 g_topology_assert_not(); 737 sx_assert(&sc->sc_lock, SX_XLOCKED); 738 739 // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) 740 // return (0); 741 if (!vol->v_dirty) 742 return (0); 743 if (vol->v_writes > 0) 744 return (0); 745 if (acw > 0 || (acw == -1 && 746 vol->v_provider != NULL && vol->v_provider->acw > 0)) { 747 timeout = g_raid_clean_time - (time_uptime - vol->v_last_write); 748 if (timeout > 0) 749 return (timeout); 750 } 751 vol->v_dirty = 0; 752 G_RAID_DEBUG1(1, sc, "Volume %s marked as clean.", 753 vol->v_name); 754 g_raid_write_metadata(sc, vol, NULL, NULL); 755 return (0); 756 } 757 758 static void 759 g_raid_dirty(struct g_raid_volume *vol) 760 { 761 struct g_raid_softc *sc; 762 763 sc = vol->v_softc; 764 g_topology_assert_not(); 765 sx_assert(&sc->sc_lock, SX_XLOCKED); 766 767 // if ((sc->sc_flags & G_RAID_DEVICE_FLAG_NOFAILSYNC) != 0) 768 // return; 769 vol->v_dirty = 1; 770 G_RAID_DEBUG1(1, sc, "Volume %s marked as dirty.", 771 vol->v_name); 772 g_raid_write_metadata(sc, vol, NULL, NULL); 773 } 774 775 void 776 g_raid_tr_flush_common(struct g_raid_tr_object *tr, struct bio *bp) 777 { 778 struct g_raid_softc *sc; 779 struct g_raid_volume *vol; 780 struct g_raid_subdisk *sd; 781 struct bio_queue_head queue; 782 struct bio *cbp; 783 int i; 784 785 vol = tr->tro_volume; 786 sc = vol->v_softc; 787 788 /* 789 * Allocate all bios before sending any request, so we can return 790 * ENOMEM in nice and clean way. 791 */ 792 bioq_init(&queue); 793 for (i = 0; i < vol->v_disks_count; i++) { 794 sd = &vol->v_subdisks[i]; 795 if (sd->sd_state == G_RAID_SUBDISK_S_NONE || 796 sd->sd_state == G_RAID_SUBDISK_S_FAILED) 797 continue; 798 cbp = g_clone_bio(bp); 799 if (cbp == NULL) 800 goto failure; 801 cbp->bio_caller1 = sd; 802 bioq_insert_tail(&queue, cbp); 803 } 804 for (cbp = bioq_first(&queue); cbp != NULL; 805 cbp = bioq_first(&queue)) { 806 bioq_remove(&queue, cbp); 807 sd = cbp->bio_caller1; 808 cbp->bio_caller1 = NULL; 809 g_raid_subdisk_iostart(sd, cbp); 810 } 811 return; 812 failure: 813 for (cbp = bioq_first(&queue); cbp != NULL; 814 cbp = bioq_first(&queue)) { 815 bioq_remove(&queue, cbp); 816 g_destroy_bio(cbp); 817 } 818 if (bp->bio_error == 0) 819 bp->bio_error = ENOMEM; 820 g_raid_iodone(bp, bp->bio_error); 821 } 822 823 static void 824 g_raid_tr_kerneldump_common_done(struct bio *bp) 825 { 826 827 bp->bio_flags |= BIO_DONE; 828 } 829 830 int 831 g_raid_tr_kerneldump_common(struct g_raid_tr_object *tr, 832 void *virtual, vm_offset_t physical, off_t offset, size_t length) 833 { 834 struct g_raid_softc *sc; 835 struct g_raid_volume *vol; 836 struct bio bp; 837 838 vol = tr->tro_volume; 839 sc = vol->v_softc; 840 841 bzero(&bp, sizeof(bp)); 842 bp.bio_cmd = BIO_WRITE; 843 bp.bio_done = g_raid_tr_kerneldump_common_done; 844 bp.bio_attribute = NULL; 845 bp.bio_offset = offset; 846 bp.bio_length = length; 847 bp.bio_data = virtual; 848 bp.bio_to = vol->v_provider; 849 850 g_raid_start(&bp); 851 while (!(bp.bio_flags & BIO_DONE)) { 852 G_RAID_DEBUG1(4, sc, "Poll..."); 853 g_raid_poll(sc); 854 DELAY(10); 855 } 856 857 return (bp.bio_error != 0 ? EIO : 0); 858 } 859 860 static int 861 g_raid_dump(void *arg, 862 void *virtual, vm_offset_t physical, off_t offset, size_t length) 863 { 864 struct g_raid_volume *vol; 865 int error; 866 867 vol = (struct g_raid_volume *)arg; 868 G_RAID_DEBUG1(3, vol->v_softc, "Dumping at off %llu len %llu.", 869 (long long unsigned)offset, (long long unsigned)length); 870 871 error = G_RAID_TR_KERNELDUMP(vol->v_tr, 872 virtual, physical, offset, length); 873 return (error); 874 } 875 876 static void 877 g_raid_kerneldump(struct g_raid_softc *sc, struct bio *bp) 878 { 879 struct g_kerneldump *gkd; 880 struct g_provider *pp; 881 struct g_raid_volume *vol; 882 883 gkd = (struct g_kerneldump*)bp->bio_data; 884 pp = bp->bio_to; 885 vol = pp->private; 886 g_trace(G_T_TOPOLOGY, "g_raid_kerneldump(%s, %jd, %jd)", 887 pp->name, (intmax_t)gkd->offset, (intmax_t)gkd->length); 888 gkd->di.dumper = g_raid_dump; 889 gkd->di.priv = vol; 890 gkd->di.blocksize = vol->v_sectorsize; 891 gkd->di.maxiosize = DFLTPHYS; 892 gkd->di.mediaoffset = gkd->offset; 893 if ((gkd->offset + gkd->length) > vol->v_mediasize) 894 gkd->length = vol->v_mediasize - gkd->offset; 895 gkd->di.mediasize = gkd->length; 896 g_io_deliver(bp, 0); 897 } 898 899 static void 900 g_raid_start(struct bio *bp) 901 { 902 struct g_raid_softc *sc; 903 904 sc = bp->bio_to->geom->softc; 905 /* 906 * If sc == NULL or there are no valid disks, provider's error 907 * should be set and g_raid_start() should not be called at all. 908 */ 909 // KASSERT(sc != NULL && sc->sc_state == G_RAID_VOLUME_S_RUNNING, 910 // ("Provider's error should be set (error=%d)(mirror=%s).", 911 // bp->bio_to->error, bp->bio_to->name)); 912 G_RAID_LOGREQ(3, bp, "Request received."); 913 914 switch (bp->bio_cmd) { 915 case BIO_READ: 916 case BIO_WRITE: 917 case BIO_DELETE: 918 case BIO_FLUSH: 919 break; 920 case BIO_GETATTR: 921 if (!strcmp(bp->bio_attribute, "GEOM::kerneldump")) 922 g_raid_kerneldump(sc, bp); 923 else 924 g_io_deliver(bp, EOPNOTSUPP); 925 return; 926 default: 927 g_io_deliver(bp, EOPNOTSUPP); 928 return; 929 } 930 mtx_lock(&sc->sc_queue_mtx); 931 bioq_disksort(&sc->sc_queue, bp); 932 mtx_unlock(&sc->sc_queue_mtx); 933 if (!dumping) { 934 G_RAID_DEBUG1(4, sc, "Waking up %p.", sc); 935 wakeup(sc); 936 } 937 } 938 939 static int 940 g_raid_bio_overlaps(const struct bio *bp, off_t lstart, off_t len) 941 { 942 /* 943 * 5 cases: 944 * (1) bp entirely below NO 945 * (2) bp entirely above NO 946 * (3) bp start below, but end in range YES 947 * (4) bp entirely within YES 948 * (5) bp starts within, ends above YES 949 * 950 * lock range 10-19 (offset 10 length 10) 951 * (1) 1-5: first if kicks it out 952 * (2) 30-35: second if kicks it out 953 * (3) 5-15: passes both ifs 954 * (4) 12-14: passes both ifs 955 * (5) 19-20: passes both 956 */ 957 off_t lend = lstart + len - 1; 958 off_t bstart = bp->bio_offset; 959 off_t bend = bp->bio_offset + bp->bio_length - 1; 960 961 if (bend < lstart) 962 return (0); 963 if (lend < bstart) 964 return (0); 965 return (1); 966 } 967 968 static int 969 g_raid_is_in_locked_range(struct g_raid_volume *vol, const struct bio *bp) 970 { 971 struct g_raid_lock *lp; 972 973 sx_assert(&vol->v_softc->sc_lock, SX_LOCKED); 974 975 LIST_FOREACH(lp, &vol->v_locks, l_next) { 976 if (g_raid_bio_overlaps(bp, lp->l_offset, lp->l_length)) 977 return (1); 978 } 979 return (0); 980 } 981 982 static void 983 g_raid_start_request(struct bio *bp) 984 { 985 struct g_raid_softc *sc; 986 struct g_raid_volume *vol; 987 988 sc = bp->bio_to->geom->softc; 989 sx_assert(&sc->sc_lock, SX_LOCKED); 990 vol = bp->bio_to->private; 991 992 /* 993 * Check to see if this item is in a locked range. If so, 994 * queue it to our locked queue and return. We'll requeue 995 * it when the range is unlocked. Internal I/O for the 996 * rebuild/rescan/recovery process is excluded from this 997 * check so we can actually do the recovery. 998 */ 999 if (!(bp->bio_cflags & G_RAID_BIO_FLAG_SPECIAL) && 1000 g_raid_is_in_locked_range(vol, bp)) { 1001 G_RAID_LOGREQ(3, bp, "Defer request."); 1002 bioq_insert_tail(&vol->v_locked, bp); 1003 return; 1004 } 1005 1006 /* 1007 * If we're actually going to do the write/delete, then 1008 * update the idle stats for the volume. 1009 */ 1010 if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { 1011 if (!vol->v_dirty) 1012 g_raid_dirty(vol); 1013 vol->v_writes++; 1014 } 1015 1016 /* 1017 * Put request onto inflight queue, so we can check if new 1018 * synchronization requests don't collide with it. Then tell 1019 * the transformation layer to start the I/O. 1020 */ 1021 bioq_insert_tail(&vol->v_inflight, bp); 1022 G_RAID_LOGREQ(4, bp, "Request started"); 1023 G_RAID_TR_IOSTART(vol->v_tr, bp); 1024 } 1025 1026 static void 1027 g_raid_finish_with_locked_ranges(struct g_raid_volume *vol, struct bio *bp) 1028 { 1029 off_t off, len; 1030 struct bio *nbp; 1031 struct g_raid_lock *lp; 1032 1033 vol->v_pending_lock = 0; 1034 LIST_FOREACH(lp, &vol->v_locks, l_next) { 1035 if (lp->l_pending) { 1036 off = lp->l_offset; 1037 len = lp->l_length; 1038 lp->l_pending = 0; 1039 TAILQ_FOREACH(nbp, &vol->v_inflight.queue, bio_queue) { 1040 if (g_raid_bio_overlaps(nbp, off, len)) 1041 lp->l_pending++; 1042 } 1043 if (lp->l_pending) { 1044 vol->v_pending_lock = 1; 1045 G_RAID_DEBUG1(4, vol->v_softc, 1046 "Deferred lock(%jd, %jd) has %d pending", 1047 (intmax_t)off, (intmax_t)(off + len), 1048 lp->l_pending); 1049 continue; 1050 } 1051 G_RAID_DEBUG1(4, vol->v_softc, 1052 "Deferred lock of %jd to %jd completed", 1053 (intmax_t)off, (intmax_t)(off + len)); 1054 G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); 1055 } 1056 } 1057 } 1058 1059 void 1060 g_raid_iodone(struct bio *bp, int error) 1061 { 1062 struct g_raid_softc *sc; 1063 struct g_raid_volume *vol; 1064 1065 sc = bp->bio_to->geom->softc; 1066 sx_assert(&sc->sc_lock, SX_LOCKED); 1067 vol = bp->bio_to->private; 1068 G_RAID_LOGREQ(3, bp, "Request done: %d.", error); 1069 1070 /* Update stats if we done write/delete. */ 1071 if (bp->bio_cmd == BIO_WRITE || bp->bio_cmd == BIO_DELETE) { 1072 vol->v_writes--; 1073 vol->v_last_write = time_uptime; 1074 } 1075 1076 bioq_remove(&vol->v_inflight, bp); 1077 if (vol->v_pending_lock && g_raid_is_in_locked_range(vol, bp)) 1078 g_raid_finish_with_locked_ranges(vol, bp); 1079 getmicrouptime(&vol->v_last_done); 1080 g_io_deliver(bp, error); 1081 } 1082 1083 int 1084 g_raid_lock_range(struct g_raid_volume *vol, off_t off, off_t len, 1085 struct bio *ignore, void *argp) 1086 { 1087 struct g_raid_softc *sc; 1088 struct g_raid_lock *lp; 1089 struct bio *bp; 1090 1091 sc = vol->v_softc; 1092 lp = malloc(sizeof(*lp), M_RAID, M_WAITOK | M_ZERO); 1093 LIST_INSERT_HEAD(&vol->v_locks, lp, l_next); 1094 lp->l_offset = off; 1095 lp->l_length = len; 1096 lp->l_callback_arg = argp; 1097 1098 lp->l_pending = 0; 1099 TAILQ_FOREACH(bp, &vol->v_inflight.queue, bio_queue) { 1100 if (bp != ignore && g_raid_bio_overlaps(bp, off, len)) 1101 lp->l_pending++; 1102 } 1103 1104 /* 1105 * If there are any writes that are pending, we return EBUSY. All 1106 * callers will have to wait until all pending writes clear. 1107 */ 1108 if (lp->l_pending > 0) { 1109 vol->v_pending_lock = 1; 1110 G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd deferred %d pend", 1111 (intmax_t)off, (intmax_t)(off+len), lp->l_pending); 1112 return (EBUSY); 1113 } 1114 G_RAID_DEBUG1(4, sc, "Locking range %jd to %jd", 1115 (intmax_t)off, (intmax_t)(off+len)); 1116 G_RAID_TR_LOCKED(vol->v_tr, lp->l_callback_arg); 1117 return (0); 1118 } 1119 1120 int 1121 g_raid_unlock_range(struct g_raid_volume *vol, off_t off, off_t len) 1122 { 1123 struct g_raid_lock *lp; 1124 struct g_raid_softc *sc; 1125 struct bio *bp; 1126 1127 sc = vol->v_softc; 1128 LIST_FOREACH(lp, &vol->v_locks, l_next) { 1129 if (lp->l_offset == off && lp->l_length == len) { 1130 LIST_REMOVE(lp, l_next); 1131 /* XXX 1132 * Right now we just put them all back on the queue 1133 * and hope for the best. We hope this because any 1134 * locked ranges will go right back on this list 1135 * when the worker thread runs. 1136 * XXX 1137 */ 1138 G_RAID_DEBUG1(4, sc, "Unlocked %jd to %jd", 1139 (intmax_t)lp->l_offset, 1140 (intmax_t)(lp->l_offset+lp->l_length)); 1141 mtx_lock(&sc->sc_queue_mtx); 1142 while ((bp = bioq_takefirst(&vol->v_locked)) != NULL) 1143 bioq_disksort(&sc->sc_queue, bp); 1144 mtx_unlock(&sc->sc_queue_mtx); 1145 free(lp, M_RAID); 1146 return (0); 1147 } 1148 } 1149 return (EINVAL); 1150 } 1151 1152 void 1153 g_raid_subdisk_iostart(struct g_raid_subdisk *sd, struct bio *bp) 1154 { 1155 struct g_consumer *cp; 1156 struct g_raid_disk *disk, *tdisk; 1157 1158 bp->bio_caller1 = sd; 1159 1160 /* 1161 * Make sure that the disk is present. Generally it is a task of 1162 * transformation layers to not send requests to absent disks, but 1163 * it is better to be safe and report situation then sorry. 1164 */ 1165 if (sd->sd_disk == NULL) { 1166 G_RAID_LOGREQ(0, bp, "Warning! I/O request to an absent disk!"); 1167 nodisk: 1168 bp->bio_from = NULL; 1169 bp->bio_to = NULL; 1170 bp->bio_error = ENXIO; 1171 g_raid_disk_done(bp); 1172 return; 1173 } 1174 disk = sd->sd_disk; 1175 if (disk->d_state != G_RAID_DISK_S_ACTIVE && 1176 disk->d_state != G_RAID_DISK_S_FAILED) { 1177 G_RAID_LOGREQ(0, bp, "Warning! I/O request to a disk in a " 1178 "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); 1179 goto nodisk; 1180 } 1181 1182 cp = disk->d_consumer; 1183 bp->bio_from = cp; 1184 bp->bio_to = cp->provider; 1185 cp->index++; 1186 1187 /* Update average disks load. */ 1188 TAILQ_FOREACH(tdisk, &sd->sd_softc->sc_disks, d_next) { 1189 if (tdisk->d_consumer == NULL) 1190 tdisk->d_load = 0; 1191 else 1192 tdisk->d_load = (tdisk->d_consumer->index * 1193 G_RAID_SUBDISK_LOAD_SCALE + tdisk->d_load * 7) / 8; 1194 } 1195 1196 disk->d_last_offset = bp->bio_offset + bp->bio_length; 1197 if (dumping) { 1198 G_RAID_LOGREQ(3, bp, "Sending dumping request."); 1199 if (bp->bio_cmd == BIO_WRITE) { 1200 bp->bio_error = g_raid_subdisk_kerneldump(sd, 1201 bp->bio_data, 0, bp->bio_offset, bp->bio_length); 1202 } else 1203 bp->bio_error = EOPNOTSUPP; 1204 g_raid_disk_done(bp); 1205 } else { 1206 bp->bio_done = g_raid_disk_done; 1207 bp->bio_offset += sd->sd_offset; 1208 G_RAID_LOGREQ(3, bp, "Sending request."); 1209 g_io_request(bp, cp); 1210 } 1211 } 1212 1213 int 1214 g_raid_subdisk_kerneldump(struct g_raid_subdisk *sd, 1215 void *virtual, vm_offset_t physical, off_t offset, size_t length) 1216 { 1217 1218 if (sd->sd_disk == NULL) 1219 return (ENXIO); 1220 if (sd->sd_disk->d_kd.di.dumper == NULL) 1221 return (EOPNOTSUPP); 1222 return (dump_write(&sd->sd_disk->d_kd.di, 1223 virtual, physical, 1224 sd->sd_disk->d_kd.di.mediaoffset + sd->sd_offset + offset, 1225 length)); 1226 } 1227 1228 static void 1229 g_raid_disk_done(struct bio *bp) 1230 { 1231 struct g_raid_softc *sc; 1232 struct g_raid_subdisk *sd; 1233 1234 sd = bp->bio_caller1; 1235 sc = sd->sd_softc; 1236 mtx_lock(&sc->sc_queue_mtx); 1237 bioq_disksort(&sc->sc_queue, bp); 1238 mtx_unlock(&sc->sc_queue_mtx); 1239 if (!dumping) 1240 wakeup(sc); 1241 } 1242 1243 static void 1244 g_raid_disk_done_request(struct bio *bp) 1245 { 1246 struct g_raid_softc *sc; 1247 struct g_raid_disk *disk; 1248 struct g_raid_subdisk *sd; 1249 struct g_raid_volume *vol; 1250 1251 g_topology_assert_not(); 1252 1253 G_RAID_LOGREQ(3, bp, "Disk request done: %d.", bp->bio_error); 1254 sd = bp->bio_caller1; 1255 sc = sd->sd_softc; 1256 vol = sd->sd_volume; 1257 if (bp->bio_from != NULL) { 1258 bp->bio_from->index--; 1259 disk = bp->bio_from->private; 1260 if (disk == NULL) 1261 g_raid_kill_consumer(sc, bp->bio_from); 1262 } 1263 bp->bio_offset -= sd->sd_offset; 1264 1265 G_RAID_TR_IODONE(vol->v_tr, sd, bp); 1266 } 1267 1268 static void 1269 g_raid_handle_event(struct g_raid_softc *sc, struct g_raid_event *ep) 1270 { 1271 1272 if ((ep->e_flags & G_RAID_EVENT_VOLUME) != 0) 1273 ep->e_error = g_raid_update_volume(ep->e_tgt, ep->e_event); 1274 else if ((ep->e_flags & G_RAID_EVENT_DISK) != 0) 1275 ep->e_error = g_raid_update_disk(ep->e_tgt, ep->e_event); 1276 else if ((ep->e_flags & G_RAID_EVENT_SUBDISK) != 0) 1277 ep->e_error = g_raid_update_subdisk(ep->e_tgt, ep->e_event); 1278 else 1279 ep->e_error = g_raid_update_node(ep->e_tgt, ep->e_event); 1280 if ((ep->e_flags & G_RAID_EVENT_WAIT) == 0) { 1281 KASSERT(ep->e_error == 0, 1282 ("Error cannot be handled.")); 1283 g_raid_event_free(ep); 1284 } else { 1285 ep->e_flags |= G_RAID_EVENT_DONE; 1286 G_RAID_DEBUG1(4, sc, "Waking up %p.", ep); 1287 mtx_lock(&sc->sc_queue_mtx); 1288 wakeup(ep); 1289 mtx_unlock(&sc->sc_queue_mtx); 1290 } 1291 } 1292 1293 /* 1294 * Worker thread. 1295 */ 1296 static void 1297 g_raid_worker(void *arg) 1298 { 1299 struct g_raid_softc *sc; 1300 struct g_raid_event *ep; 1301 struct g_raid_volume *vol; 1302 struct bio *bp; 1303 struct timeval now, t; 1304 int timeout, rv; 1305 1306 sc = arg; 1307 thread_lock(curthread); 1308 sched_prio(curthread, PRIBIO); 1309 thread_unlock(curthread); 1310 1311 sx_xlock(&sc->sc_lock); 1312 for (;;) { 1313 mtx_lock(&sc->sc_queue_mtx); 1314 /* 1315 * First take a look at events. 1316 * This is important to handle events before any I/O requests. 1317 */ 1318 bp = NULL; 1319 vol = NULL; 1320 rv = 0; 1321 ep = TAILQ_FIRST(&sc->sc_events); 1322 if (ep != NULL) 1323 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 1324 else if ((bp = bioq_takefirst(&sc->sc_queue)) != NULL) 1325 ; 1326 else { 1327 getmicrouptime(&now); 1328 t = now; 1329 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 1330 if (bioq_first(&vol->v_inflight) == NULL && 1331 vol->v_tr && 1332 timevalcmp(&vol->v_last_done, &t, < )) 1333 t = vol->v_last_done; 1334 } 1335 timevalsub(&t, &now); 1336 timeout = g_raid_idle_threshold + 1337 t.tv_sec * 1000000 + t.tv_usec; 1338 if (timeout > 0) { 1339 /* 1340 * Two steps to avoid overflows at HZ=1000 1341 * and idle timeouts > 2.1s. Some rounding 1342 * errors can occur, but they are < 1tick, 1343 * which is deemed to be close enough for 1344 * this purpose. 1345 */ 1346 int micpertic = 1000000 / hz; 1347 timeout = (timeout + micpertic - 1) / micpertic; 1348 sx_xunlock(&sc->sc_lock); 1349 MSLEEP(rv, sc, &sc->sc_queue_mtx, 1350 PRIBIO | PDROP, "-", timeout); 1351 sx_xlock(&sc->sc_lock); 1352 goto process; 1353 } else 1354 rv = EWOULDBLOCK; 1355 } 1356 mtx_unlock(&sc->sc_queue_mtx); 1357 process: 1358 if (ep != NULL) { 1359 g_raid_handle_event(sc, ep); 1360 } else if (bp != NULL) { 1361 if (bp->bio_to != NULL && 1362 bp->bio_to->geom == sc->sc_geom) 1363 g_raid_start_request(bp); 1364 else 1365 g_raid_disk_done_request(bp); 1366 } else if (rv == EWOULDBLOCK) { 1367 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 1368 if (vol->v_writes == 0 && vol->v_dirty) 1369 g_raid_clean(vol, -1); 1370 if (bioq_first(&vol->v_inflight) == NULL && 1371 vol->v_tr) { 1372 t.tv_sec = g_raid_idle_threshold / 1000000; 1373 t.tv_usec = g_raid_idle_threshold % 1000000; 1374 timevaladd(&t, &vol->v_last_done); 1375 getmicrouptime(&now); 1376 if (timevalcmp(&t, &now, <= )) { 1377 G_RAID_TR_IDLE(vol->v_tr); 1378 vol->v_last_done = now; 1379 } 1380 } 1381 } 1382 } 1383 if (sc->sc_stopping == G_RAID_DESTROY_HARD) 1384 g_raid_destroy_node(sc, 1); /* May not return. */ 1385 } 1386 } 1387 1388 static void 1389 g_raid_poll(struct g_raid_softc *sc) 1390 { 1391 struct g_raid_event *ep; 1392 struct bio *bp; 1393 1394 sx_xlock(&sc->sc_lock); 1395 mtx_lock(&sc->sc_queue_mtx); 1396 /* 1397 * First take a look at events. 1398 * This is important to handle events before any I/O requests. 1399 */ 1400 ep = TAILQ_FIRST(&sc->sc_events); 1401 if (ep != NULL) { 1402 TAILQ_REMOVE(&sc->sc_events, ep, e_next); 1403 mtx_unlock(&sc->sc_queue_mtx); 1404 g_raid_handle_event(sc, ep); 1405 goto out; 1406 } 1407 bp = bioq_takefirst(&sc->sc_queue); 1408 if (bp != NULL) { 1409 mtx_unlock(&sc->sc_queue_mtx); 1410 if (bp->bio_from == NULL || 1411 bp->bio_from->geom != sc->sc_geom) 1412 g_raid_start_request(bp); 1413 else 1414 g_raid_disk_done_request(bp); 1415 } 1416 out: 1417 sx_xunlock(&sc->sc_lock); 1418 } 1419 1420 static void 1421 g_raid_launch_provider(struct g_raid_volume *vol) 1422 { 1423 struct g_raid_disk *disk; 1424 struct g_raid_softc *sc; 1425 struct g_provider *pp; 1426 char name[G_RAID_MAX_VOLUMENAME]; 1427 off_t off; 1428 1429 sc = vol->v_softc; 1430 sx_assert(&sc->sc_lock, SX_LOCKED); 1431 1432 g_topology_lock(); 1433 /* Try to name provider with volume name. */ 1434 snprintf(name, sizeof(name), "raid/%s", vol->v_name); 1435 if (g_raid_name_format == 0 || vol->v_name[0] == 0 || 1436 g_provider_by_name(name) != NULL) { 1437 /* Otherwise use sequential volume number. */ 1438 snprintf(name, sizeof(name), "raid/r%d", vol->v_global_id); 1439 } 1440 pp = g_new_providerf(sc->sc_geom, "%s", name); 1441 pp->private = vol; 1442 pp->mediasize = vol->v_mediasize; 1443 pp->sectorsize = vol->v_sectorsize; 1444 pp->stripesize = 0; 1445 pp->stripeoffset = 0; 1446 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID1 || 1447 vol->v_raid_level == G_RAID_VOLUME_RL_RAID3 || 1448 vol->v_raid_level == G_RAID_VOLUME_RL_SINGLE || 1449 vol->v_raid_level == G_RAID_VOLUME_RL_CONCAT) { 1450 if ((disk = vol->v_subdisks[0].sd_disk) != NULL && 1451 disk->d_consumer != NULL && 1452 disk->d_consumer->provider != NULL) { 1453 pp->stripesize = disk->d_consumer->provider->stripesize; 1454 off = disk->d_consumer->provider->stripeoffset; 1455 pp->stripeoffset = off + vol->v_subdisks[0].sd_offset; 1456 if (off > 0) 1457 pp->stripeoffset %= off; 1458 } 1459 if (vol->v_raid_level == G_RAID_VOLUME_RL_RAID3) { 1460 pp->stripesize *= (vol->v_disks_count - 1); 1461 pp->stripeoffset *= (vol->v_disks_count - 1); 1462 } 1463 } else 1464 pp->stripesize = vol->v_strip_size; 1465 vol->v_provider = pp; 1466 g_error_provider(pp, 0); 1467 g_topology_unlock(); 1468 G_RAID_DEBUG1(0, sc, "Provider %s for volume %s created.", 1469 pp->name, vol->v_name); 1470 } 1471 1472 static void 1473 g_raid_destroy_provider(struct g_raid_volume *vol) 1474 { 1475 struct g_raid_softc *sc; 1476 struct g_provider *pp; 1477 struct bio *bp, *tmp; 1478 1479 g_topology_assert_not(); 1480 sc = vol->v_softc; 1481 pp = vol->v_provider; 1482 KASSERT(pp != NULL, ("NULL provider (volume=%s).", vol->v_name)); 1483 1484 g_topology_lock(); 1485 g_error_provider(pp, ENXIO); 1486 mtx_lock(&sc->sc_queue_mtx); 1487 TAILQ_FOREACH_SAFE(bp, &sc->sc_queue.queue, bio_queue, tmp) { 1488 if (bp->bio_to != pp) 1489 continue; 1490 bioq_remove(&sc->sc_queue, bp); 1491 g_io_deliver(bp, ENXIO); 1492 } 1493 mtx_unlock(&sc->sc_queue_mtx); 1494 G_RAID_DEBUG1(0, sc, "Provider %s for volume %s destroyed.", 1495 pp->name, vol->v_name); 1496 g_wither_provider(pp, ENXIO); 1497 g_topology_unlock(); 1498 vol->v_provider = NULL; 1499 } 1500 1501 /* 1502 * Update device state. 1503 */ 1504 static int 1505 g_raid_update_volume(struct g_raid_volume *vol, u_int event) 1506 { 1507 struct g_raid_softc *sc; 1508 1509 sc = vol->v_softc; 1510 sx_assert(&sc->sc_lock, SX_XLOCKED); 1511 1512 G_RAID_DEBUG1(2, sc, "Event %s for volume %s.", 1513 g_raid_volume_event2str(event), 1514 vol->v_name); 1515 switch (event) { 1516 case G_RAID_VOLUME_E_DOWN: 1517 if (vol->v_provider != NULL) 1518 g_raid_destroy_provider(vol); 1519 break; 1520 case G_RAID_VOLUME_E_UP: 1521 if (vol->v_provider == NULL) 1522 g_raid_launch_provider(vol); 1523 break; 1524 case G_RAID_VOLUME_E_START: 1525 if (vol->v_tr) 1526 G_RAID_TR_START(vol->v_tr); 1527 return (0); 1528 default: 1529 if (sc->sc_md) 1530 G_RAID_MD_VOLUME_EVENT(sc->sc_md, vol, event); 1531 return (0); 1532 } 1533 1534 /* Manage root mount release. */ 1535 if (vol->v_starting) { 1536 vol->v_starting = 0; 1537 G_RAID_DEBUG1(1, sc, "root_mount_rel %p", vol->v_rootmount); 1538 root_mount_rel(vol->v_rootmount); 1539 vol->v_rootmount = NULL; 1540 } 1541 if (vol->v_stopping && vol->v_provider_open == 0) 1542 g_raid_destroy_volume(vol); 1543 return (0); 1544 } 1545 1546 /* 1547 * Update subdisk state. 1548 */ 1549 static int 1550 g_raid_update_subdisk(struct g_raid_subdisk *sd, u_int event) 1551 { 1552 struct g_raid_softc *sc; 1553 struct g_raid_volume *vol; 1554 1555 sc = sd->sd_softc; 1556 vol = sd->sd_volume; 1557 sx_assert(&sc->sc_lock, SX_XLOCKED); 1558 1559 G_RAID_DEBUG1(2, sc, "Event %s for subdisk %s:%d-%s.", 1560 g_raid_subdisk_event2str(event), 1561 vol->v_name, sd->sd_pos, 1562 sd->sd_disk ? g_raid_get_diskname(sd->sd_disk) : "[none]"); 1563 if (vol->v_tr) 1564 G_RAID_TR_EVENT(vol->v_tr, sd, event); 1565 1566 return (0); 1567 } 1568 1569 /* 1570 * Update disk state. 1571 */ 1572 static int 1573 g_raid_update_disk(struct g_raid_disk *disk, u_int event) 1574 { 1575 struct g_raid_softc *sc; 1576 1577 sc = disk->d_softc; 1578 sx_assert(&sc->sc_lock, SX_XLOCKED); 1579 1580 G_RAID_DEBUG1(2, sc, "Event %s for disk %s.", 1581 g_raid_disk_event2str(event), 1582 g_raid_get_diskname(disk)); 1583 1584 if (sc->sc_md) 1585 G_RAID_MD_EVENT(sc->sc_md, disk, event); 1586 return (0); 1587 } 1588 1589 /* 1590 * Node event. 1591 */ 1592 static int 1593 g_raid_update_node(struct g_raid_softc *sc, u_int event) 1594 { 1595 sx_assert(&sc->sc_lock, SX_XLOCKED); 1596 1597 G_RAID_DEBUG1(2, sc, "Event %s for the array.", 1598 g_raid_node_event2str(event)); 1599 1600 if (event == G_RAID_NODE_E_WAKE) 1601 return (0); 1602 if (sc->sc_md) 1603 G_RAID_MD_EVENT(sc->sc_md, NULL, event); 1604 return (0); 1605 } 1606 1607 static int 1608 g_raid_access(struct g_provider *pp, int acr, int acw, int ace) 1609 { 1610 struct g_raid_volume *vol; 1611 struct g_raid_softc *sc; 1612 int dcw, opens, error = 0; 1613 1614 g_topology_assert(); 1615 sc = pp->geom->softc; 1616 vol = pp->private; 1617 KASSERT(sc != NULL, ("NULL softc (provider=%s).", pp->name)); 1618 KASSERT(vol != NULL, ("NULL volume (provider=%s).", pp->name)); 1619 1620 G_RAID_DEBUG1(2, sc, "Access request for %s: r%dw%de%d.", pp->name, 1621 acr, acw, ace); 1622 dcw = pp->acw + acw; 1623 1624 g_topology_unlock(); 1625 sx_xlock(&sc->sc_lock); 1626 /* Deny new opens while dying. */ 1627 if (sc->sc_stopping != 0 && (acr > 0 || acw > 0 || ace > 0)) { 1628 error = ENXIO; 1629 goto out; 1630 } 1631 if (dcw == 0 && vol->v_dirty) 1632 g_raid_clean(vol, dcw); 1633 vol->v_provider_open += acr + acw + ace; 1634 /* Handle delayed node destruction. */ 1635 if (sc->sc_stopping == G_RAID_DESTROY_DELAYED && 1636 vol->v_provider_open == 0) { 1637 /* Count open volumes. */ 1638 opens = g_raid_nopens(sc); 1639 if (opens == 0) { 1640 sc->sc_stopping = G_RAID_DESTROY_HARD; 1641 /* Wake up worker to make it selfdestruct. */ 1642 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1643 } 1644 } 1645 /* Handle open volume destruction. */ 1646 if (vol->v_stopping && vol->v_provider_open == 0) 1647 g_raid_destroy_volume(vol); 1648 out: 1649 sx_xunlock(&sc->sc_lock); 1650 g_topology_lock(); 1651 return (error); 1652 } 1653 1654 struct g_raid_softc * 1655 g_raid_create_node(struct g_class *mp, 1656 const char *name, struct g_raid_md_object *md) 1657 { 1658 struct g_raid_softc *sc; 1659 struct g_geom *gp; 1660 int error; 1661 1662 g_topology_assert(); 1663 G_RAID_DEBUG(1, "Creating array %s.", name); 1664 1665 gp = g_new_geomf(mp, "%s", name); 1666 sc = malloc(sizeof(*sc), M_RAID, M_WAITOK | M_ZERO); 1667 gp->start = g_raid_start; 1668 gp->orphan = g_raid_orphan; 1669 gp->access = g_raid_access; 1670 gp->dumpconf = g_raid_dumpconf; 1671 1672 sc->sc_md = md; 1673 sc->sc_geom = gp; 1674 sc->sc_flags = 0; 1675 TAILQ_INIT(&sc->sc_volumes); 1676 TAILQ_INIT(&sc->sc_disks); 1677 sx_init(&sc->sc_lock, "gmirror:lock"); 1678 mtx_init(&sc->sc_queue_mtx, "gmirror:queue", NULL, MTX_DEF); 1679 TAILQ_INIT(&sc->sc_events); 1680 bioq_init(&sc->sc_queue); 1681 gp->softc = sc; 1682 error = kproc_create(g_raid_worker, sc, &sc->sc_worker, 0, 0, 1683 "g_raid %s", name); 1684 if (error != 0) { 1685 G_RAID_DEBUG(0, "Cannot create kernel thread for %s.", name); 1686 mtx_destroy(&sc->sc_queue_mtx); 1687 sx_destroy(&sc->sc_lock); 1688 g_destroy_geom(sc->sc_geom); 1689 free(sc, M_RAID); 1690 return (NULL); 1691 } 1692 1693 G_RAID_DEBUG1(0, sc, "Array %s created.", name); 1694 return (sc); 1695 } 1696 1697 struct g_raid_volume * 1698 g_raid_create_volume(struct g_raid_softc *sc, const char *name, int id) 1699 { 1700 struct g_raid_volume *vol, *vol1; 1701 int i; 1702 1703 G_RAID_DEBUG1(1, sc, "Creating volume %s.", name); 1704 vol = malloc(sizeof(*vol), M_RAID, M_WAITOK | M_ZERO); 1705 vol->v_softc = sc; 1706 strlcpy(vol->v_name, name, G_RAID_MAX_VOLUMENAME); 1707 vol->v_state = G_RAID_VOLUME_S_STARTING; 1708 vol->v_raid_level = G_RAID_VOLUME_RL_UNKNOWN; 1709 vol->v_raid_level_qualifier = G_RAID_VOLUME_RLQ_UNKNOWN; 1710 bioq_init(&vol->v_inflight); 1711 bioq_init(&vol->v_locked); 1712 LIST_INIT(&vol->v_locks); 1713 for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { 1714 vol->v_subdisks[i].sd_softc = sc; 1715 vol->v_subdisks[i].sd_volume = vol; 1716 vol->v_subdisks[i].sd_pos = i; 1717 vol->v_subdisks[i].sd_state = G_RAID_DISK_S_NONE; 1718 } 1719 1720 /* Find free ID for this volume. */ 1721 g_topology_lock(); 1722 vol1 = vol; 1723 if (id >= 0) { 1724 LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { 1725 if (vol1->v_global_id == id) 1726 break; 1727 } 1728 } 1729 if (vol1 != NULL) { 1730 for (id = 0; ; id++) { 1731 LIST_FOREACH(vol1, &g_raid_volumes, v_global_next) { 1732 if (vol1->v_global_id == id) 1733 break; 1734 } 1735 if (vol1 == NULL) 1736 break; 1737 } 1738 } 1739 vol->v_global_id = id; 1740 LIST_INSERT_HEAD(&g_raid_volumes, vol, v_global_next); 1741 g_topology_unlock(); 1742 1743 /* Delay root mounting. */ 1744 vol->v_rootmount = root_mount_hold("GRAID"); 1745 G_RAID_DEBUG1(1, sc, "root_mount_hold %p", vol->v_rootmount); 1746 vol->v_starting = 1; 1747 TAILQ_INSERT_TAIL(&sc->sc_volumes, vol, v_next); 1748 return (vol); 1749 } 1750 1751 struct g_raid_disk * 1752 g_raid_create_disk(struct g_raid_softc *sc) 1753 { 1754 struct g_raid_disk *disk; 1755 1756 G_RAID_DEBUG1(1, sc, "Creating disk."); 1757 disk = malloc(sizeof(*disk), M_RAID, M_WAITOK | M_ZERO); 1758 disk->d_softc = sc; 1759 disk->d_state = G_RAID_DISK_S_NONE; 1760 TAILQ_INIT(&disk->d_subdisks); 1761 TAILQ_INSERT_TAIL(&sc->sc_disks, disk, d_next); 1762 return (disk); 1763 } 1764 1765 int g_raid_start_volume(struct g_raid_volume *vol) 1766 { 1767 struct g_raid_tr_class *class; 1768 struct g_raid_tr_object *obj; 1769 int status; 1770 1771 G_RAID_DEBUG1(2, vol->v_softc, "Starting volume %s.", vol->v_name); 1772 LIST_FOREACH(class, &g_raid_tr_classes, trc_list) { 1773 G_RAID_DEBUG1(2, vol->v_softc, 1774 "Tasting volume %s for %s transformation.", 1775 vol->v_name, class->name); 1776 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 1777 M_WAITOK); 1778 obj->tro_class = class; 1779 obj->tro_volume = vol; 1780 status = G_RAID_TR_TASTE(obj, vol); 1781 if (status != G_RAID_TR_TASTE_FAIL) 1782 break; 1783 kobj_delete((kobj_t)obj, M_RAID); 1784 } 1785 if (class == NULL) { 1786 G_RAID_DEBUG1(0, vol->v_softc, 1787 "No transformation module found for %s.", 1788 vol->v_name); 1789 vol->v_tr = NULL; 1790 g_raid_change_volume_state(vol, G_RAID_VOLUME_S_UNSUPPORTED); 1791 g_raid_event_send(vol, G_RAID_VOLUME_E_DOWN, 1792 G_RAID_EVENT_VOLUME); 1793 return (-1); 1794 } 1795 G_RAID_DEBUG1(2, vol->v_softc, 1796 "Transformation module %s chosen for %s.", 1797 class->name, vol->v_name); 1798 vol->v_tr = obj; 1799 return (0); 1800 } 1801 1802 int 1803 g_raid_destroy_node(struct g_raid_softc *sc, int worker) 1804 { 1805 struct g_raid_volume *vol, *tmpv; 1806 struct g_raid_disk *disk, *tmpd; 1807 int error = 0; 1808 1809 sc->sc_stopping = G_RAID_DESTROY_HARD; 1810 TAILQ_FOREACH_SAFE(vol, &sc->sc_volumes, v_next, tmpv) { 1811 if (g_raid_destroy_volume(vol)) 1812 error = EBUSY; 1813 } 1814 if (error) 1815 return (error); 1816 TAILQ_FOREACH_SAFE(disk, &sc->sc_disks, d_next, tmpd) { 1817 if (g_raid_destroy_disk(disk)) 1818 error = EBUSY; 1819 } 1820 if (error) 1821 return (error); 1822 if (sc->sc_md) { 1823 G_RAID_MD_FREE(sc->sc_md); 1824 kobj_delete((kobj_t)sc->sc_md, M_RAID); 1825 sc->sc_md = NULL; 1826 } 1827 if (sc->sc_geom != NULL) { 1828 G_RAID_DEBUG1(0, sc, "Array %s destroyed.", sc->sc_name); 1829 g_topology_lock(); 1830 sc->sc_geom->softc = NULL; 1831 g_wither_geom(sc->sc_geom, ENXIO); 1832 g_topology_unlock(); 1833 sc->sc_geom = NULL; 1834 } else 1835 G_RAID_DEBUG(1, "Array destroyed."); 1836 if (worker) { 1837 g_raid_event_cancel(sc, sc); 1838 mtx_destroy(&sc->sc_queue_mtx); 1839 sx_xunlock(&sc->sc_lock); 1840 sx_destroy(&sc->sc_lock); 1841 wakeup(&sc->sc_stopping); 1842 free(sc, M_RAID); 1843 curthread->td_pflags &= ~TDP_GEOM; 1844 G_RAID_DEBUG(1, "Thread exiting."); 1845 kproc_exit(0); 1846 } else { 1847 /* Wake up worker to make it selfdestruct. */ 1848 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1849 } 1850 return (0); 1851 } 1852 1853 int 1854 g_raid_destroy_volume(struct g_raid_volume *vol) 1855 { 1856 struct g_raid_softc *sc; 1857 struct g_raid_disk *disk; 1858 int i; 1859 1860 sc = vol->v_softc; 1861 G_RAID_DEBUG1(2, sc, "Destroying volume %s.", vol->v_name); 1862 vol->v_stopping = 1; 1863 if (vol->v_state != G_RAID_VOLUME_S_STOPPED) { 1864 if (vol->v_tr) { 1865 G_RAID_TR_STOP(vol->v_tr); 1866 return (EBUSY); 1867 } else 1868 vol->v_state = G_RAID_VOLUME_S_STOPPED; 1869 } 1870 if (g_raid_event_check(sc, vol) != 0) 1871 return (EBUSY); 1872 if (vol->v_provider != NULL) 1873 return (EBUSY); 1874 if (vol->v_provider_open != 0) 1875 return (EBUSY); 1876 if (vol->v_tr) { 1877 G_RAID_TR_FREE(vol->v_tr); 1878 kobj_delete((kobj_t)vol->v_tr, M_RAID); 1879 vol->v_tr = NULL; 1880 } 1881 if (vol->v_rootmount) 1882 root_mount_rel(vol->v_rootmount); 1883 g_topology_lock(); 1884 LIST_REMOVE(vol, v_global_next); 1885 g_topology_unlock(); 1886 TAILQ_REMOVE(&sc->sc_volumes, vol, v_next); 1887 for (i = 0; i < G_RAID_MAX_SUBDISKS; i++) { 1888 g_raid_event_cancel(sc, &vol->v_subdisks[i]); 1889 disk = vol->v_subdisks[i].sd_disk; 1890 if (disk == NULL) 1891 continue; 1892 TAILQ_REMOVE(&disk->d_subdisks, &vol->v_subdisks[i], sd_next); 1893 } 1894 G_RAID_DEBUG1(2, sc, "Volume %s destroyed.", vol->v_name); 1895 if (sc->sc_md) 1896 G_RAID_MD_FREE_VOLUME(sc->sc_md, vol); 1897 g_raid_event_cancel(sc, vol); 1898 free(vol, M_RAID); 1899 if (sc->sc_stopping == G_RAID_DESTROY_HARD) { 1900 /* Wake up worker to let it selfdestruct. */ 1901 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1902 } 1903 return (0); 1904 } 1905 1906 int 1907 g_raid_destroy_disk(struct g_raid_disk *disk) 1908 { 1909 struct g_raid_softc *sc; 1910 struct g_raid_subdisk *sd, *tmp; 1911 1912 sc = disk->d_softc; 1913 G_RAID_DEBUG1(2, sc, "Destroying disk."); 1914 if (disk->d_consumer) { 1915 g_raid_kill_consumer(sc, disk->d_consumer); 1916 disk->d_consumer = NULL; 1917 } 1918 TAILQ_FOREACH_SAFE(sd, &disk->d_subdisks, sd_next, tmp) { 1919 g_raid_change_subdisk_state(sd, G_RAID_SUBDISK_S_NONE); 1920 g_raid_event_send(sd, G_RAID_SUBDISK_E_DISCONNECTED, 1921 G_RAID_EVENT_SUBDISK); 1922 TAILQ_REMOVE(&disk->d_subdisks, sd, sd_next); 1923 sd->sd_disk = NULL; 1924 } 1925 TAILQ_REMOVE(&sc->sc_disks, disk, d_next); 1926 if (sc->sc_md) 1927 G_RAID_MD_FREE_DISK(sc->sc_md, disk); 1928 g_raid_event_cancel(sc, disk); 1929 free(disk, M_RAID); 1930 return (0); 1931 } 1932 1933 int 1934 g_raid_destroy(struct g_raid_softc *sc, int how) 1935 { 1936 int opens; 1937 1938 g_topology_assert_not(); 1939 if (sc == NULL) 1940 return (ENXIO); 1941 sx_assert(&sc->sc_lock, SX_XLOCKED); 1942 1943 /* Count open volumes. */ 1944 opens = g_raid_nopens(sc); 1945 1946 /* React on some opened volumes. */ 1947 if (opens > 0) { 1948 switch (how) { 1949 case G_RAID_DESTROY_SOFT: 1950 G_RAID_DEBUG1(1, sc, 1951 "%d volumes are still open.", 1952 opens); 1953 return (EBUSY); 1954 case G_RAID_DESTROY_DELAYED: 1955 G_RAID_DEBUG1(1, sc, 1956 "Array will be destroyed on last close."); 1957 sc->sc_stopping = G_RAID_DESTROY_DELAYED; 1958 return (EBUSY); 1959 case G_RAID_DESTROY_HARD: 1960 G_RAID_DEBUG1(1, sc, 1961 "%d volumes are still open.", 1962 opens); 1963 } 1964 } 1965 1966 /* Mark node for destruction. */ 1967 sc->sc_stopping = G_RAID_DESTROY_HARD; 1968 /* Wake up worker to let it selfdestruct. */ 1969 g_raid_event_send(sc, G_RAID_NODE_E_WAKE, 0); 1970 /* Sleep until node destroyed. */ 1971 sx_sleep(&sc->sc_stopping, &sc->sc_lock, 1972 PRIBIO | PDROP, "r:destroy", 0); 1973 return (0); 1974 } 1975 1976 static void 1977 g_raid_taste_orphan(struct g_consumer *cp) 1978 { 1979 1980 KASSERT(1 == 0, ("%s called while tasting %s.", __func__, 1981 cp->provider->name)); 1982 } 1983 1984 static struct g_geom * 1985 g_raid_taste(struct g_class *mp, struct g_provider *pp, int flags __unused) 1986 { 1987 struct g_consumer *cp; 1988 struct g_geom *gp, *geom; 1989 struct g_raid_md_class *class; 1990 struct g_raid_md_object *obj; 1991 int status; 1992 1993 g_topology_assert(); 1994 g_trace(G_T_TOPOLOGY, "%s(%s, %s)", __func__, mp->name, pp->name); 1995 G_RAID_DEBUG(2, "Tasting provider %s.", pp->name); 1996 1997 gp = g_new_geomf(mp, "mirror:taste"); 1998 /* 1999 * This orphan function should be never called. 2000 */ 2001 gp->orphan = g_raid_taste_orphan; 2002 cp = g_new_consumer(gp); 2003 g_attach(cp, pp); 2004 2005 geom = NULL; 2006 LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { 2007 G_RAID_DEBUG(2, "Tasting provider %s for %s metadata.", 2008 pp->name, class->name); 2009 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 2010 M_WAITOK); 2011 obj->mdo_class = class; 2012 status = G_RAID_MD_TASTE(obj, mp, cp, &geom); 2013 if (status != G_RAID_MD_TASTE_NEW) 2014 kobj_delete((kobj_t)obj, M_RAID); 2015 if (status != G_RAID_MD_TASTE_FAIL) 2016 break; 2017 } 2018 2019 g_detach(cp); 2020 g_destroy_consumer(cp); 2021 g_destroy_geom(gp); 2022 G_RAID_DEBUG(2, "Tasting provider %s done.", pp->name); 2023 return (geom); 2024 } 2025 2026 int 2027 g_raid_create_node_format(const char *format, struct g_geom **gp) 2028 { 2029 struct g_raid_md_class *class; 2030 struct g_raid_md_object *obj; 2031 int status; 2032 2033 G_RAID_DEBUG(2, "Creating array for %s metadata.", format); 2034 LIST_FOREACH(class, &g_raid_md_classes, mdc_list) { 2035 if (strcasecmp(class->name, format) == 0) 2036 break; 2037 } 2038 if (class == NULL) { 2039 G_RAID_DEBUG(1, "No support for %s metadata.", format); 2040 return (G_RAID_MD_TASTE_FAIL); 2041 } 2042 obj = (void *)kobj_create((kobj_class_t)class, M_RAID, 2043 M_WAITOK); 2044 obj->mdo_class = class; 2045 status = G_RAID_MD_CREATE(obj, &g_raid_class, gp); 2046 if (status != G_RAID_MD_TASTE_NEW) 2047 kobj_delete((kobj_t)obj, M_RAID); 2048 return (status); 2049 } 2050 2051 static int 2052 g_raid_destroy_geom(struct gctl_req *req __unused, 2053 struct g_class *mp __unused, struct g_geom *gp) 2054 { 2055 struct g_raid_softc *sc; 2056 int error; 2057 2058 g_topology_unlock(); 2059 sc = gp->softc; 2060 sx_xlock(&sc->sc_lock); 2061 g_cancel_event(sc); 2062 error = g_raid_destroy(gp->softc, G_RAID_DESTROY_SOFT); 2063 if (error != 0) 2064 sx_xunlock(&sc->sc_lock); 2065 g_topology_lock(); 2066 return (error); 2067 } 2068 2069 void g_raid_write_metadata(struct g_raid_softc *sc, struct g_raid_volume *vol, 2070 struct g_raid_subdisk *sd, struct g_raid_disk *disk) 2071 { 2072 2073 if (sc->sc_stopping == G_RAID_DESTROY_HARD) 2074 return; 2075 if (sc->sc_md) 2076 G_RAID_MD_WRITE(sc->sc_md, vol, sd, disk); 2077 } 2078 2079 void g_raid_fail_disk(struct g_raid_softc *sc, 2080 struct g_raid_subdisk *sd, struct g_raid_disk *disk) 2081 { 2082 2083 if (disk == NULL) 2084 disk = sd->sd_disk; 2085 if (disk == NULL) { 2086 G_RAID_DEBUG1(0, sc, "Warning! Fail request to an absent disk!"); 2087 return; 2088 } 2089 if (disk->d_state != G_RAID_DISK_S_ACTIVE) { 2090 G_RAID_DEBUG1(0, sc, "Warning! Fail request to a disk in a " 2091 "wrong state (%s)!", g_raid_disk_state2str(disk->d_state)); 2092 return; 2093 } 2094 if (sc->sc_md) 2095 G_RAID_MD_FAIL_DISK(sc->sc_md, sd, disk); 2096 } 2097 2098 static void 2099 g_raid_dumpconf(struct sbuf *sb, const char *indent, struct g_geom *gp, 2100 struct g_consumer *cp, struct g_provider *pp) 2101 { 2102 struct g_raid_softc *sc; 2103 struct g_raid_volume *vol; 2104 struct g_raid_subdisk *sd; 2105 struct g_raid_disk *disk; 2106 int i, s; 2107 2108 g_topology_assert(); 2109 2110 sc = gp->softc; 2111 if (sc == NULL) 2112 return; 2113 if (pp != NULL) { 2114 vol = pp->private; 2115 g_topology_unlock(); 2116 sx_xlock(&sc->sc_lock); 2117 sbuf_printf(sb, "%s<Label>%s</Label>\n", indent, 2118 vol->v_name); 2119 sbuf_printf(sb, "%s<RAIDLevel>%s</RAIDLevel>\n", indent, 2120 g_raid_volume_level2str(vol->v_raid_level, 2121 vol->v_raid_level_qualifier)); 2122 sbuf_printf(sb, 2123 "%s<Transformation>%s</Transformation>\n", indent, 2124 vol->v_tr ? vol->v_tr->tro_class->name : "NONE"); 2125 sbuf_printf(sb, "%s<Components>%u</Components>\n", indent, 2126 vol->v_disks_count); 2127 sbuf_printf(sb, "%s<Strip>%u</Strip>\n", indent, 2128 vol->v_strip_size); 2129 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 2130 g_raid_volume_state2str(vol->v_state)); 2131 sbuf_printf(sb, "%s<Dirty>%s</Dirty>\n", indent, 2132 vol->v_dirty ? "Yes" : "No"); 2133 sbuf_printf(sb, "%s<Subdisks>", indent); 2134 for (i = 0; i < vol->v_disks_count; i++) { 2135 sd = &vol->v_subdisks[i]; 2136 if (sd->sd_disk != NULL && 2137 sd->sd_disk->d_consumer != NULL) { 2138 sbuf_printf(sb, "%s ", 2139 g_raid_get_diskname(sd->sd_disk)); 2140 } else { 2141 sbuf_printf(sb, "NONE "); 2142 } 2143 sbuf_printf(sb, "(%s", 2144 g_raid_subdisk_state2str(sd->sd_state)); 2145 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 2146 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { 2147 sbuf_printf(sb, " %d%%", 2148 (int)(sd->sd_rebuild_pos * 100 / 2149 sd->sd_size)); 2150 } 2151 sbuf_printf(sb, ")"); 2152 if (i + 1 < vol->v_disks_count) 2153 sbuf_printf(sb, ", "); 2154 } 2155 sbuf_printf(sb, "</Subdisks>\n"); 2156 sx_xunlock(&sc->sc_lock); 2157 g_topology_lock(); 2158 } else if (cp != NULL) { 2159 disk = cp->private; 2160 if (disk == NULL) 2161 return; 2162 g_topology_unlock(); 2163 sx_xlock(&sc->sc_lock); 2164 sbuf_printf(sb, "%s<State>%s", indent, 2165 g_raid_disk_state2str(disk->d_state)); 2166 if (!TAILQ_EMPTY(&disk->d_subdisks)) { 2167 sbuf_printf(sb, " ("); 2168 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 2169 sbuf_printf(sb, "%s", 2170 g_raid_subdisk_state2str(sd->sd_state)); 2171 if (sd->sd_state == G_RAID_SUBDISK_S_REBUILD || 2172 sd->sd_state == G_RAID_SUBDISK_S_RESYNC) { 2173 sbuf_printf(sb, " %d%%", 2174 (int)(sd->sd_rebuild_pos * 100 / 2175 sd->sd_size)); 2176 } 2177 if (TAILQ_NEXT(sd, sd_next)) 2178 sbuf_printf(sb, ", "); 2179 } 2180 sbuf_printf(sb, ")"); 2181 } 2182 sbuf_printf(sb, "</State>\n"); 2183 sbuf_printf(sb, "%s<Subdisks>", indent); 2184 TAILQ_FOREACH(sd, &disk->d_subdisks, sd_next) { 2185 sbuf_printf(sb, "r%d(%s):%d@%ju", 2186 sd->sd_volume->v_global_id, 2187 sd->sd_volume->v_name, 2188 sd->sd_pos, sd->sd_offset); 2189 if (TAILQ_NEXT(sd, sd_next)) 2190 sbuf_printf(sb, ", "); 2191 } 2192 sbuf_printf(sb, "</Subdisks>\n"); 2193 sbuf_printf(sb, "%s<ReadErrors>%d</ReadErrors>\n", indent, 2194 disk->d_read_errs); 2195 sx_xunlock(&sc->sc_lock); 2196 g_topology_lock(); 2197 } else { 2198 g_topology_unlock(); 2199 sx_xlock(&sc->sc_lock); 2200 if (sc->sc_md) { 2201 sbuf_printf(sb, "%s<Metadata>%s</Metadata>\n", indent, 2202 sc->sc_md->mdo_class->name); 2203 } 2204 if (!TAILQ_EMPTY(&sc->sc_volumes)) { 2205 s = 0xff; 2206 TAILQ_FOREACH(vol, &sc->sc_volumes, v_next) { 2207 if (vol->v_state < s) 2208 s = vol->v_state; 2209 } 2210 sbuf_printf(sb, "%s<State>%s</State>\n", indent, 2211 g_raid_volume_state2str(s)); 2212 } 2213 sx_xunlock(&sc->sc_lock); 2214 g_topology_lock(); 2215 } 2216 } 2217 2218 static void 2219 g_raid_shutdown_pre_sync(void *arg, int howto) 2220 { 2221 struct g_class *mp; 2222 struct g_geom *gp, *gp2; 2223 struct g_raid_softc *sc; 2224 int error; 2225 2226 mp = arg; 2227 DROP_GIANT(); 2228 g_topology_lock(); 2229 LIST_FOREACH_SAFE(gp, &mp->geom, geom, gp2) { 2230 if ((sc = gp->softc) == NULL) 2231 continue; 2232 g_topology_unlock(); 2233 sx_xlock(&sc->sc_lock); 2234 g_cancel_event(sc); 2235 error = g_raid_destroy(sc, G_RAID_DESTROY_DELAYED); 2236 if (error != 0) 2237 sx_xunlock(&sc->sc_lock); 2238 g_topology_lock(); 2239 } 2240 g_topology_unlock(); 2241 PICKUP_GIANT(); 2242 } 2243 2244 static void 2245 g_raid_init(struct g_class *mp) 2246 { 2247 2248 g_raid_pre_sync = EVENTHANDLER_REGISTER(shutdown_pre_sync, 2249 g_raid_shutdown_pre_sync, mp, SHUTDOWN_PRI_FIRST); 2250 if (g_raid_pre_sync == NULL) 2251 G_RAID_DEBUG(0, "Warning! Cannot register shutdown event."); 2252 g_raid_started = 1; 2253 } 2254 2255 static void 2256 g_raid_fini(struct g_class *mp) 2257 { 2258 2259 if (g_raid_pre_sync != NULL) 2260 EVENTHANDLER_DEREGISTER(shutdown_pre_sync, g_raid_pre_sync); 2261 g_raid_started = 0; 2262 } 2263 2264 int 2265 g_raid_md_modevent(module_t mod, int type, void *arg) 2266 { 2267 struct g_raid_md_class *class, *c, *nc; 2268 int error; 2269 2270 error = 0; 2271 class = arg; 2272 switch (type) { 2273 case MOD_LOAD: 2274 c = LIST_FIRST(&g_raid_md_classes); 2275 if (c == NULL || c->mdc_priority > class->mdc_priority) 2276 LIST_INSERT_HEAD(&g_raid_md_classes, class, mdc_list); 2277 else { 2278 while ((nc = LIST_NEXT(c, mdc_list)) != NULL && 2279 nc->mdc_priority < class->mdc_priority) 2280 c = nc; 2281 LIST_INSERT_AFTER(c, class, mdc_list); 2282 } 2283 if (g_raid_started) 2284 g_retaste(&g_raid_class); 2285 break; 2286 case MOD_UNLOAD: 2287 LIST_REMOVE(class, mdc_list); 2288 break; 2289 default: 2290 error = EOPNOTSUPP; 2291 break; 2292 } 2293 2294 return (error); 2295 } 2296 2297 int 2298 g_raid_tr_modevent(module_t mod, int type, void *arg) 2299 { 2300 struct g_raid_tr_class *class, *c, *nc; 2301 int error; 2302 2303 error = 0; 2304 class = arg; 2305 switch (type) { 2306 case MOD_LOAD: 2307 c = LIST_FIRST(&g_raid_tr_classes); 2308 if (c == NULL || c->trc_priority > class->trc_priority) 2309 LIST_INSERT_HEAD(&g_raid_tr_classes, class, trc_list); 2310 else { 2311 while ((nc = LIST_NEXT(c, trc_list)) != NULL && 2312 nc->trc_priority < class->trc_priority) 2313 c = nc; 2314 LIST_INSERT_AFTER(c, class, trc_list); 2315 } 2316 break; 2317 case MOD_UNLOAD: 2318 LIST_REMOVE(class, trc_list); 2319 break; 2320 default: 2321 error = EOPNOTSUPP; 2322 break; 2323 } 2324 2325 return (error); 2326 } 2327 2328 /* 2329 * Use local implementation of DECLARE_GEOM_CLASS(g_raid_class, g_raid) 2330 * to reduce module priority, allowing submodules to register them first. 2331 */ 2332 static moduledata_t g_raid_mod = { 2333 "g_raid", 2334 g_modevent, 2335 &g_raid_class 2336 }; 2337 DECLARE_MODULE(g_raid, g_raid_mod, SI_SUB_DRIVERS, SI_ORDER_THIRD); 2338 MODULE_VERSION(geom_raid, 0); 2339