1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 26 #include <sys/param.h> 27 #include <sys/systm.h> 28 #include <sys/conf.h> 29 #include <sys/file.h> 30 #include <sys/user.h> 31 #include <sys/uio.h> 32 #include <sys/t_lock.h> 33 #include <sys/buf.h> 34 #include <sys/dkio.h> 35 #include <sys/vtoc.h> 36 #include <sys/kmem.h> 37 #include <vm/page.h> 38 #include <sys/cmn_err.h> 39 #include <sys/sysmacros.h> 40 #include <sys/types.h> 41 #include <sys/mkdev.h> 42 #include <sys/stat.h> 43 #include <sys/open.h> 44 #include <sys/lvm/mdio.h> 45 #include <sys/lvm/mdvar.h> 46 #include <sys/lvm/md_stripe.h> 47 #include <sys/lvm/md_convert.h> 48 #include <sys/lvm/md_notify.h> 49 #include <sys/modctl.h> 50 #include <sys/ddi.h> 51 #include <sys/sunddi.h> 52 #include <sys/debug.h> 53 #include <sys/sysevent/eventdefs.h> 54 #include <sys/sysevent/svm.h> 55 56 md_ops_t stripe_md_ops; 57 #ifndef lint 58 char _depends_on[] = "drv/md"; 59 md_ops_t *md_interface_ops = &stripe_md_ops; 60 #endif 61 62 extern unit_t md_nunits; 63 extern set_t md_nsets; 64 extern md_set_t md_set[]; 65 66 extern kmutex_t md_mx; 67 extern kcondvar_t md_cv; 68 69 extern int md_status; 70 extern major_t md_major; 71 extern mdq_anchor_t md_done_daemon; 72 73 static int md_stripe_mcs_buf_off; 74 static kmem_cache_t *stripe_parent_cache = NULL; 75 static kmem_cache_t *stripe_child_cache = NULL; 76 77 /*ARGSUSED1*/ 78 static int 79 stripe_parent_constructor(void *p, void *d1, int d2) 80 { 81 mutex_init(&((md_sps_t *)p)->ps_mx, 82 NULL, MUTEX_DEFAULT, NULL); 83 return (0); 84 } 85 86 static void 87 stripe_parent_init(void *ps) 88 { 89 bzero(ps, offsetof(md_sps_t, ps_mx)); 90 } 91 92 /*ARGSUSED1*/ 93 static void 94 stripe_parent_destructor(void *p, void *d) 95 { 96 mutex_destroy(&((md_sps_t *)p)->ps_mx); 97 } 98 99 /*ARGSUSED1*/ 100 static int 101 stripe_child_constructor(void *p, void *d1, int d2) 102 { 103 bioinit(&((md_scs_t *)p)->cs_buf); 104 return (0); 105 } 106 107 static void 108 stripe_child_init(md_scs_t *cs) 109 { 110 cs->cs_mdunit = 0; 111 cs->cs_ps = NULL; 112 cs->cs_comp = NULL; 113 md_bioreset(&cs->cs_buf); 114 } 115 116 /*ARGSUSED1*/ 117 static void 118 stripe_child_destructor(void *p, void *d) 119 { 120 biofini(&((md_scs_t *)p)->cs_buf); 121 } 122 123 /*ARGSUSED*/ 124 static void 125 stripe_run_queue(void *d) 126 { 127 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 128 md_daemon(1, &md_done_daemon); 129 } 130 131 static void 132 stripe_close_all_devs(ms_unit_t *un, int md_cflags) 133 { 134 int row; 135 int i; 136 int c; 137 struct ms_comp *mdcomp; 138 139 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 140 for (row = 0; row < un->un_nrows; row++) { 141 struct ms_row *mdr = &un->un_row[row]; 142 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 143 struct ms_comp *mdc; 144 mdc = &mdcomp[c++]; 145 if (md_cflags & MD_OFLG_PROBEDEV) { 146 147 /* 148 * It is possible that the md_layered_open 149 * failed because the stripe unit structure 150 * contained a NODEV. In such a case since 151 * there is nothing to open, there is nothing 152 * to close. 153 */ 154 if (mdc->un_dev == NODEV64) 155 continue; 156 } 157 if ((md_cflags & MD_OFLG_PROBEDEV) && 158 (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) { 159 md_layered_close(mdc->un_dev, 160 md_cflags); 161 mdc->un_mirror.ms_flags &= ~MDM_S_PROBEOPEN; 162 } else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) { 163 md_layered_close(mdc->un_dev, md_cflags); 164 mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN; 165 } 166 } 167 } 168 } 169 170 static int 171 stripe_open_all_devs(ms_unit_t *un, int md_oflags) 172 { 173 minor_t mnum = MD_SID(un); 174 int row; 175 int i; 176 int c; 177 struct ms_comp *mdcomp; 178 int err; 179 int cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS); 180 int probe_err_cnt = 0; 181 int total_comp_cnt = 0; 182 set_t setno = MD_MIN2SET(MD_SID(un)); 183 side_t side = mddb_getsidenum(setno); 184 mdkey_t key; 185 186 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 187 188 /* 189 * For a probe call, if any component of a stripe or a concat 190 * can be opened, it is considered to be a success. The total number 191 * of components in a stripe are computed prior to starting a probe. 192 * This number is then compared against the number of components 193 * that could be be successfully opened. If none of the components 194 * in a stripe can be opened, only then an ENXIO is returned for a 195 * probe type open. 196 */ 197 198 for (row = 0; row < un->un_nrows; row++) { 199 struct ms_row *mdr = &un->un_row[row]; 200 201 if (md_oflags & MD_OFLG_PROBEDEV) 202 total_comp_cnt += mdr->un_ncomp; 203 204 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 205 struct ms_comp *mdc; 206 md_dev64_t tmpdev; 207 208 mdc = &mdcomp[c++]; 209 tmpdev = mdc->un_dev; 210 /* 211 * Do the open by device id 212 * Check if this comp is hotspared and 213 * if it is then use the key for hotspare. 214 * MN disksets don't use devids, so we better don't use 215 * md_devid_found/md_resolve_bydevid there. Rather do, 216 * what's done in stripe_build_incore() 217 */ 218 if (MD_MNSET_SETNO(setno)) { 219 if (mdc->un_mirror.ms_hs_id != 0) { 220 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 221 0, &mdc->un_mirror.ms_hs_id, NULL, 222 &tmpdev, NULL); 223 } 224 } else { 225 key = mdc->un_mirror.ms_hs_id ? 226 mdc->un_mirror.ms_hs_key : mdc->un_key; 227 if ((md_getmajor(tmpdev) != md_major) && 228 md_devid_found(setno, side, key) == 1) { 229 tmpdev = md_resolve_bydevid(mnum, 230 tmpdev, key); 231 } 232 } 233 234 /* 235 * For a submirror, we only want to open those devices 236 * that are not errored. If the device is errored then 237 * then there is no reason to open it and leaving it 238 * closed allows the RCM/DR code to work so that the 239 * errored device can be replaced. 240 */ 241 if ((md_oflags & MD_OFLG_PROBEDEV) || 242 ! (mdc->un_mirror.ms_state & CS_ERRED)) { 243 244 err = md_layered_open(mnum, &tmpdev, md_oflags); 245 } else { 246 err = ENXIO; 247 } 248 249 /* 250 * Only set the un_dev if the tmpdev != NODEV64. If 251 * it is NODEV64 then the md_layered_open() will have 252 * failed in some manner. 253 */ 254 if (tmpdev != NODEV64) 255 mdc->un_dev = tmpdev; 256 257 if (err) { 258 if (!cont_on_errors) { 259 stripe_close_all_devs(un, md_oflags); 260 return (ENXIO); 261 } 262 263 if (md_oflags & MD_OFLG_PROBEDEV) 264 probe_err_cnt++; 265 } else { 266 if (md_oflags & MD_OFLG_PROBEDEV) { 267 mdc->un_mirror.ms_flags |= 268 MDM_S_PROBEOPEN; 269 } else 270 mdc->un_mirror.ms_flags |= MDM_S_ISOPEN; 271 } 272 } 273 } 274 275 /* If every component in a stripe could not be opened fail */ 276 if ((md_oflags & MD_OFLG_PROBEDEV) && 277 (probe_err_cnt == total_comp_cnt)) 278 return (ENXIO); 279 else 280 return (0); 281 } 282 283 int 284 stripe_build_incore(void *p, int snarfing) 285 { 286 ms_unit_t *un = (ms_unit_t *)p; 287 struct ms_comp *mdcomp; 288 minor_t mnum; 289 int row; 290 int i; 291 int c; 292 int ncomps; 293 294 mnum = MD_SID(un); 295 296 if (MD_UNIT(mnum) != NULL) 297 return (0); 298 299 MD_STATUS(un) = 0; 300 301 /* 302 * Reset all the is_open flags, these are probably set 303 * cause they just came out of the database. 304 */ 305 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 306 307 ncomps = 0; 308 for (row = 0; row < un->un_nrows; row++) { 309 struct ms_row *mdr = &un->un_row[row]; 310 ncomps += mdr->un_ncomp; 311 } 312 313 for (row = 0; row < un->un_nrows; row++) { 314 struct ms_row *mdr = &un->un_row[row]; 315 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 316 struct ms_comp *mdc; 317 set_t setno; 318 md_dev64_t tmpdev; 319 320 mdc = &mdcomp[c++]; 321 mdc->un_mirror.ms_flags &= 322 ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED); 323 324 if (!snarfing) 325 continue; 326 327 setno = MD_MIN2SET(mnum); 328 329 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 330 mdc->un_key, MD_NOTRUST_DEVT); 331 mdc->un_dev = tmpdev; 332 /* 333 * Check for hotspares. If the hotspares haven't been 334 * snarfed yet, stripe_open_all_devs() will do the 335 * remapping of the dev's later. 336 */ 337 if (mdc->un_mirror.ms_hs_id != 0) { 338 mdc->un_mirror.ms_orig_dev = mdc->un_dev; 339 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 340 0, &mdc->un_mirror.ms_hs_id, NULL, 341 &tmpdev, NULL); 342 mdc->un_dev = tmpdev; 343 } 344 } 345 } 346 347 /* place various information in the in-core data structures */ 348 md_nblocks_set(mnum, un->c.un_total_blocks); 349 MD_UNIT(mnum) = un; 350 351 return (0); 352 } 353 354 void 355 reset_stripe(ms_unit_t *un, minor_t mnum, int removing) 356 { 357 ms_comp_t *mdcomp; 358 struct ms_row *mdr; 359 int i, c; 360 int row; 361 int nsv; 362 int isv; 363 sv_dev_t *sv; 364 mddb_recid_t *recids; 365 mddb_recid_t vtoc_id; 366 int rid = 0; 367 368 md_destroy_unit_incore(mnum, &stripe_md_ops); 369 370 md_nblocks_set(mnum, -1ULL); 371 MD_UNIT(mnum) = NULL; 372 373 /* 374 * Attempt release of its minor node 375 */ 376 md_remove_minor_node(mnum); 377 378 if (!removing) 379 return; 380 381 nsv = 0; 382 /* Count the number of devices */ 383 for (row = 0; row < un->un_nrows; row++) { 384 mdr = &un->un_row[row]; 385 nsv += mdr->un_ncomp; 386 } 387 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP); 388 389 /* 390 * allocate recids array. since we may have to commit 391 * underlying soft partition records, we need an array 392 * of size: total number of components in stripe + 3 393 * (one for the stripe itself, one for the hotspare, one 394 * for the end marker). 395 */ 396 recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP); 397 398 /* 399 * Save the md_dev64_t's and driver nm indexes. 400 * Because after the mddb_deleterec() we will 401 * not be able to access the unit structure. 402 * 403 * NOTE: Deleting the names before deleting the 404 * unit structure would cause problems if 405 * the machine crashed in between the two. 406 */ 407 isv = 0; 408 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 409 410 for (row = 0; row < un->un_nrows; row++) { 411 mdr = &un->un_row[row]; 412 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 413 struct ms_comp *mdc; 414 md_dev64_t child_dev; 415 md_unit_t *child_un; 416 417 mdc = &mdcomp[c++]; 418 if (mdc->un_mirror.ms_hs_id != 0) { 419 mdkey_t hs_key; 420 421 hs_key = mdc->un_mirror.ms_hs_key; 422 423 mdc->un_dev = mdc->un_mirror.ms_orig_dev; 424 mdc->un_start_block = 425 mdc->un_mirror.ms_orig_blk; 426 mdc->un_mirror.ms_hs_id = 0; 427 mdc->un_mirror.ms_hs_key = 0; 428 mdc->un_mirror.ms_orig_dev = 0; 429 recids[0] = 0; 430 recids[1] = 0; /* recids[1] filled in below */ 431 recids[2] = 0; 432 (void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id, 433 0, 0, &recids[0], &hs_key, NULL, NULL); 434 mddb_commitrecs_wrapper(recids); 435 } 436 437 /* 438 * check if we've got metadevice below us and 439 * deparent it if we do. 440 * NOTE: currently soft partitions are the 441 * the only metadevices stripes can be 442 * built on top of. 443 */ 444 child_dev = mdc->un_dev; 445 if (md_getmajor(child_dev) == md_major) { 446 child_un = MD_UNIT(md_getminor(child_dev)); 447 md_reset_parent(child_dev); 448 recids[rid++] = MD_RECID(child_un); 449 } 450 451 sv[isv].setno = MD_MIN2SET(mnum); 452 sv[isv++].key = mdc->un_key; 453 } 454 } 455 456 recids[rid++] = un->c.un_record_id; 457 recids[rid] = 0; /* filled in below */ 458 459 /* 460 * Decrement the HSP reference count and 461 * remove the knowledge of the HSP from the unit struct. 462 * This is done atomically to remove a window. 463 */ 464 if (un->un_hsp_id != -1) { 465 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 466 &recids[rid++], NULL, NULL, NULL); 467 un->un_hsp_id = -1; 468 } 469 470 /* set end marker and commit records */ 471 recids[rid] = 0; 472 mddb_commitrecs_wrapper(recids); 473 474 vtoc_id = un->c.un_vtoc_id; 475 476 /* 477 * Remove self from the namespace 478 */ 479 if (un->c.un_revision & MD_FN_META_DEV) { 480 (void) md_rem_selfname(un->c.un_self_id); 481 } 482 483 /* Remove the unit structure */ 484 mddb_deleterec_wrapper(un->c.un_record_id); 485 486 /* Remove the vtoc, if present */ 487 if (vtoc_id) 488 mddb_deleterec_wrapper(vtoc_id); 489 490 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 491 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 492 md_rem_names(sv, nsv); 493 kmem_free(sv, sizeof (sv_dev_t) * nsv); 494 kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3)); 495 } 496 497 static void 498 stripe_error(md_sps_t *ps) 499 { 500 struct buf *pb = ps->ps_bp; 501 mdi_unit_t *ui = ps->ps_ui; 502 md_dev64_t dev = ps->ps_errcomp->un_dev; 503 md_dev64_t md_dev = md_expldev(pb->b_edev); 504 char *str; 505 506 if (pb->b_flags & B_READ) { 507 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR; 508 str = "read"; 509 } else { 510 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR; 511 str = "write"; 512 } 513 if (!(ps->ps_flags & MD_SPS_DONTFREE)) { 514 if (MUTEX_HELD(&ps->ps_mx)) { 515 mutex_exit(&ps->ps_mx); 516 } 517 } else { 518 ASSERT(panicstr); 519 } 520 SPS_FREE(stripe_parent_cache, ps); 521 pb->b_flags |= B_ERROR; 522 523 md_kstat_done(ui, pb, 0); 524 md_unit_readerexit(ui); 525 md_biodone(pb); 526 527 cmn_err(CE_WARN, "md: %s: %s error on %s", 528 md_shortname(md_getminor(md_dev)), str, 529 md_devname(MD_DEV2SET(md_dev), dev, NULL, 0)); 530 } 531 532 static int 533 stripe_done(struct buf *cb) 534 { 535 struct buf *pb; 536 mdi_unit_t *ui; 537 md_sps_t *ps; 538 md_scs_t *cs; 539 540 /*LINTED*/ 541 cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off); 542 ps = cs->cs_ps; 543 pb = ps->ps_bp; 544 545 mutex_enter(&ps->ps_mx); 546 if (cb->b_flags & B_ERROR) { 547 ps->ps_flags |= MD_SPS_ERROR; 548 pb->b_error = cb->b_error; 549 ps->ps_errcomp = cs->cs_comp; 550 } 551 552 if (cb->b_flags & B_REMAPPED) 553 bp_mapout(cb); 554 555 ps->ps_frags--; 556 if (ps->ps_frags != 0) { 557 mutex_exit(&ps->ps_mx); 558 kmem_cache_free(stripe_child_cache, cs); 559 return (1); 560 } 561 kmem_cache_free(stripe_child_cache, cs); 562 if (ps->ps_flags & MD_SPS_ERROR) { 563 stripe_error(ps); 564 return (1); 565 } 566 ui = ps->ps_ui; 567 if (!(ps->ps_flags & MD_SPS_DONTFREE)) { 568 mutex_exit(&ps->ps_mx); 569 } else { 570 ASSERT(panicstr); 571 } 572 SPS_FREE(stripe_parent_cache, ps); 573 md_kstat_done(ui, pb, 0); 574 md_unit_readerexit(ui); 575 md_biodone(pb); 576 return (0); 577 } 578 579 580 /* 581 * This routine does the mapping from virtual (dev, blkno) of a metapartition 582 * to the real (dev, blkno) of a real disk partition. 583 * It goes to the md_conf[] table to find out the correct real partition 584 * dev and block number for this buffer. 585 * 586 * A single buf request can not go across real disk partition boundary. 587 * When the virtual request specified by (dev, blkno) spans more than one 588 * real partition, md_mapbuf will return 1. Then the caller should prepare 589 * another real buf and continue calling md_mapbuf to do the mapping until 590 * it returns 0. 591 * 592 */ 593 594 static int 595 md_mapbuf( 596 ms_unit_t *un, 597 diskaddr_t blkno, 598 u_longlong_t bcount, 599 buf_t *bp, /* if bp==NULL, skip bp updates */ 600 ms_comp_t **mdc) /* if bp==NULL, skip mdc update */ 601 { 602 struct ms_row *mdr; 603 struct ms_comp *mdcomp; 604 diskaddr_t stripe_blk; 605 diskaddr_t fragment, blk_in_row, endblk; 606 offset_t interlace; 607 size_t dev_index; 608 int row_index, more; 609 extern unsigned md_maxphys; 610 /* Work var's when bp==NULL */ 611 u_longlong_t wb_bcount; 612 diskaddr_t wb_blkno; 613 md_dev64_t wb_edev; 614 ms_comp_t *wmdc; 615 616 /* 617 * Do a real calculation to derive the minor device of the 618 * Virtual Disk, which in turn will let us derive the 619 * device/minor of the underlying real device. 620 */ 621 622 623 for (row_index = 0; row_index < un->un_nrows; row_index++) { 624 mdr = &un->un_row[row_index]; 625 if (blkno < mdr->un_cum_blocks) 626 break; 627 } 628 ASSERT(row_index != un->un_nrows); 629 630 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 631 632 blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks; 633 endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE)); 634 if (mdr->un_ncomp == 1) { /* No striping */ 635 if (endblk > mdr->un_cum_blocks) { 636 wb_bcount = ldbtob(mdr->un_cum_blocks - blkno); 637 if ((row_index + 1) == un->un_nrows) 638 more = 0; 639 else 640 more = 1; 641 } else { 642 wb_bcount = bcount; 643 more = 0; 644 } 645 wmdc = &mdcomp[mdr->un_icomp]; 646 wb_blkno = blk_in_row; 647 } else { /* Have striping */ 648 interlace = mdr->un_interlace; 649 fragment = blk_in_row % interlace; 650 if (bcount > ldbtob(interlace - fragment)) { 651 more = 1; 652 wb_bcount = ldbtob(interlace - fragment); 653 } else { 654 more = 0; 655 wb_bcount = bcount; 656 } 657 658 stripe_blk = blk_in_row / interlace; 659 dev_index = (size_t)(stripe_blk % mdr->un_ncomp); 660 wmdc = &mdcomp[mdr->un_icomp + dev_index]; 661 wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) * 662 interlace) + fragment); 663 } 664 665 wb_blkno += wmdc->un_start_block; 666 wb_edev = wmdc->un_dev; 667 668 /* only break up the I/O if we're not built on another metadevice */ 669 if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) { 670 wb_bcount = md_maxphys; 671 more = 1; 672 } 673 if (bp != (buf_t *)NULL) { 674 /* 675 * wb_bcount is limited by md_maxphys which is 'int' 676 */ 677 bp->b_bcount = (size_t)wb_bcount; 678 bp->b_lblkno = wb_blkno; 679 bp->b_edev = md_dev64_to_dev(wb_edev); 680 *mdc = wmdc; 681 } 682 return (more); 683 } 684 685 static void 686 md_stripe_strategy(buf_t *pb, int flag, void *private) 687 { 688 md_sps_t *ps; 689 md_scs_t *cs; 690 int doing_writes; 691 int more; 692 ms_unit_t *un; 693 mdi_unit_t *ui; 694 size_t current_count; 695 diskaddr_t current_blkno; 696 off_t current_offset; 697 buf_t *cb; /* child buf pointer */ 698 set_t setno; 699 700 setno = MD_MIN2SET(getminor(pb->b_edev)); 701 702 /* 703 * When doing IO to a multi owner meta device, check if set is halted. 704 * We do this check without the needed lock held, for performance 705 * reasons. 706 * If an IO just slips through while the set is locked via an 707 * MD_MN_SUSPEND_SET, we don't care about it. 708 * Only check for a suspended set if we are a top-level i/o request 709 * (MD_STR_NOTTOP is cleared in 'flag'). 710 */ 711 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 712 (MD_SET_HALTED | MD_SET_MNSET)) { 713 if ((flag & MD_STR_NOTTOP) == 0) { 714 mutex_enter(&md_mx); 715 /* Here we loop until the set is no longer halted */ 716 while (md_set[setno].s_status & MD_SET_HALTED) { 717 cv_wait(&md_cv, &md_mx); 718 } 719 mutex_exit(&md_mx); 720 } 721 } 722 723 ui = MDI_UNIT(getminor(pb->b_edev)); 724 725 md_kstat_waitq_enter(ui); 726 727 un = (ms_unit_t *)md_unit_readerlock(ui); 728 729 if ((flag & MD_NOBLOCK) == 0) { 730 if (md_inc_iocount(setno) != 0) { 731 pb->b_flags |= B_ERROR; 732 pb->b_error = ENXIO; 733 pb->b_resid = pb->b_bcount; 734 md_kstat_waitq_exit(ui); 735 md_unit_readerexit(ui); 736 biodone(pb); 737 return; 738 } 739 } else { 740 md_inc_iocount_noblock(setno); 741 } 742 743 if (!(flag & MD_STR_NOTTOP)) { 744 if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) { 745 md_kstat_waitq_exit(ui); 746 return; 747 } 748 } 749 750 ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS); 751 stripe_parent_init(ps); 752 753 /* 754 * Save essential information from the original buffhdr 755 * in the md_save structure. 756 */ 757 ps->ps_un = un; 758 ps->ps_ui = ui; 759 ps->ps_bp = pb; 760 ps->ps_addr = pb->b_un.b_addr; 761 762 if ((pb->b_flags & B_READ) == 0) 763 doing_writes = 1; 764 else 765 doing_writes = 0; 766 767 768 current_count = pb->b_bcount; 769 current_blkno = pb->b_lblkno; 770 current_offset = 0; 771 772 if (!(flag & MD_STR_NOTTOP) && panicstr) 773 ps->ps_flags |= MD_SPS_DONTFREE; 774 775 md_kstat_waitq_to_runq(ui); 776 777 ps->ps_frags++; 778 do { 779 cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS); 780 stripe_child_init(cs); 781 cb = &cs->cs_buf; 782 cs->cs_ps = ps; 783 more = md_mapbuf(un, current_blkno, current_count, cb, 784 &cs->cs_comp); 785 786 cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev, 787 cb->b_lblkno, stripe_done, cb, KM_NOSLEEP); 788 /* 789 * Do these calculations now, 790 * so that we pickup a valid b_bcount from the chld_bp. 791 */ 792 current_offset += cb->b_bcount; 793 current_count -= cb->b_bcount; 794 current_blkno += (diskaddr_t)(lbtodb(cb->b_bcount)); 795 796 if (more) { 797 mutex_enter(&ps->ps_mx); 798 ps->ps_frags++; 799 mutex_exit(&ps->ps_mx); 800 } 801 802 if (doing_writes && 803 cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) { 804 (void) stripe_done(cb); 805 continue; 806 } 807 md_call_strategy(cb, flag, private); 808 } while (more); 809 810 if (!(flag & MD_STR_NOTTOP) && panicstr) { 811 while (!(ps->ps_flags & MD_SPS_DONE)) { 812 md_daemon(1, &md_done_daemon); 813 drv_usecwait(10); 814 } 815 kmem_cache_free(stripe_parent_cache, ps); 816 } 817 } 818 819 static int 820 stripe_snarf(md_snarfcmd_t cmd, set_t setno) 821 { 822 ms_unit_t *un; 823 mddb_recid_t recid; 824 int gotsomething; 825 int all_stripes_gotten; 826 mddb_type_t typ1; 827 mddb_de_ic_t *dep; 828 mddb_rb32_t *rbp; 829 size_t newreqsize; 830 ms_unit_t *big_un; 831 ms_unit32_od_t *small_un; 832 833 834 if (cmd == MD_SNARF_CLEANUP) 835 return (0); 836 837 all_stripes_gotten = 1; 838 gotsomething = 0; 839 840 typ1 = (mddb_type_t)md_getshared_key(setno, 841 stripe_md_ops.md_driver.md_drivername); 842 recid = mddb_makerecid(setno, 0); 843 844 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 845 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 846 continue; 847 848 dep = mddb_getrecdep(recid); 849 dep->de_flags = MDDB_F_STRIPE; 850 rbp = dep->de_rb; 851 852 switch (rbp->rb_revision) { 853 case MDDB_REV_RB: 854 case MDDB_REV_RBFN: 855 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 856 /* 857 * This means, we have an old and small record 858 * and this record hasn't already been 859 * converted. Before we create an incore 860 * metadevice from this we have to convert it to 861 * a big record. 862 */ 863 small_un = 864 (ms_unit32_od_t *)mddb_getrecaddr(recid); 865 newreqsize = get_big_stripe_req_size(small_un, 866 COMPLETE_STRUCTURE); 867 big_un = (ms_unit_t *)kmem_zalloc(newreqsize, 868 KM_SLEEP); 869 stripe_convert((caddr_t)small_un, 870 (caddr_t)big_un, SMALL_2_BIG); 871 kmem_free(small_un, dep->de_reqsize); 872 dep->de_rb_userdata = big_un; 873 dep->de_reqsize = newreqsize; 874 un = big_un; 875 rbp->rb_private |= MD_PRV_CONVD; 876 } else { 877 /* Small device had already been converted */ 878 un = (ms_unit_t *)mddb_getrecaddr(recid); 879 } 880 un->c.un_revision &= ~MD_64BIT_META_DEV; 881 break; 882 case MDDB_REV_RB64: 883 case MDDB_REV_RB64FN: 884 /* Big device */ 885 un = (ms_unit_t *)mddb_getrecaddr(recid); 886 un->c.un_revision |= MD_64BIT_META_DEV; 887 un->c.un_flag |= MD_EFILABEL; 888 break; 889 } 890 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 891 892 /* Create minor node for snarfed unit. */ 893 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 894 895 if (MD_UNIT(MD_SID(un)) != NULL) { 896 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 897 continue; 898 } 899 all_stripes_gotten = 0; 900 if (stripe_build_incore((void *)un, 1) == 0) { 901 mddb_setrecprivate(recid, MD_PRV_GOTIT); 902 md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0); 903 gotsomething = 1; 904 } 905 } 906 907 if (!all_stripes_gotten) 908 return (gotsomething); 909 910 recid = mddb_makerecid(setno, 0); 911 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 912 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 913 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 914 915 return (0); 916 } 917 918 static int 919 stripe_halt(md_haltcmd_t cmd, set_t setno) 920 { 921 int i; 922 mdi_unit_t *ui; 923 minor_t mnum; 924 925 if (cmd == MD_HALT_CLOSE) 926 return (0); 927 928 if (cmd == MD_HALT_OPEN) 929 return (0); 930 931 if (cmd == MD_HALT_UNLOAD) 932 return (0); 933 934 if (cmd == MD_HALT_CHECK) { 935 for (i = 0; i < md_nunits; i++) { 936 mnum = MD_MKMIN(setno, i); 937 if ((ui = MDI_UNIT(mnum)) == NULL) 938 continue; 939 if (ui->ui_opsindex != stripe_md_ops.md_selfindex) 940 continue; 941 if (md_unit_isopen(ui)) 942 return (1); 943 } 944 return (0); 945 } 946 947 if (cmd != MD_HALT_DOIT) 948 return (1); 949 950 for (i = 0; i < md_nunits; i++) { 951 mnum = MD_MKMIN(setno, i); 952 if ((ui = MDI_UNIT(mnum)) == NULL) 953 continue; 954 if (ui->ui_opsindex != stripe_md_ops.md_selfindex) 955 continue; 956 reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0); 957 } 958 959 return (0); 960 } 961 962 /*ARGSUSED3*/ 963 static int 964 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 965 { 966 minor_t mnum = getminor(*dev); 967 mdi_unit_t *ui = MDI_UNIT(mnum); 968 ms_unit_t *un; 969 int err = 0; 970 set_t setno; 971 972 /* 973 * When doing an open of a multi owner metadevice, check to see if this 974 * node is a starting node and if a reconfig cycle is underway. 975 * If so, the system isn't sufficiently set up enough to handle the 976 * open (which involves I/O during sp_validate), so fail with ENXIO. 977 */ 978 setno = MD_MIN2SET(mnum); 979 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 980 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 981 return (ENXIO); 982 } 983 984 /* single thread */ 985 un = (ms_unit_t *)md_unit_openclose_enter(ui); 986 987 /* open devices, if necessary */ 988 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 989 if ((err = stripe_open_all_devs(un, md_oflags)) != 0) { 990 goto out; 991 } 992 } 993 994 /* count open */ 995 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 996 goto out; 997 998 /* unlock, return success */ 999 out: 1000 md_unit_openclose_exit(ui); 1001 return (err); 1002 } 1003 1004 /*ARGSUSED1*/ 1005 static int 1006 stripe_close( 1007 dev_t dev, 1008 int flag, 1009 int otyp, 1010 cred_t *cred_p, 1011 int md_cflags 1012 ) 1013 { 1014 minor_t mnum = getminor(dev); 1015 mdi_unit_t *ui = MDI_UNIT(mnum); 1016 ms_unit_t *un; 1017 int err = 0; 1018 1019 /* single thread */ 1020 un = (ms_unit_t *)md_unit_openclose_enter(ui); 1021 1022 /* count closed */ 1023 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1024 goto out; 1025 1026 /* close devices, if necessary */ 1027 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1028 stripe_close_all_devs(un, md_cflags); 1029 } 1030 1031 /* unlock, return success */ 1032 out: 1033 md_unit_openclose_exit(ui); 1034 return (err); 1035 } 1036 1037 1038 static struct buf dumpbuf; 1039 1040 /* 1041 * This routine dumps memory to the disk. It assumes that the memory has 1042 * already been mapped into mainbus space. It is called at disk interrupt 1043 * priority when the system is in trouble. 1044 * 1045 */ 1046 static int 1047 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1048 { 1049 ms_unit_t *un; 1050 buf_t *bp; 1051 ms_comp_t *mdc; 1052 u_longlong_t nb; 1053 diskaddr_t mapblk; 1054 int result; 1055 int more; 1056 int saveresult = 0; 1057 1058 /* 1059 * Don't need to grab the unit lock. 1060 * Cause nothing else is suppose to be happenning. 1061 * Also dump is not suppose to sleep. 1062 */ 1063 un = (ms_unit_t *)MD_UNIT(getminor(dev)); 1064 1065 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1066 return (EINVAL); 1067 1068 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 1069 return (EINVAL); 1070 1071 bp = &dumpbuf; 1072 nb = ldbtob(nblk); 1073 do { 1074 bzero((caddr_t)bp, sizeof (*bp)); 1075 more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc); 1076 nblk = btodb(bp->b_bcount); 1077 mapblk = bp->b_lblkno; 1078 if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) { 1079 /* 1080 * bdev_dump() is currently only able to take 1081 * 32 bit wide blkno's. 1082 */ 1083 result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk, 1084 nblk); 1085 if (result) 1086 saveresult = result; 1087 } 1088 1089 nb -= bp->b_bcount; 1090 addr += bp->b_bcount; 1091 blkno += nblk; 1092 } while (more); 1093 1094 return (saveresult); 1095 } 1096 1097 /*ARGSUSED*/ 1098 static intptr_t 1099 stripe_shared_by_blk( 1100 md_dev64_t dev, 1101 void *junk, 1102 diskaddr_t blkno, 1103 u_longlong_t *cnt) 1104 { 1105 ms_unit_t *un; 1106 buf_t bp; 1107 ms_comp_t *comp; 1108 1109 un = MD_UNIT(md_getminor(dev)); 1110 (void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp); 1111 *cnt = (u_longlong_t)lbtodb(bp.b_bcount); 1112 return ((intptr_t)&comp->un_mirror); 1113 } 1114 1115 /* 1116 * stripe_block_count_skip_size() returns the following values 1117 * so that the logical to physical block mappings can 1118 * be calculated without intimate knowledge of the underpinnings. 1119 * 1120 * block - first logical block number of the device. 1121 * block = [ # of blocks before THE row ] + 1122 * [ # of blocks in THE row before the component ] 1123 * count - # of segments (interlaced size). 1124 * skip - # of logical blocks between segments, or delta to 1125 * get to next segment 1126 * size - interlace size used for the block, count, skip. 1127 */ 1128 /*ARGSUSED*/ 1129 static intptr_t 1130 stripe_block_count_skip_size( 1131 md_dev64_t dev, 1132 void *junk, 1133 int ci, 1134 diskaddr_t *block, 1135 size_t *count, 1136 u_longlong_t *skip, 1137 u_longlong_t *size) 1138 { 1139 ms_unit_t *un; 1140 int row; 1141 struct ms_row *mdr; 1142 int cmpcount = 0; 1143 1144 un = MD_UNIT(md_getminor(dev)); 1145 1146 for (row = 0; row < un->un_nrows; row++) { 1147 mdr = &un->un_row[row]; 1148 if ((mdr->un_ncomp + cmpcount) > ci) 1149 break; 1150 cmpcount += mdr->un_ncomp; 1151 } 1152 ASSERT(row != un->un_nrows); 1153 1154 /* 1155 * Concatenations are always contiguous blocks, 1156 * you cannot depend on the interlace being a usable 1157 * value (except for stripes). 1158 */ 1159 if (mdr->un_ncomp == 1) { /* Concats */ 1160 *block = mdr->un_cum_blocks - mdr->un_blocks; 1161 *count = 1; 1162 *skip = 0; 1163 *size = mdr->un_blocks; 1164 } else { /* Stripes */ 1165 *block = (mdr->un_cum_blocks - mdr->un_blocks) + 1166 ((ci - cmpcount) * mdr->un_interlace); 1167 *count = (size_t)(mdr->un_blocks / (mdr->un_interlace * 1168 mdr->un_ncomp)); 1169 *skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace; 1170 *size = mdr->un_interlace; 1171 } 1172 1173 return (0); 1174 } 1175 1176 /*ARGSUSED*/ 1177 static intptr_t 1178 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx) 1179 { 1180 ms_unit_t *un; 1181 ms_comp_t *comp; 1182 1183 un = MD_UNIT(md_getminor(dev)); 1184 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1185 comp += indx; 1186 return ((intptr_t)&comp->un_mirror); 1187 } 1188 1189 /*ARGSUSED*/ 1190 intptr_t 1191 stripe_component_count(md_dev64_t dev, void *junk) 1192 { 1193 /* 1194 * See comments for stripe_get_dev 1195 */ 1196 1197 ms_unit_t *un; 1198 int count = 0; 1199 int row; 1200 1201 un = MD_UNIT(md_getminor(dev)); 1202 for (row = 0; row < un->un_nrows; row++) 1203 count += un->un_row[row].un_ncomp; 1204 return (count); 1205 } 1206 1207 /*ARGSUSED*/ 1208 intptr_t 1209 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd) 1210 { 1211 /* 1212 * It should be noted that stripe_replace in stripe_ioctl.c calls this 1213 * routine using makedevice(0, minor) for the first argument. 1214 * 1215 * If this routine at some point in the future needs to use the major 1216 * number stripe_replace must be changed. 1217 */ 1218 1219 ms_unit_t *un; 1220 ms_comp_t *comp; 1221 md_dev64_t tmpdev; 1222 1223 un = MD_UNIT(md_getminor(dev)); 1224 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1225 comp += indx; 1226 tmpdev = comp->un_dev; 1227 /* 1228 * Try to resolve devt again if NODEV64 1229 * Check if this comp is hotspared and if it is 1230 * then use key for hotspare 1231 */ 1232 if (tmpdev == NODEV64) { 1233 tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev, 1234 comp->un_mirror.ms_hs_id ? 1235 comp->un_mirror.ms_hs_key : 1236 comp->un_key); 1237 comp->un_dev = tmpdev; 1238 } 1239 1240 cd->cd_dev = comp->un_dev; 1241 cd->cd_orig_dev = comp->un_mirror.ms_orig_dev; 1242 return (0); 1243 } 1244 1245 /*ARGSUSED*/ 1246 void 1247 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv) 1248 { 1249 /* 1250 * See comments for stripe_get_dev 1251 */ 1252 1253 minor_t mnum = md_getminor(dev); 1254 1255 if (sv != NULL) { 1256 md_rem_names(sv, 1); 1257 kmem_free(sv, sizeof (sv_dev_t)); 1258 } 1259 1260 md_unit_writerexit(MDI_UNIT(mnum)); 1261 } 1262 1263 /*ARGSUSED*/ 1264 intptr_t 1265 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd, 1266 mddb_recid_t *recids, int nrecids, void (**replace_done)(), 1267 void **replace_data) 1268 { 1269 minor_t mnum; 1270 ms_unit_t *un; 1271 mdi_unit_t *ui; 1272 ms_comp_t *comp; 1273 diskaddr_t dev_size; 1274 int row; 1275 int ncomps = 0; 1276 int cmpcount = 0; 1277 int rid = 0; 1278 struct ms_row *mdr; 1279 sv_dev_t *sv = NULL; 1280 mddb_recid_t hs_id = 0; 1281 set_t setno; 1282 side_t side; 1283 md_dev64_t this_dev; 1284 md_dev64_t old_dev; 1285 1286 mnum = md_getminor(dev); 1287 ui = MDI_UNIT(mnum); 1288 setno = MD_MIN2SET(mnum); 1289 side = mddb_getsidenum(setno); 1290 1291 un = md_unit_writerlock(ui); 1292 1293 *replace_data = NULL; 1294 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1295 1296 comp += ci; 1297 old_dev = comp->un_dev; 1298 1299 /* 1300 * Count the number of components 1301 */ 1302 for (row = 0; row < un->un_nrows; row++) { 1303 struct ms_row *mdr = &un->un_row[row]; 1304 ncomps += mdr->un_ncomp; 1305 } 1306 1307 recids[0] = 0; 1308 /* 1309 * No need of checking size of new device, 1310 * when hotsparing (it has already been done), or 1311 * when enabling the device. 1312 */ 1313 if ((nd != NULL) && (nd->nd_hs_id == 0)) { 1314 for (row = 0; row < un->un_nrows; row++) { 1315 mdr = &un->un_row[row]; 1316 if ((mdr->un_ncomp + cmpcount) > ci) 1317 break; 1318 cmpcount += mdr->un_ncomp; 1319 } 1320 ASSERT(row != un->un_nrows); 1321 1322 /* Concatenations have a ncomp = 1 */ 1323 dev_size = mdr->un_blocks / mdr->un_ncomp; 1324 1325 /* 1326 * now check to see if new comp can be used in 1327 * place of old comp 1328 */ 1329 if ((un->c.un_flag & MD_LABELED) && (ci == 0) && 1330 nd->nd_labeled) 1331 nd->nd_start_blk = 0; 1332 else 1333 nd->nd_nblks -= nd->nd_start_blk; 1334 1335 if (dev_size > nd->nd_nblks) { 1336 md_unit_writerexit(ui); 1337 return (MDE_COMP_TOO_SMALL); 1338 } 1339 1340 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 1341 sv->setno = MD_MIN2SET(mnum); 1342 sv->key = comp->un_key; 1343 } 1344 1345 /* 1346 * Close this component. 1347 */ 1348 if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) { 1349 md_layered_close(comp->un_dev, MD_OFLG_NULL); 1350 comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN; 1351 } 1352 1353 /* 1354 * If the component is hotspared, return to the pool. 1355 */ 1356 if (comp->un_mirror.ms_hs_id != 0) { 1357 hs_cmds_t cmd; 1358 mdkey_t hs_key; 1359 1360 hs_key = comp->un_mirror.ms_hs_key; 1361 comp->un_dev = comp->un_mirror.ms_orig_dev; 1362 comp->un_start_block = comp->un_mirror.ms_orig_blk; 1363 comp->un_mirror.ms_hs_key = 0; 1364 comp->un_mirror.ms_hs_id = 0; 1365 comp->un_mirror.ms_orig_dev = 0; 1366 1367 cmd = HS_FREE; 1368 if ((comp->un_mirror.ms_state != CS_OKAY) && 1369 (comp->un_mirror.ms_state != CS_RESYNC)) 1370 cmd = HS_BAD; 1371 (void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id, 1372 &hs_key, NULL, NULL); 1373 } 1374 1375 /* 1376 * Open by device id; for enable (indicated by a NULL 1377 * nd pointer), use the existing component info. For 1378 * replace, use the new device. 1379 */ 1380 if (nd == NULL) { 1381 this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key); 1382 /* 1383 * If someone replaced a new disk in the same slot 1384 * we get NODEV64 since old device id cannot be 1385 * resolved. The new devt is obtained from the 1386 * mddb since devt is going to be unchanged for the 1387 * enable case. No need to check for multiple 1388 * keys here because the caller (comp_replace) 1389 * has already sanity checked it for us. 1390 */ 1391 if (this_dev == NODEV64) { 1392 this_dev = md_getdevnum(setno, side, comp->un_key, 1393 MD_TRUST_DEVT); 1394 } 1395 } else { 1396 /* 1397 * If this is a hotspare, save the original dev_t for later 1398 * use. If this has occured during boot then the value of 1399 * comp->un_dev will be NODEV64 because of the failure to look 1400 * up the devid of the device. 1401 */ 1402 if (nd->nd_hs_id != 0) 1403 comp->un_mirror.ms_orig_dev = comp->un_dev; 1404 this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key); 1405 } 1406 1407 comp->un_dev = this_dev; 1408 1409 /* 1410 * Now open the new device if required. Note for a single component 1411 * stripe it will not be open - leave this for the mirror driver to 1412 * deal with. 1413 */ 1414 if (md_unit_isopen(ui)) { 1415 if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) { 1416 mddb_recid_t ids[3]; 1417 1418 ids[0] = un->c.un_record_id; 1419 ids[1] = hs_id; 1420 ids[2] = 0; 1421 mddb_commitrecs_wrapper(ids); 1422 if ((nd != NULL) && (nd->nd_hs_id != 0)) { 1423 /* 1424 * Revert back to the original device. 1425 */ 1426 comp->un_dev = comp->un_mirror.ms_orig_dev; 1427 1428 cmn_err(CE_WARN, 1429 "md: %s: open error of hotspare %s", 1430 md_shortname(mnum), 1431 md_devname(MD_MIN2SET(mnum), nd->nd_dev, 1432 NULL, 0)); 1433 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1434 SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev); 1435 } 1436 md_unit_writerexit(ui); 1437 return (MDE_COMP_OPEN_ERR); 1438 } 1439 if (nd != NULL) 1440 nd->nd_dev = this_dev; 1441 1442 comp->un_mirror.ms_flags |= MDM_S_ISOPEN; 1443 } 1444 1445 if (nd == NULL) { 1446 recids[0] = un->c.un_record_id; 1447 recids[1] = hs_id; 1448 recids[2] = 0; 1449 *replace_done = stripe_replace_done; 1450 return (0); 1451 } 1452 1453 /* if hot sparing this device */ 1454 if (nd->nd_hs_id != 0) { 1455 char devname[MD_MAX_CTDLEN]; 1456 char hs_devname[MD_MAX_CTDLEN]; 1457 set_t setno; 1458 1459 comp->un_mirror.ms_hs_id = nd->nd_hs_id; 1460 comp->un_mirror.ms_hs_key = nd->nd_key; 1461 1462 comp->un_mirror.ms_orig_blk = comp->un_start_block; 1463 1464 setno = MD_MIN2SET(mnum); 1465 1466 (void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname, 1467 sizeof (devname)); 1468 (void) md_devname(setno, nd->nd_dev, hs_devname, 1469 sizeof (hs_devname)); 1470 1471 cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s", 1472 md_shortname(mnum), devname, hs_devname); 1473 1474 } else { /* replacing the device */ 1475 comp->un_key = nd->nd_key; 1476 *replace_data = (void *)sv; 1477 1478 /* 1479 * For the old device, make sure to reset the parent 1480 * if it's a metadevice. 1481 */ 1482 if (md_getmajor(comp->un_dev) == md_major) { 1483 minor_t comp_mnum = md_getminor(old_dev); 1484 md_unit_t *comp_un = MD_UNIT(comp_mnum); 1485 1486 md_reset_parent(old_dev); 1487 recids[rid++] = MD_RECID(comp_un); 1488 } 1489 } 1490 1491 comp->un_dev = nd->nd_dev; 1492 comp->un_start_block = nd->nd_start_blk; 1493 1494 /* 1495 * For the new device, make sure to set the parent if it's a 1496 * metadevice. 1497 * 1498 * If we ever support using metadevices as hot spares, this 1499 * will need to be tested, and possibly moved into the 1500 * preceding "else" clause, immediately following the parent 1501 * reset block. For now, it's convenient to leave it here and 1502 * only compress nd->nd_dev once. 1503 */ 1504 if (md_getmajor(comp->un_dev) == md_major) { 1505 minor_t comp_mnum = md_getminor(comp->un_dev); 1506 md_unit_t *comp_un = MD_UNIT(comp_mnum); 1507 1508 md_set_parent(comp->un_dev, MD_SID(un)); 1509 recids[rid++] = MD_RECID(comp_un); 1510 } 1511 1512 recids[rid++] = un->c.un_record_id; 1513 recids[rid++] = hs_id; 1514 recids[rid] = 0; 1515 *replace_done = stripe_replace_done; 1516 return (0); 1517 } 1518 1519 /*ARGSUSED*/ 1520 static intptr_t 1521 stripe_hotspare_dev( 1522 md_dev64_t dev, 1523 void *junk, 1524 int ci, 1525 mddb_recid_t *recids, 1526 int nrecids, 1527 void (**replace_done)(), 1528 void **replace_data) 1529 { 1530 ms_unit_t *un; 1531 mdi_unit_t *ui; 1532 ms_comp_t *comp; 1533 int row; 1534 struct ms_row *mdr; 1535 ms_new_dev_t nd; 1536 int err; 1537 int i; 1538 minor_t mnum; 1539 set_t setno; 1540 int cmpcount = 0; 1541 1542 mnum = md_getminor(dev); 1543 ui = MDI_UNIT(mnum); 1544 un = MD_UNIT(mnum); 1545 setno = MD_MIN2SET(mnum); 1546 1547 if (md_get_setstatus(setno) & MD_SET_STALE) 1548 return (1); 1549 1550 if (un->un_hsp_id == -1) 1551 return (1); 1552 1553 for (row = 0; row < un->un_nrows; row++) { 1554 mdr = &un->un_row[row]; 1555 if ((mdr->un_ncomp + cmpcount) > ci) 1556 break; 1557 cmpcount += mdr->un_ncomp; 1558 } 1559 ASSERT(row != un->un_nrows); 1560 1561 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1562 comp += ci; 1563 /* Concatenations have a ncomp = 1 */ 1564 nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp; 1565 1566 if ((un->c.un_flag & MD_LABELED) && (ci == 0)) 1567 nd.nd_labeled = 1; 1568 else 1569 nd.nd_labeled = 0; 1570 1571 again: 1572 err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks, 1573 nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev, 1574 &nd.nd_start_blk); 1575 1576 if (err) { 1577 if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids, 1578 replace_done, replace_data)) { 1579 mddb_commitrecs_wrapper(recids); 1580 md_unit_writerexit(ui); 1581 } 1582 recids[0] = 0; 1583 return (1); 1584 } 1585 1586 if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids, 1587 replace_done, replace_data)) { 1588 1589 (void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0, 1590 &nd.nd_hs_id, &nd.nd_key, NULL, NULL); 1591 mddb_commitrec_wrapper(nd.nd_hs_id); 1592 goto again; 1593 } 1594 1595 /* Leave a slot for the null recid */ 1596 for (i = 0; i < (nrecids - 1); i++) { 1597 if (recids[i] == 0) { 1598 recids[i++] = nd.nd_hs_id; 1599 recids[i] = 0; 1600 } 1601 } 1602 return (0); 1603 } 1604 1605 static int 1606 stripe_imp_set( 1607 set_t setno 1608 ) 1609 { 1610 1611 mddb_recid_t recid; 1612 int i, row, c, gotsomething; 1613 mddb_type_t typ1; 1614 mddb_de_ic_t *dep; 1615 mddb_rb32_t *rbp; 1616 ms_unit32_od_t *un32; 1617 ms_unit_t *un64; 1618 md_dev64_t self_devt; 1619 minor_t *self_id; /* minor needs to be updated */ 1620 md_parent_t *parent_id; /* parent needs to be updated */ 1621 mddb_recid_t *record_id; /* record id needs to be updated */ 1622 mddb_recid_t *hsp_id; 1623 ms_comp32_od_t *comp32; 1624 ms_comp_t *comp64; 1625 1626 1627 gotsomething = 0; 1628 1629 typ1 = (mddb_type_t)md_getshared_key(setno, 1630 stripe_md_ops.md_driver.md_drivername); 1631 recid = mddb_makerecid(setno, 0); 1632 1633 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 1634 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1635 continue; 1636 1637 dep = mddb_getrecdep(recid); 1638 rbp = dep->de_rb; 1639 1640 switch (rbp->rb_revision) { 1641 case MDDB_REV_RB: 1642 case MDDB_REV_RBFN: 1643 /* 1644 * Small device 1645 */ 1646 un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid); 1647 self_id = &(un32->c.un_self_id); 1648 parent_id = &(un32->c.un_parent); 1649 record_id = &(un32->c.un_record_id); 1650 hsp_id = &(un32->un_hsp_id); 1651 1652 comp32 = (ms_comp32_od_t *) 1653 ((void *)&((char *)un32)[un32->un_ocomp]); 1654 for (row = 0; row < un32->un_nrows; row++) { 1655 struct ms_row32_od *mdr = &un32->un_row[row]; 1656 for (i = 0, c = mdr->un_icomp; 1657 i < mdr->un_ncomp; i++) { 1658 ms_comp32_od_t *mdc; 1659 1660 mdc = &comp32[c++]; 1661 1662 if (!md_update_minor(setno, 1663 mddb_getsidenum(setno), 1664 mdc->un_key)) 1665 goto out; 1666 1667 if (mdc->un_mirror.ms_hs_id != 0) 1668 mdc->un_mirror.ms_hs_id = 1669 MAKERECID(setno, 1670 mdc->un_mirror.ms_hs_id); 1671 } 1672 } 1673 break; 1674 case MDDB_REV_RB64: 1675 case MDDB_REV_RB64FN: 1676 un64 = (ms_unit_t *)mddb_getrecaddr(recid); 1677 self_id = &(un64->c.un_self_id); 1678 parent_id = &(un64->c.un_parent); 1679 record_id = &(un64->c.un_record_id); 1680 hsp_id = &(un64->un_hsp_id); 1681 1682 comp64 = (ms_comp_t *) 1683 ((void *)&((char *)un64)[un64->un_ocomp]); 1684 for (row = 0; row < un64->un_nrows; row++) { 1685 struct ms_row *mdr = &un64->un_row[row]; 1686 1687 for (i = 0, c = mdr->un_icomp; 1688 i < mdr->un_ncomp; i++) { 1689 ms_comp_t *mdc; 1690 1691 mdc = &comp64[c++]; 1692 1693 if (!md_update_minor(setno, 1694 mddb_getsidenum(setno), 1695 mdc->un_key)) 1696 goto out; 1697 1698 if (mdc->un_mirror.ms_hs_id != 0) 1699 mdc->un_mirror.ms_hs_id = 1700 MAKERECID(setno, 1701 mdc->un_mirror.ms_hs_id); 1702 } 1703 } 1704 break; 1705 } 1706 1707 /* 1708 * If this is a top level and a friendly name metadevice, 1709 * update its minor in the namespace. 1710 */ 1711 if ((*parent_id == MD_NO_PARENT) && 1712 ((rbp->rb_revision == MDDB_REV_RBFN) || 1713 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1714 1715 self_devt = md_makedevice(md_major, *self_id); 1716 if (!md_update_top_device_minor(setno, 1717 mddb_getsidenum(setno), self_devt)) 1718 goto out; 1719 } 1720 1721 /* 1722 * Update unit with the imported setno 1723 * 1724 */ 1725 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1726 1727 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1728 1729 if (*hsp_id != -1) 1730 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 1731 1732 if (*parent_id != MD_NO_PARENT) 1733 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1734 *record_id = MAKERECID(setno, DBID(*record_id)); 1735 1736 gotsomething = 1; 1737 } 1738 1739 out: 1740 return (gotsomething); 1741 } 1742 1743 static md_named_services_t stripe_named_services[] = { 1744 {stripe_shared_by_blk, "shared by blk" }, 1745 {stripe_shared_by_indx, "shared by indx" }, 1746 {stripe_component_count, "get component count" }, 1747 {stripe_block_count_skip_size, "get block count skip size" }, 1748 {stripe_get_dev, "get device" }, 1749 {stripe_replace_dev, "replace device" }, 1750 {stripe_hotspare_dev, "hotspare device" }, 1751 {stripe_rename_check, MDRNM_CHECK }, 1752 {NULL, 0} 1753 }; 1754 1755 md_ops_t stripe_md_ops = { 1756 stripe_open, /* open */ 1757 stripe_close, /* close */ 1758 md_stripe_strategy, /* strategy */ 1759 NULL, /* print */ 1760 stripe_dump, /* dump */ 1761 NULL, /* read */ 1762 NULL, /* write */ 1763 md_stripe_ioctl, /* stripe_ioctl, */ 1764 stripe_snarf, /* stripe_snarf */ 1765 stripe_halt, /* stripe_halt */ 1766 NULL, /* aread */ 1767 NULL, /* awrite */ 1768 stripe_imp_set, /* import set */ 1769 stripe_named_services 1770 }; 1771 1772 static void 1773 init_init() 1774 { 1775 md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t); 1776 1777 stripe_parent_cache = kmem_cache_create("md_stripe_parent", 1778 sizeof (md_sps_t), 0, stripe_parent_constructor, 1779 stripe_parent_destructor, stripe_run_queue, NULL, NULL, 1780 0); 1781 stripe_child_cache = kmem_cache_create("md_stripe_child", 1782 sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0, 1783 stripe_child_constructor, stripe_child_destructor, 1784 stripe_run_queue, NULL, NULL, 0); 1785 } 1786 1787 static void 1788 fini_uninit() 1789 { 1790 kmem_cache_destroy(stripe_parent_cache); 1791 kmem_cache_destroy(stripe_child_cache); 1792 stripe_parent_cache = stripe_child_cache = NULL; 1793 } 1794 1795 /* define the module linkage */ 1796 MD_PLUGIN_MISC_MODULE("stripes module", init_init(), fini_uninit()) 1797