1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/conf.h> 31 #include <sys/file.h> 32 #include <sys/user.h> 33 #include <sys/uio.h> 34 #include <sys/t_lock.h> 35 #include <sys/buf.h> 36 #include <sys/dkio.h> 37 #include <sys/vtoc.h> 38 #include <sys/kmem.h> 39 #include <vm/page.h> 40 #include <sys/cmn_err.h> 41 #include <sys/sysmacros.h> 42 #include <sys/types.h> 43 #include <sys/mkdev.h> 44 #include <sys/stat.h> 45 #include <sys/open.h> 46 #include <sys/lvm/mdio.h> 47 #include <sys/lvm/mdvar.h> 48 #include <sys/lvm/md_stripe.h> 49 #include <sys/lvm/md_convert.h> 50 #include <sys/lvm/md_notify.h> 51 #include <sys/modctl.h> 52 #include <sys/ddi.h> 53 #include <sys/sunddi.h> 54 #include <sys/debug.h> 55 #include <sys/sysevent/eventdefs.h> 56 #include <sys/sysevent/svm.h> 57 58 md_ops_t stripe_md_ops; 59 #ifndef lint 60 char _depends_on[] = "drv/md"; 61 md_ops_t *md_interface_ops = &stripe_md_ops; 62 #endif 63 64 extern unit_t md_nunits; 65 extern set_t md_nsets; 66 extern md_set_t md_set[]; 67 68 extern kmutex_t md_mx; 69 extern kcondvar_t md_cv; 70 71 extern int md_status; 72 extern major_t md_major; 73 extern mdq_anchor_t md_done_daemon; 74 75 static int md_stripe_mcs_buf_off; 76 static kmem_cache_t *stripe_parent_cache = NULL; 77 static kmem_cache_t *stripe_child_cache = NULL; 78 79 /*ARGSUSED1*/ 80 static int 81 stripe_parent_constructor(void *p, void *d1, int d2) 82 { 83 mutex_init(&((md_sps_t *)p)->ps_mx, 84 NULL, MUTEX_DEFAULT, NULL); 85 return (0); 86 } 87 88 static void 89 stripe_parent_init(void *ps) 90 { 91 bzero(ps, offsetof(md_sps_t, ps_mx)); 92 } 93 94 /*ARGSUSED1*/ 95 static void 96 stripe_parent_destructor(void *p, void *d) 97 { 98 mutex_destroy(&((md_sps_t *)p)->ps_mx); 99 } 100 101 /*ARGSUSED1*/ 102 static int 103 stripe_child_constructor(void *p, void *d1, int d2) 104 { 105 bioinit(&((md_scs_t *)p)->cs_buf); 106 return (0); 107 } 108 109 static void 110 stripe_child_init(md_scs_t *cs) 111 { 112 cs->cs_mdunit = 0; 113 cs->cs_ps = NULL; 114 cs->cs_comp = NULL; 115 md_bioreset(&cs->cs_buf); 116 } 117 118 /*ARGSUSED1*/ 119 static void 120 stripe_child_destructor(void *p, void *d) 121 { 122 biofini(&((md_scs_t *)p)->cs_buf); 123 } 124 125 /*ARGSUSED*/ 126 static void 127 stripe_run_queue(void *d) 128 { 129 if (!(md_status & MD_GBL_DAEMONS_LIVE)) 130 md_daemon(1, &md_done_daemon); 131 } 132 133 static void 134 stripe_close_all_devs(ms_unit_t *un, int md_cflags) 135 { 136 int row; 137 int i; 138 int c; 139 struct ms_comp *mdcomp; 140 141 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 142 for (row = 0; row < un->un_nrows; row++) { 143 struct ms_row *mdr = &un->un_row[row]; 144 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 145 struct ms_comp *mdc; 146 mdc = &mdcomp[c++]; 147 if (md_cflags & MD_OFLG_PROBEDEV) { 148 149 /* 150 * It is possible that the md_layered_open 151 * failed because the stripe unit structure 152 * contained a NODEV. In such a case since 153 * there is nothing to open, there is nothing 154 * to close. 155 */ 156 if (mdc->un_dev == NODEV64) 157 continue; 158 } 159 if ((md_cflags & MD_OFLG_PROBEDEV) && 160 (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) { 161 md_layered_close(mdc->un_dev, 162 md_cflags); 163 mdc->un_mirror.ms_flags &= 164 ~MDM_S_PROBEOPEN; 165 } else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) { 166 md_layered_close(mdc->un_dev, md_cflags); 167 mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN; 168 } 169 } 170 } 171 } 172 173 static int 174 stripe_open_all_devs(ms_unit_t *un, int md_oflags) 175 { 176 minor_t mnum = MD_SID(un); 177 int row; 178 int i; 179 int c; 180 struct ms_comp *mdcomp; 181 int err; 182 int cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS); 183 int probe_err_cnt = 0; 184 int total_comp_cnt = 0; 185 set_t setno = MD_MIN2SET(MD_SID(un)); 186 side_t side = mddb_getsidenum(setno); 187 mdkey_t key; 188 189 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 190 191 /* 192 * For a probe call, if any component of a stripe or a concat 193 * can be opened, it is considered to be a success. The total number 194 * of components in a stripe are computed prior to starting a probe. 195 * This number is then compared against the number of components 196 * that could be be successfully opened. If none of the components 197 * in a stripe can be opened, only then an ENXIO is returned for a 198 * probe type open. 199 */ 200 201 for (row = 0; row < un->un_nrows; row++) { 202 struct ms_row *mdr = &un->un_row[row]; 203 204 if (md_oflags & MD_OFLG_PROBEDEV) 205 total_comp_cnt += mdr->un_ncomp; 206 207 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 208 struct ms_comp *mdc; 209 md_dev64_t tmpdev; 210 211 mdc = &mdcomp[c++]; 212 tmpdev = mdc->un_dev; 213 /* 214 * Do the open by device id 215 * Check if this comp is hotspared and 216 * if it is then use the key for hotspare. 217 * MN disksets don't use devids, so we better don't use 218 * md_devid_found/md_resolve_bydevid there. Rather do, 219 * what's done in stripe_build_incore() 220 */ 221 if (MD_MNSET_SETNO(setno)) { 222 if (mdc->un_mirror.ms_hs_id != 0) { 223 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 224 0, &mdc->un_mirror.ms_hs_id, NULL, 225 &tmpdev, NULL); 226 } 227 } else { 228 key = mdc->un_mirror.ms_hs_id ? 229 mdc->un_mirror.ms_hs_key : mdc->un_key; 230 if ((md_getmajor(tmpdev) != md_major) && 231 md_devid_found(setno, side, key) == 1) { 232 tmpdev = md_resolve_bydevid(mnum, 233 tmpdev, key); 234 } 235 } 236 237 /* 238 * For a submirror, we only want to open those devices 239 * that are not errored. If the device is errored then 240 * then there is no reason to open it and leaving it 241 * closed allows the RCM/DR code to work so that the 242 * errored device can be replaced. 243 */ 244 if ((md_oflags & MD_OFLG_PROBEDEV) || 245 ! (mdc->un_mirror.ms_state & CS_ERRED)) { 246 247 err = md_layered_open(mnum, &tmpdev, md_oflags); 248 } else { 249 err = ENXIO; 250 } 251 252 /* 253 * Only set the un_dev if the tmpdev != NODEV64. If 254 * it is NODEV64 then the md_layered_open() will have 255 * failed in some manner. 256 */ 257 if (tmpdev != NODEV64) 258 mdc->un_dev = tmpdev; 259 260 if (err) { 261 if (!cont_on_errors) { 262 stripe_close_all_devs(un, md_oflags); 263 return (ENXIO); 264 } 265 266 if (md_oflags & MD_OFLG_PROBEDEV) 267 probe_err_cnt++; 268 } else { 269 if (md_oflags & MD_OFLG_PROBEDEV) { 270 mdc->un_mirror.ms_flags |= 271 MDM_S_PROBEOPEN; 272 } else 273 mdc->un_mirror.ms_flags |= MDM_S_ISOPEN; 274 } 275 } 276 } 277 278 /* If every component in a stripe could not be opened fail */ 279 if ((md_oflags & MD_OFLG_PROBEDEV) && 280 (probe_err_cnt == total_comp_cnt)) 281 return (ENXIO); 282 else 283 return (0); 284 } 285 286 int 287 stripe_build_incore(void *p, int snarfing) 288 { 289 ms_unit_t *un = (ms_unit_t *)p; 290 struct ms_comp *mdcomp; 291 minor_t mnum; 292 int row; 293 int i; 294 int c; 295 int ncomps; 296 297 mnum = MD_SID(un); 298 299 if (MD_UNIT(mnum) != NULL) 300 return (0); 301 302 MD_STATUS(un) = 0; 303 304 /* 305 * Reset all the is_open flags, these are probably set 306 * cause they just came out of the database. 307 */ 308 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 309 310 ncomps = 0; 311 for (row = 0; row < un->un_nrows; row++) { 312 struct ms_row *mdr = &un->un_row[row]; 313 ncomps += mdr->un_ncomp; 314 } 315 316 for (row = 0; row < un->un_nrows; row++) { 317 struct ms_row *mdr = &un->un_row[row]; 318 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 319 struct ms_comp *mdc; 320 set_t setno; 321 md_dev64_t tmpdev; 322 323 mdc = &mdcomp[c++]; 324 mdc->un_mirror.ms_flags &= 325 ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED); 326 327 if (!snarfing) 328 continue; 329 330 setno = MD_MIN2SET(mnum); 331 332 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), 333 mdc->un_key, MD_NOTRUST_DEVT); 334 mdc->un_dev = tmpdev; 335 /* 336 * Check for hotspares. If the hotspares haven't been 337 * snarfed yet, stripe_open_all_devs() will do the 338 * remapping of the dev's later. 339 */ 340 if (mdc->un_mirror.ms_hs_id != 0) { 341 mdc->un_mirror.ms_orig_dev = mdc->un_dev; 342 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 343 0, &mdc->un_mirror.ms_hs_id, NULL, 344 &tmpdev, NULL); 345 mdc->un_dev = tmpdev; 346 } 347 } 348 } 349 350 MD_UNIT(mnum) = un; 351 return (0); 352 } 353 354 void 355 reset_stripe(ms_unit_t *un, minor_t mnum, int removing) 356 { 357 ms_comp_t *mdcomp; 358 struct ms_row *mdr; 359 int i, c; 360 int row; 361 int nsv; 362 int isv; 363 sv_dev_t *sv; 364 mddb_recid_t *recids; 365 mddb_recid_t vtoc_id; 366 int rid = 0; 367 368 md_destroy_unit_incore(mnum, &stripe_md_ops); 369 370 MD_UNIT(mnum) = NULL; 371 372 /* 373 * Attempt release of its minor node 374 */ 375 md_remove_minor_node(mnum); 376 377 if (!removing) 378 return; 379 380 nsv = 0; 381 /* Count the number of devices */ 382 for (row = 0; row < un->un_nrows; row++) { 383 mdr = &un->un_row[row]; 384 nsv += mdr->un_ncomp; 385 } 386 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP); 387 388 /* 389 * allocate recids array. since we may have to commit 390 * underlying soft partition records, we need an array 391 * of size: total number of components in stripe + 3 392 * (one for the stripe itself, one for the hotspare, one 393 * for the end marker). 394 */ 395 recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP); 396 397 /* 398 * Save the md_dev64_t's and driver nm indexes. 399 * Because after the mddb_deleterec() we will 400 * not be able to access the unit structure. 401 * 402 * NOTE: Deleting the names before deleting the 403 * unit structure would cause problems if 404 * the machine crashed in between the two. 405 */ 406 isv = 0; 407 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 408 409 for (row = 0; row < un->un_nrows; row++) { 410 mdr = &un->un_row[row]; 411 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { 412 struct ms_comp *mdc; 413 md_dev64_t child_dev; 414 md_unit_t *child_un; 415 416 mdc = &mdcomp[c++]; 417 if (mdc->un_mirror.ms_hs_id != 0) { 418 mdkey_t hs_key; 419 420 hs_key = mdc->un_mirror.ms_hs_key; 421 422 mdc->un_dev = mdc->un_mirror.ms_orig_dev; 423 mdc->un_start_block = 424 mdc->un_mirror.ms_orig_blk; 425 mdc->un_mirror.ms_hs_id = 0; 426 mdc->un_mirror.ms_hs_key = 0; 427 mdc->un_mirror.ms_orig_dev = 0; 428 recids[0] = 0; 429 recids[1] = 0; /* recids[1] filled in below */ 430 recids[2] = 0; 431 (void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id, 432 0, 0, &recids[0], &hs_key, NULL, NULL); 433 mddb_commitrecs_wrapper(recids); 434 } 435 436 /* 437 * check if we've got metadevice below us and 438 * deparent it if we do. 439 * NOTE: currently soft partitions are the 440 * the only metadevices stripes can be 441 * built on top of. 442 */ 443 child_dev = mdc->un_dev; 444 if (md_getmajor(child_dev) == md_major) { 445 child_un = MD_UNIT(md_getminor(child_dev)); 446 md_reset_parent(child_dev); 447 recids[rid++] = MD_RECID(child_un); 448 } 449 450 sv[isv].setno = MD_MIN2SET(mnum); 451 sv[isv++].key = mdc->un_key; 452 } 453 } 454 455 recids[rid++] = un->c.un_record_id; 456 recids[rid] = 0; /* filled in below */ 457 458 /* 459 * Decrement the HSP reference count and 460 * remove the knowledge of the HSP from the unit struct. 461 * This is done atomically to remove a window. 462 */ 463 if (un->un_hsp_id != -1) { 464 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, 465 &recids[rid++], NULL, NULL, NULL); 466 un->un_hsp_id = -1; 467 } 468 469 /* set end marker and commit records */ 470 recids[rid] = 0; 471 mddb_commitrecs_wrapper(recids); 472 473 vtoc_id = un->c.un_vtoc_id; 474 475 /* 476 * Remove self from the namespace 477 */ 478 if (un->c.un_revision & MD_FN_META_DEV) { 479 (void) md_rem_selfname(un->c.un_self_id); 480 } 481 482 /* Remove the unit structure */ 483 mddb_deleterec_wrapper(un->c.un_record_id); 484 485 /* Remove the vtoc, if present */ 486 if (vtoc_id) 487 mddb_deleterec_wrapper(vtoc_id); 488 489 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, 490 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); 491 md_rem_names(sv, nsv); 492 kmem_free(sv, sizeof (sv_dev_t) * nsv); 493 kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3)); 494 } 495 496 static void 497 stripe_error(md_sps_t *ps) 498 { 499 struct buf *pb = ps->ps_bp; 500 mdi_unit_t *ui = ps->ps_ui; 501 md_dev64_t dev = ps->ps_errcomp->un_dev; 502 md_dev64_t md_dev = md_expldev(pb->b_edev); 503 char *str; 504 505 if (pb->b_flags & B_READ) { 506 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR; 507 str = "read"; 508 } else { 509 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR; 510 str = "write"; 511 } 512 if (!(ps->ps_flags & MD_SPS_DONTFREE)) { 513 if (MUTEX_HELD(&ps->ps_mx)) { 514 mutex_exit(&ps->ps_mx); 515 } 516 } else { 517 ASSERT(panicstr); 518 } 519 SPS_FREE(stripe_parent_cache, ps); 520 pb->b_flags |= B_ERROR; 521 522 md_kstat_done(ui, pb, 0); 523 md_unit_readerexit(ui); 524 md_biodone(pb); 525 526 cmn_err(CE_WARN, "md: %s: %s error on %s", 527 md_shortname(md_getminor(md_dev)), str, 528 md_devname(MD_DEV2SET(md_dev), dev, NULL, 0)); 529 } 530 531 static int 532 stripe_done(struct buf *cb) 533 { 534 struct buf *pb; 535 mdi_unit_t *ui; 536 md_sps_t *ps; 537 md_scs_t *cs; 538 539 /*LINTED*/ 540 cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off); 541 ps = cs->cs_ps; 542 pb = ps->ps_bp; 543 544 mutex_enter(&ps->ps_mx); 545 if (cb->b_flags & B_ERROR) { 546 ps->ps_flags |= MD_SPS_ERROR; 547 pb->b_error = cb->b_error; 548 ps->ps_errcomp = cs->cs_comp; 549 } 550 551 if (cb->b_flags & B_REMAPPED) 552 bp_mapout(cb); 553 554 ps->ps_frags--; 555 if (ps->ps_frags != 0) { 556 mutex_exit(&ps->ps_mx); 557 kmem_cache_free(stripe_child_cache, cs); 558 return (1); 559 } 560 kmem_cache_free(stripe_child_cache, cs); 561 if (ps->ps_flags & MD_SPS_ERROR) { 562 stripe_error(ps); 563 return (1); 564 } 565 ui = ps->ps_ui; 566 if (!(ps->ps_flags & MD_SPS_DONTFREE)) { 567 mutex_exit(&ps->ps_mx); 568 } else { 569 ASSERT(panicstr); 570 } 571 SPS_FREE(stripe_parent_cache, ps); 572 md_kstat_done(ui, pb, 0); 573 md_unit_readerexit(ui); 574 md_biodone(pb); 575 return (0); 576 } 577 578 579 /* 580 * This routine does the mapping from virtual (dev, blkno) of a metapartition 581 * to the real (dev, blkno) of a real disk partition. 582 * It goes to the md_conf[] table to find out the correct real partition 583 * dev and block number for this buffer. 584 * 585 * A single buf request can not go across real disk partition boundary. 586 * When the virtual request specified by (dev, blkno) spans more than one 587 * real partition, md_mapbuf will return 1. Then the caller should prepare 588 * another real buf and continue calling md_mapbuf to do the mapping until 589 * it returns 0. 590 * 591 */ 592 593 static int 594 md_mapbuf( 595 ms_unit_t *un, 596 diskaddr_t blkno, 597 u_longlong_t bcount, 598 buf_t *bp, /* if bp==NULL, skip bp updates */ 599 ms_comp_t **mdc) /* if bp==NULL, skip mdc update */ 600 { 601 struct ms_row *mdr; 602 struct ms_comp *mdcomp; 603 diskaddr_t stripe_blk; 604 diskaddr_t fragment, blk_in_row, endblk; 605 offset_t interlace; 606 size_t dev_index; 607 int row_index, more; 608 extern unsigned md_maxphys; 609 /* Work var's when bp==NULL */ 610 u_longlong_t wb_bcount; 611 diskaddr_t wb_blkno; 612 md_dev64_t wb_edev; 613 ms_comp_t *wmdc; 614 615 /* 616 * Do a real calculation to derive the minor device of the 617 * Virtual Disk, which in turn will let us derive the 618 * device/minor of the underlying real device. 619 */ 620 621 622 for (row_index = 0; row_index < un->un_nrows; row_index++) { 623 mdr = &un->un_row[row_index]; 624 if (blkno < mdr->un_cum_blocks) 625 break; 626 } 627 ASSERT(row_index != un->un_nrows); 628 629 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 630 631 blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks; 632 endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE)); 633 if (mdr->un_ncomp == 1) { /* No striping */ 634 if (endblk > mdr->un_cum_blocks) { 635 wb_bcount = ldbtob(mdr->un_cum_blocks - blkno); 636 if ((row_index + 1) == un->un_nrows) 637 more = 0; 638 else 639 more = 1; 640 } else { 641 wb_bcount = bcount; 642 more = 0; 643 } 644 wmdc = &mdcomp[mdr->un_icomp]; 645 wb_blkno = blk_in_row; 646 } else { /* Have striping */ 647 interlace = mdr->un_interlace; 648 fragment = blk_in_row % interlace; 649 if (bcount > ldbtob(interlace - fragment)) { 650 more = 1; 651 wb_bcount = ldbtob(interlace - fragment); 652 } else { 653 more = 0; 654 wb_bcount = bcount; 655 } 656 657 stripe_blk = blk_in_row / interlace; 658 dev_index = (size_t)(stripe_blk % mdr->un_ncomp); 659 wmdc = &mdcomp[mdr->un_icomp + dev_index]; 660 wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) 661 * interlace) + fragment); 662 } 663 664 wb_blkno += wmdc->un_start_block; 665 wb_edev = wmdc->un_dev; 666 667 /* only break up the I/O if we're not built on another metadevice */ 668 if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) { 669 wb_bcount = md_maxphys; 670 more = 1; 671 } 672 if (bp != (buf_t *)NULL) { 673 /* 674 * wb_bcount is limited by md_maxphys which is 'int' 675 */ 676 bp->b_bcount = (size_t)wb_bcount; 677 bp->b_lblkno = wb_blkno; 678 bp->b_edev = md_dev64_to_dev(wb_edev); 679 *mdc = wmdc; 680 } 681 return (more); 682 } 683 684 static void 685 md_stripe_strategy(buf_t *pb, int flag, void *private) 686 { 687 md_sps_t *ps; 688 md_scs_t *cs; 689 int doing_writes; 690 int more; 691 ms_unit_t *un; 692 mdi_unit_t *ui; 693 size_t current_count; 694 diskaddr_t current_blkno; 695 off_t current_offset; 696 buf_t *cb; /* child buf pointer */ 697 set_t setno; 698 699 setno = MD_MIN2SET(getminor(pb->b_edev)); 700 701 /* 702 * When doing IO to a multi owner meta device, check if set is halted. 703 * We do this check without the needed lock held, for performance 704 * reasons. 705 * If an IO just slips through while the set is locked via an 706 * MD_MN_SUSPEND_SET, we don't care about it. 707 * Only check for a suspended set if we are a top-level i/o request 708 * (MD_STR_NOTTOP is cleared in 'flag'). 709 */ 710 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == 711 (MD_SET_HALTED | MD_SET_MNSET)) { 712 if ((flag & MD_STR_NOTTOP) == 0) { 713 mutex_enter(&md_mx); 714 /* Here we loop until the set is no longer halted */ 715 while (md_set[setno].s_status & MD_SET_HALTED) { 716 cv_wait(&md_cv, &md_mx); 717 } 718 mutex_exit(&md_mx); 719 } 720 } 721 722 ui = MDI_UNIT(getminor(pb->b_edev)); 723 724 md_kstat_waitq_enter(ui); 725 726 un = (ms_unit_t *)md_unit_readerlock(ui); 727 728 if ((flag & MD_NOBLOCK) == 0) { 729 if (md_inc_iocount(setno) != 0) { 730 pb->b_flags |= B_ERROR; 731 pb->b_error = ENXIO; 732 pb->b_resid = pb->b_bcount; 733 md_kstat_waitq_exit(ui); 734 md_unit_readerexit(ui); 735 biodone(pb); 736 return; 737 } 738 } else { 739 md_inc_iocount_noblock(setno); 740 } 741 742 if (!(flag & MD_STR_NOTTOP)) { 743 if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) { 744 md_kstat_waitq_exit(ui); 745 return; 746 } 747 } 748 749 ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS); 750 stripe_parent_init(ps); 751 752 /* 753 * Save essential information from the original buffhdr 754 * in the md_save structure. 755 */ 756 ps->ps_un = un; 757 ps->ps_ui = ui; 758 ps->ps_bp = pb; 759 ps->ps_addr = pb->b_un.b_addr; 760 761 if ((pb->b_flags & B_READ) == 0) 762 doing_writes = 1; 763 else 764 doing_writes = 0; 765 766 767 current_count = pb->b_bcount; 768 current_blkno = pb->b_lblkno; 769 current_offset = 0; 770 771 if (!(flag & MD_STR_NOTTOP) && panicstr) 772 ps->ps_flags |= MD_SPS_DONTFREE; 773 774 md_kstat_waitq_to_runq(ui); 775 776 ps->ps_frags++; 777 do { 778 cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS); 779 stripe_child_init(cs); 780 cb = &cs->cs_buf; 781 cs->cs_ps = ps; 782 more = md_mapbuf(un, current_blkno, current_count, cb, 783 &cs->cs_comp); 784 785 cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev, 786 cb->b_lblkno, stripe_done, cb, KM_NOSLEEP); 787 /* 788 * Do these calculations now, 789 * so that we pickup a valid b_bcount from the chld_bp. 790 */ 791 current_offset += cb->b_bcount; 792 current_count -= cb->b_bcount; 793 current_blkno += (diskaddr_t)(lbtodb(cb->b_bcount)); 794 795 if (more) { 796 mutex_enter(&ps->ps_mx); 797 ps->ps_frags++; 798 mutex_exit(&ps->ps_mx); 799 } 800 801 if (doing_writes && 802 cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) { 803 (void) stripe_done(cb); 804 continue; 805 } 806 md_call_strategy(cb, flag, private); 807 } while (more); 808 809 if (!(flag & MD_STR_NOTTOP) && panicstr) { 810 while (!(ps->ps_flags & MD_SPS_DONE)) { 811 md_daemon(1, &md_done_daemon); 812 drv_usecwait(10); 813 } 814 kmem_cache_free(stripe_parent_cache, ps); 815 } 816 } 817 818 static int 819 stripe_snarf(md_snarfcmd_t cmd, set_t setno) 820 { 821 ms_unit_t *un; 822 mddb_recid_t recid; 823 int gotsomething; 824 int all_stripes_gotten; 825 mddb_type_t typ1; 826 mddb_de_ic_t *dep; 827 mddb_rb32_t *rbp; 828 size_t newreqsize; 829 ms_unit_t *big_un; 830 ms_unit32_od_t *small_un; 831 832 833 if (cmd == MD_SNARF_CLEANUP) 834 return (0); 835 836 all_stripes_gotten = 1; 837 gotsomething = 0; 838 839 typ1 = (mddb_type_t)md_getshared_key(setno, 840 stripe_md_ops.md_driver.md_drivername); 841 recid = mddb_makerecid(setno, 0); 842 843 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 844 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 845 continue; 846 847 dep = mddb_getrecdep(recid); 848 dep->de_flags = MDDB_F_STRIPE; 849 rbp = dep->de_rb; 850 851 switch (rbp->rb_revision) { 852 case MDDB_REV_RB: 853 case MDDB_REV_RBFN: 854 if ((rbp->rb_private & MD_PRV_CONVD) == 0) { 855 /* 856 * This means, we have an old and small record 857 * and this record hasn't already been 858 * converted. Before we create an incore 859 * metadevice from this we have to convert it to 860 * a big record. 861 */ 862 small_un = 863 (ms_unit32_od_t *)mddb_getrecaddr(recid); 864 newreqsize = get_big_stripe_req_size(small_un, 865 COMPLETE_STRUCTURE); 866 big_un = (ms_unit_t *)kmem_zalloc(newreqsize, 867 KM_SLEEP); 868 stripe_convert((caddr_t)small_un, 869 (caddr_t)big_un, SMALL_2_BIG); 870 kmem_free(small_un, dep->de_reqsize); 871 dep->de_rb_userdata = big_un; 872 dep->de_reqsize = newreqsize; 873 un = big_un; 874 rbp->rb_private |= MD_PRV_CONVD; 875 } else { 876 /* Small device had already been converted */ 877 un = (ms_unit_t *)mddb_getrecaddr(recid); 878 } 879 un->c.un_revision &= ~MD_64BIT_META_DEV; 880 break; 881 case MDDB_REV_RB64: 882 case MDDB_REV_RB64FN: 883 /* Big device */ 884 un = (ms_unit_t *)mddb_getrecaddr(recid); 885 un->c.un_revision |= MD_64BIT_META_DEV; 886 un->c.un_flag |= MD_EFILABEL; 887 break; 888 } 889 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision); 890 891 /* Create minor node for snarfed unit. */ 892 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); 893 894 if (MD_UNIT(MD_SID(un)) != NULL) { 895 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 896 continue; 897 } 898 all_stripes_gotten = 0; 899 if (stripe_build_incore((void *)un, 1) == 0) { 900 mddb_setrecprivate(recid, MD_PRV_GOTIT); 901 md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0); 902 gotsomething = 1; 903 } 904 } 905 906 if (!all_stripes_gotten) 907 return (gotsomething); 908 909 recid = mddb_makerecid(setno, 0); 910 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) 911 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) 912 mddb_setrecprivate(recid, MD_PRV_PENDDEL); 913 914 return (0); 915 } 916 917 static int 918 stripe_halt(md_haltcmd_t cmd, set_t setno) 919 { 920 int i; 921 mdi_unit_t *ui; 922 minor_t mnum; 923 924 if (cmd == MD_HALT_CLOSE) 925 return (0); 926 927 if (cmd == MD_HALT_OPEN) 928 return (0); 929 930 if (cmd == MD_HALT_UNLOAD) 931 return (0); 932 933 if (cmd == MD_HALT_CHECK) { 934 for (i = 0; i < md_nunits; i++) { 935 mnum = MD_MKMIN(setno, i); 936 if ((ui = MDI_UNIT(mnum)) == NULL) 937 continue; 938 if (ui->ui_opsindex != stripe_md_ops.md_selfindex) 939 continue; 940 if (md_unit_isopen(ui)) 941 return (1); 942 } 943 return (0); 944 } 945 946 if (cmd != MD_HALT_DOIT) 947 return (1); 948 949 for (i = 0; i < md_nunits; i++) { 950 mnum = MD_MKMIN(setno, i); 951 if ((ui = MDI_UNIT(mnum)) == NULL) 952 continue; 953 if (ui->ui_opsindex != stripe_md_ops.md_selfindex) 954 continue; 955 reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0); 956 } 957 958 return (0); 959 } 960 961 /*ARGSUSED3*/ 962 static int 963 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) 964 { 965 minor_t mnum = getminor(*dev); 966 mdi_unit_t *ui = MDI_UNIT(mnum); 967 ms_unit_t *un; 968 int err = 0; 969 set_t setno; 970 971 /* 972 * When doing an open of a multi owner metadevice, check to see if this 973 * node is a starting node and if a reconfig cycle is underway. 974 * If so, the system isn't sufficiently set up enough to handle the 975 * open (which involves I/O during sp_validate), so fail with ENXIO. 976 */ 977 setno = MD_MIN2SET(mnum); 978 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == 979 (MD_SET_MNSET | MD_SET_MN_START_RC)) { 980 return (ENXIO); 981 } 982 983 /* single thread */ 984 un = (ms_unit_t *)md_unit_openclose_enter(ui); 985 986 /* open devices, if necessary */ 987 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { 988 if ((err = stripe_open_all_devs(un, md_oflags)) != 0) { 989 goto out; 990 } 991 } 992 993 /* count open */ 994 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) 995 goto out; 996 997 /* unlock, return success */ 998 out: 999 md_unit_openclose_exit(ui); 1000 return (err); 1001 } 1002 1003 /*ARGSUSED1*/ 1004 static int 1005 stripe_close( 1006 dev_t dev, 1007 int flag, 1008 int otyp, 1009 cred_t *cred_p, 1010 int md_cflags 1011 ) 1012 { 1013 minor_t mnum = getminor(dev); 1014 mdi_unit_t *ui = MDI_UNIT(mnum); 1015 ms_unit_t *un; 1016 int err = 0; 1017 1018 /* single thread */ 1019 un = (ms_unit_t *)md_unit_openclose_enter(ui); 1020 1021 /* count closed */ 1022 if ((err = md_unit_decopen(mnum, otyp)) != 0) 1023 goto out; 1024 1025 /* close devices, if necessary */ 1026 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { 1027 stripe_close_all_devs(un, md_cflags); 1028 } 1029 1030 /* unlock, return success */ 1031 out: 1032 md_unit_openclose_exit(ui); 1033 return (err); 1034 } 1035 1036 1037 static struct buf dumpbuf; 1038 1039 /* 1040 * This routine dumps memory to the disk. It assumes that the memory has 1041 * already been mapped into mainbus space. It is called at disk interrupt 1042 * priority when the system is in trouble. 1043 * 1044 */ 1045 static int 1046 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) 1047 { 1048 ms_unit_t *un; 1049 buf_t *bp; 1050 ms_comp_t *mdc; 1051 u_longlong_t nb; 1052 diskaddr_t mapblk; 1053 int result; 1054 int more; 1055 int saveresult = 0; 1056 1057 /* 1058 * Don't need to grab the unit lock. 1059 * Cause nothing else is suppose to be happenning. 1060 * Also dump is not suppose to sleep. 1061 */ 1062 un = (ms_unit_t *)MD_UNIT(getminor(dev)); 1063 1064 if ((diskaddr_t)blkno >= un->c.un_total_blocks) 1065 return (EINVAL); 1066 1067 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) 1068 return (EINVAL); 1069 1070 bp = &dumpbuf; 1071 nb = ldbtob(nblk); 1072 do { 1073 bzero((caddr_t)bp, sizeof (*bp)); 1074 more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc); 1075 nblk = btodb(bp->b_bcount); 1076 mapblk = bp->b_lblkno; 1077 if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) { 1078 /* 1079 * bdev_dump() is currently only able to take 1080 * 32 bit wide blkno's. 1081 */ 1082 result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk, 1083 nblk); 1084 if (result) 1085 saveresult = result; 1086 } 1087 1088 nb -= bp->b_bcount; 1089 addr += bp->b_bcount; 1090 blkno += nblk; 1091 } while (more); 1092 1093 return (saveresult); 1094 } 1095 1096 /*ARGSUSED*/ 1097 static intptr_t 1098 stripe_shared_by_blk( 1099 md_dev64_t dev, 1100 void *junk, 1101 diskaddr_t blkno, 1102 u_longlong_t *cnt) 1103 { 1104 ms_unit_t *un; 1105 buf_t bp; 1106 ms_comp_t *comp; 1107 1108 un = MD_UNIT(md_getminor(dev)); 1109 (void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp); 1110 *cnt = (u_longlong_t)lbtodb(bp.b_bcount); 1111 return ((intptr_t)&comp->un_mirror); 1112 } 1113 1114 /* 1115 * stripe_block_count_skip_size() returns the following values 1116 * so that the logical to physical block mappings can 1117 * be calculated without intimate knowledge of the underpinnings. 1118 * 1119 * block - first logical block number of the device. 1120 * block = [ # of blocks before THE row ] + 1121 * [ # of blocks in THE row before the component ] 1122 * count - # of segments (interlaced size). 1123 * skip - # of logical blocks between segments, or delta to 1124 * get to next segment 1125 * size - interlace size used for the block, count, skip. 1126 */ 1127 /*ARGSUSED*/ 1128 static intptr_t 1129 stripe_block_count_skip_size( 1130 md_dev64_t dev, 1131 void *junk, 1132 int ci, 1133 diskaddr_t *block, 1134 size_t *count, 1135 u_longlong_t *skip, 1136 u_longlong_t *size) 1137 { 1138 ms_unit_t *un; 1139 int row; 1140 struct ms_row *mdr; 1141 int cmpcount = 0; 1142 1143 un = MD_UNIT(md_getminor(dev)); 1144 1145 for (row = 0; row < un->un_nrows; row++) { 1146 mdr = &un->un_row[row]; 1147 if ((mdr->un_ncomp + cmpcount) > ci) 1148 break; 1149 cmpcount += mdr->un_ncomp; 1150 } 1151 ASSERT(row != un->un_nrows); 1152 1153 /* 1154 * Concatenations are always contiguous blocks, 1155 * you cannot depend on the interlace being a usable 1156 * value (except for stripes). 1157 */ 1158 if (mdr->un_ncomp == 1) { /* Concats */ 1159 *block = mdr->un_cum_blocks - mdr->un_blocks; 1160 *count = 1; 1161 *skip = 0; 1162 *size = mdr->un_blocks; 1163 } else { /* Stripes */ 1164 *block = (mdr->un_cum_blocks - mdr->un_blocks) + 1165 ((ci - cmpcount) * mdr->un_interlace); 1166 *count = (size_t)(mdr->un_blocks / (mdr->un_interlace 1167 * mdr->un_ncomp)); 1168 *skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace; 1169 *size = mdr->un_interlace; 1170 } 1171 1172 return (0); 1173 } 1174 1175 /*ARGSUSED*/ 1176 static intptr_t 1177 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx) 1178 { 1179 ms_unit_t *un; 1180 ms_comp_t *comp; 1181 1182 un = MD_UNIT(md_getminor(dev)); 1183 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1184 comp += indx; 1185 return ((intptr_t)&comp->un_mirror); 1186 } 1187 1188 /*ARGSUSED*/ 1189 intptr_t 1190 stripe_component_count(md_dev64_t dev, void *junk) 1191 { 1192 /* 1193 * See comments for stripe_get_dev 1194 */ 1195 1196 ms_unit_t *un; 1197 int count = 0; 1198 int row; 1199 1200 un = MD_UNIT(md_getminor(dev)); 1201 for (row = 0; row < un->un_nrows; row++) 1202 count += un->un_row[row].un_ncomp; 1203 return (count); 1204 } 1205 1206 /*ARGSUSED*/ 1207 intptr_t 1208 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd) 1209 { 1210 /* 1211 * It should be noted that stripe_replace in stripe_ioctl.c calls this 1212 * routine using makedevice(0, minor) for the first argument. 1213 * 1214 * If this routine at some point in the future needs to use the major 1215 * number stripe_replace must be changed. 1216 */ 1217 1218 ms_unit_t *un; 1219 ms_comp_t *comp; 1220 md_dev64_t tmpdev; 1221 1222 un = MD_UNIT(md_getminor(dev)); 1223 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1224 comp += indx; 1225 tmpdev = comp->un_dev; 1226 /* 1227 * Try to resolve devt again if NODEV64 1228 * Check if this comp is hotspared and if it is 1229 * then use key for hotspare 1230 */ 1231 if (tmpdev == NODEV64) { 1232 tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev, 1233 comp->un_mirror.ms_hs_id ? 1234 comp->un_mirror.ms_hs_key : 1235 comp->un_key); 1236 comp->un_dev = tmpdev; 1237 } 1238 1239 cd->cd_dev = comp->un_dev; 1240 cd->cd_orig_dev = comp->un_mirror.ms_orig_dev; 1241 return (0); 1242 } 1243 1244 /*ARGSUSED*/ 1245 void 1246 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv) 1247 { 1248 /* 1249 * See comments for stripe_get_dev 1250 */ 1251 1252 minor_t mnum = md_getminor(dev); 1253 1254 if (sv != NULL) { 1255 md_rem_names(sv, 1); 1256 kmem_free(sv, sizeof (sv_dev_t)); 1257 } 1258 1259 md_unit_writerexit(MDI_UNIT(mnum)); 1260 } 1261 1262 /*ARGSUSED*/ 1263 intptr_t 1264 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd, 1265 mddb_recid_t *recids, int nrecids, void (**replace_done)(), 1266 void **replace_data) 1267 { 1268 minor_t mnum; 1269 ms_unit_t *un; 1270 mdi_unit_t *ui; 1271 ms_comp_t *comp; 1272 diskaddr_t dev_size; 1273 int row; 1274 int ncomps = 0; 1275 int cmpcount = 0; 1276 int rid = 0; 1277 struct ms_row *mdr; 1278 sv_dev_t *sv = NULL; 1279 mddb_recid_t hs_id = 0; 1280 set_t setno; 1281 side_t side; 1282 md_dev64_t this_dev; 1283 1284 mnum = md_getminor(dev); 1285 ui = MDI_UNIT(mnum); 1286 setno = MD_MIN2SET(mnum); 1287 side = mddb_getsidenum(setno); 1288 1289 un = md_unit_writerlock(ui); 1290 1291 *replace_data = NULL; 1292 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1293 1294 comp += ci; 1295 1296 /* 1297 * Count the number of components 1298 */ 1299 for (row = 0; row < un->un_nrows; row++) { 1300 struct ms_row *mdr = &un->un_row[row]; 1301 ncomps += mdr->un_ncomp; 1302 } 1303 1304 recids[0] = 0; 1305 /* 1306 * No need of checking size of new device, 1307 * when hotsparing (it has already been done), or 1308 * when enabling the device. 1309 */ 1310 if ((nd != NULL) && (nd->nd_hs_id == 0)) { 1311 for (row = 0; row < un->un_nrows; row++) { 1312 mdr = &un->un_row[row]; 1313 if ((mdr->un_ncomp + cmpcount) > ci) 1314 break; 1315 cmpcount += mdr->un_ncomp; 1316 } 1317 ASSERT(row != un->un_nrows); 1318 1319 /* Concatenations have a ncomp = 1 */ 1320 dev_size = mdr->un_blocks / mdr->un_ncomp; 1321 1322 /* 1323 * now check to see if new comp can be used in 1324 * place of old comp 1325 */ 1326 if ((un->c.un_flag & MD_LABELED) && (ci == 0) && 1327 nd->nd_labeled) 1328 nd->nd_start_blk = 0; 1329 else 1330 nd->nd_nblks -= nd->nd_start_blk; 1331 1332 if (dev_size > nd->nd_nblks) { 1333 md_unit_writerexit(ui); 1334 return (MDE_COMP_TOO_SMALL); 1335 } 1336 1337 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); 1338 sv->setno = MD_MIN2SET(mnum); 1339 sv->key = comp->un_key; 1340 } 1341 1342 /* 1343 * Close this component. 1344 */ 1345 if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) { 1346 md_layered_close(comp->un_dev, MD_OFLG_NULL); 1347 comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN; 1348 } 1349 1350 /* 1351 * If the component is hotspared, return to the pool. 1352 */ 1353 if (comp->un_mirror.ms_hs_id != 0) { 1354 hs_cmds_t cmd; 1355 mdkey_t hs_key; 1356 1357 hs_key = comp->un_mirror.ms_hs_key; 1358 comp->un_dev = comp->un_mirror.ms_orig_dev; 1359 comp->un_start_block = comp->un_mirror.ms_orig_blk; 1360 comp->un_mirror.ms_hs_key = 0; 1361 comp->un_mirror.ms_hs_id = 0; 1362 comp->un_mirror.ms_orig_dev = 0; 1363 1364 cmd = HS_FREE; 1365 if ((comp->un_mirror.ms_state != CS_OKAY) && 1366 (comp->un_mirror.ms_state != CS_RESYNC)) 1367 cmd = HS_BAD; 1368 (void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id, 1369 &hs_key, NULL, NULL); 1370 } 1371 1372 /* 1373 * Open by device id; for enable (indicated by a NULL 1374 * nd pointer), use the existing component info. For 1375 * replace, use the new device. 1376 */ 1377 if (nd == NULL) { 1378 this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key); 1379 /* 1380 * If someone replaced a new disk in the same slot 1381 * we get NODEV64 since old device id cannot be 1382 * resolved. The new devt is obtained from the 1383 * mddb since devt is going to be unchanged for the 1384 * enable case. No need to check for multiple 1385 * keys here because the caller (comp_replace) 1386 * has already sanity checked it for us. 1387 */ 1388 if (this_dev == NODEV64) { 1389 this_dev = md_getdevnum(setno, side, comp->un_key, 1390 MD_TRUST_DEVT); 1391 } 1392 } else { 1393 /* 1394 * If this is a hotspare, save the original dev_t for later 1395 * use. If this has occured during boot then the value of 1396 * comp->un_dev will be NODEV64 because of the failure to look 1397 * up the devid of the device. 1398 */ 1399 if (nd->nd_hs_id != 0) 1400 comp->un_mirror.ms_orig_dev = comp->un_dev; 1401 this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key); 1402 } 1403 1404 comp->un_dev = this_dev; 1405 1406 /* 1407 * Now open the new device if required. Note for a single component 1408 * stripe it will not be open - leave this for the mirror driver to 1409 * deal with. 1410 */ 1411 if (md_unit_isopen(ui)) { 1412 if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) { 1413 mddb_recid_t ids[3]; 1414 1415 ids[0] = un->c.un_record_id; 1416 ids[1] = hs_id; 1417 ids[2] = 0; 1418 mddb_commitrecs_wrapper(ids); 1419 if ((nd != NULL) && (nd->nd_hs_id != 0)) { 1420 /* 1421 * Revert back to the original device. 1422 */ 1423 comp->un_dev = comp->un_mirror.ms_orig_dev; 1424 1425 cmn_err(CE_WARN, 1426 "md: %s: open error of hotspare %s", 1427 md_shortname(mnum), 1428 md_devname(MD_MIN2SET(mnum), nd->nd_dev, 1429 NULL, 0)); 1430 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, 1431 SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev); 1432 } 1433 md_unit_writerexit(ui); 1434 return (MDE_COMP_OPEN_ERR); 1435 } 1436 if (nd != NULL) 1437 nd->nd_dev = this_dev; 1438 1439 comp->un_mirror.ms_flags |= MDM_S_ISOPEN; 1440 } 1441 1442 if (nd == NULL) { 1443 recids[0] = un->c.un_record_id; 1444 recids[1] = hs_id; 1445 recids[2] = 0; 1446 *replace_done = stripe_replace_done; 1447 return (0); 1448 } 1449 1450 /* if hot sparing this device */ 1451 if (nd->nd_hs_id != 0) { 1452 char devname[MD_MAX_CTDLEN]; 1453 char hs_devname[MD_MAX_CTDLEN]; 1454 set_t setno; 1455 1456 comp->un_mirror.ms_hs_id = nd->nd_hs_id; 1457 comp->un_mirror.ms_hs_key = nd->nd_key; 1458 1459 comp->un_mirror.ms_orig_blk = comp->un_start_block; 1460 1461 setno = MD_MIN2SET(mnum); 1462 1463 (void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname, 1464 sizeof (devname)); 1465 (void) md_devname(setno, nd->nd_dev, hs_devname, 1466 sizeof (hs_devname)); 1467 1468 cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s", 1469 md_shortname(mnum), devname, hs_devname); 1470 1471 } else { /* replacing the device */ 1472 comp->un_key = nd->nd_key; 1473 *replace_data = (void *)sv; 1474 1475 /* 1476 * For the old device, make sure to reset the parent 1477 * if it's a metadevice. 1478 */ 1479 if (md_getmajor(comp->un_dev) == md_major) { 1480 minor_t comp_mnum = md_getminor(comp->un_dev); 1481 md_unit_t *comp_un = MD_UNIT(comp_mnum); 1482 1483 md_reset_parent(comp->un_dev); 1484 recids[rid++] = MD_RECID(comp_un); 1485 } 1486 } 1487 1488 comp->un_dev = nd->nd_dev; 1489 comp->un_start_block = nd->nd_start_blk; 1490 1491 /* 1492 * For the new device, make sure to set the parent if it's a 1493 * metadevice. 1494 * 1495 * If we ever support using metadevices as hot spares, this 1496 * will need to be tested, and possibly moved into the 1497 * preceding "else" clause, immediately following the parent 1498 * reset block. For now, it's convenient to leave it here and 1499 * only compress nd->nd_dev once. 1500 */ 1501 if (md_getmajor(comp->un_dev) == md_major) { 1502 minor_t comp_mnum = md_getminor(comp->un_dev); 1503 md_unit_t *comp_un = MD_UNIT(comp_mnum); 1504 1505 md_set_parent(comp->un_dev, MD_SID(un)); 1506 recids[rid++] = MD_RECID(comp_un); 1507 } 1508 1509 recids[rid++] = un->c.un_record_id; 1510 recids[rid++] = hs_id; 1511 recids[rid] = 0; 1512 *replace_done = stripe_replace_done; 1513 return (0); 1514 } 1515 1516 /*ARGSUSED*/ 1517 static intptr_t 1518 stripe_hotspare_dev( 1519 md_dev64_t dev, 1520 void *junk, 1521 int ci, 1522 mddb_recid_t *recids, 1523 int nrecids, 1524 void (**replace_done)(), 1525 void **replace_data) 1526 { 1527 ms_unit_t *un; 1528 mdi_unit_t *ui; 1529 ms_comp_t *comp; 1530 int row; 1531 struct ms_row *mdr; 1532 ms_new_dev_t nd; 1533 int err; 1534 int i; 1535 minor_t mnum; 1536 set_t setno; 1537 int cmpcount = 0; 1538 1539 mnum = md_getminor(dev); 1540 ui = MDI_UNIT(mnum); 1541 un = MD_UNIT(mnum); 1542 setno = MD_MIN2SET(mnum); 1543 1544 if (md_get_setstatus(setno) & MD_SET_STALE) 1545 return (1); 1546 1547 if (un->un_hsp_id == -1) 1548 return (1); 1549 1550 for (row = 0; row < un->un_nrows; row++) { 1551 mdr = &un->un_row[row]; 1552 if ((mdr->un_ncomp + cmpcount) > ci) 1553 break; 1554 cmpcount += mdr->un_ncomp; 1555 } 1556 ASSERT(row != un->un_nrows); 1557 1558 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); 1559 comp += ci; 1560 /* Concatenations have a ncomp = 1 */ 1561 nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp; 1562 1563 if ((un->c.un_flag & MD_LABELED) && (ci == 0)) 1564 nd.nd_labeled = 1; 1565 else 1566 nd.nd_labeled = 0; 1567 1568 again: 1569 err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks, 1570 nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev, 1571 &nd.nd_start_blk); 1572 1573 if (err) { 1574 if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids, 1575 replace_done, replace_data)) { 1576 mddb_commitrecs_wrapper(recids); 1577 md_unit_writerexit(ui); 1578 } 1579 recids[0] = 0; 1580 return (1); 1581 } 1582 1583 if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids, 1584 replace_done, replace_data)) { 1585 1586 (void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0, 1587 &nd.nd_hs_id, &nd.nd_key, NULL, NULL); 1588 mddb_commitrec_wrapper(nd.nd_hs_id); 1589 goto again; 1590 } 1591 1592 /* Leave a slot for the null recid */ 1593 for (i = 0; i < (nrecids - 1); i++) { 1594 if (recids[i] == 0) { 1595 recids[i++] = nd.nd_hs_id; 1596 recids[i] = 0; 1597 } 1598 } 1599 return (0); 1600 } 1601 1602 static int 1603 stripe_imp_set( 1604 set_t setno 1605 ) 1606 { 1607 1608 mddb_recid_t recid; 1609 int i, row, c, gotsomething; 1610 mddb_type_t typ1; 1611 mddb_de_ic_t *dep; 1612 mddb_rb32_t *rbp; 1613 ms_unit32_od_t *un32; 1614 ms_unit_t *un64; 1615 md_dev64_t self_devt; 1616 minor_t *self_id; /* minor needs to be updated */ 1617 md_parent_t *parent_id; /* parent needs to be updated */ 1618 mddb_recid_t *record_id; /* record id needs to be updated */ 1619 mddb_recid_t *hsp_id; 1620 ms_comp32_od_t *comp32; 1621 ms_comp_t *comp64; 1622 1623 1624 gotsomething = 0; 1625 1626 typ1 = (mddb_type_t)md_getshared_key(setno, 1627 stripe_md_ops.md_driver.md_drivername); 1628 recid = mddb_makerecid(setno, 0); 1629 1630 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { 1631 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) 1632 continue; 1633 1634 dep = mddb_getrecdep(recid); 1635 rbp = dep->de_rb; 1636 1637 switch (rbp->rb_revision) { 1638 case MDDB_REV_RB: 1639 case MDDB_REV_RBFN: 1640 /* 1641 * Small device 1642 */ 1643 un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid); 1644 self_id = &(un32->c.un_self_id); 1645 parent_id = &(un32->c.un_parent); 1646 record_id = &(un32->c.un_record_id); 1647 hsp_id = &(un32->un_hsp_id); 1648 1649 comp32 = (ms_comp32_od_t *)((void *)&((char *)un32) 1650 [un32->un_ocomp]); 1651 for (row = 0; row < un32->un_nrows; row++) { 1652 struct ms_row32_od *mdr = &un32->un_row[row]; 1653 for (i = 0, c = mdr->un_icomp; 1654 i < mdr->un_ncomp; i++) { 1655 ms_comp32_od_t *mdc; 1656 mdc = &comp32[c++]; 1657 1658 if (!md_update_minor(setno, mddb_getsidenum 1659 (setno), mdc->un_key)) 1660 goto out; 1661 1662 if (mdc->un_mirror.ms_hs_id != 0) 1663 mdc->un_mirror.ms_hs_id = MAKERECID( 1664 setno, mdc->un_mirror.ms_hs_id); 1665 } 1666 } 1667 break; 1668 case MDDB_REV_RB64: 1669 case MDDB_REV_RB64FN: 1670 un64 = (ms_unit_t *)mddb_getrecaddr(recid); 1671 self_id = &(un64->c.un_self_id); 1672 parent_id = &(un64->c.un_parent); 1673 record_id = &(un64->c.un_record_id); 1674 hsp_id = &(un64->un_hsp_id); 1675 1676 comp64 = (ms_comp_t *)((void *)&((char *)un64) 1677 [un64->un_ocomp]); 1678 for (row = 0; row < un64->un_nrows; row++) { 1679 struct ms_row *mdr = &un64->un_row[row]; 1680 for (i = 0, c = mdr->un_icomp; 1681 i < mdr->un_ncomp; i++) { 1682 ms_comp_t *mdc; 1683 mdc = &comp64[c++]; 1684 1685 if (!md_update_minor(setno, mddb_getsidenum 1686 (setno), mdc->un_key)) 1687 goto out; 1688 1689 if (mdc->un_mirror.ms_hs_id != 0) 1690 mdc->un_mirror.ms_hs_id = MAKERECID( 1691 setno, mdc->un_mirror.ms_hs_id); 1692 } 1693 } 1694 break; 1695 } 1696 1697 /* 1698 * If this is a top level and a friendly name metadevice, 1699 * update its minor in the namespace. 1700 */ 1701 if ((*parent_id == MD_NO_PARENT) && 1702 ((rbp->rb_revision == MDDB_REV_RBFN) || 1703 (rbp->rb_revision == MDDB_REV_RB64FN))) { 1704 1705 self_devt = md_makedevice(md_major, *self_id); 1706 if (!md_update_top_device_minor(setno, 1707 mddb_getsidenum(setno), self_devt)) 1708 goto out; 1709 } 1710 1711 /* 1712 * Update unit with the imported setno 1713 * 1714 */ 1715 mddb_setrecprivate(recid, MD_PRV_GOTIT); 1716 1717 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); 1718 1719 if (*hsp_id != -1) 1720 *hsp_id = MAKERECID(setno, DBID(*hsp_id)); 1721 1722 if (*parent_id != MD_NO_PARENT) 1723 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); 1724 *record_id = MAKERECID(setno, DBID(*record_id)); 1725 1726 gotsomething = 1; 1727 } 1728 1729 out: 1730 return (gotsomething); 1731 } 1732 1733 static md_named_services_t stripe_named_services[] = { 1734 {stripe_shared_by_blk, "shared by blk" }, 1735 {stripe_shared_by_indx, "shared by indx" }, 1736 {stripe_component_count, "get component count" }, 1737 {stripe_block_count_skip_size, "get block count skip size" }, 1738 {stripe_get_dev, "get device" }, 1739 {stripe_replace_dev, "replace device" }, 1740 {stripe_hotspare_dev, "hotspare device" }, 1741 {stripe_rename_check, MDRNM_CHECK }, 1742 {NULL, 0} 1743 }; 1744 1745 md_ops_t stripe_md_ops = { 1746 stripe_open, /* open */ 1747 stripe_close, /* close */ 1748 md_stripe_strategy, /* strategy */ 1749 NULL, /* print */ 1750 stripe_dump, /* dump */ 1751 NULL, /* read */ 1752 NULL, /* write */ 1753 md_stripe_ioctl, /* stripe_ioctl, */ 1754 stripe_snarf, /* stripe_snarf */ 1755 stripe_halt, /* stripe_halt */ 1756 NULL, /* aread */ 1757 NULL, /* awrite */ 1758 stripe_imp_set, /* import set */ 1759 stripe_named_services 1760 }; 1761 1762 static void 1763 init_init() 1764 { 1765 md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t); 1766 1767 stripe_parent_cache = kmem_cache_create("md_stripe_parent", 1768 sizeof (md_sps_t), 0, stripe_parent_constructor, 1769 stripe_parent_destructor, stripe_run_queue, NULL, NULL, 1770 0); 1771 stripe_child_cache = kmem_cache_create("md_stripe_child", 1772 sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0, 1773 stripe_child_constructor, stripe_child_destructor, 1774 stripe_run_queue, NULL, NULL, 0); 1775 } 1776 1777 static void 1778 fini_uninit() 1779 { 1780 kmem_cache_destroy(stripe_parent_cache); 1781 kmem_cache_destroy(stripe_child_cache); 1782 stripe_parent_cache = stripe_child_cache = NULL; 1783 } 1784 1785 /* define the module linkage */ 1786 MD_PLUGIN_MISC_MODULE("stripes module %I%", init_init(), fini_uninit()) 1787