/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License, Version 1.0 only * (the "License"). You may not use this file except in compliance * with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include md_ops_t stripe_md_ops; #ifndef lint static char _depends_on[] = "drv/md"; static md_ops_t *md_interface_ops = &stripe_md_ops; #endif extern unit_t md_nunits; extern set_t md_nsets; extern md_set_t md_set[]; extern kmutex_t md_mx; extern kcondvar_t md_cv; extern int md_status; extern major_t md_major; extern mdq_anchor_t md_done_daemon; static int md_stripe_mcs_buf_off; static kmem_cache_t *stripe_parent_cache = NULL; static kmem_cache_t *stripe_child_cache = NULL; /*ARGSUSED1*/ static int stripe_parent_constructor(void *p, void *d1, int d2) { mutex_init(&((md_sps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL); return (0); } static void stripe_parent_init(void *ps) { bzero(ps, offsetof(md_sps_t, ps_mx)); } /*ARGSUSED1*/ static void stripe_parent_destructor(void *p, void *d) { mutex_destroy(&((md_sps_t *)p)->ps_mx); } /*ARGSUSED1*/ static int stripe_child_constructor(void *p, void *d1, int d2) { bioinit(&((md_scs_t *)p)->cs_buf); return (0); } static void stripe_child_init(md_scs_t *cs) { cs->cs_mdunit = 0; cs->cs_ps = NULL; cs->cs_comp = NULL; md_bioreset(&cs->cs_buf); } /*ARGSUSED1*/ static void stripe_child_destructor(void *p, void *d) { biofini(&((md_scs_t *)p)->cs_buf); } /*ARGSUSED*/ static void stripe_run_queue(void *d) { if (!(md_status & MD_GBL_DAEMONS_LIVE)) md_daemon(1, &md_done_daemon); } static void stripe_close_all_devs(ms_unit_t *un, int md_cflags) { int row; int i; int c; struct ms_comp *mdcomp; mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); for (row = 0; row < un->un_nrows; row++) { struct ms_row *mdr = &un->un_row[row]; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { struct ms_comp *mdc; mdc = &mdcomp[c++]; if (md_cflags & MD_OFLG_PROBEDEV) { /* * It is possible that the md_layered_open * failed because the stripe unit structure * contained a NODEV. In such a case since * there is nothing to open, there is nothing * to close. */ if (mdc->un_dev == NODEV64) continue; } if ((md_cflags & MD_OFLG_PROBEDEV) && (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) { md_layered_close(mdc->un_dev, md_cflags); mdc->un_mirror.ms_flags &= ~MDM_S_PROBEOPEN; } else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) { md_layered_close(mdc->un_dev, md_cflags); mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN; } } } } static int stripe_open_all_devs(ms_unit_t *un, int md_oflags) { minor_t mnum = MD_SID(un); int row; int i; int c; struct ms_comp *mdcomp; int err; int cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS); int probe_err_cnt = 0; int total_comp_cnt = 0; set_t setno = MD_MIN2SET(MD_SID(un)); side_t side = mddb_getsidenum(setno); mdkey_t key; mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); /* * For a probe call, if any component of a stripe or a concat * can be opened, it is considered to be a success. The total number * of components in a stripe are computed prior to starting a probe. * This number is then compared against the number of components * that could be be successfully opened. If none of the components * in a stripe can be opened, only then an ENXIO is returned for a * probe type open. */ for (row = 0; row < un->un_nrows; row++) { struct ms_row *mdr = &un->un_row[row]; if (md_oflags & MD_OFLG_PROBEDEV) total_comp_cnt += mdr->un_ncomp; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { struct ms_comp *mdc; md_dev64_t tmpdev; mdc = &mdcomp[c++]; tmpdev = mdc->un_dev; /* * Do the open by device id * Check if this comp is hotspared and * if it is then use the key for hotspare. * MN disksets don't use devids, so we better don't use * md_devid_found/md_resolve_bydevid there. Rather do, * what's done in stripe_build_incore() */ if (MD_MNSET_SETNO(setno)) { if (mdc->un_mirror.ms_hs_id != 0) { (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 0, &mdc->un_mirror.ms_hs_id, NULL, &tmpdev, NULL); } } else { key = mdc->un_mirror.ms_hs_id ? mdc->un_mirror.ms_hs_key : mdc->un_key; if ((md_getmajor(tmpdev) != md_major) && md_devid_found(setno, side, key) == 1) { tmpdev = md_resolve_bydevid(mnum, tmpdev, key); } } /* * For a submirror, we only want to open those devices * that are not errored. If the device is errored then * then there is no reason to open it and leaving it * closed allows the RCM/DR code to work so that the * errored device can be replaced. */ if ((md_oflags & MD_OFLG_PROBEDEV) || ! (mdc->un_mirror.ms_state & CS_ERRED)) { err = md_layered_open(mnum, &tmpdev, md_oflags); } else { err = ENXIO; } /* * Only set the un_dev if the tmpdev != NODEV64. If * it is NODEV64 then the md_layered_open() will have * failed in some manner. */ if (tmpdev != NODEV64) mdc->un_dev = tmpdev; if (err) { if (!cont_on_errors) { stripe_close_all_devs(un, md_oflags); return (ENXIO); } if (md_oflags & MD_OFLG_PROBEDEV) probe_err_cnt++; } else { if (md_oflags & MD_OFLG_PROBEDEV) { mdc->un_mirror.ms_flags |= MDM_S_PROBEOPEN; } else mdc->un_mirror.ms_flags |= MDM_S_ISOPEN; } } } /* If every component in a stripe could not be opened fail */ if ((md_oflags & MD_OFLG_PROBEDEV) && (probe_err_cnt == total_comp_cnt)) return (ENXIO); else return (0); } int stripe_build_incore(void *p, int snarfing) { ms_unit_t *un = (ms_unit_t *)p; struct ms_comp *mdcomp; minor_t mnum; int row; int i; int c; int ncomps; mnum = MD_SID(un); if (MD_UNIT(mnum) != NULL) return (0); MD_STATUS(un) = 0; /* * Reset all the is_open flags, these are probably set * cause they just came out of the database. */ mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); ncomps = 0; for (row = 0; row < un->un_nrows; row++) { struct ms_row *mdr = &un->un_row[row]; ncomps += mdr->un_ncomp; } for (row = 0; row < un->un_nrows; row++) { struct ms_row *mdr = &un->un_row[row]; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { struct ms_comp *mdc; set_t setno; md_dev64_t tmpdev; mdc = &mdcomp[c++]; mdc->un_mirror.ms_flags &= ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED); if (!snarfing) continue; setno = MD_MIN2SET(mnum); tmpdev = md_getdevnum(setno, mddb_getsidenum(setno), mdc->un_key, MD_NOTRUST_DEVT); mdc->un_dev = tmpdev; /* * Check for hotspares. If the hotspares haven't been * snarfed yet, stripe_open_all_devs() will do the * remapping of the dev's later. */ if (mdc->un_mirror.ms_hs_id != 0) { mdc->un_mirror.ms_orig_dev = mdc->un_dev; (void) md_hot_spare_ifc(HS_MKDEV, 0, 0, 0, &mdc->un_mirror.ms_hs_id, NULL, &tmpdev, NULL); mdc->un_dev = tmpdev; } } } MD_UNIT(mnum) = un; return (0); } void reset_stripe(ms_unit_t *un, minor_t mnum, int removing) { ms_comp_t *mdcomp; struct ms_row *mdr; int i, c; int row; int nsv; int isv; sv_dev_t *sv; mddb_recid_t *recids; mddb_recid_t vtoc_id; int rid = 0; md_destroy_unit_incore(mnum, &stripe_md_ops); MD_UNIT(mnum) = NULL; if (!removing) return; nsv = 0; /* Count the number of devices */ for (row = 0; row < un->un_nrows; row++) { mdr = &un->un_row[row]; nsv += mdr->un_ncomp; } sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP); /* * allocate recids array. since we may have to commit * underlying soft partition records, we need an array * of size: total number of components in stripe + 3 * (one for the stripe itself, one for the hotspare, one * for the end marker). */ recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP); /* * Save the md_dev64_t's and driver nm indexes. * Because after the mddb_deleterec() we will * not be able to access the unit structure. * * NOTE: Deleting the names before deleting the * unit structure would cause problems if * the machine crashed in between the two. */ isv = 0; mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); for (row = 0; row < un->un_nrows; row++) { mdr = &un->un_row[row]; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { struct ms_comp *mdc; md_dev64_t child_dev; md_unit_t *child_un; mdc = &mdcomp[c++]; if (mdc->un_mirror.ms_hs_id != 0) { mdkey_t hs_key; hs_key = mdc->un_mirror.ms_hs_key; mdc->un_dev = mdc->un_mirror.ms_orig_dev; mdc->un_start_block = mdc->un_mirror.ms_orig_blk; mdc->un_mirror.ms_hs_id = 0; mdc->un_mirror.ms_hs_key = 0; mdc->un_mirror.ms_orig_dev = 0; recids[0] = 0; recids[1] = 0; /* recids[1] filled in below */ recids[2] = 0; (void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id, 0, 0, &recids[0], &hs_key, NULL, NULL); mddb_commitrecs_wrapper(recids); } /* * check if we've got metadevice below us and * deparent it if we do. * NOTE: currently soft partitions are the * the only metadevices stripes can be * built on top of. */ child_dev = mdc->un_dev; if (md_getmajor(child_dev) == md_major) { child_un = MD_UNIT(md_getminor(child_dev)); md_reset_parent(child_dev); recids[rid++] = MD_RECID(child_un); } sv[isv].setno = MD_MIN2SET(mnum); sv[isv++].key = mdc->un_key; } } recids[rid++] = un->c.un_record_id; recids[rid] = 0; /* filled in below */ /* * Decrement the HSP reference count and * remove the knowledge of the HSP from the unit struct. * This is done atomically to remove a window. */ if (un->un_hsp_id != -1) { (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0, &recids[rid++], NULL, NULL, NULL); un->un_hsp_id = -1; } /* set end marker and commit records */ recids[rid] = 0; mddb_commitrecs_wrapper(recids); vtoc_id = un->c.un_vtoc_id; /* Remove the unit structure */ mddb_deleterec_wrapper(un->c.un_record_id); /* Remove the vtoc, if present */ if (vtoc_id) mddb_deleterec_wrapper(vtoc_id); SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE, MD_MIN2SET(mnum), MD_MIN2UNIT(mnum)); md_rem_names(sv, nsv); kmem_free(sv, sizeof (sv_dev_t) * nsv); kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3)); } static void stripe_error(md_sps_t *ps) { struct buf *pb = ps->ps_bp; mdi_unit_t *ui = ps->ps_ui; md_dev64_t dev = ps->ps_errcomp->un_dev; md_dev64_t md_dev = md_expldev(pb->b_edev); char *str; if (pb->b_flags & B_READ) { ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR; str = "read"; } else { ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR; str = "write"; } if (!(ps->ps_flags & MD_SPS_DONTFREE)) { if (MUTEX_HELD(&ps->ps_mx)) { mutex_exit(&ps->ps_mx); } } else { ASSERT(panicstr); } SPS_FREE(stripe_parent_cache, ps); pb->b_flags |= B_ERROR; md_kstat_done(ui, pb, 0); md_unit_readerexit(ui); md_biodone(pb); cmn_err(CE_WARN, "md: %s: %s error on %s", md_shortname(md_getminor(md_dev)), str, md_devname(MD_DEV2SET(md_dev), dev, NULL, 0)); } static int stripe_done(struct buf *cb) { struct buf *pb; mdi_unit_t *ui; md_sps_t *ps; md_scs_t *cs; /*LINTED*/ cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off); ps = cs->cs_ps; pb = ps->ps_bp; mutex_enter(&ps->ps_mx); if (cb->b_flags & B_ERROR) { ps->ps_flags |= MD_SPS_ERROR; pb->b_error = cb->b_error; ps->ps_errcomp = cs->cs_comp; } if (cb->b_flags & B_REMAPPED) bp_mapout(cb); ps->ps_frags--; if (ps->ps_frags != 0) { mutex_exit(&ps->ps_mx); kmem_cache_free(stripe_child_cache, cs); return (1); } kmem_cache_free(stripe_child_cache, cs); if (ps->ps_flags & MD_SPS_ERROR) { stripe_error(ps); return (1); } ui = ps->ps_ui; if (!(ps->ps_flags & MD_SPS_DONTFREE)) { mutex_exit(&ps->ps_mx); } else { ASSERT(panicstr); } SPS_FREE(stripe_parent_cache, ps); md_kstat_done(ui, pb, 0); md_unit_readerexit(ui); md_biodone(pb); return (0); } /* * This routine does the mapping from virtual (dev, blkno) of a metapartition * to the real (dev, blkno) of a real disk partition. * It goes to the md_conf[] table to find out the correct real partition * dev and block number for this buffer. * * A single buf request can not go across real disk partition boundary. * When the virtual request specified by (dev, blkno) spans more than one * real partition, md_mapbuf will return 1. Then the caller should prepare * another real buf and continue calling md_mapbuf to do the mapping until * it returns 0. * */ static int md_mapbuf( ms_unit_t *un, diskaddr_t blkno, u_longlong_t bcount, buf_t *bp, /* if bp==NULL, skip bp updates */ ms_comp_t **mdc) /* if bp==NULL, skip mdc update */ { struct ms_row *mdr; struct ms_comp *mdcomp; diskaddr_t stripe_blk; diskaddr_t fragment, blk_in_row, endblk; offset_t interlace; size_t dev_index; int row_index, more; extern unsigned md_maxphys; /* Work var's when bp==NULL */ u_longlong_t wb_bcount; diskaddr_t wb_blkno; md_dev64_t wb_edev; ms_comp_t *wmdc; /* * Do a real calculation to derive the minor device of the * Virtual Disk, which in turn will let us derive the * device/minor of the underlying real device. */ for (row_index = 0; row_index < un->un_nrows; row_index++) { mdr = &un->un_row[row_index]; if (blkno < mdr->un_cum_blocks) break; } ASSERT(row_index != un->un_nrows); mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks; endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE)); if (mdr->un_ncomp == 1) { /* No striping */ if (endblk > mdr->un_cum_blocks) { wb_bcount = ldbtob(mdr->un_cum_blocks - blkno); if ((row_index + 1) == un->un_nrows) more = 0; else more = 1; } else { wb_bcount = bcount; more = 0; } wmdc = &mdcomp[mdr->un_icomp]; wb_blkno = blk_in_row; } else { /* Have striping */ interlace = mdr->un_interlace; fragment = blk_in_row % interlace; if (bcount > ldbtob(interlace - fragment)) { more = 1; wb_bcount = ldbtob(interlace - fragment); } else { more = 0; wb_bcount = bcount; } stripe_blk = blk_in_row / interlace; dev_index = (size_t)(stripe_blk % mdr->un_ncomp); wmdc = &mdcomp[mdr->un_icomp + dev_index]; wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) * interlace) + fragment); } wb_blkno += wmdc->un_start_block; wb_edev = wmdc->un_dev; /* only break up the I/O if we're not built on another metadevice */ if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) { wb_bcount = md_maxphys; more = 1; } if (bp != (buf_t *)NULL) { /* * wb_bcount is limited by md_maxphys which is 'int' */ bp->b_bcount = (size_t)wb_bcount; bp->b_lblkno = wb_blkno; bp->b_edev = md_dev64_to_dev(wb_edev); *mdc = wmdc; } return (more); } static void md_stripe_strategy(buf_t *pb, int flag, void *private) { md_sps_t *ps; md_scs_t *cs; int doing_writes; int more; ms_unit_t *un; mdi_unit_t *ui; size_t current_count; diskaddr_t current_blkno; off_t current_offset; buf_t *cb; /* child buf pointer */ set_t setno; setno = MD_MIN2SET(getminor(pb->b_edev)); /* * When doing IO to a multi owner meta device, check if set is halted. * We do this check without the needed lock held, for performance * reasons. * If an IO just slips through while the set is locked via an * MD_MN_SUSPEND_SET, we don't care about it. * Only check for a suspended set if we are a top-level i/o request * (MD_STR_NOTTOP is cleared in 'flag'). */ if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) == (MD_SET_HALTED | MD_SET_MNSET)) { if ((flag & MD_STR_NOTTOP) == 0) { mutex_enter(&md_mx); /* Here we loop until the set is no longer halted */ while (md_set[setno].s_status & MD_SET_HALTED) { cv_wait(&md_cv, &md_mx); } mutex_exit(&md_mx); } } ui = MDI_UNIT(getminor(pb->b_edev)); md_kstat_waitq_enter(ui); un = (ms_unit_t *)md_unit_readerlock(ui); if ((flag & MD_NOBLOCK) == 0) { if (md_inc_iocount(setno) != 0) { pb->b_flags |= B_ERROR; pb->b_error = ENXIO; pb->b_resid = pb->b_bcount; md_unit_readerexit(ui); biodone(pb); return; } } else { md_inc_iocount_noblock(setno); } if (!(flag & MD_STR_NOTTOP)) { if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) { md_kstat_waitq_exit(ui); return; } } ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS); stripe_parent_init(ps); /* * Save essential information from the original buffhdr * in the md_save structure. */ ps->ps_un = un; ps->ps_ui = ui; ps->ps_bp = pb; ps->ps_addr = pb->b_un.b_addr; if ((pb->b_flags & B_READ) == 0) doing_writes = 1; else doing_writes = 0; current_count = pb->b_bcount; current_blkno = pb->b_lblkno; current_offset = 0; if (!(flag & MD_STR_NOTTOP) && panicstr) ps->ps_flags |= MD_SPS_DONTFREE; md_kstat_waitq_to_runq(ui); ps->ps_frags++; do { cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS); stripe_child_init(cs); cb = &cs->cs_buf; cs->cs_ps = ps; more = md_mapbuf(un, current_blkno, current_count, cb, &cs->cs_comp); cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev, cb->b_lblkno, stripe_done, cb, KM_NOSLEEP); /* * Do these calculations now, * so that we pickup a valid b_bcount from the chld_bp. */ current_offset += cb->b_bcount; current_count -= cb->b_bcount; current_blkno += (diskaddr_t)(lbtodb(cb->b_bcount)); if (more) { mutex_enter(&ps->ps_mx); ps->ps_frags++; mutex_exit(&ps->ps_mx); } if (doing_writes && cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) { (void) stripe_done(cb); continue; } md_call_strategy(cb, flag, private); } while (more); if (!(flag & MD_STR_NOTTOP) && panicstr) { while (!(ps->ps_flags & MD_SPS_DONE)) { md_daemon(1, &md_done_daemon); drv_usecwait(10); } kmem_cache_free(stripe_parent_cache, ps); } } static int stripe_snarf(md_snarfcmd_t cmd, set_t setno) { ms_unit_t *un; mddb_recid_t recid; int gotsomething; int all_stripes_gotten; mddb_type_t typ1; mddb_de_ic_t *dep; mddb_rb32_t *rbp; size_t newreqsize; ms_unit_t *big_un; ms_unit32_od_t *small_un; if (cmd == MD_SNARF_CLEANUP) return (0); all_stripes_gotten = 1; gotsomething = 0; typ1 = (mddb_type_t)md_getshared_key(setno, stripe_md_ops.md_driver.md_drivername); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) continue; dep = mddb_getrecdep(recid); dep->de_flags = MDDB_F_STRIPE; rbp = dep->de_rb; if ((rbp->rb_revision == MDDB_REV_RB) && ((rbp->rb_private & MD_PRV_CONVD) == 0)) { /* * This means, we have an old and small record * and this record hasn't already been converted. * Before we create an incore metadevice from this * we have to convert it to a big record. */ small_un = (ms_unit32_od_t *)mddb_getrecaddr(recid); newreqsize = get_big_stripe_req_size(small_un, COMPLETE_STRUCTURE); big_un = (ms_unit_t *)kmem_zalloc(newreqsize, KM_SLEEP); stripe_convert((caddr_t)small_un, (caddr_t)big_un, SMALL_2_BIG); kmem_free(small_un, dep->de_reqsize); dep->de_rb_userdata = big_un; dep->de_reqsize = newreqsize; un = big_un; rbp->rb_private |= MD_PRV_CONVD; } else { /* Big device */ un = (ms_unit_t *)mddb_getrecaddr(recid); } /* Set revision and flag accordingly */ if (rbp->rb_revision == MDDB_REV_RB) { un->c.un_revision = MD_32BIT_META_DEV; } else { un->c.un_revision = MD_64BIT_META_DEV; un->c.un_flag |= MD_EFILABEL; } /* Create minor node for snarfed unit. */ (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un)); if (MD_UNIT(MD_SID(un)) != NULL) { mddb_setrecprivate(recid, MD_PRV_PENDDEL); continue; } all_stripes_gotten = 0; if (stripe_build_incore((void *)un, 1) == 0) { mddb_setrecprivate(recid, MD_PRV_GOTIT); md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0); gotsomething = 1; } } if (!all_stripes_gotten) return (gotsomething); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT)) mddb_setrecprivate(recid, MD_PRV_PENDDEL); return (0); } static int stripe_halt(md_haltcmd_t cmd, set_t setno) { int i; mdi_unit_t *ui; minor_t mnum; if (cmd == MD_HALT_CLOSE) return (0); if (cmd == MD_HALT_OPEN) return (0); if (cmd == MD_HALT_UNLOAD) return (0); if (cmd == MD_HALT_CHECK) { for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != stripe_md_ops.md_selfindex) continue; if (md_unit_isopen(ui)) return (1); } return (0); } if (cmd != MD_HALT_DOIT) return (1); for (i = 0; i < md_nunits; i++) { mnum = MD_MKMIN(setno, i); if ((ui = MDI_UNIT(mnum)) == NULL) continue; if (ui->ui_opsindex != stripe_md_ops.md_selfindex) continue; reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0); } return (0); } /*ARGSUSED3*/ static int stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags) { minor_t mnum = getminor(*dev); mdi_unit_t *ui = MDI_UNIT(mnum); ms_unit_t *un; int err = 0; set_t setno; /* * When doing an open of a multi owner metadevice, check to see if this * node is a starting node and if a reconfig cycle is underway. * If so, the system isn't sufficiently set up enough to handle the * open (which involves I/O during sp_validate), so fail with ENXIO. */ setno = MD_MIN2SET(mnum); if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) == (MD_SET_MNSET | MD_SET_MN_START_RC)) { return (ENXIO); } /* single thread */ un = (ms_unit_t *)md_unit_openclose_enter(ui); /* open devices, if necessary */ if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) { if ((err = stripe_open_all_devs(un, md_oflags)) != 0) { goto out; } } /* count open */ if ((err = md_unit_incopen(mnum, flag, otyp)) != 0) goto out; /* unlock, return success */ out: md_unit_openclose_exit(ui); return (err); } /*ARGSUSED1*/ static int stripe_close( dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags ) { minor_t mnum = getminor(dev); mdi_unit_t *ui = MDI_UNIT(mnum); ms_unit_t *un; int err = 0; /* single thread */ un = (ms_unit_t *)md_unit_openclose_enter(ui); /* count closed */ if ((err = md_unit_decopen(mnum, otyp)) != 0) goto out; /* close devices, if necessary */ if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) { stripe_close_all_devs(un, md_cflags); } /* unlock, return success */ out: md_unit_openclose_exit(ui); return (err); } static struct buf dumpbuf; /* * This routine dumps memory to the disk. It assumes that the memory has * already been mapped into mainbus space. It is called at disk interrupt * priority when the system is in trouble. * */ static int stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk) { ms_unit_t *un; buf_t *bp; ms_comp_t *mdc; u_longlong_t nb; diskaddr_t mapblk; int result; int more; int saveresult = 0; /* * Don't need to grab the unit lock. * Cause nothing else is suppose to be happenning. * Also dump is not suppose to sleep. */ un = (ms_unit_t *)MD_UNIT(getminor(dev)); if ((diskaddr_t)blkno >= un->c.un_total_blocks) return (EINVAL); if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks) return (EINVAL); bp = &dumpbuf; nb = ldbtob(nblk); do { bzero((caddr_t)bp, sizeof (*bp)); more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc); nblk = btodb(bp->b_bcount); mapblk = bp->b_lblkno; if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) { /* * bdev_dump() is currently only able to take * 32 bit wide blkno's. */ result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk, nblk); if (result) saveresult = result; } nb -= bp->b_bcount; addr += bp->b_bcount; blkno += nblk; } while (more); return (saveresult); } /*ARGSUSED*/ static intptr_t stripe_shared_by_blk( md_dev64_t dev, void *junk, diskaddr_t blkno, u_longlong_t *cnt) { ms_unit_t *un; buf_t bp; ms_comp_t *comp; un = MD_UNIT(md_getminor(dev)); (void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp); *cnt = (u_longlong_t)lbtodb(bp.b_bcount); return ((intptr_t)&comp->un_mirror); } /* * stripe_block_count_skip_size() returns the following values * so that the logical to physical block mappings can * be calculated without intimate knowledge of the underpinnings. * * block - first logical block number of the device. * block = [ # of blocks before THE row ] + * [ # of blocks in THE row before the component ] * count - # of segments (interlaced size). * skip - # of logical blocks between segments, or delta to * get to next segment * size - interlace size used for the block, count, skip. */ /*ARGSUSED*/ static intptr_t stripe_block_count_skip_size( md_dev64_t dev, void *junk, int ci, diskaddr_t *block, size_t *count, u_longlong_t *skip, u_longlong_t *size) { ms_unit_t *un; int row; struct ms_row *mdr; int cmpcount = 0; un = MD_UNIT(md_getminor(dev)); for (row = 0; row < un->un_nrows; row++) { mdr = &un->un_row[row]; if ((mdr->un_ncomp + cmpcount) > ci) break; cmpcount += mdr->un_ncomp; } ASSERT(row != un->un_nrows); /* * Concatenations are always contiguous blocks, * you cannot depend on the interlace being a usable * value (except for stripes). */ if (mdr->un_ncomp == 1) { /* Concats */ *block = mdr->un_cum_blocks - mdr->un_blocks; *count = 1; *skip = 0; *size = mdr->un_blocks; } else { /* Stripes */ *block = (mdr->un_cum_blocks - mdr->un_blocks) + ((ci - cmpcount) * mdr->un_interlace); *count = (size_t)(mdr->un_blocks / (mdr->un_interlace * mdr->un_ncomp)); *skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace; *size = mdr->un_interlace; } return (0); } /*ARGSUSED*/ static intptr_t stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx) { ms_unit_t *un; ms_comp_t *comp; un = MD_UNIT(md_getminor(dev)); comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); comp += indx; return ((intptr_t)&comp->un_mirror); } /*ARGSUSED*/ intptr_t stripe_component_count(md_dev64_t dev, void *junk) { /* * See comments for stripe_get_dev */ ms_unit_t *un; int count = 0; int row; un = MD_UNIT(md_getminor(dev)); for (row = 0; row < un->un_nrows; row++) count += un->un_row[row].un_ncomp; return (count); } /*ARGSUSED*/ intptr_t stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd) { /* * It should be noted that stripe_replace in stripe_ioctl.c calls this * routine using makedevice(0, minor) for the first argument. * * If this routine at some point in the future needs to use the major * number stripe_replace must be changed. */ ms_unit_t *un; ms_comp_t *comp; md_dev64_t tmpdev; un = MD_UNIT(md_getminor(dev)); comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); comp += indx; tmpdev = comp->un_dev; /* * Try to resolve devt again if NODEV64 * Check if this comp is hotspared and if it is * then use key for hotspare */ if (tmpdev == NODEV64) { tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev, comp->un_mirror.ms_hs_id ? comp->un_mirror.ms_hs_key : comp->un_key); comp->un_dev = tmpdev; } cd->cd_dev = comp->un_dev; cd->cd_orig_dev = comp->un_mirror.ms_orig_dev; return (0); } /*ARGSUSED*/ void stripe_replace_done(md_dev64_t dev, sv_dev_t *sv) { /* * See comments for stripe_get_dev */ minor_t mnum = md_getminor(dev); if (sv != NULL) { md_rem_names(sv, 1); kmem_free(sv, sizeof (sv_dev_t)); } md_unit_writerexit(MDI_UNIT(mnum)); } /*ARGSUSED*/ intptr_t stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd, mddb_recid_t *recids, int nrecids, void (**replace_done)(), void **replace_data) { minor_t mnum; ms_unit_t *un; mdi_unit_t *ui; ms_comp_t *comp; diskaddr_t dev_size; int row; int ncomps = 0; int cmpcount = 0; int rid = 0; struct ms_row *mdr; sv_dev_t *sv = NULL; mddb_recid_t hs_id = 0; set_t setno; side_t side; md_dev64_t this_dev; mnum = md_getminor(dev); ui = MDI_UNIT(mnum); setno = MD_MIN2SET(mnum); side = mddb_getsidenum(setno); un = md_unit_writerlock(ui); *replace_data = NULL; comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); comp += ci; /* * Count the number of components */ for (row = 0; row < un->un_nrows; row++) { struct ms_row *mdr = &un->un_row[row]; ncomps += mdr->un_ncomp; } recids[0] = 0; /* * No need of checking size of new device, * when hotsparing (it has already been done), or * when enabling the device. */ if ((nd != NULL) && (nd->nd_hs_id == 0)) { for (row = 0; row < un->un_nrows; row++) { mdr = &un->un_row[row]; if ((mdr->un_ncomp + cmpcount) > ci) break; cmpcount += mdr->un_ncomp; } ASSERT(row != un->un_nrows); /* Concatenations have a ncomp = 1 */ dev_size = mdr->un_blocks / mdr->un_ncomp; /* * now check to see if new comp can be used in * place of old comp */ if ((un->c.un_flag & MD_LABELED) && (ci == 0) && nd->nd_labeled) nd->nd_start_blk = 0; else nd->nd_nblks -= nd->nd_start_blk; if (dev_size > nd->nd_nblks) { md_unit_writerexit(ui); return (MDE_COMP_TOO_SMALL); } sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP); sv->setno = MD_MIN2SET(mnum); sv->key = comp->un_key; } /* * Close this component. */ if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) { md_layered_close(comp->un_dev, MD_OFLG_NULL); comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN; } /* * If the component is hotspared, return to the pool. */ if (comp->un_mirror.ms_hs_id != 0) { hs_cmds_t cmd; mdkey_t hs_key; hs_key = comp->un_mirror.ms_hs_key; comp->un_dev = comp->un_mirror.ms_orig_dev; comp->un_start_block = comp->un_mirror.ms_orig_blk; comp->un_mirror.ms_hs_key = 0; comp->un_mirror.ms_hs_id = 0; comp->un_mirror.ms_orig_dev = 0; cmd = HS_FREE; if ((comp->un_mirror.ms_state != CS_OKAY) && (comp->un_mirror.ms_state != CS_RESYNC)) cmd = HS_BAD; (void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id, &hs_key, NULL, NULL); } /* * Open by device id; for enable (indicated by a NULL * nd pointer), use the existing component info. For * replace, use the new device. */ if (nd == NULL) { this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key); /* * If someone replaced a new disk in the same slot * we get NODEV64 since old device id cannot be * resolved. The new devt is obtained from the * mddb since devt is going to be unchanged for the * enable case. No need to check for multiple * keys here because the caller (comp_replace) * has already sanity checked it for us. */ if (this_dev == NODEV64) { this_dev = md_getdevnum(setno, side, comp->un_key, MD_TRUST_DEVT); } } else { /* * If this is a hotspare, save the original dev_t for later * use. If this has occured during boot then the value of * comp->un_dev will be NODEV64 because of the failure to look * up the devid of the device. */ if (nd->nd_hs_id != 0) comp->un_mirror.ms_orig_dev = comp->un_dev; this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key); } comp->un_dev = this_dev; /* * Now open the new device if required. Note for a single component * stripe it will not be open - leave this for the mirror driver to * deal with. */ if (md_unit_isopen(ui)) { if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) { mddb_recid_t ids[3]; ids[0] = un->c.un_record_id; ids[1] = hs_id; ids[2] = 0; mddb_commitrecs_wrapper(ids); if ((nd != NULL) && (nd->nd_hs_id != 0)) { /* * Revert back to the original device. */ comp->un_dev = comp->un_mirror.ms_orig_dev; cmn_err(CE_WARN, "md: %s: open error of hotspare %s", md_shortname(mnum), md_devname(MD_MIN2SET(mnum), nd->nd_dev, NULL, 0)); SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev); } md_unit_writerexit(ui); return (MDE_COMP_OPEN_ERR); } if (nd != NULL) nd->nd_dev = this_dev; comp->un_mirror.ms_flags |= MDM_S_ISOPEN; } if (nd == NULL) { recids[0] = un->c.un_record_id; recids[1] = hs_id; recids[2] = 0; *replace_done = stripe_replace_done; return (0); } /* if hot sparing this device */ if (nd->nd_hs_id != 0) { char devname[MD_MAX_CTDLEN]; char hs_devname[MD_MAX_CTDLEN]; set_t setno; comp->un_mirror.ms_hs_id = nd->nd_hs_id; comp->un_mirror.ms_hs_key = nd->nd_key; comp->un_mirror.ms_orig_blk = comp->un_start_block; setno = MD_MIN2SET(mnum); (void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname, sizeof (devname)); (void) md_devname(setno, nd->nd_dev, hs_devname, sizeof (hs_devname)); cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s", md_shortname(mnum), devname, hs_devname); } else { /* replacing the device */ comp->un_key = nd->nd_key; *replace_data = (void *)sv; /* * For the old device, make sure to reset the parent * if it's a metadevice. */ if (md_getmajor(comp->un_dev) == md_major) { minor_t comp_mnum = md_getminor(comp->un_dev); md_unit_t *comp_un = MD_UNIT(comp_mnum); md_reset_parent(comp->un_dev); recids[rid++] = MD_RECID(comp_un); } } comp->un_dev = nd->nd_dev; comp->un_start_block = nd->nd_start_blk; /* * For the new device, make sure to set the parent if it's a * metadevice. * * If we ever support using metadevices as hot spares, this * will need to be tested, and possibly moved into the * preceding "else" clause, immediately following the parent * reset block. For now, it's convenient to leave it here and * only compress nd->nd_dev once. */ if (md_getmajor(comp->un_dev) == md_major) { minor_t comp_mnum = md_getminor(comp->un_dev); md_unit_t *comp_un = MD_UNIT(comp_mnum); md_set_parent(comp->un_dev, MD_SID(un)); recids[rid++] = MD_RECID(comp_un); } recids[rid++] = un->c.un_record_id; recids[rid++] = hs_id; recids[rid] = 0; *replace_done = stripe_replace_done; return (0); } /*ARGSUSED*/ static intptr_t stripe_hotspare_dev( md_dev64_t dev, void *junk, int ci, mddb_recid_t *recids, int nrecids, void (**replace_done)(), void **replace_data) { ms_unit_t *un; mdi_unit_t *ui; ms_comp_t *comp; int row; struct ms_row *mdr; ms_new_dev_t nd; int err; int i; minor_t mnum; set_t setno; int cmpcount = 0; mnum = md_getminor(dev); ui = MDI_UNIT(mnum); un = MD_UNIT(mnum); setno = MD_MIN2SET(mnum); if (md_get_setstatus(setno) & MD_SET_STALE) return (1); if (un->un_hsp_id == -1) return (1); for (row = 0; row < un->un_nrows; row++) { mdr = &un->un_row[row]; if ((mdr->un_ncomp + cmpcount) > ci) break; cmpcount += mdr->un_ncomp; } ASSERT(row != un->un_nrows); comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]); comp += ci; /* Concatenations have a ncomp = 1 */ nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp; if ((un->c.un_flag & MD_LABELED) && (ci == 0)) nd.nd_labeled = 1; else nd.nd_labeled = 0; again: err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks, nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev, &nd.nd_start_blk); if (err) { if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids, replace_done, replace_data)) { mddb_commitrecs_wrapper(recids); md_unit_writerexit(ui); } recids[0] = 0; return (1); } if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids, replace_done, replace_data)) { (void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0, &nd.nd_hs_id, &nd.nd_key, NULL, NULL); mddb_commitrec_wrapper(nd.nd_hs_id); goto again; } /* Leave a slot for the null recid */ for (i = 0; i < (nrecids - 1); i++) { if (recids[i] == 0) { recids[i++] = nd.nd_hs_id; recids[i] = 0; } } return (0); } static int stripe_imp_set( set_t setno ) { mddb_recid_t recid; int i, row, c, gotsomething; mddb_type_t typ1; mddb_de_ic_t *dep; mddb_rb32_t *rbp; ms_unit32_od_t *un32; ms_unit_t *un64; minor_t *self_id; /* minor needs to be updated */ md_parent_t *parent_id; /* parent needs to be updated */ mddb_recid_t *record_id; /* record id needs to be updated */ mddb_recid_t *hsp_id; ms_comp32_od_t *comp32; ms_comp_t *comp64; gotsomething = 0; typ1 = (mddb_type_t)md_getshared_key(setno, stripe_md_ops.md_driver.md_drivername); recid = mddb_makerecid(setno, 0); while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) { if (mddb_getrecprivate(recid) & MD_PRV_GOTIT) continue; dep = mddb_getrecdep(recid); rbp = dep->de_rb; if (rbp->rb_revision == MDDB_REV_RB) { /* * Small device */ un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid); self_id = &(un32->c.un_self_id); parent_id = &(un32->c.un_parent); record_id = &(un32->c.un_record_id); hsp_id = &(un32->un_hsp_id); comp32 = (ms_comp32_od_t *)((void *)&((char *)un32) [un32->un_ocomp]); for (row = 0; row < un32->un_nrows; row++) { struct ms_row32_od *mdr = &un32->un_row[row]; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { ms_comp32_od_t *mdc; mdc = &comp32[c++]; if (!md_update_minor(setno, mddb_getsidenum (setno), mdc->un_key)) goto out; if (mdc->un_mirror.ms_hs_id != 0) mdc->un_mirror.ms_hs_id = MAKERECID( setno, mdc->un_mirror.ms_hs_id); } } } else { un64 = (ms_unit_t *)mddb_getrecaddr(recid); self_id = &(un64->c.un_self_id); parent_id = &(un64->c.un_parent); record_id = &(un64->c.un_record_id); hsp_id = &(un64->un_hsp_id); comp64 = (ms_comp_t *)((void *)&((char *)un64) [un64->un_ocomp]); for (row = 0; row < un64->un_nrows; row++) { struct ms_row *mdr = &un64->un_row[row]; for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) { ms_comp_t *mdc; mdc = &comp64[c++]; if (!md_update_minor(setno, mddb_getsidenum (setno), mdc->un_key)) goto out; if (mdc->un_mirror.ms_hs_id != 0) mdc->un_mirror.ms_hs_id = MAKERECID( setno, mdc->un_mirror.ms_hs_id); } } } /* * Update unit with the imported setno * */ mddb_setrecprivate(recid, MD_PRV_GOTIT); *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id)); if (*hsp_id != -1) *hsp_id = MAKERECID(setno, DBID(*hsp_id)); if (*parent_id != MD_NO_PARENT) *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id)); *record_id = MAKERECID(setno, DBID(*record_id)); gotsomething = 1; } out: return (gotsomething); } static md_named_services_t stripe_named_services[] = { {stripe_shared_by_blk, "shared by blk" }, {stripe_shared_by_indx, "shared by indx" }, {stripe_component_count, "get component count" }, {stripe_block_count_skip_size, "get block count skip size" }, {stripe_get_dev, "get device" }, {stripe_replace_dev, "replace device" }, {stripe_hotspare_dev, "hotspare device" }, {stripe_rename_check, MDRNM_CHECK }, {NULL, 0} }; md_ops_t stripe_md_ops = { stripe_open, /* open */ stripe_close, /* close */ md_stripe_strategy, /* strategy */ NULL, /* print */ stripe_dump, /* dump */ NULL, /* read */ NULL, /* write */ md_stripe_ioctl, /* stripe_ioctl, */ stripe_snarf, /* stripe_snarf */ stripe_halt, /* stripe_halt */ NULL, /* aread */ NULL, /* awrite */ stripe_imp_set, /* import set */ stripe_named_services }; static void init_init() { md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t); stripe_parent_cache = kmem_cache_create("md_stripe_parent", sizeof (md_sps_t), 0, stripe_parent_constructor, stripe_parent_destructor, stripe_run_queue, NULL, NULL, 0); stripe_child_cache = kmem_cache_create("md_stripe_child", sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0, stripe_child_constructor, stripe_child_destructor, stripe_run_queue, NULL, NULL, 0); } static void fini_uninit() { kmem_cache_destroy(stripe_parent_cache); kmem_cache_destroy(stripe_child_cache); stripe_parent_cache = stripe_child_cache = NULL; } /* define the module linkage */ MD_PLUGIN_MISC_MODULE("stripes module %I%", init_init(), fini_uninit())