1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
25 */
26
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/lvm/mdio.h>
46 #include <sys/lvm/mdvar.h>
47 #include <sys/lvm/md_stripe.h>
48 #include <sys/lvm/md_convert.h>
49 #include <sys/lvm/md_notify.h>
50 #include <sys/modctl.h>
51 #include <sys/ddi.h>
52 #include <sys/sunddi.h>
53 #include <sys/debug.h>
54 #include <sys/sysevent/eventdefs.h>
55 #include <sys/sysevent/svm.h>
56
57 md_ops_t stripe_md_ops;
58 #ifndef lint
59 md_ops_t *md_interface_ops = &stripe_md_ops;
60 #endif
61
62 extern unit_t md_nunits;
63 extern set_t md_nsets;
64 extern md_set_t md_set[];
65
66 extern kmutex_t md_mx;
67 extern kcondvar_t md_cv;
68
69 extern int md_status;
70 extern major_t md_major;
71 extern mdq_anchor_t md_done_daemon;
72
73 static int md_stripe_mcs_buf_off;
74 static kmem_cache_t *stripe_parent_cache = NULL;
75 static kmem_cache_t *stripe_child_cache = NULL;
76
77 /*ARGSUSED1*/
78 static int
stripe_parent_constructor(void * p,void * d1,int d2)79 stripe_parent_constructor(void *p, void *d1, int d2)
80 {
81 mutex_init(&((md_sps_t *)p)->ps_mx,
82 NULL, MUTEX_DEFAULT, NULL);
83 return (0);
84 }
85
86 static void
stripe_parent_init(void * ps)87 stripe_parent_init(void *ps)
88 {
89 bzero(ps, offsetof(md_sps_t, ps_mx));
90 }
91
92 /*ARGSUSED1*/
93 static void
stripe_parent_destructor(void * p,void * d)94 stripe_parent_destructor(void *p, void *d)
95 {
96 mutex_destroy(&((md_sps_t *)p)->ps_mx);
97 }
98
99 /*ARGSUSED1*/
100 static int
stripe_child_constructor(void * p,void * d1,int d2)101 stripe_child_constructor(void *p, void *d1, int d2)
102 {
103 bioinit(&((md_scs_t *)p)->cs_buf);
104 return (0);
105 }
106
107 static void
stripe_child_init(md_scs_t * cs)108 stripe_child_init(md_scs_t *cs)
109 {
110 cs->cs_mdunit = 0;
111 cs->cs_ps = NULL;
112 cs->cs_comp = NULL;
113 md_bioreset(&cs->cs_buf);
114 }
115
116 /*ARGSUSED1*/
117 static void
stripe_child_destructor(void * p,void * d)118 stripe_child_destructor(void *p, void *d)
119 {
120 biofini(&((md_scs_t *)p)->cs_buf);
121 }
122
123 /*ARGSUSED*/
124 static void
stripe_run_queue(void * d)125 stripe_run_queue(void *d)
126 {
127 if (!(md_status & MD_GBL_DAEMONS_LIVE))
128 md_daemon(1, &md_done_daemon);
129 }
130
131 static void
stripe_close_all_devs(ms_unit_t * un,int md_cflags)132 stripe_close_all_devs(ms_unit_t *un, int md_cflags)
133 {
134 int row;
135 int i;
136 int c;
137 struct ms_comp *mdcomp;
138
139 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
140 for (row = 0; row < un->un_nrows; row++) {
141 struct ms_row *mdr = &un->un_row[row];
142 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
143 struct ms_comp *mdc;
144 mdc = &mdcomp[c++];
145 if (md_cflags & MD_OFLG_PROBEDEV) {
146
147 /*
148 * It is possible that the md_layered_open
149 * failed because the stripe unit structure
150 * contained a NODEV. In such a case since
151 * there is nothing to open, there is nothing
152 * to close.
153 */
154 if (mdc->un_dev == NODEV64)
155 continue;
156 }
157 if ((md_cflags & MD_OFLG_PROBEDEV) &&
158 (mdc->un_mirror.ms_flags & MDM_S_PROBEOPEN)) {
159 md_layered_close(mdc->un_dev,
160 md_cflags);
161 mdc->un_mirror.ms_flags &= ~MDM_S_PROBEOPEN;
162 } else if (mdc->un_mirror.ms_flags & MDM_S_ISOPEN) {
163 md_layered_close(mdc->un_dev, md_cflags);
164 mdc->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
165 }
166 }
167 }
168 }
169
170 static int
stripe_open_all_devs(ms_unit_t * un,int md_oflags)171 stripe_open_all_devs(ms_unit_t *un, int md_oflags)
172 {
173 minor_t mnum = MD_SID(un);
174 int row;
175 int i;
176 int c;
177 struct ms_comp *mdcomp;
178 int err;
179 int cont_on_errors = (md_oflags & MD_OFLG_CONT_ERRS);
180 int probe_err_cnt = 0;
181 int total_comp_cnt = 0;
182 set_t setno = MD_MIN2SET(MD_SID(un));
183 side_t side = mddb_getsidenum(setno);
184 mdkey_t key;
185
186 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
187
188 /*
189 * For a probe call, if any component of a stripe or a concat
190 * can be opened, it is considered to be a success. The total number
191 * of components in a stripe are computed prior to starting a probe.
192 * This number is then compared against the number of components
193 * that could be be successfully opened. If none of the components
194 * in a stripe can be opened, only then an ENXIO is returned for a
195 * probe type open.
196 */
197
198 for (row = 0; row < un->un_nrows; row++) {
199 struct ms_row *mdr = &un->un_row[row];
200
201 if (md_oflags & MD_OFLG_PROBEDEV)
202 total_comp_cnt += mdr->un_ncomp;
203
204 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
205 struct ms_comp *mdc;
206 md_dev64_t tmpdev;
207
208 mdc = &mdcomp[c++];
209 tmpdev = mdc->un_dev;
210 /*
211 * Do the open by device id
212 * Check if this comp is hotspared and
213 * if it is then use the key for hotspare.
214 * MN disksets don't use devids, so we better don't use
215 * md_devid_found/md_resolve_bydevid there. Rather do,
216 * what's done in stripe_build_incore()
217 */
218 if (MD_MNSET_SETNO(setno)) {
219 if (mdc->un_mirror.ms_hs_id != 0) {
220 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
221 0, &mdc->un_mirror.ms_hs_id, NULL,
222 &tmpdev, NULL);
223 }
224 } else {
225 key = mdc->un_mirror.ms_hs_id ?
226 mdc->un_mirror.ms_hs_key : mdc->un_key;
227 if ((md_getmajor(tmpdev) != md_major) &&
228 md_devid_found(setno, side, key) == 1) {
229 tmpdev = md_resolve_bydevid(mnum,
230 tmpdev, key);
231 }
232 }
233
234 /*
235 * For a submirror, we only want to open those devices
236 * that are not errored. If the device is errored then
237 * then there is no reason to open it and leaving it
238 * closed allows the RCM/DR code to work so that the
239 * errored device can be replaced.
240 */
241 if ((md_oflags & MD_OFLG_PROBEDEV) ||
242 ! (mdc->un_mirror.ms_state & CS_ERRED)) {
243
244 err = md_layered_open(mnum, &tmpdev, md_oflags);
245 } else {
246 err = ENXIO;
247 }
248
249 /*
250 * Only set the un_dev if the tmpdev != NODEV64. If
251 * it is NODEV64 then the md_layered_open() will have
252 * failed in some manner.
253 */
254 if (tmpdev != NODEV64)
255 mdc->un_dev = tmpdev;
256
257 if (err) {
258 if (!cont_on_errors) {
259 stripe_close_all_devs(un, md_oflags);
260 return (ENXIO);
261 }
262
263 if (md_oflags & MD_OFLG_PROBEDEV)
264 probe_err_cnt++;
265 } else {
266 if (md_oflags & MD_OFLG_PROBEDEV) {
267 mdc->un_mirror.ms_flags |=
268 MDM_S_PROBEOPEN;
269 } else
270 mdc->un_mirror.ms_flags |= MDM_S_ISOPEN;
271 }
272 }
273 }
274
275 /* If every component in a stripe could not be opened fail */
276 if ((md_oflags & MD_OFLG_PROBEDEV) &&
277 (probe_err_cnt == total_comp_cnt))
278 return (ENXIO);
279 else
280 return (0);
281 }
282
283 int
stripe_build_incore(void * p,int snarfing)284 stripe_build_incore(void *p, int snarfing)
285 {
286 ms_unit_t *un = (ms_unit_t *)p;
287 struct ms_comp *mdcomp;
288 minor_t mnum;
289 int row;
290 int i;
291 int c;
292 int ncomps;
293
294 mnum = MD_SID(un);
295
296 if (MD_UNIT(mnum) != NULL)
297 return (0);
298
299 MD_STATUS(un) = 0;
300
301 /*
302 * Reset all the is_open flags, these are probably set
303 * cause they just came out of the database.
304 */
305 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
306
307 ncomps = 0;
308 for (row = 0; row < un->un_nrows; row++) {
309 struct ms_row *mdr = &un->un_row[row];
310 ncomps += mdr->un_ncomp;
311 }
312
313 for (row = 0; row < un->un_nrows; row++) {
314 struct ms_row *mdr = &un->un_row[row];
315 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
316 struct ms_comp *mdc;
317 set_t setno;
318 md_dev64_t tmpdev;
319
320 mdc = &mdcomp[c++];
321 mdc->un_mirror.ms_flags &=
322 ~(MDM_S_ISOPEN | MDM_S_IOERR | MDM_S_RS_TRIED);
323
324 if (!snarfing)
325 continue;
326
327 setno = MD_MIN2SET(mnum);
328
329 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
330 mdc->un_key, MD_NOTRUST_DEVT);
331 mdc->un_dev = tmpdev;
332 /*
333 * Check for hotspares. If the hotspares haven't been
334 * snarfed yet, stripe_open_all_devs() will do the
335 * remapping of the dev's later.
336 */
337 if (mdc->un_mirror.ms_hs_id != 0) {
338 mdc->un_mirror.ms_orig_dev = mdc->un_dev;
339 (void) md_hot_spare_ifc(HS_MKDEV, 0, 0,
340 0, &mdc->un_mirror.ms_hs_id, NULL,
341 &tmpdev, NULL);
342 mdc->un_dev = tmpdev;
343 }
344 }
345 }
346
347 /* place various information in the in-core data structures */
348 md_nblocks_set(mnum, un->c.un_total_blocks);
349 MD_UNIT(mnum) = un;
350
351 return (0);
352 }
353
354 void
reset_stripe(ms_unit_t * un,minor_t mnum,int removing)355 reset_stripe(ms_unit_t *un, minor_t mnum, int removing)
356 {
357 ms_comp_t *mdcomp;
358 struct ms_row *mdr;
359 int i, c;
360 int row;
361 int nsv;
362 int isv;
363 sv_dev_t *sv;
364 mddb_recid_t *recids;
365 mddb_recid_t vtoc_id;
366 int rid = 0;
367
368 md_destroy_unit_incore(mnum, &stripe_md_ops);
369
370 md_nblocks_set(mnum, -1ULL);
371 MD_UNIT(mnum) = NULL;
372
373 /*
374 * Attempt release of its minor node
375 */
376 md_remove_minor_node(mnum);
377
378 if (!removing)
379 return;
380
381 nsv = 0;
382 /* Count the number of devices */
383 for (row = 0; row < un->un_nrows; row++) {
384 mdr = &un->un_row[row];
385 nsv += mdr->un_ncomp;
386 }
387 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t) * nsv, KM_SLEEP);
388
389 /*
390 * allocate recids array. since we may have to commit
391 * underlying soft partition records, we need an array
392 * of size: total number of components in stripe + 3
393 * (one for the stripe itself, one for the hotspare, one
394 * for the end marker).
395 */
396 recids = kmem_alloc(sizeof (mddb_recid_t) * (nsv + 3), KM_SLEEP);
397
398 /*
399 * Save the md_dev64_t's and driver nm indexes.
400 * Because after the mddb_deleterec() we will
401 * not be able to access the unit structure.
402 *
403 * NOTE: Deleting the names before deleting the
404 * unit structure would cause problems if
405 * the machine crashed in between the two.
406 */
407 isv = 0;
408 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
409
410 for (row = 0; row < un->un_nrows; row++) {
411 mdr = &un->un_row[row];
412 for (i = 0, c = mdr->un_icomp; i < mdr->un_ncomp; i++) {
413 struct ms_comp *mdc;
414 md_dev64_t child_dev;
415 md_unit_t *child_un;
416
417 mdc = &mdcomp[c++];
418 if (mdc->un_mirror.ms_hs_id != 0) {
419 mdkey_t hs_key;
420
421 hs_key = mdc->un_mirror.ms_hs_key;
422
423 mdc->un_dev = mdc->un_mirror.ms_orig_dev;
424 mdc->un_start_block =
425 mdc->un_mirror.ms_orig_blk;
426 mdc->un_mirror.ms_hs_id = 0;
427 mdc->un_mirror.ms_hs_key = 0;
428 mdc->un_mirror.ms_orig_dev = 0;
429 recids[0] = 0;
430 recids[1] = 0; /* recids[1] filled in below */
431 recids[2] = 0;
432 (void) md_hot_spare_ifc(HS_FREE, un->un_hsp_id,
433 0, 0, &recids[0], &hs_key, NULL, NULL);
434 mddb_commitrecs_wrapper(recids);
435 }
436
437 /*
438 * check if we've got metadevice below us and
439 * deparent it if we do.
440 * NOTE: currently soft partitions are the
441 * the only metadevices stripes can be
442 * built on top of.
443 */
444 child_dev = mdc->un_dev;
445 if (md_getmajor(child_dev) == md_major) {
446 child_un = MD_UNIT(md_getminor(child_dev));
447 md_reset_parent(child_dev);
448 recids[rid++] = MD_RECID(child_un);
449 }
450
451 sv[isv].setno = MD_MIN2SET(mnum);
452 sv[isv++].key = mdc->un_key;
453 }
454 }
455
456 recids[rid++] = un->c.un_record_id;
457 recids[rid] = 0; /* filled in below */
458
459 /*
460 * Decrement the HSP reference count and
461 * remove the knowledge of the HSP from the unit struct.
462 * This is done atomically to remove a window.
463 */
464 if (un->un_hsp_id != -1) {
465 (void) md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
466 &recids[rid++], NULL, NULL, NULL);
467 un->un_hsp_id = -1;
468 }
469
470 /* set end marker and commit records */
471 recids[rid] = 0;
472 mddb_commitrecs_wrapper(recids);
473
474 vtoc_id = un->c.un_vtoc_id;
475
476 /*
477 * Remove self from the namespace
478 */
479 if (un->c.un_revision & MD_FN_META_DEV) {
480 (void) md_rem_selfname(un->c.un_self_id);
481 }
482
483 /* Remove the unit structure */
484 mddb_deleterec_wrapper(un->c.un_record_id);
485
486 /* Remove the vtoc, if present */
487 if (vtoc_id)
488 mddb_deleterec_wrapper(vtoc_id);
489
490 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
491 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
492 md_rem_names(sv, nsv);
493 kmem_free(sv, sizeof (sv_dev_t) * nsv);
494 kmem_free(recids, sizeof (mddb_recid_t) * (nsv + 3));
495 }
496
497 static void
stripe_error(md_sps_t * ps)498 stripe_error(md_sps_t *ps)
499 {
500 struct buf *pb = ps->ps_bp;
501 mdi_unit_t *ui = ps->ps_ui;
502 md_dev64_t dev = ps->ps_errcomp->un_dev;
503 md_dev64_t md_dev = md_expldev(pb->b_edev);
504 char *str;
505
506 if (pb->b_flags & B_READ) {
507 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_READERR;
508 str = "read";
509 } else {
510 ps->ps_errcomp->un_mirror.ms_flags |= MDM_S_WRTERR;
511 str = "write";
512 }
513 if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
514 if (MUTEX_HELD(&ps->ps_mx)) {
515 mutex_exit(&ps->ps_mx);
516 }
517 } else {
518 ASSERT(panicstr);
519 }
520 SPS_FREE(stripe_parent_cache, ps);
521 pb->b_flags |= B_ERROR;
522
523 md_kstat_done(ui, pb, 0);
524 md_unit_readerexit(ui);
525 md_biodone(pb);
526
527 cmn_err(CE_WARN, "md: %s: %s error on %s",
528 md_shortname(md_getminor(md_dev)), str,
529 md_devname(MD_DEV2SET(md_dev), dev, NULL, 0));
530 }
531
532 static int
stripe_done(struct buf * cb)533 stripe_done(struct buf *cb)
534 {
535 struct buf *pb;
536 mdi_unit_t *ui;
537 md_sps_t *ps;
538 md_scs_t *cs;
539
540 /*LINTED*/
541 cs = (md_scs_t *)((caddr_t)cb - md_stripe_mcs_buf_off);
542 ps = cs->cs_ps;
543 pb = ps->ps_bp;
544
545 mutex_enter(&ps->ps_mx);
546 if (cb->b_flags & B_ERROR) {
547 ps->ps_flags |= MD_SPS_ERROR;
548 pb->b_error = cb->b_error;
549 ps->ps_errcomp = cs->cs_comp;
550 }
551
552 if (cb->b_flags & B_REMAPPED)
553 bp_mapout(cb);
554
555 ps->ps_frags--;
556 if (ps->ps_frags != 0) {
557 mutex_exit(&ps->ps_mx);
558 kmem_cache_free(stripe_child_cache, cs);
559 return (1);
560 }
561 kmem_cache_free(stripe_child_cache, cs);
562 if (ps->ps_flags & MD_SPS_ERROR) {
563 stripe_error(ps);
564 return (1);
565 }
566 ui = ps->ps_ui;
567 if (!(ps->ps_flags & MD_SPS_DONTFREE)) {
568 mutex_exit(&ps->ps_mx);
569 } else {
570 ASSERT(panicstr);
571 }
572 SPS_FREE(stripe_parent_cache, ps);
573 md_kstat_done(ui, pb, 0);
574 md_unit_readerexit(ui);
575 md_biodone(pb);
576 return (0);
577 }
578
579
580 /*
581 * This routine does the mapping from virtual (dev, blkno) of a metapartition
582 * to the real (dev, blkno) of a real disk partition.
583 * It goes to the md_conf[] table to find out the correct real partition
584 * dev and block number for this buffer.
585 *
586 * A single buf request can not go across real disk partition boundary.
587 * When the virtual request specified by (dev, blkno) spans more than one
588 * real partition, md_mapbuf will return 1. Then the caller should prepare
589 * another real buf and continue calling md_mapbuf to do the mapping until
590 * it returns 0.
591 *
592 */
593
594 static int
md_mapbuf(ms_unit_t * un,diskaddr_t blkno,u_longlong_t bcount,buf_t * bp,ms_comp_t ** mdc)595 md_mapbuf(
596 ms_unit_t *un,
597 diskaddr_t blkno,
598 u_longlong_t bcount,
599 buf_t *bp, /* if bp==NULL, skip bp updates */
600 ms_comp_t **mdc) /* if bp==NULL, skip mdc update */
601 {
602 struct ms_row *mdr;
603 struct ms_comp *mdcomp;
604 diskaddr_t stripe_blk;
605 diskaddr_t fragment, blk_in_row, endblk;
606 offset_t interlace;
607 size_t dev_index;
608 int row_index, more;
609 extern unsigned md_maxphys;
610 /* Work var's when bp==NULL */
611 u_longlong_t wb_bcount;
612 diskaddr_t wb_blkno;
613 md_dev64_t wb_edev;
614 ms_comp_t *wmdc;
615
616 /*
617 * Do a real calculation to derive the minor device of the
618 * Virtual Disk, which in turn will let us derive the
619 * device/minor of the underlying real device.
620 */
621
622
623 for (row_index = 0; row_index < un->un_nrows; row_index++) {
624 mdr = &un->un_row[row_index];
625 if (blkno < mdr->un_cum_blocks)
626 break;
627 }
628 ASSERT(row_index != un->un_nrows);
629
630 mdcomp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
631
632 blk_in_row = blkno - mdr->un_cum_blocks + mdr->un_blocks;
633 endblk = (diskaddr_t)(blkno + howmany(bcount, DEV_BSIZE));
634 if (mdr->un_ncomp == 1) { /* No striping */
635 if (endblk > mdr->un_cum_blocks) {
636 wb_bcount = ldbtob(mdr->un_cum_blocks - blkno);
637 if ((row_index + 1) == un->un_nrows)
638 more = 0;
639 else
640 more = 1;
641 } else {
642 wb_bcount = bcount;
643 more = 0;
644 }
645 wmdc = &mdcomp[mdr->un_icomp];
646 wb_blkno = blk_in_row;
647 } else { /* Have striping */
648 interlace = mdr->un_interlace;
649 fragment = blk_in_row % interlace;
650 if (bcount > ldbtob(interlace - fragment)) {
651 more = 1;
652 wb_bcount = ldbtob(interlace - fragment);
653 } else {
654 more = 0;
655 wb_bcount = bcount;
656 }
657
658 stripe_blk = blk_in_row / interlace;
659 dev_index = (size_t)(stripe_blk % mdr->un_ncomp);
660 wmdc = &mdcomp[mdr->un_icomp + dev_index];
661 wb_blkno = (diskaddr_t)(((stripe_blk / mdr->un_ncomp) *
662 interlace) + fragment);
663 }
664
665 wb_blkno += wmdc->un_start_block;
666 wb_edev = wmdc->un_dev;
667
668 /* only break up the I/O if we're not built on another metadevice */
669 if ((md_getmajor(wb_edev) != md_major) && (wb_bcount > md_maxphys)) {
670 wb_bcount = md_maxphys;
671 more = 1;
672 }
673 if (bp != (buf_t *)NULL) {
674 /*
675 * wb_bcount is limited by md_maxphys which is 'int'
676 */
677 bp->b_bcount = (size_t)wb_bcount;
678 bp->b_lblkno = wb_blkno;
679 bp->b_edev = md_dev64_to_dev(wb_edev);
680 *mdc = wmdc;
681 }
682 return (more);
683 }
684
685 static void
md_stripe_strategy(buf_t * pb,int flag,void * private)686 md_stripe_strategy(buf_t *pb, int flag, void *private)
687 {
688 md_sps_t *ps;
689 md_scs_t *cs;
690 int doing_writes;
691 int more;
692 ms_unit_t *un;
693 mdi_unit_t *ui;
694 size_t current_count;
695 diskaddr_t current_blkno;
696 off_t current_offset;
697 buf_t *cb; /* child buf pointer */
698 set_t setno;
699
700 setno = MD_MIN2SET(getminor(pb->b_edev));
701
702 /*
703 * When doing IO to a multi owner meta device, check if set is halted.
704 * We do this check without the needed lock held, for performance
705 * reasons.
706 * If an IO just slips through while the set is locked via an
707 * MD_MN_SUSPEND_SET, we don't care about it.
708 * Only check for a suspended set if we are a top-level i/o request
709 * (MD_STR_NOTTOP is cleared in 'flag').
710 */
711 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
712 (MD_SET_HALTED | MD_SET_MNSET)) {
713 if ((flag & MD_STR_NOTTOP) == 0) {
714 mutex_enter(&md_mx);
715 /* Here we loop until the set is no longer halted */
716 while (md_set[setno].s_status & MD_SET_HALTED) {
717 cv_wait(&md_cv, &md_mx);
718 }
719 mutex_exit(&md_mx);
720 }
721 }
722
723 ui = MDI_UNIT(getminor(pb->b_edev));
724
725 md_kstat_waitq_enter(ui);
726
727 un = (ms_unit_t *)md_unit_readerlock(ui);
728
729 if ((flag & MD_NOBLOCK) == 0) {
730 if (md_inc_iocount(setno) != 0) {
731 pb->b_flags |= B_ERROR;
732 pb->b_error = ENXIO;
733 pb->b_resid = pb->b_bcount;
734 md_kstat_waitq_exit(ui);
735 md_unit_readerexit(ui);
736 biodone(pb);
737 return;
738 }
739 } else {
740 md_inc_iocount_noblock(setno);
741 }
742
743 if (!(flag & MD_STR_NOTTOP)) {
744 if (md_checkbuf(ui, (md_unit_t *)un, pb) != 0) {
745 md_kstat_waitq_exit(ui);
746 return;
747 }
748 }
749
750 ps = kmem_cache_alloc(stripe_parent_cache, MD_ALLOCFLAGS);
751 stripe_parent_init(ps);
752
753 /*
754 * Save essential information from the original buffhdr
755 * in the md_save structure.
756 */
757 ps->ps_un = un;
758 ps->ps_ui = ui;
759 ps->ps_bp = pb;
760 ps->ps_addr = pb->b_un.b_addr;
761
762 if ((pb->b_flags & B_READ) == 0)
763 doing_writes = 1;
764 else
765 doing_writes = 0;
766
767
768 current_count = pb->b_bcount;
769 current_blkno = pb->b_lblkno;
770 current_offset = 0;
771
772 if (!(flag & MD_STR_NOTTOP) && panicstr)
773 ps->ps_flags |= MD_SPS_DONTFREE;
774
775 md_kstat_waitq_to_runq(ui);
776
777 ps->ps_frags++;
778 do {
779 cs = kmem_cache_alloc(stripe_child_cache, MD_ALLOCFLAGS);
780 stripe_child_init(cs);
781 cb = &cs->cs_buf;
782 cs->cs_ps = ps;
783 more = md_mapbuf(un, current_blkno, current_count, cb,
784 &cs->cs_comp);
785
786 cb = md_bioclone(pb, current_offset, cb->b_bcount, cb->b_edev,
787 cb->b_lblkno, stripe_done, cb, KM_NOSLEEP);
788 /*
789 * Do these calculations now,
790 * so that we pickup a valid b_bcount from the chld_bp.
791 */
792 current_offset += cb->b_bcount;
793 current_count -= cb->b_bcount;
794 current_blkno += (diskaddr_t)(lbtodb(cb->b_bcount));
795
796 if (more) {
797 mutex_enter(&ps->ps_mx);
798 ps->ps_frags++;
799 mutex_exit(&ps->ps_mx);
800 }
801
802 if (doing_writes &&
803 cs->cs_comp->un_mirror.ms_flags & MDM_S_NOWRITE) {
804 (void) stripe_done(cb);
805 continue;
806 }
807 md_call_strategy(cb, flag, private);
808 } while (more);
809
810 if (!(flag & MD_STR_NOTTOP) && panicstr) {
811 while (!(ps->ps_flags & MD_SPS_DONE)) {
812 md_daemon(1, &md_done_daemon);
813 drv_usecwait(10);
814 }
815 kmem_cache_free(stripe_parent_cache, ps);
816 }
817 }
818
819 static int
stripe_snarf(md_snarfcmd_t cmd,set_t setno)820 stripe_snarf(md_snarfcmd_t cmd, set_t setno)
821 {
822 ms_unit_t *un;
823 mddb_recid_t recid;
824 int gotsomething;
825 int all_stripes_gotten;
826 mddb_type_t typ1;
827 mddb_de_ic_t *dep;
828 mddb_rb32_t *rbp;
829 size_t newreqsize;
830 ms_unit_t *big_un;
831 ms_unit32_od_t *small_un;
832
833
834 if (cmd == MD_SNARF_CLEANUP)
835 return (0);
836
837 all_stripes_gotten = 1;
838 gotsomething = 0;
839
840 typ1 = (mddb_type_t)md_getshared_key(setno,
841 stripe_md_ops.md_driver.md_drivername);
842 recid = mddb_makerecid(setno, 0);
843
844 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
845 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
846 continue;
847
848 dep = mddb_getrecdep(recid);
849 dep->de_flags = MDDB_F_STRIPE;
850 rbp = dep->de_rb;
851
852 switch (rbp->rb_revision) {
853 case MDDB_REV_RB:
854 case MDDB_REV_RBFN:
855 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
856 /*
857 * This means, we have an old and small record
858 * and this record hasn't already been
859 * converted. Before we create an incore
860 * metadevice from this we have to convert it to
861 * a big record.
862 */
863 small_un =
864 (ms_unit32_od_t *)mddb_getrecaddr(recid);
865 newreqsize = get_big_stripe_req_size(small_un,
866 COMPLETE_STRUCTURE);
867 big_un = (ms_unit_t *)kmem_zalloc(newreqsize,
868 KM_SLEEP);
869 stripe_convert((caddr_t)small_un,
870 (caddr_t)big_un, SMALL_2_BIG);
871 kmem_free(small_un, dep->de_reqsize);
872 dep->de_rb_userdata = big_un;
873 dep->de_reqsize = newreqsize;
874 un = big_un;
875 rbp->rb_private |= MD_PRV_CONVD;
876 } else {
877 /* Small device had already been converted */
878 un = (ms_unit_t *)mddb_getrecaddr(recid);
879 }
880 un->c.un_revision &= ~MD_64BIT_META_DEV;
881 break;
882 case MDDB_REV_RB64:
883 case MDDB_REV_RB64FN:
884 /* Big device */
885 un = (ms_unit_t *)mddb_getrecaddr(recid);
886 un->c.un_revision |= MD_64BIT_META_DEV;
887 un->c.un_flag |= MD_EFILABEL;
888 break;
889 }
890 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
891
892 /* Create minor node for snarfed unit. */
893 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
894
895 if (MD_UNIT(MD_SID(un)) != NULL) {
896 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
897 continue;
898 }
899 all_stripes_gotten = 0;
900 if (stripe_build_incore((void *)un, 1) == 0) {
901 mddb_setrecprivate(recid, MD_PRV_GOTIT);
902 md_create_unit_incore(MD_SID(un), &stripe_md_ops, 0);
903 gotsomething = 1;
904 }
905 }
906
907 if (!all_stripes_gotten)
908 return (gotsomething);
909
910 recid = mddb_makerecid(setno, 0);
911 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0)
912 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
913 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
914
915 return (0);
916 }
917
918 static int
stripe_halt(md_haltcmd_t cmd,set_t setno)919 stripe_halt(md_haltcmd_t cmd, set_t setno)
920 {
921 int i;
922 mdi_unit_t *ui;
923 minor_t mnum;
924
925 if (cmd == MD_HALT_CLOSE)
926 return (0);
927
928 if (cmd == MD_HALT_OPEN)
929 return (0);
930
931 if (cmd == MD_HALT_UNLOAD)
932 return (0);
933
934 if (cmd == MD_HALT_CHECK) {
935 for (i = 0; i < md_nunits; i++) {
936 mnum = MD_MKMIN(setno, i);
937 if ((ui = MDI_UNIT(mnum)) == NULL)
938 continue;
939 if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
940 continue;
941 if (md_unit_isopen(ui))
942 return (1);
943 }
944 return (0);
945 }
946
947 if (cmd != MD_HALT_DOIT)
948 return (1);
949
950 for (i = 0; i < md_nunits; i++) {
951 mnum = MD_MKMIN(setno, i);
952 if ((ui = MDI_UNIT(mnum)) == NULL)
953 continue;
954 if (ui->ui_opsindex != stripe_md_ops.md_selfindex)
955 continue;
956 reset_stripe((ms_unit_t *)MD_UNIT(mnum), mnum, 0);
957 }
958
959 return (0);
960 }
961
962 /*ARGSUSED3*/
963 static int
stripe_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)964 stripe_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
965 {
966 minor_t mnum = getminor(*dev);
967 mdi_unit_t *ui = MDI_UNIT(mnum);
968 ms_unit_t *un;
969 int err = 0;
970 set_t setno;
971
972 /*
973 * When doing an open of a multi owner metadevice, check to see if this
974 * node is a starting node and if a reconfig cycle is underway.
975 * If so, the system isn't sufficiently set up enough to handle the
976 * open (which involves I/O during sp_validate), so fail with ENXIO.
977 */
978 setno = MD_MIN2SET(mnum);
979 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
980 (MD_SET_MNSET | MD_SET_MN_START_RC)) {
981 return (ENXIO);
982 }
983
984 /* single thread */
985 un = (ms_unit_t *)md_unit_openclose_enter(ui);
986
987 /* open devices, if necessary */
988 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
989 if ((err = stripe_open_all_devs(un, md_oflags)) != 0) {
990 goto out;
991 }
992 }
993
994 /* count open */
995 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
996 goto out;
997
998 /* unlock, return success */
999 out:
1000 md_unit_openclose_exit(ui);
1001 return (err);
1002 }
1003
1004 /*ARGSUSED1*/
1005 static int
stripe_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)1006 stripe_close(
1007 dev_t dev,
1008 int flag,
1009 int otyp,
1010 cred_t *cred_p,
1011 int md_cflags
1012 )
1013 {
1014 minor_t mnum = getminor(dev);
1015 mdi_unit_t *ui = MDI_UNIT(mnum);
1016 ms_unit_t *un;
1017 int err = 0;
1018
1019 /* single thread */
1020 un = (ms_unit_t *)md_unit_openclose_enter(ui);
1021
1022 /* count closed */
1023 if ((err = md_unit_decopen(mnum, otyp)) != 0)
1024 goto out;
1025
1026 /* close devices, if necessary */
1027 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1028 stripe_close_all_devs(un, md_cflags);
1029 }
1030
1031 /* unlock, return success */
1032 out:
1033 md_unit_openclose_exit(ui);
1034 return (err);
1035 }
1036
1037
1038 static struct buf dumpbuf;
1039
1040 /*
1041 * This routine dumps memory to the disk. It assumes that the memory has
1042 * already been mapped into mainbus space. It is called at disk interrupt
1043 * priority when the system is in trouble.
1044 *
1045 */
1046 static int
stripe_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)1047 stripe_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1048 {
1049 ms_unit_t *un;
1050 buf_t *bp;
1051 ms_comp_t *mdc;
1052 u_longlong_t nb;
1053 diskaddr_t mapblk;
1054 int result;
1055 int more;
1056 int saveresult = 0;
1057
1058 /*
1059 * Don't need to grab the unit lock.
1060 * Cause nothing else is suppose to be happenning.
1061 * Also dump is not suppose to sleep.
1062 */
1063 un = (ms_unit_t *)MD_UNIT(getminor(dev));
1064
1065 if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1066 return (EINVAL);
1067
1068 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
1069 return (EINVAL);
1070
1071 bp = &dumpbuf;
1072 nb = ldbtob(nblk);
1073 do {
1074 bzero((caddr_t)bp, sizeof (*bp));
1075 more = md_mapbuf(un, (diskaddr_t)blkno, nb, bp, &mdc);
1076 nblk = btodb(bp->b_bcount);
1077 mapblk = bp->b_lblkno;
1078 if (!(mdc->un_mirror.ms_flags & MDM_S_NOWRITE)) {
1079 /*
1080 * bdev_dump() is currently only able to take
1081 * 32 bit wide blkno's.
1082 */
1083 result = bdev_dump(bp->b_edev, addr, (daddr_t)mapblk,
1084 nblk);
1085 if (result)
1086 saveresult = result;
1087 }
1088
1089 nb -= bp->b_bcount;
1090 addr += bp->b_bcount;
1091 blkno += nblk;
1092 } while (more);
1093
1094 return (saveresult);
1095 }
1096
1097 /*ARGSUSED*/
1098 static intptr_t
stripe_shared_by_blk(md_dev64_t dev,void * junk,diskaddr_t blkno,u_longlong_t * cnt)1099 stripe_shared_by_blk(
1100 md_dev64_t dev,
1101 void *junk,
1102 diskaddr_t blkno,
1103 u_longlong_t *cnt)
1104 {
1105 ms_unit_t *un;
1106 buf_t bp;
1107 ms_comp_t *comp;
1108
1109 un = MD_UNIT(md_getminor(dev));
1110 (void) md_mapbuf(un, blkno, ldbtob(*cnt), &bp, &comp);
1111 *cnt = (u_longlong_t)lbtodb(bp.b_bcount);
1112 return ((intptr_t)&comp->un_mirror);
1113 }
1114
1115 /*
1116 * stripe_block_count_skip_size() returns the following values
1117 * so that the logical to physical block mappings can
1118 * be calculated without intimate knowledge of the underpinnings.
1119 *
1120 * block - first logical block number of the device.
1121 * block = [ # of blocks before THE row ] +
1122 * [ # of blocks in THE row before the component ]
1123 * count - # of segments (interlaced size).
1124 * skip - # of logical blocks between segments, or delta to
1125 * get to next segment
1126 * size - interlace size used for the block, count, skip.
1127 */
1128 /*ARGSUSED*/
1129 static intptr_t
stripe_block_count_skip_size(md_dev64_t dev,void * junk,int ci,diskaddr_t * block,size_t * count,u_longlong_t * skip,u_longlong_t * size)1130 stripe_block_count_skip_size(
1131 md_dev64_t dev,
1132 void *junk,
1133 int ci,
1134 diskaddr_t *block,
1135 size_t *count,
1136 u_longlong_t *skip,
1137 u_longlong_t *size)
1138 {
1139 ms_unit_t *un;
1140 int row;
1141 struct ms_row *mdr;
1142 int cmpcount = 0;
1143
1144 un = MD_UNIT(md_getminor(dev));
1145
1146 for (row = 0; row < un->un_nrows; row++) {
1147 mdr = &un->un_row[row];
1148 if ((mdr->un_ncomp + cmpcount) > ci)
1149 break;
1150 cmpcount += mdr->un_ncomp;
1151 }
1152 ASSERT(row != un->un_nrows);
1153
1154 /*
1155 * Concatenations are always contiguous blocks,
1156 * you cannot depend on the interlace being a usable
1157 * value (except for stripes).
1158 */
1159 if (mdr->un_ncomp == 1) { /* Concats */
1160 *block = mdr->un_cum_blocks - mdr->un_blocks;
1161 *count = 1;
1162 *skip = 0;
1163 *size = mdr->un_blocks;
1164 } else { /* Stripes */
1165 *block = (mdr->un_cum_blocks - mdr->un_blocks) +
1166 ((ci - cmpcount) * mdr->un_interlace);
1167 *count = (size_t)(mdr->un_blocks / (mdr->un_interlace *
1168 mdr->un_ncomp));
1169 *skip = (mdr->un_interlace * mdr->un_ncomp) - mdr->un_interlace;
1170 *size = mdr->un_interlace;
1171 }
1172
1173 return (0);
1174 }
1175
1176 /*ARGSUSED*/
1177 static intptr_t
stripe_shared_by_indx(md_dev64_t dev,void * junk,int indx)1178 stripe_shared_by_indx(md_dev64_t dev, void *junk, int indx)
1179 {
1180 ms_unit_t *un;
1181 ms_comp_t *comp;
1182
1183 un = MD_UNIT(md_getminor(dev));
1184 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1185 comp += indx;
1186 return ((intptr_t)&comp->un_mirror);
1187 }
1188
1189 /*ARGSUSED*/
1190 intptr_t
stripe_component_count(md_dev64_t dev,void * junk)1191 stripe_component_count(md_dev64_t dev, void *junk)
1192 {
1193 /*
1194 * See comments for stripe_get_dev
1195 */
1196
1197 ms_unit_t *un;
1198 int count = 0;
1199 int row;
1200
1201 un = MD_UNIT(md_getminor(dev));
1202 for (row = 0; row < un->un_nrows; row++)
1203 count += un->un_row[row].un_ncomp;
1204 return (count);
1205 }
1206
1207 /*ARGSUSED*/
1208 intptr_t
stripe_get_dev(md_dev64_t dev,void * junk,int indx,ms_cd_info_t * cd)1209 stripe_get_dev(md_dev64_t dev, void *junk, int indx, ms_cd_info_t *cd)
1210 {
1211 /*
1212 * It should be noted that stripe_replace in stripe_ioctl.c calls this
1213 * routine using makedevice(0, minor) for the first argument.
1214 *
1215 * If this routine at some point in the future needs to use the major
1216 * number stripe_replace must be changed.
1217 */
1218
1219 ms_unit_t *un;
1220 ms_comp_t *comp;
1221 md_dev64_t tmpdev;
1222
1223 un = MD_UNIT(md_getminor(dev));
1224 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1225 comp += indx;
1226 tmpdev = comp->un_dev;
1227 /*
1228 * Try to resolve devt again if NODEV64
1229 * Check if this comp is hotspared and if it is
1230 * then use key for hotspare
1231 */
1232 if (tmpdev == NODEV64) {
1233 tmpdev = md_resolve_bydevid(md_getminor(dev), tmpdev,
1234 comp->un_mirror.ms_hs_id ?
1235 comp->un_mirror.ms_hs_key :
1236 comp->un_key);
1237 comp->un_dev = tmpdev;
1238 }
1239
1240 cd->cd_dev = comp->un_dev;
1241 cd->cd_orig_dev = comp->un_mirror.ms_orig_dev;
1242 return (0);
1243 }
1244
1245 /*ARGSUSED*/
1246 void
stripe_replace_done(md_dev64_t dev,sv_dev_t * sv)1247 stripe_replace_done(md_dev64_t dev, sv_dev_t *sv)
1248 {
1249 /*
1250 * See comments for stripe_get_dev
1251 */
1252
1253 minor_t mnum = md_getminor(dev);
1254
1255 if (sv != NULL) {
1256 md_rem_names(sv, 1);
1257 kmem_free(sv, sizeof (sv_dev_t));
1258 }
1259
1260 md_unit_writerexit(MDI_UNIT(mnum));
1261 }
1262
1263 /*ARGSUSED*/
1264 intptr_t
stripe_replace_dev(md_dev64_t dev,void * junk,int ci,ms_new_dev_t * nd,mddb_recid_t * recids,int nrecids,void (** replace_done)(),void ** replace_data)1265 stripe_replace_dev(md_dev64_t dev, void *junk, int ci, ms_new_dev_t *nd,
1266 mddb_recid_t *recids, int nrecids, void (**replace_done)(),
1267 void **replace_data)
1268 {
1269 minor_t mnum;
1270 ms_unit_t *un;
1271 mdi_unit_t *ui;
1272 ms_comp_t *comp;
1273 diskaddr_t dev_size;
1274 int row;
1275 int ncomps = 0;
1276 int cmpcount = 0;
1277 int rid = 0;
1278 struct ms_row *mdr;
1279 sv_dev_t *sv = NULL;
1280 mddb_recid_t hs_id = 0;
1281 set_t setno;
1282 side_t side;
1283 md_dev64_t this_dev;
1284 md_dev64_t old_dev;
1285
1286 mnum = md_getminor(dev);
1287 ui = MDI_UNIT(mnum);
1288 setno = MD_MIN2SET(mnum);
1289 side = mddb_getsidenum(setno);
1290
1291 un = md_unit_writerlock(ui);
1292
1293 *replace_data = NULL;
1294 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1295
1296 comp += ci;
1297 old_dev = comp->un_dev;
1298
1299 /*
1300 * Count the number of components
1301 */
1302 for (row = 0; row < un->un_nrows; row++) {
1303 struct ms_row *mdr = &un->un_row[row];
1304 ncomps += mdr->un_ncomp;
1305 }
1306
1307 recids[0] = 0;
1308 /*
1309 * No need of checking size of new device,
1310 * when hotsparing (it has already been done), or
1311 * when enabling the device.
1312 */
1313 if ((nd != NULL) && (nd->nd_hs_id == 0)) {
1314 for (row = 0; row < un->un_nrows; row++) {
1315 mdr = &un->un_row[row];
1316 if ((mdr->un_ncomp + cmpcount) > ci)
1317 break;
1318 cmpcount += mdr->un_ncomp;
1319 }
1320 ASSERT(row != un->un_nrows);
1321
1322 /* Concatenations have a ncomp = 1 */
1323 dev_size = mdr->un_blocks / mdr->un_ncomp;
1324
1325 /*
1326 * now check to see if new comp can be used in
1327 * place of old comp
1328 */
1329 if ((un->c.un_flag & MD_LABELED) && (ci == 0) &&
1330 nd->nd_labeled)
1331 nd->nd_start_blk = 0;
1332 else
1333 nd->nd_nblks -= nd->nd_start_blk;
1334
1335 if (dev_size > nd->nd_nblks) {
1336 md_unit_writerexit(ui);
1337 return (MDE_COMP_TOO_SMALL);
1338 }
1339
1340 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
1341 sv->setno = MD_MIN2SET(mnum);
1342 sv->key = comp->un_key;
1343 }
1344
1345 /*
1346 * Close this component.
1347 */
1348 if (comp->un_mirror.ms_flags & MDM_S_ISOPEN) {
1349 md_layered_close(comp->un_dev, MD_OFLG_NULL);
1350 comp->un_mirror.ms_flags &= ~MDM_S_ISOPEN;
1351 }
1352
1353 /*
1354 * If the component is hotspared, return to the pool.
1355 */
1356 if (comp->un_mirror.ms_hs_id != 0) {
1357 hs_cmds_t cmd;
1358 mdkey_t hs_key;
1359
1360 hs_key = comp->un_mirror.ms_hs_key;
1361 comp->un_dev = comp->un_mirror.ms_orig_dev;
1362 comp->un_start_block = comp->un_mirror.ms_orig_blk;
1363 comp->un_mirror.ms_hs_key = 0;
1364 comp->un_mirror.ms_hs_id = 0;
1365 comp->un_mirror.ms_orig_dev = 0;
1366
1367 cmd = HS_FREE;
1368 if ((comp->un_mirror.ms_state != CS_OKAY) &&
1369 (comp->un_mirror.ms_state != CS_RESYNC))
1370 cmd = HS_BAD;
1371 (void) md_hot_spare_ifc(cmd, un->un_hsp_id, 0, 0, &hs_id,
1372 &hs_key, NULL, NULL);
1373 }
1374
1375 /*
1376 * Open by device id; for enable (indicated by a NULL
1377 * nd pointer), use the existing component info. For
1378 * replace, use the new device.
1379 */
1380 if (nd == NULL) {
1381 this_dev = md_resolve_bydevid(mnum, comp->un_dev, comp->un_key);
1382 /*
1383 * If someone replaced a new disk in the same slot
1384 * we get NODEV64 since old device id cannot be
1385 * resolved. The new devt is obtained from the
1386 * mddb since devt is going to be unchanged for the
1387 * enable case. No need to check for multiple
1388 * keys here because the caller (comp_replace)
1389 * has already sanity checked it for us.
1390 */
1391 if (this_dev == NODEV64) {
1392 this_dev = md_getdevnum(setno, side, comp->un_key,
1393 MD_TRUST_DEVT);
1394 }
1395 } else {
1396 /*
1397 * If this is a hotspare, save the original dev_t for later
1398 * use. If this has occured during boot then the value of
1399 * comp->un_dev will be NODEV64 because of the failure to look
1400 * up the devid of the device.
1401 */
1402 if (nd->nd_hs_id != 0)
1403 comp->un_mirror.ms_orig_dev = comp->un_dev;
1404 this_dev = md_resolve_bydevid(mnum, nd->nd_dev, nd->nd_key);
1405 }
1406
1407 comp->un_dev = this_dev;
1408
1409 /*
1410 * Now open the new device if required. Note for a single component
1411 * stripe it will not be open - leave this for the mirror driver to
1412 * deal with.
1413 */
1414 if (md_unit_isopen(ui)) {
1415 if (md_layered_open(mnum, &this_dev, MD_OFLG_NULL)) {
1416 mddb_recid_t ids[3];
1417
1418 ids[0] = un->c.un_record_id;
1419 ids[1] = hs_id;
1420 ids[2] = 0;
1421 mddb_commitrecs_wrapper(ids);
1422 if ((nd != NULL) && (nd->nd_hs_id != 0)) {
1423 /*
1424 * Revert back to the original device.
1425 */
1426 comp->un_dev = comp->un_mirror.ms_orig_dev;
1427
1428 cmn_err(CE_WARN,
1429 "md: %s: open error of hotspare %s",
1430 md_shortname(mnum),
1431 md_devname(MD_MIN2SET(mnum), nd->nd_dev,
1432 NULL, 0));
1433 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1434 SVM_TAG_HS, MD_MIN2SET(mnum), nd->nd_dev);
1435 }
1436 md_unit_writerexit(ui);
1437 return (MDE_COMP_OPEN_ERR);
1438 }
1439 if (nd != NULL)
1440 nd->nd_dev = this_dev;
1441
1442 comp->un_mirror.ms_flags |= MDM_S_ISOPEN;
1443 }
1444
1445 if (nd == NULL) {
1446 recids[0] = un->c.un_record_id;
1447 recids[1] = hs_id;
1448 recids[2] = 0;
1449 *replace_done = stripe_replace_done;
1450 return (0);
1451 }
1452
1453 /* if hot sparing this device */
1454 if (nd->nd_hs_id != 0) {
1455 char devname[MD_MAX_CTDLEN];
1456 char hs_devname[MD_MAX_CTDLEN];
1457 set_t setno;
1458
1459 comp->un_mirror.ms_hs_id = nd->nd_hs_id;
1460 comp->un_mirror.ms_hs_key = nd->nd_key;
1461
1462 comp->un_mirror.ms_orig_blk = comp->un_start_block;
1463
1464 setno = MD_MIN2SET(mnum);
1465
1466 (void) md_devname(setno, comp->un_mirror.ms_orig_dev, devname,
1467 sizeof (devname));
1468 (void) md_devname(setno, nd->nd_dev, hs_devname,
1469 sizeof (hs_devname));
1470
1471 cmn_err(CE_NOTE, "md: %s: hotspared device %s with %s",
1472 md_shortname(mnum), devname, hs_devname);
1473
1474 } else { /* replacing the device */
1475 comp->un_key = nd->nd_key;
1476 *replace_data = (void *)sv;
1477
1478 /*
1479 * For the old device, make sure to reset the parent
1480 * if it's a metadevice.
1481 */
1482 if (md_getmajor(comp->un_dev) == md_major) {
1483 minor_t comp_mnum = md_getminor(old_dev);
1484 md_unit_t *comp_un = MD_UNIT(comp_mnum);
1485
1486 md_reset_parent(old_dev);
1487 recids[rid++] = MD_RECID(comp_un);
1488 }
1489 }
1490
1491 comp->un_dev = nd->nd_dev;
1492 comp->un_start_block = nd->nd_start_blk;
1493
1494 /*
1495 * For the new device, make sure to set the parent if it's a
1496 * metadevice.
1497 *
1498 * If we ever support using metadevices as hot spares, this
1499 * will need to be tested, and possibly moved into the
1500 * preceding "else" clause, immediately following the parent
1501 * reset block. For now, it's convenient to leave it here and
1502 * only compress nd->nd_dev once.
1503 */
1504 if (md_getmajor(comp->un_dev) == md_major) {
1505 minor_t comp_mnum = md_getminor(comp->un_dev);
1506 md_unit_t *comp_un = MD_UNIT(comp_mnum);
1507
1508 md_set_parent(comp->un_dev, MD_SID(un));
1509 recids[rid++] = MD_RECID(comp_un);
1510 }
1511
1512 recids[rid++] = un->c.un_record_id;
1513 recids[rid++] = hs_id;
1514 recids[rid] = 0;
1515 *replace_done = stripe_replace_done;
1516 return (0);
1517 }
1518
1519 /*ARGSUSED*/
1520 static intptr_t
stripe_hotspare_dev(md_dev64_t dev,void * junk,int ci,mddb_recid_t * recids,int nrecids,void (** replace_done)(),void ** replace_data)1521 stripe_hotspare_dev(
1522 md_dev64_t dev,
1523 void *junk,
1524 int ci,
1525 mddb_recid_t *recids,
1526 int nrecids,
1527 void (**replace_done)(),
1528 void **replace_data)
1529 {
1530 ms_unit_t *un;
1531 mdi_unit_t *ui;
1532 ms_comp_t *comp;
1533 int row;
1534 struct ms_row *mdr;
1535 ms_new_dev_t nd;
1536 int err;
1537 int i;
1538 minor_t mnum;
1539 set_t setno;
1540 int cmpcount = 0;
1541
1542 mnum = md_getminor(dev);
1543 ui = MDI_UNIT(mnum);
1544 un = MD_UNIT(mnum);
1545 setno = MD_MIN2SET(mnum);
1546
1547 if (md_get_setstatus(setno) & MD_SET_STALE)
1548 return (1);
1549
1550 if (un->un_hsp_id == -1)
1551 return (1);
1552
1553 for (row = 0; row < un->un_nrows; row++) {
1554 mdr = &un->un_row[row];
1555 if ((mdr->un_ncomp + cmpcount) > ci)
1556 break;
1557 cmpcount += mdr->un_ncomp;
1558 }
1559 ASSERT(row != un->un_nrows);
1560
1561 comp = (struct ms_comp *)((void *)&((char *)un)[un->un_ocomp]);
1562 comp += ci;
1563 /* Concatenations have a ncomp = 1 */
1564 nd.nd_nblks = mdr->un_blocks / mdr->un_ncomp;
1565
1566 if ((un->c.un_flag & MD_LABELED) && (ci == 0))
1567 nd.nd_labeled = 1;
1568 else
1569 nd.nd_labeled = 0;
1570
1571 again:
1572 err = md_hot_spare_ifc(HS_GET, un->un_hsp_id, nd.nd_nblks,
1573 nd.nd_labeled, &nd.nd_hs_id, &nd.nd_key, &nd.nd_dev,
1574 &nd.nd_start_blk);
1575
1576 if (err) {
1577 if (!stripe_replace_dev(dev, junk, ci, NULL, recids, nrecids,
1578 replace_done, replace_data)) {
1579 mddb_commitrecs_wrapper(recids);
1580 md_unit_writerexit(ui);
1581 }
1582 recids[0] = 0;
1583 return (1);
1584 }
1585
1586 if (stripe_replace_dev(dev, junk, ci, &nd, recids, nrecids,
1587 replace_done, replace_data)) {
1588
1589 (void) md_hot_spare_ifc(HS_BAD, un->un_hsp_id, 0, 0,
1590 &nd.nd_hs_id, &nd.nd_key, NULL, NULL);
1591 mddb_commitrec_wrapper(nd.nd_hs_id);
1592 goto again;
1593 }
1594
1595 /* Leave a slot for the null recid */
1596 for (i = 0; i < (nrecids - 1); i++) {
1597 if (recids[i] == 0) {
1598 recids[i++] = nd.nd_hs_id;
1599 recids[i] = 0;
1600 }
1601 }
1602 return (0);
1603 }
1604
1605 static int
stripe_imp_set(set_t setno)1606 stripe_imp_set(
1607 set_t setno
1608 )
1609 {
1610
1611 mddb_recid_t recid;
1612 int i, row, c, gotsomething;
1613 mddb_type_t typ1;
1614 mddb_de_ic_t *dep;
1615 mddb_rb32_t *rbp;
1616 ms_unit32_od_t *un32;
1617 ms_unit_t *un64;
1618 md_dev64_t self_devt;
1619 minor_t *self_id; /* minor needs to be updated */
1620 md_parent_t *parent_id; /* parent needs to be updated */
1621 mddb_recid_t *record_id; /* record id needs to be updated */
1622 mddb_recid_t *hsp_id;
1623 ms_comp32_od_t *comp32;
1624 ms_comp_t *comp64;
1625
1626
1627 gotsomething = 0;
1628
1629 typ1 = (mddb_type_t)md_getshared_key(setno,
1630 stripe_md_ops.md_driver.md_drivername);
1631 recid = mddb_makerecid(setno, 0);
1632
1633 while ((recid = mddb_getnextrec(recid, typ1, 0)) > 0) {
1634 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1635 continue;
1636
1637 dep = mddb_getrecdep(recid);
1638 rbp = dep->de_rb;
1639
1640 switch (rbp->rb_revision) {
1641 case MDDB_REV_RB:
1642 case MDDB_REV_RBFN:
1643 /*
1644 * Small device
1645 */
1646 un32 = (ms_unit32_od_t *)mddb_getrecaddr(recid);
1647 self_id = &(un32->c.un_self_id);
1648 parent_id = &(un32->c.un_parent);
1649 record_id = &(un32->c.un_record_id);
1650 hsp_id = &(un32->un_hsp_id);
1651
1652 comp32 = (ms_comp32_od_t *)
1653 ((void *)&((char *)un32)[un32->un_ocomp]);
1654 for (row = 0; row < un32->un_nrows; row++) {
1655 struct ms_row32_od *mdr = &un32->un_row[row];
1656 for (i = 0, c = mdr->un_icomp;
1657 i < mdr->un_ncomp; i++) {
1658 ms_comp32_od_t *mdc;
1659
1660 mdc = &comp32[c++];
1661
1662 if (!md_update_minor(setno,
1663 mddb_getsidenum(setno),
1664 mdc->un_key))
1665 goto out;
1666
1667 if (mdc->un_mirror.ms_hs_id != 0)
1668 mdc->un_mirror.ms_hs_id =
1669 MAKERECID(setno,
1670 mdc->un_mirror.ms_hs_id);
1671 }
1672 }
1673 break;
1674 case MDDB_REV_RB64:
1675 case MDDB_REV_RB64FN:
1676 un64 = (ms_unit_t *)mddb_getrecaddr(recid);
1677 self_id = &(un64->c.un_self_id);
1678 parent_id = &(un64->c.un_parent);
1679 record_id = &(un64->c.un_record_id);
1680 hsp_id = &(un64->un_hsp_id);
1681
1682 comp64 = (ms_comp_t *)
1683 ((void *)&((char *)un64)[un64->un_ocomp]);
1684 for (row = 0; row < un64->un_nrows; row++) {
1685 struct ms_row *mdr = &un64->un_row[row];
1686
1687 for (i = 0, c = mdr->un_icomp;
1688 i < mdr->un_ncomp; i++) {
1689 ms_comp_t *mdc;
1690
1691 mdc = &comp64[c++];
1692
1693 if (!md_update_minor(setno,
1694 mddb_getsidenum(setno),
1695 mdc->un_key))
1696 goto out;
1697
1698 if (mdc->un_mirror.ms_hs_id != 0)
1699 mdc->un_mirror.ms_hs_id =
1700 MAKERECID(setno,
1701 mdc->un_mirror.ms_hs_id);
1702 }
1703 }
1704 break;
1705 }
1706
1707 /*
1708 * If this is a top level and a friendly name metadevice,
1709 * update its minor in the namespace.
1710 */
1711 if ((*parent_id == MD_NO_PARENT) &&
1712 ((rbp->rb_revision == MDDB_REV_RBFN) ||
1713 (rbp->rb_revision == MDDB_REV_RB64FN))) {
1714
1715 self_devt = md_makedevice(md_major, *self_id);
1716 if (!md_update_top_device_minor(setno,
1717 mddb_getsidenum(setno), self_devt))
1718 goto out;
1719 }
1720
1721 /*
1722 * Update unit with the imported setno
1723 *
1724 */
1725 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1726
1727 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1728
1729 if (*hsp_id != -1)
1730 *hsp_id = MAKERECID(setno, DBID(*hsp_id));
1731
1732 if (*parent_id != MD_NO_PARENT)
1733 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1734 *record_id = MAKERECID(setno, DBID(*record_id));
1735
1736 gotsomething = 1;
1737 }
1738
1739 out:
1740 return (gotsomething);
1741 }
1742
1743 static md_named_services_t stripe_named_services[] = {
1744 {stripe_shared_by_blk, "shared by blk" },
1745 {stripe_shared_by_indx, "shared by indx" },
1746 {stripe_component_count, "get component count" },
1747 {stripe_block_count_skip_size, "get block count skip size" },
1748 {stripe_get_dev, "get device" },
1749 {stripe_replace_dev, "replace device" },
1750 {stripe_hotspare_dev, "hotspare device" },
1751 {stripe_rename_check, MDRNM_CHECK },
1752 {NULL, 0}
1753 };
1754
1755 md_ops_t stripe_md_ops = {
1756 stripe_open, /* open */
1757 stripe_close, /* close */
1758 md_stripe_strategy, /* strategy */
1759 NULL, /* print */
1760 stripe_dump, /* dump */
1761 NULL, /* read */
1762 NULL, /* write */
1763 md_stripe_ioctl, /* stripe_ioctl, */
1764 stripe_snarf, /* stripe_snarf */
1765 stripe_halt, /* stripe_halt */
1766 NULL, /* aread */
1767 NULL, /* awrite */
1768 stripe_imp_set, /* import set */
1769 stripe_named_services
1770 };
1771
1772 static void
init_init()1773 init_init()
1774 {
1775 md_stripe_mcs_buf_off = sizeof (md_scs_t) - sizeof (buf_t);
1776
1777 stripe_parent_cache = kmem_cache_create("md_stripe_parent",
1778 sizeof (md_sps_t), 0, stripe_parent_constructor,
1779 stripe_parent_destructor, stripe_run_queue, NULL, NULL,
1780 0);
1781 stripe_child_cache = kmem_cache_create("md_stripe_child",
1782 sizeof (md_scs_t) - sizeof (buf_t) + biosize(), 0,
1783 stripe_child_constructor, stripe_child_destructor,
1784 stripe_run_queue, NULL, NULL, 0);
1785 }
1786
1787 static void
fini_uninit()1788 fini_uninit()
1789 {
1790 kmem_cache_destroy(stripe_parent_cache);
1791 kmem_cache_destroy(stripe_child_cache);
1792 stripe_parent_cache = stripe_child_cache = NULL;
1793 }
1794
1795 /* define the module linkage */
1796 MD_PLUGIN_MISC_MODULE("stripes module", init_init(), fini_uninit())
1797