1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 1995, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25
26 /*
27 * Metadevice diskset interfaces
28 */
29
30 #include "meta_set_prv.h"
31 #include <sys/lvm/md_crc.h>
32 #include <strings.h>
33 #include <sys/bitmap.h>
34
35 extern char *blkname(char *);
36
37 static int
upd_dr_dbinfo(mdsetname_t * sp,md_set_desc * sd,md_drive_desc * dd,md_replicalist_t * rlp,int forceflg,md_error_t * ep)38 upd_dr_dbinfo(
39 mdsetname_t *sp,
40 md_set_desc *sd,
41 md_drive_desc *dd,
42 md_replicalist_t *rlp,
43 int forceflg,
44 md_error_t *ep
45 )
46 {
47 md_drive_desc *p;
48 md_replica_t *r;
49 md_replicalist_t *rl;
50 int i;
51 int dbcnt;
52 int rval = 0;
53 daddr_t nblks = 0;
54 md_setkey_t *cl_sk;
55 md_error_t xep = mdnullerror;
56 md_mnnode_desc *nd;
57 ddi_devid_t devid;
58
59 /* find the smallest existing replica */
60 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
61 r = rl->rl_repp;
62 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
63 }
64
65 if (nblks <= 0)
66 nblks = (MD_MNSET_DESC(sd)) ? MD_MN_DBSIZE : MD_DBSIZE;
67
68 for (p = dd; p != NULL; p = p->dd_next) {
69 dbcnt = 0;
70 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
71 r = rl->rl_repp;
72
73 /*
74 * Before we bump up the dbcnt, if we're
75 * running with device ids in disksets, let's
76 * compare the device ids otherwise we compare
77 * the ctd names.
78 *
79 * There is a possibility the device ids might
80 * have changed. To account for that case, we
81 * fallback to comparing the ctd names if the
82 * device id comparison fails. If we aren't running
83 * in device id mode and a disk has moved, the ctd's
84 * won't match.
85 */
86 if ((p->dd_dnp->devid != NULL) &&
87 (r->r_devid != NULL) && (!MD_MNSET_DESC(sd))) {
88 (void) devid_str_decode(p->dd_dnp->devid,
89 &devid, NULL);
90 if ((devid_compare(devid, r->r_devid) == 0) ||
91 (strcmp(r->r_namep->drivenamep->cname,
92 p->dd_dnp->cname) == 0))
93 dbcnt++;
94 devid_free(devid);
95 } else {
96 if (strcmp(r->r_namep->drivenamep->cname,
97 p->dd_dnp->cname) == 0)
98 dbcnt++;
99 }
100 }
101 p->dd_dbcnt = dbcnt;
102 p->dd_dbsize = dbcnt > 0 ? nblks : 0;
103 }
104
105 /* Lock the set on current set members */
106 if (MD_MNSET_DESC(sd)) {
107 nd = sd->sd_nodelist;
108 while (nd) {
109 /* If this is forced, don't lock other sides */
110 if (forceflg && strcmp(mynode(), nd->nd_nodename)
111 != 0) {
112 nd = nd->nd_next;
113 continue;
114 }
115
116 /* We already locked this side in the caller */
117 if (strcmp(mynode(), nd->nd_nodename) == 0) {
118 nd = nd->nd_next;
119 continue;
120 }
121
122 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
123 nd = nd->nd_next;
124 continue;
125 }
126
127 if (clnt_lock_set(nd->nd_nodename, sp, ep)) {
128 rval = -1;
129 goto out;
130 }
131 nd = nd->nd_next;
132 }
133 } else {
134 for (i = 0; i < MD_MAXSIDES; i++) {
135 /* Skip empty slots */
136 if (sd->sd_nodes[i][0] == '\0')
137 continue;
138
139 /* If this is forced, don't lock other sides */
140 if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
141 continue;
142
143 /* We already locked this side in the caller */
144 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
145 continue;
146
147 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
148 rval = -1;
149 goto out;
150 }
151 }
152 }
153
154 if (MD_MNSET_DESC(sd)) {
155 nd = sd->sd_nodelist;
156 while (nd) {
157 /* If this is forced, then only care about this node */
158 if (forceflg && strcmp(mynode(), nd->nd_nodename)
159 != 0) {
160 nd = nd->nd_next;
161 continue;
162 }
163
164 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
165 nd = nd->nd_next;
166 continue;
167 }
168
169 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd,
170 ep) == -1) {
171 if (! mdiserror(ep, MDE_NO_SET) &&
172 ! mdismddberror(ep, MDE_DB_NODB)) {
173 rval = -1;
174 break;
175 }
176 mdclrerror(ep);
177 }
178 nd = nd->nd_next;
179 }
180 } else {
181 for (i = 0; i < MD_MAXSIDES; i++) {
182 /* Skip empty slots */
183 if (sd->sd_nodes[i][0] == '\0')
184 continue;
185
186 /* If this is forced, then only care about this node */
187 if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
188 continue;
189
190 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd,
191 ep) == -1) {
192 if (! mdiserror(ep, MDE_NO_SET) &&
193 ! mdismddberror(ep, MDE_DB_NODB)) {
194 rval = -1;
195 break;
196 }
197 mdclrerror(ep);
198 }
199 }
200 }
201
202 out:
203 cl_sk = cl_get_setkey(sp->setno, sp->setname);
204 if (MD_MNSET_DESC(sd)) {
205 nd = sd->sd_nodelist;
206 while (nd) {
207 /* If this is forced, don't unlock other sides */
208 if (forceflg && strcmp(mynode(), nd->nd_nodename)
209 != 0) {
210 nd = nd->nd_next;
211 continue;
212 }
213
214 /* We will unlocked this side in the caller */
215 if (strcmp(mynode(), nd->nd_nodename) == 0) {
216 nd = nd->nd_next;
217 continue;
218 }
219
220 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
221 nd = nd->nd_next;
222 continue;
223 }
224
225 if (clnt_unlock_set(nd->nd_nodename, cl_sk, &xep)) {
226 if (rval == 0)
227 (void) mdstealerror(ep, &xep);
228 rval = -1;
229 }
230 nd = nd->nd_next;
231 }
232 } else {
233 for (i = 0; i < MD_MAXSIDES; i++) {
234 /* Skip empty slots */
235 if (sd->sd_nodes[i][0] == '\0')
236 continue;
237
238 /* If this is forced, don't unlock other sides */
239 if (forceflg && strcmp(mynode(), sd->sd_nodes[i]) != 0)
240 continue;
241
242 /* We will unlocked this side in the caller */
243 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
244 continue;
245
246 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
247 if (rval == 0)
248 (void) mdstealerror(ep, &xep);
249 rval = -1;
250 }
251 }
252 }
253 /* Do not clear the key, via cl_set_setkey(NULL) this is nested */
254
255 return (rval);
256 }
257
258 static int
usetag_take(set_t setno,int usetag,md_error_t * ep)259 usetag_take(set_t setno, int usetag, md_error_t *ep)
260 {
261 mddb_dtag_use_parm_t dtup;
262
263 (void) memset(&dtup, '\0', sizeof (mddb_dtag_use_parm_t));
264 dtup.dtup_id = usetag;
265 dtup.dtup_setno = setno;
266
267 if (metaioctl(MD_MED_USE_TAG, &dtup, &dtup.dtup_mde, NULL) != 0)
268 return (mdstealerror(ep, &dtup.dtup_mde));
269
270 return (0);
271 }
272
273 static int
useit_take(set_t setno,md_error_t * ep)274 useit_take(set_t setno, md_error_t *ep)
275 {
276 mddb_accept_parm_t accp;
277
278 (void) memset(&accp, '\0', sizeof (mddb_accept_parm_t));
279 accp.accp_setno = setno;
280
281 if (metaioctl(MD_MED_ACCEPT, &accp, &accp.accp_mde, NULL) != 0)
282 return (mdstealerror(ep, &accp.accp_mde));
283
284 return (0);
285 }
286
287 /*
288 * Update the master block with the device id information for the disks
289 * in the diskset. The device id information will be consumed by the
290 * diskset import code in case of remotely replicated disksets.
291 *
292 * For the drives that have a valid diskset mddb on them, we add the
293 * device id for the drive to the unused portion of the mddb.
294 *
295 * For the drives that don't have a diskset mddb on them, we add a dummy
296 * master block that contains the device id for the drive. A dummy master
297 * block is signified by changing the master block magic number, mb_magic,
298 * to MDDB_MAGIC_DU.
299 *
300 * This code is responsible primarily for adding the appropriate device id
301 * information to diskset disks that didn't have the information. This would
302 * typically occur when the OS has been upgraded from an OS release prior to
303 * Solaris 10
304 *
305 * The error path in this routine is defined as - if an error occurs while
306 * updating the mddb for one disk in the diskset, don't bother updating *any*
307 * of the mddbs because it's game over anyways as far as disaster recovery for
308 * that diskset is concerned.
309 *
310 * This code will need to be revisited if and when support for importing
311 * partial disksets is added.
312 *
313 * NOTE: This code relies heavily on the meta_repartition() working correctly
314 * and reformatting a drive, so that there's enough room for a dummy master
315 * block, every time a drive is added to a diskset. Should
316 * the meta_repartition() code change in future, this code will have to be
317 * revisited.
318 *
319 * Returns 0 on success and -1 on failure
320 */
321 int
meta_update_mb(mdsetname_t * sp,md_drive_desc * drivedesc,md_error_t * ep)322 meta_update_mb(mdsetname_t *sp, md_drive_desc *drivedesc, md_error_t *ep)
323 {
324 uint_t sliceno, offset;
325 void *mb;
326 mddb_mb_t *mbp;
327 int fd = -1;
328 ddi_devid_t devid = NULL;
329 md_drive_desc *dd;
330 mddrivename_t *dnp;
331 mdname_t *rsp;
332 int dbcnt;
333 int dbsize;
334 size_t len;
335 md_set_desc *sd;
336
337 /*
338 * Don't do anything for MN diskset for now.
339 */
340 if (! metaislocalset(sp)) {
341 if ((sd = metaget_setdesc(sp, ep)) == NULL)
342 return (-1);
343
344 if (MD_MNSET_DESC(sd))
345 return (0);
346 }
347
348 mb = Malloc(DEV_BSIZE);
349 mbp = (mddb_mb_t *)mb;
350
351 /*
352 * For every drive in the drive descriptor, iterate through all
353 * the mddbs present on it and check to see if mb_devid_magic is
354 * set. If it isn't, then update the master block with the correct
355 * device id information
356 */
357 for (dd = drivedesc; dd != NULL; dd = dd->dd_next) {
358 int i = 0;
359
360 dnp = dd->dd_dnp;
361 dbcnt = dd->dd_dbcnt;
362 dbsize = dd->dd_dbsize;
363
364 /*
365 * When the import support for remotely replicated
366 * disksets gets implemented, we probably want to
367 * inform the user that the disks won't be self
368 * identifying if any of these calls fails
369 */
370 if (meta_replicaslice(dnp, &sliceno, ep) != 0)
371 return (-1);
372
373 if ((rsp = metaslicename(dnp, sliceno, ep)) == NULL)
374 return (-1);
375
376 if ((fd = open(rsp->rname, O_RDWR)) < 0)
377 goto cleanup;
378
379 /* if devid_str_decode fails, make sure devid is null */
380 if (devid_str_decode(dnp->devid, &devid, NULL) != 0) {
381 devid = NULL;
382 }
383
384 do {
385 int push = 0;
386
387 offset = (i * dbsize + 16);
388 ++i;
389
390 if (lseek(fd, (off_t)dbtob(offset), SEEK_SET) < 0)
391 goto cleanup;
392
393 if (read(fd, mbp, DEV_BSIZE) != DEV_BSIZE)
394 goto cleanup;
395
396 if (crcchk((uchar_t *)mbp, (uint_t *)&mbp->mb_checksum,
397 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
398 goto cleanup;
399
400 /*
401 * If the disk is one of the ones that doesn't
402 * have a shared mddb on it, we put a dummy
403 * master block on it.
404 */
405 if (mbp->mb_devid_magic != MDDB_MAGIC_DE) {
406 if (dbcnt == 0) {
407 meta_mkdummymaster(sp, fd, 16);
408 break;
409 }
410 }
411
412 /*
413 * if mb_setcreatetime is 0, this field was never
414 * filled in so do it now.
415 */
416 if ((mbp->mb_setcreatetime.tv_sec == 0) &&
417 (mbp->mb_setcreatetime.tv_usec == 0)) {
418 mbp->mb_setcreatetime =
419 meta_get_lb_inittime(sp, ep);
420 push = 1;
421 }
422
423 /*
424 * If MDDB_MAGIC_DE is set in the
425 * mb_devid_magic field then we know we
426 * have a valid device id and we don't
427 * need to add it to the master block.
428 *
429 * This would have to be revisited if device
430 * ids change as a result of device id
431 * algorithms changing or somesuch.
432 */
433 if (mbp->mb_devid_magic != MDDB_MAGIC_DE) {
434 if (devid != NULL) {
435 len = devid_sizeof(devid);
436 if (len <= (DEV_BSIZE -
437 sizeof (mddb_mb_t))) {
438 /*
439 * there's enough space to
440 * store the devid
441 */
442 mbp->mb_devid_magic =
443 MDDB_MAGIC_DE;
444 mbp->mb_devid_len = len;
445 (void) memcpy(mbp->mb_devid,
446 (char *)devid, len);
447 push = 1;
448 }
449 }
450 }
451
452 /*
453 * write out (push) any changes we have to the mb
454 */
455 if (push) {
456 crcgen((uchar_t *)mbp,
457 (uint_t *)&mbp->mb_checksum,
458 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL);
459
460 if (lseek(fd, (off_t)dbtob(offset), SEEK_SET)
461 < 0)
462 goto cleanup;
463
464 if (write(fd, mbp, DEV_BSIZE) != DEV_BSIZE)
465 goto cleanup;
466 }
467 if (devid)
468 devid_free(devid);
469 } while (i < dbcnt);
470 (void) close(fd);
471 }
472 /* success */
473 return (0);
474
475 cleanup:
476 if (fd != -1)
477 (void) close(fd);
478 if (devid)
479 devid_free(devid);
480 return (-1);
481 }
482
483 extern int *replicated_disk_list_built;
484 extern int replicated_disk_list_built_pass1;
485 /*
486 * Exported Entry Points
487 */
488 int
meta_set_take(mdsetname_t * sp,mhd_mhiargs_t * mhiargsp,int flags,int usetag,md_error_t * ep)489 meta_set_take(
490 mdsetname_t *sp,
491 mhd_mhiargs_t *mhiargsp,
492 int flags,
493 int usetag,
494 md_error_t *ep
495 )
496 {
497 md_set_desc *sd;
498 md_drive_desc *dd;
499 md_drive_desc *d = NULL;
500 char *owner = NULL;
501 int rval = 0;
502 int pathname_return = 0;
503 int i;
504 int has_set;
505 int matches = 0;
506 int numsides = 0;
507 md_replicalist_t *rlp = NULL;
508 sigset_t oldsigs;
509 md_setkey_t *cl_sk;
510 int rb_level = 0;
511 md_error_t xep = mdnullerror;
512 mdsetname_t *local_sp = NULL;
513 side_t side;
514 int ret = 0;
515 char *newname = NULL;
516 mdkey_t side_names_key;
517 int unrslv_replicated = 0;
518 mddrivenamelist_t *dnlp = NULL;
519 int retake_flag = 0;
520 unsigned long node_active[BT_BITOUL(MD_MAXSIDES)];
521 mdnamelist_t *nlp = NULL;
522
523 bzero(node_active, sizeof (unsigned long) * BT_BITOUL(MD_MAXSIDES));
524
525 if ((flags & TAKE_USETAG) || (flags & TAKE_USEIT)) {
526 if (flags & TAKE_USETAG) {
527 if (usetag_take(sp->setno, usetag, ep))
528 return (-1);
529 } else {
530 if (useit_take(sp->setno, ep))
531 return (-1);
532 }
533
534 if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, ep) != 0)
535 mdclrerror(ep);
536 }
537
538 /* Do we own the set? */
539 i = own_set(sp, &owner, (flags & TAKE_FORCE), ep);
540 if (! mdisok(ep)) {
541 if (owner != NULL)
542 Free(owner);
543 return (-1);
544 }
545
546 if (i == MD_SETOWNER_NO) {
547 (void) mddserror(ep, MDE_DS_NOTOWNER, sp->setno, owner, NULL,
548 sp->setname);
549 if (owner != NULL)
550 Free(owner);
551 return (-1);
552 }
553
554 if (owner != NULL) {
555 Free(owner);
556 owner = NULL;
557 }
558
559 /* We already own it, we are done. */
560 if (i == MD_SETOWNER_YES)
561 return (0);
562
563 if ((sd = metaget_setdesc(sp, &xep)) == NULL)
564 return (-1);
565
566 /* You can not take ownership of a set that has no drives */
567 if (sd->sd_flags & MD_SR_MB_DEVID)
568 dd = metaget_drivedesc(sp, MD_BASICNAME_OK | PRINT_FAST, ep);
569 else
570 dd = metaget_drivedesc(sp, MD_BASICNAME_OK, ep);
571
572 if (dd == NULL) {
573 if (! mdisok(ep))
574 return (-1);
575 return (0);
576 }
577
578 /* END CHECK CODE */
579
580 md_rb_sig_handling_on();
581
582 /* Lock the set on our side */
583 if (clnt_lock_set(mynode(), sp, ep)) {
584 rval = -1;
585 goto out;
586 }
587
588 /*
589 * Find the "side" value so that it can be used to deal with
590 * the devids.
591 */
592 side = getnodeside(mynode(), sd);
593
594 if (side == MD_SIDEWILD) {
595 (void) mddserror(ep, MDE_DS_HOSTNOSIDE, sp->setno, mynode(),
596 NULL, mynode());
597 rval = -1;
598 goto out;
599 }
600
601 /*
602 * A local sets' side 0 references records associated with
603 * that node's local set. As this is a non-local set, "side"
604 * must be modified (by adding a SKEW) before we reference
605 * records in the local set [setno = 0] for the non-local set
606 * [setno = 1..n].
607 */
608 side += SKEW;
609
610 /*
611 * If this set had been previously imported as a partial replicated
612 * diskset, then must attempt to updated any unresolved drive
613 * records in diskset with new devid information. Must set
614 * flags in drivedesc list before loading up set so that the
615 * md driver will fix up names and devids correctly in the
616 * locator block.
617 */
618 if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) {
619 md_im_names_t cnames = { 0, NULL};
620 ddi_devid_t old_devid, new_devid;
621 char *search_path = "/dev";
622 devid_nmlist_t *nmlist;
623 int indx;
624 mddrivenamelist_t **dnlpp = &dnlp;
625
626 if (meta_list_disks(ep, &cnames) != 0) {
627 rval = -1;
628 goto out;
629 }
630
631 for (indx = 0; indx < cnames.min_count; ++indx) {
632 mddrivename_t *dnp;
633 mdsetname_t *sp = metasetname(MD_LOCAL_NAME, ep);
634 int fd = -1;
635 ddi_devid_t devid1;
636 char *cdevidp;
637 int len;
638 char *fp;
639
640 /*
641 * We may have name collision here so we need to get
642 * the dnp using the devid and not the name.
643 */
644 len = strlen(cnames.min_names[indx]) + strlen("s0");
645 if ((fp = (char *)Malloc(len+1)) == NULL) {
646 (void) mdsyserror(ep, ENOMEM, NULL);
647 rval = -1;
648 goto out;
649 }
650 (void) snprintf(fp, len + 1, "%ss0",
651 cnames.min_names[indx]);
652 if ((fd = open(fp, O_RDONLY|O_NDELAY)) < 0) {
653 (void) mdsyserror(ep, EIO, fp);
654 rval = -1;
655 goto out;
656 }
657 Free(fp);
658 /* if no device id, what error?) */
659 if (devid_get(fd, &devid1) != 0) {
660 (void) mdsyserror(ep, EIO, fp);
661 rval = -1;
662 goto out;
663 }
664 if (close(fd) < 0) {
665 (void) mdsyserror(ep, EIO, fp);
666 rval = -1;
667 goto out;
668 }
669 cdevidp = devid_str_encode(devid1, NULL);
670 if (cdevidp == NULL) {
671 (void) mdsyserror(ep, EIO, fp);
672 rval = -1;
673 goto out;
674 }
675 devid_free(devid1);
676 dnp = metadrivenamebydevid(&sp, cdevidp,
677 cnames.min_names[indx], ep);
678 devid_str_free(cdevidp);
679 if (dnp == NULL) {
680 /*
681 * Assuming we're interested in knowing about
682 * whatever error occurred, but not in stopping.
683 */
684 mde_perror(ep, cnames.min_names[indx]);
685 mdclrerror(ep);
686 continue;
687 }
688
689 dnlpp = meta_drivenamelist_append_wrapper(dnlpp, dnp);
690 }
691 /* Reget sd and dd since freed by meta_prune_cnames. */
692 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
693 rval = -1;
694 goto out;
695 }
696
697 if (sd->sd_flags & MD_SR_MB_DEVID)
698 dd = metaget_drivedesc(sp,
699 MD_BASICNAME_OK | PRINT_FAST, ep);
700 else
701 dd = metaget_drivedesc(sp,
702 MD_BASICNAME_OK, ep);
703 /* If ep has error, then there was a failure, set rval */
704 if (!mdisok(ep)) {
705 rval = -1;
706 goto out;
707 }
708
709 /* Builds global replicated disk list */
710 replicated_disk_list_built = &replicated_disk_list_built_pass1;
711
712 /* If success, then clear error structure */
713 if (build_replicated_disks_list(ep, dnlp) == 1)
714 mdclrerror(ep);
715 /* If ep has error, then there was a failure, set rval */
716 if (! mdisok(ep)) {
717 rval = -1;
718 goto out;
719 }
720
721 for (d = dd; d != NULL; d = d->dd_next) {
722 if (d->dd_flags & MD_DR_UNRSLV_REPLICATED) {
723 /* Get old devid from drive record */
724 (void) devid_str_decode(d->dd_dnp->devid,
725 &old_devid, NULL);
726
727 /*
728 * If the devid stored in the drive record
729 * (old_devid) matches a devid known by
730 * the system, then this disk has already
731 * been partially resolved. This situation
732 * could occur if a panic happened during a
733 * previous take of this diskset.
734 * Set flag to later handle fixing the master
735 * block on disk and turning off the unresolved
736 * replicated flag.
737 */
738 if (meta_deviceid_to_nmlist(search_path,
739 (ddi_devid_t)old_devid,
740 DEVID_MINOR_NAME_ALL,
741 &nmlist) == 0) {
742 d->dd_flags |= MD_DR_FIX_MB_DID;
743 retake_flag = 1;
744 continue;
745 }
746
747 /*
748 * If the devid stored in the drive record
749 * is on the list of replicated disks found
750 * during a system scan then set both flags
751 * so that the locator block, namespaces
752 * (diskset and local set), master block
753 * and unresolved replicated flag are updated.
754 */
755 new_devid = replicated_list_lookup(
756 devid_sizeof((ddi_devid_t)old_devid),
757 old_devid);
758 devid_free(old_devid);
759
760 /*
761 * If devid stored in the drive record is
762 * not found then set flag to mark
763 * that set is still unresolved and
764 * continue to next drive record.
765 */
766 if (new_devid == NULL) {
767 unrslv_replicated = 1;
768 continue;
769 }
770
771 /*
772 * Set flags to fix up the master block,
773 * locator block of the diskset, diskset
774 * namespace and the local set namespace.
775 */
776 d->dd_flags |= (MD_DR_FIX_MB_DID |
777 MD_DR_FIX_LB_NM_DID);
778 retake_flag = 1;
779 }
780 }
781
782 }
783
784 /*
785 * Check the local devid namespace to see if the disks
786 * have been moved. Use the local set first of all as this contains
787 * entries for the disks in the set.
788 *
789 * This is being done before the tk_own_bydd because the disks
790 * in the dd list could be wrong! But it should be done with the lock
791 * held for the set.
792 */
793 local_sp = metasetname(MD_LOCAL_NAME, ep);
794 for (d = dd; d != NULL; d = d->dd_next) {
795 /*
796 * Actually do the check of the disks.
797 */
798 ret = meta_upd_ctdnames(&local_sp, 0, side, d->dd_dnp, &newname,
799 ep);
800
801 if ((ret == METADEVADM_ERR) ||
802 (ret == METADEVADM_DSKNAME_ERR)) {
803 /* check failed in some unknown manner */
804 rval = -1;
805 goto out;
806 } else if (ret == METADEVADM_DISKMOVE) {
807
808 /*
809 * Update the dd namelist so that the rpc.metamhd
810 * gets the correct disks to reserve - it is the rname
811 * we are interested in.
812 */
813 if (newname != NULL) {
814 char *save_devid;
815 /*
816 * Need to save the side names key as this
817 * points to the namespace entry that will
818 * need to be updated. In addition the call
819 * to meta_make_sidenmlist does not actually
820 * set the namespace key.
821 */
822 side_names_key = d->dd_dnp->side_names_key;
823
824 /*
825 * There is the possibility that there
826 * will be multiple disks with the same
827 * name but different devids in the
828 * drivelist. Because of this, we need
829 * to look for a new dnp based on devid
830 * and not name.
831 */
832 save_devid = Strdup(d->dd_dnp->devid);
833 metafreedrivename(d->dd_dnp);
834 d->dd_dnp = metadrivenamebydevid(&sp,
835 save_devid, newname, ep);
836 Free(save_devid);
837 Free(newname);
838 /*
839 * null newname so we are reset for next time
840 * through
841 */
842 newname = NULL;
843 ret = meta_make_sidenmlist(sp,
844 d->dd_dnp, 0, NULL, ep);
845 d->dd_dnp->side_names_key = side_names_key;
846 if (ret == -1) {
847 rval = -1;
848 goto out;
849 }
850 }
851 }
852 }
853
854
855 RB_TEST(1, "take", ep)
856
857 RB_PREEMPT;
858 rb_level = 1; /* level 1 */
859
860 RB_TEST(2, "take", ep)
861
862 if (!MD_ATSET_DESC(sd)) {
863 if (tk_own_bydd(sp, dd, mhiargsp,
864 flags & MD_IM_PARTIAL_DISKSET, ep))
865 goto rollback;
866 }
867
868 RB_TEST(3, "take", ep)
869
870 RB_PREEMPT;
871 rb_level = 2; /* level 2 */
872
873 RB_TEST(4, "take", ep)
874
875 if (clnt_stimeout(mynode(), sp, mhiargsp, ep) == -1)
876 goto rollback;
877
878 if (setup_db_bydd(sp, dd, (flags & TAKE_FORCE), ep) == -1) {
879 if (! mdismddberror(ep, MDE_DB_ACCOK) &&
880 ! mdismddberror(ep, MDE_DB_TAGDATA))
881 goto rollback;
882 mdclrerror(ep);
883 }
884
885 RB_TEST(5, "take", ep)
886
887 RB_PREEMPT;
888 rb_level = 3; /* level 3 */
889
890 RB_TEST(6, "take", ep)
891
892 /* Snarf set of traditional diskset doesn't use stale information */
893 if (snarf_set(sp, FALSE, ep)) {
894 if (mdismddberror(ep, MDE_DB_STALE) ||
895 mdismddberror(ep, MDE_DB_ACCOK) ||
896 mdismddberror(ep, MDE_DB_TAGDATA)) {
897 rval = -1;
898 goto out;
899 }
900
901 if (! mdismddberror(ep, MDE_DB_NODB) &&
902 ! mdismddberror(ep, MDE_DB_NOTOWNER))
903 goto rollback;
904
905 /*
906 * Look at the set on all other hosts, if every other host
907 * has the same set with a larger genid, then we destroy this
908 * copy.
909 */
910 for (i = 0; i < MD_MAXSIDES; i++) {
911 /* Skip empty slots */
912 if (sd->sd_nodes[i][0] == '\0')
913 continue;
914
915 /* Skip this node */
916 if (strcmp(sd->sd_nodes[i], mynode()) == 0)
917 continue;
918
919 numsides++;
920
921 has_set = nodehasset(sp, sd->sd_nodes[i],
922 NHS_NST_EQ_G_GT, &xep);
923
924 if (has_set < 0) {
925 if (! mdiserror(&xep, MDE_NO_SET) &&
926 ! mdismddberror(&xep, MDE_DB_NODB))
927 goto rollback;
928 matches++;
929 mdclrerror(&xep);
930 continue;
931 }
932
933 if (has_set)
934 matches++;
935 }
936
937 /* Destroy the set */
938 if (numsides > 0 && (numsides - matches) == 0) {
939 if (meta_set_destroy(sp, FALSE, &xep))
940 mdclrerror(&xep);
941 (void) mddserror(ep, MDE_DS_SETCLEANUP, sp->setno,
942 sp->setname, NULL, mynode());
943 rval = -1;
944 }
945 goto rollback;
946 }
947
948 /*
949 * If an unresolved replicated diskset, fix up diskset
950 * and local namespaces, master block and drive record
951 * with the new devid. If all drives in diskset are
952 * now resolved, then clear set unresolved replicated flag.
953 * If an error is encountered, don't fail the take, but
954 * don't proceed any further in resolving the replicated disks.
955 */
956 if (sd->sd_flags & MD_SR_UNRSLV_REPLICATED) {
957 /* Fix up diskset and local namespaces with new devids */
958 meta_unrslv_replicated_nm(sp, dd, dnlp, ep);
959 if (mdisok(ep)) {
960 /* Fix up master block with new devids */
961 meta_unrslv_replicated_mb(sp, dd, dnlp, ep);
962 }
963
964 /* If all drives are resolved, set OK flag in set record. */
965 if (mdisok(ep) && (unrslv_replicated == 0)) {
966 /* Ignore failure since no bad effect. */
967 (void) clnt_upd_sr_flags(mynode(), sp, MD_SR_OK, ep);
968 }
969 mdclrerror(ep);
970
971 }
972
973 /*
974 * meta_getalldevs() will ultimately force devfsadmd to create
975 * the /dev links for all the configured metadevices if they
976 * do not already exist. This ensures that once the set is
977 * owned all the metadevices are accessible as opposed to waiting
978 * for devfsadmd to create them.
979 */
980 if (meta_getalldevs(sp, &nlp, FALSE, ep) != 0) {
981 metafreenamelist(nlp);
982 goto rollback;
983 }
984
985 metafreenamelist(nlp);
986
987 pathname_return = pathname_reload(&sp, sp->setno, ep);
988 if ((pathname_return == METADEVADM_ERR) ||
989 (pathname_return == METADEVADM_DSKNAME_ERR)) {
990 goto rollback;
991 }
992
993
994 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
995 goto rollback;
996
997 if (upd_dr_dbinfo(sp, sd, dd, rlp, (flags & TAKE_FORCE), ep) < 0) {
998 metafreereplicalist(rlp);
999 goto rollback;
1000 }
1001
1002 metafreereplicalist(rlp);
1003
1004 /*
1005 * If the set doesn't have the MD_SR_MB_DEVID bit set, i.e
1006 * the drives in the set don't have the device id information,
1007 * then stick it in if possible.
1008 *
1009 * If updating the master block fails for whatever reason, it's
1010 * okay. It just means the disk(s) in the diskset won't be self
1011 * identifying.
1012 */
1013 if (!(sd->sd_flags & MD_SR_MB_DEVID)) {
1014 /* Lock the set on current set members */
1015 for (i = 0; i < MD_MAXSIDES; i++) {
1016 /* Skip empty slots */
1017 if (sd->sd_nodes[i][0] == '\0')
1018 continue;
1019
1020 /* We already locked this side */
1021 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1022 continue;
1023
1024 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) {
1025 /*
1026 * Ignore any RPC errors on a force
1027 * take. The set will have been taken
1028 * above and we still need to continue.
1029 */
1030 if (flags & TAKE_FORCE)
1031 continue;
1032 rval = -1;
1033 goto out;
1034 }
1035 BT_SET(node_active, i);
1036 }
1037 rb_level = 4; /* level 4 */
1038
1039 if (meta_update_mb(sp, dd, ep) == 0)
1040 /* update the sr_flags on all hosts */
1041 for (i = 0; i < MD_MAXSIDES; i++) {
1042 /* Skip empty slots */
1043 if (sd->sd_nodes[i][0] == '\0')
1044 continue;
1045
1046 /*
1047 * Only update those nodes that
1048 * are active (ie those that the
1049 * set is locked on).
1050 */
1051 if (!BT_TEST(node_active, i))
1052 continue;
1053
1054 if (clnt_upd_sr_flags(sd->sd_nodes[i],
1055 sp, (sd->sd_flags | MD_SR_MB_DEVID), ep))
1056 goto rollback;
1057 }
1058
1059 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1060 for (i = 0; i < MD_MAXSIDES; i++) {
1061 /* Skip empty slots */
1062 if (sd->sd_nodes[i][0] == '\0')
1063 continue;
1064
1065 /* Unlocked of this side is done later */
1066 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1067 continue;
1068
1069 /* no point calling dead nodes */
1070 if (!BT_TEST(node_active, i))
1071 continue;
1072
1073 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1074 if (rval == 0)
1075 (void) mdstealerror(ep, &xep);
1076 rval = -1;
1077 }
1078 }
1079 }
1080
1081 /*
1082 * If we get here, we need to unlock the set before the resync
1083 * gets called, otherwise the "daemon" will hold the set lock
1084 * until the resync is done!
1085 */
1086
1087 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1088 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1089 if (rval == 0)
1090 (void) mdstealerror(ep, &xep);
1091 rval = -1;
1092 }
1093 cl_set_setkey(NULL);
1094
1095 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1096
1097 /* We try to get things resync'ed, but this can fail */
1098 mdclrerror(&xep);
1099 if (meta_resync_all(sp, MD_DEF_RESYNC_BUF_SIZE, &xep) != 0) {
1100 if (rval == 0)
1101 (void) mdstealerror(ep, &xep);
1102 rval = -1;
1103 }
1104
1105 RB_TEST(7, "take", ep)
1106
1107 /*
1108 * In order to resolve the namespace major driver names and
1109 * to have the subdrivers attempt to re-associate devts from
1110 * the newly resolved replicated device ids, return a '2'.
1111 * This instructs metaset to release the diskset and re-take.
1112 *
1113 * Return a 2 if
1114 * - no error was detected on the take
1115 * - a replicated unresolved devid was resolved during take
1116 * - take isn't being called during an import
1117 * - this isn't already a re-take situation
1118 */
1119 if ((rval == 0) && (retake_flag == 1) &&
1120 ((flags & (TAKE_RETAKE | TAKE_IMP)) == 0)) {
1121 rval = 2;
1122 }
1123
1124 return (rval);
1125
1126 out:
1127 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1128 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1129 if (rval == 0)
1130 (void) mdstealerror(ep, &xep);
1131 rval = -1;
1132 }
1133 if (!(sd->sd_flags & MD_SR_MB_DEVID) && (rb_level > 2)) {
1134 for (i = 0; i < MD_MAXSIDES; i++) {
1135 /* Skip empty slots */
1136 if (sd->sd_nodes[i][0] == '\0')
1137 continue;
1138
1139 /* We already unlocked this side */
1140 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1141 continue;
1142
1143 /* no point calling dead nodes */
1144 if (!BT_TEST(node_active, i))
1145 continue;
1146
1147 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep)) {
1148 if (rval == 0)
1149 (void) mdstealerror(ep, &xep);
1150 rval = -1;
1151 }
1152 }
1153 }
1154 cl_set_setkey(NULL);
1155
1156 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1157
1158 return (rval);
1159
1160 rollback:
1161 /* Make sure we are blocking all signals */
1162 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1163 mdclrerror(&xep);
1164
1165 rval = -1;
1166
1167 /* level 4 */
1168 if (rb_level > 3) {
1169 if (sd->sd_flags & MD_SR_MB_DEVID) {
1170 /* update the sr_flags on all hosts */
1171 for (i = 0; i < MD_MAXSIDES; i++) {
1172 /* Skip empty slots */
1173 if (sd->sd_nodes[i][0] == '\0')
1174 continue;
1175
1176 /* no point calling dead nodes */
1177 if (!BT_TEST(node_active, i))
1178 continue;
1179
1180 if (clnt_upd_sr_flags(sd->sd_nodes[i], sp,
1181 (sd->sd_flags & ~MD_SR_MB_DEVID), &xep))
1182 mdclrerror(&xep);
1183 }
1184 }
1185
1186 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1187 for (i = 0; i < MD_MAXSIDES; i++) {
1188 /* Skip empty slots */
1189 if (sd->sd_nodes[i][0] == '\0')
1190 continue;
1191
1192 /* We will unlocked this side below */
1193 if (strcmp(mynode(), sd->sd_nodes[i]) == 0)
1194 continue;
1195
1196 /* no point calling dead nodes */
1197 if (!BT_TEST(node_active, i))
1198 continue;
1199
1200 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, &xep))
1201 mdclrerror(&xep);
1202 }
1203 }
1204
1205 /* level 3 */
1206 if (rb_level > 2) {
1207 if (halt_set(sp, &xep))
1208 mdclrerror(&xep);
1209 }
1210
1211 /* level 2 */
1212 if (rb_level > 1) {
1213 if (clnt_stimeout(mynode(), sp, &defmhiargs, &xep) == -1)
1214 mdclrerror(&xep);
1215 }
1216
1217 /* level 1 */
1218 if (rb_level > 0) {
1219 if (!MD_ATSET_DESC(sd)) {
1220 if (rel_own_bydd(sp, dd, FALSE, &xep))
1221 mdclrerror(&xep);
1222 }
1223 }
1224
1225 /* level 0 */
1226 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1227 if (clnt_unlock_set(mynode(), cl_sk, &xep))
1228 mdclrerror(&xep);
1229 cl_set_setkey(NULL);
1230
1231 /* release signals back to what they were on entry */
1232 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1233 mdclrerror(&xep);
1234
1235 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1236
1237 return (rval);
1238 }
1239
1240 int
meta_set_release(mdsetname_t * sp,md_error_t * ep)1241 meta_set_release(
1242 mdsetname_t *sp,
1243 md_error_t *ep
1244 )
1245 {
1246 int rval = 0;
1247 md_drive_desc *dd;
1248 mhd_mhiargs_t mhiargs;
1249 sigset_t oldsigs;
1250 md_setkey_t *cl_sk;
1251 int rb_level = 0;
1252 md_error_t xep = mdnullerror;
1253
1254 /* Make sure we own the set */
1255 if (meta_check_ownership(sp, ep) != 0)
1256 return (-1);
1257
1258 /* Get the drive descriptors */
1259 if ((dd = metaget_drivedesc(sp, (MD_BASICNAME_OK | PRINT_FAST),
1260 ep)) == NULL)
1261 if (! mdisok(ep))
1262 return (-1);
1263
1264 /* Get timeout values in case we need to roll back this release */
1265 (void) memset(&mhiargs, '\0', sizeof (mhiargs));
1266 if (clnt_gtimeout(mynode(), sp, &mhiargs, ep) != 0)
1267 return (-1);
1268
1269 /* END CHECK CODE */
1270
1271 md_rb_sig_handling_on();
1272
1273 /* Lock the set on our side */
1274 if (clnt_lock_set(mynode(), sp, ep)) {
1275 rval = -1;
1276 goto out;
1277 }
1278
1279 RB_TEST(1, "release", ep)
1280
1281 RB_PREEMPT;
1282 rb_level = 1; /* level 1 */
1283
1284 RB_TEST(2, "release", ep)
1285
1286 if (halt_set(sp, ep))
1287 goto rollback;
1288
1289 RB_TEST(3, "release", ep)
1290
1291 RB_PREEMPT;
1292 rb_level = 2; /* level 2 */
1293
1294 RB_TEST(4, "release", ep)
1295
1296 if (rel_own_bydd(sp, dd, FALSE, ep))
1297 goto rollback;
1298
1299 RB_TEST(5, "release", ep)
1300
1301 RB_PREEMPT;
1302 rb_level = 3; /* level 3 */
1303
1304 RB_TEST(6, "release", ep)
1305
1306 if (clnt_stimeout(mynode(), sp, &defmhiargs, ep) == -1)
1307 goto rollback;
1308
1309 RB_TEST(7, "release", ep)
1310
1311 out:
1312 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1313 if (clnt_unlock_set(mynode(), cl_sk, &xep)) {
1314 if (rval == 0)
1315 (void) mdstealerror(ep, &xep);
1316 rval = -1;
1317 }
1318 cl_set_setkey(NULL);
1319
1320 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1321
1322 return (rval);
1323
1324 rollback:
1325 /* Make sure we are blocking all signals */
1326 if (procsigs(TRUE, &oldsigs, &xep) < 0)
1327 mdclrerror(&xep);
1328
1329 rval = -1;
1330
1331 /* level 3 */
1332 if (rb_level > 2) {
1333 if (clnt_stimeout(mynode(), sp, &mhiargs, &xep) == -1)
1334 mdclrerror(&xep);
1335 }
1336
1337 /* level 2 */
1338 if (rb_level > 1) {
1339 if (tk_own_bydd(sp, dd, &mhiargs, FALSE, &xep))
1340 mdclrerror(&xep);
1341 }
1342
1343 /* level 1 */
1344 if (rb_level > 0) {
1345 if (setup_db_bydd(sp, dd, TRUE, &xep) == -1)
1346 mdclrerror(&xep);
1347
1348 /* Snarf set of trad diskset doesn't use stale information */
1349 if (snarf_set(sp, FALSE, &xep))
1350 mdclrerror(&xep);
1351 }
1352
1353 /* level 0 */
1354 cl_sk = cl_get_setkey(sp->setno, sp->setname);
1355 if (clnt_unlock_set(mynode(), cl_sk, &xep))
1356 mdclrerror(&xep);
1357 cl_set_setkey(NULL);
1358
1359 /* release signals back to what they were on entry */
1360 if (procsigs(FALSE, &oldsigs, &xep) < 0)
1361 mdclrerror(&xep);
1362
1363 md_rb_sig_handling_off(md_got_sig(), md_which_sig());
1364
1365 return (rval);
1366 }
1367