1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Just in case we're not in a build environment, make sure that
29 * TEXT_DOMAIN gets set to something.
30 */
31 #if !defined(TEXT_DOMAIN)
32 #define TEXT_DOMAIN "SYS_TEST"
33 #endif
34
35 /*
36 * Metadevice database interfaces.
37 */
38
39 #define MDDB
40
41 #include <meta.h>
42 #include <sys/lvm/md_mddb.h>
43 #include <sys/lvm/md_crc.h>
44 #include <sys/lvm/mdio.h>
45 #include <string.h>
46 #include <strings.h>
47 #include <ctype.h>
48
49 struct svm_daemon {
50 char *svmd_name;
51 char *svmd_kill_val;
52 };
53
54 /*
55 * This is a list of the daemons that are not stopped by the SVM smf(5)
56 * services. The mdmonitord is started via svc:/system/mdmonitor:default
57 * but no contract(4) is constructed and so it is not stopped by smf(5).
58 */
59 struct svm_daemon svmd_kill_list[] = {
60 {"mdmonitord", "HUP"},
61 {"mddoors", "KILL"},
62 };
63
64 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
65
66 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
67
68 /*
69 * Are the locator blocks for the replicas using devids
70 */
71 static int devid_in_use = FALSE;
72
73 static char *
getlongname(struct mddb_config * c,md_error_t * ep)74 getlongname(
75 struct mddb_config *c,
76 md_error_t *ep
77 )
78 {
79 char *diskname = NULL;
80 char *devid_str;
81 devid_nmlist_t *disklist = NULL;
82
83 c->c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
84 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
85 (void) mdstealerror(ep, &c->c_mde);
86 return (NULL);
87 }
88
89 if (c->c_locator.l_devid_flags & MDDB_DEVID_SZ) {
90 c->c_locator.l_devid = (uintptr_t)
91 Malloc(c->c_locator.l_devid_sz);
92 c->c_locator.l_devid_flags =
93 MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
94 } else {
95 (void) mderror(ep, MDE_NODEVID, "");
96 goto out;
97 }
98
99 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
100 (void) mdstealerror(ep, &c->c_mde);
101 goto out;
102 }
103
104 if (c->c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
105 (void) mderror(ep, MDE_NODEVID, "");
106 goto out;
107 }
108
109 if (metaioctl(MD_DB_GETDEV, c, &c->c_mde, NULL) != 0) {
110 (void) mdstealerror(ep, &c->c_mde);
111 goto out;
112 }
113
114 if (c->c_locator.l_devid != NULL) {
115 if (meta_deviceid_to_nmlist("/dev/dsk",
116 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
117 c->c_locator.l_minor_name, &disklist) != 0) {
118 devid_str = devid_str_encode(
119 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, NULL);
120 (void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
121 mderrorextra(ep, devid_str);
122 if (devid_str != NULL)
123 devid_str_free(devid_str);
124 goto out;
125 }
126 diskname = Strdup(disklist[0].devname);
127 }
128
129 out:
130 if (disklist != NULL)
131 devid_free_nmlist(disklist);
132
133 if (c->c_locator.l_devid != NULL)
134 Free((void *)(uintptr_t)c->c_locator.l_devid);
135
136 return (diskname);
137 }
138
139 /*
140 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
141 */
142 md_timeval32_t
meta_get_lb_inittime(mdsetname_t * sp,md_error_t * ep)143 meta_get_lb_inittime(
144 mdsetname_t *sp,
145 md_error_t *ep
146 )
147 {
148 mddb_config_t c;
149
150 (void) memset(&c, 0, sizeof (c));
151
152 /* Fill in setno, setname, and sideno */
153 c.c_setno = sp->setno;
154
155 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
156 (void) mdstealerror(ep, &c.c_mde);
157 }
158
159 return (c.c_timestamp);
160 }
161
162 /*
163 * mkmasterblks writes out the master blocks of the mddb to the replica.
164 *
165 * In a MN diskset, this is called by the node that is adding this replica
166 * to the diskset.
167 */
168
169 #define MDDB_VERIFY_SIZE 8192
170
171 static int
mkmasterblks(mdsetname_t * sp,mdname_t * np,int fd,daddr_t firstblk,int dbsize,md_timeval32_t inittime,md_error_t * ep)172 mkmasterblks(
173 mdsetname_t *sp,
174 mdname_t *np,
175 int fd,
176 daddr_t firstblk,
177 int dbsize,
178 md_timeval32_t inittime,
179 md_error_t *ep
180 )
181 {
182 int consecutive;
183 md_timeval32_t tp;
184 struct mddb_mb *mb;
185 char *buffer;
186 int iosize;
187 md_set_desc *sd;
188 int mn_set = 0;
189 daddr_t startblk;
190 int cnt;
191 ddi_devid_t devid;
192
193 if (! metaislocalset(sp)) {
194 if ((sd = metaget_setdesc(sp, ep)) == NULL)
195 return (-1);
196
197 if (MD_MNSET_DESC(sd)) {
198 mn_set = 1; /* Used later */
199 }
200 }
201
202 /*
203 * Loop to verify the entire mddb region on disk is read/writable.
204 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
205 * chunks.
206 *
207 * A side-effect of this loop is to zero out the entire mddb region
208 */
209 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
210 return (mdsyserror(ep, ENOMEM, np->rname));
211
212 startblk = firstblk;
213 for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
214
215 if (cnt > MDDB_VERIFY_SIZE)
216 consecutive = MDDB_VERIFY_SIZE;
217 else
218 consecutive = cnt;
219
220 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
221 Free(buffer);
222 return (mdsyserror(ep, errno, np->rname));
223 }
224
225 iosize = DEV_BSIZE * consecutive;
226 if (write(fd, buffer, iosize) != iosize) {
227 Free(buffer);
228 return (mdsyserror(ep, errno, np->rname));
229 }
230
231 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
232 Free(buffer);
233 return (mdsyserror(ep, errno, np->rname));
234 }
235
236 if (read(fd, buffer, iosize) != iosize) {
237 Free(buffer);
238 return (mdsyserror(ep, errno, np->rname));
239 }
240
241 startblk += consecutive;
242 }
243
244 Free(buffer);
245 if ((mb = Zalloc(DEV_BSIZE)) == NULL)
246 return (mdsyserror(ep, ENOMEM, np->rname));
247
248 if (meta_gettimeofday(&tp) == -1) {
249 Free(mb);
250 return (mdsyserror(ep, errno, np->rname));
251 }
252
253 mb->mb_magic = MDDB_MAGIC_MB;
254 /*
255 * If a MN diskset, set master block revision for a MN set.
256 * Even though the master block structure is no different
257 * for a MN set, setting the revision field to a different
258 * number keeps any pre-MN_diskset code from accessing
259 * this diskset. It also allows for an early determination
260 * of a MN diskset when reading in from disk so that the
261 * proper size locator block and locator names structure
262 * can be read in thus saving time on diskset startup.
263 */
264 if (mn_set)
265 mb->mb_revision = MDDB_REV_MNMB;
266 else
267 mb->mb_revision = MDDB_REV_MB;
268 mb->mb_timestamp = tp;
269 mb->mb_setno = sp->setno;
270 mb->mb_blkcnt = dbsize - 1;
271 mb->mb_blkno = firstblk;
272 mb->mb_nextblk = 0;
273
274 mb->mb_blkmap.m_firstblk = firstblk + 1;
275 mb->mb_blkmap.m_consecutive = dbsize - 1;
276 if (! metaislocalset(sp)) {
277 mb->mb_setcreatetime = inittime;
278 }
279
280 /*
281 * We try to save the disks device ID into the remaining bytes in
282 * the master block. The saved devid is used to provide a mapping
283 * between this disk's devid and the devid stored into the master
284 * block. This allows the disk image to be self-identifying
285 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used
286 * when we try to import these disks on the remote copied image.
287 * If we cannot save the disks device ID onto the master block that is
288 * ok. The disk is just not self-identifying and won't be importable
289 * in the remote copy scenario.
290 */
291 if (devid_get(fd, &devid) == 0) {
292 size_t len;
293
294 len = devid_sizeof(devid);
295 if (len <= DEV_BSIZE - sizeof (*mb)) {
296 /* there is enough space to store the devid */
297 mb->mb_devid_magic = MDDB_MAGIC_DE;
298 mb->mb_devid_len = len;
299 (void) memcpy(mb->mb_devid, devid, len);
300 }
301 devid_free(devid);
302 }
303
304 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
305 (crc_skip_t *)NULL);
306
307 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
308 Free(mb);
309 return (mdsyserror(ep, errno, np->rname));
310 }
311
312 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
313 Free(mb);
314 return (mdsyserror(ep, errno, np->rname));
315 }
316
317 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
318 Free(mb);
319 return (mdsyserror(ep, errno, np->rname));
320 }
321
322 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
323 Free(mb);
324 return (mdsyserror(ep, errno, np->rname));
325 }
326
327 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
328 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
329 Free(mb);
330 return (mdmddberror(ep, MDE_NOTVERIFIED,
331 meta_getminor(np->dev), sp->setno, 0, np->rname));
332 }
333
334 Free(mb);
335 return (0);
336 }
337
338 void
meta_mkdummymaster(mdsetname_t * sp,int fd,daddr_t firstblk)339 meta_mkdummymaster(
340 mdsetname_t *sp,
341 int fd,
342 daddr_t firstblk
343 )
344 {
345 md_timeval32_t tp;
346 struct mddb_mb *mb;
347 ddi_devid_t devid;
348 md_set_desc *sd;
349 md_error_t ep = mdnullerror;
350 md_timeval32_t inittime;
351
352 /*
353 * No dummy master blocks are written for a MN diskset since devids
354 * are not supported in MN disksets.
355 */
356 if (! metaislocalset(sp)) {
357 if ((sd = metaget_setdesc(sp, &ep)) == NULL)
358 return;
359
360 if (MD_MNSET_DESC(sd))
361 return;
362 }
363
364 if ((mb = Zalloc(DEV_BSIZE)) == NULL)
365 return;
366
367 mb->mb_magic = MDDB_MAGIC_DU;
368 mb->mb_revision = MDDB_REV_MB;
369 mb->mb_setno = sp->setno;
370 inittime = meta_get_lb_inittime(sp, &ep);
371 mb->mb_setcreatetime = inittime;
372
373 if (meta_gettimeofday(&tp) != -1)
374 mb->mb_timestamp = tp;
375
376 /*
377 * We try to save the disks device ID into the remaining bytes in
378 * the master block. This allows the disk image to be self-identifying
379 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used
380 * when we try to import these disks on the remote copied image.
381 * If we cannot save the disks device ID onto the master block that is
382 * ok. The disk is just not self-identifying and won't be importable
383 * in the remote copy scenario.
384 */
385 if (devid_get(fd, &devid) == 0) {
386 int len;
387
388 len = devid_sizeof(devid);
389 if (len <= DEV_BSIZE - sizeof (*mb)) {
390 /* there is enough space to store the devid */
391 mb->mb_devid_magic = MDDB_MAGIC_DE;
392 mb->mb_devid_len = len;
393 (void) memcpy(mb->mb_devid, (char *)devid, len);
394 }
395 devid_free(devid);
396 }
397
398 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
399 (crc_skip_t *)NULL);
400
401 /*
402 * If any of these operations fail, we need to inform the
403 * user that the disk won't be self identifying. When support
404 * for importing remotely replicated disksets is added, we
405 * want to add the error messages here.
406 */
407 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
408 goto out;
409
410 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
411 goto out;
412
413 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
414 goto out;
415
416 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
417 goto out;
418
419 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
420 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
421 goto out;
422
423 out:
424 Free(mb);
425 }
426
427 static int
buildconf(mdsetname_t * sp,md_error_t * ep)428 buildconf(mdsetname_t *sp, md_error_t *ep)
429 {
430 md_replicalist_t *rlp = NULL;
431 md_replicalist_t *rl;
432 FILE *cfp = NULL;
433 FILE *mfp = NULL;
434 struct stat sbuf;
435 int rval = 0;
436 int in_miniroot = 0;
437 char line[MDDB_BOOTLIST_MAX_LEN];
438 char *tname = NULL;
439
440 /* get list of local replicas */
441 if (! metaislocalset(sp))
442 return (0);
443
444 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
445 return (-1);
446
447 /* open tempfile, copy permissions of original file */
448 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
449 /*
450 * On the miniroot tmp files must be created in /var/tmp.
451 * If we get a EROFS error, we assume that we are in the
452 * miniroot.
453 */
454 if (errno != EROFS)
455 goto error;
456 in_miniroot = 1;
457 errno = 0;
458 tname = tempnam("/var/tmp", "slvm_");
459 if (tname == NULL && errno == EROFS) {
460 /*
461 * If we are booted on a read-only root because
462 * of mddb quorum problems we don't want to emit
463 * any scary error messages.
464 */
465 errno = 0;
466 goto out;
467 }
468
469 /* open tempfile, copy permissions of original file */
470 if ((cfp = fopen(tname, "w+")) == NULL)
471 goto error;
472 }
473 if (stat(META_DBCONF, &sbuf) == 0) {
474 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
475 goto error;
476 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
477 goto error;
478 }
479
480 /* print header */
481 if (fprintf(cfp, "#metadevice database location file ") == EOF)
482 goto error;
483 if (fprintf(cfp, "do not hand edit\n") < 0)
484 goto error;
485 if (fprintf(cfp,
486 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
487 goto error;
488
489 /* dump replicas */
490 for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
491 md_replica_t *r = rl->rl_repp;
492 int checksum = 42;
493 int i;
494 char *devidp;
495 minor_t min;
496
497 devidp = devid_str_encode(r->r_devid, r->r_minor_name);
498 /* If devid code can't encode devidp - skip entry */
499 if (devidp == NULL) {
500 continue;
501 }
502
503 /* compute checksum */
504 for (i = 0; ((r->r_driver_name[i] != '\0') &&
505 (i < sizeof (r->r_driver_name))); i++) {
506 checksum -= r->r_driver_name[i];
507 }
508 min = meta_getminor(r->r_namep->dev);
509 checksum -= min;
510 checksum -= r->r_blkno;
511
512 for (i = 0; i < strlen(devidp); i++) {
513 checksum -= devidp[i];
514 }
515 /* print info */
516 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
517 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
518 goto error;
519 }
520
521 devid_str_free(devidp);
522 }
523
524 /* close and rename to real file */
525 if (fflush(cfp) != 0)
526 goto error;
527 if (fsync(fileno(cfp)) != 0)
528 goto error;
529 if (fclose(cfp) != 0) {
530 cfp = NULL;
531 goto error;
532 }
533 cfp = NULL;
534
535 /*
536 * Renames don't work in the miniroot since tmpfiles are
537 * created in /var/tmp. Hence we copy the data out.
538 */
539
540 if (! in_miniroot) {
541 if (rename(META_DBCONFTMP, META_DBCONF) != 0)
542 goto error;
543 } else {
544 if ((cfp = fopen(tname, "r")) == NULL)
545 goto error;
546 if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
547 goto error;
548 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
549 if (fputs(line, mfp) == NULL)
550 goto error;
551 }
552 (void) fclose(cfp);
553 cfp = NULL;
554 if (fflush(mfp) != 0)
555 goto error;
556 if (fsync(fileno(mfp)) != 0)
557 goto error;
558 if (fclose(mfp) != 0) {
559 mfp = NULL;
560 goto error;
561 }
562 /* delete the tempfile */
563 (void) unlink(tname);
564 }
565 /* success */
566 rval = 0;
567 goto out;
568
569 /* tempfile error */
570 error:
571 rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
572 mdsyserror(ep, errno, META_DBCONFTMP);
573
574
575 /* cleanup, return success */
576 out:
577 if (rlp != NULL)
578 metafreereplicalist(rlp);
579 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
580 rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
581 mdsyserror(ep, errno, META_DBCONFTMP);
582 }
583 free(tname);
584 return (rval);
585 }
586
587 /*
588 * check replica for dev
589 */
590 static int
in_replica(mdsetname_t * sp,md_replica_t * rp,mdname_t * np,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)591 in_replica(
592 mdsetname_t *sp,
593 md_replica_t *rp,
594 mdname_t *np,
595 diskaddr_t slblk,
596 diskaddr_t nblks,
597 md_error_t *ep
598 )
599 {
600 mdname_t *repnp = rp->r_namep;
601 diskaddr_t rep_sblk = rp->r_blkno;
602 diskaddr_t rep_nblks = rp->r_nblk;
603
604 /* should be in the same set */
605 assert(sp != NULL);
606
607 /* if error in master block, assume whole partition */
608 if ((rep_sblk == MD_DISKADDR_ERROR) ||
609 (rep_nblks == MD_DISKADDR_ERROR)) {
610 rep_sblk = 0;
611 rep_nblks = MD_DISKADDR_ERROR;
612 }
613
614 /* check overlap */
615 if (meta_check_overlap(
616 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
617 return (-1);
618 }
619
620 /* return success */
621 return (0);
622 }
623
624 /*
625 * check to see if we're in a replica
626 */
627 int
meta_check_inreplica(mdsetname_t * sp,mdname_t * np,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)628 meta_check_inreplica(
629 mdsetname_t *sp,
630 mdname_t *np,
631 diskaddr_t slblk,
632 diskaddr_t nblks,
633 md_error_t *ep
634 )
635 {
636 md_replicalist_t *rlp = NULL;
637 md_replicalist_t *rl;
638 int rval = 0;
639
640 /* should have a set */
641 assert(sp != NULL);
642
643 /* for each replica */
644 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
645 return (-1);
646 for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
647 md_replica_t *rp = rl->rl_repp;
648
649 /* check replica */
650 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
651 rval = -1;
652 break;
653 }
654 }
655
656 /* cleanup, return success */
657 metafreereplicalist(rlp);
658 return (rval);
659 }
660
661 /*
662 * check replica
663 */
664 int
meta_check_replica(mdsetname_t * sp,mdname_t * np,mdchkopts_t options,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)665 meta_check_replica(
666 mdsetname_t *sp, /* set to check against */
667 mdname_t *np, /* component to check against */
668 mdchkopts_t options, /* option flags */
669 diskaddr_t slblk, /* start logical block */
670 diskaddr_t nblks, /* number of blocks (-1,rest of them) */
671 md_error_t *ep /* error packet */
672 )
673 {
674 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE;
675
676 /* make sure we have a disk */
677 if (metachkcomp(np, ep) != 0)
678 return (-1);
679
680 /* check to ensure that it is not already in use */
681 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
682 return (-1);
683 }
684
685 if (options & MDCHK_ALLOW_NODBS)
686 return (0);
687
688 if (options & MDCHK_DRVINSET)
689 return (0);
690
691 /* make sure it is in the set */
692 if (meta_check_inset(sp, np, ep) != 0)
693 return (-1);
694
695 /* make sure its not in a metadevice */
696 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
697 return (-1);
698
699 /* return success */
700 return (0);
701 }
702
703 static int
update_dbinfo_on_drives(mdsetname_t * sp,md_drive_desc * dd,int set_locked,int force,md_error_t * ep)704 update_dbinfo_on_drives(
705 mdsetname_t *sp,
706 md_drive_desc *dd,
707 int set_locked,
708 int force,
709 md_error_t *ep
710 )
711 {
712 md_set_desc *sd;
713 int i;
714 md_setkey_t *cl_sk;
715 int rval = 0;
716 md_mnnode_desc *nd;
717
718 if ((sd = metaget_setdesc(sp, ep)) == NULL)
719 return (-1);
720
721 if (! set_locked) {
722 if (MD_MNSET_DESC(sd)) {
723 md_error_t xep = mdnullerror;
724 sigset_t sigs;
725 /* Make sure we are blocking all signals */
726 if (procsigs(TRUE, &sigs, &xep) < 0)
727 mdclrerror(&xep);
728
729 nd = sd->sd_nodelist;
730 while (nd) {
731 if (force && strcmp(nd->nd_nodename,
732 mynode()) != 0) {
733 nd = nd->nd_next;
734 continue;
735 }
736
737 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
738 nd = nd->nd_next;
739 continue;
740 }
741
742 if (clnt_lock_set(nd->nd_nodename, sp, ep))
743 return (-1);
744 nd = nd->nd_next;
745 }
746 } else {
747 for (i = 0; i < MD_MAXSIDES; i++) {
748 /* Skip empty slots */
749 if (sd->sd_nodes[i][0] == '\0')
750 continue;
751
752 if (force && strcmp(sd->sd_nodes[i],
753 mynode()) != 0)
754 continue;
755
756 if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
757 return (-1);
758 }
759 }
760 }
761
762 if (MD_MNSET_DESC(sd)) {
763 nd = sd->sd_nodelist;
764 while (nd) {
765 if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
766 nd = nd->nd_next;
767 continue;
768 }
769
770 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
771 nd = nd->nd_next;
772 continue;
773 }
774
775 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
776 == -1) {
777 rval = -1;
778 break;
779 }
780 nd = nd->nd_next;
781 }
782 } else {
783 for (i = 0; i < MD_MAXSIDES; i++) {
784 /* Skip empty slots */
785 if (sd->sd_nodes[i][0] == '\0')
786 continue;
787
788 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
789 continue;
790
791 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
792 == -1) {
793 rval = -1;
794 break;
795 }
796 }
797 }
798
799 if (! set_locked) {
800 cl_sk = cl_get_setkey(sp->setno, sp->setname);
801 if (MD_MNSET_DESC(sd)) {
802 nd = sd->sd_nodelist;
803 while (nd) {
804 if (force &&
805 strcmp(nd->nd_nodename, mynode()) != 0) {
806 nd = nd->nd_next;
807 continue;
808 }
809
810 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
811 nd = nd->nd_next;
812 continue;
813 }
814
815 if (clnt_unlock_set(nd->nd_nodename, cl_sk,
816 ep)) {
817 rval = -1;
818 break;
819 }
820 nd = nd->nd_next;
821 }
822 } else {
823 for (i = 0; i < MD_MAXSIDES; i++) {
824 /* Skip empty slots */
825 if (sd->sd_nodes[i][0] == '\0')
826 continue;
827
828 if (force &&
829 strcmp(sd->sd_nodes[i], mynode()) != 0)
830 continue;
831
832 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
833 ep)) {
834 rval = -1;
835 break;
836 }
837 }
838
839 }
840 cl_set_setkey(NULL);
841 }
842
843 return (rval);
844 }
845
846 int
meta_db_addsidenms(mdsetname_t * sp,mdname_t * np,daddr_t blkno,int bcast,md_error_t * ep)847 meta_db_addsidenms(
848 mdsetname_t *sp,
849 mdname_t *np,
850 daddr_t blkno,
851 int bcast,
852 md_error_t *ep
853 )
854 {
855 side_t sideno;
856 char *bname = NULL;
857 char *dname = NULL;
858 minor_t mnum;
859 mddb_config_t c;
860 int done;
861 int rval = 0;
862 md_set_desc *sd;
863
864 sideno = MD_SIDEWILD;
865 /*CONSTCOND*/
866 while (1) {
867 if (bname != NULL) {
868 Free(bname);
869 bname = NULL;
870 }
871 if (dname != NULL) {
872 Free(dname);
873 dname = NULL;
874 }
875 if ((done = meta_getnextside_devinfo(sp, np->bname,
876 &sideno, &bname, &dname, &mnum, ep)) == -1) {
877 rval = -1;
878 break;
879 }
880
881 if (done == 0)
882 break;
883
884 if (! metaislocalset(sp)) {
885 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
886 rval = -1;
887 break;
888 }
889 }
890
891 /*
892 * Send addsidenms to all nodes using rpc.mdcommd if
893 * sidename is being added to MN diskset.
894 *
895 * It's ok to broadcast this call to other nodes.
896 *
897 * Note: The broadcast to other nodes isn't needed during
898 * the addition of the first mddbs to the set since the
899 * other nodes haven't been joined to the set yet. All
900 * nodes in a MN diskset are (implicitly) joined to the set
901 * on the addition of the first mddb.
902 */
903 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
904 (bcast == DB_ADDSIDENMS_BCAST)) {
905 md_mn_result_t *resultp = NULL;
906 md_mn_msg_meta_db_newside_t db_ns;
907 int send_rval;
908
909 db_ns.msg_l_dev = np->dev;
910 db_ns.msg_sideno = sideno;
911 db_ns.msg_blkno = blkno;
912 (void) strncpy(db_ns.msg_dname, dname,
913 sizeof (db_ns.msg_dname));
914 (void) splitname(np->bname, &db_ns.msg_splitname);
915 db_ns.msg_mnum = mnum;
916
917 /* Set devid to NULL until devids are supported */
918 db_ns.msg_devid[0] = NULL;
919
920 /*
921 * If reconfig cycle has been started, this node is
922 * stuck in in the return step until this command has
923 * completed. If mdcommd is suspended, ask
924 * send_message to fail (instead of retrying)
925 * so that metaset can finish allowing the reconfig
926 * cycle to proceed.
927 */
928 send_rval = mdmn_send_message(sp->setno,
929 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
930 MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns,
931 sizeof (md_mn_msg_meta_db_newside_t),
932 &resultp, ep);
933 if (send_rval != 0) {
934 rval = -1;
935 if (resultp == NULL)
936 (void) mddserror(ep,
937 MDE_DS_COMMD_SEND_FAIL,
938 sp->setno, NULL, NULL,
939 sp->setname);
940 else {
941 (void) mdstealerror(ep,
942 &(resultp->mmr_ep));
943 if (mdisok(ep)) {
944 (void) mddserror(ep,
945 MDE_DS_COMMD_SEND_FAIL,
946 sp->setno, NULL, NULL,
947 sp->setname);
948 }
949 free_result(resultp);
950 }
951 break;
952 }
953 if (resultp)
954 free_result(resultp);
955 } else {
956 /*
957 * Let this side's device name, minor # and driver name
958 * be known to the database replica.
959 */
960 (void) memset(&c, 0, sizeof (c));
961
962 /* Fill in device/replica info */
963 c.c_locator.l_dev = meta_cmpldev(np->dev);
964 c.c_locator.l_blkno = blkno;
965 (void) strncpy(c.c_locator.l_driver, dname,
966 sizeof (c.c_locator.l_driver));
967 if (splitname(np->bname, &c.c_devname) ==
968 METASPLIT_LONGDISKNAME && devid_in_use == FALSE) {
969 rval = mddeverror(ep, MDE_DISKNAMETOOLONG,
970 NODEV64, np->rname);
971 break;
972 }
973
974 c.c_locator.l_mnum = mnum;
975
976 /* Fill in setno, setname, and sideno */
977 c.c_setno = sp->setno;
978 (void) strncpy(c.c_setname, sp->setname,
979 sizeof (c.c_setname));
980 c.c_sideno = sideno;
981
982 /*
983 * Don't need device id information from this ioctl
984 * Kernel determines device id from dev_t, which
985 * is just what this code would do.
986 */
987 c.c_locator.l_devid = (uint64_t)0;
988 c.c_locator.l_devid_flags = 0;
989
990 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
991 rval = mdstealerror(ep, &c.c_mde);
992 break;
993 }
994 }
995 }
996
997 /* cleanup, return success */
998 if (bname != NULL) {
999 Free(bname);
1000 bname = NULL;
1001 }
1002 if (dname != NULL) {
1003 Free(dname);
1004 dname = NULL;
1005 }
1006 return (rval);
1007 }
1008
1009
1010 int
meta_db_delsidenm(mdsetname_t * sp,side_t sideno,mdname_t * np,daddr_t blkno,md_error_t * ep)1011 meta_db_delsidenm(
1012 mdsetname_t *sp,
1013 side_t sideno,
1014 mdname_t *np,
1015 daddr_t blkno,
1016 md_error_t *ep
1017 )
1018 {
1019 mddb_config_t c;
1020 md_set_desc *sd;
1021
1022 if (! metaislocalset(sp)) {
1023 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1024 return (-1);
1025 }
1026 /* Use rpc.mdcommd to delete mddb side from all nodes */
1027 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1028 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1029 md_mn_result_t *resultp = NULL;
1030 md_mn_msg_meta_db_delside_t db_ds;
1031 int send_rval;
1032
1033 db_ds.msg_l_dev = np->dev;
1034 db_ds.msg_blkno = blkno;
1035 db_ds.msg_sideno = sideno;
1036
1037 /* Set devid to NULL until devids are supported */
1038 db_ds.msg_devid[0] = NULL;
1039
1040 /*
1041 * If reconfig cycle has been started, this node is
1042 * stuck in in the return step until this command has
1043 * completed. If mdcommd is suspended, ask
1044 * send_message to fail (instead of retrying)
1045 * so that metaset can finish allowing the reconfig
1046 * cycle to proceed.
1047 */
1048 send_rval = mdmn_send_message(sp->setno,
1049 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
1050 MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds,
1051 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
1052 if (send_rval != 0) {
1053 if (resultp == NULL)
1054 (void) mddserror(ep,
1055 MDE_DS_COMMD_SEND_FAIL,
1056 sp->setno, NULL, NULL,
1057 sp->setname);
1058 else {
1059 (void) mdstealerror(ep, &(resultp->mmr_ep));
1060 if (mdisok(ep)) {
1061 (void) mddserror(ep,
1062 MDE_DS_COMMD_SEND_FAIL,
1063 sp->setno, NULL, NULL,
1064 sp->setname);
1065 }
1066 free_result(resultp);
1067 }
1068 return (-1);
1069 }
1070 if (resultp)
1071 free_result(resultp);
1072
1073 } else {
1074 /*
1075 * Let this side's device name, minor # and driver name
1076 * be known to the database replica.
1077 */
1078 (void) memset(&c, 0, sizeof (c));
1079
1080 /* Fill in device/replica info */
1081 c.c_locator.l_dev = meta_cmpldev(np->dev);
1082 c.c_locator.l_blkno = blkno;
1083
1084 /* Fill in setno, setname, and sideno */
1085 c.c_setno = sp->setno;
1086 (void) strcpy(c.c_setname, sp->setname);
1087 c.c_sideno = sideno;
1088
1089 /*
1090 * Don't need device id information from this ioctl
1091 * Kernel determines device id from dev_t, which
1092 * is just what this code would do.
1093 */
1094 c.c_locator.l_devid = (uint64_t)0;
1095 c.c_locator.l_devid_flags = 0;
1096
1097 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1098 return (mdstealerror(ep, &c.c_mde));
1099 }
1100 return (0);
1101 }
1102
1103
1104 static int
mdnamesareunique(mdnamelist_t * nlp,md_error_t * ep)1105 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1106 {
1107 mdnamelist_t *dnp1, *dnp2;
1108
1109 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1110 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1111 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1112 return (mderror(ep, MDE_DUPDRIVE,
1113 dnp1->namep->cname));
1114 }
1115 }
1116 return (0);
1117 }
1118
1119
1120 /*
1121 * Return 1 if files are different, else return 0
1122 */
1123 static int
filediff(char * tsname,char * sname)1124 filediff(char *tsname, char *sname)
1125 {
1126 int ret = 1, fd;
1127 size_t tsz, sz;
1128 struct stat sbuf;
1129 char *tbuf, *buf;
1130
1131 if (stat(tsname, &sbuf) != 0)
1132 return (1);
1133 tsz = sbuf.st_size;
1134 if (stat(sname, &sbuf) != 0)
1135 return (1);
1136 sz = sbuf.st_size;
1137 if (tsz != sz)
1138 return (1);
1139
1140 /* allocate memory and read both files into buffer */
1141 tbuf = malloc(tsz);
1142 buf = malloc(sz);
1143 if (tbuf == NULL || buf == NULL)
1144 goto out;
1145
1146 fd = open(tsname, O_RDONLY);
1147 if (fd == -1)
1148 goto out;
1149 sz = read(fd, tbuf, tsz);
1150 (void) close(fd);
1151 if (sz != tsz)
1152 goto out;
1153
1154 fd = open(sname, O_RDONLY);
1155 if (fd == -1)
1156 goto out;
1157 sz = read(fd, buf, tsz);
1158 (void) close(fd);
1159 if (sz != tsz)
1160 goto out;
1161
1162 /* compare content */
1163 ret = bcmp(tbuf, buf, tsz);
1164 out:
1165 if (tbuf)
1166 free(tbuf);
1167 if (buf)
1168 free(buf);
1169 return (ret);
1170 }
1171
1172 /*
1173 * patch md.conf file with mddb locations
1174 */
1175 int
meta_db_patch(char * sname,char * cname,int patch,md_error_t * ep)1176 meta_db_patch(
1177 char *sname, /* system file name */
1178 char *cname, /* mddb.cf file name */
1179 int patch, /* patching locally */
1180 md_error_t *ep
1181 )
1182 {
1183 char *tsname = NULL;
1184 char line[MDDB_BOOTLIST_MAX_LEN];
1185 FILE *tsfp = NULL;
1186 FILE *mfp = NULL;
1187 int rval = -1;
1188
1189 /* check names */
1190 if (sname == NULL) {
1191 if (patch)
1192 sname = "md.conf";
1193 else
1194 sname = "/kernel/drv/md.conf";
1195 }
1196 if (cname == NULL)
1197 cname = META_DBCONF;
1198
1199 /*
1200 * edit file
1201 */
1202 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1203 if (mdissyserror(ep, EROFS)) {
1204 /*
1205 * If we are booted on a read-only root because
1206 * of mddb quorum problems we don't want to emit
1207 * any scary error messages.
1208 */
1209 mdclrerror(ep);
1210 rval = 0;
1211 }
1212 goto out;
1213 }
1214
1215 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0,
1216 ep) != 0)
1217 goto out;
1218
1219 /* if file content is identical, skip rename */
1220 if (filediff(tsname, sname) == 0) {
1221 rval = 0;
1222 goto out;
1223 }
1224
1225 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1226 (fclose(tsfp) != 0)) {
1227 (void) mdsyserror(ep, errno, tsname);
1228 goto out;
1229 }
1230
1231 tsfp = NULL;
1232
1233 /*
1234 * rename file. If we get a Cross Device error then it
1235 * is because we are in the miniroot.
1236 */
1237 if (rename(tsname, sname) != 0 && errno != EXDEV) {
1238 (void) mdsyserror(ep, errno, sname);
1239 goto out;
1240 }
1241
1242 if (errno == EXDEV) {
1243 if ((tsfp = fopen(tsname, "r")) == NULL)
1244 goto out;
1245 if ((mfp = fopen(sname, "w+")) == NULL)
1246 goto out;
1247 while (fgets(line, sizeof (line), tsfp) != NULL) {
1248 if (fputs(line, mfp) == NULL)
1249 goto out;
1250 }
1251 (void) fclose(tsfp);
1252 tsfp = NULL;
1253 if (fflush(mfp) != 0)
1254 goto out;
1255 if (fsync(fileno(mfp)) != 0)
1256 goto out;
1257 if (fclose(mfp) != 0) {
1258 mfp = NULL;
1259 goto out;
1260 }
1261 }
1262
1263 Free(tsname);
1264 tsname = NULL;
1265 rval = 0;
1266
1267 /* cleanup, return error */
1268 out:
1269 if (tsfp != NULL)
1270 (void) fclose(tsfp);
1271 if (tsname != NULL) {
1272 (void) unlink(tsname);
1273 Free(tsname);
1274 }
1275 return (rval);
1276 }
1277
1278 /*
1279 * Add replicas to set. This happens as a result of:
1280 * - metadb [-s set_name] -a
1281 * - metaset -s set_name -a disk
1282 * - metaset -s set_name -d disk (causes a rebalance of mddbs)
1283 * - metaset -s set_name -b
1284 *
1285 * For a local set, this routine is run on the local set host.
1286 *
1287 * For a traditional diskset, this routine is run on the node that
1288 * is running the metaset command.
1289 *
1290 * For a multinode diskset, this routine is run by the node that is
1291 * running the metaset command. If this is the first mddb added to
1292 * the MN diskset, then no communication is made to other nodes via commd
1293 * since the other nodes will be in-sync with respect to the mddbs when
1294 * those other nodes join the set and snarf in the newly created mddb.
1295 * If this is not the first mddb added to the MN diskset, then this
1296 * attach command is sent to all of the nodes using commd. This keeps
1297 * the nodes in-sync.
1298 */
1299 int
meta_db_attach(mdsetname_t * sp,mdnamelist_t * db_nlp,mdchkopts_t options,md_timeval32_t * timeval,int dbcnt,int dbsize,char * sysfilename,md_error_t * ep)1300 meta_db_attach(
1301 mdsetname_t *sp,
1302 mdnamelist_t *db_nlp,
1303 mdchkopts_t options,
1304 md_timeval32_t *timeval,
1305 int dbcnt,
1306 int dbsize,
1307 char *sysfilename,
1308 md_error_t *ep
1309 )
1310 {
1311 struct mddb_config c;
1312 mdnamelist_t *nlp;
1313 mdname_t *np;
1314 md_drive_desc *dd = NULL;
1315 md_drive_desc *p;
1316 int i;
1317 int fd;
1318 side_t sideno;
1319 daddr_t blkno;
1320 int replicacount = 0;
1321 int start_svmdaemons = 0;
1322 int rval = 0;
1323 md_error_t status = mdnullerror;
1324 md_set_desc *sd;
1325 int stale_bool = FALSE;
1326 int flags;
1327 int firstmddb = 1;
1328 md_timeval32_t inittime = {0, 0};
1329
1330 /*
1331 * Error if we don't get some work to do.
1332 */
1333 if (db_nlp == NULL)
1334 return (mdsyserror(ep, EINVAL, NULL));
1335
1336 if (mdnamesareunique(db_nlp, ep) != 0)
1337 return (-1);
1338 (void) memset(&c, 0, sizeof (c));
1339 c.c_id = 0;
1340 c.c_setno = sp->setno;
1341
1342 /* Don't need device id information from this ioctl */
1343 c.c_locator.l_devid = (uint64_t)0;
1344 c.c_locator.l_devid_flags = 0;
1345 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1346 if (metaislocalset(sp)) {
1347 if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1348 mdclrerror(&c.c_mde);
1349 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1350 (! (options & MDCHK_ALLOW_NODBS)))
1351 return (mdstealerror(ep, &c.c_mde));
1352 } else {
1353 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1354 return (mdstealerror(ep, &c.c_mde));
1355 }
1356 mdclrerror(&c.c_mde);
1357 }
1358 /*
1359 * Is current set STALE?
1360 */
1361 if (c.c_flags & MDDB_C_STALE) {
1362 stale_bool = TRUE;
1363 }
1364
1365 assert(db_nlp != NULL);
1366
1367 /* if these are the first replicas then the SVM daemons need to run */
1368 if (c.c_dbcnt == 0)
1369 start_svmdaemons = 1;
1370
1371 /*
1372 * check to see if we will go over the total possible number
1373 * of data bases
1374 */
1375 nlp = db_nlp;
1376 while (nlp) {
1377 replicacount += dbcnt;
1378 nlp = nlp->next;
1379 }
1380
1381 if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1382 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1383 sp->setno, c.c_dbcnt + replicacount, NULL));
1384
1385 /*
1386 * go through and check to make sure all locations specified
1387 * are legal also pick out driver name;
1388 */
1389 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1390 diskaddr_t devsize;
1391
1392 np = nlp->namep;
1393
1394 if (! metaislocalset(sp)) {
1395 uint_t partno;
1396 uint_t rep_partno;
1397 mddrivename_t *dnp = np->drivenamep;
1398
1399 /*
1400 * make sure that non-local database replicas
1401 * are always on the replica slice.
1402 */
1403 if (meta_replicaslice(dnp,
1404 &rep_partno, ep) != 0)
1405 return (-1);
1406 if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1407 return (-1);
1408 if (partno != rep_partno)
1409 return (mddeverror(ep, MDE_REPCOMP_ONLY,
1410 np->dev, sp->setname));
1411 }
1412
1413 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1414 ep)) {
1415 return (-1);
1416 }
1417
1418 if ((devsize = metagetsize(np, ep)) == -1)
1419 return (-1);
1420
1421 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1422 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1423 meta_getminor(np->dev), sp->setno, devsize,
1424 np->cname));
1425 }
1426
1427 /*
1428 * If first disk in set we don't have lb_inittime yet for use as
1429 * mb_setcreatetime so don't go looking for it. WE'll come back
1430 * later and update after the locator block has been created.
1431 * If this isn't the first disk in the set, we have a locator
1432 * block and thus we have lb_inittime. Set mb_setcreatetime to
1433 * lb_inittime.
1434 */
1435 if (! metaislocalset(sp)) {
1436 if (c.c_dbcnt != 0) {
1437 firstmddb = 0;
1438 inittime = meta_get_lb_inittime(sp, ep);
1439 }
1440 }
1441
1442 /*
1443 * go through and write all master blocks
1444 */
1445
1446 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1447 np = nlp->namep;
1448
1449 if ((fd = open(np->rname, O_RDWR)) < 0)
1450 return (mdsyserror(ep, errno, np->rname));
1451
1452 for (i = 0; i < dbcnt; i++) {
1453 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1454 inittime, ep)) {
1455 (void) close(fd);
1456 return (-1);
1457 }
1458 }
1459 (void) close(fd);
1460 }
1461
1462 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1463 return (-1);
1464
1465 if (! metaislocalset(sp)) {
1466 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1467 if (! mdisok(ep))
1468 return (-1);
1469 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1470 return (-1);
1471
1472 }
1473
1474 /*
1475 * go through and tell kernel to add them
1476 */
1477 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1478 mdcinfo_t *cinfo;
1479
1480 np = nlp->namep;
1481
1482 if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1483 rval = -1;
1484 goto out;
1485 }
1486
1487 /*
1488 * If mddb is being added to MN diskset and there already
1489 * exists a valid mddb in the set (which equates to this
1490 * node being an owner of the set) then use rpc.mdcommd
1491 * mechanism to add mddb(s) so that all nodes stay in sync.
1492 * If set is stale, don't log the message since rpc.mdcommd
1493 * can't write the message to the mddb.
1494 *
1495 * Otherwise, just add mddb to this node.
1496 */
1497 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1498 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1499 md_mn_result_t *resultp = NULL;
1500 md_mn_msg_meta_db_attach_t attach;
1501 int send_rval;
1502
1503 /*
1504 * In a scenario where new replicas had been added on
1505 * the master, and then all of the old replicas failed
1506 * before the slaves had knowledge of the new replicas,
1507 * the slaves are unable to re-parse in the mddb
1508 * from the new replicas since the slaves have no
1509 * knowledge of the new replicas. The following
1510 * algorithm solves this problem:
1511 * - META_DB_ATTACH message generates submsgs
1512 * - BLOCK parse (master)
1513 * - MDDB_ATTACH new replicas
1514 * - UNBLOCK parse (master) causing parse
1515 * information to be sent from master
1516 * to slaves at a higher class than the
1517 * unblock so the parse message will
1518 * reach slaves before unblock message.
1519 */
1520 attach.msg_l_dev = np->dev;
1521 attach.msg_cnt = dbcnt;
1522 attach.msg_dbsize = dbsize;
1523 (void) strncpy(attach.msg_dname, cinfo->dname,
1524 sizeof (attach.msg_dname));
1525 (void) splitname(np->bname, &attach.msg_splitname);
1526 attach.msg_options = options;
1527
1528 /* Set devid to NULL until devids are supported */
1529 attach.msg_devid[0] = NULL;
1530
1531 /*
1532 * If reconfig cycle has been started, this node is
1533 * stuck in in the return step until this command has
1534 * completed. If mdcommd is suspended, ask
1535 * send_message to fail (instead of retrying)
1536 * so that metaset can finish allowing the reconfig
1537 * cycle to proceed.
1538 */
1539 flags = MD_MSGF_FAIL_ON_SUSPEND;
1540 if (stale_bool == TRUE)
1541 flags |= MD_MSGF_NO_LOG;
1542 send_rval = mdmn_send_message(sp->setno,
1543 MD_MN_MSG_META_DB_ATTACH,
1544 flags, 0, (char *)&attach,
1545 sizeof (md_mn_msg_meta_db_attach_t),
1546 &resultp, ep);
1547 if (send_rval != 0) {
1548 rval = -1;
1549 if (resultp == NULL)
1550 (void) mddserror(ep,
1551 MDE_DS_COMMD_SEND_FAIL,
1552 sp->setno, NULL, NULL,
1553 sp->setname);
1554 else {
1555 (void) mdstealerror(ep,
1556 &(resultp->mmr_ep));
1557 if (mdisok(ep)) {
1558 (void) mddserror(ep,
1559 MDE_DS_COMMD_SEND_FAIL,
1560 sp->setno, NULL, NULL,
1561 sp->setname);
1562 }
1563 free_result(resultp);
1564 }
1565 goto out;
1566 }
1567 if (resultp)
1568 free_result(resultp);
1569 } else {
1570 /* Adding mddb(s) to just this node */
1571 for (i = 0; i < dbcnt; i++) {
1572 (void) memset(&c, 0, sizeof (c));
1573 /* Fill in device/replica info */
1574 c.c_locator.l_dev = meta_cmpldev(np->dev);
1575 c.c_locator.l_blkno = i * dbsize + 16;
1576 blkno = c.c_locator.l_blkno;
1577 (void) strncpy(c.c_locator.l_driver,
1578 cinfo->dname,
1579 sizeof (c.c_locator.l_driver));
1580
1581 if (splitname(np->bname, &c.c_devname) ==
1582 METASPLIT_LONGDISKNAME && devid_in_use ==
1583 FALSE) {
1584 rval = mddeverror(ep,
1585 MDE_DISKNAMETOOLONG,
1586 NODEV64, np->rname);
1587 goto out;
1588 }
1589
1590 c.c_locator.l_mnum = meta_getminor(np->dev);
1591
1592 /* Fill in setno, setname, and sideno */
1593 c.c_setno = sp->setno;
1594 if (! metaislocalset(sp)) {
1595 if (MD_MNSET_DESC(sd)) {
1596 c.c_multi_node = 1;
1597 }
1598 }
1599 (void) strcpy(c.c_setname, sp->setname);
1600 c.c_sideno = sideno;
1601
1602 /*
1603 * Don't need device id information from this
1604 * ioctl Kernel determines device id from
1605 * dev_t, which is just what this code would do.
1606 */
1607 c.c_locator.l_devid = (uint64_t)0;
1608 c.c_locator.l_devid_flags = 0;
1609
1610 if (timeval != NULL)
1611 c.c_timestamp = *timeval;
1612
1613 if (setup_med_cfg(sp, &c,
1614 (options & MDCHK_SET_FORCE), ep)) {
1615 rval = -1;
1616 goto out;
1617 }
1618
1619 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde,
1620 NULL) != 0) {
1621 rval = mdstealerror(ep, &c.c_mde);
1622 goto out;
1623 }
1624 /*
1625 * This is either a traditional diskset OR this
1626 * is the first replica added to a MN diskset.
1627 * In either case, set broadcast to NO_BCAST so
1628 * that message won't go through rpc.mdcommd.
1629 * If this is a traditional diskset, the bcast
1630 * flag is ignored since traditional disksets
1631 * don't use the rpc.mdcommd.
1632 */
1633 if (meta_db_addsidenms(sp, np, blkno,
1634 DB_ADDSIDENMS_NO_BCAST, ep))
1635 goto out;
1636 }
1637 }
1638 if (! metaislocalset(sp)) {
1639 /* update the dbcnt and size in dd */
1640 for (p = dd; p != NULL; p = p->dd_next)
1641 if (p->dd_dnp == np->drivenamep) {
1642 p->dd_dbcnt = dbcnt;
1643 p->dd_dbsize = dbsize;
1644 break;
1645 }
1646 }
1647
1648 /*
1649 * If this was the first addition of disks to the
1650 * diskset you now need to update the mb_setcreatetime
1651 * which needed lb_inittime which wasn't there until now.
1652 */
1653 if (firstmddb) {
1654 if (meta_update_mb(sp, dd, ep) != 0) {
1655 return (-1);
1656 }
1657 }
1658 (void) close(fd);
1659 }
1660
1661 out:
1662 if (metaislocalset(sp)) {
1663
1664 /* everything looks fine. Start mdmonitord */
1665 if (rval == 0 && start_svmdaemons == 1) {
1666 if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1667 mde_perror(&status, "");
1668 mdclrerror(&status);
1669 }
1670 }
1671
1672 if (buildconf(sp, &status)) {
1673 /* Don't mask any previous errors */
1674 if (rval == 0)
1675 rval = mdstealerror(ep, &status);
1676 return (rval);
1677 }
1678
1679 if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1680 /* Don't mask any previous errors */
1681 if (rval == 0)
1682 rval = mdstealerror(ep, &status);
1683 }
1684 } else {
1685 if (update_dbinfo_on_drives(sp, dd,
1686 (options & MDCHK_SET_LOCKED),
1687 (options & MDCHK_SET_FORCE),
1688 &status)) {
1689 /* Don't mask any previous errors */
1690 if (rval == 0)
1691 rval = mdstealerror(ep, &status);
1692 else
1693 mdclrerror(&status);
1694 }
1695 metafreedrivedesc(&dd);
1696 }
1697 /*
1698 * For MN disksets that already had already had nodes joined
1699 * before the attach of this mddb(s), the name invalidation is
1700 * done by the commd handler routine. Otherwise, if this
1701 * is the first attach of a MN diskset mddb, the invalidation
1702 * must be done here since the first attach cannot be sent
1703 * via the commd since there are no nodes joined to the set yet.
1704 */
1705 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1706 (MD_MNSET_DESC(sd) &&
1707 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1708 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1709 meta_invalidate_name(nlp->namep);
1710 }
1711 }
1712 return (rval);
1713 }
1714
1715 /*
1716 * deletelist_length
1717 *
1718 * return the number of slices that have been specified for deletion
1719 * on the metadb command line. This does not calculate the number
1720 * of replicas because there may be multiple replicas per slice.
1721 */
1722 static int
deletelist_length(mdnamelist_t * db_nlp)1723 deletelist_length(mdnamelist_t *db_nlp)
1724 {
1725
1726 mdnamelist_t *nlp;
1727 int list_length = 0;
1728
1729 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1730 list_length++;
1731 }
1732
1733 return (list_length);
1734 }
1735
1736 static int
in_deletelist(char * devname,mdnamelist_t * db_nlp)1737 in_deletelist(char *devname, mdnamelist_t *db_nlp)
1738 {
1739
1740 mdnamelist_t *nlp;
1741 mdname_t *np;
1742 int index = 0;
1743
1744 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1745 np = nlp->namep;
1746
1747 if (strcmp(devname, np->bname) == 0)
1748 return (index);
1749 index++;
1750 }
1751
1752 return (-1);
1753 }
1754
1755 /*
1756 * Delete replicas from set. This happens as a result of:
1757 * - metadb [-s set_name] -d
1758 * - metaset -s set_name -a disk (causes a rebalance of mddbs)
1759 * - metaset -s set_name -d disk
1760 * - metaset -s set_name -b
1761 *
1762 * For a local set, this routine is run on the local set host.
1763 *
1764 * For a traditional diskset, this routine is run on the node that
1765 * is running the metaset command.
1766 *
1767 * For a multinode diskset, this routine is run by the node that is
1768 * running the metaset command. This detach routine is sent to all
1769 * of the joined nodes in the diskset using commd. This keeps
1770 * the nodes in-sync.
1771 */
1772 int
meta_db_detach(mdsetname_t * sp,mdnamelist_t * db_nlp,mdforceopts_t force_option,char * sysfilename,md_error_t * ep)1773 meta_db_detach(
1774 mdsetname_t *sp,
1775 mdnamelist_t *db_nlp,
1776 mdforceopts_t force_option,
1777 char *sysfilename,
1778 md_error_t *ep
1779 )
1780 {
1781 struct mddb_config c;
1782 mdnamelist_t *nlp;
1783 mdname_t *np;
1784 md_drive_desc *dd = NULL;
1785 md_drive_desc *p;
1786 int replicacount;
1787 int replica_delete_count;
1788 int nr_replica_slices;
1789 int i;
1790 int stop_svmdaemons = 0;
1791 int rval = 0;
1792 int index;
1793 int valid_replicas_nottodelete = 0;
1794 int invalid_replicas_nottodelete = 0;
1795 int invalid_replicas_todelete = 0;
1796 int errored = 0;
1797 int *tag_array;
1798 int fd = -1;
1799 md_error_t status = mdnullerror;
1800 md_set_desc *sd;
1801 int stale_bool = FALSE;
1802 int flags;
1803
1804 /*
1805 * Error if we don't get some work to do.
1806 */
1807 if (db_nlp == NULL)
1808 return (mdsyserror(ep, EINVAL, NULL));
1809
1810 if (mdnamesareunique(db_nlp, ep) != 0)
1811 return (-1);
1812
1813 (void) memset(&c, 0, sizeof (c));
1814 c.c_id = 0;
1815 c.c_setno = sp->setno;
1816
1817 /* Don't need device id information from this ioctl */
1818 c.c_locator.l_devid = (uint64_t)0;
1819 c.c_locator.l_devid_flags = 0;
1820
1821 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1822 return (mdstealerror(ep, &c.c_mde));
1823
1824 /*
1825 * Is current set STALE?
1826 */
1827 if (c.c_flags & MDDB_C_STALE) {
1828 stale_bool = TRUE;
1829 }
1830
1831 replicacount = c.c_dbcnt;
1832
1833 assert(db_nlp != NULL);
1834
1835 /*
1836 * go through and gather how many data bases are on each
1837 * device specified.
1838 */
1839
1840 nr_replica_slices = deletelist_length(db_nlp);
1841 tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1842
1843 replica_delete_count = 0;
1844 for (i = 0; i < replicacount; i++) {
1845 char *devname;
1846 int found = 0;
1847
1848 c.c_id = i;
1849
1850 /* Don't need device id information from this ioctl */
1851 c.c_locator.l_devid = (uint64_t)0;
1852 c.c_locator.l_devid_flags = 0;
1853
1854 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1855 return (mdstealerror(ep, &c.c_mde));
1856
1857 devname = splicename(&c.c_devname);
1858
1859 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
1860 Free(devname);
1861 devname = getlongname(&c, ep);
1862 if (devname == NULL) {
1863 return (-1);
1864 }
1865 }
1866
1867 if ((index = in_deletelist(devname, db_nlp)) != -1) {
1868 found = 1;
1869 tag_array[index] = 1;
1870 replica_delete_count++;
1871 }
1872
1873 errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1874 MDDB_F_EWRITE | MDDB_F_TOOSMALL | MDDB_F_EFMT |
1875 MDDB_F_EDATA | MDDB_F_EMASTER);
1876
1877 /*
1878 * There are four combinations of "errored" and "found"
1879 * and they are used to find the number of
1880 * (a) valid/invalid replicas that are not in the delete
1881 * list and are available in the system.
1882 * (b) valid/invalid replicas that are to be deleted.
1883 */
1884
1885 if (errored && !found) /* errored and !found */
1886 invalid_replicas_nottodelete++;
1887 else if (!found) /* !errored and !found */
1888 valid_replicas_nottodelete++;
1889 else if (errored) /* errored and found */
1890 invalid_replicas_todelete++;
1891 /*
1892 * else it is !errored and found. This means
1893 * valid_replicas_todelete++; But this variable will not
1894 * be used anywhere
1895 */
1896
1897 Free(devname);
1898 }
1899
1900 index = 0;
1901 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1902 np = nlp->namep;
1903 if (tag_array[index++] != 1) {
1904 Free(tag_array);
1905 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1906 }
1907 }
1908
1909 Free(tag_array);
1910
1911
1912 /* if all replicas are deleted stop mdmonitord */
1913 if ((replicacount - replica_delete_count) == 0)
1914 stop_svmdaemons = 1;
1915
1916 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1917 if (force_option & MDFORCE_NONE)
1918 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1919 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1920 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1921 }
1922
1923 /*
1924 * The following algorithms are followed to check for deletion:
1925 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1926 * replicas, then deletion should be allowed.
1927 * (b) Deletion should be allowed only if valid replicas that are "not"
1928 * to be deleted is always greater than the invalid replicas that
1929 * are "not" to be deleted.
1930 * (c) If the user uses -f option, then deletion should be allowed.
1931 */
1932
1933 if ((invalid_replicas_todelete != replica_delete_count) &&
1934 (invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1935 (force_option != MDFORCE_LOCAL))
1936 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1937
1938 /*
1939 * go through and tell kernel to delete them
1940 */
1941
1942 /* Don't need device id information from this ioctl */
1943 c.c_locator.l_devid = (uint64_t)0;
1944 c.c_locator.l_devid_flags = 0;
1945
1946 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1947 return (mdstealerror(ep, &c.c_mde));
1948
1949 if (! metaislocalset(sp)) {
1950 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1951 if (! mdisok(ep))
1952 return (-1);
1953 if ((sd = metaget_setdesc(sp, ep)) == NULL)
1954 return (-1);
1955 }
1956
1957 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1958 np = nlp->namep;
1959
1960 /*
1961 * If mddb is being deleted from MN diskset and node is
1962 * an owner of the diskset then use rpc.mdcommd
1963 * mechanism to add mddb(s) so that all nodes stay in sync.
1964 * If set is stale, don't log the message since rpc.mdcommd
1965 * can't write the message to the mddb.
1966 *
1967 * When mddbs are first being added to set, a detach can
1968 * be called before any node has joined the diskset, so
1969 * must check to see if node is an owner of the diskset.
1970 *
1971 * Otherwise, just delete mddb from this node.
1972 */
1973
1974 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1975 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1976 md_mn_result_t *resultp;
1977 md_mn_msg_meta_db_detach_t detach;
1978 int send_rval;
1979
1980 /*
1981 * The following algorithm is used to detach replicas.
1982 * - META_DB_DETACH message generates submsgs
1983 * - BLOCK parse (master)
1984 * - MDDB_DETACH replicas
1985 * - UNBLOCK parse (master) causing parse
1986 * information to be sent from master
1987 * to slaves at a higher class than the
1988 * unblock so the parse message will
1989 * reach slaves before unblock message.
1990 */
1991 (void) splitname(np->bname, &detach.msg_splitname);
1992
1993 /* Set devid to NULL until devids are supported */
1994 detach.msg_devid[0] = NULL;
1995
1996 /*
1997 * If reconfig cycle has been started, this node is
1998 * stuck in in the return step until this command has
1999 * completed. If mdcommd is suspended, ask
2000 * send_message to fail (instead of retrying)
2001 * so that metaset can finish allowing the reconfig
2002 * cycle to proceed.
2003 */
2004 flags = MD_MSGF_FAIL_ON_SUSPEND;
2005 if (stale_bool == TRUE)
2006 flags |= MD_MSGF_NO_LOG;
2007 send_rval = mdmn_send_message(sp->setno,
2008 MD_MN_MSG_META_DB_DETACH,
2009 flags, 0, (char *)&detach,
2010 sizeof (md_mn_msg_meta_db_detach_t),
2011 &resultp, ep);
2012 if (send_rval != 0) {
2013 rval = -1;
2014 if (resultp == NULL)
2015 (void) mddserror(ep,
2016 MDE_DS_COMMD_SEND_FAIL,
2017 sp->setno, NULL, NULL,
2018 sp->setname);
2019 else {
2020 (void) mdstealerror(ep,
2021 &(resultp->mmr_ep));
2022 if (mdisok(ep)) {
2023 (void) mddserror(ep,
2024 MDE_DS_COMMD_SEND_FAIL,
2025 sp->setno, NULL, NULL,
2026 sp->setname);
2027 }
2028 free_result(resultp);
2029 }
2030 goto out;
2031 }
2032 if (resultp)
2033 free_result(resultp);
2034 } else {
2035 i = 0;
2036 while (i < c.c_dbcnt) {
2037 char *devname;
2038
2039 c.c_id = i;
2040
2041 /* Don't need devid info from this ioctl */
2042 c.c_locator.l_devid = (uint64_t)0;
2043 c.c_locator.l_devid_flags = 0;
2044
2045 if (metaioctl(MD_DB_GETDEV, &c,
2046 &c.c_mde, NULL)) {
2047 rval = mdstealerror(ep, &c.c_mde);
2048 goto out;
2049 }
2050
2051 devname = splicename(&c.c_devname);
2052
2053 if (strstr(devname, META_LONGDISKNAME_STR)
2054 != NULL) {
2055 Free(devname);
2056 devname = getlongname(&c, ep);
2057 if (devname == NULL) {
2058 return (-1);
2059 }
2060 }
2061
2062 if (strcmp(devname, np->bname) != 0) {
2063 Free(devname);
2064 i++;
2065 continue;
2066 }
2067 Free(devname);
2068
2069 /* Don't need devid info from this ioctl */
2070 c.c_locator.l_devid = (uint64_t)0;
2071 c.c_locator.l_devid_flags = 0;
2072
2073 if (metaioctl(MD_DB_DELDEV, &c,
2074 &c.c_mde, NULL) != 0) {
2075 rval = mdstealerror(ep, &c.c_mde);
2076 goto out;
2077 }
2078
2079 /* Not incrementing "i" intentionally */
2080 }
2081 }
2082 if (! metaislocalset(sp)) {
2083 /* update the dbcnt and size in dd */
2084 for (p = dd; p != NULL; p = p->dd_next) {
2085 if (p->dd_dnp == np->drivenamep) {
2086 p->dd_dbcnt = 0;
2087 p->dd_dbsize = 0;
2088 break;
2089 }
2090 }
2091
2092 /*
2093 * Slam a dummy master block and make it self
2094 * identifying
2095 */
2096 if ((fd = open(np->rname, O_RDWR)) >= 0) {
2097 meta_mkdummymaster(sp, fd, 16);
2098 (void) close(fd);
2099 }
2100 }
2101 }
2102 out:
2103 if (metaislocalset(sp)) {
2104 /*
2105 * Stop all the daemons if there are
2106 * no more replicas so that the module can be
2107 * unloaded.
2108 */
2109 if (rval == 0 && stop_svmdaemons == 1) {
2110 char buf[MAXPATHLEN];
2111 int i;
2112
2113 for (i = 0; i < DAEMON_COUNT; i++) {
2114 (void) snprintf(buf, MAXPATHLEN,
2115 "/usr/bin/pkill -%s -x %s",
2116 svmd_kill_list[i].svmd_kill_val,
2117 svmd_kill_list[i].svmd_name);
2118 if (pclose(popen(buf, "w")) == -1)
2119 md_perror(buf);
2120 }
2121
2122 if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2123 mde_perror(&status, "");
2124 mdclrerror(&status);
2125 }
2126 }
2127 if (buildconf(sp, &status)) {
2128 /* Don't mask any previous errors */
2129 if (rval == 0)
2130 rval = mdstealerror(ep, &status);
2131 else
2132 mdclrerror(&status);
2133 return (rval);
2134 }
2135
2136 if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2137 /* Don't mask any previous errors */
2138 if (rval == 0)
2139 rval = mdstealerror(ep, &status);
2140 else
2141 mdclrerror(&status);
2142 }
2143 } else {
2144 if (update_dbinfo_on_drives(sp, dd,
2145 (force_option & MDFORCE_SET_LOCKED),
2146 ((force_option & MDFORCE_LOCAL) |
2147 (force_option & MDFORCE_DS)), &status)) {
2148 /* Don't mask any previous errors */
2149 if (rval == 0)
2150 rval = mdstealerror(ep, &status);
2151 else
2152 mdclrerror(&status);
2153 }
2154 metafreedrivedesc(&dd);
2155 }
2156 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2157 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2158 meta_invalidate_name(nlp->namep);
2159 }
2160 }
2161 return (rval);
2162 }
2163
2164 static md_replica_t *
metareplicaname(mdsetname_t * sp,int flags,struct mddb_config * c,md_error_t * ep)2165 metareplicaname(
2166 mdsetname_t *sp,
2167 int flags,
2168 struct mddb_config *c,
2169 md_error_t *ep
2170 )
2171 {
2172 md_replica_t *rp;
2173 char *devname;
2174 size_t sz;
2175 devid_nmlist_t *disklist = NULL;
2176 char *devid_str;
2177
2178 /* allocate replicaname */
2179 rp = Zalloc(sizeof (*rp));
2180
2181 /* get device name */
2182 devname = splicename(&c->c_devname);
2183
2184 /*
2185 * Check if the device has a long name (>40 characters) and
2186 * if so then we have to use devids to get the device name.
2187 * If this cannot be done then we have to fail the request.
2188 */
2189 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
2190 if (c->c_locator.l_devid != NULL) {
2191 if (meta_deviceid_to_nmlist("/dev/dsk",
2192 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
2193 c->c_locator.l_minor_name, &disklist) != 0) {
2194 devid_str = devid_str_encode(
2195 (ddi_devid_t)(uintptr_t)
2196 c->c_locator.l_devid, NULL);
2197 (void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
2198 mderrorextra(ep, devid_str);
2199 if (devid_str != NULL)
2200 devid_str_free(devid_str);
2201 Free(rp);
2202 Free(devname);
2203 return (NULL);
2204 }
2205 } else {
2206 (void) mderror(ep, MDE_NODEVID, "");
2207 Free(rp);
2208 Free(devname);
2209 return (NULL);
2210 }
2211 Free(devname);
2212 devname = disklist[0].devname;
2213 }
2214
2215 if (flags & PRINT_FAST) {
2216 if ((rp->r_namep = metaname_fast(&sp, devname,
2217 LOGICAL_DEVICE, ep)) == NULL) {
2218 Free(devname);
2219 Free(rp);
2220 return (NULL);
2221 }
2222 } else {
2223 if ((rp->r_namep = metaname(&sp, devname,
2224 LOGICAL_DEVICE, ep)) == NULL) {
2225 Free(devname);
2226 Free(rp);
2227 return (NULL);
2228 }
2229 }
2230 Free(devname);
2231
2232 /* make sure it's OK */
2233 if ((! (flags & MD_BASICNAME_OK)) &&
2234 (metachkcomp(rp->r_namep, ep) != 0)) {
2235 Free(rp);
2236 return (NULL);
2237 }
2238
2239 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2240 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2241 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2242 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2243 sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2244 (c->c_locator.l_devid));
2245 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2246 (ddi_devid_t)NULL) {
2247 Free(rp);
2248 return (NULL);
2249 }
2250 (void) memcpy((void *)rp->r_devid,
2251 (void *)(uintptr_t)c->c_locator.l_devid, sz);
2252 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2253 rp->r_flags &= ~MDDB_F_NODEVID;
2254 /* Overwrite dev derived from name with dev from devid */
2255 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2256 }
2257 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2258
2259 rp->r_blkno = c->c_locator.l_blkno;
2260 if (c->c_dbend != 0)
2261 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2262
2263 /* return replica */
2264 return (rp);
2265 }
2266
2267 /*
2268 * free replica list
2269 */
2270 void
metafreereplicalist(md_replicalist_t * rlp)2271 metafreereplicalist(
2272 md_replicalist_t *rlp
2273 )
2274 {
2275 md_replicalist_t *rl = NULL;
2276
2277 for (/* void */; (rlp != NULL); rlp = rl) {
2278 rl = rlp->rl_next;
2279 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2280 free(rlp->rl_repp->r_devid);
2281 }
2282 Free(rlp->rl_repp);
2283 Free(rlp);
2284 }
2285 }
2286
2287 /*
2288 * return list of all replicas in set
2289 */
2290 int
metareplicalist(mdsetname_t * sp,int flags,md_replicalist_t ** rlpp,md_error_t * ep)2291 metareplicalist(
2292 mdsetname_t *sp,
2293 int flags,
2294 md_replicalist_t **rlpp,
2295 md_error_t *ep
2296 )
2297 {
2298 md_replicalist_t **tail = rlpp;
2299 int count = 0;
2300 struct mddb_config c;
2301 int i;
2302 char *devid;
2303
2304 /* for each replica */
2305 i = 0;
2306 do {
2307 md_replica_t *rp;
2308
2309 /* get next replica */
2310 (void) memset(&c, 0, sizeof (c));
2311 c.c_id = i;
2312 c.c_setno = sp->setno;
2313
2314 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2315 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2316 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2317 mdclrerror(&c.c_mde);
2318 break; /* handle none at all */
2319 }
2320 (void) mdstealerror(ep, &c.c_mde);
2321 goto out;
2322 }
2323
2324 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2325 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2326 (void) mdsyserror(ep, ENOMEM, META_DBCONF);
2327 goto out;
2328 }
2329 c.c_locator.l_devid = (uintptr_t)devid;
2330 /*
2331 * Turn on space and sz flags since 'sz' amount of
2332 * space has been alloc'd.
2333 */
2334 c.c_locator.l_devid_flags =
2335 MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2336 }
2337
2338 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2339 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2340 mdclrerror(&c.c_mde);
2341 break; /* handle none at all */
2342 }
2343 (void) mdstealerror(ep, &c.c_mde);
2344 goto out;
2345 }
2346
2347 /*
2348 * Paranoid check - shouldn't happen, but is left as
2349 * a place holder for changes that will be needed after
2350 * dynamic reconfiguration changes are added to SVM (to
2351 * support movement of disks at any point in time).
2352 */
2353 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2354 (void) fprintf(stderr,
2355 dgettext(TEXT_DOMAIN,
2356 "Error: Relocation Information "
2357 "(drvnm=%s, mnum=0x%lx) \n"
2358 "relocation information size changed - \n"
2359 "rerun command\n"),
2360 c.c_locator.l_driver, c.c_locator.l_mnum);
2361 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2362 goto out;
2363 }
2364
2365 if (c.c_dbcnt == 0)
2366 break; /* handle none at all */
2367
2368 /* get info */
2369 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2370 goto out;
2371
2372 /* append to list */
2373 *tail = Zalloc(sizeof (**tail));
2374 (*tail)->rl_repp = rp;
2375 tail = &(*tail)->rl_next;
2376 ++count;
2377
2378 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2379 free(devid);
2380 c.c_locator.l_devid_flags = 0;
2381 }
2382
2383 } while (++i < c.c_dbcnt);
2384
2385 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2386 free(devid);
2387 }
2388
2389 /* return count */
2390 return (count);
2391
2392 /* cleanup, return error */
2393 out:
2394 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2395 free(devid);
2396 }
2397 metafreereplicalist(*rlpp);
2398 *rlpp = NULL;
2399 return (-1);
2400 }
2401
2402 /*
2403 * meta_sync_db_locations - get list of replicas from kernel and write
2404 * out to mddb.cf and md.conf. 'Syncs up' the replica list in
2405 * the kernel with the replica list in the conf files.
2406 *
2407 */
2408 void
meta_sync_db_locations(mdsetname_t * sp,md_error_t * ep)2409 meta_sync_db_locations(
2410 mdsetname_t *sp,
2411 md_error_t *ep
2412 )
2413 {
2414 char *sname = 0; /* system file name */
2415 char *cname = 0; /* config file name */
2416
2417 if (!metaislocalset(sp))
2418 return;
2419
2420 /* Updates backup of configuration file (aka mddb.cf) */
2421 if (buildconf(sp, ep) != 0)
2422 return;
2423
2424 /* Updates system configuration file (aka md.conf) */
2425 (void) meta_db_patch(sname, cname, 0, ep);
2426 }
2427
2428 /*
2429 * setup_db_locations - parse the mddb.cf file and
2430 * tells the driver which db locations to use.
2431 */
2432 int
meta_setup_db_locations(md_error_t * ep)2433 meta_setup_db_locations(
2434 md_error_t *ep
2435 )
2436 {
2437 mddb_config_t c;
2438 FILE *fp;
2439 char inbuff[1024];
2440 char *buff;
2441 uint_t i;
2442 size_t sz;
2443 int rval = 0;
2444 char *devidp;
2445 uint_t devid_size;
2446 char *minor_name = NULL;
2447 ddi_devid_t devid_decode;
2448 int checksum;
2449
2450 /* do mddb.cf file */
2451 (void) memset(&c, '\0', sizeof (c));
2452 if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2453 if (errno != ENOENT)
2454 return (mdsyserror(ep, errno, META_DBCONF));
2455 }
2456 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2457 fp)) != NULL)) {
2458
2459 /* ignore comments */
2460 if (*buff == '#')
2461 continue;
2462
2463 /* parse locator */
2464 (void) memset(&c, 0, sizeof (c));
2465 c.c_setno = MD_LOCAL_SET;
2466 i = strcspn(buff, " \t");
2467 if (i > sizeof (c.c_locator.l_driver))
2468 i = sizeof (c.c_locator.l_driver);
2469 (void) strncpy(c.c_locator.l_driver, buff, i);
2470 buff += i;
2471 c.c_locator.l_dev =
2472 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2473 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2474 c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2475
2476 /* parse out devid */
2477 while (isspace((int)(*buff)))
2478 buff += 1;
2479 i = strcspn(buff, " \t");
2480 if ((devidp = (char *)malloc(i+1)) == NULL)
2481 return (mdsyserror(ep, ENOMEM, META_DBCONF));
2482
2483 (void) strncpy(devidp, buff, i);
2484 devidp[i] = '\0';
2485 if (devid_str_decode(devidp, &devid_decode,
2486 &minor_name) == -1) {
2487 free(devidp);
2488 continue;
2489 }
2490
2491 /* Conf file must have minor name associated with devid */
2492 if (minor_name == NULL) {
2493 free(devidp);
2494 devid_free(devid_decode);
2495 continue;
2496 }
2497
2498 sz = devid_sizeof(devid_decode);
2499 /* Copy to devid size buffer that ioctl expects */
2500 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2501 devid_free(devid_decode);
2502 free(minor_name);
2503 free(devidp);
2504 return (mdsyserror(ep, ENOMEM, META_DBCONF));
2505 }
2506
2507 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2508 (void *)devid_decode, sz);
2509
2510 devid_free(devid_decode);
2511
2512 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2513 free(minor_name);
2514 free(devidp);
2515 free((void *)(uintptr_t)c.c_locator.l_devid);
2516 return (mdsyserror(ep, ENOMEM, META_DBCONF));
2517 }
2518 (void) strcpy(c.c_locator.l_minor_name, minor_name);
2519 free(minor_name);
2520 c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2521 MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2522 c.c_locator.l_devid_sz = sz;
2523
2524 devid_size = strlen(devidp);
2525 buff += devid_size;
2526
2527 checksum = strtol(buff, &buff, 10);
2528 for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2529 checksum += c.c_locator.l_driver[i];
2530 for (i = 0; i < devid_size; i++) {
2531 checksum += devidp[i];
2532 }
2533 free(devidp);
2534
2535 checksum += minor(c.c_locator.l_dev);
2536 checksum += c.c_locator.l_blkno;
2537 if (checksum != 42) {
2538 /* overwritten later for more serious problems */
2539 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2540 free((void *)(uintptr_t)c.c_locator.l_devid);
2541 continue;
2542 }
2543 c.c_locator.l_flags = 0;
2544
2545 /* use db location */
2546 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2547 free((void *)(uintptr_t)c.c_locator.l_devid);
2548 return (mdstealerror(ep, &c.c_mde));
2549 }
2550
2551 /* free up devid if in use */
2552 free((void *)(uintptr_t)c.c_locator.l_devid);
2553 c.c_locator.l_devid = (uint64_t)0;
2554 c.c_locator.l_devid_flags = 0;
2555 }
2556 if ((fp) && (fclose(fp) != 0))
2557 return (mdsyserror(ep, errno, META_DBCONF));
2558
2559 /* check for stale database */
2560 (void) memset((char *)&c, 0, sizeof (struct mddb_config));
2561 c.c_id = 0;
2562 c.c_setno = MD_LOCAL_SET;
2563
2564 /*
2565 * While we do not need the devid here we may need to
2566 * know if devid's are being used by the kernel for
2567 * the replicas. This is because under some circumstances
2568 * we can only manipulate the SVM configuration if the
2569 * kernel is using devid's.
2570 */
2571 c.c_locator.l_devid = (uint64_t)0;
2572 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2573 c.c_locator.l_devid_sz = 0;
2574
2575 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2576 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2577 return (mdstealerror(ep, &c.c_mde));
2578 mdclrerror(&c.c_mde);
2579 }
2580
2581 if (c.c_flags & MDDB_C_STALE)
2582 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2583 0, NULL));
2584
2585 if (c.c_locator.l_devid_sz != 0) {
2586 /*
2587 * Devid's are being used to track the replicas because
2588 * there is space for a devid.
2589 */
2590 devid_in_use = TRUE;
2591 }
2592
2593 /* success */
2594 return (rval);
2595 }
2596
2597 /*
2598 * meta_db_minreplica - returns the minimum size replica currently in use.
2599 */
2600 daddr_t
meta_db_minreplica(mdsetname_t * sp,md_error_t * ep)2601 meta_db_minreplica(
2602 mdsetname_t *sp,
2603 md_error_t *ep
2604 )
2605 {
2606 md_replica_t *r;
2607 md_replicalist_t *rl, *rlp = NULL;
2608 daddr_t nblks = 0;
2609
2610 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2611 return (-1);
2612
2613 if (rlp == NULL)
2614 return (-1);
2615
2616 /* find the smallest existing replica */
2617 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2618 r = rl->rl_repp;
2619 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2620 }
2621
2622 metafreereplicalist(rlp);
2623 return (nblks);
2624 }
2625
2626 /*
2627 * meta_get_replica_names
2628 * returns an mdnamelist_t of replica slices
2629 */
2630 /*ARGSUSED*/
2631 int
meta_get_replica_names(mdsetname_t * sp,mdnamelist_t ** nlpp,int options,md_error_t * ep)2632 meta_get_replica_names(
2633 mdsetname_t *sp,
2634 mdnamelist_t **nlpp,
2635 int options,
2636 md_error_t *ep
2637 )
2638 {
2639 md_replicalist_t *rlp = NULL;
2640 md_replicalist_t *rl;
2641 mdnamelist_t **tailpp = nlpp;
2642 int cnt = 0;
2643
2644 assert(nlpp != NULL);
2645
2646 if (!metaislocalset(sp))
2647 goto out;
2648
2649 /* get replicas */
2650 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2651 cnt = -1;
2652 goto out;
2653 }
2654
2655 /* build name list */
2656 for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2657 /*
2658 * Add the name struct to the end of the
2659 * namelist but keep a pointer to the last
2660 * element so that we don't incur the overhead
2661 * of traversing the list each time
2662 */
2663 tailpp = meta_namelist_append_wrapper(
2664 tailpp, rl->rl_repp->r_namep);
2665 ++cnt;
2666 }
2667
2668 /* cleanup, return count or error */
2669 out:
2670 metafreereplicalist(rlp);
2671 return (cnt);
2672 }
2673