1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice database interfaces. 38 */ 39 40 #define MDDB 41 42 #include <meta.h> 43 #include <sys/lvm/md_mddb.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/lvm/mdio.h> 46 #include <string.h> 47 #include <strings.h> 48 #include <ctype.h> 49 50 struct svm_daemon { 51 char *svmd_name; 52 char *svmd_kill_val; 53 }; 54 55 struct svm_daemon svmd_kill_list[] = { 56 {"mdmonitord", "HUP"}, 57 {"mddoors", "KILL"}, 58 }; 59 60 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) 61 #define MDMONITORD "/usr/sbin/mdmonitord" 62 63 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); 64 65 /* 66 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel 67 */ 68 md_timeval32_t 69 meta_get_lb_inittime( 70 mdsetname_t *sp, 71 md_error_t *ep 72 ) 73 { 74 mddb_config_t c; 75 76 (void) memset(&c, 0, sizeof (c)); 77 78 /* Fill in setno, setname, and sideno */ 79 c.c_setno = sp->setno; 80 81 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { 82 (void) mdstealerror(ep, &c.c_mde); 83 } 84 85 return (c.c_timestamp); 86 } 87 88 /* 89 * mkmasterblks writes out the master blocks of the mddb to the replica. 90 * 91 * In a MN diskset, this is called by the node that is adding this replica 92 * to the diskset. 93 */ 94 95 #define MDDB_VERIFY_SIZE 8192 96 97 static int 98 mkmasterblks( 99 mdsetname_t *sp, 100 mdname_t *np, 101 int fd, 102 daddr_t firstblk, 103 int dbsize, 104 md_timeval32_t inittime, 105 md_error_t *ep 106 ) 107 { 108 int consecutive; 109 md_timeval32_t tp; 110 struct mddb_mb *mb; 111 char *buffer; 112 int iosize; 113 md_set_desc *sd; 114 int mn_set = 0; 115 daddr_t startblk; 116 int cnt; 117 ddi_devid_t devid; 118 119 if (! metaislocalset(sp)) { 120 if ((sd = metaget_setdesc(sp, ep)) == NULL) 121 return (-1); 122 123 if (MD_MNSET_DESC(sd)) { 124 mn_set = 1; /* Used later */ 125 } 126 } 127 128 /* 129 * Loop to verify the entire mddb region on disk is read/writable. 130 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block 131 * chunks. 132 * 133 * A side-effect of this loop is to zero out the entire mddb region 134 */ 135 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) 136 return (mdsyserror(ep, ENOMEM, np->rname)); 137 138 startblk = firstblk; 139 for (cnt = dbsize; cnt > 0; cnt -= consecutive) { 140 141 if (cnt > MDDB_VERIFY_SIZE) 142 consecutive = MDDB_VERIFY_SIZE; 143 else 144 consecutive = cnt; 145 146 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 147 Free(buffer); 148 return (mdsyserror(ep, errno, np->rname)); 149 } 150 151 iosize = DEV_BSIZE * consecutive; 152 if (write(fd, buffer, iosize) != iosize) { 153 Free(buffer); 154 return (mdsyserror(ep, errno, np->rname)); 155 } 156 157 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 158 Free(buffer); 159 return (mdsyserror(ep, errno, np->rname)); 160 } 161 162 if (read(fd, buffer, iosize) != iosize) { 163 Free(buffer); 164 return (mdsyserror(ep, errno, np->rname)); 165 } 166 167 startblk += consecutive; 168 } 169 170 Free(buffer); 171 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 172 return (mdsyserror(ep, ENOMEM, np->rname)); 173 174 if (meta_gettimeofday(&tp) == -1) { 175 Free(mb); 176 return (mdsyserror(ep, errno, np->rname)); 177 } 178 179 mb->mb_magic = MDDB_MAGIC_MB; 180 /* 181 * If a MN diskset, set master block revision for a MN set. 182 * Even though the master block structure is no different 183 * for a MN set, setting the revision field to a different 184 * number keeps any pre-MN_diskset code from accessing 185 * this diskset. It also allows for an early determination 186 * of a MN diskset when reading in from disk so that the 187 * proper size locator block and locator names structure 188 * can be read in thus saving time on diskset startup. 189 */ 190 if (mn_set) 191 mb->mb_revision = MDDB_REV_MNMB; 192 else 193 mb->mb_revision = MDDB_REV_MB; 194 mb->mb_timestamp = tp; 195 mb->mb_setno = sp->setno; 196 mb->mb_blkcnt = dbsize - 1; 197 mb->mb_blkno = firstblk; 198 mb->mb_nextblk = 0; 199 200 mb->mb_blkmap.m_firstblk = firstblk + 1; 201 mb->mb_blkmap.m_consecutive = dbsize - 1; 202 if (! metaislocalset(sp)) { 203 mb->mb_setcreatetime = inittime; 204 } 205 206 /* 207 * We try to save the disks device ID into the remaining bytes in 208 * the master block. The saved devid is used to provide a mapping 209 * between this disk's devid and the devid stored into the master 210 * block. This allows the disk image to be self-identifying 211 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 212 * when we try to import these disks on the remote copied image. 213 * If we cannot save the disks device ID onto the master block that is 214 * ok. The disk is just not self-identifying and won't be importable 215 * in the remote copy scenario. 216 */ 217 if (devid_get(fd, &devid) == 0) { 218 size_t len; 219 220 len = devid_sizeof(devid); 221 if (len <= DEV_BSIZE - sizeof (*mb)) { 222 /* there is enough space to store the devid */ 223 mb->mb_devid_magic = MDDB_MAGIC_DE; 224 mb->mb_devid_len = len; 225 (void) memcpy(mb->mb_devid, devid, len); 226 } 227 devid_free(devid); 228 } 229 230 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 231 (crc_skip_t *)NULL); 232 233 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 234 Free(mb); 235 return (mdsyserror(ep, errno, np->rname)); 236 } 237 238 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 239 Free(mb); 240 return (mdsyserror(ep, errno, np->rname)); 241 } 242 243 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 244 Free(mb); 245 return (mdsyserror(ep, errno, np->rname)); 246 } 247 248 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 249 Free(mb); 250 return (mdsyserror(ep, errno, np->rname)); 251 } 252 253 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 254 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { 255 Free(mb); 256 return (mdmddberror(ep, MDE_NOTVERIFIED, 257 meta_getminor(np->dev), sp->setno, 0, np->rname)); 258 } 259 260 Free(mb); 261 return (0); 262 } 263 264 void 265 meta_mkdummymaster( 266 mdsetname_t *sp, 267 int fd, 268 daddr_t firstblk 269 ) 270 { 271 md_timeval32_t tp; 272 struct mddb_mb *mb; 273 ddi_devid_t devid; 274 md_set_desc *sd; 275 md_error_t ep = mdnullerror; 276 md_timeval32_t inittime; 277 278 /* 279 * No dummy master blocks are written for a MN diskset since devids 280 * are not supported in MN disksets. 281 */ 282 if (! metaislocalset(sp)) { 283 if ((sd = metaget_setdesc(sp, &ep)) == NULL) 284 return; 285 286 if (MD_MNSET_DESC(sd)) 287 return; 288 } 289 290 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 291 return; 292 293 mb->mb_magic = MDDB_MAGIC_DU; 294 mb->mb_revision = MDDB_REV_MB; 295 mb->mb_setno = sp->setno; 296 inittime = meta_get_lb_inittime(sp, &ep); 297 mb->mb_setcreatetime = inittime; 298 299 if (meta_gettimeofday(&tp) != -1) 300 mb->mb_timestamp = tp; 301 302 /* 303 * We try to save the disks device ID into the remaining bytes in 304 * the master block. This allows the disk image to be self-identifying 305 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 306 * when we try to import these disks on the remote copied image. 307 * If we cannot save the disks device ID onto the master block that is 308 * ok. The disk is just not self-identifying and won't be importable 309 * in the remote copy scenario. 310 */ 311 if (devid_get(fd, &devid) == 0) { 312 int len; 313 314 len = devid_sizeof(devid); 315 if (len <= DEV_BSIZE - sizeof (*mb)) { 316 /* there is enough space to store the devid */ 317 mb->mb_devid_magic = MDDB_MAGIC_DE; 318 mb->mb_devid_len = len; 319 (void) memcpy(mb->mb_devid, (char *)devid, len); 320 } 321 devid_free(devid); 322 } 323 324 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 325 (crc_skip_t *)NULL); 326 327 /* 328 * If any of these operations fail, we need to inform the 329 * user that the disk won't be self identifying. When support 330 * for importing remotely replicated disksets is added, we 331 * want to add the error messages here. 332 */ 333 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 334 goto out; 335 336 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) 337 goto out; 338 339 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 340 goto out; 341 342 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) 343 goto out; 344 345 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 346 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) 347 goto out; 348 349 out: 350 Free(mb); 351 } 352 353 static int 354 buildconf(mdsetname_t *sp, md_error_t *ep) 355 { 356 md_replicalist_t *rlp = NULL; 357 md_replicalist_t *rl; 358 FILE *cfp = NULL; 359 FILE *mfp = NULL; 360 struct stat sbuf; 361 int rval = 0; 362 int in_miniroot = 0; 363 char line[MDDB_BOOTLIST_MAX_LEN]; 364 char *tname = NULL; 365 366 /* get list of local replicas */ 367 if (! metaislocalset(sp)) 368 return (0); 369 370 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 371 return (-1); 372 373 /* open tempfile, copy permissions of original file */ 374 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { 375 /* 376 * On the miniroot tmp files must be created in /var/tmp. 377 * If we get a EROFS error, we assume that we are in the 378 * miniroot. 379 */ 380 if (errno != EROFS) 381 goto error; 382 in_miniroot = 1; 383 errno = 0; 384 tname = tempnam("/var/tmp", "slvm_"); 385 if (tname == NULL && errno == EROFS) { 386 /* 387 * If we are booted on a read-only root because 388 * of mddb quorum problems we don't want to emit 389 * any scary error messages. 390 */ 391 errno = 0; 392 goto out; 393 } 394 395 /* open tempfile, copy permissions of original file */ 396 if ((cfp = fopen(tname, "w+")) == NULL) 397 goto error; 398 } 399 if (stat(META_DBCONF, &sbuf) == 0) { 400 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) 401 goto error; 402 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) 403 goto error; 404 } 405 406 /* print header */ 407 if (fprintf(cfp, "#metadevice database location file ") == EOF) 408 goto error; 409 if (fprintf(cfp, "do not hand edit\n") < 0) 410 goto error; 411 if (fprintf(cfp, 412 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) 413 goto error; 414 415 /* dump replicas */ 416 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 417 md_replica_t *r = rl->rl_repp; 418 int checksum = 42; 419 int i; 420 char *devidp; 421 minor_t min; 422 423 devidp = devid_str_encode(r->r_devid, r->r_minor_name); 424 /* If devid code can't encode devidp - skip entry */ 425 if (devidp == NULL) { 426 continue; 427 } 428 429 /* compute checksum */ 430 for (i = 0; ((r->r_driver_name[i] != '\0') && 431 (i < sizeof (r->r_driver_name))); i++) { 432 checksum -= r->r_driver_name[i]; 433 } 434 min = meta_getminor(r->r_namep->dev); 435 checksum -= min; 436 checksum -= r->r_blkno; 437 438 for (i = 0; i < strlen(devidp); i++) { 439 checksum -= devidp[i]; 440 } 441 /* print info */ 442 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", 443 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { 444 goto error; 445 } 446 447 devid_str_free(devidp); 448 } 449 450 /* close and rename to real file */ 451 if (fflush(cfp) != 0) 452 goto error; 453 if (fsync(fileno(cfp)) != 0) 454 goto error; 455 if (fclose(cfp) != 0) { 456 cfp = NULL; 457 goto error; 458 } 459 cfp = NULL; 460 461 /* 462 * Renames don't work in the miniroot since tmpfiles are 463 * created in /var/tmp. Hence we copy the data out. 464 */ 465 466 if (! in_miniroot) { 467 if (rename(META_DBCONFTMP, META_DBCONF) != 0) 468 goto error; 469 } else { 470 if ((cfp = fopen(tname, "r")) == NULL) 471 goto error; 472 if ((mfp = fopen(META_DBCONF, "w+")) == NULL) 473 goto error; 474 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { 475 if (fputs(line, mfp) == NULL) 476 goto error; 477 } 478 (void) fclose(cfp); 479 cfp = NULL; 480 if (fflush(mfp) != 0) 481 goto error; 482 if (fsync(fileno(mfp)) != 0) 483 goto error; 484 if (fclose(mfp) != 0) { 485 mfp = NULL; 486 goto error; 487 } 488 /* delete the tempfile */ 489 (void) unlink(tname); 490 } 491 /* success */ 492 rval = 0; 493 goto out; 494 495 /* tempfile error */ 496 error: 497 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 498 mdsyserror(ep, errno, META_DBCONFTMP); 499 500 501 /* cleanup, return success */ 502 out: 503 if (rlp != NULL) 504 metafreereplicalist(rlp); 505 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { 506 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 507 mdsyserror(ep, errno, META_DBCONFTMP); 508 } 509 free(tname); 510 return (rval); 511 } 512 513 /* 514 * check replica for dev 515 */ 516 static int 517 in_replica( 518 mdsetname_t *sp, 519 md_replica_t *rp, 520 mdname_t *np, 521 diskaddr_t slblk, 522 diskaddr_t nblks, 523 md_error_t *ep 524 ) 525 { 526 mdname_t *repnp = rp->r_namep; 527 diskaddr_t rep_sblk = rp->r_blkno; 528 diskaddr_t rep_nblks = rp->r_nblk; 529 530 /* should be in the same set */ 531 assert(sp != NULL); 532 533 /* if error in master block, assume whole partition */ 534 if ((rep_sblk == MD_DISKADDR_ERROR) || 535 (rep_nblks == MD_DISKADDR_ERROR)) { 536 rep_sblk = 0; 537 rep_nblks = MD_DISKADDR_ERROR; 538 } 539 540 /* check overlap */ 541 if (meta_check_overlap( 542 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { 543 return (-1); 544 } 545 546 /* return success */ 547 return (0); 548 } 549 550 /* 551 * check to see if we're in a replica 552 */ 553 int 554 meta_check_inreplica( 555 mdsetname_t *sp, 556 mdname_t *np, 557 diskaddr_t slblk, 558 diskaddr_t nblks, 559 md_error_t *ep 560 ) 561 { 562 md_replicalist_t *rlp = NULL; 563 md_replicalist_t *rl; 564 int rval = 0; 565 566 /* should have a set */ 567 assert(sp != NULL); 568 569 /* for each replica */ 570 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 571 return (-1); 572 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 573 md_replica_t *rp = rl->rl_repp; 574 575 /* check replica */ 576 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { 577 rval = -1; 578 break; 579 } 580 } 581 582 /* cleanup, return success */ 583 metafreereplicalist(rlp); 584 return (rval); 585 } 586 587 /* 588 * check replica 589 */ 590 int 591 meta_check_replica( 592 mdsetname_t *sp, /* set to check against */ 593 mdname_t *np, /* component to check against */ 594 mdchkopts_t options, /* option flags */ 595 diskaddr_t slblk, /* start logical block */ 596 diskaddr_t nblks, /* number of blocks (-1,rest of them) */ 597 md_error_t *ep /* error packet */ 598 ) 599 { 600 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; 601 602 /* make sure we have a disk */ 603 if (metachkcomp(np, ep) != 0) 604 return (-1); 605 606 /* check to ensure that it is not already in use */ 607 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { 608 return (-1); 609 } 610 611 if (options & MDCHK_ALLOW_NODBS) 612 return (0); 613 614 if (options & MDCHK_DRVINSET) 615 return (0); 616 617 /* make sure it is in the set */ 618 if (meta_check_inset(sp, np, ep) != 0) 619 return (-1); 620 621 /* make sure its not in a metadevice */ 622 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) 623 return (-1); 624 625 /* return success */ 626 return (0); 627 } 628 629 static int 630 update_dbinfo_on_drives( 631 mdsetname_t *sp, 632 md_drive_desc *dd, 633 int set_locked, 634 int force, 635 md_error_t *ep 636 ) 637 { 638 md_set_desc *sd; 639 int i; 640 md_setkey_t *cl_sk; 641 int rval = 0; 642 md_mnnode_desc *nd; 643 644 if ((sd = metaget_setdesc(sp, ep)) == NULL) 645 return (-1); 646 647 if (! set_locked) { 648 if (MD_MNSET_DESC(sd)) { 649 md_error_t xep = mdnullerror; 650 sigset_t sigs; 651 /* Make sure we are blocking all signals */ 652 if (procsigs(TRUE, &sigs, &xep) < 0) 653 mdclrerror(&xep); 654 655 nd = sd->sd_nodelist; 656 while (nd) { 657 if (force && strcmp(nd->nd_nodename, 658 mynode()) != 0) { 659 nd = nd->nd_next; 660 continue; 661 } 662 663 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 664 nd = nd->nd_next; 665 continue; 666 } 667 668 if (clnt_lock_set(nd->nd_nodename, sp, ep)) 669 return (-1); 670 nd = nd->nd_next; 671 } 672 } else { 673 for (i = 0; i < MD_MAXSIDES; i++) { 674 /* Skip empty slots */ 675 if (sd->sd_nodes[i][0] == '\0') 676 continue; 677 678 if (force && strcmp(sd->sd_nodes[i], 679 mynode()) != 0) 680 continue; 681 682 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) 683 return (-1); 684 } 685 } 686 } 687 688 if (MD_MNSET_DESC(sd)) { 689 nd = sd->sd_nodelist; 690 while (nd) { 691 if (force && strcmp(nd->nd_nodename, mynode()) != 0) { 692 nd = nd->nd_next; 693 continue; 694 } 695 696 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 697 nd = nd->nd_next; 698 continue; 699 } 700 701 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) 702 == -1) { 703 rval = -1; 704 break; 705 } 706 nd = nd->nd_next; 707 } 708 } else { 709 for (i = 0; i < MD_MAXSIDES; i++) { 710 /* Skip empty slots */ 711 if (sd->sd_nodes[i][0] == '\0') 712 continue; 713 714 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) 715 continue; 716 717 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) 718 == -1) { 719 rval = -1; 720 break; 721 } 722 } 723 } 724 725 if (! set_locked) { 726 cl_sk = cl_get_setkey(sp->setno, sp->setname); 727 if (MD_MNSET_DESC(sd)) { 728 nd = sd->sd_nodelist; 729 while (nd) { 730 if (force && 731 strcmp(nd->nd_nodename, mynode()) != 0) { 732 nd = nd->nd_next; 733 continue; 734 } 735 736 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 737 nd = nd->nd_next; 738 continue; 739 } 740 741 if (clnt_unlock_set(nd->nd_nodename, cl_sk, 742 ep)) { 743 rval = -1; 744 break; 745 } 746 nd = nd->nd_next; 747 } 748 } else { 749 for (i = 0; i < MD_MAXSIDES; i++) { 750 /* Skip empty slots */ 751 if (sd->sd_nodes[i][0] == '\0') 752 continue; 753 754 if (force && 755 strcmp(sd->sd_nodes[i], mynode()) != 0) 756 continue; 757 758 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, 759 ep)) { 760 rval = -1; 761 break; 762 } 763 } 764 765 } 766 cl_set_setkey(NULL); 767 } 768 769 return (rval); 770 } 771 772 int 773 meta_db_addsidenms( 774 mdsetname_t *sp, 775 mdname_t *np, 776 daddr_t blkno, 777 int bcast, 778 md_error_t *ep 779 ) 780 { 781 side_t sideno; 782 char *bname = NULL; 783 char *dname = NULL; 784 minor_t mnum; 785 mddb_config_t c; 786 int done; 787 int rval = 0; 788 md_set_desc *sd; 789 790 sideno = MD_SIDEWILD; 791 /*CONSTCOND*/ 792 while (1) { 793 if (bname != NULL) { 794 Free(bname); 795 bname = NULL; 796 } 797 if (dname != NULL) { 798 Free(dname); 799 dname = NULL; 800 } 801 if ((done = meta_getnextside_devinfo(sp, np->bname, 802 &sideno, &bname, &dname, &mnum, ep)) == -1) { 803 rval = -1; 804 break; 805 } 806 807 if (done == 0) 808 break; 809 810 if (! metaislocalset(sp)) { 811 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 812 rval = -1; 813 break; 814 } 815 } 816 817 /* 818 * Send addsidenms to all nodes using rpc.mdcommd if 819 * sidename is being added to MN diskset. 820 * 821 * It's ok to broadcast this call to other nodes. 822 * 823 * Note: The broadcast to other nodes isn't needed during 824 * the addition of the first mddbs to the set since the 825 * other nodes haven't been joined to the set yet. All 826 * nodes in a MN diskset are (implicitly) joined to the set 827 * on the addition of the first mddb. 828 */ 829 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 830 (bcast == DB_ADDSIDENMS_BCAST)) { 831 md_mn_result_t *resultp = NULL; 832 md_mn_msg_meta_db_newside_t db_ns; 833 int send_rval; 834 835 db_ns.msg_l_dev = np->dev; 836 db_ns.msg_sideno = sideno; 837 db_ns.msg_blkno = blkno; 838 (void) strncpy(db_ns.msg_dname, dname, 839 sizeof (db_ns.msg_dname)); 840 (void) splitname(np->bname, &db_ns.msg_splitname); 841 db_ns.msg_mnum = mnum; 842 843 /* Set devid to NULL until devids are supported */ 844 db_ns.msg_devid[0] = NULL; 845 846 /* 847 * If reconfig cycle has been started, this node is 848 * stuck in in the return step until this command has 849 * completed. If mdcommd is suspended, ask 850 * send_message to fail (instead of retrying) 851 * so that metaset can finish allowing the reconfig 852 * cycle to proceed. 853 */ 854 send_rval = mdmn_send_message(sp->setno, 855 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | 856 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, 857 sizeof (md_mn_msg_meta_db_newside_t), 858 &resultp, ep); 859 if (send_rval != 0) { 860 rval = -1; 861 if (resultp == NULL) 862 (void) mddserror(ep, 863 MDE_DS_COMMD_SEND_FAIL, 864 sp->setno, NULL, NULL, 865 sp->setname); 866 else { 867 (void) mdstealerror(ep, 868 &(resultp->mmr_ep)); 869 if (mdisok(ep)) { 870 (void) mddserror(ep, 871 MDE_DS_COMMD_SEND_FAIL, 872 sp->setno, NULL, NULL, 873 sp->setname); 874 } 875 free_result(resultp); 876 } 877 break; 878 } 879 if (resultp) 880 free_result(resultp); 881 } else { 882 /* 883 * Let this side's device name, minor # and driver name 884 * be known to the database replica. 885 */ 886 (void) memset(&c, 0, sizeof (c)); 887 888 /* Fill in device/replica info */ 889 c.c_locator.l_dev = meta_cmpldev(np->dev); 890 c.c_locator.l_blkno = blkno; 891 (void) strncpy(c.c_locator.l_driver, dname, 892 sizeof (c.c_locator.l_driver)); 893 (void) splitname(bname, &c.c_devname); 894 c.c_locator.l_mnum = mnum; 895 896 /* Fill in setno, setname, and sideno */ 897 c.c_setno = sp->setno; 898 (void) strncpy(c.c_setname, sp->setname, 899 sizeof (c.c_setname)); 900 c.c_sideno = sideno; 901 902 /* 903 * Don't need device id information from this ioctl 904 * Kernel determines device id from dev_t, which 905 * is just what this code would do. 906 */ 907 c.c_locator.l_devid = (uint64_t)0; 908 c.c_locator.l_devid_flags = 0; 909 910 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { 911 rval = mdstealerror(ep, &c.c_mde); 912 break; 913 } 914 } 915 } 916 917 /* cleanup, return success */ 918 if (bname != NULL) { 919 Free(bname); 920 bname = NULL; 921 } 922 if (dname != NULL) { 923 Free(dname); 924 dname = NULL; 925 } 926 return (rval); 927 } 928 929 930 int 931 meta_db_delsidenm( 932 mdsetname_t *sp, 933 side_t sideno, 934 mdname_t *np, 935 daddr_t blkno, 936 md_error_t *ep 937 ) 938 { 939 mddb_config_t c; 940 md_set_desc *sd; 941 942 if (! metaislocalset(sp)) { 943 if ((sd = metaget_setdesc(sp, ep)) == NULL) 944 return (-1); 945 } 946 /* Use rpc.mdcommd to delete mddb side from all nodes */ 947 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 948 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 949 md_mn_result_t *resultp = NULL; 950 md_mn_msg_meta_db_delside_t db_ds; 951 int send_rval; 952 953 db_ds.msg_l_dev = np->dev; 954 db_ds.msg_blkno = blkno; 955 db_ds.msg_sideno = sideno; 956 957 /* Set devid to NULL until devids are supported */ 958 db_ds.msg_devid[0] = NULL; 959 960 /* 961 * If reconfig cycle has been started, this node is 962 * stuck in in the return step until this command has 963 * completed. If mdcommd is suspended, ask 964 * send_message to fail (instead of retrying) 965 * so that metaset can finish allowing the reconfig 966 * cycle to proceed. 967 */ 968 send_rval = mdmn_send_message(sp->setno, 969 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | 970 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, 971 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); 972 if (send_rval != 0) { 973 if (resultp == NULL) 974 (void) mddserror(ep, 975 MDE_DS_COMMD_SEND_FAIL, 976 sp->setno, NULL, NULL, 977 sp->setname); 978 else { 979 (void) mdstealerror(ep, &(resultp->mmr_ep)); 980 if (mdisok(ep)) { 981 (void) mddserror(ep, 982 MDE_DS_COMMD_SEND_FAIL, 983 sp->setno, NULL, NULL, 984 sp->setname); 985 } 986 free_result(resultp); 987 } 988 return (-1); 989 } 990 if (resultp) 991 free_result(resultp); 992 993 } else { 994 /* 995 * Let this side's device name, minor # and driver name 996 * be known to the database replica. 997 */ 998 (void) memset(&c, 0, sizeof (c)); 999 1000 /* Fill in device/replica info */ 1001 c.c_locator.l_dev = meta_cmpldev(np->dev); 1002 c.c_locator.l_blkno = blkno; 1003 1004 /* Fill in setno, setname, and sideno */ 1005 c.c_setno = sp->setno; 1006 (void) strcpy(c.c_setname, sp->setname); 1007 c.c_sideno = sideno; 1008 1009 /* 1010 * Don't need device id information from this ioctl 1011 * Kernel determines device id from dev_t, which 1012 * is just what this code would do. 1013 */ 1014 c.c_locator.l_devid = (uint64_t)0; 1015 c.c_locator.l_devid_flags = 0; 1016 1017 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) 1018 return (mdstealerror(ep, &c.c_mde)); 1019 } 1020 return (0); 1021 } 1022 1023 1024 static int 1025 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) 1026 { 1027 mdnamelist_t *dnp1, *dnp2; 1028 1029 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { 1030 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { 1031 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) 1032 return (mderror(ep, MDE_DUPDRIVE, 1033 dnp1->namep->cname)); 1034 } 1035 } 1036 return (0); 1037 } 1038 1039 1040 /* 1041 * Return 1 if files are different, else return 0 1042 */ 1043 static int 1044 filediff(char *tsname, char *sname) 1045 { 1046 int ret = 1, fd; 1047 size_t tsz, sz; 1048 struct stat sbuf; 1049 char *tbuf, *buf; 1050 1051 if (stat(tsname, &sbuf) != 0) 1052 return (1); 1053 tsz = sbuf.st_size; 1054 if (stat(sname, &sbuf) != 0) 1055 return (1); 1056 sz = sbuf.st_size; 1057 if (tsz != sz) 1058 return (1); 1059 1060 /* allocate memory and read both files into buffer */ 1061 tbuf = malloc(tsz); 1062 buf = malloc(sz); 1063 if (tbuf == NULL || buf == NULL) 1064 goto out; 1065 1066 fd = open(tsname, O_RDONLY); 1067 if (fd == -1) 1068 goto out; 1069 sz = read(fd, tbuf, tsz); 1070 (void) close(fd); 1071 if (sz != tsz) 1072 goto out; 1073 1074 fd = open(sname, O_RDONLY); 1075 if (fd == -1) 1076 goto out; 1077 sz = read(fd, buf, tsz); 1078 (void) close(fd); 1079 if (sz != tsz) 1080 goto out; 1081 1082 /* compare content */ 1083 ret = bcmp(tbuf, buf, tsz); 1084 out: 1085 if (tbuf) 1086 free(tbuf); 1087 if (buf) 1088 free(buf); 1089 return (ret); 1090 } 1091 1092 /* 1093 * patch md.conf file with mddb locations 1094 */ 1095 int 1096 meta_db_patch( 1097 char *sname, /* system file name */ 1098 char *cname, /* mddb.cf file name */ 1099 int patch, /* patching locally */ 1100 md_error_t *ep 1101 ) 1102 { 1103 char *tsname = NULL; 1104 char line[MDDB_BOOTLIST_MAX_LEN]; 1105 FILE *tsfp = NULL; 1106 FILE *mfp = NULL; 1107 int rval = -1; 1108 1109 /* check names */ 1110 if (sname == NULL) { 1111 if (patch) 1112 sname = "md.conf"; 1113 else 1114 sname = "/kernel/drv/md.conf"; 1115 } 1116 if (cname == NULL) 1117 cname = META_DBCONF; 1118 1119 /* 1120 * edit file 1121 */ 1122 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { 1123 if (mdissyserror(ep, EROFS)) { 1124 /* 1125 * If we are booted on a read-only root because 1126 * of mddb quorum problems we don't want to emit 1127 * any scary error messages. 1128 */ 1129 mdclrerror(ep); 1130 rval = 0; 1131 } 1132 goto out; 1133 } 1134 1135 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 1136 ep) != 0) 1137 goto out; 1138 1139 /* if file content is identical, skip rename */ 1140 if (filediff(tsname, sname) == 0) { 1141 rval = 0; 1142 goto out; 1143 } 1144 1145 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || 1146 (fclose(tsfp) != 0)) { 1147 (void) mdsyserror(ep, errno, tsname); 1148 goto out; 1149 } 1150 1151 tsfp = NULL; 1152 1153 /* 1154 * rename file. If we get a Cross Device error then it 1155 * is because we are in the miniroot. 1156 */ 1157 if (rename(tsname, sname) != 0 && errno != EXDEV) { 1158 (void) mdsyserror(ep, errno, sname); 1159 goto out; 1160 } 1161 1162 if (errno == EXDEV) { 1163 if ((tsfp = fopen(tsname, "r")) == NULL) 1164 goto out; 1165 if ((mfp = fopen(sname, "w+")) == NULL) 1166 goto out; 1167 while (fgets(line, sizeof (line), tsfp) != NULL) { 1168 if (fputs(line, mfp) == NULL) 1169 goto out; 1170 } 1171 (void) fclose(tsfp); 1172 tsfp = NULL; 1173 if (fflush(mfp) != 0) 1174 goto out; 1175 if (fsync(fileno(mfp)) != 0) 1176 goto out; 1177 if (fclose(mfp) != 0) { 1178 mfp = NULL; 1179 goto out; 1180 } 1181 } 1182 1183 Free(tsname); 1184 tsname = NULL; 1185 rval = 0; 1186 1187 /* cleanup, return error */ 1188 out: 1189 if (tsfp != NULL) 1190 (void) fclose(tsfp); 1191 if (tsname != NULL) { 1192 (void) unlink(tsname); 1193 Free(tsname); 1194 } 1195 return (rval); 1196 } 1197 1198 /* 1199 * Add replicas to set. This happens as a result of: 1200 * - metadb [-s set_name] -a 1201 * - metaset -s set_name -a disk 1202 * - metaset -s set_name -d disk (causes a rebalance of mddbs) 1203 * - metaset -s set_name -b 1204 * 1205 * For a local set, this routine is run on the local set host. 1206 * 1207 * For a traditional diskset, this routine is run on the node that 1208 * is running the metaset command. 1209 * 1210 * For a multinode diskset, this routine is run by the node that is 1211 * running the metaset command. If this is the first mddb added to 1212 * the MN diskset, then no communication is made to other nodes via commd 1213 * since the other nodes will be in-sync with respect to the mddbs when 1214 * those other nodes join the set and snarf in the newly created mddb. 1215 * If this is not the first mddb added to the MN diskset, then this 1216 * attach command is sent to all of the nodes using commd. This keeps 1217 * the nodes in-sync. 1218 */ 1219 int 1220 meta_db_attach( 1221 mdsetname_t *sp, 1222 mdnamelist_t *db_nlp, 1223 mdchkopts_t options, 1224 md_timeval32_t *timeval, 1225 int dbcnt, 1226 int dbsize, 1227 char *sysfilename, 1228 md_error_t *ep 1229 ) 1230 { 1231 struct mddb_config c; 1232 mdnamelist_t *nlp; 1233 mdname_t *np; 1234 md_drive_desc *dd = NULL; 1235 md_drive_desc *p; 1236 int i; 1237 int fd; 1238 side_t sideno; 1239 daddr_t blkno; 1240 int replicacount = 0; 1241 int start_mdmonitord = 0; 1242 int rval = 0; 1243 md_error_t status = mdnullerror; 1244 md_set_desc *sd; 1245 int stale_bool = FALSE; 1246 int flags; 1247 int firstmddb = 1; 1248 md_timeval32_t inittime = {0, 0}; 1249 1250 /* 1251 * Error if we don't get some work to do. 1252 */ 1253 if (db_nlp == NULL) 1254 return (mdsyserror(ep, EINVAL, NULL)); 1255 1256 if (mdnamesareunique(db_nlp, ep) != 0) 1257 return (-1); 1258 (void) memset(&c, 0, sizeof (c)); 1259 c.c_id = 0; 1260 c.c_setno = sp->setno; 1261 1262 /* Don't need device id information from this ioctl */ 1263 c.c_locator.l_devid = (uint64_t)0; 1264 c.c_locator.l_devid_flags = 0; 1265 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1266 if (metaislocalset(sp)) { 1267 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) 1268 mdclrerror(&c.c_mde); 1269 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || 1270 (! (options & MDCHK_ALLOW_NODBS))) 1271 return (mdstealerror(ep, &c.c_mde)); 1272 } else { 1273 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) 1274 return (mdstealerror(ep, &c.c_mde)); 1275 } 1276 mdclrerror(&c.c_mde); 1277 } 1278 /* 1279 * Is current set STALE? 1280 */ 1281 if (c.c_flags & MDDB_C_STALE) { 1282 stale_bool = TRUE; 1283 } 1284 1285 assert(db_nlp != NULL); 1286 1287 /* if creating the metadbs for the first time start mdmonitord */ 1288 if (c.c_dbcnt == 0) 1289 start_mdmonitord = 1; 1290 1291 /* 1292 * check to see if we will go over the total possible number 1293 * of data bases 1294 */ 1295 nlp = db_nlp; 1296 while (nlp) { 1297 replicacount += dbcnt; 1298 nlp = nlp->next; 1299 } 1300 1301 if ((replicacount + c.c_dbcnt) > c.c_dbmax) 1302 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 1303 sp->setno, c.c_dbcnt + replicacount, NULL)); 1304 1305 /* 1306 * go through and check to make sure all locations specified 1307 * are legal also pick out driver name; 1308 */ 1309 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1310 diskaddr_t devsize; 1311 1312 np = nlp->namep; 1313 1314 if (! metaislocalset(sp)) { 1315 uint_t partno; 1316 uint_t rep_partno; 1317 mddrivename_t *dnp = np->drivenamep; 1318 1319 /* 1320 * make sure that non-local database replicas 1321 * are always on the replica slice. 1322 */ 1323 if (meta_replicaslice(dnp, 1324 &rep_partno, ep) != 0) 1325 return (-1); 1326 if (metagetvtoc(np, FALSE, &partno, ep) == NULL) 1327 return (-1); 1328 if (partno != rep_partno) 1329 return (mddeverror(ep, MDE_REPCOMP_ONLY, 1330 np->dev, sp->setname)); 1331 } 1332 1333 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), 1334 ep)) { 1335 return (-1); 1336 } 1337 1338 if ((devsize = metagetsize(np, ep)) == -1) 1339 return (-1); 1340 1341 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) 1342 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, 1343 meta_getminor(np->dev), sp->setno, devsize, 1344 np->cname)); 1345 } 1346 1347 /* 1348 * If first disk in set we don't have lb_inittime yet for use as 1349 * mb_setcreatetime so don't go looking for it. WE'll come back 1350 * later and update after the locator block has been created. 1351 * If this isn't the first disk in the set, we have a locator 1352 * block and thus we have lb_inittime. Set mb_setcreatetime to 1353 * lb_inittime. 1354 */ 1355 if (! metaislocalset(sp)) { 1356 if (c.c_dbcnt != 0) { 1357 firstmddb = 0; 1358 inittime = meta_get_lb_inittime(sp, ep); 1359 } 1360 } 1361 1362 /* 1363 * go through and write all master blocks 1364 */ 1365 1366 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1367 np = nlp->namep; 1368 1369 if ((fd = open(np->rname, O_RDWR)) < 0) 1370 return (mdsyserror(ep, errno, np->rname)); 1371 1372 for (i = 0; i < dbcnt; i++) { 1373 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, 1374 inittime, ep)) { 1375 (void) close(fd); 1376 return (-1); 1377 } 1378 } 1379 (void) close(fd); 1380 } 1381 1382 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1383 return (-1); 1384 1385 if (! metaislocalset(sp)) { 1386 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1387 if (! mdisok(ep)) 1388 return (-1); 1389 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1390 return (-1); 1391 1392 } 1393 1394 /* 1395 * go through and tell kernel to add them 1396 */ 1397 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1398 mdcinfo_t *cinfo; 1399 1400 np = nlp->namep; 1401 1402 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 1403 rval = -1; 1404 goto out; 1405 } 1406 1407 /* 1408 * If mddb is being added to MN diskset and there already 1409 * exists a valid mddb in the set (which equates to this 1410 * node being an owner of the set) then use rpc.mdcommd 1411 * mechanism to add mddb(s) so that all nodes stay in sync. 1412 * If set is stale, don't log the message since rpc.mdcommd 1413 * can't write the message to the mddb. 1414 * 1415 * Otherwise, just add mddb to this node. 1416 */ 1417 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1418 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1419 md_mn_result_t *resultp = NULL; 1420 md_mn_msg_meta_db_attach_t attach; 1421 int send_rval; 1422 1423 /* 1424 * In a scenario where new replicas had been added on 1425 * the master, and then all of the old replicas failed 1426 * before the slaves had knowledge of the new replicas, 1427 * the slaves are unable to re-parse in the mddb 1428 * from the new replicas since the slaves have no 1429 * knowledge of the new replicas. The following 1430 * algorithm solves this problem: 1431 * - META_DB_ATTACH message generates submsgs 1432 * - BLOCK parse (master) 1433 * - MDDB_ATTACH new replicas 1434 * - UNBLOCK parse (master) causing parse 1435 * information to be sent from master 1436 * to slaves at a higher class than the 1437 * unblock so the parse message will 1438 * reach slaves before unblock message. 1439 */ 1440 attach.msg_l_dev = np->dev; 1441 attach.msg_cnt = dbcnt; 1442 attach.msg_dbsize = dbsize; 1443 (void) strncpy(attach.msg_dname, cinfo->dname, 1444 sizeof (attach.msg_dname)); 1445 (void) splitname(np->bname, &attach.msg_splitname); 1446 attach.msg_options = options; 1447 1448 /* Set devid to NULL until devids are supported */ 1449 attach.msg_devid[0] = NULL; 1450 1451 /* 1452 * If reconfig cycle has been started, this node is 1453 * stuck in in the return step until this command has 1454 * completed. If mdcommd is suspended, ask 1455 * send_message to fail (instead of retrying) 1456 * so that metaset can finish allowing the reconfig 1457 * cycle to proceed. 1458 */ 1459 flags = MD_MSGF_FAIL_ON_SUSPEND; 1460 if (stale_bool == TRUE) 1461 flags |= MD_MSGF_NO_LOG; 1462 send_rval = mdmn_send_message(sp->setno, 1463 MD_MN_MSG_META_DB_ATTACH, 1464 flags, (char *)&attach, 1465 sizeof (md_mn_msg_meta_db_attach_t), 1466 &resultp, ep); 1467 if (send_rval != 0) { 1468 rval = -1; 1469 if (resultp == NULL) 1470 (void) mddserror(ep, 1471 MDE_DS_COMMD_SEND_FAIL, 1472 sp->setno, NULL, NULL, 1473 sp->setname); 1474 else { 1475 (void) mdstealerror(ep, 1476 &(resultp->mmr_ep)); 1477 if (mdisok(ep)) { 1478 (void) mddserror(ep, 1479 MDE_DS_COMMD_SEND_FAIL, 1480 sp->setno, NULL, NULL, 1481 sp->setname); 1482 } 1483 free_result(resultp); 1484 } 1485 goto out; 1486 } 1487 if (resultp) 1488 free_result(resultp); 1489 } else { 1490 /* Adding mddb(s) to just this node */ 1491 for (i = 0; i < dbcnt; i++) { 1492 (void) memset(&c, 0, sizeof (c)); 1493 /* Fill in device/replica info */ 1494 c.c_locator.l_dev = meta_cmpldev(np->dev); 1495 c.c_locator.l_blkno = i * dbsize + 16; 1496 blkno = c.c_locator.l_blkno; 1497 (void) strncpy(c.c_locator.l_driver, cinfo->dname, 1498 sizeof (c.c_locator.l_driver)); 1499 (void) splitname(np->bname, &c.c_devname); 1500 c.c_locator.l_mnum = meta_getminor(np->dev); 1501 1502 /* Fill in setno, setname, and sideno */ 1503 c.c_setno = sp->setno; 1504 if (! metaislocalset(sp)) { 1505 if (MD_MNSET_DESC(sd)) { 1506 c.c_multi_node = 1; 1507 } 1508 } 1509 (void) strcpy(c.c_setname, sp->setname); 1510 c.c_sideno = sideno; 1511 1512 /* 1513 * Don't need device id information from this ioctl 1514 * Kernel determines device id from dev_t, which 1515 * is just what this code would do. 1516 */ 1517 c.c_locator.l_devid = (uint64_t)0; 1518 c.c_locator.l_devid_flags = 0; 1519 1520 if (timeval != NULL) 1521 c.c_timestamp = *timeval; 1522 1523 if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE), 1524 ep)) { 1525 rval = -1; 1526 goto out; 1527 } 1528 1529 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) { 1530 rval = mdstealerror(ep, &c.c_mde); 1531 goto out; 1532 } 1533 /* 1534 * This is either a traditional diskset OR this 1535 * is the first replica added to a MN diskset. 1536 * In either case, set broadcast to NO_BCAST so 1537 * that message won't go through rpc.mdcommd. 1538 * If this is a traditional diskset, the bcast 1539 * flag is ignored since traditional disksets 1540 * don't use the rpc.mdcommd. 1541 */ 1542 if (meta_db_addsidenms(sp, np, blkno, 1543 DB_ADDSIDENMS_NO_BCAST, ep)) 1544 goto out; 1545 } 1546 } 1547 if (! metaislocalset(sp)) { 1548 /* update the dbcnt and size in dd */ 1549 for (p = dd; p != NULL; p = p->dd_next) 1550 if (p->dd_dnp == np->drivenamep) { 1551 p->dd_dbcnt = dbcnt; 1552 p->dd_dbsize = dbsize; 1553 break; 1554 } 1555 } 1556 1557 /* 1558 * If this was the first addition of disks to the 1559 * diskset you now need to update the mb_setcreatetime 1560 * which needed lb_inittime which wasn't there until now. 1561 */ 1562 if (firstmddb) { 1563 if (meta_update_mb(sp, dd, ep) != 0) { 1564 return (-1); 1565 } 1566 } 1567 (void) close(fd); 1568 } 1569 1570 out: 1571 if (metaislocalset(sp)) { 1572 1573 /* everything looks fine. Start mdmonitord */ 1574 /* Note: popen/pclose is the MT-safe replacement for system */ 1575 if (rval == 0 && start_mdmonitord == 1) { 1576 if (pclose(popen(MDMONITORD, "w")) == -1) 1577 md_perror(MDMONITORD); 1578 1579 if (meta_smf_enable(META_SMF_CORE, &status) == -1) { 1580 mde_perror(&status, ""); 1581 mdclrerror(&status); 1582 } 1583 } 1584 1585 if (buildconf(sp, &status)) { 1586 /* Don't mask any previous errors */ 1587 if (rval == 0) 1588 rval = mdstealerror(ep, &status); 1589 return (rval); 1590 } 1591 1592 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 1593 /* Don't mask any previous errors */ 1594 if (rval == 0) 1595 rval = mdstealerror(ep, &status); 1596 } 1597 } else { 1598 if (update_dbinfo_on_drives(sp, dd, 1599 (options & MDCHK_SET_LOCKED), 1600 (options & MDCHK_SET_FORCE), 1601 &status)) { 1602 /* Don't mask any previous errors */ 1603 if (rval == 0) 1604 rval = mdstealerror(ep, &status); 1605 else 1606 mdclrerror(&status); 1607 } 1608 metafreedrivedesc(&dd); 1609 } 1610 /* 1611 * For MN disksets that already had already had nodes joined 1612 * before the attach of this mddb(s), the name invalidation is 1613 * done by the commd handler routine. Otherwise, if this 1614 * is the first attach of a MN diskset mddb, the invalidation 1615 * must be done here since the first attach cannot be sent 1616 * via the commd since there are no nodes joined to the set yet. 1617 */ 1618 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || 1619 (MD_MNSET_DESC(sd) && 1620 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { 1621 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 1622 meta_invalidate_name(nlp->namep); 1623 } 1624 } 1625 return (rval); 1626 } 1627 1628 /* 1629 * deletelist_length 1630 * 1631 * return the number of slices that have been specified for deletion 1632 * on the metadb command line. This does not calculate the number 1633 * of replicas because there may be multiple replicas per slice. 1634 */ 1635 static int 1636 deletelist_length(mdnamelist_t *db_nlp) 1637 { 1638 1639 mdnamelist_t *nlp; 1640 int list_length = 0; 1641 1642 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1643 list_length++; 1644 } 1645 1646 return (list_length); 1647 } 1648 1649 static int 1650 in_deletelist(char *devname, mdnamelist_t *db_nlp) 1651 { 1652 1653 mdnamelist_t *nlp; 1654 mdname_t *np; 1655 int index = 0; 1656 1657 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1658 np = nlp->namep; 1659 1660 if (strcmp(devname, np->bname) == 0) 1661 return (index); 1662 index++; 1663 } 1664 1665 return (-1); 1666 } 1667 1668 /* 1669 * Delete replicas from set. This happens as a result of: 1670 * - metadb [-s set_name] -d 1671 * - metaset -s set_name -a disk (causes a rebalance of mddbs) 1672 * - metaset -s set_name -d disk 1673 * - metaset -s set_name -b 1674 * 1675 * For a local set, this routine is run on the local set host. 1676 * 1677 * For a traditional diskset, this routine is run on the node that 1678 * is running the metaset command. 1679 * 1680 * For a multinode diskset, this routine is run by the node that is 1681 * running the metaset command. This detach routine is sent to all 1682 * of the joined nodes in the diskset using commd. This keeps 1683 * the nodes in-sync. 1684 */ 1685 int 1686 meta_db_detach( 1687 mdsetname_t *sp, 1688 mdnamelist_t *db_nlp, 1689 mdforceopts_t force_option, 1690 char *sysfilename, 1691 md_error_t *ep 1692 ) 1693 { 1694 struct mddb_config c; 1695 mdnamelist_t *nlp; 1696 mdname_t *np; 1697 md_drive_desc *dd = NULL; 1698 md_drive_desc *p; 1699 int replicacount; 1700 int replica_delete_count; 1701 int nr_replica_slices; 1702 int i; 1703 int stop_svmdaemons = 0; 1704 int rval = 0; 1705 int index; 1706 int valid_replicas_nottodelete = 0; 1707 int invalid_replicas_nottodelete = 0; 1708 int invalid_replicas_todelete = 0; 1709 int errored = 0; 1710 int *tag_array; 1711 int fd = -1; 1712 md_error_t status = mdnullerror; 1713 md_set_desc *sd; 1714 int stale_bool = FALSE; 1715 int flags; 1716 1717 /* 1718 * Error if we don't get some work to do. 1719 */ 1720 if (db_nlp == NULL) 1721 return (mdsyserror(ep, EINVAL, NULL)); 1722 1723 if (mdnamesareunique(db_nlp, ep) != 0) 1724 return (-1); 1725 1726 (void) memset(&c, 0, sizeof (c)); 1727 c.c_id = 0; 1728 c.c_setno = sp->setno; 1729 1730 /* Don't need device id information from this ioctl */ 1731 c.c_locator.l_devid = (uint64_t)0; 1732 c.c_locator.l_devid_flags = 0; 1733 1734 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1735 return (mdstealerror(ep, &c.c_mde)); 1736 1737 /* 1738 * Is current set STALE? 1739 */ 1740 if (c.c_flags & MDDB_C_STALE) { 1741 stale_bool = TRUE; 1742 } 1743 1744 replicacount = c.c_dbcnt; 1745 1746 assert(db_nlp != NULL); 1747 1748 /* 1749 * go through and gather how many data bases are on each 1750 * device specified. 1751 */ 1752 1753 nr_replica_slices = deletelist_length(db_nlp); 1754 tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); 1755 1756 replica_delete_count = 0; 1757 for (i = 0; i < replicacount; i++) { 1758 char *devname; 1759 int found = 0; 1760 1761 c.c_id = i; 1762 1763 /* Don't need device id information from this ioctl */ 1764 c.c_locator.l_devid = (uint64_t)0; 1765 c.c_locator.l_devid_flags = 0; 1766 1767 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1768 return (mdstealerror(ep, &c.c_mde)); 1769 1770 devname = splicename(&c.c_devname); 1771 1772 if ((index = in_deletelist(devname, db_nlp)) != -1) { 1773 found = 1; 1774 tag_array[index] = 1; 1775 replica_delete_count++; 1776 } 1777 1778 errored = c.c_locator.l_flags & (MDDB_F_EREAD | 1779 MDDB_F_EWRITE | MDDB_F_TOOSMALL | 1780 MDDB_F_EFMT | MDDB_F_EDATA | 1781 MDDB_F_EMASTER); 1782 1783 /* 1784 * There are four combinations of "errored" and "found" 1785 * and they are used to find the number of 1786 * (a) valid/invalid replicas that are not in the delete 1787 * list and are available in the system. 1788 * (b) valid/invalid replicas that are to be deleted. 1789 */ 1790 1791 if (errored && !found) /* errored and !found */ 1792 invalid_replicas_nottodelete++; 1793 else if (!found) /* !errored and !found */ 1794 valid_replicas_nottodelete++; 1795 else if (errored) /* errored and found */ 1796 invalid_replicas_todelete++; 1797 /* 1798 * else it is !errored and found. This means 1799 * valid_replicas_todelete++; But this variable will not 1800 * be used anywhere 1801 */ 1802 1803 Free(devname); 1804 } 1805 1806 index = 0; 1807 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1808 np = nlp->namep; 1809 if (tag_array[index++] != 1) { 1810 Free(tag_array); 1811 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); 1812 } 1813 } 1814 1815 Free(tag_array); 1816 1817 1818 /* if all replicas are deleted stop mdmonitord */ 1819 if ((replicacount - replica_delete_count) == 0) 1820 stop_svmdaemons = 1; 1821 1822 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { 1823 if (force_option & MDFORCE_NONE) 1824 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); 1825 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) 1826 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); 1827 } 1828 1829 /* 1830 * The following algorithms are followed to check for deletion: 1831 * (a) If the delete list(db_nlp) has all invalid replicas and no valid 1832 * replicas, then deletion should be allowed. 1833 * (b) Deletion should be allowed only if valid replicas that are "not" 1834 * to be deleted is always greater than the invalid replicas that 1835 * are "not" to be deleted. 1836 * (c) If the user uses -f option, then deletion should be allowed. 1837 */ 1838 1839 if ((invalid_replicas_todelete != replica_delete_count) && 1840 (invalid_replicas_nottodelete > valid_replicas_nottodelete) && 1841 (force_option != MDFORCE_LOCAL)) 1842 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); 1843 1844 /* 1845 * go through and tell kernel to delete them 1846 */ 1847 1848 /* Don't need device id information from this ioctl */ 1849 c.c_locator.l_devid = (uint64_t)0; 1850 c.c_locator.l_devid_flags = 0; 1851 1852 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1853 return (mdstealerror(ep, &c.c_mde)); 1854 1855 if (! metaislocalset(sp)) { 1856 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1857 if (! mdisok(ep)) 1858 return (-1); 1859 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1860 return (-1); 1861 } 1862 1863 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1864 np = nlp->namep; 1865 1866 /* 1867 * If mddb is being deleted from MN diskset and node is 1868 * an owner of the diskset then use rpc.mdcommd 1869 * mechanism to add mddb(s) so that all nodes stay in sync. 1870 * If set is stale, don't log the message since rpc.mdcommd 1871 * can't write the message to the mddb. 1872 * 1873 * When mddbs are first being added to set, a detach can 1874 * be called before any node has joined the diskset, so 1875 * must check to see if node is an owner of the diskset. 1876 * 1877 * Otherwise, just delete mddb from this node. 1878 */ 1879 1880 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1881 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1882 md_mn_result_t *resultp; 1883 md_mn_msg_meta_db_detach_t detach; 1884 int send_rval; 1885 1886 /* 1887 * The following algorithm is used to detach replicas. 1888 * - META_DB_DETACH message generates submsgs 1889 * - BLOCK parse (master) 1890 * - MDDB_DETACH replicas 1891 * - UNBLOCK parse (master) causing parse 1892 * information to be sent from master 1893 * to slaves at a higher class than the 1894 * unblock so the parse message will 1895 * reach slaves before unblock message. 1896 */ 1897 (void) splitname(np->bname, &detach.msg_splitname); 1898 1899 /* Set devid to NULL until devids are supported */ 1900 detach.msg_devid[0] = NULL; 1901 1902 /* 1903 * If reconfig cycle has been started, this node is 1904 * stuck in in the return step until this command has 1905 * completed. If mdcommd is suspended, ask 1906 * send_message to fail (instead of retrying) 1907 * so that metaset can finish allowing the reconfig 1908 * cycle to proceed. 1909 */ 1910 flags = MD_MSGF_FAIL_ON_SUSPEND; 1911 if (stale_bool == TRUE) 1912 flags |= MD_MSGF_NO_LOG; 1913 send_rval = mdmn_send_message(sp->setno, 1914 MD_MN_MSG_META_DB_DETACH, 1915 flags, (char *)&detach, 1916 sizeof (md_mn_msg_meta_db_detach_t), 1917 &resultp, ep); 1918 if (send_rval != 0) { 1919 rval = -1; 1920 if (resultp == NULL) 1921 (void) mddserror(ep, 1922 MDE_DS_COMMD_SEND_FAIL, 1923 sp->setno, NULL, NULL, 1924 sp->setname); 1925 else { 1926 (void) mdstealerror(ep, 1927 &(resultp->mmr_ep)); 1928 if (mdisok(ep)) { 1929 (void) mddserror(ep, 1930 MDE_DS_COMMD_SEND_FAIL, 1931 sp->setno, NULL, NULL, 1932 sp->setname); 1933 } 1934 free_result(resultp); 1935 } 1936 goto out; 1937 } 1938 if (resultp) 1939 free_result(resultp); 1940 } else { 1941 i = 0; 1942 while (i < c.c_dbcnt) { 1943 char *devname; 1944 1945 c.c_id = i; 1946 1947 /* Don't need devid info from this ioctl */ 1948 c.c_locator.l_devid = (uint64_t)0; 1949 c.c_locator.l_devid_flags = 0; 1950 1951 if (metaioctl(MD_DB_GETDEV, &c, 1952 &c.c_mde, NULL)) { 1953 rval = mdstealerror(ep, &c.c_mde); 1954 goto out; 1955 } 1956 1957 devname = splicename(&c.c_devname); 1958 if (strcmp(devname, np->bname) != 0) { 1959 Free(devname); 1960 i++; 1961 continue; 1962 } 1963 Free(devname); 1964 1965 /* Don't need devid info from this ioctl */ 1966 c.c_locator.l_devid = (uint64_t)0; 1967 c.c_locator.l_devid_flags = 0; 1968 1969 if (metaioctl(MD_DB_DELDEV, &c, 1970 &c.c_mde, NULL) != 0) { 1971 rval = mdstealerror(ep, &c.c_mde); 1972 goto out; 1973 } 1974 1975 /* Not incrementing "i" intentionally */ 1976 } 1977 } 1978 if (! metaislocalset(sp)) { 1979 /* update the dbcnt and size in dd */ 1980 for (p = dd; p != NULL; p = p->dd_next) { 1981 if (p->dd_dnp == np->drivenamep) { 1982 p->dd_dbcnt = 0; 1983 p->dd_dbsize = 0; 1984 break; 1985 } 1986 } 1987 1988 /* 1989 * Slam a dummy master block and make it self 1990 * identifying 1991 */ 1992 if ((fd = open(np->rname, O_RDWR)) >= 0) { 1993 meta_mkdummymaster(sp, fd, 16); 1994 (void) close(fd); 1995 } 1996 } 1997 } 1998 out: 1999 if (metaislocalset(sp)) { 2000 /* 2001 * Stop all the daemons if there are 2002 * no more replicas so that the module can be 2003 * unloaded. 2004 */ 2005 if (rval == 0 && stop_svmdaemons == 1) { 2006 char buf[MAXPATHLEN]; 2007 int i; 2008 2009 for (i = 0; i < DAEMON_COUNT; i++) { 2010 (void) snprintf(buf, MAXPATHLEN, 2011 "/usr/bin/pkill -%s -x %s", 2012 svmd_kill_list[i].svmd_kill_val, 2013 svmd_kill_list[i].svmd_name); 2014 if (pclose(popen(buf, "w")) == -1) 2015 md_perror(buf); 2016 } 2017 2018 if (meta_smf_disable(META_SMF_ALL, &status) == -1) { 2019 mde_perror(&status, ""); 2020 mdclrerror(&status); 2021 } 2022 } 2023 if (buildconf(sp, &status)) { 2024 /* Don't mask any previous errors */ 2025 if (rval == 0) 2026 rval = mdstealerror(ep, &status); 2027 else 2028 mdclrerror(&status); 2029 return (rval); 2030 } 2031 2032 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 2033 /* Don't mask any previous errors */ 2034 if (rval == 0) 2035 rval = mdstealerror(ep, &status); 2036 else 2037 mdclrerror(&status); 2038 } 2039 } else { 2040 if (update_dbinfo_on_drives(sp, dd, 2041 (force_option & MDFORCE_SET_LOCKED), 2042 ((force_option & MDFORCE_LOCAL) | 2043 (force_option & MDFORCE_DS)), &status)) { 2044 /* Don't mask any previous errors */ 2045 if (rval == 0) 2046 rval = mdstealerror(ep, &status); 2047 else 2048 mdclrerror(&status); 2049 } 2050 metafreedrivedesc(&dd); 2051 } 2052 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { 2053 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 2054 meta_invalidate_name(nlp->namep); 2055 } 2056 } 2057 return (rval); 2058 } 2059 2060 static md_replica_t * 2061 metareplicaname( 2062 mdsetname_t *sp, 2063 int flags, 2064 struct mddb_config *c, 2065 md_error_t *ep 2066 ) 2067 { 2068 md_replica_t *rp; 2069 char *devname; 2070 size_t sz; 2071 2072 /* allocate replicaname */ 2073 rp = Zalloc(sizeof (*rp)); 2074 2075 /* get device name */ 2076 devname = splicename(&c->c_devname); 2077 if (flags & PRINT_FAST) { 2078 if ((rp->r_namep = metaname_fast(&sp, devname, 2079 LOGICAL_DEVICE, ep)) == NULL) { 2080 Free(devname); 2081 Free(rp); 2082 return (NULL); 2083 } 2084 } else { 2085 if ((rp->r_namep = metaname(&sp, devname, 2086 LOGICAL_DEVICE, ep)) == NULL) { 2087 Free(devname); 2088 Free(rp); 2089 return (NULL); 2090 } 2091 } 2092 Free(devname); 2093 2094 /* make sure it's OK */ 2095 if ((! (flags & MD_BASICNAME_OK)) && 2096 (metachkcomp(rp->r_namep, ep) != 0)) { 2097 Free(rp); 2098 return (NULL); 2099 } 2100 2101 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR; 2102 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR; 2103 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; 2104 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { 2105 sz = devid_sizeof((ddi_devid_t)(uintptr_t) 2106 (c->c_locator.l_devid)); 2107 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == 2108 (ddi_devid_t)NULL) { 2109 Free(rp); 2110 return (NULL); 2111 } 2112 (void) memcpy((void *)rp->r_devid, 2113 (void *)(uintptr_t)c->c_locator.l_devid, sz); 2114 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); 2115 rp->r_flags &= ~MDDB_F_NODEVID; 2116 /* Overwrite dev derived from name with dev from devid */ 2117 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); 2118 } 2119 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); 2120 2121 rp->r_blkno = c->c_locator.l_blkno; 2122 if (c->c_dbend != 0) 2123 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; 2124 2125 /* return replica */ 2126 return (rp); 2127 } 2128 2129 /* 2130 * free replica list 2131 */ 2132 void 2133 metafreereplicalist( 2134 md_replicalist_t *rlp 2135 ) 2136 { 2137 md_replicalist_t *rl = NULL; 2138 2139 for (/* void */; (rlp != NULL); rlp = rl) { 2140 rl = rlp->rl_next; 2141 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { 2142 free(rlp->rl_repp->r_devid); 2143 } 2144 Free(rlp->rl_repp); 2145 Free(rlp); 2146 } 2147 } 2148 2149 /* 2150 * return list of all replicas in set 2151 */ 2152 int 2153 metareplicalist( 2154 mdsetname_t *sp, 2155 int flags, 2156 md_replicalist_t **rlpp, 2157 md_error_t *ep 2158 ) 2159 { 2160 md_replicalist_t **tail = rlpp; 2161 int count = 0; 2162 struct mddb_config c; 2163 int i; 2164 char *devid; 2165 2166 /* for each replica */ 2167 i = 0; 2168 do { 2169 md_replica_t *rp; 2170 2171 /* get next replica */ 2172 (void) memset(&c, 0, sizeof (c)); 2173 c.c_id = i; 2174 c.c_setno = sp->setno; 2175 2176 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2177 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2178 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2179 mdclrerror(&c.c_mde); 2180 break; /* handle none at all */ 2181 } 2182 (void) mdstealerror(ep, &c.c_mde); 2183 goto out; 2184 } 2185 2186 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { 2187 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { 2188 (void) mdsyserror(ep, ENOMEM, META_DBCONF); 2189 goto out; 2190 } 2191 c.c_locator.l_devid = (uintptr_t)devid; 2192 /* 2193 * Turn on space and sz flags since 'sz' amount of 2194 * space has been alloc'd. 2195 */ 2196 c.c_locator.l_devid_flags = 2197 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2198 } 2199 2200 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2201 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2202 mdclrerror(&c.c_mde); 2203 break; /* handle none at all */ 2204 } 2205 (void) mdstealerror(ep, &c.c_mde); 2206 goto out; 2207 } 2208 2209 /* 2210 * Paranoid check - shouldn't happen, but is left as 2211 * a place holder for changes that will be needed after 2212 * dynamic reconfiguration changes are added to SVM (to 2213 * support movement of disks at any point in time). 2214 */ 2215 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 2216 (void) fprintf(stderr, 2217 dgettext(TEXT_DOMAIN, 2218 "Error: Relocation Information " 2219 "(drvnm=%s, mnum=0x%lx) \n" 2220 "relocation information size changed - \n" 2221 "rerun command\n"), 2222 c.c_locator.l_driver, c.c_locator.l_mnum); 2223 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); 2224 goto out; 2225 } 2226 2227 if (c.c_dbcnt == 0) 2228 break; /* handle none at all */ 2229 2230 /* get info */ 2231 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) 2232 goto out; 2233 2234 /* append to list */ 2235 *tail = Zalloc(sizeof (**tail)); 2236 (*tail)->rl_repp = rp; 2237 tail = &(*tail)->rl_next; 2238 ++count; 2239 2240 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2241 free(devid); 2242 c.c_locator.l_devid_flags = 0; 2243 } 2244 2245 } while (++i < c.c_dbcnt); 2246 2247 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2248 free(devid); 2249 } 2250 2251 /* return count */ 2252 return (count); 2253 2254 /* cleanup, return error */ 2255 out: 2256 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2257 free(devid); 2258 } 2259 metafreereplicalist(*rlpp); 2260 *rlpp = NULL; 2261 return (-1); 2262 } 2263 2264 /* 2265 * meta_sync_db_locations - get list of replicas from kernel and write 2266 * out to mddb.cf and md.conf. 'Syncs up' the replica list in 2267 * the kernel with the replica list in the conf files. 2268 * 2269 */ 2270 void 2271 meta_sync_db_locations( 2272 mdsetname_t *sp, 2273 md_error_t *ep 2274 ) 2275 { 2276 char *sname = 0; /* system file name */ 2277 char *cname = 0; /* config file name */ 2278 2279 if (!metaislocalset(sp)) 2280 return; 2281 2282 /* Updates backup of configuration file (aka mddb.cf) */ 2283 if (buildconf(sp, ep) != 0) 2284 return; 2285 2286 /* Updates system configuration file (aka md.conf) */ 2287 (void) meta_db_patch(sname, cname, 0, ep); 2288 } 2289 2290 /* 2291 * setup_db_locations - parse the mddb.cf file and 2292 * tells the driver which db locations to use. 2293 */ 2294 int 2295 meta_setup_db_locations( 2296 md_error_t *ep 2297 ) 2298 { 2299 mddb_config_t c; 2300 FILE *fp; 2301 char inbuff[1024]; 2302 char *buff; 2303 uint_t i; 2304 size_t sz; 2305 int rval = 0; 2306 char *devidp; 2307 uint_t devid_size; 2308 char *minor_name = NULL; 2309 ddi_devid_t devid_decode; 2310 int checksum; 2311 2312 /* do mddb.cf file */ 2313 (void) memset(&c, '\0', sizeof (c)); 2314 if ((fp = fopen(META_DBCONF, "r")) == NULL) { 2315 if (errno != ENOENT) 2316 return (mdsyserror(ep, errno, META_DBCONF)); 2317 } 2318 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), 2319 fp)) != NULL)) { 2320 2321 /* ignore comments */ 2322 if (*buff == '#') 2323 continue; 2324 2325 /* parse locator */ 2326 (void) memset(&c, 0, sizeof (c)); 2327 c.c_setno = MD_LOCAL_SET; 2328 i = strcspn(buff, " \t"); 2329 if (i > sizeof (c.c_locator.l_driver)) 2330 i = sizeof (c.c_locator.l_driver); 2331 (void) strncpy(c.c_locator.l_driver, buff, i); 2332 buff += i; 2333 c.c_locator.l_dev = 2334 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); 2335 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); 2336 c.c_locator.l_mnum = minor(c.c_locator.l_dev); 2337 2338 /* parse out devid */ 2339 while (isspace((int)(*buff))) 2340 buff += 1; 2341 i = strcspn(buff, " \t"); 2342 if ((devidp = (char *)malloc(i+1)) == NULL) 2343 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2344 2345 (void) strncpy(devidp, buff, i); 2346 devidp[i] = '\0'; 2347 if (devid_str_decode(devidp, &devid_decode, 2348 &minor_name) == -1) { 2349 free(devidp); 2350 continue; 2351 } 2352 2353 /* Conf file must have minor name associated with devid */ 2354 if (minor_name == NULL) { 2355 free(devidp); 2356 devid_free(devid_decode); 2357 continue; 2358 } 2359 2360 sz = devid_sizeof(devid_decode); 2361 /* Copy to devid size buffer that ioctl expects */ 2362 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { 2363 devid_free(devid_decode); 2364 free(minor_name); 2365 free(devidp); 2366 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2367 } 2368 2369 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2370 (void *)devid_decode, sz); 2371 2372 devid_free(devid_decode); 2373 2374 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { 2375 free(minor_name); 2376 free(devidp); 2377 free((void *)(uintptr_t)c.c_locator.l_devid); 2378 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2379 } 2380 (void) strcpy(c.c_locator.l_minor_name, minor_name); 2381 free(minor_name); 2382 c.c_locator.l_devid_flags = MDDB_DEVID_VALID | 2383 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2384 c.c_locator.l_devid_sz = sz; 2385 2386 devid_size = strlen(devidp); 2387 buff += devid_size; 2388 2389 checksum = strtol(buff, &buff, 10); 2390 for (i = 0; c.c_locator.l_driver[i] != 0; i++) 2391 checksum += c.c_locator.l_driver[i]; 2392 for (i = 0; i < devid_size; i++) { 2393 checksum += devidp[i]; 2394 } 2395 free(devidp); 2396 2397 checksum += minor(c.c_locator.l_dev); 2398 checksum += c.c_locator.l_blkno; 2399 if (checksum != 42) { 2400 /* overwritten later for more serious problems */ 2401 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); 2402 free((void *)(uintptr_t)c.c_locator.l_devid); 2403 continue; 2404 } 2405 c.c_locator.l_flags = 0; 2406 2407 /* use db location */ 2408 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2409 free((void *)(uintptr_t)c.c_locator.l_devid); 2410 return (mdstealerror(ep, &c.c_mde)); 2411 } 2412 2413 /* free up devid if in use */ 2414 free((void *)(uintptr_t)c.c_locator.l_devid); 2415 c.c_locator.l_devid = (uint64_t)0; 2416 c.c_locator.l_devid_flags = 0; 2417 } 2418 if ((fp) && (fclose(fp) != 0)) 2419 return (mdsyserror(ep, errno, META_DBCONF)); 2420 2421 /* check for stale database */ 2422 (void) memset((char *)&c, 0, sizeof (struct mddb_config)); 2423 c.c_id = 0; 2424 c.c_setno = MD_LOCAL_SET; 2425 2426 /* Don't need device id information from this ioctl */ 2427 c.c_locator.l_devid = (uint64_t)0; 2428 c.c_locator.l_devid_flags = 0; 2429 2430 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2431 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) 2432 return (mdstealerror(ep, &c.c_mde)); 2433 mdclrerror(&c.c_mde); 2434 } 2435 2436 if (c.c_flags & MDDB_C_STALE) 2437 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, 2438 0, NULL)); 2439 2440 /* success */ 2441 return (rval); 2442 } 2443 2444 /* 2445 * meta_db_minreplica - returns the minimum size replica currently in use. 2446 */ 2447 daddr_t 2448 meta_db_minreplica( 2449 mdsetname_t *sp, 2450 md_error_t *ep 2451 ) 2452 { 2453 md_replica_t *r; 2454 md_replicalist_t *rl, *rlp = NULL; 2455 daddr_t nblks = 0; 2456 2457 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) 2458 return (-1); 2459 2460 if (rlp == NULL) 2461 return (-1); 2462 2463 /* find the smallest existing replica */ 2464 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2465 r = rl->rl_repp; 2466 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2467 } 2468 2469 metafreereplicalist(rlp); 2470 return (nblks); 2471 } 2472 2473 /* 2474 * meta_get_replica_names 2475 * returns an mdnamelist_t of replica slices 2476 */ 2477 /*ARGSUSED*/ 2478 int 2479 meta_get_replica_names( 2480 mdsetname_t *sp, 2481 mdnamelist_t **nlpp, 2482 int options, 2483 md_error_t *ep 2484 ) 2485 { 2486 md_replicalist_t *rlp = NULL; 2487 md_replicalist_t *rl; 2488 mdnamelist_t **tailpp = nlpp; 2489 int cnt = 0; 2490 2491 assert(nlpp != NULL); 2492 2493 if (!metaislocalset(sp)) 2494 goto out; 2495 2496 /* get replicas */ 2497 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 2498 cnt = -1; 2499 goto out; 2500 } 2501 2502 /* build name list */ 2503 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 2504 /* 2505 * Add the name struct to the end of the 2506 * namelist but keep a pointer to the last 2507 * element so that we don't incur the overhead 2508 * of traversing the list each time 2509 */ 2510 tailpp = meta_namelist_append_wrapper( 2511 tailpp, rl->rl_repp->r_namep); 2512 ++cnt; 2513 } 2514 2515 /* cleanup, return count or error */ 2516 out: 2517 metafreereplicalist(rlp); 2518 return (cnt); 2519 } 2520