1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 /* 30 * Just in case we're not in a build environment, make sure that 31 * TEXT_DOMAIN gets set to something. 32 */ 33 #if !defined(TEXT_DOMAIN) 34 #define TEXT_DOMAIN "SYS_TEST" 35 #endif 36 37 /* 38 * Metadevice database interfaces. 39 */ 40 41 #define MDDB 42 43 #include <meta.h> 44 #include <sys/lvm/md_mddb.h> 45 #include <sys/lvm/md_crc.h> 46 #include <sys/lvm/mdio.h> 47 #include <string.h> 48 #include <strings.h> 49 #include <ctype.h> 50 51 struct svm_daemon { 52 char *svmd_name; 53 char *svmd_kill_val; 54 }; 55 56 struct svm_daemon svmd_kill_list[] = { 57 {"mdmonitord", "HUP"}, 58 {"mddoors", "KILL"}, 59 }; 60 61 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) 62 #define MDMONITORD "/usr/sbin/mdmonitord" 63 64 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); 65 66 /* 67 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel 68 */ 69 md_timeval32_t 70 meta_get_lb_inittime( 71 mdsetname_t *sp, 72 md_error_t *ep 73 ) 74 { 75 mddb_config_t c; 76 77 (void) memset(&c, 0, sizeof (c)); 78 79 /* Fill in setno, setname, and sideno */ 80 c.c_setno = sp->setno; 81 82 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { 83 (void) mdstealerror(ep, &c.c_mde); 84 } 85 86 return (c.c_timestamp); 87 } 88 89 /* 90 * mkmasterblks writes out the master blocks of the mddb to the replica. 91 * 92 * In a MN diskset, this is called by the node that is adding this replica 93 * to the diskset. 94 */ 95 96 #define MDDB_VERIFY_SIZE 8192 97 98 static int 99 mkmasterblks( 100 mdsetname_t *sp, 101 mdname_t *np, 102 int fd, 103 daddr_t firstblk, 104 int dbsize, 105 md_timeval32_t inittime, 106 md_error_t *ep 107 ) 108 { 109 int consecutive; 110 md_timeval32_t tp; 111 struct mddb_mb *mb; 112 char *buffer; 113 int iosize; 114 md_set_desc *sd; 115 int mn_set = 0; 116 daddr_t startblk; 117 int cnt; 118 ddi_devid_t devid; 119 120 if (! metaislocalset(sp)) { 121 if ((sd = metaget_setdesc(sp, ep)) == NULL) 122 return (-1); 123 124 if (MD_MNSET_DESC(sd)) { 125 mn_set = 1; /* Used later */ 126 } 127 } 128 129 /* 130 * Loop to verify the entire mddb region on disk is read/writable. 131 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block 132 * chunks. 133 * 134 * A side-effect of this loop is to zero out the entire mddb region 135 */ 136 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) 137 return (mdsyserror(ep, ENOMEM, np->rname)); 138 139 startblk = firstblk; 140 for (cnt = dbsize; cnt > 0; cnt -= consecutive) { 141 142 if (cnt > MDDB_VERIFY_SIZE) 143 consecutive = MDDB_VERIFY_SIZE; 144 else 145 consecutive = cnt; 146 147 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 148 Free(buffer); 149 return (mdsyserror(ep, errno, np->rname)); 150 } 151 152 iosize = DEV_BSIZE * consecutive; 153 if (write(fd, buffer, iosize) != iosize) { 154 Free(buffer); 155 return (mdsyserror(ep, errno, np->rname)); 156 } 157 158 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 159 Free(buffer); 160 return (mdsyserror(ep, errno, np->rname)); 161 } 162 163 if (read(fd, buffer, iosize) != iosize) { 164 Free(buffer); 165 return (mdsyserror(ep, errno, np->rname)); 166 } 167 168 startblk += consecutive; 169 } 170 171 Free(buffer); 172 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 173 return (mdsyserror(ep, ENOMEM, np->rname)); 174 175 if (meta_gettimeofday(&tp) == -1) { 176 Free(mb); 177 return (mdsyserror(ep, errno, np->rname)); 178 } 179 180 mb->mb_magic = MDDB_MAGIC_MB; 181 /* 182 * If a MN diskset, set master block revision for a MN set. 183 * Even though the master block structure is no different 184 * for a MN set, setting the revision field to a different 185 * number keeps any pre-MN_diskset code from accessing 186 * this diskset. It also allows for an early determination 187 * of a MN diskset when reading in from disk so that the 188 * proper size locator block and locator names structure 189 * can be read in thus saving time on diskset startup. 190 */ 191 if (mn_set) 192 mb->mb_revision = MDDB_REV_MNMB; 193 else 194 mb->mb_revision = MDDB_REV_MB; 195 mb->mb_timestamp = tp; 196 mb->mb_setno = sp->setno; 197 mb->mb_blkcnt = dbsize - 1; 198 mb->mb_blkno = firstblk; 199 mb->mb_nextblk = 0; 200 201 mb->mb_blkmap.m_firstblk = firstblk + 1; 202 mb->mb_blkmap.m_consecutive = dbsize - 1; 203 if (! metaislocalset(sp)) { 204 mb->mb_setcreatetime = inittime; 205 } 206 207 /* 208 * We try to save the disks device ID into the remaining bytes in 209 * the master block. The saved devid is used to provide a mapping 210 * between this disk's devid and the devid stored into the master 211 * block. This allows the disk image to be self-identifying 212 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 213 * when we try to import these disks on the remote copied image. 214 * If we cannot save the disks device ID onto the master block that is 215 * ok. The disk is just not self-identifying and won't be importable 216 * in the remote copy scenario. 217 */ 218 if (devid_get(fd, &devid) == 0) { 219 size_t len; 220 221 len = devid_sizeof(devid); 222 if (len <= DEV_BSIZE - sizeof (*mb)) { 223 /* there is enough space to store the devid */ 224 mb->mb_devid_magic = MDDB_MAGIC_DE; 225 mb->mb_devid_len = len; 226 (void) memcpy(mb->mb_devid, devid, len); 227 } 228 devid_free(devid); 229 } 230 231 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 232 (crc_skip_t *)NULL); 233 234 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 235 Free(mb); 236 return (mdsyserror(ep, errno, np->rname)); 237 } 238 239 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 240 Free(mb); 241 return (mdsyserror(ep, errno, np->rname)); 242 } 243 244 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 245 Free(mb); 246 return (mdsyserror(ep, errno, np->rname)); 247 } 248 249 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 250 Free(mb); 251 return (mdsyserror(ep, errno, np->rname)); 252 } 253 254 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 255 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { 256 Free(mb); 257 return (mdmddberror(ep, MDE_NOTVERIFIED, 258 meta_getminor(np->dev), sp->setno, 0, np->rname)); 259 } 260 261 Free(mb); 262 return (0); 263 } 264 265 void 266 meta_mkdummymaster( 267 mdsetname_t *sp, 268 int fd, 269 daddr_t firstblk 270 ) 271 { 272 md_timeval32_t tp; 273 struct mddb_mb *mb; 274 ddi_devid_t devid; 275 md_set_desc *sd; 276 md_error_t ep = mdnullerror; 277 md_timeval32_t inittime; 278 279 /* 280 * No dummy master blocks are written for a MN diskset since devids 281 * are not supported in MN disksets. 282 */ 283 if (! metaislocalset(sp)) { 284 if ((sd = metaget_setdesc(sp, &ep)) == NULL) 285 return; 286 287 if (MD_MNSET_DESC(sd)) 288 return; 289 } 290 291 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 292 return; 293 294 mb->mb_magic = MDDB_MAGIC_DU; 295 mb->mb_revision = MDDB_REV_MB; 296 mb->mb_setno = sp->setno; 297 inittime = meta_get_lb_inittime(sp, &ep); 298 mb->mb_setcreatetime = inittime; 299 300 if (meta_gettimeofday(&tp) != -1) 301 mb->mb_timestamp = tp; 302 303 /* 304 * We try to save the disks device ID into the remaining bytes in 305 * the master block. This allows the disk image to be self-identifying 306 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 307 * when we try to import these disks on the remote copied image. 308 * If we cannot save the disks device ID onto the master block that is 309 * ok. The disk is just not self-identifying and won't be importable 310 * in the remote copy scenario. 311 */ 312 if (devid_get(fd, &devid) == 0) { 313 int len; 314 315 len = devid_sizeof(devid); 316 if (len <= DEV_BSIZE - sizeof (*mb)) { 317 /* there is enough space to store the devid */ 318 mb->mb_devid_magic = MDDB_MAGIC_DE; 319 mb->mb_devid_len = len; 320 (void) memcpy(mb->mb_devid, (char *)devid, len); 321 } 322 devid_free(devid); 323 } 324 325 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 326 (crc_skip_t *)NULL); 327 328 /* 329 * If any of these operations fail, we need to inform the 330 * user that the disk won't be self identifying. When support 331 * for importing remotely replicated disksets is added, we 332 * want to add the error messages here. 333 */ 334 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 335 goto out; 336 337 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) 338 goto out; 339 340 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 341 goto out; 342 343 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) 344 goto out; 345 346 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 347 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) 348 goto out; 349 350 out: 351 Free(mb); 352 } 353 354 static int 355 buildconf(mdsetname_t *sp, md_error_t *ep) 356 { 357 md_replicalist_t *rlp = NULL; 358 md_replicalist_t *rl; 359 FILE *cfp = NULL; 360 FILE *mfp = NULL; 361 struct stat sbuf; 362 int rval = 0; 363 int in_miniroot = 0; 364 char line[MDDB_BOOTLIST_MAX_LEN]; 365 char *tname = NULL; 366 367 /* get list of local replicas */ 368 if (! metaislocalset(sp)) 369 return (0); 370 371 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 372 return (-1); 373 374 /* open tempfile, copy permissions of original file */ 375 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { 376 /* 377 * On the miniroot tmp files must be created in /var/tmp. 378 * If we get a EROFS error, we assume that we are in the 379 * miniroot. 380 */ 381 if (errno != EROFS) 382 goto error; 383 in_miniroot = 1; 384 errno = 0; 385 tname = tempnam("/var/tmp", "slvm_"); 386 if (tname == NULL && errno == EROFS) { 387 /* 388 * If we are booted on a read-only root because 389 * of mddb quorum problems we don't want to emit 390 * any scary error messages. 391 */ 392 errno = 0; 393 goto out; 394 } 395 396 /* open tempfile, copy permissions of original file */ 397 if ((cfp = fopen(tname, "w+")) == NULL) 398 goto error; 399 } 400 if (stat(META_DBCONF, &sbuf) == 0) { 401 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) 402 goto error; 403 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) 404 goto error; 405 } 406 407 /* print header */ 408 if (fprintf(cfp, "#metadevice database location file ") == EOF) 409 goto error; 410 if (fprintf(cfp, "do not hand edit\n") < 0) 411 goto error; 412 if (fprintf(cfp, 413 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) 414 goto error; 415 416 /* dump replicas */ 417 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 418 md_replica_t *r = rl->rl_repp; 419 int checksum = 42; 420 int i; 421 char *devidp; 422 minor_t min; 423 424 devidp = devid_str_encode(r->r_devid, r->r_minor_name); 425 /* If devid code can't encode devidp - skip entry */ 426 if (devidp == NULL) { 427 continue; 428 } 429 430 /* compute checksum */ 431 for (i = 0; ((r->r_driver_name[i] != '\0') && 432 (i < sizeof (r->r_driver_name))); i++) { 433 checksum -= r->r_driver_name[i]; 434 } 435 min = meta_getminor(r->r_namep->dev); 436 checksum -= min; 437 checksum -= r->r_blkno; 438 439 for (i = 0; i < strlen(devidp); i++) { 440 checksum -= devidp[i]; 441 } 442 /* print info */ 443 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", 444 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { 445 goto error; 446 } 447 448 devid_str_free(devidp); 449 } 450 451 /* close and rename to real file */ 452 if (fflush(cfp) != 0) 453 goto error; 454 if (fsync(fileno(cfp)) != 0) 455 goto error; 456 if (fclose(cfp) != 0) { 457 cfp = NULL; 458 goto error; 459 } 460 cfp = NULL; 461 462 /* 463 * Renames don't work in the miniroot since tmpfiles are 464 * created in /var/tmp. Hence we copy the data out. 465 */ 466 467 if (! in_miniroot) { 468 if (rename(META_DBCONFTMP, META_DBCONF) != 0) 469 goto error; 470 } else { 471 if ((cfp = fopen(tname, "r")) == NULL) 472 goto error; 473 if ((mfp = fopen(META_DBCONF, "w+")) == NULL) 474 goto error; 475 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { 476 if (fputs(line, mfp) == NULL) 477 goto error; 478 } 479 (void) fclose(cfp); 480 cfp = NULL; 481 if (fflush(mfp) != 0) 482 goto error; 483 if (fsync(fileno(mfp)) != 0) 484 goto error; 485 if (fclose(mfp) != 0) { 486 mfp = NULL; 487 goto error; 488 } 489 /* delete the tempfile */ 490 (void) unlink(tname); 491 } 492 /* success */ 493 rval = 0; 494 goto out; 495 496 /* tempfile error */ 497 error: 498 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 499 mdsyserror(ep, errno, META_DBCONFTMP); 500 501 502 /* cleanup, return success */ 503 out: 504 if (rlp != NULL) 505 metafreereplicalist(rlp); 506 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { 507 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 508 mdsyserror(ep, errno, META_DBCONFTMP); 509 } 510 free(tname); 511 return (rval); 512 } 513 514 /* 515 * check replica for dev 516 */ 517 static int 518 in_replica( 519 mdsetname_t *sp, 520 md_replica_t *rp, 521 mdname_t *np, 522 diskaddr_t slblk, 523 diskaddr_t nblks, 524 md_error_t *ep 525 ) 526 { 527 mdname_t *repnp = rp->r_namep; 528 diskaddr_t rep_sblk = rp->r_blkno; 529 diskaddr_t rep_nblks = rp->r_nblk; 530 531 /* should be in the same set */ 532 assert(sp != NULL); 533 534 /* if error in master block, assume whole partition */ 535 if ((rep_sblk == MD_DISKADDR_ERROR) || 536 (rep_nblks == MD_DISKADDR_ERROR)) { 537 rep_sblk = 0; 538 rep_nblks = MD_DISKADDR_ERROR; 539 } 540 541 /* check overlap */ 542 if (meta_check_overlap( 543 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { 544 return (-1); 545 } 546 547 /* return success */ 548 return (0); 549 } 550 551 /* 552 * check to see if we're in a replica 553 */ 554 int 555 meta_check_inreplica( 556 mdsetname_t *sp, 557 mdname_t *np, 558 diskaddr_t slblk, 559 diskaddr_t nblks, 560 md_error_t *ep 561 ) 562 { 563 md_replicalist_t *rlp = NULL; 564 md_replicalist_t *rl; 565 int rval = 0; 566 567 /* should have a set */ 568 assert(sp != NULL); 569 570 /* for each replica */ 571 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 572 return (-1); 573 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 574 md_replica_t *rp = rl->rl_repp; 575 576 /* check replica */ 577 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { 578 rval = -1; 579 break; 580 } 581 } 582 583 /* cleanup, return success */ 584 metafreereplicalist(rlp); 585 return (rval); 586 } 587 588 /* 589 * check replica 590 */ 591 int 592 meta_check_replica( 593 mdsetname_t *sp, /* set to check against */ 594 mdname_t *np, /* component to check against */ 595 mdchkopts_t options, /* option flags */ 596 diskaddr_t slblk, /* start logical block */ 597 diskaddr_t nblks, /* number of blocks (-1,rest of them) */ 598 md_error_t *ep /* error packet */ 599 ) 600 { 601 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; 602 603 /* make sure we have a disk */ 604 if (metachkcomp(np, ep) != 0) 605 return (-1); 606 607 /* check to ensure that it is not already in use */ 608 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { 609 return (-1); 610 } 611 612 if (options & MDCHK_ALLOW_NODBS) 613 return (0); 614 615 if (options & MDCHK_DRVINSET) 616 return (0); 617 618 /* make sure it is in the set */ 619 if (meta_check_inset(sp, np, ep) != 0) 620 return (-1); 621 622 /* make sure its not in a metadevice */ 623 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) 624 return (-1); 625 626 /* return success */ 627 return (0); 628 } 629 630 static int 631 update_dbinfo_on_drives( 632 mdsetname_t *sp, 633 md_drive_desc *dd, 634 int set_locked, 635 int force, 636 md_error_t *ep 637 ) 638 { 639 md_set_desc *sd; 640 int i; 641 md_setkey_t *cl_sk; 642 int rval = 0; 643 md_mnnode_desc *nd; 644 645 if ((sd = metaget_setdesc(sp, ep)) == NULL) 646 return (-1); 647 648 if (! set_locked) { 649 if (MD_MNSET_DESC(sd)) { 650 md_error_t xep = mdnullerror; 651 sigset_t sigs; 652 /* Make sure we are blocking all signals */ 653 if (procsigs(TRUE, &sigs, &xep) < 0) 654 mdclrerror(&xep); 655 656 nd = sd->sd_nodelist; 657 while (nd) { 658 if (force && strcmp(nd->nd_nodename, 659 mynode()) != 0) { 660 nd = nd->nd_next; 661 continue; 662 } 663 664 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 665 nd = nd->nd_next; 666 continue; 667 } 668 669 if (clnt_lock_set(nd->nd_nodename, sp, ep)) 670 return (-1); 671 nd = nd->nd_next; 672 } 673 } else { 674 for (i = 0; i < MD_MAXSIDES; i++) { 675 /* Skip empty slots */ 676 if (sd->sd_nodes[i][0] == '\0') 677 continue; 678 679 if (force && strcmp(sd->sd_nodes[i], 680 mynode()) != 0) 681 continue; 682 683 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) 684 return (-1); 685 } 686 } 687 } 688 689 if (MD_MNSET_DESC(sd)) { 690 nd = sd->sd_nodelist; 691 while (nd) { 692 if (force && strcmp(nd->nd_nodename, mynode()) != 0) { 693 nd = nd->nd_next; 694 continue; 695 } 696 697 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 698 nd = nd->nd_next; 699 continue; 700 } 701 702 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) 703 == -1) { 704 rval = -1; 705 break; 706 } 707 nd = nd->nd_next; 708 } 709 } else { 710 for (i = 0; i < MD_MAXSIDES; i++) { 711 /* Skip empty slots */ 712 if (sd->sd_nodes[i][0] == '\0') 713 continue; 714 715 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) 716 continue; 717 718 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) 719 == -1) { 720 rval = -1; 721 break; 722 } 723 } 724 } 725 726 if (! set_locked) { 727 cl_sk = cl_get_setkey(sp->setno, sp->setname); 728 if (MD_MNSET_DESC(sd)) { 729 nd = sd->sd_nodelist; 730 while (nd) { 731 if (force && 732 strcmp(nd->nd_nodename, mynode()) != 0) { 733 nd = nd->nd_next; 734 continue; 735 } 736 737 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 738 nd = nd->nd_next; 739 continue; 740 } 741 742 if (clnt_unlock_set(nd->nd_nodename, cl_sk, 743 ep)) { 744 rval = -1; 745 break; 746 } 747 nd = nd->nd_next; 748 } 749 } else { 750 for (i = 0; i < MD_MAXSIDES; i++) { 751 /* Skip empty slots */ 752 if (sd->sd_nodes[i][0] == '\0') 753 continue; 754 755 if (force && 756 strcmp(sd->sd_nodes[i], mynode()) != 0) 757 continue; 758 759 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, 760 ep)) { 761 rval = -1; 762 break; 763 } 764 } 765 766 } 767 cl_set_setkey(NULL); 768 } 769 770 return (rval); 771 } 772 773 int 774 meta_db_addsidenms( 775 mdsetname_t *sp, 776 mdname_t *np, 777 daddr_t blkno, 778 int bcast, 779 md_error_t *ep 780 ) 781 { 782 side_t sideno; 783 char *bname = NULL; 784 char *dname = NULL; 785 minor_t mnum; 786 mddb_config_t c; 787 int done; 788 int rval = 0; 789 md_set_desc *sd; 790 791 sideno = MD_SIDEWILD; 792 /*CONSTCOND*/ 793 while (1) { 794 if (bname != NULL) { 795 Free(bname); 796 bname = NULL; 797 } 798 if (dname != NULL) { 799 Free(dname); 800 dname = NULL; 801 } 802 if ((done = meta_getnextside_devinfo(sp, np->bname, 803 &sideno, &bname, &dname, &mnum, ep)) == -1) { 804 rval = -1; 805 break; 806 } 807 808 if (done == 0) 809 break; 810 811 if (! metaislocalset(sp)) { 812 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 813 rval = -1; 814 break; 815 } 816 } 817 818 /* 819 * Send addsidenms to all nodes using rpc.mdcommd if 820 * sidename is being added to MN diskset. 821 * 822 * It's ok to broadcast this call to other nodes. 823 * 824 * Note: The broadcast to other nodes isn't needed during 825 * the addition of the first mddbs to the set since the 826 * other nodes haven't been joined to the set yet. All 827 * nodes in a MN diskset are (implicitly) joined to the set 828 * on the addition of the first mddb. 829 */ 830 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 831 (bcast == DB_ADDSIDENMS_BCAST)) { 832 md_mn_result_t *resultp = NULL; 833 md_mn_msg_meta_db_newside_t db_ns; 834 int send_rval; 835 836 db_ns.msg_l_dev = np->dev; 837 db_ns.msg_sideno = sideno; 838 db_ns.msg_blkno = blkno; 839 (void) strncpy(db_ns.msg_dname, dname, 840 sizeof (db_ns.msg_dname)); 841 (void) splitname(np->bname, &db_ns.msg_splitname); 842 db_ns.msg_mnum = mnum; 843 844 /* Set devid to NULL until devids are supported */ 845 db_ns.msg_devid[0] = NULL; 846 847 /* 848 * If reconfig cycle has been started, this node is 849 * stuck in in the return step until this command has 850 * completed. If mdcommd is suspended, ask 851 * send_message to fail (instead of retrying) 852 * so that metaset can finish allowing the reconfig 853 * cycle to proceed. 854 */ 855 send_rval = mdmn_send_message(sp->setno, 856 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | 857 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, 858 sizeof (md_mn_msg_meta_db_newside_t), 859 &resultp, ep); 860 if (send_rval != 0) { 861 rval = -1; 862 if (resultp == NULL) 863 (void) mddserror(ep, 864 MDE_DS_COMMD_SEND_FAIL, 865 sp->setno, NULL, NULL, 866 sp->setname); 867 else { 868 (void) mdstealerror(ep, 869 &(resultp->mmr_ep)); 870 if (mdisok(ep)) { 871 (void) mddserror(ep, 872 MDE_DS_COMMD_SEND_FAIL, 873 sp->setno, NULL, NULL, 874 sp->setname); 875 } 876 free_result(resultp); 877 } 878 break; 879 } 880 if (resultp) 881 free_result(resultp); 882 } else { 883 /* 884 * Let this side's device name, minor # and driver name 885 * be known to the database replica. 886 */ 887 (void) memset(&c, 0, sizeof (c)); 888 889 /* Fill in device/replica info */ 890 c.c_locator.l_dev = meta_cmpldev(np->dev); 891 c.c_locator.l_blkno = blkno; 892 (void) strncpy(c.c_locator.l_driver, dname, 893 sizeof (c.c_locator.l_driver)); 894 (void) splitname(bname, &c.c_devname); 895 c.c_locator.l_mnum = mnum; 896 897 /* Fill in setno, setname, and sideno */ 898 c.c_setno = sp->setno; 899 (void) strncpy(c.c_setname, sp->setname, 900 sizeof (c.c_setname)); 901 c.c_sideno = sideno; 902 903 /* 904 * Don't need device id information from this ioctl 905 * Kernel determines device id from dev_t, which 906 * is just what this code would do. 907 */ 908 c.c_locator.l_devid = (uint64_t)0; 909 c.c_locator.l_devid_flags = 0; 910 911 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { 912 rval = mdstealerror(ep, &c.c_mde); 913 break; 914 } 915 } 916 } 917 918 /* cleanup, return success */ 919 if (bname != NULL) { 920 Free(bname); 921 bname = NULL; 922 } 923 if (dname != NULL) { 924 Free(dname); 925 dname = NULL; 926 } 927 return (rval); 928 } 929 930 931 int 932 meta_db_delsidenm( 933 mdsetname_t *sp, 934 side_t sideno, 935 mdname_t *np, 936 daddr_t blkno, 937 md_error_t *ep 938 ) 939 { 940 mddb_config_t c; 941 md_set_desc *sd; 942 943 if (! metaislocalset(sp)) { 944 if ((sd = metaget_setdesc(sp, ep)) == NULL) 945 return (-1); 946 } 947 /* Use rpc.mdcommd to delete mddb side from all nodes */ 948 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 949 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 950 md_mn_result_t *resultp = NULL; 951 md_mn_msg_meta_db_delside_t db_ds; 952 int send_rval; 953 954 db_ds.msg_l_dev = np->dev; 955 db_ds.msg_blkno = blkno; 956 db_ds.msg_sideno = sideno; 957 958 /* Set devid to NULL until devids are supported */ 959 db_ds.msg_devid[0] = NULL; 960 961 /* 962 * If reconfig cycle has been started, this node is 963 * stuck in in the return step until this command has 964 * completed. If mdcommd is suspended, ask 965 * send_message to fail (instead of retrying) 966 * so that metaset can finish allowing the reconfig 967 * cycle to proceed. 968 */ 969 send_rval = mdmn_send_message(sp->setno, 970 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | 971 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, 972 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); 973 if (send_rval != 0) { 974 if (resultp == NULL) 975 (void) mddserror(ep, 976 MDE_DS_COMMD_SEND_FAIL, 977 sp->setno, NULL, NULL, 978 sp->setname); 979 else { 980 (void) mdstealerror(ep, &(resultp->mmr_ep)); 981 if (mdisok(ep)) { 982 (void) mddserror(ep, 983 MDE_DS_COMMD_SEND_FAIL, 984 sp->setno, NULL, NULL, 985 sp->setname); 986 } 987 free_result(resultp); 988 } 989 return (-1); 990 } 991 if (resultp) 992 free_result(resultp); 993 994 } else { 995 /* 996 * Let this side's device name, minor # and driver name 997 * be known to the database replica. 998 */ 999 (void) memset(&c, 0, sizeof (c)); 1000 1001 /* Fill in device/replica info */ 1002 c.c_locator.l_dev = meta_cmpldev(np->dev); 1003 c.c_locator.l_blkno = blkno; 1004 1005 /* Fill in setno, setname, and sideno */ 1006 c.c_setno = sp->setno; 1007 (void) strcpy(c.c_setname, sp->setname); 1008 c.c_sideno = sideno; 1009 1010 /* 1011 * Don't need device id information from this ioctl 1012 * Kernel determines device id from dev_t, which 1013 * is just what this code would do. 1014 */ 1015 c.c_locator.l_devid = (uint64_t)0; 1016 c.c_locator.l_devid_flags = 0; 1017 1018 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) 1019 return (mdstealerror(ep, &c.c_mde)); 1020 } 1021 return (0); 1022 } 1023 1024 1025 static int 1026 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) 1027 { 1028 mdnamelist_t *dnp1, *dnp2; 1029 1030 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { 1031 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { 1032 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) 1033 return (mderror(ep, MDE_DUPDRIVE, 1034 dnp1->namep->cname)); 1035 } 1036 } 1037 return (0); 1038 } 1039 1040 1041 /* 1042 * Return 1 if files are different, else return 0 1043 */ 1044 static int 1045 filediff(char *tsname, char *sname) 1046 { 1047 int ret = 1, fd; 1048 size_t tsz, sz; 1049 struct stat sbuf; 1050 char *tbuf, *buf; 1051 1052 if (stat(tsname, &sbuf) != 0) 1053 return (1); 1054 tsz = sbuf.st_size; 1055 if (stat(sname, &sbuf) != 0) 1056 return (1); 1057 sz = sbuf.st_size; 1058 if (tsz != sz) 1059 return (1); 1060 1061 /* allocate memory and read both files into buffer */ 1062 tbuf = malloc(tsz); 1063 buf = malloc(sz); 1064 if (tbuf == NULL || buf == NULL) 1065 goto out; 1066 1067 fd = open(tsname, O_RDONLY); 1068 if (fd == -1) 1069 goto out; 1070 sz = read(fd, tbuf, tsz); 1071 (void) close(fd); 1072 if (sz != tsz) 1073 goto out; 1074 1075 fd = open(sname, O_RDONLY); 1076 if (fd == -1) 1077 goto out; 1078 sz = read(fd, buf, tsz); 1079 (void) close(fd); 1080 if (sz != tsz) 1081 goto out; 1082 1083 /* compare content */ 1084 ret = bcmp(tbuf, buf, tsz); 1085 out: 1086 if (tbuf) 1087 free(tbuf); 1088 if (buf) 1089 free(buf); 1090 return (ret); 1091 } 1092 1093 /* 1094 * patch md.conf file with mddb locations 1095 */ 1096 int 1097 meta_db_patch( 1098 char *sname, /* system file name */ 1099 char *cname, /* mddb.cf file name */ 1100 int patch, /* patching locally */ 1101 md_error_t *ep 1102 ) 1103 { 1104 char *tsname = NULL; 1105 char line[MDDB_BOOTLIST_MAX_LEN]; 1106 FILE *tsfp = NULL; 1107 FILE *mfp = NULL; 1108 int rval = -1; 1109 1110 /* check names */ 1111 if (sname == NULL) { 1112 if (patch) 1113 sname = "md.conf"; 1114 else 1115 sname = "/kernel/drv/md.conf"; 1116 } 1117 if (cname == NULL) 1118 cname = META_DBCONF; 1119 1120 /* 1121 * edit file 1122 */ 1123 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { 1124 if (mdissyserror(ep, EROFS)) { 1125 /* 1126 * If we are booted on a read-only root because 1127 * of mddb quorum problems we don't want to emit 1128 * any scary error messages. 1129 */ 1130 mdclrerror(ep); 1131 rval = 0; 1132 } 1133 goto out; 1134 } 1135 1136 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 1137 ep) != 0) 1138 goto out; 1139 1140 /* if file content is identical, skip rename */ 1141 if (filediff(tsname, sname) == 0) { 1142 rval = 0; 1143 goto out; 1144 } 1145 1146 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || 1147 (fclose(tsfp) != 0)) { 1148 (void) mdsyserror(ep, errno, tsname); 1149 goto out; 1150 } 1151 1152 tsfp = NULL; 1153 1154 /* 1155 * rename file. If we get a Cross Device error then it 1156 * is because we are in the miniroot. 1157 */ 1158 if (rename(tsname, sname) != 0 && errno != EXDEV) { 1159 (void) mdsyserror(ep, errno, sname); 1160 goto out; 1161 } 1162 1163 if (errno == EXDEV) { 1164 if ((tsfp = fopen(tsname, "r")) == NULL) 1165 goto out; 1166 if ((mfp = fopen(sname, "w+")) == NULL) 1167 goto out; 1168 while (fgets(line, sizeof (line), tsfp) != NULL) { 1169 if (fputs(line, mfp) == NULL) 1170 goto out; 1171 } 1172 (void) fclose(tsfp); 1173 tsfp = NULL; 1174 if (fflush(mfp) != 0) 1175 goto out; 1176 if (fsync(fileno(mfp)) != 0) 1177 goto out; 1178 if (fclose(mfp) != 0) { 1179 mfp = NULL; 1180 goto out; 1181 } 1182 } 1183 1184 Free(tsname); 1185 tsname = NULL; 1186 rval = 0; 1187 1188 /* cleanup, return error */ 1189 out: 1190 if (tsfp != NULL) 1191 (void) fclose(tsfp); 1192 if (tsname != NULL) { 1193 (void) unlink(tsname); 1194 Free(tsname); 1195 } 1196 return (rval); 1197 } 1198 1199 /* 1200 * Add replicas to set. This happens as a result of: 1201 * - metadb [-s set_name] -a 1202 * - metaset -s set_name -a disk 1203 * - metaset -s set_name -d disk (causes a rebalance of mddbs) 1204 * - metaset -s set_name -b 1205 * 1206 * For a local set, this routine is run on the local set host. 1207 * 1208 * For a traditional diskset, this routine is run on the node that 1209 * is running the metaset command. 1210 * 1211 * For a multinode diskset, this routine is run by the node that is 1212 * running the metaset command. If this is the first mddb added to 1213 * the MN diskset, then no communication is made to other nodes via commd 1214 * since the other nodes will be in-sync with respect to the mddbs when 1215 * those other nodes join the set and snarf in the newly created mddb. 1216 * If this is not the first mddb added to the MN diskset, then this 1217 * attach command is sent to all of the nodes using commd. This keeps 1218 * the nodes in-sync. 1219 */ 1220 int 1221 meta_db_attach( 1222 mdsetname_t *sp, 1223 mdnamelist_t *db_nlp, 1224 mdchkopts_t options, 1225 md_timeval32_t *timeval, 1226 int dbcnt, 1227 int dbsize, 1228 char *sysfilename, 1229 md_error_t *ep 1230 ) 1231 { 1232 struct mddb_config c; 1233 mdnamelist_t *nlp; 1234 mdname_t *np; 1235 md_drive_desc *dd = NULL; 1236 md_drive_desc *p; 1237 int i; 1238 int fd; 1239 side_t sideno; 1240 daddr_t blkno; 1241 int replicacount = 0; 1242 int start_mdmonitord = 0; 1243 int rval = 0; 1244 md_error_t status = mdnullerror; 1245 md_set_desc *sd; 1246 int stale_bool = FALSE; 1247 int flags; 1248 int firstmddb = 1; 1249 md_timeval32_t inittime = {0, 0}; 1250 1251 /* 1252 * Error if we don't get some work to do. 1253 */ 1254 if (db_nlp == NULL) 1255 return (mdsyserror(ep, EINVAL, NULL)); 1256 1257 if (mdnamesareunique(db_nlp, ep) != 0) 1258 return (-1); 1259 (void) memset(&c, 0, sizeof (c)); 1260 c.c_id = 0; 1261 c.c_setno = sp->setno; 1262 1263 /* Don't need device id information from this ioctl */ 1264 c.c_locator.l_devid = (uint64_t)0; 1265 c.c_locator.l_devid_flags = 0; 1266 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1267 if (metaislocalset(sp)) { 1268 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) 1269 mdclrerror(&c.c_mde); 1270 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || 1271 (! (options & MDCHK_ALLOW_NODBS))) 1272 return (mdstealerror(ep, &c.c_mde)); 1273 } else { 1274 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) 1275 return (mdstealerror(ep, &c.c_mde)); 1276 } 1277 mdclrerror(&c.c_mde); 1278 } 1279 /* 1280 * Is current set STALE? 1281 */ 1282 if (c.c_flags & MDDB_C_STALE) { 1283 stale_bool = TRUE; 1284 } 1285 1286 assert(db_nlp != NULL); 1287 1288 /* if creating the metadbs for the first time start mdmonitord */ 1289 if (c.c_dbcnt == 0) 1290 start_mdmonitord = 1; 1291 1292 /* 1293 * check to see if we will go over the total possible number 1294 * of data bases 1295 */ 1296 nlp = db_nlp; 1297 while (nlp) { 1298 replicacount += dbcnt; 1299 nlp = nlp->next; 1300 } 1301 1302 if ((replicacount + c.c_dbcnt) > c.c_dbmax) 1303 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 1304 sp->setno, c.c_dbcnt + replicacount, NULL)); 1305 1306 /* 1307 * go through and check to make sure all locations specified 1308 * are legal also pick out driver name; 1309 */ 1310 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1311 diskaddr_t devsize; 1312 1313 np = nlp->namep; 1314 1315 if (! metaislocalset(sp)) { 1316 uint_t partno; 1317 uint_t rep_partno; 1318 mddrivename_t *dnp = np->drivenamep; 1319 1320 /* 1321 * make sure that non-local database replicas 1322 * are always on the replica slice. 1323 */ 1324 if (meta_replicaslice(dnp, 1325 &rep_partno, ep) != 0) 1326 return (-1); 1327 if (metagetvtoc(np, FALSE, &partno, ep) == NULL) 1328 return (-1); 1329 if (partno != rep_partno) 1330 return (mddeverror(ep, MDE_REPCOMP_ONLY, 1331 np->dev, sp->setname)); 1332 } 1333 1334 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), 1335 ep)) { 1336 return (-1); 1337 } 1338 1339 if ((devsize = metagetsize(np, ep)) == -1) 1340 return (-1); 1341 1342 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) 1343 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, 1344 meta_getminor(np->dev), sp->setno, devsize, 1345 np->cname)); 1346 } 1347 1348 /* 1349 * If first disk in set we don't have lb_inittime yet for use as 1350 * mb_setcreatetime so don't go looking for it. WE'll come back 1351 * later and update after the locator block has been created. 1352 * If this isn't the first disk in the set, we have a locator 1353 * block and thus we have lb_inittime. Set mb_setcreatetime to 1354 * lb_inittime. 1355 */ 1356 if (! metaislocalset(sp)) { 1357 if (c.c_dbcnt != 0) { 1358 firstmddb = 0; 1359 inittime = meta_get_lb_inittime(sp, ep); 1360 } 1361 } 1362 1363 /* 1364 * go through and write all master blocks 1365 */ 1366 1367 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1368 np = nlp->namep; 1369 1370 if ((fd = open(np->rname, O_RDWR)) < 0) 1371 return (mdsyserror(ep, errno, np->rname)); 1372 1373 for (i = 0; i < dbcnt; i++) { 1374 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, 1375 inittime, ep)) { 1376 (void) close(fd); 1377 return (-1); 1378 } 1379 } 1380 (void) close(fd); 1381 } 1382 1383 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1384 return (-1); 1385 1386 if (! metaislocalset(sp)) { 1387 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1388 if (! mdisok(ep)) 1389 return (-1); 1390 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1391 return (-1); 1392 1393 } 1394 1395 /* 1396 * go through and tell kernel to add them 1397 */ 1398 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1399 mdcinfo_t *cinfo; 1400 1401 np = nlp->namep; 1402 1403 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 1404 rval = -1; 1405 goto out; 1406 } 1407 1408 /* 1409 * If mddb is being added to MN diskset and there already 1410 * exists a valid mddb in the set (which equates to this 1411 * node being an owner of the set) then use rpc.mdcommd 1412 * mechanism to add mddb(s) so that all nodes stay in sync. 1413 * If set is stale, don't log the message since rpc.mdcommd 1414 * can't write the message to the mddb. 1415 * 1416 * Otherwise, just add mddb to this node. 1417 */ 1418 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1419 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1420 md_mn_result_t *resultp = NULL; 1421 md_mn_msg_meta_db_attach_t attach; 1422 int send_rval; 1423 1424 /* 1425 * In a scenario where new replicas had been added on 1426 * the master, and then all of the old replicas failed 1427 * before the slaves had knowledge of the new replicas, 1428 * the slaves are unable to re-parse in the mddb 1429 * from the new replicas since the slaves have no 1430 * knowledge of the new replicas. The following 1431 * algorithm solves this problem: 1432 * - META_DB_ATTACH message generates submsgs 1433 * - BLOCK parse (master) 1434 * - MDDB_ATTACH new replicas 1435 * - UNBLOCK parse (master) causing parse 1436 * information to be sent from master 1437 * to slaves at a higher class than the 1438 * unblock so the parse message will 1439 * reach slaves before unblock message. 1440 */ 1441 attach.msg_l_dev = np->dev; 1442 attach.msg_cnt = dbcnt; 1443 attach.msg_dbsize = dbsize; 1444 (void) strncpy(attach.msg_dname, cinfo->dname, 1445 sizeof (attach.msg_dname)); 1446 (void) splitname(np->bname, &attach.msg_splitname); 1447 attach.msg_options = options; 1448 1449 /* Set devid to NULL until devids are supported */ 1450 attach.msg_devid[0] = NULL; 1451 1452 /* 1453 * If reconfig cycle has been started, this node is 1454 * stuck in in the return step until this command has 1455 * completed. If mdcommd is suspended, ask 1456 * send_message to fail (instead of retrying) 1457 * so that metaset can finish allowing the reconfig 1458 * cycle to proceed. 1459 */ 1460 flags = MD_MSGF_FAIL_ON_SUSPEND; 1461 if (stale_bool == TRUE) 1462 flags |= MD_MSGF_NO_LOG; 1463 send_rval = mdmn_send_message(sp->setno, 1464 MD_MN_MSG_META_DB_ATTACH, 1465 flags, (char *)&attach, 1466 sizeof (md_mn_msg_meta_db_attach_t), 1467 &resultp, ep); 1468 if (send_rval != 0) { 1469 rval = -1; 1470 if (resultp == NULL) 1471 (void) mddserror(ep, 1472 MDE_DS_COMMD_SEND_FAIL, 1473 sp->setno, NULL, NULL, 1474 sp->setname); 1475 else { 1476 (void) mdstealerror(ep, 1477 &(resultp->mmr_ep)); 1478 if (mdisok(ep)) { 1479 (void) mddserror(ep, 1480 MDE_DS_COMMD_SEND_FAIL, 1481 sp->setno, NULL, NULL, 1482 sp->setname); 1483 } 1484 free_result(resultp); 1485 } 1486 goto out; 1487 } 1488 if (resultp) 1489 free_result(resultp); 1490 } else { 1491 /* Adding mddb(s) to just this node */ 1492 for (i = 0; i < dbcnt; i++) { 1493 (void) memset(&c, 0, sizeof (c)); 1494 /* Fill in device/replica info */ 1495 c.c_locator.l_dev = meta_cmpldev(np->dev); 1496 c.c_locator.l_blkno = i * dbsize + 16; 1497 blkno = c.c_locator.l_blkno; 1498 (void) strncpy(c.c_locator.l_driver, cinfo->dname, 1499 sizeof (c.c_locator.l_driver)); 1500 (void) splitname(np->bname, &c.c_devname); 1501 c.c_locator.l_mnum = meta_getminor(np->dev); 1502 1503 /* Fill in setno, setname, and sideno */ 1504 c.c_setno = sp->setno; 1505 if (! metaislocalset(sp)) { 1506 if (MD_MNSET_DESC(sd)) { 1507 c.c_multi_node = 1; 1508 } 1509 } 1510 (void) strcpy(c.c_setname, sp->setname); 1511 c.c_sideno = sideno; 1512 1513 /* 1514 * Don't need device id information from this ioctl 1515 * Kernel determines device id from dev_t, which 1516 * is just what this code would do. 1517 */ 1518 c.c_locator.l_devid = (uint64_t)0; 1519 c.c_locator.l_devid_flags = 0; 1520 1521 if (timeval != NULL) 1522 c.c_timestamp = *timeval; 1523 1524 if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE), 1525 ep)) { 1526 rval = -1; 1527 goto out; 1528 } 1529 1530 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) { 1531 rval = mdstealerror(ep, &c.c_mde); 1532 goto out; 1533 } 1534 /* 1535 * This is either a traditional diskset OR this 1536 * is the first replica added to a MN diskset. 1537 * In either case, set broadcast to NO_BCAST so 1538 * that message won't go through rpc.mdcommd. 1539 * If this is a traditional diskset, the bcast 1540 * flag is ignored since traditional disksets 1541 * don't use the rpc.mdcommd. 1542 */ 1543 if (meta_db_addsidenms(sp, np, blkno, 1544 DB_ADDSIDENMS_NO_BCAST, ep)) 1545 goto out; 1546 } 1547 } 1548 if (! metaislocalset(sp)) { 1549 /* update the dbcnt and size in dd */ 1550 for (p = dd; p != NULL; p = p->dd_next) 1551 if (p->dd_dnp == np->drivenamep) { 1552 p->dd_dbcnt = dbcnt; 1553 p->dd_dbsize = dbsize; 1554 break; 1555 } 1556 } 1557 1558 /* 1559 * If this was the first addition of disks to the 1560 * diskset you now need to update the mb_setcreatetime 1561 * which needed lb_inittime which wasn't there until now. 1562 */ 1563 if (firstmddb) { 1564 if (meta_update_mb(sp, dd, ep) != 0) { 1565 return (-1); 1566 } 1567 } 1568 (void) close(fd); 1569 } 1570 1571 out: 1572 if (metaislocalset(sp)) { 1573 1574 /* everything looks fine. Start mdmonitord */ 1575 /* Note: popen/pclose is the MT-safe replacement for system */ 1576 if (rval == 0 && start_mdmonitord == 1) { 1577 if (pclose(popen(MDMONITORD, "w")) == -1) 1578 md_perror(MDMONITORD); 1579 1580 if (meta_smf_enable(META_SMF_CORE, &status) == -1) { 1581 mde_perror(&status, ""); 1582 mdclrerror(&status); 1583 } 1584 } 1585 1586 if (buildconf(sp, &status)) { 1587 /* Don't mask any previous errors */ 1588 if (rval == 0) 1589 rval = mdstealerror(ep, &status); 1590 return (rval); 1591 } 1592 1593 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 1594 /* Don't mask any previous errors */ 1595 if (rval == 0) 1596 rval = mdstealerror(ep, &status); 1597 } 1598 } else { 1599 if (update_dbinfo_on_drives(sp, dd, 1600 (options & MDCHK_SET_LOCKED), 1601 (options & MDCHK_SET_FORCE), 1602 &status)) { 1603 /* Don't mask any previous errors */ 1604 if (rval == 0) 1605 rval = mdstealerror(ep, &status); 1606 else 1607 mdclrerror(&status); 1608 } 1609 metafreedrivedesc(&dd); 1610 } 1611 /* 1612 * For MN disksets that already had already had nodes joined 1613 * before the attach of this mddb(s), the name invalidation is 1614 * done by the commd handler routine. Otherwise, if this 1615 * is the first attach of a MN diskset mddb, the invalidation 1616 * must be done here since the first attach cannot be sent 1617 * via the commd since there are no nodes joined to the set yet. 1618 */ 1619 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || 1620 (MD_MNSET_DESC(sd) && 1621 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { 1622 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 1623 meta_invalidate_name(nlp->namep); 1624 } 1625 } 1626 return (rval); 1627 } 1628 1629 /* 1630 * deletelist_length 1631 * 1632 * return the number of slices that have been specified for deletion 1633 * on the metadb command line. This does not calculate the number 1634 * of replicas because there may be multiple replicas per slice. 1635 */ 1636 static int 1637 deletelist_length(mdnamelist_t *db_nlp) 1638 { 1639 1640 mdnamelist_t *nlp; 1641 int list_length = 0; 1642 1643 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1644 list_length++; 1645 } 1646 1647 return (list_length); 1648 } 1649 1650 static int 1651 in_deletelist(char *devname, mdnamelist_t *db_nlp) 1652 { 1653 1654 mdnamelist_t *nlp; 1655 mdname_t *np; 1656 int index = 0; 1657 1658 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1659 np = nlp->namep; 1660 1661 if (strcmp(devname, np->bname) == 0) 1662 return (index); 1663 index++; 1664 } 1665 1666 return (-1); 1667 } 1668 1669 /* 1670 * Delete replicas from set. This happens as a result of: 1671 * - metadb [-s set_name] -d 1672 * - metaset -s set_name -a disk (causes a rebalance of mddbs) 1673 * - metaset -s set_name -d disk 1674 * - metaset -s set_name -b 1675 * 1676 * For a local set, this routine is run on the local set host. 1677 * 1678 * For a traditional diskset, this routine is run on the node that 1679 * is running the metaset command. 1680 * 1681 * For a multinode diskset, this routine is run by the node that is 1682 * running the metaset command. This detach routine is sent to all 1683 * of the joined nodes in the diskset using commd. This keeps 1684 * the nodes in-sync. 1685 */ 1686 int 1687 meta_db_detach( 1688 mdsetname_t *sp, 1689 mdnamelist_t *db_nlp, 1690 mdforceopts_t force_option, 1691 char *sysfilename, 1692 md_error_t *ep 1693 ) 1694 { 1695 struct mddb_config c; 1696 mdnamelist_t *nlp; 1697 mdname_t *np; 1698 md_drive_desc *dd = NULL; 1699 md_drive_desc *p; 1700 int replicacount; 1701 int replica_delete_count; 1702 int nr_replica_slices; 1703 int i; 1704 int stop_svmdaemons = 0; 1705 int rval = 0; 1706 int index; 1707 int valid_replicas_nottodelete = 0; 1708 int invalid_replicas_nottodelete = 0; 1709 int invalid_replicas_todelete = 0; 1710 int errored = 0; 1711 int *tag_array; 1712 int fd = -1; 1713 md_error_t status = mdnullerror; 1714 md_set_desc *sd; 1715 int stale_bool = FALSE; 1716 int flags; 1717 1718 /* 1719 * Error if we don't get some work to do. 1720 */ 1721 if (db_nlp == NULL) 1722 return (mdsyserror(ep, EINVAL, NULL)); 1723 1724 if (mdnamesareunique(db_nlp, ep) != 0) 1725 return (-1); 1726 1727 (void) memset(&c, 0, sizeof (c)); 1728 c.c_id = 0; 1729 c.c_setno = sp->setno; 1730 1731 /* Don't need device id information from this ioctl */ 1732 c.c_locator.l_devid = (uint64_t)0; 1733 c.c_locator.l_devid_flags = 0; 1734 1735 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1736 return (mdstealerror(ep, &c.c_mde)); 1737 1738 /* 1739 * Is current set STALE? 1740 */ 1741 if (c.c_flags & MDDB_C_STALE) { 1742 stale_bool = TRUE; 1743 } 1744 1745 replicacount = c.c_dbcnt; 1746 1747 assert(db_nlp != NULL); 1748 1749 /* 1750 * go through and gather how many data bases are on each 1751 * device specified. 1752 */ 1753 1754 nr_replica_slices = deletelist_length(db_nlp); 1755 tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); 1756 1757 replica_delete_count = 0; 1758 for (i = 0; i < replicacount; i++) { 1759 char *devname; 1760 int found = 0; 1761 1762 c.c_id = i; 1763 1764 /* Don't need device id information from this ioctl */ 1765 c.c_locator.l_devid = (uint64_t)0; 1766 c.c_locator.l_devid_flags = 0; 1767 1768 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1769 return (mdstealerror(ep, &c.c_mde)); 1770 1771 devname = splicename(&c.c_devname); 1772 1773 if ((index = in_deletelist(devname, db_nlp)) != -1) { 1774 found = 1; 1775 tag_array[index] = 1; 1776 replica_delete_count++; 1777 } 1778 1779 errored = c.c_locator.l_flags & (MDDB_F_EREAD | 1780 MDDB_F_EWRITE | MDDB_F_TOOSMALL | 1781 MDDB_F_EFMT | MDDB_F_EDATA | 1782 MDDB_F_EMASTER); 1783 1784 /* 1785 * There are four combinations of "errored" and "found" 1786 * and they are used to find the number of 1787 * (a) valid/invalid replicas that are not in the delete 1788 * list and are available in the system. 1789 * (b) valid/invalid replicas that are to be deleted. 1790 */ 1791 1792 if (errored && !found) /* errored and !found */ 1793 invalid_replicas_nottodelete++; 1794 else if (!found) /* !errored and !found */ 1795 valid_replicas_nottodelete++; 1796 else if (errored) /* errored and found */ 1797 invalid_replicas_todelete++; 1798 /* 1799 * else it is !errored and found. This means 1800 * valid_replicas_todelete++; But this variable will not 1801 * be used anywhere 1802 */ 1803 1804 Free(devname); 1805 } 1806 1807 index = 0; 1808 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1809 np = nlp->namep; 1810 if (tag_array[index++] != 1) { 1811 Free(tag_array); 1812 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); 1813 } 1814 } 1815 1816 Free(tag_array); 1817 1818 1819 /* if all replicas are deleted stop mdmonitord */ 1820 if ((replicacount - replica_delete_count) == 0) 1821 stop_svmdaemons = 1; 1822 1823 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { 1824 if (force_option & MDFORCE_NONE) 1825 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); 1826 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) 1827 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); 1828 } 1829 1830 /* 1831 * The following algorithms are followed to check for deletion: 1832 * (a) If the delete list(db_nlp) has all invalid replicas and no valid 1833 * replicas, then deletion should be allowed. 1834 * (b) Deletion should be allowed only if valid replicas that are "not" 1835 * to be deleted is always greater than the invalid replicas that 1836 * are "not" to be deleted. 1837 * (c) If the user uses -f option, then deletion should be allowed. 1838 */ 1839 1840 if ((invalid_replicas_todelete != replica_delete_count) && 1841 (invalid_replicas_nottodelete > valid_replicas_nottodelete) && 1842 (force_option != MDFORCE_LOCAL)) 1843 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); 1844 1845 /* 1846 * go through and tell kernel to delete them 1847 */ 1848 1849 /* Don't need device id information from this ioctl */ 1850 c.c_locator.l_devid = (uint64_t)0; 1851 c.c_locator.l_devid_flags = 0; 1852 1853 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1854 return (mdstealerror(ep, &c.c_mde)); 1855 1856 if (! metaislocalset(sp)) { 1857 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1858 if (! mdisok(ep)) 1859 return (-1); 1860 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1861 return (-1); 1862 } 1863 1864 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1865 np = nlp->namep; 1866 1867 /* 1868 * If mddb is being deleted from MN diskset and node is 1869 * an owner of the diskset then use rpc.mdcommd 1870 * mechanism to add mddb(s) so that all nodes stay in sync. 1871 * If set is stale, don't log the message since rpc.mdcommd 1872 * can't write the message to the mddb. 1873 * 1874 * When mddbs are first being added to set, a detach can 1875 * be called before any node has joined the diskset, so 1876 * must check to see if node is an owner of the diskset. 1877 * 1878 * Otherwise, just delete mddb from this node. 1879 */ 1880 1881 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1882 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1883 md_mn_result_t *resultp; 1884 md_mn_msg_meta_db_detach_t detach; 1885 int send_rval; 1886 1887 /* 1888 * The following algorithm is used to detach replicas. 1889 * - META_DB_DETACH message generates submsgs 1890 * - BLOCK parse (master) 1891 * - MDDB_DETACH replicas 1892 * - UNBLOCK parse (master) causing parse 1893 * information to be sent from master 1894 * to slaves at a higher class than the 1895 * unblock so the parse message will 1896 * reach slaves before unblock message. 1897 */ 1898 (void) splitname(np->bname, &detach.msg_splitname); 1899 1900 /* Set devid to NULL until devids are supported */ 1901 detach.msg_devid[0] = NULL; 1902 1903 /* 1904 * If reconfig cycle has been started, this node is 1905 * stuck in in the return step until this command has 1906 * completed. If mdcommd is suspended, ask 1907 * send_message to fail (instead of retrying) 1908 * so that metaset can finish allowing the reconfig 1909 * cycle to proceed. 1910 */ 1911 flags = MD_MSGF_FAIL_ON_SUSPEND; 1912 if (stale_bool == TRUE) 1913 flags |= MD_MSGF_NO_LOG; 1914 send_rval = mdmn_send_message(sp->setno, 1915 MD_MN_MSG_META_DB_DETACH, 1916 flags, (char *)&detach, 1917 sizeof (md_mn_msg_meta_db_detach_t), 1918 &resultp, ep); 1919 if (send_rval != 0) { 1920 rval = -1; 1921 if (resultp == NULL) 1922 (void) mddserror(ep, 1923 MDE_DS_COMMD_SEND_FAIL, 1924 sp->setno, NULL, NULL, 1925 sp->setname); 1926 else { 1927 (void) mdstealerror(ep, 1928 &(resultp->mmr_ep)); 1929 if (mdisok(ep)) { 1930 (void) mddserror(ep, 1931 MDE_DS_COMMD_SEND_FAIL, 1932 sp->setno, NULL, NULL, 1933 sp->setname); 1934 } 1935 free_result(resultp); 1936 } 1937 goto out; 1938 } 1939 if (resultp) 1940 free_result(resultp); 1941 } else { 1942 i = 0; 1943 while (i < c.c_dbcnt) { 1944 char *devname; 1945 1946 c.c_id = i; 1947 1948 /* Don't need devid info from this ioctl */ 1949 c.c_locator.l_devid = (uint64_t)0; 1950 c.c_locator.l_devid_flags = 0; 1951 1952 if (metaioctl(MD_DB_GETDEV, &c, 1953 &c.c_mde, NULL)) { 1954 rval = mdstealerror(ep, &c.c_mde); 1955 goto out; 1956 } 1957 1958 devname = splicename(&c.c_devname); 1959 if (strcmp(devname, np->bname) != 0) { 1960 Free(devname); 1961 i++; 1962 continue; 1963 } 1964 Free(devname); 1965 1966 /* Don't need devid info from this ioctl */ 1967 c.c_locator.l_devid = (uint64_t)0; 1968 c.c_locator.l_devid_flags = 0; 1969 1970 if (metaioctl(MD_DB_DELDEV, &c, 1971 &c.c_mde, NULL) != 0) { 1972 rval = mdstealerror(ep, &c.c_mde); 1973 goto out; 1974 } 1975 1976 /* Not incrementing "i" intentionally */ 1977 } 1978 } 1979 if (! metaislocalset(sp)) { 1980 /* update the dbcnt and size in dd */ 1981 for (p = dd; p != NULL; p = p->dd_next) { 1982 if (p->dd_dnp == np->drivenamep) { 1983 p->dd_dbcnt = 0; 1984 p->dd_dbsize = 0; 1985 break; 1986 } 1987 } 1988 1989 /* 1990 * Slam a dummy master block and make it self 1991 * identifying 1992 */ 1993 if ((fd = open(np->rname, O_RDWR)) >= 0) { 1994 meta_mkdummymaster(sp, fd, 16); 1995 (void) close(fd); 1996 } 1997 } 1998 } 1999 out: 2000 if (metaislocalset(sp)) { 2001 /* 2002 * Stop all the daemons if there are 2003 * no more replicas so that the module can be 2004 * unloaded. 2005 */ 2006 if (rval == 0 && stop_svmdaemons == 1) { 2007 char buf[MAXPATHLEN]; 2008 int i; 2009 2010 for (i = 0; i < DAEMON_COUNT; i++) { 2011 (void) snprintf(buf, MAXPATHLEN, 2012 "/usr/bin/pkill -%s -x %s", 2013 svmd_kill_list[i].svmd_kill_val, 2014 svmd_kill_list[i].svmd_name); 2015 if (pclose(popen(buf, "w")) == -1) 2016 md_perror(buf); 2017 } 2018 2019 if (meta_smf_disable(META_SMF_ALL, &status) == -1) { 2020 mde_perror(&status, ""); 2021 mdclrerror(&status); 2022 } 2023 } 2024 if (buildconf(sp, &status)) { 2025 /* Don't mask any previous errors */ 2026 if (rval == 0) 2027 rval = mdstealerror(ep, &status); 2028 else 2029 mdclrerror(&status); 2030 return (rval); 2031 } 2032 2033 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 2034 /* Don't mask any previous errors */ 2035 if (rval == 0) 2036 rval = mdstealerror(ep, &status); 2037 else 2038 mdclrerror(&status); 2039 } 2040 } else { 2041 if (update_dbinfo_on_drives(sp, dd, 2042 (force_option & MDFORCE_SET_LOCKED), 2043 ((force_option & MDFORCE_LOCAL) | 2044 (force_option & MDFORCE_DS)), &status)) { 2045 /* Don't mask any previous errors */ 2046 if (rval == 0) 2047 rval = mdstealerror(ep, &status); 2048 else 2049 mdclrerror(&status); 2050 } 2051 metafreedrivedesc(&dd); 2052 } 2053 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { 2054 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 2055 meta_invalidate_name(nlp->namep); 2056 } 2057 } 2058 return (rval); 2059 } 2060 2061 static md_replica_t * 2062 metareplicaname( 2063 mdsetname_t *sp, 2064 int flags, 2065 struct mddb_config *c, 2066 md_error_t *ep 2067 ) 2068 { 2069 md_replica_t *rp; 2070 char *devname; 2071 size_t sz; 2072 2073 /* allocate replicaname */ 2074 rp = Zalloc(sizeof (*rp)); 2075 2076 /* get device name */ 2077 devname = splicename(&c->c_devname); 2078 if (flags & PRINT_FAST) { 2079 if ((rp->r_namep = metaname_fast(&sp, devname, ep)) == NULL) { 2080 Free(devname); 2081 Free(rp); 2082 return (NULL); 2083 } 2084 } else { 2085 if ((rp->r_namep = metaname(&sp, devname, ep)) == NULL) { 2086 Free(devname); 2087 Free(rp); 2088 return (NULL); 2089 } 2090 } 2091 Free(devname); 2092 2093 /* make sure it's OK */ 2094 if ((! (flags & MD_BASICNAME_OK)) && 2095 (metachkcomp(rp->r_namep, ep) != 0)) { 2096 Free(rp); 2097 return (NULL); 2098 } 2099 2100 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR; 2101 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR; 2102 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; 2103 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { 2104 sz = devid_sizeof((ddi_devid_t)(uintptr_t) 2105 (c->c_locator.l_devid)); 2106 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == 2107 (ddi_devid_t)NULL) { 2108 Free(rp); 2109 return (NULL); 2110 } 2111 (void) memcpy((void *)rp->r_devid, 2112 (void *)(uintptr_t)c->c_locator.l_devid, sz); 2113 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); 2114 rp->r_flags &= ~MDDB_F_NODEVID; 2115 /* Overwrite dev derived from name with dev from devid */ 2116 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); 2117 } 2118 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); 2119 2120 rp->r_blkno = c->c_locator.l_blkno; 2121 if (c->c_dbend != 0) 2122 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; 2123 2124 /* return replica */ 2125 return (rp); 2126 } 2127 2128 /* 2129 * free replica list 2130 */ 2131 void 2132 metafreereplicalist( 2133 md_replicalist_t *rlp 2134 ) 2135 { 2136 md_replicalist_t *rl = NULL; 2137 2138 for (/* void */; (rlp != NULL); rlp = rl) { 2139 rl = rlp->rl_next; 2140 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { 2141 free(rlp->rl_repp->r_devid); 2142 } 2143 Free(rlp->rl_repp); 2144 Free(rlp); 2145 } 2146 } 2147 2148 /* 2149 * return list of all replicas in set 2150 */ 2151 int 2152 metareplicalist( 2153 mdsetname_t *sp, 2154 int flags, 2155 md_replicalist_t **rlpp, 2156 md_error_t *ep 2157 ) 2158 { 2159 md_replicalist_t **tail = rlpp; 2160 int count = 0; 2161 struct mddb_config c; 2162 int i; 2163 char *devid; 2164 2165 /* for each replica */ 2166 i = 0; 2167 do { 2168 md_replica_t *rp; 2169 2170 /* get next replica */ 2171 (void) memset(&c, 0, sizeof (c)); 2172 c.c_id = i; 2173 c.c_setno = sp->setno; 2174 2175 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2176 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2177 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2178 mdclrerror(&c.c_mde); 2179 break; /* handle none at all */ 2180 } 2181 (void) mdstealerror(ep, &c.c_mde); 2182 goto out; 2183 } 2184 2185 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { 2186 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { 2187 (void) mdsyserror(ep, ENOMEM, META_DBCONF); 2188 goto out; 2189 } 2190 c.c_locator.l_devid = (uintptr_t)devid; 2191 /* 2192 * Turn on space and sz flags since 'sz' amount of 2193 * space has been alloc'd. 2194 */ 2195 c.c_locator.l_devid_flags = 2196 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2197 } 2198 2199 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2200 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2201 mdclrerror(&c.c_mde); 2202 break; /* handle none at all */ 2203 } 2204 (void) mdstealerror(ep, &c.c_mde); 2205 goto out; 2206 } 2207 2208 /* 2209 * Paranoid check - shouldn't happen, but is left as 2210 * a place holder for changes that will be needed after 2211 * dynamic reconfiguration changes are added to SVM (to 2212 * support movement of disks at any point in time). 2213 */ 2214 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 2215 (void) fprintf(stderr, 2216 dgettext(TEXT_DOMAIN, 2217 "Error: Relocation Information " 2218 "(drvnm=%s, mnum=0x%lx) \n" 2219 "relocation information size changed - \n" 2220 "rerun command\n"), 2221 c.c_locator.l_driver, c.c_locator.l_mnum); 2222 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); 2223 goto out; 2224 } 2225 2226 if (c.c_dbcnt == 0) 2227 break; /* handle none at all */ 2228 2229 /* get info */ 2230 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) 2231 goto out; 2232 2233 /* append to list */ 2234 *tail = Zalloc(sizeof (**tail)); 2235 (*tail)->rl_repp = rp; 2236 tail = &(*tail)->rl_next; 2237 ++count; 2238 2239 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2240 free(devid); 2241 c.c_locator.l_devid_flags = 0; 2242 } 2243 2244 } while (++i < c.c_dbcnt); 2245 2246 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2247 free(devid); 2248 } 2249 2250 /* return count */ 2251 return (count); 2252 2253 /* cleanup, return error */ 2254 out: 2255 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2256 free(devid); 2257 } 2258 metafreereplicalist(*rlpp); 2259 *rlpp = NULL; 2260 return (-1); 2261 } 2262 2263 /* 2264 * meta_sync_db_locations - get list of replicas from kernel and write 2265 * out to mddb.cf and md.conf. 'Syncs up' the replica list in 2266 * the kernel with the replica list in the conf files. 2267 * 2268 */ 2269 void 2270 meta_sync_db_locations( 2271 mdsetname_t *sp, 2272 md_error_t *ep 2273 ) 2274 { 2275 char *sname = 0; /* system file name */ 2276 char *cname = 0; /* config file name */ 2277 2278 if (!metaislocalset(sp)) 2279 return; 2280 2281 /* Updates backup of configuration file (aka mddb.cf) */ 2282 if (buildconf(sp, ep) != 0) 2283 return; 2284 2285 /* Updates system configuration file (aka md.conf) */ 2286 (void) meta_db_patch(sname, cname, 0, ep); 2287 } 2288 2289 /* 2290 * setup_db_locations - parse the mddb.cf file and 2291 * tells the driver which db locations to use. 2292 */ 2293 int 2294 meta_setup_db_locations( 2295 md_error_t *ep 2296 ) 2297 { 2298 mddb_config_t c; 2299 FILE *fp; 2300 char inbuff[1024]; 2301 char *buff; 2302 uint_t i; 2303 size_t sz; 2304 int rval = 0; 2305 char *devidp; 2306 uint_t devid_size; 2307 char *minor_name = NULL; 2308 ddi_devid_t devid_decode; 2309 int checksum; 2310 2311 /* do mddb.cf file */ 2312 (void) memset(&c, '\0', sizeof (c)); 2313 if ((fp = fopen(META_DBCONF, "r")) == NULL) { 2314 if (errno != ENOENT) 2315 return (mdsyserror(ep, errno, META_DBCONF)); 2316 } 2317 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), 2318 fp)) != NULL)) { 2319 2320 /* ignore comments */ 2321 if (*buff == '#') 2322 continue; 2323 2324 /* parse locator */ 2325 (void) memset(&c, 0, sizeof (c)); 2326 c.c_setno = MD_LOCAL_SET; 2327 i = strcspn(buff, " \t"); 2328 if (i > sizeof (c.c_locator.l_driver)) 2329 i = sizeof (c.c_locator.l_driver); 2330 (void) strncpy(c.c_locator.l_driver, buff, i); 2331 buff += i; 2332 c.c_locator.l_dev = 2333 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); 2334 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); 2335 c.c_locator.l_mnum = minor(c.c_locator.l_dev); 2336 2337 /* parse out devid */ 2338 while (isspace((int)(*buff))) 2339 buff += 1; 2340 i = strcspn(buff, " \t"); 2341 if ((devidp = (char *)malloc(i+1)) == NULL) 2342 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2343 2344 (void) strncpy(devidp, buff, i); 2345 devidp[i] = '\0'; 2346 if (devid_str_decode(devidp, &devid_decode, 2347 &minor_name) == -1) { 2348 free(devidp); 2349 continue; 2350 } 2351 2352 /* Conf file must have minor name associated with devid */ 2353 if (minor_name == NULL) { 2354 free(devidp); 2355 devid_free(devid_decode); 2356 continue; 2357 } 2358 2359 sz = devid_sizeof(devid_decode); 2360 /* Copy to devid size buffer that ioctl expects */ 2361 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { 2362 devid_free(devid_decode); 2363 free(minor_name); 2364 free(devidp); 2365 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2366 } 2367 2368 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2369 (void *)devid_decode, sz); 2370 2371 devid_free(devid_decode); 2372 2373 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { 2374 free(minor_name); 2375 free(devidp); 2376 free((void *)(uintptr_t)c.c_locator.l_devid); 2377 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2378 } 2379 (void) strcpy(c.c_locator.l_minor_name, minor_name); 2380 free(minor_name); 2381 c.c_locator.l_devid_flags = MDDB_DEVID_VALID | 2382 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2383 c.c_locator.l_devid_sz = sz; 2384 2385 devid_size = strlen(devidp); 2386 buff += devid_size; 2387 2388 checksum = strtol(buff, &buff, 10); 2389 for (i = 0; c.c_locator.l_driver[i] != 0; i++) 2390 checksum += c.c_locator.l_driver[i]; 2391 for (i = 0; i < devid_size; i++) { 2392 checksum += devidp[i]; 2393 } 2394 free(devidp); 2395 2396 checksum += minor(c.c_locator.l_dev); 2397 checksum += c.c_locator.l_blkno; 2398 if (checksum != 42) { 2399 /* overwritten later for more serious problems */ 2400 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); 2401 free((void *)(uintptr_t)c.c_locator.l_devid); 2402 continue; 2403 } 2404 c.c_locator.l_flags = 0; 2405 2406 /* use db location */ 2407 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2408 free((void *)(uintptr_t)c.c_locator.l_devid); 2409 return (mdstealerror(ep, &c.c_mde)); 2410 } 2411 2412 /* free up devid if in use */ 2413 free((void *)(uintptr_t)c.c_locator.l_devid); 2414 c.c_locator.l_devid = (uint64_t)0; 2415 c.c_locator.l_devid_flags = 0; 2416 } 2417 if ((fp) && (fclose(fp) != 0)) 2418 return (mdsyserror(ep, errno, META_DBCONF)); 2419 2420 /* check for stale database */ 2421 (void) memset((char *)&c, 0, sizeof (struct mddb_config)); 2422 c.c_id = 0; 2423 c.c_setno = MD_LOCAL_SET; 2424 2425 /* Don't need device id information from this ioctl */ 2426 c.c_locator.l_devid = (uint64_t)0; 2427 c.c_locator.l_devid_flags = 0; 2428 2429 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2430 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) 2431 return (mdstealerror(ep, &c.c_mde)); 2432 mdclrerror(&c.c_mde); 2433 } 2434 2435 if (c.c_flags & MDDB_C_STALE) 2436 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, 2437 0, NULL)); 2438 2439 /* success */ 2440 return (rval); 2441 } 2442 2443 /* 2444 * meta_db_minreplica - returns the minimum size replica currently in use. 2445 */ 2446 daddr_t 2447 meta_db_minreplica( 2448 mdsetname_t *sp, 2449 md_error_t *ep 2450 ) 2451 { 2452 md_replica_t *r; 2453 md_replicalist_t *rl, *rlp = NULL; 2454 daddr_t nblks = 0; 2455 2456 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) 2457 return (-1); 2458 2459 if (rlp == NULL) 2460 return (-1); 2461 2462 /* find the smallest existing replica */ 2463 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2464 r = rl->rl_repp; 2465 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2466 } 2467 2468 metafreereplicalist(rlp); 2469 return (nblks); 2470 } 2471 2472 /* 2473 * meta_get_replica_names 2474 * returns an mdnamelist_t of replica slices 2475 */ 2476 /*ARGSUSED*/ 2477 int 2478 meta_get_replica_names( 2479 mdsetname_t *sp, 2480 mdnamelist_t **nlpp, 2481 int options, 2482 md_error_t *ep 2483 ) 2484 { 2485 md_replicalist_t *rlp = NULL; 2486 md_replicalist_t *rl; 2487 mdnamelist_t **tailpp = nlpp; 2488 int cnt = 0; 2489 2490 assert(nlpp != NULL); 2491 2492 if (!metaislocalset(sp)) 2493 goto out; 2494 2495 /* get replicas */ 2496 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 2497 cnt = -1; 2498 goto out; 2499 } 2500 2501 /* build name list */ 2502 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 2503 /* 2504 * Add the name struct to the end of the 2505 * namelist but keep a pointer to the last 2506 * element so that we don't incur the overhead 2507 * of traversing the list each time 2508 */ 2509 tailpp = meta_namelist_append_wrapper( 2510 tailpp, rl->rl_repp->r_namep); 2511 ++cnt; 2512 } 2513 2514 /* cleanup, return count or error */ 2515 out: 2516 metafreereplicalist(rlp); 2517 return (cnt); 2518 } 2519