1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice database interfaces. 38 */ 39 40 #define MDDB 41 42 #include <meta.h> 43 #include <sys/lvm/md_mddb.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/lvm/mdio.h> 46 #include <string.h> 47 #include <strings.h> 48 #include <ctype.h> 49 50 struct svm_daemon { 51 char *svmd_name; 52 char *svmd_kill_val; 53 }; 54 55 /* 56 * This is a list of the daemons that are not stopped by the SVM smf(5) 57 * services. The mdmonitord is started via svc:/system/mdmonitor:default 58 * but no contract(4) is constructed and so it is not stopped by smf(5). 59 */ 60 struct svm_daemon svmd_kill_list[] = { 61 {"mdmonitord", "HUP"}, 62 {"mddoors", "KILL"}, 63 }; 64 65 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) 66 67 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); 68 69 /* 70 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel 71 */ 72 md_timeval32_t 73 meta_get_lb_inittime( 74 mdsetname_t *sp, 75 md_error_t *ep 76 ) 77 { 78 mddb_config_t c; 79 80 (void) memset(&c, 0, sizeof (c)); 81 82 /* Fill in setno, setname, and sideno */ 83 c.c_setno = sp->setno; 84 85 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { 86 (void) mdstealerror(ep, &c.c_mde); 87 } 88 89 return (c.c_timestamp); 90 } 91 92 /* 93 * mkmasterblks writes out the master blocks of the mddb to the replica. 94 * 95 * In a MN diskset, this is called by the node that is adding this replica 96 * to the diskset. 97 */ 98 99 #define MDDB_VERIFY_SIZE 8192 100 101 static int 102 mkmasterblks( 103 mdsetname_t *sp, 104 mdname_t *np, 105 int fd, 106 daddr_t firstblk, 107 int dbsize, 108 md_timeval32_t inittime, 109 md_error_t *ep 110 ) 111 { 112 int consecutive; 113 md_timeval32_t tp; 114 struct mddb_mb *mb; 115 char *buffer; 116 int iosize; 117 md_set_desc *sd; 118 int mn_set = 0; 119 daddr_t startblk; 120 int cnt; 121 ddi_devid_t devid; 122 123 if (! metaislocalset(sp)) { 124 if ((sd = metaget_setdesc(sp, ep)) == NULL) 125 return (-1); 126 127 if (MD_MNSET_DESC(sd)) { 128 mn_set = 1; /* Used later */ 129 } 130 } 131 132 /* 133 * Loop to verify the entire mddb region on disk is read/writable. 134 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block 135 * chunks. 136 * 137 * A side-effect of this loop is to zero out the entire mddb region 138 */ 139 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) 140 return (mdsyserror(ep, ENOMEM, np->rname)); 141 142 startblk = firstblk; 143 for (cnt = dbsize; cnt > 0; cnt -= consecutive) { 144 145 if (cnt > MDDB_VERIFY_SIZE) 146 consecutive = MDDB_VERIFY_SIZE; 147 else 148 consecutive = cnt; 149 150 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 151 Free(buffer); 152 return (mdsyserror(ep, errno, np->rname)); 153 } 154 155 iosize = DEV_BSIZE * consecutive; 156 if (write(fd, buffer, iosize) != iosize) { 157 Free(buffer); 158 return (mdsyserror(ep, errno, np->rname)); 159 } 160 161 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 162 Free(buffer); 163 return (mdsyserror(ep, errno, np->rname)); 164 } 165 166 if (read(fd, buffer, iosize) != iosize) { 167 Free(buffer); 168 return (mdsyserror(ep, errno, np->rname)); 169 } 170 171 startblk += consecutive; 172 } 173 174 Free(buffer); 175 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 176 return (mdsyserror(ep, ENOMEM, np->rname)); 177 178 if (meta_gettimeofday(&tp) == -1) { 179 Free(mb); 180 return (mdsyserror(ep, errno, np->rname)); 181 } 182 183 mb->mb_magic = MDDB_MAGIC_MB; 184 /* 185 * If a MN diskset, set master block revision for a MN set. 186 * Even though the master block structure is no different 187 * for a MN set, setting the revision field to a different 188 * number keeps any pre-MN_diskset code from accessing 189 * this diskset. It also allows for an early determination 190 * of a MN diskset when reading in from disk so that the 191 * proper size locator block and locator names structure 192 * can be read in thus saving time on diskset startup. 193 */ 194 if (mn_set) 195 mb->mb_revision = MDDB_REV_MNMB; 196 else 197 mb->mb_revision = MDDB_REV_MB; 198 mb->mb_timestamp = tp; 199 mb->mb_setno = sp->setno; 200 mb->mb_blkcnt = dbsize - 1; 201 mb->mb_blkno = firstblk; 202 mb->mb_nextblk = 0; 203 204 mb->mb_blkmap.m_firstblk = firstblk + 1; 205 mb->mb_blkmap.m_consecutive = dbsize - 1; 206 if (! metaislocalset(sp)) { 207 mb->mb_setcreatetime = inittime; 208 } 209 210 /* 211 * We try to save the disks device ID into the remaining bytes in 212 * the master block. The saved devid is used to provide a mapping 213 * between this disk's devid and the devid stored into the master 214 * block. This allows the disk image to be self-identifying 215 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 216 * when we try to import these disks on the remote copied image. 217 * If we cannot save the disks device ID onto the master block that is 218 * ok. The disk is just not self-identifying and won't be importable 219 * in the remote copy scenario. 220 */ 221 if (devid_get(fd, &devid) == 0) { 222 size_t len; 223 224 len = devid_sizeof(devid); 225 if (len <= DEV_BSIZE - sizeof (*mb)) { 226 /* there is enough space to store the devid */ 227 mb->mb_devid_magic = MDDB_MAGIC_DE; 228 mb->mb_devid_len = len; 229 (void) memcpy(mb->mb_devid, devid, len); 230 } 231 devid_free(devid); 232 } 233 234 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 235 (crc_skip_t *)NULL); 236 237 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 238 Free(mb); 239 return (mdsyserror(ep, errno, np->rname)); 240 } 241 242 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 243 Free(mb); 244 return (mdsyserror(ep, errno, np->rname)); 245 } 246 247 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 248 Free(mb); 249 return (mdsyserror(ep, errno, np->rname)); 250 } 251 252 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 253 Free(mb); 254 return (mdsyserror(ep, errno, np->rname)); 255 } 256 257 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 258 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { 259 Free(mb); 260 return (mdmddberror(ep, MDE_NOTVERIFIED, 261 meta_getminor(np->dev), sp->setno, 0, np->rname)); 262 } 263 264 Free(mb); 265 return (0); 266 } 267 268 void 269 meta_mkdummymaster( 270 mdsetname_t *sp, 271 int fd, 272 daddr_t firstblk 273 ) 274 { 275 md_timeval32_t tp; 276 struct mddb_mb *mb; 277 ddi_devid_t devid; 278 md_set_desc *sd; 279 md_error_t ep = mdnullerror; 280 md_timeval32_t inittime; 281 282 /* 283 * No dummy master blocks are written for a MN diskset since devids 284 * are not supported in MN disksets. 285 */ 286 if (! metaislocalset(sp)) { 287 if ((sd = metaget_setdesc(sp, &ep)) == NULL) 288 return; 289 290 if (MD_MNSET_DESC(sd)) 291 return; 292 } 293 294 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 295 return; 296 297 mb->mb_magic = MDDB_MAGIC_DU; 298 mb->mb_revision = MDDB_REV_MB; 299 mb->mb_setno = sp->setno; 300 inittime = meta_get_lb_inittime(sp, &ep); 301 mb->mb_setcreatetime = inittime; 302 303 if (meta_gettimeofday(&tp) != -1) 304 mb->mb_timestamp = tp; 305 306 /* 307 * We try to save the disks device ID into the remaining bytes in 308 * the master block. This allows the disk image to be self-identifying 309 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 310 * when we try to import these disks on the remote copied image. 311 * If we cannot save the disks device ID onto the master block that is 312 * ok. The disk is just not self-identifying and won't be importable 313 * in the remote copy scenario. 314 */ 315 if (devid_get(fd, &devid) == 0) { 316 int len; 317 318 len = devid_sizeof(devid); 319 if (len <= DEV_BSIZE - sizeof (*mb)) { 320 /* there is enough space to store the devid */ 321 mb->mb_devid_magic = MDDB_MAGIC_DE; 322 mb->mb_devid_len = len; 323 (void) memcpy(mb->mb_devid, (char *)devid, len); 324 } 325 devid_free(devid); 326 } 327 328 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 329 (crc_skip_t *)NULL); 330 331 /* 332 * If any of these operations fail, we need to inform the 333 * user that the disk won't be self identifying. When support 334 * for importing remotely replicated disksets is added, we 335 * want to add the error messages here. 336 */ 337 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 338 goto out; 339 340 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) 341 goto out; 342 343 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 344 goto out; 345 346 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) 347 goto out; 348 349 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 350 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) 351 goto out; 352 353 out: 354 Free(mb); 355 } 356 357 static int 358 buildconf(mdsetname_t *sp, md_error_t *ep) 359 { 360 md_replicalist_t *rlp = NULL; 361 md_replicalist_t *rl; 362 FILE *cfp = NULL; 363 FILE *mfp = NULL; 364 struct stat sbuf; 365 int rval = 0; 366 int in_miniroot = 0; 367 char line[MDDB_BOOTLIST_MAX_LEN]; 368 char *tname = NULL; 369 370 /* get list of local replicas */ 371 if (! metaislocalset(sp)) 372 return (0); 373 374 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 375 return (-1); 376 377 /* open tempfile, copy permissions of original file */ 378 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { 379 /* 380 * On the miniroot tmp files must be created in /var/tmp. 381 * If we get a EROFS error, we assume that we are in the 382 * miniroot. 383 */ 384 if (errno != EROFS) 385 goto error; 386 in_miniroot = 1; 387 errno = 0; 388 tname = tempnam("/var/tmp", "slvm_"); 389 if (tname == NULL && errno == EROFS) { 390 /* 391 * If we are booted on a read-only root because 392 * of mddb quorum problems we don't want to emit 393 * any scary error messages. 394 */ 395 errno = 0; 396 goto out; 397 } 398 399 /* open tempfile, copy permissions of original file */ 400 if ((cfp = fopen(tname, "w+")) == NULL) 401 goto error; 402 } 403 if (stat(META_DBCONF, &sbuf) == 0) { 404 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) 405 goto error; 406 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) 407 goto error; 408 } 409 410 /* print header */ 411 if (fprintf(cfp, "#metadevice database location file ") == EOF) 412 goto error; 413 if (fprintf(cfp, "do not hand edit\n") < 0) 414 goto error; 415 if (fprintf(cfp, 416 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) 417 goto error; 418 419 /* dump replicas */ 420 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 421 md_replica_t *r = rl->rl_repp; 422 int checksum = 42; 423 int i; 424 char *devidp; 425 minor_t min; 426 427 devidp = devid_str_encode(r->r_devid, r->r_minor_name); 428 /* If devid code can't encode devidp - skip entry */ 429 if (devidp == NULL) { 430 continue; 431 } 432 433 /* compute checksum */ 434 for (i = 0; ((r->r_driver_name[i] != '\0') && 435 (i < sizeof (r->r_driver_name))); i++) { 436 checksum -= r->r_driver_name[i]; 437 } 438 min = meta_getminor(r->r_namep->dev); 439 checksum -= min; 440 checksum -= r->r_blkno; 441 442 for (i = 0; i < strlen(devidp); i++) { 443 checksum -= devidp[i]; 444 } 445 /* print info */ 446 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", 447 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { 448 goto error; 449 } 450 451 devid_str_free(devidp); 452 } 453 454 /* close and rename to real file */ 455 if (fflush(cfp) != 0) 456 goto error; 457 if (fsync(fileno(cfp)) != 0) 458 goto error; 459 if (fclose(cfp) != 0) { 460 cfp = NULL; 461 goto error; 462 } 463 cfp = NULL; 464 465 /* 466 * Renames don't work in the miniroot since tmpfiles are 467 * created in /var/tmp. Hence we copy the data out. 468 */ 469 470 if (! in_miniroot) { 471 if (rename(META_DBCONFTMP, META_DBCONF) != 0) 472 goto error; 473 } else { 474 if ((cfp = fopen(tname, "r")) == NULL) 475 goto error; 476 if ((mfp = fopen(META_DBCONF, "w+")) == NULL) 477 goto error; 478 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { 479 if (fputs(line, mfp) == NULL) 480 goto error; 481 } 482 (void) fclose(cfp); 483 cfp = NULL; 484 if (fflush(mfp) != 0) 485 goto error; 486 if (fsync(fileno(mfp)) != 0) 487 goto error; 488 if (fclose(mfp) != 0) { 489 mfp = NULL; 490 goto error; 491 } 492 /* delete the tempfile */ 493 (void) unlink(tname); 494 } 495 /* success */ 496 rval = 0; 497 goto out; 498 499 /* tempfile error */ 500 error: 501 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 502 mdsyserror(ep, errno, META_DBCONFTMP); 503 504 505 /* cleanup, return success */ 506 out: 507 if (rlp != NULL) 508 metafreereplicalist(rlp); 509 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { 510 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 511 mdsyserror(ep, errno, META_DBCONFTMP); 512 } 513 free(tname); 514 return (rval); 515 } 516 517 /* 518 * check replica for dev 519 */ 520 static int 521 in_replica( 522 mdsetname_t *sp, 523 md_replica_t *rp, 524 mdname_t *np, 525 diskaddr_t slblk, 526 diskaddr_t nblks, 527 md_error_t *ep 528 ) 529 { 530 mdname_t *repnp = rp->r_namep; 531 diskaddr_t rep_sblk = rp->r_blkno; 532 diskaddr_t rep_nblks = rp->r_nblk; 533 534 /* should be in the same set */ 535 assert(sp != NULL); 536 537 /* if error in master block, assume whole partition */ 538 if ((rep_sblk == MD_DISKADDR_ERROR) || 539 (rep_nblks == MD_DISKADDR_ERROR)) { 540 rep_sblk = 0; 541 rep_nblks = MD_DISKADDR_ERROR; 542 } 543 544 /* check overlap */ 545 if (meta_check_overlap( 546 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { 547 return (-1); 548 } 549 550 /* return success */ 551 return (0); 552 } 553 554 /* 555 * check to see if we're in a replica 556 */ 557 int 558 meta_check_inreplica( 559 mdsetname_t *sp, 560 mdname_t *np, 561 diskaddr_t slblk, 562 diskaddr_t nblks, 563 md_error_t *ep 564 ) 565 { 566 md_replicalist_t *rlp = NULL; 567 md_replicalist_t *rl; 568 int rval = 0; 569 570 /* should have a set */ 571 assert(sp != NULL); 572 573 /* for each replica */ 574 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 575 return (-1); 576 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 577 md_replica_t *rp = rl->rl_repp; 578 579 /* check replica */ 580 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { 581 rval = -1; 582 break; 583 } 584 } 585 586 /* cleanup, return success */ 587 metafreereplicalist(rlp); 588 return (rval); 589 } 590 591 /* 592 * check replica 593 */ 594 int 595 meta_check_replica( 596 mdsetname_t *sp, /* set to check against */ 597 mdname_t *np, /* component to check against */ 598 mdchkopts_t options, /* option flags */ 599 diskaddr_t slblk, /* start logical block */ 600 diskaddr_t nblks, /* number of blocks (-1,rest of them) */ 601 md_error_t *ep /* error packet */ 602 ) 603 { 604 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; 605 606 /* make sure we have a disk */ 607 if (metachkcomp(np, ep) != 0) 608 return (-1); 609 610 /* check to ensure that it is not already in use */ 611 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { 612 return (-1); 613 } 614 615 if (options & MDCHK_ALLOW_NODBS) 616 return (0); 617 618 if (options & MDCHK_DRVINSET) 619 return (0); 620 621 /* make sure it is in the set */ 622 if (meta_check_inset(sp, np, ep) != 0) 623 return (-1); 624 625 /* make sure its not in a metadevice */ 626 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) 627 return (-1); 628 629 /* return success */ 630 return (0); 631 } 632 633 static int 634 update_dbinfo_on_drives( 635 mdsetname_t *sp, 636 md_drive_desc *dd, 637 int set_locked, 638 int force, 639 md_error_t *ep 640 ) 641 { 642 md_set_desc *sd; 643 int i; 644 md_setkey_t *cl_sk; 645 int rval = 0; 646 md_mnnode_desc *nd; 647 648 if ((sd = metaget_setdesc(sp, ep)) == NULL) 649 return (-1); 650 651 if (! set_locked) { 652 if (MD_MNSET_DESC(sd)) { 653 md_error_t xep = mdnullerror; 654 sigset_t sigs; 655 /* Make sure we are blocking all signals */ 656 if (procsigs(TRUE, &sigs, &xep) < 0) 657 mdclrerror(&xep); 658 659 nd = sd->sd_nodelist; 660 while (nd) { 661 if (force && strcmp(nd->nd_nodename, 662 mynode()) != 0) { 663 nd = nd->nd_next; 664 continue; 665 } 666 667 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 668 nd = nd->nd_next; 669 continue; 670 } 671 672 if (clnt_lock_set(nd->nd_nodename, sp, ep)) 673 return (-1); 674 nd = nd->nd_next; 675 } 676 } else { 677 for (i = 0; i < MD_MAXSIDES; i++) { 678 /* Skip empty slots */ 679 if (sd->sd_nodes[i][0] == '\0') 680 continue; 681 682 if (force && strcmp(sd->sd_nodes[i], 683 mynode()) != 0) 684 continue; 685 686 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) 687 return (-1); 688 } 689 } 690 } 691 692 if (MD_MNSET_DESC(sd)) { 693 nd = sd->sd_nodelist; 694 while (nd) { 695 if (force && strcmp(nd->nd_nodename, mynode()) != 0) { 696 nd = nd->nd_next; 697 continue; 698 } 699 700 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 701 nd = nd->nd_next; 702 continue; 703 } 704 705 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) 706 == -1) { 707 rval = -1; 708 break; 709 } 710 nd = nd->nd_next; 711 } 712 } else { 713 for (i = 0; i < MD_MAXSIDES; i++) { 714 /* Skip empty slots */ 715 if (sd->sd_nodes[i][0] == '\0') 716 continue; 717 718 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) 719 continue; 720 721 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) 722 == -1) { 723 rval = -1; 724 break; 725 } 726 } 727 } 728 729 if (! set_locked) { 730 cl_sk = cl_get_setkey(sp->setno, sp->setname); 731 if (MD_MNSET_DESC(sd)) { 732 nd = sd->sd_nodelist; 733 while (nd) { 734 if (force && 735 strcmp(nd->nd_nodename, mynode()) != 0) { 736 nd = nd->nd_next; 737 continue; 738 } 739 740 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 741 nd = nd->nd_next; 742 continue; 743 } 744 745 if (clnt_unlock_set(nd->nd_nodename, cl_sk, 746 ep)) { 747 rval = -1; 748 break; 749 } 750 nd = nd->nd_next; 751 } 752 } else { 753 for (i = 0; i < MD_MAXSIDES; i++) { 754 /* Skip empty slots */ 755 if (sd->sd_nodes[i][0] == '\0') 756 continue; 757 758 if (force && 759 strcmp(sd->sd_nodes[i], mynode()) != 0) 760 continue; 761 762 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, 763 ep)) { 764 rval = -1; 765 break; 766 } 767 } 768 769 } 770 cl_set_setkey(NULL); 771 } 772 773 return (rval); 774 } 775 776 int 777 meta_db_addsidenms( 778 mdsetname_t *sp, 779 mdname_t *np, 780 daddr_t blkno, 781 int bcast, 782 md_error_t *ep 783 ) 784 { 785 side_t sideno; 786 char *bname = NULL; 787 char *dname = NULL; 788 minor_t mnum; 789 mddb_config_t c; 790 int done; 791 int rval = 0; 792 md_set_desc *sd; 793 794 sideno = MD_SIDEWILD; 795 /*CONSTCOND*/ 796 while (1) { 797 if (bname != NULL) { 798 Free(bname); 799 bname = NULL; 800 } 801 if (dname != NULL) { 802 Free(dname); 803 dname = NULL; 804 } 805 if ((done = meta_getnextside_devinfo(sp, np->bname, 806 &sideno, &bname, &dname, &mnum, ep)) == -1) { 807 rval = -1; 808 break; 809 } 810 811 if (done == 0) 812 break; 813 814 if (! metaislocalset(sp)) { 815 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 816 rval = -1; 817 break; 818 } 819 } 820 821 /* 822 * Send addsidenms to all nodes using rpc.mdcommd if 823 * sidename is being added to MN diskset. 824 * 825 * It's ok to broadcast this call to other nodes. 826 * 827 * Note: The broadcast to other nodes isn't needed during 828 * the addition of the first mddbs to the set since the 829 * other nodes haven't been joined to the set yet. All 830 * nodes in a MN diskset are (implicitly) joined to the set 831 * on the addition of the first mddb. 832 */ 833 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 834 (bcast == DB_ADDSIDENMS_BCAST)) { 835 md_mn_result_t *resultp = NULL; 836 md_mn_msg_meta_db_newside_t db_ns; 837 int send_rval; 838 839 db_ns.msg_l_dev = np->dev; 840 db_ns.msg_sideno = sideno; 841 db_ns.msg_blkno = blkno; 842 (void) strncpy(db_ns.msg_dname, dname, 843 sizeof (db_ns.msg_dname)); 844 (void) splitname(np->bname, &db_ns.msg_splitname); 845 db_ns.msg_mnum = mnum; 846 847 /* Set devid to NULL until devids are supported */ 848 db_ns.msg_devid[0] = NULL; 849 850 /* 851 * If reconfig cycle has been started, this node is 852 * stuck in in the return step until this command has 853 * completed. If mdcommd is suspended, ask 854 * send_message to fail (instead of retrying) 855 * so that metaset can finish allowing the reconfig 856 * cycle to proceed. 857 */ 858 send_rval = mdmn_send_message(sp->setno, 859 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | 860 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, 861 sizeof (md_mn_msg_meta_db_newside_t), 862 &resultp, ep); 863 if (send_rval != 0) { 864 rval = -1; 865 if (resultp == NULL) 866 (void) mddserror(ep, 867 MDE_DS_COMMD_SEND_FAIL, 868 sp->setno, NULL, NULL, 869 sp->setname); 870 else { 871 (void) mdstealerror(ep, 872 &(resultp->mmr_ep)); 873 if (mdisok(ep)) { 874 (void) mddserror(ep, 875 MDE_DS_COMMD_SEND_FAIL, 876 sp->setno, NULL, NULL, 877 sp->setname); 878 } 879 free_result(resultp); 880 } 881 break; 882 } 883 if (resultp) 884 free_result(resultp); 885 } else { 886 /* 887 * Let this side's device name, minor # and driver name 888 * be known to the database replica. 889 */ 890 (void) memset(&c, 0, sizeof (c)); 891 892 /* Fill in device/replica info */ 893 c.c_locator.l_dev = meta_cmpldev(np->dev); 894 c.c_locator.l_blkno = blkno; 895 (void) strncpy(c.c_locator.l_driver, dname, 896 sizeof (c.c_locator.l_driver)); 897 (void) splitname(bname, &c.c_devname); 898 c.c_locator.l_mnum = mnum; 899 900 /* Fill in setno, setname, and sideno */ 901 c.c_setno = sp->setno; 902 (void) strncpy(c.c_setname, sp->setname, 903 sizeof (c.c_setname)); 904 c.c_sideno = sideno; 905 906 /* 907 * Don't need device id information from this ioctl 908 * Kernel determines device id from dev_t, which 909 * is just what this code would do. 910 */ 911 c.c_locator.l_devid = (uint64_t)0; 912 c.c_locator.l_devid_flags = 0; 913 914 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { 915 rval = mdstealerror(ep, &c.c_mde); 916 break; 917 } 918 } 919 } 920 921 /* cleanup, return success */ 922 if (bname != NULL) { 923 Free(bname); 924 bname = NULL; 925 } 926 if (dname != NULL) { 927 Free(dname); 928 dname = NULL; 929 } 930 return (rval); 931 } 932 933 934 int 935 meta_db_delsidenm( 936 mdsetname_t *sp, 937 side_t sideno, 938 mdname_t *np, 939 daddr_t blkno, 940 md_error_t *ep 941 ) 942 { 943 mddb_config_t c; 944 md_set_desc *sd; 945 946 if (! metaislocalset(sp)) { 947 if ((sd = metaget_setdesc(sp, ep)) == NULL) 948 return (-1); 949 } 950 /* Use rpc.mdcommd to delete mddb side from all nodes */ 951 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 952 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 953 md_mn_result_t *resultp = NULL; 954 md_mn_msg_meta_db_delside_t db_ds; 955 int send_rval; 956 957 db_ds.msg_l_dev = np->dev; 958 db_ds.msg_blkno = blkno; 959 db_ds.msg_sideno = sideno; 960 961 /* Set devid to NULL until devids are supported */ 962 db_ds.msg_devid[0] = NULL; 963 964 /* 965 * If reconfig cycle has been started, this node is 966 * stuck in in the return step until this command has 967 * completed. If mdcommd is suspended, ask 968 * send_message to fail (instead of retrying) 969 * so that metaset can finish allowing the reconfig 970 * cycle to proceed. 971 */ 972 send_rval = mdmn_send_message(sp->setno, 973 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | 974 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, 975 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); 976 if (send_rval != 0) { 977 if (resultp == NULL) 978 (void) mddserror(ep, 979 MDE_DS_COMMD_SEND_FAIL, 980 sp->setno, NULL, NULL, 981 sp->setname); 982 else { 983 (void) mdstealerror(ep, &(resultp->mmr_ep)); 984 if (mdisok(ep)) { 985 (void) mddserror(ep, 986 MDE_DS_COMMD_SEND_FAIL, 987 sp->setno, NULL, NULL, 988 sp->setname); 989 } 990 free_result(resultp); 991 } 992 return (-1); 993 } 994 if (resultp) 995 free_result(resultp); 996 997 } else { 998 /* 999 * Let this side's device name, minor # and driver name 1000 * be known to the database replica. 1001 */ 1002 (void) memset(&c, 0, sizeof (c)); 1003 1004 /* Fill in device/replica info */ 1005 c.c_locator.l_dev = meta_cmpldev(np->dev); 1006 c.c_locator.l_blkno = blkno; 1007 1008 /* Fill in setno, setname, and sideno */ 1009 c.c_setno = sp->setno; 1010 (void) strcpy(c.c_setname, sp->setname); 1011 c.c_sideno = sideno; 1012 1013 /* 1014 * Don't need device id information from this ioctl 1015 * Kernel determines device id from dev_t, which 1016 * is just what this code would do. 1017 */ 1018 c.c_locator.l_devid = (uint64_t)0; 1019 c.c_locator.l_devid_flags = 0; 1020 1021 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) 1022 return (mdstealerror(ep, &c.c_mde)); 1023 } 1024 return (0); 1025 } 1026 1027 1028 static int 1029 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) 1030 { 1031 mdnamelist_t *dnp1, *dnp2; 1032 1033 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { 1034 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { 1035 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) 1036 return (mderror(ep, MDE_DUPDRIVE, 1037 dnp1->namep->cname)); 1038 } 1039 } 1040 return (0); 1041 } 1042 1043 1044 /* 1045 * Return 1 if files are different, else return 0 1046 */ 1047 static int 1048 filediff(char *tsname, char *sname) 1049 { 1050 int ret = 1, fd; 1051 size_t tsz, sz; 1052 struct stat sbuf; 1053 char *tbuf, *buf; 1054 1055 if (stat(tsname, &sbuf) != 0) 1056 return (1); 1057 tsz = sbuf.st_size; 1058 if (stat(sname, &sbuf) != 0) 1059 return (1); 1060 sz = sbuf.st_size; 1061 if (tsz != sz) 1062 return (1); 1063 1064 /* allocate memory and read both files into buffer */ 1065 tbuf = malloc(tsz); 1066 buf = malloc(sz); 1067 if (tbuf == NULL || buf == NULL) 1068 goto out; 1069 1070 fd = open(tsname, O_RDONLY); 1071 if (fd == -1) 1072 goto out; 1073 sz = read(fd, tbuf, tsz); 1074 (void) close(fd); 1075 if (sz != tsz) 1076 goto out; 1077 1078 fd = open(sname, O_RDONLY); 1079 if (fd == -1) 1080 goto out; 1081 sz = read(fd, buf, tsz); 1082 (void) close(fd); 1083 if (sz != tsz) 1084 goto out; 1085 1086 /* compare content */ 1087 ret = bcmp(tbuf, buf, tsz); 1088 out: 1089 if (tbuf) 1090 free(tbuf); 1091 if (buf) 1092 free(buf); 1093 return (ret); 1094 } 1095 1096 /* 1097 * patch md.conf file with mddb locations 1098 */ 1099 int 1100 meta_db_patch( 1101 char *sname, /* system file name */ 1102 char *cname, /* mddb.cf file name */ 1103 int patch, /* patching locally */ 1104 md_error_t *ep 1105 ) 1106 { 1107 char *tsname = NULL; 1108 char line[MDDB_BOOTLIST_MAX_LEN]; 1109 FILE *tsfp = NULL; 1110 FILE *mfp = NULL; 1111 int rval = -1; 1112 1113 /* check names */ 1114 if (sname == NULL) { 1115 if (patch) 1116 sname = "md.conf"; 1117 else 1118 sname = "/kernel/drv/md.conf"; 1119 } 1120 if (cname == NULL) 1121 cname = META_DBCONF; 1122 1123 /* 1124 * edit file 1125 */ 1126 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { 1127 if (mdissyserror(ep, EROFS)) { 1128 /* 1129 * If we are booted on a read-only root because 1130 * of mddb quorum problems we don't want to emit 1131 * any scary error messages. 1132 */ 1133 mdclrerror(ep); 1134 rval = 0; 1135 } 1136 goto out; 1137 } 1138 1139 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0, 1140 ep) != 0) 1141 goto out; 1142 1143 /* if file content is identical, skip rename */ 1144 if (filediff(tsname, sname) == 0) { 1145 rval = 0; 1146 goto out; 1147 } 1148 1149 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || 1150 (fclose(tsfp) != 0)) { 1151 (void) mdsyserror(ep, errno, tsname); 1152 goto out; 1153 } 1154 1155 tsfp = NULL; 1156 1157 /* 1158 * rename file. If we get a Cross Device error then it 1159 * is because we are in the miniroot. 1160 */ 1161 if (rename(tsname, sname) != 0 && errno != EXDEV) { 1162 (void) mdsyserror(ep, errno, sname); 1163 goto out; 1164 } 1165 1166 if (errno == EXDEV) { 1167 if ((tsfp = fopen(tsname, "r")) == NULL) 1168 goto out; 1169 if ((mfp = fopen(sname, "w+")) == NULL) 1170 goto out; 1171 while (fgets(line, sizeof (line), tsfp) != NULL) { 1172 if (fputs(line, mfp) == NULL) 1173 goto out; 1174 } 1175 (void) fclose(tsfp); 1176 tsfp = NULL; 1177 if (fflush(mfp) != 0) 1178 goto out; 1179 if (fsync(fileno(mfp)) != 0) 1180 goto out; 1181 if (fclose(mfp) != 0) { 1182 mfp = NULL; 1183 goto out; 1184 } 1185 } 1186 1187 Free(tsname); 1188 tsname = NULL; 1189 rval = 0; 1190 1191 /* cleanup, return error */ 1192 out: 1193 if (tsfp != NULL) 1194 (void) fclose(tsfp); 1195 if (tsname != NULL) { 1196 (void) unlink(tsname); 1197 Free(tsname); 1198 } 1199 return (rval); 1200 } 1201 1202 /* 1203 * Add replicas to set. This happens as a result of: 1204 * - metadb [-s set_name] -a 1205 * - metaset -s set_name -a disk 1206 * - metaset -s set_name -d disk (causes a rebalance of mddbs) 1207 * - metaset -s set_name -b 1208 * 1209 * For a local set, this routine is run on the local set host. 1210 * 1211 * For a traditional diskset, this routine is run on the node that 1212 * is running the metaset command. 1213 * 1214 * For a multinode diskset, this routine is run by the node that is 1215 * running the metaset command. If this is the first mddb added to 1216 * the MN diskset, then no communication is made to other nodes via commd 1217 * since the other nodes will be in-sync with respect to the mddbs when 1218 * those other nodes join the set and snarf in the newly created mddb. 1219 * If this is not the first mddb added to the MN diskset, then this 1220 * attach command is sent to all of the nodes using commd. This keeps 1221 * the nodes in-sync. 1222 */ 1223 int 1224 meta_db_attach( 1225 mdsetname_t *sp, 1226 mdnamelist_t *db_nlp, 1227 mdchkopts_t options, 1228 md_timeval32_t *timeval, 1229 int dbcnt, 1230 int dbsize, 1231 char *sysfilename, 1232 md_error_t *ep 1233 ) 1234 { 1235 struct mddb_config c; 1236 mdnamelist_t *nlp; 1237 mdname_t *np; 1238 md_drive_desc *dd = NULL; 1239 md_drive_desc *p; 1240 int i; 1241 int fd; 1242 side_t sideno; 1243 daddr_t blkno; 1244 int replicacount = 0; 1245 int start_svmdaemons = 0; 1246 int rval = 0; 1247 md_error_t status = mdnullerror; 1248 md_set_desc *sd; 1249 int stale_bool = FALSE; 1250 int flags; 1251 int firstmddb = 1; 1252 md_timeval32_t inittime = {0, 0}; 1253 1254 /* 1255 * Error if we don't get some work to do. 1256 */ 1257 if (db_nlp == NULL) 1258 return (mdsyserror(ep, EINVAL, NULL)); 1259 1260 if (mdnamesareunique(db_nlp, ep) != 0) 1261 return (-1); 1262 (void) memset(&c, 0, sizeof (c)); 1263 c.c_id = 0; 1264 c.c_setno = sp->setno; 1265 1266 /* Don't need device id information from this ioctl */ 1267 c.c_locator.l_devid = (uint64_t)0; 1268 c.c_locator.l_devid_flags = 0; 1269 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1270 if (metaislocalset(sp)) { 1271 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) 1272 mdclrerror(&c.c_mde); 1273 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || 1274 (! (options & MDCHK_ALLOW_NODBS))) 1275 return (mdstealerror(ep, &c.c_mde)); 1276 } else { 1277 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) 1278 return (mdstealerror(ep, &c.c_mde)); 1279 } 1280 mdclrerror(&c.c_mde); 1281 } 1282 /* 1283 * Is current set STALE? 1284 */ 1285 if (c.c_flags & MDDB_C_STALE) { 1286 stale_bool = TRUE; 1287 } 1288 1289 assert(db_nlp != NULL); 1290 1291 /* if these are the first replicas then the SVM daemons need to run */ 1292 if (c.c_dbcnt == 0) 1293 start_svmdaemons = 1; 1294 1295 /* 1296 * check to see if we will go over the total possible number 1297 * of data bases 1298 */ 1299 nlp = db_nlp; 1300 while (nlp) { 1301 replicacount += dbcnt; 1302 nlp = nlp->next; 1303 } 1304 1305 if ((replicacount + c.c_dbcnt) > c.c_dbmax) 1306 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 1307 sp->setno, c.c_dbcnt + replicacount, NULL)); 1308 1309 /* 1310 * go through and check to make sure all locations specified 1311 * are legal also pick out driver name; 1312 */ 1313 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1314 diskaddr_t devsize; 1315 1316 np = nlp->namep; 1317 1318 if (! metaislocalset(sp)) { 1319 uint_t partno; 1320 uint_t rep_partno; 1321 mddrivename_t *dnp = np->drivenamep; 1322 1323 /* 1324 * make sure that non-local database replicas 1325 * are always on the replica slice. 1326 */ 1327 if (meta_replicaslice(dnp, 1328 &rep_partno, ep) != 0) 1329 return (-1); 1330 if (metagetvtoc(np, FALSE, &partno, ep) == NULL) 1331 return (-1); 1332 if (partno != rep_partno) 1333 return (mddeverror(ep, MDE_REPCOMP_ONLY, 1334 np->dev, sp->setname)); 1335 } 1336 1337 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), 1338 ep)) { 1339 return (-1); 1340 } 1341 1342 if ((devsize = metagetsize(np, ep)) == -1) 1343 return (-1); 1344 1345 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) 1346 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, 1347 meta_getminor(np->dev), sp->setno, devsize, 1348 np->cname)); 1349 } 1350 1351 /* 1352 * If first disk in set we don't have lb_inittime yet for use as 1353 * mb_setcreatetime so don't go looking for it. WE'll come back 1354 * later and update after the locator block has been created. 1355 * If this isn't the first disk in the set, we have a locator 1356 * block and thus we have lb_inittime. Set mb_setcreatetime to 1357 * lb_inittime. 1358 */ 1359 if (! metaislocalset(sp)) { 1360 if (c.c_dbcnt != 0) { 1361 firstmddb = 0; 1362 inittime = meta_get_lb_inittime(sp, ep); 1363 } 1364 } 1365 1366 /* 1367 * go through and write all master blocks 1368 */ 1369 1370 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1371 np = nlp->namep; 1372 1373 if ((fd = open(np->rname, O_RDWR)) < 0) 1374 return (mdsyserror(ep, errno, np->rname)); 1375 1376 for (i = 0; i < dbcnt; i++) { 1377 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, 1378 inittime, ep)) { 1379 (void) close(fd); 1380 return (-1); 1381 } 1382 } 1383 (void) close(fd); 1384 } 1385 1386 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1387 return (-1); 1388 1389 if (! metaislocalset(sp)) { 1390 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1391 if (! mdisok(ep)) 1392 return (-1); 1393 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1394 return (-1); 1395 1396 } 1397 1398 /* 1399 * go through and tell kernel to add them 1400 */ 1401 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1402 mdcinfo_t *cinfo; 1403 1404 np = nlp->namep; 1405 1406 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 1407 rval = -1; 1408 goto out; 1409 } 1410 1411 /* 1412 * If mddb is being added to MN diskset and there already 1413 * exists a valid mddb in the set (which equates to this 1414 * node being an owner of the set) then use rpc.mdcommd 1415 * mechanism to add mddb(s) so that all nodes stay in sync. 1416 * If set is stale, don't log the message since rpc.mdcommd 1417 * can't write the message to the mddb. 1418 * 1419 * Otherwise, just add mddb to this node. 1420 */ 1421 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1422 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1423 md_mn_result_t *resultp = NULL; 1424 md_mn_msg_meta_db_attach_t attach; 1425 int send_rval; 1426 1427 /* 1428 * In a scenario where new replicas had been added on 1429 * the master, and then all of the old replicas failed 1430 * before the slaves had knowledge of the new replicas, 1431 * the slaves are unable to re-parse in the mddb 1432 * from the new replicas since the slaves have no 1433 * knowledge of the new replicas. The following 1434 * algorithm solves this problem: 1435 * - META_DB_ATTACH message generates submsgs 1436 * - BLOCK parse (master) 1437 * - MDDB_ATTACH new replicas 1438 * - UNBLOCK parse (master) causing parse 1439 * information to be sent from master 1440 * to slaves at a higher class than the 1441 * unblock so the parse message will 1442 * reach slaves before unblock message. 1443 */ 1444 attach.msg_l_dev = np->dev; 1445 attach.msg_cnt = dbcnt; 1446 attach.msg_dbsize = dbsize; 1447 (void) strncpy(attach.msg_dname, cinfo->dname, 1448 sizeof (attach.msg_dname)); 1449 (void) splitname(np->bname, &attach.msg_splitname); 1450 attach.msg_options = options; 1451 1452 /* Set devid to NULL until devids are supported */ 1453 attach.msg_devid[0] = NULL; 1454 1455 /* 1456 * If reconfig cycle has been started, this node is 1457 * stuck in in the return step until this command has 1458 * completed. If mdcommd is suspended, ask 1459 * send_message to fail (instead of retrying) 1460 * so that metaset can finish allowing the reconfig 1461 * cycle to proceed. 1462 */ 1463 flags = MD_MSGF_FAIL_ON_SUSPEND; 1464 if (stale_bool == TRUE) 1465 flags |= MD_MSGF_NO_LOG; 1466 send_rval = mdmn_send_message(sp->setno, 1467 MD_MN_MSG_META_DB_ATTACH, 1468 flags, (char *)&attach, 1469 sizeof (md_mn_msg_meta_db_attach_t), 1470 &resultp, ep); 1471 if (send_rval != 0) { 1472 rval = -1; 1473 if (resultp == NULL) 1474 (void) mddserror(ep, 1475 MDE_DS_COMMD_SEND_FAIL, 1476 sp->setno, NULL, NULL, 1477 sp->setname); 1478 else { 1479 (void) mdstealerror(ep, 1480 &(resultp->mmr_ep)); 1481 if (mdisok(ep)) { 1482 (void) mddserror(ep, 1483 MDE_DS_COMMD_SEND_FAIL, 1484 sp->setno, NULL, NULL, 1485 sp->setname); 1486 } 1487 free_result(resultp); 1488 } 1489 goto out; 1490 } 1491 if (resultp) 1492 free_result(resultp); 1493 } else { 1494 /* Adding mddb(s) to just this node */ 1495 for (i = 0; i < dbcnt; i++) { 1496 (void) memset(&c, 0, sizeof (c)); 1497 /* Fill in device/replica info */ 1498 c.c_locator.l_dev = meta_cmpldev(np->dev); 1499 c.c_locator.l_blkno = i * dbsize + 16; 1500 blkno = c.c_locator.l_blkno; 1501 (void) strncpy(c.c_locator.l_driver, cinfo->dname, 1502 sizeof (c.c_locator.l_driver)); 1503 (void) splitname(np->bname, &c.c_devname); 1504 c.c_locator.l_mnum = meta_getminor(np->dev); 1505 1506 /* Fill in setno, setname, and sideno */ 1507 c.c_setno = sp->setno; 1508 if (! metaislocalset(sp)) { 1509 if (MD_MNSET_DESC(sd)) { 1510 c.c_multi_node = 1; 1511 } 1512 } 1513 (void) strcpy(c.c_setname, sp->setname); 1514 c.c_sideno = sideno; 1515 1516 /* 1517 * Don't need device id information from this ioctl 1518 * Kernel determines device id from dev_t, which 1519 * is just what this code would do. 1520 */ 1521 c.c_locator.l_devid = (uint64_t)0; 1522 c.c_locator.l_devid_flags = 0; 1523 1524 if (timeval != NULL) 1525 c.c_timestamp = *timeval; 1526 1527 if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE), 1528 ep)) { 1529 rval = -1; 1530 goto out; 1531 } 1532 1533 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) { 1534 rval = mdstealerror(ep, &c.c_mde); 1535 goto out; 1536 } 1537 /* 1538 * This is either a traditional diskset OR this 1539 * is the first replica added to a MN diskset. 1540 * In either case, set broadcast to NO_BCAST so 1541 * that message won't go through rpc.mdcommd. 1542 * If this is a traditional diskset, the bcast 1543 * flag is ignored since traditional disksets 1544 * don't use the rpc.mdcommd. 1545 */ 1546 if (meta_db_addsidenms(sp, np, blkno, 1547 DB_ADDSIDENMS_NO_BCAST, ep)) 1548 goto out; 1549 } 1550 } 1551 if (! metaislocalset(sp)) { 1552 /* update the dbcnt and size in dd */ 1553 for (p = dd; p != NULL; p = p->dd_next) 1554 if (p->dd_dnp == np->drivenamep) { 1555 p->dd_dbcnt = dbcnt; 1556 p->dd_dbsize = dbsize; 1557 break; 1558 } 1559 } 1560 1561 /* 1562 * If this was the first addition of disks to the 1563 * diskset you now need to update the mb_setcreatetime 1564 * which needed lb_inittime which wasn't there until now. 1565 */ 1566 if (firstmddb) { 1567 if (meta_update_mb(sp, dd, ep) != 0) { 1568 return (-1); 1569 } 1570 } 1571 (void) close(fd); 1572 } 1573 1574 out: 1575 if (metaislocalset(sp)) { 1576 1577 /* everything looks fine. Start mdmonitord */ 1578 if (rval == 0 && start_svmdaemons == 1) { 1579 if (meta_smf_enable(META_SMF_CORE, &status) == -1) { 1580 mde_perror(&status, ""); 1581 mdclrerror(&status); 1582 } 1583 } 1584 1585 if (buildconf(sp, &status)) { 1586 /* Don't mask any previous errors */ 1587 if (rval == 0) 1588 rval = mdstealerror(ep, &status); 1589 return (rval); 1590 } 1591 1592 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 1593 /* Don't mask any previous errors */ 1594 if (rval == 0) 1595 rval = mdstealerror(ep, &status); 1596 } 1597 } else { 1598 if (update_dbinfo_on_drives(sp, dd, 1599 (options & MDCHK_SET_LOCKED), 1600 (options & MDCHK_SET_FORCE), 1601 &status)) { 1602 /* Don't mask any previous errors */ 1603 if (rval == 0) 1604 rval = mdstealerror(ep, &status); 1605 else 1606 mdclrerror(&status); 1607 } 1608 metafreedrivedesc(&dd); 1609 } 1610 /* 1611 * For MN disksets that already had already had nodes joined 1612 * before the attach of this mddb(s), the name invalidation is 1613 * done by the commd handler routine. Otherwise, if this 1614 * is the first attach of a MN diskset mddb, the invalidation 1615 * must be done here since the first attach cannot be sent 1616 * via the commd since there are no nodes joined to the set yet. 1617 */ 1618 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || 1619 (MD_MNSET_DESC(sd) && 1620 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { 1621 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 1622 meta_invalidate_name(nlp->namep); 1623 } 1624 } 1625 return (rval); 1626 } 1627 1628 /* 1629 * deletelist_length 1630 * 1631 * return the number of slices that have been specified for deletion 1632 * on the metadb command line. This does not calculate the number 1633 * of replicas because there may be multiple replicas per slice. 1634 */ 1635 static int 1636 deletelist_length(mdnamelist_t *db_nlp) 1637 { 1638 1639 mdnamelist_t *nlp; 1640 int list_length = 0; 1641 1642 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1643 list_length++; 1644 } 1645 1646 return (list_length); 1647 } 1648 1649 static int 1650 in_deletelist(char *devname, mdnamelist_t *db_nlp) 1651 { 1652 1653 mdnamelist_t *nlp; 1654 mdname_t *np; 1655 int index = 0; 1656 1657 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1658 np = nlp->namep; 1659 1660 if (strcmp(devname, np->bname) == 0) 1661 return (index); 1662 index++; 1663 } 1664 1665 return (-1); 1666 } 1667 1668 /* 1669 * Delete replicas from set. This happens as a result of: 1670 * - metadb [-s set_name] -d 1671 * - metaset -s set_name -a disk (causes a rebalance of mddbs) 1672 * - metaset -s set_name -d disk 1673 * - metaset -s set_name -b 1674 * 1675 * For a local set, this routine is run on the local set host. 1676 * 1677 * For a traditional diskset, this routine is run on the node that 1678 * is running the metaset command. 1679 * 1680 * For a multinode diskset, this routine is run by the node that is 1681 * running the metaset command. This detach routine is sent to all 1682 * of the joined nodes in the diskset using commd. This keeps 1683 * the nodes in-sync. 1684 */ 1685 int 1686 meta_db_detach( 1687 mdsetname_t *sp, 1688 mdnamelist_t *db_nlp, 1689 mdforceopts_t force_option, 1690 char *sysfilename, 1691 md_error_t *ep 1692 ) 1693 { 1694 struct mddb_config c; 1695 mdnamelist_t *nlp; 1696 mdname_t *np; 1697 md_drive_desc *dd = NULL; 1698 md_drive_desc *p; 1699 int replicacount; 1700 int replica_delete_count; 1701 int nr_replica_slices; 1702 int i; 1703 int stop_svmdaemons = 0; 1704 int rval = 0; 1705 int index; 1706 int valid_replicas_nottodelete = 0; 1707 int invalid_replicas_nottodelete = 0; 1708 int invalid_replicas_todelete = 0; 1709 int errored = 0; 1710 int *tag_array; 1711 int fd = -1; 1712 md_error_t status = mdnullerror; 1713 md_set_desc *sd; 1714 int stale_bool = FALSE; 1715 int flags; 1716 1717 /* 1718 * Error if we don't get some work to do. 1719 */ 1720 if (db_nlp == NULL) 1721 return (mdsyserror(ep, EINVAL, NULL)); 1722 1723 if (mdnamesareunique(db_nlp, ep) != 0) 1724 return (-1); 1725 1726 (void) memset(&c, 0, sizeof (c)); 1727 c.c_id = 0; 1728 c.c_setno = sp->setno; 1729 1730 /* Don't need device id information from this ioctl */ 1731 c.c_locator.l_devid = (uint64_t)0; 1732 c.c_locator.l_devid_flags = 0; 1733 1734 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1735 return (mdstealerror(ep, &c.c_mde)); 1736 1737 /* 1738 * Is current set STALE? 1739 */ 1740 if (c.c_flags & MDDB_C_STALE) { 1741 stale_bool = TRUE; 1742 } 1743 1744 replicacount = c.c_dbcnt; 1745 1746 assert(db_nlp != NULL); 1747 1748 /* 1749 * go through and gather how many data bases are on each 1750 * device specified. 1751 */ 1752 1753 nr_replica_slices = deletelist_length(db_nlp); 1754 tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); 1755 1756 replica_delete_count = 0; 1757 for (i = 0; i < replicacount; i++) { 1758 char *devname; 1759 int found = 0; 1760 1761 c.c_id = i; 1762 1763 /* Don't need device id information from this ioctl */ 1764 c.c_locator.l_devid = (uint64_t)0; 1765 c.c_locator.l_devid_flags = 0; 1766 1767 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1768 return (mdstealerror(ep, &c.c_mde)); 1769 1770 devname = splicename(&c.c_devname); 1771 1772 if ((index = in_deletelist(devname, db_nlp)) != -1) { 1773 found = 1; 1774 tag_array[index] = 1; 1775 replica_delete_count++; 1776 } 1777 1778 errored = c.c_locator.l_flags & (MDDB_F_EREAD | 1779 MDDB_F_EWRITE | MDDB_F_TOOSMALL | 1780 MDDB_F_EFMT | MDDB_F_EDATA | 1781 MDDB_F_EMASTER); 1782 1783 /* 1784 * There are four combinations of "errored" and "found" 1785 * and they are used to find the number of 1786 * (a) valid/invalid replicas that are not in the delete 1787 * list and are available in the system. 1788 * (b) valid/invalid replicas that are to be deleted. 1789 */ 1790 1791 if (errored && !found) /* errored and !found */ 1792 invalid_replicas_nottodelete++; 1793 else if (!found) /* !errored and !found */ 1794 valid_replicas_nottodelete++; 1795 else if (errored) /* errored and found */ 1796 invalid_replicas_todelete++; 1797 /* 1798 * else it is !errored and found. This means 1799 * valid_replicas_todelete++; But this variable will not 1800 * be used anywhere 1801 */ 1802 1803 Free(devname); 1804 } 1805 1806 index = 0; 1807 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1808 np = nlp->namep; 1809 if (tag_array[index++] != 1) { 1810 Free(tag_array); 1811 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); 1812 } 1813 } 1814 1815 Free(tag_array); 1816 1817 1818 /* if all replicas are deleted stop mdmonitord */ 1819 if ((replicacount - replica_delete_count) == 0) 1820 stop_svmdaemons = 1; 1821 1822 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { 1823 if (force_option & MDFORCE_NONE) 1824 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); 1825 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) 1826 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); 1827 } 1828 1829 /* 1830 * The following algorithms are followed to check for deletion: 1831 * (a) If the delete list(db_nlp) has all invalid replicas and no valid 1832 * replicas, then deletion should be allowed. 1833 * (b) Deletion should be allowed only if valid replicas that are "not" 1834 * to be deleted is always greater than the invalid replicas that 1835 * are "not" to be deleted. 1836 * (c) If the user uses -f option, then deletion should be allowed. 1837 */ 1838 1839 if ((invalid_replicas_todelete != replica_delete_count) && 1840 (invalid_replicas_nottodelete > valid_replicas_nottodelete) && 1841 (force_option != MDFORCE_LOCAL)) 1842 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); 1843 1844 /* 1845 * go through and tell kernel to delete them 1846 */ 1847 1848 /* Don't need device id information from this ioctl */ 1849 c.c_locator.l_devid = (uint64_t)0; 1850 c.c_locator.l_devid_flags = 0; 1851 1852 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1853 return (mdstealerror(ep, &c.c_mde)); 1854 1855 if (! metaislocalset(sp)) { 1856 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1857 if (! mdisok(ep)) 1858 return (-1); 1859 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1860 return (-1); 1861 } 1862 1863 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1864 np = nlp->namep; 1865 1866 /* 1867 * If mddb is being deleted from MN diskset and node is 1868 * an owner of the diskset then use rpc.mdcommd 1869 * mechanism to add mddb(s) so that all nodes stay in sync. 1870 * If set is stale, don't log the message since rpc.mdcommd 1871 * can't write the message to the mddb. 1872 * 1873 * When mddbs are first being added to set, a detach can 1874 * be called before any node has joined the diskset, so 1875 * must check to see if node is an owner of the diskset. 1876 * 1877 * Otherwise, just delete mddb from this node. 1878 */ 1879 1880 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1881 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1882 md_mn_result_t *resultp; 1883 md_mn_msg_meta_db_detach_t detach; 1884 int send_rval; 1885 1886 /* 1887 * The following algorithm is used to detach replicas. 1888 * - META_DB_DETACH message generates submsgs 1889 * - BLOCK parse (master) 1890 * - MDDB_DETACH replicas 1891 * - UNBLOCK parse (master) causing parse 1892 * information to be sent from master 1893 * to slaves at a higher class than the 1894 * unblock so the parse message will 1895 * reach slaves before unblock message. 1896 */ 1897 (void) splitname(np->bname, &detach.msg_splitname); 1898 1899 /* Set devid to NULL until devids are supported */ 1900 detach.msg_devid[0] = NULL; 1901 1902 /* 1903 * If reconfig cycle has been started, this node is 1904 * stuck in in the return step until this command has 1905 * completed. If mdcommd is suspended, ask 1906 * send_message to fail (instead of retrying) 1907 * so that metaset can finish allowing the reconfig 1908 * cycle to proceed. 1909 */ 1910 flags = MD_MSGF_FAIL_ON_SUSPEND; 1911 if (stale_bool == TRUE) 1912 flags |= MD_MSGF_NO_LOG; 1913 send_rval = mdmn_send_message(sp->setno, 1914 MD_MN_MSG_META_DB_DETACH, 1915 flags, (char *)&detach, 1916 sizeof (md_mn_msg_meta_db_detach_t), 1917 &resultp, ep); 1918 if (send_rval != 0) { 1919 rval = -1; 1920 if (resultp == NULL) 1921 (void) mddserror(ep, 1922 MDE_DS_COMMD_SEND_FAIL, 1923 sp->setno, NULL, NULL, 1924 sp->setname); 1925 else { 1926 (void) mdstealerror(ep, 1927 &(resultp->mmr_ep)); 1928 if (mdisok(ep)) { 1929 (void) mddserror(ep, 1930 MDE_DS_COMMD_SEND_FAIL, 1931 sp->setno, NULL, NULL, 1932 sp->setname); 1933 } 1934 free_result(resultp); 1935 } 1936 goto out; 1937 } 1938 if (resultp) 1939 free_result(resultp); 1940 } else { 1941 i = 0; 1942 while (i < c.c_dbcnt) { 1943 char *devname; 1944 1945 c.c_id = i; 1946 1947 /* Don't need devid info from this ioctl */ 1948 c.c_locator.l_devid = (uint64_t)0; 1949 c.c_locator.l_devid_flags = 0; 1950 1951 if (metaioctl(MD_DB_GETDEV, &c, 1952 &c.c_mde, NULL)) { 1953 rval = mdstealerror(ep, &c.c_mde); 1954 goto out; 1955 } 1956 1957 devname = splicename(&c.c_devname); 1958 if (strcmp(devname, np->bname) != 0) { 1959 Free(devname); 1960 i++; 1961 continue; 1962 } 1963 Free(devname); 1964 1965 /* Don't need devid info from this ioctl */ 1966 c.c_locator.l_devid = (uint64_t)0; 1967 c.c_locator.l_devid_flags = 0; 1968 1969 if (metaioctl(MD_DB_DELDEV, &c, 1970 &c.c_mde, NULL) != 0) { 1971 rval = mdstealerror(ep, &c.c_mde); 1972 goto out; 1973 } 1974 1975 /* Not incrementing "i" intentionally */ 1976 } 1977 } 1978 if (! metaislocalset(sp)) { 1979 /* update the dbcnt and size in dd */ 1980 for (p = dd; p != NULL; p = p->dd_next) { 1981 if (p->dd_dnp == np->drivenamep) { 1982 p->dd_dbcnt = 0; 1983 p->dd_dbsize = 0; 1984 break; 1985 } 1986 } 1987 1988 /* 1989 * Slam a dummy master block and make it self 1990 * identifying 1991 */ 1992 if ((fd = open(np->rname, O_RDWR)) >= 0) { 1993 meta_mkdummymaster(sp, fd, 16); 1994 (void) close(fd); 1995 } 1996 } 1997 } 1998 out: 1999 if (metaislocalset(sp)) { 2000 /* 2001 * Stop all the daemons if there are 2002 * no more replicas so that the module can be 2003 * unloaded. 2004 */ 2005 if (rval == 0 && stop_svmdaemons == 1) { 2006 char buf[MAXPATHLEN]; 2007 int i; 2008 2009 for (i = 0; i < DAEMON_COUNT; i++) { 2010 (void) snprintf(buf, MAXPATHLEN, 2011 "/usr/bin/pkill -%s -x %s", 2012 svmd_kill_list[i].svmd_kill_val, 2013 svmd_kill_list[i].svmd_name); 2014 if (pclose(popen(buf, "w")) == -1) 2015 md_perror(buf); 2016 } 2017 2018 if (meta_smf_disable(META_SMF_ALL, &status) == -1) { 2019 mde_perror(&status, ""); 2020 mdclrerror(&status); 2021 } 2022 } 2023 if (buildconf(sp, &status)) { 2024 /* Don't mask any previous errors */ 2025 if (rval == 0) 2026 rval = mdstealerror(ep, &status); 2027 else 2028 mdclrerror(&status); 2029 return (rval); 2030 } 2031 2032 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 2033 /* Don't mask any previous errors */ 2034 if (rval == 0) 2035 rval = mdstealerror(ep, &status); 2036 else 2037 mdclrerror(&status); 2038 } 2039 } else { 2040 if (update_dbinfo_on_drives(sp, dd, 2041 (force_option & MDFORCE_SET_LOCKED), 2042 ((force_option & MDFORCE_LOCAL) | 2043 (force_option & MDFORCE_DS)), &status)) { 2044 /* Don't mask any previous errors */ 2045 if (rval == 0) 2046 rval = mdstealerror(ep, &status); 2047 else 2048 mdclrerror(&status); 2049 } 2050 metafreedrivedesc(&dd); 2051 } 2052 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { 2053 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 2054 meta_invalidate_name(nlp->namep); 2055 } 2056 } 2057 return (rval); 2058 } 2059 2060 static md_replica_t * 2061 metareplicaname( 2062 mdsetname_t *sp, 2063 int flags, 2064 struct mddb_config *c, 2065 md_error_t *ep 2066 ) 2067 { 2068 md_replica_t *rp; 2069 char *devname; 2070 size_t sz; 2071 2072 /* allocate replicaname */ 2073 rp = Zalloc(sizeof (*rp)); 2074 2075 /* get device name */ 2076 devname = splicename(&c->c_devname); 2077 if (flags & PRINT_FAST) { 2078 if ((rp->r_namep = metaname_fast(&sp, devname, 2079 LOGICAL_DEVICE, ep)) == NULL) { 2080 Free(devname); 2081 Free(rp); 2082 return (NULL); 2083 } 2084 } else { 2085 if ((rp->r_namep = metaname(&sp, devname, 2086 LOGICAL_DEVICE, ep)) == NULL) { 2087 Free(devname); 2088 Free(rp); 2089 return (NULL); 2090 } 2091 } 2092 Free(devname); 2093 2094 /* make sure it's OK */ 2095 if ((! (flags & MD_BASICNAME_OK)) && 2096 (metachkcomp(rp->r_namep, ep) != 0)) { 2097 Free(rp); 2098 return (NULL); 2099 } 2100 2101 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR; 2102 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR; 2103 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; 2104 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { 2105 sz = devid_sizeof((ddi_devid_t)(uintptr_t) 2106 (c->c_locator.l_devid)); 2107 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == 2108 (ddi_devid_t)NULL) { 2109 Free(rp); 2110 return (NULL); 2111 } 2112 (void) memcpy((void *)rp->r_devid, 2113 (void *)(uintptr_t)c->c_locator.l_devid, sz); 2114 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); 2115 rp->r_flags &= ~MDDB_F_NODEVID; 2116 /* Overwrite dev derived from name with dev from devid */ 2117 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); 2118 } 2119 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); 2120 2121 rp->r_blkno = c->c_locator.l_blkno; 2122 if (c->c_dbend != 0) 2123 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; 2124 2125 /* return replica */ 2126 return (rp); 2127 } 2128 2129 /* 2130 * free replica list 2131 */ 2132 void 2133 metafreereplicalist( 2134 md_replicalist_t *rlp 2135 ) 2136 { 2137 md_replicalist_t *rl = NULL; 2138 2139 for (/* void */; (rlp != NULL); rlp = rl) { 2140 rl = rlp->rl_next; 2141 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { 2142 free(rlp->rl_repp->r_devid); 2143 } 2144 Free(rlp->rl_repp); 2145 Free(rlp); 2146 } 2147 } 2148 2149 /* 2150 * return list of all replicas in set 2151 */ 2152 int 2153 metareplicalist( 2154 mdsetname_t *sp, 2155 int flags, 2156 md_replicalist_t **rlpp, 2157 md_error_t *ep 2158 ) 2159 { 2160 md_replicalist_t **tail = rlpp; 2161 int count = 0; 2162 struct mddb_config c; 2163 int i; 2164 char *devid; 2165 2166 /* for each replica */ 2167 i = 0; 2168 do { 2169 md_replica_t *rp; 2170 2171 /* get next replica */ 2172 (void) memset(&c, 0, sizeof (c)); 2173 c.c_id = i; 2174 c.c_setno = sp->setno; 2175 2176 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2177 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2178 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2179 mdclrerror(&c.c_mde); 2180 break; /* handle none at all */ 2181 } 2182 (void) mdstealerror(ep, &c.c_mde); 2183 goto out; 2184 } 2185 2186 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { 2187 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { 2188 (void) mdsyserror(ep, ENOMEM, META_DBCONF); 2189 goto out; 2190 } 2191 c.c_locator.l_devid = (uintptr_t)devid; 2192 /* 2193 * Turn on space and sz flags since 'sz' amount of 2194 * space has been alloc'd. 2195 */ 2196 c.c_locator.l_devid_flags = 2197 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2198 } 2199 2200 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2201 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2202 mdclrerror(&c.c_mde); 2203 break; /* handle none at all */ 2204 } 2205 (void) mdstealerror(ep, &c.c_mde); 2206 goto out; 2207 } 2208 2209 /* 2210 * Paranoid check - shouldn't happen, but is left as 2211 * a place holder for changes that will be needed after 2212 * dynamic reconfiguration changes are added to SVM (to 2213 * support movement of disks at any point in time). 2214 */ 2215 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 2216 (void) fprintf(stderr, 2217 dgettext(TEXT_DOMAIN, 2218 "Error: Relocation Information " 2219 "(drvnm=%s, mnum=0x%lx) \n" 2220 "relocation information size changed - \n" 2221 "rerun command\n"), 2222 c.c_locator.l_driver, c.c_locator.l_mnum); 2223 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); 2224 goto out; 2225 } 2226 2227 if (c.c_dbcnt == 0) 2228 break; /* handle none at all */ 2229 2230 /* get info */ 2231 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) 2232 goto out; 2233 2234 /* append to list */ 2235 *tail = Zalloc(sizeof (**tail)); 2236 (*tail)->rl_repp = rp; 2237 tail = &(*tail)->rl_next; 2238 ++count; 2239 2240 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2241 free(devid); 2242 c.c_locator.l_devid_flags = 0; 2243 } 2244 2245 } while (++i < c.c_dbcnt); 2246 2247 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2248 free(devid); 2249 } 2250 2251 /* return count */ 2252 return (count); 2253 2254 /* cleanup, return error */ 2255 out: 2256 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2257 free(devid); 2258 } 2259 metafreereplicalist(*rlpp); 2260 *rlpp = NULL; 2261 return (-1); 2262 } 2263 2264 /* 2265 * meta_sync_db_locations - get list of replicas from kernel and write 2266 * out to mddb.cf and md.conf. 'Syncs up' the replica list in 2267 * the kernel with the replica list in the conf files. 2268 * 2269 */ 2270 void 2271 meta_sync_db_locations( 2272 mdsetname_t *sp, 2273 md_error_t *ep 2274 ) 2275 { 2276 char *sname = 0; /* system file name */ 2277 char *cname = 0; /* config file name */ 2278 2279 if (!metaislocalset(sp)) 2280 return; 2281 2282 /* Updates backup of configuration file (aka mddb.cf) */ 2283 if (buildconf(sp, ep) != 0) 2284 return; 2285 2286 /* Updates system configuration file (aka md.conf) */ 2287 (void) meta_db_patch(sname, cname, 0, ep); 2288 } 2289 2290 /* 2291 * setup_db_locations - parse the mddb.cf file and 2292 * tells the driver which db locations to use. 2293 */ 2294 int 2295 meta_setup_db_locations( 2296 md_error_t *ep 2297 ) 2298 { 2299 mddb_config_t c; 2300 FILE *fp; 2301 char inbuff[1024]; 2302 char *buff; 2303 uint_t i; 2304 size_t sz; 2305 int rval = 0; 2306 char *devidp; 2307 uint_t devid_size; 2308 char *minor_name = NULL; 2309 ddi_devid_t devid_decode; 2310 int checksum; 2311 2312 /* do mddb.cf file */ 2313 (void) memset(&c, '\0', sizeof (c)); 2314 if ((fp = fopen(META_DBCONF, "r")) == NULL) { 2315 if (errno != ENOENT) 2316 return (mdsyserror(ep, errno, META_DBCONF)); 2317 } 2318 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), 2319 fp)) != NULL)) { 2320 2321 /* ignore comments */ 2322 if (*buff == '#') 2323 continue; 2324 2325 /* parse locator */ 2326 (void) memset(&c, 0, sizeof (c)); 2327 c.c_setno = MD_LOCAL_SET; 2328 i = strcspn(buff, " \t"); 2329 if (i > sizeof (c.c_locator.l_driver)) 2330 i = sizeof (c.c_locator.l_driver); 2331 (void) strncpy(c.c_locator.l_driver, buff, i); 2332 buff += i; 2333 c.c_locator.l_dev = 2334 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); 2335 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); 2336 c.c_locator.l_mnum = minor(c.c_locator.l_dev); 2337 2338 /* parse out devid */ 2339 while (isspace((int)(*buff))) 2340 buff += 1; 2341 i = strcspn(buff, " \t"); 2342 if ((devidp = (char *)malloc(i+1)) == NULL) 2343 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2344 2345 (void) strncpy(devidp, buff, i); 2346 devidp[i] = '\0'; 2347 if (devid_str_decode(devidp, &devid_decode, 2348 &minor_name) == -1) { 2349 free(devidp); 2350 continue; 2351 } 2352 2353 /* Conf file must have minor name associated with devid */ 2354 if (minor_name == NULL) { 2355 free(devidp); 2356 devid_free(devid_decode); 2357 continue; 2358 } 2359 2360 sz = devid_sizeof(devid_decode); 2361 /* Copy to devid size buffer that ioctl expects */ 2362 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { 2363 devid_free(devid_decode); 2364 free(minor_name); 2365 free(devidp); 2366 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2367 } 2368 2369 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2370 (void *)devid_decode, sz); 2371 2372 devid_free(devid_decode); 2373 2374 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { 2375 free(minor_name); 2376 free(devidp); 2377 free((void *)(uintptr_t)c.c_locator.l_devid); 2378 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2379 } 2380 (void) strcpy(c.c_locator.l_minor_name, minor_name); 2381 free(minor_name); 2382 c.c_locator.l_devid_flags = MDDB_DEVID_VALID | 2383 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2384 c.c_locator.l_devid_sz = sz; 2385 2386 devid_size = strlen(devidp); 2387 buff += devid_size; 2388 2389 checksum = strtol(buff, &buff, 10); 2390 for (i = 0; c.c_locator.l_driver[i] != 0; i++) 2391 checksum += c.c_locator.l_driver[i]; 2392 for (i = 0; i < devid_size; i++) { 2393 checksum += devidp[i]; 2394 } 2395 free(devidp); 2396 2397 checksum += minor(c.c_locator.l_dev); 2398 checksum += c.c_locator.l_blkno; 2399 if (checksum != 42) { 2400 /* overwritten later for more serious problems */ 2401 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); 2402 free((void *)(uintptr_t)c.c_locator.l_devid); 2403 continue; 2404 } 2405 c.c_locator.l_flags = 0; 2406 2407 /* use db location */ 2408 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2409 free((void *)(uintptr_t)c.c_locator.l_devid); 2410 return (mdstealerror(ep, &c.c_mde)); 2411 } 2412 2413 /* free up devid if in use */ 2414 free((void *)(uintptr_t)c.c_locator.l_devid); 2415 c.c_locator.l_devid = (uint64_t)0; 2416 c.c_locator.l_devid_flags = 0; 2417 } 2418 if ((fp) && (fclose(fp) != 0)) 2419 return (mdsyserror(ep, errno, META_DBCONF)); 2420 2421 /* check for stale database */ 2422 (void) memset((char *)&c, 0, sizeof (struct mddb_config)); 2423 c.c_id = 0; 2424 c.c_setno = MD_LOCAL_SET; 2425 2426 /* Don't need device id information from this ioctl */ 2427 c.c_locator.l_devid = (uint64_t)0; 2428 c.c_locator.l_devid_flags = 0; 2429 2430 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2431 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) 2432 return (mdstealerror(ep, &c.c_mde)); 2433 mdclrerror(&c.c_mde); 2434 } 2435 2436 if (c.c_flags & MDDB_C_STALE) 2437 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, 2438 0, NULL)); 2439 2440 /* success */ 2441 return (rval); 2442 } 2443 2444 /* 2445 * meta_db_minreplica - returns the minimum size replica currently in use. 2446 */ 2447 daddr_t 2448 meta_db_minreplica( 2449 mdsetname_t *sp, 2450 md_error_t *ep 2451 ) 2452 { 2453 md_replica_t *r; 2454 md_replicalist_t *rl, *rlp = NULL; 2455 daddr_t nblks = 0; 2456 2457 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) 2458 return (-1); 2459 2460 if (rlp == NULL) 2461 return (-1); 2462 2463 /* find the smallest existing replica */ 2464 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2465 r = rl->rl_repp; 2466 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2467 } 2468 2469 metafreereplicalist(rlp); 2470 return (nblks); 2471 } 2472 2473 /* 2474 * meta_get_replica_names 2475 * returns an mdnamelist_t of replica slices 2476 */ 2477 /*ARGSUSED*/ 2478 int 2479 meta_get_replica_names( 2480 mdsetname_t *sp, 2481 mdnamelist_t **nlpp, 2482 int options, 2483 md_error_t *ep 2484 ) 2485 { 2486 md_replicalist_t *rlp = NULL; 2487 md_replicalist_t *rl; 2488 mdnamelist_t **tailpp = nlpp; 2489 int cnt = 0; 2490 2491 assert(nlpp != NULL); 2492 2493 if (!metaislocalset(sp)) 2494 goto out; 2495 2496 /* get replicas */ 2497 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 2498 cnt = -1; 2499 goto out; 2500 } 2501 2502 /* build name list */ 2503 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 2504 /* 2505 * Add the name struct to the end of the 2506 * namelist but keep a pointer to the last 2507 * element so that we don't incur the overhead 2508 * of traversing the list each time 2509 */ 2510 tailpp = meta_namelist_append_wrapper( 2511 tailpp, rl->rl_repp->r_namep); 2512 ++cnt; 2513 } 2514 2515 /* cleanup, return count or error */ 2516 out: 2517 metafreereplicalist(rlp); 2518 return (cnt); 2519 } 2520