1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 /* 29 * Just in case we're not in a build environment, make sure that 30 * TEXT_DOMAIN gets set to something. 31 */ 32 #if !defined(TEXT_DOMAIN) 33 #define TEXT_DOMAIN "SYS_TEST" 34 #endif 35 36 /* 37 * Metadevice database interfaces. 38 */ 39 40 #define MDDB 41 42 #include <meta.h> 43 #include <sys/lvm/md_mddb.h> 44 #include <sys/lvm/md_crc.h> 45 #include <sys/lvm/mdio.h> 46 #include <string.h> 47 #include <strings.h> 48 #include <ctype.h> 49 50 struct svm_daemon { 51 char *svmd_name; 52 char *svmd_kill_val; 53 }; 54 55 /* 56 * This is a list of the daemons that are not stopped by the SVM smf(5) 57 * services. The mdmonitord is started via svc:/system/mdmonitor:default 58 * but no contract(4) is constructed and so it is not stopped by smf(5). 59 */ 60 struct svm_daemon svmd_kill_list[] = { 61 {"mdmonitord", "HUP"}, 62 {"mddoors", "KILL"}, 63 }; 64 65 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) 66 67 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); 68 69 /* 70 * Are the locator blocks for the replicas using devids 71 */ 72 static int devid_in_use = FALSE; 73 74 static char * 75 getlongname( 76 struct mddb_config *c, 77 md_error_t *ep 78 ) 79 { 80 char *diskname = NULL; 81 char *devid_str; 82 devid_nmlist_t *disklist = NULL; 83 84 c->c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 85 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) { 86 (void) mdstealerror(ep, &c->c_mde); 87 return (NULL); 88 } 89 90 if (c->c_locator.l_devid_flags & MDDB_DEVID_SZ) { 91 c->c_locator.l_devid = (uintptr_t) 92 Malloc(c->c_locator.l_devid_sz); 93 c->c_locator.l_devid_flags = 94 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 95 } else { 96 (void) mderror(ep, MDE_NODEVID, ""); 97 goto out; 98 } 99 100 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) { 101 (void) mdstealerror(ep, &c->c_mde); 102 goto out; 103 } 104 105 if (c->c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 106 (void) mderror(ep, MDE_NODEVID, ""); 107 goto out; 108 } 109 110 if (metaioctl(MD_DB_GETDEV, c, &c->c_mde, NULL) != 0) { 111 (void) mdstealerror(ep, &c->c_mde); 112 goto out; 113 } 114 115 if (c->c_locator.l_devid != NULL) { 116 if (meta_deviceid_to_nmlist("/dev/dsk", 117 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, 118 c->c_locator.l_minor_name, &disklist) != 0) { 119 devid_str = devid_str_encode( 120 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, NULL); 121 (void) mderror(ep, MDE_MISSING_DEVID_DISK, ""); 122 mderrorextra(ep, devid_str); 123 if (devid_str != NULL) 124 devid_str_free(devid_str); 125 goto out; 126 } 127 diskname = Strdup(disklist[0].devname); 128 } 129 130 out: 131 if (disklist != NULL) 132 devid_free_nmlist(disklist); 133 134 if (c->c_locator.l_devid != NULL) 135 Free((void *)(uintptr_t)c->c_locator.l_devid); 136 137 return (diskname); 138 } 139 140 /* 141 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel 142 */ 143 md_timeval32_t 144 meta_get_lb_inittime( 145 mdsetname_t *sp, 146 md_error_t *ep 147 ) 148 { 149 mddb_config_t c; 150 151 (void) memset(&c, 0, sizeof (c)); 152 153 /* Fill in setno, setname, and sideno */ 154 c.c_setno = sp->setno; 155 156 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { 157 (void) mdstealerror(ep, &c.c_mde); 158 } 159 160 return (c.c_timestamp); 161 } 162 163 /* 164 * mkmasterblks writes out the master blocks of the mddb to the replica. 165 * 166 * In a MN diskset, this is called by the node that is adding this replica 167 * to the diskset. 168 */ 169 170 #define MDDB_VERIFY_SIZE 8192 171 172 static int 173 mkmasterblks( 174 mdsetname_t *sp, 175 mdname_t *np, 176 int fd, 177 daddr_t firstblk, 178 int dbsize, 179 md_timeval32_t inittime, 180 md_error_t *ep 181 ) 182 { 183 int consecutive; 184 md_timeval32_t tp; 185 struct mddb_mb *mb; 186 char *buffer; 187 int iosize; 188 md_set_desc *sd; 189 int mn_set = 0; 190 daddr_t startblk; 191 int cnt; 192 ddi_devid_t devid; 193 194 if (! metaislocalset(sp)) { 195 if ((sd = metaget_setdesc(sp, ep)) == NULL) 196 return (-1); 197 198 if (MD_MNSET_DESC(sd)) { 199 mn_set = 1; /* Used later */ 200 } 201 } 202 203 /* 204 * Loop to verify the entire mddb region on disk is read/writable. 205 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block 206 * chunks. 207 * 208 * A side-effect of this loop is to zero out the entire mddb region 209 */ 210 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) 211 return (mdsyserror(ep, ENOMEM, np->rname)); 212 213 startblk = firstblk; 214 for (cnt = dbsize; cnt > 0; cnt -= consecutive) { 215 216 if (cnt > MDDB_VERIFY_SIZE) 217 consecutive = MDDB_VERIFY_SIZE; 218 else 219 consecutive = cnt; 220 221 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 222 Free(buffer); 223 return (mdsyserror(ep, errno, np->rname)); 224 } 225 226 iosize = DEV_BSIZE * consecutive; 227 if (write(fd, buffer, iosize) != iosize) { 228 Free(buffer); 229 return (mdsyserror(ep, errno, np->rname)); 230 } 231 232 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 233 Free(buffer); 234 return (mdsyserror(ep, errno, np->rname)); 235 } 236 237 if (read(fd, buffer, iosize) != iosize) { 238 Free(buffer); 239 return (mdsyserror(ep, errno, np->rname)); 240 } 241 242 startblk += consecutive; 243 } 244 245 Free(buffer); 246 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 247 return (mdsyserror(ep, ENOMEM, np->rname)); 248 249 if (meta_gettimeofday(&tp) == -1) { 250 Free(mb); 251 return (mdsyserror(ep, errno, np->rname)); 252 } 253 254 mb->mb_magic = MDDB_MAGIC_MB; 255 /* 256 * If a MN diskset, set master block revision for a MN set. 257 * Even though the master block structure is no different 258 * for a MN set, setting the revision field to a different 259 * number keeps any pre-MN_diskset code from accessing 260 * this diskset. It also allows for an early determination 261 * of a MN diskset when reading in from disk so that the 262 * proper size locator block and locator names structure 263 * can be read in thus saving time on diskset startup. 264 */ 265 if (mn_set) 266 mb->mb_revision = MDDB_REV_MNMB; 267 else 268 mb->mb_revision = MDDB_REV_MB; 269 mb->mb_timestamp = tp; 270 mb->mb_setno = sp->setno; 271 mb->mb_blkcnt = dbsize - 1; 272 mb->mb_blkno = firstblk; 273 mb->mb_nextblk = 0; 274 275 mb->mb_blkmap.m_firstblk = firstblk + 1; 276 mb->mb_blkmap.m_consecutive = dbsize - 1; 277 if (! metaislocalset(sp)) { 278 mb->mb_setcreatetime = inittime; 279 } 280 281 /* 282 * We try to save the disks device ID into the remaining bytes in 283 * the master block. The saved devid is used to provide a mapping 284 * between this disk's devid and the devid stored into the master 285 * block. This allows the disk image to be self-identifying 286 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 287 * when we try to import these disks on the remote copied image. 288 * If we cannot save the disks device ID onto the master block that is 289 * ok. The disk is just not self-identifying and won't be importable 290 * in the remote copy scenario. 291 */ 292 if (devid_get(fd, &devid) == 0) { 293 size_t len; 294 295 len = devid_sizeof(devid); 296 if (len <= DEV_BSIZE - sizeof (*mb)) { 297 /* there is enough space to store the devid */ 298 mb->mb_devid_magic = MDDB_MAGIC_DE; 299 mb->mb_devid_len = len; 300 (void) memcpy(mb->mb_devid, devid, len); 301 } 302 devid_free(devid); 303 } 304 305 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 306 (crc_skip_t *)NULL); 307 308 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 309 Free(mb); 310 return (mdsyserror(ep, errno, np->rname)); 311 } 312 313 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 314 Free(mb); 315 return (mdsyserror(ep, errno, np->rname)); 316 } 317 318 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 319 Free(mb); 320 return (mdsyserror(ep, errno, np->rname)); 321 } 322 323 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 324 Free(mb); 325 return (mdsyserror(ep, errno, np->rname)); 326 } 327 328 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 329 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { 330 Free(mb); 331 return (mdmddberror(ep, MDE_NOTVERIFIED, 332 meta_getminor(np->dev), sp->setno, 0, np->rname)); 333 } 334 335 Free(mb); 336 return (0); 337 } 338 339 void 340 meta_mkdummymaster( 341 mdsetname_t *sp, 342 int fd, 343 daddr_t firstblk 344 ) 345 { 346 md_timeval32_t tp; 347 struct mddb_mb *mb; 348 ddi_devid_t devid; 349 md_set_desc *sd; 350 md_error_t ep = mdnullerror; 351 md_timeval32_t inittime; 352 353 /* 354 * No dummy master blocks are written for a MN diskset since devids 355 * are not supported in MN disksets. 356 */ 357 if (! metaislocalset(sp)) { 358 if ((sd = metaget_setdesc(sp, &ep)) == NULL) 359 return; 360 361 if (MD_MNSET_DESC(sd)) 362 return; 363 } 364 365 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 366 return; 367 368 mb->mb_magic = MDDB_MAGIC_DU; 369 mb->mb_revision = MDDB_REV_MB; 370 mb->mb_setno = sp->setno; 371 inittime = meta_get_lb_inittime(sp, &ep); 372 mb->mb_setcreatetime = inittime; 373 374 if (meta_gettimeofday(&tp) != -1) 375 mb->mb_timestamp = tp; 376 377 /* 378 * We try to save the disks device ID into the remaining bytes in 379 * the master block. This allows the disk image to be self-identifying 380 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 381 * when we try to import these disks on the remote copied image. 382 * If we cannot save the disks device ID onto the master block that is 383 * ok. The disk is just not self-identifying and won't be importable 384 * in the remote copy scenario. 385 */ 386 if (devid_get(fd, &devid) == 0) { 387 int len; 388 389 len = devid_sizeof(devid); 390 if (len <= DEV_BSIZE - sizeof (*mb)) { 391 /* there is enough space to store the devid */ 392 mb->mb_devid_magic = MDDB_MAGIC_DE; 393 mb->mb_devid_len = len; 394 (void) memcpy(mb->mb_devid, (char *)devid, len); 395 } 396 devid_free(devid); 397 } 398 399 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 400 (crc_skip_t *)NULL); 401 402 /* 403 * If any of these operations fail, we need to inform the 404 * user that the disk won't be self identifying. When support 405 * for importing remotely replicated disksets is added, we 406 * want to add the error messages here. 407 */ 408 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 409 goto out; 410 411 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) 412 goto out; 413 414 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 415 goto out; 416 417 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) 418 goto out; 419 420 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 421 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) 422 goto out; 423 424 out: 425 Free(mb); 426 } 427 428 static int 429 buildconf(mdsetname_t *sp, md_error_t *ep) 430 { 431 md_replicalist_t *rlp = NULL; 432 md_replicalist_t *rl; 433 FILE *cfp = NULL; 434 FILE *mfp = NULL; 435 struct stat sbuf; 436 int rval = 0; 437 int in_miniroot = 0; 438 char line[MDDB_BOOTLIST_MAX_LEN]; 439 char *tname = NULL; 440 441 /* get list of local replicas */ 442 if (! metaislocalset(sp)) 443 return (0); 444 445 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 446 return (-1); 447 448 /* open tempfile, copy permissions of original file */ 449 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { 450 /* 451 * On the miniroot tmp files must be created in /var/tmp. 452 * If we get a EROFS error, we assume that we are in the 453 * miniroot. 454 */ 455 if (errno != EROFS) 456 goto error; 457 in_miniroot = 1; 458 errno = 0; 459 tname = tempnam("/var/tmp", "slvm_"); 460 if (tname == NULL && errno == EROFS) { 461 /* 462 * If we are booted on a read-only root because 463 * of mddb quorum problems we don't want to emit 464 * any scary error messages. 465 */ 466 errno = 0; 467 goto out; 468 } 469 470 /* open tempfile, copy permissions of original file */ 471 if ((cfp = fopen(tname, "w+")) == NULL) 472 goto error; 473 } 474 if (stat(META_DBCONF, &sbuf) == 0) { 475 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) 476 goto error; 477 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) 478 goto error; 479 } 480 481 /* print header */ 482 if (fprintf(cfp, "#metadevice database location file ") == EOF) 483 goto error; 484 if (fprintf(cfp, "do not hand edit\n") < 0) 485 goto error; 486 if (fprintf(cfp, 487 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) 488 goto error; 489 490 /* dump replicas */ 491 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 492 md_replica_t *r = rl->rl_repp; 493 int checksum = 42; 494 int i; 495 char *devidp; 496 minor_t min; 497 498 devidp = devid_str_encode(r->r_devid, r->r_minor_name); 499 /* If devid code can't encode devidp - skip entry */ 500 if (devidp == NULL) { 501 continue; 502 } 503 504 /* compute checksum */ 505 for (i = 0; ((r->r_driver_name[i] != '\0') && 506 (i < sizeof (r->r_driver_name))); i++) { 507 checksum -= r->r_driver_name[i]; 508 } 509 min = meta_getminor(r->r_namep->dev); 510 checksum -= min; 511 checksum -= r->r_blkno; 512 513 for (i = 0; i < strlen(devidp); i++) { 514 checksum -= devidp[i]; 515 } 516 /* print info */ 517 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", 518 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { 519 goto error; 520 } 521 522 devid_str_free(devidp); 523 } 524 525 /* close and rename to real file */ 526 if (fflush(cfp) != 0) 527 goto error; 528 if (fsync(fileno(cfp)) != 0) 529 goto error; 530 if (fclose(cfp) != 0) { 531 cfp = NULL; 532 goto error; 533 } 534 cfp = NULL; 535 536 /* 537 * Renames don't work in the miniroot since tmpfiles are 538 * created in /var/tmp. Hence we copy the data out. 539 */ 540 541 if (! in_miniroot) { 542 if (rename(META_DBCONFTMP, META_DBCONF) != 0) 543 goto error; 544 } else { 545 if ((cfp = fopen(tname, "r")) == NULL) 546 goto error; 547 if ((mfp = fopen(META_DBCONF, "w+")) == NULL) 548 goto error; 549 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { 550 if (fputs(line, mfp) == NULL) 551 goto error; 552 } 553 (void) fclose(cfp); 554 cfp = NULL; 555 if (fflush(mfp) != 0) 556 goto error; 557 if (fsync(fileno(mfp)) != 0) 558 goto error; 559 if (fclose(mfp) != 0) { 560 mfp = NULL; 561 goto error; 562 } 563 /* delete the tempfile */ 564 (void) unlink(tname); 565 } 566 /* success */ 567 rval = 0; 568 goto out; 569 570 /* tempfile error */ 571 error: 572 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 573 mdsyserror(ep, errno, META_DBCONFTMP); 574 575 576 /* cleanup, return success */ 577 out: 578 if (rlp != NULL) 579 metafreereplicalist(rlp); 580 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { 581 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 582 mdsyserror(ep, errno, META_DBCONFTMP); 583 } 584 free(tname); 585 return (rval); 586 } 587 588 /* 589 * check replica for dev 590 */ 591 static int 592 in_replica( 593 mdsetname_t *sp, 594 md_replica_t *rp, 595 mdname_t *np, 596 diskaddr_t slblk, 597 diskaddr_t nblks, 598 md_error_t *ep 599 ) 600 { 601 mdname_t *repnp = rp->r_namep; 602 diskaddr_t rep_sblk = rp->r_blkno; 603 diskaddr_t rep_nblks = rp->r_nblk; 604 605 /* should be in the same set */ 606 assert(sp != NULL); 607 608 /* if error in master block, assume whole partition */ 609 if ((rep_sblk == MD_DISKADDR_ERROR) || 610 (rep_nblks == MD_DISKADDR_ERROR)) { 611 rep_sblk = 0; 612 rep_nblks = MD_DISKADDR_ERROR; 613 } 614 615 /* check overlap */ 616 if (meta_check_overlap( 617 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { 618 return (-1); 619 } 620 621 /* return success */ 622 return (0); 623 } 624 625 /* 626 * check to see if we're in a replica 627 */ 628 int 629 meta_check_inreplica( 630 mdsetname_t *sp, 631 mdname_t *np, 632 diskaddr_t slblk, 633 diskaddr_t nblks, 634 md_error_t *ep 635 ) 636 { 637 md_replicalist_t *rlp = NULL; 638 md_replicalist_t *rl; 639 int rval = 0; 640 641 /* should have a set */ 642 assert(sp != NULL); 643 644 /* for each replica */ 645 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 646 return (-1); 647 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 648 md_replica_t *rp = rl->rl_repp; 649 650 /* check replica */ 651 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { 652 rval = -1; 653 break; 654 } 655 } 656 657 /* cleanup, return success */ 658 metafreereplicalist(rlp); 659 return (rval); 660 } 661 662 /* 663 * check replica 664 */ 665 int 666 meta_check_replica( 667 mdsetname_t *sp, /* set to check against */ 668 mdname_t *np, /* component to check against */ 669 mdchkopts_t options, /* option flags */ 670 diskaddr_t slblk, /* start logical block */ 671 diskaddr_t nblks, /* number of blocks (-1,rest of them) */ 672 md_error_t *ep /* error packet */ 673 ) 674 { 675 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; 676 677 /* make sure we have a disk */ 678 if (metachkcomp(np, ep) != 0) 679 return (-1); 680 681 /* check to ensure that it is not already in use */ 682 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { 683 return (-1); 684 } 685 686 if (options & MDCHK_ALLOW_NODBS) 687 return (0); 688 689 if (options & MDCHK_DRVINSET) 690 return (0); 691 692 /* make sure it is in the set */ 693 if (meta_check_inset(sp, np, ep) != 0) 694 return (-1); 695 696 /* make sure its not in a metadevice */ 697 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) 698 return (-1); 699 700 /* return success */ 701 return (0); 702 } 703 704 static int 705 update_dbinfo_on_drives( 706 mdsetname_t *sp, 707 md_drive_desc *dd, 708 int set_locked, 709 int force, 710 md_error_t *ep 711 ) 712 { 713 md_set_desc *sd; 714 int i; 715 md_setkey_t *cl_sk; 716 int rval = 0; 717 md_mnnode_desc *nd; 718 719 if ((sd = metaget_setdesc(sp, ep)) == NULL) 720 return (-1); 721 722 if (! set_locked) { 723 if (MD_MNSET_DESC(sd)) { 724 md_error_t xep = mdnullerror; 725 sigset_t sigs; 726 /* Make sure we are blocking all signals */ 727 if (procsigs(TRUE, &sigs, &xep) < 0) 728 mdclrerror(&xep); 729 730 nd = sd->sd_nodelist; 731 while (nd) { 732 if (force && strcmp(nd->nd_nodename, 733 mynode()) != 0) { 734 nd = nd->nd_next; 735 continue; 736 } 737 738 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 739 nd = nd->nd_next; 740 continue; 741 } 742 743 if (clnt_lock_set(nd->nd_nodename, sp, ep)) 744 return (-1); 745 nd = nd->nd_next; 746 } 747 } else { 748 for (i = 0; i < MD_MAXSIDES; i++) { 749 /* Skip empty slots */ 750 if (sd->sd_nodes[i][0] == '\0') 751 continue; 752 753 if (force && strcmp(sd->sd_nodes[i], 754 mynode()) != 0) 755 continue; 756 757 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) 758 return (-1); 759 } 760 } 761 } 762 763 if (MD_MNSET_DESC(sd)) { 764 nd = sd->sd_nodelist; 765 while (nd) { 766 if (force && strcmp(nd->nd_nodename, mynode()) != 0) { 767 nd = nd->nd_next; 768 continue; 769 } 770 771 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 772 nd = nd->nd_next; 773 continue; 774 } 775 776 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) 777 == -1) { 778 rval = -1; 779 break; 780 } 781 nd = nd->nd_next; 782 } 783 } else { 784 for (i = 0; i < MD_MAXSIDES; i++) { 785 /* Skip empty slots */ 786 if (sd->sd_nodes[i][0] == '\0') 787 continue; 788 789 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) 790 continue; 791 792 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) 793 == -1) { 794 rval = -1; 795 break; 796 } 797 } 798 } 799 800 if (! set_locked) { 801 cl_sk = cl_get_setkey(sp->setno, sp->setname); 802 if (MD_MNSET_DESC(sd)) { 803 nd = sd->sd_nodelist; 804 while (nd) { 805 if (force && 806 strcmp(nd->nd_nodename, mynode()) != 0) { 807 nd = nd->nd_next; 808 continue; 809 } 810 811 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 812 nd = nd->nd_next; 813 continue; 814 } 815 816 if (clnt_unlock_set(nd->nd_nodename, cl_sk, 817 ep)) { 818 rval = -1; 819 break; 820 } 821 nd = nd->nd_next; 822 } 823 } else { 824 for (i = 0; i < MD_MAXSIDES; i++) { 825 /* Skip empty slots */ 826 if (sd->sd_nodes[i][0] == '\0') 827 continue; 828 829 if (force && 830 strcmp(sd->sd_nodes[i], mynode()) != 0) 831 continue; 832 833 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, 834 ep)) { 835 rval = -1; 836 break; 837 } 838 } 839 840 } 841 cl_set_setkey(NULL); 842 } 843 844 return (rval); 845 } 846 847 int 848 meta_db_addsidenms( 849 mdsetname_t *sp, 850 mdname_t *np, 851 daddr_t blkno, 852 int bcast, 853 md_error_t *ep 854 ) 855 { 856 side_t sideno; 857 char *bname = NULL; 858 char *dname = NULL; 859 minor_t mnum; 860 mddb_config_t c; 861 int done; 862 int rval = 0; 863 md_set_desc *sd; 864 865 sideno = MD_SIDEWILD; 866 /*CONSTCOND*/ 867 while (1) { 868 if (bname != NULL) { 869 Free(bname); 870 bname = NULL; 871 } 872 if (dname != NULL) { 873 Free(dname); 874 dname = NULL; 875 } 876 if ((done = meta_getnextside_devinfo(sp, np->bname, 877 &sideno, &bname, &dname, &mnum, ep)) == -1) { 878 rval = -1; 879 break; 880 } 881 882 if (done == 0) 883 break; 884 885 if (! metaislocalset(sp)) { 886 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 887 rval = -1; 888 break; 889 } 890 } 891 892 /* 893 * Send addsidenms to all nodes using rpc.mdcommd if 894 * sidename is being added to MN diskset. 895 * 896 * It's ok to broadcast this call to other nodes. 897 * 898 * Note: The broadcast to other nodes isn't needed during 899 * the addition of the first mddbs to the set since the 900 * other nodes haven't been joined to the set yet. All 901 * nodes in a MN diskset are (implicitly) joined to the set 902 * on the addition of the first mddb. 903 */ 904 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 905 (bcast == DB_ADDSIDENMS_BCAST)) { 906 md_mn_result_t *resultp = NULL; 907 md_mn_msg_meta_db_newside_t db_ns; 908 int send_rval; 909 910 db_ns.msg_l_dev = np->dev; 911 db_ns.msg_sideno = sideno; 912 db_ns.msg_blkno = blkno; 913 (void) strncpy(db_ns.msg_dname, dname, 914 sizeof (db_ns.msg_dname)); 915 (void) splitname(np->bname, &db_ns.msg_splitname); 916 db_ns.msg_mnum = mnum; 917 918 /* Set devid to NULL until devids are supported */ 919 db_ns.msg_devid[0] = NULL; 920 921 /* 922 * If reconfig cycle has been started, this node is 923 * stuck in in the return step until this command has 924 * completed. If mdcommd is suspended, ask 925 * send_message to fail (instead of retrying) 926 * so that metaset can finish allowing the reconfig 927 * cycle to proceed. 928 */ 929 send_rval = mdmn_send_message(sp->setno, 930 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | 931 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns, 932 sizeof (md_mn_msg_meta_db_newside_t), 933 &resultp, ep); 934 if (send_rval != 0) { 935 rval = -1; 936 if (resultp == NULL) 937 (void) mddserror(ep, 938 MDE_DS_COMMD_SEND_FAIL, 939 sp->setno, NULL, NULL, 940 sp->setname); 941 else { 942 (void) mdstealerror(ep, 943 &(resultp->mmr_ep)); 944 if (mdisok(ep)) { 945 (void) mddserror(ep, 946 MDE_DS_COMMD_SEND_FAIL, 947 sp->setno, NULL, NULL, 948 sp->setname); 949 } 950 free_result(resultp); 951 } 952 break; 953 } 954 if (resultp) 955 free_result(resultp); 956 } else { 957 /* 958 * Let this side's device name, minor # and driver name 959 * be known to the database replica. 960 */ 961 (void) memset(&c, 0, sizeof (c)); 962 963 /* Fill in device/replica info */ 964 c.c_locator.l_dev = meta_cmpldev(np->dev); 965 c.c_locator.l_blkno = blkno; 966 (void) strncpy(c.c_locator.l_driver, dname, 967 sizeof (c.c_locator.l_driver)); 968 if (splitname(np->bname, &c.c_devname) == 969 METASPLIT_LONGDISKNAME && devid_in_use == FALSE) { 970 rval = mddeverror(ep, MDE_DISKNAMETOOLONG, 971 NODEV64, np->rname); 972 break; 973 } 974 975 c.c_locator.l_mnum = mnum; 976 977 /* Fill in setno, setname, and sideno */ 978 c.c_setno = sp->setno; 979 (void) strncpy(c.c_setname, sp->setname, 980 sizeof (c.c_setname)); 981 c.c_sideno = sideno; 982 983 /* 984 * Don't need device id information from this ioctl 985 * Kernel determines device id from dev_t, which 986 * is just what this code would do. 987 */ 988 c.c_locator.l_devid = (uint64_t)0; 989 c.c_locator.l_devid_flags = 0; 990 991 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { 992 rval = mdstealerror(ep, &c.c_mde); 993 break; 994 } 995 } 996 } 997 998 /* cleanup, return success */ 999 if (bname != NULL) { 1000 Free(bname); 1001 bname = NULL; 1002 } 1003 if (dname != NULL) { 1004 Free(dname); 1005 dname = NULL; 1006 } 1007 return (rval); 1008 } 1009 1010 1011 int 1012 meta_db_delsidenm( 1013 mdsetname_t *sp, 1014 side_t sideno, 1015 mdname_t *np, 1016 daddr_t blkno, 1017 md_error_t *ep 1018 ) 1019 { 1020 mddb_config_t c; 1021 md_set_desc *sd; 1022 1023 if (! metaislocalset(sp)) { 1024 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1025 return (-1); 1026 } 1027 /* Use rpc.mdcommd to delete mddb side from all nodes */ 1028 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1029 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1030 md_mn_result_t *resultp = NULL; 1031 md_mn_msg_meta_db_delside_t db_ds; 1032 int send_rval; 1033 1034 db_ds.msg_l_dev = np->dev; 1035 db_ds.msg_blkno = blkno; 1036 db_ds.msg_sideno = sideno; 1037 1038 /* Set devid to NULL until devids are supported */ 1039 db_ds.msg_devid[0] = NULL; 1040 1041 /* 1042 * If reconfig cycle has been started, this node is 1043 * stuck in in the return step until this command has 1044 * completed. If mdcommd is suspended, ask 1045 * send_message to fail (instead of retrying) 1046 * so that metaset can finish allowing the reconfig 1047 * cycle to proceed. 1048 */ 1049 send_rval = mdmn_send_message(sp->setno, 1050 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | 1051 MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds, 1052 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); 1053 if (send_rval != 0) { 1054 if (resultp == NULL) 1055 (void) mddserror(ep, 1056 MDE_DS_COMMD_SEND_FAIL, 1057 sp->setno, NULL, NULL, 1058 sp->setname); 1059 else { 1060 (void) mdstealerror(ep, &(resultp->mmr_ep)); 1061 if (mdisok(ep)) { 1062 (void) mddserror(ep, 1063 MDE_DS_COMMD_SEND_FAIL, 1064 sp->setno, NULL, NULL, 1065 sp->setname); 1066 } 1067 free_result(resultp); 1068 } 1069 return (-1); 1070 } 1071 if (resultp) 1072 free_result(resultp); 1073 1074 } else { 1075 /* 1076 * Let this side's device name, minor # and driver name 1077 * be known to the database replica. 1078 */ 1079 (void) memset(&c, 0, sizeof (c)); 1080 1081 /* Fill in device/replica info */ 1082 c.c_locator.l_dev = meta_cmpldev(np->dev); 1083 c.c_locator.l_blkno = blkno; 1084 1085 /* Fill in setno, setname, and sideno */ 1086 c.c_setno = sp->setno; 1087 (void) strcpy(c.c_setname, sp->setname); 1088 c.c_sideno = sideno; 1089 1090 /* 1091 * Don't need device id information from this ioctl 1092 * Kernel determines device id from dev_t, which 1093 * is just what this code would do. 1094 */ 1095 c.c_locator.l_devid = (uint64_t)0; 1096 c.c_locator.l_devid_flags = 0; 1097 1098 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) 1099 return (mdstealerror(ep, &c.c_mde)); 1100 } 1101 return (0); 1102 } 1103 1104 1105 static int 1106 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) 1107 { 1108 mdnamelist_t *dnp1, *dnp2; 1109 1110 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { 1111 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { 1112 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) 1113 return (mderror(ep, MDE_DUPDRIVE, 1114 dnp1->namep->cname)); 1115 } 1116 } 1117 return (0); 1118 } 1119 1120 1121 /* 1122 * Return 1 if files are different, else return 0 1123 */ 1124 static int 1125 filediff(char *tsname, char *sname) 1126 { 1127 int ret = 1, fd; 1128 size_t tsz, sz; 1129 struct stat sbuf; 1130 char *tbuf, *buf; 1131 1132 if (stat(tsname, &sbuf) != 0) 1133 return (1); 1134 tsz = sbuf.st_size; 1135 if (stat(sname, &sbuf) != 0) 1136 return (1); 1137 sz = sbuf.st_size; 1138 if (tsz != sz) 1139 return (1); 1140 1141 /* allocate memory and read both files into buffer */ 1142 tbuf = malloc(tsz); 1143 buf = malloc(sz); 1144 if (tbuf == NULL || buf == NULL) 1145 goto out; 1146 1147 fd = open(tsname, O_RDONLY); 1148 if (fd == -1) 1149 goto out; 1150 sz = read(fd, tbuf, tsz); 1151 (void) close(fd); 1152 if (sz != tsz) 1153 goto out; 1154 1155 fd = open(sname, O_RDONLY); 1156 if (fd == -1) 1157 goto out; 1158 sz = read(fd, buf, tsz); 1159 (void) close(fd); 1160 if (sz != tsz) 1161 goto out; 1162 1163 /* compare content */ 1164 ret = bcmp(tbuf, buf, tsz); 1165 out: 1166 if (tbuf) 1167 free(tbuf); 1168 if (buf) 1169 free(buf); 1170 return (ret); 1171 } 1172 1173 /* 1174 * patch md.conf file with mddb locations 1175 */ 1176 int 1177 meta_db_patch( 1178 char *sname, /* system file name */ 1179 char *cname, /* mddb.cf file name */ 1180 int patch, /* patching locally */ 1181 md_error_t *ep 1182 ) 1183 { 1184 char *tsname = NULL; 1185 char line[MDDB_BOOTLIST_MAX_LEN]; 1186 FILE *tsfp = NULL; 1187 FILE *mfp = NULL; 1188 int rval = -1; 1189 1190 /* check names */ 1191 if (sname == NULL) { 1192 if (patch) 1193 sname = "md.conf"; 1194 else 1195 sname = "/kernel/drv/md.conf"; 1196 } 1197 if (cname == NULL) 1198 cname = META_DBCONF; 1199 1200 /* 1201 * edit file 1202 */ 1203 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { 1204 if (mdissyserror(ep, EROFS)) { 1205 /* 1206 * If we are booted on a read-only root because 1207 * of mddb quorum problems we don't want to emit 1208 * any scary error messages. 1209 */ 1210 mdclrerror(ep); 1211 rval = 0; 1212 } 1213 goto out; 1214 } 1215 1216 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0, 1217 ep) != 0) 1218 goto out; 1219 1220 /* if file content is identical, skip rename */ 1221 if (filediff(tsname, sname) == 0) { 1222 rval = 0; 1223 goto out; 1224 } 1225 1226 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || 1227 (fclose(tsfp) != 0)) { 1228 (void) mdsyserror(ep, errno, tsname); 1229 goto out; 1230 } 1231 1232 tsfp = NULL; 1233 1234 /* 1235 * rename file. If we get a Cross Device error then it 1236 * is because we are in the miniroot. 1237 */ 1238 if (rename(tsname, sname) != 0 && errno != EXDEV) { 1239 (void) mdsyserror(ep, errno, sname); 1240 goto out; 1241 } 1242 1243 if (errno == EXDEV) { 1244 if ((tsfp = fopen(tsname, "r")) == NULL) 1245 goto out; 1246 if ((mfp = fopen(sname, "w+")) == NULL) 1247 goto out; 1248 while (fgets(line, sizeof (line), tsfp) != NULL) { 1249 if (fputs(line, mfp) == NULL) 1250 goto out; 1251 } 1252 (void) fclose(tsfp); 1253 tsfp = NULL; 1254 if (fflush(mfp) != 0) 1255 goto out; 1256 if (fsync(fileno(mfp)) != 0) 1257 goto out; 1258 if (fclose(mfp) != 0) { 1259 mfp = NULL; 1260 goto out; 1261 } 1262 } 1263 1264 Free(tsname); 1265 tsname = NULL; 1266 rval = 0; 1267 1268 /* cleanup, return error */ 1269 out: 1270 if (tsfp != NULL) 1271 (void) fclose(tsfp); 1272 if (tsname != NULL) { 1273 (void) unlink(tsname); 1274 Free(tsname); 1275 } 1276 return (rval); 1277 } 1278 1279 /* 1280 * Add replicas to set. This happens as a result of: 1281 * - metadb [-s set_name] -a 1282 * - metaset -s set_name -a disk 1283 * - metaset -s set_name -d disk (causes a rebalance of mddbs) 1284 * - metaset -s set_name -b 1285 * 1286 * For a local set, this routine is run on the local set host. 1287 * 1288 * For a traditional diskset, this routine is run on the node that 1289 * is running the metaset command. 1290 * 1291 * For a multinode diskset, this routine is run by the node that is 1292 * running the metaset command. If this is the first mddb added to 1293 * the MN diskset, then no communication is made to other nodes via commd 1294 * since the other nodes will be in-sync with respect to the mddbs when 1295 * those other nodes join the set and snarf in the newly created mddb. 1296 * If this is not the first mddb added to the MN diskset, then this 1297 * attach command is sent to all of the nodes using commd. This keeps 1298 * the nodes in-sync. 1299 */ 1300 int 1301 meta_db_attach( 1302 mdsetname_t *sp, 1303 mdnamelist_t *db_nlp, 1304 mdchkopts_t options, 1305 md_timeval32_t *timeval, 1306 int dbcnt, 1307 int dbsize, 1308 char *sysfilename, 1309 md_error_t *ep 1310 ) 1311 { 1312 struct mddb_config c; 1313 mdnamelist_t *nlp; 1314 mdname_t *np; 1315 md_drive_desc *dd = NULL; 1316 md_drive_desc *p; 1317 int i; 1318 int fd; 1319 side_t sideno; 1320 daddr_t blkno; 1321 int replicacount = 0; 1322 int start_svmdaemons = 0; 1323 int rval = 0; 1324 md_error_t status = mdnullerror; 1325 md_set_desc *sd; 1326 int stale_bool = FALSE; 1327 int flags; 1328 int firstmddb = 1; 1329 md_timeval32_t inittime = {0, 0}; 1330 1331 /* 1332 * Error if we don't get some work to do. 1333 */ 1334 if (db_nlp == NULL) 1335 return (mdsyserror(ep, EINVAL, NULL)); 1336 1337 if (mdnamesareunique(db_nlp, ep) != 0) 1338 return (-1); 1339 (void) memset(&c, 0, sizeof (c)); 1340 c.c_id = 0; 1341 c.c_setno = sp->setno; 1342 1343 /* Don't need device id information from this ioctl */ 1344 c.c_locator.l_devid = (uint64_t)0; 1345 c.c_locator.l_devid_flags = 0; 1346 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1347 if (metaislocalset(sp)) { 1348 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) 1349 mdclrerror(&c.c_mde); 1350 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || 1351 (! (options & MDCHK_ALLOW_NODBS))) 1352 return (mdstealerror(ep, &c.c_mde)); 1353 } else { 1354 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) 1355 return (mdstealerror(ep, &c.c_mde)); 1356 } 1357 mdclrerror(&c.c_mde); 1358 } 1359 /* 1360 * Is current set STALE? 1361 */ 1362 if (c.c_flags & MDDB_C_STALE) { 1363 stale_bool = TRUE; 1364 } 1365 1366 assert(db_nlp != NULL); 1367 1368 /* if these are the first replicas then the SVM daemons need to run */ 1369 if (c.c_dbcnt == 0) 1370 start_svmdaemons = 1; 1371 1372 /* 1373 * check to see if we will go over the total possible number 1374 * of data bases 1375 */ 1376 nlp = db_nlp; 1377 while (nlp) { 1378 replicacount += dbcnt; 1379 nlp = nlp->next; 1380 } 1381 1382 if ((replicacount + c.c_dbcnt) > c.c_dbmax) 1383 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 1384 sp->setno, c.c_dbcnt + replicacount, NULL)); 1385 1386 /* 1387 * go through and check to make sure all locations specified 1388 * are legal also pick out driver name; 1389 */ 1390 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1391 diskaddr_t devsize; 1392 1393 np = nlp->namep; 1394 1395 if (! metaislocalset(sp)) { 1396 uint_t partno; 1397 uint_t rep_partno; 1398 mddrivename_t *dnp = np->drivenamep; 1399 1400 /* 1401 * make sure that non-local database replicas 1402 * are always on the replica slice. 1403 */ 1404 if (meta_replicaslice(dnp, 1405 &rep_partno, ep) != 0) 1406 return (-1); 1407 if (metagetvtoc(np, FALSE, &partno, ep) == NULL) 1408 return (-1); 1409 if (partno != rep_partno) 1410 return (mddeverror(ep, MDE_REPCOMP_ONLY, 1411 np->dev, sp->setname)); 1412 } 1413 1414 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), 1415 ep)) { 1416 return (-1); 1417 } 1418 1419 if ((devsize = metagetsize(np, ep)) == -1) 1420 return (-1); 1421 1422 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) 1423 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, 1424 meta_getminor(np->dev), sp->setno, devsize, 1425 np->cname)); 1426 } 1427 1428 /* 1429 * If first disk in set we don't have lb_inittime yet for use as 1430 * mb_setcreatetime so don't go looking for it. WE'll come back 1431 * later and update after the locator block has been created. 1432 * If this isn't the first disk in the set, we have a locator 1433 * block and thus we have lb_inittime. Set mb_setcreatetime to 1434 * lb_inittime. 1435 */ 1436 if (! metaislocalset(sp)) { 1437 if (c.c_dbcnt != 0) { 1438 firstmddb = 0; 1439 inittime = meta_get_lb_inittime(sp, ep); 1440 } 1441 } 1442 1443 /* 1444 * go through and write all master blocks 1445 */ 1446 1447 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1448 np = nlp->namep; 1449 1450 if ((fd = open(np->rname, O_RDWR)) < 0) 1451 return (mdsyserror(ep, errno, np->rname)); 1452 1453 for (i = 0; i < dbcnt; i++) { 1454 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, 1455 inittime, ep)) { 1456 (void) close(fd); 1457 return (-1); 1458 } 1459 } 1460 (void) close(fd); 1461 } 1462 1463 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1464 return (-1); 1465 1466 if (! metaislocalset(sp)) { 1467 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1468 if (! mdisok(ep)) 1469 return (-1); 1470 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1471 return (-1); 1472 1473 } 1474 1475 /* 1476 * go through and tell kernel to add them 1477 */ 1478 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1479 mdcinfo_t *cinfo; 1480 1481 np = nlp->namep; 1482 1483 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 1484 rval = -1; 1485 goto out; 1486 } 1487 1488 /* 1489 * If mddb is being added to MN diskset and there already 1490 * exists a valid mddb in the set (which equates to this 1491 * node being an owner of the set) then use rpc.mdcommd 1492 * mechanism to add mddb(s) so that all nodes stay in sync. 1493 * If set is stale, don't log the message since rpc.mdcommd 1494 * can't write the message to the mddb. 1495 * 1496 * Otherwise, just add mddb to this node. 1497 */ 1498 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1499 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1500 md_mn_result_t *resultp = NULL; 1501 md_mn_msg_meta_db_attach_t attach; 1502 int send_rval; 1503 1504 /* 1505 * In a scenario where new replicas had been added on 1506 * the master, and then all of the old replicas failed 1507 * before the slaves had knowledge of the new replicas, 1508 * the slaves are unable to re-parse in the mddb 1509 * from the new replicas since the slaves have no 1510 * knowledge of the new replicas. The following 1511 * algorithm solves this problem: 1512 * - META_DB_ATTACH message generates submsgs 1513 * - BLOCK parse (master) 1514 * - MDDB_ATTACH new replicas 1515 * - UNBLOCK parse (master) causing parse 1516 * information to be sent from master 1517 * to slaves at a higher class than the 1518 * unblock so the parse message will 1519 * reach slaves before unblock message. 1520 */ 1521 attach.msg_l_dev = np->dev; 1522 attach.msg_cnt = dbcnt; 1523 attach.msg_dbsize = dbsize; 1524 (void) strncpy(attach.msg_dname, cinfo->dname, 1525 sizeof (attach.msg_dname)); 1526 (void) splitname(np->bname, &attach.msg_splitname); 1527 attach.msg_options = options; 1528 1529 /* Set devid to NULL until devids are supported */ 1530 attach.msg_devid[0] = NULL; 1531 1532 /* 1533 * If reconfig cycle has been started, this node is 1534 * stuck in in the return step until this command has 1535 * completed. If mdcommd is suspended, ask 1536 * send_message to fail (instead of retrying) 1537 * so that metaset can finish allowing the reconfig 1538 * cycle to proceed. 1539 */ 1540 flags = MD_MSGF_FAIL_ON_SUSPEND; 1541 if (stale_bool == TRUE) 1542 flags |= MD_MSGF_NO_LOG; 1543 send_rval = mdmn_send_message(sp->setno, 1544 MD_MN_MSG_META_DB_ATTACH, 1545 flags, (char *)&attach, 1546 sizeof (md_mn_msg_meta_db_attach_t), 1547 &resultp, ep); 1548 if (send_rval != 0) { 1549 rval = -1; 1550 if (resultp == NULL) 1551 (void) mddserror(ep, 1552 MDE_DS_COMMD_SEND_FAIL, 1553 sp->setno, NULL, NULL, 1554 sp->setname); 1555 else { 1556 (void) mdstealerror(ep, 1557 &(resultp->mmr_ep)); 1558 if (mdisok(ep)) { 1559 (void) mddserror(ep, 1560 MDE_DS_COMMD_SEND_FAIL, 1561 sp->setno, NULL, NULL, 1562 sp->setname); 1563 } 1564 free_result(resultp); 1565 } 1566 goto out; 1567 } 1568 if (resultp) 1569 free_result(resultp); 1570 } else { 1571 /* Adding mddb(s) to just this node */ 1572 for (i = 0; i < dbcnt; i++) { 1573 (void) memset(&c, 0, sizeof (c)); 1574 /* Fill in device/replica info */ 1575 c.c_locator.l_dev = meta_cmpldev(np->dev); 1576 c.c_locator.l_blkno = i * dbsize + 16; 1577 blkno = c.c_locator.l_blkno; 1578 (void) strncpy(c.c_locator.l_driver, 1579 cinfo->dname, 1580 sizeof (c.c_locator.l_driver)); 1581 1582 if (splitname(np->bname, &c.c_devname) == 1583 METASPLIT_LONGDISKNAME && devid_in_use == 1584 FALSE) { 1585 rval = mddeverror(ep, 1586 MDE_DISKNAMETOOLONG, 1587 NODEV64, np->rname); 1588 goto out; 1589 } 1590 1591 c.c_locator.l_mnum = meta_getminor(np->dev); 1592 1593 /* Fill in setno, setname, and sideno */ 1594 c.c_setno = sp->setno; 1595 if (! metaislocalset(sp)) { 1596 if (MD_MNSET_DESC(sd)) { 1597 c.c_multi_node = 1; 1598 } 1599 } 1600 (void) strcpy(c.c_setname, sp->setname); 1601 c.c_sideno = sideno; 1602 1603 /* 1604 * Don't need device id information from this 1605 * ioctl Kernel determines device id from 1606 * dev_t, which is just what this code would do. 1607 */ 1608 c.c_locator.l_devid = (uint64_t)0; 1609 c.c_locator.l_devid_flags = 0; 1610 1611 if (timeval != NULL) 1612 c.c_timestamp = *timeval; 1613 1614 if (setup_med_cfg(sp, &c, 1615 (options & MDCHK_SET_FORCE), ep)) { 1616 rval = -1; 1617 goto out; 1618 } 1619 1620 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, 1621 NULL) != 0) { 1622 rval = mdstealerror(ep, &c.c_mde); 1623 goto out; 1624 } 1625 /* 1626 * This is either a traditional diskset OR this 1627 * is the first replica added to a MN diskset. 1628 * In either case, set broadcast to NO_BCAST so 1629 * that message won't go through rpc.mdcommd. 1630 * If this is a traditional diskset, the bcast 1631 * flag is ignored since traditional disksets 1632 * don't use the rpc.mdcommd. 1633 */ 1634 if (meta_db_addsidenms(sp, np, blkno, 1635 DB_ADDSIDENMS_NO_BCAST, ep)) 1636 goto out; 1637 } 1638 } 1639 if (! metaislocalset(sp)) { 1640 /* update the dbcnt and size in dd */ 1641 for (p = dd; p != NULL; p = p->dd_next) 1642 if (p->dd_dnp == np->drivenamep) { 1643 p->dd_dbcnt = dbcnt; 1644 p->dd_dbsize = dbsize; 1645 break; 1646 } 1647 } 1648 1649 /* 1650 * If this was the first addition of disks to the 1651 * diskset you now need to update the mb_setcreatetime 1652 * which needed lb_inittime which wasn't there until now. 1653 */ 1654 if (firstmddb) { 1655 if (meta_update_mb(sp, dd, ep) != 0) { 1656 return (-1); 1657 } 1658 } 1659 (void) close(fd); 1660 } 1661 1662 out: 1663 if (metaislocalset(sp)) { 1664 1665 /* everything looks fine. Start mdmonitord */ 1666 if (rval == 0 && start_svmdaemons == 1) { 1667 if (meta_smf_enable(META_SMF_CORE, &status) == -1) { 1668 mde_perror(&status, ""); 1669 mdclrerror(&status); 1670 } 1671 } 1672 1673 if (buildconf(sp, &status)) { 1674 /* Don't mask any previous errors */ 1675 if (rval == 0) 1676 rval = mdstealerror(ep, &status); 1677 return (rval); 1678 } 1679 1680 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 1681 /* Don't mask any previous errors */ 1682 if (rval == 0) 1683 rval = mdstealerror(ep, &status); 1684 } 1685 } else { 1686 if (update_dbinfo_on_drives(sp, dd, 1687 (options & MDCHK_SET_LOCKED), 1688 (options & MDCHK_SET_FORCE), 1689 &status)) { 1690 /* Don't mask any previous errors */ 1691 if (rval == 0) 1692 rval = mdstealerror(ep, &status); 1693 else 1694 mdclrerror(&status); 1695 } 1696 metafreedrivedesc(&dd); 1697 } 1698 /* 1699 * For MN disksets that already had already had nodes joined 1700 * before the attach of this mddb(s), the name invalidation is 1701 * done by the commd handler routine. Otherwise, if this 1702 * is the first attach of a MN diskset mddb, the invalidation 1703 * must be done here since the first attach cannot be sent 1704 * via the commd since there are no nodes joined to the set yet. 1705 */ 1706 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || 1707 (MD_MNSET_DESC(sd) && 1708 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { 1709 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 1710 meta_invalidate_name(nlp->namep); 1711 } 1712 } 1713 return (rval); 1714 } 1715 1716 /* 1717 * deletelist_length 1718 * 1719 * return the number of slices that have been specified for deletion 1720 * on the metadb command line. This does not calculate the number 1721 * of replicas because there may be multiple replicas per slice. 1722 */ 1723 static int 1724 deletelist_length(mdnamelist_t *db_nlp) 1725 { 1726 1727 mdnamelist_t *nlp; 1728 int list_length = 0; 1729 1730 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1731 list_length++; 1732 } 1733 1734 return (list_length); 1735 } 1736 1737 static int 1738 in_deletelist(char *devname, mdnamelist_t *db_nlp) 1739 { 1740 1741 mdnamelist_t *nlp; 1742 mdname_t *np; 1743 int index = 0; 1744 1745 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1746 np = nlp->namep; 1747 1748 if (strcmp(devname, np->bname) == 0) 1749 return (index); 1750 index++; 1751 } 1752 1753 return (-1); 1754 } 1755 1756 /* 1757 * Delete replicas from set. This happens as a result of: 1758 * - metadb [-s set_name] -d 1759 * - metaset -s set_name -a disk (causes a rebalance of mddbs) 1760 * - metaset -s set_name -d disk 1761 * - metaset -s set_name -b 1762 * 1763 * For a local set, this routine is run on the local set host. 1764 * 1765 * For a traditional diskset, this routine is run on the node that 1766 * is running the metaset command. 1767 * 1768 * For a multinode diskset, this routine is run by the node that is 1769 * running the metaset command. This detach routine is sent to all 1770 * of the joined nodes in the diskset using commd. This keeps 1771 * the nodes in-sync. 1772 */ 1773 int 1774 meta_db_detach( 1775 mdsetname_t *sp, 1776 mdnamelist_t *db_nlp, 1777 mdforceopts_t force_option, 1778 char *sysfilename, 1779 md_error_t *ep 1780 ) 1781 { 1782 struct mddb_config c; 1783 mdnamelist_t *nlp; 1784 mdname_t *np; 1785 md_drive_desc *dd = NULL; 1786 md_drive_desc *p; 1787 int replicacount; 1788 int replica_delete_count; 1789 int nr_replica_slices; 1790 int i; 1791 int stop_svmdaemons = 0; 1792 int rval = 0; 1793 int index; 1794 int valid_replicas_nottodelete = 0; 1795 int invalid_replicas_nottodelete = 0; 1796 int invalid_replicas_todelete = 0; 1797 int errored = 0; 1798 int *tag_array; 1799 int fd = -1; 1800 md_error_t status = mdnullerror; 1801 md_set_desc *sd; 1802 int stale_bool = FALSE; 1803 int flags; 1804 1805 /* 1806 * Error if we don't get some work to do. 1807 */ 1808 if (db_nlp == NULL) 1809 return (mdsyserror(ep, EINVAL, NULL)); 1810 1811 if (mdnamesareunique(db_nlp, ep) != 0) 1812 return (-1); 1813 1814 (void) memset(&c, 0, sizeof (c)); 1815 c.c_id = 0; 1816 c.c_setno = sp->setno; 1817 1818 /* Don't need device id information from this ioctl */ 1819 c.c_locator.l_devid = (uint64_t)0; 1820 c.c_locator.l_devid_flags = 0; 1821 1822 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1823 return (mdstealerror(ep, &c.c_mde)); 1824 1825 /* 1826 * Is current set STALE? 1827 */ 1828 if (c.c_flags & MDDB_C_STALE) { 1829 stale_bool = TRUE; 1830 } 1831 1832 replicacount = c.c_dbcnt; 1833 1834 assert(db_nlp != NULL); 1835 1836 /* 1837 * go through and gather how many data bases are on each 1838 * device specified. 1839 */ 1840 1841 nr_replica_slices = deletelist_length(db_nlp); 1842 tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); 1843 1844 replica_delete_count = 0; 1845 for (i = 0; i < replicacount; i++) { 1846 char *devname; 1847 int found = 0; 1848 1849 c.c_id = i; 1850 1851 /* Don't need device id information from this ioctl */ 1852 c.c_locator.l_devid = (uint64_t)0; 1853 c.c_locator.l_devid_flags = 0; 1854 1855 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1856 return (mdstealerror(ep, &c.c_mde)); 1857 1858 devname = splicename(&c.c_devname); 1859 1860 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) { 1861 Free(devname); 1862 devname = getlongname(&c, ep); 1863 if (devname == NULL) { 1864 return (-1); 1865 } 1866 } 1867 1868 if ((index = in_deletelist(devname, db_nlp)) != -1) { 1869 found = 1; 1870 tag_array[index] = 1; 1871 replica_delete_count++; 1872 } 1873 1874 errored = c.c_locator.l_flags & (MDDB_F_EREAD | 1875 MDDB_F_EWRITE | MDDB_F_TOOSMALL | MDDB_F_EFMT | 1876 MDDB_F_EDATA | MDDB_F_EMASTER); 1877 1878 /* 1879 * There are four combinations of "errored" and "found" 1880 * and they are used to find the number of 1881 * (a) valid/invalid replicas that are not in the delete 1882 * list and are available in the system. 1883 * (b) valid/invalid replicas that are to be deleted. 1884 */ 1885 1886 if (errored && !found) /* errored and !found */ 1887 invalid_replicas_nottodelete++; 1888 else if (!found) /* !errored and !found */ 1889 valid_replicas_nottodelete++; 1890 else if (errored) /* errored and found */ 1891 invalid_replicas_todelete++; 1892 /* 1893 * else it is !errored and found. This means 1894 * valid_replicas_todelete++; But this variable will not 1895 * be used anywhere 1896 */ 1897 1898 Free(devname); 1899 } 1900 1901 index = 0; 1902 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1903 np = nlp->namep; 1904 if (tag_array[index++] != 1) { 1905 Free(tag_array); 1906 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); 1907 } 1908 } 1909 1910 Free(tag_array); 1911 1912 1913 /* if all replicas are deleted stop mdmonitord */ 1914 if ((replicacount - replica_delete_count) == 0) 1915 stop_svmdaemons = 1; 1916 1917 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { 1918 if (force_option & MDFORCE_NONE) 1919 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); 1920 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) 1921 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); 1922 } 1923 1924 /* 1925 * The following algorithms are followed to check for deletion: 1926 * (a) If the delete list(db_nlp) has all invalid replicas and no valid 1927 * replicas, then deletion should be allowed. 1928 * (b) Deletion should be allowed only if valid replicas that are "not" 1929 * to be deleted is always greater than the invalid replicas that 1930 * are "not" to be deleted. 1931 * (c) If the user uses -f option, then deletion should be allowed. 1932 */ 1933 1934 if ((invalid_replicas_todelete != replica_delete_count) && 1935 (invalid_replicas_nottodelete > valid_replicas_nottodelete) && 1936 (force_option != MDFORCE_LOCAL)) 1937 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); 1938 1939 /* 1940 * go through and tell kernel to delete them 1941 */ 1942 1943 /* Don't need device id information from this ioctl */ 1944 c.c_locator.l_devid = (uint64_t)0; 1945 c.c_locator.l_devid_flags = 0; 1946 1947 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1948 return (mdstealerror(ep, &c.c_mde)); 1949 1950 if (! metaislocalset(sp)) { 1951 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1952 if (! mdisok(ep)) 1953 return (-1); 1954 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1955 return (-1); 1956 } 1957 1958 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1959 np = nlp->namep; 1960 1961 /* 1962 * If mddb is being deleted from MN diskset and node is 1963 * an owner of the diskset then use rpc.mdcommd 1964 * mechanism to add mddb(s) so that all nodes stay in sync. 1965 * If set is stale, don't log the message since rpc.mdcommd 1966 * can't write the message to the mddb. 1967 * 1968 * When mddbs are first being added to set, a detach can 1969 * be called before any node has joined the diskset, so 1970 * must check to see if node is an owner of the diskset. 1971 * 1972 * Otherwise, just delete mddb from this node. 1973 */ 1974 1975 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1976 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1977 md_mn_result_t *resultp; 1978 md_mn_msg_meta_db_detach_t detach; 1979 int send_rval; 1980 1981 /* 1982 * The following algorithm is used to detach replicas. 1983 * - META_DB_DETACH message generates submsgs 1984 * - BLOCK parse (master) 1985 * - MDDB_DETACH replicas 1986 * - UNBLOCK parse (master) causing parse 1987 * information to be sent from master 1988 * to slaves at a higher class than the 1989 * unblock so the parse message will 1990 * reach slaves before unblock message. 1991 */ 1992 (void) splitname(np->bname, &detach.msg_splitname); 1993 1994 /* Set devid to NULL until devids are supported */ 1995 detach.msg_devid[0] = NULL; 1996 1997 /* 1998 * If reconfig cycle has been started, this node is 1999 * stuck in in the return step until this command has 2000 * completed. If mdcommd is suspended, ask 2001 * send_message to fail (instead of retrying) 2002 * so that metaset can finish allowing the reconfig 2003 * cycle to proceed. 2004 */ 2005 flags = MD_MSGF_FAIL_ON_SUSPEND; 2006 if (stale_bool == TRUE) 2007 flags |= MD_MSGF_NO_LOG; 2008 send_rval = mdmn_send_message(sp->setno, 2009 MD_MN_MSG_META_DB_DETACH, 2010 flags, (char *)&detach, 2011 sizeof (md_mn_msg_meta_db_detach_t), 2012 &resultp, ep); 2013 if (send_rval != 0) { 2014 rval = -1; 2015 if (resultp == NULL) 2016 (void) mddserror(ep, 2017 MDE_DS_COMMD_SEND_FAIL, 2018 sp->setno, NULL, NULL, 2019 sp->setname); 2020 else { 2021 (void) mdstealerror(ep, 2022 &(resultp->mmr_ep)); 2023 if (mdisok(ep)) { 2024 (void) mddserror(ep, 2025 MDE_DS_COMMD_SEND_FAIL, 2026 sp->setno, NULL, NULL, 2027 sp->setname); 2028 } 2029 free_result(resultp); 2030 } 2031 goto out; 2032 } 2033 if (resultp) 2034 free_result(resultp); 2035 } else { 2036 i = 0; 2037 while (i < c.c_dbcnt) { 2038 char *devname; 2039 2040 c.c_id = i; 2041 2042 /* Don't need devid info from this ioctl */ 2043 c.c_locator.l_devid = (uint64_t)0; 2044 c.c_locator.l_devid_flags = 0; 2045 2046 if (metaioctl(MD_DB_GETDEV, &c, 2047 &c.c_mde, NULL)) { 2048 rval = mdstealerror(ep, &c.c_mde); 2049 goto out; 2050 } 2051 2052 devname = splicename(&c.c_devname); 2053 2054 if (strstr(devname, META_LONGDISKNAME_STR) 2055 != NULL) { 2056 Free(devname); 2057 devname = getlongname(&c, ep); 2058 if (devname == NULL) { 2059 return (-1); 2060 } 2061 } 2062 2063 if (strcmp(devname, np->bname) != 0) { 2064 Free(devname); 2065 i++; 2066 continue; 2067 } 2068 Free(devname); 2069 2070 /* Don't need devid info from this ioctl */ 2071 c.c_locator.l_devid = (uint64_t)0; 2072 c.c_locator.l_devid_flags = 0; 2073 2074 if (metaioctl(MD_DB_DELDEV, &c, 2075 &c.c_mde, NULL) != 0) { 2076 rval = mdstealerror(ep, &c.c_mde); 2077 goto out; 2078 } 2079 2080 /* Not incrementing "i" intentionally */ 2081 } 2082 } 2083 if (! metaislocalset(sp)) { 2084 /* update the dbcnt and size in dd */ 2085 for (p = dd; p != NULL; p = p->dd_next) { 2086 if (p->dd_dnp == np->drivenamep) { 2087 p->dd_dbcnt = 0; 2088 p->dd_dbsize = 0; 2089 break; 2090 } 2091 } 2092 2093 /* 2094 * Slam a dummy master block and make it self 2095 * identifying 2096 */ 2097 if ((fd = open(np->rname, O_RDWR)) >= 0) { 2098 meta_mkdummymaster(sp, fd, 16); 2099 (void) close(fd); 2100 } 2101 } 2102 } 2103 out: 2104 if (metaislocalset(sp)) { 2105 /* 2106 * Stop all the daemons if there are 2107 * no more replicas so that the module can be 2108 * unloaded. 2109 */ 2110 if (rval == 0 && stop_svmdaemons == 1) { 2111 char buf[MAXPATHLEN]; 2112 int i; 2113 2114 for (i = 0; i < DAEMON_COUNT; i++) { 2115 (void) snprintf(buf, MAXPATHLEN, 2116 "/usr/bin/pkill -%s -x %s", 2117 svmd_kill_list[i].svmd_kill_val, 2118 svmd_kill_list[i].svmd_name); 2119 if (pclose(popen(buf, "w")) == -1) 2120 md_perror(buf); 2121 } 2122 2123 if (meta_smf_disable(META_SMF_ALL, &status) == -1) { 2124 mde_perror(&status, ""); 2125 mdclrerror(&status); 2126 } 2127 } 2128 if (buildconf(sp, &status)) { 2129 /* Don't mask any previous errors */ 2130 if (rval == 0) 2131 rval = mdstealerror(ep, &status); 2132 else 2133 mdclrerror(&status); 2134 return (rval); 2135 } 2136 2137 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 2138 /* Don't mask any previous errors */ 2139 if (rval == 0) 2140 rval = mdstealerror(ep, &status); 2141 else 2142 mdclrerror(&status); 2143 } 2144 } else { 2145 if (update_dbinfo_on_drives(sp, dd, 2146 (force_option & MDFORCE_SET_LOCKED), 2147 ((force_option & MDFORCE_LOCAL) | 2148 (force_option & MDFORCE_DS)), &status)) { 2149 /* Don't mask any previous errors */ 2150 if (rval == 0) 2151 rval = mdstealerror(ep, &status); 2152 else 2153 mdclrerror(&status); 2154 } 2155 metafreedrivedesc(&dd); 2156 } 2157 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { 2158 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 2159 meta_invalidate_name(nlp->namep); 2160 } 2161 } 2162 return (rval); 2163 } 2164 2165 static md_replica_t * 2166 metareplicaname( 2167 mdsetname_t *sp, 2168 int flags, 2169 struct mddb_config *c, 2170 md_error_t *ep 2171 ) 2172 { 2173 md_replica_t *rp; 2174 char *devname; 2175 size_t sz; 2176 devid_nmlist_t *disklist = NULL; 2177 char *devid_str; 2178 2179 /* allocate replicaname */ 2180 rp = Zalloc(sizeof (*rp)); 2181 2182 /* get device name */ 2183 devname = splicename(&c->c_devname); 2184 2185 /* 2186 * Check if the device has a long name (>40 characters) and 2187 * if so then we have to use devids to get the device name. 2188 * If this cannot be done then we have to fail the request. 2189 */ 2190 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) { 2191 if (c->c_locator.l_devid != NULL) { 2192 if (meta_deviceid_to_nmlist("/dev/dsk", 2193 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, 2194 c->c_locator.l_minor_name, &disklist) != 0) { 2195 devid_str = devid_str_encode( 2196 (ddi_devid_t)(uintptr_t) 2197 c->c_locator.l_devid, NULL); 2198 (void) mderror(ep, MDE_MISSING_DEVID_DISK, ""); 2199 mderrorextra(ep, devid_str); 2200 if (devid_str != NULL) 2201 devid_str_free(devid_str); 2202 Free(rp); 2203 Free(devname); 2204 return (NULL); 2205 } 2206 } else { 2207 (void) mderror(ep, MDE_NODEVID, ""); 2208 Free(rp); 2209 Free(devname); 2210 return (NULL); 2211 } 2212 Free(devname); 2213 devname = disklist[0].devname; 2214 } 2215 2216 if (flags & PRINT_FAST) { 2217 if ((rp->r_namep = metaname_fast(&sp, devname, 2218 LOGICAL_DEVICE, ep)) == NULL) { 2219 Free(devname); 2220 Free(rp); 2221 return (NULL); 2222 } 2223 } else { 2224 if ((rp->r_namep = metaname(&sp, devname, 2225 LOGICAL_DEVICE, ep)) == NULL) { 2226 Free(devname); 2227 Free(rp); 2228 return (NULL); 2229 } 2230 } 2231 Free(devname); 2232 2233 /* make sure it's OK */ 2234 if ((! (flags & MD_BASICNAME_OK)) && 2235 (metachkcomp(rp->r_namep, ep) != 0)) { 2236 Free(rp); 2237 return (NULL); 2238 } 2239 2240 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR; 2241 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR; 2242 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; 2243 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { 2244 sz = devid_sizeof((ddi_devid_t)(uintptr_t) 2245 (c->c_locator.l_devid)); 2246 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == 2247 (ddi_devid_t)NULL) { 2248 Free(rp); 2249 return (NULL); 2250 } 2251 (void) memcpy((void *)rp->r_devid, 2252 (void *)(uintptr_t)c->c_locator.l_devid, sz); 2253 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); 2254 rp->r_flags &= ~MDDB_F_NODEVID; 2255 /* Overwrite dev derived from name with dev from devid */ 2256 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); 2257 } 2258 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); 2259 2260 rp->r_blkno = c->c_locator.l_blkno; 2261 if (c->c_dbend != 0) 2262 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; 2263 2264 /* return replica */ 2265 return (rp); 2266 } 2267 2268 /* 2269 * free replica list 2270 */ 2271 void 2272 metafreereplicalist( 2273 md_replicalist_t *rlp 2274 ) 2275 { 2276 md_replicalist_t *rl = NULL; 2277 2278 for (/* void */; (rlp != NULL); rlp = rl) { 2279 rl = rlp->rl_next; 2280 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { 2281 free(rlp->rl_repp->r_devid); 2282 } 2283 Free(rlp->rl_repp); 2284 Free(rlp); 2285 } 2286 } 2287 2288 /* 2289 * return list of all replicas in set 2290 */ 2291 int 2292 metareplicalist( 2293 mdsetname_t *sp, 2294 int flags, 2295 md_replicalist_t **rlpp, 2296 md_error_t *ep 2297 ) 2298 { 2299 md_replicalist_t **tail = rlpp; 2300 int count = 0; 2301 struct mddb_config c; 2302 int i; 2303 char *devid; 2304 2305 /* for each replica */ 2306 i = 0; 2307 do { 2308 md_replica_t *rp; 2309 2310 /* get next replica */ 2311 (void) memset(&c, 0, sizeof (c)); 2312 c.c_id = i; 2313 c.c_setno = sp->setno; 2314 2315 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2316 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2317 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2318 mdclrerror(&c.c_mde); 2319 break; /* handle none at all */ 2320 } 2321 (void) mdstealerror(ep, &c.c_mde); 2322 goto out; 2323 } 2324 2325 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { 2326 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { 2327 (void) mdsyserror(ep, ENOMEM, META_DBCONF); 2328 goto out; 2329 } 2330 c.c_locator.l_devid = (uintptr_t)devid; 2331 /* 2332 * Turn on space and sz flags since 'sz' amount of 2333 * space has been alloc'd. 2334 */ 2335 c.c_locator.l_devid_flags = 2336 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2337 } 2338 2339 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2340 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2341 mdclrerror(&c.c_mde); 2342 break; /* handle none at all */ 2343 } 2344 (void) mdstealerror(ep, &c.c_mde); 2345 goto out; 2346 } 2347 2348 /* 2349 * Paranoid check - shouldn't happen, but is left as 2350 * a place holder for changes that will be needed after 2351 * dynamic reconfiguration changes are added to SVM (to 2352 * support movement of disks at any point in time). 2353 */ 2354 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 2355 (void) fprintf(stderr, 2356 dgettext(TEXT_DOMAIN, 2357 "Error: Relocation Information " 2358 "(drvnm=%s, mnum=0x%lx) \n" 2359 "relocation information size changed - \n" 2360 "rerun command\n"), 2361 c.c_locator.l_driver, c.c_locator.l_mnum); 2362 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); 2363 goto out; 2364 } 2365 2366 if (c.c_dbcnt == 0) 2367 break; /* handle none at all */ 2368 2369 /* get info */ 2370 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) 2371 goto out; 2372 2373 /* append to list */ 2374 *tail = Zalloc(sizeof (**tail)); 2375 (*tail)->rl_repp = rp; 2376 tail = &(*tail)->rl_next; 2377 ++count; 2378 2379 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2380 free(devid); 2381 c.c_locator.l_devid_flags = 0; 2382 } 2383 2384 } while (++i < c.c_dbcnt); 2385 2386 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2387 free(devid); 2388 } 2389 2390 /* return count */ 2391 return (count); 2392 2393 /* cleanup, return error */ 2394 out: 2395 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2396 free(devid); 2397 } 2398 metafreereplicalist(*rlpp); 2399 *rlpp = NULL; 2400 return (-1); 2401 } 2402 2403 /* 2404 * meta_sync_db_locations - get list of replicas from kernel and write 2405 * out to mddb.cf and md.conf. 'Syncs up' the replica list in 2406 * the kernel with the replica list in the conf files. 2407 * 2408 */ 2409 void 2410 meta_sync_db_locations( 2411 mdsetname_t *sp, 2412 md_error_t *ep 2413 ) 2414 { 2415 char *sname = 0; /* system file name */ 2416 char *cname = 0; /* config file name */ 2417 2418 if (!metaislocalset(sp)) 2419 return; 2420 2421 /* Updates backup of configuration file (aka mddb.cf) */ 2422 if (buildconf(sp, ep) != 0) 2423 return; 2424 2425 /* Updates system configuration file (aka md.conf) */ 2426 (void) meta_db_patch(sname, cname, 0, ep); 2427 } 2428 2429 /* 2430 * setup_db_locations - parse the mddb.cf file and 2431 * tells the driver which db locations to use. 2432 */ 2433 int 2434 meta_setup_db_locations( 2435 md_error_t *ep 2436 ) 2437 { 2438 mddb_config_t c; 2439 FILE *fp; 2440 char inbuff[1024]; 2441 char *buff; 2442 uint_t i; 2443 size_t sz; 2444 int rval = 0; 2445 char *devidp; 2446 uint_t devid_size; 2447 char *minor_name = NULL; 2448 ddi_devid_t devid_decode; 2449 int checksum; 2450 2451 /* do mddb.cf file */ 2452 (void) memset(&c, '\0', sizeof (c)); 2453 if ((fp = fopen(META_DBCONF, "r")) == NULL) { 2454 if (errno != ENOENT) 2455 return (mdsyserror(ep, errno, META_DBCONF)); 2456 } 2457 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), 2458 fp)) != NULL)) { 2459 2460 /* ignore comments */ 2461 if (*buff == '#') 2462 continue; 2463 2464 /* parse locator */ 2465 (void) memset(&c, 0, sizeof (c)); 2466 c.c_setno = MD_LOCAL_SET; 2467 i = strcspn(buff, " \t"); 2468 if (i > sizeof (c.c_locator.l_driver)) 2469 i = sizeof (c.c_locator.l_driver); 2470 (void) strncpy(c.c_locator.l_driver, buff, i); 2471 buff += i; 2472 c.c_locator.l_dev = 2473 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); 2474 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); 2475 c.c_locator.l_mnum = minor(c.c_locator.l_dev); 2476 2477 /* parse out devid */ 2478 while (isspace((int)(*buff))) 2479 buff += 1; 2480 i = strcspn(buff, " \t"); 2481 if ((devidp = (char *)malloc(i+1)) == NULL) 2482 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2483 2484 (void) strncpy(devidp, buff, i); 2485 devidp[i] = '\0'; 2486 if (devid_str_decode(devidp, &devid_decode, 2487 &minor_name) == -1) { 2488 free(devidp); 2489 continue; 2490 } 2491 2492 /* Conf file must have minor name associated with devid */ 2493 if (minor_name == NULL) { 2494 free(devidp); 2495 devid_free(devid_decode); 2496 continue; 2497 } 2498 2499 sz = devid_sizeof(devid_decode); 2500 /* Copy to devid size buffer that ioctl expects */ 2501 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { 2502 devid_free(devid_decode); 2503 free(minor_name); 2504 free(devidp); 2505 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2506 } 2507 2508 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2509 (void *)devid_decode, sz); 2510 2511 devid_free(devid_decode); 2512 2513 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { 2514 free(minor_name); 2515 free(devidp); 2516 free((void *)(uintptr_t)c.c_locator.l_devid); 2517 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2518 } 2519 (void) strcpy(c.c_locator.l_minor_name, minor_name); 2520 free(minor_name); 2521 c.c_locator.l_devid_flags = MDDB_DEVID_VALID | 2522 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2523 c.c_locator.l_devid_sz = sz; 2524 2525 devid_size = strlen(devidp); 2526 buff += devid_size; 2527 2528 checksum = strtol(buff, &buff, 10); 2529 for (i = 0; c.c_locator.l_driver[i] != 0; i++) 2530 checksum += c.c_locator.l_driver[i]; 2531 for (i = 0; i < devid_size; i++) { 2532 checksum += devidp[i]; 2533 } 2534 free(devidp); 2535 2536 checksum += minor(c.c_locator.l_dev); 2537 checksum += c.c_locator.l_blkno; 2538 if (checksum != 42) { 2539 /* overwritten later for more serious problems */ 2540 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); 2541 free((void *)(uintptr_t)c.c_locator.l_devid); 2542 continue; 2543 } 2544 c.c_locator.l_flags = 0; 2545 2546 /* use db location */ 2547 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2548 free((void *)(uintptr_t)c.c_locator.l_devid); 2549 return (mdstealerror(ep, &c.c_mde)); 2550 } 2551 2552 /* free up devid if in use */ 2553 free((void *)(uintptr_t)c.c_locator.l_devid); 2554 c.c_locator.l_devid = (uint64_t)0; 2555 c.c_locator.l_devid_flags = 0; 2556 } 2557 if ((fp) && (fclose(fp) != 0)) 2558 return (mdsyserror(ep, errno, META_DBCONF)); 2559 2560 /* check for stale database */ 2561 (void) memset((char *)&c, 0, sizeof (struct mddb_config)); 2562 c.c_id = 0; 2563 c.c_setno = MD_LOCAL_SET; 2564 2565 /* 2566 * While we do not need the devid here we may need to 2567 * know if devid's are being used by the kernel for 2568 * the replicas. This is because under some circumstances 2569 * we can only manipulate the SVM configuration if the 2570 * kernel is using devid's. 2571 */ 2572 c.c_locator.l_devid = (uint64_t)0; 2573 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2574 c.c_locator.l_devid_sz = 0; 2575 2576 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2577 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) 2578 return (mdstealerror(ep, &c.c_mde)); 2579 mdclrerror(&c.c_mde); 2580 } 2581 2582 if (c.c_flags & MDDB_C_STALE) 2583 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, 2584 0, NULL)); 2585 2586 if (c.c_locator.l_devid_sz != 0) { 2587 /* 2588 * Devid's are being used to track the replicas because 2589 * there is space for a devid. 2590 */ 2591 devid_in_use = TRUE; 2592 } 2593 2594 /* success */ 2595 return (rval); 2596 } 2597 2598 /* 2599 * meta_db_minreplica - returns the minimum size replica currently in use. 2600 */ 2601 daddr_t 2602 meta_db_minreplica( 2603 mdsetname_t *sp, 2604 md_error_t *ep 2605 ) 2606 { 2607 md_replica_t *r; 2608 md_replicalist_t *rl, *rlp = NULL; 2609 daddr_t nblks = 0; 2610 2611 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) 2612 return (-1); 2613 2614 if (rlp == NULL) 2615 return (-1); 2616 2617 /* find the smallest existing replica */ 2618 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2619 r = rl->rl_repp; 2620 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2621 } 2622 2623 metafreereplicalist(rlp); 2624 return (nblks); 2625 } 2626 2627 /* 2628 * meta_get_replica_names 2629 * returns an mdnamelist_t of replica slices 2630 */ 2631 /*ARGSUSED*/ 2632 int 2633 meta_get_replica_names( 2634 mdsetname_t *sp, 2635 mdnamelist_t **nlpp, 2636 int options, 2637 md_error_t *ep 2638 ) 2639 { 2640 md_replicalist_t *rlp = NULL; 2641 md_replicalist_t *rl; 2642 mdnamelist_t **tailpp = nlpp; 2643 int cnt = 0; 2644 2645 assert(nlpp != NULL); 2646 2647 if (!metaislocalset(sp)) 2648 goto out; 2649 2650 /* get replicas */ 2651 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 2652 cnt = -1; 2653 goto out; 2654 } 2655 2656 /* build name list */ 2657 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 2658 /* 2659 * Add the name struct to the end of the 2660 * namelist but keep a pointer to the last 2661 * element so that we don't incur the overhead 2662 * of traversing the list each time 2663 */ 2664 tailpp = meta_namelist_append_wrapper( 2665 tailpp, rl->rl_repp->r_namep); 2666 ++cnt; 2667 } 2668 2669 /* cleanup, return count or error */ 2670 out: 2671 metafreereplicalist(rlp); 2672 return (cnt); 2673 } 2674