1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * Just in case we're not in a build environment, make sure that 29 * TEXT_DOMAIN gets set to something. 30 */ 31 #if !defined(TEXT_DOMAIN) 32 #define TEXT_DOMAIN "SYS_TEST" 33 #endif 34 35 /* 36 * Metadevice database interfaces. 37 */ 38 39 #define MDDB 40 41 #include <meta.h> 42 #include <sys/lvm/md_mddb.h> 43 #include <sys/lvm/md_crc.h> 44 #include <sys/lvm/mdio.h> 45 #include <string.h> 46 #include <strings.h> 47 #include <ctype.h> 48 49 struct svm_daemon { 50 char *svmd_name; 51 char *svmd_kill_val; 52 }; 53 54 /* 55 * This is a list of the daemons that are not stopped by the SVM smf(5) 56 * services. The mdmonitord is started via svc:/system/mdmonitor:default 57 * but no contract(4) is constructed and so it is not stopped by smf(5). 58 */ 59 struct svm_daemon svmd_kill_list[] = { 60 {"mdmonitord", "HUP"}, 61 {"mddoors", "KILL"}, 62 }; 63 64 #define DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon)) 65 66 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep); 67 68 /* 69 * Are the locator blocks for the replicas using devids 70 */ 71 static int devid_in_use = FALSE; 72 73 static char * 74 getlongname( 75 struct mddb_config *c, 76 md_error_t *ep 77 ) 78 { 79 char *diskname = NULL; 80 char *devid_str; 81 devid_nmlist_t *disklist = NULL; 82 83 c->c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 84 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) { 85 (void) mdstealerror(ep, &c->c_mde); 86 return (NULL); 87 } 88 89 if (c->c_locator.l_devid_flags & MDDB_DEVID_SZ) { 90 c->c_locator.l_devid = (uintptr_t) 91 Malloc(c->c_locator.l_devid_sz); 92 c->c_locator.l_devid_flags = 93 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 94 } else { 95 (void) mderror(ep, MDE_NODEVID, ""); 96 goto out; 97 } 98 99 if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) { 100 (void) mdstealerror(ep, &c->c_mde); 101 goto out; 102 } 103 104 if (c->c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 105 (void) mderror(ep, MDE_NODEVID, ""); 106 goto out; 107 } 108 109 if (metaioctl(MD_DB_GETDEV, c, &c->c_mde, NULL) != 0) { 110 (void) mdstealerror(ep, &c->c_mde); 111 goto out; 112 } 113 114 if (c->c_locator.l_devid != NULL) { 115 if (meta_deviceid_to_nmlist("/dev/dsk", 116 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, 117 c->c_locator.l_minor_name, &disklist) != 0) { 118 devid_str = devid_str_encode( 119 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, NULL); 120 (void) mderror(ep, MDE_MISSING_DEVID_DISK, ""); 121 mderrorextra(ep, devid_str); 122 if (devid_str != NULL) 123 devid_str_free(devid_str); 124 goto out; 125 } 126 diskname = Strdup(disklist[0].devname); 127 } 128 129 out: 130 if (disklist != NULL) 131 devid_free_nmlist(disklist); 132 133 if (c->c_locator.l_devid != NULL) 134 Free((void *)(uintptr_t)c->c_locator.l_devid); 135 136 return (diskname); 137 } 138 139 /* 140 * meta_get_lb_inittime sends a request for the lb_inittime to the kernel 141 */ 142 md_timeval32_t 143 meta_get_lb_inittime( 144 mdsetname_t *sp, 145 md_error_t *ep 146 ) 147 { 148 mddb_config_t c; 149 150 (void) memset(&c, 0, sizeof (c)); 151 152 /* Fill in setno, setname, and sideno */ 153 c.c_setno = sp->setno; 154 155 if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) { 156 (void) mdstealerror(ep, &c.c_mde); 157 } 158 159 return (c.c_timestamp); 160 } 161 162 /* 163 * mkmasterblks writes out the master blocks of the mddb to the replica. 164 * 165 * In a MN diskset, this is called by the node that is adding this replica 166 * to the diskset. 167 */ 168 169 #define MDDB_VERIFY_SIZE 8192 170 171 static int 172 mkmasterblks( 173 mdsetname_t *sp, 174 mdname_t *np, 175 int fd, 176 daddr_t firstblk, 177 int dbsize, 178 md_timeval32_t inittime, 179 md_error_t *ep 180 ) 181 { 182 int consecutive; 183 md_timeval32_t tp; 184 struct mddb_mb *mb; 185 char *buffer; 186 int iosize; 187 md_set_desc *sd; 188 int mn_set = 0; 189 daddr_t startblk; 190 int cnt; 191 ddi_devid_t devid; 192 193 if (! metaislocalset(sp)) { 194 if ((sd = metaget_setdesc(sp, ep)) == NULL) 195 return (-1); 196 197 if (MD_MNSET_DESC(sd)) { 198 mn_set = 1; /* Used later */ 199 } 200 } 201 202 /* 203 * Loop to verify the entire mddb region on disk is read/writable. 204 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block 205 * chunks. 206 * 207 * A side-effect of this loop is to zero out the entire mddb region 208 */ 209 if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL) 210 return (mdsyserror(ep, ENOMEM, np->rname)); 211 212 startblk = firstblk; 213 for (cnt = dbsize; cnt > 0; cnt -= consecutive) { 214 215 if (cnt > MDDB_VERIFY_SIZE) 216 consecutive = MDDB_VERIFY_SIZE; 217 else 218 consecutive = cnt; 219 220 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 221 Free(buffer); 222 return (mdsyserror(ep, errno, np->rname)); 223 } 224 225 iosize = DEV_BSIZE * consecutive; 226 if (write(fd, buffer, iosize) != iosize) { 227 Free(buffer); 228 return (mdsyserror(ep, errno, np->rname)); 229 } 230 231 if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) { 232 Free(buffer); 233 return (mdsyserror(ep, errno, np->rname)); 234 } 235 236 if (read(fd, buffer, iosize) != iosize) { 237 Free(buffer); 238 return (mdsyserror(ep, errno, np->rname)); 239 } 240 241 startblk += consecutive; 242 } 243 244 Free(buffer); 245 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 246 return (mdsyserror(ep, ENOMEM, np->rname)); 247 248 if (meta_gettimeofday(&tp) == -1) { 249 Free(mb); 250 return (mdsyserror(ep, errno, np->rname)); 251 } 252 253 mb->mb_magic = MDDB_MAGIC_MB; 254 /* 255 * If a MN diskset, set master block revision for a MN set. 256 * Even though the master block structure is no different 257 * for a MN set, setting the revision field to a different 258 * number keeps any pre-MN_diskset code from accessing 259 * this diskset. It also allows for an early determination 260 * of a MN diskset when reading in from disk so that the 261 * proper size locator block and locator names structure 262 * can be read in thus saving time on diskset startup. 263 */ 264 if (mn_set) 265 mb->mb_revision = MDDB_REV_MNMB; 266 else 267 mb->mb_revision = MDDB_REV_MB; 268 mb->mb_timestamp = tp; 269 mb->mb_setno = sp->setno; 270 mb->mb_blkcnt = dbsize - 1; 271 mb->mb_blkno = firstblk; 272 mb->mb_nextblk = 0; 273 274 mb->mb_blkmap.m_firstblk = firstblk + 1; 275 mb->mb_blkmap.m_consecutive = dbsize - 1; 276 if (! metaislocalset(sp)) { 277 mb->mb_setcreatetime = inittime; 278 } 279 280 /* 281 * We try to save the disks device ID into the remaining bytes in 282 * the master block. The saved devid is used to provide a mapping 283 * between this disk's devid and the devid stored into the master 284 * block. This allows the disk image to be self-identifying 285 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 286 * when we try to import these disks on the remote copied image. 287 * If we cannot save the disks device ID onto the master block that is 288 * ok. The disk is just not self-identifying and won't be importable 289 * in the remote copy scenario. 290 */ 291 if (devid_get(fd, &devid) == 0) { 292 size_t len; 293 294 len = devid_sizeof(devid); 295 if (len <= DEV_BSIZE - sizeof (*mb)) { 296 /* there is enough space to store the devid */ 297 mb->mb_devid_magic = MDDB_MAGIC_DE; 298 mb->mb_devid_len = len; 299 (void) memcpy(mb->mb_devid, devid, len); 300 } 301 devid_free(devid); 302 } 303 304 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 305 (crc_skip_t *)NULL); 306 307 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 308 Free(mb); 309 return (mdsyserror(ep, errno, np->rname)); 310 } 311 312 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 313 Free(mb); 314 return (mdsyserror(ep, errno, np->rname)); 315 } 316 317 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) { 318 Free(mb); 319 return (mdsyserror(ep, errno, np->rname)); 320 } 321 322 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) { 323 Free(mb); 324 return (mdsyserror(ep, errno, np->rname)); 325 } 326 327 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 328 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) { 329 Free(mb); 330 return (mdmddberror(ep, MDE_NOTVERIFIED, 331 meta_getminor(np->dev), sp->setno, 0, np->rname)); 332 } 333 334 Free(mb); 335 return (0); 336 } 337 338 void 339 meta_mkdummymaster( 340 mdsetname_t *sp, 341 int fd, 342 daddr_t firstblk 343 ) 344 { 345 md_timeval32_t tp; 346 struct mddb_mb *mb; 347 ddi_devid_t devid; 348 md_set_desc *sd; 349 md_error_t ep = mdnullerror; 350 md_timeval32_t inittime; 351 352 /* 353 * No dummy master blocks are written for a MN diskset since devids 354 * are not supported in MN disksets. 355 */ 356 if (! metaislocalset(sp)) { 357 if ((sd = metaget_setdesc(sp, &ep)) == NULL) 358 return; 359 360 if (MD_MNSET_DESC(sd)) 361 return; 362 } 363 364 if ((mb = Zalloc(DEV_BSIZE)) == NULL) 365 return; 366 367 mb->mb_magic = MDDB_MAGIC_DU; 368 mb->mb_revision = MDDB_REV_MB; 369 mb->mb_setno = sp->setno; 370 inittime = meta_get_lb_inittime(sp, &ep); 371 mb->mb_setcreatetime = inittime; 372 373 if (meta_gettimeofday(&tp) != -1) 374 mb->mb_timestamp = tp; 375 376 /* 377 * We try to save the disks device ID into the remaining bytes in 378 * the master block. This allows the disk image to be self-identifying 379 * if it gets copied (e.g. SNDR, True Copy, etc.). This is used 380 * when we try to import these disks on the remote copied image. 381 * If we cannot save the disks device ID onto the master block that is 382 * ok. The disk is just not self-identifying and won't be importable 383 * in the remote copy scenario. 384 */ 385 if (devid_get(fd, &devid) == 0) { 386 int len; 387 388 len = devid_sizeof(devid); 389 if (len <= DEV_BSIZE - sizeof (*mb)) { 390 /* there is enough space to store the devid */ 391 mb->mb_devid_magic = MDDB_MAGIC_DE; 392 mb->mb_devid_len = len; 393 (void) memcpy(mb->mb_devid, (char *)devid, len); 394 } 395 devid_free(devid); 396 } 397 398 crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE, 399 (crc_skip_t *)NULL); 400 401 /* 402 * If any of these operations fail, we need to inform the 403 * user that the disk won't be self identifying. When support 404 * for importing remotely replicated disksets is added, we 405 * want to add the error messages here. 406 */ 407 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 408 goto out; 409 410 if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) 411 goto out; 412 413 if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) 414 goto out; 415 416 if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) 417 goto out; 418 419 if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum, 420 (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) 421 goto out; 422 423 out: 424 Free(mb); 425 } 426 427 static int 428 buildconf(mdsetname_t *sp, md_error_t *ep) 429 { 430 md_replicalist_t *rlp = NULL; 431 md_replicalist_t *rl; 432 FILE *cfp = NULL; 433 FILE *mfp = NULL; 434 struct stat sbuf; 435 int rval = 0; 436 int in_miniroot = 0; 437 char line[MDDB_BOOTLIST_MAX_LEN]; 438 char *tname = NULL; 439 440 /* get list of local replicas */ 441 if (! metaislocalset(sp)) 442 return (0); 443 444 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 445 return (-1); 446 447 /* open tempfile, copy permissions of original file */ 448 if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) { 449 /* 450 * On the miniroot tmp files must be created in /var/tmp. 451 * If we get a EROFS error, we assume that we are in the 452 * miniroot. 453 */ 454 if (errno != EROFS) 455 goto error; 456 in_miniroot = 1; 457 errno = 0; 458 tname = tempnam("/var/tmp", "slvm_"); 459 if (tname == NULL && errno == EROFS) { 460 /* 461 * If we are booted on a read-only root because 462 * of mddb quorum problems we don't want to emit 463 * any scary error messages. 464 */ 465 errno = 0; 466 goto out; 467 } 468 469 /* open tempfile, copy permissions of original file */ 470 if ((cfp = fopen(tname, "w+")) == NULL) 471 goto error; 472 } 473 if (stat(META_DBCONF, &sbuf) == 0) { 474 if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0) 475 goto error; 476 if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0) 477 goto error; 478 } 479 480 /* print header */ 481 if (fprintf(cfp, "#metadevice database location file ") == EOF) 482 goto error; 483 if (fprintf(cfp, "do not hand edit\n") < 0) 484 goto error; 485 if (fprintf(cfp, 486 "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0) 487 goto error; 488 489 /* dump replicas */ 490 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 491 md_replica_t *r = rl->rl_repp; 492 int checksum = 42; 493 int i; 494 char *devidp; 495 minor_t min; 496 497 devidp = devid_str_encode(r->r_devid, r->r_minor_name); 498 /* If devid code can't encode devidp - skip entry */ 499 if (devidp == NULL) { 500 continue; 501 } 502 503 /* compute checksum */ 504 for (i = 0; ((r->r_driver_name[i] != '\0') && 505 (i < sizeof (r->r_driver_name))); i++) { 506 checksum -= r->r_driver_name[i]; 507 } 508 min = meta_getminor(r->r_namep->dev); 509 checksum -= min; 510 checksum -= r->r_blkno; 511 512 for (i = 0; i < strlen(devidp); i++) { 513 checksum -= devidp[i]; 514 } 515 /* print info */ 516 if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n", 517 r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) { 518 goto error; 519 } 520 521 devid_str_free(devidp); 522 } 523 524 /* close and rename to real file */ 525 if (fflush(cfp) != 0) 526 goto error; 527 if (fsync(fileno(cfp)) != 0) 528 goto error; 529 if (fclose(cfp) != 0) { 530 cfp = NULL; 531 goto error; 532 } 533 cfp = NULL; 534 535 /* 536 * Renames don't work in the miniroot since tmpfiles are 537 * created in /var/tmp. Hence we copy the data out. 538 */ 539 540 if (! in_miniroot) { 541 if (rename(META_DBCONFTMP, META_DBCONF) != 0) 542 goto error; 543 } else { 544 if ((cfp = fopen(tname, "r")) == NULL) 545 goto error; 546 if ((mfp = fopen(META_DBCONF, "w+")) == NULL) 547 goto error; 548 while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) { 549 if (fputs(line, mfp) == NULL) 550 goto error; 551 } 552 (void) fclose(cfp); 553 cfp = NULL; 554 if (fflush(mfp) != 0) 555 goto error; 556 if (fsync(fileno(mfp)) != 0) 557 goto error; 558 if (fclose(mfp) != 0) { 559 mfp = NULL; 560 goto error; 561 } 562 /* delete the tempfile */ 563 (void) unlink(tname); 564 } 565 /* success */ 566 rval = 0; 567 goto out; 568 569 /* tempfile error */ 570 error: 571 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 572 mdsyserror(ep, errno, META_DBCONFTMP); 573 574 575 /* cleanup, return success */ 576 out: 577 if (rlp != NULL) 578 metafreereplicalist(rlp); 579 if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) { 580 rval = (in_miniroot) ? mdsyserror(ep, errno, tname): 581 mdsyserror(ep, errno, META_DBCONFTMP); 582 } 583 free(tname); 584 return (rval); 585 } 586 587 /* 588 * check replica for dev 589 */ 590 static int 591 in_replica( 592 mdsetname_t *sp, 593 md_replica_t *rp, 594 mdname_t *np, 595 diskaddr_t slblk, 596 diskaddr_t nblks, 597 md_error_t *ep 598 ) 599 { 600 mdname_t *repnp = rp->r_namep; 601 diskaddr_t rep_sblk = rp->r_blkno; 602 diskaddr_t rep_nblks = rp->r_nblk; 603 604 /* should be in the same set */ 605 assert(sp != NULL); 606 607 /* if error in master block, assume whole partition */ 608 if ((rep_sblk == MD_DISKADDR_ERROR) || 609 (rep_nblks == MD_DISKADDR_ERROR)) { 610 rep_sblk = 0; 611 rep_nblks = MD_DISKADDR_ERROR; 612 } 613 614 /* check overlap */ 615 if (meta_check_overlap( 616 MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) { 617 return (-1); 618 } 619 620 /* return success */ 621 return (0); 622 } 623 624 /* 625 * check to see if we're in a replica 626 */ 627 int 628 meta_check_inreplica( 629 mdsetname_t *sp, 630 mdname_t *np, 631 diskaddr_t slblk, 632 diskaddr_t nblks, 633 md_error_t *ep 634 ) 635 { 636 md_replicalist_t *rlp = NULL; 637 md_replicalist_t *rl; 638 int rval = 0; 639 640 /* should have a set */ 641 assert(sp != NULL); 642 643 /* for each replica */ 644 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) 645 return (-1); 646 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 647 md_replica_t *rp = rl->rl_repp; 648 649 /* check replica */ 650 if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) { 651 rval = -1; 652 break; 653 } 654 } 655 656 /* cleanup, return success */ 657 metafreereplicalist(rlp); 658 return (rval); 659 } 660 661 /* 662 * check replica 663 */ 664 int 665 meta_check_replica( 666 mdsetname_t *sp, /* set to check against */ 667 mdname_t *np, /* component to check against */ 668 mdchkopts_t options, /* option flags */ 669 diskaddr_t slblk, /* start logical block */ 670 diskaddr_t nblks, /* number of blocks (-1,rest of them) */ 671 md_error_t *ep /* error packet */ 672 ) 673 { 674 mdchkopts_t chkoptions = MDCHK_ALLOW_REPSLICE; 675 676 /* make sure we have a disk */ 677 if (metachkcomp(np, ep) != 0) 678 return (-1); 679 680 /* check to ensure that it is not already in use */ 681 if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) { 682 return (-1); 683 } 684 685 if (options & MDCHK_ALLOW_NODBS) 686 return (0); 687 688 if (options & MDCHK_DRVINSET) 689 return (0); 690 691 /* make sure it is in the set */ 692 if (meta_check_inset(sp, np, ep) != 0) 693 return (-1); 694 695 /* make sure its not in a metadevice */ 696 if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0) 697 return (-1); 698 699 /* return success */ 700 return (0); 701 } 702 703 static int 704 update_dbinfo_on_drives( 705 mdsetname_t *sp, 706 md_drive_desc *dd, 707 int set_locked, 708 int force, 709 md_error_t *ep 710 ) 711 { 712 md_set_desc *sd; 713 int i; 714 md_setkey_t *cl_sk; 715 int rval = 0; 716 md_mnnode_desc *nd; 717 718 if ((sd = metaget_setdesc(sp, ep)) == NULL) 719 return (-1); 720 721 if (! set_locked) { 722 if (MD_MNSET_DESC(sd)) { 723 md_error_t xep = mdnullerror; 724 sigset_t sigs; 725 /* Make sure we are blocking all signals */ 726 if (procsigs(TRUE, &sigs, &xep) < 0) 727 mdclrerror(&xep); 728 729 nd = sd->sd_nodelist; 730 while (nd) { 731 if (force && strcmp(nd->nd_nodename, 732 mynode()) != 0) { 733 nd = nd->nd_next; 734 continue; 735 } 736 737 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 738 nd = nd->nd_next; 739 continue; 740 } 741 742 if (clnt_lock_set(nd->nd_nodename, sp, ep)) 743 return (-1); 744 nd = nd->nd_next; 745 } 746 } else { 747 for (i = 0; i < MD_MAXSIDES; i++) { 748 /* Skip empty slots */ 749 if (sd->sd_nodes[i][0] == '\0') 750 continue; 751 752 if (force && strcmp(sd->sd_nodes[i], 753 mynode()) != 0) 754 continue; 755 756 if (clnt_lock_set(sd->sd_nodes[i], sp, ep)) 757 return (-1); 758 } 759 } 760 } 761 762 if (MD_MNSET_DESC(sd)) { 763 nd = sd->sd_nodelist; 764 while (nd) { 765 if (force && strcmp(nd->nd_nodename, mynode()) != 0) { 766 nd = nd->nd_next; 767 continue; 768 } 769 770 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 771 nd = nd->nd_next; 772 continue; 773 } 774 775 if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep) 776 == -1) { 777 rval = -1; 778 break; 779 } 780 nd = nd->nd_next; 781 } 782 } else { 783 for (i = 0; i < MD_MAXSIDES; i++) { 784 /* Skip empty slots */ 785 if (sd->sd_nodes[i][0] == '\0') 786 continue; 787 788 if (force && strcmp(sd->sd_nodes[i], mynode()) != 0) 789 continue; 790 791 if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep) 792 == -1) { 793 rval = -1; 794 break; 795 } 796 } 797 } 798 799 if (! set_locked) { 800 cl_sk = cl_get_setkey(sp->setno, sp->setname); 801 if (MD_MNSET_DESC(sd)) { 802 nd = sd->sd_nodelist; 803 while (nd) { 804 if (force && 805 strcmp(nd->nd_nodename, mynode()) != 0) { 806 nd = nd->nd_next; 807 continue; 808 } 809 810 if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) { 811 nd = nd->nd_next; 812 continue; 813 } 814 815 if (clnt_unlock_set(nd->nd_nodename, cl_sk, 816 ep)) { 817 rval = -1; 818 break; 819 } 820 nd = nd->nd_next; 821 } 822 } else { 823 for (i = 0; i < MD_MAXSIDES; i++) { 824 /* Skip empty slots */ 825 if (sd->sd_nodes[i][0] == '\0') 826 continue; 827 828 if (force && 829 strcmp(sd->sd_nodes[i], mynode()) != 0) 830 continue; 831 832 if (clnt_unlock_set(sd->sd_nodes[i], cl_sk, 833 ep)) { 834 rval = -1; 835 break; 836 } 837 } 838 839 } 840 cl_set_setkey(NULL); 841 } 842 843 return (rval); 844 } 845 846 int 847 meta_db_addsidenms( 848 mdsetname_t *sp, 849 mdname_t *np, 850 daddr_t blkno, 851 int bcast, 852 md_error_t *ep 853 ) 854 { 855 side_t sideno; 856 char *bname = NULL; 857 char *dname = NULL; 858 minor_t mnum; 859 mddb_config_t c; 860 int done; 861 int rval = 0; 862 md_set_desc *sd; 863 864 sideno = MD_SIDEWILD; 865 /*CONSTCOND*/ 866 while (1) { 867 if (bname != NULL) { 868 Free(bname); 869 bname = NULL; 870 } 871 if (dname != NULL) { 872 Free(dname); 873 dname = NULL; 874 } 875 if ((done = meta_getnextside_devinfo(sp, np->bname, 876 &sideno, &bname, &dname, &mnum, ep)) == -1) { 877 rval = -1; 878 break; 879 } 880 881 if (done == 0) 882 break; 883 884 if (! metaislocalset(sp)) { 885 if ((sd = metaget_setdesc(sp, ep)) == NULL) { 886 rval = -1; 887 break; 888 } 889 } 890 891 /* 892 * Send addsidenms to all nodes using rpc.mdcommd if 893 * sidename is being added to MN diskset. 894 * 895 * It's ok to broadcast this call to other nodes. 896 * 897 * Note: The broadcast to other nodes isn't needed during 898 * the addition of the first mddbs to the set since the 899 * other nodes haven't been joined to the set yet. All 900 * nodes in a MN diskset are (implicitly) joined to the set 901 * on the addition of the first mddb. 902 */ 903 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 904 (bcast == DB_ADDSIDENMS_BCAST)) { 905 md_mn_result_t *resultp = NULL; 906 md_mn_msg_meta_db_newside_t db_ns; 907 int send_rval; 908 909 db_ns.msg_l_dev = np->dev; 910 db_ns.msg_sideno = sideno; 911 db_ns.msg_blkno = blkno; 912 (void) strncpy(db_ns.msg_dname, dname, 913 sizeof (db_ns.msg_dname)); 914 (void) splitname(np->bname, &db_ns.msg_splitname); 915 db_ns.msg_mnum = mnum; 916 917 /* Set devid to NULL until devids are supported */ 918 db_ns.msg_devid[0] = NULL; 919 920 /* 921 * If reconfig cycle has been started, this node is 922 * stuck in in the return step until this command has 923 * completed. If mdcommd is suspended, ask 924 * send_message to fail (instead of retrying) 925 * so that metaset can finish allowing the reconfig 926 * cycle to proceed. 927 */ 928 send_rval = mdmn_send_message(sp->setno, 929 MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND | 930 MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns, 931 sizeof (md_mn_msg_meta_db_newside_t), 932 &resultp, ep); 933 if (send_rval != 0) { 934 rval = -1; 935 if (resultp == NULL) 936 (void) mddserror(ep, 937 MDE_DS_COMMD_SEND_FAIL, 938 sp->setno, NULL, NULL, 939 sp->setname); 940 else { 941 (void) mdstealerror(ep, 942 &(resultp->mmr_ep)); 943 if (mdisok(ep)) { 944 (void) mddserror(ep, 945 MDE_DS_COMMD_SEND_FAIL, 946 sp->setno, NULL, NULL, 947 sp->setname); 948 } 949 free_result(resultp); 950 } 951 break; 952 } 953 if (resultp) 954 free_result(resultp); 955 } else { 956 /* 957 * Let this side's device name, minor # and driver name 958 * be known to the database replica. 959 */ 960 (void) memset(&c, 0, sizeof (c)); 961 962 /* Fill in device/replica info */ 963 c.c_locator.l_dev = meta_cmpldev(np->dev); 964 c.c_locator.l_blkno = blkno; 965 (void) strncpy(c.c_locator.l_driver, dname, 966 sizeof (c.c_locator.l_driver)); 967 if (splitname(np->bname, &c.c_devname) == 968 METASPLIT_LONGDISKNAME && devid_in_use == FALSE) { 969 rval = mddeverror(ep, MDE_DISKNAMETOOLONG, 970 NODEV64, np->rname); 971 break; 972 } 973 974 c.c_locator.l_mnum = mnum; 975 976 /* Fill in setno, setname, and sideno */ 977 c.c_setno = sp->setno; 978 (void) strncpy(c.c_setname, sp->setname, 979 sizeof (c.c_setname)); 980 c.c_sideno = sideno; 981 982 /* 983 * Don't need device id information from this ioctl 984 * Kernel determines device id from dev_t, which 985 * is just what this code would do. 986 */ 987 c.c_locator.l_devid = (uint64_t)0; 988 c.c_locator.l_devid_flags = 0; 989 990 if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) { 991 rval = mdstealerror(ep, &c.c_mde); 992 break; 993 } 994 } 995 } 996 997 /* cleanup, return success */ 998 if (bname != NULL) { 999 Free(bname); 1000 bname = NULL; 1001 } 1002 if (dname != NULL) { 1003 Free(dname); 1004 dname = NULL; 1005 } 1006 return (rval); 1007 } 1008 1009 1010 int 1011 meta_db_delsidenm( 1012 mdsetname_t *sp, 1013 side_t sideno, 1014 mdname_t *np, 1015 daddr_t blkno, 1016 md_error_t *ep 1017 ) 1018 { 1019 mddb_config_t c; 1020 md_set_desc *sd; 1021 1022 if (! metaislocalset(sp)) { 1023 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1024 return (-1); 1025 } 1026 /* Use rpc.mdcommd to delete mddb side from all nodes */ 1027 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1028 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1029 md_mn_result_t *resultp = NULL; 1030 md_mn_msg_meta_db_delside_t db_ds; 1031 int send_rval; 1032 1033 db_ds.msg_l_dev = np->dev; 1034 db_ds.msg_blkno = blkno; 1035 db_ds.msg_sideno = sideno; 1036 1037 /* Set devid to NULL until devids are supported */ 1038 db_ds.msg_devid[0] = NULL; 1039 1040 /* 1041 * If reconfig cycle has been started, this node is 1042 * stuck in in the return step until this command has 1043 * completed. If mdcommd is suspended, ask 1044 * send_message to fail (instead of retrying) 1045 * so that metaset can finish allowing the reconfig 1046 * cycle to proceed. 1047 */ 1048 send_rval = mdmn_send_message(sp->setno, 1049 MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND | 1050 MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds, 1051 sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep); 1052 if (send_rval != 0) { 1053 if (resultp == NULL) 1054 (void) mddserror(ep, 1055 MDE_DS_COMMD_SEND_FAIL, 1056 sp->setno, NULL, NULL, 1057 sp->setname); 1058 else { 1059 (void) mdstealerror(ep, &(resultp->mmr_ep)); 1060 if (mdisok(ep)) { 1061 (void) mddserror(ep, 1062 MDE_DS_COMMD_SEND_FAIL, 1063 sp->setno, NULL, NULL, 1064 sp->setname); 1065 } 1066 free_result(resultp); 1067 } 1068 return (-1); 1069 } 1070 if (resultp) 1071 free_result(resultp); 1072 1073 } else { 1074 /* 1075 * Let this side's device name, minor # and driver name 1076 * be known to the database replica. 1077 */ 1078 (void) memset(&c, 0, sizeof (c)); 1079 1080 /* Fill in device/replica info */ 1081 c.c_locator.l_dev = meta_cmpldev(np->dev); 1082 c.c_locator.l_blkno = blkno; 1083 1084 /* Fill in setno, setname, and sideno */ 1085 c.c_setno = sp->setno; 1086 (void) strcpy(c.c_setname, sp->setname); 1087 c.c_sideno = sideno; 1088 1089 /* 1090 * Don't need device id information from this ioctl 1091 * Kernel determines device id from dev_t, which 1092 * is just what this code would do. 1093 */ 1094 c.c_locator.l_devid = (uint64_t)0; 1095 c.c_locator.l_devid_flags = 0; 1096 1097 if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0) 1098 return (mdstealerror(ep, &c.c_mde)); 1099 } 1100 return (0); 1101 } 1102 1103 1104 static int 1105 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep) 1106 { 1107 mdnamelist_t *dnp1, *dnp2; 1108 1109 for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) { 1110 for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) { 1111 if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0) 1112 return (mderror(ep, MDE_DUPDRIVE, 1113 dnp1->namep->cname)); 1114 } 1115 } 1116 return (0); 1117 } 1118 1119 1120 /* 1121 * Return 1 if files are different, else return 0 1122 */ 1123 static int 1124 filediff(char *tsname, char *sname) 1125 { 1126 int ret = 1, fd; 1127 size_t tsz, sz; 1128 struct stat sbuf; 1129 char *tbuf, *buf; 1130 1131 if (stat(tsname, &sbuf) != 0) 1132 return (1); 1133 tsz = sbuf.st_size; 1134 if (stat(sname, &sbuf) != 0) 1135 return (1); 1136 sz = sbuf.st_size; 1137 if (tsz != sz) 1138 return (1); 1139 1140 /* allocate memory and read both files into buffer */ 1141 tbuf = malloc(tsz); 1142 buf = malloc(sz); 1143 if (tbuf == NULL || buf == NULL) 1144 goto out; 1145 1146 fd = open(tsname, O_RDONLY); 1147 if (fd == -1) 1148 goto out; 1149 sz = read(fd, tbuf, tsz); 1150 (void) close(fd); 1151 if (sz != tsz) 1152 goto out; 1153 1154 fd = open(sname, O_RDONLY); 1155 if (fd == -1) 1156 goto out; 1157 sz = read(fd, buf, tsz); 1158 (void) close(fd); 1159 if (sz != tsz) 1160 goto out; 1161 1162 /* compare content */ 1163 ret = bcmp(tbuf, buf, tsz); 1164 out: 1165 if (tbuf) 1166 free(tbuf); 1167 if (buf) 1168 free(buf); 1169 return (ret); 1170 } 1171 1172 /* 1173 * patch md.conf file with mddb locations 1174 */ 1175 int 1176 meta_db_patch( 1177 char *sname, /* system file name */ 1178 char *cname, /* mddb.cf file name */ 1179 int patch, /* patching locally */ 1180 md_error_t *ep 1181 ) 1182 { 1183 char *tsname = NULL; 1184 char line[MDDB_BOOTLIST_MAX_LEN]; 1185 FILE *tsfp = NULL; 1186 FILE *mfp = NULL; 1187 int rval = -1; 1188 1189 /* check names */ 1190 if (sname == NULL) { 1191 if (patch) 1192 sname = "md.conf"; 1193 else 1194 sname = "/kernel/drv/md.conf"; 1195 } 1196 if (cname == NULL) 1197 cname = META_DBCONF; 1198 1199 /* 1200 * edit file 1201 */ 1202 if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) { 1203 if (mdissyserror(ep, EROFS)) { 1204 /* 1205 * If we are booted on a read-only root because 1206 * of mddb quorum problems we don't want to emit 1207 * any scary error messages. 1208 */ 1209 mdclrerror(ep); 1210 rval = 0; 1211 } 1212 goto out; 1213 } 1214 1215 if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0, 1216 ep) != 0) 1217 goto out; 1218 1219 /* if file content is identical, skip rename */ 1220 if (filediff(tsname, sname) == 0) { 1221 rval = 0; 1222 goto out; 1223 } 1224 1225 if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) || 1226 (fclose(tsfp) != 0)) { 1227 (void) mdsyserror(ep, errno, tsname); 1228 goto out; 1229 } 1230 1231 tsfp = NULL; 1232 1233 /* 1234 * rename file. If we get a Cross Device error then it 1235 * is because we are in the miniroot. 1236 */ 1237 if (rename(tsname, sname) != 0 && errno != EXDEV) { 1238 (void) mdsyserror(ep, errno, sname); 1239 goto out; 1240 } 1241 1242 if (errno == EXDEV) { 1243 if ((tsfp = fopen(tsname, "r")) == NULL) 1244 goto out; 1245 if ((mfp = fopen(sname, "w+")) == NULL) 1246 goto out; 1247 while (fgets(line, sizeof (line), tsfp) != NULL) { 1248 if (fputs(line, mfp) == NULL) 1249 goto out; 1250 } 1251 (void) fclose(tsfp); 1252 tsfp = NULL; 1253 if (fflush(mfp) != 0) 1254 goto out; 1255 if (fsync(fileno(mfp)) != 0) 1256 goto out; 1257 if (fclose(mfp) != 0) { 1258 mfp = NULL; 1259 goto out; 1260 } 1261 } 1262 1263 Free(tsname); 1264 tsname = NULL; 1265 rval = 0; 1266 1267 /* cleanup, return error */ 1268 out: 1269 if (tsfp != NULL) 1270 (void) fclose(tsfp); 1271 if (tsname != NULL) { 1272 (void) unlink(tsname); 1273 Free(tsname); 1274 } 1275 return (rval); 1276 } 1277 1278 /* 1279 * Add replicas to set. This happens as a result of: 1280 * - metadb [-s set_name] -a 1281 * - metaset -s set_name -a disk 1282 * - metaset -s set_name -d disk (causes a rebalance of mddbs) 1283 * - metaset -s set_name -b 1284 * 1285 * For a local set, this routine is run on the local set host. 1286 * 1287 * For a traditional diskset, this routine is run on the node that 1288 * is running the metaset command. 1289 * 1290 * For a multinode diskset, this routine is run by the node that is 1291 * running the metaset command. If this is the first mddb added to 1292 * the MN diskset, then no communication is made to other nodes via commd 1293 * since the other nodes will be in-sync with respect to the mddbs when 1294 * those other nodes join the set and snarf in the newly created mddb. 1295 * If this is not the first mddb added to the MN diskset, then this 1296 * attach command is sent to all of the nodes using commd. This keeps 1297 * the nodes in-sync. 1298 */ 1299 int 1300 meta_db_attach( 1301 mdsetname_t *sp, 1302 mdnamelist_t *db_nlp, 1303 mdchkopts_t options, 1304 md_timeval32_t *timeval, 1305 int dbcnt, 1306 int dbsize, 1307 char *sysfilename, 1308 md_error_t *ep 1309 ) 1310 { 1311 struct mddb_config c; 1312 mdnamelist_t *nlp; 1313 mdname_t *np; 1314 md_drive_desc *dd = NULL; 1315 md_drive_desc *p; 1316 int i; 1317 int fd; 1318 side_t sideno; 1319 daddr_t blkno; 1320 int replicacount = 0; 1321 int start_svmdaemons = 0; 1322 int rval = 0; 1323 md_error_t status = mdnullerror; 1324 md_set_desc *sd; 1325 int stale_bool = FALSE; 1326 int flags; 1327 int firstmddb = 1; 1328 md_timeval32_t inittime = {0, 0}; 1329 1330 /* 1331 * Error if we don't get some work to do. 1332 */ 1333 if (db_nlp == NULL) 1334 return (mdsyserror(ep, EINVAL, NULL)); 1335 1336 if (mdnamesareunique(db_nlp, ep) != 0) 1337 return (-1); 1338 (void) memset(&c, 0, sizeof (c)); 1339 c.c_id = 0; 1340 c.c_setno = sp->setno; 1341 1342 /* Don't need device id information from this ioctl */ 1343 c.c_locator.l_devid = (uint64_t)0; 1344 c.c_locator.l_devid_flags = 0; 1345 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 1346 if (metaislocalset(sp)) { 1347 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) 1348 mdclrerror(&c.c_mde); 1349 else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) || 1350 (! (options & MDCHK_ALLOW_NODBS))) 1351 return (mdstealerror(ep, &c.c_mde)); 1352 } else { 1353 if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER)) 1354 return (mdstealerror(ep, &c.c_mde)); 1355 } 1356 mdclrerror(&c.c_mde); 1357 } 1358 /* 1359 * Is current set STALE? 1360 */ 1361 if (c.c_flags & MDDB_C_STALE) { 1362 stale_bool = TRUE; 1363 } 1364 1365 assert(db_nlp != NULL); 1366 1367 /* if these are the first replicas then the SVM daemons need to run */ 1368 if (c.c_dbcnt == 0) 1369 start_svmdaemons = 1; 1370 1371 /* 1372 * check to see if we will go over the total possible number 1373 * of data bases 1374 */ 1375 nlp = db_nlp; 1376 while (nlp) { 1377 replicacount += dbcnt; 1378 nlp = nlp->next; 1379 } 1380 1381 if ((replicacount + c.c_dbcnt) > c.c_dbmax) 1382 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32, 1383 sp->setno, c.c_dbcnt + replicacount, NULL)); 1384 1385 /* 1386 * go through and check to make sure all locations specified 1387 * are legal also pick out driver name; 1388 */ 1389 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1390 diskaddr_t devsize; 1391 1392 np = nlp->namep; 1393 1394 if (! metaislocalset(sp)) { 1395 uint_t partno; 1396 uint_t rep_partno; 1397 mddrivename_t *dnp = np->drivenamep; 1398 1399 /* 1400 * make sure that non-local database replicas 1401 * are always on the replica slice. 1402 */ 1403 if (meta_replicaslice(dnp, 1404 &rep_partno, ep) != 0) 1405 return (-1); 1406 if (metagetvtoc(np, FALSE, &partno, ep) == NULL) 1407 return (-1); 1408 if (partno != rep_partno) 1409 return (mddeverror(ep, MDE_REPCOMP_ONLY, 1410 np->dev, sp->setname)); 1411 } 1412 1413 if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize), 1414 ep)) { 1415 return (-1); 1416 } 1417 1418 if ((devsize = metagetsize(np, ep)) == -1) 1419 return (-1); 1420 1421 if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16)) 1422 return (mdmddberror(ep, MDE_REPLICA_TOOSMALL, 1423 meta_getminor(np->dev), sp->setno, devsize, 1424 np->cname)); 1425 } 1426 1427 /* 1428 * If first disk in set we don't have lb_inittime yet for use as 1429 * mb_setcreatetime so don't go looking for it. WE'll come back 1430 * later and update after the locator block has been created. 1431 * If this isn't the first disk in the set, we have a locator 1432 * block and thus we have lb_inittime. Set mb_setcreatetime to 1433 * lb_inittime. 1434 */ 1435 if (! metaislocalset(sp)) { 1436 if (c.c_dbcnt != 0) { 1437 firstmddb = 0; 1438 inittime = meta_get_lb_inittime(sp, ep); 1439 } 1440 } 1441 1442 /* 1443 * go through and write all master blocks 1444 */ 1445 1446 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1447 np = nlp->namep; 1448 1449 if ((fd = open(np->rname, O_RDWR)) < 0) 1450 return (mdsyserror(ep, errno, np->rname)); 1451 1452 for (i = 0; i < dbcnt; i++) { 1453 if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize, 1454 inittime, ep)) { 1455 (void) close(fd); 1456 return (-1); 1457 } 1458 } 1459 (void) close(fd); 1460 } 1461 1462 if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD) 1463 return (-1); 1464 1465 if (! metaislocalset(sp)) { 1466 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1467 if (! mdisok(ep)) 1468 return (-1); 1469 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1470 return (-1); 1471 1472 } 1473 1474 /* 1475 * go through and tell kernel to add them 1476 */ 1477 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1478 mdcinfo_t *cinfo; 1479 1480 np = nlp->namep; 1481 1482 if ((cinfo = metagetcinfo(np, ep)) == NULL) { 1483 rval = -1; 1484 goto out; 1485 } 1486 1487 /* 1488 * If mddb is being added to MN diskset and there already 1489 * exists a valid mddb in the set (which equates to this 1490 * node being an owner of the set) then use rpc.mdcommd 1491 * mechanism to add mddb(s) so that all nodes stay in sync. 1492 * If set is stale, don't log the message since rpc.mdcommd 1493 * can't write the message to the mddb. 1494 * 1495 * Otherwise, just add mddb to this node. 1496 */ 1497 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1498 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1499 md_mn_result_t *resultp = NULL; 1500 md_mn_msg_meta_db_attach_t attach; 1501 int send_rval; 1502 1503 /* 1504 * In a scenario where new replicas had been added on 1505 * the master, and then all of the old replicas failed 1506 * before the slaves had knowledge of the new replicas, 1507 * the slaves are unable to re-parse in the mddb 1508 * from the new replicas since the slaves have no 1509 * knowledge of the new replicas. The following 1510 * algorithm solves this problem: 1511 * - META_DB_ATTACH message generates submsgs 1512 * - BLOCK parse (master) 1513 * - MDDB_ATTACH new replicas 1514 * - UNBLOCK parse (master) causing parse 1515 * information to be sent from master 1516 * to slaves at a higher class than the 1517 * unblock so the parse message will 1518 * reach slaves before unblock message. 1519 */ 1520 attach.msg_l_dev = np->dev; 1521 attach.msg_cnt = dbcnt; 1522 attach.msg_dbsize = dbsize; 1523 (void) strncpy(attach.msg_dname, cinfo->dname, 1524 sizeof (attach.msg_dname)); 1525 (void) splitname(np->bname, &attach.msg_splitname); 1526 attach.msg_options = options; 1527 1528 /* Set devid to NULL until devids are supported */ 1529 attach.msg_devid[0] = NULL; 1530 1531 /* 1532 * If reconfig cycle has been started, this node is 1533 * stuck in in the return step until this command has 1534 * completed. If mdcommd is suspended, ask 1535 * send_message to fail (instead of retrying) 1536 * so that metaset can finish allowing the reconfig 1537 * cycle to proceed. 1538 */ 1539 flags = MD_MSGF_FAIL_ON_SUSPEND; 1540 if (stale_bool == TRUE) 1541 flags |= MD_MSGF_NO_LOG; 1542 send_rval = mdmn_send_message(sp->setno, 1543 MD_MN_MSG_META_DB_ATTACH, 1544 flags, 0, (char *)&attach, 1545 sizeof (md_mn_msg_meta_db_attach_t), 1546 &resultp, ep); 1547 if (send_rval != 0) { 1548 rval = -1; 1549 if (resultp == NULL) 1550 (void) mddserror(ep, 1551 MDE_DS_COMMD_SEND_FAIL, 1552 sp->setno, NULL, NULL, 1553 sp->setname); 1554 else { 1555 (void) mdstealerror(ep, 1556 &(resultp->mmr_ep)); 1557 if (mdisok(ep)) { 1558 (void) mddserror(ep, 1559 MDE_DS_COMMD_SEND_FAIL, 1560 sp->setno, NULL, NULL, 1561 sp->setname); 1562 } 1563 free_result(resultp); 1564 } 1565 goto out; 1566 } 1567 if (resultp) 1568 free_result(resultp); 1569 } else { 1570 /* Adding mddb(s) to just this node */ 1571 for (i = 0; i < dbcnt; i++) { 1572 (void) memset(&c, 0, sizeof (c)); 1573 /* Fill in device/replica info */ 1574 c.c_locator.l_dev = meta_cmpldev(np->dev); 1575 c.c_locator.l_blkno = i * dbsize + 16; 1576 blkno = c.c_locator.l_blkno; 1577 (void) strncpy(c.c_locator.l_driver, 1578 cinfo->dname, 1579 sizeof (c.c_locator.l_driver)); 1580 1581 if (splitname(np->bname, &c.c_devname) == 1582 METASPLIT_LONGDISKNAME && devid_in_use == 1583 FALSE) { 1584 rval = mddeverror(ep, 1585 MDE_DISKNAMETOOLONG, 1586 NODEV64, np->rname); 1587 goto out; 1588 } 1589 1590 c.c_locator.l_mnum = meta_getminor(np->dev); 1591 1592 /* Fill in setno, setname, and sideno */ 1593 c.c_setno = sp->setno; 1594 if (! metaislocalset(sp)) { 1595 if (MD_MNSET_DESC(sd)) { 1596 c.c_multi_node = 1; 1597 } 1598 } 1599 (void) strcpy(c.c_setname, sp->setname); 1600 c.c_sideno = sideno; 1601 1602 /* 1603 * Don't need device id information from this 1604 * ioctl Kernel determines device id from 1605 * dev_t, which is just what this code would do. 1606 */ 1607 c.c_locator.l_devid = (uint64_t)0; 1608 c.c_locator.l_devid_flags = 0; 1609 1610 if (timeval != NULL) 1611 c.c_timestamp = *timeval; 1612 1613 if (setup_med_cfg(sp, &c, 1614 (options & MDCHK_SET_FORCE), ep)) { 1615 rval = -1; 1616 goto out; 1617 } 1618 1619 if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, 1620 NULL) != 0) { 1621 rval = mdstealerror(ep, &c.c_mde); 1622 goto out; 1623 } 1624 /* 1625 * This is either a traditional diskset OR this 1626 * is the first replica added to a MN diskset. 1627 * In either case, set broadcast to NO_BCAST so 1628 * that message won't go through rpc.mdcommd. 1629 * If this is a traditional diskset, the bcast 1630 * flag is ignored since traditional disksets 1631 * don't use the rpc.mdcommd. 1632 */ 1633 if (meta_db_addsidenms(sp, np, blkno, 1634 DB_ADDSIDENMS_NO_BCAST, ep)) 1635 goto out; 1636 } 1637 } 1638 if (! metaislocalset(sp)) { 1639 /* update the dbcnt and size in dd */ 1640 for (p = dd; p != NULL; p = p->dd_next) 1641 if (p->dd_dnp == np->drivenamep) { 1642 p->dd_dbcnt = dbcnt; 1643 p->dd_dbsize = dbsize; 1644 break; 1645 } 1646 } 1647 1648 /* 1649 * If this was the first addition of disks to the 1650 * diskset you now need to update the mb_setcreatetime 1651 * which needed lb_inittime which wasn't there until now. 1652 */ 1653 if (firstmddb) { 1654 if (meta_update_mb(sp, dd, ep) != 0) { 1655 return (-1); 1656 } 1657 } 1658 (void) close(fd); 1659 } 1660 1661 out: 1662 if (metaislocalset(sp)) { 1663 1664 /* everything looks fine. Start mdmonitord */ 1665 if (rval == 0 && start_svmdaemons == 1) { 1666 if (meta_smf_enable(META_SMF_CORE, &status) == -1) { 1667 mde_perror(&status, ""); 1668 mdclrerror(&status); 1669 } 1670 } 1671 1672 if (buildconf(sp, &status)) { 1673 /* Don't mask any previous errors */ 1674 if (rval == 0) 1675 rval = mdstealerror(ep, &status); 1676 return (rval); 1677 } 1678 1679 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 1680 /* Don't mask any previous errors */ 1681 if (rval == 0) 1682 rval = mdstealerror(ep, &status); 1683 } 1684 } else { 1685 if (update_dbinfo_on_drives(sp, dd, 1686 (options & MDCHK_SET_LOCKED), 1687 (options & MDCHK_SET_FORCE), 1688 &status)) { 1689 /* Don't mask any previous errors */ 1690 if (rval == 0) 1691 rval = mdstealerror(ep, &status); 1692 else 1693 mdclrerror(&status); 1694 } 1695 metafreedrivedesc(&dd); 1696 } 1697 /* 1698 * For MN disksets that already had already had nodes joined 1699 * before the attach of this mddb(s), the name invalidation is 1700 * done by the commd handler routine. Otherwise, if this 1701 * is the first attach of a MN diskset mddb, the invalidation 1702 * must be done here since the first attach cannot be sent 1703 * via the commd since there are no nodes joined to the set yet. 1704 */ 1705 if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) || 1706 (MD_MNSET_DESC(sd) && 1707 (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) { 1708 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 1709 meta_invalidate_name(nlp->namep); 1710 } 1711 } 1712 return (rval); 1713 } 1714 1715 /* 1716 * deletelist_length 1717 * 1718 * return the number of slices that have been specified for deletion 1719 * on the metadb command line. This does not calculate the number 1720 * of replicas because there may be multiple replicas per slice. 1721 */ 1722 static int 1723 deletelist_length(mdnamelist_t *db_nlp) 1724 { 1725 1726 mdnamelist_t *nlp; 1727 int list_length = 0; 1728 1729 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1730 list_length++; 1731 } 1732 1733 return (list_length); 1734 } 1735 1736 static int 1737 in_deletelist(char *devname, mdnamelist_t *db_nlp) 1738 { 1739 1740 mdnamelist_t *nlp; 1741 mdname_t *np; 1742 int index = 0; 1743 1744 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1745 np = nlp->namep; 1746 1747 if (strcmp(devname, np->bname) == 0) 1748 return (index); 1749 index++; 1750 } 1751 1752 return (-1); 1753 } 1754 1755 /* 1756 * Delete replicas from set. This happens as a result of: 1757 * - metadb [-s set_name] -d 1758 * - metaset -s set_name -a disk (causes a rebalance of mddbs) 1759 * - metaset -s set_name -d disk 1760 * - metaset -s set_name -b 1761 * 1762 * For a local set, this routine is run on the local set host. 1763 * 1764 * For a traditional diskset, this routine is run on the node that 1765 * is running the metaset command. 1766 * 1767 * For a multinode diskset, this routine is run by the node that is 1768 * running the metaset command. This detach routine is sent to all 1769 * of the joined nodes in the diskset using commd. This keeps 1770 * the nodes in-sync. 1771 */ 1772 int 1773 meta_db_detach( 1774 mdsetname_t *sp, 1775 mdnamelist_t *db_nlp, 1776 mdforceopts_t force_option, 1777 char *sysfilename, 1778 md_error_t *ep 1779 ) 1780 { 1781 struct mddb_config c; 1782 mdnamelist_t *nlp; 1783 mdname_t *np; 1784 md_drive_desc *dd = NULL; 1785 md_drive_desc *p; 1786 int replicacount; 1787 int replica_delete_count; 1788 int nr_replica_slices; 1789 int i; 1790 int stop_svmdaemons = 0; 1791 int rval = 0; 1792 int index; 1793 int valid_replicas_nottodelete = 0; 1794 int invalid_replicas_nottodelete = 0; 1795 int invalid_replicas_todelete = 0; 1796 int errored = 0; 1797 int *tag_array; 1798 int fd = -1; 1799 md_error_t status = mdnullerror; 1800 md_set_desc *sd; 1801 int stale_bool = FALSE; 1802 int flags; 1803 1804 /* 1805 * Error if we don't get some work to do. 1806 */ 1807 if (db_nlp == NULL) 1808 return (mdsyserror(ep, EINVAL, NULL)); 1809 1810 if (mdnamesareunique(db_nlp, ep) != 0) 1811 return (-1); 1812 1813 (void) memset(&c, 0, sizeof (c)); 1814 c.c_id = 0; 1815 c.c_setno = sp->setno; 1816 1817 /* Don't need device id information from this ioctl */ 1818 c.c_locator.l_devid = (uint64_t)0; 1819 c.c_locator.l_devid_flags = 0; 1820 1821 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1822 return (mdstealerror(ep, &c.c_mde)); 1823 1824 /* 1825 * Is current set STALE? 1826 */ 1827 if (c.c_flags & MDDB_C_STALE) { 1828 stale_bool = TRUE; 1829 } 1830 1831 replicacount = c.c_dbcnt; 1832 1833 assert(db_nlp != NULL); 1834 1835 /* 1836 * go through and gather how many data bases are on each 1837 * device specified. 1838 */ 1839 1840 nr_replica_slices = deletelist_length(db_nlp); 1841 tag_array = (int *)calloc(nr_replica_slices, sizeof (int)); 1842 1843 replica_delete_count = 0; 1844 for (i = 0; i < replicacount; i++) { 1845 char *devname; 1846 int found = 0; 1847 1848 c.c_id = i; 1849 1850 /* Don't need device id information from this ioctl */ 1851 c.c_locator.l_devid = (uint64_t)0; 1852 c.c_locator.l_devid_flags = 0; 1853 1854 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1855 return (mdstealerror(ep, &c.c_mde)); 1856 1857 devname = splicename(&c.c_devname); 1858 1859 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) { 1860 Free(devname); 1861 devname = getlongname(&c, ep); 1862 if (devname == NULL) { 1863 return (-1); 1864 } 1865 } 1866 1867 if ((index = in_deletelist(devname, db_nlp)) != -1) { 1868 found = 1; 1869 tag_array[index] = 1; 1870 replica_delete_count++; 1871 } 1872 1873 errored = c.c_locator.l_flags & (MDDB_F_EREAD | 1874 MDDB_F_EWRITE | MDDB_F_TOOSMALL | MDDB_F_EFMT | 1875 MDDB_F_EDATA | MDDB_F_EMASTER); 1876 1877 /* 1878 * There are four combinations of "errored" and "found" 1879 * and they are used to find the number of 1880 * (a) valid/invalid replicas that are not in the delete 1881 * list and are available in the system. 1882 * (b) valid/invalid replicas that are to be deleted. 1883 */ 1884 1885 if (errored && !found) /* errored and !found */ 1886 invalid_replicas_nottodelete++; 1887 else if (!found) /* !errored and !found */ 1888 valid_replicas_nottodelete++; 1889 else if (errored) /* errored and found */ 1890 invalid_replicas_todelete++; 1891 /* 1892 * else it is !errored and found. This means 1893 * valid_replicas_todelete++; But this variable will not 1894 * be used anywhere 1895 */ 1896 1897 Free(devname); 1898 } 1899 1900 index = 0; 1901 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1902 np = nlp->namep; 1903 if (tag_array[index++] != 1) { 1904 Free(tag_array); 1905 return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname)); 1906 } 1907 } 1908 1909 Free(tag_array); 1910 1911 1912 /* if all replicas are deleted stop mdmonitord */ 1913 if ((replicacount - replica_delete_count) == 0) 1914 stop_svmdaemons = 1; 1915 1916 if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) { 1917 if (force_option & MDFORCE_NONE) 1918 return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname)); 1919 if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS)) 1920 return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname)); 1921 } 1922 1923 /* 1924 * The following algorithms are followed to check for deletion: 1925 * (a) If the delete list(db_nlp) has all invalid replicas and no valid 1926 * replicas, then deletion should be allowed. 1927 * (b) Deletion should be allowed only if valid replicas that are "not" 1928 * to be deleted is always greater than the invalid replicas that 1929 * are "not" to be deleted. 1930 * (c) If the user uses -f option, then deletion should be allowed. 1931 */ 1932 1933 if ((invalid_replicas_todelete != replica_delete_count) && 1934 (invalid_replicas_nottodelete > valid_replicas_nottodelete) && 1935 (force_option != MDFORCE_LOCAL)) 1936 return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname)); 1937 1938 /* 1939 * go through and tell kernel to delete them 1940 */ 1941 1942 /* Don't need device id information from this ioctl */ 1943 c.c_locator.l_devid = (uint64_t)0; 1944 c.c_locator.l_devid_flags = 0; 1945 1946 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) 1947 return (mdstealerror(ep, &c.c_mde)); 1948 1949 if (! metaislocalset(sp)) { 1950 dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep); 1951 if (! mdisok(ep)) 1952 return (-1); 1953 if ((sd = metaget_setdesc(sp, ep)) == NULL) 1954 return (-1); 1955 } 1956 1957 for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) { 1958 np = nlp->namep; 1959 1960 /* 1961 * If mddb is being deleted from MN diskset and node is 1962 * an owner of the diskset then use rpc.mdcommd 1963 * mechanism to add mddb(s) so that all nodes stay in sync. 1964 * If set is stale, don't log the message since rpc.mdcommd 1965 * can't write the message to the mddb. 1966 * 1967 * When mddbs are first being added to set, a detach can 1968 * be called before any node has joined the diskset, so 1969 * must check to see if node is an owner of the diskset. 1970 * 1971 * Otherwise, just delete mddb from this node. 1972 */ 1973 1974 if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) && 1975 (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) { 1976 md_mn_result_t *resultp; 1977 md_mn_msg_meta_db_detach_t detach; 1978 int send_rval; 1979 1980 /* 1981 * The following algorithm is used to detach replicas. 1982 * - META_DB_DETACH message generates submsgs 1983 * - BLOCK parse (master) 1984 * - MDDB_DETACH replicas 1985 * - UNBLOCK parse (master) causing parse 1986 * information to be sent from master 1987 * to slaves at a higher class than the 1988 * unblock so the parse message will 1989 * reach slaves before unblock message. 1990 */ 1991 (void) splitname(np->bname, &detach.msg_splitname); 1992 1993 /* Set devid to NULL until devids are supported */ 1994 detach.msg_devid[0] = NULL; 1995 1996 /* 1997 * If reconfig cycle has been started, this node is 1998 * stuck in in the return step until this command has 1999 * completed. If mdcommd is suspended, ask 2000 * send_message to fail (instead of retrying) 2001 * so that metaset can finish allowing the reconfig 2002 * cycle to proceed. 2003 */ 2004 flags = MD_MSGF_FAIL_ON_SUSPEND; 2005 if (stale_bool == TRUE) 2006 flags |= MD_MSGF_NO_LOG; 2007 send_rval = mdmn_send_message(sp->setno, 2008 MD_MN_MSG_META_DB_DETACH, 2009 flags, 0, (char *)&detach, 2010 sizeof (md_mn_msg_meta_db_detach_t), 2011 &resultp, ep); 2012 if (send_rval != 0) { 2013 rval = -1; 2014 if (resultp == NULL) 2015 (void) mddserror(ep, 2016 MDE_DS_COMMD_SEND_FAIL, 2017 sp->setno, NULL, NULL, 2018 sp->setname); 2019 else { 2020 (void) mdstealerror(ep, 2021 &(resultp->mmr_ep)); 2022 if (mdisok(ep)) { 2023 (void) mddserror(ep, 2024 MDE_DS_COMMD_SEND_FAIL, 2025 sp->setno, NULL, NULL, 2026 sp->setname); 2027 } 2028 free_result(resultp); 2029 } 2030 goto out; 2031 } 2032 if (resultp) 2033 free_result(resultp); 2034 } else { 2035 i = 0; 2036 while (i < c.c_dbcnt) { 2037 char *devname; 2038 2039 c.c_id = i; 2040 2041 /* Don't need devid info from this ioctl */ 2042 c.c_locator.l_devid = (uint64_t)0; 2043 c.c_locator.l_devid_flags = 0; 2044 2045 if (metaioctl(MD_DB_GETDEV, &c, 2046 &c.c_mde, NULL)) { 2047 rval = mdstealerror(ep, &c.c_mde); 2048 goto out; 2049 } 2050 2051 devname = splicename(&c.c_devname); 2052 2053 if (strstr(devname, META_LONGDISKNAME_STR) 2054 != NULL) { 2055 Free(devname); 2056 devname = getlongname(&c, ep); 2057 if (devname == NULL) { 2058 return (-1); 2059 } 2060 } 2061 2062 if (strcmp(devname, np->bname) != 0) { 2063 Free(devname); 2064 i++; 2065 continue; 2066 } 2067 Free(devname); 2068 2069 /* Don't need devid info from this ioctl */ 2070 c.c_locator.l_devid = (uint64_t)0; 2071 c.c_locator.l_devid_flags = 0; 2072 2073 if (metaioctl(MD_DB_DELDEV, &c, 2074 &c.c_mde, NULL) != 0) { 2075 rval = mdstealerror(ep, &c.c_mde); 2076 goto out; 2077 } 2078 2079 /* Not incrementing "i" intentionally */ 2080 } 2081 } 2082 if (! metaislocalset(sp)) { 2083 /* update the dbcnt and size in dd */ 2084 for (p = dd; p != NULL; p = p->dd_next) { 2085 if (p->dd_dnp == np->drivenamep) { 2086 p->dd_dbcnt = 0; 2087 p->dd_dbsize = 0; 2088 break; 2089 } 2090 } 2091 2092 /* 2093 * Slam a dummy master block and make it self 2094 * identifying 2095 */ 2096 if ((fd = open(np->rname, O_RDWR)) >= 0) { 2097 meta_mkdummymaster(sp, fd, 16); 2098 (void) close(fd); 2099 } 2100 } 2101 } 2102 out: 2103 if (metaislocalset(sp)) { 2104 /* 2105 * Stop all the daemons if there are 2106 * no more replicas so that the module can be 2107 * unloaded. 2108 */ 2109 if (rval == 0 && stop_svmdaemons == 1) { 2110 char buf[MAXPATHLEN]; 2111 int i; 2112 2113 for (i = 0; i < DAEMON_COUNT; i++) { 2114 (void) snprintf(buf, MAXPATHLEN, 2115 "/usr/bin/pkill -%s -x %s", 2116 svmd_kill_list[i].svmd_kill_val, 2117 svmd_kill_list[i].svmd_name); 2118 if (pclose(popen(buf, "w")) == -1) 2119 md_perror(buf); 2120 } 2121 2122 if (meta_smf_disable(META_SMF_ALL, &status) == -1) { 2123 mde_perror(&status, ""); 2124 mdclrerror(&status); 2125 } 2126 } 2127 if (buildconf(sp, &status)) { 2128 /* Don't mask any previous errors */ 2129 if (rval == 0) 2130 rval = mdstealerror(ep, &status); 2131 else 2132 mdclrerror(&status); 2133 return (rval); 2134 } 2135 2136 if (meta_db_patch(sysfilename, NULL, 0, &status)) { 2137 /* Don't mask any previous errors */ 2138 if (rval == 0) 2139 rval = mdstealerror(ep, &status); 2140 else 2141 mdclrerror(&status); 2142 } 2143 } else { 2144 if (update_dbinfo_on_drives(sp, dd, 2145 (force_option & MDFORCE_SET_LOCKED), 2146 ((force_option & MDFORCE_LOCAL) | 2147 (force_option & MDFORCE_DS)), &status)) { 2148 /* Don't mask any previous errors */ 2149 if (rval == 0) 2150 rval = mdstealerror(ep, &status); 2151 else 2152 mdclrerror(&status); 2153 } 2154 metafreedrivedesc(&dd); 2155 } 2156 if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) { 2157 for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) { 2158 meta_invalidate_name(nlp->namep); 2159 } 2160 } 2161 return (rval); 2162 } 2163 2164 static md_replica_t * 2165 metareplicaname( 2166 mdsetname_t *sp, 2167 int flags, 2168 struct mddb_config *c, 2169 md_error_t *ep 2170 ) 2171 { 2172 md_replica_t *rp; 2173 char *devname; 2174 size_t sz; 2175 devid_nmlist_t *disklist = NULL; 2176 char *devid_str; 2177 2178 /* allocate replicaname */ 2179 rp = Zalloc(sizeof (*rp)); 2180 2181 /* get device name */ 2182 devname = splicename(&c->c_devname); 2183 2184 /* 2185 * Check if the device has a long name (>40 characters) and 2186 * if so then we have to use devids to get the device name. 2187 * If this cannot be done then we have to fail the request. 2188 */ 2189 if (strstr(devname, META_LONGDISKNAME_STR) != NULL) { 2190 if (c->c_locator.l_devid != NULL) { 2191 if (meta_deviceid_to_nmlist("/dev/dsk", 2192 (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, 2193 c->c_locator.l_minor_name, &disklist) != 0) { 2194 devid_str = devid_str_encode( 2195 (ddi_devid_t)(uintptr_t) 2196 c->c_locator.l_devid, NULL); 2197 (void) mderror(ep, MDE_MISSING_DEVID_DISK, ""); 2198 mderrorextra(ep, devid_str); 2199 if (devid_str != NULL) 2200 devid_str_free(devid_str); 2201 Free(rp); 2202 Free(devname); 2203 return (NULL); 2204 } 2205 } else { 2206 (void) mderror(ep, MDE_NODEVID, ""); 2207 Free(rp); 2208 Free(devname); 2209 return (NULL); 2210 } 2211 Free(devname); 2212 devname = disklist[0].devname; 2213 } 2214 2215 if (flags & PRINT_FAST) { 2216 if ((rp->r_namep = metaname_fast(&sp, devname, 2217 LOGICAL_DEVICE, ep)) == NULL) { 2218 Free(devname); 2219 Free(rp); 2220 return (NULL); 2221 } 2222 } else { 2223 if ((rp->r_namep = metaname(&sp, devname, 2224 LOGICAL_DEVICE, ep)) == NULL) { 2225 Free(devname); 2226 Free(rp); 2227 return (NULL); 2228 } 2229 } 2230 Free(devname); 2231 2232 /* make sure it's OK */ 2233 if ((! (flags & MD_BASICNAME_OK)) && 2234 (metachkcomp(rp->r_namep, ep) != 0)) { 2235 Free(rp); 2236 return (NULL); 2237 } 2238 2239 rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR; 2240 rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR; 2241 rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID; 2242 if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) { 2243 sz = devid_sizeof((ddi_devid_t)(uintptr_t) 2244 (c->c_locator.l_devid)); 2245 if ((rp->r_devid = (ddi_devid_t)malloc(sz)) == 2246 (ddi_devid_t)NULL) { 2247 Free(rp); 2248 return (NULL); 2249 } 2250 (void) memcpy((void *)rp->r_devid, 2251 (void *)(uintptr_t)c->c_locator.l_devid, sz); 2252 (void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name); 2253 rp->r_flags &= ~MDDB_F_NODEVID; 2254 /* Overwrite dev derived from name with dev from devid */ 2255 rp->r_namep->dev = meta_expldev(c->c_locator.l_dev); 2256 } 2257 (void) strcpy(rp->r_driver_name, c->c_locator.l_driver); 2258 2259 rp->r_blkno = c->c_locator.l_blkno; 2260 if (c->c_dbend != 0) 2261 rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1; 2262 2263 /* return replica */ 2264 return (rp); 2265 } 2266 2267 /* 2268 * free replica list 2269 */ 2270 void 2271 metafreereplicalist( 2272 md_replicalist_t *rlp 2273 ) 2274 { 2275 md_replicalist_t *rl = NULL; 2276 2277 for (/* void */; (rlp != NULL); rlp = rl) { 2278 rl = rlp->rl_next; 2279 if (rlp->rl_repp->r_devid != (ddi_devid_t)0) { 2280 free(rlp->rl_repp->r_devid); 2281 } 2282 Free(rlp->rl_repp); 2283 Free(rlp); 2284 } 2285 } 2286 2287 /* 2288 * return list of all replicas in set 2289 */ 2290 int 2291 metareplicalist( 2292 mdsetname_t *sp, 2293 int flags, 2294 md_replicalist_t **rlpp, 2295 md_error_t *ep 2296 ) 2297 { 2298 md_replicalist_t **tail = rlpp; 2299 int count = 0; 2300 struct mddb_config c; 2301 int i; 2302 char *devid; 2303 2304 /* for each replica */ 2305 i = 0; 2306 do { 2307 md_replica_t *rp; 2308 2309 /* get next replica */ 2310 (void) memset(&c, 0, sizeof (c)); 2311 c.c_id = i; 2312 c.c_setno = sp->setno; 2313 2314 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2315 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2316 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2317 mdclrerror(&c.c_mde); 2318 break; /* handle none at all */ 2319 } 2320 (void) mdstealerror(ep, &c.c_mde); 2321 goto out; 2322 } 2323 2324 if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) { 2325 if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) { 2326 (void) mdsyserror(ep, ENOMEM, META_DBCONF); 2327 goto out; 2328 } 2329 c.c_locator.l_devid = (uintptr_t)devid; 2330 /* 2331 * Turn on space and sz flags since 'sz' amount of 2332 * space has been alloc'd. 2333 */ 2334 c.c_locator.l_devid_flags = 2335 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2336 } 2337 2338 if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) { 2339 if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) { 2340 mdclrerror(&c.c_mde); 2341 break; /* handle none at all */ 2342 } 2343 (void) mdstealerror(ep, &c.c_mde); 2344 goto out; 2345 } 2346 2347 /* 2348 * Paranoid check - shouldn't happen, but is left as 2349 * a place holder for changes that will be needed after 2350 * dynamic reconfiguration changes are added to SVM (to 2351 * support movement of disks at any point in time). 2352 */ 2353 if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) { 2354 (void) fprintf(stderr, 2355 dgettext(TEXT_DOMAIN, 2356 "Error: Relocation Information " 2357 "(drvnm=%s, mnum=0x%lx) \n" 2358 "relocation information size changed - \n" 2359 "rerun command\n"), 2360 c.c_locator.l_driver, c.c_locator.l_mnum); 2361 (void) mderror(ep, MDE_DEVID_TOOBIG, NULL); 2362 goto out; 2363 } 2364 2365 if (c.c_dbcnt == 0) 2366 break; /* handle none at all */ 2367 2368 /* get info */ 2369 if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL) 2370 goto out; 2371 2372 /* append to list */ 2373 *tail = Zalloc(sizeof (**tail)); 2374 (*tail)->rl_repp = rp; 2375 tail = &(*tail)->rl_next; 2376 ++count; 2377 2378 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2379 free(devid); 2380 c.c_locator.l_devid_flags = 0; 2381 } 2382 2383 } while (++i < c.c_dbcnt); 2384 2385 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2386 free(devid); 2387 } 2388 2389 /* return count */ 2390 return (count); 2391 2392 /* cleanup, return error */ 2393 out: 2394 if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) { 2395 free(devid); 2396 } 2397 metafreereplicalist(*rlpp); 2398 *rlpp = NULL; 2399 return (-1); 2400 } 2401 2402 /* 2403 * meta_sync_db_locations - get list of replicas from kernel and write 2404 * out to mddb.cf and md.conf. 'Syncs up' the replica list in 2405 * the kernel with the replica list in the conf files. 2406 * 2407 */ 2408 void 2409 meta_sync_db_locations( 2410 mdsetname_t *sp, 2411 md_error_t *ep 2412 ) 2413 { 2414 char *sname = 0; /* system file name */ 2415 char *cname = 0; /* config file name */ 2416 2417 if (!metaislocalset(sp)) 2418 return; 2419 2420 /* Updates backup of configuration file (aka mddb.cf) */ 2421 if (buildconf(sp, ep) != 0) 2422 return; 2423 2424 /* Updates system configuration file (aka md.conf) */ 2425 (void) meta_db_patch(sname, cname, 0, ep); 2426 } 2427 2428 /* 2429 * setup_db_locations - parse the mddb.cf file and 2430 * tells the driver which db locations to use. 2431 */ 2432 int 2433 meta_setup_db_locations( 2434 md_error_t *ep 2435 ) 2436 { 2437 mddb_config_t c; 2438 FILE *fp; 2439 char inbuff[1024]; 2440 char *buff; 2441 uint_t i; 2442 size_t sz; 2443 int rval = 0; 2444 char *devidp; 2445 uint_t devid_size; 2446 char *minor_name = NULL; 2447 ddi_devid_t devid_decode; 2448 int checksum; 2449 2450 /* do mddb.cf file */ 2451 (void) memset(&c, '\0', sizeof (c)); 2452 if ((fp = fopen(META_DBCONF, "r")) == NULL) { 2453 if (errno != ENOENT) 2454 return (mdsyserror(ep, errno, META_DBCONF)); 2455 } 2456 while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1), 2457 fp)) != NULL)) { 2458 2459 /* ignore comments */ 2460 if (*buff == '#') 2461 continue; 2462 2463 /* parse locator */ 2464 (void) memset(&c, 0, sizeof (c)); 2465 c.c_setno = MD_LOCAL_SET; 2466 i = strcspn(buff, " \t"); 2467 if (i > sizeof (c.c_locator.l_driver)) 2468 i = sizeof (c.c_locator.l_driver); 2469 (void) strncpy(c.c_locator.l_driver, buff, i); 2470 buff += i; 2471 c.c_locator.l_dev = 2472 makedev((major_t)0, (minor_t)strtol(buff, &buff, 10)); 2473 c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10); 2474 c.c_locator.l_mnum = minor(c.c_locator.l_dev); 2475 2476 /* parse out devid */ 2477 while (isspace((int)(*buff))) 2478 buff += 1; 2479 i = strcspn(buff, " \t"); 2480 if ((devidp = (char *)malloc(i+1)) == NULL) 2481 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2482 2483 (void) strncpy(devidp, buff, i); 2484 devidp[i] = '\0'; 2485 if (devid_str_decode(devidp, &devid_decode, 2486 &minor_name) == -1) { 2487 free(devidp); 2488 continue; 2489 } 2490 2491 /* Conf file must have minor name associated with devid */ 2492 if (minor_name == NULL) { 2493 free(devidp); 2494 devid_free(devid_decode); 2495 continue; 2496 } 2497 2498 sz = devid_sizeof(devid_decode); 2499 /* Copy to devid size buffer that ioctl expects */ 2500 if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) { 2501 devid_free(devid_decode); 2502 free(minor_name); 2503 free(devidp); 2504 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2505 } 2506 2507 (void) memcpy((void *)(uintptr_t)c.c_locator.l_devid, 2508 (void *)devid_decode, sz); 2509 2510 devid_free(devid_decode); 2511 2512 if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) { 2513 free(minor_name); 2514 free(devidp); 2515 free((void *)(uintptr_t)c.c_locator.l_devid); 2516 return (mdsyserror(ep, ENOMEM, META_DBCONF)); 2517 } 2518 (void) strcpy(c.c_locator.l_minor_name, minor_name); 2519 free(minor_name); 2520 c.c_locator.l_devid_flags = MDDB_DEVID_VALID | 2521 MDDB_DEVID_SPACE | MDDB_DEVID_SZ; 2522 c.c_locator.l_devid_sz = sz; 2523 2524 devid_size = strlen(devidp); 2525 buff += devid_size; 2526 2527 checksum = strtol(buff, &buff, 10); 2528 for (i = 0; c.c_locator.l_driver[i] != 0; i++) 2529 checksum += c.c_locator.l_driver[i]; 2530 for (i = 0; i < devid_size; i++) { 2531 checksum += devidp[i]; 2532 } 2533 free(devidp); 2534 2535 checksum += minor(c.c_locator.l_dev); 2536 checksum += c.c_locator.l_blkno; 2537 if (checksum != 42) { 2538 /* overwritten later for more serious problems */ 2539 rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF); 2540 free((void *)(uintptr_t)c.c_locator.l_devid); 2541 continue; 2542 } 2543 c.c_locator.l_flags = 0; 2544 2545 /* use db location */ 2546 if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) { 2547 free((void *)(uintptr_t)c.c_locator.l_devid); 2548 return (mdstealerror(ep, &c.c_mde)); 2549 } 2550 2551 /* free up devid if in use */ 2552 free((void *)(uintptr_t)c.c_locator.l_devid); 2553 c.c_locator.l_devid = (uint64_t)0; 2554 c.c_locator.l_devid_flags = 0; 2555 } 2556 if ((fp) && (fclose(fp) != 0)) 2557 return (mdsyserror(ep, errno, META_DBCONF)); 2558 2559 /* check for stale database */ 2560 (void) memset((char *)&c, 0, sizeof (struct mddb_config)); 2561 c.c_id = 0; 2562 c.c_setno = MD_LOCAL_SET; 2563 2564 /* 2565 * While we do not need the devid here we may need to 2566 * know if devid's are being used by the kernel for 2567 * the replicas. This is because under some circumstances 2568 * we can only manipulate the SVM configuration if the 2569 * kernel is using devid's. 2570 */ 2571 c.c_locator.l_devid = (uint64_t)0; 2572 c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ; 2573 c.c_locator.l_devid_sz = 0; 2574 2575 if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) { 2576 if (! mdismddberror(&c.c_mde, MDE_DB_INVALID)) 2577 return (mdstealerror(ep, &c.c_mde)); 2578 mdclrerror(&c.c_mde); 2579 } 2580 2581 if (c.c_flags & MDDB_C_STALE) 2582 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET, 2583 0, NULL)); 2584 2585 if (c.c_locator.l_devid_sz != 0) { 2586 /* 2587 * Devid's are being used to track the replicas because 2588 * there is space for a devid. 2589 */ 2590 devid_in_use = TRUE; 2591 } 2592 2593 /* success */ 2594 return (rval); 2595 } 2596 2597 /* 2598 * meta_db_minreplica - returns the minimum size replica currently in use. 2599 */ 2600 daddr_t 2601 meta_db_minreplica( 2602 mdsetname_t *sp, 2603 md_error_t *ep 2604 ) 2605 { 2606 md_replica_t *r; 2607 md_replicalist_t *rl, *rlp = NULL; 2608 daddr_t nblks = 0; 2609 2610 if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0) 2611 return (-1); 2612 2613 if (rlp == NULL) 2614 return (-1); 2615 2616 /* find the smallest existing replica */ 2617 for (rl = rlp; rl != NULL; rl = rl->rl_next) { 2618 r = rl->rl_repp; 2619 nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks)); 2620 } 2621 2622 metafreereplicalist(rlp); 2623 return (nblks); 2624 } 2625 2626 /* 2627 * meta_get_replica_names 2628 * returns an mdnamelist_t of replica slices 2629 */ 2630 /*ARGSUSED*/ 2631 int 2632 meta_get_replica_names( 2633 mdsetname_t *sp, 2634 mdnamelist_t **nlpp, 2635 int options, 2636 md_error_t *ep 2637 ) 2638 { 2639 md_replicalist_t *rlp = NULL; 2640 md_replicalist_t *rl; 2641 mdnamelist_t **tailpp = nlpp; 2642 int cnt = 0; 2643 2644 assert(nlpp != NULL); 2645 2646 if (!metaislocalset(sp)) 2647 goto out; 2648 2649 /* get replicas */ 2650 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) { 2651 cnt = -1; 2652 goto out; 2653 } 2654 2655 /* build name list */ 2656 for (rl = rlp; (rl != NULL); rl = rl->rl_next) { 2657 /* 2658 * Add the name struct to the end of the 2659 * namelist but keep a pointer to the last 2660 * element so that we don't incur the overhead 2661 * of traversing the list each time 2662 */ 2663 tailpp = meta_namelist_append_wrapper( 2664 tailpp, rl->rl_repp->r_namep); 2665 ++cnt; 2666 } 2667 2668 /* cleanup, return count or error */ 2669 out: 2670 metafreereplicalist(rlp); 2671 return (cnt); 2672 } 2673