xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_db.c (revision 355b4669e025ff377602b6fc7caaf30dbc218371)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice database interfaces.
38  */
39 
40 #define	MDDB
41 
42 #include <meta.h>
43 #include <sys/lvm/md_mddb.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/lvm/mdio.h>
46 #include <string.h>
47 #include <strings.h>
48 #include <ctype.h>
49 
50 struct svm_daemon {
51 	char *svmd_name;
52 	char *svmd_kill_val;
53 };
54 
55 struct svm_daemon svmd_kill_list[] = {
56 		{"mdmonitord", "HUP"},
57 		{"mddoors", "KILL"},
58 	};
59 
60 #define	DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
61 #define	MDMONITORD	"/usr/sbin/mdmonitord"
62 
63 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
64 
65 /*
66  * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
67  */
68 md_timeval32_t
69 meta_get_lb_inittime(
70 	mdsetname_t	*sp,
71 	md_error_t	*ep
72 )
73 {
74 	mddb_config_t	c;
75 
76 	(void) memset(&c, 0, sizeof (c));
77 
78 	/* Fill in setno, setname, and sideno */
79 	c.c_setno = sp->setno;
80 
81 	if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
82 		(void) mdstealerror(ep, &c.c_mde);
83 	}
84 
85 	return (c.c_timestamp);
86 }
87 
88 /*
89  * mkmasterblks writes out the master blocks of the mddb to the replica.
90  *
91  * In a MN diskset, this is called by the node that is adding this replica
92  * to the diskset.
93  */
94 
95 #define	MDDB_VERIFY_SIZE	8192
96 
97 static int
98 mkmasterblks(
99 	mdsetname_t	*sp,
100 	mdname_t	*np,
101 	int		fd,
102 	daddr_t		firstblk,
103 	int		dbsize,
104 	md_timeval32_t	inittime,
105 	md_error_t	*ep
106 )
107 {
108 	int		consecutive;
109 	md_timeval32_t	tp;
110 	struct mddb_mb	*mb;
111 	char		*buffer;
112 	int		iosize;
113 	md_set_desc	*sd;
114 	int		mn_set = 0;
115 	daddr_t		startblk;
116 	int		cnt;
117 	ddi_devid_t	devid;
118 
119 	if (! metaislocalset(sp)) {
120 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
121 			return (-1);
122 
123 		if (MD_MNSET_DESC(sd)) {
124 			mn_set = 1;		/* Used later */
125 		}
126 	}
127 
128 	/*
129 	 * Loop to verify the entire mddb region on disk is read/writable.
130 	 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
131 	 * chunks.
132 	 *
133 	 * A side-effect of this loop is to zero out the entire mddb region
134 	 */
135 	if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
136 		return (mdsyserror(ep, ENOMEM, np->rname));
137 
138 	startblk = firstblk;
139 	for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
140 
141 		if (cnt > MDDB_VERIFY_SIZE)
142 			consecutive = MDDB_VERIFY_SIZE;
143 		else
144 			consecutive = cnt;
145 
146 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
147 			Free(buffer);
148 			return (mdsyserror(ep, errno, np->rname));
149 		}
150 
151 		iosize = DEV_BSIZE * consecutive;
152 		if (write(fd, buffer, iosize) != iosize) {
153 			Free(buffer);
154 			return (mdsyserror(ep, errno, np->rname));
155 		}
156 
157 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
158 			Free(buffer);
159 			return (mdsyserror(ep, errno, np->rname));
160 		}
161 
162 		if (read(fd, buffer, iosize) != iosize) {
163 			Free(buffer);
164 			return (mdsyserror(ep, errno, np->rname));
165 		}
166 
167 		startblk += consecutive;
168 	}
169 
170 	Free(buffer);
171 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
172 		return (mdsyserror(ep, ENOMEM, np->rname));
173 
174 	if (meta_gettimeofday(&tp) == -1) {
175 		Free(mb);
176 		return (mdsyserror(ep, errno, np->rname));
177 	}
178 
179 	mb->mb_magic = MDDB_MAGIC_MB;
180 	/*
181 	 * If a MN diskset, set master block revision for a MN set.
182 	 * Even though the master block structure is no different
183 	 * for a MN set, setting the revision field to a different
184 	 * number keeps any pre-MN_diskset code from accessing
185 	 * this diskset.  It also allows for an early determination
186 	 * of a MN diskset when reading in from disk so that the
187 	 * proper size locator block and locator names structure
188 	 * can be read in thus saving time on diskset startup.
189 	 */
190 	if (mn_set)
191 		mb->mb_revision = MDDB_REV_MNMB;
192 	else
193 		mb->mb_revision = MDDB_REV_MB;
194 	mb->mb_timestamp = tp;
195 	mb->mb_setno = sp->setno;
196 	mb->mb_blkcnt = dbsize - 1;
197 	mb->mb_blkno = firstblk;
198 	mb->mb_nextblk = 0;
199 
200 	mb->mb_blkmap.m_firstblk = firstblk + 1;
201 	mb->mb_blkmap.m_consecutive = dbsize - 1;
202 	if (! metaislocalset(sp)) {
203 		mb->mb_setcreatetime = inittime;
204 	}
205 
206 	/*
207 	 * We try to save the disks device ID into the remaining bytes in
208 	 * the master block. The saved devid is used to provide a mapping
209 	 * between this disk's devid and the devid stored into the master
210 	 * block. This allows the disk image to be self-identifying
211 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
212 	 * when we try to import these disks on the remote copied image.
213 	 * If we cannot save the disks device ID onto the master block that is
214 	 * ok.  The disk is just not self-identifying and won't be importable
215 	 * in the remote copy scenario.
216 	 */
217 	if (devid_get(fd, &devid) == 0) {
218 		size_t len;
219 
220 		len = devid_sizeof(devid);
221 		if (len <= DEV_BSIZE - sizeof (*mb)) {
222 			/* there is enough space to store the devid */
223 			mb->mb_devid_magic = MDDB_MAGIC_DE;
224 			mb->mb_devid_len = len;
225 			(void) memcpy(mb->mb_devid, devid, len);
226 		}
227 		devid_free(devid);
228 	}
229 
230 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
231 	    (crc_skip_t *)NULL);
232 
233 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
234 		Free(mb);
235 		return (mdsyserror(ep, errno, np->rname));
236 	}
237 
238 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
239 		Free(mb);
240 		return (mdsyserror(ep, errno, np->rname));
241 	}
242 
243 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
244 		Free(mb);
245 		return (mdsyserror(ep, errno, np->rname));
246 	}
247 
248 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
249 		Free(mb);
250 		return (mdsyserror(ep, errno, np->rname));
251 	}
252 
253 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
254 		(uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
255 		Free(mb);
256 		return (mdmddberror(ep, MDE_NOTVERIFIED,
257 			meta_getminor(np->dev), sp->setno, 0, np->rname));
258 	}
259 
260 	Free(mb);
261 	return (0);
262 }
263 
264 void
265 meta_mkdummymaster(
266 	mdsetname_t	*sp,
267 	int		fd,
268 	daddr_t		firstblk
269 )
270 {
271 	md_timeval32_t	tp;
272 	struct mddb_mb	*mb;
273 	ddi_devid_t	devid;
274 	md_set_desc	*sd;
275 	md_error_t	ep = mdnullerror;
276 	md_timeval32_t	inittime;
277 
278 	/*
279 	 * No dummy master blocks are written for a MN diskset since devids
280 	 * are not supported in MN disksets.
281 	 */
282 	if (! metaislocalset(sp)) {
283 		if ((sd = metaget_setdesc(sp, &ep)) == NULL)
284 			return;
285 
286 		if (MD_MNSET_DESC(sd))
287 			return;
288 	}
289 
290 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
291 		return;
292 
293 	mb->mb_magic = MDDB_MAGIC_DU;
294 	mb->mb_revision = MDDB_REV_MB;
295 	mb->mb_setno = sp->setno;
296 	inittime = meta_get_lb_inittime(sp, &ep);
297 	mb->mb_setcreatetime = inittime;
298 
299 	if (meta_gettimeofday(&tp) != -1)
300 		mb->mb_timestamp = tp;
301 
302 	/*
303 	 * We try to save the disks device ID into the remaining bytes in
304 	 * the master block.  This allows the disk image to be self-identifying
305 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
306 	 * when we try to import these disks on the remote copied image.
307 	 * If we cannot save the disks device ID onto the master block that is
308 	 * ok.  The disk is just not self-identifying and won't be importable
309 	 * in the remote copy scenario.
310 	 */
311 	if (devid_get(fd, &devid) == 0) {
312 		int len;
313 
314 		len = devid_sizeof(devid);
315 		if (len <= DEV_BSIZE - sizeof (*mb)) {
316 			/* there is enough space to store the devid */
317 			mb->mb_devid_magic = MDDB_MAGIC_DE;
318 			mb->mb_devid_len = len;
319 			(void) memcpy(mb->mb_devid, (char *)devid, len);
320 		}
321 		devid_free(devid);
322 	}
323 
324 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
325 	    (crc_skip_t *)NULL);
326 
327 	/*
328 	 * If any of these operations fail, we need to inform the
329 	 * user that the disk won't be self identifying. When support
330 	 * for importing remotely replicated disksets is added, we
331 	 * want to add the error messages here.
332 	 */
333 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
334 		goto out;
335 
336 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
337 		goto out;
338 
339 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
340 		goto out;
341 
342 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
343 		goto out;
344 
345 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
346 	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
347 		goto out;
348 
349 out:
350 	Free(mb);
351 }
352 
353 static int
354 buildconf(mdsetname_t *sp, md_error_t *ep)
355 {
356 	md_replicalist_t	*rlp = NULL;
357 	md_replicalist_t	*rl;
358 	FILE			*cfp = NULL;
359 	FILE			*mfp = NULL;
360 	struct stat		sbuf;
361 	int			rval = 0;
362 	int			in_miniroot = 0;
363 	char			line[MDDB_BOOTLIST_MAX_LEN];
364 	char			*tname = NULL;
365 
366 	/* get list of local replicas */
367 	if (! metaislocalset(sp))
368 		return (0);
369 
370 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
371 		return (-1);
372 
373 	/* open tempfile, copy permissions of original file */
374 	if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
375 		/*
376 		 * On the miniroot tmp files must be created in /var/tmp.
377 		 * If we get a EROFS error, we assume that we are in the
378 		 * miniroot.
379 		 */
380 		if (errno != EROFS)
381 			goto error;
382 		in_miniroot = 1;
383 		errno = 0;
384 		tname = tempnam("/var/tmp", "slvm_");
385 		if (tname == NULL && errno == EROFS) {
386 			/*
387 			 * If we are booted on a read-only root because
388 			 * of mddb quorum problems we don't want to emit
389 			 * any scary error messages.
390 			 */
391 			errno = 0;
392 			goto out;
393 		}
394 
395 		/* open tempfile, copy permissions of original file */
396 		if ((cfp = fopen(tname, "w+")) == NULL)
397 			goto error;
398 	}
399 	if (stat(META_DBCONF, &sbuf) == 0) {
400 		if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
401 			goto error;
402 		if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
403 			goto error;
404 	}
405 
406 	/* print header */
407 	if (fprintf(cfp, "#metadevice database location file ") == EOF)
408 		goto error;
409 	if (fprintf(cfp, "do not hand edit\n") < 0)
410 		goto error;
411 	if (fprintf(cfp,
412 		"#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
413 		goto error;
414 
415 	/* dump replicas */
416 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
417 		md_replica_t	*r = rl->rl_repp;
418 		int		checksum = 42;
419 		int		i;
420 		char		*devidp;
421 		minor_t		min;
422 
423 		devidp = devid_str_encode(r->r_devid, r->r_minor_name);
424 		/* If devid code can't encode devidp - skip entry */
425 		if (devidp == NULL) {
426 			continue;
427 		}
428 
429 		/* compute checksum */
430 		for (i = 0; ((r->r_driver_name[i] != '\0') &&
431 		    (i < sizeof (r->r_driver_name))); i++) {
432 			checksum -= r->r_driver_name[i];
433 		}
434 		min = meta_getminor(r->r_namep->dev);
435 		checksum -= min;
436 		checksum -= r->r_blkno;
437 
438 		for (i = 0; i < strlen(devidp); i++) {
439 			checksum -= devidp[i];
440 		}
441 		/* print info */
442 		if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
443 		    r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
444 			goto error;
445 		}
446 
447 		devid_str_free(devidp);
448 	}
449 
450 	/* close and rename to real file */
451 	if (fflush(cfp) != 0)
452 		goto error;
453 	if (fsync(fileno(cfp)) != 0)
454 		goto error;
455 	if (fclose(cfp) != 0) {
456 		cfp = NULL;
457 		goto error;
458 	}
459 	cfp = NULL;
460 
461 	/*
462 	 * Renames don't work in the miniroot since tmpfiles are
463 	 * created in /var/tmp. Hence we copy the data out.
464 	 */
465 
466 	if (! in_miniroot) {
467 		if (rename(META_DBCONFTMP, META_DBCONF) != 0)
468 			goto error;
469 	} else {
470 		if ((cfp = fopen(tname, "r")) == NULL)
471 			goto error;
472 		if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
473 			goto error;
474 		while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
475 			if (fputs(line, mfp) == NULL)
476 				goto error;
477 		}
478 		(void) fclose(cfp);
479 		cfp = NULL;
480 		if (fflush(mfp) != 0)
481 			goto error;
482 		if (fsync(fileno(mfp)) != 0)
483 			goto error;
484 		if (fclose(mfp) != 0) {
485 			mfp = NULL;
486 			goto error;
487 		}
488 		/* delete the tempfile */
489 		(void) unlink(tname);
490 	}
491 	/* success */
492 	rval = 0;
493 	goto out;
494 
495 	/* tempfile error */
496 error:
497 	rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
498 				mdsyserror(ep, errno, META_DBCONFTMP);
499 
500 
501 	/* cleanup, return success */
502 out:
503 	if (rlp != NULL)
504 		metafreereplicalist(rlp);
505 	if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
506 		rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
507 					mdsyserror(ep, errno, META_DBCONFTMP);
508 	}
509 	free(tname);
510 	return (rval);
511 }
512 
513 /*
514  * check replica for dev
515  */
516 static int
517 in_replica(
518 	mdsetname_t	*sp,
519 	md_replica_t	*rp,
520 	mdname_t	*np,
521 	diskaddr_t	slblk,
522 	diskaddr_t	nblks,
523 	md_error_t	*ep
524 )
525 {
526 	mdname_t	*repnp = rp->r_namep;
527 	diskaddr_t	rep_sblk = rp->r_blkno;
528 	diskaddr_t	rep_nblks = rp->r_nblk;
529 
530 	/* should be in the same set */
531 	assert(sp != NULL);
532 
533 	/* if error in master block, assume whole partition */
534 	if ((rep_sblk == MD_DISKADDR_ERROR) ||
535 	    (rep_nblks == MD_DISKADDR_ERROR)) {
536 		rep_sblk = 0;
537 		rep_nblks = MD_DISKADDR_ERROR;
538 	}
539 
540 	/* check overlap */
541 	if (meta_check_overlap(
542 	    MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
543 		return (-1);
544 	}
545 
546 	/* return success */
547 	return (0);
548 }
549 
550 /*
551  * check to see if we're in a replica
552  */
553 int
554 meta_check_inreplica(
555 	mdsetname_t		*sp,
556 	mdname_t		*np,
557 	diskaddr_t		slblk,
558 	diskaddr_t		nblks,
559 	md_error_t		*ep
560 )
561 {
562 	md_replicalist_t	*rlp = NULL;
563 	md_replicalist_t	*rl;
564 	int			rval = 0;
565 
566 	/* should have a set */
567 	assert(sp != NULL);
568 
569 	/* for each replica */
570 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
571 		return (-1);
572 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
573 		md_replica_t	*rp = rl->rl_repp;
574 
575 		/* check replica */
576 		if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
577 			rval = -1;
578 			break;
579 		}
580 	}
581 
582 	/* cleanup, return success */
583 	metafreereplicalist(rlp);
584 	return (rval);
585 }
586 
587 /*
588  * check replica
589  */
590 int
591 meta_check_replica(
592 	mdsetname_t	*sp,		/* set to check against */
593 	mdname_t	*np,		/* component to check against */
594 	mdchkopts_t	options,	/* option flags */
595 	diskaddr_t	slblk,		/* start logical block */
596 	diskaddr_t	nblks,		/* number of blocks (-1,rest of them) */
597 	md_error_t	*ep		/* error packet */
598 )
599 {
600 	mdchkopts_t	chkoptions = MDCHK_ALLOW_REPSLICE;
601 
602 	/* make sure we have a disk */
603 	if (metachkcomp(np, ep) != 0)
604 		return (-1);
605 
606 	/* check to ensure that it is not already in use */
607 	if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
608 		return (-1);
609 	}
610 
611 	if (options & MDCHK_ALLOW_NODBS)
612 		return (0);
613 
614 	if (options & MDCHK_DRVINSET)
615 		return (0);
616 
617 	/* make sure it is in the set */
618 	if (meta_check_inset(sp, np, ep) != 0)
619 		return (-1);
620 
621 	/* make sure its not in a metadevice */
622 	if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
623 		return (-1);
624 
625 	/* return success */
626 	return (0);
627 }
628 
629 static int
630 update_dbinfo_on_drives(
631 	mdsetname_t	*sp,
632 	md_drive_desc	*dd,
633 	int		set_locked,
634 	int		force,
635 	md_error_t	*ep
636 )
637 {
638 	md_set_desc		*sd;
639 	int			i;
640 	md_setkey_t		*cl_sk;
641 	int			rval = 0;
642 	md_mnnode_desc		*nd;
643 
644 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
645 		return (-1);
646 
647 	if (! set_locked) {
648 		if (MD_MNSET_DESC(sd)) {
649 			md_error_t xep = mdnullerror;
650 			sigset_t sigs;
651 			/* Make sure we are blocking all signals */
652 			if (procsigs(TRUE, &sigs, &xep) < 0)
653 				mdclrerror(&xep);
654 
655 			nd = sd->sd_nodelist;
656 			while (nd) {
657 				if (force && strcmp(nd->nd_nodename,
658 				    mynode()) != 0) {
659 					nd = nd->nd_next;
660 					continue;
661 				}
662 
663 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
664 					nd = nd->nd_next;
665 					continue;
666 				}
667 
668 				if (clnt_lock_set(nd->nd_nodename, sp, ep))
669 					return (-1);
670 				nd = nd->nd_next;
671 			}
672 		} else {
673 			for (i = 0; i < MD_MAXSIDES; i++) {
674 				/* Skip empty slots */
675 				if (sd->sd_nodes[i][0] == '\0')
676 					continue;
677 
678 				if (force && strcmp(sd->sd_nodes[i],
679 				    mynode()) != 0)
680 					continue;
681 
682 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
683 					return (-1);
684 			}
685 		}
686 	}
687 
688 	if (MD_MNSET_DESC(sd)) {
689 		nd = sd->sd_nodelist;
690 		while (nd) {
691 			if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
692 				nd = nd->nd_next;
693 				continue;
694 			}
695 
696 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
697 				nd = nd->nd_next;
698 				continue;
699 			}
700 
701 			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
702 			    == -1) {
703 				rval = -1;
704 				break;
705 			}
706 			nd = nd->nd_next;
707 		}
708 	} else {
709 		for (i = 0; i < MD_MAXSIDES; i++) {
710 			/* Skip empty slots */
711 			if (sd->sd_nodes[i][0] == '\0')
712 				continue;
713 
714 			if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
715 				continue;
716 
717 			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
718 			    == -1) {
719 				rval = -1;
720 				break;
721 			}
722 		}
723 	}
724 
725 	if (! set_locked) {
726 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
727 		if (MD_MNSET_DESC(sd)) {
728 			nd = sd->sd_nodelist;
729 			while (nd) {
730 				if (force &&
731 				    strcmp(nd->nd_nodename, mynode()) != 0) {
732 					nd = nd->nd_next;
733 					continue;
734 				}
735 
736 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
737 					nd = nd->nd_next;
738 					continue;
739 				}
740 
741 				if (clnt_unlock_set(nd->nd_nodename, cl_sk,
742 				    ep)) {
743 					rval = -1;
744 					break;
745 				}
746 				nd = nd->nd_next;
747 			}
748 		} else {
749 			for (i = 0; i < MD_MAXSIDES; i++) {
750 				/* Skip empty slots */
751 				if (sd->sd_nodes[i][0] == '\0')
752 					continue;
753 
754 				if (force &&
755 				    strcmp(sd->sd_nodes[i], mynode()) != 0)
756 					continue;
757 
758 				if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
759 				    ep)) {
760 					rval = -1;
761 					break;
762 				}
763 			}
764 
765 		}
766 		cl_set_setkey(NULL);
767 	}
768 
769 	return (rval);
770 }
771 
772 int
773 meta_db_addsidenms(
774 	mdsetname_t	*sp,
775 	mdname_t	*np,
776 	daddr_t		blkno,
777 	int		bcast,
778 	md_error_t	*ep
779 )
780 {
781 	side_t		sideno;
782 	char		*bname = NULL;
783 	char		*dname = NULL;
784 	minor_t		mnum;
785 	mddb_config_t	c;
786 	int		done;
787 	int		rval = 0;
788 	md_set_desc	*sd;
789 
790 	sideno = MD_SIDEWILD;
791 	/*CONSTCOND*/
792 	while (1) {
793 		if (bname != NULL) {
794 			Free(bname);
795 			bname = NULL;
796 		}
797 		if (dname != NULL) {
798 			Free(dname);
799 			dname = NULL;
800 		}
801 		if ((done = meta_getnextside_devinfo(sp, np->bname,
802 		    &sideno, &bname, &dname, &mnum, ep)) == -1) {
803 			rval = -1;
804 			break;
805 		}
806 
807 		if (done == 0)
808 			break;
809 
810 		if (! metaislocalset(sp)) {
811 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
812 				rval = -1;
813 				break;
814 			}
815 		}
816 
817 		/*
818 		 * Send addsidenms to all nodes using rpc.mdcommd if
819 		 * sidename is being added to MN diskset.
820 		 *
821 		 *   It's ok to broadcast this call to other nodes.
822 		 *
823 		 *   Note: The broadcast to other nodes isn't needed during
824 		 *   the addition of the first mddbs to the set since the
825 		 *   other nodes haven't been joined to the set yet.  All
826 		 *   nodes in a MN diskset are (implicitly) joined to the set
827 		 *   on the addition of the first mddb.
828 		 */
829 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
830 		    (bcast == DB_ADDSIDENMS_BCAST)) {
831 			md_mn_result_t			*resultp = NULL;
832 			md_mn_msg_meta_db_newside_t	db_ns;
833 			int				send_rval;
834 
835 			db_ns.msg_l_dev = np->dev;
836 			db_ns.msg_sideno = sideno;
837 			db_ns.msg_blkno = blkno;
838 			(void) strncpy(db_ns.msg_dname, dname,
839 			    sizeof (db_ns.msg_dname));
840 			(void) splitname(np->bname, &db_ns.msg_splitname);
841 			db_ns.msg_mnum = mnum;
842 
843 			/* Set devid to NULL until devids are supported */
844 			db_ns.msg_devid[0] = NULL;
845 
846 			/*
847 			 * If reconfig cycle has been started, this node is
848 			 * stuck in in the return step until this command has
849 			 * completed.  If mdcommd is suspended, ask
850 			 * send_message to fail (instead of retrying)
851 			 * so that metaset can finish allowing the reconfig
852 			 * cycle to proceed.
853 			 */
854 			send_rval = mdmn_send_message(sp->setno,
855 			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
856 			    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns,
857 			    sizeof (md_mn_msg_meta_db_newside_t),
858 			    &resultp, ep);
859 			if (send_rval != 0) {
860 				rval = -1;
861 				if (resultp == NULL)
862 					(void) mddserror(ep,
863 					    MDE_DS_COMMD_SEND_FAIL,
864 					    sp->setno, NULL, NULL,
865 					    sp->setname);
866 				else {
867 					(void) mdstealerror(ep,
868 					    &(resultp->mmr_ep));
869 					if (mdisok(ep)) {
870 						(void) mddserror(ep,
871 						    MDE_DS_COMMD_SEND_FAIL,
872 						    sp->setno, NULL, NULL,
873 						    sp->setname);
874 					}
875 					free_result(resultp);
876 				}
877 				break;
878 			}
879 			if (resultp)
880 				free_result(resultp);
881 		} else {
882 			/*
883 			 * Let this side's  device name, minor # and driver name
884 			 * be known to the database replica.
885 			 */
886 			(void) memset(&c, 0, sizeof (c));
887 
888 			/* Fill in device/replica info */
889 			c.c_locator.l_dev = meta_cmpldev(np->dev);
890 			c.c_locator.l_blkno = blkno;
891 			(void) strncpy(c.c_locator.l_driver, dname,
892 			    sizeof (c.c_locator.l_driver));
893 			(void) splitname(bname, &c.c_devname);
894 			c.c_locator.l_mnum = mnum;
895 
896 			/* Fill in setno, setname, and sideno */
897 			c.c_setno = sp->setno;
898 			(void) strncpy(c.c_setname, sp->setname,
899 				sizeof (c.c_setname));
900 			c.c_sideno = sideno;
901 
902 			/*
903 			 * Don't need device id information from this ioctl
904 			 * Kernel determines device id from dev_t, which
905 			 * is just what this code would do.
906 			 */
907 			c.c_locator.l_devid = (uint64_t)0;
908 			c.c_locator.l_devid_flags = 0;
909 
910 			if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
911 				rval = mdstealerror(ep, &c.c_mde);
912 				break;
913 			}
914 		}
915 	}
916 
917 	/* cleanup, return success */
918 	if (bname != NULL) {
919 		Free(bname);
920 		bname = NULL;
921 	}
922 	if (dname != NULL) {
923 		Free(dname);
924 		dname = NULL;
925 	}
926 	return (rval);
927 }
928 
929 
930 int
931 meta_db_delsidenm(
932 	mdsetname_t	*sp,
933 	side_t		sideno,
934 	mdname_t	*np,
935 	daddr_t		blkno,
936 	md_error_t	*ep
937 )
938 {
939 	mddb_config_t	c;
940 	md_set_desc	*sd;
941 
942 	if (! metaislocalset(sp)) {
943 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
944 			return (-1);
945 	}
946 	/* Use rpc.mdcommd to delete mddb side from all nodes */
947 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
948 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
949 		md_mn_result_t			*resultp = NULL;
950 		md_mn_msg_meta_db_delside_t	db_ds;
951 		int				send_rval;
952 
953 		db_ds.msg_l_dev = np->dev;
954 		db_ds.msg_blkno = blkno;
955 		db_ds.msg_sideno = sideno;
956 
957 		/* Set devid to NULL until devids are supported */
958 		db_ds.msg_devid[0] = NULL;
959 
960 		/*
961 		 * If reconfig cycle has been started, this node is
962 		 * stuck in in the return step until this command has
963 		 * completed.  If mdcommd is suspended, ask
964 		 * send_message to fail (instead of retrying)
965 		 * so that metaset can finish allowing the reconfig
966 		 * cycle to proceed.
967 		 */
968 		send_rval = mdmn_send_message(sp->setno,
969 		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
970 		    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds,
971 		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
972 		if (send_rval != 0) {
973 			if (resultp == NULL)
974 				(void) mddserror(ep,
975 				    MDE_DS_COMMD_SEND_FAIL,
976 				    sp->setno, NULL, NULL,
977 				    sp->setname);
978 			else {
979 				(void) mdstealerror(ep, &(resultp->mmr_ep));
980 				if (mdisok(ep)) {
981 					(void) mddserror(ep,
982 					    MDE_DS_COMMD_SEND_FAIL,
983 					    sp->setno, NULL, NULL,
984 					    sp->setname);
985 				}
986 				free_result(resultp);
987 			}
988 			return (-1);
989 		}
990 		if (resultp)
991 			free_result(resultp);
992 
993 	} else {
994 		/*
995 		 * Let this side's  device name, minor # and driver name
996 		 * be known to the database replica.
997 		 */
998 		(void) memset(&c, 0, sizeof (c));
999 
1000 		/* Fill in device/replica info */
1001 		c.c_locator.l_dev = meta_cmpldev(np->dev);
1002 		c.c_locator.l_blkno = blkno;
1003 
1004 		/* Fill in setno, setname, and sideno */
1005 		c.c_setno = sp->setno;
1006 		(void) strcpy(c.c_setname, sp->setname);
1007 		c.c_sideno = sideno;
1008 
1009 		/*
1010 		 * Don't need device id information from this ioctl
1011 		 * Kernel determines device id from dev_t, which
1012 		 * is just what this code would do.
1013 		 */
1014 		c.c_locator.l_devid = (uint64_t)0;
1015 		c.c_locator.l_devid_flags = 0;
1016 
1017 		if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1018 			return (mdstealerror(ep, &c.c_mde));
1019 	}
1020 	return (0);
1021 }
1022 
1023 
1024 static int
1025 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1026 {
1027 	mdnamelist_t		*dnp1, *dnp2;
1028 
1029 	for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1030 		for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1031 			if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1032 				return (mderror(ep, MDE_DUPDRIVE,
1033 				    dnp1->namep->cname));
1034 		}
1035 	}
1036 	return (0);
1037 }
1038 
1039 
1040 /*
1041  * Return 1 if files are different, else return 0
1042  */
1043 static int
1044 filediff(char *tsname, char *sname)
1045 {
1046 	int ret = 1, fd;
1047 	size_t tsz, sz;
1048 	struct stat sbuf;
1049 	char *tbuf, *buf;
1050 
1051 	if (stat(tsname, &sbuf) != 0)
1052 		return (1);
1053 	tsz = sbuf.st_size;
1054 	if (stat(sname, &sbuf) != 0)
1055 		return (1);
1056 	sz = sbuf.st_size;
1057 	if (tsz != sz)
1058 		return (1);
1059 
1060 	/* allocate memory and read both files into buffer */
1061 	tbuf = malloc(tsz);
1062 	buf = malloc(sz);
1063 	if (tbuf == NULL || buf == NULL)
1064 		goto out;
1065 
1066 	fd = open(tsname, O_RDONLY);
1067 	if (fd == -1)
1068 		goto out;
1069 	sz = read(fd, tbuf, tsz);
1070 	(void) close(fd);
1071 	if (sz != tsz)
1072 		goto out;
1073 
1074 	fd = open(sname, O_RDONLY);
1075 	if (fd == -1)
1076 		goto out;
1077 	sz = read(fd, buf, tsz);
1078 	(void) close(fd);
1079 	if (sz != tsz)
1080 		goto out;
1081 
1082 	/* compare content */
1083 	ret = bcmp(tbuf, buf, tsz);
1084 out:
1085 	if (tbuf)
1086 		free(tbuf);
1087 	if (buf)
1088 		free(buf);
1089 	return (ret);
1090 }
1091 
1092 /*
1093  * patch md.conf file with mddb locations
1094  */
1095 int
1096 meta_db_patch(
1097 	char		*sname,		/* system file name */
1098 	char		*cname,		/* mddb.cf file name */
1099 	int		patch,		/* patching locally */
1100 	md_error_t	*ep
1101 )
1102 {
1103 	char		*tsname = NULL;
1104 	char		line[MDDB_BOOTLIST_MAX_LEN];
1105 	FILE		*tsfp = NULL;
1106 	FILE		*mfp = NULL;
1107 	int		rval = -1;
1108 
1109 	/* check names */
1110 	if (sname == NULL) {
1111 		if (patch)
1112 			sname = "md.conf";
1113 		else
1114 			sname = "/kernel/drv/md.conf";
1115 	}
1116 	if (cname == NULL)
1117 		cname = META_DBCONF;
1118 
1119 	/*
1120 	 * edit file
1121 	 */
1122 	if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1123 		if (mdissyserror(ep, EROFS)) {
1124 			/*
1125 			 * If we are booted on a read-only root because
1126 			 * of mddb quorum problems we don't want to emit
1127 			 * any scary error messages.
1128 			 */
1129 			mdclrerror(ep);
1130 			rval = 0;
1131 		}
1132 		goto out;
1133 	}
1134 
1135 	if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0,
1136 	    ep) != 0)
1137 		goto out;
1138 
1139 	/* if file content is identical, skip rename */
1140 	if (filediff(tsname, sname) == 0) {
1141 		rval = 0;
1142 		goto out;
1143 	}
1144 
1145 	if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1146 					    (fclose(tsfp) != 0)) {
1147 		(void) mdsyserror(ep, errno, tsname);
1148 		goto out;
1149 	}
1150 
1151 	tsfp = NULL;
1152 
1153 	/*
1154 	 * rename file. If we get a Cross Device error then it
1155 	 * is because we are in the miniroot.
1156 	 */
1157 	if (rename(tsname, sname) != 0 && errno != EXDEV) {
1158 		(void) mdsyserror(ep, errno, sname);
1159 		goto out;
1160 	}
1161 
1162 	if (errno == EXDEV) {
1163 		if ((tsfp = fopen(tsname, "r")) == NULL)
1164 			goto out;
1165 		if ((mfp = fopen(sname, "w+")) == NULL)
1166 			goto out;
1167 		while (fgets(line, sizeof (line), tsfp) != NULL) {
1168 			if (fputs(line, mfp) == NULL)
1169 				goto out;
1170 		}
1171 		(void) fclose(tsfp);
1172 		tsfp = NULL;
1173 		if (fflush(mfp) != 0)
1174 			goto out;
1175 		if (fsync(fileno(mfp)) != 0)
1176 			goto out;
1177 		if (fclose(mfp) != 0) {
1178 			mfp = NULL;
1179 			goto out;
1180 		}
1181 	}
1182 
1183 	Free(tsname);
1184 	tsname = NULL;
1185 	rval = 0;
1186 
1187 	/* cleanup, return error */
1188 out:
1189 	if (tsfp != NULL)
1190 		(void) fclose(tsfp);
1191 	if (tsname != NULL) {
1192 		(void) unlink(tsname);
1193 		Free(tsname);
1194 	}
1195 	return (rval);
1196 }
1197 
1198 /*
1199  * Add replicas to set.  This happens as a result of:
1200  *	- metadb [-s set_name] -a
1201  *	- metaset -s set_name -a disk
1202  *	- metaset -s set_name -d disk	 (causes a rebalance of mddbs)
1203  *	- metaset -s set_name -b
1204  *
1205  * For a local set, this routine is run on the local set host.
1206  *
1207  * For a traditional diskset, this routine is run on the node that
1208  * is running the metaset command.
1209  *
1210  * For a multinode diskset, this routine is run by the node that is
1211  * running the metaset command.  If this is the first mddb added to
1212  * the MN diskset, then no communication is made to other nodes via commd
1213  * since the other nodes will be in-sync with respect to the mddbs when
1214  * those other nodes join the set and snarf in the newly created mddb.
1215  * If this is not the first mddb added to the MN diskset, then this
1216  * attach command is sent to all of the nodes using commd.  This keeps
1217  * the nodes in-sync.
1218  */
1219 int
1220 meta_db_attach(
1221 	mdsetname_t		*sp,
1222 	mdnamelist_t		*db_nlp,
1223 	mdchkopts_t		options,
1224 	md_timeval32_t		*timeval,
1225 	int			dbcnt,
1226 	int			dbsize,
1227 	char			*sysfilename,
1228 	md_error_t		*ep
1229 )
1230 {
1231 	struct mddb_config	c;
1232 	mdnamelist_t		*nlp;
1233 	mdname_t		*np;
1234 	md_drive_desc		*dd = NULL;
1235 	md_drive_desc		*p;
1236 	int			i;
1237 	int			fd;
1238 	side_t			sideno;
1239 	daddr_t			blkno;
1240 	int			replicacount = 0;
1241 	int			start_mdmonitord = 0;
1242 	int			rval = 0;
1243 	md_error_t		status = mdnullerror;
1244 	md_set_desc		*sd;
1245 	int			stale_bool = FALSE;
1246 	int			flags;
1247 	int			firstmddb = 1;
1248 	md_timeval32_t		inittime = {0, 0};
1249 
1250 	/*
1251 	 * Error if we don't get some work to do.
1252 	 */
1253 	if (db_nlp == NULL)
1254 		return (mdsyserror(ep, EINVAL, NULL));
1255 
1256 	if (mdnamesareunique(db_nlp, ep) != 0)
1257 		return (-1);
1258 	(void) memset(&c, 0, sizeof (c));
1259 	c.c_id = 0;
1260 	c.c_setno = sp->setno;
1261 
1262 	/* Don't need device id information from this ioctl */
1263 	c.c_locator.l_devid = (uint64_t)0;
1264 	c.c_locator.l_devid_flags = 0;
1265 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1266 		if (metaislocalset(sp)) {
1267 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1268 				mdclrerror(&c.c_mde);
1269 			else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1270 			    (! (options & MDCHK_ALLOW_NODBS)))
1271 				return (mdstealerror(ep, &c.c_mde));
1272 		} else {
1273 			if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1274 				return (mdstealerror(ep, &c.c_mde));
1275 		}
1276 		mdclrerror(&c.c_mde);
1277 	}
1278 	/*
1279 	 * Is current set STALE?
1280 	 */
1281 	if (c.c_flags & MDDB_C_STALE) {
1282 		stale_bool = TRUE;
1283 	}
1284 
1285 	assert(db_nlp != NULL);
1286 
1287 	/* if creating the metadbs for the first time start mdmonitord */
1288 	if (c.c_dbcnt == 0)
1289 		start_mdmonitord = 1;
1290 
1291 	/*
1292 	 * check to see if we will go over the total possible number
1293 	 * of data bases
1294 	 */
1295 	nlp = db_nlp;
1296 	while (nlp) {
1297 		replicacount += dbcnt;
1298 		nlp = nlp->next;
1299 	}
1300 
1301 	if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1302 		return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1303 		    sp->setno, c.c_dbcnt + replicacount, NULL));
1304 
1305 	/*
1306 	 * go through and check to make sure all locations specified
1307 	 * are legal also pick out driver name;
1308 	 */
1309 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1310 		diskaddr_t devsize;
1311 
1312 		np = nlp->namep;
1313 
1314 		if (! metaislocalset(sp)) {
1315 			uint_t	partno;
1316 			uint_t	rep_partno;
1317 			mddrivename_t	*dnp = np->drivenamep;
1318 
1319 			/*
1320 			 * make sure that non-local database replicas
1321 			 * are always on the replica slice.
1322 			 */
1323 			if (meta_replicaslice(dnp,
1324 			    &rep_partno, ep) != 0)
1325 				return (-1);
1326 			if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1327 				return (-1);
1328 			if (partno != rep_partno)
1329 				return (mddeverror(ep, MDE_REPCOMP_ONLY,
1330 				    np->dev, sp->setname));
1331 		}
1332 
1333 		if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1334 		    ep)) {
1335 			return (-1);
1336 		}
1337 
1338 		if ((devsize = metagetsize(np, ep)) == -1)
1339 			return (-1);
1340 
1341 		if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1342 			return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1343 			    meta_getminor(np->dev), sp->setno, devsize,
1344 			    np->cname));
1345 	}
1346 
1347 	/*
1348 	 * If first disk in set we don't have lb_inittime yet for use as
1349 	 * mb_setcreatetime so don't go looking for it. WE'll come back
1350 	 * later and update after the locator block has been created.
1351 	 * If this isn't the first disk in the set, we have a locator
1352 	 * block and thus we have lb_inittime. Set mb_setcreatetime to
1353 	 * lb_inittime.
1354 	 */
1355 	if (! metaislocalset(sp)) {
1356 		if (c.c_dbcnt != 0) {
1357 			firstmddb = 0;
1358 			inittime = meta_get_lb_inittime(sp, ep);
1359 		}
1360 	}
1361 
1362 	/*
1363 	 * go through and write all master blocks
1364 	 */
1365 
1366 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1367 		np = nlp->namep;
1368 
1369 		if ((fd = open(np->rname, O_RDWR)) < 0)
1370 			return (mdsyserror(ep, errno, np->rname));
1371 
1372 		for (i = 0; i < dbcnt; i++) {
1373 			if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1374 			    inittime, ep)) {
1375 				(void) close(fd);
1376 				return (-1);
1377 			}
1378 		}
1379 		(void) close(fd);
1380 	}
1381 
1382 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1383 		return (-1);
1384 
1385 	if (! metaislocalset(sp)) {
1386 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1387 		if (! mdisok(ep))
1388 			return (-1);
1389 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1390 			return (-1);
1391 
1392 	}
1393 
1394 	/*
1395 	 * go through and tell kernel to add them
1396 	 */
1397 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1398 		mdcinfo_t	*cinfo;
1399 
1400 		np = nlp->namep;
1401 
1402 		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1403 			rval = -1;
1404 			goto out;
1405 		}
1406 
1407 		/*
1408 		 * If mddb is being added to MN diskset and there already
1409 		 * exists a valid mddb in the set (which equates to this
1410 		 * node being an owner of the set) then use rpc.mdcommd
1411 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1412 		 * If set is stale, don't log the message since rpc.mdcommd
1413 		 * can't write the message to the mddb.
1414 		 *
1415 		 * Otherwise, just add mddb to this node.
1416 		 */
1417 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1418 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1419 			md_mn_result_t			*resultp = NULL;
1420 			md_mn_msg_meta_db_attach_t	attach;
1421 			int 				send_rval;
1422 
1423 			/*
1424 			 * In a scenario where new replicas had been added on
1425 			 * the master, and then all of the old replicas failed
1426 			 * before the slaves had knowledge of the new replicas,
1427 			 * the slaves are unable to re-parse in the mddb
1428 			 * from the new replicas since the slaves have no
1429 			 * knowledge of the new replicas.  The following
1430 			 * algorithm solves this problem:
1431 			 * 	- META_DB_ATTACH message generates submsgs
1432 			 * 		- BLOCK parse (master)
1433 			 * 		- MDDB_ATTACH new replicas
1434 			 * 		- UNBLOCK parse (master) causing parse
1435 			 *		information to be sent from master
1436 			 *		to slaves at a higher class than the
1437 			 *		unblock so the parse message will
1438 			 *		reach slaves before unblock message.
1439 			 */
1440 			attach.msg_l_dev = np->dev;
1441 			attach.msg_cnt = dbcnt;
1442 			attach.msg_dbsize = dbsize;
1443 			(void) strncpy(attach.msg_dname, cinfo->dname,
1444 			    sizeof (attach.msg_dname));
1445 			(void) splitname(np->bname, &attach.msg_splitname);
1446 			attach.msg_options = options;
1447 
1448 			/* Set devid to NULL until devids are supported */
1449 			attach.msg_devid[0] = NULL;
1450 
1451 			/*
1452 			 * If reconfig cycle has been started, this node is
1453 			 * stuck in in the return step until this command has
1454 			 * completed.  If mdcommd is suspended, ask
1455 			 * send_message to fail (instead of retrying)
1456 			 * so that metaset can finish allowing the reconfig
1457 			 * cycle to proceed.
1458 			 */
1459 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1460 			if (stale_bool == TRUE)
1461 				flags |= MD_MSGF_NO_LOG;
1462 			send_rval = mdmn_send_message(sp->setno,
1463 				MD_MN_MSG_META_DB_ATTACH,
1464 				flags, (char *)&attach,
1465 				sizeof (md_mn_msg_meta_db_attach_t),
1466 				&resultp, ep);
1467 			if (send_rval != 0) {
1468 				rval = -1;
1469 				if (resultp == NULL)
1470 					(void) mddserror(ep,
1471 					    MDE_DS_COMMD_SEND_FAIL,
1472 					    sp->setno, NULL, NULL,
1473 					    sp->setname);
1474 				else {
1475 					(void) mdstealerror(ep,
1476 					    &(resultp->mmr_ep));
1477 					if (mdisok(ep)) {
1478 						(void) mddserror(ep,
1479 						    MDE_DS_COMMD_SEND_FAIL,
1480 						    sp->setno, NULL, NULL,
1481 						    sp->setname);
1482 					}
1483 					free_result(resultp);
1484 				}
1485 				goto out;
1486 			}
1487 			if (resultp)
1488 				free_result(resultp);
1489 		} else {
1490 		    /* Adding mddb(s) to just this node */
1491 		    for (i = 0; i < dbcnt; i++) {
1492 			(void) memset(&c, 0, sizeof (c));
1493 			/* Fill in device/replica info */
1494 			c.c_locator.l_dev = meta_cmpldev(np->dev);
1495 			c.c_locator.l_blkno = i * dbsize + 16;
1496 			blkno = c.c_locator.l_blkno;
1497 			(void) strncpy(c.c_locator.l_driver, cinfo->dname,
1498 			    sizeof (c.c_locator.l_driver));
1499 			(void) splitname(np->bname, &c.c_devname);
1500 			c.c_locator.l_mnum = meta_getminor(np->dev);
1501 
1502 			/* Fill in setno, setname, and sideno */
1503 			c.c_setno = sp->setno;
1504 			if (! metaislocalset(sp)) {
1505 				if (MD_MNSET_DESC(sd)) {
1506 					c.c_multi_node = 1;
1507 				}
1508 			}
1509 			(void) strcpy(c.c_setname, sp->setname);
1510 			c.c_sideno = sideno;
1511 
1512 			/*
1513 			 * Don't need device id information from this ioctl
1514 			 * Kernel determines device id from dev_t, which
1515 			 * is just what this code would do.
1516 			 */
1517 			c.c_locator.l_devid = (uint64_t)0;
1518 			c.c_locator.l_devid_flags = 0;
1519 
1520 			if (timeval != NULL)
1521 				c.c_timestamp = *timeval;
1522 
1523 			if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE),
1524 			    ep)) {
1525 				rval = -1;
1526 				goto out;
1527 			}
1528 
1529 			if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) {
1530 				rval = mdstealerror(ep, &c.c_mde);
1531 				goto out;
1532 			}
1533 			/*
1534 			 * This is either a traditional diskset OR this
1535 			 * is the first replica added to a MN diskset.
1536 			 * In either case, set broadcast to NO_BCAST so
1537 			 * that message won't go through rpc.mdcommd.
1538 			 * If this is a traditional diskset, the bcast
1539 			 * flag is ignored since traditional disksets
1540 			 * don't use the rpc.mdcommd.
1541 			 */
1542 			if (meta_db_addsidenms(sp, np, blkno,
1543 			    DB_ADDSIDENMS_NO_BCAST, ep))
1544 				goto out;
1545 		    }
1546 		}
1547 		if (! metaislocalset(sp)) {
1548 			/* update the dbcnt and size in dd */
1549 			for (p = dd; p != NULL; p = p->dd_next)
1550 				if (p->dd_dnp == np->drivenamep) {
1551 					p->dd_dbcnt = dbcnt;
1552 					p->dd_dbsize  = dbsize;
1553 					break;
1554 				}
1555 		}
1556 
1557 		/*
1558 		 * If this was the first addition of disks to the
1559 		 * diskset you now need to update the mb_setcreatetime
1560 		 * which needed lb_inittime which wasn't there until now.
1561 		 */
1562 		if (firstmddb) {
1563 			if (meta_update_mb(sp, dd, ep) != 0) {
1564 				return (-1);
1565 			}
1566 		}
1567 		(void) close(fd);
1568 	}
1569 
1570 out:
1571 	if (metaislocalset(sp)) {
1572 
1573 		/* everything looks fine. Start mdmonitord */
1574 		/* Note: popen/pclose is the MT-safe replacement for system */
1575 		if (rval == 0 && start_mdmonitord  == 1) {
1576 			if (pclose(popen(MDMONITORD, "w")) == -1)
1577 				md_perror(MDMONITORD);
1578 
1579 			if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1580 				mde_perror(&status, "");
1581 				mdclrerror(&status);
1582 			}
1583 		}
1584 
1585 		if (buildconf(sp, &status)) {
1586 			/* Don't mask any previous errors */
1587 			if (rval == 0)
1588 				rval = mdstealerror(ep, &status);
1589 			return (rval);
1590 		}
1591 
1592 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1593 			/* Don't mask any previous errors */
1594 			if (rval == 0)
1595 				rval = mdstealerror(ep, &status);
1596 		}
1597 	} else {
1598 		if (update_dbinfo_on_drives(sp, dd,
1599 		    (options & MDCHK_SET_LOCKED),
1600 		    (options & MDCHK_SET_FORCE),
1601 		    &status)) {
1602 			/* Don't mask any previous errors */
1603 			if (rval == 0)
1604 				rval = mdstealerror(ep, &status);
1605 			else
1606 				mdclrerror(&status);
1607 		}
1608 		metafreedrivedesc(&dd);
1609 	}
1610 	/*
1611 	 * For MN disksets that already had already had nodes joined
1612 	 * before the attach of this mddb(s), the name invalidation is
1613 	 * done by the commd handler routine.  Otherwise, if this
1614 	 * is the first attach of a MN diskset mddb, the invalidation
1615 	 * must be done here since the first attach cannot be sent
1616 	 * via the commd since there are no nodes joined to the set yet.
1617 	 */
1618 	if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1619 	    (MD_MNSET_DESC(sd) &&
1620 	    (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1621 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1622 			meta_invalidate_name(nlp->namep);
1623 		}
1624 	}
1625 	return (rval);
1626 }
1627 
1628 /*
1629  * deletelist_length
1630  *
1631  *	return the number of slices that have been specified for deletion
1632  *	on the metadb command line.  This does not calculate the number
1633  *	of replicas because there may be multiple replicas per slice.
1634  */
1635 static int
1636 deletelist_length(mdnamelist_t *db_nlp)
1637 {
1638 
1639 	mdnamelist_t		*nlp;
1640 	int			list_length = 0;
1641 
1642 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1643 		list_length++;
1644 	}
1645 
1646 	return (list_length);
1647 }
1648 
1649 static int
1650 in_deletelist(char *devname, mdnamelist_t *db_nlp)
1651 {
1652 
1653 	mdnamelist_t		*nlp;
1654 	mdname_t		*np;
1655 	int			index = 0;
1656 
1657 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1658 		np = nlp->namep;
1659 
1660 		if (strcmp(devname, np->bname) == 0)
1661 			return (index);
1662 		index++;
1663 	}
1664 
1665 	return (-1);
1666 }
1667 
1668 /*
1669  * Delete replicas from set.  This happens as a result of:
1670  *	- metadb [-s set_name] -d
1671  *	- metaset -s set_name -a disk	(causes a rebalance of mddbs)
1672  *	- metaset -s set_name -d disk
1673  *	- metaset -s set_name -b
1674  *
1675  * For a local set, this routine is run on the local set host.
1676  *
1677  * For a traditional diskset, this routine is run on the node that
1678  * is running the metaset command.
1679  *
1680  * For a multinode diskset, this routine is run by the node that is
1681  * running the metaset command.  This detach routine is sent to all
1682  * of the joined nodes in the diskset using commd.  This keeps
1683  * the nodes in-sync.
1684  */
1685 int
1686 meta_db_detach(
1687 	mdsetname_t		*sp,
1688 	mdnamelist_t		*db_nlp,
1689 	mdforceopts_t		force_option,
1690 	char			*sysfilename,
1691 	md_error_t		*ep
1692 )
1693 {
1694 	struct mddb_config	c;
1695 	mdnamelist_t		*nlp;
1696 	mdname_t		*np;
1697 	md_drive_desc		*dd = NULL;
1698 	md_drive_desc		*p;
1699 	int			replicacount;
1700 	int			replica_delete_count;
1701 	int			nr_replica_slices;
1702 	int			i;
1703 	int			stop_svmdaemons = 0;
1704 	int			rval = 0;
1705 	int			index;
1706 	int			valid_replicas_nottodelete = 0;
1707 	int			invalid_replicas_nottodelete = 0;
1708 	int			invalid_replicas_todelete = 0;
1709 	int			errored = 0;
1710 	int			*tag_array;
1711 	int			fd = -1;
1712 	md_error_t		status = mdnullerror;
1713 	md_set_desc		*sd;
1714 	int			stale_bool = FALSE;
1715 	int			flags;
1716 
1717 	/*
1718 	 * Error if we don't get some work to do.
1719 	 */
1720 	if (db_nlp == NULL)
1721 		return (mdsyserror(ep, EINVAL, NULL));
1722 
1723 	if (mdnamesareunique(db_nlp, ep) != 0)
1724 		return (-1);
1725 
1726 	(void) memset(&c, 0, sizeof (c));
1727 	c.c_id = 0;
1728 	c.c_setno = sp->setno;
1729 
1730 	/* Don't need device id information from this ioctl */
1731 	c.c_locator.l_devid = (uint64_t)0;
1732 	c.c_locator.l_devid_flags = 0;
1733 
1734 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1735 		return (mdstealerror(ep, &c.c_mde));
1736 
1737 	/*
1738 	 * Is current set STALE?
1739 	 */
1740 	if (c.c_flags & MDDB_C_STALE) {
1741 		stale_bool = TRUE;
1742 	}
1743 
1744 	replicacount = c.c_dbcnt;
1745 
1746 	assert(db_nlp != NULL);
1747 
1748 	/*
1749 	 * go through and gather how many data bases are on each
1750 	 * device specified.
1751 	 */
1752 
1753 	nr_replica_slices = deletelist_length(db_nlp);
1754 	tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1755 
1756 	replica_delete_count = 0;
1757 	for (i = 0; i < replicacount; i++) {
1758 		char	*devname;
1759 		int	found = 0;
1760 
1761 		c.c_id = i;
1762 
1763 		/* Don't need device id information from this ioctl */
1764 		c.c_locator.l_devid = (uint64_t)0;
1765 		c.c_locator.l_devid_flags = 0;
1766 
1767 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1768 			return (mdstealerror(ep, &c.c_mde));
1769 
1770 		devname = splicename(&c.c_devname);
1771 
1772 		if ((index = in_deletelist(devname, db_nlp)) != -1) {
1773 			found = 1;
1774 			tag_array[index] = 1;
1775 			replica_delete_count++;
1776 		}
1777 
1778 		errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1779 				MDDB_F_EWRITE | MDDB_F_TOOSMALL |
1780 				MDDB_F_EFMT | MDDB_F_EDATA |
1781 				MDDB_F_EMASTER);
1782 
1783 		/*
1784 		 * There are four combinations of "errored" and "found"
1785 		 * and they are used to find the number of
1786 		 * (a) valid/invalid replicas that are not in the delete
1787 		 * list and are available in the system.
1788 		 * (b) valid/invalid replicas that are to be deleted.
1789 		 */
1790 
1791 		if (errored && !found)		/* errored and !found */
1792 			invalid_replicas_nottodelete++;
1793 		else if (!found)		/* !errored and !found */
1794 			valid_replicas_nottodelete++;
1795 		else if (errored)		/* errored and found */
1796 			invalid_replicas_todelete++;
1797 		/*
1798 		 * else it is !errored and found. This means
1799 		 * valid_replicas_todelete++; But this variable will not
1800 		 * be used anywhere
1801 		 */
1802 
1803 		Free(devname);
1804 	}
1805 
1806 	index = 0;
1807 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1808 		np = nlp->namep;
1809 		if (tag_array[index++] != 1) {
1810 			Free(tag_array);
1811 			return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1812 		}
1813 	}
1814 
1815 	Free(tag_array);
1816 
1817 
1818 	/* if all replicas are deleted stop mdmonitord */
1819 	if ((replicacount - replica_delete_count) == 0)
1820 		stop_svmdaemons = 1;
1821 
1822 	if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1823 		if (force_option & MDFORCE_NONE)
1824 			return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1825 		if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1826 			return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1827 	}
1828 
1829 	/*
1830 	 * The following algorithms are followed to check for deletion:
1831 	 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1832 	 * replicas, then deletion should be allowed.
1833 	 * (b) Deletion should be allowed only if valid replicas that are "not"
1834 	 * to be deleted is always greater than the invalid replicas that
1835 	 * are "not" to be deleted.
1836 	 * (c) If the user uses -f option, then deletion should be allowed.
1837 	 */
1838 
1839 	if ((invalid_replicas_todelete != replica_delete_count) &&
1840 		(invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1841 				(force_option != MDFORCE_LOCAL))
1842 		return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1843 
1844 	/*
1845 	 * go through and tell kernel to delete them
1846 	 */
1847 
1848 	/* Don't need device id information from this ioctl */
1849 	c.c_locator.l_devid = (uint64_t)0;
1850 	c.c_locator.l_devid_flags = 0;
1851 
1852 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1853 		return (mdstealerror(ep, &c.c_mde));
1854 
1855 	if (! metaislocalset(sp)) {
1856 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1857 		if (! mdisok(ep))
1858 			return (-1);
1859 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1860 			return (-1);
1861 	}
1862 
1863 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1864 		np = nlp->namep;
1865 
1866 		/*
1867 		 * If mddb is being deleted from MN diskset and node is
1868 		 * an owner of the diskset then use rpc.mdcommd
1869 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1870 		 * If set is stale, don't log the message since rpc.mdcommd
1871 		 * can't write the message to the mddb.
1872 		 *
1873 		 * When mddbs are first being added to set, a detach can
1874 		 * be called before any node has joined the diskset, so
1875 		 * must check to see if node is an owner of the diskset.
1876 		 *
1877 		 * Otherwise, just delete mddb from this node.
1878 		 */
1879 
1880 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1881 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1882 			md_mn_result_t			*resultp;
1883 			md_mn_msg_meta_db_detach_t	detach;
1884 			int				send_rval;
1885 
1886 			/*
1887 			 * The following algorithm is used to detach replicas.
1888 			 * 	- META_DB_DETACH message generates submsgs
1889 			 * 		- BLOCK parse (master)
1890 			 * 		- MDDB_DETACH replicas
1891 			 * 		- UNBLOCK parse (master) causing parse
1892 			 *		information to be sent from master
1893 			 *		to slaves at a higher class than the
1894 			 *		unblock so the parse message will
1895 			 *		reach slaves before unblock message.
1896 			 */
1897 			(void) splitname(np->bname, &detach.msg_splitname);
1898 
1899 			/* Set devid to NULL until devids are supported */
1900 			detach.msg_devid[0] = NULL;
1901 
1902 			/*
1903 			 * If reconfig cycle has been started, this node is
1904 			 * stuck in in the return step until this command has
1905 			 * completed.  If mdcommd is suspended, ask
1906 			 * send_message to fail (instead of retrying)
1907 			 * so that metaset can finish allowing the reconfig
1908 			 * cycle to proceed.
1909 			 */
1910 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1911 			if (stale_bool == TRUE)
1912 				flags |= MD_MSGF_NO_LOG;
1913 			send_rval = mdmn_send_message(sp->setno,
1914 				MD_MN_MSG_META_DB_DETACH,
1915 				flags, (char *)&detach,
1916 				sizeof (md_mn_msg_meta_db_detach_t),
1917 				&resultp, ep);
1918 			if (send_rval != 0) {
1919 				rval = -1;
1920 				if (resultp == NULL)
1921 					(void) mddserror(ep,
1922 					    MDE_DS_COMMD_SEND_FAIL,
1923 					    sp->setno, NULL, NULL,
1924 					    sp->setname);
1925 				else {
1926 					(void) mdstealerror(ep,
1927 					    &(resultp->mmr_ep));
1928 					if (mdisok(ep)) {
1929 						(void) mddserror(ep,
1930 						    MDE_DS_COMMD_SEND_FAIL,
1931 						    sp->setno, NULL, NULL,
1932 						    sp->setname);
1933 					}
1934 					free_result(resultp);
1935 				}
1936 				goto out;
1937 			}
1938 			if (resultp)
1939 				free_result(resultp);
1940 		} else {
1941 			i = 0;
1942 			while (i < c.c_dbcnt) {
1943 				char	*devname;
1944 
1945 				c.c_id = i;
1946 
1947 				/* Don't need devid info from this ioctl */
1948 				c.c_locator.l_devid = (uint64_t)0;
1949 				c.c_locator.l_devid_flags = 0;
1950 
1951 				if (metaioctl(MD_DB_GETDEV, &c,
1952 				    &c.c_mde, NULL)) {
1953 					rval = mdstealerror(ep, &c.c_mde);
1954 					goto out;
1955 				}
1956 
1957 				devname = splicename(&c.c_devname);
1958 				if (strcmp(devname, np->bname) != 0) {
1959 					Free(devname);
1960 					i++;
1961 					continue;
1962 				}
1963 				Free(devname);
1964 
1965 				/* Don't need devid info from this ioctl */
1966 				c.c_locator.l_devid = (uint64_t)0;
1967 				c.c_locator.l_devid_flags = 0;
1968 
1969 				if (metaioctl(MD_DB_DELDEV, &c,
1970 				    &c.c_mde, NULL) != 0) {
1971 					rval = mdstealerror(ep, &c.c_mde);
1972 					goto out;
1973 				}
1974 
1975 				/* Not incrementing "i" intentionally */
1976 			}
1977 		}
1978 		if (! metaislocalset(sp)) {
1979 			/* update the dbcnt and size in dd */
1980 			for (p = dd; p != NULL; p = p->dd_next) {
1981 				if (p->dd_dnp == np->drivenamep) {
1982 					p->dd_dbcnt = 0;
1983 					p->dd_dbsize  = 0;
1984 					break;
1985 				}
1986 			}
1987 
1988 			/*
1989 			 * Slam a dummy master block and make it self
1990 			 * identifying
1991 			 */
1992 			if ((fd = open(np->rname, O_RDWR)) >= 0) {
1993 				meta_mkdummymaster(sp, fd, 16);
1994 				(void) close(fd);
1995 			}
1996 		}
1997 	}
1998 out:
1999 	if (metaislocalset(sp)) {
2000 		/*
2001 		 * Stop all the daemons if there are
2002 		 * no more replicas so that the module can be
2003 		 * unloaded.
2004 		 */
2005 		if (rval == 0 && stop_svmdaemons == 1) {
2006 			char buf[MAXPATHLEN];
2007 			int i;
2008 
2009 			for (i = 0; i < DAEMON_COUNT; i++) {
2010 				(void) snprintf(buf, MAXPATHLEN,
2011 					"/usr/bin/pkill -%s -x %s",
2012 					svmd_kill_list[i].svmd_kill_val,
2013 					svmd_kill_list[i].svmd_name);
2014 				if (pclose(popen(buf, "w")) == -1)
2015 					md_perror(buf);
2016 			}
2017 
2018 			if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2019 				mde_perror(&status, "");
2020 				mdclrerror(&status);
2021 			}
2022 		}
2023 		if (buildconf(sp, &status)) {
2024 			/* Don't mask any previous errors */
2025 			if (rval == 0)
2026 				rval = mdstealerror(ep, &status);
2027 			else
2028 				mdclrerror(&status);
2029 			return (rval);
2030 		}
2031 
2032 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2033 			/* Don't mask any previous errors */
2034 			if (rval == 0)
2035 				rval = mdstealerror(ep, &status);
2036 			else
2037 				mdclrerror(&status);
2038 		}
2039 	} else {
2040 		if (update_dbinfo_on_drives(sp, dd,
2041 		    (force_option & MDFORCE_SET_LOCKED),
2042 		    ((force_option & MDFORCE_LOCAL) |
2043 		    (force_option & MDFORCE_DS)), &status)) {
2044 			/* Don't mask any previous errors */
2045 			if (rval == 0)
2046 				rval = mdstealerror(ep, &status);
2047 			else
2048 				mdclrerror(&status);
2049 		}
2050 		metafreedrivedesc(&dd);
2051 	}
2052 	if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2053 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2054 			meta_invalidate_name(nlp->namep);
2055 		}
2056 	}
2057 	return (rval);
2058 }
2059 
2060 static md_replica_t *
2061 metareplicaname(
2062 	mdsetname_t		*sp,
2063 	int			flags,
2064 	struct mddb_config	*c,
2065 	md_error_t		*ep
2066 )
2067 {
2068 	md_replica_t	*rp;
2069 	char		*devname;
2070 	size_t		sz;
2071 
2072 	/* allocate replicaname */
2073 	rp = Zalloc(sizeof (*rp));
2074 
2075 	/* get device name */
2076 	devname = splicename(&c->c_devname);
2077 	if (flags & PRINT_FAST) {
2078 		if ((rp->r_namep = metaname_fast(&sp, devname,
2079 		    LOGICAL_DEVICE, ep)) == NULL) {
2080 			Free(devname);
2081 			Free(rp);
2082 			return (NULL);
2083 		}
2084 	} else {
2085 		if ((rp->r_namep = metaname(&sp, devname,
2086 		    LOGICAL_DEVICE, ep)) == NULL) {
2087 			Free(devname);
2088 			Free(rp);
2089 			return (NULL);
2090 		}
2091 	}
2092 	Free(devname);
2093 
2094 	/* make sure it's OK */
2095 	if ((! (flags & MD_BASICNAME_OK)) &&
2096 	    (metachkcomp(rp->r_namep, ep) != 0)) {
2097 		Free(rp);
2098 		return (NULL);
2099 	}
2100 
2101 	rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2102 	rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2103 	rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2104 	if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2105 		sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2106 		    (c->c_locator.l_devid));
2107 		if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2108 		    (ddi_devid_t)NULL) {
2109 			Free(rp);
2110 			return (NULL);
2111 		}
2112 		(void) memcpy((void *)rp->r_devid,
2113 		    (void *)(uintptr_t)c->c_locator.l_devid, sz);
2114 		(void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2115 		rp->r_flags &= ~MDDB_F_NODEVID;
2116 		/* Overwrite dev derived from name with dev from devid */
2117 		rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2118 	}
2119 	(void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2120 
2121 	rp->r_blkno = c->c_locator.l_blkno;
2122 	if (c->c_dbend != 0)
2123 		rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2124 
2125 	/* return replica */
2126 	return (rp);
2127 }
2128 
2129 /*
2130  * free replica list
2131  */
2132 void
2133 metafreereplicalist(
2134 	md_replicalist_t	*rlp
2135 )
2136 {
2137 	md_replicalist_t	*rl = NULL;
2138 
2139 	for (/* void */; (rlp != NULL); rlp = rl) {
2140 		rl = rlp->rl_next;
2141 		if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2142 			free(rlp->rl_repp->r_devid);
2143 		}
2144 		Free(rlp->rl_repp);
2145 		Free(rlp);
2146 	}
2147 }
2148 
2149 /*
2150  * return list of all replicas in set
2151  */
2152 int
2153 metareplicalist(
2154 	mdsetname_t		*sp,
2155 	int			flags,
2156 	md_replicalist_t	**rlpp,
2157 	md_error_t		*ep
2158 )
2159 {
2160 	md_replicalist_t	**tail = rlpp;
2161 	int			count = 0;
2162 	struct mddb_config	c;
2163 	int			i;
2164 	char			*devid;
2165 
2166 	/* for each replica */
2167 	i = 0;
2168 	do {
2169 		md_replica_t	*rp;
2170 
2171 		/* get next replica */
2172 		(void) memset(&c, 0, sizeof (c));
2173 		c.c_id = i;
2174 		c.c_setno = sp->setno;
2175 
2176 		c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2177 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2178 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2179 				mdclrerror(&c.c_mde);
2180 				break;	/* handle none at all */
2181 			}
2182 			(void) mdstealerror(ep, &c.c_mde);
2183 			goto out;
2184 		}
2185 
2186 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2187 			if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2188 				(void) mdsyserror(ep, ENOMEM, META_DBCONF);
2189 				goto out;
2190 			}
2191 			c.c_locator.l_devid = (uintptr_t)devid;
2192 			/*
2193 			 * Turn on space and sz flags since 'sz' amount of
2194 			 * space has been alloc'd.
2195 			 */
2196 			c.c_locator.l_devid_flags =
2197 				MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2198 		}
2199 
2200 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2201 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2202 				mdclrerror(&c.c_mde);
2203 				break;	/* handle none at all */
2204 			}
2205 			(void) mdstealerror(ep, &c.c_mde);
2206 			goto out;
2207 		}
2208 
2209 		/*
2210 		 * Paranoid check - shouldn't happen, but is left as
2211 		 * a place holder for changes that will be needed after
2212 		 * dynamic reconfiguration changes are added to SVM (to
2213 		 * support movement of disks at any point in time).
2214 		 */
2215 		if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2216 			(void) fprintf(stderr,
2217 			    dgettext(TEXT_DOMAIN,
2218 				"Error: Relocation Information "
2219 				"(drvnm=%s, mnum=0x%lx) \n"
2220 				"relocation information size changed - \n"
2221 				"rerun command\n"),
2222 			    c.c_locator.l_driver, c.c_locator.l_mnum);
2223 			(void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2224 			goto out;
2225 		}
2226 
2227 		if (c.c_dbcnt == 0)
2228 			break;		/* handle none at all */
2229 
2230 		/* get info */
2231 		if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2232 			goto out;
2233 
2234 		/* append to list */
2235 		*tail = Zalloc(sizeof (**tail));
2236 		(*tail)->rl_repp = rp;
2237 		tail = &(*tail)->rl_next;
2238 		++count;
2239 
2240 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2241 			free(devid);
2242 			c.c_locator.l_devid_flags = 0;
2243 		}
2244 
2245 	} while (++i < c.c_dbcnt);
2246 
2247 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2248 		free(devid);
2249 	}
2250 
2251 	/* return count */
2252 	return (count);
2253 
2254 	/* cleanup, return error */
2255 out:
2256 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2257 		free(devid);
2258 	}
2259 	metafreereplicalist(*rlpp);
2260 	*rlpp = NULL;
2261 	return (-1);
2262 }
2263 
2264 /*
2265  * meta_sync_db_locations - get list of replicas from kernel and write
2266  * 	out to mddb.cf and md.conf.  'Syncs up' the replica list in
2267  * 	the kernel with the replica list in the conf files.
2268  *
2269  */
2270 void
2271 meta_sync_db_locations(
2272 	mdsetname_t	*sp,
2273 	md_error_t	*ep
2274 )
2275 {
2276 	char		*sname = 0;		/* system file name */
2277 	char 		*cname = 0;		/* config file name */
2278 
2279 	if (!metaislocalset(sp))
2280 		return;
2281 
2282 	/* Updates backup of configuration file (aka mddb.cf) */
2283 	if (buildconf(sp, ep) != 0)
2284 		return;
2285 
2286 	/* Updates system configuration file (aka md.conf) */
2287 	(void) meta_db_patch(sname, cname, 0, ep);
2288 }
2289 
2290 /*
2291  * setup_db_locations - parse the mddb.cf file and
2292  *			tells the driver which db locations to use.
2293  */
2294 int
2295 meta_setup_db_locations(
2296 	md_error_t	*ep
2297 )
2298 {
2299 	mddb_config_t	c;
2300 	FILE		*fp;
2301 	char		inbuff[1024];
2302 	char		*buff;
2303 	uint_t		i;
2304 	size_t		sz;
2305 	int		rval = 0;
2306 	char		*devidp;
2307 	uint_t		devid_size;
2308 	char		*minor_name = NULL;
2309 	ddi_devid_t	devid_decode;
2310 	int		checksum;
2311 
2312 	/* do mddb.cf file */
2313 	(void) memset(&c, '\0', sizeof (c));
2314 	if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2315 		if (errno != ENOENT)
2316 			return (mdsyserror(ep, errno, META_DBCONF));
2317 	}
2318 	while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2319 	    fp)) != NULL)) {
2320 
2321 		/* ignore comments */
2322 		if (*buff == '#')
2323 			continue;
2324 
2325 		/* parse locator */
2326 		(void) memset(&c, 0, sizeof (c));
2327 		c.c_setno = MD_LOCAL_SET;
2328 		i = strcspn(buff, " \t");
2329 		if (i > sizeof (c.c_locator.l_driver))
2330 			i = sizeof (c.c_locator.l_driver);
2331 		(void) strncpy(c.c_locator.l_driver, buff, i);
2332 		buff += i;
2333 		c.c_locator.l_dev =
2334 		    makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2335 		c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2336 		c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2337 
2338 		/* parse out devid */
2339 		while (isspace((int)(*buff)))
2340 			buff += 1;
2341 		i = strcspn(buff, " \t");
2342 		if ((devidp = (char *)malloc(i+1)) == NULL)
2343 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2344 
2345 		(void) strncpy(devidp, buff, i);
2346 		devidp[i] = '\0';
2347 		if (devid_str_decode(devidp, &devid_decode,
2348 		    &minor_name) == -1) {
2349 			free(devidp);
2350 			continue;
2351 		}
2352 
2353 		/* Conf file must have minor name associated with devid */
2354 		if (minor_name == NULL) {
2355 			free(devidp);
2356 			devid_free(devid_decode);
2357 			continue;
2358 		}
2359 
2360 		sz = devid_sizeof(devid_decode);
2361 		/* Copy to devid size buffer that ioctl expects */
2362 		if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2363 			devid_free(devid_decode);
2364 			free(minor_name);
2365 			free(devidp);
2366 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2367 		}
2368 
2369 		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2370 		    (void *)devid_decode, sz);
2371 
2372 		devid_free(devid_decode);
2373 
2374 		if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2375 			free(minor_name);
2376 			free(devidp);
2377 			free((void *)(uintptr_t)c.c_locator.l_devid);
2378 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2379 		}
2380 		(void) strcpy(c.c_locator.l_minor_name, minor_name);
2381 		free(minor_name);
2382 		c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2383 			MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2384 		c.c_locator.l_devid_sz = sz;
2385 
2386 		devid_size = strlen(devidp);
2387 		buff += devid_size;
2388 
2389 		checksum = strtol(buff, &buff, 10);
2390 		for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2391 			checksum += c.c_locator.l_driver[i];
2392 		for (i = 0; i < devid_size; i++) {
2393 			checksum += devidp[i];
2394 		}
2395 		free(devidp);
2396 
2397 		checksum += minor(c.c_locator.l_dev);
2398 		checksum += c.c_locator.l_blkno;
2399 		if (checksum != 42) {
2400 			/* overwritten later for more serious problems */
2401 			rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2402 			free((void *)(uintptr_t)c.c_locator.l_devid);
2403 			continue;
2404 		}
2405 		c.c_locator.l_flags = 0;
2406 
2407 		/* use db location */
2408 		if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2409 			free((void *)(uintptr_t)c.c_locator.l_devid);
2410 			return (mdstealerror(ep, &c.c_mde));
2411 		}
2412 
2413 		/* free up devid if in use */
2414 		free((void *)(uintptr_t)c.c_locator.l_devid);
2415 		c.c_locator.l_devid = (uint64_t)0;
2416 		c.c_locator.l_devid_flags = 0;
2417 	}
2418 	if ((fp) && (fclose(fp) != 0))
2419 		return (mdsyserror(ep, errno, META_DBCONF));
2420 
2421 	/* check for stale database */
2422 	(void) memset((char *)&c, 0, sizeof (struct mddb_config));
2423 	c.c_id = 0;
2424 	c.c_setno = MD_LOCAL_SET;
2425 
2426 	/* Don't need device id information from this ioctl */
2427 	c.c_locator.l_devid = (uint64_t)0;
2428 	c.c_locator.l_devid_flags = 0;
2429 
2430 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2431 		if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2432 			return (mdstealerror(ep, &c.c_mde));
2433 		mdclrerror(&c.c_mde);
2434 	}
2435 
2436 	if (c.c_flags & MDDB_C_STALE)
2437 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2438 		    0, NULL));
2439 
2440 	/* success */
2441 	return (rval);
2442 }
2443 
2444 /*
2445  * meta_db_minreplica - returns the minimum size replica currently in use.
2446  */
2447 daddr_t
2448 meta_db_minreplica(
2449 	mdsetname_t	*sp,
2450 	md_error_t	*ep
2451 )
2452 {
2453 	md_replica_t		*r;
2454 	md_replicalist_t	*rl, *rlp = NULL;
2455 	daddr_t			nblks = 0;
2456 
2457 	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2458 		return (-1);
2459 
2460 	if (rlp == NULL)
2461 		return (-1);
2462 
2463 	/* find the smallest existing replica */
2464 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2465 		r = rl->rl_repp;
2466 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2467 	}
2468 
2469 	metafreereplicalist(rlp);
2470 	return (nblks);
2471 }
2472 
2473 /*
2474  * meta_get_replica_names
2475  *  returns an mdnamelist_t of replica slices
2476  */
2477 /*ARGSUSED*/
2478 int
2479 meta_get_replica_names(
2480 	mdsetname_t	*sp,
2481 	mdnamelist_t	**nlpp,
2482 	int		options,
2483 	md_error_t	*ep
2484 )
2485 {
2486 	md_replicalist_t	*rlp = NULL;
2487 	md_replicalist_t	*rl;
2488 	mdnamelist_t		**tailpp = nlpp;
2489 	int			cnt = 0;
2490 
2491 	assert(nlpp != NULL);
2492 
2493 	if (!metaislocalset(sp))
2494 		goto out;
2495 
2496 	/* get replicas */
2497 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2498 		cnt = -1;
2499 		goto out;
2500 	}
2501 
2502 	/* build name list */
2503 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2504 		/*
2505 		 * Add the name struct to the end of the
2506 		 * namelist but keep a pointer to the last
2507 		 * element so that we don't incur the overhead
2508 		 * of traversing the list each time
2509 		 */
2510 		tailpp = meta_namelist_append_wrapper(
2511 			tailpp, rl->rl_repp->r_namep);
2512 		++cnt;
2513 	}
2514 
2515 	/* cleanup, return count or error */
2516 out:
2517 	metafreereplicalist(rlp);
2518 	return (cnt);
2519 }
2520