xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_db.c (revision c54c769d4c1cde75dd28975fb0090a8f944651a6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * Just in case we're not in a build environment, make sure that
30  * TEXT_DOMAIN gets set to something.
31  */
32 #if !defined(TEXT_DOMAIN)
33 #define	TEXT_DOMAIN "SYS_TEST"
34 #endif
35 
36 /*
37  * Metadevice database interfaces.
38  */
39 
40 #define	MDDB
41 
42 #include <meta.h>
43 #include <sys/lvm/md_mddb.h>
44 #include <sys/lvm/md_crc.h>
45 #include <sys/lvm/mdio.h>
46 #include <string.h>
47 #include <strings.h>
48 #include <ctype.h>
49 
50 struct svm_daemon {
51 	char *svmd_name;
52 	char *svmd_kill_val;
53 };
54 
55 /*
56  * This is a list of the daemons that are not stopped by the SVM smf(5)
57  * services. The mdmonitord is started via svc:/system/mdmonitor:default
58  * but no contract(4) is constructed and so it is not stopped by smf(5).
59  */
60 struct svm_daemon svmd_kill_list[] = {
61 		{"mdmonitord", "HUP"},
62 		{"mddoors", "KILL"},
63 	};
64 
65 #define	DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
66 
67 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
68 
69 /*
70  * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
71  */
72 md_timeval32_t
73 meta_get_lb_inittime(
74 	mdsetname_t	*sp,
75 	md_error_t	*ep
76 )
77 {
78 	mddb_config_t	c;
79 
80 	(void) memset(&c, 0, sizeof (c));
81 
82 	/* Fill in setno, setname, and sideno */
83 	c.c_setno = sp->setno;
84 
85 	if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
86 		(void) mdstealerror(ep, &c.c_mde);
87 	}
88 
89 	return (c.c_timestamp);
90 }
91 
92 /*
93  * mkmasterblks writes out the master blocks of the mddb to the replica.
94  *
95  * In a MN diskset, this is called by the node that is adding this replica
96  * to the diskset.
97  */
98 
99 #define	MDDB_VERIFY_SIZE	8192
100 
101 static int
102 mkmasterblks(
103 	mdsetname_t	*sp,
104 	mdname_t	*np,
105 	int		fd,
106 	daddr_t		firstblk,
107 	int		dbsize,
108 	md_timeval32_t	inittime,
109 	md_error_t	*ep
110 )
111 {
112 	int		consecutive;
113 	md_timeval32_t	tp;
114 	struct mddb_mb	*mb;
115 	char		*buffer;
116 	int		iosize;
117 	md_set_desc	*sd;
118 	int		mn_set = 0;
119 	daddr_t		startblk;
120 	int		cnt;
121 	ddi_devid_t	devid;
122 
123 	if (! metaislocalset(sp)) {
124 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
125 			return (-1);
126 
127 		if (MD_MNSET_DESC(sd)) {
128 			mn_set = 1;		/* Used later */
129 		}
130 	}
131 
132 	/*
133 	 * Loop to verify the entire mddb region on disk is read/writable.
134 	 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
135 	 * chunks.
136 	 *
137 	 * A side-effect of this loop is to zero out the entire mddb region
138 	 */
139 	if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
140 		return (mdsyserror(ep, ENOMEM, np->rname));
141 
142 	startblk = firstblk;
143 	for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
144 
145 		if (cnt > MDDB_VERIFY_SIZE)
146 			consecutive = MDDB_VERIFY_SIZE;
147 		else
148 			consecutive = cnt;
149 
150 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
151 			Free(buffer);
152 			return (mdsyserror(ep, errno, np->rname));
153 		}
154 
155 		iosize = DEV_BSIZE * consecutive;
156 		if (write(fd, buffer, iosize) != iosize) {
157 			Free(buffer);
158 			return (mdsyserror(ep, errno, np->rname));
159 		}
160 
161 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
162 			Free(buffer);
163 			return (mdsyserror(ep, errno, np->rname));
164 		}
165 
166 		if (read(fd, buffer, iosize) != iosize) {
167 			Free(buffer);
168 			return (mdsyserror(ep, errno, np->rname));
169 		}
170 
171 		startblk += consecutive;
172 	}
173 
174 	Free(buffer);
175 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
176 		return (mdsyserror(ep, ENOMEM, np->rname));
177 
178 	if (meta_gettimeofday(&tp) == -1) {
179 		Free(mb);
180 		return (mdsyserror(ep, errno, np->rname));
181 	}
182 
183 	mb->mb_magic = MDDB_MAGIC_MB;
184 	/*
185 	 * If a MN diskset, set master block revision for a MN set.
186 	 * Even though the master block structure is no different
187 	 * for a MN set, setting the revision field to a different
188 	 * number keeps any pre-MN_diskset code from accessing
189 	 * this diskset.  It also allows for an early determination
190 	 * of a MN diskset when reading in from disk so that the
191 	 * proper size locator block and locator names structure
192 	 * can be read in thus saving time on diskset startup.
193 	 */
194 	if (mn_set)
195 		mb->mb_revision = MDDB_REV_MNMB;
196 	else
197 		mb->mb_revision = MDDB_REV_MB;
198 	mb->mb_timestamp = tp;
199 	mb->mb_setno = sp->setno;
200 	mb->mb_blkcnt = dbsize - 1;
201 	mb->mb_blkno = firstblk;
202 	mb->mb_nextblk = 0;
203 
204 	mb->mb_blkmap.m_firstblk = firstblk + 1;
205 	mb->mb_blkmap.m_consecutive = dbsize - 1;
206 	if (! metaislocalset(sp)) {
207 		mb->mb_setcreatetime = inittime;
208 	}
209 
210 	/*
211 	 * We try to save the disks device ID into the remaining bytes in
212 	 * the master block. The saved devid is used to provide a mapping
213 	 * between this disk's devid and the devid stored into the master
214 	 * block. This allows the disk image to be self-identifying
215 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
216 	 * when we try to import these disks on the remote copied image.
217 	 * If we cannot save the disks device ID onto the master block that is
218 	 * ok.  The disk is just not self-identifying and won't be importable
219 	 * in the remote copy scenario.
220 	 */
221 	if (devid_get(fd, &devid) == 0) {
222 		size_t len;
223 
224 		len = devid_sizeof(devid);
225 		if (len <= DEV_BSIZE - sizeof (*mb)) {
226 			/* there is enough space to store the devid */
227 			mb->mb_devid_magic = MDDB_MAGIC_DE;
228 			mb->mb_devid_len = len;
229 			(void) memcpy(mb->mb_devid, devid, len);
230 		}
231 		devid_free(devid);
232 	}
233 
234 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
235 	    (crc_skip_t *)NULL);
236 
237 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
238 		Free(mb);
239 		return (mdsyserror(ep, errno, np->rname));
240 	}
241 
242 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
243 		Free(mb);
244 		return (mdsyserror(ep, errno, np->rname));
245 	}
246 
247 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
248 		Free(mb);
249 		return (mdsyserror(ep, errno, np->rname));
250 	}
251 
252 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
253 		Free(mb);
254 		return (mdsyserror(ep, errno, np->rname));
255 	}
256 
257 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
258 		(uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
259 		Free(mb);
260 		return (mdmddberror(ep, MDE_NOTVERIFIED,
261 			meta_getminor(np->dev), sp->setno, 0, np->rname));
262 	}
263 
264 	Free(mb);
265 	return (0);
266 }
267 
268 void
269 meta_mkdummymaster(
270 	mdsetname_t	*sp,
271 	int		fd,
272 	daddr_t		firstblk
273 )
274 {
275 	md_timeval32_t	tp;
276 	struct mddb_mb	*mb;
277 	ddi_devid_t	devid;
278 	md_set_desc	*sd;
279 	md_error_t	ep = mdnullerror;
280 	md_timeval32_t	inittime;
281 
282 	/*
283 	 * No dummy master blocks are written for a MN diskset since devids
284 	 * are not supported in MN disksets.
285 	 */
286 	if (! metaislocalset(sp)) {
287 		if ((sd = metaget_setdesc(sp, &ep)) == NULL)
288 			return;
289 
290 		if (MD_MNSET_DESC(sd))
291 			return;
292 	}
293 
294 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
295 		return;
296 
297 	mb->mb_magic = MDDB_MAGIC_DU;
298 	mb->mb_revision = MDDB_REV_MB;
299 	mb->mb_setno = sp->setno;
300 	inittime = meta_get_lb_inittime(sp, &ep);
301 	mb->mb_setcreatetime = inittime;
302 
303 	if (meta_gettimeofday(&tp) != -1)
304 		mb->mb_timestamp = tp;
305 
306 	/*
307 	 * We try to save the disks device ID into the remaining bytes in
308 	 * the master block.  This allows the disk image to be self-identifying
309 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
310 	 * when we try to import these disks on the remote copied image.
311 	 * If we cannot save the disks device ID onto the master block that is
312 	 * ok.  The disk is just not self-identifying and won't be importable
313 	 * in the remote copy scenario.
314 	 */
315 	if (devid_get(fd, &devid) == 0) {
316 		int len;
317 
318 		len = devid_sizeof(devid);
319 		if (len <= DEV_BSIZE - sizeof (*mb)) {
320 			/* there is enough space to store the devid */
321 			mb->mb_devid_magic = MDDB_MAGIC_DE;
322 			mb->mb_devid_len = len;
323 			(void) memcpy(mb->mb_devid, (char *)devid, len);
324 		}
325 		devid_free(devid);
326 	}
327 
328 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
329 	    (crc_skip_t *)NULL);
330 
331 	/*
332 	 * If any of these operations fail, we need to inform the
333 	 * user that the disk won't be self identifying. When support
334 	 * for importing remotely replicated disksets is added, we
335 	 * want to add the error messages here.
336 	 */
337 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
338 		goto out;
339 
340 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
341 		goto out;
342 
343 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
344 		goto out;
345 
346 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
347 		goto out;
348 
349 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
350 	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
351 		goto out;
352 
353 out:
354 	Free(mb);
355 }
356 
357 static int
358 buildconf(mdsetname_t *sp, md_error_t *ep)
359 {
360 	md_replicalist_t	*rlp = NULL;
361 	md_replicalist_t	*rl;
362 	FILE			*cfp = NULL;
363 	FILE			*mfp = NULL;
364 	struct stat		sbuf;
365 	int			rval = 0;
366 	int			in_miniroot = 0;
367 	char			line[MDDB_BOOTLIST_MAX_LEN];
368 	char			*tname = NULL;
369 
370 	/* get list of local replicas */
371 	if (! metaislocalset(sp))
372 		return (0);
373 
374 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
375 		return (-1);
376 
377 	/* open tempfile, copy permissions of original file */
378 	if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
379 		/*
380 		 * On the miniroot tmp files must be created in /var/tmp.
381 		 * If we get a EROFS error, we assume that we are in the
382 		 * miniroot.
383 		 */
384 		if (errno != EROFS)
385 			goto error;
386 		in_miniroot = 1;
387 		errno = 0;
388 		tname = tempnam("/var/tmp", "slvm_");
389 		if (tname == NULL && errno == EROFS) {
390 			/*
391 			 * If we are booted on a read-only root because
392 			 * of mddb quorum problems we don't want to emit
393 			 * any scary error messages.
394 			 */
395 			errno = 0;
396 			goto out;
397 		}
398 
399 		/* open tempfile, copy permissions of original file */
400 		if ((cfp = fopen(tname, "w+")) == NULL)
401 			goto error;
402 	}
403 	if (stat(META_DBCONF, &sbuf) == 0) {
404 		if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
405 			goto error;
406 		if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
407 			goto error;
408 	}
409 
410 	/* print header */
411 	if (fprintf(cfp, "#metadevice database location file ") == EOF)
412 		goto error;
413 	if (fprintf(cfp, "do not hand edit\n") < 0)
414 		goto error;
415 	if (fprintf(cfp,
416 		"#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
417 		goto error;
418 
419 	/* dump replicas */
420 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
421 		md_replica_t	*r = rl->rl_repp;
422 		int		checksum = 42;
423 		int		i;
424 		char		*devidp;
425 		minor_t		min;
426 
427 		devidp = devid_str_encode(r->r_devid, r->r_minor_name);
428 		/* If devid code can't encode devidp - skip entry */
429 		if (devidp == NULL) {
430 			continue;
431 		}
432 
433 		/* compute checksum */
434 		for (i = 0; ((r->r_driver_name[i] != '\0') &&
435 		    (i < sizeof (r->r_driver_name))); i++) {
436 			checksum -= r->r_driver_name[i];
437 		}
438 		min = meta_getminor(r->r_namep->dev);
439 		checksum -= min;
440 		checksum -= r->r_blkno;
441 
442 		for (i = 0; i < strlen(devidp); i++) {
443 			checksum -= devidp[i];
444 		}
445 		/* print info */
446 		if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
447 		    r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
448 			goto error;
449 		}
450 
451 		devid_str_free(devidp);
452 	}
453 
454 	/* close and rename to real file */
455 	if (fflush(cfp) != 0)
456 		goto error;
457 	if (fsync(fileno(cfp)) != 0)
458 		goto error;
459 	if (fclose(cfp) != 0) {
460 		cfp = NULL;
461 		goto error;
462 	}
463 	cfp = NULL;
464 
465 	/*
466 	 * Renames don't work in the miniroot since tmpfiles are
467 	 * created in /var/tmp. Hence we copy the data out.
468 	 */
469 
470 	if (! in_miniroot) {
471 		if (rename(META_DBCONFTMP, META_DBCONF) != 0)
472 			goto error;
473 	} else {
474 		if ((cfp = fopen(tname, "r")) == NULL)
475 			goto error;
476 		if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
477 			goto error;
478 		while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
479 			if (fputs(line, mfp) == NULL)
480 				goto error;
481 		}
482 		(void) fclose(cfp);
483 		cfp = NULL;
484 		if (fflush(mfp) != 0)
485 			goto error;
486 		if (fsync(fileno(mfp)) != 0)
487 			goto error;
488 		if (fclose(mfp) != 0) {
489 			mfp = NULL;
490 			goto error;
491 		}
492 		/* delete the tempfile */
493 		(void) unlink(tname);
494 	}
495 	/* success */
496 	rval = 0;
497 	goto out;
498 
499 	/* tempfile error */
500 error:
501 	rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
502 				mdsyserror(ep, errno, META_DBCONFTMP);
503 
504 
505 	/* cleanup, return success */
506 out:
507 	if (rlp != NULL)
508 		metafreereplicalist(rlp);
509 	if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
510 		rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
511 					mdsyserror(ep, errno, META_DBCONFTMP);
512 	}
513 	free(tname);
514 	return (rval);
515 }
516 
517 /*
518  * check replica for dev
519  */
520 static int
521 in_replica(
522 	mdsetname_t	*sp,
523 	md_replica_t	*rp,
524 	mdname_t	*np,
525 	diskaddr_t	slblk,
526 	diskaddr_t	nblks,
527 	md_error_t	*ep
528 )
529 {
530 	mdname_t	*repnp = rp->r_namep;
531 	diskaddr_t	rep_sblk = rp->r_blkno;
532 	diskaddr_t	rep_nblks = rp->r_nblk;
533 
534 	/* should be in the same set */
535 	assert(sp != NULL);
536 
537 	/* if error in master block, assume whole partition */
538 	if ((rep_sblk == MD_DISKADDR_ERROR) ||
539 	    (rep_nblks == MD_DISKADDR_ERROR)) {
540 		rep_sblk = 0;
541 		rep_nblks = MD_DISKADDR_ERROR;
542 	}
543 
544 	/* check overlap */
545 	if (meta_check_overlap(
546 	    MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
547 		return (-1);
548 	}
549 
550 	/* return success */
551 	return (0);
552 }
553 
554 /*
555  * check to see if we're in a replica
556  */
557 int
558 meta_check_inreplica(
559 	mdsetname_t		*sp,
560 	mdname_t		*np,
561 	diskaddr_t		slblk,
562 	diskaddr_t		nblks,
563 	md_error_t		*ep
564 )
565 {
566 	md_replicalist_t	*rlp = NULL;
567 	md_replicalist_t	*rl;
568 	int			rval = 0;
569 
570 	/* should have a set */
571 	assert(sp != NULL);
572 
573 	/* for each replica */
574 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
575 		return (-1);
576 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
577 		md_replica_t	*rp = rl->rl_repp;
578 
579 		/* check replica */
580 		if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
581 			rval = -1;
582 			break;
583 		}
584 	}
585 
586 	/* cleanup, return success */
587 	metafreereplicalist(rlp);
588 	return (rval);
589 }
590 
591 /*
592  * check replica
593  */
594 int
595 meta_check_replica(
596 	mdsetname_t	*sp,		/* set to check against */
597 	mdname_t	*np,		/* component to check against */
598 	mdchkopts_t	options,	/* option flags */
599 	diskaddr_t	slblk,		/* start logical block */
600 	diskaddr_t	nblks,		/* number of blocks (-1,rest of them) */
601 	md_error_t	*ep		/* error packet */
602 )
603 {
604 	mdchkopts_t	chkoptions = MDCHK_ALLOW_REPSLICE;
605 
606 	/* make sure we have a disk */
607 	if (metachkcomp(np, ep) != 0)
608 		return (-1);
609 
610 	/* check to ensure that it is not already in use */
611 	if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
612 		return (-1);
613 	}
614 
615 	if (options & MDCHK_ALLOW_NODBS)
616 		return (0);
617 
618 	if (options & MDCHK_DRVINSET)
619 		return (0);
620 
621 	/* make sure it is in the set */
622 	if (meta_check_inset(sp, np, ep) != 0)
623 		return (-1);
624 
625 	/* make sure its not in a metadevice */
626 	if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
627 		return (-1);
628 
629 	/* return success */
630 	return (0);
631 }
632 
633 static int
634 update_dbinfo_on_drives(
635 	mdsetname_t	*sp,
636 	md_drive_desc	*dd,
637 	int		set_locked,
638 	int		force,
639 	md_error_t	*ep
640 )
641 {
642 	md_set_desc		*sd;
643 	int			i;
644 	md_setkey_t		*cl_sk;
645 	int			rval = 0;
646 	md_mnnode_desc		*nd;
647 
648 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
649 		return (-1);
650 
651 	if (! set_locked) {
652 		if (MD_MNSET_DESC(sd)) {
653 			md_error_t xep = mdnullerror;
654 			sigset_t sigs;
655 			/* Make sure we are blocking all signals */
656 			if (procsigs(TRUE, &sigs, &xep) < 0)
657 				mdclrerror(&xep);
658 
659 			nd = sd->sd_nodelist;
660 			while (nd) {
661 				if (force && strcmp(nd->nd_nodename,
662 				    mynode()) != 0) {
663 					nd = nd->nd_next;
664 					continue;
665 				}
666 
667 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
668 					nd = nd->nd_next;
669 					continue;
670 				}
671 
672 				if (clnt_lock_set(nd->nd_nodename, sp, ep))
673 					return (-1);
674 				nd = nd->nd_next;
675 			}
676 		} else {
677 			for (i = 0; i < MD_MAXSIDES; i++) {
678 				/* Skip empty slots */
679 				if (sd->sd_nodes[i][0] == '\0')
680 					continue;
681 
682 				if (force && strcmp(sd->sd_nodes[i],
683 				    mynode()) != 0)
684 					continue;
685 
686 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
687 					return (-1);
688 			}
689 		}
690 	}
691 
692 	if (MD_MNSET_DESC(sd)) {
693 		nd = sd->sd_nodelist;
694 		while (nd) {
695 			if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
696 				nd = nd->nd_next;
697 				continue;
698 			}
699 
700 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
701 				nd = nd->nd_next;
702 				continue;
703 			}
704 
705 			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
706 			    == -1) {
707 				rval = -1;
708 				break;
709 			}
710 			nd = nd->nd_next;
711 		}
712 	} else {
713 		for (i = 0; i < MD_MAXSIDES; i++) {
714 			/* Skip empty slots */
715 			if (sd->sd_nodes[i][0] == '\0')
716 				continue;
717 
718 			if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
719 				continue;
720 
721 			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
722 			    == -1) {
723 				rval = -1;
724 				break;
725 			}
726 		}
727 	}
728 
729 	if (! set_locked) {
730 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
731 		if (MD_MNSET_DESC(sd)) {
732 			nd = sd->sd_nodelist;
733 			while (nd) {
734 				if (force &&
735 				    strcmp(nd->nd_nodename, mynode()) != 0) {
736 					nd = nd->nd_next;
737 					continue;
738 				}
739 
740 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
741 					nd = nd->nd_next;
742 					continue;
743 				}
744 
745 				if (clnt_unlock_set(nd->nd_nodename, cl_sk,
746 				    ep)) {
747 					rval = -1;
748 					break;
749 				}
750 				nd = nd->nd_next;
751 			}
752 		} else {
753 			for (i = 0; i < MD_MAXSIDES; i++) {
754 				/* Skip empty slots */
755 				if (sd->sd_nodes[i][0] == '\0')
756 					continue;
757 
758 				if (force &&
759 				    strcmp(sd->sd_nodes[i], mynode()) != 0)
760 					continue;
761 
762 				if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
763 				    ep)) {
764 					rval = -1;
765 					break;
766 				}
767 			}
768 
769 		}
770 		cl_set_setkey(NULL);
771 	}
772 
773 	return (rval);
774 }
775 
776 int
777 meta_db_addsidenms(
778 	mdsetname_t	*sp,
779 	mdname_t	*np,
780 	daddr_t		blkno,
781 	int		bcast,
782 	md_error_t	*ep
783 )
784 {
785 	side_t		sideno;
786 	char		*bname = NULL;
787 	char		*dname = NULL;
788 	minor_t		mnum;
789 	mddb_config_t	c;
790 	int		done;
791 	int		rval = 0;
792 	md_set_desc	*sd;
793 
794 	sideno = MD_SIDEWILD;
795 	/*CONSTCOND*/
796 	while (1) {
797 		if (bname != NULL) {
798 			Free(bname);
799 			bname = NULL;
800 		}
801 		if (dname != NULL) {
802 			Free(dname);
803 			dname = NULL;
804 		}
805 		if ((done = meta_getnextside_devinfo(sp, np->bname,
806 		    &sideno, &bname, &dname, &mnum, ep)) == -1) {
807 			rval = -1;
808 			break;
809 		}
810 
811 		if (done == 0)
812 			break;
813 
814 		if (! metaislocalset(sp)) {
815 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
816 				rval = -1;
817 				break;
818 			}
819 		}
820 
821 		/*
822 		 * Send addsidenms to all nodes using rpc.mdcommd if
823 		 * sidename is being added to MN diskset.
824 		 *
825 		 *   It's ok to broadcast this call to other nodes.
826 		 *
827 		 *   Note: The broadcast to other nodes isn't needed during
828 		 *   the addition of the first mddbs to the set since the
829 		 *   other nodes haven't been joined to the set yet.  All
830 		 *   nodes in a MN diskset are (implicitly) joined to the set
831 		 *   on the addition of the first mddb.
832 		 */
833 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
834 		    (bcast == DB_ADDSIDENMS_BCAST)) {
835 			md_mn_result_t			*resultp = NULL;
836 			md_mn_msg_meta_db_newside_t	db_ns;
837 			int				send_rval;
838 
839 			db_ns.msg_l_dev = np->dev;
840 			db_ns.msg_sideno = sideno;
841 			db_ns.msg_blkno = blkno;
842 			(void) strncpy(db_ns.msg_dname, dname,
843 			    sizeof (db_ns.msg_dname));
844 			(void) splitname(np->bname, &db_ns.msg_splitname);
845 			db_ns.msg_mnum = mnum;
846 
847 			/* Set devid to NULL until devids are supported */
848 			db_ns.msg_devid[0] = NULL;
849 
850 			/*
851 			 * If reconfig cycle has been started, this node is
852 			 * stuck in in the return step until this command has
853 			 * completed.  If mdcommd is suspended, ask
854 			 * send_message to fail (instead of retrying)
855 			 * so that metaset can finish allowing the reconfig
856 			 * cycle to proceed.
857 			 */
858 			send_rval = mdmn_send_message(sp->setno,
859 			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
860 			    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns,
861 			    sizeof (md_mn_msg_meta_db_newside_t),
862 			    &resultp, ep);
863 			if (send_rval != 0) {
864 				rval = -1;
865 				if (resultp == NULL)
866 					(void) mddserror(ep,
867 					    MDE_DS_COMMD_SEND_FAIL,
868 					    sp->setno, NULL, NULL,
869 					    sp->setname);
870 				else {
871 					(void) mdstealerror(ep,
872 					    &(resultp->mmr_ep));
873 					if (mdisok(ep)) {
874 						(void) mddserror(ep,
875 						    MDE_DS_COMMD_SEND_FAIL,
876 						    sp->setno, NULL, NULL,
877 						    sp->setname);
878 					}
879 					free_result(resultp);
880 				}
881 				break;
882 			}
883 			if (resultp)
884 				free_result(resultp);
885 		} else {
886 			/*
887 			 * Let this side's  device name, minor # and driver name
888 			 * be known to the database replica.
889 			 */
890 			(void) memset(&c, 0, sizeof (c));
891 
892 			/* Fill in device/replica info */
893 			c.c_locator.l_dev = meta_cmpldev(np->dev);
894 			c.c_locator.l_blkno = blkno;
895 			(void) strncpy(c.c_locator.l_driver, dname,
896 			    sizeof (c.c_locator.l_driver));
897 			(void) splitname(bname, &c.c_devname);
898 			c.c_locator.l_mnum = mnum;
899 
900 			/* Fill in setno, setname, and sideno */
901 			c.c_setno = sp->setno;
902 			(void) strncpy(c.c_setname, sp->setname,
903 				sizeof (c.c_setname));
904 			c.c_sideno = sideno;
905 
906 			/*
907 			 * Don't need device id information from this ioctl
908 			 * Kernel determines device id from dev_t, which
909 			 * is just what this code would do.
910 			 */
911 			c.c_locator.l_devid = (uint64_t)0;
912 			c.c_locator.l_devid_flags = 0;
913 
914 			if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
915 				rval = mdstealerror(ep, &c.c_mde);
916 				break;
917 			}
918 		}
919 	}
920 
921 	/* cleanup, return success */
922 	if (bname != NULL) {
923 		Free(bname);
924 		bname = NULL;
925 	}
926 	if (dname != NULL) {
927 		Free(dname);
928 		dname = NULL;
929 	}
930 	return (rval);
931 }
932 
933 
934 int
935 meta_db_delsidenm(
936 	mdsetname_t	*sp,
937 	side_t		sideno,
938 	mdname_t	*np,
939 	daddr_t		blkno,
940 	md_error_t	*ep
941 )
942 {
943 	mddb_config_t	c;
944 	md_set_desc	*sd;
945 
946 	if (! metaislocalset(sp)) {
947 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
948 			return (-1);
949 	}
950 	/* Use rpc.mdcommd to delete mddb side from all nodes */
951 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
952 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
953 		md_mn_result_t			*resultp = NULL;
954 		md_mn_msg_meta_db_delside_t	db_ds;
955 		int				send_rval;
956 
957 		db_ds.msg_l_dev = np->dev;
958 		db_ds.msg_blkno = blkno;
959 		db_ds.msg_sideno = sideno;
960 
961 		/* Set devid to NULL until devids are supported */
962 		db_ds.msg_devid[0] = NULL;
963 
964 		/*
965 		 * If reconfig cycle has been started, this node is
966 		 * stuck in in the return step until this command has
967 		 * completed.  If mdcommd is suspended, ask
968 		 * send_message to fail (instead of retrying)
969 		 * so that metaset can finish allowing the reconfig
970 		 * cycle to proceed.
971 		 */
972 		send_rval = mdmn_send_message(sp->setno,
973 		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
974 		    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds,
975 		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
976 		if (send_rval != 0) {
977 			if (resultp == NULL)
978 				(void) mddserror(ep,
979 				    MDE_DS_COMMD_SEND_FAIL,
980 				    sp->setno, NULL, NULL,
981 				    sp->setname);
982 			else {
983 				(void) mdstealerror(ep, &(resultp->mmr_ep));
984 				if (mdisok(ep)) {
985 					(void) mddserror(ep,
986 					    MDE_DS_COMMD_SEND_FAIL,
987 					    sp->setno, NULL, NULL,
988 					    sp->setname);
989 				}
990 				free_result(resultp);
991 			}
992 			return (-1);
993 		}
994 		if (resultp)
995 			free_result(resultp);
996 
997 	} else {
998 		/*
999 		 * Let this side's  device name, minor # and driver name
1000 		 * be known to the database replica.
1001 		 */
1002 		(void) memset(&c, 0, sizeof (c));
1003 
1004 		/* Fill in device/replica info */
1005 		c.c_locator.l_dev = meta_cmpldev(np->dev);
1006 		c.c_locator.l_blkno = blkno;
1007 
1008 		/* Fill in setno, setname, and sideno */
1009 		c.c_setno = sp->setno;
1010 		(void) strcpy(c.c_setname, sp->setname);
1011 		c.c_sideno = sideno;
1012 
1013 		/*
1014 		 * Don't need device id information from this ioctl
1015 		 * Kernel determines device id from dev_t, which
1016 		 * is just what this code would do.
1017 		 */
1018 		c.c_locator.l_devid = (uint64_t)0;
1019 		c.c_locator.l_devid_flags = 0;
1020 
1021 		if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1022 			return (mdstealerror(ep, &c.c_mde));
1023 	}
1024 	return (0);
1025 }
1026 
1027 
1028 static int
1029 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1030 {
1031 	mdnamelist_t		*dnp1, *dnp2;
1032 
1033 	for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1034 		for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1035 			if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1036 				return (mderror(ep, MDE_DUPDRIVE,
1037 				    dnp1->namep->cname));
1038 		}
1039 	}
1040 	return (0);
1041 }
1042 
1043 
1044 /*
1045  * Return 1 if files are different, else return 0
1046  */
1047 static int
1048 filediff(char *tsname, char *sname)
1049 {
1050 	int ret = 1, fd;
1051 	size_t tsz, sz;
1052 	struct stat sbuf;
1053 	char *tbuf, *buf;
1054 
1055 	if (stat(tsname, &sbuf) != 0)
1056 		return (1);
1057 	tsz = sbuf.st_size;
1058 	if (stat(sname, &sbuf) != 0)
1059 		return (1);
1060 	sz = sbuf.st_size;
1061 	if (tsz != sz)
1062 		return (1);
1063 
1064 	/* allocate memory and read both files into buffer */
1065 	tbuf = malloc(tsz);
1066 	buf = malloc(sz);
1067 	if (tbuf == NULL || buf == NULL)
1068 		goto out;
1069 
1070 	fd = open(tsname, O_RDONLY);
1071 	if (fd == -1)
1072 		goto out;
1073 	sz = read(fd, tbuf, tsz);
1074 	(void) close(fd);
1075 	if (sz != tsz)
1076 		goto out;
1077 
1078 	fd = open(sname, O_RDONLY);
1079 	if (fd == -1)
1080 		goto out;
1081 	sz = read(fd, buf, tsz);
1082 	(void) close(fd);
1083 	if (sz != tsz)
1084 		goto out;
1085 
1086 	/* compare content */
1087 	ret = bcmp(tbuf, buf, tsz);
1088 out:
1089 	if (tbuf)
1090 		free(tbuf);
1091 	if (buf)
1092 		free(buf);
1093 	return (ret);
1094 }
1095 
1096 /*
1097  * patch md.conf file with mddb locations
1098  */
1099 int
1100 meta_db_patch(
1101 	char		*sname,		/* system file name */
1102 	char		*cname,		/* mddb.cf file name */
1103 	int		patch,		/* patching locally */
1104 	md_error_t	*ep
1105 )
1106 {
1107 	char		*tsname = NULL;
1108 	char		line[MDDB_BOOTLIST_MAX_LEN];
1109 	FILE		*tsfp = NULL;
1110 	FILE		*mfp = NULL;
1111 	int		rval = -1;
1112 
1113 	/* check names */
1114 	if (sname == NULL) {
1115 		if (patch)
1116 			sname = "md.conf";
1117 		else
1118 			sname = "/kernel/drv/md.conf";
1119 	}
1120 	if (cname == NULL)
1121 		cname = META_DBCONF;
1122 
1123 	/*
1124 	 * edit file
1125 	 */
1126 	if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1127 		if (mdissyserror(ep, EROFS)) {
1128 			/*
1129 			 * If we are booted on a read-only root because
1130 			 * of mddb quorum problems we don't want to emit
1131 			 * any scary error messages.
1132 			 */
1133 			mdclrerror(ep);
1134 			rval = 0;
1135 		}
1136 		goto out;
1137 	}
1138 
1139 	if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0,
1140 	    ep) != 0)
1141 		goto out;
1142 
1143 	/* if file content is identical, skip rename */
1144 	if (filediff(tsname, sname) == 0) {
1145 		rval = 0;
1146 		goto out;
1147 	}
1148 
1149 	if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1150 					    (fclose(tsfp) != 0)) {
1151 		(void) mdsyserror(ep, errno, tsname);
1152 		goto out;
1153 	}
1154 
1155 	tsfp = NULL;
1156 
1157 	/*
1158 	 * rename file. If we get a Cross Device error then it
1159 	 * is because we are in the miniroot.
1160 	 */
1161 	if (rename(tsname, sname) != 0 && errno != EXDEV) {
1162 		(void) mdsyserror(ep, errno, sname);
1163 		goto out;
1164 	}
1165 
1166 	if (errno == EXDEV) {
1167 		if ((tsfp = fopen(tsname, "r")) == NULL)
1168 			goto out;
1169 		if ((mfp = fopen(sname, "w+")) == NULL)
1170 			goto out;
1171 		while (fgets(line, sizeof (line), tsfp) != NULL) {
1172 			if (fputs(line, mfp) == NULL)
1173 				goto out;
1174 		}
1175 		(void) fclose(tsfp);
1176 		tsfp = NULL;
1177 		if (fflush(mfp) != 0)
1178 			goto out;
1179 		if (fsync(fileno(mfp)) != 0)
1180 			goto out;
1181 		if (fclose(mfp) != 0) {
1182 			mfp = NULL;
1183 			goto out;
1184 		}
1185 	}
1186 
1187 	Free(tsname);
1188 	tsname = NULL;
1189 	rval = 0;
1190 
1191 	/* cleanup, return error */
1192 out:
1193 	if (tsfp != NULL)
1194 		(void) fclose(tsfp);
1195 	if (tsname != NULL) {
1196 		(void) unlink(tsname);
1197 		Free(tsname);
1198 	}
1199 	return (rval);
1200 }
1201 
1202 /*
1203  * Add replicas to set.  This happens as a result of:
1204  *	- metadb [-s set_name] -a
1205  *	- metaset -s set_name -a disk
1206  *	- metaset -s set_name -d disk	 (causes a rebalance of mddbs)
1207  *	- metaset -s set_name -b
1208  *
1209  * For a local set, this routine is run on the local set host.
1210  *
1211  * For a traditional diskset, this routine is run on the node that
1212  * is running the metaset command.
1213  *
1214  * For a multinode diskset, this routine is run by the node that is
1215  * running the metaset command.  If this is the first mddb added to
1216  * the MN diskset, then no communication is made to other nodes via commd
1217  * since the other nodes will be in-sync with respect to the mddbs when
1218  * those other nodes join the set and snarf in the newly created mddb.
1219  * If this is not the first mddb added to the MN diskset, then this
1220  * attach command is sent to all of the nodes using commd.  This keeps
1221  * the nodes in-sync.
1222  */
1223 int
1224 meta_db_attach(
1225 	mdsetname_t		*sp,
1226 	mdnamelist_t		*db_nlp,
1227 	mdchkopts_t		options,
1228 	md_timeval32_t		*timeval,
1229 	int			dbcnt,
1230 	int			dbsize,
1231 	char			*sysfilename,
1232 	md_error_t		*ep
1233 )
1234 {
1235 	struct mddb_config	c;
1236 	mdnamelist_t		*nlp;
1237 	mdname_t		*np;
1238 	md_drive_desc		*dd = NULL;
1239 	md_drive_desc		*p;
1240 	int			i;
1241 	int			fd;
1242 	side_t			sideno;
1243 	daddr_t			blkno;
1244 	int			replicacount = 0;
1245 	int			start_svmdaemons = 0;
1246 	int			rval = 0;
1247 	md_error_t		status = mdnullerror;
1248 	md_set_desc		*sd;
1249 	int			stale_bool = FALSE;
1250 	int			flags;
1251 	int			firstmddb = 1;
1252 	md_timeval32_t		inittime = {0, 0};
1253 
1254 	/*
1255 	 * Error if we don't get some work to do.
1256 	 */
1257 	if (db_nlp == NULL)
1258 		return (mdsyserror(ep, EINVAL, NULL));
1259 
1260 	if (mdnamesareunique(db_nlp, ep) != 0)
1261 		return (-1);
1262 	(void) memset(&c, 0, sizeof (c));
1263 	c.c_id = 0;
1264 	c.c_setno = sp->setno;
1265 
1266 	/* Don't need device id information from this ioctl */
1267 	c.c_locator.l_devid = (uint64_t)0;
1268 	c.c_locator.l_devid_flags = 0;
1269 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1270 		if (metaislocalset(sp)) {
1271 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1272 				mdclrerror(&c.c_mde);
1273 			else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1274 			    (! (options & MDCHK_ALLOW_NODBS)))
1275 				return (mdstealerror(ep, &c.c_mde));
1276 		} else {
1277 			if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1278 				return (mdstealerror(ep, &c.c_mde));
1279 		}
1280 		mdclrerror(&c.c_mde);
1281 	}
1282 	/*
1283 	 * Is current set STALE?
1284 	 */
1285 	if (c.c_flags & MDDB_C_STALE) {
1286 		stale_bool = TRUE;
1287 	}
1288 
1289 	assert(db_nlp != NULL);
1290 
1291 	/* if these are the first replicas then the SVM daemons need to run */
1292 	if (c.c_dbcnt == 0)
1293 		start_svmdaemons = 1;
1294 
1295 	/*
1296 	 * check to see if we will go over the total possible number
1297 	 * of data bases
1298 	 */
1299 	nlp = db_nlp;
1300 	while (nlp) {
1301 		replicacount += dbcnt;
1302 		nlp = nlp->next;
1303 	}
1304 
1305 	if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1306 		return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1307 		    sp->setno, c.c_dbcnt + replicacount, NULL));
1308 
1309 	/*
1310 	 * go through and check to make sure all locations specified
1311 	 * are legal also pick out driver name;
1312 	 */
1313 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1314 		diskaddr_t devsize;
1315 
1316 		np = nlp->namep;
1317 
1318 		if (! metaislocalset(sp)) {
1319 			uint_t	partno;
1320 			uint_t	rep_partno;
1321 			mddrivename_t	*dnp = np->drivenamep;
1322 
1323 			/*
1324 			 * make sure that non-local database replicas
1325 			 * are always on the replica slice.
1326 			 */
1327 			if (meta_replicaslice(dnp,
1328 			    &rep_partno, ep) != 0)
1329 				return (-1);
1330 			if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1331 				return (-1);
1332 			if (partno != rep_partno)
1333 				return (mddeverror(ep, MDE_REPCOMP_ONLY,
1334 				    np->dev, sp->setname));
1335 		}
1336 
1337 		if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1338 		    ep)) {
1339 			return (-1);
1340 		}
1341 
1342 		if ((devsize = metagetsize(np, ep)) == -1)
1343 			return (-1);
1344 
1345 		if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1346 			return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1347 			    meta_getminor(np->dev), sp->setno, devsize,
1348 			    np->cname));
1349 	}
1350 
1351 	/*
1352 	 * If first disk in set we don't have lb_inittime yet for use as
1353 	 * mb_setcreatetime so don't go looking for it. WE'll come back
1354 	 * later and update after the locator block has been created.
1355 	 * If this isn't the first disk in the set, we have a locator
1356 	 * block and thus we have lb_inittime. Set mb_setcreatetime to
1357 	 * lb_inittime.
1358 	 */
1359 	if (! metaislocalset(sp)) {
1360 		if (c.c_dbcnt != 0) {
1361 			firstmddb = 0;
1362 			inittime = meta_get_lb_inittime(sp, ep);
1363 		}
1364 	}
1365 
1366 	/*
1367 	 * go through and write all master blocks
1368 	 */
1369 
1370 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1371 		np = nlp->namep;
1372 
1373 		if ((fd = open(np->rname, O_RDWR)) < 0)
1374 			return (mdsyserror(ep, errno, np->rname));
1375 
1376 		for (i = 0; i < dbcnt; i++) {
1377 			if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1378 			    inittime, ep)) {
1379 				(void) close(fd);
1380 				return (-1);
1381 			}
1382 		}
1383 		(void) close(fd);
1384 	}
1385 
1386 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1387 		return (-1);
1388 
1389 	if (! metaislocalset(sp)) {
1390 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1391 		if (! mdisok(ep))
1392 			return (-1);
1393 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1394 			return (-1);
1395 
1396 	}
1397 
1398 	/*
1399 	 * go through and tell kernel to add them
1400 	 */
1401 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1402 		mdcinfo_t	*cinfo;
1403 
1404 		np = nlp->namep;
1405 
1406 		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1407 			rval = -1;
1408 			goto out;
1409 		}
1410 
1411 		/*
1412 		 * If mddb is being added to MN diskset and there already
1413 		 * exists a valid mddb in the set (which equates to this
1414 		 * node being an owner of the set) then use rpc.mdcommd
1415 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1416 		 * If set is stale, don't log the message since rpc.mdcommd
1417 		 * can't write the message to the mddb.
1418 		 *
1419 		 * Otherwise, just add mddb to this node.
1420 		 */
1421 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1422 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1423 			md_mn_result_t			*resultp = NULL;
1424 			md_mn_msg_meta_db_attach_t	attach;
1425 			int 				send_rval;
1426 
1427 			/*
1428 			 * In a scenario where new replicas had been added on
1429 			 * the master, and then all of the old replicas failed
1430 			 * before the slaves had knowledge of the new replicas,
1431 			 * the slaves are unable to re-parse in the mddb
1432 			 * from the new replicas since the slaves have no
1433 			 * knowledge of the new replicas.  The following
1434 			 * algorithm solves this problem:
1435 			 * 	- META_DB_ATTACH message generates submsgs
1436 			 * 		- BLOCK parse (master)
1437 			 * 		- MDDB_ATTACH new replicas
1438 			 * 		- UNBLOCK parse (master) causing parse
1439 			 *		information to be sent from master
1440 			 *		to slaves at a higher class than the
1441 			 *		unblock so the parse message will
1442 			 *		reach slaves before unblock message.
1443 			 */
1444 			attach.msg_l_dev = np->dev;
1445 			attach.msg_cnt = dbcnt;
1446 			attach.msg_dbsize = dbsize;
1447 			(void) strncpy(attach.msg_dname, cinfo->dname,
1448 			    sizeof (attach.msg_dname));
1449 			(void) splitname(np->bname, &attach.msg_splitname);
1450 			attach.msg_options = options;
1451 
1452 			/* Set devid to NULL until devids are supported */
1453 			attach.msg_devid[0] = NULL;
1454 
1455 			/*
1456 			 * If reconfig cycle has been started, this node is
1457 			 * stuck in in the return step until this command has
1458 			 * completed.  If mdcommd is suspended, ask
1459 			 * send_message to fail (instead of retrying)
1460 			 * so that metaset can finish allowing the reconfig
1461 			 * cycle to proceed.
1462 			 */
1463 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1464 			if (stale_bool == TRUE)
1465 				flags |= MD_MSGF_NO_LOG;
1466 			send_rval = mdmn_send_message(sp->setno,
1467 				MD_MN_MSG_META_DB_ATTACH,
1468 				flags, (char *)&attach,
1469 				sizeof (md_mn_msg_meta_db_attach_t),
1470 				&resultp, ep);
1471 			if (send_rval != 0) {
1472 				rval = -1;
1473 				if (resultp == NULL)
1474 					(void) mddserror(ep,
1475 					    MDE_DS_COMMD_SEND_FAIL,
1476 					    sp->setno, NULL, NULL,
1477 					    sp->setname);
1478 				else {
1479 					(void) mdstealerror(ep,
1480 					    &(resultp->mmr_ep));
1481 					if (mdisok(ep)) {
1482 						(void) mddserror(ep,
1483 						    MDE_DS_COMMD_SEND_FAIL,
1484 						    sp->setno, NULL, NULL,
1485 						    sp->setname);
1486 					}
1487 					free_result(resultp);
1488 				}
1489 				goto out;
1490 			}
1491 			if (resultp)
1492 				free_result(resultp);
1493 		} else {
1494 		    /* Adding mddb(s) to just this node */
1495 		    for (i = 0; i < dbcnt; i++) {
1496 			(void) memset(&c, 0, sizeof (c));
1497 			/* Fill in device/replica info */
1498 			c.c_locator.l_dev = meta_cmpldev(np->dev);
1499 			c.c_locator.l_blkno = i * dbsize + 16;
1500 			blkno = c.c_locator.l_blkno;
1501 			(void) strncpy(c.c_locator.l_driver, cinfo->dname,
1502 			    sizeof (c.c_locator.l_driver));
1503 			(void) splitname(np->bname, &c.c_devname);
1504 			c.c_locator.l_mnum = meta_getminor(np->dev);
1505 
1506 			/* Fill in setno, setname, and sideno */
1507 			c.c_setno = sp->setno;
1508 			if (! metaislocalset(sp)) {
1509 				if (MD_MNSET_DESC(sd)) {
1510 					c.c_multi_node = 1;
1511 				}
1512 			}
1513 			(void) strcpy(c.c_setname, sp->setname);
1514 			c.c_sideno = sideno;
1515 
1516 			/*
1517 			 * Don't need device id information from this ioctl
1518 			 * Kernel determines device id from dev_t, which
1519 			 * is just what this code would do.
1520 			 */
1521 			c.c_locator.l_devid = (uint64_t)0;
1522 			c.c_locator.l_devid_flags = 0;
1523 
1524 			if (timeval != NULL)
1525 				c.c_timestamp = *timeval;
1526 
1527 			if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE),
1528 			    ep)) {
1529 				rval = -1;
1530 				goto out;
1531 			}
1532 
1533 			if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) {
1534 				rval = mdstealerror(ep, &c.c_mde);
1535 				goto out;
1536 			}
1537 			/*
1538 			 * This is either a traditional diskset OR this
1539 			 * is the first replica added to a MN diskset.
1540 			 * In either case, set broadcast to NO_BCAST so
1541 			 * that message won't go through rpc.mdcommd.
1542 			 * If this is a traditional diskset, the bcast
1543 			 * flag is ignored since traditional disksets
1544 			 * don't use the rpc.mdcommd.
1545 			 */
1546 			if (meta_db_addsidenms(sp, np, blkno,
1547 			    DB_ADDSIDENMS_NO_BCAST, ep))
1548 				goto out;
1549 		    }
1550 		}
1551 		if (! metaislocalset(sp)) {
1552 			/* update the dbcnt and size in dd */
1553 			for (p = dd; p != NULL; p = p->dd_next)
1554 				if (p->dd_dnp == np->drivenamep) {
1555 					p->dd_dbcnt = dbcnt;
1556 					p->dd_dbsize  = dbsize;
1557 					break;
1558 				}
1559 		}
1560 
1561 		/*
1562 		 * If this was the first addition of disks to the
1563 		 * diskset you now need to update the mb_setcreatetime
1564 		 * which needed lb_inittime which wasn't there until now.
1565 		 */
1566 		if (firstmddb) {
1567 			if (meta_update_mb(sp, dd, ep) != 0) {
1568 				return (-1);
1569 			}
1570 		}
1571 		(void) close(fd);
1572 	}
1573 
1574 out:
1575 	if (metaislocalset(sp)) {
1576 
1577 		/* everything looks fine. Start mdmonitord */
1578 		if (rval == 0 && start_svmdaemons == 1) {
1579 			if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1580 				mde_perror(&status, "");
1581 				mdclrerror(&status);
1582 			}
1583 		}
1584 
1585 		if (buildconf(sp, &status)) {
1586 			/* Don't mask any previous errors */
1587 			if (rval == 0)
1588 				rval = mdstealerror(ep, &status);
1589 			return (rval);
1590 		}
1591 
1592 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1593 			/* Don't mask any previous errors */
1594 			if (rval == 0)
1595 				rval = mdstealerror(ep, &status);
1596 		}
1597 	} else {
1598 		if (update_dbinfo_on_drives(sp, dd,
1599 		    (options & MDCHK_SET_LOCKED),
1600 		    (options & MDCHK_SET_FORCE),
1601 		    &status)) {
1602 			/* Don't mask any previous errors */
1603 			if (rval == 0)
1604 				rval = mdstealerror(ep, &status);
1605 			else
1606 				mdclrerror(&status);
1607 		}
1608 		metafreedrivedesc(&dd);
1609 	}
1610 	/*
1611 	 * For MN disksets that already had already had nodes joined
1612 	 * before the attach of this mddb(s), the name invalidation is
1613 	 * done by the commd handler routine.  Otherwise, if this
1614 	 * is the first attach of a MN diskset mddb, the invalidation
1615 	 * must be done here since the first attach cannot be sent
1616 	 * via the commd since there are no nodes joined to the set yet.
1617 	 */
1618 	if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1619 	    (MD_MNSET_DESC(sd) &&
1620 	    (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1621 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1622 			meta_invalidate_name(nlp->namep);
1623 		}
1624 	}
1625 	return (rval);
1626 }
1627 
1628 /*
1629  * deletelist_length
1630  *
1631  *	return the number of slices that have been specified for deletion
1632  *	on the metadb command line.  This does not calculate the number
1633  *	of replicas because there may be multiple replicas per slice.
1634  */
1635 static int
1636 deletelist_length(mdnamelist_t *db_nlp)
1637 {
1638 
1639 	mdnamelist_t		*nlp;
1640 	int			list_length = 0;
1641 
1642 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1643 		list_length++;
1644 	}
1645 
1646 	return (list_length);
1647 }
1648 
1649 static int
1650 in_deletelist(char *devname, mdnamelist_t *db_nlp)
1651 {
1652 
1653 	mdnamelist_t		*nlp;
1654 	mdname_t		*np;
1655 	int			index = 0;
1656 
1657 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1658 		np = nlp->namep;
1659 
1660 		if (strcmp(devname, np->bname) == 0)
1661 			return (index);
1662 		index++;
1663 	}
1664 
1665 	return (-1);
1666 }
1667 
1668 /*
1669  * Delete replicas from set.  This happens as a result of:
1670  *	- metadb [-s set_name] -d
1671  *	- metaset -s set_name -a disk	(causes a rebalance of mddbs)
1672  *	- metaset -s set_name -d disk
1673  *	- metaset -s set_name -b
1674  *
1675  * For a local set, this routine is run on the local set host.
1676  *
1677  * For a traditional diskset, this routine is run on the node that
1678  * is running the metaset command.
1679  *
1680  * For a multinode diskset, this routine is run by the node that is
1681  * running the metaset command.  This detach routine is sent to all
1682  * of the joined nodes in the diskset using commd.  This keeps
1683  * the nodes in-sync.
1684  */
1685 int
1686 meta_db_detach(
1687 	mdsetname_t		*sp,
1688 	mdnamelist_t		*db_nlp,
1689 	mdforceopts_t		force_option,
1690 	char			*sysfilename,
1691 	md_error_t		*ep
1692 )
1693 {
1694 	struct mddb_config	c;
1695 	mdnamelist_t		*nlp;
1696 	mdname_t		*np;
1697 	md_drive_desc		*dd = NULL;
1698 	md_drive_desc		*p;
1699 	int			replicacount;
1700 	int			replica_delete_count;
1701 	int			nr_replica_slices;
1702 	int			i;
1703 	int			stop_svmdaemons = 0;
1704 	int			rval = 0;
1705 	int			index;
1706 	int			valid_replicas_nottodelete = 0;
1707 	int			invalid_replicas_nottodelete = 0;
1708 	int			invalid_replicas_todelete = 0;
1709 	int			errored = 0;
1710 	int			*tag_array;
1711 	int			fd = -1;
1712 	md_error_t		status = mdnullerror;
1713 	md_set_desc		*sd;
1714 	int			stale_bool = FALSE;
1715 	int			flags;
1716 
1717 	/*
1718 	 * Error if we don't get some work to do.
1719 	 */
1720 	if (db_nlp == NULL)
1721 		return (mdsyserror(ep, EINVAL, NULL));
1722 
1723 	if (mdnamesareunique(db_nlp, ep) != 0)
1724 		return (-1);
1725 
1726 	(void) memset(&c, 0, sizeof (c));
1727 	c.c_id = 0;
1728 	c.c_setno = sp->setno;
1729 
1730 	/* Don't need device id information from this ioctl */
1731 	c.c_locator.l_devid = (uint64_t)0;
1732 	c.c_locator.l_devid_flags = 0;
1733 
1734 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1735 		return (mdstealerror(ep, &c.c_mde));
1736 
1737 	/*
1738 	 * Is current set STALE?
1739 	 */
1740 	if (c.c_flags & MDDB_C_STALE) {
1741 		stale_bool = TRUE;
1742 	}
1743 
1744 	replicacount = c.c_dbcnt;
1745 
1746 	assert(db_nlp != NULL);
1747 
1748 	/*
1749 	 * go through and gather how many data bases are on each
1750 	 * device specified.
1751 	 */
1752 
1753 	nr_replica_slices = deletelist_length(db_nlp);
1754 	tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1755 
1756 	replica_delete_count = 0;
1757 	for (i = 0; i < replicacount; i++) {
1758 		char	*devname;
1759 		int	found = 0;
1760 
1761 		c.c_id = i;
1762 
1763 		/* Don't need device id information from this ioctl */
1764 		c.c_locator.l_devid = (uint64_t)0;
1765 		c.c_locator.l_devid_flags = 0;
1766 
1767 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1768 			return (mdstealerror(ep, &c.c_mde));
1769 
1770 		devname = splicename(&c.c_devname);
1771 
1772 		if ((index = in_deletelist(devname, db_nlp)) != -1) {
1773 			found = 1;
1774 			tag_array[index] = 1;
1775 			replica_delete_count++;
1776 		}
1777 
1778 		errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1779 				MDDB_F_EWRITE | MDDB_F_TOOSMALL |
1780 				MDDB_F_EFMT | MDDB_F_EDATA |
1781 				MDDB_F_EMASTER);
1782 
1783 		/*
1784 		 * There are four combinations of "errored" and "found"
1785 		 * and they are used to find the number of
1786 		 * (a) valid/invalid replicas that are not in the delete
1787 		 * list and are available in the system.
1788 		 * (b) valid/invalid replicas that are to be deleted.
1789 		 */
1790 
1791 		if (errored && !found)		/* errored and !found */
1792 			invalid_replicas_nottodelete++;
1793 		else if (!found)		/* !errored and !found */
1794 			valid_replicas_nottodelete++;
1795 		else if (errored)		/* errored and found */
1796 			invalid_replicas_todelete++;
1797 		/*
1798 		 * else it is !errored and found. This means
1799 		 * valid_replicas_todelete++; But this variable will not
1800 		 * be used anywhere
1801 		 */
1802 
1803 		Free(devname);
1804 	}
1805 
1806 	index = 0;
1807 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1808 		np = nlp->namep;
1809 		if (tag_array[index++] != 1) {
1810 			Free(tag_array);
1811 			return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1812 		}
1813 	}
1814 
1815 	Free(tag_array);
1816 
1817 
1818 	/* if all replicas are deleted stop mdmonitord */
1819 	if ((replicacount - replica_delete_count) == 0)
1820 		stop_svmdaemons = 1;
1821 
1822 	if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1823 		if (force_option & MDFORCE_NONE)
1824 			return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1825 		if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1826 			return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1827 	}
1828 
1829 	/*
1830 	 * The following algorithms are followed to check for deletion:
1831 	 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1832 	 * replicas, then deletion should be allowed.
1833 	 * (b) Deletion should be allowed only if valid replicas that are "not"
1834 	 * to be deleted is always greater than the invalid replicas that
1835 	 * are "not" to be deleted.
1836 	 * (c) If the user uses -f option, then deletion should be allowed.
1837 	 */
1838 
1839 	if ((invalid_replicas_todelete != replica_delete_count) &&
1840 		(invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1841 				(force_option != MDFORCE_LOCAL))
1842 		return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1843 
1844 	/*
1845 	 * go through and tell kernel to delete them
1846 	 */
1847 
1848 	/* Don't need device id information from this ioctl */
1849 	c.c_locator.l_devid = (uint64_t)0;
1850 	c.c_locator.l_devid_flags = 0;
1851 
1852 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1853 		return (mdstealerror(ep, &c.c_mde));
1854 
1855 	if (! metaislocalset(sp)) {
1856 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1857 		if (! mdisok(ep))
1858 			return (-1);
1859 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1860 			return (-1);
1861 	}
1862 
1863 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1864 		np = nlp->namep;
1865 
1866 		/*
1867 		 * If mddb is being deleted from MN diskset and node is
1868 		 * an owner of the diskset then use rpc.mdcommd
1869 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1870 		 * If set is stale, don't log the message since rpc.mdcommd
1871 		 * can't write the message to the mddb.
1872 		 *
1873 		 * When mddbs are first being added to set, a detach can
1874 		 * be called before any node has joined the diskset, so
1875 		 * must check to see if node is an owner of the diskset.
1876 		 *
1877 		 * Otherwise, just delete mddb from this node.
1878 		 */
1879 
1880 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1881 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1882 			md_mn_result_t			*resultp;
1883 			md_mn_msg_meta_db_detach_t	detach;
1884 			int				send_rval;
1885 
1886 			/*
1887 			 * The following algorithm is used to detach replicas.
1888 			 * 	- META_DB_DETACH message generates submsgs
1889 			 * 		- BLOCK parse (master)
1890 			 * 		- MDDB_DETACH replicas
1891 			 * 		- UNBLOCK parse (master) causing parse
1892 			 *		information to be sent from master
1893 			 *		to slaves at a higher class than the
1894 			 *		unblock so the parse message will
1895 			 *		reach slaves before unblock message.
1896 			 */
1897 			(void) splitname(np->bname, &detach.msg_splitname);
1898 
1899 			/* Set devid to NULL until devids are supported */
1900 			detach.msg_devid[0] = NULL;
1901 
1902 			/*
1903 			 * If reconfig cycle has been started, this node is
1904 			 * stuck in in the return step until this command has
1905 			 * completed.  If mdcommd is suspended, ask
1906 			 * send_message to fail (instead of retrying)
1907 			 * so that metaset can finish allowing the reconfig
1908 			 * cycle to proceed.
1909 			 */
1910 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1911 			if (stale_bool == TRUE)
1912 				flags |= MD_MSGF_NO_LOG;
1913 			send_rval = mdmn_send_message(sp->setno,
1914 				MD_MN_MSG_META_DB_DETACH,
1915 				flags, (char *)&detach,
1916 				sizeof (md_mn_msg_meta_db_detach_t),
1917 				&resultp, ep);
1918 			if (send_rval != 0) {
1919 				rval = -1;
1920 				if (resultp == NULL)
1921 					(void) mddserror(ep,
1922 					    MDE_DS_COMMD_SEND_FAIL,
1923 					    sp->setno, NULL, NULL,
1924 					    sp->setname);
1925 				else {
1926 					(void) mdstealerror(ep,
1927 					    &(resultp->mmr_ep));
1928 					if (mdisok(ep)) {
1929 						(void) mddserror(ep,
1930 						    MDE_DS_COMMD_SEND_FAIL,
1931 						    sp->setno, NULL, NULL,
1932 						    sp->setname);
1933 					}
1934 					free_result(resultp);
1935 				}
1936 				goto out;
1937 			}
1938 			if (resultp)
1939 				free_result(resultp);
1940 		} else {
1941 			i = 0;
1942 			while (i < c.c_dbcnt) {
1943 				char	*devname;
1944 
1945 				c.c_id = i;
1946 
1947 				/* Don't need devid info from this ioctl */
1948 				c.c_locator.l_devid = (uint64_t)0;
1949 				c.c_locator.l_devid_flags = 0;
1950 
1951 				if (metaioctl(MD_DB_GETDEV, &c,
1952 				    &c.c_mde, NULL)) {
1953 					rval = mdstealerror(ep, &c.c_mde);
1954 					goto out;
1955 				}
1956 
1957 				devname = splicename(&c.c_devname);
1958 				if (strcmp(devname, np->bname) != 0) {
1959 					Free(devname);
1960 					i++;
1961 					continue;
1962 				}
1963 				Free(devname);
1964 
1965 				/* Don't need devid info from this ioctl */
1966 				c.c_locator.l_devid = (uint64_t)0;
1967 				c.c_locator.l_devid_flags = 0;
1968 
1969 				if (metaioctl(MD_DB_DELDEV, &c,
1970 				    &c.c_mde, NULL) != 0) {
1971 					rval = mdstealerror(ep, &c.c_mde);
1972 					goto out;
1973 				}
1974 
1975 				/* Not incrementing "i" intentionally */
1976 			}
1977 		}
1978 		if (! metaislocalset(sp)) {
1979 			/* update the dbcnt and size in dd */
1980 			for (p = dd; p != NULL; p = p->dd_next) {
1981 				if (p->dd_dnp == np->drivenamep) {
1982 					p->dd_dbcnt = 0;
1983 					p->dd_dbsize  = 0;
1984 					break;
1985 				}
1986 			}
1987 
1988 			/*
1989 			 * Slam a dummy master block and make it self
1990 			 * identifying
1991 			 */
1992 			if ((fd = open(np->rname, O_RDWR)) >= 0) {
1993 				meta_mkdummymaster(sp, fd, 16);
1994 				(void) close(fd);
1995 			}
1996 		}
1997 	}
1998 out:
1999 	if (metaislocalset(sp)) {
2000 		/*
2001 		 * Stop all the daemons if there are
2002 		 * no more replicas so that the module can be
2003 		 * unloaded.
2004 		 */
2005 		if (rval == 0 && stop_svmdaemons == 1) {
2006 			char buf[MAXPATHLEN];
2007 			int i;
2008 
2009 			for (i = 0; i < DAEMON_COUNT; i++) {
2010 				(void) snprintf(buf, MAXPATHLEN,
2011 					"/usr/bin/pkill -%s -x %s",
2012 					svmd_kill_list[i].svmd_kill_val,
2013 					svmd_kill_list[i].svmd_name);
2014 				if (pclose(popen(buf, "w")) == -1)
2015 					md_perror(buf);
2016 			}
2017 
2018 			if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2019 				mde_perror(&status, "");
2020 				mdclrerror(&status);
2021 			}
2022 		}
2023 		if (buildconf(sp, &status)) {
2024 			/* Don't mask any previous errors */
2025 			if (rval == 0)
2026 				rval = mdstealerror(ep, &status);
2027 			else
2028 				mdclrerror(&status);
2029 			return (rval);
2030 		}
2031 
2032 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2033 			/* Don't mask any previous errors */
2034 			if (rval == 0)
2035 				rval = mdstealerror(ep, &status);
2036 			else
2037 				mdclrerror(&status);
2038 		}
2039 	} else {
2040 		if (update_dbinfo_on_drives(sp, dd,
2041 		    (force_option & MDFORCE_SET_LOCKED),
2042 		    ((force_option & MDFORCE_LOCAL) |
2043 		    (force_option & MDFORCE_DS)), &status)) {
2044 			/* Don't mask any previous errors */
2045 			if (rval == 0)
2046 				rval = mdstealerror(ep, &status);
2047 			else
2048 				mdclrerror(&status);
2049 		}
2050 		metafreedrivedesc(&dd);
2051 	}
2052 	if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2053 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2054 			meta_invalidate_name(nlp->namep);
2055 		}
2056 	}
2057 	return (rval);
2058 }
2059 
2060 static md_replica_t *
2061 metareplicaname(
2062 	mdsetname_t		*sp,
2063 	int			flags,
2064 	struct mddb_config	*c,
2065 	md_error_t		*ep
2066 )
2067 {
2068 	md_replica_t	*rp;
2069 	char		*devname;
2070 	size_t		sz;
2071 
2072 	/* allocate replicaname */
2073 	rp = Zalloc(sizeof (*rp));
2074 
2075 	/* get device name */
2076 	devname = splicename(&c->c_devname);
2077 	if (flags & PRINT_FAST) {
2078 		if ((rp->r_namep = metaname_fast(&sp, devname,
2079 		    LOGICAL_DEVICE, ep)) == NULL) {
2080 			Free(devname);
2081 			Free(rp);
2082 			return (NULL);
2083 		}
2084 	} else {
2085 		if ((rp->r_namep = metaname(&sp, devname,
2086 		    LOGICAL_DEVICE, ep)) == NULL) {
2087 			Free(devname);
2088 			Free(rp);
2089 			return (NULL);
2090 		}
2091 	}
2092 	Free(devname);
2093 
2094 	/* make sure it's OK */
2095 	if ((! (flags & MD_BASICNAME_OK)) &&
2096 	    (metachkcomp(rp->r_namep, ep) != 0)) {
2097 		Free(rp);
2098 		return (NULL);
2099 	}
2100 
2101 	rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2102 	rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2103 	rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2104 	if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2105 		sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2106 		    (c->c_locator.l_devid));
2107 		if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2108 		    (ddi_devid_t)NULL) {
2109 			Free(rp);
2110 			return (NULL);
2111 		}
2112 		(void) memcpy((void *)rp->r_devid,
2113 		    (void *)(uintptr_t)c->c_locator.l_devid, sz);
2114 		(void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2115 		rp->r_flags &= ~MDDB_F_NODEVID;
2116 		/* Overwrite dev derived from name with dev from devid */
2117 		rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2118 	}
2119 	(void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2120 
2121 	rp->r_blkno = c->c_locator.l_blkno;
2122 	if (c->c_dbend != 0)
2123 		rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2124 
2125 	/* return replica */
2126 	return (rp);
2127 }
2128 
2129 /*
2130  * free replica list
2131  */
2132 void
2133 metafreereplicalist(
2134 	md_replicalist_t	*rlp
2135 )
2136 {
2137 	md_replicalist_t	*rl = NULL;
2138 
2139 	for (/* void */; (rlp != NULL); rlp = rl) {
2140 		rl = rlp->rl_next;
2141 		if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2142 			free(rlp->rl_repp->r_devid);
2143 		}
2144 		Free(rlp->rl_repp);
2145 		Free(rlp);
2146 	}
2147 }
2148 
2149 /*
2150  * return list of all replicas in set
2151  */
2152 int
2153 metareplicalist(
2154 	mdsetname_t		*sp,
2155 	int			flags,
2156 	md_replicalist_t	**rlpp,
2157 	md_error_t		*ep
2158 )
2159 {
2160 	md_replicalist_t	**tail = rlpp;
2161 	int			count = 0;
2162 	struct mddb_config	c;
2163 	int			i;
2164 	char			*devid;
2165 
2166 	/* for each replica */
2167 	i = 0;
2168 	do {
2169 		md_replica_t	*rp;
2170 
2171 		/* get next replica */
2172 		(void) memset(&c, 0, sizeof (c));
2173 		c.c_id = i;
2174 		c.c_setno = sp->setno;
2175 
2176 		c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2177 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2178 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2179 				mdclrerror(&c.c_mde);
2180 				break;	/* handle none at all */
2181 			}
2182 			(void) mdstealerror(ep, &c.c_mde);
2183 			goto out;
2184 		}
2185 
2186 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2187 			if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2188 				(void) mdsyserror(ep, ENOMEM, META_DBCONF);
2189 				goto out;
2190 			}
2191 			c.c_locator.l_devid = (uintptr_t)devid;
2192 			/*
2193 			 * Turn on space and sz flags since 'sz' amount of
2194 			 * space has been alloc'd.
2195 			 */
2196 			c.c_locator.l_devid_flags =
2197 				MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2198 		}
2199 
2200 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2201 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2202 				mdclrerror(&c.c_mde);
2203 				break;	/* handle none at all */
2204 			}
2205 			(void) mdstealerror(ep, &c.c_mde);
2206 			goto out;
2207 		}
2208 
2209 		/*
2210 		 * Paranoid check - shouldn't happen, but is left as
2211 		 * a place holder for changes that will be needed after
2212 		 * dynamic reconfiguration changes are added to SVM (to
2213 		 * support movement of disks at any point in time).
2214 		 */
2215 		if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2216 			(void) fprintf(stderr,
2217 			    dgettext(TEXT_DOMAIN,
2218 				"Error: Relocation Information "
2219 				"(drvnm=%s, mnum=0x%lx) \n"
2220 				"relocation information size changed - \n"
2221 				"rerun command\n"),
2222 			    c.c_locator.l_driver, c.c_locator.l_mnum);
2223 			(void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2224 			goto out;
2225 		}
2226 
2227 		if (c.c_dbcnt == 0)
2228 			break;		/* handle none at all */
2229 
2230 		/* get info */
2231 		if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2232 			goto out;
2233 
2234 		/* append to list */
2235 		*tail = Zalloc(sizeof (**tail));
2236 		(*tail)->rl_repp = rp;
2237 		tail = &(*tail)->rl_next;
2238 		++count;
2239 
2240 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2241 			free(devid);
2242 			c.c_locator.l_devid_flags = 0;
2243 		}
2244 
2245 	} while (++i < c.c_dbcnt);
2246 
2247 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2248 		free(devid);
2249 	}
2250 
2251 	/* return count */
2252 	return (count);
2253 
2254 	/* cleanup, return error */
2255 out:
2256 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2257 		free(devid);
2258 	}
2259 	metafreereplicalist(*rlpp);
2260 	*rlpp = NULL;
2261 	return (-1);
2262 }
2263 
2264 /*
2265  * meta_sync_db_locations - get list of replicas from kernel and write
2266  * 	out to mddb.cf and md.conf.  'Syncs up' the replica list in
2267  * 	the kernel with the replica list in the conf files.
2268  *
2269  */
2270 void
2271 meta_sync_db_locations(
2272 	mdsetname_t	*sp,
2273 	md_error_t	*ep
2274 )
2275 {
2276 	char		*sname = 0;		/* system file name */
2277 	char 		*cname = 0;		/* config file name */
2278 
2279 	if (!metaislocalset(sp))
2280 		return;
2281 
2282 	/* Updates backup of configuration file (aka mddb.cf) */
2283 	if (buildconf(sp, ep) != 0)
2284 		return;
2285 
2286 	/* Updates system configuration file (aka md.conf) */
2287 	(void) meta_db_patch(sname, cname, 0, ep);
2288 }
2289 
2290 /*
2291  * setup_db_locations - parse the mddb.cf file and
2292  *			tells the driver which db locations to use.
2293  */
2294 int
2295 meta_setup_db_locations(
2296 	md_error_t	*ep
2297 )
2298 {
2299 	mddb_config_t	c;
2300 	FILE		*fp;
2301 	char		inbuff[1024];
2302 	char		*buff;
2303 	uint_t		i;
2304 	size_t		sz;
2305 	int		rval = 0;
2306 	char		*devidp;
2307 	uint_t		devid_size;
2308 	char		*minor_name = NULL;
2309 	ddi_devid_t	devid_decode;
2310 	int		checksum;
2311 
2312 	/* do mddb.cf file */
2313 	(void) memset(&c, '\0', sizeof (c));
2314 	if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2315 		if (errno != ENOENT)
2316 			return (mdsyserror(ep, errno, META_DBCONF));
2317 	}
2318 	while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2319 	    fp)) != NULL)) {
2320 
2321 		/* ignore comments */
2322 		if (*buff == '#')
2323 			continue;
2324 
2325 		/* parse locator */
2326 		(void) memset(&c, 0, sizeof (c));
2327 		c.c_setno = MD_LOCAL_SET;
2328 		i = strcspn(buff, " \t");
2329 		if (i > sizeof (c.c_locator.l_driver))
2330 			i = sizeof (c.c_locator.l_driver);
2331 		(void) strncpy(c.c_locator.l_driver, buff, i);
2332 		buff += i;
2333 		c.c_locator.l_dev =
2334 		    makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2335 		c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2336 		c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2337 
2338 		/* parse out devid */
2339 		while (isspace((int)(*buff)))
2340 			buff += 1;
2341 		i = strcspn(buff, " \t");
2342 		if ((devidp = (char *)malloc(i+1)) == NULL)
2343 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2344 
2345 		(void) strncpy(devidp, buff, i);
2346 		devidp[i] = '\0';
2347 		if (devid_str_decode(devidp, &devid_decode,
2348 		    &minor_name) == -1) {
2349 			free(devidp);
2350 			continue;
2351 		}
2352 
2353 		/* Conf file must have minor name associated with devid */
2354 		if (minor_name == NULL) {
2355 			free(devidp);
2356 			devid_free(devid_decode);
2357 			continue;
2358 		}
2359 
2360 		sz = devid_sizeof(devid_decode);
2361 		/* Copy to devid size buffer that ioctl expects */
2362 		if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2363 			devid_free(devid_decode);
2364 			free(minor_name);
2365 			free(devidp);
2366 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2367 		}
2368 
2369 		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2370 		    (void *)devid_decode, sz);
2371 
2372 		devid_free(devid_decode);
2373 
2374 		if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2375 			free(minor_name);
2376 			free(devidp);
2377 			free((void *)(uintptr_t)c.c_locator.l_devid);
2378 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2379 		}
2380 		(void) strcpy(c.c_locator.l_minor_name, minor_name);
2381 		free(minor_name);
2382 		c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2383 			MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2384 		c.c_locator.l_devid_sz = sz;
2385 
2386 		devid_size = strlen(devidp);
2387 		buff += devid_size;
2388 
2389 		checksum = strtol(buff, &buff, 10);
2390 		for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2391 			checksum += c.c_locator.l_driver[i];
2392 		for (i = 0; i < devid_size; i++) {
2393 			checksum += devidp[i];
2394 		}
2395 		free(devidp);
2396 
2397 		checksum += minor(c.c_locator.l_dev);
2398 		checksum += c.c_locator.l_blkno;
2399 		if (checksum != 42) {
2400 			/* overwritten later for more serious problems */
2401 			rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2402 			free((void *)(uintptr_t)c.c_locator.l_devid);
2403 			continue;
2404 		}
2405 		c.c_locator.l_flags = 0;
2406 
2407 		/* use db location */
2408 		if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2409 			free((void *)(uintptr_t)c.c_locator.l_devid);
2410 			return (mdstealerror(ep, &c.c_mde));
2411 		}
2412 
2413 		/* free up devid if in use */
2414 		free((void *)(uintptr_t)c.c_locator.l_devid);
2415 		c.c_locator.l_devid = (uint64_t)0;
2416 		c.c_locator.l_devid_flags = 0;
2417 	}
2418 	if ((fp) && (fclose(fp) != 0))
2419 		return (mdsyserror(ep, errno, META_DBCONF));
2420 
2421 	/* check for stale database */
2422 	(void) memset((char *)&c, 0, sizeof (struct mddb_config));
2423 	c.c_id = 0;
2424 	c.c_setno = MD_LOCAL_SET;
2425 
2426 	/* Don't need device id information from this ioctl */
2427 	c.c_locator.l_devid = (uint64_t)0;
2428 	c.c_locator.l_devid_flags = 0;
2429 
2430 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2431 		if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2432 			return (mdstealerror(ep, &c.c_mde));
2433 		mdclrerror(&c.c_mde);
2434 	}
2435 
2436 	if (c.c_flags & MDDB_C_STALE)
2437 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2438 		    0, NULL));
2439 
2440 	/* success */
2441 	return (rval);
2442 }
2443 
2444 /*
2445  * meta_db_minreplica - returns the minimum size replica currently in use.
2446  */
2447 daddr_t
2448 meta_db_minreplica(
2449 	mdsetname_t	*sp,
2450 	md_error_t	*ep
2451 )
2452 {
2453 	md_replica_t		*r;
2454 	md_replicalist_t	*rl, *rlp = NULL;
2455 	daddr_t			nblks = 0;
2456 
2457 	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2458 		return (-1);
2459 
2460 	if (rlp == NULL)
2461 		return (-1);
2462 
2463 	/* find the smallest existing replica */
2464 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2465 		r = rl->rl_repp;
2466 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2467 	}
2468 
2469 	metafreereplicalist(rlp);
2470 	return (nblks);
2471 }
2472 
2473 /*
2474  * meta_get_replica_names
2475  *  returns an mdnamelist_t of replica slices
2476  */
2477 /*ARGSUSED*/
2478 int
2479 meta_get_replica_names(
2480 	mdsetname_t	*sp,
2481 	mdnamelist_t	**nlpp,
2482 	int		options,
2483 	md_error_t	*ep
2484 )
2485 {
2486 	md_replicalist_t	*rlp = NULL;
2487 	md_replicalist_t	*rl;
2488 	mdnamelist_t		**tailpp = nlpp;
2489 	int			cnt = 0;
2490 
2491 	assert(nlpp != NULL);
2492 
2493 	if (!metaislocalset(sp))
2494 		goto out;
2495 
2496 	/* get replicas */
2497 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2498 		cnt = -1;
2499 		goto out;
2500 	}
2501 
2502 	/* build name list */
2503 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2504 		/*
2505 		 * Add the name struct to the end of the
2506 		 * namelist but keep a pointer to the last
2507 		 * element so that we don't incur the overhead
2508 		 * of traversing the list each time
2509 		 */
2510 		tailpp = meta_namelist_append_wrapper(
2511 			tailpp, rl->rl_repp->r_namep);
2512 		++cnt;
2513 	}
2514 
2515 	/* cleanup, return count or error */
2516 out:
2517 	metafreereplicalist(rlp);
2518 	return (cnt);
2519 }
2520