xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_db.c (revision 8461248208fabd3a8230615f8615e5bf1b4dcdcb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Just in case we're not in a build environment, make sure that
31  * TEXT_DOMAIN gets set to something.
32  */
33 #if !defined(TEXT_DOMAIN)
34 #define	TEXT_DOMAIN "SYS_TEST"
35 #endif
36 
37 /*
38  * Metadevice database interfaces.
39  */
40 
41 #define	MDDB
42 
43 #include <meta.h>
44 #include <sys/lvm/md_mddb.h>
45 #include <sys/lvm/md_crc.h>
46 #include <sys/lvm/mdio.h>
47 #include <string.h>
48 #include <strings.h>
49 #include <ctype.h>
50 
51 struct svm_daemon {
52 	char *svmd_name;
53 	char *svmd_kill_val;
54 };
55 
56 struct svm_daemon svmd_kill_list[] = {
57 		{"mdmonitord", "HUP"},
58 		{"mddoors", "KILL"},
59 	};
60 
61 #define	DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
62 #define	MDMONITORD	"/usr/sbin/mdmonitord"
63 
64 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
65 
66 /*
67  * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
68  */
69 md_timeval32_t
70 meta_get_lb_inittime(
71 	mdsetname_t	*sp,
72 	md_error_t	*ep
73 )
74 {
75 	mddb_config_t	c;
76 
77 	(void) memset(&c, 0, sizeof (c));
78 
79 	/* Fill in setno, setname, and sideno */
80 	c.c_setno = sp->setno;
81 
82 	if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
83 		(void) mdstealerror(ep, &c.c_mde);
84 	}
85 
86 	return (c.c_timestamp);
87 }
88 
89 /*
90  * mkmasterblks writes out the master blocks of the mddb to the replica.
91  *
92  * In a MN diskset, this is called by the node that is adding this replica
93  * to the diskset.
94  */
95 
96 #define	MDDB_VERIFY_SIZE	8192
97 
98 static int
99 mkmasterblks(
100 	mdsetname_t	*sp,
101 	mdname_t	*np,
102 	int		fd,
103 	daddr_t		firstblk,
104 	int		dbsize,
105 	md_timeval32_t	inittime,
106 	md_error_t	*ep
107 )
108 {
109 	int		consecutive;
110 	md_timeval32_t	tp;
111 	struct mddb_mb	*mb;
112 	char		*buffer;
113 	int		iosize;
114 	md_set_desc	*sd;
115 	int		mn_set = 0;
116 	daddr_t		startblk;
117 	int		cnt;
118 	ddi_devid_t	devid;
119 
120 	if (! metaislocalset(sp)) {
121 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
122 			return (-1);
123 
124 		if (MD_MNSET_DESC(sd)) {
125 			mn_set = 1;		/* Used later */
126 		}
127 	}
128 
129 	/*
130 	 * Loop to verify the entire mddb region on disk is read/writable.
131 	 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
132 	 * chunks.
133 	 *
134 	 * A side-effect of this loop is to zero out the entire mddb region
135 	 */
136 	if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
137 		return (mdsyserror(ep, ENOMEM, np->rname));
138 
139 	startblk = firstblk;
140 	for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
141 
142 		if (cnt > MDDB_VERIFY_SIZE)
143 			consecutive = MDDB_VERIFY_SIZE;
144 		else
145 			consecutive = cnt;
146 
147 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
148 			Free(buffer);
149 			return (mdsyserror(ep, errno, np->rname));
150 		}
151 
152 		iosize = DEV_BSIZE * consecutive;
153 		if (write(fd, buffer, iosize) != iosize) {
154 			Free(buffer);
155 			return (mdsyserror(ep, errno, np->rname));
156 		}
157 
158 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
159 			Free(buffer);
160 			return (mdsyserror(ep, errno, np->rname));
161 		}
162 
163 		if (read(fd, buffer, iosize) != iosize) {
164 			Free(buffer);
165 			return (mdsyserror(ep, errno, np->rname));
166 		}
167 
168 		startblk += consecutive;
169 	}
170 
171 	Free(buffer);
172 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
173 		return (mdsyserror(ep, ENOMEM, np->rname));
174 
175 	if (meta_gettimeofday(&tp) == -1) {
176 		Free(mb);
177 		return (mdsyserror(ep, errno, np->rname));
178 	}
179 
180 	mb->mb_magic = MDDB_MAGIC_MB;
181 	/*
182 	 * If a MN diskset, set master block revision for a MN set.
183 	 * Even though the master block structure is no different
184 	 * for a MN set, setting the revision field to a different
185 	 * number keeps any pre-MN_diskset code from accessing
186 	 * this diskset.  It also allows for an early determination
187 	 * of a MN diskset when reading in from disk so that the
188 	 * proper size locator block and locator names structure
189 	 * can be read in thus saving time on diskset startup.
190 	 */
191 	if (mn_set)
192 		mb->mb_revision = MDDB_REV_MNMB;
193 	else
194 		mb->mb_revision = MDDB_REV_MB;
195 	mb->mb_timestamp = tp;
196 	mb->mb_setno = sp->setno;
197 	mb->mb_blkcnt = dbsize - 1;
198 	mb->mb_blkno = firstblk;
199 	mb->mb_nextblk = 0;
200 
201 	mb->mb_blkmap.m_firstblk = firstblk + 1;
202 	mb->mb_blkmap.m_consecutive = dbsize - 1;
203 	if (! metaislocalset(sp)) {
204 		mb->mb_setcreatetime = inittime;
205 	}
206 
207 	/*
208 	 * We try to save the disks device ID into the remaining bytes in
209 	 * the master block. The saved devid is used to provide a mapping
210 	 * between this disk's devid and the devid stored into the master
211 	 * block. This allows the disk image to be self-identifying
212 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
213 	 * when we try to import these disks on the remote copied image.
214 	 * If we cannot save the disks device ID onto the master block that is
215 	 * ok.  The disk is just not self-identifying and won't be importable
216 	 * in the remote copy scenario.
217 	 */
218 	if (devid_get(fd, &devid) == 0) {
219 		size_t len;
220 
221 		len = devid_sizeof(devid);
222 		if (len <= DEV_BSIZE - sizeof (*mb)) {
223 			/* there is enough space to store the devid */
224 			mb->mb_devid_magic = MDDB_MAGIC_DE;
225 			mb->mb_devid_len = len;
226 			(void) memcpy(mb->mb_devid, devid, len);
227 		}
228 		devid_free(devid);
229 	}
230 
231 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
232 	    (crc_skip_t *)NULL);
233 
234 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
235 		Free(mb);
236 		return (mdsyserror(ep, errno, np->rname));
237 	}
238 
239 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
240 		Free(mb);
241 		return (mdsyserror(ep, errno, np->rname));
242 	}
243 
244 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
245 		Free(mb);
246 		return (mdsyserror(ep, errno, np->rname));
247 	}
248 
249 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
250 		Free(mb);
251 		return (mdsyserror(ep, errno, np->rname));
252 	}
253 
254 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
255 		(uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
256 		Free(mb);
257 		return (mdmddberror(ep, MDE_NOTVERIFIED,
258 			meta_getminor(np->dev), sp->setno, 0, np->rname));
259 	}
260 
261 	Free(mb);
262 	return (0);
263 }
264 
265 void
266 meta_mkdummymaster(
267 	mdsetname_t	*sp,
268 	int		fd,
269 	daddr_t		firstblk
270 )
271 {
272 	md_timeval32_t	tp;
273 	struct mddb_mb	*mb;
274 	ddi_devid_t	devid;
275 	md_set_desc	*sd;
276 	md_error_t	ep = mdnullerror;
277 	md_timeval32_t	inittime;
278 
279 	/*
280 	 * No dummy master blocks are written for a MN diskset since devids
281 	 * are not supported in MN disksets.
282 	 */
283 	if (! metaislocalset(sp)) {
284 		if ((sd = metaget_setdesc(sp, &ep)) == NULL)
285 			return;
286 
287 		if (MD_MNSET_DESC(sd))
288 			return;
289 	}
290 
291 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
292 		return;
293 
294 	mb->mb_magic = MDDB_MAGIC_DU;
295 	mb->mb_revision = MDDB_REV_MB;
296 	mb->mb_setno = sp->setno;
297 	inittime = meta_get_lb_inittime(sp, &ep);
298 	mb->mb_setcreatetime = inittime;
299 
300 	if (meta_gettimeofday(&tp) != -1)
301 		mb->mb_timestamp = tp;
302 
303 	/*
304 	 * We try to save the disks device ID into the remaining bytes in
305 	 * the master block.  This allows the disk image to be self-identifying
306 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
307 	 * when we try to import these disks on the remote copied image.
308 	 * If we cannot save the disks device ID onto the master block that is
309 	 * ok.  The disk is just not self-identifying and won't be importable
310 	 * in the remote copy scenario.
311 	 */
312 	if (devid_get(fd, &devid) == 0) {
313 		int len;
314 
315 		len = devid_sizeof(devid);
316 		if (len <= DEV_BSIZE - sizeof (*mb)) {
317 			/* there is enough space to store the devid */
318 			mb->mb_devid_magic = MDDB_MAGIC_DE;
319 			mb->mb_devid_len = len;
320 			(void) memcpy(mb->mb_devid, (char *)devid, len);
321 		}
322 		devid_free(devid);
323 	}
324 
325 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
326 	    (crc_skip_t *)NULL);
327 
328 	/*
329 	 * If any of these operations fail, we need to inform the
330 	 * user that the disk won't be self identifying. When support
331 	 * for importing remotely replicated disksets is added, we
332 	 * want to add the error messages here.
333 	 */
334 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
335 		goto out;
336 
337 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
338 		goto out;
339 
340 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
341 		goto out;
342 
343 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
344 		goto out;
345 
346 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
347 	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
348 		goto out;
349 
350 out:
351 	Free(mb);
352 }
353 
354 static int
355 buildconf(mdsetname_t *sp, md_error_t *ep)
356 {
357 	md_replicalist_t	*rlp = NULL;
358 	md_replicalist_t	*rl;
359 	FILE			*cfp = NULL;
360 	FILE			*mfp = NULL;
361 	struct stat		sbuf;
362 	int			rval = 0;
363 	int			in_miniroot = 0;
364 	char			line[MDDB_BOOTLIST_MAX_LEN];
365 	char			*tname = NULL;
366 
367 	/* get list of local replicas */
368 	if (! metaislocalset(sp))
369 		return (0);
370 
371 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
372 		return (-1);
373 
374 	/* open tempfile, copy permissions of original file */
375 	if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
376 		/*
377 		 * On the miniroot tmp files must be created in /var/tmp.
378 		 * If we get a EROFS error, we assume that we are in the
379 		 * miniroot.
380 		 */
381 		if (errno != EROFS)
382 			goto error;
383 		in_miniroot = 1;
384 		errno = 0;
385 		tname = tempnam("/var/tmp", "slvm_");
386 		if (tname == NULL && errno == EROFS) {
387 			/*
388 			 * If we are booted on a read-only root because
389 			 * of mddb quorum problems we don't want to emit
390 			 * any scary error messages.
391 			 */
392 			errno = 0;
393 			goto out;
394 		}
395 
396 		/* open tempfile, copy permissions of original file */
397 		if ((cfp = fopen(tname, "w+")) == NULL)
398 			goto error;
399 	}
400 	if (stat(META_DBCONF, &sbuf) == 0) {
401 		if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
402 			goto error;
403 		if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
404 			goto error;
405 	}
406 
407 	/* print header */
408 	if (fprintf(cfp, "#metadevice database location file ") == EOF)
409 		goto error;
410 	if (fprintf(cfp, "do not hand edit\n") < 0)
411 		goto error;
412 	if (fprintf(cfp,
413 		"#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
414 		goto error;
415 
416 	/* dump replicas */
417 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
418 		md_replica_t	*r = rl->rl_repp;
419 		int		checksum = 42;
420 		int		i;
421 		char		*devidp;
422 		minor_t		min;
423 
424 		devidp = devid_str_encode(r->r_devid, r->r_minor_name);
425 		/* If devid code can't encode devidp - skip entry */
426 		if (devidp == NULL) {
427 			continue;
428 		}
429 
430 		/* compute checksum */
431 		for (i = 0; ((r->r_driver_name[i] != '\0') &&
432 		    (i < sizeof (r->r_driver_name))); i++) {
433 			checksum -= r->r_driver_name[i];
434 		}
435 		min = meta_getminor(r->r_namep->dev);
436 		checksum -= min;
437 		checksum -= r->r_blkno;
438 
439 		for (i = 0; i < strlen(devidp); i++) {
440 			checksum -= devidp[i];
441 		}
442 		/* print info */
443 		if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
444 		    r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
445 			goto error;
446 		}
447 
448 		devid_str_free(devidp);
449 	}
450 
451 	/* close and rename to real file */
452 	if (fflush(cfp) != 0)
453 		goto error;
454 	if (fsync(fileno(cfp)) != 0)
455 		goto error;
456 	if (fclose(cfp) != 0) {
457 		cfp = NULL;
458 		goto error;
459 	}
460 	cfp = NULL;
461 
462 	/*
463 	 * Renames don't work in the miniroot since tmpfiles are
464 	 * created in /var/tmp. Hence we copy the data out.
465 	 */
466 
467 	if (! in_miniroot) {
468 		if (rename(META_DBCONFTMP, META_DBCONF) != 0)
469 			goto error;
470 	} else {
471 		if ((cfp = fopen(tname, "r")) == NULL)
472 			goto error;
473 		if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
474 			goto error;
475 		while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
476 			if (fputs(line, mfp) == NULL)
477 				goto error;
478 		}
479 		(void) fclose(cfp);
480 		cfp = NULL;
481 		if (fflush(mfp) != 0)
482 			goto error;
483 		if (fsync(fileno(mfp)) != 0)
484 			goto error;
485 		if (fclose(mfp) != 0) {
486 			mfp = NULL;
487 			goto error;
488 		}
489 		/* delete the tempfile */
490 		(void) unlink(tname);
491 	}
492 	/* success */
493 	rval = 0;
494 	goto out;
495 
496 	/* tempfile error */
497 error:
498 	rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
499 				mdsyserror(ep, errno, META_DBCONFTMP);
500 
501 
502 	/* cleanup, return success */
503 out:
504 	if (rlp != NULL)
505 		metafreereplicalist(rlp);
506 	if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
507 		rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
508 					mdsyserror(ep, errno, META_DBCONFTMP);
509 	}
510 	free(tname);
511 	return (rval);
512 }
513 
514 /*
515  * check replica for dev
516  */
517 static int
518 in_replica(
519 	mdsetname_t	*sp,
520 	md_replica_t	*rp,
521 	mdname_t	*np,
522 	diskaddr_t	slblk,
523 	diskaddr_t	nblks,
524 	md_error_t	*ep
525 )
526 {
527 	mdname_t	*repnp = rp->r_namep;
528 	diskaddr_t	rep_sblk = rp->r_blkno;
529 	diskaddr_t	rep_nblks = rp->r_nblk;
530 
531 	/* should be in the same set */
532 	assert(sp != NULL);
533 
534 	/* if error in master block, assume whole partition */
535 	if ((rep_sblk == MD_DISKADDR_ERROR) ||
536 	    (rep_nblks == MD_DISKADDR_ERROR)) {
537 		rep_sblk = 0;
538 		rep_nblks = MD_DISKADDR_ERROR;
539 	}
540 
541 	/* check overlap */
542 	if (meta_check_overlap(
543 	    MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
544 		return (-1);
545 	}
546 
547 	/* return success */
548 	return (0);
549 }
550 
551 /*
552  * check to see if we're in a replica
553  */
554 int
555 meta_check_inreplica(
556 	mdsetname_t		*sp,
557 	mdname_t		*np,
558 	diskaddr_t		slblk,
559 	diskaddr_t		nblks,
560 	md_error_t		*ep
561 )
562 {
563 	md_replicalist_t	*rlp = NULL;
564 	md_replicalist_t	*rl;
565 	int			rval = 0;
566 
567 	/* should have a set */
568 	assert(sp != NULL);
569 
570 	/* for each replica */
571 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
572 		return (-1);
573 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
574 		md_replica_t	*rp = rl->rl_repp;
575 
576 		/* check replica */
577 		if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
578 			rval = -1;
579 			break;
580 		}
581 	}
582 
583 	/* cleanup, return success */
584 	metafreereplicalist(rlp);
585 	return (rval);
586 }
587 
588 /*
589  * check replica
590  */
591 int
592 meta_check_replica(
593 	mdsetname_t	*sp,		/* set to check against */
594 	mdname_t	*np,		/* component to check against */
595 	mdchkopts_t	options,	/* option flags */
596 	diskaddr_t	slblk,		/* start logical block */
597 	diskaddr_t	nblks,		/* number of blocks (-1,rest of them) */
598 	md_error_t	*ep		/* error packet */
599 )
600 {
601 	mdchkopts_t	chkoptions = MDCHK_ALLOW_REPSLICE;
602 
603 	/* make sure we have a disk */
604 	if (metachkcomp(np, ep) != 0)
605 		return (-1);
606 
607 	/* check to ensure that it is not already in use */
608 	if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
609 		return (-1);
610 	}
611 
612 	if (options & MDCHK_ALLOW_NODBS)
613 		return (0);
614 
615 	if (options & MDCHK_DRVINSET)
616 		return (0);
617 
618 	/* make sure it is in the set */
619 	if (meta_check_inset(sp, np, ep) != 0)
620 		return (-1);
621 
622 	/* make sure its not in a metadevice */
623 	if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
624 		return (-1);
625 
626 	/* return success */
627 	return (0);
628 }
629 
630 static int
631 update_dbinfo_on_drives(
632 	mdsetname_t	*sp,
633 	md_drive_desc	*dd,
634 	int		set_locked,
635 	int		force,
636 	md_error_t	*ep
637 )
638 {
639 	md_set_desc		*sd;
640 	int			i;
641 	md_setkey_t		*cl_sk;
642 	int			rval = 0;
643 	md_mnnode_desc		*nd;
644 
645 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
646 		return (-1);
647 
648 	if (! set_locked) {
649 		if (MD_MNSET_DESC(sd)) {
650 			md_error_t xep = mdnullerror;
651 			sigset_t sigs;
652 			/* Make sure we are blocking all signals */
653 			if (procsigs(TRUE, &sigs, &xep) < 0)
654 				mdclrerror(&xep);
655 
656 			nd = sd->sd_nodelist;
657 			while (nd) {
658 				if (force && strcmp(nd->nd_nodename,
659 				    mynode()) != 0) {
660 					nd = nd->nd_next;
661 					continue;
662 				}
663 
664 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
665 					nd = nd->nd_next;
666 					continue;
667 				}
668 
669 				if (clnt_lock_set(nd->nd_nodename, sp, ep))
670 					return (-1);
671 				nd = nd->nd_next;
672 			}
673 		} else {
674 			for (i = 0; i < MD_MAXSIDES; i++) {
675 				/* Skip empty slots */
676 				if (sd->sd_nodes[i][0] == '\0')
677 					continue;
678 
679 				if (force && strcmp(sd->sd_nodes[i],
680 				    mynode()) != 0)
681 					continue;
682 
683 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
684 					return (-1);
685 			}
686 		}
687 	}
688 
689 	if (MD_MNSET_DESC(sd)) {
690 		nd = sd->sd_nodelist;
691 		while (nd) {
692 			if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
693 				nd = nd->nd_next;
694 				continue;
695 			}
696 
697 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
698 				nd = nd->nd_next;
699 				continue;
700 			}
701 
702 			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
703 			    == -1) {
704 				rval = -1;
705 				break;
706 			}
707 			nd = nd->nd_next;
708 		}
709 	} else {
710 		for (i = 0; i < MD_MAXSIDES; i++) {
711 			/* Skip empty slots */
712 			if (sd->sd_nodes[i][0] == '\0')
713 				continue;
714 
715 			if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
716 				continue;
717 
718 			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
719 			    == -1) {
720 				rval = -1;
721 				break;
722 			}
723 		}
724 	}
725 
726 	if (! set_locked) {
727 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
728 		if (MD_MNSET_DESC(sd)) {
729 			nd = sd->sd_nodelist;
730 			while (nd) {
731 				if (force &&
732 				    strcmp(nd->nd_nodename, mynode()) != 0) {
733 					nd = nd->nd_next;
734 					continue;
735 				}
736 
737 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
738 					nd = nd->nd_next;
739 					continue;
740 				}
741 
742 				if (clnt_unlock_set(nd->nd_nodename, cl_sk,
743 				    ep)) {
744 					rval = -1;
745 					break;
746 				}
747 				nd = nd->nd_next;
748 			}
749 		} else {
750 			for (i = 0; i < MD_MAXSIDES; i++) {
751 				/* Skip empty slots */
752 				if (sd->sd_nodes[i][0] == '\0')
753 					continue;
754 
755 				if (force &&
756 				    strcmp(sd->sd_nodes[i], mynode()) != 0)
757 					continue;
758 
759 				if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
760 				    ep)) {
761 					rval = -1;
762 					break;
763 				}
764 			}
765 
766 		}
767 		cl_set_setkey(NULL);
768 	}
769 
770 	return (rval);
771 }
772 
773 int
774 meta_db_addsidenms(
775 	mdsetname_t	*sp,
776 	mdname_t	*np,
777 	daddr_t		blkno,
778 	int		bcast,
779 	md_error_t	*ep
780 )
781 {
782 	side_t		sideno;
783 	char		*bname = NULL;
784 	char		*dname = NULL;
785 	minor_t		mnum;
786 	mddb_config_t	c;
787 	int		done;
788 	int		rval = 0;
789 	md_set_desc	*sd;
790 
791 	sideno = MD_SIDEWILD;
792 	/*CONSTCOND*/
793 	while (1) {
794 		if (bname != NULL) {
795 			Free(bname);
796 			bname = NULL;
797 		}
798 		if (dname != NULL) {
799 			Free(dname);
800 			dname = NULL;
801 		}
802 		if ((done = meta_getnextside_devinfo(sp, np->bname,
803 		    &sideno, &bname, &dname, &mnum, ep)) == -1) {
804 			rval = -1;
805 			break;
806 		}
807 
808 		if (done == 0)
809 			break;
810 
811 		if (! metaislocalset(sp)) {
812 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
813 				rval = -1;
814 				break;
815 			}
816 		}
817 
818 		/*
819 		 * Send addsidenms to all nodes using rpc.mdcommd if
820 		 * sidename is being added to MN diskset.
821 		 *
822 		 *   It's ok to broadcast this call to other nodes.
823 		 *
824 		 *   Note: The broadcast to other nodes isn't needed during
825 		 *   the addition of the first mddbs to the set since the
826 		 *   other nodes haven't been joined to the set yet.  All
827 		 *   nodes in a MN diskset are (implicitly) joined to the set
828 		 *   on the addition of the first mddb.
829 		 */
830 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
831 		    (bcast == DB_ADDSIDENMS_BCAST)) {
832 			md_mn_result_t			*resultp = NULL;
833 			md_mn_msg_meta_db_newside_t	db_ns;
834 			int				send_rval;
835 
836 			db_ns.msg_l_dev = np->dev;
837 			db_ns.msg_sideno = sideno;
838 			db_ns.msg_blkno = blkno;
839 			(void) strncpy(db_ns.msg_dname, dname,
840 			    sizeof (db_ns.msg_dname));
841 			(void) splitname(np->bname, &db_ns.msg_splitname);
842 			db_ns.msg_mnum = mnum;
843 
844 			/* Set devid to NULL until devids are supported */
845 			db_ns.msg_devid[0] = NULL;
846 
847 			/*
848 			 * If reconfig cycle has been started, this node is
849 			 * stuck in in the return step until this command has
850 			 * completed.  If mdcommd is suspended, ask
851 			 * send_message to fail (instead of retrying)
852 			 * so that metaset can finish allowing the reconfig
853 			 * cycle to proceed.
854 			 */
855 			send_rval = mdmn_send_message(sp->setno,
856 			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
857 			    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ns,
858 			    sizeof (md_mn_msg_meta_db_newside_t),
859 			    &resultp, ep);
860 			if (send_rval != 0) {
861 				rval = -1;
862 				if (resultp == NULL)
863 					(void) mddserror(ep,
864 					    MDE_DS_COMMD_SEND_FAIL,
865 					    sp->setno, NULL, NULL,
866 					    sp->setname);
867 				else {
868 					(void) mdstealerror(ep,
869 					    &(resultp->mmr_ep));
870 					if (mdisok(ep)) {
871 						(void) mddserror(ep,
872 						    MDE_DS_COMMD_SEND_FAIL,
873 						    sp->setno, NULL, NULL,
874 						    sp->setname);
875 					}
876 					free_result(resultp);
877 				}
878 				break;
879 			}
880 			if (resultp)
881 				free_result(resultp);
882 		} else {
883 			/*
884 			 * Let this side's  device name, minor # and driver name
885 			 * be known to the database replica.
886 			 */
887 			(void) memset(&c, 0, sizeof (c));
888 
889 			/* Fill in device/replica info */
890 			c.c_locator.l_dev = meta_cmpldev(np->dev);
891 			c.c_locator.l_blkno = blkno;
892 			(void) strncpy(c.c_locator.l_driver, dname,
893 			    sizeof (c.c_locator.l_driver));
894 			(void) splitname(bname, &c.c_devname);
895 			c.c_locator.l_mnum = mnum;
896 
897 			/* Fill in setno, setname, and sideno */
898 			c.c_setno = sp->setno;
899 			(void) strncpy(c.c_setname, sp->setname,
900 				sizeof (c.c_setname));
901 			c.c_sideno = sideno;
902 
903 			/*
904 			 * Don't need device id information from this ioctl
905 			 * Kernel determines device id from dev_t, which
906 			 * is just what this code would do.
907 			 */
908 			c.c_locator.l_devid = (uint64_t)0;
909 			c.c_locator.l_devid_flags = 0;
910 
911 			if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
912 				rval = mdstealerror(ep, &c.c_mde);
913 				break;
914 			}
915 		}
916 	}
917 
918 	/* cleanup, return success */
919 	if (bname != NULL) {
920 		Free(bname);
921 		bname = NULL;
922 	}
923 	if (dname != NULL) {
924 		Free(dname);
925 		dname = NULL;
926 	}
927 	return (rval);
928 }
929 
930 
931 int
932 meta_db_delsidenm(
933 	mdsetname_t	*sp,
934 	side_t		sideno,
935 	mdname_t	*np,
936 	daddr_t		blkno,
937 	md_error_t	*ep
938 )
939 {
940 	mddb_config_t	c;
941 	md_set_desc	*sd;
942 
943 	if (! metaislocalset(sp)) {
944 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
945 			return (-1);
946 	}
947 	/* Use rpc.mdcommd to delete mddb side from all nodes */
948 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
949 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
950 		md_mn_result_t			*resultp = NULL;
951 		md_mn_msg_meta_db_delside_t	db_ds;
952 		int				send_rval;
953 
954 		db_ds.msg_l_dev = np->dev;
955 		db_ds.msg_blkno = blkno;
956 		db_ds.msg_sideno = sideno;
957 
958 		/* Set devid to NULL until devids are supported */
959 		db_ds.msg_devid[0] = NULL;
960 
961 		/*
962 		 * If reconfig cycle has been started, this node is
963 		 * stuck in in the return step until this command has
964 		 * completed.  If mdcommd is suspended, ask
965 		 * send_message to fail (instead of retrying)
966 		 * so that metaset can finish allowing the reconfig
967 		 * cycle to proceed.
968 		 */
969 		send_rval = mdmn_send_message(sp->setno,
970 		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
971 		    MD_MSGF_PANIC_WHEN_INCONSISTENT, (char *)&db_ds,
972 		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
973 		if (send_rval != 0) {
974 			if (resultp == NULL)
975 				(void) mddserror(ep,
976 				    MDE_DS_COMMD_SEND_FAIL,
977 				    sp->setno, NULL, NULL,
978 				    sp->setname);
979 			else {
980 				(void) mdstealerror(ep, &(resultp->mmr_ep));
981 				if (mdisok(ep)) {
982 					(void) mddserror(ep,
983 					    MDE_DS_COMMD_SEND_FAIL,
984 					    sp->setno, NULL, NULL,
985 					    sp->setname);
986 				}
987 				free_result(resultp);
988 			}
989 			return (-1);
990 		}
991 		if (resultp)
992 			free_result(resultp);
993 
994 	} else {
995 		/*
996 		 * Let this side's  device name, minor # and driver name
997 		 * be known to the database replica.
998 		 */
999 		(void) memset(&c, 0, sizeof (c));
1000 
1001 		/* Fill in device/replica info */
1002 		c.c_locator.l_dev = meta_cmpldev(np->dev);
1003 		c.c_locator.l_blkno = blkno;
1004 
1005 		/* Fill in setno, setname, and sideno */
1006 		c.c_setno = sp->setno;
1007 		(void) strcpy(c.c_setname, sp->setname);
1008 		c.c_sideno = sideno;
1009 
1010 		/*
1011 		 * Don't need device id information from this ioctl
1012 		 * Kernel determines device id from dev_t, which
1013 		 * is just what this code would do.
1014 		 */
1015 		c.c_locator.l_devid = (uint64_t)0;
1016 		c.c_locator.l_devid_flags = 0;
1017 
1018 		if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1019 			return (mdstealerror(ep, &c.c_mde));
1020 	}
1021 	return (0);
1022 }
1023 
1024 
1025 static int
1026 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1027 {
1028 	mdnamelist_t		*dnp1, *dnp2;
1029 
1030 	for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1031 		for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1032 			if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1033 				return (mderror(ep, MDE_DUPDRIVE,
1034 				    dnp1->namep->cname));
1035 		}
1036 	}
1037 	return (0);
1038 }
1039 
1040 
1041 /*
1042  * Return 1 if files are different, else return 0
1043  */
1044 static int
1045 filediff(char *tsname, char *sname)
1046 {
1047 	int ret = 1, fd;
1048 	size_t tsz, sz;
1049 	struct stat sbuf;
1050 	char *tbuf, *buf;
1051 
1052 	if (stat(tsname, &sbuf) != 0)
1053 		return (1);
1054 	tsz = sbuf.st_size;
1055 	if (stat(sname, &sbuf) != 0)
1056 		return (1);
1057 	sz = sbuf.st_size;
1058 	if (tsz != sz)
1059 		return (1);
1060 
1061 	/* allocate memory and read both files into buffer */
1062 	tbuf = malloc(tsz);
1063 	buf = malloc(sz);
1064 	if (tbuf == NULL || buf == NULL)
1065 		goto out;
1066 
1067 	fd = open(tsname, O_RDONLY);
1068 	if (fd == -1)
1069 		goto out;
1070 	sz = read(fd, tbuf, tsz);
1071 	(void) close(fd);
1072 	if (sz != tsz)
1073 		goto out;
1074 
1075 	fd = open(sname, O_RDONLY);
1076 	if (fd == -1)
1077 		goto out;
1078 	sz = read(fd, buf, tsz);
1079 	(void) close(fd);
1080 	if (sz != tsz)
1081 		goto out;
1082 
1083 	/* compare content */
1084 	ret = bcmp(tbuf, buf, tsz);
1085 out:
1086 	if (tbuf)
1087 		free(tbuf);
1088 	if (buf)
1089 		free(buf);
1090 	return (ret);
1091 }
1092 
1093 /*
1094  * patch md.conf file with mddb locations
1095  */
1096 int
1097 meta_db_patch(
1098 	char		*sname,		/* system file name */
1099 	char		*cname,		/* mddb.cf file name */
1100 	int		patch,		/* patching locally */
1101 	md_error_t	*ep
1102 )
1103 {
1104 	char		*tsname = NULL;
1105 	char		line[MDDB_BOOTLIST_MAX_LEN];
1106 	FILE		*tsfp = NULL;
1107 	FILE		*mfp = NULL;
1108 	int		rval = -1;
1109 
1110 	/* check names */
1111 	if (sname == NULL) {
1112 		if (patch)
1113 			sname = "md.conf";
1114 		else
1115 			sname = "/kernel/drv/md.conf";
1116 	}
1117 	if (cname == NULL)
1118 		cname = META_DBCONF;
1119 
1120 	/*
1121 	 * edit file
1122 	 */
1123 	if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1124 		if (mdissyserror(ep, EROFS)) {
1125 			/*
1126 			 * If we are booted on a read-only root because
1127 			 * of mddb quorum problems we don't want to emit
1128 			 * any scary error messages.
1129 			 */
1130 			mdclrerror(ep);
1131 			rval = 0;
1132 		}
1133 		goto out;
1134 	}
1135 
1136 	if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0,
1137 	    ep) != 0)
1138 		goto out;
1139 
1140 	/* if file content is identical, skip rename */
1141 	if (filediff(tsname, sname) == 0) {
1142 		rval = 0;
1143 		goto out;
1144 	}
1145 
1146 	if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1147 					    (fclose(tsfp) != 0)) {
1148 		(void) mdsyserror(ep, errno, tsname);
1149 		goto out;
1150 	}
1151 
1152 	tsfp = NULL;
1153 
1154 	/*
1155 	 * rename file. If we get a Cross Device error then it
1156 	 * is because we are in the miniroot.
1157 	 */
1158 	if (rename(tsname, sname) != 0 && errno != EXDEV) {
1159 		(void) mdsyserror(ep, errno, sname);
1160 		goto out;
1161 	}
1162 
1163 	if (errno == EXDEV) {
1164 		if ((tsfp = fopen(tsname, "r")) == NULL)
1165 			goto out;
1166 		if ((mfp = fopen(sname, "w+")) == NULL)
1167 			goto out;
1168 		while (fgets(line, sizeof (line), tsfp) != NULL) {
1169 			if (fputs(line, mfp) == NULL)
1170 				goto out;
1171 		}
1172 		(void) fclose(tsfp);
1173 		tsfp = NULL;
1174 		if (fflush(mfp) != 0)
1175 			goto out;
1176 		if (fsync(fileno(mfp)) != 0)
1177 			goto out;
1178 		if (fclose(mfp) != 0) {
1179 			mfp = NULL;
1180 			goto out;
1181 		}
1182 	}
1183 
1184 	Free(tsname);
1185 	tsname = NULL;
1186 	rval = 0;
1187 
1188 	/* cleanup, return error */
1189 out:
1190 	if (tsfp != NULL)
1191 		(void) fclose(tsfp);
1192 	if (tsname != NULL) {
1193 		(void) unlink(tsname);
1194 		Free(tsname);
1195 	}
1196 	return (rval);
1197 }
1198 
1199 /*
1200  * Add replicas to set.  This happens as a result of:
1201  *	- metadb [-s set_name] -a
1202  *	- metaset -s set_name -a disk
1203  *	- metaset -s set_name -d disk	 (causes a rebalance of mddbs)
1204  *	- metaset -s set_name -b
1205  *
1206  * For a local set, this routine is run on the local set host.
1207  *
1208  * For a traditional diskset, this routine is run on the node that
1209  * is running the metaset command.
1210  *
1211  * For a multinode diskset, this routine is run by the node that is
1212  * running the metaset command.  If this is the first mddb added to
1213  * the MN diskset, then no communication is made to other nodes via commd
1214  * since the other nodes will be in-sync with respect to the mddbs when
1215  * those other nodes join the set and snarf in the newly created mddb.
1216  * If this is not the first mddb added to the MN diskset, then this
1217  * attach command is sent to all of the nodes using commd.  This keeps
1218  * the nodes in-sync.
1219  */
1220 int
1221 meta_db_attach(
1222 	mdsetname_t		*sp,
1223 	mdnamelist_t		*db_nlp,
1224 	mdchkopts_t		options,
1225 	md_timeval32_t		*timeval,
1226 	int			dbcnt,
1227 	int			dbsize,
1228 	char			*sysfilename,
1229 	md_error_t		*ep
1230 )
1231 {
1232 	struct mddb_config	c;
1233 	mdnamelist_t		*nlp;
1234 	mdname_t		*np;
1235 	md_drive_desc		*dd = NULL;
1236 	md_drive_desc		*p;
1237 	int			i;
1238 	int			fd;
1239 	side_t			sideno;
1240 	daddr_t			blkno;
1241 	int			replicacount = 0;
1242 	int			start_mdmonitord = 0;
1243 	int			rval = 0;
1244 	md_error_t		status = mdnullerror;
1245 	md_set_desc		*sd;
1246 	int			stale_bool = FALSE;
1247 	int			flags;
1248 	int			firstmddb = 1;
1249 	md_timeval32_t		inittime = {0, 0};
1250 
1251 	/*
1252 	 * Error if we don't get some work to do.
1253 	 */
1254 	if (db_nlp == NULL)
1255 		return (mdsyserror(ep, EINVAL, NULL));
1256 
1257 	if (mdnamesareunique(db_nlp, ep) != 0)
1258 		return (-1);
1259 	(void) memset(&c, 0, sizeof (c));
1260 	c.c_id = 0;
1261 	c.c_setno = sp->setno;
1262 
1263 	/* Don't need device id information from this ioctl */
1264 	c.c_locator.l_devid = (uint64_t)0;
1265 	c.c_locator.l_devid_flags = 0;
1266 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1267 		if (metaislocalset(sp)) {
1268 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1269 				mdclrerror(&c.c_mde);
1270 			else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1271 			    (! (options & MDCHK_ALLOW_NODBS)))
1272 				return (mdstealerror(ep, &c.c_mde));
1273 		} else {
1274 			if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1275 				return (mdstealerror(ep, &c.c_mde));
1276 		}
1277 		mdclrerror(&c.c_mde);
1278 	}
1279 	/*
1280 	 * Is current set STALE?
1281 	 */
1282 	if (c.c_flags & MDDB_C_STALE) {
1283 		stale_bool = TRUE;
1284 	}
1285 
1286 	assert(db_nlp != NULL);
1287 
1288 	/* if creating the metadbs for the first time start mdmonitord */
1289 	if (c.c_dbcnt == 0)
1290 		start_mdmonitord = 1;
1291 
1292 	/*
1293 	 * check to see if we will go over the total possible number
1294 	 * of data bases
1295 	 */
1296 	nlp = db_nlp;
1297 	while (nlp) {
1298 		replicacount += dbcnt;
1299 		nlp = nlp->next;
1300 	}
1301 
1302 	if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1303 		return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1304 		    sp->setno, c.c_dbcnt + replicacount, NULL));
1305 
1306 	/*
1307 	 * go through and check to make sure all locations specified
1308 	 * are legal also pick out driver name;
1309 	 */
1310 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1311 		diskaddr_t devsize;
1312 
1313 		np = nlp->namep;
1314 
1315 		if (! metaislocalset(sp)) {
1316 			uint_t	partno;
1317 			uint_t	rep_partno;
1318 			mddrivename_t	*dnp = np->drivenamep;
1319 
1320 			/*
1321 			 * make sure that non-local database replicas
1322 			 * are always on the replica slice.
1323 			 */
1324 			if (meta_replicaslice(dnp,
1325 			    &rep_partno, ep) != 0)
1326 				return (-1);
1327 			if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1328 				return (-1);
1329 			if (partno != rep_partno)
1330 				return (mddeverror(ep, MDE_REPCOMP_ONLY,
1331 				    np->dev, sp->setname));
1332 		}
1333 
1334 		if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1335 		    ep)) {
1336 			return (-1);
1337 		}
1338 
1339 		if ((devsize = metagetsize(np, ep)) == -1)
1340 			return (-1);
1341 
1342 		if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1343 			return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1344 			    meta_getminor(np->dev), sp->setno, devsize,
1345 			    np->cname));
1346 	}
1347 
1348 	/*
1349 	 * If first disk in set we don't have lb_inittime yet for use as
1350 	 * mb_setcreatetime so don't go looking for it. WE'll come back
1351 	 * later and update after the locator block has been created.
1352 	 * If this isn't the first disk in the set, we have a locator
1353 	 * block and thus we have lb_inittime. Set mb_setcreatetime to
1354 	 * lb_inittime.
1355 	 */
1356 	if (! metaislocalset(sp)) {
1357 		if (c.c_dbcnt != 0) {
1358 			firstmddb = 0;
1359 			inittime = meta_get_lb_inittime(sp, ep);
1360 		}
1361 	}
1362 
1363 	/*
1364 	 * go through and write all master blocks
1365 	 */
1366 
1367 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1368 		np = nlp->namep;
1369 
1370 		if ((fd = open(np->rname, O_RDWR)) < 0)
1371 			return (mdsyserror(ep, errno, np->rname));
1372 
1373 		for (i = 0; i < dbcnt; i++) {
1374 			if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1375 			    inittime, ep)) {
1376 				(void) close(fd);
1377 				return (-1);
1378 			}
1379 		}
1380 		(void) close(fd);
1381 	}
1382 
1383 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1384 		return (-1);
1385 
1386 	if (! metaislocalset(sp)) {
1387 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1388 		if (! mdisok(ep))
1389 			return (-1);
1390 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1391 			return (-1);
1392 
1393 	}
1394 
1395 	/*
1396 	 * go through and tell kernel to add them
1397 	 */
1398 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1399 		mdcinfo_t	*cinfo;
1400 
1401 		np = nlp->namep;
1402 
1403 		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1404 			rval = -1;
1405 			goto out;
1406 		}
1407 
1408 		/*
1409 		 * If mddb is being added to MN diskset and there already
1410 		 * exists a valid mddb in the set (which equates to this
1411 		 * node being an owner of the set) then use rpc.mdcommd
1412 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1413 		 * If set is stale, don't log the message since rpc.mdcommd
1414 		 * can't write the message to the mddb.
1415 		 *
1416 		 * Otherwise, just add mddb to this node.
1417 		 */
1418 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1419 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1420 			md_mn_result_t			*resultp = NULL;
1421 			md_mn_msg_meta_db_attach_t	attach;
1422 			int 				send_rval;
1423 
1424 			/*
1425 			 * In a scenario where new replicas had been added on
1426 			 * the master, and then all of the old replicas failed
1427 			 * before the slaves had knowledge of the new replicas,
1428 			 * the slaves are unable to re-parse in the mddb
1429 			 * from the new replicas since the slaves have no
1430 			 * knowledge of the new replicas.  The following
1431 			 * algorithm solves this problem:
1432 			 * 	- META_DB_ATTACH message generates submsgs
1433 			 * 		- BLOCK parse (master)
1434 			 * 		- MDDB_ATTACH new replicas
1435 			 * 		- UNBLOCK parse (master) causing parse
1436 			 *		information to be sent from master
1437 			 *		to slaves at a higher class than the
1438 			 *		unblock so the parse message will
1439 			 *		reach slaves before unblock message.
1440 			 */
1441 			attach.msg_l_dev = np->dev;
1442 			attach.msg_cnt = dbcnt;
1443 			attach.msg_dbsize = dbsize;
1444 			(void) strncpy(attach.msg_dname, cinfo->dname,
1445 			    sizeof (attach.msg_dname));
1446 			(void) splitname(np->bname, &attach.msg_splitname);
1447 			attach.msg_options = options;
1448 
1449 			/* Set devid to NULL until devids are supported */
1450 			attach.msg_devid[0] = NULL;
1451 
1452 			/*
1453 			 * If reconfig cycle has been started, this node is
1454 			 * stuck in in the return step until this command has
1455 			 * completed.  If mdcommd is suspended, ask
1456 			 * send_message to fail (instead of retrying)
1457 			 * so that metaset can finish allowing the reconfig
1458 			 * cycle to proceed.
1459 			 */
1460 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1461 			if (stale_bool == TRUE)
1462 				flags |= MD_MSGF_NO_LOG;
1463 			send_rval = mdmn_send_message(sp->setno,
1464 				MD_MN_MSG_META_DB_ATTACH,
1465 				flags, (char *)&attach,
1466 				sizeof (md_mn_msg_meta_db_attach_t),
1467 				&resultp, ep);
1468 			if (send_rval != 0) {
1469 				rval = -1;
1470 				if (resultp == NULL)
1471 					(void) mddserror(ep,
1472 					    MDE_DS_COMMD_SEND_FAIL,
1473 					    sp->setno, NULL, NULL,
1474 					    sp->setname);
1475 				else {
1476 					(void) mdstealerror(ep,
1477 					    &(resultp->mmr_ep));
1478 					if (mdisok(ep)) {
1479 						(void) mddserror(ep,
1480 						    MDE_DS_COMMD_SEND_FAIL,
1481 						    sp->setno, NULL, NULL,
1482 						    sp->setname);
1483 					}
1484 					free_result(resultp);
1485 				}
1486 				goto out;
1487 			}
1488 			if (resultp)
1489 				free_result(resultp);
1490 		} else {
1491 		    /* Adding mddb(s) to just this node */
1492 		    for (i = 0; i < dbcnt; i++) {
1493 			(void) memset(&c, 0, sizeof (c));
1494 			/* Fill in device/replica info */
1495 			c.c_locator.l_dev = meta_cmpldev(np->dev);
1496 			c.c_locator.l_blkno = i * dbsize + 16;
1497 			blkno = c.c_locator.l_blkno;
1498 			(void) strncpy(c.c_locator.l_driver, cinfo->dname,
1499 			    sizeof (c.c_locator.l_driver));
1500 			(void) splitname(np->bname, &c.c_devname);
1501 			c.c_locator.l_mnum = meta_getminor(np->dev);
1502 
1503 			/* Fill in setno, setname, and sideno */
1504 			c.c_setno = sp->setno;
1505 			if (! metaislocalset(sp)) {
1506 				if (MD_MNSET_DESC(sd)) {
1507 					c.c_multi_node = 1;
1508 				}
1509 			}
1510 			(void) strcpy(c.c_setname, sp->setname);
1511 			c.c_sideno = sideno;
1512 
1513 			/*
1514 			 * Don't need device id information from this ioctl
1515 			 * Kernel determines device id from dev_t, which
1516 			 * is just what this code would do.
1517 			 */
1518 			c.c_locator.l_devid = (uint64_t)0;
1519 			c.c_locator.l_devid_flags = 0;
1520 
1521 			if (timeval != NULL)
1522 				c.c_timestamp = *timeval;
1523 
1524 			if (setup_med_cfg(sp, &c, (options & MDCHK_SET_FORCE),
1525 			    ep)) {
1526 				rval = -1;
1527 				goto out;
1528 			}
1529 
1530 			if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde, NULL) != 0) {
1531 				rval = mdstealerror(ep, &c.c_mde);
1532 				goto out;
1533 			}
1534 			/*
1535 			 * This is either a traditional diskset OR this
1536 			 * is the first replica added to a MN diskset.
1537 			 * In either case, set broadcast to NO_BCAST so
1538 			 * that message won't go through rpc.mdcommd.
1539 			 * If this is a traditional diskset, the bcast
1540 			 * flag is ignored since traditional disksets
1541 			 * don't use the rpc.mdcommd.
1542 			 */
1543 			if (meta_db_addsidenms(sp, np, blkno,
1544 			    DB_ADDSIDENMS_NO_BCAST, ep))
1545 				goto out;
1546 		    }
1547 		}
1548 		if (! metaislocalset(sp)) {
1549 			/* update the dbcnt and size in dd */
1550 			for (p = dd; p != NULL; p = p->dd_next)
1551 				if (p->dd_dnp == np->drivenamep) {
1552 					p->dd_dbcnt = dbcnt;
1553 					p->dd_dbsize  = dbsize;
1554 					break;
1555 				}
1556 		}
1557 
1558 		/*
1559 		 * If this was the first addition of disks to the
1560 		 * diskset you now need to update the mb_setcreatetime
1561 		 * which needed lb_inittime which wasn't there until now.
1562 		 */
1563 		if (firstmddb) {
1564 			if (meta_update_mb(sp, dd, ep) != 0) {
1565 				return (-1);
1566 			}
1567 		}
1568 		(void) close(fd);
1569 	}
1570 
1571 out:
1572 	if (metaislocalset(sp)) {
1573 
1574 		/* everything looks fine. Start mdmonitord */
1575 		/* Note: popen/pclose is the MT-safe replacement for system */
1576 		if (rval == 0 && start_mdmonitord  == 1) {
1577 			if (pclose(popen(MDMONITORD, "w")) == -1)
1578 				md_perror(MDMONITORD);
1579 
1580 			if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1581 				mde_perror(&status, "");
1582 				mdclrerror(&status);
1583 			}
1584 		}
1585 
1586 		if (buildconf(sp, &status)) {
1587 			/* Don't mask any previous errors */
1588 			if (rval == 0)
1589 				rval = mdstealerror(ep, &status);
1590 			return (rval);
1591 		}
1592 
1593 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1594 			/* Don't mask any previous errors */
1595 			if (rval == 0)
1596 				rval = mdstealerror(ep, &status);
1597 		}
1598 	} else {
1599 		if (update_dbinfo_on_drives(sp, dd,
1600 		    (options & MDCHK_SET_LOCKED),
1601 		    (options & MDCHK_SET_FORCE),
1602 		    &status)) {
1603 			/* Don't mask any previous errors */
1604 			if (rval == 0)
1605 				rval = mdstealerror(ep, &status);
1606 			else
1607 				mdclrerror(&status);
1608 		}
1609 		metafreedrivedesc(&dd);
1610 	}
1611 	/*
1612 	 * For MN disksets that already had already had nodes joined
1613 	 * before the attach of this mddb(s), the name invalidation is
1614 	 * done by the commd handler routine.  Otherwise, if this
1615 	 * is the first attach of a MN diskset mddb, the invalidation
1616 	 * must be done here since the first attach cannot be sent
1617 	 * via the commd since there are no nodes joined to the set yet.
1618 	 */
1619 	if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1620 	    (MD_MNSET_DESC(sd) &&
1621 	    (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1622 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1623 			meta_invalidate_name(nlp->namep);
1624 		}
1625 	}
1626 	return (rval);
1627 }
1628 
1629 /*
1630  * deletelist_length
1631  *
1632  *	return the number of slices that have been specified for deletion
1633  *	on the metadb command line.  This does not calculate the number
1634  *	of replicas because there may be multiple replicas per slice.
1635  */
1636 static int
1637 deletelist_length(mdnamelist_t *db_nlp)
1638 {
1639 
1640 	mdnamelist_t		*nlp;
1641 	int			list_length = 0;
1642 
1643 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1644 		list_length++;
1645 	}
1646 
1647 	return (list_length);
1648 }
1649 
1650 static int
1651 in_deletelist(char *devname, mdnamelist_t *db_nlp)
1652 {
1653 
1654 	mdnamelist_t		*nlp;
1655 	mdname_t		*np;
1656 	int			index = 0;
1657 
1658 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1659 		np = nlp->namep;
1660 
1661 		if (strcmp(devname, np->bname) == 0)
1662 			return (index);
1663 		index++;
1664 	}
1665 
1666 	return (-1);
1667 }
1668 
1669 /*
1670  * Delete replicas from set.  This happens as a result of:
1671  *	- metadb [-s set_name] -d
1672  *	- metaset -s set_name -a disk	(causes a rebalance of mddbs)
1673  *	- metaset -s set_name -d disk
1674  *	- metaset -s set_name -b
1675  *
1676  * For a local set, this routine is run on the local set host.
1677  *
1678  * For a traditional diskset, this routine is run on the node that
1679  * is running the metaset command.
1680  *
1681  * For a multinode diskset, this routine is run by the node that is
1682  * running the metaset command.  This detach routine is sent to all
1683  * of the joined nodes in the diskset using commd.  This keeps
1684  * the nodes in-sync.
1685  */
1686 int
1687 meta_db_detach(
1688 	mdsetname_t		*sp,
1689 	mdnamelist_t		*db_nlp,
1690 	mdforceopts_t		force_option,
1691 	char			*sysfilename,
1692 	md_error_t		*ep
1693 )
1694 {
1695 	struct mddb_config	c;
1696 	mdnamelist_t		*nlp;
1697 	mdname_t		*np;
1698 	md_drive_desc		*dd = NULL;
1699 	md_drive_desc		*p;
1700 	int			replicacount;
1701 	int			replica_delete_count;
1702 	int			nr_replica_slices;
1703 	int			i;
1704 	int			stop_svmdaemons = 0;
1705 	int			rval = 0;
1706 	int			index;
1707 	int			valid_replicas_nottodelete = 0;
1708 	int			invalid_replicas_nottodelete = 0;
1709 	int			invalid_replicas_todelete = 0;
1710 	int			errored = 0;
1711 	int			*tag_array;
1712 	int			fd = -1;
1713 	md_error_t		status = mdnullerror;
1714 	md_set_desc		*sd;
1715 	int			stale_bool = FALSE;
1716 	int			flags;
1717 
1718 	/*
1719 	 * Error if we don't get some work to do.
1720 	 */
1721 	if (db_nlp == NULL)
1722 		return (mdsyserror(ep, EINVAL, NULL));
1723 
1724 	if (mdnamesareunique(db_nlp, ep) != 0)
1725 		return (-1);
1726 
1727 	(void) memset(&c, 0, sizeof (c));
1728 	c.c_id = 0;
1729 	c.c_setno = sp->setno;
1730 
1731 	/* Don't need device id information from this ioctl */
1732 	c.c_locator.l_devid = (uint64_t)0;
1733 	c.c_locator.l_devid_flags = 0;
1734 
1735 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1736 		return (mdstealerror(ep, &c.c_mde));
1737 
1738 	/*
1739 	 * Is current set STALE?
1740 	 */
1741 	if (c.c_flags & MDDB_C_STALE) {
1742 		stale_bool = TRUE;
1743 	}
1744 
1745 	replicacount = c.c_dbcnt;
1746 
1747 	assert(db_nlp != NULL);
1748 
1749 	/*
1750 	 * go through and gather how many data bases are on each
1751 	 * device specified.
1752 	 */
1753 
1754 	nr_replica_slices = deletelist_length(db_nlp);
1755 	tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1756 
1757 	replica_delete_count = 0;
1758 	for (i = 0; i < replicacount; i++) {
1759 		char	*devname;
1760 		int	found = 0;
1761 
1762 		c.c_id = i;
1763 
1764 		/* Don't need device id information from this ioctl */
1765 		c.c_locator.l_devid = (uint64_t)0;
1766 		c.c_locator.l_devid_flags = 0;
1767 
1768 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1769 			return (mdstealerror(ep, &c.c_mde));
1770 
1771 		devname = splicename(&c.c_devname);
1772 
1773 		if ((index = in_deletelist(devname, db_nlp)) != -1) {
1774 			found = 1;
1775 			tag_array[index] = 1;
1776 			replica_delete_count++;
1777 		}
1778 
1779 		errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1780 				MDDB_F_EWRITE | MDDB_F_TOOSMALL |
1781 				MDDB_F_EFMT | MDDB_F_EDATA |
1782 				MDDB_F_EMASTER);
1783 
1784 		/*
1785 		 * There are four combinations of "errored" and "found"
1786 		 * and they are used to find the number of
1787 		 * (a) valid/invalid replicas that are not in the delete
1788 		 * list and are available in the system.
1789 		 * (b) valid/invalid replicas that are to be deleted.
1790 		 */
1791 
1792 		if (errored && !found)		/* errored and !found */
1793 			invalid_replicas_nottodelete++;
1794 		else if (!found)		/* !errored and !found */
1795 			valid_replicas_nottodelete++;
1796 		else if (errored)		/* errored and found */
1797 			invalid_replicas_todelete++;
1798 		/*
1799 		 * else it is !errored and found. This means
1800 		 * valid_replicas_todelete++; But this variable will not
1801 		 * be used anywhere
1802 		 */
1803 
1804 		Free(devname);
1805 	}
1806 
1807 	index = 0;
1808 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1809 		np = nlp->namep;
1810 		if (tag_array[index++] != 1) {
1811 			Free(tag_array);
1812 			return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1813 		}
1814 	}
1815 
1816 	Free(tag_array);
1817 
1818 
1819 	/* if all replicas are deleted stop mdmonitord */
1820 	if ((replicacount - replica_delete_count) == 0)
1821 		stop_svmdaemons = 1;
1822 
1823 	if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1824 		if (force_option & MDFORCE_NONE)
1825 			return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1826 		if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1827 			return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1828 	}
1829 
1830 	/*
1831 	 * The following algorithms are followed to check for deletion:
1832 	 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1833 	 * replicas, then deletion should be allowed.
1834 	 * (b) Deletion should be allowed only if valid replicas that are "not"
1835 	 * to be deleted is always greater than the invalid replicas that
1836 	 * are "not" to be deleted.
1837 	 * (c) If the user uses -f option, then deletion should be allowed.
1838 	 */
1839 
1840 	if ((invalid_replicas_todelete != replica_delete_count) &&
1841 		(invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1842 				(force_option != MDFORCE_LOCAL))
1843 		return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1844 
1845 	/*
1846 	 * go through and tell kernel to delete them
1847 	 */
1848 
1849 	/* Don't need device id information from this ioctl */
1850 	c.c_locator.l_devid = (uint64_t)0;
1851 	c.c_locator.l_devid_flags = 0;
1852 
1853 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1854 		return (mdstealerror(ep, &c.c_mde));
1855 
1856 	if (! metaislocalset(sp)) {
1857 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1858 		if (! mdisok(ep))
1859 			return (-1);
1860 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1861 			return (-1);
1862 	}
1863 
1864 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1865 		np = nlp->namep;
1866 
1867 		/*
1868 		 * If mddb is being deleted from MN diskset and node is
1869 		 * an owner of the diskset then use rpc.mdcommd
1870 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1871 		 * If set is stale, don't log the message since rpc.mdcommd
1872 		 * can't write the message to the mddb.
1873 		 *
1874 		 * When mddbs are first being added to set, a detach can
1875 		 * be called before any node has joined the diskset, so
1876 		 * must check to see if node is an owner of the diskset.
1877 		 *
1878 		 * Otherwise, just delete mddb from this node.
1879 		 */
1880 
1881 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1882 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1883 			md_mn_result_t			*resultp;
1884 			md_mn_msg_meta_db_detach_t	detach;
1885 			int				send_rval;
1886 
1887 			/*
1888 			 * The following algorithm is used to detach replicas.
1889 			 * 	- META_DB_DETACH message generates submsgs
1890 			 * 		- BLOCK parse (master)
1891 			 * 		- MDDB_DETACH replicas
1892 			 * 		- UNBLOCK parse (master) causing parse
1893 			 *		information to be sent from master
1894 			 *		to slaves at a higher class than the
1895 			 *		unblock so the parse message will
1896 			 *		reach slaves before unblock message.
1897 			 */
1898 			(void) splitname(np->bname, &detach.msg_splitname);
1899 
1900 			/* Set devid to NULL until devids are supported */
1901 			detach.msg_devid[0] = NULL;
1902 
1903 			/*
1904 			 * If reconfig cycle has been started, this node is
1905 			 * stuck in in the return step until this command has
1906 			 * completed.  If mdcommd is suspended, ask
1907 			 * send_message to fail (instead of retrying)
1908 			 * so that metaset can finish allowing the reconfig
1909 			 * cycle to proceed.
1910 			 */
1911 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1912 			if (stale_bool == TRUE)
1913 				flags |= MD_MSGF_NO_LOG;
1914 			send_rval = mdmn_send_message(sp->setno,
1915 				MD_MN_MSG_META_DB_DETACH,
1916 				flags, (char *)&detach,
1917 				sizeof (md_mn_msg_meta_db_detach_t),
1918 				&resultp, ep);
1919 			if (send_rval != 0) {
1920 				rval = -1;
1921 				if (resultp == NULL)
1922 					(void) mddserror(ep,
1923 					    MDE_DS_COMMD_SEND_FAIL,
1924 					    sp->setno, NULL, NULL,
1925 					    sp->setname);
1926 				else {
1927 					(void) mdstealerror(ep,
1928 					    &(resultp->mmr_ep));
1929 					if (mdisok(ep)) {
1930 						(void) mddserror(ep,
1931 						    MDE_DS_COMMD_SEND_FAIL,
1932 						    sp->setno, NULL, NULL,
1933 						    sp->setname);
1934 					}
1935 					free_result(resultp);
1936 				}
1937 				goto out;
1938 			}
1939 			if (resultp)
1940 				free_result(resultp);
1941 		} else {
1942 			i = 0;
1943 			while (i < c.c_dbcnt) {
1944 				char	*devname;
1945 
1946 				c.c_id = i;
1947 
1948 				/* Don't need devid info from this ioctl */
1949 				c.c_locator.l_devid = (uint64_t)0;
1950 				c.c_locator.l_devid_flags = 0;
1951 
1952 				if (metaioctl(MD_DB_GETDEV, &c,
1953 				    &c.c_mde, NULL)) {
1954 					rval = mdstealerror(ep, &c.c_mde);
1955 					goto out;
1956 				}
1957 
1958 				devname = splicename(&c.c_devname);
1959 				if (strcmp(devname, np->bname) != 0) {
1960 					Free(devname);
1961 					i++;
1962 					continue;
1963 				}
1964 				Free(devname);
1965 
1966 				/* Don't need devid info from this ioctl */
1967 				c.c_locator.l_devid = (uint64_t)0;
1968 				c.c_locator.l_devid_flags = 0;
1969 
1970 				if (metaioctl(MD_DB_DELDEV, &c,
1971 				    &c.c_mde, NULL) != 0) {
1972 					rval = mdstealerror(ep, &c.c_mde);
1973 					goto out;
1974 				}
1975 
1976 				/* Not incrementing "i" intentionally */
1977 			}
1978 		}
1979 		if (! metaislocalset(sp)) {
1980 			/* update the dbcnt and size in dd */
1981 			for (p = dd; p != NULL; p = p->dd_next) {
1982 				if (p->dd_dnp == np->drivenamep) {
1983 					p->dd_dbcnt = 0;
1984 					p->dd_dbsize  = 0;
1985 					break;
1986 				}
1987 			}
1988 
1989 			/*
1990 			 * Slam a dummy master block and make it self
1991 			 * identifying
1992 			 */
1993 			if ((fd = open(np->rname, O_RDWR)) >= 0) {
1994 				meta_mkdummymaster(sp, fd, 16);
1995 				(void) close(fd);
1996 			}
1997 		}
1998 	}
1999 out:
2000 	if (metaislocalset(sp)) {
2001 		/*
2002 		 * Stop all the daemons if there are
2003 		 * no more replicas so that the module can be
2004 		 * unloaded.
2005 		 */
2006 		if (rval == 0 && stop_svmdaemons == 1) {
2007 			char buf[MAXPATHLEN];
2008 			int i;
2009 
2010 			for (i = 0; i < DAEMON_COUNT; i++) {
2011 				(void) snprintf(buf, MAXPATHLEN,
2012 					"/usr/bin/pkill -%s -x %s",
2013 					svmd_kill_list[i].svmd_kill_val,
2014 					svmd_kill_list[i].svmd_name);
2015 				if (pclose(popen(buf, "w")) == -1)
2016 					md_perror(buf);
2017 			}
2018 
2019 			if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2020 				mde_perror(&status, "");
2021 				mdclrerror(&status);
2022 			}
2023 		}
2024 		if (buildconf(sp, &status)) {
2025 			/* Don't mask any previous errors */
2026 			if (rval == 0)
2027 				rval = mdstealerror(ep, &status);
2028 			else
2029 				mdclrerror(&status);
2030 			return (rval);
2031 		}
2032 
2033 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2034 			/* Don't mask any previous errors */
2035 			if (rval == 0)
2036 				rval = mdstealerror(ep, &status);
2037 			else
2038 				mdclrerror(&status);
2039 		}
2040 	} else {
2041 		if (update_dbinfo_on_drives(sp, dd,
2042 		    (force_option & MDFORCE_SET_LOCKED),
2043 		    ((force_option & MDFORCE_LOCAL) |
2044 		    (force_option & MDFORCE_DS)), &status)) {
2045 			/* Don't mask any previous errors */
2046 			if (rval == 0)
2047 				rval = mdstealerror(ep, &status);
2048 			else
2049 				mdclrerror(&status);
2050 		}
2051 		metafreedrivedesc(&dd);
2052 	}
2053 	if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2054 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2055 			meta_invalidate_name(nlp->namep);
2056 		}
2057 	}
2058 	return (rval);
2059 }
2060 
2061 static md_replica_t *
2062 metareplicaname(
2063 	mdsetname_t		*sp,
2064 	int			flags,
2065 	struct mddb_config	*c,
2066 	md_error_t		*ep
2067 )
2068 {
2069 	md_replica_t	*rp;
2070 	char		*devname;
2071 	size_t		sz;
2072 
2073 	/* allocate replicaname */
2074 	rp = Zalloc(sizeof (*rp));
2075 
2076 	/* get device name */
2077 	devname = splicename(&c->c_devname);
2078 	if (flags & PRINT_FAST) {
2079 		if ((rp->r_namep = metaname_fast(&sp, devname, ep)) == NULL) {
2080 			Free(devname);
2081 			Free(rp);
2082 			return (NULL);
2083 		}
2084 	} else {
2085 		if ((rp->r_namep = metaname(&sp, devname, ep)) == NULL) {
2086 			Free(devname);
2087 			Free(rp);
2088 			return (NULL);
2089 		}
2090 	}
2091 	Free(devname);
2092 
2093 	/* make sure it's OK */
2094 	if ((! (flags & MD_BASICNAME_OK)) &&
2095 	    (metachkcomp(rp->r_namep, ep) != 0)) {
2096 		Free(rp);
2097 		return (NULL);
2098 	}
2099 
2100 	rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2101 	rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2102 	rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2103 	if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2104 		sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2105 		    (c->c_locator.l_devid));
2106 		if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2107 		    (ddi_devid_t)NULL) {
2108 			Free(rp);
2109 			return (NULL);
2110 		}
2111 		(void) memcpy((void *)rp->r_devid,
2112 		    (void *)(uintptr_t)c->c_locator.l_devid, sz);
2113 		(void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2114 		rp->r_flags &= ~MDDB_F_NODEVID;
2115 		/* Overwrite dev derived from name with dev from devid */
2116 		rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2117 	}
2118 	(void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2119 
2120 	rp->r_blkno = c->c_locator.l_blkno;
2121 	if (c->c_dbend != 0)
2122 		rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2123 
2124 	/* return replica */
2125 	return (rp);
2126 }
2127 
2128 /*
2129  * free replica list
2130  */
2131 void
2132 metafreereplicalist(
2133 	md_replicalist_t	*rlp
2134 )
2135 {
2136 	md_replicalist_t	*rl = NULL;
2137 
2138 	for (/* void */; (rlp != NULL); rlp = rl) {
2139 		rl = rlp->rl_next;
2140 		if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2141 			free(rlp->rl_repp->r_devid);
2142 		}
2143 		Free(rlp->rl_repp);
2144 		Free(rlp);
2145 	}
2146 }
2147 
2148 /*
2149  * return list of all replicas in set
2150  */
2151 int
2152 metareplicalist(
2153 	mdsetname_t		*sp,
2154 	int			flags,
2155 	md_replicalist_t	**rlpp,
2156 	md_error_t		*ep
2157 )
2158 {
2159 	md_replicalist_t	**tail = rlpp;
2160 	int			count = 0;
2161 	struct mddb_config	c;
2162 	int			i;
2163 	char			*devid;
2164 
2165 	/* for each replica */
2166 	i = 0;
2167 	do {
2168 		md_replica_t	*rp;
2169 
2170 		/* get next replica */
2171 		(void) memset(&c, 0, sizeof (c));
2172 		c.c_id = i;
2173 		c.c_setno = sp->setno;
2174 
2175 		c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2176 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2177 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2178 				mdclrerror(&c.c_mde);
2179 				break;	/* handle none at all */
2180 			}
2181 			(void) mdstealerror(ep, &c.c_mde);
2182 			goto out;
2183 		}
2184 
2185 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2186 			if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2187 				(void) mdsyserror(ep, ENOMEM, META_DBCONF);
2188 				goto out;
2189 			}
2190 			c.c_locator.l_devid = (uintptr_t)devid;
2191 			/*
2192 			 * Turn on space and sz flags since 'sz' amount of
2193 			 * space has been alloc'd.
2194 			 */
2195 			c.c_locator.l_devid_flags =
2196 				MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2197 		}
2198 
2199 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2200 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2201 				mdclrerror(&c.c_mde);
2202 				break;	/* handle none at all */
2203 			}
2204 			(void) mdstealerror(ep, &c.c_mde);
2205 			goto out;
2206 		}
2207 
2208 		/*
2209 		 * Paranoid check - shouldn't happen, but is left as
2210 		 * a place holder for changes that will be needed after
2211 		 * dynamic reconfiguration changes are added to SVM (to
2212 		 * support movement of disks at any point in time).
2213 		 */
2214 		if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2215 			(void) fprintf(stderr,
2216 			    dgettext(TEXT_DOMAIN,
2217 				"Error: Relocation Information "
2218 				"(drvnm=%s, mnum=0x%lx) \n"
2219 				"relocation information size changed - \n"
2220 				"rerun command\n"),
2221 			    c.c_locator.l_driver, c.c_locator.l_mnum);
2222 			(void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2223 			goto out;
2224 		}
2225 
2226 		if (c.c_dbcnt == 0)
2227 			break;		/* handle none at all */
2228 
2229 		/* get info */
2230 		if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2231 			goto out;
2232 
2233 		/* append to list */
2234 		*tail = Zalloc(sizeof (**tail));
2235 		(*tail)->rl_repp = rp;
2236 		tail = &(*tail)->rl_next;
2237 		++count;
2238 
2239 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2240 			free(devid);
2241 			c.c_locator.l_devid_flags = 0;
2242 		}
2243 
2244 	} while (++i < c.c_dbcnt);
2245 
2246 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2247 		free(devid);
2248 	}
2249 
2250 	/* return count */
2251 	return (count);
2252 
2253 	/* cleanup, return error */
2254 out:
2255 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2256 		free(devid);
2257 	}
2258 	metafreereplicalist(*rlpp);
2259 	*rlpp = NULL;
2260 	return (-1);
2261 }
2262 
2263 /*
2264  * meta_sync_db_locations - get list of replicas from kernel and write
2265  * 	out to mddb.cf and md.conf.  'Syncs up' the replica list in
2266  * 	the kernel with the replica list in the conf files.
2267  *
2268  */
2269 void
2270 meta_sync_db_locations(
2271 	mdsetname_t	*sp,
2272 	md_error_t	*ep
2273 )
2274 {
2275 	char		*sname = 0;		/* system file name */
2276 	char 		*cname = 0;		/* config file name */
2277 
2278 	if (!metaislocalset(sp))
2279 		return;
2280 
2281 	/* Updates backup of configuration file (aka mddb.cf) */
2282 	if (buildconf(sp, ep) != 0)
2283 		return;
2284 
2285 	/* Updates system configuration file (aka md.conf) */
2286 	(void) meta_db_patch(sname, cname, 0, ep);
2287 }
2288 
2289 /*
2290  * setup_db_locations - parse the mddb.cf file and
2291  *			tells the driver which db locations to use.
2292  */
2293 int
2294 meta_setup_db_locations(
2295 	md_error_t	*ep
2296 )
2297 {
2298 	mddb_config_t	c;
2299 	FILE		*fp;
2300 	char		inbuff[1024];
2301 	char		*buff;
2302 	uint_t		i;
2303 	size_t		sz;
2304 	int		rval = 0;
2305 	char		*devidp;
2306 	uint_t		devid_size;
2307 	char		*minor_name = NULL;
2308 	ddi_devid_t	devid_decode;
2309 	int		checksum;
2310 
2311 	/* do mddb.cf file */
2312 	(void) memset(&c, '\0', sizeof (c));
2313 	if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2314 		if (errno != ENOENT)
2315 			return (mdsyserror(ep, errno, META_DBCONF));
2316 	}
2317 	while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2318 	    fp)) != NULL)) {
2319 
2320 		/* ignore comments */
2321 		if (*buff == '#')
2322 			continue;
2323 
2324 		/* parse locator */
2325 		(void) memset(&c, 0, sizeof (c));
2326 		c.c_setno = MD_LOCAL_SET;
2327 		i = strcspn(buff, " \t");
2328 		if (i > sizeof (c.c_locator.l_driver))
2329 			i = sizeof (c.c_locator.l_driver);
2330 		(void) strncpy(c.c_locator.l_driver, buff, i);
2331 		buff += i;
2332 		c.c_locator.l_dev =
2333 		    makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2334 		c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2335 		c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2336 
2337 		/* parse out devid */
2338 		while (isspace((int)(*buff)))
2339 			buff += 1;
2340 		i = strcspn(buff, " \t");
2341 		if ((devidp = (char *)malloc(i+1)) == NULL)
2342 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2343 
2344 		(void) strncpy(devidp, buff, i);
2345 		devidp[i] = '\0';
2346 		if (devid_str_decode(devidp, &devid_decode,
2347 		    &minor_name) == -1) {
2348 			free(devidp);
2349 			continue;
2350 		}
2351 
2352 		/* Conf file must have minor name associated with devid */
2353 		if (minor_name == NULL) {
2354 			free(devidp);
2355 			devid_free(devid_decode);
2356 			continue;
2357 		}
2358 
2359 		sz = devid_sizeof(devid_decode);
2360 		/* Copy to devid size buffer that ioctl expects */
2361 		if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2362 			devid_free(devid_decode);
2363 			free(minor_name);
2364 			free(devidp);
2365 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2366 		}
2367 
2368 		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2369 		    (void *)devid_decode, sz);
2370 
2371 		devid_free(devid_decode);
2372 
2373 		if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2374 			free(minor_name);
2375 			free(devidp);
2376 			free((void *)(uintptr_t)c.c_locator.l_devid);
2377 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2378 		}
2379 		(void) strcpy(c.c_locator.l_minor_name, minor_name);
2380 		free(minor_name);
2381 		c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2382 			MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2383 		c.c_locator.l_devid_sz = sz;
2384 
2385 		devid_size = strlen(devidp);
2386 		buff += devid_size;
2387 
2388 		checksum = strtol(buff, &buff, 10);
2389 		for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2390 			checksum += c.c_locator.l_driver[i];
2391 		for (i = 0; i < devid_size; i++) {
2392 			checksum += devidp[i];
2393 		}
2394 		free(devidp);
2395 
2396 		checksum += minor(c.c_locator.l_dev);
2397 		checksum += c.c_locator.l_blkno;
2398 		if (checksum != 42) {
2399 			/* overwritten later for more serious problems */
2400 			rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2401 			free((void *)(uintptr_t)c.c_locator.l_devid);
2402 			continue;
2403 		}
2404 		c.c_locator.l_flags = 0;
2405 
2406 		/* use db location */
2407 		if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2408 			free((void *)(uintptr_t)c.c_locator.l_devid);
2409 			return (mdstealerror(ep, &c.c_mde));
2410 		}
2411 
2412 		/* free up devid if in use */
2413 		free((void *)(uintptr_t)c.c_locator.l_devid);
2414 		c.c_locator.l_devid = (uint64_t)0;
2415 		c.c_locator.l_devid_flags = 0;
2416 	}
2417 	if ((fp) && (fclose(fp) != 0))
2418 		return (mdsyserror(ep, errno, META_DBCONF));
2419 
2420 	/* check for stale database */
2421 	(void) memset((char *)&c, 0, sizeof (struct mddb_config));
2422 	c.c_id = 0;
2423 	c.c_setno = MD_LOCAL_SET;
2424 
2425 	/* Don't need device id information from this ioctl */
2426 	c.c_locator.l_devid = (uint64_t)0;
2427 	c.c_locator.l_devid_flags = 0;
2428 
2429 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2430 		if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2431 			return (mdstealerror(ep, &c.c_mde));
2432 		mdclrerror(&c.c_mde);
2433 	}
2434 
2435 	if (c.c_flags & MDDB_C_STALE)
2436 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2437 		    0, NULL));
2438 
2439 	/* success */
2440 	return (rval);
2441 }
2442 
2443 /*
2444  * meta_db_minreplica - returns the minimum size replica currently in use.
2445  */
2446 daddr_t
2447 meta_db_minreplica(
2448 	mdsetname_t	*sp,
2449 	md_error_t	*ep
2450 )
2451 {
2452 	md_replica_t		*r;
2453 	md_replicalist_t	*rl, *rlp = NULL;
2454 	daddr_t			nblks = 0;
2455 
2456 	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2457 		return (-1);
2458 
2459 	if (rlp == NULL)
2460 		return (-1);
2461 
2462 	/* find the smallest existing replica */
2463 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2464 		r = rl->rl_repp;
2465 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2466 	}
2467 
2468 	metafreereplicalist(rlp);
2469 	return (nblks);
2470 }
2471 
2472 /*
2473  * meta_get_replica_names
2474  *  returns an mdnamelist_t of replica slices
2475  */
2476 /*ARGSUSED*/
2477 int
2478 meta_get_replica_names(
2479 	mdsetname_t	*sp,
2480 	mdnamelist_t	**nlpp,
2481 	int		options,
2482 	md_error_t	*ep
2483 )
2484 {
2485 	md_replicalist_t	*rlp = NULL;
2486 	md_replicalist_t	*rl;
2487 	mdnamelist_t		**tailpp = nlpp;
2488 	int			cnt = 0;
2489 
2490 	assert(nlpp != NULL);
2491 
2492 	if (!metaislocalset(sp))
2493 		goto out;
2494 
2495 	/* get replicas */
2496 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2497 		cnt = -1;
2498 		goto out;
2499 	}
2500 
2501 	/* build name list */
2502 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2503 		/*
2504 		 * Add the name struct to the end of the
2505 		 * namelist but keep a pointer to the last
2506 		 * element so that we don't incur the overhead
2507 		 * of traversing the list each time
2508 		 */
2509 		tailpp = meta_namelist_append_wrapper(
2510 			tailpp, rl->rl_repp->r_namep);
2511 		++cnt;
2512 	}
2513 
2514 	/* cleanup, return count or error */
2515 out:
2516 	metafreereplicalist(rlp);
2517 	return (cnt);
2518 }
2519