xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_db.c (revision bf85a12b7c81d0745d5a8aff65baeff50006cde9)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Just in case we're not in a build environment, make sure that
29  * TEXT_DOMAIN gets set to something.
30  */
31 #if !defined(TEXT_DOMAIN)
32 #define	TEXT_DOMAIN "SYS_TEST"
33 #endif
34 
35 /*
36  * Metadevice database interfaces.
37  */
38 
39 #define	MDDB
40 
41 #include <meta.h>
42 #include <sys/lvm/md_mddb.h>
43 #include <sys/lvm/md_crc.h>
44 #include <sys/lvm/mdio.h>
45 #include <string.h>
46 #include <strings.h>
47 #include <ctype.h>
48 
49 struct svm_daemon {
50 	char *svmd_name;
51 	char *svmd_kill_val;
52 };
53 
54 /*
55  * This is a list of the daemons that are not stopped by the SVM smf(5)
56  * services. The mdmonitord is started via svc:/system/mdmonitor:default
57  * but no contract(4) is constructed and so it is not stopped by smf(5).
58  */
59 struct svm_daemon svmd_kill_list[] = {
60 		{"mdmonitord", "HUP"},
61 		{"mddoors", "KILL"},
62 	};
63 
64 #define	DAEMON_COUNT (sizeof (svmd_kill_list)/ sizeof (struct svm_daemon))
65 
66 extern int procsigs(int block, sigset_t *oldsigs, md_error_t *ep);
67 
68 /*
69  * Are the locator blocks for the replicas using devids
70  */
71 static int	devid_in_use = FALSE;
72 
73 static char *
getlongname(struct mddb_config * c,md_error_t * ep)74 getlongname(
75 	struct mddb_config	*c,
76 	md_error_t		*ep
77 )
78 {
79 	char		*diskname = NULL;
80 	char		*devid_str;
81 	devid_nmlist_t	*disklist = NULL;
82 
83 	c->c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
84 	if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
85 		(void) mdstealerror(ep, &c->c_mde);
86 		return (NULL);
87 	}
88 
89 	if (c->c_locator.l_devid_flags & MDDB_DEVID_SZ) {
90 		c->c_locator.l_devid = (uintptr_t)
91 		    Malloc(c->c_locator.l_devid_sz);
92 		c->c_locator.l_devid_flags =
93 		    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
94 	} else {
95 		(void) mderror(ep, MDE_NODEVID, "");
96 		goto out;
97 	}
98 
99 	if (metaioctl(MD_DB_ENDDEV, c, &c->c_mde, NULL) != 0) {
100 		(void) mdstealerror(ep, &c->c_mde);
101 		goto out;
102 	}
103 
104 	if (c->c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
105 		(void) mderror(ep, MDE_NODEVID, "");
106 		goto out;
107 	}
108 
109 	if (metaioctl(MD_DB_GETDEV, c, &c->c_mde, NULL) != 0) {
110 		(void) mdstealerror(ep, &c->c_mde);
111 		goto out;
112 	}
113 
114 	if (c->c_locator.l_devid != NULL) {
115 		if (meta_deviceid_to_nmlist("/dev/dsk",
116 		    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
117 		    c->c_locator.l_minor_name, &disklist) != 0) {
118 			devid_str = devid_str_encode(
119 			    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid, NULL);
120 			(void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
121 			mderrorextra(ep, devid_str);
122 			if (devid_str != NULL)
123 				devid_str_free(devid_str);
124 			goto out;
125 		}
126 		diskname = Strdup(disklist[0].devname);
127 	}
128 
129 out:
130 	if (disklist != NULL)
131 		devid_free_nmlist(disklist);
132 
133 	if (c->c_locator.l_devid != NULL)
134 		Free((void *)(uintptr_t)c->c_locator.l_devid);
135 
136 	return (diskname);
137 }
138 
139 /*
140  * meta_get_lb_inittime sends a request for the lb_inittime to the kernel
141  */
142 md_timeval32_t
meta_get_lb_inittime(mdsetname_t * sp,md_error_t * ep)143 meta_get_lb_inittime(
144 	mdsetname_t	*sp,
145 	md_error_t	*ep
146 )
147 {
148 	mddb_config_t	c;
149 
150 	(void) memset(&c, 0, sizeof (c));
151 
152 	/* Fill in setno, setname, and sideno */
153 	c.c_setno = sp->setno;
154 
155 	if (metaioctl(MD_DB_LBINITTIME, &c, &c.c_mde, NULL) != 0) {
156 		(void) mdstealerror(ep, &c.c_mde);
157 	}
158 
159 	return (c.c_timestamp);
160 }
161 
162 /*
163  * mkmasterblks writes out the master blocks of the mddb to the replica.
164  *
165  * In a MN diskset, this is called by the node that is adding this replica
166  * to the diskset.
167  */
168 
169 #define	MDDB_VERIFY_SIZE	8192
170 
171 static int
mkmasterblks(mdsetname_t * sp,mdname_t * np,int fd,daddr_t firstblk,int dbsize,md_timeval32_t inittime,md_error_t * ep)172 mkmasterblks(
173 	mdsetname_t	*sp,
174 	mdname_t	*np,
175 	int		fd,
176 	daddr_t		firstblk,
177 	int		dbsize,
178 	md_timeval32_t	inittime,
179 	md_error_t	*ep
180 )
181 {
182 	int		consecutive;
183 	md_timeval32_t	tp;
184 	struct mddb_mb	*mb;
185 	char		*buffer;
186 	int		iosize;
187 	md_set_desc	*sd;
188 	int		mn_set = 0;
189 	daddr_t		startblk;
190 	int		cnt;
191 	ddi_devid_t	devid;
192 
193 	if (! metaislocalset(sp)) {
194 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
195 			return (-1);
196 
197 		if (MD_MNSET_DESC(sd)) {
198 			mn_set = 1;		/* Used later */
199 		}
200 	}
201 
202 	/*
203 	 * Loop to verify the entire mddb region on disk is read/writable.
204 	 * buffer is used to write/read in at most MDDB_VERIFY_SIZE block
205 	 * chunks.
206 	 *
207 	 * A side-effect of this loop is to zero out the entire mddb region
208 	 */
209 	if ((buffer = Zalloc(MDDB_VERIFY_SIZE * DEV_BSIZE)) == NULL)
210 		return (mdsyserror(ep, ENOMEM, np->rname));
211 
212 	startblk = firstblk;
213 	for (cnt = dbsize; cnt > 0; cnt -= consecutive) {
214 
215 		if (cnt > MDDB_VERIFY_SIZE)
216 			consecutive = MDDB_VERIFY_SIZE;
217 		else
218 			consecutive = cnt;
219 
220 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
221 			Free(buffer);
222 			return (mdsyserror(ep, errno, np->rname));
223 		}
224 
225 		iosize = DEV_BSIZE * consecutive;
226 		if (write(fd, buffer, iosize) != iosize) {
227 			Free(buffer);
228 			return (mdsyserror(ep, errno, np->rname));
229 		}
230 
231 		if (lseek(fd, (off_t)(startblk * DEV_BSIZE), SEEK_SET) < 0) {
232 			Free(buffer);
233 			return (mdsyserror(ep, errno, np->rname));
234 		}
235 
236 		if (read(fd, buffer, iosize) != iosize) {
237 			Free(buffer);
238 			return (mdsyserror(ep, errno, np->rname));
239 		}
240 
241 		startblk += consecutive;
242 	}
243 
244 	Free(buffer);
245 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
246 		return (mdsyserror(ep, ENOMEM, np->rname));
247 
248 	if (meta_gettimeofday(&tp) == -1) {
249 		Free(mb);
250 		return (mdsyserror(ep, errno, np->rname));
251 	}
252 
253 	mb->mb_magic = MDDB_MAGIC_MB;
254 	/*
255 	 * If a MN diskset, set master block revision for a MN set.
256 	 * Even though the master block structure is no different
257 	 * for a MN set, setting the revision field to a different
258 	 * number keeps any pre-MN_diskset code from accessing
259 	 * this diskset.  It also allows for an early determination
260 	 * of a MN diskset when reading in from disk so that the
261 	 * proper size locator block and locator names structure
262 	 * can be read in thus saving time on diskset startup.
263 	 */
264 	if (mn_set)
265 		mb->mb_revision = MDDB_REV_MNMB;
266 	else
267 		mb->mb_revision = MDDB_REV_MB;
268 	mb->mb_timestamp = tp;
269 	mb->mb_setno = sp->setno;
270 	mb->mb_blkcnt = dbsize - 1;
271 	mb->mb_blkno = firstblk;
272 	mb->mb_nextblk = 0;
273 
274 	mb->mb_blkmap.m_firstblk = firstblk + 1;
275 	mb->mb_blkmap.m_consecutive = dbsize - 1;
276 	if (! metaislocalset(sp)) {
277 		mb->mb_setcreatetime = inittime;
278 	}
279 
280 	/*
281 	 * We try to save the disks device ID into the remaining bytes in
282 	 * the master block. The saved devid is used to provide a mapping
283 	 * between this disk's devid and the devid stored into the master
284 	 * block. This allows the disk image to be self-identifying
285 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
286 	 * when we try to import these disks on the remote copied image.
287 	 * If we cannot save the disks device ID onto the master block that is
288 	 * ok.  The disk is just not self-identifying and won't be importable
289 	 * in the remote copy scenario.
290 	 */
291 	if (devid_get(fd, &devid) == 0) {
292 		size_t len;
293 
294 		len = devid_sizeof(devid);
295 		if (len <= DEV_BSIZE - sizeof (*mb)) {
296 			/* there is enough space to store the devid */
297 			mb->mb_devid_magic = MDDB_MAGIC_DE;
298 			mb->mb_devid_len = len;
299 			(void) memcpy(mb->mb_devid, devid, len);
300 		}
301 		devid_free(devid);
302 	}
303 
304 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
305 	    (crc_skip_t *)NULL);
306 
307 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
308 		Free(mb);
309 		return (mdsyserror(ep, errno, np->rname));
310 	}
311 
312 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
313 		Free(mb);
314 		return (mdsyserror(ep, errno, np->rname));
315 	}
316 
317 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0) {
318 		Free(mb);
319 		return (mdsyserror(ep, errno, np->rname));
320 	}
321 
322 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE) {
323 		Free(mb);
324 		return (mdsyserror(ep, errno, np->rname));
325 	}
326 
327 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
328 	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL)) {
329 		Free(mb);
330 		return (mdmddberror(ep, MDE_NOTVERIFIED,
331 		    meta_getminor(np->dev), sp->setno, 0, np->rname));
332 	}
333 
334 	Free(mb);
335 	return (0);
336 }
337 
338 void
meta_mkdummymaster(mdsetname_t * sp,int fd,daddr_t firstblk)339 meta_mkdummymaster(
340 	mdsetname_t	*sp,
341 	int		fd,
342 	daddr_t		firstblk
343 )
344 {
345 	md_timeval32_t	tp;
346 	struct mddb_mb	*mb;
347 	ddi_devid_t	devid;
348 	md_set_desc	*sd;
349 	md_error_t	ep = mdnullerror;
350 	md_timeval32_t	inittime;
351 
352 	/*
353 	 * No dummy master blocks are written for a MN diskset since devids
354 	 * are not supported in MN disksets.
355 	 */
356 	if (! metaislocalset(sp)) {
357 		if ((sd = metaget_setdesc(sp, &ep)) == NULL)
358 			return;
359 
360 		if (MD_MNSET_DESC(sd))
361 			return;
362 	}
363 
364 	if ((mb = Zalloc(DEV_BSIZE)) == NULL)
365 		return;
366 
367 	mb->mb_magic = MDDB_MAGIC_DU;
368 	mb->mb_revision = MDDB_REV_MB;
369 	mb->mb_setno = sp->setno;
370 	inittime = meta_get_lb_inittime(sp, &ep);
371 	mb->mb_setcreatetime = inittime;
372 
373 	if (meta_gettimeofday(&tp) != -1)
374 		mb->mb_timestamp = tp;
375 
376 	/*
377 	 * We try to save the disks device ID into the remaining bytes in
378 	 * the master block.  This allows the disk image to be self-identifying
379 	 * if it gets copied (e.g. SNDR, True Copy, etc.).  This is used
380 	 * when we try to import these disks on the remote copied image.
381 	 * If we cannot save the disks device ID onto the master block that is
382 	 * ok.  The disk is just not self-identifying and won't be importable
383 	 * in the remote copy scenario.
384 	 */
385 	if (devid_get(fd, &devid) == 0) {
386 		int len;
387 
388 		len = devid_sizeof(devid);
389 		if (len <= DEV_BSIZE - sizeof (*mb)) {
390 			/* there is enough space to store the devid */
391 			mb->mb_devid_magic = MDDB_MAGIC_DE;
392 			mb->mb_devid_len = len;
393 			(void) memcpy(mb->mb_devid, (char *)devid, len);
394 		}
395 		devid_free(devid);
396 	}
397 
398 	crcgen((uchar_t *)mb, (uint_t *)&mb->mb_checksum, (uint_t)DEV_BSIZE,
399 	    (crc_skip_t *)NULL);
400 
401 	/*
402 	 * If any of these operations fail, we need to inform the
403 	 * user that the disk won't be self identifying. When support
404 	 * for importing remotely replicated disksets is added, we
405 	 * want to add the error messages here.
406 	 */
407 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
408 		goto out;
409 
410 	if (write(fd, mb, DEV_BSIZE) != DEV_BSIZE)
411 		goto out;
412 
413 	if (lseek(fd, (off_t)(firstblk * DEV_BSIZE), SEEK_SET) < 0)
414 		goto out;
415 
416 	if (read(fd, mb, DEV_BSIZE) != DEV_BSIZE)
417 		goto out;
418 
419 	if (crcchk((uchar_t *)mb, (uint_t *)&mb->mb_checksum,
420 	    (uint_t)DEV_BSIZE, (crc_skip_t *)NULL))
421 		goto out;
422 
423 out:
424 	Free(mb);
425 }
426 
427 static int
buildconf(mdsetname_t * sp,md_error_t * ep)428 buildconf(mdsetname_t *sp, md_error_t *ep)
429 {
430 	md_replicalist_t	*rlp = NULL;
431 	md_replicalist_t	*rl;
432 	FILE			*cfp = NULL;
433 	FILE			*mfp = NULL;
434 	struct stat		sbuf;
435 	int			rval = 0;
436 	int			in_miniroot = 0;
437 	char			line[MDDB_BOOTLIST_MAX_LEN];
438 	char			*tname = NULL;
439 
440 	/* get list of local replicas */
441 	if (! metaislocalset(sp))
442 		return (0);
443 
444 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
445 		return (-1);
446 
447 	/* open tempfile, copy permissions of original file */
448 	if ((cfp = fopen(META_DBCONFTMP, "w+")) == NULL) {
449 		/*
450 		 * On the miniroot tmp files must be created in /var/tmp.
451 		 * If we get a EROFS error, we assume that we are in the
452 		 * miniroot.
453 		 */
454 		if (errno != EROFS)
455 			goto error;
456 		in_miniroot = 1;
457 		errno = 0;
458 		tname = tempnam("/var/tmp", "slvm_");
459 		if (tname == NULL && errno == EROFS) {
460 			/*
461 			 * If we are booted on a read-only root because
462 			 * of mddb quorum problems we don't want to emit
463 			 * any scary error messages.
464 			 */
465 			errno = 0;
466 			goto out;
467 		}
468 
469 		/* open tempfile, copy permissions of original file */
470 		if ((cfp = fopen(tname, "w+")) == NULL)
471 			goto error;
472 	}
473 	if (stat(META_DBCONF, &sbuf) == 0) {
474 		if (fchmod(fileno(cfp), (sbuf.st_mode & 0666)) != 0)
475 			goto error;
476 		if (fchown(fileno(cfp), sbuf.st_uid, sbuf.st_gid) != 0)
477 			goto error;
478 	}
479 
480 	/* print header */
481 	if (fprintf(cfp, "#metadevice database location file ") == EOF)
482 		goto error;
483 	if (fprintf(cfp, "do not hand edit\n") < 0)
484 		goto error;
485 	if (fprintf(cfp,
486 	    "#driver\tminor_t\tdaddr_t\tdevice id\tchecksum\n") < 0)
487 		goto error;
488 
489 	/* dump replicas */
490 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
491 		md_replica_t	*r = rl->rl_repp;
492 		int		checksum = 42;
493 		int		i;
494 		char		*devidp;
495 		minor_t		min;
496 
497 		devidp = devid_str_encode(r->r_devid, r->r_minor_name);
498 		/* If devid code can't encode devidp - skip entry */
499 		if (devidp == NULL) {
500 			continue;
501 		}
502 
503 		/* compute checksum */
504 		for (i = 0; ((r->r_driver_name[i] != '\0') &&
505 		    (i < sizeof (r->r_driver_name))); i++) {
506 			checksum -= r->r_driver_name[i];
507 		}
508 		min = meta_getminor(r->r_namep->dev);
509 		checksum -= min;
510 		checksum -= r->r_blkno;
511 
512 		for (i = 0; i < strlen(devidp); i++) {
513 			checksum -= devidp[i];
514 		}
515 		/* print info */
516 		if (fprintf(cfp, "%s\t%lu\t%ld\t%s\t%d\n",
517 		    r->r_driver_name, min, r->r_blkno, devidp, checksum) < 0) {
518 			goto error;
519 		}
520 
521 		devid_str_free(devidp);
522 	}
523 
524 	/* close and rename to real file */
525 	if (fflush(cfp) != 0)
526 		goto error;
527 	if (fsync(fileno(cfp)) != 0)
528 		goto error;
529 	if (fclose(cfp) != 0) {
530 		cfp = NULL;
531 		goto error;
532 	}
533 	cfp = NULL;
534 
535 	/*
536 	 * Renames don't work in the miniroot since tmpfiles are
537 	 * created in /var/tmp. Hence we copy the data out.
538 	 */
539 
540 	if (! in_miniroot) {
541 		if (rename(META_DBCONFTMP, META_DBCONF) != 0)
542 			goto error;
543 	} else {
544 		if ((cfp = fopen(tname, "r")) == NULL)
545 			goto error;
546 		if ((mfp = fopen(META_DBCONF, "w+")) == NULL)
547 			goto error;
548 		while (fgets(line, MDDB_BOOTLIST_MAX_LEN, cfp) != NULL) {
549 			if (fputs(line, mfp) == NULL)
550 				goto error;
551 		}
552 		(void) fclose(cfp);
553 		cfp = NULL;
554 		if (fflush(mfp) != 0)
555 			goto error;
556 		if (fsync(fileno(mfp)) != 0)
557 			goto error;
558 		if (fclose(mfp) != 0) {
559 			mfp = NULL;
560 			goto error;
561 		}
562 		/* delete the tempfile */
563 		(void) unlink(tname);
564 	}
565 	/* success */
566 	rval = 0;
567 	goto out;
568 
569 	/* tempfile error */
570 error:
571 	rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
572 	    mdsyserror(ep, errno, META_DBCONFTMP);
573 
574 
575 	/* cleanup, return success */
576 out:
577 	if (rlp != NULL)
578 		metafreereplicalist(rlp);
579 	if ((cfp != NULL) && (fclose(cfp) != 0) && (rval == 0)) {
580 		rval = (in_miniroot) ? mdsyserror(ep, errno, tname):
581 		    mdsyserror(ep, errno, META_DBCONFTMP);
582 	}
583 	free(tname);
584 	return (rval);
585 }
586 
587 /*
588  * check replica for dev
589  */
590 static int
in_replica(mdsetname_t * sp,md_replica_t * rp,mdname_t * np,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)591 in_replica(
592 	mdsetname_t	*sp,
593 	md_replica_t	*rp,
594 	mdname_t	*np,
595 	diskaddr_t	slblk,
596 	diskaddr_t	nblks,
597 	md_error_t	*ep
598 )
599 {
600 	mdname_t	*repnp = rp->r_namep;
601 	diskaddr_t	rep_sblk = rp->r_blkno;
602 	diskaddr_t	rep_nblks = rp->r_nblk;
603 
604 	/* should be in the same set */
605 	assert(sp != NULL);
606 
607 	/* if error in master block, assume whole partition */
608 	if ((rep_sblk == MD_DISKADDR_ERROR) ||
609 	    (rep_nblks == MD_DISKADDR_ERROR)) {
610 		rep_sblk = 0;
611 		rep_nblks = MD_DISKADDR_ERROR;
612 	}
613 
614 	/* check overlap */
615 	if (meta_check_overlap(
616 	    MDB_STR, np, slblk, nblks, repnp, rep_sblk, rep_nblks, ep) != 0) {
617 		return (-1);
618 	}
619 
620 	/* return success */
621 	return (0);
622 }
623 
624 /*
625  * check to see if we're in a replica
626  */
627 int
meta_check_inreplica(mdsetname_t * sp,mdname_t * np,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)628 meta_check_inreplica(
629 	mdsetname_t		*sp,
630 	mdname_t		*np,
631 	diskaddr_t		slblk,
632 	diskaddr_t		nblks,
633 	md_error_t		*ep
634 )
635 {
636 	md_replicalist_t	*rlp = NULL;
637 	md_replicalist_t	*rl;
638 	int			rval = 0;
639 
640 	/* should have a set */
641 	assert(sp != NULL);
642 
643 	/* for each replica */
644 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0)
645 		return (-1);
646 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
647 		md_replica_t	*rp = rl->rl_repp;
648 
649 		/* check replica */
650 		if (in_replica(sp, rp, np, slblk, nblks, ep) != 0) {
651 			rval = -1;
652 			break;
653 		}
654 	}
655 
656 	/* cleanup, return success */
657 	metafreereplicalist(rlp);
658 	return (rval);
659 }
660 
661 /*
662  * check replica
663  */
664 int
meta_check_replica(mdsetname_t * sp,mdname_t * np,mdchkopts_t options,diskaddr_t slblk,diskaddr_t nblks,md_error_t * ep)665 meta_check_replica(
666 	mdsetname_t	*sp,		/* set to check against */
667 	mdname_t	*np,		/* component to check against */
668 	mdchkopts_t	options,	/* option flags */
669 	diskaddr_t	slblk,		/* start logical block */
670 	diskaddr_t	nblks,		/* number of blocks (-1,rest of them) */
671 	md_error_t	*ep		/* error packet */
672 )
673 {
674 	mdchkopts_t	chkoptions = MDCHK_ALLOW_REPSLICE;
675 
676 	/* make sure we have a disk */
677 	if (metachkcomp(np, ep) != 0)
678 		return (-1);
679 
680 	/* check to ensure that it is not already in use */
681 	if (meta_check_inuse(sp, np, MDCHK_INUSE, ep) != 0) {
682 		return (-1);
683 	}
684 
685 	if (options & MDCHK_ALLOW_NODBS)
686 		return (0);
687 
688 	if (options & MDCHK_DRVINSET)
689 		return (0);
690 
691 	/* make sure it is in the set */
692 	if (meta_check_inset(sp, np, ep) != 0)
693 		return (-1);
694 
695 	/* make sure its not in a metadevice */
696 	if (meta_check_inmeta(sp, np, chkoptions, slblk, nblks, ep) != 0)
697 		return (-1);
698 
699 	/* return success */
700 	return (0);
701 }
702 
703 static int
update_dbinfo_on_drives(mdsetname_t * sp,md_drive_desc * dd,int set_locked,int force,md_error_t * ep)704 update_dbinfo_on_drives(
705 	mdsetname_t	*sp,
706 	md_drive_desc	*dd,
707 	int		set_locked,
708 	int		force,
709 	md_error_t	*ep
710 )
711 {
712 	md_set_desc		*sd;
713 	int			i;
714 	md_setkey_t		*cl_sk;
715 	int			rval = 0;
716 	md_mnnode_desc		*nd;
717 
718 	if ((sd = metaget_setdesc(sp, ep)) == NULL)
719 		return (-1);
720 
721 	if (! set_locked) {
722 		if (MD_MNSET_DESC(sd)) {
723 			md_error_t xep = mdnullerror;
724 			sigset_t sigs;
725 			/* Make sure we are blocking all signals */
726 			if (procsigs(TRUE, &sigs, &xep) < 0)
727 				mdclrerror(&xep);
728 
729 			nd = sd->sd_nodelist;
730 			while (nd) {
731 				if (force && strcmp(nd->nd_nodename,
732 				    mynode()) != 0) {
733 					nd = nd->nd_next;
734 					continue;
735 				}
736 
737 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
738 					nd = nd->nd_next;
739 					continue;
740 				}
741 
742 				if (clnt_lock_set(nd->nd_nodename, sp, ep))
743 					return (-1);
744 				nd = nd->nd_next;
745 			}
746 		} else {
747 			for (i = 0; i < MD_MAXSIDES; i++) {
748 				/* Skip empty slots */
749 				if (sd->sd_nodes[i][0] == '\0')
750 					continue;
751 
752 				if (force && strcmp(sd->sd_nodes[i],
753 				    mynode()) != 0)
754 					continue;
755 
756 				if (clnt_lock_set(sd->sd_nodes[i], sp, ep))
757 					return (-1);
758 			}
759 		}
760 	}
761 
762 	if (MD_MNSET_DESC(sd)) {
763 		nd = sd->sd_nodelist;
764 		while (nd) {
765 			if (force && strcmp(nd->nd_nodename, mynode()) != 0) {
766 				nd = nd->nd_next;
767 				continue;
768 			}
769 
770 			if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
771 				nd = nd->nd_next;
772 				continue;
773 			}
774 
775 			if (clnt_upd_dr_dbinfo(nd->nd_nodename, sp, dd, ep)
776 			    == -1) {
777 				rval = -1;
778 				break;
779 			}
780 			nd = nd->nd_next;
781 		}
782 	} else {
783 		for (i = 0; i < MD_MAXSIDES; i++) {
784 			/* Skip empty slots */
785 			if (sd->sd_nodes[i][0] == '\0')
786 				continue;
787 
788 			if (force && strcmp(sd->sd_nodes[i], mynode()) != 0)
789 				continue;
790 
791 			if (clnt_upd_dr_dbinfo(sd->sd_nodes[i], sp, dd, ep)
792 			    == -1) {
793 				rval = -1;
794 				break;
795 			}
796 		}
797 	}
798 
799 	if (! set_locked) {
800 		cl_sk = cl_get_setkey(sp->setno, sp->setname);
801 		if (MD_MNSET_DESC(sd)) {
802 			nd = sd->sd_nodelist;
803 			while (nd) {
804 				if (force &&
805 				    strcmp(nd->nd_nodename, mynode()) != 0) {
806 					nd = nd->nd_next;
807 					continue;
808 				}
809 
810 				if (!(nd->nd_flags & MD_MN_NODE_ALIVE)) {
811 					nd = nd->nd_next;
812 					continue;
813 				}
814 
815 				if (clnt_unlock_set(nd->nd_nodename, cl_sk,
816 				    ep)) {
817 					rval = -1;
818 					break;
819 				}
820 				nd = nd->nd_next;
821 			}
822 		} else {
823 			for (i = 0; i < MD_MAXSIDES; i++) {
824 				/* Skip empty slots */
825 				if (sd->sd_nodes[i][0] == '\0')
826 					continue;
827 
828 				if (force &&
829 				    strcmp(sd->sd_nodes[i], mynode()) != 0)
830 					continue;
831 
832 				if (clnt_unlock_set(sd->sd_nodes[i], cl_sk,
833 				    ep)) {
834 					rval = -1;
835 					break;
836 				}
837 			}
838 
839 		}
840 		cl_set_setkey(NULL);
841 	}
842 
843 	return (rval);
844 }
845 
846 int
meta_db_addsidenms(mdsetname_t * sp,mdname_t * np,daddr_t blkno,int bcast,md_error_t * ep)847 meta_db_addsidenms(
848 	mdsetname_t	*sp,
849 	mdname_t	*np,
850 	daddr_t		blkno,
851 	int		bcast,
852 	md_error_t	*ep
853 )
854 {
855 	side_t		sideno;
856 	char		*bname = NULL;
857 	char		*dname = NULL;
858 	minor_t		mnum;
859 	mddb_config_t	c;
860 	int		done;
861 	int		rval = 0;
862 	md_set_desc	*sd;
863 
864 	sideno = MD_SIDEWILD;
865 	/*CONSTCOND*/
866 	while (1) {
867 		if (bname != NULL) {
868 			Free(bname);
869 			bname = NULL;
870 		}
871 		if (dname != NULL) {
872 			Free(dname);
873 			dname = NULL;
874 		}
875 		if ((done = meta_getnextside_devinfo(sp, np->bname,
876 		    &sideno, &bname, &dname, &mnum, ep)) == -1) {
877 			rval = -1;
878 			break;
879 		}
880 
881 		if (done == 0)
882 			break;
883 
884 		if (! metaislocalset(sp)) {
885 			if ((sd = metaget_setdesc(sp, ep)) == NULL) {
886 				rval = -1;
887 				break;
888 			}
889 		}
890 
891 		/*
892 		 * Send addsidenms to all nodes using rpc.mdcommd if
893 		 * sidename is being added to MN diskset.
894 		 *
895 		 *   It's ok to broadcast this call to other nodes.
896 		 *
897 		 *   Note: The broadcast to other nodes isn't needed during
898 		 *   the addition of the first mddbs to the set since the
899 		 *   other nodes haven't been joined to the set yet.  All
900 		 *   nodes in a MN diskset are (implicitly) joined to the set
901 		 *   on the addition of the first mddb.
902 		 */
903 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
904 		    (bcast == DB_ADDSIDENMS_BCAST)) {
905 			md_mn_result_t			*resultp = NULL;
906 			md_mn_msg_meta_db_newside_t	db_ns;
907 			int				send_rval;
908 
909 			db_ns.msg_l_dev = np->dev;
910 			db_ns.msg_sideno = sideno;
911 			db_ns.msg_blkno = blkno;
912 			(void) strncpy(db_ns.msg_dname, dname,
913 			    sizeof (db_ns.msg_dname));
914 			(void) splitname(np->bname, &db_ns.msg_splitname);
915 			db_ns.msg_mnum = mnum;
916 
917 			/* Set devid to NULL until devids are supported */
918 			db_ns.msg_devid[0] = NULL;
919 
920 			/*
921 			 * If reconfig cycle has been started, this node is
922 			 * stuck in in the return step until this command has
923 			 * completed.  If mdcommd is suspended, ask
924 			 * send_message to fail (instead of retrying)
925 			 * so that metaset can finish allowing the reconfig
926 			 * cycle to proceed.
927 			 */
928 			send_rval = mdmn_send_message(sp->setno,
929 			    MD_MN_MSG_META_DB_NEWSIDE, MD_MSGF_FAIL_ON_SUSPEND |
930 			    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ns,
931 			    sizeof (md_mn_msg_meta_db_newside_t),
932 			    &resultp, ep);
933 			if (send_rval != 0) {
934 				rval = -1;
935 				if (resultp == NULL)
936 					(void) mddserror(ep,
937 					    MDE_DS_COMMD_SEND_FAIL,
938 					    sp->setno, NULL, NULL,
939 					    sp->setname);
940 				else {
941 					(void) mdstealerror(ep,
942 					    &(resultp->mmr_ep));
943 					if (mdisok(ep)) {
944 						(void) mddserror(ep,
945 						    MDE_DS_COMMD_SEND_FAIL,
946 						    sp->setno, NULL, NULL,
947 						    sp->setname);
948 					}
949 					free_result(resultp);
950 				}
951 				break;
952 			}
953 			if (resultp)
954 				free_result(resultp);
955 		} else {
956 			/*
957 			 * Let this side's  device name, minor # and driver name
958 			 * be known to the database replica.
959 			 */
960 			(void) memset(&c, 0, sizeof (c));
961 
962 			/* Fill in device/replica info */
963 			c.c_locator.l_dev = meta_cmpldev(np->dev);
964 			c.c_locator.l_blkno = blkno;
965 			(void) strncpy(c.c_locator.l_driver, dname,
966 			    sizeof (c.c_locator.l_driver));
967 			if (splitname(np->bname, &c.c_devname) ==
968 			    METASPLIT_LONGDISKNAME && devid_in_use == FALSE) {
969 				rval = mddeverror(ep, MDE_DISKNAMETOOLONG,
970 				    NODEV64, np->rname);
971 				break;
972 			}
973 
974 			c.c_locator.l_mnum = mnum;
975 
976 			/* Fill in setno, setname, and sideno */
977 			c.c_setno = sp->setno;
978 			(void) strncpy(c.c_setname, sp->setname,
979 			    sizeof (c.c_setname));
980 			c.c_sideno = sideno;
981 
982 			/*
983 			 * Don't need device id information from this ioctl
984 			 * Kernel determines device id from dev_t, which
985 			 * is just what this code would do.
986 			 */
987 			c.c_locator.l_devid = (uint64_t)0;
988 			c.c_locator.l_devid_flags = 0;
989 
990 			if (metaioctl(MD_DB_NEWSIDE, &c, &c.c_mde, NULL) != 0) {
991 				rval = mdstealerror(ep, &c.c_mde);
992 				break;
993 			}
994 		}
995 	}
996 
997 	/* cleanup, return success */
998 	if (bname != NULL) {
999 		Free(bname);
1000 		bname = NULL;
1001 	}
1002 	if (dname != NULL) {
1003 		Free(dname);
1004 		dname = NULL;
1005 	}
1006 	return (rval);
1007 }
1008 
1009 
1010 int
meta_db_delsidenm(mdsetname_t * sp,side_t sideno,mdname_t * np,daddr_t blkno,md_error_t * ep)1011 meta_db_delsidenm(
1012 	mdsetname_t	*sp,
1013 	side_t		sideno,
1014 	mdname_t	*np,
1015 	daddr_t		blkno,
1016 	md_error_t	*ep
1017 )
1018 {
1019 	mddb_config_t	c;
1020 	md_set_desc	*sd;
1021 
1022 	if (! metaislocalset(sp)) {
1023 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1024 			return (-1);
1025 	}
1026 	/* Use rpc.mdcommd to delete mddb side from all nodes */
1027 	if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1028 	    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1029 		md_mn_result_t			*resultp = NULL;
1030 		md_mn_msg_meta_db_delside_t	db_ds;
1031 		int				send_rval;
1032 
1033 		db_ds.msg_l_dev = np->dev;
1034 		db_ds.msg_blkno = blkno;
1035 		db_ds.msg_sideno = sideno;
1036 
1037 		/* Set devid to NULL until devids are supported */
1038 		db_ds.msg_devid[0] = NULL;
1039 
1040 		/*
1041 		 * If reconfig cycle has been started, this node is
1042 		 * stuck in in the return step until this command has
1043 		 * completed.  If mdcommd is suspended, ask
1044 		 * send_message to fail (instead of retrying)
1045 		 * so that metaset can finish allowing the reconfig
1046 		 * cycle to proceed.
1047 		 */
1048 		send_rval = mdmn_send_message(sp->setno,
1049 		    MD_MN_MSG_META_DB_DELSIDE, MD_MSGF_FAIL_ON_SUSPEND |
1050 		    MD_MSGF_PANIC_WHEN_INCONSISTENT, 0, (char *)&db_ds,
1051 		    sizeof (md_mn_msg_meta_db_delside_t), &resultp, ep);
1052 		if (send_rval != 0) {
1053 			if (resultp == NULL)
1054 				(void) mddserror(ep,
1055 				    MDE_DS_COMMD_SEND_FAIL,
1056 				    sp->setno, NULL, NULL,
1057 				    sp->setname);
1058 			else {
1059 				(void) mdstealerror(ep, &(resultp->mmr_ep));
1060 				if (mdisok(ep)) {
1061 					(void) mddserror(ep,
1062 					    MDE_DS_COMMD_SEND_FAIL,
1063 					    sp->setno, NULL, NULL,
1064 					    sp->setname);
1065 				}
1066 				free_result(resultp);
1067 			}
1068 			return (-1);
1069 		}
1070 		if (resultp)
1071 			free_result(resultp);
1072 
1073 	} else {
1074 		/*
1075 		 * Let this side's  device name, minor # and driver name
1076 		 * be known to the database replica.
1077 		 */
1078 		(void) memset(&c, 0, sizeof (c));
1079 
1080 		/* Fill in device/replica info */
1081 		c.c_locator.l_dev = meta_cmpldev(np->dev);
1082 		c.c_locator.l_blkno = blkno;
1083 
1084 		/* Fill in setno, setname, and sideno */
1085 		c.c_setno = sp->setno;
1086 		(void) strcpy(c.c_setname, sp->setname);
1087 		c.c_sideno = sideno;
1088 
1089 		/*
1090 		 * Don't need device id information from this ioctl
1091 		 * Kernel determines device id from dev_t, which
1092 		 * is just what this code would do.
1093 		 */
1094 		c.c_locator.l_devid = (uint64_t)0;
1095 		c.c_locator.l_devid_flags = 0;
1096 
1097 		if (metaioctl(MD_DB_DELSIDE, &c, &c.c_mde, NULL) != 0)
1098 			return (mdstealerror(ep, &c.c_mde));
1099 	}
1100 	return (0);
1101 }
1102 
1103 
1104 static int
mdnamesareunique(mdnamelist_t * nlp,md_error_t * ep)1105 mdnamesareunique(mdnamelist_t *nlp, md_error_t *ep)
1106 {
1107 	mdnamelist_t		*dnp1, *dnp2;
1108 
1109 	for (dnp1 = nlp; dnp1 != NULL; dnp1 = dnp1->next) {
1110 		for (dnp2 = dnp1->next; dnp2 != NULL; dnp2 = dnp2->next) {
1111 			if (strcmp(dnp1->namep->cname, dnp2->namep->cname) == 0)
1112 				return (mderror(ep, MDE_DUPDRIVE,
1113 				    dnp1->namep->cname));
1114 		}
1115 	}
1116 	return (0);
1117 }
1118 
1119 
1120 /*
1121  * Return 1 if files are different, else return 0
1122  */
1123 static int
filediff(char * tsname,char * sname)1124 filediff(char *tsname, char *sname)
1125 {
1126 	int ret = 1, fd;
1127 	size_t tsz, sz;
1128 	struct stat sbuf;
1129 	char *tbuf, *buf;
1130 
1131 	if (stat(tsname, &sbuf) != 0)
1132 		return (1);
1133 	tsz = sbuf.st_size;
1134 	if (stat(sname, &sbuf) != 0)
1135 		return (1);
1136 	sz = sbuf.st_size;
1137 	if (tsz != sz)
1138 		return (1);
1139 
1140 	/* allocate memory and read both files into buffer */
1141 	tbuf = malloc(tsz);
1142 	buf = malloc(sz);
1143 	if (tbuf == NULL || buf == NULL)
1144 		goto out;
1145 
1146 	fd = open(tsname, O_RDONLY);
1147 	if (fd == -1)
1148 		goto out;
1149 	sz = read(fd, tbuf, tsz);
1150 	(void) close(fd);
1151 	if (sz != tsz)
1152 		goto out;
1153 
1154 	fd = open(sname, O_RDONLY);
1155 	if (fd == -1)
1156 		goto out;
1157 	sz = read(fd, buf, tsz);
1158 	(void) close(fd);
1159 	if (sz != tsz)
1160 		goto out;
1161 
1162 	/* compare content */
1163 	ret = bcmp(tbuf, buf, tsz);
1164 out:
1165 	if (tbuf)
1166 		free(tbuf);
1167 	if (buf)
1168 		free(buf);
1169 	return (ret);
1170 }
1171 
1172 /*
1173  * patch md.conf file with mddb locations
1174  */
1175 int
meta_db_patch(char * sname,char * cname,int patch,md_error_t * ep)1176 meta_db_patch(
1177 	char		*sname,		/* system file name */
1178 	char		*cname,		/* mddb.cf file name */
1179 	int		patch,		/* patching locally */
1180 	md_error_t	*ep
1181 )
1182 {
1183 	char		*tsname = NULL;
1184 	char		line[MDDB_BOOTLIST_MAX_LEN];
1185 	FILE		*tsfp = NULL;
1186 	FILE		*mfp = NULL;
1187 	int		rval = -1;
1188 
1189 	/* check names */
1190 	if (sname == NULL) {
1191 		if (patch)
1192 			sname = "md.conf";
1193 		else
1194 			sname = "/kernel/drv/md.conf";
1195 	}
1196 	if (cname == NULL)
1197 		cname = META_DBCONF;
1198 
1199 	/*
1200 	 * edit file
1201 	 */
1202 	if (meta_systemfile_copy(sname, 0, 1, 1, 0, &tsname, &tsfp, ep) != 0) {
1203 		if (mdissyserror(ep, EROFS)) {
1204 			/*
1205 			 * If we are booted on a read-only root because
1206 			 * of mddb quorum problems we don't want to emit
1207 			 * any scary error messages.
1208 			 */
1209 			mdclrerror(ep);
1210 			rval = 0;
1211 		}
1212 		goto out;
1213 	}
1214 
1215 	if (meta_systemfile_append_mddb(cname, sname, tsname, tsfp, 1, 0, 0,
1216 	    ep) != 0)
1217 		goto out;
1218 
1219 	/* if file content is identical, skip rename */
1220 	if (filediff(tsname, sname) == 0) {
1221 		rval = 0;
1222 		goto out;
1223 	}
1224 
1225 	if ((fflush(tsfp) != 0) || (fsync(fileno(tsfp)) != 0) ||
1226 	    (fclose(tsfp) != 0)) {
1227 		(void) mdsyserror(ep, errno, tsname);
1228 		goto out;
1229 	}
1230 
1231 	tsfp = NULL;
1232 
1233 	/*
1234 	 * rename file. If we get a Cross Device error then it
1235 	 * is because we are in the miniroot.
1236 	 */
1237 	if (rename(tsname, sname) != 0 && errno != EXDEV) {
1238 		(void) mdsyserror(ep, errno, sname);
1239 		goto out;
1240 	}
1241 
1242 	if (errno == EXDEV) {
1243 		if ((tsfp = fopen(tsname, "r")) == NULL)
1244 			goto out;
1245 		if ((mfp = fopen(sname, "w+")) == NULL)
1246 			goto out;
1247 		while (fgets(line, sizeof (line), tsfp) != NULL) {
1248 			if (fputs(line, mfp) == NULL)
1249 				goto out;
1250 		}
1251 		(void) fclose(tsfp);
1252 		tsfp = NULL;
1253 		if (fflush(mfp) != 0)
1254 			goto out;
1255 		if (fsync(fileno(mfp)) != 0)
1256 			goto out;
1257 		if (fclose(mfp) != 0) {
1258 			mfp = NULL;
1259 			goto out;
1260 		}
1261 	}
1262 
1263 	Free(tsname);
1264 	tsname = NULL;
1265 	rval = 0;
1266 
1267 	/* cleanup, return error */
1268 out:
1269 	if (tsfp != NULL)
1270 		(void) fclose(tsfp);
1271 	if (tsname != NULL) {
1272 		(void) unlink(tsname);
1273 		Free(tsname);
1274 	}
1275 	return (rval);
1276 }
1277 
1278 /*
1279  * Add replicas to set.  This happens as a result of:
1280  *	- metadb [-s set_name] -a
1281  *	- metaset -s set_name -a disk
1282  *	- metaset -s set_name -d disk	 (causes a rebalance of mddbs)
1283  *	- metaset -s set_name -b
1284  *
1285  * For a local set, this routine is run on the local set host.
1286  *
1287  * For a traditional diskset, this routine is run on the node that
1288  * is running the metaset command.
1289  *
1290  * For a multinode diskset, this routine is run by the node that is
1291  * running the metaset command.  If this is the first mddb added to
1292  * the MN diskset, then no communication is made to other nodes via commd
1293  * since the other nodes will be in-sync with respect to the mddbs when
1294  * those other nodes join the set and snarf in the newly created mddb.
1295  * If this is not the first mddb added to the MN diskset, then this
1296  * attach command is sent to all of the nodes using commd.  This keeps
1297  * the nodes in-sync.
1298  */
1299 int
meta_db_attach(mdsetname_t * sp,mdnamelist_t * db_nlp,mdchkopts_t options,md_timeval32_t * timeval,int dbcnt,int dbsize,char * sysfilename,md_error_t * ep)1300 meta_db_attach(
1301 	mdsetname_t		*sp,
1302 	mdnamelist_t		*db_nlp,
1303 	mdchkopts_t		options,
1304 	md_timeval32_t		*timeval,
1305 	int			dbcnt,
1306 	int			dbsize,
1307 	char			*sysfilename,
1308 	md_error_t		*ep
1309 )
1310 {
1311 	struct mddb_config	c;
1312 	mdnamelist_t		*nlp;
1313 	mdname_t		*np;
1314 	md_drive_desc		*dd = NULL;
1315 	md_drive_desc		*p;
1316 	int			i;
1317 	int			fd;
1318 	side_t			sideno;
1319 	daddr_t			blkno;
1320 	int			replicacount = 0;
1321 	int			start_svmdaemons = 0;
1322 	int			rval = 0;
1323 	md_error_t		status = mdnullerror;
1324 	md_set_desc		*sd;
1325 	int			stale_bool = FALSE;
1326 	int			flags;
1327 	int			firstmddb = 1;
1328 	md_timeval32_t		inittime = {0, 0};
1329 
1330 	/*
1331 	 * Error if we don't get some work to do.
1332 	 */
1333 	if (db_nlp == NULL)
1334 		return (mdsyserror(ep, EINVAL, NULL));
1335 
1336 	if (mdnamesareunique(db_nlp, ep) != 0)
1337 		return (-1);
1338 	(void) memset(&c, 0, sizeof (c));
1339 	c.c_id = 0;
1340 	c.c_setno = sp->setno;
1341 
1342 	/* Don't need device id information from this ioctl */
1343 	c.c_locator.l_devid = (uint64_t)0;
1344 	c.c_locator.l_devid_flags = 0;
1345 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
1346 		if (metaislocalset(sp)) {
1347 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID))
1348 				mdclrerror(&c.c_mde);
1349 			else if (! mdismddberror(&c.c_mde, MDE_DB_NODB) ||
1350 			    (! (options & MDCHK_ALLOW_NODBS)))
1351 				return (mdstealerror(ep, &c.c_mde));
1352 		} else {
1353 			if (! mdismddberror(&c.c_mde, MDE_DB_NOTOWNER))
1354 				return (mdstealerror(ep, &c.c_mde));
1355 		}
1356 		mdclrerror(&c.c_mde);
1357 	}
1358 	/*
1359 	 * Is current set STALE?
1360 	 */
1361 	if (c.c_flags & MDDB_C_STALE) {
1362 		stale_bool = TRUE;
1363 	}
1364 
1365 	assert(db_nlp != NULL);
1366 
1367 	/* if these are the first replicas then the SVM daemons need to run */
1368 	if (c.c_dbcnt == 0)
1369 		start_svmdaemons = 1;
1370 
1371 	/*
1372 	 * check to see if we will go over the total possible number
1373 	 * of data bases
1374 	 */
1375 	nlp = db_nlp;
1376 	while (nlp) {
1377 		replicacount += dbcnt;
1378 		nlp = nlp->next;
1379 	}
1380 
1381 	if ((replicacount + c.c_dbcnt) > c.c_dbmax)
1382 		return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
1383 		    sp->setno, c.c_dbcnt + replicacount, NULL));
1384 
1385 	/*
1386 	 * go through and check to make sure all locations specified
1387 	 * are legal also pick out driver name;
1388 	 */
1389 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1390 		diskaddr_t devsize;
1391 
1392 		np = nlp->namep;
1393 
1394 		if (! metaislocalset(sp)) {
1395 			uint_t	partno;
1396 			uint_t	rep_partno;
1397 			mddrivename_t	*dnp = np->drivenamep;
1398 
1399 			/*
1400 			 * make sure that non-local database replicas
1401 			 * are always on the replica slice.
1402 			 */
1403 			if (meta_replicaslice(dnp,
1404 			    &rep_partno, ep) != 0)
1405 				return (-1);
1406 			if (metagetvtoc(np, FALSE, &partno, ep) == NULL)
1407 				return (-1);
1408 			if (partno != rep_partno)
1409 				return (mddeverror(ep, MDE_REPCOMP_ONLY,
1410 				    np->dev, sp->setname));
1411 		}
1412 
1413 		if (meta_check_replica(sp, np, options, 0, (dbcnt * dbsize),
1414 		    ep)) {
1415 			return (-1);
1416 		}
1417 
1418 		if ((devsize = metagetsize(np, ep)) == -1)
1419 			return (-1);
1420 
1421 		if (devsize < (diskaddr_t)((dbcnt * dbsize) + 16))
1422 			return (mdmddberror(ep, MDE_REPLICA_TOOSMALL,
1423 			    meta_getminor(np->dev), sp->setno, devsize,
1424 			    np->cname));
1425 	}
1426 
1427 	/*
1428 	 * If first disk in set we don't have lb_inittime yet for use as
1429 	 * mb_setcreatetime so don't go looking for it. WE'll come back
1430 	 * later and update after the locator block has been created.
1431 	 * If this isn't the first disk in the set, we have a locator
1432 	 * block and thus we have lb_inittime. Set mb_setcreatetime to
1433 	 * lb_inittime.
1434 	 */
1435 	if (! metaislocalset(sp)) {
1436 		if (c.c_dbcnt != 0) {
1437 			firstmddb = 0;
1438 			inittime = meta_get_lb_inittime(sp, ep);
1439 		}
1440 	}
1441 
1442 	/*
1443 	 * go through and write all master blocks
1444 	 */
1445 
1446 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1447 		np = nlp->namep;
1448 
1449 		if ((fd = open(np->rname, O_RDWR)) < 0)
1450 			return (mdsyserror(ep, errno, np->rname));
1451 
1452 		for (i = 0; i < dbcnt; i++) {
1453 			if (mkmasterblks(sp, np, fd, (i * dbsize + 16), dbsize,
1454 			    inittime, ep)) {
1455 				(void) close(fd);
1456 				return (-1);
1457 			}
1458 		}
1459 		(void) close(fd);
1460 	}
1461 
1462 	if ((sideno = getmyside(sp, ep)) == MD_SIDEWILD)
1463 		return (-1);
1464 
1465 	if (! metaislocalset(sp)) {
1466 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1467 		if (! mdisok(ep))
1468 			return (-1);
1469 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1470 			return (-1);
1471 
1472 	}
1473 
1474 	/*
1475 	 * go through and tell kernel to add them
1476 	 */
1477 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1478 		mdcinfo_t	*cinfo;
1479 
1480 		np = nlp->namep;
1481 
1482 		if ((cinfo = metagetcinfo(np, ep)) == NULL) {
1483 			rval = -1;
1484 			goto out;
1485 		}
1486 
1487 		/*
1488 		 * If mddb is being added to MN diskset and there already
1489 		 * exists a valid mddb in the set (which equates to this
1490 		 * node being an owner of the set) then use rpc.mdcommd
1491 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1492 		 * If set is stale, don't log the message since rpc.mdcommd
1493 		 * can't write the message to the mddb.
1494 		 *
1495 		 * Otherwise, just add mddb to this node.
1496 		 */
1497 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1498 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1499 			md_mn_result_t			*resultp = NULL;
1500 			md_mn_msg_meta_db_attach_t	attach;
1501 			int 				send_rval;
1502 
1503 			/*
1504 			 * In a scenario where new replicas had been added on
1505 			 * the master, and then all of the old replicas failed
1506 			 * before the slaves had knowledge of the new replicas,
1507 			 * the slaves are unable to re-parse in the mddb
1508 			 * from the new replicas since the slaves have no
1509 			 * knowledge of the new replicas.  The following
1510 			 * algorithm solves this problem:
1511 			 * 	- META_DB_ATTACH message generates submsgs
1512 			 * 		- BLOCK parse (master)
1513 			 * 		- MDDB_ATTACH new replicas
1514 			 * 		- UNBLOCK parse (master) causing parse
1515 			 *		information to be sent from master
1516 			 *		to slaves at a higher class than the
1517 			 *		unblock so the parse message will
1518 			 *		reach slaves before unblock message.
1519 			 */
1520 			attach.msg_l_dev = np->dev;
1521 			attach.msg_cnt = dbcnt;
1522 			attach.msg_dbsize = dbsize;
1523 			(void) strncpy(attach.msg_dname, cinfo->dname,
1524 			    sizeof (attach.msg_dname));
1525 			(void) splitname(np->bname, &attach.msg_splitname);
1526 			attach.msg_options = options;
1527 
1528 			/* Set devid to NULL until devids are supported */
1529 			attach.msg_devid[0] = NULL;
1530 
1531 			/*
1532 			 * If reconfig cycle has been started, this node is
1533 			 * stuck in in the return step until this command has
1534 			 * completed.  If mdcommd is suspended, ask
1535 			 * send_message to fail (instead of retrying)
1536 			 * so that metaset can finish allowing the reconfig
1537 			 * cycle to proceed.
1538 			 */
1539 			flags = MD_MSGF_FAIL_ON_SUSPEND;
1540 			if (stale_bool == TRUE)
1541 				flags |= MD_MSGF_NO_LOG;
1542 			send_rval = mdmn_send_message(sp->setno,
1543 			    MD_MN_MSG_META_DB_ATTACH,
1544 			    flags, 0, (char *)&attach,
1545 			    sizeof (md_mn_msg_meta_db_attach_t),
1546 			    &resultp, ep);
1547 			if (send_rval != 0) {
1548 				rval = -1;
1549 				if (resultp == NULL)
1550 					(void) mddserror(ep,
1551 					    MDE_DS_COMMD_SEND_FAIL,
1552 					    sp->setno, NULL, NULL,
1553 					    sp->setname);
1554 				else {
1555 					(void) mdstealerror(ep,
1556 					    &(resultp->mmr_ep));
1557 					if (mdisok(ep)) {
1558 						(void) mddserror(ep,
1559 						    MDE_DS_COMMD_SEND_FAIL,
1560 						    sp->setno, NULL, NULL,
1561 						    sp->setname);
1562 					}
1563 					free_result(resultp);
1564 				}
1565 				goto out;
1566 			}
1567 			if (resultp)
1568 				free_result(resultp);
1569 		} else {
1570 			/* Adding mddb(s) to just this node */
1571 			for (i = 0; i < dbcnt; i++) {
1572 				(void) memset(&c, 0, sizeof (c));
1573 				/* Fill in device/replica info */
1574 				c.c_locator.l_dev = meta_cmpldev(np->dev);
1575 				c.c_locator.l_blkno = i * dbsize + 16;
1576 				blkno = c.c_locator.l_blkno;
1577 				(void) strncpy(c.c_locator.l_driver,
1578 				    cinfo->dname,
1579 				    sizeof (c.c_locator.l_driver));
1580 
1581 				if (splitname(np->bname, &c.c_devname) ==
1582 				    METASPLIT_LONGDISKNAME && devid_in_use ==
1583 				    FALSE) {
1584 					rval = mddeverror(ep,
1585 					    MDE_DISKNAMETOOLONG,
1586 					    NODEV64, np->rname);
1587 					goto out;
1588 				}
1589 
1590 				c.c_locator.l_mnum = meta_getminor(np->dev);
1591 
1592 				/* Fill in setno, setname, and sideno */
1593 				c.c_setno = sp->setno;
1594 				if (! metaislocalset(sp)) {
1595 					if (MD_MNSET_DESC(sd)) {
1596 						c.c_multi_node = 1;
1597 					}
1598 				}
1599 				(void) strcpy(c.c_setname, sp->setname);
1600 				c.c_sideno = sideno;
1601 
1602 				/*
1603 				 * Don't need device id information from this
1604 				 * ioctl Kernel determines device id from
1605 				 * dev_t, which is just what this code would do.
1606 				 */
1607 				c.c_locator.l_devid = (uint64_t)0;
1608 				c.c_locator.l_devid_flags = 0;
1609 
1610 				if (timeval != NULL)
1611 					c.c_timestamp = *timeval;
1612 
1613 				if (setup_med_cfg(sp, &c,
1614 				    (options & MDCHK_SET_FORCE), ep)) {
1615 					rval = -1;
1616 					goto out;
1617 				}
1618 
1619 				if (metaioctl(MD_DB_NEWDEV, &c, &c.c_mde,
1620 				    NULL) != 0) {
1621 					rval = mdstealerror(ep, &c.c_mde);
1622 					goto out;
1623 				}
1624 				/*
1625 				 * This is either a traditional diskset OR this
1626 				 * is the first replica added to a MN diskset.
1627 				 * In either case, set broadcast to NO_BCAST so
1628 				 * that message won't go through rpc.mdcommd.
1629 				 * If this is a traditional diskset, the bcast
1630 				 * flag is ignored since traditional disksets
1631 				 * don't use the rpc.mdcommd.
1632 				 */
1633 				if (meta_db_addsidenms(sp, np, blkno,
1634 				    DB_ADDSIDENMS_NO_BCAST, ep))
1635 					goto out;
1636 			}
1637 		}
1638 		if (! metaislocalset(sp)) {
1639 			/* update the dbcnt and size in dd */
1640 			for (p = dd; p != NULL; p = p->dd_next)
1641 				if (p->dd_dnp == np->drivenamep) {
1642 					p->dd_dbcnt = dbcnt;
1643 					p->dd_dbsize  = dbsize;
1644 					break;
1645 				}
1646 		}
1647 
1648 		/*
1649 		 * If this was the first addition of disks to the
1650 		 * diskset you now need to update the mb_setcreatetime
1651 		 * which needed lb_inittime which wasn't there until now.
1652 		 */
1653 		if (firstmddb) {
1654 			if (meta_update_mb(sp, dd, ep) != 0) {
1655 				return (-1);
1656 			}
1657 		}
1658 		(void) close(fd);
1659 	}
1660 
1661 out:
1662 	if (metaislocalset(sp)) {
1663 
1664 		/* everything looks fine. Start mdmonitord */
1665 		if (rval == 0 && start_svmdaemons == 1) {
1666 			if (meta_smf_enable(META_SMF_CORE, &status) == -1) {
1667 				mde_perror(&status, "");
1668 				mdclrerror(&status);
1669 			}
1670 		}
1671 
1672 		if (buildconf(sp, &status)) {
1673 			/* Don't mask any previous errors */
1674 			if (rval == 0)
1675 				rval = mdstealerror(ep, &status);
1676 			return (rval);
1677 		}
1678 
1679 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
1680 			/* Don't mask any previous errors */
1681 			if (rval == 0)
1682 				rval = mdstealerror(ep, &status);
1683 		}
1684 	} else {
1685 		if (update_dbinfo_on_drives(sp, dd,
1686 		    (options & MDCHK_SET_LOCKED),
1687 		    (options & MDCHK_SET_FORCE),
1688 		    &status)) {
1689 			/* Don't mask any previous errors */
1690 			if (rval == 0)
1691 				rval = mdstealerror(ep, &status);
1692 			else
1693 				mdclrerror(&status);
1694 		}
1695 		metafreedrivedesc(&dd);
1696 	}
1697 	/*
1698 	 * For MN disksets that already had already had nodes joined
1699 	 * before the attach of this mddb(s), the name invalidation is
1700 	 * done by the commd handler routine.  Otherwise, if this
1701 	 * is the first attach of a MN diskset mddb, the invalidation
1702 	 * must be done here since the first attach cannot be sent
1703 	 * via the commd since there are no nodes joined to the set yet.
1704 	 */
1705 	if ((metaislocalset(sp)) || (!MD_MNSET_DESC(sd)) ||
1706 	    (MD_MNSET_DESC(sd) &&
1707 	    (!(sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)))) {
1708 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
1709 			meta_invalidate_name(nlp->namep);
1710 		}
1711 	}
1712 	return (rval);
1713 }
1714 
1715 /*
1716  * deletelist_length
1717  *
1718  *	return the number of slices that have been specified for deletion
1719  *	on the metadb command line.  This does not calculate the number
1720  *	of replicas because there may be multiple replicas per slice.
1721  */
1722 static int
deletelist_length(mdnamelist_t * db_nlp)1723 deletelist_length(mdnamelist_t *db_nlp)
1724 {
1725 
1726 	mdnamelist_t		*nlp;
1727 	int			list_length = 0;
1728 
1729 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1730 		list_length++;
1731 	}
1732 
1733 	return (list_length);
1734 }
1735 
1736 static int
in_deletelist(char * devname,mdnamelist_t * db_nlp)1737 in_deletelist(char *devname, mdnamelist_t *db_nlp)
1738 {
1739 
1740 	mdnamelist_t		*nlp;
1741 	mdname_t		*np;
1742 	int			index = 0;
1743 
1744 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1745 		np = nlp->namep;
1746 
1747 		if (strcmp(devname, np->bname) == 0)
1748 			return (index);
1749 		index++;
1750 	}
1751 
1752 	return (-1);
1753 }
1754 
1755 /*
1756  * Delete replicas from set.  This happens as a result of:
1757  *	- metadb [-s set_name] -d
1758  *	- metaset -s set_name -a disk	(causes a rebalance of mddbs)
1759  *	- metaset -s set_name -d disk
1760  *	- metaset -s set_name -b
1761  *
1762  * For a local set, this routine is run on the local set host.
1763  *
1764  * For a traditional diskset, this routine is run on the node that
1765  * is running the metaset command.
1766  *
1767  * For a multinode diskset, this routine is run by the node that is
1768  * running the metaset command.  This detach routine is sent to all
1769  * of the joined nodes in the diskset using commd.  This keeps
1770  * the nodes in-sync.
1771  */
1772 int
meta_db_detach(mdsetname_t * sp,mdnamelist_t * db_nlp,mdforceopts_t force_option,char * sysfilename,md_error_t * ep)1773 meta_db_detach(
1774 	mdsetname_t		*sp,
1775 	mdnamelist_t		*db_nlp,
1776 	mdforceopts_t		force_option,
1777 	char			*sysfilename,
1778 	md_error_t		*ep
1779 )
1780 {
1781 	struct mddb_config	c;
1782 	mdnamelist_t		*nlp;
1783 	mdname_t		*np;
1784 	md_drive_desc		*dd = NULL;
1785 	md_drive_desc		*p;
1786 	int			replicacount;
1787 	int			replica_delete_count;
1788 	int			nr_replica_slices;
1789 	int			i;
1790 	int			stop_svmdaemons = 0;
1791 	int			rval = 0;
1792 	int			index;
1793 	int			valid_replicas_nottodelete = 0;
1794 	int			invalid_replicas_nottodelete = 0;
1795 	int			invalid_replicas_todelete = 0;
1796 	int			errored = 0;
1797 	int			*tag_array;
1798 	int			fd = -1;
1799 	md_error_t		status = mdnullerror;
1800 	md_set_desc		*sd;
1801 	int			stale_bool = FALSE;
1802 	int			flags;
1803 
1804 	/*
1805 	 * Error if we don't get some work to do.
1806 	 */
1807 	if (db_nlp == NULL)
1808 		return (mdsyserror(ep, EINVAL, NULL));
1809 
1810 	if (mdnamesareunique(db_nlp, ep) != 0)
1811 		return (-1);
1812 
1813 	(void) memset(&c, 0, sizeof (c));
1814 	c.c_id = 0;
1815 	c.c_setno = sp->setno;
1816 
1817 	/* Don't need device id information from this ioctl */
1818 	c.c_locator.l_devid = (uint64_t)0;
1819 	c.c_locator.l_devid_flags = 0;
1820 
1821 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1822 		return (mdstealerror(ep, &c.c_mde));
1823 
1824 	/*
1825 	 * Is current set STALE?
1826 	 */
1827 	if (c.c_flags & MDDB_C_STALE) {
1828 		stale_bool = TRUE;
1829 	}
1830 
1831 	replicacount = c.c_dbcnt;
1832 
1833 	assert(db_nlp != NULL);
1834 
1835 	/*
1836 	 * go through and gather how many data bases are on each
1837 	 * device specified.
1838 	 */
1839 
1840 	nr_replica_slices = deletelist_length(db_nlp);
1841 	tag_array = (int *)calloc(nr_replica_slices, sizeof (int));
1842 
1843 	replica_delete_count = 0;
1844 	for (i = 0; i < replicacount; i++) {
1845 		char	*devname;
1846 		int	found = 0;
1847 
1848 		c.c_id = i;
1849 
1850 		/* Don't need device id information from this ioctl */
1851 		c.c_locator.l_devid = (uint64_t)0;
1852 		c.c_locator.l_devid_flags = 0;
1853 
1854 		if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1855 			return (mdstealerror(ep, &c.c_mde));
1856 
1857 		devname = splicename(&c.c_devname);
1858 
1859 		if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
1860 			Free(devname);
1861 			devname = getlongname(&c, ep);
1862 			if (devname == NULL) {
1863 				return (-1);
1864 			}
1865 		}
1866 
1867 		if ((index = in_deletelist(devname, db_nlp)) != -1) {
1868 			found = 1;
1869 			tag_array[index] = 1;
1870 			replica_delete_count++;
1871 		}
1872 
1873 		errored = c.c_locator.l_flags & (MDDB_F_EREAD |
1874 		    MDDB_F_EWRITE | MDDB_F_TOOSMALL | MDDB_F_EFMT |
1875 		    MDDB_F_EDATA | MDDB_F_EMASTER);
1876 
1877 		/*
1878 		 * There are four combinations of "errored" and "found"
1879 		 * and they are used to find the number of
1880 		 * (a) valid/invalid replicas that are not in the delete
1881 		 * list and are available in the system.
1882 		 * (b) valid/invalid replicas that are to be deleted.
1883 		 */
1884 
1885 		if (errored && !found)		/* errored and !found */
1886 			invalid_replicas_nottodelete++;
1887 		else if (!found)		/* !errored and !found */
1888 			valid_replicas_nottodelete++;
1889 		else if (errored)		/* errored and found */
1890 			invalid_replicas_todelete++;
1891 		/*
1892 		 * else it is !errored and found. This means
1893 		 * valid_replicas_todelete++; But this variable will not
1894 		 * be used anywhere
1895 		 */
1896 
1897 		Free(devname);
1898 	}
1899 
1900 	index = 0;
1901 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1902 		np = nlp->namep;
1903 		if (tag_array[index++] != 1) {
1904 			Free(tag_array);
1905 			return (mddeverror(ep, MDE_NO_DB, np->dev, np->cname));
1906 		}
1907 	}
1908 
1909 	Free(tag_array);
1910 
1911 
1912 	/* if all replicas are deleted stop mdmonitord */
1913 	if ((replicacount - replica_delete_count) == 0)
1914 		stop_svmdaemons = 1;
1915 
1916 	if (((replicacount - replica_delete_count) < MD_MINREPLICAS)) {
1917 		if (force_option & MDFORCE_NONE)
1918 			return (mderror(ep, MDE_NOTENOUGH_DB, sp->setname));
1919 		if (! metaislocalset(sp) && ! (force_option & MDFORCE_DS))
1920 			return (mderror(ep, MDE_DELDB_NOTALLOWED, sp->setname));
1921 	}
1922 
1923 	/*
1924 	 * The following algorithms are followed to check for deletion:
1925 	 * (a) If the delete list(db_nlp) has all invalid replicas and no valid
1926 	 * replicas, then deletion should be allowed.
1927 	 * (b) Deletion should be allowed only if valid replicas that are "not"
1928 	 * to be deleted is always greater than the invalid replicas that
1929 	 * are "not" to be deleted.
1930 	 * (c) If the user uses -f option, then deletion should be allowed.
1931 	 */
1932 
1933 	if ((invalid_replicas_todelete != replica_delete_count) &&
1934 	    (invalid_replicas_nottodelete > valid_replicas_nottodelete) &&
1935 	    (force_option != MDFORCE_LOCAL))
1936 		return (mderror(ep, MDE_DEL_VALIDDB_NOTALLOWED, sp->setname));
1937 
1938 	/*
1939 	 * go through and tell kernel to delete them
1940 	 */
1941 
1942 	/* Don't need device id information from this ioctl */
1943 	c.c_locator.l_devid = (uint64_t)0;
1944 	c.c_locator.l_devid_flags = 0;
1945 
1946 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0)
1947 		return (mdstealerror(ep, &c.c_mde));
1948 
1949 	if (! metaislocalset(sp)) {
1950 		dd = metaget_drivedesc_fromnamelist(sp, db_nlp, ep);
1951 		if (! mdisok(ep))
1952 			return (-1);
1953 		if ((sd = metaget_setdesc(sp, ep)) == NULL)
1954 			return (-1);
1955 	}
1956 
1957 	for (nlp = db_nlp; nlp != NULL; nlp = nlp->next) {
1958 		np = nlp->namep;
1959 
1960 		/*
1961 		 * If mddb is being deleted from MN diskset and node is
1962 		 * an owner of the diskset then use rpc.mdcommd
1963 		 * mechanism to add mddb(s) so that all nodes stay in sync.
1964 		 * If set is stale, don't log the message since rpc.mdcommd
1965 		 * can't write the message to the mddb.
1966 		 *
1967 		 * When mddbs are first being added to set, a detach can
1968 		 * be called before any node has joined the diskset, so
1969 		 * must check to see if node is an owner of the diskset.
1970 		 *
1971 		 * Otherwise, just delete mddb from this node.
1972 		 */
1973 
1974 		if ((! metaislocalset(sp)) && MD_MNSET_DESC(sd) &&
1975 		    (sd->sd_mn_mynode->nd_flags & MD_MN_NODE_OWN)) {
1976 			md_mn_result_t			*resultp;
1977 			md_mn_msg_meta_db_detach_t	detach;
1978 			int				send_rval;
1979 
1980 			/*
1981 			 * The following algorithm is used to detach replicas.
1982 			 * 	- META_DB_DETACH message generates submsgs
1983 			 * 		- BLOCK parse (master)
1984 			 * 		- MDDB_DETACH replicas
1985 			 * 		- UNBLOCK parse (master) causing parse
1986 			 *		information to be sent from master
1987 			 *		to slaves at a higher class than the
1988 			 *		unblock so the parse message will
1989 			 *		reach slaves before unblock message.
1990 			 */
1991 			(void) splitname(np->bname, &detach.msg_splitname);
1992 
1993 			/* Set devid to NULL until devids are supported */
1994 			detach.msg_devid[0] = NULL;
1995 
1996 			/*
1997 			 * If reconfig cycle has been started, this node is
1998 			 * stuck in in the return step until this command has
1999 			 * completed.  If mdcommd is suspended, ask
2000 			 * send_message to fail (instead of retrying)
2001 			 * so that metaset can finish allowing the reconfig
2002 			 * cycle to proceed.
2003 			 */
2004 			flags = MD_MSGF_FAIL_ON_SUSPEND;
2005 			if (stale_bool == TRUE)
2006 				flags |= MD_MSGF_NO_LOG;
2007 			send_rval = mdmn_send_message(sp->setno,
2008 			    MD_MN_MSG_META_DB_DETACH,
2009 			    flags, 0, (char *)&detach,
2010 			    sizeof (md_mn_msg_meta_db_detach_t),
2011 			    &resultp, ep);
2012 			if (send_rval != 0) {
2013 				rval = -1;
2014 				if (resultp == NULL)
2015 					(void) mddserror(ep,
2016 					    MDE_DS_COMMD_SEND_FAIL,
2017 					    sp->setno, NULL, NULL,
2018 					    sp->setname);
2019 				else {
2020 					(void) mdstealerror(ep,
2021 					    &(resultp->mmr_ep));
2022 					if (mdisok(ep)) {
2023 						(void) mddserror(ep,
2024 						    MDE_DS_COMMD_SEND_FAIL,
2025 						    sp->setno, NULL, NULL,
2026 						    sp->setname);
2027 					}
2028 					free_result(resultp);
2029 				}
2030 				goto out;
2031 			}
2032 			if (resultp)
2033 				free_result(resultp);
2034 		} else {
2035 			i = 0;
2036 			while (i < c.c_dbcnt) {
2037 				char	*devname;
2038 
2039 				c.c_id = i;
2040 
2041 				/* Don't need devid info from this ioctl */
2042 				c.c_locator.l_devid = (uint64_t)0;
2043 				c.c_locator.l_devid_flags = 0;
2044 
2045 				if (metaioctl(MD_DB_GETDEV, &c,
2046 				    &c.c_mde, NULL)) {
2047 					rval = mdstealerror(ep, &c.c_mde);
2048 					goto out;
2049 				}
2050 
2051 				devname = splicename(&c.c_devname);
2052 
2053 				if (strstr(devname, META_LONGDISKNAME_STR)
2054 				    != NULL) {
2055 					Free(devname);
2056 					devname = getlongname(&c, ep);
2057 					if (devname == NULL) {
2058 						return (-1);
2059 					}
2060 				}
2061 
2062 				if (strcmp(devname, np->bname) != 0) {
2063 					Free(devname);
2064 					i++;
2065 					continue;
2066 				}
2067 				Free(devname);
2068 
2069 				/* Don't need devid info from this ioctl */
2070 				c.c_locator.l_devid = (uint64_t)0;
2071 				c.c_locator.l_devid_flags = 0;
2072 
2073 				if (metaioctl(MD_DB_DELDEV, &c,
2074 				    &c.c_mde, NULL) != 0) {
2075 					rval = mdstealerror(ep, &c.c_mde);
2076 					goto out;
2077 				}
2078 
2079 				/* Not incrementing "i" intentionally */
2080 			}
2081 		}
2082 		if (! metaislocalset(sp)) {
2083 			/* update the dbcnt and size in dd */
2084 			for (p = dd; p != NULL; p = p->dd_next) {
2085 				if (p->dd_dnp == np->drivenamep) {
2086 					p->dd_dbcnt = 0;
2087 					p->dd_dbsize  = 0;
2088 					break;
2089 				}
2090 			}
2091 
2092 			/*
2093 			 * Slam a dummy master block and make it self
2094 			 * identifying
2095 			 */
2096 			if ((fd = open(np->rname, O_RDWR)) >= 0) {
2097 				meta_mkdummymaster(sp, fd, 16);
2098 				(void) close(fd);
2099 			}
2100 		}
2101 	}
2102 out:
2103 	if (metaislocalset(sp)) {
2104 		/*
2105 		 * Stop all the daemons if there are
2106 		 * no more replicas so that the module can be
2107 		 * unloaded.
2108 		 */
2109 		if (rval == 0 && stop_svmdaemons == 1) {
2110 			char buf[MAXPATHLEN];
2111 			int i;
2112 
2113 			for (i = 0; i < DAEMON_COUNT; i++) {
2114 				(void) snprintf(buf, MAXPATHLEN,
2115 				    "/usr/bin/pkill -%s -x %s",
2116 				    svmd_kill_list[i].svmd_kill_val,
2117 				    svmd_kill_list[i].svmd_name);
2118 				if (pclose(popen(buf, "w")) == -1)
2119 					md_perror(buf);
2120 			}
2121 
2122 			if (meta_smf_disable(META_SMF_ALL, &status) == -1) {
2123 				mde_perror(&status, "");
2124 				mdclrerror(&status);
2125 			}
2126 		}
2127 		if (buildconf(sp, &status)) {
2128 			/* Don't mask any previous errors */
2129 			if (rval == 0)
2130 				rval = mdstealerror(ep, &status);
2131 			else
2132 				mdclrerror(&status);
2133 			return (rval);
2134 		}
2135 
2136 		if (meta_db_patch(sysfilename, NULL, 0, &status)) {
2137 			/* Don't mask any previous errors */
2138 			if (rval == 0)
2139 				rval = mdstealerror(ep, &status);
2140 			else
2141 				mdclrerror(&status);
2142 		}
2143 	} else {
2144 		if (update_dbinfo_on_drives(sp, dd,
2145 		    (force_option & MDFORCE_SET_LOCKED),
2146 		    ((force_option & MDFORCE_LOCAL) |
2147 		    (force_option & MDFORCE_DS)), &status)) {
2148 			/* Don't mask any previous errors */
2149 			if (rval == 0)
2150 				rval = mdstealerror(ep, &status);
2151 			else
2152 				mdclrerror(&status);
2153 		}
2154 		metafreedrivedesc(&dd);
2155 	}
2156 	if ((metaislocalset(sp)) || (!(MD_MNSET_DESC(sd)))) {
2157 		for (nlp = db_nlp; (nlp != NULL); nlp = nlp->next) {
2158 			meta_invalidate_name(nlp->namep);
2159 		}
2160 	}
2161 	return (rval);
2162 }
2163 
2164 static md_replica_t *
metareplicaname(mdsetname_t * sp,int flags,struct mddb_config * c,md_error_t * ep)2165 metareplicaname(
2166 	mdsetname_t		*sp,
2167 	int			flags,
2168 	struct mddb_config	*c,
2169 	md_error_t		*ep
2170 )
2171 {
2172 	md_replica_t	*rp;
2173 	char		*devname;
2174 	size_t		sz;
2175 	devid_nmlist_t	*disklist = NULL;
2176 	char		*devid_str;
2177 
2178 	/* allocate replicaname */
2179 	rp = Zalloc(sizeof (*rp));
2180 
2181 	/* get device name */
2182 	devname = splicename(&c->c_devname);
2183 
2184 	/*
2185 	 * Check if the device has a long name (>40 characters) and
2186 	 * if so then we have to use devids to get the device name.
2187 	 * If this cannot be done then we have to fail the request.
2188 	 */
2189 	if (strstr(devname, META_LONGDISKNAME_STR) != NULL) {
2190 		if (c->c_locator.l_devid != NULL) {
2191 			if (meta_deviceid_to_nmlist("/dev/dsk",
2192 			    (ddi_devid_t)(uintptr_t)c->c_locator.l_devid,
2193 			    c->c_locator.l_minor_name, &disklist) != 0) {
2194 				devid_str = devid_str_encode(
2195 				    (ddi_devid_t)(uintptr_t)
2196 				    c->c_locator.l_devid, NULL);
2197 				(void) mderror(ep, MDE_MISSING_DEVID_DISK, "");
2198 				mderrorextra(ep, devid_str);
2199 				if (devid_str != NULL)
2200 					devid_str_free(devid_str);
2201 				Free(rp);
2202 				Free(devname);
2203 				return (NULL);
2204 			}
2205 		} else {
2206 			(void) mderror(ep, MDE_NODEVID, "");
2207 			Free(rp);
2208 			Free(devname);
2209 			return (NULL);
2210 		}
2211 		Free(devname);
2212 		devname = disklist[0].devname;
2213 	}
2214 
2215 	if (flags & PRINT_FAST) {
2216 		if ((rp->r_namep = metaname_fast(&sp, devname,
2217 		    LOGICAL_DEVICE, ep)) == NULL) {
2218 			Free(devname);
2219 			Free(rp);
2220 			return (NULL);
2221 		}
2222 	} else {
2223 		if ((rp->r_namep = metaname(&sp, devname,
2224 		    LOGICAL_DEVICE, ep)) == NULL) {
2225 			Free(devname);
2226 			Free(rp);
2227 			return (NULL);
2228 		}
2229 	}
2230 	Free(devname);
2231 
2232 	/* make sure it's OK */
2233 	if ((! (flags & MD_BASICNAME_OK)) &&
2234 	    (metachkcomp(rp->r_namep, ep) != 0)) {
2235 		Free(rp);
2236 		return (NULL);
2237 	}
2238 
2239 	rp->r_blkno = (daddr_t)MD_DISKADDR_ERROR;
2240 	rp->r_nblk = (daddr_t)MD_DISKADDR_ERROR;
2241 	rp->r_flags = c->c_locator.l_flags | MDDB_F_NODEVID;
2242 	if (c->c_locator.l_devid_flags & MDDB_DEVID_VALID) {
2243 		sz = devid_sizeof((ddi_devid_t)(uintptr_t)
2244 		    (c->c_locator.l_devid));
2245 		if ((rp->r_devid = (ddi_devid_t)malloc(sz)) ==
2246 		    (ddi_devid_t)NULL) {
2247 			Free(rp);
2248 			return (NULL);
2249 		}
2250 		(void) memcpy((void *)rp->r_devid,
2251 		    (void *)(uintptr_t)c->c_locator.l_devid, sz);
2252 		(void) strcpy(rp->r_minor_name, c->c_locator.l_minor_name);
2253 		rp->r_flags &= ~MDDB_F_NODEVID;
2254 		/* Overwrite dev derived from name with dev from devid */
2255 		rp->r_namep->dev = meta_expldev(c->c_locator.l_dev);
2256 	}
2257 	(void) strcpy(rp->r_driver_name, c->c_locator.l_driver);
2258 
2259 	rp->r_blkno = c->c_locator.l_blkno;
2260 	if (c->c_dbend != 0)
2261 		rp->r_nblk = c->c_dbend - c->c_locator.l_blkno + 1;
2262 
2263 	/* return replica */
2264 	return (rp);
2265 }
2266 
2267 /*
2268  * free replica list
2269  */
2270 void
metafreereplicalist(md_replicalist_t * rlp)2271 metafreereplicalist(
2272 	md_replicalist_t	*rlp
2273 )
2274 {
2275 	md_replicalist_t	*rl = NULL;
2276 
2277 	for (/* void */; (rlp != NULL); rlp = rl) {
2278 		rl = rlp->rl_next;
2279 		if (rlp->rl_repp->r_devid != (ddi_devid_t)0) {
2280 			free(rlp->rl_repp->r_devid);
2281 		}
2282 		Free(rlp->rl_repp);
2283 		Free(rlp);
2284 	}
2285 }
2286 
2287 /*
2288  * return list of all replicas in set
2289  */
2290 int
metareplicalist(mdsetname_t * sp,int flags,md_replicalist_t ** rlpp,md_error_t * ep)2291 metareplicalist(
2292 	mdsetname_t		*sp,
2293 	int			flags,
2294 	md_replicalist_t	**rlpp,
2295 	md_error_t		*ep
2296 )
2297 {
2298 	md_replicalist_t	**tail = rlpp;
2299 	int			count = 0;
2300 	struct mddb_config	c;
2301 	int			i;
2302 	char			*devid;
2303 
2304 	/* for each replica */
2305 	i = 0;
2306 	do {
2307 		md_replica_t	*rp;
2308 
2309 		/* get next replica */
2310 		(void) memset(&c, 0, sizeof (c));
2311 		c.c_id = i;
2312 		c.c_setno = sp->setno;
2313 
2314 		c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2315 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2316 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2317 				mdclrerror(&c.c_mde);
2318 				break;	/* handle none at all */
2319 			}
2320 			(void) mdstealerror(ep, &c.c_mde);
2321 			goto out;
2322 		}
2323 
2324 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SZ) {
2325 			if ((devid = malloc(c.c_locator.l_devid_sz)) == NULL) {
2326 				(void) mdsyserror(ep, ENOMEM, META_DBCONF);
2327 				goto out;
2328 			}
2329 			c.c_locator.l_devid = (uintptr_t)devid;
2330 			/*
2331 			 * Turn on space and sz flags since 'sz' amount of
2332 			 * space has been alloc'd.
2333 			 */
2334 			c.c_locator.l_devid_flags =
2335 			    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2336 		}
2337 
2338 		if (metaioctl(MD_DB_ENDDEV, &c, &c.c_mde, NULL) != 0) {
2339 			if (mdismddberror(&c.c_mde, MDE_DB_INVALID)) {
2340 				mdclrerror(&c.c_mde);
2341 				break;	/* handle none at all */
2342 			}
2343 			(void) mdstealerror(ep, &c.c_mde);
2344 			goto out;
2345 		}
2346 
2347 		/*
2348 		 * Paranoid check - shouldn't happen, but is left as
2349 		 * a place holder for changes that will be needed after
2350 		 * dynamic reconfiguration changes are added to SVM (to
2351 		 * support movement of disks at any point in time).
2352 		 */
2353 		if (c.c_locator.l_devid_flags & MDDB_DEVID_NOSPACE) {
2354 			(void) fprintf(stderr,
2355 			    dgettext(TEXT_DOMAIN,
2356 			    "Error: Relocation Information "
2357 			    "(drvnm=%s, mnum=0x%lx) \n"
2358 			    "relocation information size changed - \n"
2359 			    "rerun command\n"),
2360 			    c.c_locator.l_driver, c.c_locator.l_mnum);
2361 			(void) mderror(ep, MDE_DEVID_TOOBIG, NULL);
2362 			goto out;
2363 		}
2364 
2365 		if (c.c_dbcnt == 0)
2366 			break;		/* handle none at all */
2367 
2368 		/* get info */
2369 		if ((rp = metareplicaname(sp, flags, &c, ep)) == NULL)
2370 			goto out;
2371 
2372 		/* append to list */
2373 		*tail = Zalloc(sizeof (**tail));
2374 		(*tail)->rl_repp = rp;
2375 		tail = &(*tail)->rl_next;
2376 		++count;
2377 
2378 		if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2379 			free(devid);
2380 			c.c_locator.l_devid_flags = 0;
2381 		}
2382 
2383 	} while (++i < c.c_dbcnt);
2384 
2385 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2386 		free(devid);
2387 	}
2388 
2389 	/* return count */
2390 	return (count);
2391 
2392 	/* cleanup, return error */
2393 out:
2394 	if (c.c_locator.l_devid_flags & MDDB_DEVID_SPACE) {
2395 		free(devid);
2396 	}
2397 	metafreereplicalist(*rlpp);
2398 	*rlpp = NULL;
2399 	return (-1);
2400 }
2401 
2402 /*
2403  * meta_sync_db_locations - get list of replicas from kernel and write
2404  * 	out to mddb.cf and md.conf.  'Syncs up' the replica list in
2405  * 	the kernel with the replica list in the conf files.
2406  *
2407  */
2408 void
meta_sync_db_locations(mdsetname_t * sp,md_error_t * ep)2409 meta_sync_db_locations(
2410 	mdsetname_t	*sp,
2411 	md_error_t	*ep
2412 )
2413 {
2414 	char		*sname = 0;		/* system file name */
2415 	char 		*cname = 0;		/* config file name */
2416 
2417 	if (!metaislocalset(sp))
2418 		return;
2419 
2420 	/* Updates backup of configuration file (aka mddb.cf) */
2421 	if (buildconf(sp, ep) != 0)
2422 		return;
2423 
2424 	/* Updates system configuration file (aka md.conf) */
2425 	(void) meta_db_patch(sname, cname, 0, ep);
2426 }
2427 
2428 /*
2429  * setup_db_locations - parse the mddb.cf file and
2430  *			tells the driver which db locations to use.
2431  */
2432 int
meta_setup_db_locations(md_error_t * ep)2433 meta_setup_db_locations(
2434 	md_error_t	*ep
2435 )
2436 {
2437 	mddb_config_t	c;
2438 	FILE		*fp;
2439 	char		inbuff[1024];
2440 	char		*buff;
2441 	uint_t		i;
2442 	size_t		sz;
2443 	int		rval = 0;
2444 	char		*devidp;
2445 	uint_t		devid_size;
2446 	char		*minor_name = NULL;
2447 	ddi_devid_t	devid_decode;
2448 	int		checksum;
2449 
2450 	/* do mddb.cf file */
2451 	(void) memset(&c, '\0', sizeof (c));
2452 	if ((fp = fopen(META_DBCONF, "r")) == NULL) {
2453 		if (errno != ENOENT)
2454 			return (mdsyserror(ep, errno, META_DBCONF));
2455 	}
2456 	while ((fp != NULL) && ((buff = fgets(inbuff, (sizeof (inbuff) - 1),
2457 	    fp)) != NULL)) {
2458 
2459 		/* ignore comments */
2460 		if (*buff == '#')
2461 			continue;
2462 
2463 		/* parse locator */
2464 		(void) memset(&c, 0, sizeof (c));
2465 		c.c_setno = MD_LOCAL_SET;
2466 		i = strcspn(buff, " \t");
2467 		if (i > sizeof (c.c_locator.l_driver))
2468 			i = sizeof (c.c_locator.l_driver);
2469 		(void) strncpy(c.c_locator.l_driver, buff, i);
2470 		buff += i;
2471 		c.c_locator.l_dev =
2472 		    makedev((major_t)0, (minor_t)strtol(buff, &buff, 10));
2473 		c.c_locator.l_blkno = (daddr_t)strtol(buff, &buff, 10);
2474 		c.c_locator.l_mnum = minor(c.c_locator.l_dev);
2475 
2476 		/* parse out devid */
2477 		while (isspace((int)(*buff)))
2478 			buff += 1;
2479 		i = strcspn(buff, " \t");
2480 		if ((devidp = (char *)malloc(i+1)) == NULL)
2481 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2482 
2483 		(void) strncpy(devidp, buff, i);
2484 		devidp[i] = '\0';
2485 		if (devid_str_decode(devidp, &devid_decode,
2486 		    &minor_name) == -1) {
2487 			free(devidp);
2488 			continue;
2489 		}
2490 
2491 		/* Conf file must have minor name associated with devid */
2492 		if (minor_name == NULL) {
2493 			free(devidp);
2494 			devid_free(devid_decode);
2495 			continue;
2496 		}
2497 
2498 		sz = devid_sizeof(devid_decode);
2499 		/* Copy to devid size buffer that ioctl expects */
2500 		if ((c.c_locator.l_devid = (uintptr_t)malloc(sz)) == NULL) {
2501 			devid_free(devid_decode);
2502 			free(minor_name);
2503 			free(devidp);
2504 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2505 		}
2506 
2507 		(void) memcpy((void *)(uintptr_t)c.c_locator.l_devid,
2508 		    (void *)devid_decode, sz);
2509 
2510 		devid_free(devid_decode);
2511 
2512 		if (strlen(minor_name) > MDDB_MINOR_NAME_MAX) {
2513 			free(minor_name);
2514 			free(devidp);
2515 			free((void *)(uintptr_t)c.c_locator.l_devid);
2516 			return (mdsyserror(ep, ENOMEM, META_DBCONF));
2517 		}
2518 		(void) strcpy(c.c_locator.l_minor_name, minor_name);
2519 		free(minor_name);
2520 		c.c_locator.l_devid_flags = MDDB_DEVID_VALID |
2521 		    MDDB_DEVID_SPACE | MDDB_DEVID_SZ;
2522 		c.c_locator.l_devid_sz = sz;
2523 
2524 		devid_size = strlen(devidp);
2525 		buff += devid_size;
2526 
2527 		checksum = strtol(buff, &buff, 10);
2528 		for (i = 0; c.c_locator.l_driver[i] != 0; i++)
2529 			checksum += c.c_locator.l_driver[i];
2530 		for (i = 0; i < devid_size; i++) {
2531 			checksum += devidp[i];
2532 		}
2533 		free(devidp);
2534 
2535 		checksum += minor(c.c_locator.l_dev);
2536 		checksum += c.c_locator.l_blkno;
2537 		if (checksum != 42) {
2538 			/* overwritten later for more serious problems */
2539 			rval = mderror(ep, MDE_MDDB_CKSUM, META_DBCONF);
2540 			free((void *)(uintptr_t)c.c_locator.l_devid);
2541 			continue;
2542 		}
2543 		c.c_locator.l_flags = 0;
2544 
2545 		/* use db location */
2546 		if (metaioctl(MD_DB_USEDEV, &c, &c.c_mde, NULL) != 0) {
2547 			free((void *)(uintptr_t)c.c_locator.l_devid);
2548 			return (mdstealerror(ep, &c.c_mde));
2549 		}
2550 
2551 		/* free up devid if in use */
2552 		free((void *)(uintptr_t)c.c_locator.l_devid);
2553 		c.c_locator.l_devid = (uint64_t)0;
2554 		c.c_locator.l_devid_flags = 0;
2555 	}
2556 	if ((fp) && (fclose(fp) != 0))
2557 		return (mdsyserror(ep, errno, META_DBCONF));
2558 
2559 	/* check for stale database */
2560 	(void) memset((char *)&c, 0, sizeof (struct mddb_config));
2561 	c.c_id = 0;
2562 	c.c_setno = MD_LOCAL_SET;
2563 
2564 	/*
2565 	 * While we do not need the devid here we may need to
2566 	 * know if devid's are being used by the kernel for
2567 	 * the replicas. This is because under some circumstances
2568 	 * we can only manipulate the SVM configuration if the
2569 	 * kernel is using devid's.
2570 	 */
2571 	c.c_locator.l_devid = (uint64_t)0;
2572 	c.c_locator.l_devid_flags = MDDB_DEVID_GETSZ;
2573 	c.c_locator.l_devid_sz = 0;
2574 
2575 	if (metaioctl(MD_DB_GETDEV, &c, &c.c_mde, NULL) != 0) {
2576 		if (! mdismddberror(&c.c_mde, MDE_DB_INVALID))
2577 			return (mdstealerror(ep, &c.c_mde));
2578 		mdclrerror(&c.c_mde);
2579 	}
2580 
2581 	if (c.c_flags & MDDB_C_STALE)
2582 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, MD_LOCAL_SET,
2583 		    0, NULL));
2584 
2585 	if (c.c_locator.l_devid_sz != 0) {
2586 		/*
2587 		 * Devid's are being used to track the replicas because
2588 		 * there is space for a devid.
2589 		 */
2590 		devid_in_use = TRUE;
2591 	}
2592 
2593 	/* success */
2594 	return (rval);
2595 }
2596 
2597 /*
2598  * meta_db_minreplica - returns the minimum size replica currently in use.
2599  */
2600 daddr_t
meta_db_minreplica(mdsetname_t * sp,md_error_t * ep)2601 meta_db_minreplica(
2602 	mdsetname_t	*sp,
2603 	md_error_t	*ep
2604 )
2605 {
2606 	md_replica_t		*r;
2607 	md_replicalist_t	*rl, *rlp = NULL;
2608 	daddr_t			nblks = 0;
2609 
2610 	if (metareplicalist(sp, (MD_BASICNAME_OK | PRINT_FAST), &rlp, ep) < 0)
2611 		return (-1);
2612 
2613 	if (rlp == NULL)
2614 		return (-1);
2615 
2616 	/* find the smallest existing replica */
2617 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
2618 		r = rl->rl_repp;
2619 		nblks = ((nblks == 0) ? r->r_nblk : min(r->r_nblk, nblks));
2620 	}
2621 
2622 	metafreereplicalist(rlp);
2623 	return (nblks);
2624 }
2625 
2626 /*
2627  * meta_get_replica_names
2628  *  returns an mdnamelist_t of replica slices
2629  */
2630 /*ARGSUSED*/
2631 int
meta_get_replica_names(mdsetname_t * sp,mdnamelist_t ** nlpp,int options,md_error_t * ep)2632 meta_get_replica_names(
2633 	mdsetname_t	*sp,
2634 	mdnamelist_t	**nlpp,
2635 	int		options,
2636 	md_error_t	*ep
2637 )
2638 {
2639 	md_replicalist_t	*rlp = NULL;
2640 	md_replicalist_t	*rl;
2641 	mdnamelist_t		**tailpp = nlpp;
2642 	int			cnt = 0;
2643 
2644 	assert(nlpp != NULL);
2645 
2646 	if (!metaislocalset(sp))
2647 		goto out;
2648 
2649 	/* get replicas */
2650 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
2651 		cnt = -1;
2652 		goto out;
2653 	}
2654 
2655 	/* build name list */
2656 	for (rl = rlp; (rl != NULL); rl = rl->rl_next) {
2657 		/*
2658 		 * Add the name struct to the end of the
2659 		 * namelist but keep a pointer to the last
2660 		 * element so that we don't incur the overhead
2661 		 * of traversing the list each time
2662 		 */
2663 		tailpp = meta_namelist_append_wrapper(
2664 		    tailpp, rl->rl_repp->r_namep);
2665 		++cnt;
2666 	}
2667 
2668 	/* cleanup, return count or error */
2669 out:
2670 	metafreereplicalist(rlp);
2671 	return (cnt);
2672 }
2673