xref: /titanic_41/usr/src/uts/common/io/lvm/md/md_mddb.c (revision 32c22d57860198538fb6b8f261cb76ab26318d34)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/conf.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/systeminfo.h>
33 #include <sys/sysmacros.h>
34 #include <sys/buf.h>
35 #include <sys/kmem.h>
36 #include <sys/file.h>
37 #include <sys/open.h>
38 #include <sys/debug.h>
39 #include <sys/stat.h>
40 #include <sys/lvm/mdvar.h>
41 #include <sys/lvm/md_crc.h>
42 #include <sys/lvm/md_convert.h>
43 #include <sys/types.h>
44 #include <sys/kmem.h>
45 #include <sys/lvm/mdmn_commd.h>
46 #include <sys/cladm.h>
47 
48 mhd_mhiargs_t	defmhiargs = {
49 	1000,
50 	{ 6000, 6000, 30000 }
51 };
52 
53 #define	MDDB
54 
55 #include <sys/lvm/mdvar.h>
56 #include <sys/lvm/mdmed.h>
57 #include <sys/lvm/md_names.h>
58 #include <sys/cred.h>
59 #include <sys/ddi.h>
60 #include <sys/sunddi.h>
61 #include <sys/esunddi.h>
62 
63 #include <sys/sysevent/eventdefs.h>
64 #include <sys/sysevent/svm.h>
65 
66 extern char svm_bootpath[];
67 
68 int			md_maxbootlist = MAXBOOTLIST;
69 static ulong_t		mddb_maxblocks = 0;	/* tune for small records */
70 static int		mddb_maxbufheaders = 50;
71 static uint_t		mddb_maxcopies = MDDB_NLB;
72 
73 /*
74  * If this is set, more detailed messages about DB init will be given, instead
75  * of just the MDE_DB_NODB.
76  */
77 static int		mddb_db_err_detail = 0;
78 
79 /*
80  * This lock is used to single-thread load/unload of all sets
81  */
82 static kmutex_t		mddb_lock;
83 
84 /*
85  * You really do NOT want to change this boolean.
86  * It can be VERY dangerous to do so.  Loss of
87  * data may occur. USE AT YOUR OWN RISK!!!!
88  */
89 static int		mddb_allow_half = 0;
90 /*
91  * For mirrored root allow reboot with only half the replicas available
92  * Flag inserted for Santa Fe project.
93  */
94 int mirrored_root_flag;
95 
96 #define	ISWHITE(c)	(((c) == ' ') || ((c) == '\t') || \
97 			    ((c) == '\r') || ((c) == '\n'))
98 #define	ISNUM(c)	(((c) >= '0') && ((c) <= '9'))
99 
100 #define	SETMUTEX(setno)	(&md_set[setno].s_dbmx)
101 
102 extern md_krwlock_t	md_unit_array_rw;	/* md.c */
103 extern set_t		md_nsets;		/* md.c */
104 extern int		md_nmedh;		/* md.c */
105 extern md_set_t		md_set[];		/* md.c */
106 extern int		(*mdv_strategy_tstpnt)(buf_t *, int, void*);
107 extern dev_info_t	*md_devinfo;
108 extern int		md_init_debug;
109 extern int		md_status;
110 extern md_ops_t		*md_opslist;
111 extern md_krwlock_t	nm_lock;
112 
113 static int 		update_locatorblock(mddb_set_t *s, md_dev64_t dev,
114 				ddi_devid_t didptr, ddi_devid_t old_didptr);
115 
116 /*
117  * Defines for crc calculation for records
118  * rec_crcgen generates a crc checksum for a record block
119  * rec_crcchk checks the crc checksum for a record block
120  */
121 #define	REC_CRCGEN	0
122 #define	REC_CRCCHK	1
123 #define	rec_crcgen(s, dep, rbp) \
124 	(void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
125 #define	rec_crcchk(s, dep, rbp) \
126 	rec_crcfunc(s, dep, rbp, REC_CRCCHK)
127 
128 /*
129  * During upgrade, SVM basically runs with the devt from the target
130  * being upgraded.  Translations are made from the target devt to the
131  * miniroot devt when writing data out to the disk.  This is done by
132  * the following routines:
133  *	wrtblklst
134  *	writeblks
135  *	readblklst
136  *	readblks
137  *	dt_read
138  *
139  * The following routines are used by the routines listed above and
140  * expect a translated (aka miniroot) devt:
141  *	getblks
142  * 	getmasters
143  *
144  * Also, when calling any system routines, such as ddi_lyr_get_devid,
145  * the translated (aka miniroot) devt must be used.
146  *
147  * By the same token, the major number and major name conversion operations
148  * need to use the name_to_major file from the target system instead
149  * of the name_to_major file on the miniroot.  So, calls to
150  * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
151  * when running on an upgrade.  Same is true with calls to
152  * ddi_major_to_name.
153  */
154 
155 
156 #ifndef MDDB_FAKE
157 
158 static int
mddb_rwdata(mddb_set_t * s,int flag,buf_t * bp)159 mddb_rwdata(
160 	mddb_set_t	*s,	/* incore db set structure */
161 	int		flag,	/* B_ASYNC, B_FAILFAST or 0 passed in here */
162 	buf_t		*bp
163 )
164 {
165 	int		err = 0;
166 
167 	bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
168 
169 	mutex_exit(SETMUTEX(s->s_setno));
170 	if (mdv_strategy_tstpnt == NULL ||
171 	    (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
172 		(void) bdev_strategy(bp);
173 
174 	if (flag & B_ASYNC) {
175 		mutex_enter(SETMUTEX(s->s_setno));
176 		return (0);
177 	}
178 
179 	err = biowait(bp);
180 	mutex_enter(SETMUTEX(s->s_setno));
181 	return (err);
182 }
183 
184 static void
setidentifier(mddb_set_t * s,identifier_t * ident)185 setidentifier(
186 	mddb_set_t	*s,
187 	identifier_t	*ident
188 )
189 {
190 	if (s->s_setno == MD_LOCAL_SET)
191 		(void) strcpy(&ident->serial[0], s->s_ident.serial);
192 	else
193 		ident->createtime = s->s_ident.createtime;
194 }
195 
196 static int
cmpidentifier(mddb_set_t * s,identifier_t * ident)197 cmpidentifier(
198 	mddb_set_t	*s,
199 	identifier_t	*ident
200 )
201 {
202 	if (s->s_setno == MD_LOCAL_SET)
203 		return (strcmp(ident->serial, s->s_ident.serial));
204 	else
205 		return (timercmp(&ident->createtime,
206 		    /*CSTYLED*/
207 		    &s->s_ident.createtime, !=));
208 }
209 
210 static int
mddb_devopen(md_dev64_t dev)211 mddb_devopen(
212 	md_dev64_t	dev
213 )
214 {
215 	dev_t		ddi_dev = md_dev64_to_dev(dev);
216 
217 	if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
218 		return (0);
219 	return (1);
220 }
221 
222 static void
mddb_devclose(md_dev64_t dev)223 mddb_devclose(
224 	md_dev64_t	dev
225 )
226 {
227 	(void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
228 }
229 
230 /*
231  * stripe_skip_ts
232  *
233  * Returns a list of fields to be skipped in the stripe record structure.
234  * These fields are ms_timestamp in the component structure.
235  * Used to skip these fields when calculating the checksum.
236  */
237 static crc_skip_t *
stripe_skip_ts(void * un,uint_t revision)238 stripe_skip_ts(void *un, uint_t revision)
239 {
240 	struct ms_row32_od	*small_mdr;
241 	struct ms_row		*big_mdr;
242 	uint_t			row, comp, ncomps, compoff;
243 	crc_skip_t		*skip;
244 	crc_skip_t		*skip_prev;
245 	crc_skip_t		skip_start = {0, 0, 0};
246 	ms_unit_t		*big_un;
247 	ms_unit32_od_t		*small_un;
248 	uint_t			rb_off = offsetof(mddb_rb32_t, rb_data[0]);
249 
250 	switch (revision) {
251 	case MDDB_REV_RB:
252 	case MDDB_REV_RBFN:
253 		small_un = (ms_unit32_od_t *)un;
254 		skip_prev = &skip_start;
255 
256 		if (small_un->un_nrows == 0)
257 			return (NULL);
258 		/*
259 		 * walk through all rows to find the total number
260 		 * of components
261 		 */
262 		small_mdr   = &small_un->un_row[0];
263 		ncomps = 0;
264 		for (row = 0; (row < small_un->un_nrows); row++) {
265 			ncomps += small_mdr[row].un_ncomp;
266 		}
267 
268 		/* Now walk through the components */
269 		compoff = small_un->un_ocomp + rb_off;
270 		for (comp = 0; (comp < ncomps); ++comp) {
271 			uint_t	mdcp = compoff +
272 			    (comp * sizeof (ms_comp32_od_t));
273 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
274 			    KM_SLEEP);
275 			skip->skip_offset = mdcp +
276 			    offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
277 			skip->skip_size = sizeof (md_timeval32_t);
278 			skip_prev->skip_next = skip;
279 			skip_prev = skip;
280 		}
281 		break;
282 	case MDDB_REV_RB64:
283 	case MDDB_REV_RB64FN:
284 		big_un = (ms_unit_t *)un;
285 		skip_prev = &skip_start;
286 
287 		if (big_un->un_nrows == 0)
288 			return (NULL);
289 		/*
290 		 * walk through all rows to find the total number
291 		 * of components
292 		 */
293 		big_mdr   = &big_un->un_row[0];
294 		ncomps = 0;
295 		for (row = 0; (row < big_un->un_nrows); row++) {
296 			ncomps += big_mdr[row].un_ncomp;
297 		}
298 
299 		/* Now walk through the components */
300 		compoff = big_un->un_ocomp + rb_off;
301 		for (comp = 0; (comp < ncomps); ++comp) {
302 			uint_t	mdcp = compoff +
303 			    (comp * sizeof (ms_comp_t));
304 			skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
305 			    KM_SLEEP);
306 			skip->skip_offset = mdcp +
307 			    offsetof(ms_comp_t, un_mirror.ms_timestamp);
308 			skip->skip_size = sizeof (md_timeval32_t);
309 			skip_prev->skip_next = skip;
310 			skip_prev = skip;
311 		}
312 		break;
313 	}
314 	/* Return the start of the list of fields to skip */
315 	return (skip_start.skip_next);
316 }
317 
318 /*
319  * mirror_skip_ts
320  *
321  * Returns a list of fields to be skipped in the mirror record structure.
322  * This includes un_last_read and sm_timestamp for each submirror
323  * Used to skip these fields when calculating the checksum.
324  */
325 static crc_skip_t *
mirror_skip_ts(uint_t revision)326 mirror_skip_ts(uint_t revision)
327 {
328 	int		i;
329 	crc_skip_t	*skip;
330 	crc_skip_t	*skip_prev;
331 	crc_skip_t	skip_start = {0, 0, 0};
332 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
333 
334 	skip_prev = &skip_start;
335 
336 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
337 	switch (revision) {
338 	case MDDB_REV_RB:
339 	case MDDB_REV_RBFN:
340 		skip->skip_offset = offsetof(mm_unit32_od_t,
341 		    un_last_read) + rb_off;
342 		break;
343 	case MDDB_REV_RB64:
344 	case MDDB_REV_RB64FN:
345 		skip->skip_offset = offsetof(mm_unit_t,
346 		    un_last_read) + rb_off;
347 		break;
348 	}
349 	skip->skip_size = sizeof (int);
350 	skip_prev->skip_next = skip;
351 	skip_prev = skip;
352 
353 	for (i = 0; i < NMIRROR; i++) {
354 		skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
355 		switch (revision) {
356 		case MDDB_REV_RB:
357 		case MDDB_REV_RBFN:
358 			skip->skip_offset = offsetof(mm_unit32_od_t,
359 			    un_sm[i].sm_timestamp) + rb_off;
360 			break;
361 		case MDDB_REV_RB64:
362 		case MDDB_REV_RB64FN:
363 			skip->skip_offset = offsetof(mm_unit_t,
364 			    un_sm[i].sm_timestamp) + rb_off;
365 			break;
366 		}
367 		skip->skip_size = sizeof (md_timeval32_t);
368 		skip_prev->skip_next = skip;
369 		skip_prev = skip;
370 	}
371 	/* Return the start of the list of fields to skip */
372 	return (skip_start.skip_next);
373 }
374 
375 /*
376  * hotspare_skip_ts
377  *
378  * Returns a list of the timestamp fields in the hotspare record structure.
379  * Used to skip these fields when calculating the checksum.
380  */
381 static crc_skip_t *
hotspare_skip_ts(uint_t revision)382 hotspare_skip_ts(uint_t revision)
383 {
384 	crc_skip_t	*skip;
385 	uint_t		rb_off = offsetof(mddb_rb32_t, rb_data[0]);
386 
387 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
388 	switch (revision) {
389 	case MDDB_REV_RB:
390 	case MDDB_REV_RBFN:
391 		skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
392 		    rb_off;
393 		break;
394 	case MDDB_REV_RB64:
395 	case MDDB_REV_RB64FN:
396 		skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
397 		    rb_off;
398 		break;
399 	}
400 	skip->skip_size = sizeof (md_timeval32_t);
401 	return (skip);
402 }
403 
404 /*
405  * rec_crcfunc
406  *
407  * Calculate or check the checksum for a record
408  * Calculate the crc if check == 0, Check the crc if check == 1
409  *
410  * Record block may be written by different nodes in a multi-owner diskset
411  * (in case of master change), the function rec_crcchk excludes timestamp
412  * fields in crc computation of record data.
413  * Otherwise, timestamp fields will cause each node to have a different
414  * checksum for same record block causing the exclusive-or of all record block
415  * checksums and data block record sums to be non-zero after new master writes
416  * at least one record block.
417  */
418 static uint_t
rec_crcfunc(mddb_set_t * s,mddb_de_ic_t * dep,mddb_rb32_t * rbp,int check)419 rec_crcfunc(
420 	mddb_set_t	*s,
421 	mddb_de_ic_t	*dep,
422 	mddb_rb32_t	*rbp,
423 	int		check
424 )
425 {
426 	crc_skip_t	*skip;
427 	crc_skip_t	*skip_tail;
428 	mddb_type_t	type = dep->de_type1;
429 	uint_t		ret;
430 
431 	/*
432 	 * Generate a list of the areas to be skipped when calculating
433 	 * the checksum.
434 	 * First skip rb_checksum, rb_private and rb_userdata.
435 	 */
436 	skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
437 	skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
438 	skip->skip_size = 3 * sizeof (uint_t);
439 	skip_tail = skip;
440 	if (MD_MNSET_SETNO(s->s_setno)) {
441 		/* For a MN set, skip rb_timestamp */
442 		skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
443 		    KM_SLEEP);
444 		skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
445 		skip_tail->skip_size = sizeof (md_timeval32_t);
446 		skip->skip_next = skip_tail;
447 
448 		/* Now add a list of timestamps to be skipped */
449 		if (type >= MDDB_FIRST_MODID) {
450 			switch (dep->de_flags) {
451 				case MDDB_F_STRIPE:
452 					skip_tail->skip_next =
453 					    stripe_skip_ts((void *)rbp->rb_data,
454 					    rbp->rb_revision);
455 					break;
456 				case MDDB_F_MIRROR:
457 					skip_tail->skip_next =
458 					    mirror_skip_ts(rbp->rb_revision);
459 					break;
460 				case MDDB_F_HOTSPARE:
461 					skip_tail->skip_next =
462 					    hotspare_skip_ts(rbp->rb_revision);
463 					break;
464 				default:
465 					break;
466 			}
467 		}
468 	}
469 
470 	if (check) {
471 		ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
472 	} else {
473 		crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
474 		ret = rbp->rb_checksum;
475 	}
476 	while (skip) {
477 		crc_skip_t	*skip_save = skip;
478 
479 		skip = skip->skip_next;
480 		kmem_free(skip_save, sizeof (crc_skip_t));
481 	}
482 	return (ret);
483 }
484 
485 static mddb_bf_t *
allocbuffer(mddb_set_t * s,int sleepflag)486 allocbuffer(
487 	mddb_set_t	*s,
488 	int		sleepflag
489 )
490 {
491 	mddb_bf_t	*bfp;
492 
493 	while ((bfp = s->s_freebufhead) == NULL) {
494 		if (sleepflag == MDDB_NOSLEEP)
495 			return ((mddb_bf_t *)NULL);
496 		++s->s_bufmisses;
497 #ifdef	DEBUG
498 		if (s->s_bufmisses == 1)
499 			cmn_err(CE_NOTE,
500 			    "md: mddb: set %u sleeping for buffer", s->s_setno);
501 #endif
502 		s->s_bufwakeup = 1;
503 		cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
504 	}
505 	s->s_freebufhead = bfp->bf_next;
506 	bzero((caddr_t)bfp, sizeof (*bfp));
507 	bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
508 	bfp->bf_buf.b_flags = B_BUSY;	/* initialize flags */
509 	return (bfp);
510 }
511 
512 static void
freebuffer(mddb_set_t * s,mddb_bf_t * bfp)513 freebuffer(
514 	mddb_set_t		*s,
515 	mddb_bf_t	*bfp
516 )
517 {
518 	bfp->bf_next = s->s_freebufhead;
519 	s->s_freebufhead = bfp;
520 	if (s->s_bufwakeup) {
521 		cv_broadcast(&s->s_buf_cv);
522 		s->s_bufwakeup = 0;
523 	}
524 }
525 
526 
527 static void
blkbusy(mddb_set_t * s,mddb_block_t blk)528 blkbusy(
529 	mddb_set_t	*s,
530 	mddb_block_t	blk
531 )
532 {
533 	int		bit, byte;
534 
535 	s->s_freeblkcnt--;
536 	byte = blk / 8;
537 	bit = 1 << (blk & 7);
538 	ASSERT(! (s->s_freebitmap[byte] & bit));
539 	s->s_freebitmap[byte] |= bit;
540 }
541 
542 static void
blkfree(mddb_set_t * s,mddb_block_t blk)543 blkfree(
544 	mddb_set_t	*s,
545 	mddb_block_t	blk
546 )
547 {
548 	int		bit, byte;
549 
550 	s->s_freeblkcnt++;
551 	byte = blk / 8;
552 	bit = 1 << (blk & 7);
553 	ASSERT(s->s_freebitmap[byte] & bit);
554 	s->s_freebitmap[byte] &= ~bit;
555 }
556 
557 static int
blkcheck(mddb_set_t * s,mddb_block_t blk)558 blkcheck(
559 	mddb_set_t	*s,
560 	mddb_block_t	blk
561 )
562 {
563 	int		bit, byte;
564 
565 	byte = blk / 8;
566 	bit = 1 << (blk & 7);
567 	return (s->s_freebitmap[byte] & bit);
568 }
569 
570 /*
571  * not fast but simple
572  */
573 static mddb_block_t
getfreeblks(mddb_set_t * s,size_t count)574 getfreeblks(
575 	mddb_set_t	*s,
576 	size_t		count
577 )
578 {
579 	int		i;
580 	size_t		contig;
581 
582 	contig = 0;
583 	for (i = 0; i < s->s_totalblkcnt; i++) {
584 		if (blkcheck(s, i)) {
585 			contig = 0;
586 		} else {
587 			contig++;
588 			if (contig == count) {
589 				contig = i - count + 1;
590 				for (i = (int)contig; i < contig + count; i++)
591 					blkbusy(s, i);
592 				return ((mddb_block_t)contig);
593 			}
594 		}
595 	}
596 	return (0);
597 }
598 
599 static void
computefreeblks(mddb_set_t * s)600 computefreeblks(
601 	mddb_set_t	*s
602 )
603 {
604 	mddb_db_t	*dbp;
605 	mddb_de_ic_t	*dep;
606 	int		i;
607 	int		minblks;
608 	int		freeblks;
609 	mddb_mb_ic_t	*mbip;
610 	mddb_lb_t	*lbp;
611 	mddb_block_t	maxblk;
612 	mddb_did_db_t	*did_dbp;
613 	int		nblks;
614 
615 	minblks = 0;
616 	lbp = s->s_lbp;
617 	maxblk = 0;
618 
619 	/*
620 	 * Determine the max number of blocks.
621 	 */
622 	nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
623 	/*
624 	 * go through and find highest logical block
625 	 */
626 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
627 		if (dbp->db_blknum > maxblk)
628 			maxblk = dbp->db_blknum;
629 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
630 			for (i = 0; i < dep->de_blkcount; i++)
631 				if (dep->de_blks[i] > maxblk)
632 					maxblk = dep->de_blks[i];
633 	}
634 
635 	for (i = 0; i < lbp->lb_loccnt; i++) {
636 		mddb_locator_t	*lp = &lbp->lb_locators[i];
637 
638 		if ((lp->l_flags & MDDB_F_DELETED) ||
639 		    (lp->l_flags & MDDB_F_EMASTER))
640 			continue;
641 
642 		freeblks = 0;
643 		for (mbip = s->s_mbiarray[i]; mbip != NULL;
644 		    mbip = mbip->mbi_next) {
645 			freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
646 		}
647 		if (freeblks == 0)	/* this happen when there is no */
648 			continue;	/*	master blk		*/
649 
650 		if (freeblks <= maxblk) {
651 			lp->l_flags |= MDDB_F_TOOSMALL;
652 			lp->l_flags &= ~MDDB_F_ACTIVE;
653 		}
654 
655 		if (freeblks < minblks || minblks == 0)
656 			minblks = freeblks;
657 	}
658 	/*
659 	 * set up reasonable freespace if no
660 	 * data bases exist
661 	 */
662 	if (minblks == 0)
663 		minblks = 100;
664 	if (minblks > nblks)
665 		minblks = nblks;
666 	s->s_freeblkcnt = minblks;
667 	s->s_totalblkcnt = minblks;
668 	if (! s->s_freebitmapsize) {
669 		s->s_freebitmapsize = nblks / 8;
670 		s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
671 		    KM_SLEEP);
672 	}
673 	bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
674 
675 	/* locator block sectors */
676 	for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
677 		blkbusy(s, i);
678 
679 	/* locator name sectors */
680 	for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
681 		blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
682 
683 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
684 		/* locator block device id information */
685 		for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
686 			blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
687 
688 		/* disk blocks containing actual device ids */
689 		did_dbp = s->s_did_icp->did_ic_dbp;
690 		while (did_dbp) {
691 			for (i = 0; i < did_dbp->db_blkcnt; i++) {
692 				blkbusy(s, did_dbp->db_firstblk + i);
693 			}
694 			did_dbp = did_dbp->db_next;
695 		}
696 	}
697 
698 	/* Only use data tags if not a MN set */
699 	if (!(lbp->lb_flags & MDDB_MNSET)) {
700 		/* Found a bad tag, do NOT mark the data tag blks busy here */
701 		if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
702 			for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
703 				blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
704 		}
705 	}
706 
707 	/* directory block/entry sectors */
708 	for (dbp = s->s_dbp; dbp != 0;	dbp = dbp->db_next) {
709 		blkbusy(s, dbp->db_blknum);
710 		for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
711 			for (i = 0; i < dep->de_blkcount; i++)
712 				blkbusy(s, dep->de_blks[i]);
713 	}
714 }
715 
716 /*
717  * Add free space to the device id incore free list.
718  * Called:
719  *    - During startup when all devid blocks are temporarily placed on the
720  *       free list
721  *    - After a devid has been deleted via the metadb command.
722  *    - When mddb_devid_free_get adds unused space from a disk block
723  *       to free list
724  */
725 static int
mddb_devid_free_add(mddb_set_t * s,uint_t firstblk,uint_t offset,uint_t length)726 mddb_devid_free_add(
727 	mddb_set_t *s,
728 	uint_t firstblk,
729 	uint_t offset,
730 	uint_t length
731 )
732 {
733 	mddb_did_free_t	*did_freep;
734 
735 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
736 		return (0);
737 	}
738 
739 	did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
740 	    KM_SLEEP);
741 	did_freep->free_blk = firstblk;
742 	did_freep->free_offset = offset;
743 	did_freep->free_length = length;
744 	did_freep->free_next = s->s_did_icp->did_ic_freep;
745 	s->s_did_icp->did_ic_freep = did_freep;
746 
747 	return (0);
748 }
749 
750 /*
751  * Remove specific free space from the device id incore free list.
752  * Called at startup (after all devid blocks have been placed on
753  * free list) in order to remove the free space from the list that
754  * contains actual devids.
755  * Returns 0 if area successfully removed.
756  * Returns 1 if no matching area is found - so nothing removed.
757  */
758 static int
mddb_devid_free_delete(mddb_set_t * s,uint_t firstblk,uint_t offset,uint_t length)759 mddb_devid_free_delete(
760 	mddb_set_t *s,
761 	uint_t firstblk,
762 	uint_t offset,
763 	uint_t length
764 )
765 {
766 	int		block_found = 0;
767 	mddb_did_free_t	*did_freep1;		/* next free block */
768 	mddb_did_free_t	*did_freep2 = 0;	/* previous free block */
769 	mddb_did_free_t *did_freep_before;	/* area before offset, len */
770 	mddb_did_free_t	*did_freep_after;	/* area after offset, len */
771 	uint_t		old_length;
772 
773 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
774 		return (1);
775 	}
776 
777 	/* find free block for this devid */
778 	did_freep1 = s->s_did_icp->did_ic_freep;
779 	while (did_freep1) {
780 		/*
781 		 * Look through free list of <block, offset, length> to
782 		 * find our entry in the free list.  Our entry should
783 		 * exist since the entire devid block was placed into
784 		 * this free list at startup.  This code is just removing
785 		 * the non-free (in-use) portions of the devid block so
786 		 * that the remaining linked list does indeed just
787 		 * contain a free list.
788 		 *
789 		 * Our entry has been found if
790 		 *   - the blocks match,
791 		 *   - the offset (starting address) in the free list is
792 		 *	less than the offset of our entry and
793 		 *   - the length+offset (ending address) in the free list is
794 		 *	greater than the length+offset of our entry.
795 		 */
796 		if ((did_freep1->free_blk == firstblk) &&
797 		    (did_freep1->free_offset <= offset) &&
798 		    ((did_freep1->free_length + did_freep1->free_offset) >=
799 		    (length + offset))) {
800 			/* Have found our entry - remove from list */
801 			block_found = 1;
802 			did_freep_before = did_freep1;
803 			old_length = did_freep1->free_length;
804 			/* did_freep1 - pts to next free block */
805 			did_freep1 = did_freep1->free_next;
806 			if (did_freep2) {
807 				did_freep2->free_next = did_freep1;
808 			} else {
809 				s->s_did_icp->did_ic_freep = did_freep1;
810 			}
811 
812 			/*
813 			 * did_freep_before points to area in block before
814 			 * offset, length.
815 			 */
816 			did_freep_before->free_length = offset -
817 			    did_freep_before->free_offset;
818 			/*
819 			 * did_freep_after points to area in block after
820 			 * offset, length.
821 			 */
822 			did_freep_after = (mddb_did_free_t *)kmem_zalloc
823 			    (sizeof (mddb_did_free_t), KM_SLEEP);
824 			did_freep_after->free_blk = did_freep_before->free_blk;
825 			did_freep_after->free_offset = offset + length;
826 			did_freep_after->free_length = old_length - length -
827 			    did_freep_before->free_length;
828 			/*
829 			 * Add before and after areas to free list
830 			 * If area before or after offset, length has length
831 			 * of 0, that entry is not added.
832 			 */
833 			if (did_freep_after->free_length) {
834 				did_freep_after->free_next = did_freep1;
835 				if (did_freep2) {
836 					did_freep2->free_next =
837 					    did_freep_after;
838 				} else {
839 					s->s_did_icp->did_ic_freep =
840 					    did_freep_after;
841 				}
842 				did_freep1 = did_freep_after;
843 			} else {
844 				kmem_free(did_freep_after,
845 				    sizeof (mddb_did_free_t));
846 			}
847 
848 			if (did_freep_before->free_length) {
849 				did_freep_before->free_next = did_freep1;
850 				if (did_freep2) {
851 					did_freep2->free_next =
852 					    did_freep_before;
853 				} else {
854 					s->s_did_icp->did_ic_freep =
855 					    did_freep_before;
856 				}
857 			} else {
858 				kmem_free(did_freep_before,
859 				    sizeof (mddb_did_free_t));
860 			}
861 			break;
862 		} else {
863 			did_freep2 = did_freep1;
864 			did_freep1 = did_freep1->free_next;
865 		}
866 	}
867 	if (block_found == 0) {
868 		return (1);
869 	} else {
870 		return (0);
871 	}
872 }
873 
874 /*
875  * Find free space of devid length and remove free space from list.
876  * Return a pointer to the previously free area.
877  *
878  * If there's not enough free space on the free list, get an empty
879  * disk block, put the empty disk block on the did_ic_dbp linked list,
880  * and add the disk block space not used for devid to the free list.
881  *
882  * Return pointer to address (inside disk block) of free area for devid.
883  * Return 0 if error.
884  */
885 static caddr_t
mddb_devid_free_get(mddb_set_t * s,uint_t len,uint_t * blk,uint_t * cnt,uint_t * offset)886 mddb_devid_free_get(
887 	mddb_set_t *s,
888 	uint_t len,
889 	uint_t *blk,
890 	uint_t *cnt,
891 	uint_t *offset
892 )
893 {
894 	mddb_did_free_t	*freep, *freep2;
895 	mddb_did_db_t	*dbp;
896 	uint_t		blk_cnt, blk_num;
897 	ddi_devid_t	devid_ptr = NULL;
898 
899 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
900 		return (0);
901 	}
902 
903 	freep = s->s_did_icp->did_ic_freep;
904 	freep2 = (mddb_did_free_t *)NULL;
905 	while (freep) {
906 		/* found a free area - remove from free list */
907 		if (len <= freep->free_length) {
908 			*blk = freep->free_blk;
909 			*offset = freep->free_offset;
910 			/* find disk block pointer that contains free area */
911 			dbp = s->s_did_icp->did_ic_dbp;
912 			while (dbp) {
913 				if (dbp->db_firstblk == *blk)
914 					break;
915 				else
916 					dbp = dbp->db_next;
917 			}
918 			/*
919 			 * If a disk block pointer can't be found - something
920 			 * is wrong, so don't use this free space.
921 			 */
922 			if (dbp == NULL) {
923 				freep2 = freep;
924 				freep = freep->free_next;
925 				continue;
926 			}
927 
928 			devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
929 			*cnt = dbp->db_blkcnt;
930 
931 			/* Update free list information */
932 			freep->free_offset += len;
933 			freep->free_length -= len;
934 			if (freep->free_length == 0) {
935 				if (freep2) {
936 					freep2->free_next =
937 					    freep->free_next;
938 				} else {
939 					s->s_did_icp->did_ic_freep =
940 					    freep->free_next;
941 				}
942 				kmem_free(freep, sizeof (mddb_did_free_t));
943 			}
944 			break;
945 		}
946 		freep2 = freep;
947 		freep = freep->free_next;
948 	}
949 
950 	/* Didn't find a free spot */
951 	if (freep == NULL) {
952 		/* get free logical disk blk in replica */
953 		blk_cnt = btodb(len + (MDDB_BSIZE - 1));
954 		blk_num = getfreeblks(s, blk_cnt);
955 		if (blk_num == 0)
956 			return (0);
957 
958 		/* Add disk block to disk block linked list */
959 		dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
960 		dbp->db_firstblk = blk_num;
961 		dbp->db_blkcnt = blk_cnt;
962 		dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
963 		dbp->db_next = s->s_did_icp->did_ic_dbp;
964 		s->s_did_icp->did_ic_dbp = dbp;
965 		devid_ptr = (ddi_devid_t)dbp->db_ptr;
966 
967 		/* Update return values */
968 		*blk = blk_num;
969 		*offset = 0;
970 		*cnt = blk_cnt;
971 
972 		/* Add unused part of block to free list */
973 		(void) mddb_devid_free_add(s, blk_num,
974 		    len, (dbtob(blk_cnt) - len));
975 	}
976 
977 	return ((caddr_t)devid_ptr);
978 }
979 
980 /*
981  * Add device id information for locator index to device id area in set.
982  * Get free area to store device id from free list.   Update checksum
983  * for mddb_did_blk.
984  *
985  * This routine does not write any data out to disk.
986  * After this routine has been called, the routine, writelocall, should
987  * be called to write both the locator block and device id area out
988  * to disk.
989  */
990 static int
mddb_devid_add(mddb_set_t * s,uint_t index,ddi_devid_t devid,char * minor_name)991 mddb_devid_add(
992 	mddb_set_t	*s,
993 	uint_t		index,
994 	ddi_devid_t	devid,
995 	char		*minor_name
996 )
997 {
998 	uint_t		devid_len;
999 	uint_t		blk, offset;
1000 	ddi_devid_t	devid_ptr;
1001 	mddb_did_info_t	*did_info;
1002 	uint_t		blkcnt, i;
1003 	mddb_did_blk_t	*did_blk;
1004 
1005 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1006 		return (1);
1007 	}
1008 	if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1009 		return (1);
1010 
1011 	/* Check if device id has already been added */
1012 	did_blk = s->s_did_icp->did_ic_blkp;
1013 	did_info = &(did_blk->blk_info[index]);
1014 	if (did_info->info_flags & MDDB_DID_EXISTS)
1015 		return (0);
1016 
1017 	devid_len = ddi_devid_sizeof(devid);
1018 	devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
1019 	    devid_len, &blk, &blkcnt, &offset);
1020 
1021 	if (devid_ptr == NULL) {
1022 		return (1);
1023 	}
1024 
1025 	/* Copy devid into devid free area */
1026 	for (i = 0; i < devid_len; i++)
1027 		((char *)devid_ptr)[i] = ((char *)devid)[i];
1028 
1029 	/* Update mddb_did_info area for new device id */
1030 	did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
1031 
1032 	/*
1033 	 * Only set UPDATED flag for non-replicated import cases.
1034 	 * This allows the side locator driver name index to get
1035 	 * updated in load_old_replicas.
1036 	 */
1037 	if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
1038 		did_info->info_flags |= MDDB_DID_UPDATED;
1039 
1040 	did_info->info_firstblk = blk;
1041 	did_info->info_blkcnt = blkcnt;
1042 	did_info->info_offset = offset;
1043 	did_info->info_length = devid_len;
1044 	(void) strcpy(did_info->info_minor_name, minor_name);
1045 	crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1046 
1047 	/* Add device id pointer to did_ic_devid array */
1048 	s->s_did_icp->did_ic_devid[index] = devid_ptr;
1049 
1050 	return (0);
1051 }
1052 
1053 
1054 /*
1055  * Delete device id information for locator index from device id area in set.
1056  * Add device id space to free area.
1057  *
1058  * This routine does not write any data out to disk.
1059  * After this routine has been called, the routine, writelocall, should
1060  * be called to write both the locator block and device id area out
1061  * to disk.
1062  */
1063 static int
mddb_devid_delete(mddb_set_t * s,uint_t index)1064 mddb_devid_delete(mddb_set_t *s, uint_t index)
1065 {
1066 	mddb_did_info_t	*did_info;
1067 	mddb_did_blk_t	*did_blk;
1068 
1069 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1070 		return (1);
1071 	}
1072 
1073 	/* Get device id information from mddb_did_blk */
1074 	did_blk = s->s_did_icp->did_ic_blkp;
1075 	did_info = &(did_blk->blk_info[index]);
1076 
1077 	/*
1078 	 * Ensure that the underlying device supports device ids
1079 	 * before arbitrarily removing them.
1080 	 */
1081 	if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1082 		return (1);
1083 	}
1084 
1085 	/* Remove device id information from mddb_did_blk */
1086 	did_info->info_flags = 0;
1087 
1088 	/* Remove device id from incore area */
1089 	s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1090 
1091 	/* Add new free space in disk block to free list */
1092 	(void) mddb_devid_free_add(s, did_info->info_firstblk,
1093 	    did_info->info_offset, did_info->info_length);
1094 
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Check if there is a device id for a locator index.
1100  *
1101  * Caller of this routine should not free devid or minor_name since
1102  * these will point to internal data structures that should not
1103  * be freed.
1104  */
1105 static int
mddb_devid_get(mddb_set_t * s,uint_t index,ddi_devid_t * devid,char ** minor_name)1106 mddb_devid_get(
1107 	mddb_set_t *s,
1108 	uint_t index,
1109 	ddi_devid_t *devid,
1110 	char **minor_name
1111 )
1112 {
1113 	mddb_did_info_t	*did_info;
1114 
1115 	if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1116 		return (0);
1117 	}
1118 	did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1119 
1120 	if (did_info->info_flags & MDDB_DID_EXISTS) {
1121 		*devid = s->s_did_icp->did_ic_devid[index];
1122 		*minor_name =
1123 		    s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1124 		return (1);
1125 	} else
1126 		return (0);
1127 
1128 
1129 }
1130 
1131 /*
1132  * Check if device id is valid on current system.
1133  * Needs devid, previously known dev_t and current minor_name.
1134  *
1135  * Success:
1136  * 	Returns 0 if valid device id is found and updates
1137  * 	dev_t if the dev_t associated with the device id is
1138  *	different than dev_t.
1139  * Failure:
1140  * 	Returns 1 if device id not valid on current system.
1141  */
1142 static int
mddb_devid_validate(ddi_devid_t devid,md_dev64_t * dev,char * minor_name)1143 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1144 {
1145 	int		retndevs;
1146 	dev_t		*ddi_devs;
1147 	int		devid_flag = 0;
1148 	int 		cnt;
1149 
1150 	if (dev == 0)
1151 		return (1);
1152 	/*
1153 	 * See if devid is valid in the current system.
1154 	 * If so, set dev to match the devid.
1155 	 */
1156 	if (ddi_lyr_devid_to_devlist(devid, minor_name,
1157 	    &retndevs, &ddi_devs) == DDI_SUCCESS) {
1158 		if (retndevs > 0) {
1159 			/* devid is valid to use */
1160 			devid_flag = 1;
1161 			/* does dev_t in list match dev */
1162 			cnt = 0;
1163 			while (cnt < retndevs) {
1164 				if (*dev == md_expldev(ddi_devs[cnt]))
1165 					break;
1166 				cnt++;
1167 			}
1168 			/*
1169 			 * If a different dev_t, then setup
1170 			 * new dev and new major name
1171 			 */
1172 			if (cnt == retndevs) {
1173 				*dev = md_expldev(ddi_devs[0]);
1174 			}
1175 			ddi_lyr_free_devlist(ddi_devs, retndevs);
1176 		}
1177 	}
1178 	if (devid_flag)
1179 		return (0);
1180 	else
1181 		return (1);
1182 }
1183 
1184 
1185 /*
1186  * Free the devid incore data areas
1187  */
1188 static void
mddb_devid_icp_free(mddb_did_ic_t ** did_icp,mddb_lb_t * lbp)1189 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1190 {
1191 	mddb_did_free_t	*did_freep1, *did_freep2;
1192 	mddb_did_db_t	*did_dbp1, *did_dbp2;
1193 	mddb_did_ic_t	*icp = *did_icp;
1194 
1195 	if (icp) {
1196 		if (icp->did_ic_blkp) {
1197 			kmem_free((caddr_t)icp->did_ic_blkp,
1198 			    dbtob(lbp->lb_didblkcnt));
1199 			icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1200 		}
1201 
1202 		if (icp->did_ic_dbp) {
1203 			did_dbp1 = icp->did_ic_dbp;
1204 			while (did_dbp1) {
1205 				did_dbp2 = did_dbp1->db_next;
1206 				kmem_free((caddr_t)did_dbp1->db_ptr,
1207 				    dbtob(did_dbp1->db_blkcnt));
1208 				kmem_free((caddr_t)did_dbp1,
1209 				    sizeof (mddb_did_db_t));
1210 				did_dbp1 = did_dbp2;
1211 			}
1212 		}
1213 
1214 		if (icp->did_ic_freep) {
1215 			did_freep1 = icp->did_ic_freep;
1216 			while (did_freep1) {
1217 				did_freep2 = did_freep1->free_next;
1218 				kmem_free((caddr_t)did_freep1,
1219 				    sizeof (mddb_did_free_t));
1220 				did_freep1 = did_freep2;
1221 			}
1222 		}
1223 
1224 		kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1225 		*did_icp = (mddb_did_ic_t *)NULL;
1226 	}
1227 
1228 }
1229 
1230 static daddr_t
getphysblk(mddb_block_t blk,mddb_mb_ic_t * mbip)1231 getphysblk(
1232 	mddb_block_t		blk,
1233 	mddb_mb_ic_t		*mbip
1234 )
1235 {
1236 	mddb_mb_t	*mbp = &(mbip->mbi_mddb_mb);
1237 
1238 	while (blk >= mbp->mb_blkcnt) {
1239 		if (! mbip->mbi_next)
1240 			return ((daddr_t)-1);	/* no such block */
1241 		blk -= mbp->mb_blkcnt;
1242 		mbip = mbip->mbi_next;
1243 		mbp = &(mbip->mbi_mddb_mb);
1244 	}
1245 
1246 	if (blk >= mbp->mb_blkmap.m_consecutive)
1247 		return ((daddr_t)-1);	/* no such block */
1248 
1249 	return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1250 }
1251 
1252 /*
1253  * when a buf header is passed in the new buffer must be
1254  * put on the front of the chain. writerec counts on it
1255  */
1256 static int
putblks(mddb_set_t * s,caddr_t buffer,daddr_t blk,int cnt,md_dev64_t device,mddb_bf_t ** bufhead)1257 putblks(
1258 	mddb_set_t	*s,		/* incore db set structure */
1259 	caddr_t		buffer,		/* adr of buffer to be written */
1260 	daddr_t		blk,		/* block number for first block */
1261 	int		cnt,		/* number of blocks to be written */
1262 	md_dev64_t	device,		/* device to be written to */
1263 	mddb_bf_t	**bufhead	/* if non-zero then ASYNC I/O */
1264 					/*    and put buf address here */
1265 )
1266 {
1267 	buf_t		*bp;
1268 	mddb_bf_t	*bfp;
1269 	int		err = 0;
1270 
1271 	bfp = allocbuffer(s, MDDB_SLEEPOK);
1272 	bp = &bfp->bf_buf;
1273 	bp->b_bcount = MDDB_BSIZE * cnt;
1274 	bp->b_un.b_addr = buffer;
1275 	bp->b_blkno = blk;
1276 	bp->b_edev = md_dev64_to_dev(device);
1277 	/*
1278 	 * if a header for a buf chain is passed in this is async io.
1279 	 * currently only done for optimize  records
1280 	 */
1281 	if (bufhead) {
1282 		bfp->bf_next = *bufhead;
1283 		*bufhead = bfp;
1284 		(void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1285 		return (0);
1286 	}
1287 	err = mddb_rwdata(s, B_WRITE, bp);
1288 	freebuffer(s, bfp);
1289 	if (err) {
1290 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1291 		    s->s_setno, device);
1292 		return (MDDB_F_EWRITE);
1293 	}
1294 	return (0);
1295 }
1296 
1297 /*
1298  * wrtblklst - takes an array of logical block numbers
1299  *		and writes the buffer to those blocks (scatter).
1300  * If called during upgrade, this routine expects a
1301  * non-translated (aka target) dev.
1302  */
1303 static int
wrtblklst(mddb_set_t * s,caddr_t buffer,mddb_block_t blka[],daddr_t cnt,const int li,mddb_bf_t ** bufhead,int master_only)1304 wrtblklst(
1305 	mddb_set_t	*s,		/* incore set structure */
1306 	caddr_t		buffer,		/* buffer to be written (record blk) */
1307 	mddb_block_t	blka[],		/* list of logical blks for record */
1308 	daddr_t		cnt,		/* number of logical blks */
1309 	const int	li,		/* locator index */
1310 	mddb_bf_t	**bufhead,	/* if non-zero then ASYNC I/O */
1311 					/*    and put buf address here */
1312 	int		master_only	/* allow only master node to write */
1313 )
1314 {
1315 	daddr_t		blk;
1316 	daddr_t		blk1;
1317 	int		err = 0;
1318 	int		cons;
1319 	mddb_lb_t	*lbp = s->s_lbp;
1320 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1321 	md_dev64_t	dev;
1322 	mddb_mb_ic_t	*mbip = s->s_mbiarray[li];
1323 
1324 	/*
1325 	 * If a MN diskset and only the master can write,
1326 	 * then a non-master node will just return success.
1327 	 */
1328 	if (lbp->lb_flags & MDDB_MNSET) {
1329 		if (master_only == MDDB_WR_ONLY_MASTER) {
1330 			/* return successfully if we aren't the master */
1331 			if (!(md_set[s->s_setno].s_am_i_master)) {
1332 				return (0);
1333 			}
1334 		}
1335 		if (mbip == NULL)
1336 			return (MDDB_F_EWRITE);
1337 	}
1338 
1339 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1340 	if (dev == NODEV64) {
1341 		return (1);
1342 	}
1343 
1344 	blk = getphysblk(blka[0], mbip);
1345 	ASSERT(blk >= 0);
1346 
1347 	cons = 1;
1348 	while (cnt) {
1349 		if (cons != cnt) {
1350 			blk1 = getphysblk(blka[cons], mbip);
1351 			ASSERT(blk1 >= 0);
1352 			if ((blk + cons) == blk1) {
1353 				cons++;
1354 				continue;
1355 			}
1356 		}
1357 		if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1358 			/*
1359 			 * If an MN diskset and any_node_can_write
1360 			 * then this request is coming from writeoptrecord
1361 			 * and l_flags field should not be updated.
1362 			 * l_flags will be updated as a result of sending
1363 			 * a class1 message to the master.  Setting l_flags
1364 			 * here will cause slave to be out of sync with
1365 			 * master.
1366 			 *
1367 			 * Otherwise, set the error in l_flags
1368 			 * (this occurs if this is not a MN diskset or
1369 			 * only_master_can_write is set).
1370 			 */
1371 			if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1372 			    (master_only == MDDB_WR_ONLY_MASTER)) {
1373 				lp->l_flags |= MDDB_F_EWRITE;
1374 			}
1375 			return (err);
1376 		}
1377 		if (bufhead)
1378 			(*bufhead)->bf_locator = lp;
1379 
1380 		buffer += MDDB_BSIZE * cons;
1381 		cnt -= cons;
1382 		blka += cons;
1383 		if (cnt) {
1384 			blk = getphysblk(blka[0], mbip);
1385 			ASSERT(blk >= 0);
1386 		}
1387 		cons = 1;
1388 	}
1389 
1390 	return (0);
1391 }
1392 
1393 /*
1394  * writeblks - takes a logical block number/block count pair
1395  * 		and writes the buffer to those contiguous logical blocks.
1396  * If called during upgrade, this routine expects a non-translated
1397  * (aka target) dev.
1398  */
1399 static int
writeblks(mddb_set_t * s,caddr_t buffer,mddb_block_t blk,int cnt,const int li,int master_only)1400 writeblks(
1401 	mddb_set_t	*s,		/* incore set structure */
1402 	caddr_t		buffer,		/* buffer to be written */
1403 	mddb_block_t	blk,		/* starting logical block number */
1404 	int		cnt,		/* number of log blocks to be written */
1405 	const int	li,		/* locator index */
1406 	int		master_only	/* allow only master node to write */
1407 )
1408 {
1409 	daddr_t		physblk;
1410 	int		err = 0;
1411 	int		i;
1412 	mddb_lb_t	*lbp = s->s_lbp;
1413 	mddb_locator_t	*lp = &lbp->lb_locators[li];
1414 	md_dev64_t	dev;
1415 	mddb_block_t	*blkarray;
1416 	int		size;
1417 	int		ret;
1418 
1419 	/*
1420 	 * If a MN diskset and only the master can write,
1421 	 * then a non-master node will just return success.
1422 	 */
1423 	if ((lbp->lb_flags & MDDB_MNSET) &&
1424 	    (master_only == MDDB_WR_ONLY_MASTER)) {
1425 		/* return successfully if we aren't the master */
1426 		if (!(md_set[s->s_setno].s_am_i_master)) {
1427 			return (0);
1428 		}
1429 	}
1430 
1431 	dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1432 	if (dev == NODEV64) {
1433 		return (1);
1434 	}
1435 
1436 	if (cnt > 1) {
1437 		size = sizeof (mddb_block_t) * cnt;
1438 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1439 		for (i = 0; i < cnt; i++)
1440 			blkarray[i] = blk + i;
1441 		ret = wrtblklst(s, buffer, blkarray, cnt,
1442 		    li, 0, MDDB_WR_ONLY_MASTER);
1443 		kmem_free(blkarray, size);
1444 		return (ret);
1445 	}
1446 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1447 	ASSERT(physblk > 0);
1448 	if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1449 		lp->l_flags |= MDDB_F_EWRITE;
1450 		return (err);
1451 	}
1452 	return (0);
1453 }
1454 
1455 /*
1456  * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1457  */
1458 static int
writeall(mddb_set_t * s,caddr_t buffer,mddb_block_t block,int cnt,int master_only)1459 writeall(
1460 	mddb_set_t	*s,		/* incore set structure */
1461 	caddr_t		buffer,		/* buffer to be written */
1462 	mddb_block_t	block,		/* starting logical block number */
1463 	int		cnt,		/* number of log blocks to be written */
1464 	int		master_only	/* allow only master node to write */
1465 )
1466 {
1467 	int		li;
1468 	int		err = 0;
1469 	mddb_lb_t	*lbp = s->s_lbp;
1470 
1471 	for (li = 0; li < lbp->lb_loccnt; li++) {
1472 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1473 
1474 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1475 		    (lp->l_flags & MDDB_F_EWRITE))
1476 			continue;
1477 
1478 		err |= writeblks(s, buffer, block, cnt, li, master_only);
1479 	}
1480 
1481 	return (err);
1482 }
1483 
1484 /*
1485  * writelocall - write the locator block and device id information (if
1486  * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1487  *
1488  * Increments the locator block's commitcnt.  Updates the device id area's
1489  * commitcnt if the replica is in device id format.  Regenerates the
1490  * checksums after updating the commitcnt(s).
1491  */
1492 static int
writelocall(mddb_set_t * s)1493 writelocall(
1494 	mddb_set_t	*s	/* incore set structure */
1495 )
1496 {
1497 	int		li;
1498 	int		err = 0;
1499 	mddb_lb_t	*lbp = s->s_lbp;
1500 	mddb_did_blk_t	*did_blk;
1501 	mddb_did_db_t	*did_dbp;
1502 
1503 	s->s_lbp->lb_commitcnt++;
1504 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1505 		did_blk = s->s_did_icp->did_ic_blkp;
1506 		did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1507 		crcgen(did_blk, &did_blk->blk_checksum,
1508 		    dbtob(lbp->lb_didblkcnt), NULL);
1509 	}
1510 	crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1511 
1512 	for (li = 0; li < lbp->lb_loccnt; li++) {
1513 		mddb_locator_t	*lp = &lbp->lb_locators[li];
1514 
1515 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1516 		    (lp->l_flags & MDDB_F_EWRITE))
1517 			continue;
1518 
1519 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1520 			/* write out blocks containing actual device ids */
1521 			did_dbp = s->s_did_icp->did_ic_dbp;
1522 			while (did_dbp) {
1523 				err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1524 				    did_dbp->db_firstblk,
1525 				    did_dbp->db_blkcnt, li,
1526 				    MDDB_WR_ONLY_MASTER);
1527 				did_dbp = did_dbp->db_next;
1528 			}
1529 
1530 			/* write out device id area block */
1531 			err |= writeblks(s, (caddr_t)did_blk,
1532 			    lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1533 			    MDDB_WR_ONLY_MASTER);
1534 		}
1535 		/* write out locator block */
1536 		err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1537 		    MDDB_WR_ONLY_MASTER);
1538 	}
1539 
1540 	/*
1541 	 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1542 	 * in the mddb_set structure to show that the locator block has
1543 	 * been changed.
1544 	 */
1545 
1546 	if ((lbp->lb_flags & MDDB_MNSET) &&
1547 	    (md_set[s->s_setno].s_am_i_master)) {
1548 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1549 	}
1550 	return (err);
1551 }
1552 
1553 /*
1554  * If called during upgrade, this routine expects a translated
1555  * (aka miniroot) dev.
1556  */
1557 static int
getblks(mddb_set_t * s,caddr_t buffer,md_dev64_t device,daddr_t blk,int cnt,int flag)1558 getblks(
1559 	mddb_set_t	*s,	/* incore db set structure */
1560 	caddr_t		buffer,	/* buffer to read data into */
1561 	md_dev64_t	device,	/* device to read from */
1562 	daddr_t		blk,	/* physical block number to read */
1563 	int		cnt,	/* number of blocks to read */
1564 	int		flag	/* flags for I/O */
1565 )
1566 {
1567 	buf_t		*bp;
1568 	mddb_bf_t	*bfp;
1569 	int		err = 0;
1570 
1571 	bfp = allocbuffer(s, MDDB_SLEEPOK);	/* this will never sleep */
1572 	bp = &bfp->bf_buf;
1573 	bp->b_bcount = MDDB_BSIZE * cnt;
1574 	bp->b_un.b_addr = buffer;
1575 	bp->b_blkno = blk;
1576 	bp->b_edev = md_dev64_to_dev(device);
1577 	err = mddb_rwdata(s, (B_READ | flag), bp);
1578 	freebuffer(s, bfp);
1579 	if (err) {
1580 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1581 		    s->s_setno, device);
1582 		return (MDDB_F_EREAD);
1583 	}
1584 	return (0);
1585 }
1586 
1587 /*
1588  * readblklst - takes an array of logical block numbers
1589  * 		and reads those blocks (gather) into the buffer.
1590  * If called during upgrade, this routine expects a non-translated
1591  * (aka target) dev.
1592  */
1593 static int
readblklst(mddb_set_t * s,caddr_t buffer,mddb_block_t blka[],daddr_t cnt,int li,int flag)1594 readblklst(
1595 	mddb_set_t	*s,	/* incore set structure */
1596 	caddr_t		buffer,	/* buffer to be read (record block) */
1597 	mddb_block_t	blka[],	/* list of logical blocks to be read */
1598 	daddr_t		cnt,	/* number of logical blocks */
1599 	int		li,	/* locator index */
1600 	int		flag	/* flags for I/O */
1601 )
1602 {
1603 	daddr_t		blk;
1604 	daddr_t		blk1;
1605 	int		err = 0;
1606 	int		cons;
1607 	md_dev64_t	dev;
1608 	mddb_mb_ic_t	*mbip;
1609 
1610 	mbip = s->s_mbiarray[li];
1611 	dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1612 	dev = md_xlate_targ_2_mini(dev);
1613 	if (dev == NODEV64) {
1614 		return (1);
1615 	}
1616 
1617 	blk = getphysblk(blka[0], mbip);
1618 	ASSERT(blk >= 0);
1619 
1620 	cons = 1;
1621 	while (cnt) {
1622 		if (cons != cnt) {
1623 			blk1 = getphysblk(blka[cons], mbip);
1624 			ASSERT(blk1 >= 0);
1625 			if ((blk + cons) == blk1) {
1626 				cons++;
1627 				continue;
1628 			}
1629 		}
1630 		if (err = getblks(s, buffer, dev, blk, cons, flag))
1631 			return (err);
1632 		buffer += MDDB_BSIZE * cons;
1633 		cnt -= cons;
1634 		blka += cons;
1635 		if (cnt) {
1636 			blk = getphysblk(blka[0], mbip);
1637 			ASSERT(blk >= 0);
1638 		}
1639 		cons = 1;
1640 	}
1641 	return (0);
1642 }
1643 
1644 /*
1645  * readblks - takes a logical block number/block count pair
1646  * 		and reads those contiguous logical blocks into the buffer.
1647  * If called during upgrade, this routine expects a non-translated
1648  * (aka target) dev.
1649  */
1650 static int
readblks(mddb_set_t * s,caddr_t buffer,mddb_block_t blk,int cnt,int li)1651 readblks(
1652 	mddb_set_t	*s,	/* incore set structure */
1653 	caddr_t		buffer,	/* buffer to be read into */
1654 	mddb_block_t	blk,	/* logical block number to be read */
1655 	int		cnt,	/* number of logical blocks to be read */
1656 	int		li	/* locator index */
1657 )
1658 {
1659 	daddr_t		physblk;
1660 	md_dev64_t	device;
1661 	int		i;
1662 	mddb_block_t	*blkarray;
1663 	int		size;
1664 	int		ret;
1665 
1666 	if (cnt > 1) {
1667 		size = sizeof (mddb_block_t) * cnt;
1668 		blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1669 		for (i = 0; i < cnt; i++)
1670 			blkarray[i] = blk + i;
1671 		ret = readblklst(s, buffer, blkarray, cnt, li, 0);
1672 		kmem_free(blkarray, size);
1673 		return (ret);
1674 	}
1675 	physblk = getphysblk(blk, s->s_mbiarray[li]);
1676 	ASSERT(physblk > 0);
1677 	device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1678 	device = md_xlate_targ_2_mini(device);
1679 	if (device == NODEV64) {
1680 		return (1);
1681 	}
1682 	return (getblks(s, buffer, device, physblk, 1, 0));
1683 }
1684 
1685 static void
single_thread_start(mddb_set_t * s)1686 single_thread_start(
1687 	mddb_set_t	*s
1688 )
1689 {
1690 	while (s->s_singlelockgotten) {
1691 		s->s_singlelockwanted++;
1692 		cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1693 	}
1694 	s->s_singlelockgotten++;
1695 }
1696 
1697 static void
single_thread_end(mddb_set_t * s)1698 single_thread_end(
1699 	mddb_set_t	*s
1700 )
1701 {
1702 	ASSERT(s->s_singlelockgotten);
1703 	s->s_singlelockgotten = 0;
1704 	if (s->s_singlelockwanted) {
1705 		s->s_singlelockwanted = 0;
1706 		cv_broadcast(&s->s_single_thread_cv);
1707 	}
1708 }
1709 
1710 static size_t
sizeofde(mddb_de_ic_t * dep)1711 sizeofde(
1712 	mddb_de_ic_t	*dep
1713 )
1714 {
1715 	size_t		size;
1716 
1717 	size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1718 	    sizeof (mddb_block_t) * dep->de_blkcount;
1719 	return (size);
1720 }
1721 
1722 static size_t
sizeofde32(mddb_de32_t * dep)1723 sizeofde32(
1724 	mddb_de32_t	*dep
1725 )
1726 {
1727 	size_t		size;
1728 
1729 	size = sizeof (*dep) - sizeof (dep->de32_blks) +
1730 	    sizeof (mddb_block_t) * dep->de32_blkcount;
1731 	return (size);
1732 }
1733 
1734 static mddb_de32_t *
nextentry(mddb_de32_t * dep)1735 nextentry(
1736 	mddb_de32_t	*dep
1737 )
1738 {
1739 	mddb_de32_t	*ret;
1740 
1741 	ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1742 	return (ret);
1743 }
1744 
1745 static void
create_db32rec(mddb_db32_t * db32p,mddb_db_t * dbp)1746 create_db32rec(
1747 	mddb_db32_t *db32p,
1748 	mddb_db_t *dbp
1749 )
1750 {
1751 	mddb_de_ic_t *dep;
1752 	mddb_de32_t *de32p;
1753 
1754 #if defined(_ILP32) && !defined(lint)
1755 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1756 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1757 #endif
1758 
1759 	dbtodb32(dbp, db32p);
1760 	if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1761 		db32p->db32_firstentry = 0x4;
1762 	de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1763 	    + sizeof (db32p->db32_firstentry)));
1764 	for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1765 		detode32(dep, de32p);
1766 		if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1767 			de32p->de32_next = 0x4;
1768 		de32p = nextentry(de32p);
1769 	}
1770 	ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1771 }
1772 
1773 /*
1774  * If called during upgrade, this routine expects a translated
1775  * (aka miniroot) dev.
1776  * If master blocks are found, set the mn_set parameter to 1 if the
1777  * the master block revision number is MDDB_REV_MNMB; otherwise,
1778  * set it to 0.
1779  * If master blocks are not found, do not change the mnset parameter.
1780  */
1781 static mddb_mb_ic_t *
getmasters(mddb_set_t * s,md_dev64_t dev,daddr_t blkno,uint_t * flag,int * mn_set)1782 getmasters(
1783 	mddb_set_t	*s,
1784 	md_dev64_t	dev,
1785 	daddr_t		blkno,
1786 	uint_t		*flag,
1787 	int		*mn_set
1788 )
1789 {
1790 	mddb_mb_ic_t	*mbi = NULL;
1791 	mddb_mb_t	*mb;
1792 	int		error = 0;
1793 	ddi_devid_t	devid;
1794 
1795 
1796 	if (mddb_devopen(dev)) {
1797 		if (flag)
1798 			*flag |= MDDB_F_EMASTER;
1799 		return ((mddb_mb_ic_t *)NULL);
1800 	}
1801 
1802 
1803 	mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1804 	mb = &(mbi->mbi_mddb_mb);
1805 	if (error = getblks(s, (caddr_t)mb, dev, blkno,
1806 	    btodb(MDDB_BSIZE), 0)) {
1807 		error |= MDDB_F_EMASTER;
1808 	}
1809 	if (mb->mb_magic != MDDB_MAGIC_MB) {
1810 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1811 	}
1812 	/* Check for MDDB_REV_MNMB and lower */
1813 	if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1814 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1815 	}
1816 	if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1817 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1818 	}
1819 
1820 	if (!(md_get_setstatus(s->s_setno) &
1821 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
1822 	    (mb->mb_setno != s->s_setno)) {
1823 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1824 	}
1825 	if (mb->mb_blkno != blkno) {
1826 		error = MDDB_F_EFMT | MDDB_F_EMASTER;
1827 	}
1828 	mb->mb_next = NULL;
1829 	mbi->mbi_next = NULL;
1830 
1831 	if (error)
1832 		goto out;
1833 
1834 	/*
1835 	 * Check the md_devid_destroy and md_keep_repl_state flags
1836 	 * to see if we need to regen the devid or not.
1837 	 *
1838 	 * Don't care about devid in local set since it is not used
1839 	 * and this should not be part of set importing
1840 	 */
1841 	if ((s->s_setno != MD_LOCAL_SET) &&
1842 	    !(md_get_setstatus(s->s_setno) &
1843 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
1844 		/*
1845 		 * Now check the destroy flag. We also need to handle
1846 		 * the case where the destroy flag is reset after the
1847 		 * destroy
1848 		 */
1849 		if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1850 
1851 			if (md_devid_destroy) {
1852 				bzero(mb->mb_devid, mb->mb_devid_len);
1853 				mb->mb_devid_len = 0;
1854 			}
1855 
1856 			/*
1857 			 * Try to regenerate it if the 'keep' flag is not set
1858 			 */
1859 			if (!md_keep_repl_state) {
1860 				if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1861 				    &devid) == DDI_SUCCESS) {
1862 					mb->mb_devid_len =
1863 					    ddi_devid_sizeof(devid);
1864 					bcopy(devid, mb->mb_devid,
1865 					    mb->mb_devid_len);
1866 					ddi_devid_free(devid);
1867 				} else {
1868 					error = MDDB_F_EFMT | MDDB_F_EMASTER;
1869 				}
1870 			}
1871 
1872 			crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1873 
1874 			/*
1875 			 * Push
1876 			 */
1877 			if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1878 				error = MDDB_F_EFMT | MDDB_F_EMASTER;
1879 			}
1880 		}
1881 	}
1882 
1883 	if (! error) {
1884 		/* Set mn_set parameter to 1 if a MN set */
1885 		if (mb->mb_revision == MDDB_REV_MNMB)
1886 			*mn_set = 1;
1887 		else
1888 			*mn_set = 0;
1889 		return (mbi);
1890 	}
1891 
1892 out:
1893 	/* Error Out */
1894 	if (flag)
1895 		*flag |= error;
1896 
1897 	kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1898 	mddb_devclose(dev);
1899 	return ((mddb_mb_ic_t *)NULL);
1900 }
1901 
1902 static int
getrecord(mddb_set_t * s,mddb_de_ic_t * dep,int li)1903 getrecord(
1904 	mddb_set_t	*s,
1905 	mddb_de_ic_t	*dep,
1906 	int		li
1907 )
1908 {
1909 	int		err = 0;
1910 	mddb_rb32_t	*rbp;
1911 
1912 #if defined(_ILP32) && !defined(lint)
1913 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1914 #endif
1915 
1916 
1917 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1918 	rbp = dep->de_rb;
1919 
1920 	err = readblklst(s, (caddr_t)rbp, dep->de_blks,
1921 	    dep->de_blkcount, li, 0);
1922 	if (err) {
1923 		return (MDDB_F_EDATA | err);
1924 	}
1925 	if (rbp->rb_magic != MDDB_MAGIC_RB) {
1926 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1927 	}
1928 	if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1929 	    (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
1930 	    (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
1931 	    (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
1932 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1933 	}
1934 	/* Check crc for this record */
1935 	if (rec_crcchk(s, dep, rbp)) {
1936 		return (MDDB_F_EFMT | MDDB_F_EDATA);
1937 	}
1938 	return (0);
1939 }
1940 
1941 /*
1942  * Code to read in the locator name information
1943  */
1944 static int
readlocnames(mddb_set_t * s,int li)1945 readlocnames(
1946 	mddb_set_t	*s,
1947 	int		li
1948 )
1949 {
1950 	mddb_ln_t	*lnp;
1951 	int		err = 0;
1952 	mddb_block_t	ln_blkcnt, ln_blkno;
1953 
1954 	/*
1955 	 * read in the locator name blocks
1956 	 */
1957 	s->s_lnp = NULL;
1958 
1959 	ln_blkno = s->s_lbp->lb_lnfirstblk;
1960 	ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1961 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1962 
1963 	err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1964 	if (err) {
1965 		err |= MDDB_F_EDATA;
1966 		goto out;
1967 	}
1968 	if (lnp->ln_magic != MDDB_MAGIC_LN) {
1969 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1970 		goto out;
1971 	}
1972 	if (s->s_lbp->lb_flags & MDDB_MNSET) {
1973 		if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1974 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1975 			goto out;
1976 		}
1977 	} else {
1978 		if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1979 			err = MDDB_F_EDATA | MDDB_F_EFMT;
1980 			goto out;
1981 		}
1982 	}
1983 	if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1984 		err = MDDB_F_EDATA | MDDB_F_EFMT;
1985 		goto out;
1986 	}
1987 out:
1988 	/*
1989 	 *	if error occurred in locator name blocks free them
1990 	 *	and return
1991 	 */
1992 	if (err) {
1993 		kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1994 		return (err);
1995 	}
1996 	s->s_lnp = lnp;
1997 	return (0);
1998 }
1999 
2000 /*
2001  * code to read in a copy of the database.
2002  */
2003 
2004 static int
readcopy(mddb_set_t * s,int li)2005 readcopy(
2006 	mddb_set_t	*s,
2007 	int		li
2008 )
2009 {
2010 	uint_t		blk;
2011 	mddb_db_t	*dbp, *dbp1, *dbhp;
2012 	mddb_db32_t	*db32p;
2013 	mddb_de_ic_t	*dep, *dep2;
2014 	mddb_de32_t	*de32p, *de32p2;
2015 	int		err = 0;
2016 	uint_t		checksum;
2017 
2018 
2019 #if defined(_ILP32) && !defined(lint)
2020 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2021 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2022 #endif
2023 
2024 	dbp = NULL;
2025 	dbhp = NULL;
2026 	/*
2027 	 *	read in all the directory blocks
2028 	 */
2029 	blk = s->s_lbp->lb_dbfirstblk;
2030 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2031 
2032 	for (; blk != 0; blk = dbp->db_nextblk) {
2033 		dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2034 		if (! dbhp) {
2035 			dbhp = dbp1;
2036 		} else {
2037 			dbp->db_next = dbp1;
2038 		}
2039 		dbp = dbp1;
2040 
2041 		err = readblks(s, (caddr_t)db32p, blk, 1, li);
2042 		if (err) {
2043 			err |= MDDB_F_EDATA;
2044 			break;
2045 		}
2046 		db32todb(db32p, dbp);
2047 		if (db32p->db32_magic != MDDB_MAGIC_DB) {
2048 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2049 			break;
2050 		}
2051 		if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2052 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2053 			break;
2054 		}
2055 		if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2056 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2057 			break;
2058 		}
2059 		/*
2060 		 * first go through and fix up all de_next pointers
2061 		 */
2062 		if (dbp->db_firstentry) {
2063 
2064 			de32p = (mddb_de32_t *)
2065 			    ((void *) ((caddr_t)(&db32p->db32_firstentry)
2066 			    + sizeof (db32p->db32_firstentry)));
2067 
2068 			dep = (mddb_de_ic_t *)
2069 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
2070 			    sizeof (mddb_block_t) +
2071 			    sizeof (mddb_block_t) * de32p->de32_blkcount,
2072 			    KM_SLEEP);
2073 			de32tode(de32p, dep);
2074 
2075 			dbp->db_firstentry = dep;
2076 			while (de32p && de32p->de32_next) {
2077 
2078 				de32p2 = nextentry(de32p);
2079 
2080 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
2081 				    sizeof (mddb_de_ic_t) -
2082 				    sizeof (mddb_block_t) +
2083 				    sizeof (mddb_block_t) *
2084 				    de32p2->de32_blkcount, KM_SLEEP);
2085 
2086 				de32tode(de32p2, dep2);
2087 
2088 				dep->de_next = dep2;
2089 				dep = dep2;
2090 				de32p = de32p2;
2091 			}
2092 		}
2093 		/*
2094 		 * go through and make all of the pointer to record blocks
2095 		 * are null;
2096 		 */
2097 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2098 			dep->de_rb = NULL;
2099 	}
2100 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2101 	dbp->db_next = NULL;
2102 	/*
2103 	 *	if error occurred in directory blocks free them
2104 	 *	and return
2105 	 */
2106 	if (err) {
2107 		dbp = dbhp;
2108 		while (dbp) {
2109 			dep = dbp->db_firstentry;
2110 			while (dep) {
2111 				/* No mddb_rb32_t structures yet */
2112 				dep2 = dep->de_next;
2113 				kmem_free((caddr_t)dep, sizeofde(dep));
2114 				dep = dep2;
2115 			}
2116 			dbp1 = dbp->db_next;
2117 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2118 			dbp = dbp1;
2119 		}
2120 		s->s_dbp = NULL;
2121 		return (err);
2122 
2123 	}
2124 	/*
2125 	 */
2126 	err = 0;
2127 	checksum = MDDB_GLOBAL_XOR;
2128 	for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2129 		checksum ^= dbp->db_recsum;
2130 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2131 			if (dep->de_flags & MDDB_F_OPT)
2132 				continue;
2133 			err = getrecord(s, dep, li);
2134 			if (err)
2135 				break;
2136 			/* Don't include CHANGELOG in big XOR */
2137 			if (dep->de_flags & MDDB_F_CHANGELOG)
2138 				continue;
2139 			checksum ^= dep->de_rb->rb_checksum;
2140 			checksum ^= dep->de_rb->rb_checksum_fiddle;
2141 		}
2142 		if (err)
2143 			break;
2144 	}
2145 	if (checksum) {
2146 		if (! err)
2147 			err = MDDB_F_EDATA | MDDB_F_EFMT;
2148 	}
2149 	if (err) {
2150 		dbp = dbhp;
2151 		dbhp = NULL;
2152 		while (dbp) {
2153 			dep = dbp->db_firstentry;
2154 			while (dep) {
2155 				if (dep->de_rb)
2156 					kmem_free((caddr_t)dep->de_rb,
2157 					    dep->de_recsize);
2158 				dep2 = dep->de_next;
2159 				kmem_free((caddr_t)dep, sizeofde(dep));
2160 				dep = dep2;
2161 			}
2162 			dbp1 = dbp->db_next;
2163 			kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2164 			dbp = dbp1;
2165 		}
2166 	}
2167 	s->s_dbp = dbhp;
2168 	return (err);
2169 }
2170 
2171 static int
getoptcnt(mddb_set_t * s,int li)2172 getoptcnt(
2173 	mddb_set_t	*s,
2174 	int		li)
2175 {
2176 	int		result;
2177 	mddb_de_ic_t	*dep;
2178 	mddb_db_t	*dbp;
2179 
2180 #if defined(_ILP32) && !defined(lint)
2181 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2182 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2183 #endif
2184 
2185 	result = 0;
2186 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2187 		dep = dbp->db_firstentry;
2188 		for (; dep != NULL; dep = dep->de_next) {
2189 			if (! (dep->de_flags & MDDB_F_OPT))
2190 				continue;
2191 			if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2192 			    (li == dep->de_optinfo[0].o_li)) ||
2193 			    ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2194 			    (li == dep->de_optinfo[1].o_li)))
2195 			result++;
2196 		}
2197 	}
2198 	return (result);
2199 }
2200 
2201 static void
getoptdev(mddb_set_t * s,mddb_de_ic_t * rdep,int opti)2202 getoptdev(
2203 	mddb_set_t	*s,
2204 	mddb_de_ic_t	*rdep,
2205 	int		opti
2206 )
2207 {
2208 	mddb_lb_t	*lbp;
2209 	mddb_locator_t	*lp;
2210 	mddb_optinfo_t	*otherop;
2211 	mddb_optinfo_t	*resultop;
2212 	int		li;
2213 	dev_t		otherdev;
2214 	int		blkonly = 0;
2215 	int		mincnt;
2216 	int		thiscnt;
2217 
2218 	lbp = s->s_lbp;
2219 
2220 	resultop = &rdep->de_optinfo[opti];
2221 	otherop = &rdep->de_optinfo[1-opti];
2222 
2223 	resultop->o_flags = 0;
2224 
2225 	/*
2226 	 * scan through and see if data bases have to vary by only device
2227 	 */
2228 
2229 	if (otherop->o_flags & MDDB_F_ACTIVE) {
2230 		blkonly = 1;
2231 		otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2232 		for (li = 0; li < lbp->lb_loccnt; li++) {
2233 			lp = &lbp->lb_locators[li];
2234 			if (! (lp->l_flags & MDDB_F_ACTIVE))
2235 				continue;
2236 			if (expldev(lp->l_dev) != otherdev) {
2237 				blkonly = 0;
2238 				break;
2239 			}
2240 		}
2241 	}
2242 
2243 	mincnt = 999999;
2244 	for (li = 0; li < lbp->lb_loccnt; li++) {
2245 		dev_info_t	*devi;
2246 		int		removable = 0;
2247 
2248 		lp = &lbp->lb_locators[li];
2249 		if (! (lp->l_flags & MDDB_F_ACTIVE))
2250 			continue;
2251 		if (otherop->o_flags & MDDB_F_ACTIVE) {
2252 			if (blkonly) {
2253 				if (otherop->o_li == li)
2254 					continue;
2255 			} else {
2256 				if (otherdev == expldev(lp->l_dev))
2257 					continue;
2258 			}
2259 		}
2260 
2261 		/*
2262 		 * Check if this is a removable device.  If it is we
2263 		 * assume it is something like a USB flash disk, a zip disk
2264 		 * or even a floppy that is being used to help maintain
2265 		 * mddb quorum.  We don't want to put any optimized resync
2266 		 * records on these kinds of disks since they are usually
2267 		 * slower or don't have the same read/write lifetimes as
2268 		 * a regular fixed disk.
2269 		 */
2270 		if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2271 			int		error;
2272 			struct cb_ops	*cb;
2273 			ddi_prop_op_t	prop_op = PROP_LEN_AND_VAL_BUF;
2274 			int		propvalue = 0;
2275 			int		proplength = sizeof (int);
2276 
2277 			if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2278 			    != NULL) {
2279 				error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2280 				    prop_op, DDI_PROP_NOTPROM |
2281 				    DDI_PROP_DONTPASS, "removable-media",
2282 				    (caddr_t)&propvalue, &proplength);
2283 
2284 				if (error == DDI_PROP_SUCCESS)
2285 					removable = 1;
2286 			}
2287 
2288 			ddi_release_devi(devi);
2289 		}
2290 
2291 		if (removable)
2292 			continue;
2293 
2294 		thiscnt = getoptcnt(s, li);
2295 		if (thiscnt < mincnt) {
2296 			resultop->o_li  = li;
2297 			mincnt = thiscnt;
2298 			resultop->o_flags = MDDB_F_ACTIVE;
2299 		}
2300 	}
2301 }
2302 
2303 static void
allocuserdata(mddb_de_ic_t * dep)2304 allocuserdata(
2305 	mddb_de_ic_t	*dep
2306 )
2307 {
2308 	mddb_rb32_t	*rbp;
2309 
2310 #if defined(_ILP32) && !defined(lint)
2311 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2312 #endif
2313 
2314 	rbp = dep->de_rb;
2315 	rbp->rb_private = 0;
2316 	dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2317 	rbp->rb_userdata = 0x4;	/* Make sure this is non-zero */
2318 	bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2319 }
2320 
2321 
2322 static void
getuserdata(set_t setno,mddb_de_ic_t * dep)2323 getuserdata(
2324 	set_t		setno,
2325 	mddb_de_ic_t	*dep
2326 )
2327 {
2328 	mddb_rb32_t	 *rbp;
2329 
2330 
2331 	mddb_type_t	type = dep->de_type1;
2332 	caddr_t		data, udata;
2333 
2334 #if defined(_ILP32) && !defined(lint)
2335 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2336 #endif
2337 	rbp = dep->de_rb;
2338 	data = (caddr_t)rbp->rb_data;
2339 	udata = (caddr_t)dep->de_rb_userdata;
2340 
2341 	/*
2342 	 * If it's a driver record, and an old style record, and not a DRL
2343 	 * record, we must convert it because it was incore as a 64 bit
2344 	 * structure but its on disk layout has only 32 bit for block sizes
2345 	 */
2346 	if (!(md_get_setstatus(setno) &
2347 	    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
2348 	    (type >= MDDB_FIRST_MODID) &&
2349 	    ((rbp->rb_revision == MDDB_REV_RB) ||
2350 	    (rbp->rb_revision == MDDB_REV_RBFN))) {
2351 
2352 		switch (dep->de_flags) {
2353 
2354 			case MDDB_F_STRIPE:
2355 				stripe_convert(data, udata, BIG_2_SMALL);
2356 				break;
2357 
2358 			case MDDB_F_MIRROR:
2359 				mirror_convert(data, udata, BIG_2_SMALL);
2360 				break;
2361 
2362 			case MDDB_F_RAID:
2363 				raid_convert(data, udata, BIG_2_SMALL);
2364 				break;
2365 
2366 			case MDDB_F_SOFTPART:
2367 				softpart_convert(data, udata, BIG_2_SMALL);
2368 				break;
2369 
2370 			case MDDB_F_TRANS_MASTER:
2371 				trans_master_convert(data, udata, BIG_2_SMALL);
2372 				break;
2373 
2374 			case MDDB_F_TRANS_LOG:
2375 				trans_log_convert(data, udata, BIG_2_SMALL);
2376 				break;
2377 
2378 			case MDDB_F_HOTSPARE:
2379 				hs_convert(data, udata, BIG_2_SMALL);
2380 				break;
2381 
2382 			case MDDB_F_OPT:
2383 			default:
2384 				bcopy(udata, data, dep->de_reqsize);
2385 		}
2386 	} else {
2387 		bcopy(udata, data, dep->de_reqsize);
2388 	}
2389 }
2390 
2391 static void
getoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)2392 getoptrecord(
2393 	mddb_set_t	*s,
2394 	mddb_de_ic_t	*dep
2395 )
2396 {
2397 	mddb_lb_t	*lbp;
2398 	mddb_locator_t	*lp;
2399 	mddb_rb32_t	*rbp, *crbp;
2400 	int		li;
2401 	int		i;
2402 	int		err = 0;
2403 	size_t		recsize;
2404 
2405 #if defined(_ILP32) && !defined(lint)
2406 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2407 #endif
2408 
2409 	lbp = s->s_lbp;
2410 
2411 	recsize = dep->de_recsize;
2412 	dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2413 	rbp = dep->de_rb;
2414 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2415 
2416 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2417 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2418 
2419 	for (i = 0; i < 2; i++) {
2420 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2421 			continue;
2422 		li = dep->de_optinfo[i].o_li;
2423 		lp = &lbp->lb_locators[li];
2424 
2425 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2426 		    (lp->l_flags & MDDB_F_EMASTER))
2427 			continue;
2428 
2429 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2430 		    dep->de_blkcount, li, 0);
2431 
2432 		if (err)
2433 			continue;
2434 
2435 		if (rbp->rb_magic != MDDB_MAGIC_RB)
2436 			continue;
2437 
2438 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
2439 			continue;
2440 
2441 		/* Check the crc for this record */
2442 		if (rec_crcchk(s, dep, rbp)) {
2443 			continue;
2444 		}
2445 
2446 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2447 
2448 		if (rbp == crbp) {
2449 			if (rbp->rb_checksum != crbp->rb_checksum)
2450 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2451 			break;
2452 		}
2453 		rbp = crbp;
2454 	}
2455 
2456 	if (rbp == crbp) {
2457 		rbp->rb_private = 0;
2458 		kmem_free((caddr_t)crbp, recsize);
2459 		return;
2460 	}
2461 	bzero((caddr_t)rbp, recsize);
2462 	rbp->rb_magic = MDDB_MAGIC_RB;
2463 	rbp->rb_revision = MDDB_REV_RB;
2464 	uniqtime32(&rbp->rb_timestamp);
2465 	/* Generate the crc for this record */
2466 	rec_crcgen(s, dep, rbp);
2467 	kmem_free((caddr_t)crbp, recsize);
2468 }
2469 
2470 /*
2471  * writeoptrecord writes out an optimized record.
2472  */
2473 static int
writeoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)2474 writeoptrecord(
2475 	mddb_set_t	*s,
2476 	mddb_de_ic_t	*dep
2477 )
2478 {
2479 	mddb_rb32_t	*rbp;
2480 	int		li;
2481 	int		err = 0, wrt_err = 0;
2482 	mddb_bf_t	*bufhead, *bfp;
2483 	mddb_lb_t	*lbp = s->s_lbp;
2484 	mddb_locator_t	*lp;
2485 	int		i;
2486 
2487 #if defined(_ILP32) && !defined(lint)
2488 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2489 #endif
2490 
2491 	bufhead = NULL;
2492 	err = 0;
2493 
2494 	while (s->s_opthavequeuinglck) {
2495 		s->s_optwantqueuinglck++;
2496 		cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2497 	}
2498 	s->s_opthavequeuinglck++;
2499 	rbp = dep->de_rb;
2500 	for (i = 0; i < 2; i++) {
2501 		/*
2502 		 * only possible error is xlate. This can
2503 		 * occur if a replica was off line and came
2504 		 * back. During the mean time the database grew
2505 		 * large than the now on line replica can store
2506 		 */
2507 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2508 			continue;
2509 		li = dep->de_optinfo[i].o_li;
2510 		/*
2511 		 * In a MN diskset, any node can write optimized record(s).
2512 		 */
2513 		wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2514 		    dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2515 		/*
2516 		 * For MN diskset, set error in optinfo structure so
2517 		 * that mddb_commitrec knows which replica failed.
2518 		 */
2519 		if ((MD_MNSET_SETNO(s->s_setno)) &&
2520 		    (wrt_err & MDDB_F_EWRITE)) {
2521 			dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2522 		}
2523 		err |= wrt_err;
2524 	}
2525 	s->s_opthavequeuinglck = 0;
2526 	if (s->s_optwantqueuinglck) {
2527 		s->s_optwantqueuinglck = 0;
2528 		cv_broadcast(&s->s_optqueuing_cv);
2529 	}
2530 	for (bfp = bufhead; bfp; bfp = bufhead) {
2531 		mutex_exit(SETMUTEX(s->s_setno));
2532 		(void) biowait(&bfp->bf_buf);
2533 		mutex_enter(SETMUTEX(s->s_setno));
2534 		if (bfp->bf_buf.b_flags & B_ERROR) {
2535 			/*
2536 			 * If an MN diskset, don't set replica
2537 			 * in error since this hasn't been set in master.
2538 			 * Setting replica in error before master could
2539 			 * leave the nodes with different views of the
2540 			 * world since a class 1 configuration change
2541 			 * could occur in mddb_commitrec as soon as
2542 			 * all locks are dropped.  Must keep this
2543 			 * node the same as master and can't afford a
2544 			 * failure from the class 1 config change
2545 			 * if master succeeded.
2546 			 */
2547 			if (!(MD_MNSET_SETNO(s->s_setno))) {
2548 				bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2549 			} else {
2550 				/*
2551 				 * Find which de_optinfo (which replica)
2552 				 * had a failure and set the failure in
2553 				 * the o_flags field.
2554 				 */
2555 				lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2556 				if (lp == bfp->bf_locator) {
2557 					dep->de_optinfo[0].o_flags |=
2558 					    MDDB_F_EWRITE;
2559 				} else {
2560 					dep->de_optinfo[1].o_flags |=
2561 					    MDDB_F_EWRITE;
2562 				}
2563 			}
2564 			err |= MDDB_F_EWRITE;
2565 		}
2566 		bufhead = bfp->bf_next;
2567 		freebuffer(s, bfp);
2568 	}
2569 	return (err);
2570 }
2571 
2572 /*
2573  * Fix up the optimized resync record.  Used in the traditional and local
2574  * disksets to move an optimized record from a failed or deleted mddb
2575  * to an active one.
2576  *
2577  * In a MN diskset, the fixing of the optimized record is split between
2578  * the master and slave nodes.  If the master node moves the optimized
2579  * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2580  * message to the slave nodes causing the slave nodes to reget the
2581  * directory entry containing the location of the optimized resync record.
2582  * After the record is reread from disk, then writeoptrecord is called
2583  * if the location of the optimized resync record or flags have changed.
2584  * When writeoptrecord is called, the node that is the owner of this record
2585  * will write the optimized record to the location specified in the directory
2586  * entry.  Since the master node uses the highest class message (PARSE)
2587  * the record owner node is guaranteed to already have an updated
2588  * directory entry incore.
2589  *
2590  * The other difference between the traditional/local set and MN diskset
2591  * is that the directory entry can be written to disk before the optimized
2592  * record in a MN diskset if the record is owned by a slave node.  So,
2593  * the users of an optimized record must handle the failure case when no
2594  * data is available from an optimized record since the master node could
2595  * have failed during the relocation of the optimized record to another mddb.
2596  */
2597 static int
fixoptrecord(mddb_set_t * s,mddb_de_ic_t * dep,mddb_db_t * dbp)2598 fixoptrecord(
2599 	mddb_set_t	*s,
2600 	mddb_de_ic_t	*dep,
2601 	mddb_db_t	*dbp
2602 )
2603 {
2604 	int		changed;
2605 	int		writedata;
2606 	int		err = 0;
2607 	int		i;
2608 	mddb_lb_t	*lbp;
2609 	mddb_optinfo_t	*op;
2610 	mddb_db32_t	*db32p;
2611 	int		rec_owner;	/* Is node owner of record? */
2612 
2613 #if defined(_ILP32) && !defined(lint)
2614 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2615 #endif
2616 
2617 	lbp = s->s_lbp;
2618 	changed = 0;
2619 	writedata = 0;
2620 	for (i = 0; i < 2; i++) {
2621 		op = &dep->de_optinfo[i];
2622 
2623 		if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2624 			op->o_flags = 0;
2625 
2626 		/*
2627 		 * If optimized record has seen a replica failure,
2628 		 * assign new replica to record and re-write data
2629 		 * to new record.
2630 		 */
2631 		if (! (op->o_flags & MDDB_F_ACTIVE)) {
2632 			getoptdev(s, dep, i);
2633 			writedata++;
2634 			changed++;
2635 			/* Set flag for slaves to reread dep and write rec */
2636 			if (lbp->lb_flags & MDDB_MNSET) {
2637 				s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2638 			}
2639 		}
2640 
2641 		/*
2642 		 * If just an error in the data was seen, set
2643 		 * the optimized record's replica flag to active (ok)
2644 		 * and try again.
2645 		 */
2646 		if (op->o_flags & MDDB_F_EDATA) {
2647 			dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2648 			writedata++;
2649 		}
2650 	}
2651 
2652 	rec_owner = 0;
2653 	if (lbp->lb_flags & MDDB_MNSET) {
2654 		/*
2655 		 * If a MN diskset then check the owner of optimized record.
2656 		 * If the master node owns the record or if there is
2657 		 * no owner of the record, then the master can write the
2658 		 * optimized record to disk.
2659 		 * Master node can write the optimized record now, but
2660 		 * slave nodes write their records during handling of
2661 		 * the MDDB_PARSE_OPTRECS message.
2662 		 */
2663 		if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2664 		    (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2665 			rec_owner = 1;
2666 		}
2667 	} else {
2668 		/*
2669 		 * In traditional diskset and local set, this node
2670 		 * is always the record owner and always the master.
2671 		 */
2672 		rec_owner = 1;
2673 	}
2674 
2675 	/*
2676 	 * If this node is the record owner, write out record.
2677 	 */
2678 	if ((writedata) && (rec_owner)) {
2679 		if (err = writeoptrecord(s, dep)) {
2680 			return (err);
2681 		}
2682 	}
2683 	if (! changed)
2684 		return (0);
2685 	uniqtime32(&dbp->db_timestamp);
2686 	dbp->db_revision = MDDB_REV_DB;
2687 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2688 	create_db32rec(db32p, dbp);
2689 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2690 	err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2691 	    1, MDDB_WR_ONLY_MASTER);
2692 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
2693 	return (err);
2694 }
2695 
2696 static int
fixoptrecords(mddb_set_t * s)2697 fixoptrecords(
2698 	mddb_set_t		*s
2699 )
2700 {
2701 	mddb_de_ic_t	*dep;
2702 	mddb_db_t	*dbp;
2703 	int		err = 0;
2704 	set_t		setno;
2705 
2706 	/*
2707 	 * In a MN diskset, the master node is the only node that runs
2708 	 * fixoptrecords.  If the master node changes anything, then the
2709 	 * master node sends PARSE message to the slave nodes.  The slave
2710 	 * nodes will then re-read in the locator block or re-read in the
2711 	 * directory blocks and re-write the optimized resync records.
2712 	 */
2713 	setno = s->s_setno;
2714 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2715 	    (md_set[setno].s_am_i_master == 0)) {
2716 		return (0);
2717 	}
2718 
2719 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2720 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2721 			if (! (dep->de_flags & MDDB_F_OPT))
2722 				continue;
2723 			err = fixoptrecord(s, dep, dbp);
2724 			if (err != 0)
2725 				return (err);
2726 		}
2727 	}
2728 	return (0);
2729 }
2730 
2731 /*
2732  * Checks incore version of mddb data to mddb data ondisk.
2733  *
2734  * Returns:
2735  *	- 0 if the data was successfully read and is good.
2736  *	- MDDB_F_EREAD if a read error occurred.
2737  *	- 1 if the data read is bad (checksum failed, etc)
2738  */
2739 static int
checkcopy(mddb_set_t * s,int li)2740 checkcopy
2741 (
2742 	mddb_set_t	*s,
2743 	int		li
2744 )
2745 {
2746 	mddb_db_t	*dbp;
2747 	mddb_db32_t	*cdb32p;
2748 	mddb_de_ic_t	*dep;
2749 	mddb_de32_t	*cde32p;
2750 	mddb_rb32_t	*rbp, *crbp;
2751 	size_t		size;
2752 	int		i;
2753 	int		retval = 1;
2754 
2755 #if defined(_ILP32) && !defined(lint)
2756 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2757 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2758 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2759 #endif
2760 
2761 	if (s->s_databuffer_size == 0) {
2762 		size_t maxrecsize = MDDB_BSIZE;
2763 
2764 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2765 			for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2766 				if (! (dep->de_flags & MDDB_F_OPT) &&
2767 				    dep->de_recsize > maxrecsize)
2768 					maxrecsize = dep->de_recsize;
2769 
2770 		s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2771 		s->s_databuffer_size = maxrecsize;
2772 	}
2773 
2774 	cdb32p = (mddb_db32_t *)s->s_databuffer;
2775 
2776 	/*
2777 	 * first go through and make sure all directory stuff
2778 	 * is the same
2779 	 */
2780 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2781 		if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2782 			retval = MDDB_F_EREAD;
2783 			goto err;
2784 		}
2785 		if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2786 			goto err;
2787 		if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2788 			goto err;
2789 		if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2790 			goto err;
2791 		if (cdb32p->db32_nextblk != dbp->db_nextblk)
2792 			goto err;
2793 		if (cdb32p->db32_recsum != dbp->db_recsum)
2794 			goto err;
2795 		if (cdb32p->db32_firstentry) {
2796 			cde32p = (mddb_de32_t *)
2797 			    ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2798 			    + sizeof (cdb32p->db32_firstentry)));
2799 		} else
2800 			cde32p = NULL;
2801 
2802 		dep = dbp->db_firstentry;
2803 		/*
2804 		 * check if all directory entries are identical
2805 		 */
2806 		while (dep && cde32p) {
2807 			if (dep->de_recid != cde32p->de32_recid)
2808 				goto err;
2809 			if (dep->de_type1 != cde32p->de32_type1)
2810 				goto err;
2811 			if (dep->de_type2 != cde32p->de32_type2)
2812 				goto err;
2813 			if (dep->de_reqsize != cde32p->de32_reqsize)
2814 				goto err;
2815 			if (dep->de_flags != cde32p->de32_flags)
2816 				goto err;
2817 
2818 			for (i = 0; i < 2; i++) {
2819 				if (dep->de_optinfo[i].o_li !=
2820 				    cde32p->de32_optinfo[i].o_li)
2821 					break;
2822 			}
2823 			if (i != 2)
2824 				goto err;
2825 			size = sizeof (mddb_block_t) * dep->de_blkcount;
2826 			if (bcmp((caddr_t)dep->de_blks,
2827 			    (caddr_t)cde32p->de32_blks, size))
2828 				goto err;
2829 			dep = dep->de_next;
2830 			if (cde32p->de32_next)
2831 				cde32p = nextentry(cde32p);
2832 			else
2833 				cde32p = NULL;
2834 		}
2835 		if (dep || cde32p)
2836 			goto err;
2837 	}
2838 	/*
2839 	 * If here, all directories are functionally identical
2840 	 * check to make sure all records are identical
2841 	 * the reason the records are not just bcmped is that the
2842 	 * lock flag does not want to be compared.
2843 	 */
2844 	crbp = (mddb_rb32_t *)cdb32p;
2845 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2846 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2847 			if ((dep->de_flags & MDDB_F_OPT) ||
2848 			    (dep->de_flags & MDDB_F_CHANGELOG))
2849 				continue;
2850 			rbp = (mddb_rb32_t *)dep->de_rb;
2851 			if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2852 			    dep->de_blkcount, li, 0)) {
2853 				retval = MDDB_F_EREAD;
2854 				goto err;
2855 			}
2856 			/* Check the crc for this record */
2857 			if (rec_crcchk(s, dep, crbp))
2858 				goto err;
2859 
2860 			if (rbp->rb_checksum != crbp->rb_checksum ||
2861 			    rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2862 				goto err;
2863 		}
2864 	}
2865 	return (0);
2866 err:
2867 	return (retval);
2868 }
2869 
2870 /*
2871  * Determine if the location information for two mddbs is the same.
2872  * The device slice and block offset should match.  If both have devids then
2873  * use that for the comparison, otherwise we compare the dev_ts.
2874  * Comparing with the devid allows us to handle the case where a mddb was
2875  * relocated to a dead mddbs dev_t.  The live mddb will have the dev_t of
2876  * the dead mddb but the devid comparison will catch this and not match.
2877  *
2878  * Return 1 if the location of the two mddbs match, 0 if not.
2879  */
2880 static int
match_mddb(mddb_ri_t * rip,ddi_devid_t devid,char * minor,md_dev64_t dev,daddr32_t blkno)2881 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2882 	daddr32_t blkno)
2883 {
2884 	if (rip->ri_flags & MDDB_F_EMASTER) {
2885 		/*
2886 		 * If this element is errored then we don't try to match on it.
2887 		 * If we try to match we could erroneously match on the dev_t
2888 		 * of a relocated disk.
2889 		 */
2890 		return (0);
2891 	}
2892 
2893 	if (rip->ri_devid && devid && minor) {
2894 		/*
2895 		 * If old devid exists, then this is a replicated diskset
2896 		 * and both old and new devids must be checked.
2897 		 */
2898 		if (rip->ri_old_devid) {
2899 			if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
2900 			    (ddi_devid_compare(rip->ri_old_devid,
2901 			    devid) != 0)) ||
2902 			    (strcmp(rip->ri_minor_name, minor) != 0))
2903 				return (0);
2904 		} else {
2905 			if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2906 			    strcmp(rip->ri_minor_name, minor) != 0)
2907 				return (0);
2908 		}
2909 	} else {
2910 		if (rip->ri_dev != dev)
2911 			return (0);
2912 	}
2913 
2914 	if (rip->ri_blkno != blkno)
2915 		return (0);
2916 
2917 	return (1);
2918 }
2919 
2920 static int
ridev(mddb_ri_t ** rip,mddb_cfg_loc_t * clp,dev32_t * dev_2b_fixed,int flag)2921 ridev(
2922 	mddb_ri_t	**rip,
2923 	mddb_cfg_loc_t	*clp,
2924 	dev32_t		*dev_2b_fixed,
2925 	int		flag)
2926 {
2927 	mddb_ri_t	*r, *r1;
2928 	md_dev64_t	ldev, ndev;
2929 	major_t		majordev;
2930 	int		sz;
2931 
2932 	if (MD_UPGRADE) {
2933 		ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2934 		    clp->l_mnum);
2935 	} else {
2936 		if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2937 			return (EINVAL);
2938 
2939 		ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2940 		    clp->l_mnum);
2941 	}
2942 
2943 	if (clp->l_devid != 0) {
2944 		/*
2945 		 * Get dev associated with device id and minor name.
2946 		 * Setup correct driver name if dev is now different.
2947 		 * Don't change driver name if during upgrade.
2948 		 */
2949 		ndev = ldev;
2950 		if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2951 		    &ndev, clp->l_minor_name)) {
2952 			if ((ndev != ldev) && (!(MD_UPGRADE))) {
2953 				majordev = md_getmajor(ndev);
2954 				(void) strcpy(clp->l_driver,
2955 				    ddi_major_to_name(majordev));
2956 				clp->l_mnum = md_getminor(ndev);
2957 				clp->l_devid_flags |= MDDB_DEVID_VALID;
2958 				ldev = ndev;
2959 			}
2960 		} else {
2961 			/* Mark as invalid */
2962 			clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2963 		}
2964 	}
2965 
2966 	clp->l_dev = md_cmpldev(ldev);
2967 	if (dev_2b_fixed)
2968 		*dev_2b_fixed = clp->l_dev;
2969 	r = *rip;
2970 
2971 	while (r) {
2972 		if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2973 		    clp->l_minor_name, ldev, clp->l_blkno)) {
2974 			if ((clp->l_devid != 0) &&
2975 			    !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2976 				r->ri_flags |= MDDB_F_EMASTER;
2977 			} else {
2978 				r->ri_flags |= flag;
2979 			}
2980 			return (0);	/* already entered return success */
2981 		}
2982 		r = r->ri_next;
2983 	}
2984 
2985 	/*
2986 	 * This replica not represented in the current rip list,
2987 	 * so add it to the list.
2988 	 */
2989 	r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2990 	r->ri_dev = ldev;
2991 	r->ri_blkno = clp->l_blkno;
2992 	(void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2993 	if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2994 		r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2995 	}
2996 	if (clp->l_devname != NULL) {
2997 		(void) strcpy(r->ri_devname, clp->l_devname);
2998 	}
2999 	r->ri_flags |= flag;
3000 	if (clp->l_devid != 0) {
3001 		sz = clp->l_devid_sz;
3002 		r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
3003 		bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
3004 
3005 		if (clp->l_old_devid != NULL) {
3006 			sz = clp->l_old_devid_sz;
3007 			r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
3008 			    KM_SLEEP);
3009 			bcopy((char *)(uintptr_t)clp->l_old_devid,
3010 			    (char *)r->ri_old_devid, sz);
3011 		} else {
3012 			r->ri_old_devid = 0;
3013 		}
3014 		if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
3015 			(void) strcpy(r->ri_minor_name, clp->l_minor_name);
3016 
3017 		if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
3018 			/*
3019 			 * Devid is present, but not valid.  This could
3020 			 * happen if device has been powered off or if
3021 			 * the device has been removed.  Mark the device in
3022 			 * error.  Don't allow any writes to this device
3023 			 * based on the dev_t since another device could
3024 			 * have been placed in its spot and be responding to
3025 			 * the dev_t accesses.
3026 			 */
3027 			r->ri_flags |= MDDB_F_EMASTER;
3028 		}
3029 	} else {
3030 		r->ri_devid = 0;
3031 		r->ri_old_devid = 0;
3032 	}
3033 
3034 	/*
3035 	 * If the rip list is empty then this entry
3036 	 * is the list.
3037 	 */
3038 	if (*rip == NULL) {
3039 		*rip = r;
3040 		return (0);
3041 	}
3042 
3043 	/*
3044 	 * Add this entry to the end of the rip list
3045 	 */
3046 	r1 = *rip;
3047 	while (r1->ri_next)
3048 		r1 = r1->ri_next;
3049 	r1->ri_next = r;
3050 	return (0);
3051 }
3052 
3053 /*
3054  * writecopy writes the incore data blocks out to all of the replicas.
3055  * This is called from writestart
3056  *	- when a diskset is started or
3057  *	- when an error has been enountered during the write to a mddb.
3058  * and from newdev when a new mddb is being added.
3059  *
3060  * flag can be 2 values:
3061  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3062  *		always used for traditional and local disksets.
3063  *		For MN diskset:
3064  *			All nodes can call writecopy, but only the
3065  *			master node actually writes data to the disk
3066  *			except for optimized resync records.
3067  *			An optimized resync record can only be written to
3068  *			by the record owner.
3069  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3070  *		master has been chosen, the new master may need to
3071  * 		write its incore mddb to disk (this is the case where the
3072  *		old master had executed a message but hadn't relayed it
3073  *		to this slave yet).  New master should not write the
3074  *		change log records since new master would be overwriting
3075  *		valuable data.  Only used during a reconfig cycle.
3076  */
3077 static int
writecopy(mddb_set_t * s,int li,int flag)3078 writecopy(
3079 	mddb_set_t	*s,
3080 	int		li,
3081 	int		flag
3082 )
3083 {
3084 	mddb_db_t	*dbp;
3085 	mddb_db32_t	*db32p;
3086 	mddb_de_ic_t	*dep;
3087 	mddb_rb32_t	*rbp;
3088 	uint_t		checksum;
3089 	int		err = 0;
3090 
3091 #if defined(_ILP32) && !defined(lint)
3092 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3093 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3094 #endif
3095 
3096 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3097 		db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3098 		create_db32rec(db32p, dbp);
3099 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3100 		err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3101 		    MDDB_WR_ONLY_MASTER);
3102 		kmem_free((caddr_t)db32p, MDDB_BSIZE);
3103 		if (err)
3104 			return (err);
3105 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3106 			/*
3107 			 * In a multinode diskset, when a new master is
3108 			 * chosen the new master may need to write its
3109 			 * incore copy of the mddb to disk.  In this case,
3110 			 * don't want to overwrite the change log records
3111 			 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3112 			 */
3113 			if (flag == MDDB_WRITECOPY_SYNC) {
3114 				if (dep->de_flags & MDDB_F_CHANGELOG)
3115 					continue;
3116 			}
3117 			/*
3118 			 * In a multinode diskset, don't write out optimized
3119 			 * resync resyncs since only the mirror owner node
3120 			 * will have the correct data.  If writecopy is
3121 			 * being called from writestart as a result of
3122 			 * an mddb failure, then writestart will handle
3123 			 * the optimized records when it calls fixoptrecords.
3124 			 */
3125 			if ((MD_MNSET_SETNO(s->s_setno)) &&
3126 			    (dep->de_flags & MDDB_F_OPT)) {
3127 				continue;
3128 			}
3129 
3130 			rbp = dep->de_rb;
3131 			checksum = rbp->rb_checksum_fiddle;
3132 			checksum ^= rbp->rb_checksum;
3133 			/* Generate the crc for this record */
3134 			rec_crcgen(s, dep, rbp);
3135 			checksum ^= rbp->rb_checksum;
3136 			rbp->rb_checksum_fiddle = checksum;
3137 			if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3138 			    dep->de_blkcount, li, (mddb_bf_t **)0,
3139 			    MDDB_WR_ONLY_MASTER))
3140 				return (err);
3141 		}
3142 	}
3143 	return (0);
3144 }
3145 
3146 static int
upd_med(mddb_set_t * s,char * tag)3147 upd_med(
3148 	mddb_set_t	*s,
3149 	char		*tag
3150 )
3151 {
3152 	med_data_t	meddb;
3153 	int		medok;
3154 	mddb_lb_t	*lbp = s->s_lbp;
3155 	set_t		setno = s->s_setno;
3156 	int		li;
3157 	int		alc;
3158 	int		lc;
3159 
3160 
3161 	/* If no mediator hosts, nothing to do */
3162 	if (s->s_med.n_cnt == 0)
3163 		return (0);
3164 
3165 	/*
3166 	 * If this is a MN set and we are not the master, then don't
3167 	 * update mediator hosts or mark mediator as golden since
3168 	 * only master node should do that.
3169 	 */
3170 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3171 	    (md_set[setno].s_am_i_master == 0)) {
3172 		return (0);
3173 	}
3174 
3175 	bzero((char *)&meddb, sizeof (med_data_t));
3176 	meddb.med_dat_mag = MED_DATA_MAGIC;
3177 	meddb.med_dat_rev = MED_DATA_REV;
3178 	meddb.med_dat_fl = 0;
3179 	meddb.med_dat_sn = setno;
3180 	meddb.med_dat_cc = lbp->lb_commitcnt;
3181 	TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3182 	crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3183 
3184 	/* count accessible mediators */
3185 	medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3186 
3187 	/* count accessible and existing replicas */
3188 	for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3189 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3190 
3191 		if (lp->l_flags & MDDB_F_DELETED)
3192 			continue;
3193 
3194 		lc++;
3195 
3196 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3197 		    (lp->l_flags & MDDB_F_EMASTER) ||
3198 		    (lp->l_flags & MDDB_F_EWRITE))
3199 			continue;
3200 
3201 		alc++;
3202 	}
3203 
3204 	/*
3205 	 * Mediator update quorum is >= 50%: check for less than
3206 	 * "mediator update" quorum.
3207 	 */
3208 	if ((medok * 2) < s->s_med.n_cnt) {
3209 		/* panic if <= 50% of all replicas are accessible */
3210 		if ((lc > 0) && ((alc * 2) <= lc)) {
3211 			cmn_err(CE_PANIC,
3212 			    "md: Update of 50%% of the mediator hosts failed");
3213 			/* NOTREACHED */
3214 		}
3215 
3216 		cmn_err(CE_WARN,
3217 		    "md: Update of 50%% of the mediator hosts failed");
3218 	}
3219 
3220 	/*
3221 	 * If we have mediator update quorum and exactly 50% of the replicas
3222 	 * are accessible then mark the mediator as golden.
3223 	 */
3224 	if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3225 	    ((alc * 2) == lc)) {
3226 		meddb.med_dat_fl = MED_DFL_GOLDEN;
3227 		crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3228 		(void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3229 	}
3230 
3231 	return (0);
3232 }
3233 
3234 static int
push_lb(mddb_set_t * s)3235 push_lb(mddb_set_t *s)
3236 {
3237 	mddb_lb_t	*lbp = s->s_lbp;
3238 
3239 	/* push the change to all the replicas */
3240 	uniqtime32(&lbp->lb_timestamp);
3241 	if (MD_MNSET_SETNO(s->s_setno)) {
3242 		lbp->lb_revision = MDDB_REV_MNLB;
3243 	} else {
3244 		lbp->lb_revision = MDDB_REV_LB;
3245 	}
3246 	/*
3247 	 * The updates to the mediator hosts are done
3248 	 * by the callers of this function.
3249 	 */
3250 	return (writelocall(s));
3251 }
3252 
3253 /* Should not call for MN diskset since data tags are not supported */
3254 static int
dtl_cmp(const mddb_dtag_t * odtp,const mddb_dtag_t * ndtp)3255 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3256 {
3257 	int 		diff = 0;
3258 
3259 	diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3260 	if (diff)
3261 		return (diff);
3262 
3263 	diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3264 	if (diff)
3265 		return (diff);
3266 
3267 	diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3268 	if (diff)
3269 		return (diff);
3270 
3271 	/*CSTYLED*/
3272 	return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3273 }
3274 
3275 /* Should not call for MN diskset since data tags are not supported */
3276 static int
dtl_addl(mddb_set_t * s,const mddb_dtag_t * ndtp)3277 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3278 {
3279 	int		nextid = 0;
3280 	mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3281 
3282 	/* Run to the end of the list */
3283 	for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3284 		if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3285 			return (0);
3286 		nextid++;
3287 	}
3288 
3289 	/* Add the new member */
3290 	*dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3291 
3292 	/* Update the dtag portion of the list */
3293 	bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3294 	    sizeof (mddb_dtag_t));
3295 
3296 	/* Fix up the id value */
3297 	(*dtlpp)->dtl_dt.dt_id = ++nextid;
3298 
3299 	return (0);
3300 }
3301 
3302 /*
3303  * Even though data tags are not supported in MN disksets, dt_cntl may
3304  * be called for a MN diskset since this routine is called even before
3305  * it is known the kind of diskset being read in from disk.
3306  * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3307  */
3308 static int
dtl_cntl(mddb_set_t * s)3309 dtl_cntl(mddb_set_t *s)
3310 {
3311 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3312 	int		ndt = 0;
3313 
3314 	while (dtlp != NULL) {
3315 		ndt++;
3316 		dtlp = dtlp->dtl_nx;
3317 	}
3318 
3319 	return (ndt);
3320 }
3321 
3322 /*
3323  * Even though data tags are not supported in MN disksets, dt_cntl may
3324  * be called for a MN diskset since this routine is called even before
3325  * it is known the kind of diskset being read in from disk.
3326  * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3327  */
3328 static mddb_dtag_t *
dtl_findl(mddb_set_t * s,int id)3329 dtl_findl(mddb_set_t *s, int id)
3330 {
3331 	mddb_dtag_lst_t	*dtlp = s->s_dtlp;
3332 
3333 	while (dtlp != NULL) {
3334 		if (dtlp->dtl_dt.dt_id == id)
3335 			return (&dtlp->dtl_dt);
3336 		dtlp = dtlp->dtl_nx;
3337 	}
3338 	return ((mddb_dtag_t *)NULL);
3339 }
3340 
3341 /* Should not call for MN diskset since data tags are not supported */
3342 static void
dtl_freel(mddb_dtag_lst_t ** dtlpp)3343 dtl_freel(mddb_dtag_lst_t **dtlpp)
3344 {
3345 	mddb_dtag_lst_t	*dtlp;
3346 	mddb_dtag_lst_t	*tdtlp;
3347 
3348 
3349 	for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3350 		dtlp = tdtlp->dtl_nx;
3351 		kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3352 	}
3353 	*dtlpp = (mddb_dtag_lst_t *)NULL;
3354 }
3355 
3356 /*
3357  * Even though data tags are not supported in MN disksets, dt_setup will
3358  * be called for a MN diskset since this routine is called even before
3359  * it is known the kind of diskset being read in from disk.
3360  * Once this set is known as a MN diskset, the dtp area will be freed.
3361  */
3362 static void
dt_setup(mddb_set_t * s,const mddb_dtag_t * dtagp)3363 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3364 {
3365 	mddb_dt_t	*dtp;
3366 	set_t		setno = s->s_setno;
3367 
3368 
3369 	if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3370 		md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3371 	else if (dtagp == (mddb_dtag_t *)NULL)
3372 		bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3373 
3374 	/* shorthand */
3375 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3376 
3377 	dtp->dt_mag = MDDB_MAGIC_DT;
3378 	dtp->dt_rev = MDDB_REV_DT;
3379 
3380 	if (dtagp != NULL)
3381 		dtp->dt_dtag = *dtagp;		/* structure assignment */
3382 
3383 	/* Initialize the setno */
3384 	dtp->dt_dtag.dt_setno = setno;
3385 
3386 	/* Clear the id and flags, this is only used in user land */
3387 	dtp->dt_dtag.dt_id = 0;
3388 
3389 	/* Checksum it */
3390 	crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3391 }
3392 
3393 /* Should not call for MN diskset since data tags are not supported */
3394 static int
set_dtag(mddb_set_t * s,md_error_t * ep)3395 set_dtag(mddb_set_t *s, md_error_t *ep)
3396 {
3397 	mddb_lb_t	*lbp = s->s_lbp;
3398 	mddb_dtag_t	tag;
3399 
3400 	if (lbp->lb_dtblkcnt == 0) {
3401 		/* Data tags not used in a MN set - so no failure returned */
3402 		if (lbp->lb_flags & MDDB_MNSET)
3403 			return (0);
3404 
3405 		cmn_err(CE_WARN,
3406 		    "No tag record allocated, unable to tag data");
3407 		(void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3408 		return (1);
3409 	}
3410 
3411 	/* Clear the stack variable */
3412 	bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3413 
3414 	/* Get the HW serial number for this host */
3415 	(void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
3416 	tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3417 
3418 	/* Get the nodename that this host goes by */
3419 	(void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3420 	tag.dt_hn[MD_MAX_NODENAME] = '\0';
3421 
3422 	/* Get a time stamp for NOW */
3423 	uniqtime32(&tag.dt_tv);
3424 
3425 	/* Setup the data tag record */
3426 	dt_setup(s, &tag);
3427 
3428 	/* Free any list of tags if they exist */
3429 	dtl_freel(&s->s_dtlp);
3430 
3431 	/* Put the new tag onto the tag list */
3432 	(void) dtl_addl(s, &tag);
3433 
3434 	return (0);
3435 }
3436 
3437 /*
3438  * If called during upgrade, this routine expects a non-translated
3439  * (aka target) dev.
3440  * Should not call for MN diskset since data tags are not supported.
3441  */
3442 static int
dt_read(mddb_set_t * s,mddb_lb_t * lbp,mddb_ri_t * rip)3443 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3444 {
3445 	int		err = 0;
3446 	md_dev64_t	dev;
3447 	caddr_t		tbuf;
3448 	daddr_t		physblk;
3449 	mddb_block_t	blk;
3450 	mddb_dt_t	*dtp;
3451 	mddb_dtag_t	*dtagp;
3452 	set_t		setno = s->s_setno;
3453 
3454 	/* If have not allocated a data tag record, there is nothing to do */
3455 	if (lbp->lb_dtblkcnt == 0)
3456 		return (1);
3457 
3458 	dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3459 
3460 	if (dtp == (mddb_dt_t *)NULL)
3461 		return (1);
3462 
3463 	/* shorthand */
3464 	dev = md_xlate_targ_2_mini(rip->ri_dev);
3465 	if (dev == NODEV64) {
3466 		return (1);
3467 	}
3468 
3469 	tbuf = (caddr_t)rip->ri_dtp;
3470 
3471 	for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3472 		physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3473 		err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
3474 		/* error reading the tag */
3475 		if (err) {
3476 			err = 1;
3477 			goto out;
3478 		}
3479 		tbuf += MDDB_BSIZE;
3480 	}
3481 
3482 	/* magic is valid? */
3483 	if (dtp->dt_mag != MDDB_MAGIC_DT) {
3484 		err = 1;
3485 		goto out;
3486 	}
3487 
3488 	/* revision is valid? */
3489 	if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3490 		err = 1;
3491 		goto out;
3492 	}
3493 
3494 	/* crc is valid? */
3495 	if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3496 		err = 1;
3497 		goto out;
3498 	}
3499 
3500 	/* shorthand */
3501 	dtagp = &dtp->dt_dtag;
3502 
3503 	/* set number match? */
3504 	if (dtagp->dt_setno != setno) {
3505 		err = 1;
3506 		goto out;
3507 	}
3508 
3509 	/* tag is not empty? */
3510 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3511 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3512 	    dtagp->dt_id == 0) {
3513 		err = 2;
3514 		goto out;
3515 	}
3516 
3517 	/* Mark the locator as having tagged data */
3518 	rip->ri_flags |= MDDB_F_TAGDATA;
3519 
3520 out:
3521 	if (err) {
3522 		if (err == 1) {
3523 			md_set_setstatus(setno, MD_SET_BADTAG);
3524 			rip->ri_flags |= MDDB_F_BADTAG;
3525 		}
3526 		if (dtp != NULL) {
3527 			kmem_free(dtp, MDDB_DT_BYTES);
3528 			rip->ri_dtp = (mddb_dt_t *)NULL;
3529 		}
3530 	}
3531 
3532 	return (err);
3533 }
3534 
3535 /* Should not call for MN diskset since data tags are not supported */
3536 static int
dt_write(mddb_set_t * s)3537 dt_write(mddb_set_t *s)
3538 {
3539 	int		li;
3540 	int		err = 0;
3541 	int		werr;
3542 	int		empty_tag = 0;
3543 	mddb_dtag_t	*dtagp;
3544 	mddb_dt_t	*dtp;
3545 	mddb_lb_t	*lbp = s->s_lbp;
3546 	set_t		setno = s->s_setno;
3547 	uint_t		set_status = md_get_setstatus(setno);
3548 
3549 
3550 	ASSERT(md_set[setno].s_dtp != NULL);
3551 
3552 	/* Nowhere to write to */
3553 	if (lbp->lb_dtblkcnt == 0)
3554 		return (err);
3555 
3556 	if (set_status & MD_SET_BADTAG)
3557 		return (err);
3558 
3559 	/* shorthand */
3560 	dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3561 	dtagp = &dtp->dt_dtag;
3562 
3563 	/* See if the tag is empty. */
3564 	if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3565 	    (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3566 	    dtagp->dt_id == 0)
3567 		empty_tag = 1;
3568 
3569 	/* Write the tag to the locators and reset appropriate flags. */
3570 	for (li = 0; li < lbp->lb_loccnt; li++) {
3571 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3572 
3573 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3574 		    (lp->l_flags & MDDB_F_DELETED) ||
3575 		    (lp->l_flags & MDDB_F_EWRITE))
3576 			continue;
3577 
3578 		werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3579 		    MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3580 
3581 		if (werr) {
3582 			err |= werr;
3583 			continue;
3584 		}
3585 
3586 		if (empty_tag)
3587 			lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3588 		else {
3589 			lp->l_flags |= MDDB_F_TAGDATA;
3590 			lp->l_flags &= ~MDDB_F_BADTAG;
3591 		}
3592 	}
3593 
3594 	if (err)
3595 		return (err);
3596 
3597 
3598 	/* If the tags were written, check to see if any tags remain. */
3599 	for (li = 0; li < lbp->lb_loccnt; li++) {
3600 		mddb_locator_t	*lp = &lbp->lb_locators[li];
3601 
3602 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3603 		    (lp->l_flags & MDDB_F_DELETED) ||
3604 		    (lp->l_flags & MDDB_F_EWRITE))
3605 			continue;
3606 
3607 		if (lp->l_flags & MDDB_F_TAGDATA)
3608 			break;
3609 	}
3610 
3611 	/* If there are no tags, then clear CLRTAG and TAGDATA */
3612 	if (li == lbp->lb_loccnt) {
3613 		md_clr_setstatus(setno, MD_SET_CLRTAG);
3614 		md_clr_setstatus(setno, MD_SET_TAGDATA);
3615 	}
3616 
3617 	return (err);
3618 }
3619 
3620 /* Should not call for MN diskset since data tags are not supported */
3621 static int
dt_alloc_if_needed(mddb_set_t * s)3622 dt_alloc_if_needed(mddb_set_t *s)
3623 {
3624 	int		i;
3625 	int		li;
3626 	int		moveit = 0;
3627 	mddb_lb_t	*lbp = s->s_lbp;
3628 	mddb_block_t	blkcnt = lbp->lb_dtblkcnt;
3629 	set_t		setno = s->s_setno;
3630 	uint_t		set_status = md_get_setstatus(setno);
3631 
3632 	/*
3633 	 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3634 	 * not detected, there is nothing to do.
3635 	 */
3636 	if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3637 		return (0);
3638 
3639 	/* Bitmap not setup, checks can't be done */
3640 	if (s->s_totalblkcnt == 0)
3641 		return (0);
3642 
3643 	/* While reading the tag(s) an invalid tag data record was seen */
3644 	if (set_status & MD_SET_BADTAG)
3645 		/* See if the invalid tag needs to be moved */
3646 		for (i = 0; i < MDDB_DT_BLOCKS; i++)
3647 			if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3648 				moveit = 1;
3649 				break;
3650 			}
3651 
3652 	/* Need to move or allocate the tag data record */
3653 	if (moveit || blkcnt == 0) {
3654 		lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3655 		if (lbp->lb_dtfirstblk == 0) {
3656 			cmn_err(CE_WARN,
3657 			    "Unable to allocate data tag record");
3658 			return (0);
3659 		}
3660 		lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3661 
3662 		/* Mark the locators so that they get written to disk. */
3663 		for (li = 0; li < lbp->lb_loccnt; li++) {
3664 			mddb_locator_t	*lp = &lbp->lb_locators[li];
3665 
3666 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3667 			    (lp->l_flags & MDDB_F_DELETED) ||
3668 			    (lp->l_flags & MDDB_F_EWRITE))
3669 				continue;
3670 
3671 			lp->l_flags |= MDDB_F_BADTAG;
3672 		}
3673 		return (1);
3674 	}
3675 
3676 	/*
3677 	 * Make sure the blocks are owned, since the calculation in
3678 	 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3679 	 */
3680 	for (i = 0; i < MDDB_DT_BLOCKS; i++)
3681 		blkbusy(s, (i + lbp->lb_dtfirstblk));
3682 
3683 	return (1);
3684 }
3685 
3686 /*
3687  * Writestart writes the incore mddb out to all of the replicas.
3688  * This is called when a diskset is started and when an error has
3689  * been enountered during the write to a mddb.
3690  *
3691  * flag can be 2 values:
3692  *	MDDB_WRITECOPY_ALL - write all records to all mddbs.  This is
3693  *		always used for traditional and local disksets.
3694  *		This is the normal path for MN disksets since the slave
3695  *		nodes aren't actually allowed to write to disk.
3696  *	MDDB_WRITECOPY_SYNC - special case for MN diskset.  When a new
3697  *		master has been chosen, the new master may need to
3698  * 		write its incore mddb to disk (this is the case where the
3699  *		old master had executed a message but hadn't relayed it
3700  *		to this slave yet).  New master should not write the
3701  *		change log records since new master would be overwriting
3702  *		valuable data.  Only used during a reconfig cycle.
3703  */
3704 static int
writestart(mddb_set_t * s,int flag)3705 writestart(
3706 	mddb_set_t	*s,
3707 	int		flag
3708 )
3709 {
3710 	int		li;
3711 	mddb_locator_t	*lp;
3712 	mddb_lb_t	*lbp;
3713 	mddb_ln_t	*lnp;
3714 	int		err = 0;
3715 	uint_t		set_status;
3716 
3717 	lbp = s->s_lbp;
3718 
3719 	for (li = 0; li < lbp->lb_loccnt; li++) {
3720 		lp = &lbp->lb_locators[li];
3721 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3722 			continue;
3723 		if (! (lp->l_flags & MDDB_F_SUSPECT))
3724 			continue;
3725 		if (writecopy(s, li, flag))
3726 			return (1);
3727 		lp->l_flags |= MDDB_F_UP2DATE;
3728 	}
3729 
3730 	for (li = 0; li < lbp->lb_loccnt; li++) {
3731 		lp = &lbp->lb_locators[li];
3732 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3733 			continue;
3734 		if ((lp->l_flags & MDDB_F_UP2DATE))
3735 			continue;
3736 		if (checkcopy(s, li))
3737 			if (err = writecopy(s, li, flag))
3738 				return (1);
3739 		lp->l_flags |= MDDB_F_UP2DATE;
3740 	}
3741 
3742 	/*
3743 	 * Call fixoptrecord even during a reconfig cycle since a replica
3744 	 * failure may force the master to re-assign the optimized
3745 	 * resync record to another replica.
3746 	 */
3747 	if (fixoptrecords(s))
3748 		return (1);
3749 
3750 	set_status = md_get_setstatus(s->s_setno);
3751 
3752 	/* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3753 	for (li = 0; li < lbp->lb_loccnt; li++) {
3754 		lp = &lbp->lb_locators[li];
3755 
3756 		if (lp->l_flags & MDDB_F_DELETED)
3757 			continue;
3758 
3759 		if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3760 		    (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3761 		    ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3762 		    (lp->l_flags & MDDB_F_OLDACT) != 0))
3763 			break;
3764 
3765 		if ((set_status & MD_SET_TAGDATA) ||
3766 		    (set_status & MD_SET_CLRTAG))
3767 			if ((lp->l_flags & MDDB_F_TAGDATA) ||
3768 			    (lp->l_flags & MDDB_F_BADTAG))
3769 				break;
3770 	}
3771 
3772 	/*
3773 	 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3774 	 * the lbp identifier and the set identifier doesn't match.
3775 	 */
3776 	if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3777 
3778 		/* Only call for traditional and local sets */
3779 		if (!(lbp->lb_flags & MDDB_MNSET))
3780 			(void) dt_write(s);
3781 
3782 		setidentifier(s, &lbp->lb_ident);
3783 
3784 		if (err = push_lb(s)) {
3785 			(void) upd_med(s, "writestart(0)");
3786 			return (err);
3787 		}
3788 
3789 		(void) upd_med(s, "writestart(0)");
3790 
3791 		if (err = push_lb(s)) {
3792 			(void) upd_med(s, "writestart(1)");
3793 			return (err);
3794 		}
3795 
3796 		(void) upd_med(s, "writestart(1)");
3797 
3798 		lnp = s->s_lnp;
3799 		uniqtime32(&lnp->ln_timestamp);
3800 		if (lbp->lb_flags & MDDB_MNSET)
3801 			lnp->ln_revision = MDDB_REV_MNLN;
3802 		else
3803 			lnp->ln_revision = MDDB_REV_LN;
3804 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3805 		err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3806 		    lbp->lb_lnblkcnt, 0);
3807 		/*
3808 		 * If a MN diskset and this is the master, set the PARSE_LOCNM
3809 		 * flag in the mddb_set structure to show that the locator
3810 		 * names have changed.
3811 		 * Don't set parseflags as a result of a new master sync
3812 		 * during reconfig cycle since slaves nodes are already
3813 		 * in-sync with the new master.
3814 		 */
3815 
3816 		if ((lbp->lb_flags & MDDB_MNSET) &&
3817 		    (md_set[s->s_setno].s_am_i_master) &&
3818 		    (flag != MDDB_WRITECOPY_SYNC)) {
3819 			s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3820 		}
3821 
3822 		if (err)
3823 			return (err);
3824 	}
3825 
3826 	for (li = 0; li < lbp->lb_loccnt; li++) {
3827 		lp = &lbp->lb_locators[li];
3828 		if (lp->l_flags & MDDB_F_DELETED)
3829 			continue;
3830 		if (lp->l_flags & MDDB_F_ACTIVE) {
3831 			lp->l_flags |= MDDB_F_OLDACT;
3832 		} else {
3833 			lp->l_flags &= ~MDDB_F_OLDACT;
3834 		}
3835 	}
3836 
3837 	md_clr_setstatus(s->s_setno, MD_SET_STALE);
3838 
3839 	return (0);
3840 }
3841 
3842 /*
3843  * selectreplicas selects the working replicas and may write the incore
3844  * version of the mddb out to the replicas ondisk.
3845  *
3846  * flag can be 3 values:
3847  *	MDDB_RETRYSCAN - quick scan to see if there is an error.
3848  *			If no new error, returns without writing mddb
3849  *			to disks.  If a new error is seen, writes out
3850  *			mddb to disks.
3851  *	MDDB_SCANALL  - lengthy scan to check out mddbs and always writes
3852  *			out mddb to the replica ondisk.  Calls writecopy
3853  *			with MDDB_WRITECOPY_ALL flag which writes out
3854  *			all records to the replicas ondisk.
3855  *	MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3856  *			and ondisk mddbs by writing incore values to disk.
3857  *			Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3858  *			that change log records are not written out.
3859  *			Only used by MN disksets.
3860  *
3861  * Returns:
3862  *	0 - Successful
3863  *	1 - Unable to write incore mddb data to disk since < 50% replicas.
3864  */
3865 int
selectreplicas(mddb_set_t * s,int flag)3866 selectreplicas(
3867 	mddb_set_t	*s,
3868 	int		flag
3869 )
3870 {
3871 	int		li;
3872 	int		alc;
3873 	int		lc;
3874 	mddb_locator_t	*lp;
3875 	mddb_lb_t	*lbp = s->s_lbp;
3876 	set_t		setno = s->s_setno;
3877 	int		wc_flag;
3878 
3879 	/*
3880 	 * can never transition from stale to not stale
3881 	 */
3882 	if (md_get_setstatus(setno) & MD_SET_STALE) {
3883 		for (li = 0; li < lbp->lb_loccnt; li++) {
3884 			lp = &lbp->lb_locators[li];
3885 			if (lp->l_flags & MDDB_F_DELETED)
3886 				continue;
3887 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3888 				lp->l_flags |= MDDB_F_ACTIVE;
3889 			} else {
3890 				lp->l_flags &= ~MDDB_F_ACTIVE;
3891 			}
3892 		}
3893 		return (1);
3894 	}
3895 
3896 	if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3897 		for (li = 0; li < lbp->lb_loccnt; li++) {
3898 			lp = &lbp->lb_locators[li];
3899 			if (lp->l_flags & MDDB_F_DELETED)
3900 				continue;
3901 			if (lp->l_flags & MDDB_F_ACTIVE) {
3902 				lp->l_flags |= MDDB_F_OLDACT;
3903 				lp->l_flags &= ~MDDB_F_SUSPECT;
3904 			} else {
3905 				lp->l_flags |= MDDB_F_SUSPECT;
3906 				lp->l_flags &= ~MDDB_F_OLDACT;
3907 			}
3908 
3909 			if (! (lp->l_flags & MDDB_F_EMASTER)) {
3910 				lp->l_flags |= MDDB_F_ACTIVE;
3911 				lp->l_flags &= ~MDDB_F_EWRITE;
3912 				lp->l_flags &= ~MDDB_F_TOOSMALL;
3913 			} else {
3914 				lp->l_flags &= ~MDDB_F_ACTIVE;
3915 			}
3916 		}
3917 		computefreeblks(s); /* set up free block bits */
3918 	} else {
3919 		for (li = 0; li < lbp->lb_loccnt; li++) {
3920 			lp = &lbp->lb_locators[li];
3921 			if (! (lp->l_flags & MDDB_F_ACTIVE))
3922 				continue;
3923 			if (lp->l_flags & MDDB_F_EWRITE)
3924 				break;
3925 		}
3926 
3927 		/*
3928 		 * if there are no errors this is error has already
3929 		 * been processed return current state
3930 		 */
3931 		if (li == lbp->lb_loccnt)
3932 			return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3933 
3934 		lp->l_flags &= ~MDDB_F_ACTIVE;
3935 		do {
3936 			lp = &lbp->lb_locators[li];
3937 			lp->l_flags &= ~MDDB_F_UP2DATE;
3938 		} while (++li < lbp->lb_loccnt);
3939 	}
3940 
3941 	alc = 0;
3942 	lc = 0;
3943 	for (li = 0; li < lbp->lb_loccnt; li++) {
3944 		lp = &lbp->lb_locators[li];
3945 		if (lp->l_flags & MDDB_F_DELETED)
3946 			continue;
3947 		lc++;
3948 		if (! (lp->l_flags & MDDB_F_ACTIVE))
3949 			continue;
3950 		alc++;
3951 	}
3952 
3953 	if (alc < ((lc + 1) / 2)) {
3954 		md_set_setstatus(setno, MD_SET_TOOFEW);
3955 		return (1);
3956 	}
3957 
3958 	/* Set wc_flag based on flag passed in. */
3959 	if (flag == MDDB_SCANALLSYNC)
3960 		wc_flag = MDDB_WRITECOPY_SYNC;
3961 	else
3962 		wc_flag = MDDB_WRITECOPY_ALL;
3963 
3964 	do {
3965 		if (! writestart(s, wc_flag)) {
3966 			md_clr_setstatus(setno, MD_SET_TOOFEW);
3967 			return (0);
3968 		}
3969 		alc  = 0;
3970 		for (li = 0; li < lbp->lb_loccnt; li++) {
3971 			lp = &lbp->lb_locators[li];
3972 			if ((lp->l_flags & MDDB_F_DELETED) ||
3973 			    (lp->l_flags & MDDB_F_EMASTER))
3974 				continue;
3975 
3976 			if (lp->l_flags & MDDB_F_EWRITE) {
3977 				lp->l_flags &= ~MDDB_F_ACTIVE;
3978 				lp->l_flags &= ~MDDB_F_UP2DATE;
3979 				continue;
3980 			}
3981 			alc++;
3982 		}
3983 	} while (alc >= ((lc + 1) / 2));
3984 	md_set_setstatus(setno, MD_SET_TOOFEW);
3985 	return (1);
3986 }
3987 
3988 static int
checkstate(mddb_set_t * s,int probe)3989 checkstate(
3990 	mddb_set_t	*s,
3991 	int		probe
3992 )
3993 {
3994 	int		error;
3995 	uint_t		set_status = md_get_setstatus(s->s_setno);
3996 
3997 	ASSERT(s != NULL);
3998 
3999 	if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
4000 		return (0);
4001 
4002 	if (probe == MDDB_NOPROBE)
4003 		return (1);
4004 
4005 	single_thread_start(s);
4006 	error = selectreplicas(s, MDDB_SCANALL);
4007 	single_thread_end(s);
4008 
4009 	if (error == 0 && s->s_zombie != 0) {
4010 		mutex_exit(SETMUTEX(s->s_setno));
4011 		error = mddb_deleterec(s->s_zombie);
4012 		mutex_enter(SETMUTEX(s->s_setno));
4013 		if (error == 0)
4014 			s->s_zombie = 0;
4015 	}
4016 	return (error);
4017 }
4018 
4019 static int
writeretry(mddb_set_t * s)4020 writeretry(
4021 	mddb_set_t	*s
4022 )
4023 {
4024 	if (selectreplicas(s, MDDB_RETRYSCAN))
4025 		if (selectreplicas(s, MDDB_SCANALL))
4026 			return (1);
4027 	return (0);
4028 }
4029 
4030 static void
free_mbipp(mddb_mb_ic_t ** mbipp)4031 free_mbipp(mddb_mb_ic_t **mbipp)
4032 {
4033 	mddb_mb_ic_t	*mbip1, *mbip2;
4034 
4035 	for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
4036 		mbip2 = mbip1->mbi_next;
4037 		kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
4038 	}
4039 	*mbipp = (mddb_mb_ic_t *)NULL;
4040 }
4041 
4042 static mddb_ri_t *
save_rip(mddb_set_t * s)4043 save_rip(mddb_set_t *s)
4044 {
4045 	mddb_ri_t	*trip = s->s_rip;
4046 	mddb_ri_t	*nrip = NULL;
4047 	mddb_ri_t	**nripp = &nrip;
4048 	mddb_ri_t	*rip;
4049 
4050 	while (trip) {
4051 		/* Run to the end of the list */
4052 		for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4053 			/* void */;
4054 
4055 		/* Add the new member */
4056 		*nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4057 
4058 		ASSERT(*nripp != NULL);
4059 
4060 		/* shorthand */
4061 		rip = *nripp;
4062 
4063 		*rip = *trip;			/* structure assignment */
4064 
4065 		/* Clear the stuff that is not needed for hints */
4066 		rip->ri_flags = 0;
4067 		rip->ri_commitcnt = 0;
4068 		rip->ri_transplant = 0;
4069 		rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4070 		rip->ri_dtp = (mddb_dt_t *)NULL;
4071 		rip->ri_lbp = (mddb_lb_t *)NULL;
4072 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4073 		rip->ri_devid = (ddi_devid_t)NULL;
4074 		rip->ri_old_devid = (ddi_devid_t)NULL;
4075 		rip->ri_next = (mddb_ri_t *)NULL;
4076 
4077 		trip = trip->ri_next;
4078 	}
4079 	return (nrip);
4080 }
4081 
4082 static void
free_rip(mddb_ri_t ** ripp)4083 free_rip(mddb_ri_t **ripp)
4084 {
4085 	mddb_ri_t	*rip;
4086 	mddb_ri_t	*arip;
4087 
4088 	for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4089 		arip = rip->ri_next;
4090 		if (rip->ri_devid != (ddi_devid_t)NULL) {
4091 			ddi_devid_free(rip->ri_devid);
4092 			rip->ri_devid = (ddi_devid_t)NULL;
4093 		}
4094 		if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4095 			ddi_devid_free(rip->ri_old_devid);
4096 			rip->ri_old_devid = (ddi_devid_t)NULL;
4097 		}
4098 		kmem_free((caddr_t)rip, sizeof (*rip));
4099 	}
4100 	*ripp = (mddb_ri_t *)NULL;
4101 }
4102 
4103 /*
4104  * this routine selects the correct replica to use
4105  * the rules are as follows
4106  *	1.	if all replica has same init time select highest commit count
4107  *	2.	if some but not all replicas are from another hostid discard
4108  *		them.
4109  *	3.	find which init time is present is most replicas
4110  *	4.	discard all replicas which do not match most init times
4111  *	5.	select replica with highest commit count
4112  */
4113 
4114 static mddb_lb_t *
selectlocator(mddb_set_t * s)4115 selectlocator(
4116 	mddb_set_t	*s
4117 )
4118 {
4119 	mddb_ri_t	*rip = s->s_rip;
4120 	mddb_ri_t	*r, *r1;
4121 	mddb_lb_t	*lbp;
4122 	struct timeval32 *tp = (struct timeval32 *)NULL;
4123 	int		different;
4124 	int		same;
4125 	int		count;
4126 	int		maxcount;
4127 	set_t		setno = s->s_setno;
4128 	size_t		sz;
4129 	int		mn_set = 0;
4130 
4131 	/* Clear the ri_transplant flag on all the rip entries. */
4132 	/* Set ri_commitcnt to locator's commitcnt - if available */
4133 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4134 		r->ri_transplant = 0;
4135 		if (r->ri_lbp != (mddb_lb_t *)NULL) {
4136 			r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4137 			/* If any locators have MN bit set, set flag */
4138 			if (r->ri_lbp->lb_flags & MDDB_MNSET)
4139 				mn_set = 1;
4140 		}
4141 	}
4142 
4143 	/*
4144 	 * A data tag is being used, so use it to limit the selection first.
4145 	 * Data tags not used in MN diskset.
4146 	 */
4147 	if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4148 		mddb_dt_t	*dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4149 
4150 		/*
4151 		 * now toss any locators that have a different data tag
4152 		 */
4153 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4154 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4155 				continue;
4156 
4157 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4158 				/* If same tag, keep it */
4159 				if (dtl_cmp(&dtp->dt_dtag,
4160 				    &r->ri_dtp->dt_dtag) == 0)
4161 					continue;
4162 			}
4163 
4164 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4165 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4166 				r->ri_dtp = (mddb_dt_t *)NULL;
4167 			}
4168 
4169 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4170 			if (!(md_get_setstatus(setno) &
4171 			    MD_SET_REPLICATED_IMPORT)) {
4172 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4173 					sz = ddi_devid_sizeof(r->ri_old_devid);
4174 					kmem_free((caddr_t)r->ri_old_devid, sz);
4175 					r->ri_old_devid = (ddi_devid_t)NULL;
4176 				}
4177 			}
4178 
4179 			kmem_free((caddr_t)r->ri_lbp,
4180 			    dbtob(r->ri_lbp->lb_blkcnt));
4181 			r->ri_lbp = (mddb_lb_t *)NULL;
4182 
4183 			r->ri_transplant = 1;
4184 		}
4185 
4186 		/* Tag used, clear the bit */
4187 		md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4188 
4189 		if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4190 			/*
4191 			 * Get rid of the list of tags.
4192 			 */
4193 			dtl_freel(&s->s_dtlp);
4194 
4195 			/*
4196 			 * Re-create the list with the tag used.
4197 			 */
4198 			(void) dtl_addl(s, &dtp->dt_dtag);
4199 		}
4200 	}
4201 
4202 	/*
4203 	 * scan to see if all replicas have same time
4204 	 */
4205 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4206 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4207 			continue;
4208 		if (tp == NULL) {
4209 			tp = &r->ri_lbp->lb_inittime;
4210 			continue;
4211 		}
4212 		/* CSTYLED */
4213 		if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4214 			break;
4215 	}
4216 
4217 	/*
4218 	 * if r == NULL then they were all them same. Choose highest
4219 	 * commit count
4220 	 */
4221 	if (r == (mddb_ri_t *)NULL)
4222 		goto out;
4223 
4224 	/*
4225 	 * If here, a bogus replica is present and at least 1 lb_inittime
4226 	 * did not match.
4227 	 */
4228 
4229 	/*
4230 	 * look and see if any but not all are from different id
4231 	 */
4232 
4233 	different = 0;
4234 	same = 0;
4235 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4236 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4237 			continue;
4238 		if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4239 			different = 1;
4240 		else
4241 			same = 1;
4242 	}
4243 
4244 	/*
4245 	 * now go through and throw out different if there are some
4246 	 * that are the same
4247 	 */
4248 	if (different != 0 && same != 0) {
4249 		for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4250 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4251 				continue;
4252 
4253 			if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
4254 				continue;
4255 
4256 			if (r->ri_dtp != (mddb_dt_t *)NULL) {
4257 				kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4258 				r->ri_dtp = (mddb_dt_t *)NULL;
4259 			}
4260 
4261 			mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4262 			if (!(md_get_setstatus(setno) &
4263 			    MD_SET_REPLICATED_IMPORT)) {
4264 				if (r->ri_old_devid != (ddi_devid_t)NULL) {
4265 					sz = ddi_devid_sizeof(r->ri_old_devid);
4266 					kmem_free((caddr_t)r->ri_old_devid, sz);
4267 					r->ri_old_devid = (ddi_devid_t)NULL;
4268 				}
4269 			}
4270 
4271 			kmem_free((caddr_t)r->ri_lbp,
4272 			    dbtob(r->ri_lbp->lb_blkcnt));
4273 			r->ri_lbp = (mddb_lb_t *)NULL;
4274 
4275 			r->ri_transplant = 1;
4276 		}
4277 	}
4278 
4279 	/*
4280 	 * go through and pick highest. Use n square because it is
4281 	 * simple and 40 some is max possible
4282 	 */
4283 	maxcount = 0;
4284 	lbp = (mddb_lb_t *)NULL;
4285 	for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4286 		if (r1->ri_lbp == (mddb_lb_t *)NULL)
4287 			continue;
4288 		count = 0;
4289 		for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4290 			if (r->ri_lbp == (mddb_lb_t *)NULL)
4291 				continue;
4292 			if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4293 			    &r->ri_lbp->lb_inittime, ==))
4294 				count++;
4295 		}
4296 		if (count > maxcount) {
4297 			maxcount = count;
4298 			lbp = r1->ri_lbp;
4299 		}
4300 	}
4301 
4302 	/*
4303 	 * now go though and toss any that are of a different time stamp
4304 	 */
4305 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4306 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4307 			continue;
4308 		if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4309 		    &r->ri_lbp->lb_inittime, ==))
4310 			continue;
4311 
4312 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4313 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4314 			r->ri_dtp = (mddb_dt_t *)NULL;
4315 		}
4316 
4317 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4318 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4319 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4320 				sz = ddi_devid_sizeof(r->ri_old_devid);
4321 				kmem_free((caddr_t)r->ri_old_devid, sz);
4322 				r->ri_old_devid = (ddi_devid_t)NULL;
4323 			}
4324 		}
4325 
4326 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4327 		r->ri_lbp = (mddb_lb_t *)NULL;
4328 
4329 		r->ri_transplant = 1;
4330 	}
4331 
4332 out:
4333 	/*
4334 	 * Find the locator with the highest commit count, and make it the
4335 	 * "chosen" one.
4336 	 */
4337 	lbp = (mddb_lb_t *)NULL;
4338 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4339 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4340 			continue;
4341 
4342 		if (lbp == NULL) {
4343 			lbp = r->ri_lbp;
4344 			continue;
4345 		}
4346 
4347 		if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4348 			lbp = r->ri_lbp;
4349 	}
4350 
4351 	/* Toss all locator blocks, except the "chosen" one. */
4352 	for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4353 		if (r->ri_lbp == (mddb_lb_t *)NULL)
4354 			continue;
4355 
4356 		/* Get rid of all dtp's */
4357 		if (r->ri_dtp != (mddb_dt_t *)NULL) {
4358 			kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4359 			r->ri_dtp = (mddb_dt_t *)NULL;
4360 		}
4361 
4362 		if (r->ri_lbp == lbp)
4363 			continue;
4364 
4365 		/* Get rid of extra locator devid block info */
4366 		mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4367 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4368 			if (r->ri_old_devid != (ddi_devid_t)NULL) {
4369 				sz = ddi_devid_sizeof(r->ri_old_devid);
4370 				kmem_free((caddr_t)r->ri_old_devid, sz);
4371 				r->ri_old_devid = (ddi_devid_t)NULL;
4372 			}
4373 		}
4374 
4375 		/* Get rid of extra locators */
4376 		kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4377 		r->ri_lbp = (mddb_lb_t *)NULL;
4378 	}
4379 	return (lbp);
4380 }
4381 
4382 static void
locator2cfgloc(mddb_lb_t * lbp,mddb_cfg_loc_t * clp,int li,side_t sideno,mddb_did_ic_t * did_icp)4383 locator2cfgloc(
4384 	mddb_lb_t		*lbp,
4385 	mddb_cfg_loc_t		*clp,
4386 	int			li,
4387 	side_t			sideno,
4388 	mddb_did_ic_t		*did_icp
4389 )
4390 {
4391 	mddb_drvnm_t		*dn;
4392 	mddb_locator_t		*lp = &lbp->lb_locators[li];
4393 	mddb_sidelocator_t	*slp;
4394 	mddb_mnsidelocator_t	*mnslp;
4395 	mddb_did_info_t		*did_info;
4396 	int 			i, sz, szalloc;
4397 	int			mn_set = 0;
4398 	mddb_mnlb_t		*mnlbp;
4399 
4400 	if (lbp->lb_flags & MDDB_MNSET) {
4401 		mn_set = 1;
4402 		mnlbp = (mddb_mnlb_t *)lbp;
4403 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4404 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4405 			if (mnslp->mnl_sideno == sideno)
4406 				break;
4407 		}
4408 		if (i == MD_MNMAXSIDES)
4409 			return;
4410 	} else {
4411 		slp = &lbp->lb_sidelocators[sideno][li];
4412 	}
4413 
4414 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4415 		did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4416 		if (did_info->info_flags & MDDB_DID_EXISTS) {
4417 			sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4418 			if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4419 				/*
4420 				 * copy device id from mddb to
4421 				 * cfg_loc structure
4422 				 */
4423 				szalloc = clp->l_devid_sz;
4424 				if (sz <= szalloc) {
4425 					for (i = 0; i < sz; i++) {
4426 						((char *)(uintptr_t)
4427 						    clp->l_devid)[i] =
4428 						    ((char *)did_icp->
4429 						    did_ic_devid[li])[i];
4430 					}
4431 					clp->l_devid_flags |= MDDB_DEVID_VALID;
4432 					(void) strcpy(clp->l_minor_name,
4433 					    did_info->info_minor_name);
4434 				} else {
4435 					clp->l_devid_flags |=
4436 					    MDDB_DEVID_NOSPACE;
4437 				}
4438 			} else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4439 				clp->l_devid_flags = MDDB_DEVID_SZ;
4440 				clp->l_devid_sz = sz;
4441 			}
4442 		}
4443 	}
4444 
4445 	/*
4446 	 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4447 	 * and sidelocators.  During startup, the dev, drvnm and mnum in
4448 	 * these structures may not match the devid (the locators and
4449 	 * sidelocators will be updated to match the devid by the routine
4450 	 * load_old_replicas).  Using out-of-sync values won't cause any
4451 	 * problems since ridev will re-derive these from the devid and mnum.
4452 	 * After startup, the dev, drvnm and mnum in these structures have
4453 	 * been updated and can be used.
4454 	 */
4455 
4456 	clp->l_blkno = lp->l_blkno;
4457 	clp->l_flags = lp->l_flags;
4458 	clp->l_dev = lp->l_dev;
4459 
4460 	if (mn_set) {
4461 		dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4462 		clp->l_mnum = mnslp->mnl_mnum;
4463 	} else {
4464 		dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4465 		clp->l_mnum = slp->l_mnum;
4466 	}
4467 	(void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4468 }
4469 
4470 /*
4471  * Find the index into the mnsidelocator where entry will go.
4472  * Then index can be fed into both splitname2locatorblocks and
4473  * cfgloc2locator so that those entries can be kept in sync.
4474  *
4475  * Returns:
4476  *	-1 if failed to find unused slot or if a traditional diskset
4477  *	index, if successful  (0 <= index <= MD_MNMAXSIDES)
4478  */
4479 static int
checklocator(mddb_lb_t * lbp,int li,side_t sideno)4480 checklocator(
4481 	mddb_lb_t		*lbp,
4482 	int			li,
4483 	side_t			sideno
4484 )
4485 {
4486 	uchar_t			i;
4487 	mddb_mnsidelocator_t	*mnslp;
4488 	mddb_mnlb_t		*mnlbp;
4489 	int			index = -1;
4490 
4491 	if (lbp->lb_flags & MDDB_MNSET) {
4492 		/*
4493 		 * Checking side locator structure.  First, check if
4494 		 * there is already an entry for this side.  If so,
4495 		 * then use that entry.  Otherwise, find an entry
4496 		 * that has a sideno of 0.
4497 		 */
4498 		mnlbp = (mddb_mnlb_t *)lbp;
4499 		for (i = 0; i < MD_MNMAXSIDES; i++) {
4500 			mnslp = &mnlbp->lb_mnsidelocators[i][li];
4501 			if (mnslp->mnl_sideno == sideno) {
4502 				/* Found a match - stop looking */
4503 				index = i;
4504 				break;
4505 			} else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4506 				/* Set first empty slot, but keep looking */
4507 				index = i;
4508 			}
4509 		}
4510 		/* Didn't find empty slot or previously used slot */
4511 		if ((i == MD_MNMAXSIDES) && (index == -1)) {
4512 			return (-1);
4513 		}
4514 		return (index);
4515 	} else
4516 		return (0);
4517 }
4518 
4519 /*
4520  * Takes locator information (driver name, minor number, sideno) and
4521  * stores it in the locator block.
4522  * For traditional diskset, the sideno is the index into the sidelocator
4523  * array in the locator block.
4524  * For the MN diskset, the sideno is the nodeid which can be any number,
4525  * so the index passed in is the index into the mnsidelocator array
4526  * in the locator block.
4527  */
4528 static int
cfgloc2locator(mddb_lb_t * lbp,mddb_cfg_loc_t * clp,int li,side_t sideno,int index)4529 cfgloc2locator(
4530 	mddb_lb_t		*lbp,
4531 	mddb_cfg_loc_t		*clp,
4532 	int			li,
4533 	side_t			sideno,
4534 	int			index	/* Only useful in MNsets when > 1 */
4535 )
4536 {
4537 	uchar_t			i;
4538 	mddb_sidelocator_t	*slp;
4539 	mddb_mnsidelocator_t	*mnslp;
4540 	mddb_set_t		*s;
4541 	int			mn_set = 0;
4542 	mddb_mnlb_t		*mnlbp;
4543 
4544 	if (lbp->lb_flags & MDDB_MNSET) {
4545 		mnlbp = (mddb_mnlb_t *)lbp;
4546 		mn_set = 1;
4547 		/*
4548 		 * Index will be the slot that has the given sideno or
4549 		 * the first empty slot if no match is found.
4550 		 * This was pre-checked out in check locator.
4551 		 */
4552 		mnslp = &mnlbp->lb_mnsidelocators[index][li];
4553 	} else {
4554 		slp = &lbp->lb_sidelocators[sideno][li];
4555 	}
4556 
4557 	/*
4558 	 * Look for the driver name
4559 	 */
4560 	for (i = 0; i < MDDB_DRVNMCNT; i++) {
4561 		if (lbp->lb_drvnm[i].dn_len == 0)
4562 			continue;
4563 		if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4564 		    MD_MAXDRVNM) == 0)
4565 			break;
4566 	}
4567 
4568 	/*
4569 	 * Didn't find one, add a new one
4570 	 */
4571 	if (i == MDDB_DRVNMCNT) {
4572 		for (i = 0; i < MDDB_DRVNMCNT; i++) {
4573 			if (lbp->lb_drvnm[i].dn_len == 0)
4574 				break;
4575 		}
4576 		if (i == MDDB_DRVNMCNT)
4577 			return (1);
4578 		(void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4579 		    MD_MAXDRVNM);
4580 		lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4581 	}
4582 
4583 	/* Fill in the drvnm index */
4584 	if (mn_set) {
4585 		mnslp->mnl_drvnm_index = i;
4586 		mnslp->mnl_mnum = clp->l_mnum;
4587 		mnslp->mnl_sideno = sideno;
4588 	} else {
4589 		slp->l_drvnm_index = i;
4590 		slp->l_mnum = clp->l_mnum;
4591 	}
4592 
4593 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4594 		/*
4595 		 * This device id could already be associated with this index
4596 		 * if this is not the first side added to the set.
4597 		 * If device id is 0, there is no device id for this device.
4598 		 */
4599 		if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4600 			return (0);
4601 		s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4602 		if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4603 		    clp->l_minor_name)) {
4604 			return (1);
4605 		}
4606 	}
4607 
4608 	return (0);
4609 }
4610 
4611 /*
4612  * See if there are mediator hosts and try to use the data.
4613  */
4614 static int
mediate(mddb_set_t * s)4615 mediate(
4616 	mddb_set_t	*s
4617 )
4618 {
4619 	mddb_lb_t	*lbp = s->s_lbp;
4620 	med_data_lst_t	*meddlp = NULL;
4621 	med_data_lst_t	*tmeddlp = NULL;
4622 	med_data_t	*meddp;
4623 	int		medok = 0;
4624 	int		medacc = 0;
4625 	uint_t		maxcc;
4626 	int		golden = 0;
4627 	int		err = 1;
4628 	set_t		setno = s->s_setno;
4629 
4630 	/* Do not have a mediator, then the state is stale */
4631 	if (s->s_med.n_cnt == 0)
4632 		return (err);
4633 
4634 	/* Contact the mediator hosts for the data */
4635 	meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4636 
4637 	/* No mediator data, stale */
4638 	if (meddlp == NULL)
4639 		return (err);
4640 
4641 	/* Mark all the mediator data that is not for this set as errored */
4642 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4643 		struct timeval32 tmptime;
4644 		meddp = tmeddlp->mdl_med;
4645 
4646 		/* Count the number of mediators contacted */
4647 		medacc++;
4648 
4649 		/* Paranoid check */
4650 		if (meddp->med_dat_sn != setno)
4651 			meddp->med_dat_fl |= MED_DFL_ERROR;
4652 
4653 		TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4654 
4655 		/*CSTYLED*/
4656 		if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4657 			meddp->med_dat_fl |= MED_DFL_ERROR;
4658 	}
4659 
4660 	/* Get the max commitcount */
4661 	maxcc = 0;
4662 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4663 		meddp = tmeddlp->mdl_med;
4664 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4665 			continue;
4666 		if (meddp->med_dat_cc > maxcc)
4667 			maxcc = meddp->med_dat_cc;
4668 	}
4669 
4670 	/* Now mark the records that don't have the highest cc as errored */
4671 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4672 		meddp = tmeddlp->mdl_med;
4673 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4674 			continue;
4675 		if (meddp->med_dat_cc != maxcc)
4676 			meddp->med_dat_fl |= MED_DFL_ERROR;
4677 	}
4678 
4679 	/* Now mark the records that don't match the lb commitcnt as errored */
4680 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4681 		meddp = tmeddlp->mdl_med;
4682 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4683 			continue;
4684 		if (meddp->med_dat_cc != lbp->lb_commitcnt)
4685 			meddp->med_dat_fl |= MED_DFL_ERROR;
4686 	}
4687 
4688 	/* Is there a "golden" copy and how many valid mediators */
4689 	for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4690 		meddp = tmeddlp->mdl_med;
4691 		if (meddp->med_dat_fl & MED_DFL_ERROR)
4692 			continue;
4693 
4694 		if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4695 			golden++;
4696 
4697 		medok++;
4698 	}
4699 
4700 	/* No survivors, stale */
4701 	if (medok == 0)
4702 		goto out;
4703 
4704 	/* No mediator quorum and no golden copies, stale */
4705 	if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4706 		/* Skip odd numbers, no exact 50% */
4707 		if (s->s_med.n_cnt & 1)
4708 			goto out;
4709 		/* Have 50%, allow an accept */
4710 		if (medacc == (s->s_med.n_cnt / 2))
4711 			md_set_setstatus(setno, MD_SET_ACCOK);
4712 		goto out;
4713 	}
4714 
4715 	/* We either have a quorum or a golden copy, or both */
4716 	err = 0;
4717 
4718 out:
4719 	if (meddlp) {
4720 		for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4721 			tmeddlp = meddlp->mdl_nx;
4722 			kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4723 			kmem_free(meddlp, sizeof (med_data_lst_t));
4724 		}
4725 	}
4726 
4727 	return (err);
4728 }
4729 
4730 /*
4731  *	1. read masterblks and locator blocks for all know database locations
4732  *		a. keep track of which have good master blks
4733  *		b. keep track of which have good locators
4734  *
4735  */
4736 static int
get_mbs_n_lbs(mddb_set_t * s,int * write_lb)4737 get_mbs_n_lbs(
4738 	mddb_set_t	*s,
4739 	int		*write_lb
4740 )
4741 {
4742 	mddb_lb_t	*lbp = NULL;		/* pointer to locator block */
4743 						/* May be cast to mddb_mnlb_t */
4744 						/* if accessing sidenames in */
4745 						/* MN set */
4746 	mddb_did_ic_t	*did_icp = NULL;	/* ptr to Device ID incore */
4747 	mddb_did_blk_t	*did_blkp = 0;
4748 	int		did_blkp_sz = 0;
4749 	mddb_did_db_t	*did_dbp;
4750 	mddb_did_info_t	*did_info;
4751 	caddr_t		did_block;
4752 	mddb_ri_t	*rip;
4753 	mddb_dtag_lst_t	*dtlp;
4754 	mddb_locator_t	*lp;
4755 	daddr_t		physblk;
4756 	int		li;
4757 	uint_t		blk;
4758 	md_dev64_t	dev;
4759 	caddr_t		buffer;
4760 	uint_t		lb_blkcnt;
4761 	int		retval = 0;
4762 	int		err = 0;
4763 	int		lb_ok = 0;
4764 	int		lb_total = 0;
4765 	int		lb_tagged = 0;
4766 	int		lb_tags;
4767 	set_t		setno = s->s_setno;
4768 	int		cont_flag, i;
4769 	mddb_did_db_t	*did_dbp1, *did_dbp2;
4770 	int		mn_set = 0;
4771 	mddb_cfg_loc_t	*cl;
4772 
4773 	/*
4774 	 * read in master blocks and locator block for all known locators.
4775 	 * lb_blkcnt will be set correctly for MN set later once getmasters
4776 	 * has determined that the set is a MN set.
4777 	 */
4778 	lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4779 
4780 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4781 		rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4782 		    MDDB_F_EMASTER);
4783 		rip->ri_lbp = (mddb_lb_t *)NULL;
4784 		rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4785 
4786 		/*
4787 		 * Translated dev is only used in calls to getmasters and
4788 		 * getblks which expect a translated (aka miniroot) dev.
4789 		 */
4790 		dev = md_xlate_targ_2_mini(rip->ri_dev);
4791 		if (dev == NODEV64) {
4792 			/* Set error flag that getmasters would have set */
4793 			/* if getmasters had been allowed to fail */
4794 			rip->ri_flags |= MDDB_F_EMASTER;
4795 		}
4796 
4797 		/*
4798 		 * Invalid device id on system (due to failed or
4799 		 * removed device) or invalid devt during upgrade
4800 		 * (due to powered off device) will cause this
4801 		 * replica to be marked in error and not used.
4802 		 */
4803 		if (rip->ri_flags & MDDB_F_EMASTER)
4804 			continue;
4805 
4806 		/* get all master blocks, does mddb_devopen() */
4807 		rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4808 		    &rip->ri_flags, &mn_set);
4809 
4810 		/* if invalid master block - try next replica */
4811 		if (! rip->ri_mbip)
4812 			continue;
4813 
4814 		/*
4815 		 * If lbp alloc'd to wrong size - reset it.
4816 		 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4817 		 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4818 		 */
4819 		if (lbp) {
4820 			if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4821 			    ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4822 				kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4823 				lbp = (mddb_lb_t *)NULL;
4824 			}
4825 		}
4826 
4827 		if (lbp == (mddb_lb_t *)NULL) {
4828 			/* If a MN set, set lb_blkcnt for MN loc blk size */
4829 			if (mn_set)
4830 				lb_blkcnt = MDDB_MNLBCNT;
4831 			lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4832 			    KM_SLEEP);
4833 		}
4834 
4835 		/*
4836 		 * Read in all the sectors for the locator block
4837 		 * NOTE: Need to use getblks, rather than readblklst.
4838 		 *	because it is too early and things are
4839 		 *	NOT set up yet for read*()'s
4840 		 */
4841 		buffer = (caddr_t)lbp;
4842 		for (blk = 0; blk < lb_blkcnt; blk++) {
4843 			physblk = getphysblk(blk, rip->ri_mbip);
4844 			err = getblks(s, buffer, dev, physblk,
4845 			    btodb(MDDB_BSIZE), 0);
4846 			if (err) {
4847 				rip->ri_flags |= err;
4848 				break;
4849 			}
4850 			buffer += MDDB_BSIZE;
4851 		}
4852 
4853 		if (err)
4854 			continue;
4855 
4856 		/* Verify the locator block */
4857 		if (blk != lb_blkcnt)
4858 			continue;
4859 		if (lbp->lb_magic != MDDB_MAGIC_LB)
4860 			continue;
4861 		if (lbp->lb_blkcnt != lb_blkcnt)
4862 			continue;
4863 		if (mn_set) {
4864 			/* If a MN set, check for MNLB revision in lb. */
4865 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4866 				continue;
4867 		} else {
4868 			/* If not a MN set, check for LB revision in lb. */
4869 			if (revchk(MDDB_REV_LB, lbp->lb_revision))
4870 				continue;
4871 		}
4872 		if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4873 			continue;
4874 
4875 		/*
4876 		 * With the addition of MultiNode Disksets, we must make sure
4877 		 * to verify that this is the correct set.  A node could
4878 		 * have been out of the config for awhile and this disk could
4879 		 * have been moved to a different diskset and we don't want
4880 		 * to accidentally start the wrong set.
4881 		 *
4882 		 * We don't do this check if we're in the middle of
4883 		 * importing a set.
4884 		 */
4885 		if (!(md_get_setstatus(s->s_setno) &
4886 		    (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
4887 		    (lbp->lb_setno != s->s_setno))
4888 			continue;
4889 
4890 		rip->ri_flags |= MDDB_F_LOCACC;
4891 
4892 		/*
4893 		 * a commit count of zero means this locator has been deleted
4894 		 */
4895 		if (lbp->lb_commitcnt == 0)
4896 			continue;
4897 
4898 		/*
4899 		 * If replica is in the device ID style and md_devid_destroy
4900 		 * flag is set, turn off device id style.  This is only to be
4901 		 * used in a catastrophic failure case.  Examples would be
4902 		 * where the device id of all drives in the system
4903 		 * (especially the mirror'd root drives) had been changed
4904 		 * by firmware upgrade or by a patch to an existing disk
4905 		 * driver.  Another example would be in the case of non-unique
4906 		 * device ids due to a bug.  The device id would be valid on
4907 		 * the system, but would return the wrong dev_t.
4908 		 */
4909 		if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4910 			lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4911 			lbp->lb_didfirstblk = 0;
4912 			lbp->lb_didblkcnt = 0;
4913 			*write_lb = 1;
4914 		}
4915 
4916 
4917 		/*
4918 		 * If replica is in device ID style, read in device ID
4919 		 * block and verify device ID block information.
4920 		 */
4921 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4922 
4923 			/* Read in device ID block */
4924 			if (did_icp == NULL) {
4925 				did_icp = (mddb_did_ic_t *)
4926 				    kmem_zalloc(sizeof (mddb_did_ic_t),
4927 				    KM_SLEEP);
4928 			} else {
4929 				/* Reuse did_icp, but clear out data */
4930 				if (did_icp->did_ic_blkp !=
4931 				    (mddb_did_blk_t *)NULL) {
4932 					kmem_free((caddr_t)did_icp->did_ic_blkp,
4933 					    did_blkp_sz);
4934 					did_blkp = (mddb_did_blk_t *)NULL;
4935 					did_icp->did_ic_blkp =
4936 					    (mddb_did_blk_t *)NULL;
4937 				}
4938 				if (did_icp->did_ic_dbp !=
4939 				    (mddb_did_db_t *)NULL) {
4940 					did_dbp1 = did_icp->did_ic_dbp;
4941 					while (did_dbp1) {
4942 						did_dbp2 = did_dbp1->db_next;
4943 						kmem_free((caddr_t)
4944 						    did_dbp1->db_ptr,
4945 						    dbtob(did_dbp1->db_blkcnt));
4946 						kmem_free((caddr_t)did_dbp1,
4947 						    sizeof (mddb_did_db_t));
4948 						did_dbp1 = did_dbp2;
4949 					}
4950 					did_icp->did_ic_dbp =
4951 					    (mddb_did_db_t *)NULL;
4952 				}
4953 				for (i = 0; i < MDDB_NLB; i++) {
4954 					did_icp->did_ic_devid[i] =
4955 					    (ddi_devid_t)NULL;
4956 				}
4957 			}
4958 
4959 			/* Can't reuse blkp since size could be different */
4960 			if (did_blkp != (mddb_did_blk_t *)NULL) {
4961 				kmem_free(did_blkp, did_blkp_sz);
4962 			}
4963 			did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4964 			did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4965 			    KM_SLEEP);
4966 			did_icp->did_ic_blkp = did_blkp;
4967 			buffer = (caddr_t)did_blkp;
4968 			for (blk = lbp->lb_didfirstblk;
4969 			    blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4970 			    blk++) {
4971 				physblk = getphysblk(blk, rip->ri_mbip);
4972 				err = getblks(s, buffer, dev, physblk,
4973 				    btodb(MDDB_BSIZE), 0);
4974 				if (err) {
4975 					rip->ri_flags |= err;
4976 					break;
4977 				}
4978 				buffer += MDDB_BSIZE;
4979 			}
4980 			if (err)
4981 				continue;
4982 
4983 			/* Verify the Device ID block */
4984 			if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4985 				continue;
4986 			if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4987 				continue;
4988 			if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4989 				continue;
4990 			if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4991 				continue;
4992 			if (crcchk(did_blkp, &did_blkp->blk_checksum,
4993 			    dbtob(lbp->lb_didblkcnt), NULL))
4994 				continue;
4995 
4996 			/*
4997 			 * Check if device ID block is out of sync with the
4998 			 * Locator Block by checking if the locator block
4999 			 * commitcnt does not match the device id block
5000 			 * commitcnt.  If an 'out of sync' condition
5001 			 * exists, discard this replica since it has
5002 			 * inconsistent data and can't be used in
5003 			 * determining the best replica.
5004 			 *
5005 			 * An 'out of sync' condition could happen if old
5006 			 * SDS code was running with new devid style replicas
5007 			 * or if a failure occurred between the writing of
5008 			 * the locator block's commitcnt and the device
5009 			 * id block's commitcnt.
5010 			 *
5011 			 * If old SDS code had been running, the upgrade
5012 			 * process should detect this situation and
5013 			 * have removed all of the device id information
5014 			 * via the md_devid_destroy flag in md.conf.
5015 			 */
5016 			if (did_blkp->blk_commitcnt !=
5017 			    lbp->lb_commitcnt) {
5018 				continue;
5019 			}
5020 		}
5021 
5022 
5023 		/*
5024 		 * If replica is still in device ID style, read in all
5025 		 * of the device IDs, verify the checksum of the device IDs.
5026 		 */
5027 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5028 			/*
5029 			 * Reset valid bit in device id info block flags. This
5030 			 * flag is stored on disk, but the valid bit is reset
5031 			 * when reading in the replica.  If the corresponding
5032 			 * device id is valid (aka meaning that the system
5033 			 * knows about this device id), the valid bit will
5034 			 * be set at a later time.  The valid bit for this
5035 			 * replica's device ID will be set in this routine.
5036 			 * The valid bits for the rest of the device id's
5037 			 * will be set after the 'best' replica has
5038 			 * been selected in routine load_old_replicas.
5039 			 * Reset updated bit in device id info block flags.
5040 			 * This flag is also stored on disk, reset when read
5041 			 * in and set when the locators and side locators
5042 			 * have been updated to match this valid device
5043 			 * id information.
5044 			 */
5045 			for (li = 0; li < lbp->lb_loccnt; li++) {
5046 				did_info = &did_blkp->blk_info[li];
5047 				if (did_info->info_flags & MDDB_DID_EXISTS)
5048 					did_info->info_flags &=
5049 					    ~(MDDB_DID_VALID |
5050 					    MDDB_DID_UPDATED);
5051 			}
5052 
5053 			cont_flag = 0;
5054 			for (li = 0; li < lbp->lb_loccnt; li++) {
5055 				did_info = &did_blkp->blk_info[li];
5056 				did_block = (caddr_t)NULL;
5057 				if (did_info->info_flags & MDDB_DID_EXISTS) {
5058 					/*
5059 					 * Check if block has
5060 					 * already been read in
5061 					 */
5062 					did_dbp = did_icp->did_ic_dbp;
5063 					while (did_dbp != 0) {
5064 						if (did_dbp->db_firstblk ==
5065 						    did_info->info_firstblk)
5066 							break;
5067 						else
5068 							did_dbp =
5069 							    did_dbp->db_next;
5070 					}
5071 					/* if block not found, read it in */
5072 					if (did_dbp == NULL) {
5073 						did_block = (caddr_t)
5074 						    (kmem_zalloc(dbtob(
5075 						    did_info->info_blkcnt),
5076 						    KM_SLEEP));
5077 						buffer = (caddr_t)did_block;
5078 						for (blk =
5079 						    did_info->info_firstblk;
5080 						    blk < (did_info->
5081 						    info_firstblk +
5082 						    did_info->info_blkcnt);
5083 						    blk++) {
5084 							physblk =
5085 							    getphysblk(blk,
5086 							    rip->ri_mbip);
5087 							err = getblks(s,
5088 							    buffer, dev,
5089 							    physblk, btodb(
5090 							    MDDB_BSIZE), 0);
5091 							if (err) {
5092 								rip->ri_flags |=
5093 								    err;
5094 								break;
5095 							}
5096 							buffer += MDDB_BSIZE;
5097 						}
5098 						if (err) {
5099 							kmem_free(did_block,
5100 							    dbtob(did_info->
5101 							    info_blkcnt));
5102 							did_block =
5103 							    (caddr_t)NULL;
5104 							cont_flag = 1;
5105 							break;
5106 						}
5107 
5108 						/*
5109 						 * Block read in -
5110 						 * alloc Disk Block area
5111 						 */
5112 						did_dbp = (mddb_did_db_t *)
5113 						    kmem_zalloc(
5114 						    sizeof (mddb_did_db_t),
5115 						    KM_SLEEP);
5116 						did_dbp->db_ptr = did_block;
5117 						did_dbp->db_firstblk =
5118 						    did_info->info_firstblk;
5119 						did_dbp->db_blkcnt =
5120 						    did_info->info_blkcnt;
5121 
5122 						/* Add to front of dbp list */
5123 						did_dbp->db_next =
5124 						    did_icp->did_ic_dbp;
5125 						did_icp->did_ic_dbp = did_dbp;
5126 					}
5127 					/* Check validity of devid in block */
5128 					if (crcchk(((char *)did_dbp->db_ptr +
5129 					    did_info->info_offset),
5130 					    &did_info->info_checksum,
5131 					    did_info->info_length, NULL)) {
5132 						cont_flag = 1;
5133 						break;
5134 					}
5135 
5136 					/* Block now pointed to by did_dbp */
5137 					did_icp->did_ic_devid[li] =
5138 					    (ddi_devid_t)((char *)
5139 					    did_dbp->db_ptr +
5140 					    did_info->info_offset);
5141 				}
5142 			}
5143 			if (cont_flag)
5144 				continue;
5145 		}
5146 
5147 		/*
5148 		 * All blocks containing devids are now in core.
5149 		 */
5150 
5151 		/*
5152 		 * If we're doing a replicated import (also known as
5153 		 * remote copy import), the device id in the locator
5154 		 * block is incorrect and we need to fix it up here
5155 		 * alongwith the l_dev otherwise we run into lots of
5156 		 * trouble later on.
5157 		 */
5158 		if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5159 			mddb_ri_t	*trip;
5160 			for (li = 0; li < lbp->lb_loccnt; li++) {
5161 				did_info = &did_blkp->blk_info[li];
5162 				lp = &lbp->lb_locators[li];
5163 
5164 				if (lp->l_flags & MDDB_F_DELETED)
5165 					continue;
5166 
5167 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5168 					continue;
5169 
5170 				if (did_icp->did_ic_devid[li] == NULL)
5171 					continue;
5172 
5173 				for (trip = s->s_rip; trip != NULL;
5174 				    trip = trip->ri_next) {
5175 					if (trip->ri_old_devid == NULL)
5176 						continue;
5177 					if (ddi_devid_compare(
5178 					    trip->ri_old_devid,
5179 					    did_icp->did_ic_devid[li]) != 0) {
5180 						continue;
5181 					}
5182 
5183 					/* update l_dev and side mnum */
5184 					lp->l_dev = md_cmpldev(trip->ri_dev);
5185 					lbp->lb_sidelocators[0][li].l_mnum =
5186 					    md_getminor(trip->ri_dev);
5187 				}
5188 			}
5189 		}
5190 
5191 		/*
5192 		 * If there is a valid devid, verify that this locator
5193 		 * block has information about itself by checking the
5194 		 * device ID, minor_name and block
5195 		 * number from this replica's incore data structure
5196 		 * against the locator block information that has just
5197 		 * been read in from disk.
5198 		 *
5199 		 * If not a valid devid, verify that this locator block
5200 		 * has information about itself by checking the minor
5201 		 * number, block number and driver name from this
5202 		 * replica's incore data structure against the locator
5203 		 * block information that has just been read in from disk.
5204 		 */
5205 		if ((rip->ri_devid != NULL) &&
5206 		    (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5207 			/*
5208 			 * This locator block MUST have locator (replica)
5209 			 * information about itself.  Check against devid,
5210 			 * slice part of minor number, and block number.
5211 			 */
5212 			for (li = 0; li < lbp->lb_loccnt; li++) {
5213 				did_info = &did_blkp->blk_info[li];
5214 				lp = &lbp->lb_locators[li];
5215 				if (lp->l_flags & MDDB_F_DELETED)
5216 					continue;
5217 
5218 				if (!(did_info->info_flags & MDDB_DID_EXISTS))
5219 					continue;
5220 
5221 				if (((md_get_setstatus(setno) &
5222 				    MD_SET_REPLICATED_IMPORT)) &&
5223 				    (rip->ri_old_devid != (ddi_devid_t)NULL)) {
5224 					if (ddi_devid_compare(rip->ri_old_devid,
5225 					    did_icp->did_ic_devid[li]) != 0)
5226 						continue;
5227 				} else {
5228 					if (ddi_devid_compare(rip->ri_devid,
5229 					    did_icp->did_ic_devid[li]) != 0)
5230 						continue;
5231 				}
5232 
5233 				if (strcmp(rip->ri_minor_name,
5234 				    did_info->info_minor_name) != 0)
5235 					continue;
5236 
5237 				if (lp->l_blkno == rip->ri_blkno)
5238 					break;
5239 			}
5240 		} else {
5241 			/*
5242 			 * This locator block MUST have locator (replica)
5243 			 * information about itself.
5244 			 */
5245 			if (!mn_set) {
5246 				for (li = 0; li < lbp->lb_loccnt; li++) {
5247 					mddb_drvnm_t		*dn;
5248 					mddb_sidelocator_t	*slp;
5249 
5250 					lp = &lbp->lb_locators[li];
5251 					slp = &lbp->
5252 					    lb_sidelocators[s->s_sideno][li];
5253 					if (lp->l_flags & MDDB_F_DELETED)
5254 						continue;
5255 					if (slp->l_mnum != md_getminor(
5256 					    rip->ri_dev))
5257 						continue;
5258 					if (lp->l_blkno != rip->ri_blkno)
5259 						continue;
5260 					dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5261 					if (strncmp(dn->dn_data,
5262 					    rip->ri_driver, MD_MAXDRVNM) == 0)
5263 						break;
5264 				}
5265 			} else {
5266 				for (li = 0; li < lbp->lb_loccnt; li++) {
5267 					mddb_drvnm_t		*dn;
5268 					mddb_mnsidelocator_t	*mnslp;
5269 					mddb_mnlb_t		*mnlbp;
5270 					int			i;
5271 
5272 					/*
5273 					 * Check all possible locators locking
5274 					 * for match to the currently read-in
5275 					 * locator, must match on:
5276 					 *	- blkno
5277 					 *	- side locator for this
5278 					 *	  node's side
5279 					 *	- side locator minor number
5280 					 *	- side locator driver name
5281 					 */
5282 
5283 					/*
5284 					 * Looking at sidelocs:
5285 					 * cast lbp -> mnlbp
5286 					 */
5287 					mnlbp = (mddb_mnlb_t *)lbp;
5288 					lp = &mnlbp->lb_locators[li];
5289 					if (lp->l_flags & MDDB_F_DELETED)
5290 						continue;
5291 					if (lp->l_blkno != rip->ri_blkno)
5292 						continue;
5293 
5294 					for (i = 0; i < MD_MNMAXSIDES; i++) {
5295 						mnslp = &mnlbp->
5296 						    lb_mnsidelocators[i][li];
5297 						if (mnslp->mnl_sideno ==
5298 						    s->s_sideno) {
5299 							break;
5300 						}
5301 					}
5302 					/* No matching side found */
5303 					if (i == MD_MNMAXSIDES)
5304 						continue;
5305 					if (mnslp->mnl_mnum !=
5306 					    md_getminor(rip->ri_dev))
5307 						continue;
5308 					dn = &lbp->
5309 					    lb_drvnm[mnslp->mnl_drvnm_index];
5310 					if (strncmp(dn->dn_data,
5311 					    rip->ri_driver, MD_MAXDRVNM) == 0)
5312 						break;
5313 				}
5314 			}
5315 		}
5316 
5317 		/*
5318 		 * Didn't find ourself in this locator block it means
5319 		 * the locator block is a stale transplant. Probably from
5320 		 * a user doing a dd.
5321 		 */
5322 		if (li == lbp->lb_loccnt)
5323 			continue;
5324 
5325 		/*
5326 		 * Keep track of the number of accessed and valid
5327 		 * locator blocks.
5328 		 */
5329 		lb_ok++;
5330 
5331 		/*
5332 		 * Read the tag in, skips invalid or blank tags.
5333 		 * Only valid tags allocate storage
5334 		 * Data tags are not used in MN disksets.
5335 		 */
5336 		if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5337 			/*
5338 			 * Keep track of the number of tagged
5339 			 * locator blocks.
5340 			 */
5341 			lb_tagged++;
5342 
5343 			/* Keep a list of unique tags. */
5344 			(void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5345 		}
5346 
5347 		if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5348 			/*
5349 			 * go through locator block and add any other
5350 			 * locations of the data base.
5351 			 * For the replicated import case, this was done earlier
5352 			 * and we really don't need or want to do so again
5353 			 */
5354 			cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5355 			for (li = 0; li < lbp->lb_loccnt; li++) {
5356 				lp = &lbp->lb_locators[li];
5357 				if (lp->l_flags & MDDB_F_DELETED)
5358 					continue;
5359 
5360 				cl->l_devid_flags = MDDB_DEVID_GETSZ;
5361 				cl->l_devid = (uint64_t)0;
5362 				cl->l_devid_sz = 0;
5363 				cl->l_old_devid = (uint64_t)0;
5364 				cl->l_old_devid_sz = 0;
5365 				cl->l_minor_name[0] = '\0';
5366 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5367 				    did_icp);
5368 
5369 				if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5370 					if ((cl->l_devid = (uintptr_t)kmem_alloc
5371 					    (cl->l_devid_sz, KM_SLEEP))
5372 					    == NULL) {
5373 						continue;
5374 					} else {
5375 						cl->l_devid_flags =
5376 						    MDDB_DEVID_SPACE;
5377 					}
5378 				}
5379 				locator2cfgloc(lbp, cl, li, s->s_sideno,
5380 				    did_icp);
5381 
5382 				(void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5383 
5384 				if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5385 					kmem_free((caddr_t)(uintptr_t)
5386 					    cl->l_devid, cl->l_devid_sz);
5387 			}
5388 			kmem_free(cl, sizeof (mddb_cfg_loc_t));
5389 		}
5390 
5391 		/* Save LB for later */
5392 		rip->ri_lbp = lbp;
5393 		if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5394 			rip->ri_did_icp = did_icp;
5395 			did_icp = (mddb_did_ic_t *)NULL;
5396 			did_blkp = (mddb_did_blk_t *)NULL;
5397 		} else
5398 			rip->ri_did_icp = NULL;
5399 		lbp = (mddb_lb_t *)NULL;
5400 	}
5401 
5402 	if (lbp != (mddb_lb_t *)NULL)
5403 		kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5404 
5405 	if (did_icp != (mddb_did_ic_t *)NULL) {
5406 		if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5407 			kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5408 			did_blkp = (mddb_did_blk_t *)NULL;
5409 		}
5410 		if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5411 			mddb_did_db_t	*did_dbp1, *did_dbp2;
5412 
5413 			did_dbp1 = did_icp->did_ic_dbp;
5414 			while (did_dbp1) {
5415 				did_dbp2 = did_dbp1->db_next;
5416 				kmem_free((caddr_t)did_dbp1->db_ptr,
5417 				    dbtob(did_dbp1->db_blkcnt));
5418 				kmem_free((caddr_t)did_dbp1,
5419 				    sizeof (mddb_did_db_t));
5420 				did_dbp1 = did_dbp2;
5421 			}
5422 		}
5423 		kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5424 	}
5425 
5426 	if (did_blkp != (mddb_did_blk_t *)NULL) {
5427 		kmem_free((caddr_t)did_blkp, did_blkp_sz);
5428 	}
5429 
5430 	/* No locator blocks were ok */
5431 	if (lb_ok == 0)
5432 		goto out;
5433 
5434 	/* No tagged data was found - will be 0 for MN diskset */
5435 	if (lb_tagged == 0)
5436 		goto out;
5437 
5438 	/* Find the highest non-deleted replica count */
5439 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5440 		int		lb_tot = 0;
5441 
5442 		if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5443 			continue;
5444 
5445 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
5446 			continue;
5447 
5448 		for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5449 			lp = &rip->ri_lbp->lb_locators[li];
5450 			if (lp->l_flags & MDDB_F_DELETED)
5451 				continue;
5452 			lb_tot++;
5453 		}
5454 
5455 		if (lb_tot > lb_total)
5456 			lb_total = lb_tot;
5457 	}
5458 
5459 	/* Count the number of unique tags */
5460 	for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5461 		lb_tags++;
5462 
5463 	/* Should have at least one tag at this point */
5464 	ASSERT(lb_tags > 0);
5465 
5466 
5467 	/*
5468 	 * If the number of tagged locators is not the same as the number of
5469 	 * OK locators OR more than one tag exists, then make sure the
5470 	 * selected tag will be written out later.
5471 	 */
5472 	if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5473 		md_set_setstatus(setno, MD_SET_TAGDATA);
5474 
5475 	/* Only a single tag, take the tagged data */
5476 	if (lb_tags == 1) {
5477 		dt_setup(s, &s->s_dtlp->dtl_dt);
5478 		md_set_setstatus(setno, MD_SET_USETAG);
5479 		goto out;
5480 	}
5481 
5482 	/* Multiple tags, not selecting a tag, tag mode is on */
5483 	if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5484 		retval = MDDB_E_TAGDATA;
5485 
5486 out:
5487 
5488 	return (retval);
5489 }
5490 
5491 /*
5492  *	1. Select a locator.
5493  *	2. check if enough locators now have current copies
5494  *	3. read in database from one of latest
5495  *	4. if known to have latest make all database the same
5496  *	5. if configuration has changed rewrite locators
5497  *
5498  * Parameters:
5499  * 	s - pointer to mddb_set structure
5500  *	flag - used in MN disksets to tell if this node is being joined to
5501  *		a diskset that is in the STALE state.  If the flag is
5502  *		MDDB_MN_STALE, then this node should be marked in the STALE
5503  *		state even if > 50% mddbs are available.  (The diskset can
5504  *		only change from STALE->OK if all nodes withdraw from the
5505  *		MN diskset and then rejoin).
5506  */
5507 static int
load_old_replicas(mddb_set_t * s,int flag)5508 load_old_replicas(
5509 	mddb_set_t	*s,
5510 	int		flag
5511 )
5512 {
5513 	mddb_lb_t	*lbp = NULL;
5514 	mddb_mnlb_t	*mnlbp = NULL;
5515 	mddb_ri_t	*rip;
5516 	mddb_locator_t	*lp;
5517 	mddb_db_t	*dbp;
5518 	mddb_de_ic_t	*dep;
5519 	int		li;
5520 	int		alc;
5521 	int		lc;
5522 	int		tlc;
5523 	int		retval = 0;
5524 	caddr_t		p;
5525 	size_t		maxrecsize;
5526 	set_t		setno = s->s_setno;
5527 	mddb_did_db_t	*did_dbp1;
5528 	mddb_did_info_t	*did_info;
5529 	mddb_did_ic_t	*did_icp = NULL;
5530 	md_dev64_t	*newdev;
5531 	mddb_sidelocator_t	*slp = 0;
5532 	mddb_mnsidelocator_t	*mnslp = 0;
5533 	uchar_t		i;
5534 	char		*name;
5535 	ddi_devid_t	ret_devid;
5536 	md_dev64_t	dev;
5537 	uint_t		len, sz;
5538 	char		*minor_name;
5539 	int		write_lb = 0;
5540 	int		rval;
5541 	int		stale_rtn = 0;
5542 
5543 	/* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5544 	if (retval = get_mbs_n_lbs(s, &write_lb))
5545 		goto errout;
5546 
5547 	if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5548 		retval = MDDB_E_NOLOCBLK;
5549 		goto errout;
5550 	}
5551 
5552 	/* If a multi-node set, then set md_set.s_status flag */
5553 	if (lbp->lb_flags & MDDB_MNSET) {
5554 		md_set_setstatus(setno, MD_SET_MNSET);
5555 		/*
5556 		 * If data tag area had been allocated before set type was
5557 		 * known - free it now.
5558 		 */
5559 		if (md_set[setno].s_dtp) {
5560 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5561 			md_set[setno].s_dtp = NULL;
5562 		}
5563 	}
5564 
5565 	/*
5566 	 * If the replica is in devid format, setup the devid incore ptr.
5567 	 */
5568 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5569 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5570 			if (rip->ri_lbp == s->s_lbp) {
5571 				did_icp = s->s_did_icp = rip->ri_did_icp;
5572 				break;
5573 			}
5574 		}
5575 		/*
5576 		 * If no devid incore info found - something has gone
5577 		 * wrong so errout.
5578 		 */
5579 		if (rip == NULL) {
5580 			retval = MDDB_E_NODEVID;
5581 			goto errout;
5582 		}
5583 
5584 		/*
5585 		 * Add all blocks containing devids to free list.
5586 		 * Then remove addresses that actually contain devids.
5587 		 */
5588 		did_dbp1 = did_icp->did_ic_dbp;
5589 		while (did_dbp1) {
5590 			if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5591 			    0, dbtob(did_dbp1->db_blkcnt))) {
5592 				retval = MDDB_E_NOSPACE;
5593 				goto errout;
5594 			}
5595 
5596 			did_dbp1 = did_dbp1->db_next;
5597 		}
5598 		for (li = 0; li < lbp->lb_loccnt; li++) {
5599 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5600 			if (!(did_info->info_flags & MDDB_DID_EXISTS))
5601 				continue;
5602 
5603 			if (mddb_devid_free_delete(s, did_info->info_firstblk,
5604 			    did_info->info_offset, did_info->info_length)) {
5605 				/* unable to find disk block */
5606 				retval = MDDB_E_NODEVID;
5607 				goto errout;
5608 			}
5609 		}
5610 	}
5611 
5612 	/*
5613 	 * create mddb_mbaray, count all locators and active locators.
5614 	 */
5615 	alc = 0;
5616 	lc = 0;
5617 	for (li = 0; li < lbp->lb_loccnt; li++) {
5618 		ddi_devid_t	li_devid;
5619 
5620 		lp = &lbp->lb_locators[li];
5621 
5622 		if (lp->l_flags & MDDB_F_DELETED)
5623 			continue;
5624 
5625 		/* Count non-deleted replicas */
5626 		lc++;
5627 
5628 		/*
5629 		 * Use the devid of this locator to compare with the rip
5630 		 * list.  The scenario to watch out for here is that this
5631 		 * locator could be on a disk that is dead and there could
5632 		 * be a valid entry in the rip list for a different disk
5633 		 * that has been moved to the dead disks dev_t.  We don't
5634 		 * want to match with the moved disk.
5635 		 */
5636 		li_devid = NULL;
5637 		(void) mddb_devid_get(s, li, &li_devid, &minor_name);
5638 
5639 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5640 			if (match_mddb(rip, li_devid, minor_name,
5641 			    md_expldev(lp->l_dev), lp->l_blkno)) {
5642 				break;
5643 			}
5644 		}
5645 		if (rip == NULL) {
5646 			/*
5647 			 * If rip not found, then mark error in master block
5648 			 * so that no writes are later attempted to this
5649 			 * replica.  rip may not be setup if ridev
5650 			 * failed due to un-found driver name.
5651 			 */
5652 			lp->l_flags |= MDDB_F_EMASTER;
5653 			continue;
5654 		}
5655 
5656 		s->s_mbiarray[li] = rip->ri_mbip;
5657 
5658 		lp->l_flags &= MDDB_F_ACTIVE;
5659 		lp->l_flags |= (int)rip->ri_flags;
5660 
5661 		if (rip->ri_transplant)
5662 			lp->l_flags &= ~MDDB_F_ACTIVE;
5663 
5664 		if (lp->l_flags & MDDB_F_LOCACC)
5665 			alc++;
5666 	}
5667 
5668 	/* Save on a divide - calculate 50% + 1 up front */
5669 	tlc = ((lc + 1) / 2);
5670 
5671 	if (alc > tlc) {		/* alc > tlc		- OK */
5672 		md_clr_setstatus(setno, MD_SET_STALE);
5673 	} else if (alc < tlc) {		/* alc < tlc		- stale */
5674 		md_set_setstatus(setno, MD_SET_STALE);
5675 	} else if (lc & 1) {		/* alc == tlc && odd	- OK */
5676 		md_clr_setstatus(setno, MD_SET_STALE);
5677 	} else {			/* alc == tlc && even	- ? */
5678 		/* Can do an accept, and are */
5679 		if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5680 			md_clr_setstatus(setno, MD_SET_STALE);
5681 		} else {		/* possibly has a mediator */
5682 			if (mediate(s)) {
5683 				md_set_setstatus(setno, MD_SET_STALE);
5684 			} else {
5685 				md_clr_setstatus(setno, MD_SET_STALE);
5686 			}
5687 		}
5688 
5689 		/*
5690 		 * The mirrored_root_flag allows the sysadmin to decide to
5691 		 * start the local set in a read/write (non-stale) mode
5692 		 * when there are only 50% available mddbs on the system and
5693 		 * when the root file system is on a mirror.  This is useful
5694 		 * in a 2 disk system where 1 disk failure would cause an mddb
5695 		 * quorum failure and subsequent boot failures since the root
5696 		 * filesystem would be in a read-only state.
5697 		 */
5698 		if (mirrored_root_flag == 1 && setno == 0 &&
5699 		    svm_bootpath[0] != 0) {
5700 			md_clr_setstatus(setno, MD_SET_STALE);
5701 		} else {
5702 			if (md_get_setstatus(setno) & MD_SET_STALE) {
5703 				/* Allow half mode - CAREFUL! */
5704 				if (mddb_allow_half)
5705 					md_clr_setstatus(setno, MD_SET_STALE);
5706 			}
5707 		}
5708 
5709 		/*
5710 		 * In a MN diskset,
5711 		 *	- if 50% mddbs are unavailable and this
5712 		 *		has been marked STALE above
5713 		 * 	- master node isn't in the STALE state
5714 		 *	- this node isn't the master node (this node
5715 		 *		isn't the first node to join the set)
5716 		 * then clear the STALE state and set TOOFEW.
5717 		 *
5718 		 * If this node is the master node and set was marked STALE,
5719 		 * then the set stays STALE.
5720 		 *
5721 		 * If this node is not the master and this node's state is
5722 		 * STALE and the master node is not marked STALE,
5723 		 * then master node must be in the TOOFEW state or the
5724 		 * master is panic'ing.  A MN diskset can only be placed into
5725 		 * the STALE state by having the first node join the set
5726 		 * with <= 50% mddbs.  There's no way for a MN diskset to
5727 		 * transition between STALE and not-STALE states unless all
5728 		 * nodes are withdrawn from the diskset or all nodes in the
5729 		 * diskset are rebooted at the same time.
5730 		 *
5731 		 * So, mark this node's state as TOOFEW instead of STALE.
5732 		 */
5733 		if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5734 		    == (MD_SET_MNSET | MD_SET_STALE)) &&
5735 		    ((flag & MDDB_MN_STALE) == 0) &&
5736 		    (!(md_set[setno].s_am_i_master))) {
5737 			md_clr_setstatus(setno, MD_SET_STALE);
5738 			md_set_setstatus(setno, MD_SET_TOOFEW);
5739 		}
5740 	}
5741 
5742 	/*
5743 	 * If a MN set is marked STALE on the other nodes,
5744 	 * mark it stale here.  Override all other considerations
5745 	 * such as a mediator or > 50% mddbs available.
5746 	 */
5747 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
5748 		if (flag & MDDB_MN_STALE)
5749 			md_set_setstatus(setno, MD_SET_STALE);
5750 	}
5751 
5752 	/*
5753 	 * read a good copy of the locator names
5754 	 * if an error occurs reading what is suppose
5755 	 * to be a good copy continue looking for another
5756 	 * good copy
5757 	 */
5758 	s->s_lnp = NULL;
5759 	for (li = 0; li < lbp->lb_loccnt; li++) {
5760 		lp = &lbp->lb_locators[li];
5761 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5762 		    (lp->l_flags & MDDB_F_EMASTER))
5763 			continue;
5764 
5765 		/* Find rip entry for this locator if one exists */
5766 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5767 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5768 			    lp->l_blkno))
5769 				break;
5770 		}
5771 
5772 		if (rip == NULL) {
5773 			continue;
5774 		}
5775 
5776 		/*
5777 		 * Use the rip commitcnt since the commitcnt in lbp could
5778 		 * been cleared by selectlocator.  Looking for a replica with
5779 		 * the same commitcnt as the 'golden' copy in order to
5780 		 * get the same data.
5781 		 */
5782 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5783 			continue;
5784 		}
5785 
5786 		/*
5787 		 * Now have a copy of the database that is equivalent
5788 		 * to the chosen locator block with respect to
5789 		 * inittime, identifier and commitcnt.   Trying the
5790 		 * equivalent databases in the order that they were
5791 		 * written will provide the most up to date data.
5792 		 */
5793 		lp->l_flags |= readlocnames(s, li);
5794 		if (s->s_lnp)
5795 			break;
5796 	}
5797 
5798 	if (s->s_lnp == NULL) {
5799 		retval = MDDB_E_NOLOCNMS;
5800 		goto errout;
5801 	}
5802 
5803 	/*
5804 	 * read a good copy of the data base
5805 	 * if an error occurs reading what is suppose
5806 	 * to be a good copy continue looking for another
5807 	 * good copy
5808 	 */
5809 
5810 	s->s_dbp = NULL;
5811 	for (li = 0; li < lbp->lb_loccnt; li++) {
5812 		lp = &lbp->lb_locators[li];
5813 		if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5814 		    (lp->l_flags & MDDB_F_EMASTER))
5815 			continue;
5816 
5817 		/* Find rip entry for this locator if one exists */
5818 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5819 			if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5820 			    lp->l_blkno))
5821 				break;
5822 		}
5823 
5824 		if (rip == NULL) {
5825 			continue;
5826 		}
5827 
5828 		/*
5829 		 * Use the rip commitcnt since the commitcnt in lbp could
5830 		 * been cleared by selectlocator.  Looking for a replica with
5831 		 * the same commitcnt as the 'golden' copy in order to
5832 		 * get the same data.
5833 		 */
5834 		if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5835 			continue;
5836 		}
5837 
5838 		/*
5839 		 * Now have a copy of the database that is equivalent
5840 		 * to the chosen locator block with respect to
5841 		 * inittime, identifier and commitcnt.   Trying the
5842 		 * equivalent databases in the order that they were
5843 		 * written will provide the most up to date data.
5844 		 */
5845 		lp->l_flags |= readcopy(s, li);
5846 
5847 		if (s->s_dbp)
5848 			break;
5849 	}
5850 
5851 	if (s->s_dbp == NULL) {
5852 		retval = MDDB_E_NODIRBLK;
5853 		goto errout;
5854 	}
5855 
5856 	lp->l_flags |= MDDB_F_MASTER;
5857 	lp->l_flags |= MDDB_F_UP2DATE;
5858 
5859 	/*
5860 	 * go through and find largest record;
5861 	 * Also fixup the user data area's
5862 	 */
5863 	maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5864 
5865 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5866 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5867 			if (dep->de_flags & MDDB_F_OPT)
5868 				getoptrecord(s, dep);
5869 			else {
5870 				allocuserdata(dep);
5871 				maxrecsize = MAX(dep->de_recsize, maxrecsize);
5872 			}
5873 
5874 	if (maxrecsize > s->s_databuffer_size) {
5875 		p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5876 		if (s->s_databuffer_size)
5877 			kmem_free(s->s_databuffer, s->s_databuffer_size);
5878 		s->s_databuffer = p;
5879 		s->s_databuffer_size = maxrecsize;
5880 	}
5881 
5882 	/* If we can clear the tag data record, do it now. */
5883 	/* Data tags not supported on MN sets */
5884 	if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5885 	    (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5886 		dt_setup(s, NULL);
5887 
5888 	/* This will return non-zero if STALE or TOOFEW */
5889 	/* This will write out chosen replica image to all replicas */
5890 	stale_rtn = selectreplicas(s, MDDB_SCANALL);
5891 
5892 	if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5893 		ddi_devid_t	devidptr;
5894 
5895 		/*
5896 		 * ignore the return value from selectreplicas because we
5897 		 * may have a STALE or TOOFEW set in the case of a partial
5898 		 * replicated diskset. We will fix that up later.
5899 		 */
5900 
5901 		lbp = s->s_lbp;
5902 		for (li = 0; li < lbp->lb_loccnt; li++) {
5903 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5904 
5905 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5906 				devidptr = s->s_did_icp->did_ic_devid[li];
5907 				lp = &lbp->lb_locators[li];
5908 				for (rip = s->s_rip; rip != NULL;
5909 				    rip = rip->ri_next) {
5910 					if (rip->ri_old_devid == 0)
5911 						continue;
5912 					if (ddi_devid_compare(rip->ri_old_devid,
5913 					    devidptr) != 0) {
5914 						continue;
5915 					}
5916 					if (update_locatorblock(s,
5917 					    md_expldev(lp->l_dev),
5918 					    rip->ri_devid, rip->ri_old_devid)) {
5919 						goto errout;
5920 					}
5921 				}
5922 			}
5923 		}
5924 	} else {
5925 		if (stale_rtn)
5926 			goto errout;
5927 	}
5928 
5929 	/*
5930 	 * If the replica is in device id style - validate the device id's,
5931 	 * if present, in the locator block devid area.
5932 	 */
5933 	newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5934 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5935 		for (li = 0; li < lbp->lb_loccnt; li++) {
5936 			newdev[li] = 0;
5937 			lp = &lbp->lb_locators[li];
5938 			if (lp->l_flags & MDDB_F_DELETED)
5939 				continue;
5940 			did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5941 			dev = md_expldev(lp->l_dev);
5942 			if (did_info->info_flags & MDDB_DID_EXISTS) {
5943 				/* Validate device id on current system */
5944 				newdev[li] = dev;
5945 				if (mddb_devid_validate(
5946 				    did_icp->did_ic_devid[li],
5947 				    &(newdev[li]),
5948 				    did_info->info_minor_name) == 0) {
5949 					/* Set valid flag */
5950 					did_info->info_flags |= MDDB_DID_VALID;
5951 				} else {
5952 					lp->l_flags |= MDDB_F_EMASTER;
5953 				}
5954 			} else if (!(MD_UPGRADE)) {
5955 				/*
5956 				 * If a device doesn't have a device id,
5957 				 * check if there is now a device ID
5958 				 * associated with device.  If one exists,
5959 				 * add it to the locator block devid area.
5960 				 * If there's not enough space to add it,
5961 				 * print a warning.
5962 				 * Don't do this during upgrade.
5963 				 */
5964 				dev_t ddi_dev = md_dev64_to_dev(dev);
5965 				if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5966 				    DDI_SUCCESS) {
5967 					if (ddi_lyr_get_minor_name(ddi_dev,
5968 					    S_IFBLK, &minor_name)
5969 					    == DDI_SUCCESS) {
5970 						if (mddb_devid_add(s, li,
5971 						    ret_devid, minor_name)) {
5972 							cmn_err(CE_WARN,
5973 							    "Not enough space"
5974 							    " in metadevice"
5975 							    " state"
5976 							    " database\n");
5977 							cmn_err(CE_WARN,
5978 							    "to add relocation"
5979 							    " information for"
5980 							    " device:\n");
5981 							cmn_err(CE_WARN,
5982 							    " major = %d, "
5983 							    " minor = %d\n",
5984 							    getmajor(ddi_dev),
5985 							    getminor(ddi_dev));
5986 						} else {
5987 							write_lb = 1;
5988 						}
5989 						kmem_free(minor_name,
5990 						    strlen(minor_name) + 1);
5991 					}
5992 					ddi_devid_free(ret_devid);
5993 				}
5994 			}
5995 		}
5996 
5997 		/*
5998 		 * If a device has a valid device id and if the dev_t
5999 		 * associated with the device id has changed, update the
6000 		 * driver name, minor num and dev_t in the local and side
6001 		 * locators to match the dev_t that the system currently
6002 		 * associates with the device id.
6003 		 *
6004 		 * Don't do this during upgrade.
6005 		 */
6006 		if (!(MD_UPGRADE)) {
6007 			for (li = 0; li < lbp->lb_loccnt; li++) {
6008 				lp = &lbp->lb_locators[li];
6009 				if (lp->l_flags & MDDB_F_DELETED)
6010 					continue;
6011 				did_info = &(did_icp->did_ic_blkp->blk_info
6012 				    [li]);
6013 				if ((did_info->info_flags & MDDB_DID_VALID) &&
6014 				    !(did_info->info_flags &
6015 				    MDDB_DID_UPDATED)) {
6016 					if (lbp->lb_flags & MDDB_MNSET) {
6017 						int j;
6018 						int index = -1;
6019 						mnlbp = (mddb_mnlb_t *)lbp;
6020 						for (j = 0; j < MD_MNMAXSIDES;
6021 						    j++) {
6022 							mnslp = &mnlbp->
6023 							    lb_mnsidelocators[j]
6024 							    [li];
6025 							if (mnslp->mnl_sideno ==
6026 							    s->s_sideno)
6027 								break;
6028 							if (mnslp->mnl_sideno ==
6029 							    0)
6030 								index = j;
6031 						}
6032 						if (j == MD_MNMAXSIDES) {
6033 							/*
6034 							 * No match found; take
6035 							 * empty
6036 							 */
6037 							mnslp = &mnlbp->
6038 							    lb_mnsidelocators
6039 							    [index][li];
6040 							write_lb = 1;
6041 							mnslp->mnl_mnum =
6042 							    md_getminor(newdev
6043 							    [li]);
6044 						} else if (mnslp->mnl_mnum !=
6045 						    md_getminor(newdev[li])) {
6046 							write_lb = 1;
6047 							mnslp->mnl_mnum =
6048 							    md_getminor(newdev
6049 							    [li]);
6050 						}
6051 					} else {
6052 						slp = &lbp->
6053 						    lb_sidelocators[s->s_sideno]
6054 						    [li];
6055 						if (slp->l_mnum !=
6056 						    md_getminor(newdev[li])) {
6057 							write_lb = 1;
6058 							slp->l_mnum =
6059 							    md_getminor(newdev
6060 							    [li]);
6061 						}
6062 					}
6063 					name = ddi_major_to_name(md_getmajor(
6064 					    newdev[li]));
6065 					if (lbp->lb_flags & MDDB_MNSET)
6066 						i = mnslp->mnl_drvnm_index;
6067 					else
6068 						i = slp->l_drvnm_index;
6069 					if (strncmp(lbp->lb_drvnm[i].dn_data,
6070 					    name, lbp->lb_drvnm[i].dn_len) !=
6071 					    0) {
6072 						/* Driver name has changed */
6073 						len = strlen(name);
6074 						/* Look for the driver name */
6075 						for (i = 0; i < MDDB_DRVNMCNT;
6076 						    i++) {
6077 							if (lbp->lb_drvnm[i].
6078 							    dn_len != len)
6079 								continue;
6080 							if (strncmp(lbp->
6081 							    lb_drvnm[i].dn_data,
6082 							    name, len) == 0)
6083 								break;
6084 						}
6085 						/* Didn't find one, add it */
6086 						if (i == MDDB_DRVNMCNT) {
6087 							for (i = 0; i <
6088 							    MDDB_DRVNMCNT;
6089 							    i++) {
6090 								if (lbp->
6091 								    lb_drvnm[i].
6092 								    dn_len == 0)
6093 									break;
6094 							}
6095 							if (i ==
6096 							    MDDB_DRVNMCNT) {
6097 								cmn_err(CE_WARN,
6098 								    "Unable to "
6099 								    " update "
6100 								    "driver "
6101 								    " name for "
6102 								    "dev:  "
6103 								    "major = %d"
6104 								    ", minor = "
6105 								    "%d\n",
6106 								    md_getmajor(
6107 								    newdev[li]),
6108 								    md_getminor(
6109 								    newdev
6110 								    [li]));
6111 								continue;
6112 							}
6113 							(void) strncpy(lbp->
6114 							    lb_drvnm[i].dn_data,
6115 							    name, MD_MAXDRVNM);
6116 							lbp->lb_drvnm[i].
6117 							    dn_len = (uchar_t)
6118 							    strlen(name);
6119 						}
6120 						/* Fill in the drvnm index */
6121 						if (lbp->lb_flags &
6122 						    MDDB_MNSET)
6123 							mnslp->mnl_drvnm_index =
6124 							    i;
6125 						else
6126 							slp->l_drvnm_index = i;
6127 						write_lb = 1;
6128 					}
6129 					did_info->info_flags |=
6130 					    MDDB_DID_UPDATED;
6131 				}
6132 			}
6133 		}
6134 	}
6135 	kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6136 
6137 	/*
6138 	 * If locator block has been changed by get_mbs_n_lbs,
6139 	 * by addition of new device id, by updated minor name or
6140 	 * by updated driver name - write out locator block.
6141 	 */
6142 	if (write_lb) {
6143 		rval = push_lb(s);
6144 		(void) upd_med(s, "load_old_replicas(0)");
6145 		if (rval)
6146 			goto errout;
6147 	}
6148 
6149 	/*
6150 	 * If the tag was moved, allocated, or a BADTAG was seen for some other
6151 	 * reason, then make sure tags are written to all the replicas.
6152 	 * Data tags not supported on MN sets.
6153 	 */
6154 	if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6155 		if (! (lc = dt_alloc_if_needed(s))) {
6156 			for (li = 0; li < lbp->lb_loccnt; li++) {
6157 				lp = &lbp->lb_locators[li];
6158 
6159 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6160 				    (lp->l_flags & MDDB_F_EMASTER))
6161 					continue;
6162 
6163 				if (lp->l_flags & MDDB_F_BADTAG) {
6164 					lc = 1;
6165 					break;
6166 				}
6167 			}
6168 		}
6169 
6170 		if (lc) {
6171 			md_set_setstatus(setno, MD_SET_TAGDATA);
6172 			md_clr_setstatus(setno, MD_SET_BADTAG);
6173 			(void) selectreplicas(s, MDDB_SCANALL);
6174 		}
6175 	}
6176 
6177 errout:
6178 
6179 	/* Free extraneous rip components. */
6180 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6181 		/* Get rid of lbp's and dtp's */
6182 
6183 		if (rip->ri_lbp != lbp) {
6184 			if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6185 				kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6186 				rip->ri_dtp = (mddb_dt_t *)NULL;
6187 			}
6188 
6189 			if (rip->ri_devid != (ddi_devid_t)NULL) {
6190 				sz = (int)ddi_devid_sizeof(rip->ri_devid);
6191 				kmem_free((caddr_t)rip->ri_devid, sz);
6192 				rip->ri_devid = (ddi_devid_t)NULL;
6193 			}
6194 			if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6195 				sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6196 				kmem_free((caddr_t)rip->ri_old_devid, sz);
6197 				rip->ri_old_devid = (ddi_devid_t)NULL;
6198 			}
6199 
6200 			if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6201 				mddb_devid_icp_free(&rip->ri_did_icp,
6202 				    rip->ri_lbp);
6203 
6204 				kmem_free((caddr_t)rip->ri_lbp,
6205 				    dbtob(rip->ri_lbp->lb_blkcnt));
6206 				rip->ri_lbp = (mddb_lb_t *)NULL;
6207 			}
6208 		}
6209 
6210 		if (lbp != NULL) {
6211 			for (li = 0; li < lbp->lb_loccnt; li++) {
6212 				lp = &lbp->lb_locators[li];
6213 				if (lp->l_flags & MDDB_F_DELETED)
6214 					continue;
6215 				if (rip->ri_dev == md_expldev(lp->l_dev) &&
6216 				    rip->ri_blkno == lp->l_blkno)
6217 					break;
6218 			}
6219 			if (li < lbp->lb_loccnt)
6220 				continue;
6221 		}
6222 
6223 		/*
6224 		 * Get rid of mbp's:
6225 		 *	if lbp, those out of lb_loccnt bounds
6226 		 *	if !lbp,  all of them.
6227 		 */
6228 		if (rip->ri_mbip) {
6229 			md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6230 			if (dev64 != NODEV64)
6231 				mddb_devclose(dev64);
6232 
6233 			free_mbipp(&rip->ri_mbip);
6234 		}
6235 		/*
6236 		 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6237 		 * code always ends up calling ridev for all replicas
6238 		 * before calling load_old_replicas.  ridev will reset
6239 		 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6240 		 */
6241 		if (setno != MD_LOCAL_SET)
6242 			rip->ri_flags &= ~MDDB_F_EMASTER;
6243 	}
6244 	return (retval);
6245 }
6246 
6247 /*
6248  * Given the devt from the md.conf info, get the devid for the device.
6249  */
6250 static void
lookup_db_devid(mddb_cfg_loc_t * cl)6251 lookup_db_devid(mddb_cfg_loc_t *cl)
6252 {
6253 	dev_t		ldev;
6254 	ddi_devid_t	devid;
6255 	char		*minor;
6256 
6257 	if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6258 		cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6259 		return;
6260 	}
6261 
6262 	ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6263 	if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6264 		cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6265 		    cl->l_driver, cl->l_mnum);
6266 		return;
6267 	}
6268 
6269 	if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6270 		cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6271 		    cl->l_mnum);
6272 		return;
6273 	}
6274 
6275 	cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6276 	cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6277 	cl->l_devid = (uint64_t)(uintptr_t)devid;
6278 	(void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6279 
6280 	kmem_free(minor, strlen(minor) + 1);
6281 }
6282 
6283 /*
6284  * grab driver name, minor, block and devid out of
6285  * strings like "driver:minor:block:devid"
6286  */
6287 static int
parse_db_loc(char * str,mddb_cfg_loc_t * clp)6288 parse_db_loc(
6289 	char		*str,
6290 	mddb_cfg_loc_t	*clp
6291 )
6292 {
6293 	char		*p, *e;
6294 	char		*minor_name;
6295 	ddi_devid_t	ret_devid;
6296 
6297 	clp->l_dev = 0;
6298 	p = clp->l_driver;
6299 	e = p + sizeof (clp->l_driver) - 1;
6300 	while ((*str != ':') && (*str != '\0') && (p < e))
6301 		*p++ = *str++;
6302 	*p = '\0';
6303 	if (*str++ != ':')
6304 		return (-1);
6305 	clp->l_mnum = 0;
6306 	while (ISNUM(*str)) {
6307 		clp->l_mnum *= 10;
6308 		clp->l_mnum += *str++ - '0';
6309 	}
6310 	if (*str++ != ':')
6311 		return (-1);
6312 	clp->l_blkno = 0;
6313 	while (ISNUM(*str)) {
6314 		clp->l_blkno *= 10;
6315 		clp->l_blkno += *str++ - '0';
6316 	}
6317 	if (*str++ != ':')
6318 		return (-1);
6319 
6320 	/*
6321 	 * If the md_devid_destroy flag is set, ignore the device ids.
6322 	 * This is only to used in a catastrophic failure case.  Examples
6323 	 * would be where the device id of all drives in the system
6324 	 * (especially the mirror'd root drives) had been changed
6325 	 * by firmware upgrade or by a patch to an existing disk
6326 	 * driver.  Another example would be in the case of non-unique
6327 	 * device ids due to a bug.  The device id would be valid on
6328 	 * the system, but would return the wrong dev_t.
6329 	 */
6330 	if (md_devid_destroy) {
6331 		clp->l_devid_flags = 0;
6332 		clp->l_devid = (uint64_t)NULL;
6333 		clp->l_devid_sz = 0;
6334 		clp->l_old_devid = (uint64_t)NULL;
6335 		clp->l_old_devid_sz = 0;
6336 		clp->l_minor_name[0] = '\0';
6337 		return (0);
6338 	}
6339 
6340 	if (ddi_devid_str_decode(str,
6341 	    (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6342 		return (-1);
6343 
6344 	clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6345 	clp->l_devid_flags = 0;
6346 	clp->l_old_devid = (uint64_t)NULL;
6347 	clp->l_old_devid_sz = 0;
6348 
6349 	/* If no device id associated with device, just return */
6350 	if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6351 		clp->l_devid_sz = 0;
6352 		clp->l_minor_name[0] = '\0';
6353 		if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6354 		    md_keep_repl_state == 0) {
6355 			/*
6356 			 * No devid in md.conf; we're in recovery mode so
6357 			 * lookup the devid for the device as specified by
6358 			 * the devt in md.conf.
6359 			 */
6360 			lookup_db_devid(clp);
6361 		}
6362 		return (0);
6363 	}
6364 
6365 	clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6366 	    MDDB_DEVID_SZ;
6367 	clp->l_devid_sz = (int)ddi_devid_sizeof(
6368 	    (ddi_devid_t)(uintptr_t)clp->l_devid);
6369 	(void) strcpy(clp->l_minor_name, minor_name);
6370 	kmem_free(minor_name, strlen(minor_name) + 1);
6371 
6372 	return (0);
6373 }
6374 
6375 /*
6376  * grab driver name, minor, and block out of
6377  * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6378  */
6379 static void
parse_db_string(char * str)6380 parse_db_string(
6381 	char		*str
6382 )
6383 {
6384 	char		*p, *e;
6385 	mddb_cfg_loc_t	*cl;
6386 	char		restore_space;
6387 
6388 	/* CSTYLED */
6389 	cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6390 	for (p = str; (*p != '\0'); ) {
6391 		for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6392 			;
6393 		if (*p == '\0')
6394 			break;
6395 		for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6396 			;
6397 		/*
6398 		 * Only give parse_db_loc 1 entry, so stuff a null into
6399 		 * the string if we're not at the end.  We need to save this
6400 		 * char and restore it after call.
6401 		 */
6402 		restore_space = '\0';
6403 		if (*e != '\0') {
6404 			restore_space = *e;
6405 			*e = '\0';
6406 		}
6407 		if (parse_db_loc(p, cl) != 0) {
6408 			cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6409 		} else {
6410 			(void) ridev(
6411 			    &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6412 			    cl, NULL, MDDB_F_PTCHED);
6413 			if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6414 				kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6415 				    cl->l_devid_sz);
6416 			}
6417 		}
6418 		if (restore_space != '\0') {
6419 			*e = restore_space;
6420 		}
6421 		p = e;
6422 	}
6423 	kmem_free(cl, sizeof (mddb_cfg_loc_t));
6424 }
6425 
6426 /*
6427  * grab database locations supplied by md.conf as properties
6428  */
6429 static void
parse_db_strings(void)6430 parse_db_strings(void)
6431 {
6432 	int		bootlist_id;
6433 	int		proplen;
6434 	/*
6435 	 * size of _bootlist_name should match uses of line and entry in
6436 	 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6437 	 */
6438 	char 		_bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6439 	char		*bootlist_name;
6440 	caddr_t		prop;
6441 
6442 /*
6443  * Step through the bootlist properties one at a time by forming the
6444  * correct name, fetching the property, parsing the property and
6445  * then freeing the memory.  If a property does not exist or returns
6446  * some form of error just ignore it.  There is no guarantee that
6447  * the properties will always exist in sequence, for example
6448  * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6449  * mddb_bootlist3 existing.
6450  */
6451 	bootlist_name = &_bootlist_name[0];
6452 	for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6453 
6454 		proplen = 0;
6455 		(void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6456 
6457 		if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6458 		    DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6459 		    &proplen) != DDI_PROP_SUCCESS)
6460 			continue;
6461 
6462 		if (proplen <= 0)
6463 			continue;
6464 
6465 		if (md_init_debug)
6466 			cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6467 
6468 		parse_db_string(prop);
6469 		kmem_free(prop, proplen);
6470 	}
6471 }
6472 
6473 static int
initit(set_t setno,int flag)6474 initit(
6475 	set_t		setno,
6476 	int		flag
6477 )
6478 {
6479 	int		i;
6480 	mddb_set_t	*s;
6481 	mddb_lb_t	*lbp;		/* pointer to locator block */
6482 	mddb_ln_t	*lnp;		/* pointer to locator names */
6483 	mddb_db_t	*dbp;		/* pointer to directory block */
6484 	mddb_did_blk_t	*did_blkp;	/* pointer to Device ID block */
6485 	mddb_did_ic_t	*did_icp;	/* pointer to Device ID incore area */
6486 	mddb_bf_t	*bfp;
6487 	side_t		sideno;
6488 	side_t		maxsides;
6489 	mddb_block_t	lb_blkcnt;
6490 	int		retval = 0;
6491 	md_dev64_t	dev;
6492 	mddb_mnlb_t	*mnlbp;
6493 	int		devid_flag;
6494 
6495 	/* single thread's all loads/unloads of set's */
6496 	mutex_enter(&mddb_lock);
6497 	mutex_enter(SETMUTEX(setno));
6498 
6499 	if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6500 		mutex_exit(SETMUTEX(setno));
6501 		mutex_exit(&mddb_lock);
6502 		return (MDDB_E_NOTNOW);
6503 	}
6504 
6505 	s = (mddb_set_t *)md_set[setno].s_db;
6506 
6507 	single_thread_start(s);
6508 
6509 	/*
6510 	 * init is already underway, block. Return success.
6511 	 */
6512 	if (s->s_lbp) {
6513 		single_thread_end(s);
6514 		mutex_exit(SETMUTEX(setno));
6515 		mutex_exit(&mddb_lock);
6516 		return (0);
6517 	}
6518 
6519 	uniqtime32(&s->s_inittime);
6520 
6521 	/* grab database locations patched by /etc/system */
6522 	if (setno == MD_LOCAL_SET)
6523 		parse_db_strings();
6524 
6525 	s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6526 	    sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6527 
6528 	s->s_zombie = 0;
6529 	s->s_staledeletes = 0;
6530 	s->s_optcmtcnt = 0;
6531 	s->s_opthavelck = 0;
6532 	s->s_optwantlck = 0;
6533 	s->s_optwaiterr = 0;
6534 	s->s_opthungerr = 0;
6535 
6536 	/*
6537 	 * KEEPTAG can never be set for a MN diskset since no tags are
6538 	 * allowed to be stored in a MN diskset.  No way to check
6539 	 * if this is a MN diskset or not at this point since the mddb
6540 	 * hasn't been read in from disk yet.  (flag will only have
6541 	 * MUTLINODE bit set if a new set is being created.)
6542 	 */
6543 	if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6544 		dt_setup(s, NULL);
6545 
6546 	md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6547 
6548 	for (i = 0; i <	mddb_maxbufheaders; i++) {
6549 		bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6550 		sema_init(&bfp->bf_buf.b_io, 0, NULL,
6551 		    SEMA_DEFAULT, NULL);
6552 		sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6553 		    SEMA_DEFAULT, NULL);
6554 		bfp->bf_buf.b_offset = -1;
6555 		freebuffer(s, bfp);
6556 	}
6557 
6558 	retval = load_old_replicas(s, flag);
6559 	/* If 0 return value - success */
6560 	if (! retval) {
6561 		single_thread_end(s);
6562 		mutex_exit(SETMUTEX(setno));
6563 		mutex_exit(&mddb_lock);
6564 		return (0);
6565 	}
6566 
6567 	/*
6568 	 * If here, then the load_old_replicas() failed
6569 	 */
6570 
6571 
6572 	/* If the database was supposed to exist. */
6573 	if (flag & MDDB_MUSTEXIST) {
6574 		if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6575 			for (i = 0; i < mddb_maxcopies;	 i++) {
6576 				if (! s->s_mbiarray[i])
6577 					continue;
6578 				dev = md_expldev(
6579 				    s->s_lbp->lb_locators[i].l_dev);
6580 				dev = md_xlate_targ_2_mini(dev);
6581 				if (dev != NODEV64)
6582 					mddb_devclose(dev);
6583 
6584 				free_mbipp(&s->s_mbiarray[i]);
6585 			}
6586 
6587 			kmem_free((caddr_t)s->s_mbiarray,
6588 			    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6589 			s->s_mbiarray = NULL;
6590 		}
6591 
6592 		if (s->s_lnp != (mddb_ln_t *)NULL) {
6593 			kmem_free((caddr_t)s->s_lnp,
6594 			    dbtob(s->s_lbp->lb_lnblkcnt));
6595 			s->s_lnp = (mddb_ln_t *)NULL;
6596 		}
6597 
6598 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6599 
6600 		if (s->s_lbp != (mddb_lb_t *)NULL) {
6601 			kmem_free((caddr_t)s->s_lbp,
6602 			    dbtob(s->s_lbp->lb_blkcnt));
6603 			s->s_lbp = (mddb_lb_t *)NULL;
6604 		}
6605 
6606 		while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6607 			kmem_free((caddr_t)bfp, sizeof (*bfp));
6608 
6609 		single_thread_end(s);
6610 		mutex_exit(SETMUTEX(setno));
6611 		mutex_exit(&mddb_lock);
6612 
6613 		if (retval == MDDB_E_TAGDATA)
6614 			return (retval);
6615 
6616 		/* Want a bit more detailed error messages */
6617 		if (mddb_db_err_detail)
6618 			return (retval);
6619 
6620 		return (MDDB_E_NODB);
6621 	}
6622 
6623 
6624 	/*
6625 	 * MDDB_NOOLDOK set - Creating a new database, so do
6626 	 * more initialization.
6627 	 */
6628 
6629 	lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6630 	    MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6631 	if (flag & MDDB_MULTINODE) {
6632 		lb_blkcnt = MDDB_MNLBCNT;
6633 	}
6634 
6635 	if (s->s_lbp == NULL)
6636 		s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6637 	lbp = s->s_lbp;
6638 
6639 	bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6640 	lbp->lb_setno = setno;
6641 	lbp->lb_magic = MDDB_MAGIC_LB;
6642 	if (flag & MDDB_MULTINODE) {
6643 		lbp->lb_revision = MDDB_REV_MNLB;
6644 	} else {
6645 		lbp->lb_revision = MDDB_REV_LB;
6646 	}
6647 	lbp->lb_inittime = s->s_inittime;
6648 	if (flag & MDDB_MULTINODE) {
6649 		mnlbp = (mddb_mnlb_t *)lbp;
6650 		for (i = 0; i < MDDB_NLB; i++) {
6651 			for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6652 				mddb_mnsidelocator_t	*mnslp;
6653 				mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6654 				mnslp->mnl_mnum = NODEV32;
6655 				mnslp->mnl_sideno = 0;
6656 				mnslp->mnl_drvnm_index = 0;
6657 			}
6658 		}
6659 	} else {
6660 		maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6661 		for (i = 0; i < MDDB_NLB; i++) {
6662 			for (sideno = 0; sideno < maxsides; sideno++) {
6663 				mddb_sidelocator_t	*slp;
6664 				slp = &lbp->lb_sidelocators[sideno][i];
6665 				slp->l_mnum = NODEV32;
6666 			}
6667 		}
6668 	}
6669 	lbp->lb_blkcnt = lb_blkcnt;
6670 
6671 	/* lb starts on block 0 */
6672 	/* locator names starts after locator block */
6673 	lbp->lb_lnfirstblk = lb_blkcnt;
6674 	if (flag & MDDB_MULTINODE) {
6675 		lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6676 	} else {
6677 		lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6678 		    MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6679 	}
6680 
6681 	if (flag & MDDB_MULTINODE) {
6682 		/* Creating a multinode diskset */
6683 		md_set_setstatus(setno, MD_SET_MNSET);
6684 		lbp->lb_flags |= MDDB_MNSET;
6685 	}
6686 
6687 	/* Data portion of mddb located after locator names */
6688 	lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6689 
6690 	/* the btodb that follows is converting the directory block size */
6691 	/* Data tag part of mddb located after first block of mddb data */
6692 	lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6693 	    btodb(MDDB_BSIZE));
6694 	/* Data tags are not used in MN diskset - so set count to 0 */
6695 	if (flag & MDDB_MULTINODE)
6696 		lbp->lb_dtblkcnt = (mddb_block_t)0;
6697 	else
6698 		lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6699 
6700 
6701 	lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6702 	lnp->ln_magic = MDDB_MAGIC_LN;
6703 	if (flag & MDDB_MULTINODE) {
6704 		lnp->ln_revision = MDDB_REV_MNLN;
6705 	} else {
6706 		lnp->ln_revision = MDDB_REV_LN;
6707 	}
6708 	s->s_lnp = lnp;
6709 
6710 	/*
6711 	 * Set up Device ID portion of Locator Block.
6712 	 * Do not set locator to device id style if
6713 	 * md_devid_destroy is 1 and md_keep_repl_state is 1
6714 	 * (destroy all device id data and keep replica in
6715 	 * non device id mode).
6716 	 *
6717 	 * This is logically equivalent to set locator to
6718 	 * device id style if md_devid_destroy is 0 or
6719 	 * md_keep_repl_state is 0.
6720 	 *
6721 	 * In SunCluster environment, device id mode is disabled
6722 	 * which means diskset will be run in non-devid mode.  For
6723 	 * localset, the behavior will remain intact and run in
6724 	 * device id mode.
6725 	 *
6726 	 * In multinode diskset devids are turned off.
6727 	 */
6728 	devid_flag = 1;
6729 	if (cluster_bootflags & CLUSTER_CONFIGURED)
6730 		if (setno != MD_LOCAL_SET)
6731 			devid_flag = 0;
6732 	if (flag & MDDB_MULTINODE)
6733 		devid_flag = 0;
6734 	if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6735 		devid_flag = 0;
6736 	/*
6737 	 * if we weren't devid style before and md_keep_repl_state=1
6738 	 * we need to stay non-devid
6739 	 */
6740 	if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6741 	    (md_keep_repl_state == 1))
6742 		devid_flag = 0;
6743 	if (devid_flag) {
6744 		lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6745 		    lbp->lb_dtblkcnt;
6746 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6747 		lbp->lb_flags |= MDDB_DEVID_STYLE;
6748 
6749 		did_icp = (mddb_did_ic_t *)kmem_zalloc
6750 		    (sizeof (mddb_did_ic_t), KM_SLEEP);
6751 		did_blkp = (mddb_did_blk_t *)
6752 		    kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6753 		did_blkp->blk_magic = MDDB_MAGIC_DI;
6754 		did_blkp->blk_revision = MDDB_REV_DI;
6755 		did_icp->did_ic_blkp = did_blkp;
6756 		s->s_did_icp = did_icp;
6757 	}
6758 
6759 	setidentifier(s, &lbp->lb_ident);
6760 	uniqtime32(&lbp->lb_timestamp);
6761 	dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6762 	dbp->db_magic = MDDB_MAGIC_DB;
6763 	dbp->db_revision = MDDB_REV_DB;
6764 	uniqtime32(&dbp->db_timestamp);
6765 	dbp->db_nextblk = 0;
6766 	dbp->db_firstentry = NULL;
6767 	dbp->db_blknum = lbp->lb_dbfirstblk;
6768 	dbp->db_recsum = MDDB_GLOBAL_XOR;
6769 	s->s_dbp = dbp;
6770 	single_thread_end(s);
6771 	mutex_exit(SETMUTEX(setno));
6772 	mutex_exit(&mddb_lock);
6773 	return (0);
6774 }
6775 
6776 mddb_set_t *
mddb_setenter(set_t setno,int flag,int * errorcodep)6777 mddb_setenter(
6778 	set_t		setno,
6779 	int		flag,
6780 	int		*errorcodep
6781 )
6782 {
6783 	mddb_set_t	*s;
6784 	int		err = 0;
6785 	size_t		sz = sizeof (void *) * MD_MAXUNITS;
6786 
6787 	mutex_enter(SETMUTEX(setno));
6788 	if (! md_set[setno].s_db) {
6789 		mutex_exit(SETMUTEX(setno));
6790 		if (errorcodep != NULL)
6791 			*errorcodep = MDDB_E_NOTOWNER;
6792 		return (NULL);
6793 	}
6794 
6795 	/* Allocate s_un and s_ui arrays if not already present. */
6796 	if (md_set[setno].s_un == NULL) {
6797 		md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6798 		if (md_set[setno].s_un == NULL) {
6799 			mutex_exit(SETMUTEX(setno));
6800 			if (errorcodep != NULL)
6801 				*errorcodep = MDDB_E_NOTOWNER;
6802 			return (NULL);
6803 		}
6804 	}
6805 	if (md_set[setno].s_ui == NULL) {
6806 		md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6807 		if (md_set[setno].s_ui == NULL) {
6808 			mutex_exit(&md_set[setno].s_dbmx);
6809 			kmem_free(md_set[setno].s_un, sz);
6810 			md_set[setno].s_un = NULL;
6811 			if (errorcodep != NULL)
6812 				*errorcodep = MDDB_E_NOTOWNER;
6813 			return (NULL);
6814 		}
6815 	}
6816 	s = (mddb_set_t *)md_set[setno].s_db;
6817 	if (s->s_lbp)
6818 		return (s);
6819 
6820 	if (flag & MDDB_NOINIT)
6821 		return (s);
6822 
6823 	/*
6824 	 * Release the set mutex - it will be acquired and released in
6825 	 * initit after acquiring the mddb_lock.  This is done to assure
6826 	 * that mutexes are always acquired in the same order to prevent
6827 	 * possible deadlock
6828 	 */
6829 	mutex_exit(SETMUTEX(setno));
6830 
6831 	if ((err = initit(setno, flag)) != 0) {
6832 		if (errorcodep != NULL)
6833 			*errorcodep = err;
6834 		return (NULL);
6835 	}
6836 
6837 	mutex_enter(SETMUTEX(setno));
6838 	return ((mddb_set_t *)md_set[setno].s_db);
6839 }
6840 
6841 /*
6842  * Release the set lock for a given set.
6843  *
6844  * In a MN diskset, this routine may send messages to the rpc.mdcommd
6845  * in order to have the slave nodes re-parse parts of the mddb.
6846  * Messages are only sent if the global ioctl lock is not held.
6847  *
6848  * With the introduction of multi-threaded ioctls, there is no way
6849  * to determine which thread(s) are holding the ioctl lock.  So, if
6850  * the ioctl lock is held (by process X) process X will send the
6851  * messages to the slave nodes when process X releases the ioctl lock.
6852  */
6853 void
mddb_setexit(mddb_set_t * s)6854 mddb_setexit(
6855 	mddb_set_t	*s
6856 )
6857 {
6858 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
6859 	md_mn_kresult_t			*kresult;
6860 	mddb_lb_t			*lbp = s->s_lbp;
6861 	int				i;
6862 	int				rval = 1;
6863 
6864 	/*
6865 	 * If not a MN diskset OR
6866 	 * a MN diskset but this node isn't master,
6867 	 * then release the mutex.
6868 	 */
6869 	if (!(MD_MNSET_SETNO(s->s_setno)) ||
6870 	    ((MD_MNSET_SETNO(s->s_setno)) &&
6871 	    (!md_set[s->s_setno].s_am_i_master))) {
6872 		mutex_exit(SETMUTEX(s->s_setno));
6873 		return;
6874 	}
6875 
6876 	/*
6877 	 * If global ioctl lock is held, then send no messages,
6878 	 * just release mutex and return.
6879 	 *
6880 	 */
6881 	if (md_status & MD_GBL_IOCTL_LOCK) {
6882 		mutex_exit(SETMUTEX(s->s_setno));
6883 		return;
6884 	}
6885 
6886 	/*
6887 	 * This thread is not holding the ioctl lock, so drop the set
6888 	 * lock, send messages to slave nodes to reparse portions
6889 	 * of the mddb and return.
6890 	 *
6891 	 * If the block parse flag is set, do not send parse messages.
6892 	 * This flag is set when master is adding a new mddb that would
6893 	 * cause parse messages to be sent to the slaves, but the slaves
6894 	 * don't have knowledge of the new mddb yet since the mddb add
6895 	 * operation hasn't been run on the slave nodes yet.  When the
6896 	 * master unblocks the parse flag, the parse messages will be
6897 	 * generated.
6898 	 *
6899 	 * If s_mn_parseflags_sending is non-zero, then another thread
6900 	 * is already currently sending a parse message, so just release
6901 	 * the mutex and return.  If an mddb change occurred that results
6902 	 * in a parse message to be generated, the thread that is currently
6903 	 * sending a parse message would generate the additional parse message.
6904 	 *
6905 	 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6906 	 * then loop until s_mn_parseflags is 0 (until there are no more
6907 	 * messages to send).
6908 	 * While s_mn_parseflags is non-zero,
6909 	 * 	put snapshot of parse_flags in s_mn_parseflags_sending
6910 	 * 	set s_mn_parseflags to zero
6911 	 *	release mutex
6912 	 *	send message
6913 	 *	re-grab mutex
6914 	 *	set s_mn_parseflags_sending to zero
6915 	 */
6916 	mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
6917 	while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6918 	    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6919 	    (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6920 		/* Grab snapshot of parse flags */
6921 		s->s_mn_parseflags_sending = s->s_mn_parseflags;
6922 		s->s_mn_parseflags = 0;
6923 
6924 		mutex_exit(SETMUTEX(s->s_setno));
6925 
6926 		/*
6927 		 * Send the message to the slaves to re-parse
6928 		 * the indicated portions of the mddb. Send the status
6929 		 * of the 50 mddbs in this set so that slaves know which
6930 		 * mddbs that the master node thinks are 'good'.
6931 		 * Otherwise, slave may reparse, but from wrong replica.
6932 		 */
6933 		mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6934 		for (i = 0; i < MDDB_NLB; i++) {
6935 			mddb_parse_msg->msg_lb_flags[i] =
6936 			    lbp->lb_locators[i].l_flags;
6937 		}
6938 		kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6939 		while (rval != 0) {
6940 			rval = mdmn_ksend_message(s->s_setno,
6941 			    MD_MN_MSG_MDDB_PARSE, 0, 0,
6942 			    (char *)mddb_parse_msg,
6943 			    sizeof (md_mn_msg_mddb_parse_t), kresult);
6944 			if (rval != 0)
6945 				cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6946 				    "mddb update message to other nodes in "
6947 				    "diskset %s\n", s->s_setname);
6948 		}
6949 		kmem_free(kresult, sizeof (md_mn_kresult_t));
6950 
6951 		/*
6952 		 * Re-grab mutex to clear sending field and to
6953 		 * see if another parse message needs to be generated.
6954 		 */
6955 		mutex_enter(SETMUTEX(s->s_setno));
6956 		s->s_mn_parseflags_sending = 0;
6957 	}
6958 	kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6959 	mutex_exit(SETMUTEX(s->s_setno));
6960 }
6961 
6962 static void
mddb_setexit_no_parse(mddb_set_t * s)6963 mddb_setexit_no_parse(
6964 	mddb_set_t	*s
6965 )
6966 {
6967 	mutex_exit(SETMUTEX(s->s_setno));
6968 }
6969 
6970 uint_t
mddb_lb_did_convert(mddb_set_t * s,uint_t doit,uint_t * blk_cnt)6971 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6972 {
6973 	uint_t			li;
6974 	mddb_lb_t		*lbp = s->s_lbp;
6975 	mddb_locator_t		*lp;
6976 	ddi_devid_t		ret_devid;
6977 	uint_t			devid_len;
6978 	dev_t			ddi_dev;
6979 	mddb_did_ic_t		*did_icp;
6980 	mddb_did_blk_t		*did_blkp;
6981 	char			*minor_name;
6982 	size_t			sz;
6983 	int			retval;
6984 	int			err;
6985 	md_dev64_t		dev64; /* tmp var to make code look better */
6986 
6987 
6988 	/* Need disk block(s) to hold mddb_did_blk_t */
6989 	*blk_cnt = MDDB_DID_BLOCKS;
6990 
6991 	if (doit) {
6992 		/*
6993 		 * Alloc mddb_did_blk_t disk block and fill in header area.
6994 		 * Don't fill in did magic number until end of routine so
6995 		 * if machine panics in the middle of conversion, the
6996 		 * device id information will be thrown away at the
6997 		 * next snarfing of this set.
6998 		 * Need to set DEVID_STYLE so that mddb_devid_add will
6999 		 * function properly.
7000 		 */
7001 		/* grab the mutex */
7002 		if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
7003 			return (1);
7004 		}
7005 		single_thread_start(s);
7006 		lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
7007 		if (lbp->lb_didfirstblk == 0) {
7008 			single_thread_end(s);
7009 			mddb_setexit(s);
7010 			return (1);
7011 		}
7012 		lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
7013 		did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
7014 		    KM_SLEEP);
7015 		did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
7016 		    KM_SLEEP);
7017 
7018 		did_blkp->blk_revision = MDDB_REV_DI;
7019 		did_icp->did_ic_blkp = did_blkp;
7020 		s->s_did_icp = did_icp;
7021 		lbp->lb_flags |= MDDB_DEVID_STYLE;
7022 	}
7023 
7024 	/* Fill in information in mddb_did_info_t array */
7025 	for (li = 0; li < lbp->lb_loccnt; li++) {
7026 		lp = &lbp->lb_locators[li];
7027 		if (lp->l_flags & MDDB_F_DELETED)
7028 			continue;
7029 
7030 		dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
7031 		ddi_dev = md_dev64_to_dev(dev64);
7032 		if (ddi_dev == NODEV) {
7033 			/*
7034 			 * No translation available for replica.
7035 			 * Could fail conversion to device id replica,
7036 			 * but instead will just continue with next
7037 			 * replica in list.
7038 			 */
7039 			continue;
7040 		}
7041 		if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
7042 			/*
7043 			 * Just count each devid as at least 1 block.  This
7044 			 * is conservative since several device id's may fit
7045 			 * into 1 disk block, but it's better to overestimate
7046 			 * the number of blocks needed than to underestimate.
7047 			 */
7048 			devid_len = (int)ddi_devid_sizeof(ret_devid);
7049 			*blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
7050 			if (doit) {
7051 				if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
7052 				    &minor_name) == DDI_SUCCESS) {
7053 					if (mddb_devid_add(s, li, ret_devid,
7054 					    minor_name)) {
7055 						cmn_err(CE_WARN,
7056 						    "Not enough space in metadb"
7057 						    " to add device id for"
7058 						    "  dev: major = %d, "
7059 						    "minor = %d\n",
7060 						    getmajor(ddi_dev),
7061 						    getminor(ddi_dev));
7062 					}
7063 					sz = strlen(minor_name) + 1;
7064 					kmem_free(minor_name, sz);
7065 				}
7066 			}
7067 			ddi_devid_free(ret_devid);
7068 		}
7069 	}
7070 
7071 	if (doit) {
7072 		did_blkp->blk_magic = MDDB_MAGIC_DI;
7073 		retval = push_lb(s);
7074 		(void) upd_med(s, "mddb_lb_did_convert(0)");
7075 		single_thread_end(s);
7076 		mddb_setexit(s);
7077 		if (retval != 0)
7078 			return (1);
7079 	}
7080 
7081 	return (0);
7082 }
7083 
7084 static mddb_set_t *
init_set(mddb_config_t * cp,int flag,int * errp)7085 init_set(
7086 	mddb_config_t	*cp,
7087 	int		flag,
7088 	int		*errp
7089 )
7090 {
7091 	mddb_set_t	*s;
7092 	char		*setname = NULL;
7093 	set_t		setno = MD_LOCAL_SET;
7094 	side_t		sideno = 0;
7095 	struct timeval32 *created = NULL;
7096 
7097 	if (cp != NULL) {
7098 		setname = cp->c_setname;
7099 		setno = cp->c_setno;
7100 		sideno = cp->c_sideno;
7101 		created = &cp->c_timestamp;
7102 	}
7103 
7104 	if (setno >= MD_MAXSETS)
7105 		return ((mddb_set_t *)NULL);
7106 
7107 	if (md_set[setno].s_db)
7108 		return (mddb_setenter(setno, flag, errp));
7109 
7110 	s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
7111 
7112 	cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
7113 	cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
7114 	cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
7115 	cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
7116 	cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
7117 
7118 	s->s_setno = setno;
7119 	s->s_sideno = sideno;
7120 	if (setno == MD_LOCAL_SET) {
7121 		(void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
7122 		    "%u", zone_get_hostid(NULL));
7123 	} else {
7124 		s->s_ident.createtime = *created;
7125 		s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
7126 		    KM_SLEEP);
7127 		(void) strcpy(s->s_setname, setname);
7128 	}
7129 
7130 	/* have a config struct,  copy mediator information */
7131 	if (cp != NULL)
7132 		s->s_med = cp->c_med;		/* structure assignment */
7133 
7134 	md_set[setno].s_db = (void *) s;
7135 
7136 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
7137 
7138 	return (mddb_setenter(setno, flag, errp));
7139 }
7140 
7141 void
mddb_unload_set(set_t setno)7142 mddb_unload_set(
7143 	set_t		setno
7144 )
7145 {
7146 
7147 	mddb_set_t	*s;
7148 	mddb_db_t	*dbp, *adbp = NULL;
7149 	mddb_de_ic_t	*dep, *dep2;
7150 	mddb_bf_t	*bfp;
7151 	int		i;
7152 	md_dev64_t	dev;
7153 
7154 	if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7155 		return;
7156 
7157 	single_thread_start(s);
7158 
7159 	s->s_opthavequeuinglck = 0;
7160 	s->s_optwantqueuinglck = 0;
7161 
7162 	for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7163 		for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7164 			if (dep->de_rb_userdata != NULL) {
7165 				if (dep->de_icreqsize)
7166 					kmem_free(dep->de_rb_userdata_ic,
7167 					    dep->de_icreqsize);
7168 				else
7169 					kmem_free(dep->de_rb_userdata,
7170 					    dep->de_reqsize);
7171 			}
7172 			kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7173 			dep2 = dep->de_next;
7174 			kmem_free((caddr_t)dep, sizeofde(dep));
7175 		}
7176 		adbp = dbp->db_next;
7177 		kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7178 	}
7179 	s->s_dbp = (mddb_db_t *)NULL;
7180 
7181 	free_rip(&s->s_rip);
7182 
7183 	for (i = 0; i < mddb_maxcopies;	 i++) {
7184 		if (! s->s_mbiarray)
7185 			break;
7186 
7187 		if (! s->s_mbiarray[i])
7188 			continue;
7189 
7190 		dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7191 		dev = md_xlate_targ_2_mini(dev);
7192 		if (dev != NODEV64)
7193 			mddb_devclose(dev);
7194 
7195 		free_mbipp(&s->s_mbiarray[i]);
7196 	}
7197 
7198 	if (s->s_mbiarray) {
7199 		kmem_free((caddr_t)s->s_mbiarray,
7200 		    sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7201 		s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7202 	}
7203 
7204 	if (s->s_lnp) {
7205 		kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7206 		s->s_lnp = (mddb_ln_t *)NULL;
7207 	}
7208 
7209 	if (s->s_lbp) {
7210 		mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7211 		kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7212 		s->s_lbp = (mddb_lb_t *)NULL;
7213 	}
7214 
7215 	if (s->s_freebitmap) {
7216 		kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7217 		s->s_freebitmap = NULL;
7218 		s->s_freebitmapsize = 0;
7219 	}
7220 
7221 	while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7222 		kmem_free((caddr_t)bfp, sizeof (*bfp));
7223 
7224 	if (s->s_databuffer_size) {
7225 		kmem_free(s->s_databuffer, s->s_databuffer_size);
7226 		s->s_databuffer_size = 0;
7227 	}
7228 
7229 	if (s->s_setname != NULL)
7230 		kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7231 
7232 	/* Data tags not supported on MN sets. */
7233 	if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7234 		dtl_freel(&s->s_dtlp);
7235 
7236 	md_set[setno].s_db = NULL;
7237 	ASSERT(s->s_singlelockwanted == 0);
7238 	kmem_free(s, sizeof (mddb_set_t));
7239 
7240 	/* Take care of things setup in the md_set array */
7241 	if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7242 		if (md_set[setno].s_dtp) {
7243 			kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7244 			md_set[setno].s_dtp = NULL;
7245 		}
7246 	}
7247 
7248 	md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7249 	    MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
7250 	    MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
7251 	    MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
7252 	    MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
7253 
7254 	mutex_exit(SETMUTEX(setno));
7255 }
7256 
7257 /*
7258  * returns 0 if name can be put into locator block
7259  * returns 1 if locator block prefixes are all used
7260  *
7261  * Takes splitname (suffix, prefix, sideno) and
7262  * stores it in the locator name structure.
7263  * For traditional diskset, the sideno is the index into the suffixes
7264  * array in the locator name structure.
7265  * For the MN diskset, the sideno is the nodeid which can be any number,
7266  * so the index passed in is the index into the mnsuffixes array
7267  * in the locator structure.  This index was computed by the
7268  * routine checklocator which basically checked the locator block
7269  * mnside locator structure.
7270  */
7271 static int
splitname2locatorblock(md_splitname * spn,mddb_ln_t * lnp,int li,side_t sideno,int index)7272 splitname2locatorblock(
7273 	md_splitname	*spn,
7274 	mddb_ln_t	*lnp,
7275 	int		li,
7276 	side_t		sideno,
7277 	int		index
7278 )
7279 {
7280 	uchar_t			i;
7281 	md_name_suffix		*sn;
7282 	md_mnname_suffix_t	*mnsn;
7283 	mddb_mnln_t		*mnlnp;
7284 
7285 	for (i = 0; i < MDDB_PREFIXCNT; i++) {
7286 		if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7287 			continue;
7288 		if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7289 		    SPN_PREFIX(spn).pre_len) == 0)
7290 			break;
7291 	}
7292 	if (i == MDDB_PREFIXCNT) {
7293 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7294 			if (lnp->ln_prefixes[i].pre_len == 0)
7295 				break;
7296 		}
7297 		if (i == MDDB_PREFIXCNT)
7298 			return (1);
7299 		bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7300 		    SPN_PREFIX(spn).pre_len);
7301 		lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7302 	}
7303 
7304 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7305 		/* If a MN diskset, use index */
7306 		mnlnp = (mddb_mnln_t *)lnp;
7307 		mnsn = &mnlnp->ln_mnsuffixes[index][li];
7308 		mnsn->mn_ln_sideno = sideno;
7309 		mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7310 		mnsn->mn_ln_suffix.suf_prefix = i;
7311 		bcopy(SPN_SUFFIX(spn).suf_data,
7312 		    mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7313 	} else {
7314 		sn = &lnp->ln_suffixes[sideno][li];
7315 		sn->suf_len = SPN_SUFFIX(spn).suf_len;
7316 		sn->suf_prefix = i;
7317 		bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7318 		    SPN_SUFFIX(spn).suf_len);
7319 	}
7320 	return (0);
7321 }
7322 
7323 /*
7324  * Find the locator name for the given sideno and convert the locator name
7325  * information into a splitname structure.
7326  */
7327 void
mddb_locatorblock2splitname(mddb_ln_t * lnp,int li,side_t sideno,md_splitname * spn)7328 mddb_locatorblock2splitname(
7329 	mddb_ln_t	*lnp,
7330 	int		li,
7331 	side_t		sideno,
7332 	md_splitname	*spn
7333 )
7334 {
7335 	int			iprefix;
7336 	md_name_suffix		*sn;
7337 	md_mnname_suffix_t	*mnsn;
7338 	int			i;
7339 	mddb_mnln_t		*mnlnp;
7340 
7341 	if (lnp->ln_revision == MDDB_REV_MNLN) {
7342 		mnlnp = (mddb_mnln_t *)lnp;
7343 		for (i = 0; i < MD_MNMAXSIDES; i++) {
7344 			mnsn = &mnlnp->ln_mnsuffixes[i][li];
7345 			if (mnsn->mn_ln_sideno == sideno)
7346 				break;
7347 		}
7348 		if (i == MD_MNMAXSIDES)
7349 			return;
7350 
7351 		SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7352 		bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7353 		    SPN_SUFFIX(spn).suf_len);
7354 		iprefix = mnsn->mn_ln_suffix.suf_prefix;
7355 	} else {
7356 		sn = &lnp->ln_suffixes[sideno][li];
7357 		SPN_SUFFIX(spn).suf_len = sn->suf_len;
7358 		bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7359 		    SPN_SUFFIX(spn).suf_len);
7360 		iprefix = sn->suf_prefix;
7361 	}
7362 	SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7363 	bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7364 	    SPN_PREFIX(spn).pre_len);
7365 }
7366 
7367 static int
getdeldev(mddb_config_t * cp,int command,md_error_t * ep)7368 getdeldev(
7369 	mddb_config_t	*cp,
7370 	int		command,
7371 	md_error_t	*ep
7372 )
7373 {
7374 	mddb_set_t	*s;
7375 	mddb_lb_t	*lbp;
7376 	mddb_locator_t	*locators;
7377 	uint_t		loccnt;
7378 	mddb_mb_ic_t	*mbip;
7379 	mddb_block_t	blk;
7380 	int		err = 0;
7381 	int		i, j;
7382 	int		li;
7383 	uint_t		commitcnt;
7384 	set_t		setno = cp->c_setno;
7385 	uint_t		set_status;
7386 	md_dev64_t	dev;
7387 	int		flags = MDDB_MUSTEXIST;
7388 	mddb_ri_t	*rip;
7389 
7390 	cp->c_dbmax = MDDB_NLB;
7391 
7392 	/*
7393 	 * Data checking
7394 	 */
7395 	if (setno >= md_nsets || cp->c_id < 0 ||
7396 	    cp->c_id > cp->c_dbmax) {
7397 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7398 	}
7399 
7400 	if (cp->c_flags & MDDB_C_STALE)
7401 		flags |= MDDB_MN_STALE;
7402 
7403 	if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7404 		return (mddbstatus2error(ep, err, NODEV32, setno));
7405 
7406 	cp->c_flags = 0;
7407 
7408 	lbp = s->s_lbp;
7409 	loccnt = lbp->lb_loccnt;
7410 	locators = lbp->lb_locators;
7411 
7412 	/* shorthand */
7413 	set_status = md_get_setstatus(setno);
7414 
7415 	if (set_status & MD_SET_STALE)
7416 		cp->c_flags |= MDDB_C_STALE;
7417 
7418 	if (set_status & MD_SET_TOOFEW)
7419 		cp->c_flags |= MDDB_C_TOOFEW;
7420 
7421 	cp->c_sideno = s->s_sideno;
7422 
7423 	cp->c_dbcnt = 0;
7424 	/*
7425 	 * go through and count active entries
7426 	 */
7427 	for (i = 0; i < loccnt;	 i++) {
7428 		if (locators[i].l_flags & MDDB_F_DELETED)
7429 			continue;
7430 		cp->c_dbcnt++;
7431 	}
7432 
7433 	/*
7434 	 * add the ability to accept a locator block index
7435 	 * which is not relative to previously deleted replicas.  This
7436 	 * is for support of MD_DEBUG=STAT in metastat since it asks for
7437 	 * replica information specifically for each of the mirror resync
7438 	 * records.  MDDB_CONFIG_SUBCMD uses one of the pad spares in
7439 	 * the mddb_config_t type.
7440 	 */
7441 	if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7442 		if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7443 			mddb_setexit(s);
7444 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7445 			    setno));
7446 		}
7447 		li = cp->c_id;
7448 	} else {
7449 		if (cp->c_id >= cp->c_dbcnt) {
7450 			mddb_setexit(s);
7451 			return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7452 			    setno));
7453 		}
7454 
7455 		/* CSTYLED */
7456 		for (li = 0, j = 0; /* void */; li++) {
7457 			if (locators[li].l_flags & MDDB_F_DELETED)
7458 				continue;
7459 			j++;
7460 			if (j > cp->c_id)
7461 				break;
7462 		}
7463 	}
7464 
7465 	if (command == MDDB_ENDDEV) {
7466 		daddr_t ib = 0, jb;
7467 
7468 		blk = 0;
7469 		if ((s != NULL) && s->s_mbiarray[li]) {
7470 			mbip = s->s_mbiarray[li];
7471 			while ((jb = getphysblk(blk++, mbip)) > 0) {
7472 				if (jb > ib)
7473 					ib = jb;
7474 			}
7475 			cp->c_dbend = (int)ib;
7476 		} else {
7477 			cp->c_dbend = 0;
7478 		}
7479 	}
7480 
7481 	locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7482 	mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7483 
7484 	if (command != MDDB_DELDEV) {
7485 		mddb_setexit(s);
7486 		return (0);
7487 	}
7488 
7489 	/* Currently don't allow addition/deletion of sides during upgrade */
7490 	if (MD_UPGRADE) {
7491 		cmn_err(CE_WARN,
7492 		    "Deletion of replica not allowed during upgrade.\n");
7493 		mddb_setexit(s);
7494 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7495 	}
7496 
7497 	/*
7498 	 * If here, replica delete in progress.
7499 	 */
7500 	single_thread_start(s);
7501 
7502 	if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7503 	    (locators[li].l_flags & MDDB_F_ACTIVE)) {
7504 		commitcnt = lbp->lb_commitcnt;
7505 		lbp->lb_commitcnt = 0;
7506 		setidentifier(s, &lbp->lb_ident);
7507 		crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7508 		/*
7509 		 * Don't need to write out device id area, since locator
7510 		 * block on this replica is being deleted by setting the
7511 		 * commitcnt to 0.
7512 		 */
7513 		(void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7514 		    MDDB_WR_ONLY_MASTER);
7515 		lbp->lb_commitcnt = commitcnt;
7516 	}
7517 
7518 	if (s->s_mbiarray[li]) {
7519 		/* A freed mbi pointer still exists in the mddb_ri_t */
7520 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7521 			if (rip->ri_mbip == s->s_mbiarray[li])
7522 				rip->ri_mbip = NULL;
7523 		}
7524 		free_mbipp(&s->s_mbiarray[li]);
7525 	}
7526 
7527 	if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7528 		dev = md_expldev(locators[li].l_dev);
7529 		dev = md_xlate_targ_2_mini(dev);
7530 		if (dev != NODEV64)
7531 			mddb_devclose(dev);
7532 	}
7533 
7534 	s->s_mbiarray[li] = 0;
7535 	lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7536 
7537 	/* Only support data tags for traditional and local sets */
7538 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7539 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
7540 	    setno != MD_LOCAL_SET)
7541 		if (set_dtag(s, ep))
7542 			mdclrerror(ep);
7543 
7544 	/* Write data tags to all accessible devices */
7545 	/* Only support data tags for traditional and local sets */
7546 	if (!(lbp->lb_flags & MDDB_MNSET)) {
7547 		(void) dt_write(s);
7548 	}
7549 
7550 	/* Delete device id of deleted replica */
7551 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7552 		(void) mddb_devid_delete(s, li);
7553 	}
7554 	/* write new locator to all devices */
7555 	err = writelocall(s);
7556 
7557 	(void) upd_med(s, "getdeldev(0)");
7558 
7559 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7560 	    md_expldev(locators[li].l_dev));
7561 
7562 	computefreeblks(s); /* recompute always it may be larger */
7563 	cp->c_dbcnt--;
7564 	err |= fixoptrecords(s);
7565 	if (err) {
7566 		if (writeretry(s)) {
7567 			single_thread_end(s);
7568 			mddb_setexit(s);
7569 			return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7570 		}
7571 	}
7572 
7573 	single_thread_end(s);
7574 	mddb_setexit(s);
7575 	return (0);
7576 }
7577 
7578 static int
getdriver(mddb_cfg_loc_t * clp)7579 getdriver(
7580 	mddb_cfg_loc_t	*clp
7581 )
7582 {
7583 	major_t		majordev;
7584 
7585 	/*
7586 	 * Data checking
7587 	 */
7588 	if (clp->l_dev <= 0)
7589 		return (EINVAL);
7590 
7591 	majordev = getmajor(expldev(clp->l_dev));
7592 
7593 	if (ddi_major_to_name(majordev) == (char *)NULL)
7594 		return (EINVAL);
7595 
7596 	if (MD_UPGRADE)
7597 		(void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7598 	else
7599 		(void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7600 	return (0);
7601 }
7602 
7603 /*
7604  * update_valid_replica - updates the locator block namespace (prefix
7605  * 	and/or suffix) with new pathname and devname.
7606  *	RETURN
7607  *		1	Error
7608  *		0	Success
7609  */
7610 static int
update_valid_replica(side_t side,mddb_locator_t * lp,mddb_set_t * s,int li,char * devname,char * pathname,md_dev64_t devt)7611 update_valid_replica(
7612 	side_t		side,
7613 	mddb_locator_t	*lp,
7614 	mddb_set_t	*s,
7615 	int		li,
7616 	char		*devname,
7617 	char		*pathname,
7618 	md_dev64_t	devt
7619 )
7620 {
7621 	uchar_t		pre_len, suf_len;
7622 	md_name_suffix	*sn;
7623 	mddb_ln_t	*lnp;
7624 	uchar_t		pre_index;
7625 	uchar_t		i;
7626 
7627 	if (md_expldev(lp->l_dev) != devt) {
7628 		return (0);
7629 	}
7630 
7631 	if (pathname[strlen(pathname) - 1] == '/')
7632 		pathname[strlen(pathname) - 1] = '\0';
7633 
7634 	pre_len = (uchar_t)strlen(pathname);
7635 	suf_len = (uchar_t)strlen(devname);
7636 
7637 	if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7638 		return (1);
7639 
7640 	lnp = s->s_lnp;
7641 
7642 	/*
7643 	 * Future note:  Need to do something here for the MN diskset case
7644 	 * when device ids are supported in disksets.
7645 	 * Can't add until merging devids_in_diskset code into code base
7646 	 * Currently only called with side of 0.
7647 	 */
7648 
7649 	sn = &lnp->ln_suffixes[side][li];
7650 
7651 	/*
7652 	 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7653 	 * If new prefix is the same as the previous prefix - no change.
7654 	 *
7655 	 * If new prefix is not the same, check if new prefix
7656 	 * matches an existing one.  If so, use that one.
7657 	 *
7658 	 * If new prefix doesn't exist, add a new prefix.  If not enough
7659 	 * space, return failure.
7660 	 */
7661 	pre_index = sn->suf_prefix;
7662 	/* Check if new prefix is the same as the old prefix. */
7663 	if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7664 	    (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7665 	    pre_len) != 0)) {
7666 		/* Check if new prefix is an already known prefix. */
7667 		for (i = 0; i < MDDB_PREFIXCNT; i++) {
7668 			if (lnp->ln_prefixes[i].pre_len != pre_len) {
7669 				continue;
7670 			}
7671 			if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7672 			    pre_len) == 0) {
7673 				break;
7674 			}
7675 		}
7676 		/* If no match found for new prefix - add the new prefix */
7677 		if (i == MDDB_PREFIXCNT) {
7678 			for (i = 0; i < MDDB_PREFIXCNT; i++) {
7679 				if (lnp->ln_prefixes[i].pre_len == 0)
7680 					break;
7681 			}
7682 			/* No space to add new prefix - return failure */
7683 			if (i == MDDB_PREFIXCNT) {
7684 				return (1);
7685 			}
7686 			bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7687 			lnp->ln_prefixes[i].pre_len = pre_len;
7688 		}
7689 		sn->suf_prefix = i;
7690 	}
7691 
7692 	/* Now, update the suffix (Ex: c0t0d0s0) if needed */
7693 	if ((sn->suf_len != suf_len) ||
7694 	    (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7695 		bcopy(devname, sn->suf_data, suf_len);
7696 		sn->suf_len = suf_len;
7697 	}
7698 	return (0);
7699 }
7700 
7701 
7702 /*
7703  * md_update_locator_namespace - If in devid style and active and the devid's
7704  *		exist and are valid update the locator namespace pathname
7705  *		and devname.
7706  *	RETURN
7707  *		1	Error
7708  *		0	Success
7709  */
7710 int
md_update_locator_namespace(set_t setno,side_t side,char * dname,char * pname,md_dev64_t devt)7711 md_update_locator_namespace(
7712 	set_t		setno,		/* which set to get name from */
7713 	side_t		side,
7714 	char		*dname,
7715 	char		*pname,
7716 	md_dev64_t	devt
7717 )
7718 {
7719 	mddb_set_t	*s;
7720 	mddb_lb_t	*lbp;
7721 	int		li;
7722 	uint_t		flg;
7723 	int		err = 0;
7724 	mddb_ln_t	*lnp;
7725 
7726 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7727 		return (1);
7728 	single_thread_start(s);
7729 	lbp = s->s_lbp;
7730 	/* must be DEVID_STYLE */
7731 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7732 		for (li = 0; li < lbp->lb_loccnt; li++) {
7733 			mddb_locator_t *lp = &lbp->lb_locators[li];
7734 
7735 			if (lp->l_flags & MDDB_F_DELETED) {
7736 				continue;
7737 			}
7738 
7739 			/* replica also must be active */
7740 			if (lp->l_flags & MDDB_F_ACTIVE) {
7741 				flg = s->s_did_icp->did_ic_blkp->
7742 				    blk_info[li].info_flags;
7743 				/* only update if did exists and is valid */
7744 				if ((flg & MDDB_DID_EXISTS) &&
7745 				    (flg & MDDB_DID_VALID)) {
7746 					if (update_valid_replica(side, lp, s,
7747 					    li, dname, pname, devt)) {
7748 						err = 1;
7749 						goto out;
7750 					}
7751 				}
7752 			}
7753 		}
7754 	}
7755 	lnp = s->s_lnp;
7756 	uniqtime32(&lnp->ln_timestamp);
7757 	if (lbp->lb_flags & MDDB_MNSET)
7758 		lnp->ln_revision = MDDB_REV_MNLN;
7759 	else
7760 		lnp->ln_revision = MDDB_REV_LN;
7761 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7762 	err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7763 	    lbp->lb_lnblkcnt, 0);
7764 	/*
7765 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
7766 	 * flag in the mddb_set structure to show that the locator
7767 	 * names have changed.
7768 	 */
7769 
7770 	if ((lbp->lb_flags & MDDB_MNSET) &&
7771 	    (md_set[s->s_setno].s_am_i_master)) {
7772 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7773 	}
7774 out:
7775 	single_thread_end(s);
7776 	mddb_setexit(s);
7777 	if (err)
7778 		return (1);
7779 	return (0);
7780 }
7781 
7782 /*
7783  * update_locatorblock - for active entries in the locator block, check
7784  *		the devt to see if it matches the given devt. If so, and
7785  *		there is an associated device id which is not the same
7786  *		as the passed in devid, delete old devid and add a new one.
7787  *
7788  *		During import of replicated disksets, old_didptr contains
7789  *		the original disk's device id.  Use this device id in
7790  *		addition to the devt to determine if an entry is a match
7791  *		and should be updated with the new device id of the
7792  *		replicated disk.  Specifically, this is the case being handled:
7793  *
7794  *		Original_disk	Replicated_disk	Disk_Available_During_Import
7795  *		c1t1d0		c1t3d0		no - so old name c1t1d0 shown
7796  *		c1t2d0		c1t1d0		yes - name is c1t1d0
7797  *		c1t3d0		c1t2d0		yes - name is c1t2d0
7798  *
7799  *		Can't just match on devt since devt for the first and third
7800  *		disks will be the same, but the original disk's device id
7801  *		is known and can be used to distinguish which disk's
7802  *		replicated device id should be updated.
7803  *	RETURN
7804  *		MDDB_E_NODEVID
7805  *		MDDB_E_NOLOCBLK
7806  *		1	Error
7807  *		0	Success
7808  */
7809 static int
update_locatorblock(mddb_set_t * s,md_dev64_t dev,ddi_devid_t didptr,ddi_devid_t old_didptr)7810 update_locatorblock(
7811 	mddb_set_t	*s,
7812 	md_dev64_t	dev,
7813 	ddi_devid_t	didptr,
7814 	ddi_devid_t	old_didptr
7815 )
7816 {
7817 	mddb_lb_t	*lbp = NULL;
7818 	mddb_locator_t	*lp;
7819 	int		li;
7820 	uint_t		flg;
7821 	ddi_devid_t	devid_ptr;
7822 	int		retval = 0;
7823 	char		*minor_name;
7824 	int		repl_import_flag;
7825 
7826 	/* Set replicated flag if this is a replicated import */
7827 	repl_import_flag = md_get_setstatus(s->s_setno) &
7828 	    MD_SET_REPLICATED_IMPORT;
7829 
7830 	lbp = s->s_lbp;
7831 	/* find replicas that haven't been deleted */
7832 	for (li = 0; li < lbp->lb_loccnt; li++) {
7833 		lp = &lbp->lb_locators[li];
7834 
7835 		if ((lp->l_flags & MDDB_F_DELETED)) {
7836 			continue;
7837 		}
7838 		/*
7839 		 * check to see if locator devt matches given dev
7840 		 * and if there is a device ID associated with it
7841 		 */
7842 		flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7843 		if ((md_expldev(lp->l_dev) == dev) &&
7844 		    (flg & MDDB_DID_EXISTS)) {
7845 			if (flg & MDDB_DID_VALID) {
7846 				continue; /* cont to nxt active entry */
7847 			}
7848 			devid_ptr = s->s_did_icp->did_ic_devid[li];
7849 			if (devid_ptr == NULL) {
7850 				return (MDDB_E_NODEVID);
7851 			}
7852 
7853 			/*
7854 			 * During a replicated import the old_didptr
7855 			 * must match the current devid before the
7856 			 * devid can be updated.
7857 			 */
7858 			if (repl_import_flag) {
7859 				if (ddi_devid_compare(devid_ptr,
7860 				    old_didptr) != 0)
7861 					continue;
7862 			}
7863 
7864 			if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7865 				/*
7866 				 * devid's not equal so
7867 				 * delete and add
7868 				 */
7869 				if (ddi_lyr_get_minor_name(
7870 				    md_dev64_to_dev(dev),
7871 				    S_IFBLK, &minor_name) == DDI_SUCCESS) {
7872 					(void) mddb_devid_delete(s, li);
7873 					(void) mddb_devid_add(s, li, didptr,
7874 					    minor_name);
7875 					kmem_free(minor_name,
7876 					    strlen(minor_name)+1);
7877 					break;
7878 				} else {
7879 					retval = 1;
7880 					goto err_out;
7881 				}
7882 			}
7883 		}
7884 	} /* end for */
7885 	retval = push_lb(s);
7886 	(void) upd_med(s, "update_locatorblock(0)");
7887 err_out:
7888 	return (retval);
7889 }
7890 
7891 static int
update_mb_devid(mddb_set_t * s,mddb_ri_t * rip,ddi_devid_t devidptr)7892 update_mb_devid(
7893 	mddb_set_t	*s,
7894 	mddb_ri_t	*rip,
7895 	ddi_devid_t	devidptr
7896 )
7897 {
7898 	mddb_mb_ic_t	*mbip;
7899 	mddb_mb_t	*mb = NULL;
7900 	daddr_t		blkno;
7901 	md_dev64_t	device;
7902 	uint_t		sz;
7903 	int		mb2free = 0;
7904 	int		err = 0;
7905 
7906 
7907 	/*
7908 	 * There is case where a disk may not have mddb,
7909 	 * and only has dummy mddb which contains
7910 	 * a valid devid we like to update and in this
7911 	 * case, the rip_lbp will be NULL but we still
7912 	 * like to update the devid embedded in the
7913 	 * dummy mb block.
7914 	 *
7915 	 */
7916 	if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7917 		mbip = rip->ri_mbip;
7918 		mb = &mbip->mbi_mddb_mb;
7919 	} else {
7920 		/*
7921 		 * Done if it is non-replicated set
7922 		 */
7923 		if (devidptr != (ddi_devid_t)NULL) {
7924 			mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7925 			    KM_SLEEP);
7926 			mb->mb_magic = MDDB_MAGIC_DU;
7927 			mb->mb_revision = MDDB_REV_MB;
7928 			mb2free = 1;
7929 		} else {
7930 			goto out;
7931 		}
7932 	}
7933 
7934 	blkno = rip->ri_blkno;
7935 	device = rip->ri_dev;
7936 	/*
7937 	 * Replace the mb_devid with the new/valid one
7938 	 */
7939 	if (devidptr != (ddi_devid_t)NULL) {
7940 		/*
7941 		 * Zero out what we have previously
7942 		 */
7943 		if (mb->mb_devid_len)
7944 			bzero(mb->mb_devid, mb->mb_devid_len);
7945 		sz = ddi_devid_sizeof(devidptr);
7946 		bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7947 		mb->mb_devid_len = sz;
7948 	}
7949 
7950 	mb->mb_setno = s->s_setno;
7951 	uniqtime32(&mb->mb_timestamp);
7952 	crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7953 	/*
7954 	 * putblks will
7955 	 *
7956 	 *	- drop the s_dbmx lock
7957 	 *	- biowait
7958 	 *	- regain the s_dbmx lock
7959 	 *
7960 	 * Need to update this if we wants to handle
7961 	 * mb_next != NULL which it is unlikely will happen
7962 	 */
7963 	err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7964 
7965 	if (mb2free) {
7966 		kmem_free(mb, MDDB_BSIZE);
7967 	}
7968 out:
7969 	return (err);
7970 }
7971 
7972 static int
setdid(mddb_config_t * cp)7973 setdid(
7974 	mddb_config_t		*cp
7975 )
7976 {
7977 	ddi_devid_t		devidp;
7978 	dev_t			ddi_dev;
7979 	mddb_set_t		*s;
7980 	int			err = 0;
7981 	mddb_ri_t		*rip;
7982 
7983 	/*
7984 	 * Data integrity check
7985 	 */
7986 	if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7987 		return (EINVAL);
7988 
7989 	if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7990 		return (0);
7991 
7992 	ddi_dev = md_dev64_to_dev(cp->c_devt);
7993 	if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7994 		return (-1);
7995 	}
7996 	if (devidp == NULL) {
7997 		return (-1);
7998 	}
7999 
8000 	if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
8001 		return (-1);
8002 	single_thread_start(s);
8003 
8004 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
8005 		if (rip->ri_lbp == (mddb_lb_t *)NULL)
8006 			continue;
8007 		/*
8008 		 * We only update what is asked
8009 		 */
8010 		if (rip->ri_dev == cp->c_devt) {
8011 			if (update_mb_devid(s, rip, devidp) != 0) {
8012 				err = -1;
8013 				goto out;
8014 			}
8015 		}
8016 	}
8017 
8018 	if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
8019 		err = -1;
8020 		goto out;
8021 	}
8022 
8023 out:
8024 	single_thread_end(s);
8025 	mddb_setexit(s);
8026 	ddi_devid_free(devidp);
8027 	return (err);
8028 }
8029 
8030 static int
delnewside(mddb_config_t * cp,int command,md_error_t * ep)8031 delnewside(
8032 	mddb_config_t		*cp,
8033 	int			command,
8034 	md_error_t		*ep
8035 )
8036 {
8037 	mddb_set_t		*s;
8038 	int			li;
8039 	mddb_lb_t		*lbp;		/* pointer to locator block */
8040 	mddb_ln_t		*lnp;		/* pointer to locator names */
8041 	mddb_mnln_t		*mnlnp;		/* pointer to locator names */
8042 	mddb_locator_t		*lp;
8043 	mddb_sidelocator_t	*slp;
8044 	mddb_cfg_loc_t		*clp;
8045 	int			err = 0;
8046 	set_t			setno = cp->c_setno;
8047 	ddi_devid_t		devid;
8048 	ddi_devid_t		ret_devid = NULL;
8049 	char			*minor_name;
8050 	uint_t			use_devid = 0;
8051 	dev_t			ddi_dev;
8052 	md_mnname_suffix_t	*mnsn;
8053 	mddb_mnlb_t		*mnlbp;
8054 	mddb_mnsidelocator_t	*mnslp;
8055 
8056 	/* Currently don't allow addition/deletion of sides during upgrade */
8057 	if (MD_UPGRADE) {
8058 		cmn_err(CE_WARN,
8059 		    "Addition and deletion of sides not allowed"
8060 		    " during upgrade. \n");
8061 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8062 	}
8063 
8064 	/*
8065 	 * Data integrity check
8066 	 */
8067 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8068 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8069 
8070 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8071 		return (mddbstatus2error(ep, err, NODEV32, setno));
8072 
8073 	single_thread_start(s);
8074 	clp = &cp->c_locator;
8075 
8076 	lbp = s->s_lbp;
8077 
8078 	if (lbp->lb_setno != setno) {
8079 		single_thread_end(s);
8080 		mddb_setexit(s);
8081 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8082 	}
8083 
8084 	/*
8085 	 * Find this device/blkno pair
8086 	 */
8087 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8088 		ddi_dev = md_dev64_to_dev(clp->l_dev);
8089 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8090 		    (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
8091 		    == DDI_SUCCESS)) {
8092 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8093 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8094 				use_devid = 1;
8095 				(void) strcpy(clp->l_minor_name, minor_name);
8096 			}
8097 			kmem_free(minor_name, strlen(minor_name)+1);
8098 		}
8099 		if (use_devid != 1 && ret_devid != NULL)
8100 			ddi_devid_free(ret_devid);
8101 	}
8102 	for (li = 0; li < lbp->lb_loccnt; li++) {
8103 		lp = &lbp->lb_locators[li];
8104 		if (lp->l_flags & MDDB_F_DELETED)
8105 			continue;
8106 		if (use_devid) {
8107 			if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
8108 				continue;
8109 			if ((ddi_devid_compare(devid,
8110 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8111 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8112 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8113 				break;
8114 			}
8115 		} else {
8116 			if (lp->l_dev == clp->l_dev &&
8117 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8118 				break;
8119 			}
8120 		}
8121 	}
8122 
8123 	if (li == lbp->lb_loccnt) {
8124 		if (use_devid)
8125 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8126 		single_thread_end(s);
8127 		mddb_setexit(s);
8128 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8129 	}
8130 
8131 	lnp = s->s_lnp;
8132 	if (command == MDDB_NEWSIDE) {
8133 		int 	index = 0;
8134 		/*
8135 		 * If a MN diskset, need to find the index where the new
8136 		 * locator information is to be stored in the mnsidelocator
8137 		 * field of the locator block so that the locator name can
8138 		 * be stored at the same array index in the mnsuffixes
8139 		 * field of the locator names structure.
8140 		 */
8141 		if (lbp->lb_flags & MDDB_MNSET) {
8142 			if ((index = checklocator(lbp, li,
8143 			    cp->c_sideno)) == -1) {
8144 				if (use_devid) {
8145 					ddi_devid_free((ddi_devid_t)
8146 					    (uintptr_t)clp->l_devid);
8147 				}
8148 				single_thread_end(s);
8149 				mddb_setexit(s);
8150 				return (mdmddberror(ep, MDE_DB_TOOSMALL,
8151 				    NODEV32, setno));
8152 			}
8153 		}
8154 
8155 		/*
8156 		 * Store the locator name before the sidelocator information
8157 		 * in case a panic occurs between these 2 steps.  Must have
8158 		 * the locator name information in order to print reasonable
8159 		 * error information.
8160 		 */
8161 		if (splitname2locatorblock(&cp->c_devname, lnp, li,
8162 		    cp->c_sideno, index)) {
8163 			if (use_devid)
8164 				ddi_devid_free(
8165 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8166 			single_thread_end(s);
8167 			mddb_setexit(s);
8168 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8169 			    setno));
8170 		}
8171 
8172 		if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
8173 			if (use_devid)
8174 				ddi_devid_free(
8175 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8176 			single_thread_end(s);
8177 			mddb_setexit(s);
8178 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8179 			    setno));
8180 		}
8181 	}
8182 
8183 	if (use_devid)
8184 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8185 
8186 	if (command == MDDB_DELSIDE) {
8187 		int i;
8188 		for (i = 0; i < lbp->lb_loccnt; i++) {
8189 			if (lbp->lb_flags & MDDB_MNSET) {
8190 				int	j;
8191 				mnlbp = (mddb_mnlb_t *)lbp;
8192 				for (j = 0; j < MD_MNMAXSIDES; j++) {
8193 					mnslp = &mnlbp->lb_mnsidelocators[j][i];
8194 					if (mnslp->mnl_sideno == cp->c_sideno)
8195 						break;
8196 				}
8197 				if (j < MD_MNMAXSIDES) {
8198 					mnslp->mnl_mnum = NODEV32;
8199 					mnslp->mnl_sideno = 0;
8200 					mnlnp = (mddb_mnln_t *)lnp;
8201 					mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8202 					bzero((caddr_t)mnsn,
8203 					    sizeof (md_mnname_suffix_t));
8204 				}
8205 			} else {
8206 				slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8207 				bzero((caddr_t)&lnp->ln_suffixes
8208 				    [cp->c_sideno][i], sizeof (md_name_suffix));
8209 				slp->l_mnum = NODEV32;
8210 			}
8211 		}
8212 	}
8213 
8214 	/* write new locator names to all devices */
8215 	uniqtime32(&lnp->ln_timestamp);
8216 	if (lbp->lb_flags & MDDB_MNSET)
8217 		lnp->ln_revision = MDDB_REV_MNLN;
8218 	else
8219 		lnp->ln_revision = MDDB_REV_LN;
8220 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8221 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8222 	    lbp->lb_lnblkcnt, 0);
8223 	/*
8224 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8225 	 * flag in the mddb_set structure to show that the locator
8226 	 * names have changed.
8227 	 */
8228 
8229 	if ((lbp->lb_flags & MDDB_MNSET) &&
8230 	    (md_set[s->s_setno].s_am_i_master)) {
8231 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8232 	}
8233 	if (err) {
8234 		if (writeretry(s)) {
8235 			single_thread_end(s);
8236 			mddb_setexit(s);
8237 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8238 		}
8239 	}
8240 
8241 	uniqtime32(&lbp->lb_timestamp);
8242 	/* write new locator to all devices */
8243 	err = writelocall(s);
8244 
8245 	(void) upd_med(s, "delnewside(0)");
8246 
8247 	computefreeblks(s); /* recompute always it may be larger */
8248 	if (err) {
8249 		if (writeretry(s)) {
8250 			single_thread_end(s);
8251 			mddb_setexit(s);
8252 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8253 		}
8254 	}
8255 
8256 	single_thread_end(s);
8257 	mddb_setexit(s);
8258 
8259 	return (0);
8260 }
8261 
8262 static int
newdev(mddb_config_t * cp,int command,md_error_t * ep)8263 newdev(
8264 	mddb_config_t	*cp,
8265 	int		command,
8266 	md_error_t	*ep
8267 )
8268 {
8269 	mddb_set_t	*s;
8270 	mddb_mb_ic_t	*mbip, *mbip1;
8271 	int		i, j;
8272 	int		li;
8273 	mddb_lb_t	*lbp;		/* pointer to locator block */
8274 	mddb_ln_t	*lnp;		/* pointer to locator names */
8275 	mddb_locator_t	*lp;
8276 	mddb_cfg_loc_t	*clp;
8277 	int		err = 0;
8278 	set_t		setno = cp->c_setno;
8279 	ddi_devid_t	devid2;
8280 	ddi_devid_t	ret_devid = NULL;
8281 	char		*minor_name;
8282 	uint_t		use_devid = 0;
8283 	dev_t		ddi_dev;
8284 	int		old_flags;
8285 	int		flags;
8286 	int		mn_set = 0;
8287 	int		index;
8288 	mddb_ri_t	*rip;
8289 	int		locator_deleted = 0;
8290 	dev32_t		locator_deleted_dev;
8291 	int		sz = 0;
8292 
8293 
8294 	/* Currently don't allow addition of new replica during upgrade */
8295 	if (MD_UPGRADE) {
8296 		cmn_err(CE_WARN,
8297 		    "Addition of new replica not allowed during upgrade.\n");
8298 		return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8299 	}
8300 
8301 	/*
8302 	 * Data integrity check
8303 	 */
8304 	if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8305 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8306 
8307 	/* Determine the flag settings for multinode sets */
8308 	flags = MDDB_NOOLDOK;
8309 	if (cp->c_multi_node)
8310 		flags |= MDDB_MULTINODE;
8311 
8312 	if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8313 		if (err != MDDB_E_NOTOWNER)
8314 			return (mddbstatus2error(ep, err, NODEV32, setno));
8315 		s = init_set(cp, flags, &err);
8316 		if (s == NULL)
8317 			return (mddbstatus2error(ep, err, NODEV32, setno));
8318 	}
8319 
8320 	single_thread_start(s);
8321 
8322 	/* shorthand */
8323 	clp = &cp->c_locator;
8324 
8325 	/* shorthand */
8326 	lbp = s->s_lbp;
8327 
8328 	if (lbp->lb_setno != setno) {
8329 		single_thread_end(s);
8330 		mddb_setexit(s);
8331 		return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8332 	}
8333 
8334 	/*
8335 	 * See if this device/blkno pair is already a replica
8336 	 */
8337 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8338 		ddi_dev = expldev(clp->l_dev);
8339 		if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8340 		    (ddi_lyr_get_minor_name(ddi_dev,
8341 		    S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8342 			if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8343 				clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8344 				use_devid = 1;
8345 				(void) strcpy(clp->l_minor_name, minor_name);
8346 			}
8347 			kmem_free(minor_name, strlen(minor_name)+1);
8348 		}
8349 		if (use_devid != 1 && ret_devid != NULL)
8350 			ddi_devid_free(ret_devid);
8351 	}
8352 
8353 	for (i = 0; i < lbp->lb_loccnt;	 i++) {
8354 		lp = &lbp->lb_locators[i];
8355 		if (lp->l_flags & MDDB_F_DELETED)
8356 			continue;
8357 		if (use_devid) {
8358 			if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8359 				continue;
8360 			if ((ddi_devid_compare(devid2,
8361 			    (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8362 			    (strcmp(clp->l_minor_name, minor_name) == 0) &&
8363 			    ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8364 				if (command == MDDB_NEWDEV) {
8365 					ddi_devid_free((ddi_devid_t)(uintptr_t)
8366 					    clp->l_devid);
8367 					single_thread_end(s);
8368 					mddb_setexit(s);
8369 					return (mdmddberror(ep,
8370 					    MDE_DB_EXISTS, NODEV32, setno));
8371 				}
8372 			}
8373 		} else {
8374 			if (lp->l_dev == clp->l_dev &&
8375 			    (daddr_t)lp->l_blkno == clp->l_blkno) {
8376 				if (command == MDDB_NEWDEV) {
8377 					single_thread_end(s);
8378 					mddb_setexit(s);
8379 					return (mdmddberror(ep,
8380 					    MDE_DB_EXISTS, NODEV32, setno));
8381 				}
8382 			}
8383 		}
8384 	}
8385 
8386 	/*
8387 	 * Really is a new replica, go get the master blocks
8388 	 */
8389 	mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8390 	    (uint_t *)0, &mn_set);
8391 	if (! mbip) {
8392 		if (use_devid)
8393 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8394 		single_thread_end(s);
8395 		mddb_setexit(s);
8396 		return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8397 	}
8398 
8399 	/*
8400 	 * Compute free blocks in replica.
8401 	 */
8402 	computefreeblks(s);
8403 
8404 	/*
8405 	 * Check if this is large enough
8406 	 */
8407 	for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8408 		i += mbip1->mbi_mddb_mb.mb_blkcnt;
8409 	for (j = i; j < s->s_totalblkcnt; j++) {
8410 		if (blkcheck(s, j)) {
8411 			while (mbip) {
8412 				mbip1 = mbip->mbi_next;
8413 				kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8414 				mbip = mbip1;
8415 			}
8416 			if (use_devid)
8417 				ddi_devid_free(
8418 				    (ddi_devid_t)(uintptr_t)clp->l_devid);
8419 			mddb_devclose(md_expldev(clp->l_dev));
8420 			single_thread_end(s);
8421 			mddb_setexit(s);
8422 			return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8423 			    setno));
8424 		}
8425 	}
8426 
8427 	/* Look for a deleted slot */
8428 	for (li = 0; li < lbp->lb_loccnt; li++) {
8429 		lp = &lbp->lb_locators[li];
8430 		if (lp->l_flags & MDDB_F_DELETED) {
8431 			locator_deleted = 1;
8432 			locator_deleted_dev = lp->l_dev;
8433 			break;
8434 		}
8435 	}
8436 
8437 	/* If no deleted slots, add a new one */
8438 	if (li == lbp->lb_loccnt) {
8439 		/* Already have the max replicas, bail */
8440 		if (lbp->lb_loccnt == MDDB_NLB) {
8441 			if (use_devid)
8442 				ddi_devid_free((ddi_devid_t)(uintptr_t)
8443 				    clp->l_devid);
8444 			mddb_devclose(md_expldev(clp->l_dev));
8445 			single_thread_end(s);
8446 			mddb_setexit(s);
8447 			return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8448 			    setno));
8449 		}
8450 		lbp->lb_loccnt++;
8451 		lp = &lbp->lb_locators[li];
8452 	}
8453 
8454 	/* Initialize the new or deleted slot */
8455 	old_flags = lp->l_flags;
8456 	lp->l_dev = clp->l_dev;
8457 	lp->l_blkno = (daddr32_t)clp->l_blkno;
8458 	lp->l_flags = clp->l_flags;
8459 
8460 	/* shorthand */
8461 	lnp = s->s_lnp;
8462 
8463 	index = 0;
8464 	if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8465 		/*
8466 		 * If a MN diskset, need to find the index where the new
8467 		 * locator information is to be stored in the mnsidelocator
8468 		 * field of the locator block so that the locator name can
8469 		 * be stored at the same array index in the mnsuffixes
8470 		 * field of the locator names structure.
8471 		 */
8472 		lbp->lb_flags |= MDDB_MNSET;
8473 		if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8474 			if (use_devid)
8475 				ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8476 				    l_devid);
8477 			lp->l_flags = old_flags;
8478 			lbp->lb_loccnt--;
8479 			mddb_devclose(md_expldev(clp->l_dev));
8480 			single_thread_end(s);
8481 			mddb_setexit(s);
8482 			return (mdmddberror(ep, MDE_DB_TOOSMALL,
8483 			    NODEV32, setno));
8484 		}
8485 	}
8486 	/*
8487 	 * Store the locator name before the sidelocator information
8488 	 * in case a panic occurs between these 2 steps.  Must have
8489 	 * the locator name information in order to print reasonable
8490 	 * error information.
8491 	 */
8492 	if (splitname2locatorblock(&cp->c_devname, lnp, li,
8493 	    s->s_sideno, index)) {
8494 		if (use_devid)
8495 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8496 		lp->l_flags = old_flags;
8497 		lbp->lb_loccnt--;
8498 		mddb_devclose(md_expldev(clp->l_dev));
8499 		single_thread_end(s);
8500 		mddb_setexit(s);
8501 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8502 	}
8503 
8504 	/*
8505 	 * Compute free blocks in replica before calling cfgloc2locator
8506 	 * since cfgloc2locator may attempt to alloc an unused block
8507 	 * to store the device id.
8508 	 * mbiarray needs to be setup before calling computefreeblks.
8509 	 */
8510 	s->s_mbiarray[li] = mbip;
8511 	computefreeblks(s);
8512 
8513 	if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8514 		if (use_devid)
8515 			ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8516 		lp->l_flags = old_flags;
8517 		lbp->lb_loccnt--;
8518 		s->s_mbiarray[li] = 0;
8519 		mddb_devclose(md_expldev(clp->l_dev));
8520 		single_thread_end(s);
8521 		mddb_setexit(s);
8522 		return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8523 	}
8524 
8525 	/*
8526 	 * Hijack a deleted rip master record and correct the contents
8527 	 */
8528 	if (locator_deleted) {
8529 		for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
8530 			if (rip->ri_lbp != NULL &&
8531 			    rip->ri_mbip == 0 &&
8532 			    (rip->ri_dev == md_expldev(locator_deleted_dev))) {
8533 				rip->ri_dev = md_expldev(clp->l_dev);
8534 				rip->ri_mbip = mbip;
8535 
8536 				if (use_devid && clp->l_devid != 0) {
8537 					sz = (int)ddi_devid_sizeof(
8538 					    (ddi_devid_t)(uintptr_t)
8539 					    clp->l_devid);
8540 					rip->ri_devid =
8541 					    (ddi_devid_t)kmem_zalloc(sz,
8542 					    KM_SLEEP);
8543 					bcopy((void *)(uintptr_t)clp->l_devid,
8544 					    (char *)rip->ri_devid, sz);
8545 				}
8546 
8547 				break;
8548 			}
8549 		}
8550 	}
8551 
8552 	if (use_devid)
8553 		ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8554 
8555 	uniqtime32(&lbp->lb_timestamp);
8556 	lp->l_flags = MDDB_F_ACTIVE;
8557 
8558 	/* write db copy to new device */
8559 	err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8560 	lp->l_flags |= MDDB_F_UP2DATE;
8561 
8562 	/* write new locator names to all devices */
8563 	uniqtime32(&lnp->ln_timestamp);
8564 	if (lbp->lb_flags & MDDB_MNSET)
8565 		lnp->ln_revision = MDDB_REV_MNLN;
8566 	else
8567 		lnp->ln_revision = MDDB_REV_LN;
8568 	crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8569 	err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8570 	    lbp->lb_lnblkcnt, 0);
8571 	/*
8572 	 * If a MN diskset and this is the master, set the PARSE_LOCNM
8573 	 * flag in the mddb_set structure to show that the locator
8574 	 * names have changed.
8575 	 */
8576 
8577 	if ((lbp->lb_flags & MDDB_MNSET) &&
8578 	    (md_set[s->s_setno].s_am_i_master)) {
8579 		s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8580 	}
8581 	if (err) {
8582 		if (writeretry(s)) {
8583 			single_thread_end(s);
8584 			mddb_setexit(s);
8585 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8586 		}
8587 	}
8588 
8589 	/* Data tags not supported on MN sets */
8590 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8591 	    (!(lbp->lb_flags & MDDB_MNSET)) &&
8592 	    setno != MD_LOCAL_SET)
8593 		if (set_dtag(s, ep))
8594 			mdclrerror(ep);
8595 
8596 	/* Write data tags to all accessible devices */
8597 	/* Data tags not supported on MN sets */
8598 	if (!(lbp->lb_flags & MDDB_MNSET)) {
8599 		(void) dt_write(s);
8600 	}
8601 
8602 	/* write new locator to all devices */
8603 	err = writelocall(s);
8604 
8605 	(void) upd_med(s, "newdev(0)");
8606 
8607 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8608 	    md_expldev(clp->l_dev));
8609 
8610 	computefreeblks(s); /* recompute always it may be smaller */
8611 	if (err) {
8612 		if (writeretry(s)) {
8613 			single_thread_end(s);
8614 			mddb_setexit(s);
8615 			return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8616 		}
8617 	}
8618 
8619 	single_thread_end(s);
8620 	mddb_setexit(s);
8621 
8622 	return (0);
8623 }
8624 
8625 #ifdef DEBUG
8626 static void
mddb_check_set(set_t setno)8627 mddb_check_set(
8628 	set_t	setno
8629 )
8630 {
8631 	mddb_set_t	*s;
8632 	mddb_db_t	*dbp;
8633 	mddb_de_ic_t	*dep;
8634 	mddb_rb32_t	*rbp;
8635 
8636 	if (! md_set[setno].s_db)
8637 		return;
8638 
8639 	s = (mddb_set_t *)md_set[setno].s_db;
8640 
8641 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8642 		for (dep = dbp->db_firstentry;
8643 		    dep != NULL; dep = dep->de_next) {
8644 			rbp = dep->de_rb;
8645 			ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8646 			if (dep->de_rb_userdata)
8647 				ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8648 		}
8649 	}
8650 }
8651 #endif /* DEBUG */
8652 
8653 /*
8654  * Exported Entry Points
8655  */
8656 #ifdef DEBUG
8657 void
mddb_check(void)8658 mddb_check(void)
8659 {
8660 	int	i;
8661 
8662 	for (i = 0; i < md_nsets; i++) {
8663 		if (! md_set[i].s_db)
8664 			return;
8665 
8666 		mddb_check_set(i);
8667 	}
8668 
8669 }
8670 #endif /* DEBUG */
8671 
8672 int
mddb_configure(mddb_cfgcmd_t command,mddb_config_t * cp)8673 mddb_configure(
8674 	mddb_cfgcmd_t	command,
8675 	mddb_config_t	*cp
8676 )
8677 {
8678 	mddb_set_t	*s;
8679 	md_error_t	*ep = &cp->c_mde;
8680 	int		flag = 0;
8681 	int		err = 0;
8682 	set_t		setno = cp->c_setno;
8683 
8684 	mdclrerror(ep);
8685 
8686 	switch (command) {
8687 		case MDDB_NEWDEV:
8688 			err = newdev(cp, command, ep);
8689 			break;
8690 
8691 		case MDDB_NEWSIDE:
8692 		case MDDB_DELSIDE:
8693 			err = delnewside(cp, command, ep);
8694 			break;
8695 
8696 		case MDDB_GETDEV:
8697 		case MDDB_DELDEV:
8698 		case MDDB_ENDDEV:
8699 			err = getdeldev(cp, command, ep);
8700 			break;
8701 
8702 		case MDDB_GETDRVRNAME:
8703 			err = getdriver(&cp->c_locator);
8704 			break;
8705 
8706 		case MDDB_USEDEV:
8707 			/*
8708 			 * Note: must allow USEDEV ioctl during upgrade to
8709 			 * support auto-take disksets.
8710 			 *
8711 			 * Also during the set import if the md_devid_destroy
8712 			 * flag is set then error out
8713 			 */
8714 
8715 			if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8716 				return (mdmderror(ep, MDE_INVAL_UNIT,
8717 				    MD_ADM_MINOR));
8718 
8719 			if (setno >= md_nsets)
8720 				return (mdmderror(ep, MDE_INVAL_UNIT,
8721 				    MD_ADM_MINOR));
8722 
8723 			if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
8724 			    NULL) {
8725 				if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
8726 				    NULL) {
8727 					err = mddbstatus2error(ep, err,
8728 					    NODEV32, setno);
8729 					break;
8730 				}
8731 			}
8732 			if (setno == MD_LOCAL_SET)
8733 				flag = MDDB_F_IOCTL;
8734 			if (cp->c_locator.l_old_devid) {
8735 				md_set_setstatus(setno,
8736 				    MD_SET_REPLICATED_IMPORT);
8737 			}
8738 			err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
8739 			mddb_setexit(s);
8740 			break;
8741 
8742 		case MDDB_RELEASESET:
8743 			mutex_enter(&mddb_lock);
8744 			mddb_unload_set(cp->c_setno);
8745 			mutex_exit(&mddb_lock);
8746 			break;
8747 
8748 		case MDDB_SETDID:
8749 			err = setdid(cp);
8750 			break;
8751 
8752 		default:
8753 			err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
8754 			    cp->c_setno);
8755 	}
8756 
8757 	return (err);
8758 }
8759 
8760 int
mddb_getoptloc(mddb_optloc_t * ol)8761 mddb_getoptloc(
8762 	mddb_optloc_t		*ol
8763 )
8764 {
8765 	mddb_set_t		*s;
8766 	mddb_db_t		*dbp;
8767 	mddb_de_ic_t		*dep;
8768 	mddb_recid_t		id;
8769 	set_t			setno;
8770 
8771 	ol->li[0] = -1;
8772 	ol->li[1] = -1;
8773 
8774 	id = ol->recid;
8775 	setno = DBSET(id);
8776 	if (setno >= md_nsets)
8777 		return (EINVAL);
8778 
8779 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8780 		return (0);
8781 
8782 	id = DBID(id);
8783 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8784 		for (dep = dbp->db_firstentry;
8785 		    dep != NULL; dep = dep->de_next) {
8786 			if (dep->de_recid != id)
8787 				continue;
8788 			ol->li[0] = dep->de_optinfo[0].o_li;
8789 			ol->li[1] = dep->de_optinfo[1].o_li;
8790 			mddb_setexit(s);
8791 			return (0);
8792 		}
8793 	}
8794 	mddb_setexit(s);
8795 	return (0);
8796 }
8797 
8798 void
mddb_init(void)8799 mddb_init(void)
8800 {
8801 	mddb_set_t	*s;
8802 
8803 	mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8804 	if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8805 		mddb_setexit(s);
8806 }
8807 
8808 
8809 void
mddb_unload(void)8810 mddb_unload(void)
8811 {
8812 	int	i;
8813 
8814 	mutex_enter(&mddb_lock);
8815 
8816 	for (i = 0; i < md_nsets; i++) {
8817 		md_clr_setstatus(i, MD_SET_KEEPTAG);
8818 		mddb_unload_set(i);
8819 	}
8820 
8821 	crcfreetab();
8822 
8823 	mutex_exit(&mddb_lock);
8824 }
8825 
8826 mddb_recid_t
mddb_createrec(size_t usersize,mddb_type_t type,uint_t type2,md_create_rec_option_t options,set_t setno)8827 mddb_createrec(
8828 	size_t		usersize,	 /* size of db record */
8829 	mddb_type_t	type,		 /* type1 of db record */
8830 	uint_t		type2,		 /* type2 of db record */
8831 	md_create_rec_option_t	options, /* options for this creation  */
8832 	set_t		setno		 /* set number to create record in */
8833 )
8834 {
8835 	mddb_set_t	*s;
8836 	mddb_db_t	*dbp, *prevdbp, *newdbp;
8837 	mddb_db32_t	*db32p;
8838 	mddb_de_ic_t	*dep;
8839 	/* LINTED variable unused - used for sizeof calculations */
8840 	mddb_de32_t	*de32p;
8841 	mddb_rb32_t	*rbp;
8842 	size_t		recsize;
8843 	ulong_t		blkcnt;
8844 	ulong_t		maxblocks;
8845 	size_t		desize, desize_ic;
8846 	size_t		used;
8847 	mddb_recid_t	newid;
8848 	caddr_t		tmppnt;
8849 	int		i, err = 0;
8850 	void		*userdata;
8851 	uint_t		flag_type;
8852 
8853 #if defined(_ILP32) && !defined(lint)
8854 	ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8855 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8856 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8857 #endif
8858 
8859 	/*
8860 	 * everyone is supposed to sepcify if it's a
8861 	 * 32 bit or a 64 bit record
8862 	 */
8863 	if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8864 		return (MDDB_E_INVALID);
8865 	}
8866 
8867 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8868 		return (err);
8869 
8870 	if (checkstate(s, MDDB_PROBE)) {
8871 		mddb_setexit(s);
8872 		return (MDDB_E_NOTNOW);
8873 	}
8874 
8875 	recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8876 	    usersize, MDDB_BSIZE);
8877 	blkcnt = btodb(recsize);
8878 
8879 	if (mddb_maxblocks)
8880 		maxblocks = mddb_maxblocks;
8881 	else
8882 		maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
8883 		    sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8884 
8885 	if (blkcnt > maxblocks) {
8886 		mddb_setexit(s);
8887 		return (MDDB_E_INVALID);
8888 	}
8889 	/*
8890 	 * allocate record block
8891 	 * and new directory block so to avoid sleeping
8892 	 * after starting single_thread
8893 	 */
8894 	rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8895 	if ((options & MD_CRO_OPTIMIZE) == 0)
8896 		userdata = kmem_zalloc(usersize, KM_SLEEP);
8897 	newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8898 
8899 	/*
8900 	 * if this is the largest record allocate new buffer for
8901 	 * checkcopy();
8902 	 */
8903 	if (recsize > s->s_databuffer_size) {
8904 		tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8905 		/*
8906 		 * this test is incase when to sleep during kmem_alloc
8907 		 * and some other task bumped max record size
8908 		 */
8909 		if (recsize > s->s_databuffer_size) {
8910 			if (s->s_databuffer_size)
8911 				kmem_free(s->s_databuffer,
8912 				    s->s_databuffer_size);
8913 			s->s_databuffer = tmppnt;
8914 			s->s_databuffer_size = recsize;
8915 		} else {
8916 			kmem_free(tmppnt, recsize);
8917 		}
8918 	}
8919 
8920 	single_thread_start(s);
8921 
8922 	newid = 0;
8923 	do {
8924 		newid++;
8925 		if (DBID(newid) == 0) {
8926 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8927 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8928 			if ((options & MD_CRO_OPTIMIZE) == 0)
8929 				kmem_free(userdata, usersize);
8930 			single_thread_end(s);
8931 			mddb_setexit(s);
8932 			return (MDDB_E_NOTNOW);
8933 		}
8934 
8935 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8936 			for (dep = dbp->db_firstentry; dep;
8937 			    dep = dep->de_next) {
8938 				if (dep->de_recid == newid)
8939 					break;
8940 			}
8941 			if (dep != NULL)
8942 				break;
8943 		}
8944 	} while (dbp);
8945 
8946 	desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8947 	    (sizeof (mddb_block_t) * blkcnt);
8948 
8949 	/*
8950 	 * see if a directory block exists which will hold this entry
8951 	 */
8952 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8953 		used = sizeof (*db32p);
8954 		for (dep = dbp->db_firstentry;
8955 		    dep != NULL; dep = dep->de_next) {
8956 			used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8957 			used += sizeof (mddb_block_t) * dep->de_blkcount;
8958 		}
8959 		if ((used + desize) < MDDB_BSIZE)
8960 			break;
8961 	}
8962 	if (dbp) {
8963 		kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8964 		if (blkcnt > s->s_freeblkcnt) {
8965 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8966 			if ((options & MD_CRO_OPTIMIZE) == 0)
8967 				kmem_free(userdata, usersize);
8968 			single_thread_end(s);
8969 			mddb_setexit(s);
8970 			return (MDDB_E_NOSPACE);
8971 		}
8972 		prevdbp = NULL;
8973 	} else {
8974 		/*
8975 		 * need to add directory block
8976 		 */
8977 		if ((blkcnt + 1) > s->s_freeblkcnt) {
8978 			kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8979 			kmem_free((caddr_t)rbp, ((size_t)recsize));
8980 			if ((options & MD_CRO_OPTIMIZE) == 0)
8981 				kmem_free(userdata, usersize);
8982 			single_thread_end(s);
8983 			mddb_setexit(s);
8984 			return (MDDB_E_NOSPACE);
8985 		}
8986 		for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
8987 			;
8988 		dbp->db_next = newdbp;
8989 		bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8990 		dbp->db_nextblk = getfreeblks(s, 1);
8991 		dbp->db_next->db_blknum = dbp->db_nextblk;
8992 		prevdbp = dbp;
8993 		dbp = dbp->db_next;
8994 		dbp->db_nextblk = 0;
8995 		dbp->db_firstentry = NULL;
8996 		dbp->db_recsum = 0;
8997 		dbp->db_magic = MDDB_MAGIC_DB;
8998 	}
8999 	/*
9000 	 * ready to add record
9001 	 */
9002 	desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
9003 	    (sizeof (mddb_block_t) * blkcnt);
9004 	if (dbp->db_firstentry) {
9005 		for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
9006 			;
9007 		dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
9008 		dep = dep->de_next;
9009 	} else {
9010 		dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
9011 		dbp->db_firstentry = dep;
9012 	}
9013 	bzero((caddr_t)dep, desize_ic);
9014 	dep->de_recid = newid;
9015 	/*
9016 	 * Optimized records have an owner node associated with them in
9017 	 * a MN diskset.  The owner is only set on a node that is actively
9018 	 * writing to that record.  The other nodes will show that record
9019 	 * as having an invalid owner.  The owner for an optimized record
9020 	 * is used during fixoptrecord to determine which node should
9021 	 * write out the record when the replicas associated with that
9022 	 * optimized record have been changed.
9023 	 */
9024 	if (MD_MNSET_SETNO(s->s_setno)) {
9025 		dep->de_owner_nodeid = MD_MN_INVALID_NID;
9026 	}
9027 	dep->de_type1 =	type;
9028 	dep->de_type2 = type2;
9029 	dep->de_reqsize = usersize;
9030 	dep->de_recsize = recsize;
9031 	dep->de_blkcount = blkcnt;
9032 	flag_type = options &
9033 	    (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
9034 	    MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
9035 	    MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
9036 	switch (flag_type) {
9037 	case MD_CRO_OPTIMIZE:
9038 		dep->de_flags = MDDB_F_OPT;
9039 		getoptdev(s, dep, 0);
9040 		getoptdev(s, dep, 1);
9041 		break;
9042 	case MD_CRO_STRIPE:
9043 		dep->de_flags = MDDB_F_STRIPE;
9044 		break;
9045 	case MD_CRO_MIRROR:
9046 		dep->de_flags = MDDB_F_MIRROR;
9047 		break;
9048 	case MD_CRO_RAID:
9049 		dep->de_flags = MDDB_F_RAID;
9050 		break;
9051 	case MD_CRO_SOFTPART:
9052 		dep->de_flags = MDDB_F_SOFTPART;
9053 		break;
9054 	case MD_CRO_TRANS_MASTER:
9055 		dep->de_flags = MDDB_F_TRANS_MASTER;
9056 		break;
9057 	case MD_CRO_TRANS_LOG:
9058 		dep->de_flags = MDDB_F_TRANS_LOG;
9059 		break;
9060 	case MD_CRO_HOTSPARE:
9061 		dep->de_flags = MDDB_F_HOTSPARE;
9062 		break;
9063 	case MD_CRO_HOTSPARE_POOL:
9064 		dep->de_flags = MDDB_F_HOTSPARE_POOL;
9065 		break;
9066 	case MD_CRO_CHANGELOG:
9067 		dep->de_flags = MDDB_F_CHANGELOG;
9068 		break;
9069 	}
9070 	/*
9071 	 * try to get all blocks consecutive. If not possible
9072 	 * just get them one at a time
9073 	 */
9074 	dep->de_blks[0] = getfreeblks(s, blkcnt);
9075 	if (dep->de_blks[0]) {
9076 		for (i = 1; i < blkcnt; i++)
9077 			dep->de_blks[i] = dep->de_blks[0] + i;
9078 	} else {
9079 		for (i = 0; i < blkcnt;	 i++)
9080 			dep->de_blks[i] = getfreeblks(s, 1);
9081 	}
9082 	dep->de_rb = rbp;
9083 	bzero((caddr_t)rbp, recsize);
9084 	rbp->rb_magic = MDDB_MAGIC_RB;
9085 
9086 	/* Do we have to create an old style (32 bit) record?  */
9087 	if (options & MD_CRO_32BIT) {
9088 		if (options & MD_CRO_FN)
9089 			rbp->rb_revision = MDDB_REV_RBFN;
9090 		else
9091 			rbp->rb_revision = MDDB_REV_RB;
9092 	} else {
9093 		if (options & MD_CRO_FN)
9094 			rbp->rb_revision = MDDB_REV_RB64FN;
9095 		else
9096 			rbp->rb_revision = MDDB_REV_RB64;
9097 	}
9098 
9099 	/* set de_rb_userdata for non optimization records */
9100 	if ((options & MD_CRO_OPTIMIZE) == 0) {
9101 		dep->de_rb_userdata = userdata;
9102 	}
9103 
9104 	uniqtime32(&rbp->rb_timestamp);
9105 	/* Generate the crc for this record */
9106 	rec_crcgen(s, dep, rbp);
9107 	tmppnt = (caddr_t)rbp;
9108 	/*
9109 	 * the following code writes new records to all instances of
9110 	 * the data base. Writing one block at a time to each instance
9111 	 * is safe because they are not yet in a directory entry which
9112 	 * has been written to the data base
9113 	 */
9114 	err = 0;
9115 	if ((options & MD_CRO_OPTIMIZE) == 0) {
9116 		for (i = 0; i < blkcnt;	 i++) {
9117 			err |= writeall(s, (caddr_t)tmppnt,
9118 			    dep->de_blks[i], 1, 0);
9119 			tmppnt += MDDB_BSIZE;
9120 		}
9121 	} else {
9122 		if ((MD_MNSET_SETNO(s->s_setno)) &&
9123 		    md_set[s->s_setno].s_am_i_master) {
9124 		/*
9125 		 * If a MN diskset then only master writes out newly
9126 		 * created optimized record.
9127 		 */
9128 			err |= writeoptrecord(s, dep);
9129 		}
9130 	}
9131 	uniqtime32(&dbp->db_timestamp);
9132 	dbp->db_revision = MDDB_REV_DB;
9133 	/* Don't include opt resync and change log records in global XOR */
9134 	if (!(dep->de_flags & MDDB_F_OPT) &&
9135 	    !(dep->de_flags & MDDB_F_CHANGELOG))
9136 		dbp->db_recsum ^= rbp->rb_checksum;
9137 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9138 	create_db32rec(db32p, dbp);
9139 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9140 	err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9141 	if (prevdbp) {
9142 		dbp = prevdbp;
9143 		uniqtime32(&dbp->db_timestamp);
9144 		dbp->db_revision = MDDB_REV_DB;
9145 		create_db32rec(db32p, dbp);
9146 		crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9147 		err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9148 	}
9149 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9150 	if (err) {
9151 		if (writeretry(s)) {
9152 			s->s_zombie = newid;
9153 			single_thread_end(s);
9154 			mddb_setexit(s);
9155 			return (MDDB_E_NOTNOW);
9156 		}
9157 	}
9158 	single_thread_end(s);
9159 	mddb_setexit(s);
9160 
9161 	ASSERT((newid & MDDB_SETMASK) == 0);
9162 	return (MAKERECID(setno, newid));
9163 }
9164 
9165 int
mddb_deleterec(mddb_recid_t id)9166 mddb_deleterec(
9167 	mddb_recid_t	id
9168 )
9169 {
9170 	mddb_set_t	*s;
9171 	mddb_db_t	*dbp;
9172 	mddb_db32_t	*db32p;
9173 	mddb_de_ic_t	*dep, *dep1;
9174 	int		i;
9175 
9176 #if defined(_ILP32) && !defined(lint)
9177 	ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
9178 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
9179 #endif
9180 
9181 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9182 	ASSERT(s != NULL);
9183 
9184 	id = DBID(id);
9185 	if (checkstate(s, MDDB_PROBE)) {
9186 		mddb_setexit(s);
9187 		return (MDDB_E_NOTNOW);
9188 	}
9189 
9190 	ASSERT(s->s_lbp != NULL);
9191 	single_thread_start(s);
9192 
9193 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9194 		dep1 = NULL;
9195 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9196 			if (dep->de_recid == id)
9197 				break;
9198 			dep1 = dep;
9199 		}
9200 		if (dep != NULL)
9201 			break;
9202 	}
9203 	/*
9204 	 * no such record
9205 	 */
9206 	if (dep == NULL) {
9207 		single_thread_end(s);
9208 		ASSERT(s->s_staledeletes != 0);
9209 		s->s_staledeletes--;
9210 		mddb_setexit(s);
9211 		return (0);
9212 	}
9213 
9214 	if (!(dep->de_flags & MDDB_F_OPT) &&
9215 	    !(dep->de_flags & MDDB_F_CHANGELOG)) {
9216 		dbp->db_recsum ^= dep->de_rb->rb_checksum;
9217 		dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
9218 	}
9219 
9220 	if (dep->de_rb_userdata != NULL) {
9221 		if (dep->de_icreqsize)
9222 			kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
9223 		else
9224 			kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9225 	}
9226 
9227 	kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
9228 
9229 	for (i = 0; i < dep->de_blkcount; i++)
9230 		blkfree(s, dep->de_blks[i]);
9231 	if (dep1)
9232 		dep1->de_next = dep->de_next;
9233 	else
9234 		dbp->db_firstentry = dep->de_next;
9235 
9236 	kmem_free(dep, sizeofde(dep));
9237 
9238 	uniqtime32(&dbp->db_timestamp);
9239 	dbp->db_revision = MDDB_REV_DB;
9240 	db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9241 	create_db32rec(db32p, dbp);
9242 	crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9243 	if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9244 		if (writeretry(s)) {
9245 			/*
9246 			 * staledelete is used to mark deletes which failed.
9247 			 * its only use is to not panic when the user retries
9248 			 * the delete once the database is active again
9249 			 */
9250 			single_thread_end(s);
9251 			s->s_staledeletes++;
9252 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
9253 			mddb_setexit(s);
9254 			return (MDDB_E_NOTNOW);
9255 		}
9256 	}
9257 	single_thread_end(s);
9258 	kmem_free((caddr_t)db32p, MDDB_BSIZE);
9259 	mddb_setexit(s);
9260 	return (0);
9261 }
9262 
9263 mddb_recid_t
mddb_getnextrec(mddb_recid_t id,mddb_type_t typ,uint_t type2)9264 mddb_getnextrec(
9265 	mddb_recid_t		id,
9266 	mddb_type_t		typ,
9267 	uint_t			type2
9268 )
9269 {
9270 	mddb_set_t		*s;
9271 	mddb_db_t		*dbp;
9272 	mddb_de_ic_t		*dep;
9273 	int			searching, err;
9274 	set_t			setno;
9275 
9276 	setno = DBSET(id);
9277 	id = DBID(id);
9278 	searching = id;
9279 
9280 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9281 		return (err);
9282 
9283 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9284 		for (dep = dbp->db_firstentry;
9285 		    dep != NULL; dep = dep->de_next) {
9286 			if (searching) {
9287 				if (dep->de_recid == id)
9288 					searching = 0;
9289 			} else {
9290 				if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9291 				    (type2 == 0 || dep->de_type2 == type2)) {
9292 					id = dep->de_recid;
9293 					mddb_setexit(s);
9294 					ASSERT((id & MDDB_SETMASK) == 0);
9295 					return (MAKERECID(setno, id));
9296 				}
9297 			}
9298 		}
9299 	}
9300 
9301 	mddb_setexit(s);
9302 
9303 	if (searching)
9304 		return (MDDB_E_NORECORD);
9305 	return (0);
9306 }
9307 
9308 void *
mddb_getrecaddr(mddb_recid_t id)9309 mddb_getrecaddr(
9310 	mddb_recid_t		id
9311 )
9312 {
9313 	mddb_set_t		*s;
9314 	mddb_db_t		*dbp;
9315 	mddb_de_ic_t		*dep;
9316 	void			*rval;
9317 
9318 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9319 		return (NULL);
9320 
9321 	id = DBID(id);
9322 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9323 		for (dep = dbp->db_firstentry;
9324 		    dep != NULL; dep = dep->de_next) {
9325 			if (dep->de_recid != id)
9326 				continue;
9327 			if (dep->de_rb_userdata)
9328 				rval = (void *)dep->de_rb_userdata;
9329 			else
9330 				rval = (void *)dep->de_rb->rb_data;
9331 			mddb_setexit(s);
9332 			return (rval);
9333 		}
9334 	}
9335 
9336 	mddb_setexit(s);
9337 	return (NULL);
9338 }
9339 
9340 
9341 mddb_de_ic_t *
mddb_getrecdep(mddb_recid_t id)9342 mddb_getrecdep(
9343 	mddb_recid_t		id
9344 )
9345 {
9346 	mddb_set_t		*s;
9347 	mddb_db_t		*dbp;
9348 	mddb_de_ic_t		*dep;
9349 
9350 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9351 		return (NULL);
9352 
9353 	id = DBID(id);
9354 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9355 		for (dep = dbp->db_firstentry;
9356 		    dep != NULL; dep = dep->de_next) {
9357 			if (dep->de_recid != id)
9358 				continue;
9359 			mddb_setexit(s);
9360 			return (dep);
9361 		}
9362 	}
9363 
9364 	mddb_setexit(s);
9365 	return (NULL);
9366 }
9367 
9368 void *
mddb_getrecaddr_resize(mddb_recid_t id,size_t icsize,off_t off)9369 mddb_getrecaddr_resize(
9370 	mddb_recid_t		id,
9371 	size_t			icsize,
9372 	off_t			off
9373 )
9374 {
9375 	mddb_set_t		*s;
9376 	mddb_db_t		*dbp;
9377 	mddb_de_ic_t		*dep;
9378 	void			*rval = NULL;
9379 
9380 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9381 		return (NULL);
9382 
9383 	id = DBID(id);
9384 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9385 		for (dep = dbp->db_firstentry;
9386 		    dep != NULL; dep = dep->de_next) {
9387 			if (dep->de_recid != id)
9388 				continue;
9389 			if (dep->de_rb_userdata)
9390 				rval = (void *)dep->de_rb_userdata;
9391 			else
9392 				rval = (void *)dep->de_rb->rb_data;
9393 			break;
9394 		}
9395 		if (rval != NULL)
9396 			break;
9397 	}
9398 
9399 	if (rval == NULL) {
9400 		mddb_setexit(s);
9401 		return (NULL);
9402 	}
9403 
9404 	if (dep->de_rb_userdata) {
9405 		caddr_t nud;
9406 
9407 		if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9408 			mddb_setexit(s);
9409 			return (rval);
9410 		}
9411 		ASSERT((dep->de_reqsize + off) <= icsize);
9412 		nud = kmem_zalloc(icsize, KM_SLEEP);
9413 		bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9414 		kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9415 		dep->de_rb_userdata = nud + off;
9416 		dep->de_rb_userdata_ic = nud;
9417 		dep->de_icreqsize = icsize;
9418 		rval = nud;
9419 	} else {
9420 		size_t recsize;
9421 		/* LINTED variable unused - used for sizeof calculations */
9422 		mddb_rb32_t *nrbp;
9423 
9424 		recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9425 		    icsize, MDDB_BSIZE);
9426 		if (dep->de_recsize < recsize)
9427 			cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9428 			    "nonoptimized records can be resized\n");
9429 	}
9430 
9431 	mddb_setexit(s);
9432 	return (rval);
9433 }
9434 
9435 int
mddb_getrecprivate(mddb_recid_t id)9436 mddb_getrecprivate(
9437 	mddb_recid_t		id
9438 )
9439 {
9440 	mddb_set_t		*s;
9441 	mddb_db_t		*dbp;
9442 	mddb_de_ic_t		*dep;
9443 	int			err = 0;
9444 	int			private;
9445 
9446 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9447 		return (err);
9448 
9449 	id = DBID(id);
9450 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9451 		for (dep = dbp->db_firstentry;
9452 		    dep != NULL; dep = dep->de_next) {
9453 			if (dep->de_recid != id)
9454 				continue;
9455 			private = (int)dep->de_rb->rb_private;
9456 			mddb_setexit(s);
9457 			return (private);
9458 		}
9459 	}
9460 
9461 	mddb_setexit(s);
9462 	return (MDDB_E_NORECORD);
9463 }
9464 
9465 void
mddb_setrecprivate(mddb_recid_t id,uint_t private)9466 mddb_setrecprivate(
9467 	mddb_recid_t		id,
9468 	uint_t			private
9469 )
9470 {
9471 	mddb_set_t		*s;
9472 	mddb_db_t		*dbp;
9473 	mddb_de_ic_t		*dep;
9474 
9475 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9476 		ASSERT(0);
9477 		return;
9478 	}
9479 
9480 	id = DBID(id);
9481 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9482 		for (dep = dbp->db_firstentry;
9483 		    dep != NULL; dep = dep->de_next) {
9484 			if (dep->de_recid != id)
9485 				continue;
9486 			dep->de_rb->rb_private = private;
9487 			mddb_setexit(s);
9488 			return;
9489 		}
9490 	}
9491 
9492 	mddb_setexit(s);
9493 	ASSERT(0);
9494 }
9495 
9496 mddb_type_t
mddb_getrectype1(mddb_recid_t id)9497 mddb_getrectype1(
9498 	mddb_recid_t		id
9499 )
9500 {
9501 	mddb_set_t		*s;
9502 	mddb_db_t		*dbp;
9503 	mddb_de_ic_t		*dep;
9504 	int			err = 0;
9505 	mddb_type_t		rval;
9506 
9507 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9508 		return (err);
9509 
9510 	id = DBID(id);
9511 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9512 		for (dep = dbp->db_firstentry;
9513 		    dep != NULL; dep = dep->de_next) {
9514 			if (dep->de_recid != id)
9515 				continue;
9516 			rval = dep->de_type1;
9517 			mddb_setexit(s);
9518 			return (rval);
9519 		}
9520 	}
9521 
9522 	mddb_setexit(s);
9523 	return (MDDB_E_NORECORD);
9524 }
9525 
9526 int
mddb_getrectype2(mddb_recid_t id)9527 mddb_getrectype2(
9528 	mddb_recid_t		id
9529 )
9530 {
9531 	mddb_set_t		*s;
9532 	mddb_db_t		*dbp;
9533 	mddb_de_ic_t		*dep;
9534 	int			err = 0;
9535 	int			rval;
9536 
9537 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9538 		return (err);
9539 
9540 	id = DBID(id);
9541 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9542 		for (dep = dbp->db_firstentry;
9543 		    dep != NULL; dep = dep->de_next) {
9544 			if (dep->de_recid != id)
9545 				continue;
9546 			rval = (int)dep->de_type2;
9547 			mddb_setexit(s);
9548 			return (rval);
9549 		}
9550 	}
9551 
9552 	mddb_setexit(s);
9553 	return (MDDB_E_NORECORD);
9554 }
9555 
9556 int
mddb_getrecsize(mddb_recid_t id)9557 mddb_getrecsize(
9558 	mddb_recid_t		id
9559 )
9560 {
9561 	mddb_set_t		*s;
9562 	mddb_db_t		*dbp;
9563 	mddb_de_ic_t		*dep;
9564 	int			err = 0;
9565 	int			rval;
9566 
9567 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9568 		return (err);
9569 
9570 	id = DBID(id);
9571 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9572 		for (dep = dbp->db_firstentry;
9573 		    dep != NULL; dep = dep->de_next) {
9574 			if (dep->de_recid != id)
9575 				continue;
9576 			rval = (int)dep->de_reqsize;
9577 			mddb_setexit(s);
9578 			return (rval);
9579 		}
9580 	}
9581 
9582 	mddb_setexit(s);
9583 	return (MDDB_E_NORECORD);
9584 }
9585 
9586 
9587 mddb_recstatus_t
mddb_getrecstatus(mddb_recid_t id)9588 mddb_getrecstatus(
9589 	mddb_recid_t		id
9590 )
9591 {
9592 	mddb_set_t		*s;
9593 	mddb_db_t		*dbp;
9594 	mddb_de_ic_t		*dep;
9595 	int			err = 0;
9596 	mddb_recstatus_t	e_err;
9597 
9598 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9599 		return ((mddb_recstatus_t)err);
9600 
9601 	id = DBID(id);
9602 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9603 		for (dep = dbp->db_firstentry;
9604 		    dep != NULL; dep = dep->de_next) {
9605 			if (dep->de_recid == id)
9606 				break;
9607 		}
9608 		if (dep)
9609 			break;
9610 	}
9611 
9612 	e_err = MDDB_OK;
9613 
9614 	if (! dep)
9615 		e_err = MDDB_NORECORD;
9616 	else if (! dep->de_rb->rb_commitcnt)
9617 		e_err = MDDB_NODATA;
9618 	else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9619 		e_err = MDDB_STALE;
9620 
9621 	mddb_setexit(s);
9622 	return (e_err);
9623 }
9624 
9625 static int	mddb_commitrec_retries = 5;
9626 
9627 /*
9628  * Commit given record to disk.
9629  * If committing an optimized record, do not call
9630  * with md ioctl lock held.
9631  */
9632 int
mddb_commitrec(mddb_recid_t id)9633 mddb_commitrec(
9634 	mddb_recid_t	id
9635 )
9636 {
9637 	mddb_set_t			*s;
9638 	mddb_db_t			*dbp;
9639 	mddb_de_ic_t			*dep;
9640 	mddb_recid_t			ids[2];
9641 	mddb_rb32_t			*rbp;
9642 	static int			err = 0;
9643 	md_mn_msg_mddb_optrecerr_t	*msg_recerr;
9644 	md_mn_kresult_t			*kres;
9645 	mddb_lb_t			*lbp;
9646 	mddb_mnlb_t			*mnlbp;
9647 	mddb_locator_t			*lp;
9648 	mddb_mnsidelocator_t		*mnslp;
9649 	mddb_drvnm_t			*dn;
9650 	int				li;
9651 	md_replica_recerr_t		*recerr;
9652 	int				i, j;
9653 	int				rval;
9654 	int				hit_err = 0;
9655 	int				retry = mddb_commitrec_retries;
9656 	int				gave_up = 0;
9657 
9658 	s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9659 	ASSERT(s != NULL);
9660 
9661 	if (checkstate(s, MDDB_PROBE)) {
9662 		mddb_setexit(s);
9663 		return (MDDB_E_NOTNOW);
9664 	}
9665 
9666 	if (DBID(id) == 0) {
9667 		mddb_setexit(s);
9668 		return (0);
9669 	}
9670 
9671 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9672 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9673 			if (dep->de_recid == DBID(id))
9674 				break;
9675 		}
9676 		if (dep)
9677 			break;
9678 	}
9679 
9680 	if (dep == NULL) {
9681 		mddb_setexit(s);
9682 		return (MDDB_E_NORECORD);
9683 	}
9684 
9685 	if (! (dep->de_flags & MDDB_F_OPT)) {
9686 		ids[0] = id;
9687 		ids[1] = 0;
9688 		mddb_setexit(s);
9689 		return (mddb_commitrecs(ids));
9690 	}
9691 
9692 	/*
9693 	 * following code allows multiple processes to be doing
9694 	 * optimization commits in parallel.
9695 	 * NOTE: if lots of optimization commits then the lock
9696 	 * will not get released until it winds down
9697 	 */
9698 	if (s->s_optwaiterr) {
9699 		while (s->s_optwaiterr) {
9700 			s->s_opthungerr = 1;
9701 			cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9702 		}
9703 		if (checkstate(s, MDDB_PROBE)) {
9704 			mddb_setexit(s);
9705 			return (MDDB_E_NOTNOW);
9706 		}
9707 	}
9708 	if (s->s_optcmtcnt++ == 0) {
9709 		single_thread_start(s);
9710 		s->s_opthavelck = 1;
9711 		if (s->s_optwantlck) {
9712 			cv_broadcast(&s->s_optwantlck_cv);
9713 			s->s_optwantlck = 0;
9714 		}
9715 	} else {
9716 		while (! s->s_opthavelck) {
9717 			s->s_optwantlck = 1;
9718 			cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9719 		}
9720 	}
9721 
9722 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9723 		for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9724 			if (dep->de_recid == DBID(id))
9725 				break;
9726 		}
9727 		if (dep)
9728 			break;
9729 	}
9730 
9731 	if (dep == NULL) {
9732 		if (! (--s->s_optcmtcnt)) {
9733 			single_thread_end(s);
9734 			s->s_opthavelck = 0;
9735 		}
9736 		mddb_setexit(s);
9737 		return (MDDB_E_NORECORD);
9738 	}
9739 
9740 	rbp = dep->de_rb;
9741 	rbp->rb_commitcnt++;
9742 	uniqtime32(&rbp->rb_timestamp);
9743 	/* Generate the crc for this record */
9744 	rec_crcgen(s, dep, rbp);
9745 
9746 	if (writeoptrecord(s, dep)) {
9747 		if (MD_MNSET_SETNO(s->s_setno)) {
9748 			hit_err = 1;
9749 		}
9750 		s->s_optwaiterr++;
9751 	}
9752 	if (MD_MNSET_SETNO(s->s_setno)) {
9753 		/* If last thread out, release single_thread_start */
9754 		if (! (--s->s_optcmtcnt)) {
9755 			single_thread_end(s);
9756 			s->s_opthavelck = 0;
9757 		}
9758 		/*
9759 		 * If this thread had a writeoptrecords failure, then
9760 		 * need to send message to master.
9761 		 * But, multiple threads could all be running on the
9762 		 * same single_thread_start, so serialize the threads
9763 		 * by making each thread grab single_thread_start.
9764 		 *
9765 		 * After return from sending message to master message,
9766 		 * replicas associated with optimized record will havei
9767 		 * been changed (via a callback from the master to all
9768 		 * nodes), so retry call to writeoptrecord.
9769 		 * This code is replacing the call to writeretry that
9770 		 * occurs for the local and traditional disksets.
9771 		 */
9772 		if (hit_err) {
9773 			single_thread_start(s);
9774 			/*
9775 			 * If > 50% of replicas are alive then continue
9776 			 * to send message to master until writeoptrecord
9777 			 * succeeds.  For now, assume that minor name,
9778 			 * major number on this node is the same as on
9779 			 * the master node.  Once devids are turned on
9780 			 * for MN disksets, can send devid.
9781 			 */
9782 			kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9783 			msg_recerr = kmem_zalloc(
9784 			    sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9785 			while (!(md_get_setstatus(s->s_setno) &
9786 			    MD_SET_TOOFEW)) {
9787 				bzero((caddr_t)msg_recerr,
9788 				    sizeof (md_mn_msg_mddb_optrecerr_t));
9789 				lbp = s->s_lbp;
9790 				mnlbp = (mddb_mnlb_t *)lbp;
9791 				for (i = 0; i < 2; i++) {
9792 					li = dep->de_optinfo[i].o_li;
9793 					lp = &lbp->lb_locators[li];
9794 					for (j = 0; j < MD_MNMAXSIDES; j++) {
9795 						mnslp =
9796 						    &mnlbp->
9797 						    lb_mnsidelocators[j][li];
9798 						if (mnslp->mnl_sideno ==
9799 						    s->s_sideno)
9800 							break;
9801 					}
9802 					if (j == MD_MNMAXSIDES)
9803 						continue;
9804 
9805 					dn = &lbp->
9806 					    lb_drvnm[mnslp->mnl_drvnm_index];
9807 					recerr = &msg_recerr->msg_recerr[i];
9808 					recerr->r_li = li;
9809 					recerr->r_flags =
9810 					    dep->de_optinfo[i].o_flags;
9811 					recerr->r_blkno = lp->l_blkno;
9812 					recerr->r_mnum = md_getminor(lp->l_dev);
9813 					(void) strncpy(recerr->r_driver_name,
9814 					    dn->dn_data, MD_MAXDRVNM);
9815 				}
9816 
9817 				/* Release locks */
9818 				single_thread_end(s);
9819 				mutex_exit(SETMUTEX(s->s_setno));
9820 
9821 				/*
9822 				 * Send message to master about optimized
9823 				 * record failure.  After return, master
9824 				 * should have marked failed replicas
9825 				 * and sent parse message to slaves causing
9826 				 * slaves to have fixed up the optimized
9827 				 * record.
9828 				 * On return from ksend_message, retry
9829 				 * the write since this node should have fixed
9830 				 * the optimized resync records it owns.
9831 				 */
9832 				rval = mdmn_ksend_message(s->s_setno,
9833 				    MD_MN_MSG_MDDB_OPTRECERR,
9834 				    MD_MSGF_NO_BCAST, 0,
9835 				    (char *)msg_recerr,
9836 				    sizeof (md_mn_msg_mddb_optrecerr_t),
9837 				    kres);
9838 				if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9839 					cmn_err(CE_WARN, "mddb_commitrec: "
9840 					    "Unable to send optimized "
9841 					    "resync record failure "
9842 					    "message to other nodes in "
9843 					    "diskset %s\n", s->s_setname);
9844 					mdmn_ksend_show_error(rval, kres,
9845 					    "MD_MN_MSG_MDDB_OPTRECERR");
9846 				}
9847 
9848 				/* Regrab locks */
9849 				mutex_enter(SETMUTEX(s->s_setno));
9850 				single_thread_start(s);
9851 
9852 				/* Start over in case mddb changed */
9853 				for (dbp = s->s_dbp; dbp != NULL;
9854 				    dbp = dbp->db_next) {
9855 					for (dep = dbp->db_firstentry; dep;
9856 					    dep = dep->de_next) {
9857 						if (dep->de_recid == DBID(id))
9858 							break;
9859 					}
9860 					if (dep)
9861 						break;
9862 				}
9863 				if (dep) {
9864 					rbp = dep->de_rb;
9865 					rbp->rb_commitcnt++;
9866 					uniqtime32(&rbp->rb_timestamp);
9867 					/* Generate the crc for this record */
9868 					rec_crcgen(s, dep, rbp);
9869 
9870 					/*
9871 					 * If writeoptrecord succeeds, then
9872 					 * break out.
9873 					 */
9874 					if (!(writeoptrecord(s, dep)))
9875 						break;
9876 				}
9877 				if (--retry == 0) {
9878 					cmn_err(CE_WARN, "mddb_commitrec: "
9879 					    "giving up writing optimized "
9880 					    "resync record for "
9881 					    "diskset %s, device %s,%d "
9882 					    "blkno 0x%x, flags 0x%x\n",
9883 					    s->s_setname, recerr->r_driver_name,
9884 					    recerr->r_mnum, recerr->r_blkno,
9885 					    recerr->r_flags);
9886 					gave_up++;
9887 					break;
9888 				}
9889 			}
9890 			kmem_free(kres, sizeof (md_mn_kresult_t));
9891 			kmem_free(msg_recerr,
9892 			    sizeof (md_mn_msg_mddb_optrecerr_t));
9893 
9894 			/* Resync record should be fixed - if possible */
9895 			s->s_optwaiterr--;
9896 			if (s->s_optwaiterr == 0) {
9897 				/* All errors have been handled */
9898 				if (s->s_opthungerr) {
9899 					s->s_opthungerr = 0;
9900 					cv_broadcast(&s->s_opthungerr_cv);
9901 				}
9902 			}
9903 			single_thread_end(s);
9904 			mddb_setexit(s);
9905 			if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9906 				return (MDDB_E_NOTNOW);
9907 			} else if (gave_up) {
9908 				return (MDDB_E_STALE);
9909 			} else {
9910 				return (0);
9911 			}
9912 		}
9913 	} else {
9914 		/* If set is a traditional or local set */
9915 		if (! (--s->s_optcmtcnt)) {
9916 			err = 0;
9917 			if (s->s_optwaiterr) {
9918 				err = writeretry(s);
9919 				s->s_optwaiterr = 0;
9920 				if (s->s_opthungerr) {
9921 					s->s_opthungerr = 0;
9922 					cv_broadcast(&s->s_opthungerr_cv);
9923 				}
9924 			}
9925 			single_thread_end(s);
9926 			s->s_opthavelck = 0;
9927 			mddb_setexit(s);
9928 			if (err)
9929 				return (MDDB_E_NOTNOW);
9930 			return (0);
9931 		}
9932 		if (s->s_optwaiterr) {
9933 			while (s->s_optwaiterr) {
9934 				s->s_opthungerr = 1;
9935 				cv_wait(&s->s_opthungerr_cv,
9936 				    SETMUTEX(s->s_setno));
9937 			}
9938 			if (checkstate(s, MDDB_NOPROBE)) {
9939 				mddb_setexit(s);
9940 				return (MDDB_E_NOTNOW);
9941 			}
9942 		}
9943 	}
9944 
9945 	mddb_setexit(s);
9946 	return (0);
9947 }
9948 
9949 int
mddb_commitrecs(mddb_recid_t ids[])9950 mddb_commitrecs(
9951 	mddb_recid_t	ids[]
9952 )
9953 {
9954 	mddb_set_t	*s;
9955 	mddb_db_t	*dbp;
9956 	mddb_de_ic_t	*dep;
9957 	mddb_rb32_t	*rbp;
9958 	mddb_rb32_t	*saverbp;
9959 	mddb_lb_t	*lbp;
9960 	int		li;
9961 	uint_t		checksum;
9962 	mddb_recid_t	*idp;
9963 	int		err = 0;
9964 	set_t		setno;
9965 
9966 	if (panicstr)
9967 		cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9968 
9969 	/*
9970 	 * scan through and make sure ids are from the same set
9971 	 */
9972 	setno = DBSET(ids[0]);
9973 	for (idp = ids; *idp != NULL; idp++)
9974 		ASSERT(DBSET(*idp) == setno);
9975 
9976 	s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9977 
9978 	if (checkstate(s, MDDB_PROBE)) {
9979 		mddb_setexit(s);
9980 		return (MDDB_E_NOTNOW);
9981 	}
9982 
9983 	ASSERT(s->s_lbp != NULL);
9984 	err = 0;
9985 
9986 	if (! ids[0]) {
9987 		mddb_setexit(s);
9988 		return (0);
9989 	}
9990 
9991 	single_thread_start(s);
9992 	/*
9993 	 * scan through and make sure ids all exist
9994 	 */
9995 	for (idp = ids; *idp != NULL; idp++) {
9996 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9997 			for (dep = dbp->db_firstentry; dep;
9998 			    dep = dep->de_next) {
9999 				if (dep->de_recid == DBID(*idp))
10000 					break;
10001 			}
10002 			if (dep != NULL)
10003 				break;
10004 		}
10005 		if (dep == NULL) {
10006 			single_thread_end(s);
10007 			mddb_setexit(s);
10008 			return (MDDB_E_NORECORD);
10009 		}
10010 	}
10011 
10012 	/*
10013 	 * scan through records fix commit counts and
10014 	 * zero fiddles and update time stamp and rechecksum record
10015 	 */
10016 	checksum = 0;
10017 	idp = ids;
10018 	saverbp = NULL;
10019 	while (*idp) {
10020 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10021 			for (dep = dbp->db_firstentry; dep;
10022 			    dep = dep->de_next) {
10023 				if (dep->de_recid == DBID(*idp))
10024 					break;
10025 			}
10026 			if (dep != NULL)
10027 				break;
10028 		}
10029 		rbp = dep->de_rb;
10030 		ASSERT(! (dep->de_flags & MDDB_F_OPT));
10031 
10032 		getuserdata(setno, dep);
10033 		/* Don't do fiddles for CHANGE LOG records */
10034 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
10035 			checksum ^= rbp->rb_checksum_fiddle;
10036 			rbp->rb_checksum_fiddle = 0;
10037 			checksum ^= rbp->rb_checksum;
10038 			saverbp = rbp;
10039 		}
10040 		rbp->rb_commitcnt++;
10041 		uniqtime32(&rbp->rb_timestamp);
10042 		/* Generate the crc for this record */
10043 		rec_crcgen(s, dep, rbp);
10044 
10045 		/* Don't do fiddles for CHANGE LOG records */
10046 		if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
10047 			checksum ^= rbp->rb_checksum;
10048 		}
10049 		idp++;
10050 	}
10051 
10052 	if (saverbp)
10053 		saverbp->rb_checksum_fiddle = checksum;
10054 
10055 	/*
10056 	 * If this is a MN set but we are not the master, then we are not
10057 	 * supposed to update the mddb on disk. So we finish at this point.
10058 	 */
10059 	if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
10060 	    (md_set[setno].s_am_i_master == 0)) {
10061 		single_thread_end(s);
10062 		mddb_setexit(s);
10063 		return (0);
10064 	}
10065 
10066 	lbp = s->s_lbp;
10067 	for (li = 0; li < lbp->lb_loccnt; li++) {
10068 		if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
10069 			continue;
10070 
10071 		idp = ids;
10072 		while (*idp) {
10073 			for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10074 				dep = dbp->db_firstentry;
10075 				while (dep && (dep->de_recid != DBID(*idp)))
10076 					dep = dep->de_next;
10077 				if (dep != NULL)
10078 					break;
10079 			}
10080 			rbp = dep->de_rb;
10081 			err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
10082 			    dep->de_blkcount, li, (mddb_bf_t **)0,
10083 			    MDDB_WR_ONLY_MASTER);
10084 			if (err)
10085 				break;
10086 			idp++;
10087 		}
10088 		if (err)
10089 			break;
10090 	}
10091 	if (err) {
10092 		if (writeretry(s)) {
10093 			single_thread_end(s);
10094 			mddb_setexit(s);
10095 			return (MDDB_E_NOTNOW);
10096 		}
10097 	}
10098 	single_thread_end(s);
10099 	mddb_setexit(s);
10100 	return (0);
10101 }
10102 
10103 mddb_recid_t
mddb_makerecid(set_t setno,mddb_recid_t id)10104 mddb_makerecid(
10105 	set_t		setno,
10106 	mddb_recid_t	id
10107 )
10108 {
10109 	return (MAKERECID(setno, id));
10110 }
10111 
10112 set_t
mddb_getsetnum(mddb_recid_t id)10113 mddb_getsetnum(
10114 	mddb_recid_t	id
10115 )
10116 {
10117 	return (DBSET(id));
10118 }
10119 
10120 char *
mddb_getsetname(set_t setno)10121 mddb_getsetname(
10122 	set_t	setno
10123 )
10124 {
10125 	return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
10126 }
10127 
10128 side_t
mddb_getsidenum(set_t setno)10129 mddb_getsidenum(
10130 	set_t	setno
10131 )
10132 {
10133 	if (md_set[setno].s_db)
10134 		return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
10135 	return (0);
10136 }
10137 
10138 int
mddb_ownset(set_t setno)10139 mddb_ownset(
10140 	set_t	setno
10141 )
10142 {
10143 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
10144 		return (1);
10145 
10146 	if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
10147 		return (1);
10148 
10149 	return (0);
10150 }
10151 
10152 /*ARGSUSED*/
10153 int
getmed_ioctl(mddb_med_parm_t * medpp,int mode)10154 getmed_ioctl(mddb_med_parm_t *medpp, int mode)
10155 {
10156 	mddb_set_t	*s;
10157 	int		err = 0;
10158 	set_t		setno = medpp->med_setno;
10159 	md_error_t	*ep = &medpp->med_mde;
10160 
10161 	mdclrerror(ep);
10162 
10163 	if (setno >= md_nsets)
10164 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10165 
10166 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10167 		return (0);
10168 
10169 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10170 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10171 
10172 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10173 		return (mddbstatus2error(ep, err, NODEV32, setno));
10174 
10175 	medpp->med = s->s_med;			/* structure assignment */
10176 
10177 	mddb_setexit(s);
10178 
10179 	return (0);
10180 }
10181 
10182 int
setmed_ioctl(mddb_med_parm_t * medpp,int mode)10183 setmed_ioctl(mddb_med_parm_t *medpp, int mode)
10184 {
10185 
10186 	mddb_set_t	*s;
10187 	int		err = 0;
10188 	set_t		setno = medpp->med_setno;
10189 	md_error_t	*ep = &medpp->med_mde;
10190 
10191 	mdclrerror(ep);
10192 
10193 	if ((mode & FWRITE) == 0)
10194 		return (mdsyserror(ep, EACCES));
10195 
10196 	/*
10197 	 * This should be the only thing that prevents LOCAL sets from having
10198 	 * mediators, at least in the kernel, userland needs to have some code
10199 	 * written.
10200 	 */
10201 	if (setno == MD_LOCAL_SET)
10202 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10203 
10204 	if (setno >= md_nsets)
10205 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10206 
10207 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10208 		return (0);
10209 
10210 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10211 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10212 
10213 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10214 		return (mddbstatus2error(ep, err, NODEV32, setno));
10215 
10216 	s->s_med = medpp->med;			/* structure assignment */
10217 
10218 	mddb_setexit(s);
10219 
10220 	return (0);
10221 }
10222 
10223 int
updmed_ioctl(mddb_med_upd_parm_t * medpp,int mode)10224 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
10225 {
10226 
10227 	mddb_set_t	*s;
10228 	int		err = 0;
10229 	set_t		setno = medpp->med_setno;
10230 	md_error_t	*ep = &medpp->med_mde;
10231 
10232 	mdclrerror(ep);
10233 
10234 	if ((mode & FWRITE) == 0)
10235 		return (mdsyserror(ep, EACCES));
10236 
10237 	if (setno >= md_nsets)
10238 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10239 
10240 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10241 		return (0);
10242 
10243 	if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10244 		return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10245 
10246 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10247 		return (mddbstatus2error(ep, err, NODEV32, setno));
10248 
10249 	single_thread_start(s);
10250 	(void) upd_med(s, "updmed_ioctl()");
10251 	single_thread_end(s);
10252 
10253 	mddb_setexit(s);
10254 
10255 	return (0);
10256 }
10257 
10258 int
take_set(mddb_config_t * cp,int mode)10259 take_set(mddb_config_t *cp, int mode)
10260 {
10261 	int			err = 0;
10262 	mddb_med_upd_parm_t	medup;
10263 	set_t			setno = cp->c_setno;
10264 	md_error_t		*ep = &cp->c_mde;
10265 	int			snarf_ok = 0;
10266 
10267 	if (md_get_setstatus(setno) & MD_SET_SNARFED)
10268 		return (0);
10269 
10270 	err = mddb_configure(MDDB_GETDEV, cp);
10271 	if (! err && mdisok(ep)) {
10272 		if (md_snarf_db_set(setno, ep) != 0)
10273 			goto out;
10274 		snarf_ok = 1;
10275 	}
10276 
10277 	/*
10278 	 * Clear replicated import flag since this is
10279 	 * used during the take of a diskset with
10280 	 * previously unresolved replicated disks.
10281 	 */
10282 	if (md_get_setstatus(setno) &
10283 	    MD_SET_REPLICATED_IMPORT) {
10284 		md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
10285 	}
10286 
10287 	if (! err && mdisok(ep)) {
10288 		if (! cp->c_flags) {
10289 			medup.med_setno = setno;
10290 			mdclrerror(&medup.med_mde);
10291 
10292 			err = updmed_ioctl(&medup, mode);
10293 			if (! mdisok(&medup.med_mde))
10294 				(void) mdstealerror(ep, &medup.med_mde);
10295 		}
10296 	}
10297 
10298 out:
10299 	/*
10300 	 * In the case that the snarf failed, the diskset is
10301 	 * left with s_db set, but s_lbp not set.  The node is not
10302 	 * an owner of the set and won't be allowed to release the
10303 	 * diskset in order to cleanup.  With s_db set, any call to the
10304 	 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10305 	 * will cause the diskset to be loaded.  So, cleanup the diskset so
10306 	 * that an inadvertent start of the diskset doesn't happen later.
10307 	 */
10308 	if ((snarf_ok == 0) && md_set[setno].s_db &&
10309 	    (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10310 		mutex_enter(&mddb_lock);
10311 		mddb_unload_set(setno);
10312 		mutex_exit(&mddb_lock);
10313 	}
10314 	return (err);
10315 }
10316 
10317 /*ARGSUSED*/
10318 int
release_set(mddb_config_t * cp,int mode)10319 release_set(mddb_config_t *cp, int mode)
10320 {
10321 	int			err = 0;
10322 	set_t			setno = cp->c_setno;
10323 	md_error_t		*ep = &cp->c_mde;
10324 
10325 	/*
10326 	 * Data integrity check
10327 	 */
10328 	if (setno >= md_nsets)
10329 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10330 
10331 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10332 	md_haltsnarf_enter(setno);
10333 	/*
10334 	 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10335 	 * that the mirror code is currently searching all mirrors for a
10336 	 * errored component that needs a hotspare. While this search is in
10337 	 * progress, we cannot release the set and thgerefore we return EBUSY.
10338 	 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10339 	 * block before the search until the set is released.
10340 	 */
10341 	if (md_holdset_testandenter(setno) != 0) {
10342 		md_haltsnarf_exit(setno);
10343 		rw_exit(&md_unit_array_rw.lock);
10344 		return (EBUSY);
10345 	}
10346 
10347 	if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10348 		err = mddb_configure(MDDB_RELEASESET, cp);
10349 
10350 	md_holdset_exit(setno);
10351 	md_haltsnarf_exit(setno);
10352 	rw_exit(&md_unit_array_rw.lock);
10353 
10354 	if (! err && mdisok(ep)) {
10355 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10356 		    NODEV64);
10357 	}
10358 
10359 	return (err);
10360 }
10361 
10362 int
gettag_ioctl(mddb_dtag_get_parm_t * dtgpp,int mode)10363 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10364 {
10365 	mddb_set_t	*s;
10366 	int		err = 0;
10367 	mddb_dtag_lst_t	*dtlp;
10368 	set_t		setno = dtgpp->dtgp_setno;
10369 	md_error_t	*ep = &dtgpp->dtgp_mde;
10370 
10371 	mdclrerror(ep);
10372 
10373 	if ((mode & FREAD) == 0)
10374 		return (mdsyserror(ep, EACCES));
10375 
10376 	if (setno >= md_nsets)
10377 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10378 
10379 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10380 		return (0);
10381 
10382 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10383 		return (mddbstatus2error(ep, err, NODEV32, setno));
10384 
10385 	/*
10386 	 * Data tags not supported on MN sets so return invalid operation.
10387 	 * This ioctl could be called before the mddb has been read in so
10388 	 * the set status may not yet be set to MNSET, so code following
10389 	 * this check must handle a MN diskset properly.
10390 	 */
10391 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10392 		mddb_setexit(s);
10393 		return (mderror(ep, MDE_INVAL_MNOP));
10394 	}
10395 
10396 	/* s_dtlp is NULL for MN diskset */
10397 	dtlp = s->s_dtlp;
10398 	while (dtlp != NULL) {
10399 		if (dtgpp->dtgp_dt.dt_id == 0 ||
10400 		    dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10401 			bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10402 			    sizeof (mddb_dtag_t));
10403 			break;
10404 		}
10405 		dtlp = dtlp->dtl_nx;
10406 	}
10407 
10408 	/* Walked the whole list and id not found, return error */
10409 	if (dtlp == (mddb_dtag_lst_t *)NULL) {
10410 		mddb_setexit(s);
10411 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10412 	}
10413 
10414 	mddb_setexit(s);
10415 
10416 	return (0);
10417 }
10418 
10419 int
usetag_ioctl(mddb_dtag_use_parm_t * dtupp,int mode)10420 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10421 {
10422 	mddb_set_t	*s;
10423 	int		err = 0;
10424 	mddb_config_t	*cp;
10425 	mddb_ri_t	*trip = NULL;
10426 	mddb_dtag_t	*dtagp = NULL;
10427 	set_t		setno = dtupp->dtup_setno;
10428 	md_error_t	*ep = &dtupp->dtup_mde;
10429 
10430 	mdclrerror(ep);
10431 
10432 	if ((mode & FWRITE) == 0)
10433 		return (mdsyserror(ep, EACCES));
10434 
10435 	if (setno >= md_nsets)
10436 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10437 
10438 	if (dtupp->dtup_id < 0)
10439 		return (mdsyserror(ep, EINVAL));
10440 	else if (dtupp->dtup_id == 0)
10441 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10442 
10443 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10444 		return (0);
10445 
10446 	if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10447 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10448 
10449 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10450 		return (mddbstatus2error(ep, err, NODEV32, setno));
10451 
10452 	/*
10453 	 * Data tags not supported on MN sets so return invalid operation.
10454 	 * This ioctl could be called before the mddb has been read in so
10455 	 * the set status may not yet be set to MNSET, so code following
10456 	 * this check must handle a MN diskset properly.
10457 	 */
10458 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10459 		mddb_setexit(s);
10460 		return (mderror(ep, MDE_INVAL_MNOP));
10461 	}
10462 
10463 	/* Validate and find the id requested - nothing found if MN diskset */
10464 	if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10465 		mddb_setexit(s);
10466 		return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10467 	}
10468 
10469 	/* Usetag is only valid when more than one tag exists */
10470 	if (dtl_cntl(s) < 2) {
10471 		mddb_setexit(s);
10472 		return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10473 	}
10474 
10475 	/* Put the selected tag in place */
10476 	dt_setup(s, dtagp);
10477 
10478 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10479 
10480 	/* Save the hint information */
10481 	trip = save_rip(s);
10482 
10483 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10484 	cp->c_setno = setno;
10485 	cp->c_sideno = s->s_sideno;
10486 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10487 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10488 	cp->c_med = s->s_med;				/* struct assignment */
10489 
10490 	mddb_setexit(s);
10491 
10492 	s = NULL;
10493 
10494 	/* shorthand */
10495 	setno = cp->c_setno;
10496 
10497 	/* Let unload know not to free the tag */
10498 	md_set_setstatus(setno, MD_SET_KEEPTAG);
10499 
10500 	/* Release the set */
10501 	if (err = release_set(cp, mode))
10502 		goto out;
10503 
10504 	if (! mdisok(&cp->c_mde)) {
10505 		(void) mdstealerror(ep, &cp->c_mde);
10506 		err = 1;
10507 		goto out;
10508 	}
10509 
10510 	/* Re-init set using the saved mddb_config_t structure */
10511 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10512 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10513 			err = mddbstatus2error(ep, err, NODEV32, setno);
10514 			goto out;
10515 		}
10516 	}
10517 
10518 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10519 
10520 	/* use the saved rip structure */
10521 	s->s_rip = trip;
10522 	trip = (mddb_ri_t *)NULL;
10523 
10524 	/* Let the take code know a tag is being used */
10525 	md_set_setstatus(setno, MD_SET_USETAG);
10526 
10527 	mddb_setexit(s);
10528 
10529 	s = NULL;
10530 
10531 	/* Take the set */
10532 	if (err = take_set(cp, mode))
10533 		goto out;
10534 
10535 	if (! mdisok(&cp->c_mde))
10536 		(void) mdstealerror(ep, &cp->c_mde);
10537 
10538 out:
10539 	md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10540 
10541 	kmem_free(cp, sizeof (mddb_config_t));
10542 
10543 	if (trip)
10544 		free_rip(&trip);
10545 
10546 	if (s)
10547 		mddb_setexit(s);
10548 
10549 	return (err);
10550 }
10551 
10552 int
accept_ioctl(mddb_accept_parm_t * accpp,int mode)10553 accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10554 {
10555 	mddb_set_t	*s;
10556 	int		err = 0;
10557 	mddb_config_t	*cp;
10558 	mddb_ri_t	*trip = NULL;
10559 	set_t		setno = accpp->accp_setno;
10560 	md_error_t	*ep = &accpp->accp_mde;
10561 
10562 	mdclrerror(ep);
10563 
10564 	if ((mode & FWRITE) == 0)
10565 		return (mdsyserror(ep, EACCES));
10566 
10567 	if (setno >= md_nsets)
10568 		return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10569 
10570 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10571 		return (0);
10572 
10573 	if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10574 		return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10575 
10576 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10577 		return (mddbstatus2error(ep, err, NODEV32, setno));
10578 
10579 	/*
10580 	 * Data tags not supported on MN sets so return invalid operation.
10581 	 * mddb is guaranteed to be incore at this point, so this
10582 	 * check will catch all MN disksets.
10583 	 */
10584 	if (md_get_setstatus(setno) & MD_SET_MNSET) {
10585 		mddb_setexit(s);
10586 		return (mderror(ep, MDE_INVAL_MNOP));
10587 	}
10588 
10589 	cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10590 
10591 	trip = save_rip(s);
10592 
10593 	cp->c_timestamp = s->s_ident.createtime;	/* struct assignment */
10594 	cp->c_setno = setno;
10595 	cp->c_sideno = s->s_sideno;
10596 	(void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10597 	cp->c_setname[MD_MAX_SETNAME] = '\0';
10598 	cp->c_med = s->s_med;				/* struct assignment */
10599 
10600 	/* Tag the data */
10601 	if (err = set_dtag(s, ep)) {
10602 		err = mdsyserror(ep, err);
10603 		goto out;
10604 	}
10605 
10606 	/* If we had a BADTAG, it will be re-written, so clear the bit. */
10607 	if (md_get_setstatus(setno) & MD_SET_BADTAG)
10608 		md_clr_setstatus(setno, MD_SET_BADTAG);
10609 
10610 	if (err = dt_write(s)) {
10611 		err = mdsyserror(ep, err);
10612 		goto out;
10613 	}
10614 
10615 	mddb_setexit(s);
10616 
10617 	s = NULL;
10618 
10619 	/* shorthand */
10620 	setno = cp->c_setno;
10621 
10622 	/* Clear the keeptag */
10623 	md_clr_setstatus(setno, MD_SET_KEEPTAG);
10624 
10625 	/* Release the set */
10626 	if (err = release_set(cp, mode))
10627 		goto out;
10628 
10629 	if (! mdisok(&cp->c_mde)) {
10630 		(void) mdstealerror(ep, &cp->c_mde);
10631 		goto out;
10632 	}
10633 
10634 	/* Re-init set using the saved mddb_config_t structure */
10635 	if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10636 		if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10637 			err = mddbstatus2error(ep, err, NODEV32, setno);
10638 			goto out;
10639 		}
10640 	}
10641 
10642 	ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10643 
10644 	/* Free the allocated rip structure */
10645 	if (s->s_rip != (mddb_ri_t *)NULL)
10646 		free_rip(&s->s_rip);
10647 
10648 	/* use the saved rip structure */
10649 	s->s_rip = trip;
10650 	trip = (mddb_ri_t *)NULL;
10651 
10652 	/* Let the set init code know an accept is in progress */
10653 	md_set_setstatus(setno, MD_SET_ACCEPT);
10654 
10655 	mddb_setexit(s);
10656 
10657 	s = NULL;
10658 
10659 	/* Take the set */
10660 	if (err = take_set(cp, mode))
10661 		goto out;
10662 
10663 	if (! mdisok(&cp->c_mde))
10664 		(void) mdstealerror(ep, &cp->c_mde);
10665 
10666 out:
10667 	md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10668 
10669 	kmem_free(cp, sizeof (mddb_config_t));
10670 
10671 	if (trip)
10672 		free_rip(&trip);
10673 
10674 	if (s)
10675 		mddb_setexit(s);
10676 
10677 	return (err);
10678 }
10679 
10680 /*
10681  * mddb_getinvlb_devid - cycles through the locator block and determines
10682  *		if the device id's for any of the replica disks are invalid.
10683  *		If so, it returns the diskname in the ctdptr.
10684  *	RETURN
10685  *		-1	Error
10686  *		cnt	number of invalid device id's
10687  */
10688 int
mddb_getinvlb_devid(set_t setno,int count,int size,char ** ctdptr)10689 mddb_getinvlb_devid(
10690 	set_t	setno,
10691 	int	count,
10692 	int	size,
10693 	char	**ctdptr
10694 )
10695 {
10696 	mddb_set_t	*s;
10697 	int		err = 0;
10698 	mddb_lb_t	*lbp;
10699 	int		li;
10700 	mddb_did_blk_t	*did_blk;
10701 	mddb_did_info_t	*did_info;
10702 	int		len;
10703 	int		cnt = 0;
10704 	char		*cptr;
10705 	md_name_suffix	*sn;
10706 	int		i, dont_add_it;
10707 	char		*tmpctd, *diskname;
10708 	char		*tmpname;
10709 
10710 	cptr = *ctdptr;
10711 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10712 		return (-1);
10713 	}
10714 
10715 	single_thread_start(s);
10716 	lbp = s->s_lbp;
10717 
10718 	if (lbp->lb_setno != setno) {
10719 		single_thread_end(s);
10720 		mddb_setexit(s);
10721 		return (-1);
10722 	}
10723 
10724 	/* check for lb being devid style */
10725 	if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10726 		did_blk = s->s_did_icp->did_ic_blkp;
10727 		for (li = 0; li < lbp->lb_loccnt; li++) {
10728 			did_info = &(did_blk->blk_info[li]);
10729 			/* Only if devid exists and isn't valid */
10730 			if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10731 			    !(did_info->info_flags & MDDB_DID_VALID)) {
10732 				/*
10733 				 * if we count more invalid did's than
10734 				 * was passed in there's an error somewhere
10735 				 */
10736 				if (cnt++ > count) {
10737 					single_thread_end(s);
10738 					mddb_setexit(s);
10739 					return (-1);
10740 				}
10741 
10742 				/*
10743 				 * Future note: Need to do something here
10744 				 * for the MN diskset case when device ids
10745 				 * are supported in disksets.
10746 				 * Can't add until merging devids_in_diskset
10747 				 * code into code base.
10748 				 */
10749 
10750 				sn = &s->s_lnp->ln_suffixes[0][li];
10751 				/*
10752 				 * check to make sure length of device name is
10753 				 * not greater than computed first time through
10754 				 */
10755 				len = sn->suf_len;
10756 				if (len > size) {
10757 					single_thread_end(s);
10758 					mddb_setexit(s);
10759 					return (-1);
10760 				}
10761 				tmpctd = *ctdptr;
10762 				/* strip off slice part */
10763 				diskname = md_strdup(sn->suf_data);
10764 				tmpname = strrchr(diskname, 's');
10765 				*tmpname = '\0';
10766 				dont_add_it = 0;
10767 				/* look to see if diskname is already in list */
10768 				for (i = 0; i < (cnt-1); i++) {
10769 					if (strcmp(diskname, tmpctd) == 0) {
10770 						/* already there, don't add */
10771 						dont_add_it = 1;
10772 						break;
10773 					}
10774 					/* point to next diskname in list */
10775 					tmpctd += size;
10776 				}
10777 				if (dont_add_it == 0) {
10778 					/* add diskname to list */
10779 					(void) strcpy(cptr, diskname);
10780 					cptr += size;
10781 				}
10782 				kmem_free(diskname, strlen(sn->suf_data) + 1);
10783 			}
10784 		}
10785 	}
10786 	/* null terminate the list */
10787 	*cptr = '\0';
10788 	/*
10789 	 * need to save the new pointer so that calling routine can continue
10790 	 * to add information onto the end.
10791 	 */
10792 	*ctdptr = cptr;
10793 	single_thread_end(s);
10794 	mddb_setexit(s);
10795 	return (cnt);
10796 }
10797 
10798 /*
10799  * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10800  *		track of length of longest devicename.
10801  *	RETURN
10802  *		-1	error
10803  *		 cnt	number of lb's with invalid devid's
10804  */
10805 int
mddb_validate_lb(set_t setno,int * rmaxsz)10806 mddb_validate_lb(
10807 	set_t	setno,
10808 	int	*rmaxsz
10809 )
10810 {
10811 	mddb_set_t	*s;
10812 	int		err = 0;
10813 	mddb_lb_t	*lbp;
10814 	int		li;
10815 	mddb_did_blk_t	*did_blk;
10816 	mddb_did_info_t	*did_info;
10817 	int		len;
10818 	int		cnt = 0;
10819 
10820 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10821 		return (-1);
10822 
10823 	single_thread_start(s);
10824 	lbp = s->s_lbp;
10825 
10826 	if (lbp->lb_setno != setno) {
10827 		single_thread_end(s);
10828 		mddb_setexit(s);
10829 		return (-1);
10830 	}
10831 
10832 	/* lb must be in devid style */
10833 	if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10834 		goto mvl_out;
10835 
10836 	did_blk = s->s_did_icp->did_ic_blkp;
10837 	for (li = 0; li < lbp->lb_loccnt; li++) {
10838 		char		*minor_name;
10839 		mddb_locator_t	*lp;
10840 		dev_t		ddi_dev;
10841 		ddi_devid_t	devid;
10842 		ddi_devid_t	rtn_devid = NULL;
10843 		int		get_rval;
10844 
10845 		did_info = &(did_blk->blk_info[li]);
10846 		if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10847 		    (did_info->info_flags & MDDB_DID_VALID))
10848 			continue;
10849 
10850 		/* Here we know, did exists but isn't valid */
10851 
10852 		lp = &lbp->lb_locators[li];
10853 		ddi_dev = expldev(lp->l_dev);
10854 		get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10855 		ASSERT(get_rval == 1);
10856 		if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10857 		    (ddi_devid_compare(rtn_devid, devid) == 0)) {
10858 			did_info->info_flags = MDDB_DID_VALID |
10859 			    MDDB_DID_EXISTS | MDDB_DID_UPDATED;
10860 		} else {
10861 			cnt++;
10862 			/*
10863 			 * Future note: Need to do something here
10864 			 * for the MN diskset case when device ids
10865 			 * are supported in disksets.
10866 			 * Can't add until merging devids_in_diskset
10867 			 * code into code base.
10868 			 */
10869 			len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10870 			if (*rmaxsz < len)
10871 				*rmaxsz = len;
10872 		}
10873 		if (rtn_devid != NULL)
10874 			ddi_devid_free(rtn_devid);
10875 	}
10876 
10877 mvl_out:
10878 
10879 	if (push_lb(s) != 0)
10880 		cnt = -1;
10881 	(void) upd_med(s, "mddb_validate_lb(0)");
10882 	single_thread_end(s);
10883 	mddb_setexit(s);
10884 	return (cnt);
10885 }
10886 
10887 int
check_active_locators()10888 check_active_locators()
10889 {
10890 	mddb_set_t	*s;
10891 	mddb_lb_t	*lbp;
10892 	int		li;
10893 	int		active = 0;
10894 
10895 	mutex_enter(&mddb_lock);
10896 	/* there is nothing here..so we can unload */
10897 	if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10898 		mutex_exit(&mddb_lock);
10899 		return (0);
10900 	}
10901 	s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10902 	lbp = s->s_lbp;
10903 	if (lbp == NULL) {
10904 		mutex_exit(&mddb_lock);
10905 		return (0);
10906 	}
10907 
10908 	for (li = 0; li < lbp->lb_loccnt; li++) {
10909 		mddb_locator_t *lp = &lbp->lb_locators[li];
10910 		if (lp->l_flags & MDDB_F_ACTIVE) {
10911 			active = 1;
10912 			break;
10913 		}
10914 	}
10915 	mutex_exit(&mddb_lock);
10916 	return (active);
10917 }
10918 
10919 /*
10920  * regetoptrecord:
10921  * --------------
10922  *	Update the in-core optimized resync record contents by re-reading the
10923  *	record from the on-disk metadb.
10924  *	The contents of the resync record will be overwritten by calling this
10925  *	routine. This means that callers that require the previous contents to
10926  *	be preserved must save the data before calling this routine.
10927  *	Return values:
10928  *	0 - successfully read in resync record from a mddb
10929  *	1 - failure.  Unable to read resync record from either mddb.
10930  */
10931 static int
regetoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)10932 regetoptrecord(
10933 	mddb_set_t	*s,
10934 	mddb_de_ic_t	*dep
10935 )
10936 {
10937 	mddb_lb_t	*lbp;
10938 	mddb_locator_t	*lp;
10939 	mddb_rb32_t	*rbp, *crbp;
10940 	int		li;
10941 	int		i;
10942 	int		err = 0;
10943 	size_t		recsize;
10944 
10945 #if defined(_ILP32) && !defined(lint)
10946 	ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10947 #endif
10948 
10949 	recsize = dep->de_recsize;
10950 	crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10951 
10952 	single_thread_start(s);
10953 	rbp = dep->de_rb;
10954 
10955 	dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10956 	dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10957 
10958 	lbp = s->s_lbp;
10959 
10960 	for (i = 0; i < 2; i++) {
10961 		if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10962 			continue;
10963 		li = dep->de_optinfo[i].o_li;
10964 		lp = &lbp->lb_locators[li];
10965 
10966 		if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10967 		    (lp->l_flags & MDDB_F_EMASTER))
10968 			continue;
10969 
10970 		/*
10971 		 * re-read the optimized resync record with failfast set
10972 		 * since a failed disk could lead to a very long wait.
10973 		 */
10974 		err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10975 		    dep->de_blkcount, li, B_FAILFAST);
10976 
10977 		if (err)
10978 			continue;
10979 
10980 		if (rbp->rb_magic != MDDB_MAGIC_RB)
10981 			continue;
10982 
10983 		if (revchk(MDDB_REV_RB, rbp->rb_revision))
10984 			continue;
10985 
10986 		/* Check the crc for this record */
10987 		if (rec_crcchk(s, dep, rbp)) {
10988 			continue;
10989 		}
10990 		dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10991 
10992 		if (rbp == crbp) {
10993 			if (rbp->rb_checksum != crbp->rb_checksum)
10994 				dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10995 			break;
10996 		}
10997 		rbp = crbp;
10998 	}
10999 
11000 	single_thread_end(s);
11001 
11002 	if (rbp == crbp) {
11003 		rbp->rb_private = 0;
11004 		kmem_free((caddr_t)crbp, recsize);
11005 		return (0);
11006 	}
11007 	uniqtime32(&rbp->rb_timestamp);
11008 	/* Generate the crc for this record */
11009 	rec_crcgen(s, dep, rbp);
11010 	kmem_free((caddr_t)crbp, recsize);
11011 	return (1);
11012 }
11013 
11014 /*
11015  * mddb_reread_rr:
11016  *	Re-read the resync record from the on-disk copy. This is required for
11017  *	multi-node support so that a new mirror-owner can determine if a resync
11018  *	operation is required to guarantee data integrity.
11019  *
11020  * Arguments:
11021  *	setno	Associated set
11022  *	id	Resync record ID
11023  *
11024  * Return Value:
11025  *	0	successful reread
11026  *	-1	invalid set (not multi-node or non-existant)
11027  *	>0	metadb state invalid, failed to reread
11028  */
11029 int
mddb_reread_rr(set_t setno,mddb_recid_t id)11030 mddb_reread_rr(
11031 	set_t		setno,
11032 	mddb_recid_t	id
11033 )
11034 {
11035 	mddb_set_t	*s;
11036 	int		err = 0;
11037 	mddb_db_t	*dbp;
11038 	mddb_de_ic_t	*dep;
11039 
11040 	if (setno >= md_nsets)
11041 		return (-1);
11042 
11043 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
11044 		return (-1);
11045 
11046 	if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
11047 		mddb_setexit(s);
11048 		return (-1);
11049 	}
11050 
11051 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11052 		dep = dbp->db_firstentry;
11053 		while (dep && (dep->de_recid != DBID(id)))
11054 			dep = dep->de_next;
11055 		if (dep != NULL)
11056 			break;
11057 	}
11058 
11059 	if (dep != NULL) {
11060 		err = regetoptrecord(s, dep);
11061 	} else {
11062 		err = -1;
11063 	}
11064 	mddb_setexit(s);
11065 	return (err);
11066 }
11067 
11068 /*
11069  * Set owner associated with MN optimized resync record.
11070  *
11071  * Optimized records have an owner node associated with them in
11072  * a MN diskset.  The owner is only set on a node that is actively
11073  * writing to that record.  The other nodes will show that record
11074  * as having an invalid owner.  The owner for an optimized record
11075  * is used during fixoptrecord to determine which node should
11076  * write out the record when the replicas associated with that
11077  * optimized record have been changed.
11078  *
11079  * Called directly from mirror driver and not from an ioctl.
11080  *
11081  * Returns
11082  *	NULL if successful.
11083  *	MDDB_E_NORECORD if record not found.
11084  */
11085 int
mddb_setowner(mddb_recid_t id,md_mn_nodeid_t owner)11086 mddb_setowner(
11087 	mddb_recid_t		id,
11088 	md_mn_nodeid_t		owner
11089 )
11090 {
11091 	mddb_set_t		*s;
11092 	mddb_db_t		*dbp;
11093 	mddb_de_ic_t		*dep;
11094 	int			found = 0;
11095 
11096 
11097 	if (DBSET(id) >= md_nsets)
11098 		return (MDDB_E_NORECORD);
11099 
11100 	if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
11101 		return (MDDB_E_NORECORD);
11102 
11103 	id = DBID(id);
11104 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11105 		for (dep = dbp->db_firstentry;
11106 		    dep != NULL; dep = dep->de_next) {
11107 			if (dep->de_recid != id)
11108 				continue;
11109 			dep->de_owner_nodeid = owner;
11110 			found = 1;
11111 			break;
11112 		}
11113 		if (found)
11114 			break;
11115 	}
11116 
11117 	mddb_setexit(s);
11118 
11119 	if (!found) {
11120 		return (MDDB_E_NORECORD);
11121 	}
11122 
11123 	return (NULL);
11124 }
11125 
11126 /*
11127  * mddb_parse re-reads portions of the mddb from disk given a list
11128  * of good replicas to read from and flags describing
11129  * which portion of the mddb to read in.
11130  *
11131  * Used in a MN diskset when the master has made a change to some part
11132  * of the mddb and wants to relay this information to the slaves.
11133  */
11134 int
mddb_parse(mddb_parse_parm_t * mpp)11135 mddb_parse(mddb_parse_parm_t *mpp)
11136 {
11137 	mddb_set_t	*s;
11138 	int		err = 0;
11139 	mddb_locator_t	*lp, *old_lp;
11140 	mddb_lb_t	*lbp, *old_lbp;
11141 	int		rval = 0;
11142 	int		i, li;
11143 	int		found_good_one = 0;
11144 	mddb_ln_t	*lnp;
11145 	mddb_block_t	ln_blkcnt;
11146 	md_error_t	*ep = &mpp->c_mde;
11147 
11148 	if (mpp->c_setno >= md_nsets)
11149 		return (EINVAL);
11150 
11151 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11152 		return (0);
11153 
11154 	if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11155 		return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
11156 	}
11157 
11158 	if (!(MD_MNSET_SETNO(mpp->c_setno))) {
11159 		mddb_setexit_no_parse(s);
11160 		return (EINVAL);
11161 	}
11162 
11163 	/*
11164 	 * Master node initiated this request, so there's no work for
11165 	 * the master node to do.
11166 	 */
11167 	if (md_set[mpp->c_setno].s_am_i_master) {
11168 		mddb_setexit_no_parse(s);
11169 		return (rval);
11170 	}
11171 
11172 	single_thread_start(s);
11173 
11174 	if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
11175 		lbp = 0;
11176 		for (i = 0; i < MDDB_NLB; i++) {
11177 			/* Walk through master's active list */
11178 			if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
11179 				continue;
11180 			if (s->s_mbiarray[i] == NULL)
11181 				continue;
11182 
11183 			/* Assumes master blocks are already setup */
11184 			if (lbp == (mddb_lb_t *)NULL) {
11185 				lbp = (mddb_lb_t *)kmem_zalloc(
11186 				    dbtob(MDDB_MNLBCNT), KM_SLEEP);
11187 			}
11188 			err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
11189 
11190 			if (err)
11191 				continue;
11192 
11193 			if (lbp->lb_magic != MDDB_MAGIC_LB)
11194 				continue;
11195 			if (lbp->lb_blkcnt != MDDB_MNLBCNT)
11196 				continue;
11197 			if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
11198 				continue;
11199 			if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
11200 			    NULL))
11201 				continue;
11202 			if (lbp->lb_setno != s->s_setno)
11203 				continue;
11204 			/*
11205 			 * a commit count of zero means this locator has
11206 			 * been deleted
11207 			 */
11208 			if (lbp->lb_commitcnt == 0) {
11209 				continue;
11210 			}
11211 			/* Found a good locator - keep it */
11212 			found_good_one = 1;
11213 			break;
11214 		}
11215 
11216 		/*
11217 		 * If found a good copy of the mddb, then read it into
11218 		 * this node's locator block.  Fix up the set's s_mbiarray
11219 		 * pointer (master block incore array pointer) to be
11220 		 * in sync with the newly read in locator block.  If a
11221 		 * new mddb was added, read in the master blocks associated
11222 		 * with the new mddb.  If an mddb was deleted, free the
11223 		 * master blocks associated with deleted mddb.
11224 		 */
11225 		if (found_good_one)  {
11226 			/* Compare old and new view of mddb locator blocks */
11227 			old_lbp = s->s_lbp;
11228 			for (li = 0; li < lbp->lb_loccnt; li++) {
11229 				int	mn_set;
11230 
11231 				lp = &lbp->lb_locators[li];
11232 				old_lp = &old_lbp->lb_locators[li];
11233 
11234 				/* If old and new views match, continue */
11235 				if ((lp->l_flags & MDDB_F_ACTIVE) ==
11236 				    (old_lp->l_flags & MDDB_F_ACTIVE))
11237 					continue;
11238 
11239 				if (lp->l_flags & MDDB_F_ACTIVE) {
11240 					/*
11241 					 * If new mddb has been added - delete
11242 					 * old mbiarray and get new one.
11243 					 *
11244 					 * When devids are supported, will
11245 					 * need to get dev from devid.
11246 					 */
11247 					if (s->s_mbiarray[li]) {
11248 						free_mbipp(&s->s_mbiarray[li]);
11249 					}
11250 					/*
11251 					 * If getmasters fails, getmasters
11252 					 * will set appropriate error flags.
11253 					 */
11254 					s->s_mbiarray[li] = getmasters(s,
11255 					    md_expldev(lp->l_dev), lp->l_blkno,
11256 					    (uint_t *)&(lp->l_flags), &mn_set);
11257 				} else if (lp->l_flags & MDDB_F_DELETED) {
11258 					/*
11259 					 * If old one has been deleted -
11260 					 * delete old mbiarray.
11261 					 */
11262 					if (s->s_mbiarray[li]) {
11263 						free_mbipp(&s->s_mbiarray[li]);
11264 					}
11265 				}
11266 			}
11267 
11268 			/* Free this node's old view of mddb locator blocks */
11269 			kmem_free((caddr_t)s->s_lbp,
11270 			    dbtob(s->s_lbp->lb_blkcnt));
11271 			s->s_lbp = lbp;
11272 		} else {
11273 			if (lbp)
11274 				kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11275 		}
11276 	}
11277 
11278 	if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11279 		lnp = s->s_lnp;
11280 		lbp = s->s_lbp;
11281 		ln_blkcnt = lbp->lb_lnblkcnt;
11282 		s->s_lnp = NULL; /* readlocnames does this anyway */
11283 		for (li = 0; li < lbp->lb_loccnt; li++) {
11284 			lp = &lbp->lb_locators[li];
11285 
11286 			if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11287 			    (lp->l_flags & MDDB_F_EMASTER))
11288 				continue;
11289 
11290 			/* Successfully read the locator names */
11291 			if (readlocnames(s, li) == 0)
11292 				break;
11293 		}
11294 
11295 		if (li == lbp->lb_loccnt) {
11296 			/* Did not successfully read locnames; restore lnp */
11297 			s->s_lnp = lnp;
11298 		} else {
11299 			/* readlocnames successful, free old struct */
11300 			kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11301 		}
11302 	}
11303 
11304 	if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11305 		mddb_de_ic_t	*dep, *tdep, *first_dep, *dep2;
11306 		mddb_db_t	*dbp;
11307 		mddb_db32_t	*db32p;
11308 		mddb_de32_t	*de32p, *de32p2;
11309 		int		writeout;
11310 
11311 		lbp = s->s_lbp;
11312 		/*
11313 		 * Walk through directory block and directory entry incore
11314 		 * linked list looking for optimized resync records.
11315 		 * For each opt record found, re-read in directory block.
11316 		 * The directoy block consists of a number of directory
11317 		 * entries.  The directory entry for this opt record will
11318 		 * describe which 2 mddbs actually contain the resync record
11319 		 * since it could have been relocated by the master node
11320 		 * due to mddb failure or mddb deletion.  If this node
11321 		 * is the record owner for this opt record, then write out
11322 		 * the record to the 2 mddbs listed in the directory entry
11323 		 * if the mddbs locations are different than previously known.
11324 		 */
11325 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11326 			for (dep = dbp->db_firstentry; dep;
11327 			    dep = dep->de_next) {
11328 				/* Found an opt record */
11329 				if (dep->de_flags & MDDB_F_OPT)
11330 					break;
11331 			}
11332 			/* If no opt records found, go to next dbp */
11333 			if (dep == NULL)
11334 				continue;
11335 
11336 			/*
11337 			 * Reread directory block from disk since
11338 			 * master could have rewritten in during fixoptrecord.
11339 			 */
11340 			db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11341 			    KM_SLEEP);
11342 			create_db32rec(db32p, dbp);
11343 			for (li = 0; li < lbp->lb_loccnt; li++) {
11344 				lp = &lbp->lb_locators[li];
11345 
11346 				if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11347 				    (lp->l_flags & MDDB_F_EMASTER))
11348 					continue;
11349 
11350 				err = readblks(s, (caddr_t)db32p,
11351 				    db32p->db32_blknum, 1, li);
11352 				if (err)
11353 					continue;
11354 
11355 				/* Reverify db; go to next mddb if bad */
11356 				if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11357 				    (revchk(MDDB_REV_DB,
11358 				    db32p->db32_revision)) ||
11359 				    (crcchk(db32p, &db32p->db32_checksum,
11360 				    MDDB_BSIZE, NULL))) {
11361 					continue;
11362 				} else {
11363 					break;
11364 				}
11365 			}
11366 			/*
11367 			 * If all mddbs are unavailable then panic since
11368 			 * this slave cannot be allowed to continue out-of-sync
11369 			 * with the master node.  Since the optimized resync
11370 			 * records are written by all nodes, all nodes must
11371 			 * stay in sync with the master.
11372 			 *
11373 			 * This also handles the case when all storage
11374 			 * connectivity to a slave node has failed.  The
11375 			 * slave node will send an MDDB_OPTRECERR message to
11376 			 * the master node when the slave node has been unable
11377 			 * to write an optimized resync record to both
11378 			 * designated mddbs.  After the master has fixed the
11379 			 * optimized records to be on available mddbs, the
11380 			 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11381 			 * is sent to all slave nodes.  If a slave node is
11382 			 * unable to access any mddb in order to read in the
11383 			 * relocated optimized resync record, then the slave
11384 			 * node must panic.
11385 			 */
11386 			if (li == lbp->lb_loccnt) {
11387 				kmem_free((caddr_t)db32p, MDDB_BSIZE);
11388 				cmn_err(CE_PANIC, "md: mddb: Node unable to "
11389 				    "access any SVM state database "
11390 				    "replicas for diskset %s\n", s->s_setname);
11391 			}
11392 			/*
11393 			 * Setup temp copy of linked list of de's.
11394 			 * Already have an incore copy, but need to walk
11395 			 * the directory entry list contained in the
11396 			 * new directory block that was just read in above.
11397 			 * After finding the directory entry of an opt record
11398 			 * by walking the incore list, find the corresponding
11399 			 * entry in the temporary list and then update
11400 			 * the incore directory entry record with
11401 			 * the (possibly changed) mddb location stored
11402 			 * for the optimized resync records.
11403 			 */
11404 			de32p = (mddb_de32_t *)
11405 			    ((void *) ((caddr_t)
11406 			    (&db32p->db32_firstentry)
11407 			    + sizeof (db32p->db32_firstentry)));
11408 			tdep = (mddb_de_ic_t *)
11409 			    kmem_zalloc(sizeof (mddb_de_ic_t) -
11410 			    sizeof (mddb_block_t) +
11411 			    sizeof (mddb_block_t) *
11412 			    de32p->de32_blkcount, KM_SLEEP);
11413 			de32tode(de32p, tdep);
11414 			first_dep = tdep;
11415 			while (de32p && de32p->de32_next) {
11416 				de32p2 = nextentry(de32p);
11417 				dep2 = (mddb_de_ic_t *)kmem_zalloc(
11418 				    sizeof (mddb_de_ic_t) -
11419 				    sizeof (mddb_block_t) +
11420 				    sizeof (mddb_block_t) *
11421 				    de32p2->de32_blkcount, KM_SLEEP);
11422 				de32tode(de32p2, dep2);
11423 				tdep->de_next = dep2;
11424 				tdep = dep2;
11425 				de32p = de32p2;
11426 			}
11427 
11428 			/* Now, walk the incore directory entry list */
11429 			for (dep = dbp->db_firstentry; dep;
11430 			    dep = dep->de_next) {
11431 				if (! (dep->de_flags & MDDB_F_OPT))
11432 					continue;
11433 				/*
11434 				 * Found an opt record in the incore copy.
11435 				 * Find the corresponding entry in the temp
11436 				 * list.  If anything has changed in the
11437 				 * opt record info between the incore copy
11438 				 * and the temp copy, update the incore copy
11439 				 * and set a flag to writeout the opt record
11440 				 * to the new mddb locations.
11441 				 */
11442 				for (tdep = first_dep; tdep;
11443 				    tdep = tdep->de_next) {
11444 					if (dep->de_recid == tdep->de_recid) {
11445 						writeout = 0;
11446 						/* Check first mddb location */
11447 						if ((dep->de_optinfo[0].o_li !=
11448 						    tdep->de_optinfo[0].o_li) ||
11449 						    (dep->de_optinfo[0].
11450 						    o_flags != tdep->de_optinfo
11451 						    [0].o_flags)) {
11452 							dep->de_optinfo[0] =
11453 							    tdep->de_optinfo[0];
11454 							writeout = 1;
11455 						}
11456 						/* Check second mddb location */
11457 						if ((dep->de_optinfo[1].o_li !=
11458 						    tdep->de_optinfo[1].o_li) ||
11459 						    (dep->de_optinfo[1].
11460 						    o_flags != tdep->de_optinfo
11461 						    [1].o_flags)) {
11462 							dep->de_optinfo[1] =
11463 							    tdep->de_optinfo[1];
11464 							writeout = 1;
11465 						}
11466 						/*
11467 						 * Record owner should rewrite
11468 						 * it
11469 						 */
11470 						if ((writeout) &&
11471 						    (dep->de_owner_nodeid ==
11472 						    md_set[mpp->c_setno].
11473 						    s_nodeid))
11474 							(void) writeoptrecord(s,
11475 							    dep);
11476 						break;
11477 					}
11478 				}
11479 			}
11480 			/*
11481 			 * Update the incore checksum information for this
11482 			 * directory block to match the newly read in checksum.
11483 			 * This should have only changed if the incore and
11484 			 * temp directory entries differed, but it takes
11485 			 * more code to do the check than to just update
11486 			 * the information everytime.
11487 			 */
11488 			dbp->db_checksum = db32p->db32_checksum;
11489 
11490 			/* Now free everything */
11491 			tdep = first_dep;
11492 			while (tdep) {
11493 				dep2 = tdep->de_next;
11494 				kmem_free((caddr_t)tdep,
11495 				    sizeofde(tdep));
11496 				tdep = dep2;
11497 			}
11498 			kmem_free((caddr_t)db32p, MDDB_BSIZE);
11499 		}
11500 		rval = 0;
11501 	}
11502 out:
11503 	single_thread_end(s);
11504 	mddb_setexit_no_parse(s);
11505 	return (rval);
11506 }
11507 
11508 int
mddb_block(mddb_block_parm_t * mbp)11509 mddb_block(mddb_block_parm_t *mbp)
11510 {
11511 	mddb_set_t	*s;
11512 	int		err = 0;
11513 	md_error_t	*ep = &mbp->c_mde;
11514 
11515 	if (mbp->c_setno >= md_nsets)
11516 		return (EINVAL);
11517 
11518 	/*
11519 	 * If the new_master flag is set for this setno we are in the middle
11520 	 * of a reconfig cycle, and blocking or unblocking is not needed.
11521 	 * Hence we can return success immediately
11522 	 */
11523 	if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11524 		return (0);
11525 	}
11526 
11527 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11528 		return (0);
11529 
11530 	if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11531 		return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11532 	}
11533 
11534 	if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11535 		mddb_setexit_no_parse(s);
11536 		return (EINVAL);
11537 	}
11538 
11539 	single_thread_start(s);
11540 
11541 	if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11542 		md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11543 
11544 	if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11545 		md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11546 
11547 	single_thread_end(s);
11548 	mddb_setexit_no_parse(s);
11549 	return (err);
11550 }
11551 
11552 /*
11553  * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11554  * to relocate any optimized resync records to available mddbs.
11555  * This routine is only called on the master node.
11556  *
11557  * Used in a MN diskset when a slave node has failed to write an optimized
11558  * resync record.  The failed mddb information is sent to the master node
11559  * so the master can relocate the optimized records, if possible.  If the
11560  * failed mddb information has a mddb marked as failed that was previously
11561  * marked active on the master, the master sets its incore mddb state to
11562  * EWRITE and sets the PARSE_LOCBLK flag.  The master node then attempts
11563  * to relocate any optimized records on the newly failed mddbs by calling
11564  * fixoptrecords.  (fixoptrecords will set the PARSE_OPTRECS flag if any
11565  * optimized records are relocated.)
11566  *
11567  * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11568  * flags and will send a PARSE message to the slave nodes.  The PARSE_LOCBLK
11569  * flag causes the slave node to re-read in the locator block from disk.
11570  * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11571  * blocks and write out any optimized resync records that have been
11572  * relocated to a different mddb.
11573  */
11574 int
mddb_optrecfix(mddb_optrec_parm_t * mop)11575 mddb_optrecfix(mddb_optrec_parm_t *mop)
11576 {
11577 	mddb_set_t		*s;
11578 	int			err = 0;
11579 	mddb_lb_t		*lbp;
11580 	mddb_mnlb_t		*mnlbp;
11581 	mddb_locator_t		*lp;
11582 	int			li;
11583 	mddb_mnsidelocator_t	*mnslp;
11584 	mddb_drvnm_t		*dn;
11585 	int			i, j;
11586 	md_replica_recerr_t	*recerr;
11587 	md_error_t		*ep = &mop->c_mde;
11588 	int			something_changed = 0;
11589 	int			alc, lc;
11590 	int			setno;
11591 
11592 	setno = mop->c_setno;
11593 	if (mop->c_setno >= md_nsets)
11594 		return (EINVAL);
11595 
11596 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11597 		return (0);
11598 
11599 	if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11600 		return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11601 	}
11602 
11603 	if (!(MD_MNSET_SETNO(mop->c_setno))) {
11604 		mddb_setexit(s);
11605 		return (EINVAL);
11606 	}
11607 
11608 	single_thread_start(s);
11609 	lbp = s->s_lbp;
11610 	mnlbp = (mddb_mnlb_t *)lbp;
11611 
11612 	/*
11613 	 * If slave node has seen an mddb failure, but the master node
11614 	 * hasn't encountered this failure, mark the mddb as failed on
11615 	 * the master node and set the something_changed flag to 1.
11616 	 */
11617 	for (i = 0; i < 2; i++) {
11618 		recerr = &mop->c_recerr[i];
11619 		if (recerr->r_flags & MDDB_F_EWRITE) {
11620 			li = recerr->r_li;
11621 			lp = &lbp->lb_locators[li];
11622 			for (j = 0; j < MD_MNMAXSIDES; j++) {
11623 				mnslp = &mnlbp->lb_mnsidelocators[j][li];
11624 				if (mnslp->mnl_sideno == s->s_sideno)
11625 					break;
11626 			}
11627 			/* Do quick check using li */
11628 			if (j != MD_MNMAXSIDES)
11629 				dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11630 
11631 			if ((j != MD_MNMAXSIDES) &&
11632 			    (strncmp(dn->dn_data, recerr->r_driver_name,
11633 			    MD_MAXDRVNM) == 0) &&
11634 			    (recerr->r_blkno == lp->l_blkno) &&
11635 			    (recerr->r_mnum == mnslp->mnl_mnum)) {
11636 				if ((lp->l_flags & MDDB_F_ACTIVE) ||
11637 				    ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11638 					something_changed = 1;
11639 					lp->l_flags |= MDDB_F_EWRITE;
11640 					lp->l_flags &= ~MDDB_F_ACTIVE;
11641 				}
11642 			} else {
11643 				/*
11644 				 * Passed in li from slave does not match
11645 				 * the replica in the master's structures.
11646 				 * This could have occurred if a delete
11647 				 * mddb command was running when the
11648 				 * optimized resync record had a failure.
11649 				 * Search all replicas for this entry.
11650 				 * If no match, just ignore.
11651 				 * If a match, set replica in error.
11652 				 */
11653 				for (li = 0; li < lbp->lb_loccnt; li++) {
11654 					lp = &lbp->lb_locators[li];
11655 					if (lp->l_flags & MDDB_F_DELETED)
11656 						continue;
11657 
11658 					for (j = 0; j < MD_MNMAXSIDES; j++) {
11659 						mnslp =
11660 						    &mnlbp->
11661 						    lb_mnsidelocators[j][li];
11662 						if (mnslp->mnl_sideno ==
11663 						    s->s_sideno)
11664 							break;
11665 					}
11666 					if (j == MD_MNMAXSIDES)
11667 						continue;
11668 
11669 					dn = &lbp->
11670 					    lb_drvnm[mnslp->mnl_drvnm_index];
11671 					if ((strncmp(dn->dn_data,
11672 					    recerr->r_driver_name,
11673 					    MD_MAXDRVNM) == 0) &&
11674 					    (recerr->r_blkno == lp->l_blkno) &&
11675 					    (recerr->r_mnum ==
11676 					    mnslp->mnl_mnum)) {
11677 						if ((lp->l_flags &
11678 						    MDDB_F_ACTIVE) ||
11679 						    ((lp->l_flags &
11680 						    MDDB_F_EWRITE) == 0)) {
11681 							something_changed = 1;
11682 							lp->l_flags |=
11683 							    MDDB_F_EWRITE;
11684 							lp->l_flags &=
11685 							    ~MDDB_F_ACTIVE;
11686 						}
11687 						break;
11688 					}
11689 				}
11690 			}
11691 		}
11692 	}
11693 
11694 	/*
11695 	 * If this message changed nothing, then we're done since this
11696 	 * failure has already been handled.
11697 	 * If some mddb state has been changed, send a parse message to
11698 	 * the slave nodes so that the slaves will re-read the locator
11699 	 * block from disk.
11700 	 */
11701 	if (something_changed == 0) {
11702 		single_thread_end(s);
11703 		mddb_setexit(s);
11704 		return (0);
11705 	} else {
11706 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11707 	}
11708 
11709 	/*
11710 	 * Scan replicas setting MD_SET_TOOFEW if
11711 	 * 50% or more of the mddbs have seen errors.
11712 	 * Note: Don't call selectreplicas or writeretry
11713 	 * since these routines may end up setting the ACTIVE flag
11714 	 * on a failed mddb if the master is able to access the mddb
11715 	 * but the slave node couldn't.  Need to have the ACTIVE flag
11716 	 * turned off in order to relocate the optimized records to
11717 	 * mddbs that are (hopefully) available on all nodes.
11718 	 */
11719 	alc = 0;
11720 	lc = 0;
11721 	for (li = 0; li < lbp->lb_loccnt; li++) {
11722 		lp = &lbp->lb_locators[li];
11723 		if (lp->l_flags & MDDB_F_DELETED)
11724 			continue;
11725 		lc++;
11726 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11727 			continue;
11728 		alc++;
11729 	}
11730 
11731 	/*
11732 	 * If more than 50% mddbs have failed, then don't relocate opt recs.
11733 	 * The node sending the mddb failure information will detect TOOFEW
11734 	 * and will panic when it attempts to re-write the optimized record.
11735 	 */
11736 	if (alc < ((lc + 1) / 2)) {
11737 		md_set_setstatus(setno, MD_SET_TOOFEW);
11738 		(void) push_lb(s);
11739 		(void) upd_med(s, "mddb_optrecfix(0)");
11740 		single_thread_end(s);
11741 		mddb_setexit(s);
11742 		return (0);
11743 	}
11744 
11745 	/* Attempt to relocate optimized records that are on failed mddbs */
11746 	(void) fixoptrecords(s);
11747 
11748 	/* Push changed locator block out to disk */
11749 	(void) push_lb(s);
11750 	(void) upd_med(s, "mddb_optrecfix(1)");
11751 
11752 	/* Recheck for TOOFEW after writing out locator blocks */
11753 	alc = 0;
11754 	lc = 0;
11755 	for (li = 0; li < lbp->lb_loccnt; li++) {
11756 		lp = &lbp->lb_locators[li];
11757 		if (lp->l_flags & MDDB_F_DELETED)
11758 			continue;
11759 		lc++;
11760 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11761 			continue;
11762 		alc++;
11763 	}
11764 
11765 	/* If more than 50% mddbs have failed, then don't relocate opt recs */
11766 	if (alc < ((lc + 1) / 2)) {
11767 		md_set_setstatus(setno, MD_SET_TOOFEW);
11768 		single_thread_end(s);
11769 		mddb_setexit(s);
11770 		return (0);
11771 	}
11772 
11773 	single_thread_end(s);
11774 	mddb_setexit(s);
11775 	return (0);
11776 }
11777 
11778 /*
11779  * Check if incore mddb on master node matches ondisk mddb.
11780  * If not, master writes out incore view to all mddbs.
11781  * Have previously verified that master is an owner of the
11782  * diskset (master has snarfed diskset) and that diskset is
11783  * not stale.
11784  *
11785  * Meant to be called during reconfig cycle during change of master.
11786  * Previous master in diskset may have changed the mddb and
11787  * panic'd before relaying information to slave nodes.  New
11788  * master node just writes out its incore view of the mddb and
11789  * the replay of the change log will resync all the nodes.
11790  *
11791  * Only supported for MN disksets.
11792  *
11793  * Return values:
11794  *	0 - success
11795  *	non-zero - failure
11796  */
11797 int
mddb_check_write_ioctl(mddb_config_t * info)11798 mddb_check_write_ioctl(mddb_config_t *info)
11799 {
11800 	int			err = 0;
11801 	set_t			setno = info->c_setno;
11802 	mddb_set_t		*s;
11803 	int			li;
11804 	mddb_locator_t		*lp;
11805 	mddb_lb_t		*lbp;
11806 	mddb_mnlb_t		*mnlbp_od;
11807 	mddb_ln_t		*lnp;
11808 	mddb_mnln_t		*mnlnp_od;
11809 	mddb_db_t		*dbp;
11810 	mddb_de_ic_t		*dep;
11811 	int			write_out_mddb;
11812 	md_error_t		*ep = &info->c_mde;
11813 	int			mddb_err = 0;
11814 	int			prev_li = 0;
11815 	int			rval = 0;
11816 	int			alc, lc;
11817 	int			mddbs_present = 0;
11818 
11819 	/* Verify that setno is in valid range */
11820 	if (setno >= md_nsets)
11821 		return (EINVAL);
11822 
11823 	if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11824 		return (0);
11825 
11826 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11827 		return (mddbstatus2error(ep, err, NODEV32, setno));
11828 	}
11829 
11830 	/* Calling diskset must be a MN diskset */
11831 	if (!(MD_MNSET_SETNO(setno))) {
11832 		mddb_setexit(s);
11833 		return (EINVAL);
11834 	}
11835 
11836 	/* Re-verify that set is not stale */
11837 	if (md_get_setstatus(setno) & MD_SET_STALE) {
11838 		mddb_setexit(s);
11839 		return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
11840 	}
11841 
11842 	lbp = s->s_lbp;
11843 	lnp = s->s_lnp;
11844 
11845 	/*
11846 	 * Previous master could have died during the write of data to
11847 	 * the mddbs so that the ondisk mddbs may not be consistent.
11848 	 * So, need to check the contents of the first and last active mddb
11849 	 * to see if the mddbs need to be rewritten.
11850 	 */
11851 	for (li = 0; li < lbp->lb_loccnt; li++) {
11852 		int	checkcopy_err;
11853 
11854 		lp = &lbp->lb_locators[li];
11855 		/* Find replica that is active */
11856 		if (lp->l_flags & MDDB_F_DELETED)
11857 			continue;
11858 		mddbs_present = 1;
11859 		if (! (lp->l_flags & MDDB_F_ACTIVE))
11860 			continue;
11861 		if (s->s_mbiarray[li] == NULL)
11862 			continue;
11863 		/* Check locator block */
11864 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11865 		    KM_SLEEP);
11866 		/* read in on-disk locator block */
11867 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11868 
11869 		/* If err, try next mddb */
11870 		if (err) {
11871 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11872 			continue;
11873 		}
11874 
11875 		/*
11876 		 * We resnarf all changelog entries for this set.
11877 		 * They may have been altered by the previous master
11878 		 */
11879 		for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11880 			for (dep = dbp->db_firstentry; dep; dep =
11881 			    dep->de_next) {
11882 				if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11883 					continue;
11884 				}
11885 				/*
11886 				 * This has been alloc'ed while
11887 				 * joining the set
11888 				 */
11889 				if (dep->de_rb) {
11890 					kmem_free(dep->de_rb, dep->de_recsize);
11891 					dep->de_rb = (mddb_rb32_t *)NULL;
11892 				}
11893 				if (dep->de_rb_userdata) {
11894 					kmem_free(dep->de_rb_userdata,
11895 					    dep->de_reqsize);
11896 					dep->de_rb_userdata = (caddr_t)NULL;
11897 				}
11898 
11899 				err = getrecord(s, dep, li);
11900 				if (err) {
11901 					/*
11902 					 * When we see on error while reading
11903 					 * the changelog entries, we move on
11904 					 * to the next mddb
11905 					 */
11906 					err = 1;
11907 					break; /* out of inner for-loop */
11908 				}
11909 				allocuserdata(dep);
11910 			}
11911 			if (err)
11912 				break; /* out of outer for-loop */
11913 		}
11914 
11915 		/* If err, try next mddb */
11916 		if (err) {
11917 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11918 			continue;
11919 		}
11920 
11921 		/* Is incore locator block same as ondisk? */
11922 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11923 		    == 1) {
11924 			write_out_mddb = 1;
11925 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11926 			break;
11927 		}
11928 
11929 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11930 
11931 		/* If lb ok, check locator names */
11932 		mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11933 		    KM_SLEEP);
11934 		/* read in on-disk locator names */
11935 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11936 		    lbp->lb_lnblkcnt, li);
11937 
11938 		/* If err, try next mddb */
11939 		if (err) {
11940 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11941 			continue;
11942 		}
11943 
11944 		/* Are incore locator names same as ondisk? */
11945 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11946 		    == 1) {
11947 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11948 			write_out_mddb = 1;
11949 			break;
11950 		}
11951 
11952 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11953 
11954 		/*
11955 		 * Check records in mddb.
11956 		 * If a read error is encountered, set the error flag and
11957 		 * continue to the next mddb.  Otherwise, if incore data is
11958 		 * different from ondisk, then set the flag to write out
11959 		 * the mddb and break out.
11960 		 */
11961 		checkcopy_err = checkcopy(s, li);
11962 		if (checkcopy_err == MDDB_F_EREAD) {
11963 			lp->l_flags |= MDDB_F_EREAD;
11964 			mddb_err = 1;
11965 			continue;
11966 		} else if (checkcopy_err == 1) {
11967 			write_out_mddb = 1;
11968 			break;
11969 		}
11970 		/*
11971 		 * Have found first active mddb and the data is the same as
11972 		 * incore - break out of loop
11973 		 */
11974 		write_out_mddb = 0;
11975 		break;
11976 	}
11977 
11978 	/*
11979 	 * Skip checking for last active mddb if:
11980 	 *	- already found a mismatch in the first active mddb
11981 	 *		(write_out_mddb is 1)  OR
11982 	 * 	- didn't find a readable mddb when looking for first
11983 	 *	  active mddb (there are mddbs present but all failed
11984 	 *	  when read was attempted).
11985 	 *
11986 	 * In either case, go to write_out_mddb label in order to attempt
11987 	 * to write out the data. If < 50% mddbs are available, panic.
11988 	 */
11989 	if ((write_out_mddb == 1) ||
11990 	    ((li == lbp->lb_loccnt) && mddbs_present)) {
11991 		write_out_mddb = 1;
11992 		goto write_out_mddb;
11993 	}
11994 
11995 	/*
11996 	 * Save which index was checked for the first active mddb.  If only 1
11997 	 * active mddb, don't want to recheck the same mddb when looking for
11998 	 * last active mddb.
11999 	 */
12000 	prev_li = li;
12001 
12002 	/*
12003 	 * Now, checking for last active mddb.  If found same index as before
12004 	 * (only 1 active mddb), then skip.
12005 	 */
12006 	for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
12007 		int	checkcopy_err;
12008 
12009 		lp = &lbp->lb_locators[li];
12010 		/* Find replica that is active */
12011 		if (! (lp->l_flags & MDDB_F_ACTIVE))
12012 			continue;
12013 		if (lp->l_flags & MDDB_F_DELETED)
12014 			continue;
12015 		if (s->s_mbiarray[li] == NULL)
12016 			continue;
12017 		/* If already checked mddb, bail out */
12018 		if (li == prev_li)
12019 			break;
12020 		/* Check locator block */
12021 		mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
12022 		    KM_SLEEP);
12023 		/* read in on-disk locator block */
12024 		err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
12025 
12026 		/* If err, try next mddb */
12027 		if (err) {
12028 			kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
12029 			continue;
12030 		}
12031 
12032 
12033 		/* Is incore locator block same as ondisk? */
12034 		if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
12035 		    == 1) {
12036 			kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
12037 			write_out_mddb = 1;
12038 			break;
12039 		}
12040 
12041 		kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
12042 
12043 		/* If lb ok, check locator names */
12044 		mnlnp_od = (mddb_mnln_t *)
12045 		    kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
12046 
12047 		/* read in on-disk locator names */
12048 		err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
12049 		    lbp->lb_lnblkcnt, li);
12050 
12051 		/* If err, try next mddb */
12052 		if (err) {
12053 			kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
12054 			continue;
12055 		}
12056 
12057 		/* Are incore locator names same as ondisk? */
12058 		if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
12059 		    == 1) {
12060 			kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12061 			write_out_mddb = 1;
12062 			break;
12063 		}
12064 
12065 		kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12066 
12067 		/*
12068 		 * Check records in mddb.
12069 		 * If a read error is encountered, set the error flag and
12070 		 * continue to the next mddb.  Otherwise, if incore data is
12071 		 * different from ondisk, then set the flag to write out
12072 		 * the mddb and break out.
12073 		 */
12074 		checkcopy_err = checkcopy(s, li);
12075 		if (checkcopy_err == MDDB_F_EREAD) {
12076 			lp->l_flags |= MDDB_F_EREAD;
12077 			mddb_err = 1;
12078 			continue;
12079 		} else if (checkcopy_err == 1) {
12080 			write_out_mddb = 1;
12081 			break;
12082 		}
12083 		/*
12084 		 * Have found last active mddb and the data is the same as
12085 		 * incore - break out of loop
12086 		 */
12087 		write_out_mddb = 0;
12088 		break;
12089 	}
12090 
12091 	/*
12092 	 * If ondisk and incore versions of the mddb don't match, then
12093 	 * write out this node's incore version to disk.
12094 	 * Or, if unable to read a copy of the mddb, attempt to write
12095 	 * out a new one.
12096 	 */
12097 write_out_mddb:
12098 	if (write_out_mddb) {
12099 		/* Recompute free blocks based on incore information */
12100 		computefreeblks(s); /* set up free block bits */
12101 
12102 		/*
12103 		 * Write directory entries and record blocks.
12104 		 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
12105 		 * routine won't write out change log records.
12106 		 */
12107 		for (li = 0; li < lbp->lb_loccnt; li++) {
12108 			lp = &lbp->lb_locators[li];
12109 			/* Don't write to inactive or deleted mddbs */
12110 			if (! (lp->l_flags & MDDB_F_ACTIVE))
12111 				continue;
12112 			if (lp->l_flags & MDDB_F_DELETED)
12113 				continue;
12114 			if (s->s_mbiarray[li] == NULL)
12115 				continue;
12116 			/* If encounter a write error, save it for later */
12117 			if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
12118 				lp->l_flags |= MDDB_F_EWRITE;
12119 				mddb_err = 1;
12120 			}
12121 		}
12122 
12123 		/*
12124 		 * Write out locator blocks to all replicas.
12125 		 * push_lb will set MDDB_F_EWRITE on replicas that fail.
12126 		 */
12127 		if (push_lb(s))
12128 			mddb_err = 1;
12129 		(void) upd_med(s, "mddb_check_write_ioctl(0)");
12130 
12131 		/* Write out locator names to all replicas */
12132 		lnp = s->s_lnp;
12133 		uniqtime32(&lnp->ln_timestamp);
12134 		lnp->ln_revision = MDDB_REV_MNLN;
12135 		crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
12136 
12137 		/* writeall sets MDDB_F_EWRITE if writes fails to replica */
12138 		if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
12139 		    lbp->lb_lnblkcnt, 0))
12140 			mddb_err = 1;
12141 
12142 		/*
12143 		 * The writes to the replicas above would have set
12144 		 * the MDDB_F_EWRITE flags if any write error was
12145 		 * encountered.
12146 		 * If < 50% of the mddbs are available, panic.
12147 		 */
12148 		lc = alc = 0;
12149 		for (li = 0; li < lbp->lb_loccnt; li++) {
12150 			lp = &lbp->lb_locators[li];
12151 			if (lp->l_flags & MDDB_F_DELETED)
12152 				continue;
12153 			lc++;
12154 			/*
12155 			 * If mddb:
12156 			 *	- is not active (previously had an error)
12157 			 *	- had an error reading the master blocks  or
12158 			 *	- had an error in writing to the mddb
12159 			 * then don't count this mddb in the active count.
12160 			 */
12161 			if (! (lp->l_flags & MDDB_F_ACTIVE) ||
12162 			    (lp->l_flags & MDDB_F_EMASTER) ||
12163 			    (lp->l_flags & MDDB_F_EWRITE))
12164 				continue;
12165 			alc++;
12166 		}
12167 		if (alc < ((lc + 1) / 2)) {
12168 			cmn_err(CE_PANIC,
12169 			    "md: Panic due to lack of DiskSuite state\n"
12170 			    " database replicas. Fewer than 50%% of "
12171 			    "the total were available,\n so panic to "
12172 			    "ensure data integrity.");
12173 		}
12174 	}
12175 
12176 	/*
12177 	 * If encountered an error during checking or writing of
12178 	 * mddbs, call selectreplicas so that replica error can
12179 	 * be properly handled. This will involve another attempt
12180 	 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
12181 	 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
12182 	 * turned off. Set the MDDB_SCANALLSYNC flag so that
12183 	 * selectreplicas doesn't overwrite the change log entries.
12184 	 *
12185 	 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
12186 	 * that the locator block has been changed.
12187 	 */
12188 	if (mddb_err) {
12189 		(void) selectreplicas(s, MDDB_SCANALLSYNC);
12190 		s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
12191 	}
12192 
12193 write_out_end:
12194 	mddb_setexit(s);
12195 	return (rval);
12196 }
12197 
12198 /*
12199  * Set/reset/get set flags in set structure.
12200  * Used during reconfig cycle
12201  * Only supported for MN disksets.
12202  *
12203  * Return values:
12204  *	0 - success
12205  *	non-zero - failure
12206  */
12207 int
mddb_setflags_ioctl(mddb_setflags_config_t * info)12208 mddb_setflags_ioctl(mddb_setflags_config_t *info)
12209 {
12210 	set_t			setno = info->sf_setno;
12211 
12212 	/* Verify that setno is in valid range */
12213 	if (setno >= md_nsets)
12214 		return (EINVAL);
12215 
12216 	/*
12217 	 * When setting the flags, the set may not
12218 	 * be snarfed yet. So, don't check for SNARFED or MNset
12219 	 * and don't call mddb_setenter.
12220 	 * In order to discourage bad ioctl calls,
12221 	 * verify that magic field in structure is set correctly.
12222 	 */
12223 	if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
12224 		return (EINVAL);
12225 
12226 	switch (info->sf_flags) {
12227 	case MDDB_NM_SET:
12228 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12229 			md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12230 		if (info->sf_setflags & MD_SET_MN_START_RC)
12231 			md_set_setstatus(setno, MD_SET_MN_START_RC);
12232 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12233 			md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12234 		break;
12235 
12236 	case MDDB_NM_RESET:
12237 		if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12238 			md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12239 		if (info->sf_setflags & MD_SET_MN_START_RC)
12240 			md_clr_setstatus(setno, MD_SET_MN_START_RC);
12241 		if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12242 			md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12243 		break;
12244 
12245 	case MDDB_NM_GET:
12246 		info->sf_setflags = md_get_setstatus(setno) &
12247 		    (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
12248 		    MD_SET_MN_MIR_STATE_RC);
12249 		break;
12250 	}
12251 
12252 	return (0);
12253 }
12254 
12255 /*
12256  * md_update_minor
12257  *
12258  * This function updates the minor in the namespace entry for an
12259  * underlying metadevice.  The function is called in mod_imp_set
12260  * where mod is sp, stripe, mirror and raid.
12261  *
12262  */
12263 int
md_update_minor(set_t setno,side_t side,mdkey_t key)12264 md_update_minor(
12265 	set_t	setno,
12266 	side_t	side,
12267 	mdkey_t	key
12268 )
12269 {
12270 	struct nm_next_hdr	*nh;
12271 	struct nm_name		*n;
12272 	char			*shn;
12273 	int			retval = 1;
12274 	side_t			s;
12275 
12276 	/*
12277 	 * Load the devid name space if it exists
12278 	 */
12279 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12280 	if (! md_load_namespace(setno, NULL, 0L)) {
12281 		/*
12282 		 * Unload the devid namespace
12283 		 */
12284 		(void) md_unload_namespace(setno, NM_DEVID);
12285 		return (0);
12286 	}
12287 
12288 	rw_enter(&nm_lock.lock, RW_READER);
12289 
12290 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12291 		retval = 0;
12292 		goto out;
12293 	}
12294 
12295 	/*
12296 	 * Look up the key
12297 	 */
12298 	for (s = 0; s < MD_MAXSIDES; s++) {
12299 		/*
12300 		 * For side other than the import 'side', cleanup its entry
12301 		 */
12302 		if ((n = lookup_entry(nh, setno, s, key, NODEV64, 0L)) !=
12303 		    NULL) {
12304 			if (n->n_side == side) {
12305 				/*
12306 				 * Update its n_minor if metadevice
12307 				 */
12308 				if (((shn = (char *)getshared_name(setno,
12309 				    n->n_drv_key, 0L)) != NULL) &&
12310 				    (strcmp(shn, "md") == 0)) {
12311 					n->n_minor = MD_MKMIN(setno,
12312 					    MD_MIN2UNIT(n->n_minor));
12313 				}
12314 			} else {
12315 				/* We are not the import side, cleanup */
12316 				(void) remove_entry(nh, n->n_side, key, 0L);
12317 			}
12318 		}
12319 	}
12320 
12321 out:
12322 	rw_exit(&nm_lock.lock);
12323 	return (retval);
12324 }
12325 
12326 /*
12327  * md_update_top_device_minor
12328  *
12329  * This function updates the minor in the namespace entry for a top
12330  * level metadevice.  The function is called in mod_imp_set where
12331  * mod is sp, stripe, mirror and raid.
12332  *
12333  */
12334 int
md_update_top_device_minor(set_t setno,side_t side,md_dev64_t dev)12335 md_update_top_device_minor(
12336 	set_t	setno,
12337 	side_t	side,
12338 	md_dev64_t dev
12339 )
12340 {
12341 	struct nm_next_hdr	*nh;
12342 	struct nm_name		*n;
12343 	char			*shn;
12344 	int			retval = 1;
12345 
12346 	/*
12347 	 * Load the devid name space if it exists
12348 	 */
12349 	(void) md_load_namespace(setno, NULL, NM_DEVID);
12350 	if (! md_load_namespace(setno, NULL, 0L)) {
12351 		/*
12352 		 * Unload the devid namespace
12353 		 */
12354 		(void) md_unload_namespace(setno, NM_DEVID);
12355 		return (0);
12356 	}
12357 
12358 	rw_enter(&nm_lock.lock, RW_READER);
12359 
12360 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12361 		retval = 0;
12362 		goto out;
12363 	}
12364 
12365 	/*
12366 	 * Look up the key
12367 	 */
12368 	if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
12369 		/*
12370 		 * Find the entry, update its n_minor if metadevice
12371 		 */
12372 		if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12373 		    == NULL) {
12374 			retval = 0;
12375 			goto out;
12376 		}
12377 
12378 		if (strcmp(shn, "md") == 0) {
12379 			n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12380 		}
12381 	}
12382 
12383 out:
12384 	rw_exit(&nm_lock.lock);
12385 	return (retval);
12386 }
12387 
12388 static void
md_imp_nm(mddb_set_t * s)12389 md_imp_nm(
12390 	mddb_set_t	*s
12391 )
12392 {
12393 	mddb_db_t		*dbp;
12394 	mddb_de_ic_t		*dep;
12395 	struct nm_rec_hdr	*hdr;
12396 	struct nm_header	*hhdr;
12397 	set_t			setno = s->s_setno;
12398 
12399 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12400 		for (dep = dbp->db_firstentry; dep != NULL;
12401 		    dep = dep->de_next) {
12402 			switch (dep->de_type1) {
12403 
12404 			case MDDB_NM_HDR:
12405 			case MDDB_DID_NM_HDR:
12406 
12407 				hhdr = (struct nm_header *)
12408 				    dep->de_rb_userdata;
12409 
12410 				hdr = &hhdr->h_names;
12411 				if (hdr->r_next_recid > 0) {
12412 					hdr->r_next_recid = MAKERECID(setno,
12413 					    DBID(hdr->r_next_recid));
12414 				}
12415 
12416 				hdr = &hhdr->h_shared;
12417 				if (hdr->r_next_recid > 0) {
12418 					hdr->r_next_recid = MAKERECID(setno,
12419 					    DBID(hdr->r_next_recid));
12420 				}
12421 				break;
12422 
12423 			case MDDB_NM:
12424 			case MDDB_DID_NM:
12425 			case MDDB_SHR_NM:
12426 			case MDDB_DID_SHR_NM:
12427 
12428 				hdr = (struct nm_rec_hdr *)
12429 				    dep->de_rb_userdata;
12430 
12431 				if (hdr->r_next_recid > 0) {
12432 					hdr->r_next_recid = MAKERECID
12433 					    (setno, DBID(hdr->r_next_recid));
12434 				}
12435 				break;
12436 
12437 			default:
12438 				break;
12439 			}
12440 		}
12441 	}
12442 }
12443 
12444 static int
update_db_rec(mddb_set_t * s)12445 update_db_rec(
12446 	mddb_set_t	*s
12447 )
12448 {
12449 	mddb_db_t	*dbp;
12450 	mddb_de_ic_t	*dep;
12451 	mddb_recid_t	ids[2];
12452 
12453 	for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12454 		for (dep = dbp->db_firstentry; dep != NULL;
12455 		    dep = dep->de_next) {
12456 			if (! (dep->de_flags & MDDB_F_OPT)) {
12457 				ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12458 				ids[1] = 0;
12459 				if (mddb_commitrecs(ids)) {
12460 					return (MDDB_E_NORECORD);
12461 				}
12462 			}
12463 		}
12464 	}
12465 	return (0);
12466 }
12467 
12468 static int
update_mb(mddb_set_t * s)12469 update_mb(
12470 	mddb_set_t	*s
12471 )
12472 {
12473 	mddb_ri_t	*rip;
12474 	int	err = 0;
12475 
12476 	for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12477 		if (rip->ri_flags & MDDB_F_EMASTER)
12478 			/* disk is powered off or not there */
12479 			continue;
12480 
12481 		if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12482 			/*
12483 			 * It is a replicated set
12484 			 */
12485 			if (rip->ri_devid == (ddi_devid_t)NULL) {
12486 				return (-1);
12487 			}
12488 			err = update_mb_devid(s, rip, rip->ri_devid);
12489 		} else {
12490 			/*
12491 			 * It is a non-replicated set
12492 			 * and there is no need to update
12493 			 * devid
12494 			 */
12495 			err = update_mb_devid(s, rip, NULL);
12496 		}
12497 
12498 		if (err)
12499 			return (err);
12500 	}
12501 
12502 	return (0);
12503 }
12504 
12505 static int
update_setname(set_t setno)12506 update_setname(
12507 	set_t	setno
12508 )
12509 {
12510 	struct nm_next_hdr	*nh;
12511 	struct nm_shared_name	*shn, *new_shn;
12512 	char			*prefix = "/dev/md/";
12513 	char			*shrname;
12514 	int			len;
12515 	mdkey_t			o_key;
12516 	uint32_t		o_count, o_data;
12517 	mddb_recid_t		recid, ids[3];
12518 	int			err = 0;
12519 	mddb_set_t		*dbp;
12520 
12521 	/* Import setname */
12522 	dbp = (mddb_set_t *)md_set[setno].s_db;
12523 	len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12524 	shrname = kmem_zalloc(len, KM_SLEEP);
12525 	(void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12526 
12527 	rw_enter(&nm_lock.lock, RW_WRITER);
12528 	if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12529 		/*
12530 		 * No namespace is okay
12531 		 */
12532 		err = 0;
12533 		goto out;
12534 	}
12535 
12536 	if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12537 	    0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12538 		/*
12539 		 * No metadevice is okay
12540 		 */
12541 		err = 0;
12542 		goto out;
12543 	}
12544 
12545 	/*
12546 	 * We have it, go ahead and update the namespace.
12547 	 */
12548 	o_key = shn->sn_key;
12549 	o_count = shn->sn_count;
12550 	o_data = shn->sn_data;
12551 
12552 	if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12553 	    NM_NOCOMMIT | NM_KEY_RECYCLE)) {
12554 		err = MDDB_E_NORECORD;
12555 		goto out;
12556 	}
12557 	if ((new_shn = (struct nm_shared_name *)alloc_entry(
12558 	    nh, md_set[setno].s_nmid, len, NM_SHARED |
12559 	    NM_NOCOMMIT, &recid)) == NULL) {
12560 		err = MDDB_E_NORECORD;
12561 		goto out;
12562 	}
12563 
12564 	new_shn->sn_key = o_key;
12565 	new_shn->sn_count = o_count;
12566 	new_shn->sn_data = o_data;
12567 	new_shn->sn_namlen = (ushort_t)len;
12568 	(void) strcpy(new_shn->sn_name, shrname);
12569 
12570 	ids[0] = recid;
12571 	ids[1] = md_set[setno].s_nmid;
12572 	ids[2] = 0;
12573 	err = mddb_commitrecs(ids);
12574 
12575 out:
12576 	if (shrname)
12577 		kmem_free(shrname, len);
12578 	rw_exit(&nm_lock.lock);
12579 	return (err);
12580 }
12581 
12582 /*
12583  * Returns 0 on success.
12584  * Returns -1 on failure with ep filled in.
12585  */
12586 static int
md_imp_db(set_t setno,int stale_flag,md_error_t * ep)12587 md_imp_db(
12588 	set_t		setno,
12589 	int		stale_flag,
12590 	md_error_t	*ep
12591 )
12592 {
12593 	mddb_set_t	*s;
12594 	int		err = 0;
12595 	mddb_dt_t	*dtp;
12596 	mddb_lb_t	*lbp;
12597 	int		i;
12598 	int		loccnt;
12599 
12600 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12601 		return (mddbstatus2error(ep, err, NODEV32, setno));
12602 	}
12603 
12604 	/* Update dt */
12605 	if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12606 		crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12607 	}
12608 
12609 	if ((err = dt_write(s)) != 0) {
12610 		err = mdsyserror(ep, err);
12611 		mddb_setexit(s);
12612 		return (err);
12613 	}
12614 
12615 	/*
12616 	 * Update lb, no need to update the mediator because
12617 	 * the diskset will only exist on the importing node
12618 	 * and as such a mediator adds no value.
12619 	 */
12620 
12621 	/* Update lb */
12622 	if (stale_flag & MD_IMP_STALE_SET) {
12623 		lbp = s->s_lbp;
12624 		loccnt = lbp->lb_loccnt;
12625 		for (i = 0; i < loccnt; i++) {
12626 			mddb_locator_t	*lp = &lbp->lb_locators[i];
12627 			md_dev64_t	ndev = md_expldev(lp->l_dev);
12628 			ddi_devid_t	devid_ptr;
12629 
12630 			devid_ptr = s->s_did_icp->did_ic_devid[i];
12631 			if (devid_ptr == NULL) {
12632 				/*
12633 				 * Already deleted, go to next one.
12634 				 */
12635 				continue;
12636 			}
12637 			if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
12638 			    NULL)) {
12639 				/* disk unavailable, mark deleted */
12640 				lp->l_flags = MDDB_F_DELETED;
12641 				/* then remove the device id from the list */
12642 				free_mbipp(&s->s_mbiarray[i]);
12643 				(void) mddb_devid_delete(s, i);
12644 			}
12645 		}
12646 		md_clr_setstatus(setno, MD_SET_STALE);
12647 	}
12648 
12649 	if ((err = writelocall(s)) != 0) {
12650 		err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
12651 		mddb_setexit(s);
12652 		return (err);
12653 	}
12654 
12655 	mddb_setexit(s);
12656 
12657 	/* Update db records */
12658 	if ((err = update_db_rec(s)) != 0) {
12659 		return (mddbstatus2error(ep, err, NODEV32, setno));
12660 	}
12661 
12662 	/* Update setname embedded in the namespace */
12663 	if ((err = update_setname(setno)) != 0)
12664 		return (mddbstatus2error(ep, err, NODEV32, setno));
12665 
12666 	return (err);
12667 }
12668 
12669 static void
md_dr_add(md_set_record * sr,md_drive_record * dr)12670 md_dr_add(
12671 	md_set_record	*sr,
12672 	md_drive_record	*dr
12673 )
12674 {
12675 	md_drive_record	*drv;
12676 
12677 	if (sr->sr_driverec == 0) {
12678 		sr->sr_driverec = dr->dr_selfid;
12679 		return;
12680 	}
12681 
12682 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12683 	    drv->dr_nextrec != 0;
12684 	    drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12685 		;
12686 	drv->dr_nextrec = dr->dr_selfid;
12687 }
12688 
12689 static void
md_setup_recids(md_set_record * sr,mddb_recid_t ** ids,size_t size)12690 md_setup_recids(
12691 	md_set_record	*sr,
12692 	mddb_recid_t	**ids,
12693 	size_t		size
12694 )
12695 {
12696 	md_drive_record	*drv;
12697 	int		cnt;
12698 	mddb_recid_t	*recids;
12699 
12700 	recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12701 	    * size, KM_SLEEP);
12702 	recids[0] = sr->sr_selfid;
12703 	cnt = 1;
12704 
12705 	for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12706 	    /* CSTYLED */
12707 	    drv != NULL;) {
12708 		recids[cnt++] = drv->dr_selfid;
12709 		if (drv->dr_nextrec != 0)
12710 			drv = (md_drive_record *)mddb_getrecaddr
12711 			    (drv->dr_nextrec);
12712 		else
12713 			drv = NULL;
12714 	}
12715 	recids[cnt] = 0;
12716 	*ids = &recids[0];
12717 }
12718 
12719 /*
12720  * The purpose of this function is to replace the old_devid with the
12721  * new_devid in the given namespace.   This is used for importing
12722  * remotely replicated drives.
12723  */
12724 int
md_update_namespace_rr_did(mddb_config_t * cp)12725 md_update_namespace_rr_did(
12726 	mddb_config_t	*cp
12727 )
12728 {
12729 	set_t			setno = cp->c_setno;
12730 	struct nm_next_hdr	*nh;
12731 	mdkey_t			key = MD_KEYWILD;
12732 	side_t			side = MD_SIDEWILD;
12733 	mddb_recid_t		recids[3];
12734 	struct did_min_name	*n;
12735 	struct nm_next_hdr	*did_shr_nh;
12736 	struct did_shr_name	*shr_n;
12737 	mdkey_t			ent_did_key;
12738 	uint32_t		ent_did_count;
12739 	uint32_t		ent_did_data;
12740 	ddi_devid_t		devid = NULL;
12741 	struct did_shr_name	*shn;
12742 	void			*old_devid, *new_devid;
12743 
12744 	if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
12745 		return (EIO);
12746 
12747 	old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
12748 	new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;
12749 
12750 	/*
12751 	 * It is okay if we dont have any configuration
12752 	 */
12753 	if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12754 	    == NULL) {
12755 		return (0);
12756 	}
12757 	while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12758 		/* check out every entry in the namespace */
12759 		if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12760 		    side, key, NODEV64, NM_DEVID)) == NULL) {
12761 			continue;
12762 		} else {
12763 			did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12764 			    NM_SHARED);
12765 			if (did_shr_nh == NULL) {
12766 				return (ENOENT);
12767 			}
12768 
12769 			shr_n = (struct did_shr_name *)lookup_shared_entry(
12770 			    did_shr_nh, n->min_devid_key, (char *)0,
12771 			    &recids[0], NM_DEVID);
12772 			if (shr_n == NULL) {
12773 				return (ENOENT);
12774 			}
12775 			rw_enter(&nm_lock.lock, RW_WRITER);
12776 			devid = (ddi_devid_t)shr_n->did_devid;
12777 			/* find this devid in the incore replica  */
12778 			if (ddi_devid_compare(devid, old_devid) == 0) {
12779 				/*
12780 				 * found the corresponding entry
12781 				 * update with new devid
12782 				 */
12783 				/* first remove old devid info */
12784 				ent_did_key = shr_n ->did_key;
12785 				ent_did_count = shr_n->did_count;
12786 				ent_did_data = shr_n->did_data;
12787 				(void) remove_shared_entry(did_shr_nh,
12788 				    shr_n->did_key, NULL, NM_DEVID |
12789 				    NM_IMP_SHARED | NM_KEY_RECYCLE);
12790 
12791 				/* add in new devid info */
12792 				if ((shn = (struct did_shr_name *)
12793 				    alloc_entry(did_shr_nh,
12794 				    md_set[setno].s_did_nmid,
12795 				    cp->c_locator.l_devid_sz,
12796 				    NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12797 				    &recids[0])) == NULL) {
12798 						rw_exit(&nm_lock.lock);
12799 						return (ENOMEM);
12800 					}
12801 					shn->did_key = ent_did_key;
12802 					shn->did_count = ent_did_count;
12803 					ent_did_data |= NM_DEVID_VALID;
12804 					shn->did_data = ent_did_data;
12805 					shn->did_size = ddi_devid_sizeof(
12806 					    new_devid);
12807 					bcopy((void *)new_devid, (void *)
12808 					    shn->did_devid, shn->did_size);
12809 					recids[1] = md_set[setno].s_nmid;
12810 					recids[2] = 0;
12811 					mddb_commitrecs_wrapper(recids);
12812 			}
12813 			rw_exit(&nm_lock.lock);
12814 		}
12815 	}
12816 
12817 	return (0);
12818 }
12819 
12820 /*
12821  * namespace is loaded before this is called.
12822  * This function is a wrapper for md_update_namespace_rr_did.
12823  *
12824  * md_update_namespace_rr_did may be called twice if attempting to
12825  * resolve a replicated device id during the take of a diskset - once
12826  * for the diskset namespace and a second time for the local namespace.
12827  * The local namespace would need to be updated when a drive has been
12828  * found during a take of the diskset that hadn't been resolved during
12829  * the import (aka partial replicated import).
12830  *
12831  * If being called during the import of the diskset (IMPORT flag set)
12832  * md_update_namespace_rr_did will only be called once with the disket
12833  * namespace.
12834  */
12835 int
md_update_nm_rr_did_ioctl(mddb_config_t * cp)12836 md_update_nm_rr_did_ioctl(
12837 	mddb_config_t	*cp
12838 )
12839 {
12840 	int	rval = 0;
12841 
12842 	/* If update of diskset namespace fails, stop and return failure */
12843 	if ((rval = md_update_namespace_rr_did(cp)) != 0)
12844 		return (rval);
12845 
12846 	if (cp->c_flags & MDDB_C_IMPORT)
12847 		return (0);
12848 
12849 	/* If update of local namespace fails, return failure */
12850 	cp->c_setno = MD_LOCAL_SET;
12851 	rval = md_update_namespace_rr_did(cp);
12852 	return (rval);
12853 }
12854 
12855 /*ARGSUSED*/
12856 int
md_imp_snarf_set(mddb_config_t * cp)12857 md_imp_snarf_set(
12858 	mddb_config_t	*cp
12859 )
12860 {
12861 	set_t		setno;
12862 	int		stale_flag;
12863 	mddb_set_t	*s;
12864 	int		i, err = 0;
12865 	md_ops_t	*ops;
12866 	md_error_t	*ep = &cp->c_mde;
12867 
12868 	setno = cp->c_setno;
12869 	stale_flag = cp->c_flags;
12870 
12871 	mdclrerror(ep);
12872 	if (setno >= md_nsets) {
12873 		return (mdsyserror(ep, EINVAL));
12874 	}
12875 
12876 	md_haltsnarf_enter(setno);
12877 	if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12878 		goto out;
12879 	}
12880 
12881 	/* Set the bit first otherwise load_old_replicas can fail */
12882 	md_set_setstatus(setno, MD_SET_IMPORT);
12883 
12884 	if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12885 		err = mddbstatus2error(ep, err, NODEV32, setno);
12886 		goto out;
12887 	}
12888 
12889 	/*
12890 	 * Upon completion of load_old_replicas, the old setno is
12891 	 * restored from the disk so we need to reset
12892 	 */
12893 	s->s_lbp->lb_setno = setno;
12894 
12895 	/*
12896 	 * Fixup the NM records before loading namespace
12897 	 */
12898 	(void) md_imp_nm(s);
12899 	mddb_setexit(s);
12900 
12901 	/*
12902 	 * Load the devid name space if it exists
12903 	 * and ask each module to fixup unit records
12904 	 */
12905 	if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12906 		err = mdsyserror(ep, ENOENT);
12907 		goto cleanup;
12908 	}
12909 	if (!md_load_namespace(setno, NULL, 0L)) {
12910 		(void) md_unload_namespace(setno, NM_DEVID);
12911 		err = mdsyserror(ep, ENOENT);
12912 		goto cleanup;
12913 	}
12914 
12915 	do {
12916 		i = 0;
12917 		for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12918 			if (ops->md_imp_set != NULL)
12919 				i += ops->md_imp_set(setno);
12920 	} while (i);
12921 
12922 	/*
12923 	 * Fixup
12924 	 *	(1) locator block
12925 	 *	(2) locator name block if necessary
12926 	 *	(3) master block
12927 	 *	(4) directory block
12928 	 * calls appropriate writes to push changes out
12929 	 */
12930 	if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
12931 		goto cleanup;
12932 	}
12933 
12934 	/*
12935 	 * Don't unload namespace if importing a replicated diskset.
12936 	 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
12937 	 */
12938 	if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12939 		md_haltsnarf_exit(setno);
12940 		return (err);
12941 	}
12942 
12943 cleanup:
12944 	/*
12945 	 * Halt the set
12946 	 */
12947 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12948 	(void) md_halt_set(setno, MD_HALT_ALL);
12949 	rw_exit(&md_unit_array_rw.lock);
12950 
12951 	/*
12952 	 * Unload the namespace for the imported set
12953 	 */
12954 	mutex_enter(&mddb_lock);
12955 	mddb_unload_set(setno);
12956 	mutex_exit(&mddb_lock);
12957 
12958 out:
12959 	md_haltsnarf_exit(setno);
12960 	md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12961 	return (err);
12962 }
12963 #endif	/* MDDB_FAKE */
12964