1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 #include <sys/types.h>
27 #include <sys/conf.h>
28 #include <sys/time.h>
29 #include <sys/uio.h>
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/systeminfo.h>
33 #include <sys/sysmacros.h>
34 #include <sys/buf.h>
35 #include <sys/kmem.h>
36 #include <sys/file.h>
37 #include <sys/open.h>
38 #include <sys/debug.h>
39 #include <sys/stat.h>
40 #include <sys/lvm/mdvar.h>
41 #include <sys/lvm/md_crc.h>
42 #include <sys/lvm/md_convert.h>
43 #include <sys/types.h>
44 #include <sys/kmem.h>
45 #include <sys/lvm/mdmn_commd.h>
46 #include <sys/cladm.h>
47
48 mhd_mhiargs_t defmhiargs = {
49 1000,
50 { 6000, 6000, 30000 }
51 };
52
53 #define MDDB
54
55 #include <sys/lvm/mdvar.h>
56 #include <sys/lvm/mdmed.h>
57 #include <sys/lvm/md_names.h>
58 #include <sys/cred.h>
59 #include <sys/ddi.h>
60 #include <sys/sunddi.h>
61 #include <sys/esunddi.h>
62
63 #include <sys/sysevent/eventdefs.h>
64 #include <sys/sysevent/svm.h>
65
66 extern char svm_bootpath[];
67
68 int md_maxbootlist = MAXBOOTLIST;
69 static ulong_t mddb_maxblocks = 0; /* tune for small records */
70 static int mddb_maxbufheaders = 50;
71 static uint_t mddb_maxcopies = MDDB_NLB;
72
73 /*
74 * If this is set, more detailed messages about DB init will be given, instead
75 * of just the MDE_DB_NODB.
76 */
77 static int mddb_db_err_detail = 0;
78
79 /*
80 * This lock is used to single-thread load/unload of all sets
81 */
82 static kmutex_t mddb_lock;
83
84 /*
85 * You really do NOT want to change this boolean.
86 * It can be VERY dangerous to do so. Loss of
87 * data may occur. USE AT YOUR OWN RISK!!!!
88 */
89 static int mddb_allow_half = 0;
90 /*
91 * For mirrored root allow reboot with only half the replicas available
92 * Flag inserted for Santa Fe project.
93 */
94 int mirrored_root_flag;
95
96 #define ISWHITE(c) (((c) == ' ') || ((c) == '\t') || \
97 ((c) == '\r') || ((c) == '\n'))
98 #define ISNUM(c) (((c) >= '0') && ((c) <= '9'))
99
100 #define SETMUTEX(setno) (&md_set[setno].s_dbmx)
101
102 extern md_krwlock_t md_unit_array_rw; /* md.c */
103 extern set_t md_nsets; /* md.c */
104 extern int md_nmedh; /* md.c */
105 extern md_set_t md_set[]; /* md.c */
106 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*);
107 extern dev_info_t *md_devinfo;
108 extern int md_init_debug;
109 extern int md_status;
110 extern md_ops_t *md_opslist;
111 extern md_krwlock_t nm_lock;
112
113 static int update_locatorblock(mddb_set_t *s, md_dev64_t dev,
114 ddi_devid_t didptr, ddi_devid_t old_didptr);
115
116 /*
117 * Defines for crc calculation for records
118 * rec_crcgen generates a crc checksum for a record block
119 * rec_crcchk checks the crc checksum for a record block
120 */
121 #define REC_CRCGEN 0
122 #define REC_CRCCHK 1
123 #define rec_crcgen(s, dep, rbp) \
124 (void) rec_crcfunc(s, dep, rbp, REC_CRCGEN)
125 #define rec_crcchk(s, dep, rbp) \
126 rec_crcfunc(s, dep, rbp, REC_CRCCHK)
127
128 /*
129 * During upgrade, SVM basically runs with the devt from the target
130 * being upgraded. Translations are made from the target devt to the
131 * miniroot devt when writing data out to the disk. This is done by
132 * the following routines:
133 * wrtblklst
134 * writeblks
135 * readblklst
136 * readblks
137 * dt_read
138 *
139 * The following routines are used by the routines listed above and
140 * expect a translated (aka miniroot) devt:
141 * getblks
142 * getmasters
143 *
144 * Also, when calling any system routines, such as ddi_lyr_get_devid,
145 * the translated (aka miniroot) devt must be used.
146 *
147 * By the same token, the major number and major name conversion operations
148 * need to use the name_to_major file from the target system instead
149 * of the name_to_major file on the miniroot. So, calls to
150 * ddi_name_to_major must be replaced with calls to md_targ_name_to_major
151 * when running on an upgrade. Same is true with calls to
152 * ddi_major_to_name.
153 */
154
155
156 #ifndef MDDB_FAKE
157
158 static int
mddb_rwdata(mddb_set_t * s,int flag,buf_t * bp)159 mddb_rwdata(
160 mddb_set_t *s, /* incore db set structure */
161 int flag, /* B_ASYNC, B_FAILFAST or 0 passed in here */
162 buf_t *bp
163 )
164 {
165 int err = 0;
166
167 bp->b_flags = (flag | B_BUSY) & (~B_ASYNC);
168
169 mutex_exit(SETMUTEX(s->s_setno));
170 if (mdv_strategy_tstpnt == NULL ||
171 (*mdv_strategy_tstpnt)(bp, 0, NULL) == 0)
172 (void) bdev_strategy(bp);
173
174 if (flag & B_ASYNC) {
175 mutex_enter(SETMUTEX(s->s_setno));
176 return (0);
177 }
178
179 err = biowait(bp);
180 mutex_enter(SETMUTEX(s->s_setno));
181 return (err);
182 }
183
184 static void
setidentifier(mddb_set_t * s,identifier_t * ident)185 setidentifier(
186 mddb_set_t *s,
187 identifier_t *ident
188 )
189 {
190 if (s->s_setno == MD_LOCAL_SET)
191 (void) strcpy(&ident->serial[0], s->s_ident.serial);
192 else
193 ident->createtime = s->s_ident.createtime;
194 }
195
196 static int
cmpidentifier(mddb_set_t * s,identifier_t * ident)197 cmpidentifier(
198 mddb_set_t *s,
199 identifier_t *ident
200 )
201 {
202 if (s->s_setno == MD_LOCAL_SET)
203 return (strcmp(ident->serial, s->s_ident.serial));
204 else
205 return (timercmp(&ident->createtime,
206 /*CSTYLED*/
207 &s->s_ident.createtime, !=));
208 }
209
210 static int
mddb_devopen(md_dev64_t dev)211 mddb_devopen(
212 md_dev64_t dev
213 )
214 {
215 dev_t ddi_dev = md_dev64_to_dev(dev);
216
217 if (dev_lopen(&ddi_dev, FREAD|FWRITE, OTYP_LYR, kcred) == 0)
218 return (0);
219 return (1);
220 }
221
222 static void
mddb_devclose(md_dev64_t dev)223 mddb_devclose(
224 md_dev64_t dev
225 )
226 {
227 (void) dev_lclose(md_dev64_to_dev(dev), FREAD|FWRITE, OTYP_LYR, kcred);
228 }
229
230 /*
231 * stripe_skip_ts
232 *
233 * Returns a list of fields to be skipped in the stripe record structure.
234 * These fields are ms_timestamp in the component structure.
235 * Used to skip these fields when calculating the checksum.
236 */
237 static crc_skip_t *
stripe_skip_ts(void * un,uint_t revision)238 stripe_skip_ts(void *un, uint_t revision)
239 {
240 struct ms_row32_od *small_mdr;
241 struct ms_row *big_mdr;
242 uint_t row, comp, ncomps, compoff;
243 crc_skip_t *skip;
244 crc_skip_t *skip_prev;
245 crc_skip_t skip_start = {0, 0, 0};
246 ms_unit_t *big_un;
247 ms_unit32_od_t *small_un;
248 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]);
249
250 switch (revision) {
251 case MDDB_REV_RB:
252 case MDDB_REV_RBFN:
253 small_un = (ms_unit32_od_t *)un;
254 skip_prev = &skip_start;
255
256 if (small_un->un_nrows == 0)
257 return (NULL);
258 /*
259 * walk through all rows to find the total number
260 * of components
261 */
262 small_mdr = &small_un->un_row[0];
263 ncomps = 0;
264 for (row = 0; (row < small_un->un_nrows); row++) {
265 ncomps += small_mdr[row].un_ncomp;
266 }
267
268 /* Now walk through the components */
269 compoff = small_un->un_ocomp + rb_off;
270 for (comp = 0; (comp < ncomps); ++comp) {
271 uint_t mdcp = compoff +
272 (comp * sizeof (ms_comp32_od_t));
273 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
274 KM_SLEEP);
275 skip->skip_offset = mdcp +
276 offsetof(ms_comp32_od_t, un_mirror.ms_timestamp);
277 skip->skip_size = sizeof (md_timeval32_t);
278 skip_prev->skip_next = skip;
279 skip_prev = skip;
280 }
281 break;
282 case MDDB_REV_RB64:
283 case MDDB_REV_RB64FN:
284 big_un = (ms_unit_t *)un;
285 skip_prev = &skip_start;
286
287 if (big_un->un_nrows == 0)
288 return (NULL);
289 /*
290 * walk through all rows to find the total number
291 * of components
292 */
293 big_mdr = &big_un->un_row[0];
294 ncomps = 0;
295 for (row = 0; (row < big_un->un_nrows); row++) {
296 ncomps += big_mdr[row].un_ncomp;
297 }
298
299 /* Now walk through the components */
300 compoff = big_un->un_ocomp + rb_off;
301 for (comp = 0; (comp < ncomps); ++comp) {
302 uint_t mdcp = compoff +
303 (comp * sizeof (ms_comp_t));
304 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
305 KM_SLEEP);
306 skip->skip_offset = mdcp +
307 offsetof(ms_comp_t, un_mirror.ms_timestamp);
308 skip->skip_size = sizeof (md_timeval32_t);
309 skip_prev->skip_next = skip;
310 skip_prev = skip;
311 }
312 break;
313 }
314 /* Return the start of the list of fields to skip */
315 return (skip_start.skip_next);
316 }
317
318 /*
319 * mirror_skip_ts
320 *
321 * Returns a list of fields to be skipped in the mirror record structure.
322 * This includes un_last_read and sm_timestamp for each submirror
323 * Used to skip these fields when calculating the checksum.
324 */
325 static crc_skip_t *
mirror_skip_ts(uint_t revision)326 mirror_skip_ts(uint_t revision)
327 {
328 int i;
329 crc_skip_t *skip;
330 crc_skip_t *skip_prev;
331 crc_skip_t skip_start = {0, 0, 0};
332 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]);
333
334 skip_prev = &skip_start;
335
336 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
337 switch (revision) {
338 case MDDB_REV_RB:
339 case MDDB_REV_RBFN:
340 skip->skip_offset = offsetof(mm_unit32_od_t,
341 un_last_read) + rb_off;
342 break;
343 case MDDB_REV_RB64:
344 case MDDB_REV_RB64FN:
345 skip->skip_offset = offsetof(mm_unit_t,
346 un_last_read) + rb_off;
347 break;
348 }
349 skip->skip_size = sizeof (int);
350 skip_prev->skip_next = skip;
351 skip_prev = skip;
352
353 for (i = 0; i < NMIRROR; i++) {
354 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
355 switch (revision) {
356 case MDDB_REV_RB:
357 case MDDB_REV_RBFN:
358 skip->skip_offset = offsetof(mm_unit32_od_t,
359 un_sm[i].sm_timestamp) + rb_off;
360 break;
361 case MDDB_REV_RB64:
362 case MDDB_REV_RB64FN:
363 skip->skip_offset = offsetof(mm_unit_t,
364 un_sm[i].sm_timestamp) + rb_off;
365 break;
366 }
367 skip->skip_size = sizeof (md_timeval32_t);
368 skip_prev->skip_next = skip;
369 skip_prev = skip;
370 }
371 /* Return the start of the list of fields to skip */
372 return (skip_start.skip_next);
373 }
374
375 /*
376 * hotspare_skip_ts
377 *
378 * Returns a list of the timestamp fields in the hotspare record structure.
379 * Used to skip these fields when calculating the checksum.
380 */
381 static crc_skip_t *
hotspare_skip_ts(uint_t revision)382 hotspare_skip_ts(uint_t revision)
383 {
384 crc_skip_t *skip;
385 uint_t rb_off = offsetof(mddb_rb32_t, rb_data[0]);
386
387 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
388 switch (revision) {
389 case MDDB_REV_RB:
390 case MDDB_REV_RBFN:
391 skip->skip_offset = offsetof(hot_spare32_od_t, hs_timestamp) +
392 rb_off;
393 break;
394 case MDDB_REV_RB64:
395 case MDDB_REV_RB64FN:
396 skip->skip_offset = offsetof(hot_spare_t, hs_timestamp) +
397 rb_off;
398 break;
399 }
400 skip->skip_size = sizeof (md_timeval32_t);
401 return (skip);
402 }
403
404 /*
405 * rec_crcfunc
406 *
407 * Calculate or check the checksum for a record
408 * Calculate the crc if check == 0, Check the crc if check == 1
409 *
410 * Record block may be written by different nodes in a multi-owner diskset
411 * (in case of master change), the function rec_crcchk excludes timestamp
412 * fields in crc computation of record data.
413 * Otherwise, timestamp fields will cause each node to have a different
414 * checksum for same record block causing the exclusive-or of all record block
415 * checksums and data block record sums to be non-zero after new master writes
416 * at least one record block.
417 */
418 static uint_t
rec_crcfunc(mddb_set_t * s,mddb_de_ic_t * dep,mddb_rb32_t * rbp,int check)419 rec_crcfunc(
420 mddb_set_t *s,
421 mddb_de_ic_t *dep,
422 mddb_rb32_t *rbp,
423 int check
424 )
425 {
426 crc_skip_t *skip;
427 crc_skip_t *skip_tail;
428 mddb_type_t type = dep->de_type1;
429 uint_t ret;
430
431 /*
432 * Generate a list of the areas to be skipped when calculating
433 * the checksum.
434 * First skip rb_checksum, rb_private and rb_userdata.
435 */
436 skip = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t), KM_SLEEP);
437 skip->skip_offset = offsetof(mddb_rb32_t, rb_checksum_fiddle);
438 skip->skip_size = 3 * sizeof (uint_t);
439 skip_tail = skip;
440 if (MD_MNSET_SETNO(s->s_setno)) {
441 /* For a MN set, skip rb_timestamp */
442 skip_tail = (crc_skip_t *)kmem_zalloc(sizeof (crc_skip_t),
443 KM_SLEEP);
444 skip_tail->skip_offset = offsetof(mddb_rb32_t, rb_timestamp);
445 skip_tail->skip_size = sizeof (md_timeval32_t);
446 skip->skip_next = skip_tail;
447
448 /* Now add a list of timestamps to be skipped */
449 if (type >= MDDB_FIRST_MODID) {
450 switch (dep->de_flags) {
451 case MDDB_F_STRIPE:
452 skip_tail->skip_next =
453 stripe_skip_ts((void *)rbp->rb_data,
454 rbp->rb_revision);
455 break;
456 case MDDB_F_MIRROR:
457 skip_tail->skip_next =
458 mirror_skip_ts(rbp->rb_revision);
459 break;
460 case MDDB_F_HOTSPARE:
461 skip_tail->skip_next =
462 hotspare_skip_ts(rbp->rb_revision);
463 break;
464 default:
465 break;
466 }
467 }
468 }
469
470 if (check) {
471 ret = crcchk(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
472 } else {
473 crcgen(rbp, &rbp->rb_checksum, dep->de_recsize, skip);
474 ret = rbp->rb_checksum;
475 }
476 while (skip) {
477 crc_skip_t *skip_save = skip;
478
479 skip = skip->skip_next;
480 kmem_free(skip_save, sizeof (crc_skip_t));
481 }
482 return (ret);
483 }
484
485 static mddb_bf_t *
allocbuffer(mddb_set_t * s,int sleepflag)486 allocbuffer(
487 mddb_set_t *s,
488 int sleepflag
489 )
490 {
491 mddb_bf_t *bfp;
492
493 while ((bfp = s->s_freebufhead) == NULL) {
494 if (sleepflag == MDDB_NOSLEEP)
495 return ((mddb_bf_t *)NULL);
496 ++s->s_bufmisses;
497 #ifdef DEBUG
498 if (s->s_bufmisses == 1)
499 cmn_err(CE_NOTE,
500 "md: mddb: set %u sleeping for buffer", s->s_setno);
501 #endif
502 s->s_bufwakeup = 1;
503 cv_wait(&s->s_buf_cv, SETMUTEX(s->s_setno));
504 }
505 s->s_freebufhead = bfp->bf_next;
506 bzero((caddr_t)bfp, sizeof (*bfp));
507 bfp->bf_buf.b_back = bfp->bf_buf.b_forw = &bfp->bf_buf;
508 bfp->bf_buf.b_flags = B_BUSY; /* initialize flags */
509 return (bfp);
510 }
511
512 static void
freebuffer(mddb_set_t * s,mddb_bf_t * bfp)513 freebuffer(
514 mddb_set_t *s,
515 mddb_bf_t *bfp
516 )
517 {
518 bfp->bf_next = s->s_freebufhead;
519 s->s_freebufhead = bfp;
520 if (s->s_bufwakeup) {
521 cv_broadcast(&s->s_buf_cv);
522 s->s_bufwakeup = 0;
523 }
524 }
525
526
527 static void
blkbusy(mddb_set_t * s,mddb_block_t blk)528 blkbusy(
529 mddb_set_t *s,
530 mddb_block_t blk
531 )
532 {
533 int bit, byte;
534
535 s->s_freeblkcnt--;
536 byte = blk / 8;
537 bit = 1 << (blk & 7);
538 ASSERT(! (s->s_freebitmap[byte] & bit));
539 s->s_freebitmap[byte] |= bit;
540 }
541
542 static void
blkfree(mddb_set_t * s,mddb_block_t blk)543 blkfree(
544 mddb_set_t *s,
545 mddb_block_t blk
546 )
547 {
548 int bit, byte;
549
550 s->s_freeblkcnt++;
551 byte = blk / 8;
552 bit = 1 << (blk & 7);
553 ASSERT(s->s_freebitmap[byte] & bit);
554 s->s_freebitmap[byte] &= ~bit;
555 }
556
557 static int
blkcheck(mddb_set_t * s,mddb_block_t blk)558 blkcheck(
559 mddb_set_t *s,
560 mddb_block_t blk
561 )
562 {
563 int bit, byte;
564
565 byte = blk / 8;
566 bit = 1 << (blk & 7);
567 return (s->s_freebitmap[byte] & bit);
568 }
569
570 /*
571 * not fast but simple
572 */
573 static mddb_block_t
getfreeblks(mddb_set_t * s,size_t count)574 getfreeblks(
575 mddb_set_t *s,
576 size_t count
577 )
578 {
579 int i;
580 size_t contig;
581
582 contig = 0;
583 for (i = 0; i < s->s_totalblkcnt; i++) {
584 if (blkcheck(s, i)) {
585 contig = 0;
586 } else {
587 contig++;
588 if (contig == count) {
589 contig = i - count + 1;
590 for (i = (int)contig; i < contig + count; i++)
591 blkbusy(s, i);
592 return ((mddb_block_t)contig);
593 }
594 }
595 }
596 return (0);
597 }
598
599 static void
computefreeblks(mddb_set_t * s)600 computefreeblks(
601 mddb_set_t *s
602 )
603 {
604 mddb_db_t *dbp;
605 mddb_de_ic_t *dep;
606 int i;
607 int minblks;
608 int freeblks;
609 mddb_mb_ic_t *mbip;
610 mddb_lb_t *lbp;
611 mddb_block_t maxblk;
612 mddb_did_db_t *did_dbp;
613 int nblks;
614
615 minblks = 0;
616 lbp = s->s_lbp;
617 maxblk = 0;
618
619 /*
620 * Determine the max number of blocks.
621 */
622 nblks = (lbp->lb_flags & MDDB_MNSET) ? MDDB_MN_MAXBLKS : MDDB_MAXBLKS;
623 /*
624 * go through and find highest logical block
625 */
626 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) {
627 if (dbp->db_blknum > maxblk)
628 maxblk = dbp->db_blknum;
629 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
630 for (i = 0; i < dep->de_blkcount; i++)
631 if (dep->de_blks[i] > maxblk)
632 maxblk = dep->de_blks[i];
633 }
634
635 for (i = 0; i < lbp->lb_loccnt; i++) {
636 mddb_locator_t *lp = &lbp->lb_locators[i];
637
638 if ((lp->l_flags & MDDB_F_DELETED) ||
639 (lp->l_flags & MDDB_F_EMASTER))
640 continue;
641
642 freeblks = 0;
643 for (mbip = s->s_mbiarray[i]; mbip != NULL;
644 mbip = mbip->mbi_next) {
645 freeblks += mbip->mbi_mddb_mb.mb_blkcnt;
646 }
647 if (freeblks == 0) /* this happen when there is no */
648 continue; /* master blk */
649
650 if (freeblks <= maxblk) {
651 lp->l_flags |= MDDB_F_TOOSMALL;
652 lp->l_flags &= ~MDDB_F_ACTIVE;
653 }
654
655 if (freeblks < minblks || minblks == 0)
656 minblks = freeblks;
657 }
658 /*
659 * set up reasonable freespace if no
660 * data bases exist
661 */
662 if (minblks == 0)
663 minblks = 100;
664 if (minblks > nblks)
665 minblks = nblks;
666 s->s_freeblkcnt = minblks;
667 s->s_totalblkcnt = minblks;
668 if (! s->s_freebitmapsize) {
669 s->s_freebitmapsize = nblks / 8;
670 s->s_freebitmap = (uchar_t *)kmem_zalloc(s->s_freebitmapsize,
671 KM_SLEEP);
672 }
673 bzero((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
674
675 /* locator block sectors */
676 for (i = 0; i < s->s_lbp->lb_blkcnt; i++)
677 blkbusy(s, i);
678
679 /* locator name sectors */
680 for (i = 0; i < s->s_lbp->lb_lnblkcnt; i++)
681 blkbusy(s, (s->s_lbp->lb_lnfirstblk + i));
682
683 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
684 /* locator block device id information */
685 for (i = 0; i < s->s_lbp->lb_didblkcnt; i++)
686 blkbusy(s, (s->s_lbp->lb_didfirstblk + i));
687
688 /* disk blocks containing actual device ids */
689 did_dbp = s->s_did_icp->did_ic_dbp;
690 while (did_dbp) {
691 for (i = 0; i < did_dbp->db_blkcnt; i++) {
692 blkbusy(s, did_dbp->db_firstblk + i);
693 }
694 did_dbp = did_dbp->db_next;
695 }
696 }
697
698 /* Only use data tags if not a MN set */
699 if (!(lbp->lb_flags & MDDB_MNSET)) {
700 /* Found a bad tag, do NOT mark the data tag blks busy here */
701 if (! (md_get_setstatus(s->s_setno) & MD_SET_BADTAG)) {
702 for (i = 0; i < s->s_lbp->lb_dtblkcnt; i++)
703 blkbusy(s, (s->s_lbp->lb_dtfirstblk + i));
704 }
705 }
706
707 /* directory block/entry sectors */
708 for (dbp = s->s_dbp; dbp != 0; dbp = dbp->db_next) {
709 blkbusy(s, dbp->db_blknum);
710 for (dep = dbp->db_firstentry; dep != 0; dep = dep->de_next)
711 for (i = 0; i < dep->de_blkcount; i++)
712 blkbusy(s, dep->de_blks[i]);
713 }
714 }
715
716 /*
717 * Add free space to the device id incore free list.
718 * Called:
719 * - During startup when all devid blocks are temporarily placed on the
720 * free list
721 * - After a devid has been deleted via the metadb command.
722 * - When mddb_devid_free_get adds unused space from a disk block
723 * to free list
724 */
725 static int
mddb_devid_free_add(mddb_set_t * s,uint_t firstblk,uint_t offset,uint_t length)726 mddb_devid_free_add(
727 mddb_set_t *s,
728 uint_t firstblk,
729 uint_t offset,
730 uint_t length
731 )
732 {
733 mddb_did_free_t *did_freep;
734
735 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
736 return (0);
737 }
738
739 did_freep = (mddb_did_free_t *)kmem_zalloc(sizeof (mddb_did_free_t),
740 KM_SLEEP);
741 did_freep->free_blk = firstblk;
742 did_freep->free_offset = offset;
743 did_freep->free_length = length;
744 did_freep->free_next = s->s_did_icp->did_ic_freep;
745 s->s_did_icp->did_ic_freep = did_freep;
746
747 return (0);
748 }
749
750 /*
751 * Remove specific free space from the device id incore free list.
752 * Called at startup (after all devid blocks have been placed on
753 * free list) in order to remove the free space from the list that
754 * contains actual devids.
755 * Returns 0 if area successfully removed.
756 * Returns 1 if no matching area is found - so nothing removed.
757 */
758 static int
mddb_devid_free_delete(mddb_set_t * s,uint_t firstblk,uint_t offset,uint_t length)759 mddb_devid_free_delete(
760 mddb_set_t *s,
761 uint_t firstblk,
762 uint_t offset,
763 uint_t length
764 )
765 {
766 int block_found = 0;
767 mddb_did_free_t *did_freep1; /* next free block */
768 mddb_did_free_t *did_freep2 = 0; /* previous free block */
769 mddb_did_free_t *did_freep_before; /* area before offset, len */
770 mddb_did_free_t *did_freep_after; /* area after offset, len */
771 uint_t old_length;
772
773 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
774 return (1);
775 }
776
777 /* find free block for this devid */
778 did_freep1 = s->s_did_icp->did_ic_freep;
779 while (did_freep1) {
780 /*
781 * Look through free list of <block, offset, length> to
782 * find our entry in the free list. Our entry should
783 * exist since the entire devid block was placed into
784 * this free list at startup. This code is just removing
785 * the non-free (in-use) portions of the devid block so
786 * that the remaining linked list does indeed just
787 * contain a free list.
788 *
789 * Our entry has been found if
790 * - the blocks match,
791 * - the offset (starting address) in the free list is
792 * less than the offset of our entry and
793 * - the length+offset (ending address) in the free list is
794 * greater than the length+offset of our entry.
795 */
796 if ((did_freep1->free_blk == firstblk) &&
797 (did_freep1->free_offset <= offset) &&
798 ((did_freep1->free_length + did_freep1->free_offset) >=
799 (length + offset))) {
800 /* Have found our entry - remove from list */
801 block_found = 1;
802 did_freep_before = did_freep1;
803 old_length = did_freep1->free_length;
804 /* did_freep1 - pts to next free block */
805 did_freep1 = did_freep1->free_next;
806 if (did_freep2) {
807 did_freep2->free_next = did_freep1;
808 } else {
809 s->s_did_icp->did_ic_freep = did_freep1;
810 }
811
812 /*
813 * did_freep_before points to area in block before
814 * offset, length.
815 */
816 did_freep_before->free_length = offset -
817 did_freep_before->free_offset;
818 /*
819 * did_freep_after points to area in block after
820 * offset, length.
821 */
822 did_freep_after = (mddb_did_free_t *)kmem_zalloc
823 (sizeof (mddb_did_free_t), KM_SLEEP);
824 did_freep_after->free_blk = did_freep_before->free_blk;
825 did_freep_after->free_offset = offset + length;
826 did_freep_after->free_length = old_length - length -
827 did_freep_before->free_length;
828 /*
829 * Add before and after areas to free list
830 * If area before or after offset, length has length
831 * of 0, that entry is not added.
832 */
833 if (did_freep_after->free_length) {
834 did_freep_after->free_next = did_freep1;
835 if (did_freep2) {
836 did_freep2->free_next =
837 did_freep_after;
838 } else {
839 s->s_did_icp->did_ic_freep =
840 did_freep_after;
841 }
842 did_freep1 = did_freep_after;
843 } else {
844 kmem_free(did_freep_after,
845 sizeof (mddb_did_free_t));
846 }
847
848 if (did_freep_before->free_length) {
849 did_freep_before->free_next = did_freep1;
850 if (did_freep2) {
851 did_freep2->free_next =
852 did_freep_before;
853 } else {
854 s->s_did_icp->did_ic_freep =
855 did_freep_before;
856 }
857 } else {
858 kmem_free(did_freep_before,
859 sizeof (mddb_did_free_t));
860 }
861 break;
862 } else {
863 did_freep2 = did_freep1;
864 did_freep1 = did_freep1->free_next;
865 }
866 }
867 if (block_found == 0) {
868 return (1);
869 } else {
870 return (0);
871 }
872 }
873
874 /*
875 * Find free space of devid length and remove free space from list.
876 * Return a pointer to the previously free area.
877 *
878 * If there's not enough free space on the free list, get an empty
879 * disk block, put the empty disk block on the did_ic_dbp linked list,
880 * and add the disk block space not used for devid to the free list.
881 *
882 * Return pointer to address (inside disk block) of free area for devid.
883 * Return 0 if error.
884 */
885 static caddr_t
mddb_devid_free_get(mddb_set_t * s,uint_t len,uint_t * blk,uint_t * cnt,uint_t * offset)886 mddb_devid_free_get(
887 mddb_set_t *s,
888 uint_t len,
889 uint_t *blk,
890 uint_t *cnt,
891 uint_t *offset
892 )
893 {
894 mddb_did_free_t *freep, *freep2;
895 mddb_did_db_t *dbp;
896 uint_t blk_cnt, blk_num;
897 ddi_devid_t devid_ptr = NULL;
898
899 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
900 return (0);
901 }
902
903 freep = s->s_did_icp->did_ic_freep;
904 freep2 = (mddb_did_free_t *)NULL;
905 while (freep) {
906 /* found a free area - remove from free list */
907 if (len <= freep->free_length) {
908 *blk = freep->free_blk;
909 *offset = freep->free_offset;
910 /* find disk block pointer that contains free area */
911 dbp = s->s_did_icp->did_ic_dbp;
912 while (dbp) {
913 if (dbp->db_firstblk == *blk)
914 break;
915 else
916 dbp = dbp->db_next;
917 }
918 /*
919 * If a disk block pointer can't be found - something
920 * is wrong, so don't use this free space.
921 */
922 if (dbp == NULL) {
923 freep2 = freep;
924 freep = freep->free_next;
925 continue;
926 }
927
928 devid_ptr = (ddi_devid_t)(dbp->db_ptr + *offset);
929 *cnt = dbp->db_blkcnt;
930
931 /* Update free list information */
932 freep->free_offset += len;
933 freep->free_length -= len;
934 if (freep->free_length == 0) {
935 if (freep2) {
936 freep2->free_next =
937 freep->free_next;
938 } else {
939 s->s_did_icp->did_ic_freep =
940 freep->free_next;
941 }
942 kmem_free(freep, sizeof (mddb_did_free_t));
943 }
944 break;
945 }
946 freep2 = freep;
947 freep = freep->free_next;
948 }
949
950 /* Didn't find a free spot */
951 if (freep == NULL) {
952 /* get free logical disk blk in replica */
953 blk_cnt = btodb(len + (MDDB_BSIZE - 1));
954 blk_num = getfreeblks(s, blk_cnt);
955 if (blk_num == 0)
956 return (0);
957
958 /* Add disk block to disk block linked list */
959 dbp = kmem_zalloc(sizeof (mddb_did_db_t), KM_SLEEP);
960 dbp->db_firstblk = blk_num;
961 dbp->db_blkcnt = blk_cnt;
962 dbp->db_ptr = (caddr_t)kmem_zalloc(dbtob(blk_cnt), KM_SLEEP);
963 dbp->db_next = s->s_did_icp->did_ic_dbp;
964 s->s_did_icp->did_ic_dbp = dbp;
965 devid_ptr = (ddi_devid_t)dbp->db_ptr;
966
967 /* Update return values */
968 *blk = blk_num;
969 *offset = 0;
970 *cnt = blk_cnt;
971
972 /* Add unused part of block to free list */
973 (void) mddb_devid_free_add(s, blk_num,
974 len, (dbtob(blk_cnt) - len));
975 }
976
977 return ((caddr_t)devid_ptr);
978 }
979
980 /*
981 * Add device id information for locator index to device id area in set.
982 * Get free area to store device id from free list. Update checksum
983 * for mddb_did_blk.
984 *
985 * This routine does not write any data out to disk.
986 * After this routine has been called, the routine, writelocall, should
987 * be called to write both the locator block and device id area out
988 * to disk.
989 */
990 static int
mddb_devid_add(mddb_set_t * s,uint_t index,ddi_devid_t devid,char * minor_name)991 mddb_devid_add(
992 mddb_set_t *s,
993 uint_t index,
994 ddi_devid_t devid,
995 char *minor_name
996 )
997 {
998 uint_t devid_len;
999 uint_t blk, offset;
1000 ddi_devid_t devid_ptr;
1001 mddb_did_info_t *did_info;
1002 uint_t blkcnt, i;
1003 mddb_did_blk_t *did_blk;
1004
1005 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1006 return (1);
1007 }
1008 if (strlen(minor_name) > (MDDB_MINOR_NAME_MAX - 1))
1009 return (1);
1010
1011 /* Check if device id has already been added */
1012 did_blk = s->s_did_icp->did_ic_blkp;
1013 did_info = &(did_blk->blk_info[index]);
1014 if (did_info->info_flags & MDDB_DID_EXISTS)
1015 return (0);
1016
1017 devid_len = ddi_devid_sizeof(devid);
1018 devid_ptr = (ddi_devid_t)mddb_devid_free_get(s,
1019 devid_len, &blk, &blkcnt, &offset);
1020
1021 if (devid_ptr == NULL) {
1022 return (1);
1023 }
1024
1025 /* Copy devid into devid free area */
1026 for (i = 0; i < devid_len; i++)
1027 ((char *)devid_ptr)[i] = ((char *)devid)[i];
1028
1029 /* Update mddb_did_info area for new device id */
1030 did_info->info_flags = MDDB_DID_EXISTS | MDDB_DID_VALID;
1031
1032 /*
1033 * Only set UPDATED flag for non-replicated import cases.
1034 * This allows the side locator driver name index to get
1035 * updated in load_old_replicas.
1036 */
1037 if (!(md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT))
1038 did_info->info_flags |= MDDB_DID_UPDATED;
1039
1040 did_info->info_firstblk = blk;
1041 did_info->info_blkcnt = blkcnt;
1042 did_info->info_offset = offset;
1043 did_info->info_length = devid_len;
1044 (void) strcpy(did_info->info_minor_name, minor_name);
1045 crcgen(devid_ptr, &did_info->info_checksum, devid_len, NULL);
1046
1047 /* Add device id pointer to did_ic_devid array */
1048 s->s_did_icp->did_ic_devid[index] = devid_ptr;
1049
1050 return (0);
1051 }
1052
1053
1054 /*
1055 * Delete device id information for locator index from device id area in set.
1056 * Add device id space to free area.
1057 *
1058 * This routine does not write any data out to disk.
1059 * After this routine has been called, the routine, writelocall, should
1060 * be called to write both the locator block and device id area out
1061 * to disk.
1062 */
1063 static int
mddb_devid_delete(mddb_set_t * s,uint_t index)1064 mddb_devid_delete(mddb_set_t *s, uint_t index)
1065 {
1066 mddb_did_info_t *did_info;
1067 mddb_did_blk_t *did_blk;
1068
1069 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1070 return (1);
1071 }
1072
1073 /* Get device id information from mddb_did_blk */
1074 did_blk = s->s_did_icp->did_ic_blkp;
1075 did_info = &(did_blk->blk_info[index]);
1076
1077 /*
1078 * Ensure that the underlying device supports device ids
1079 * before arbitrarily removing them.
1080 */
1081 if (!(did_info->info_flags & MDDB_DID_EXISTS)) {
1082 return (1);
1083 }
1084
1085 /* Remove device id information from mddb_did_blk */
1086 did_info->info_flags = 0;
1087
1088 /* Remove device id from incore area */
1089 s->s_did_icp->did_ic_devid[index] = (ddi_devid_t)NULL;
1090
1091 /* Add new free space in disk block to free list */
1092 (void) mddb_devid_free_add(s, did_info->info_firstblk,
1093 did_info->info_offset, did_info->info_length);
1094
1095 return (0);
1096 }
1097
1098 /*
1099 * Check if there is a device id for a locator index.
1100 *
1101 * Caller of this routine should not free devid or minor_name since
1102 * these will point to internal data structures that should not
1103 * be freed.
1104 */
1105 static int
mddb_devid_get(mddb_set_t * s,uint_t index,ddi_devid_t * devid,char ** minor_name)1106 mddb_devid_get(
1107 mddb_set_t *s,
1108 uint_t index,
1109 ddi_devid_t *devid,
1110 char **minor_name
1111 )
1112 {
1113 mddb_did_info_t *did_info;
1114
1115 if (!(s->s_lbp->lb_flags & MDDB_DEVID_STYLE)) {
1116 return (0);
1117 }
1118 did_info = &(s->s_did_icp->did_ic_blkp->blk_info[index]);
1119
1120 if (did_info->info_flags & MDDB_DID_EXISTS) {
1121 *devid = s->s_did_icp->did_ic_devid[index];
1122 *minor_name =
1123 s->s_did_icp->did_ic_blkp->blk_info[index].info_minor_name;
1124 return (1);
1125 } else
1126 return (0);
1127
1128
1129 }
1130
1131 /*
1132 * Check if device id is valid on current system.
1133 * Needs devid, previously known dev_t and current minor_name.
1134 *
1135 * Success:
1136 * Returns 0 if valid device id is found and updates
1137 * dev_t if the dev_t associated with the device id is
1138 * different than dev_t.
1139 * Failure:
1140 * Returns 1 if device id not valid on current system.
1141 */
1142 static int
mddb_devid_validate(ddi_devid_t devid,md_dev64_t * dev,char * minor_name)1143 mddb_devid_validate(ddi_devid_t devid, md_dev64_t *dev, char *minor_name)
1144 {
1145 int retndevs;
1146 dev_t *ddi_devs;
1147 int devid_flag = 0;
1148 int cnt;
1149
1150 if (dev == 0)
1151 return (1);
1152 /*
1153 * See if devid is valid in the current system.
1154 * If so, set dev to match the devid.
1155 */
1156 if (ddi_lyr_devid_to_devlist(devid, minor_name,
1157 &retndevs, &ddi_devs) == DDI_SUCCESS) {
1158 if (retndevs > 0) {
1159 /* devid is valid to use */
1160 devid_flag = 1;
1161 /* does dev_t in list match dev */
1162 cnt = 0;
1163 while (cnt < retndevs) {
1164 if (*dev == md_expldev(ddi_devs[cnt]))
1165 break;
1166 cnt++;
1167 }
1168 /*
1169 * If a different dev_t, then setup
1170 * new dev and new major name
1171 */
1172 if (cnt == retndevs) {
1173 *dev = md_expldev(ddi_devs[0]);
1174 }
1175 ddi_lyr_free_devlist(ddi_devs, retndevs);
1176 }
1177 }
1178 if (devid_flag)
1179 return (0);
1180 else
1181 return (1);
1182 }
1183
1184
1185 /*
1186 * Free the devid incore data areas
1187 */
1188 static void
mddb_devid_icp_free(mddb_did_ic_t ** did_icp,mddb_lb_t * lbp)1189 mddb_devid_icp_free(mddb_did_ic_t **did_icp, mddb_lb_t *lbp)
1190 {
1191 mddb_did_free_t *did_freep1, *did_freep2;
1192 mddb_did_db_t *did_dbp1, *did_dbp2;
1193 mddb_did_ic_t *icp = *did_icp;
1194
1195 if (icp) {
1196 if (icp->did_ic_blkp) {
1197 kmem_free((caddr_t)icp->did_ic_blkp,
1198 dbtob(lbp->lb_didblkcnt));
1199 icp->did_ic_blkp = (mddb_did_blk_t *)NULL;
1200 }
1201
1202 if (icp->did_ic_dbp) {
1203 did_dbp1 = icp->did_ic_dbp;
1204 while (did_dbp1) {
1205 did_dbp2 = did_dbp1->db_next;
1206 kmem_free((caddr_t)did_dbp1->db_ptr,
1207 dbtob(did_dbp1->db_blkcnt));
1208 kmem_free((caddr_t)did_dbp1,
1209 sizeof (mddb_did_db_t));
1210 did_dbp1 = did_dbp2;
1211 }
1212 }
1213
1214 if (icp->did_ic_freep) {
1215 did_freep1 = icp->did_ic_freep;
1216 while (did_freep1) {
1217 did_freep2 = did_freep1->free_next;
1218 kmem_free((caddr_t)did_freep1,
1219 sizeof (mddb_did_free_t));
1220 did_freep1 = did_freep2;
1221 }
1222 }
1223
1224 kmem_free((caddr_t)icp, sizeof (mddb_did_ic_t));
1225 *did_icp = (mddb_did_ic_t *)NULL;
1226 }
1227
1228 }
1229
1230 static daddr_t
getphysblk(mddb_block_t blk,mddb_mb_ic_t * mbip)1231 getphysblk(
1232 mddb_block_t blk,
1233 mddb_mb_ic_t *mbip
1234 )
1235 {
1236 mddb_mb_t *mbp = &(mbip->mbi_mddb_mb);
1237
1238 while (blk >= mbp->mb_blkcnt) {
1239 if (! mbip->mbi_next)
1240 return ((daddr_t)-1); /* no such block */
1241 blk -= mbp->mb_blkcnt;
1242 mbip = mbip->mbi_next;
1243 mbp = &(mbip->mbi_mddb_mb);
1244 }
1245
1246 if (blk >= mbp->mb_blkmap.m_consecutive)
1247 return ((daddr_t)-1); /* no such block */
1248
1249 return ((daddr_t)(mbp->mb_blkmap.m_firstblk + blk));
1250 }
1251
1252 /*
1253 * when a buf header is passed in the new buffer must be
1254 * put on the front of the chain. writerec counts on it
1255 */
1256 static int
putblks(mddb_set_t * s,caddr_t buffer,daddr_t blk,int cnt,md_dev64_t device,mddb_bf_t ** bufhead)1257 putblks(
1258 mddb_set_t *s, /* incore db set structure */
1259 caddr_t buffer, /* adr of buffer to be written */
1260 daddr_t blk, /* block number for first block */
1261 int cnt, /* number of blocks to be written */
1262 md_dev64_t device, /* device to be written to */
1263 mddb_bf_t **bufhead /* if non-zero then ASYNC I/O */
1264 /* and put buf address here */
1265 )
1266 {
1267 buf_t *bp;
1268 mddb_bf_t *bfp;
1269 int err = 0;
1270
1271 bfp = allocbuffer(s, MDDB_SLEEPOK);
1272 bp = &bfp->bf_buf;
1273 bp->b_bcount = MDDB_BSIZE * cnt;
1274 bp->b_un.b_addr = buffer;
1275 bp->b_blkno = blk;
1276 bp->b_edev = md_dev64_to_dev(device);
1277 /*
1278 * if a header for a buf chain is passed in this is async io.
1279 * currently only done for optimize records
1280 */
1281 if (bufhead) {
1282 bfp->bf_next = *bufhead;
1283 *bufhead = bfp;
1284 (void) mddb_rwdata(s, B_WRITE|B_ASYNC, bp);
1285 return (0);
1286 }
1287 err = mddb_rwdata(s, B_WRITE, bp);
1288 freebuffer(s, bfp);
1289 if (err) {
1290 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1291 s->s_setno, device);
1292 return (MDDB_F_EWRITE);
1293 }
1294 return (0);
1295 }
1296
1297 /*
1298 * wrtblklst - takes an array of logical block numbers
1299 * and writes the buffer to those blocks (scatter).
1300 * If called during upgrade, this routine expects a
1301 * non-translated (aka target) dev.
1302 */
1303 static int
wrtblklst(mddb_set_t * s,caddr_t buffer,mddb_block_t blka[],daddr_t cnt,const int li,mddb_bf_t ** bufhead,int master_only)1304 wrtblklst(
1305 mddb_set_t *s, /* incore set structure */
1306 caddr_t buffer, /* buffer to be written (record blk) */
1307 mddb_block_t blka[], /* list of logical blks for record */
1308 daddr_t cnt, /* number of logical blks */
1309 const int li, /* locator index */
1310 mddb_bf_t **bufhead, /* if non-zero then ASYNC I/O */
1311 /* and put buf address here */
1312 int master_only /* allow only master node to write */
1313 )
1314 {
1315 daddr_t blk;
1316 daddr_t blk1;
1317 int err = 0;
1318 int cons;
1319 mddb_lb_t *lbp = s->s_lbp;
1320 mddb_locator_t *lp = &lbp->lb_locators[li];
1321 md_dev64_t dev;
1322 mddb_mb_ic_t *mbip = s->s_mbiarray[li];
1323
1324 /*
1325 * If a MN diskset and only the master can write,
1326 * then a non-master node will just return success.
1327 */
1328 if (lbp->lb_flags & MDDB_MNSET) {
1329 if (master_only == MDDB_WR_ONLY_MASTER) {
1330 /* return successfully if we aren't the master */
1331 if (!(md_set[s->s_setno].s_am_i_master)) {
1332 return (0);
1333 }
1334 }
1335 if (mbip == NULL)
1336 return (MDDB_F_EWRITE);
1337 }
1338
1339 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1340 if (dev == NODEV64) {
1341 return (1);
1342 }
1343
1344 blk = getphysblk(blka[0], mbip);
1345 ASSERT(blk >= 0);
1346
1347 cons = 1;
1348 while (cnt) {
1349 if (cons != cnt) {
1350 blk1 = getphysblk(blka[cons], mbip);
1351 ASSERT(blk1 >= 0);
1352 if ((blk + cons) == blk1) {
1353 cons++;
1354 continue;
1355 }
1356 }
1357 if (err = putblks(s, buffer, blk, cons, dev, bufhead)) {
1358 /*
1359 * If an MN diskset and any_node_can_write
1360 * then this request is coming from writeoptrecord
1361 * and l_flags field should not be updated.
1362 * l_flags will be updated as a result of sending
1363 * a class1 message to the master. Setting l_flags
1364 * here will cause slave to be out of sync with
1365 * master.
1366 *
1367 * Otherwise, set the error in l_flags
1368 * (this occurs if this is not a MN diskset or
1369 * only_master_can_write is set).
1370 */
1371 if ((!(lbp->lb_flags & MDDB_MNSET)) ||
1372 (master_only == MDDB_WR_ONLY_MASTER)) {
1373 lp->l_flags |= MDDB_F_EWRITE;
1374 }
1375 return (err);
1376 }
1377 if (bufhead)
1378 (*bufhead)->bf_locator = lp;
1379
1380 buffer += MDDB_BSIZE * cons;
1381 cnt -= cons;
1382 blka += cons;
1383 if (cnt) {
1384 blk = getphysblk(blka[0], mbip);
1385 ASSERT(blk >= 0);
1386 }
1387 cons = 1;
1388 }
1389
1390 return (0);
1391 }
1392
1393 /*
1394 * writeblks - takes a logical block number/block count pair
1395 * and writes the buffer to those contiguous logical blocks.
1396 * If called during upgrade, this routine expects a non-translated
1397 * (aka target) dev.
1398 */
1399 static int
writeblks(mddb_set_t * s,caddr_t buffer,mddb_block_t blk,int cnt,const int li,int master_only)1400 writeblks(
1401 mddb_set_t *s, /* incore set structure */
1402 caddr_t buffer, /* buffer to be written */
1403 mddb_block_t blk, /* starting logical block number */
1404 int cnt, /* number of log blocks to be written */
1405 const int li, /* locator index */
1406 int master_only /* allow only master node to write */
1407 )
1408 {
1409 daddr_t physblk;
1410 int err = 0;
1411 int i;
1412 mddb_lb_t *lbp = s->s_lbp;
1413 mddb_locator_t *lp = &lbp->lb_locators[li];
1414 md_dev64_t dev;
1415 mddb_block_t *blkarray;
1416 int size;
1417 int ret;
1418
1419 /*
1420 * If a MN diskset and only the master can write,
1421 * then a non-master node will just return success.
1422 */
1423 if ((lbp->lb_flags & MDDB_MNSET) &&
1424 (master_only == MDDB_WR_ONLY_MASTER)) {
1425 /* return successfully if we aren't the master */
1426 if (!(md_set[s->s_setno].s_am_i_master)) {
1427 return (0);
1428 }
1429 }
1430
1431 dev = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
1432 if (dev == NODEV64) {
1433 return (1);
1434 }
1435
1436 if (cnt > 1) {
1437 size = sizeof (mddb_block_t) * cnt;
1438 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1439 for (i = 0; i < cnt; i++)
1440 blkarray[i] = blk + i;
1441 ret = wrtblklst(s, buffer, blkarray, cnt,
1442 li, 0, MDDB_WR_ONLY_MASTER);
1443 kmem_free(blkarray, size);
1444 return (ret);
1445 }
1446 physblk = getphysblk(blk, s->s_mbiarray[li]);
1447 ASSERT(physblk > 0);
1448 if (err = putblks(s, buffer, physblk, 1, dev, (mddb_bf_t **)0)) {
1449 lp->l_flags |= MDDB_F_EWRITE;
1450 return (err);
1451 }
1452 return (0);
1453 }
1454
1455 /*
1456 * writeall - will write the buffer to all ACTIVE/NON-ERRORED replicas.
1457 */
1458 static int
writeall(mddb_set_t * s,caddr_t buffer,mddb_block_t block,int cnt,int master_only)1459 writeall(
1460 mddb_set_t *s, /* incore set structure */
1461 caddr_t buffer, /* buffer to be written */
1462 mddb_block_t block, /* starting logical block number */
1463 int cnt, /* number of log blocks to be written */
1464 int master_only /* allow only master node to write */
1465 )
1466 {
1467 int li;
1468 int err = 0;
1469 mddb_lb_t *lbp = s->s_lbp;
1470
1471 for (li = 0; li < lbp->lb_loccnt; li++) {
1472 mddb_locator_t *lp = &lbp->lb_locators[li];
1473
1474 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1475 (lp->l_flags & MDDB_F_EWRITE))
1476 continue;
1477
1478 err |= writeblks(s, buffer, block, cnt, li, master_only);
1479 }
1480
1481 return (err);
1482 }
1483
1484 /*
1485 * writelocall - write the locator block and device id information (if
1486 * replica is in device id format) to all ACTIVE/NON-ERRORER replicas.
1487 *
1488 * Increments the locator block's commitcnt. Updates the device id area's
1489 * commitcnt if the replica is in device id format. Regenerates the
1490 * checksums after updating the commitcnt(s).
1491 */
1492 static int
writelocall(mddb_set_t * s)1493 writelocall(
1494 mddb_set_t *s /* incore set structure */
1495 )
1496 {
1497 int li;
1498 int err = 0;
1499 mddb_lb_t *lbp = s->s_lbp;
1500 mddb_did_blk_t *did_blk;
1501 mddb_did_db_t *did_dbp;
1502
1503 s->s_lbp->lb_commitcnt++;
1504 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1505 did_blk = s->s_did_icp->did_ic_blkp;
1506 did_blk->blk_commitcnt = s->s_lbp->lb_commitcnt;
1507 crcgen(did_blk, &did_blk->blk_checksum,
1508 dbtob(lbp->lb_didblkcnt), NULL);
1509 }
1510 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
1511
1512 for (li = 0; li < lbp->lb_loccnt; li++) {
1513 mddb_locator_t *lp = &lbp->lb_locators[li];
1514
1515 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
1516 (lp->l_flags & MDDB_F_EWRITE))
1517 continue;
1518
1519 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
1520 /* write out blocks containing actual device ids */
1521 did_dbp = s->s_did_icp->did_ic_dbp;
1522 while (did_dbp) {
1523 err |= writeblks(s, (caddr_t)did_dbp->db_ptr,
1524 did_dbp->db_firstblk,
1525 did_dbp->db_blkcnt, li,
1526 MDDB_WR_ONLY_MASTER);
1527 did_dbp = did_dbp->db_next;
1528 }
1529
1530 /* write out device id area block */
1531 err |= writeblks(s, (caddr_t)did_blk,
1532 lbp->lb_didfirstblk, lbp->lb_didblkcnt, li,
1533 MDDB_WR_ONLY_MASTER);
1534 }
1535 /* write out locator block */
1536 err |= writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
1537 MDDB_WR_ONLY_MASTER);
1538 }
1539
1540 /*
1541 * If a MN diskset and this is the master, set the PARSE_LOCBLK flag
1542 * in the mddb_set structure to show that the locator block has
1543 * been changed.
1544 */
1545
1546 if ((lbp->lb_flags & MDDB_MNSET) &&
1547 (md_set[s->s_setno].s_am_i_master)) {
1548 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
1549 }
1550 return (err);
1551 }
1552
1553 /*
1554 * If called during upgrade, this routine expects a translated
1555 * (aka miniroot) dev.
1556 */
1557 static int
getblks(mddb_set_t * s,caddr_t buffer,md_dev64_t device,daddr_t blk,int cnt,int flag)1558 getblks(
1559 mddb_set_t *s, /* incore db set structure */
1560 caddr_t buffer, /* buffer to read data into */
1561 md_dev64_t device, /* device to read from */
1562 daddr_t blk, /* physical block number to read */
1563 int cnt, /* number of blocks to read */
1564 int flag /* flags for I/O */
1565 )
1566 {
1567 buf_t *bp;
1568 mddb_bf_t *bfp;
1569 int err = 0;
1570
1571 bfp = allocbuffer(s, MDDB_SLEEPOK); /* this will never sleep */
1572 bp = &bfp->bf_buf;
1573 bp->b_bcount = MDDB_BSIZE * cnt;
1574 bp->b_un.b_addr = buffer;
1575 bp->b_blkno = blk;
1576 bp->b_edev = md_dev64_to_dev(device);
1577 err = mddb_rwdata(s, (B_READ | flag), bp);
1578 freebuffer(s, bfp);
1579 if (err) {
1580 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED, SVM_TAG_REPLICA,
1581 s->s_setno, device);
1582 return (MDDB_F_EREAD);
1583 }
1584 return (0);
1585 }
1586
1587 /*
1588 * readblklst - takes an array of logical block numbers
1589 * and reads those blocks (gather) into the buffer.
1590 * If called during upgrade, this routine expects a non-translated
1591 * (aka target) dev.
1592 */
1593 static int
readblklst(mddb_set_t * s,caddr_t buffer,mddb_block_t blka[],daddr_t cnt,int li,int flag)1594 readblklst(
1595 mddb_set_t *s, /* incore set structure */
1596 caddr_t buffer, /* buffer to be read (record block) */
1597 mddb_block_t blka[], /* list of logical blocks to be read */
1598 daddr_t cnt, /* number of logical blocks */
1599 int li, /* locator index */
1600 int flag /* flags for I/O */
1601 )
1602 {
1603 daddr_t blk;
1604 daddr_t blk1;
1605 int err = 0;
1606 int cons;
1607 md_dev64_t dev;
1608 mddb_mb_ic_t *mbip;
1609
1610 mbip = s->s_mbiarray[li];
1611 dev = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1612 dev = md_xlate_targ_2_mini(dev);
1613 if (dev == NODEV64) {
1614 return (1);
1615 }
1616
1617 blk = getphysblk(blka[0], mbip);
1618 ASSERT(blk >= 0);
1619
1620 cons = 1;
1621 while (cnt) {
1622 if (cons != cnt) {
1623 blk1 = getphysblk(blka[cons], mbip);
1624 ASSERT(blk1 >= 0);
1625 if ((blk + cons) == blk1) {
1626 cons++;
1627 continue;
1628 }
1629 }
1630 if (err = getblks(s, buffer, dev, blk, cons, flag))
1631 return (err);
1632 buffer += MDDB_BSIZE * cons;
1633 cnt -= cons;
1634 blka += cons;
1635 if (cnt) {
1636 blk = getphysblk(blka[0], mbip);
1637 ASSERT(blk >= 0);
1638 }
1639 cons = 1;
1640 }
1641 return (0);
1642 }
1643
1644 /*
1645 * readblks - takes a logical block number/block count pair
1646 * and reads those contiguous logical blocks into the buffer.
1647 * If called during upgrade, this routine expects a non-translated
1648 * (aka target) dev.
1649 */
1650 static int
readblks(mddb_set_t * s,caddr_t buffer,mddb_block_t blk,int cnt,int li)1651 readblks(
1652 mddb_set_t *s, /* incore set structure */
1653 caddr_t buffer, /* buffer to be read into */
1654 mddb_block_t blk, /* logical block number to be read */
1655 int cnt, /* number of logical blocks to be read */
1656 int li /* locator index */
1657 )
1658 {
1659 daddr_t physblk;
1660 md_dev64_t device;
1661 int i;
1662 mddb_block_t *blkarray;
1663 int size;
1664 int ret;
1665
1666 if (cnt > 1) {
1667 size = sizeof (mddb_block_t) * cnt;
1668 blkarray = (mddb_block_t *)kmem_alloc(size, KM_SLEEP);
1669 for (i = 0; i < cnt; i++)
1670 blkarray[i] = blk + i;
1671 ret = readblklst(s, buffer, blkarray, cnt, li, 0);
1672 kmem_free(blkarray, size);
1673 return (ret);
1674 }
1675 physblk = getphysblk(blk, s->s_mbiarray[li]);
1676 ASSERT(physblk > 0);
1677 device = md_expldev(s->s_lbp->lb_locators[li].l_dev);
1678 device = md_xlate_targ_2_mini(device);
1679 if (device == NODEV64) {
1680 return (1);
1681 }
1682 return (getblks(s, buffer, device, physblk, 1, 0));
1683 }
1684
1685 static void
single_thread_start(mddb_set_t * s)1686 single_thread_start(
1687 mddb_set_t *s
1688 )
1689 {
1690 while (s->s_singlelockgotten) {
1691 s->s_singlelockwanted++;
1692 cv_wait(&s->s_single_thread_cv, SETMUTEX(s->s_setno));
1693 }
1694 s->s_singlelockgotten++;
1695 }
1696
1697 static void
single_thread_end(mddb_set_t * s)1698 single_thread_end(
1699 mddb_set_t *s
1700 )
1701 {
1702 ASSERT(s->s_singlelockgotten);
1703 s->s_singlelockgotten = 0;
1704 if (s->s_singlelockwanted) {
1705 s->s_singlelockwanted = 0;
1706 cv_broadcast(&s->s_single_thread_cv);
1707 }
1708 }
1709
1710 static size_t
sizeofde(mddb_de_ic_t * dep)1711 sizeofde(
1712 mddb_de_ic_t *dep
1713 )
1714 {
1715 size_t size;
1716
1717 size = sizeof (mddb_de_ic_t) - sizeof (mddb_block_t) +
1718 sizeof (mddb_block_t) * dep->de_blkcount;
1719 return (size);
1720 }
1721
1722 static size_t
sizeofde32(mddb_de32_t * dep)1723 sizeofde32(
1724 mddb_de32_t *dep
1725 )
1726 {
1727 size_t size;
1728
1729 size = sizeof (*dep) - sizeof (dep->de32_blks) +
1730 sizeof (mddb_block_t) * dep->de32_blkcount;
1731 return (size);
1732 }
1733
1734 static mddb_de32_t *
nextentry(mddb_de32_t * dep)1735 nextentry(
1736 mddb_de32_t *dep
1737 )
1738 {
1739 mddb_de32_t *ret;
1740
1741 ret = (mddb_de32_t *)((void *)((caddr_t)dep + sizeofde32(dep)));
1742 return (ret);
1743 }
1744
1745 static void
create_db32rec(mddb_db32_t * db32p,mddb_db_t * dbp)1746 create_db32rec(
1747 mddb_db32_t *db32p,
1748 mddb_db_t *dbp
1749 )
1750 {
1751 mddb_de_ic_t *dep;
1752 mddb_de32_t *de32p;
1753
1754 #if defined(_ILP32) && !defined(lint)
1755 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
1756 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
1757 #endif
1758
1759 dbtodb32(dbp, db32p);
1760 if ((dbp->db_firstentry != NULL) && (db32p->db32_firstentry == 0))
1761 db32p->db32_firstentry = 0x4;
1762 de32p = (mddb_de32_t *)((void *) ((caddr_t)(&db32p->db32_firstentry)
1763 + sizeof (db32p->db32_firstentry)));
1764 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
1765 detode32(dep, de32p);
1766 if ((dep->de_next != NULL) && (de32p->de32_next == 0))
1767 de32p->de32_next = 0x4;
1768 de32p = nextentry(de32p);
1769 }
1770 ASSERT((uintptr_t)de32p <= (uintptr_t)de32p + MDDB_BSIZE);
1771 }
1772
1773 /*
1774 * If called during upgrade, this routine expects a translated
1775 * (aka miniroot) dev.
1776 * If master blocks are found, set the mn_set parameter to 1 if the
1777 * the master block revision number is MDDB_REV_MNMB; otherwise,
1778 * set it to 0.
1779 * If master blocks are not found, do not change the mnset parameter.
1780 */
1781 static mddb_mb_ic_t *
getmasters(mddb_set_t * s,md_dev64_t dev,daddr_t blkno,uint_t * flag,int * mn_set)1782 getmasters(
1783 mddb_set_t *s,
1784 md_dev64_t dev,
1785 daddr_t blkno,
1786 uint_t *flag,
1787 int *mn_set
1788 )
1789 {
1790 mddb_mb_ic_t *mbi = NULL;
1791 mddb_mb_t *mb;
1792 int error = 0;
1793 ddi_devid_t devid;
1794
1795
1796 if (mddb_devopen(dev)) {
1797 if (flag)
1798 *flag |= MDDB_F_EMASTER;
1799 return ((mddb_mb_ic_t *)NULL);
1800 }
1801
1802
1803 mbi = (mddb_mb_ic_t *)kmem_zalloc(MDDB_IC_BSIZE, KM_SLEEP);
1804 mb = &(mbi->mbi_mddb_mb);
1805 if (error = getblks(s, (caddr_t)mb, dev, blkno,
1806 btodb(MDDB_BSIZE), 0)) {
1807 error |= MDDB_F_EMASTER;
1808 }
1809 if (mb->mb_magic != MDDB_MAGIC_MB) {
1810 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1811 }
1812 /* Check for MDDB_REV_MNMB and lower */
1813 if (revchk(MDDB_REV_MNMB, mb->mb_revision)) {
1814 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1815 }
1816 if (crcchk(mb, &mb->mb_checksum, MDDB_BSIZE, NULL)) {
1817 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1818 }
1819
1820 if (!(md_get_setstatus(s->s_setno) &
1821 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
1822 (mb->mb_setno != s->s_setno)) {
1823 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1824 }
1825 if (mb->mb_blkno != blkno) {
1826 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1827 }
1828 mb->mb_next = NULL;
1829 mbi->mbi_next = NULL;
1830
1831 if (error)
1832 goto out;
1833
1834 /*
1835 * Check the md_devid_destroy and md_keep_repl_state flags
1836 * to see if we need to regen the devid or not.
1837 *
1838 * Don't care about devid in local set since it is not used
1839 * and this should not be part of set importing
1840 */
1841 if ((s->s_setno != MD_LOCAL_SET) &&
1842 !(md_get_setstatus(s->s_setno) &
1843 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT))) {
1844 /*
1845 * Now check the destroy flag. We also need to handle
1846 * the case where the destroy flag is reset after the
1847 * destroy
1848 */
1849 if (md_devid_destroy || (mb->mb_devid_len == 0)) {
1850
1851 if (md_devid_destroy) {
1852 bzero(mb->mb_devid, mb->mb_devid_len);
1853 mb->mb_devid_len = 0;
1854 }
1855
1856 /*
1857 * Try to regenerate it if the 'keep' flag is not set
1858 */
1859 if (!md_keep_repl_state) {
1860 if (ddi_lyr_get_devid(md_dev64_to_dev(dev),
1861 &devid) == DDI_SUCCESS) {
1862 mb->mb_devid_len =
1863 ddi_devid_sizeof(devid);
1864 bcopy(devid, mb->mb_devid,
1865 mb->mb_devid_len);
1866 ddi_devid_free(devid);
1867 } else {
1868 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1869 }
1870 }
1871
1872 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
1873
1874 /*
1875 * Push
1876 */
1877 if (putblks(s, (caddr_t)mb, blkno, 1, dev, 0) != 0) {
1878 error = MDDB_F_EFMT | MDDB_F_EMASTER;
1879 }
1880 }
1881 }
1882
1883 if (! error) {
1884 /* Set mn_set parameter to 1 if a MN set */
1885 if (mb->mb_revision == MDDB_REV_MNMB)
1886 *mn_set = 1;
1887 else
1888 *mn_set = 0;
1889 return (mbi);
1890 }
1891
1892 out:
1893 /* Error Out */
1894 if (flag)
1895 *flag |= error;
1896
1897 kmem_free((caddr_t)mbi, MDDB_IC_BSIZE);
1898 mddb_devclose(dev);
1899 return ((mddb_mb_ic_t *)NULL);
1900 }
1901
1902 static int
getrecord(mddb_set_t * s,mddb_de_ic_t * dep,int li)1903 getrecord(
1904 mddb_set_t *s,
1905 mddb_de_ic_t *dep,
1906 int li
1907 )
1908 {
1909 int err = 0;
1910 mddb_rb32_t *rbp;
1911
1912 #if defined(_ILP32) && !defined(lint)
1913 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
1914 #endif
1915
1916
1917 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(dep->de_recsize, KM_SLEEP);
1918 rbp = dep->de_rb;
1919
1920 err = readblklst(s, (caddr_t)rbp, dep->de_blks,
1921 dep->de_blkcount, li, 0);
1922 if (err) {
1923 return (MDDB_F_EDATA | err);
1924 }
1925 if (rbp->rb_magic != MDDB_MAGIC_RB) {
1926 return (MDDB_F_EFMT | MDDB_F_EDATA);
1927 }
1928 if ((revchk(MDDB_REV_RB, rbp->rb_revision) != 0) &&
1929 (revchk(MDDB_REV_RB64, rbp->rb_revision) != 0) &&
1930 (revchk(MDDB_REV_RBFN, rbp->rb_revision) != 0) &&
1931 (revchk(MDDB_REV_RB64FN, rbp->rb_revision) != 0)) {
1932 return (MDDB_F_EFMT | MDDB_F_EDATA);
1933 }
1934 /* Check crc for this record */
1935 if (rec_crcchk(s, dep, rbp)) {
1936 return (MDDB_F_EFMT | MDDB_F_EDATA);
1937 }
1938 return (0);
1939 }
1940
1941 /*
1942 * Code to read in the locator name information
1943 */
1944 static int
readlocnames(mddb_set_t * s,int li)1945 readlocnames(
1946 mddb_set_t *s,
1947 int li
1948 )
1949 {
1950 mddb_ln_t *lnp;
1951 int err = 0;
1952 mddb_block_t ln_blkcnt, ln_blkno;
1953
1954 /*
1955 * read in the locator name blocks
1956 */
1957 s->s_lnp = NULL;
1958
1959 ln_blkno = s->s_lbp->lb_lnfirstblk;
1960 ln_blkcnt = s->s_lbp->lb_lnblkcnt;
1961 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(ln_blkcnt), KM_SLEEP);
1962
1963 err = readblks(s, (caddr_t)lnp, ln_blkno, ln_blkcnt, li);
1964 if (err) {
1965 err |= MDDB_F_EDATA;
1966 goto out;
1967 }
1968 if (lnp->ln_magic != MDDB_MAGIC_LN) {
1969 err = MDDB_F_EDATA | MDDB_F_EFMT;
1970 goto out;
1971 }
1972 if (s->s_lbp->lb_flags & MDDB_MNSET) {
1973 if (revchk(MDDB_REV_MNLN, lnp->ln_revision)) {
1974 err = MDDB_F_EDATA | MDDB_F_EFMT;
1975 goto out;
1976 }
1977 } else {
1978 if (revchk(MDDB_REV_LN, lnp->ln_revision)) {
1979 err = MDDB_F_EDATA | MDDB_F_EFMT;
1980 goto out;
1981 }
1982 }
1983 if (crcchk(lnp, &lnp->ln_checksum, dbtob(ln_blkcnt), NULL)) {
1984 err = MDDB_F_EDATA | MDDB_F_EFMT;
1985 goto out;
1986 }
1987 out:
1988 /*
1989 * if error occurred in locator name blocks free them
1990 * and return
1991 */
1992 if (err) {
1993 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
1994 return (err);
1995 }
1996 s->s_lnp = lnp;
1997 return (0);
1998 }
1999
2000 /*
2001 * code to read in a copy of the database.
2002 */
2003
2004 static int
readcopy(mddb_set_t * s,int li)2005 readcopy(
2006 mddb_set_t *s,
2007 int li
2008 )
2009 {
2010 uint_t blk;
2011 mddb_db_t *dbp, *dbp1, *dbhp;
2012 mddb_db32_t *db32p;
2013 mddb_de_ic_t *dep, *dep2;
2014 mddb_de32_t *de32p, *de32p2;
2015 int err = 0;
2016 uint_t checksum;
2017
2018
2019 #if defined(_ILP32) && !defined(lint)
2020 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2021 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2022 #endif
2023
2024 dbp = NULL;
2025 dbhp = NULL;
2026 /*
2027 * read in all the directory blocks
2028 */
2029 blk = s->s_lbp->lb_dbfirstblk;
2030 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2031
2032 for (; blk != 0; blk = dbp->db_nextblk) {
2033 dbp1 = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
2034 if (! dbhp) {
2035 dbhp = dbp1;
2036 } else {
2037 dbp->db_next = dbp1;
2038 }
2039 dbp = dbp1;
2040
2041 err = readblks(s, (caddr_t)db32p, blk, 1, li);
2042 if (err) {
2043 err |= MDDB_F_EDATA;
2044 break;
2045 }
2046 db32todb(db32p, dbp);
2047 if (db32p->db32_magic != MDDB_MAGIC_DB) {
2048 err = MDDB_F_EDATA | MDDB_F_EFMT;
2049 break;
2050 }
2051 if (revchk(MDDB_REV_DB, db32p->db32_revision)) {
2052 err = MDDB_F_EDATA | MDDB_F_EFMT;
2053 break;
2054 }
2055 if (crcchk(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL)) {
2056 err = MDDB_F_EDATA | MDDB_F_EFMT;
2057 break;
2058 }
2059 /*
2060 * first go through and fix up all de_next pointers
2061 */
2062 if (dbp->db_firstentry) {
2063
2064 de32p = (mddb_de32_t *)
2065 ((void *) ((caddr_t)(&db32p->db32_firstentry)
2066 + sizeof (db32p->db32_firstentry)));
2067
2068 dep = (mddb_de_ic_t *)
2069 kmem_zalloc(sizeof (mddb_de_ic_t) -
2070 sizeof (mddb_block_t) +
2071 sizeof (mddb_block_t) * de32p->de32_blkcount,
2072 KM_SLEEP);
2073 de32tode(de32p, dep);
2074
2075 dbp->db_firstentry = dep;
2076 while (de32p && de32p->de32_next) {
2077
2078 de32p2 = nextentry(de32p);
2079
2080 dep2 = (mddb_de_ic_t *)kmem_zalloc(
2081 sizeof (mddb_de_ic_t) -
2082 sizeof (mddb_block_t) +
2083 sizeof (mddb_block_t) *
2084 de32p2->de32_blkcount, KM_SLEEP);
2085
2086 de32tode(de32p2, dep2);
2087
2088 dep->de_next = dep2;
2089 dep = dep2;
2090 de32p = de32p2;
2091 }
2092 }
2093 /*
2094 * go through and make all of the pointer to record blocks
2095 * are null;
2096 */
2097 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
2098 dep->de_rb = NULL;
2099 }
2100 kmem_free((caddr_t)db32p, MDDB_BSIZE);
2101 dbp->db_next = NULL;
2102 /*
2103 * if error occurred in directory blocks free them
2104 * and return
2105 */
2106 if (err) {
2107 dbp = dbhp;
2108 while (dbp) {
2109 dep = dbp->db_firstentry;
2110 while (dep) {
2111 /* No mddb_rb32_t structures yet */
2112 dep2 = dep->de_next;
2113 kmem_free((caddr_t)dep, sizeofde(dep));
2114 dep = dep2;
2115 }
2116 dbp1 = dbp->db_next;
2117 kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2118 dbp = dbp1;
2119 }
2120 s->s_dbp = NULL;
2121 return (err);
2122
2123 }
2124 /*
2125 */
2126 err = 0;
2127 checksum = MDDB_GLOBAL_XOR;
2128 for (dbp = dbhp; dbp != NULL; dbp = dbp->db_next) {
2129 checksum ^= dbp->db_recsum;
2130 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2131 if (dep->de_flags & MDDB_F_OPT)
2132 continue;
2133 err = getrecord(s, dep, li);
2134 if (err)
2135 break;
2136 /* Don't include CHANGELOG in big XOR */
2137 if (dep->de_flags & MDDB_F_CHANGELOG)
2138 continue;
2139 checksum ^= dep->de_rb->rb_checksum;
2140 checksum ^= dep->de_rb->rb_checksum_fiddle;
2141 }
2142 if (err)
2143 break;
2144 }
2145 if (checksum) {
2146 if (! err)
2147 err = MDDB_F_EDATA | MDDB_F_EFMT;
2148 }
2149 if (err) {
2150 dbp = dbhp;
2151 dbhp = NULL;
2152 while (dbp) {
2153 dep = dbp->db_firstentry;
2154 while (dep) {
2155 if (dep->de_rb)
2156 kmem_free((caddr_t)dep->de_rb,
2157 dep->de_recsize);
2158 dep2 = dep->de_next;
2159 kmem_free((caddr_t)dep, sizeofde(dep));
2160 dep = dep2;
2161 }
2162 dbp1 = dbp->db_next;
2163 kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
2164 dbp = dbp1;
2165 }
2166 }
2167 s->s_dbp = dbhp;
2168 return (err);
2169 }
2170
2171 static int
getoptcnt(mddb_set_t * s,int li)2172 getoptcnt(
2173 mddb_set_t *s,
2174 int li)
2175 {
2176 int result;
2177 mddb_de_ic_t *dep;
2178 mddb_db_t *dbp;
2179
2180 #if defined(_ILP32) && !defined(lint)
2181 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2182 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2183 #endif
2184
2185 result = 0;
2186 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2187 dep = dbp->db_firstentry;
2188 for (; dep != NULL; dep = dep->de_next) {
2189 if (! (dep->de_flags & MDDB_F_OPT))
2190 continue;
2191 if (((dep->de_optinfo[0].o_flags & MDDB_F_ACTIVE) &&
2192 (li == dep->de_optinfo[0].o_li)) ||
2193 ((dep->de_optinfo[1].o_flags & MDDB_F_ACTIVE) &&
2194 (li == dep->de_optinfo[1].o_li)))
2195 result++;
2196 }
2197 }
2198 return (result);
2199 }
2200
2201 static void
getoptdev(mddb_set_t * s,mddb_de_ic_t * rdep,int opti)2202 getoptdev(
2203 mddb_set_t *s,
2204 mddb_de_ic_t *rdep,
2205 int opti
2206 )
2207 {
2208 mddb_lb_t *lbp;
2209 mddb_locator_t *lp;
2210 mddb_optinfo_t *otherop;
2211 mddb_optinfo_t *resultop;
2212 int li;
2213 dev_t otherdev;
2214 int blkonly = 0;
2215 int mincnt;
2216 int thiscnt;
2217
2218 lbp = s->s_lbp;
2219
2220 resultop = &rdep->de_optinfo[opti];
2221 otherop = &rdep->de_optinfo[1-opti];
2222
2223 resultop->o_flags = 0;
2224
2225 /*
2226 * scan through and see if data bases have to vary by only device
2227 */
2228
2229 if (otherop->o_flags & MDDB_F_ACTIVE) {
2230 blkonly = 1;
2231 otherdev = expldev(lbp->lb_locators[otherop->o_li].l_dev);
2232 for (li = 0; li < lbp->lb_loccnt; li++) {
2233 lp = &lbp->lb_locators[li];
2234 if (! (lp->l_flags & MDDB_F_ACTIVE))
2235 continue;
2236 if (expldev(lp->l_dev) != otherdev) {
2237 blkonly = 0;
2238 break;
2239 }
2240 }
2241 }
2242
2243 mincnt = 999999;
2244 for (li = 0; li < lbp->lb_loccnt; li++) {
2245 dev_info_t *devi;
2246 int removable = 0;
2247
2248 lp = &lbp->lb_locators[li];
2249 if (! (lp->l_flags & MDDB_F_ACTIVE))
2250 continue;
2251 if (otherop->o_flags & MDDB_F_ACTIVE) {
2252 if (blkonly) {
2253 if (otherop->o_li == li)
2254 continue;
2255 } else {
2256 if (otherdev == expldev(lp->l_dev))
2257 continue;
2258 }
2259 }
2260
2261 /*
2262 * Check if this is a removable device. If it is we
2263 * assume it is something like a USB flash disk, a zip disk
2264 * or even a floppy that is being used to help maintain
2265 * mddb quorum. We don't want to put any optimized resync
2266 * records on these kinds of disks since they are usually
2267 * slower or don't have the same read/write lifetimes as
2268 * a regular fixed disk.
2269 */
2270 if ((devi = e_ddi_hold_devi_by_dev(lp->l_dev, 0)) != NULL) {
2271 int error;
2272 struct cb_ops *cb;
2273 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF;
2274 int propvalue = 0;
2275 int proplength = sizeof (int);
2276
2277 if ((cb = devopsp[getmajor(lp->l_dev)]->devo_cb_ops)
2278 != NULL) {
2279 error = (*cb->cb_prop_op)(DDI_DEV_T_ANY, devi,
2280 prop_op, DDI_PROP_NOTPROM |
2281 DDI_PROP_DONTPASS, "removable-media",
2282 (caddr_t)&propvalue, &proplength);
2283
2284 if (error == DDI_PROP_SUCCESS)
2285 removable = 1;
2286 }
2287
2288 ddi_release_devi(devi);
2289 }
2290
2291 if (removable)
2292 continue;
2293
2294 thiscnt = getoptcnt(s, li);
2295 if (thiscnt < mincnt) {
2296 resultop->o_li = li;
2297 mincnt = thiscnt;
2298 resultop->o_flags = MDDB_F_ACTIVE;
2299 }
2300 }
2301 }
2302
2303 static void
allocuserdata(mddb_de_ic_t * dep)2304 allocuserdata(
2305 mddb_de_ic_t *dep
2306 )
2307 {
2308 mddb_rb32_t *rbp;
2309
2310 #if defined(_ILP32) && !defined(lint)
2311 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2312 #endif
2313
2314 rbp = dep->de_rb;
2315 rbp->rb_private = 0;
2316 dep->de_rb_userdata = kmem_zalloc(dep->de_reqsize, KM_SLEEP);
2317 rbp->rb_userdata = 0x4; /* Make sure this is non-zero */
2318 bcopy((caddr_t)rbp->rb_data, dep->de_rb_userdata, dep->de_reqsize);
2319 }
2320
2321
2322 static void
getuserdata(set_t setno,mddb_de_ic_t * dep)2323 getuserdata(
2324 set_t setno,
2325 mddb_de_ic_t *dep
2326 )
2327 {
2328 mddb_rb32_t *rbp;
2329
2330
2331 mddb_type_t type = dep->de_type1;
2332 caddr_t data, udata;
2333
2334 #if defined(_ILP32) && !defined(lint)
2335 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2336 #endif
2337 rbp = dep->de_rb;
2338 data = (caddr_t)rbp->rb_data;
2339 udata = (caddr_t)dep->de_rb_userdata;
2340
2341 /*
2342 * If it's a driver record, and an old style record, and not a DRL
2343 * record, we must convert it because it was incore as a 64 bit
2344 * structure but its on disk layout has only 32 bit for block sizes
2345 */
2346 if (!(md_get_setstatus(setno) &
2347 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
2348 (type >= MDDB_FIRST_MODID) &&
2349 ((rbp->rb_revision == MDDB_REV_RB) ||
2350 (rbp->rb_revision == MDDB_REV_RBFN))) {
2351
2352 switch (dep->de_flags) {
2353
2354 case MDDB_F_STRIPE:
2355 stripe_convert(data, udata, BIG_2_SMALL);
2356 break;
2357
2358 case MDDB_F_MIRROR:
2359 mirror_convert(data, udata, BIG_2_SMALL);
2360 break;
2361
2362 case MDDB_F_RAID:
2363 raid_convert(data, udata, BIG_2_SMALL);
2364 break;
2365
2366 case MDDB_F_SOFTPART:
2367 softpart_convert(data, udata, BIG_2_SMALL);
2368 break;
2369
2370 case MDDB_F_TRANS_MASTER:
2371 trans_master_convert(data, udata, BIG_2_SMALL);
2372 break;
2373
2374 case MDDB_F_TRANS_LOG:
2375 trans_log_convert(data, udata, BIG_2_SMALL);
2376 break;
2377
2378 case MDDB_F_HOTSPARE:
2379 hs_convert(data, udata, BIG_2_SMALL);
2380 break;
2381
2382 case MDDB_F_OPT:
2383 default:
2384 bcopy(udata, data, dep->de_reqsize);
2385 }
2386 } else {
2387 bcopy(udata, data, dep->de_reqsize);
2388 }
2389 }
2390
2391 static void
getoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)2392 getoptrecord(
2393 mddb_set_t *s,
2394 mddb_de_ic_t *dep
2395 )
2396 {
2397 mddb_lb_t *lbp;
2398 mddb_locator_t *lp;
2399 mddb_rb32_t *rbp, *crbp;
2400 int li;
2401 int i;
2402 int err = 0;
2403 size_t recsize;
2404
2405 #if defined(_ILP32) && !defined(lint)
2406 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2407 #endif
2408
2409 lbp = s->s_lbp;
2410
2411 recsize = dep->de_recsize;
2412 dep->de_rb = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2413 rbp = dep->de_rb;
2414 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
2415
2416 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
2417 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2418
2419 for (i = 0; i < 2; i++) {
2420 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2421 continue;
2422 li = dep->de_optinfo[i].o_li;
2423 lp = &lbp->lb_locators[li];
2424
2425 if (! (lp->l_flags & MDDB_F_ACTIVE) ||
2426 (lp->l_flags & MDDB_F_EMASTER))
2427 continue;
2428
2429 err = readblklst(s, (caddr_t)rbp, dep->de_blks,
2430 dep->de_blkcount, li, 0);
2431
2432 if (err)
2433 continue;
2434
2435 if (rbp->rb_magic != MDDB_MAGIC_RB)
2436 continue;
2437
2438 if (revchk(MDDB_REV_RB, rbp->rb_revision))
2439 continue;
2440
2441 /* Check the crc for this record */
2442 if (rec_crcchk(s, dep, rbp)) {
2443 continue;
2444 }
2445
2446 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
2447
2448 if (rbp == crbp) {
2449 if (rbp->rb_checksum != crbp->rb_checksum)
2450 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
2451 break;
2452 }
2453 rbp = crbp;
2454 }
2455
2456 if (rbp == crbp) {
2457 rbp->rb_private = 0;
2458 kmem_free((caddr_t)crbp, recsize);
2459 return;
2460 }
2461 bzero((caddr_t)rbp, recsize);
2462 rbp->rb_magic = MDDB_MAGIC_RB;
2463 rbp->rb_revision = MDDB_REV_RB;
2464 uniqtime32(&rbp->rb_timestamp);
2465 /* Generate the crc for this record */
2466 rec_crcgen(s, dep, rbp);
2467 kmem_free((caddr_t)crbp, recsize);
2468 }
2469
2470 /*
2471 * writeoptrecord writes out an optimized record.
2472 */
2473 static int
writeoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)2474 writeoptrecord(
2475 mddb_set_t *s,
2476 mddb_de_ic_t *dep
2477 )
2478 {
2479 mddb_rb32_t *rbp;
2480 int li;
2481 int err = 0, wrt_err = 0;
2482 mddb_bf_t *bufhead, *bfp;
2483 mddb_lb_t *lbp = s->s_lbp;
2484 mddb_locator_t *lp;
2485 int i;
2486
2487 #if defined(_ILP32) && !defined(lint)
2488 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2489 #endif
2490
2491 bufhead = NULL;
2492 err = 0;
2493
2494 while (s->s_opthavequeuinglck) {
2495 s->s_optwantqueuinglck++;
2496 cv_wait(&s->s_optqueuing_cv, SETMUTEX(s->s_setno));
2497 }
2498 s->s_opthavequeuinglck++;
2499 rbp = dep->de_rb;
2500 for (i = 0; i < 2; i++) {
2501 /*
2502 * only possible error is xlate. This can
2503 * occur if a replica was off line and came
2504 * back. During the mean time the database grew
2505 * large than the now on line replica can store
2506 */
2507 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
2508 continue;
2509 li = dep->de_optinfo[i].o_li;
2510 /*
2511 * In a MN diskset, any node can write optimized record(s).
2512 */
2513 wrt_err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
2514 dep->de_blkcount, li, &bufhead, MDDB_WR_ANY_NODE);
2515 /*
2516 * For MN diskset, set error in optinfo structure so
2517 * that mddb_commitrec knows which replica failed.
2518 */
2519 if ((MD_MNSET_SETNO(s->s_setno)) &&
2520 (wrt_err & MDDB_F_EWRITE)) {
2521 dep->de_optinfo[i].o_flags |= MDDB_F_EWRITE;
2522 }
2523 err |= wrt_err;
2524 }
2525 s->s_opthavequeuinglck = 0;
2526 if (s->s_optwantqueuinglck) {
2527 s->s_optwantqueuinglck = 0;
2528 cv_broadcast(&s->s_optqueuing_cv);
2529 }
2530 for (bfp = bufhead; bfp; bfp = bufhead) {
2531 mutex_exit(SETMUTEX(s->s_setno));
2532 (void) biowait(&bfp->bf_buf);
2533 mutex_enter(SETMUTEX(s->s_setno));
2534 if (bfp->bf_buf.b_flags & B_ERROR) {
2535 /*
2536 * If an MN diskset, don't set replica
2537 * in error since this hasn't been set in master.
2538 * Setting replica in error before master could
2539 * leave the nodes with different views of the
2540 * world since a class 1 configuration change
2541 * could occur in mddb_commitrec as soon as
2542 * all locks are dropped. Must keep this
2543 * node the same as master and can't afford a
2544 * failure from the class 1 config change
2545 * if master succeeded.
2546 */
2547 if (!(MD_MNSET_SETNO(s->s_setno))) {
2548 bfp->bf_locator->l_flags |= MDDB_F_EWRITE;
2549 } else {
2550 /*
2551 * Find which de_optinfo (which replica)
2552 * had a failure and set the failure in
2553 * the o_flags field.
2554 */
2555 lp = &lbp->lb_locators[dep->de_optinfo[0].o_li];
2556 if (lp == bfp->bf_locator) {
2557 dep->de_optinfo[0].o_flags |=
2558 MDDB_F_EWRITE;
2559 } else {
2560 dep->de_optinfo[1].o_flags |=
2561 MDDB_F_EWRITE;
2562 }
2563 }
2564 err |= MDDB_F_EWRITE;
2565 }
2566 bufhead = bfp->bf_next;
2567 freebuffer(s, bfp);
2568 }
2569 return (err);
2570 }
2571
2572 /*
2573 * Fix up the optimized resync record. Used in the traditional and local
2574 * disksets to move an optimized record from a failed or deleted mddb
2575 * to an active one.
2576 *
2577 * In a MN diskset, the fixing of the optimized record is split between
2578 * the master and slave nodes. If the master node moves the optimized
2579 * resync record, then the master node will send a MDDB_PARSE_OPTRECS
2580 * message to the slave nodes causing the slave nodes to reget the
2581 * directory entry containing the location of the optimized resync record.
2582 * After the record is reread from disk, then writeoptrecord is called
2583 * if the location of the optimized resync record or flags have changed.
2584 * When writeoptrecord is called, the node that is the owner of this record
2585 * will write the optimized record to the location specified in the directory
2586 * entry. Since the master node uses the highest class message (PARSE)
2587 * the record owner node is guaranteed to already have an updated
2588 * directory entry incore.
2589 *
2590 * The other difference between the traditional/local set and MN diskset
2591 * is that the directory entry can be written to disk before the optimized
2592 * record in a MN diskset if the record is owned by a slave node. So,
2593 * the users of an optimized record must handle the failure case when no
2594 * data is available from an optimized record since the master node could
2595 * have failed during the relocation of the optimized record to another mddb.
2596 */
2597 static int
fixoptrecord(mddb_set_t * s,mddb_de_ic_t * dep,mddb_db_t * dbp)2598 fixoptrecord(
2599 mddb_set_t *s,
2600 mddb_de_ic_t *dep,
2601 mddb_db_t *dbp
2602 )
2603 {
2604 int changed;
2605 int writedata;
2606 int err = 0;
2607 int i;
2608 mddb_lb_t *lbp;
2609 mddb_optinfo_t *op;
2610 mddb_db32_t *db32p;
2611 int rec_owner; /* Is node owner of record? */
2612
2613 #if defined(_ILP32) && !defined(lint)
2614 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2615 #endif
2616
2617 lbp = s->s_lbp;
2618 changed = 0;
2619 writedata = 0;
2620 for (i = 0; i < 2; i++) {
2621 op = &dep->de_optinfo[i];
2622
2623 if (! (lbp->lb_locators[op->o_li].l_flags & MDDB_F_ACTIVE))
2624 op->o_flags = 0;
2625
2626 /*
2627 * If optimized record has seen a replica failure,
2628 * assign new replica to record and re-write data
2629 * to new record.
2630 */
2631 if (! (op->o_flags & MDDB_F_ACTIVE)) {
2632 getoptdev(s, dep, i);
2633 writedata++;
2634 changed++;
2635 /* Set flag for slaves to reread dep and write rec */
2636 if (lbp->lb_flags & MDDB_MNSET) {
2637 s->s_mn_parseflags |= MDDB_PARSE_OPTRECS;
2638 }
2639 }
2640
2641 /*
2642 * If just an error in the data was seen, set
2643 * the optimized record's replica flag to active (ok)
2644 * and try again.
2645 */
2646 if (op->o_flags & MDDB_F_EDATA) {
2647 dep->de_optinfo[0].o_flags = MDDB_F_ACTIVE;
2648 writedata++;
2649 }
2650 }
2651
2652 rec_owner = 0;
2653 if (lbp->lb_flags & MDDB_MNSET) {
2654 /*
2655 * If a MN diskset then check the owner of optimized record.
2656 * If the master node owns the record or if there is
2657 * no owner of the record, then the master can write the
2658 * optimized record to disk.
2659 * Master node can write the optimized record now, but
2660 * slave nodes write their records during handling of
2661 * the MDDB_PARSE_OPTRECS message.
2662 */
2663 if ((dep->de_owner_nodeid == MD_MN_INVALID_NID) ||
2664 (dep->de_owner_nodeid == md_set[s->s_setno].s_nodeid)) {
2665 rec_owner = 1;
2666 }
2667 } else {
2668 /*
2669 * In traditional diskset and local set, this node
2670 * is always the record owner and always the master.
2671 */
2672 rec_owner = 1;
2673 }
2674
2675 /*
2676 * If this node is the record owner, write out record.
2677 */
2678 if ((writedata) && (rec_owner)) {
2679 if (err = writeoptrecord(s, dep)) {
2680 return (err);
2681 }
2682 }
2683 if (! changed)
2684 return (0);
2685 uniqtime32(&dbp->db_timestamp);
2686 dbp->db_revision = MDDB_REV_DB;
2687 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
2688 create_db32rec(db32p, dbp);
2689 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
2690 err = writeall(s, (caddr_t)db32p, db32p->db32_blknum,
2691 1, MDDB_WR_ONLY_MASTER);
2692 kmem_free((caddr_t)db32p, MDDB_BSIZE);
2693 return (err);
2694 }
2695
2696 static int
fixoptrecords(mddb_set_t * s)2697 fixoptrecords(
2698 mddb_set_t *s
2699 )
2700 {
2701 mddb_de_ic_t *dep;
2702 mddb_db_t *dbp;
2703 int err = 0;
2704 set_t setno;
2705
2706 /*
2707 * In a MN diskset, the master node is the only node that runs
2708 * fixoptrecords. If the master node changes anything, then the
2709 * master node sends PARSE message to the slave nodes. The slave
2710 * nodes will then re-read in the locator block or re-read in the
2711 * directory blocks and re-write the optimized resync records.
2712 */
2713 setno = s->s_setno;
2714 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
2715 (md_set[setno].s_am_i_master == 0)) {
2716 return (0);
2717 }
2718
2719 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2720 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2721 if (! (dep->de_flags & MDDB_F_OPT))
2722 continue;
2723 err = fixoptrecord(s, dep, dbp);
2724 if (err != 0)
2725 return (err);
2726 }
2727 }
2728 return (0);
2729 }
2730
2731 /*
2732 * Checks incore version of mddb data to mddb data ondisk.
2733 *
2734 * Returns:
2735 * - 0 if the data was successfully read and is good.
2736 * - MDDB_F_EREAD if a read error occurred.
2737 * - 1 if the data read is bad (checksum failed, etc)
2738 */
2739 static int
checkcopy(mddb_set_t * s,int li)2740 checkcopy
2741 (
2742 mddb_set_t *s,
2743 int li
2744 )
2745 {
2746 mddb_db_t *dbp;
2747 mddb_db32_t *cdb32p;
2748 mddb_de_ic_t *dep;
2749 mddb_de32_t *cde32p;
2750 mddb_rb32_t *rbp, *crbp;
2751 size_t size;
2752 int i;
2753 int retval = 1;
2754
2755 #if defined(_ILP32) && !defined(lint)
2756 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
2757 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
2758 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
2759 #endif
2760
2761 if (s->s_databuffer_size == 0) {
2762 size_t maxrecsize = MDDB_BSIZE;
2763
2764 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
2765 for (dep = dbp->db_firstentry; dep; dep = dep->de_next)
2766 if (! (dep->de_flags & MDDB_F_OPT) &&
2767 dep->de_recsize > maxrecsize)
2768 maxrecsize = dep->de_recsize;
2769
2770 s->s_databuffer = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
2771 s->s_databuffer_size = maxrecsize;
2772 }
2773
2774 cdb32p = (mddb_db32_t *)s->s_databuffer;
2775
2776 /*
2777 * first go through and make sure all directory stuff
2778 * is the same
2779 */
2780 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2781 if (readblks(s, (caddr_t)cdb32p, dbp->db_blknum, 1, li)) {
2782 retval = MDDB_F_EREAD;
2783 goto err;
2784 }
2785 if (cdb32p->db32_magic != MDDB_MAGIC_DB)
2786 goto err;
2787 if (revchk(MDDB_REV_DB, cdb32p->db32_revision))
2788 goto err;
2789 if (crcchk(cdb32p, &cdb32p->db32_checksum, MDDB_BSIZE, NULL))
2790 goto err;
2791 if (cdb32p->db32_nextblk != dbp->db_nextblk)
2792 goto err;
2793 if (cdb32p->db32_recsum != dbp->db_recsum)
2794 goto err;
2795 if (cdb32p->db32_firstentry) {
2796 cde32p = (mddb_de32_t *)
2797 ((void *)((caddr_t)(&cdb32p->db32_firstentry)
2798 + sizeof (cdb32p->db32_firstentry)));
2799 } else
2800 cde32p = NULL;
2801
2802 dep = dbp->db_firstentry;
2803 /*
2804 * check if all directory entries are identical
2805 */
2806 while (dep && cde32p) {
2807 if (dep->de_recid != cde32p->de32_recid)
2808 goto err;
2809 if (dep->de_type1 != cde32p->de32_type1)
2810 goto err;
2811 if (dep->de_type2 != cde32p->de32_type2)
2812 goto err;
2813 if (dep->de_reqsize != cde32p->de32_reqsize)
2814 goto err;
2815 if (dep->de_flags != cde32p->de32_flags)
2816 goto err;
2817
2818 for (i = 0; i < 2; i++) {
2819 if (dep->de_optinfo[i].o_li !=
2820 cde32p->de32_optinfo[i].o_li)
2821 break;
2822 }
2823 if (i != 2)
2824 goto err;
2825 size = sizeof (mddb_block_t) * dep->de_blkcount;
2826 if (bcmp((caddr_t)dep->de_blks,
2827 (caddr_t)cde32p->de32_blks, size))
2828 goto err;
2829 dep = dep->de_next;
2830 if (cde32p->de32_next)
2831 cde32p = nextentry(cde32p);
2832 else
2833 cde32p = NULL;
2834 }
2835 if (dep || cde32p)
2836 goto err;
2837 }
2838 /*
2839 * If here, all directories are functionally identical
2840 * check to make sure all records are identical
2841 * the reason the records are not just bcmped is that the
2842 * lock flag does not want to be compared.
2843 */
2844 crbp = (mddb_rb32_t *)cdb32p;
2845 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
2846 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
2847 if ((dep->de_flags & MDDB_F_OPT) ||
2848 (dep->de_flags & MDDB_F_CHANGELOG))
2849 continue;
2850 rbp = (mddb_rb32_t *)dep->de_rb;
2851 if (readblklst(s, (caddr_t)crbp, dep->de_blks,
2852 dep->de_blkcount, li, 0)) {
2853 retval = MDDB_F_EREAD;
2854 goto err;
2855 }
2856 /* Check the crc for this record */
2857 if (rec_crcchk(s, dep, crbp))
2858 goto err;
2859
2860 if (rbp->rb_checksum != crbp->rb_checksum ||
2861 rbp->rb_checksum_fiddle != crbp->rb_checksum_fiddle)
2862 goto err;
2863 }
2864 }
2865 return (0);
2866 err:
2867 return (retval);
2868 }
2869
2870 /*
2871 * Determine if the location information for two mddbs is the same.
2872 * The device slice and block offset should match. If both have devids then
2873 * use that for the comparison, otherwise we compare the dev_ts.
2874 * Comparing with the devid allows us to handle the case where a mddb was
2875 * relocated to a dead mddbs dev_t. The live mddb will have the dev_t of
2876 * the dead mddb but the devid comparison will catch this and not match.
2877 *
2878 * Return 1 if the location of the two mddbs match, 0 if not.
2879 */
2880 static int
match_mddb(mddb_ri_t * rip,ddi_devid_t devid,char * minor,md_dev64_t dev,daddr32_t blkno)2881 match_mddb(mddb_ri_t *rip, ddi_devid_t devid, char *minor, md_dev64_t dev,
2882 daddr32_t blkno)
2883 {
2884 if (rip->ri_flags & MDDB_F_EMASTER) {
2885 /*
2886 * If this element is errored then we don't try to match on it.
2887 * If we try to match we could erroneously match on the dev_t
2888 * of a relocated disk.
2889 */
2890 return (0);
2891 }
2892
2893 if (rip->ri_devid && devid && minor) {
2894 /*
2895 * If old devid exists, then this is a replicated diskset
2896 * and both old and new devids must be checked.
2897 */
2898 if (rip->ri_old_devid) {
2899 if (((ddi_devid_compare(rip->ri_devid, devid) != 0) &&
2900 (ddi_devid_compare(rip->ri_old_devid,
2901 devid) != 0)) ||
2902 (strcmp(rip->ri_minor_name, minor) != 0))
2903 return (0);
2904 } else {
2905 if (ddi_devid_compare(rip->ri_devid, devid) != 0 ||
2906 strcmp(rip->ri_minor_name, minor) != 0)
2907 return (0);
2908 }
2909 } else {
2910 if (rip->ri_dev != dev)
2911 return (0);
2912 }
2913
2914 if (rip->ri_blkno != blkno)
2915 return (0);
2916
2917 return (1);
2918 }
2919
2920 static int
ridev(mddb_ri_t ** rip,mddb_cfg_loc_t * clp,dev32_t * dev_2b_fixed,int flag)2921 ridev(
2922 mddb_ri_t **rip,
2923 mddb_cfg_loc_t *clp,
2924 dev32_t *dev_2b_fixed,
2925 int flag)
2926 {
2927 mddb_ri_t *r, *r1;
2928 md_dev64_t ldev, ndev;
2929 major_t majordev;
2930 int sz;
2931
2932 if (MD_UPGRADE) {
2933 ldev = md_makedevice(md_targ_name_to_major(clp->l_driver),
2934 clp->l_mnum);
2935 } else {
2936 if (ddi_name_to_major(clp->l_driver) == (major_t)-1)
2937 return (EINVAL);
2938
2939 ldev = md_makedevice(ddi_name_to_major(clp->l_driver),
2940 clp->l_mnum);
2941 }
2942
2943 if (clp->l_devid != 0) {
2944 /*
2945 * Get dev associated with device id and minor name.
2946 * Setup correct driver name if dev is now different.
2947 * Don't change driver name if during upgrade.
2948 */
2949 ndev = ldev;
2950 if (!mddb_devid_validate((ddi_devid_t)(uintptr_t)clp->l_devid,
2951 &ndev, clp->l_minor_name)) {
2952 if ((ndev != ldev) && (!(MD_UPGRADE))) {
2953 majordev = md_getmajor(ndev);
2954 (void) strcpy(clp->l_driver,
2955 ddi_major_to_name(majordev));
2956 clp->l_mnum = md_getminor(ndev);
2957 clp->l_devid_flags |= MDDB_DEVID_VALID;
2958 ldev = ndev;
2959 }
2960 } else {
2961 /* Mark as invalid */
2962 clp->l_devid_flags &= ~MDDB_DEVID_VALID;
2963 }
2964 }
2965
2966 clp->l_dev = md_cmpldev(ldev);
2967 if (dev_2b_fixed)
2968 *dev_2b_fixed = clp->l_dev;
2969 r = *rip;
2970
2971 while (r) {
2972 if (match_mddb(r, (ddi_devid_t)(uintptr_t)clp->l_devid,
2973 clp->l_minor_name, ldev, clp->l_blkno)) {
2974 if ((clp->l_devid != 0) &&
2975 !(clp->l_devid_flags & MDDB_DEVID_VALID)) {
2976 r->ri_flags |= MDDB_F_EMASTER;
2977 } else {
2978 r->ri_flags |= flag;
2979 }
2980 return (0); /* already entered return success */
2981 }
2982 r = r->ri_next;
2983 }
2984
2985 /*
2986 * This replica not represented in the current rip list,
2987 * so add it to the list.
2988 */
2989 r = (mddb_ri_t *)kmem_zalloc(sizeof (**rip), KM_SLEEP);
2990 r->ri_dev = ldev;
2991 r->ri_blkno = clp->l_blkno;
2992 (void) strncpy(r->ri_driver, clp->l_driver, MD_MAXDRVNM);
2993 if (strlen(clp->l_driver) >= MD_MAXDRVNM) {
2994 r->ri_driver[(MD_MAXDRVNM -1)] = '\0';
2995 }
2996 if (clp->l_devname != NULL) {
2997 (void) strcpy(r->ri_devname, clp->l_devname);
2998 }
2999 r->ri_flags |= flag;
3000 if (clp->l_devid != 0) {
3001 sz = clp->l_devid_sz;
3002 r->ri_devid = (ddi_devid_t)kmem_zalloc(sz, KM_SLEEP);
3003 bcopy((void *)(uintptr_t)clp->l_devid, (char *)r->ri_devid, sz);
3004
3005 if (clp->l_old_devid != NULL) {
3006 sz = clp->l_old_devid_sz;
3007 r->ri_old_devid = (ddi_devid_t)kmem_zalloc(sz,
3008 KM_SLEEP);
3009 bcopy((char *)(uintptr_t)clp->l_old_devid,
3010 (char *)r->ri_old_devid, sz);
3011 } else {
3012 r->ri_old_devid = 0;
3013 }
3014 if (strlen(clp->l_minor_name) < MDDB_MINOR_NAME_MAX)
3015 (void) strcpy(r->ri_minor_name, clp->l_minor_name);
3016
3017 if (!(clp->l_devid_flags & MDDB_DEVID_VALID)) {
3018 /*
3019 * Devid is present, but not valid. This could
3020 * happen if device has been powered off or if
3021 * the device has been removed. Mark the device in
3022 * error. Don't allow any writes to this device
3023 * based on the dev_t since another device could
3024 * have been placed in its spot and be responding to
3025 * the dev_t accesses.
3026 */
3027 r->ri_flags |= MDDB_F_EMASTER;
3028 }
3029 } else {
3030 r->ri_devid = 0;
3031 r->ri_old_devid = 0;
3032 }
3033
3034 /*
3035 * If the rip list is empty then this entry
3036 * is the list.
3037 */
3038 if (*rip == NULL) {
3039 *rip = r;
3040 return (0);
3041 }
3042
3043 /*
3044 * Add this entry to the end of the rip list
3045 */
3046 r1 = *rip;
3047 while (r1->ri_next)
3048 r1 = r1->ri_next;
3049 r1->ri_next = r;
3050 return (0);
3051 }
3052
3053 /*
3054 * writecopy writes the incore data blocks out to all of the replicas.
3055 * This is called from writestart
3056 * - when a diskset is started or
3057 * - when an error has been enountered during the write to a mddb.
3058 * and from newdev when a new mddb is being added.
3059 *
3060 * flag can be 2 values:
3061 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is
3062 * always used for traditional and local disksets.
3063 * For MN diskset:
3064 * All nodes can call writecopy, but only the
3065 * master node actually writes data to the disk
3066 * except for optimized resync records.
3067 * An optimized resync record can only be written to
3068 * by the record owner.
3069 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new
3070 * master has been chosen, the new master may need to
3071 * write its incore mddb to disk (this is the case where the
3072 * old master had executed a message but hadn't relayed it
3073 * to this slave yet). New master should not write the
3074 * change log records since new master would be overwriting
3075 * valuable data. Only used during a reconfig cycle.
3076 */
3077 static int
writecopy(mddb_set_t * s,int li,int flag)3078 writecopy(
3079 mddb_set_t *s,
3080 int li,
3081 int flag
3082 )
3083 {
3084 mddb_db_t *dbp;
3085 mddb_db32_t *db32p;
3086 mddb_de_ic_t *dep;
3087 mddb_rb32_t *rbp;
3088 uint_t checksum;
3089 int err = 0;
3090
3091 #if defined(_ILP32) && !defined(lint)
3092 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
3093 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
3094 #endif
3095
3096 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
3097 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
3098 create_db32rec(db32p, dbp);
3099 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
3100 err = writeblks(s, (caddr_t)db32p, dbp->db_blknum, 1, li,
3101 MDDB_WR_ONLY_MASTER);
3102 kmem_free((caddr_t)db32p, MDDB_BSIZE);
3103 if (err)
3104 return (err);
3105 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
3106 /*
3107 * In a multinode diskset, when a new master is
3108 * chosen the new master may need to write its
3109 * incore copy of the mddb to disk. In this case,
3110 * don't want to overwrite the change log records
3111 * so new master sets flag to MDDB_WRITECOPY_SYNC.
3112 */
3113 if (flag == MDDB_WRITECOPY_SYNC) {
3114 if (dep->de_flags & MDDB_F_CHANGELOG)
3115 continue;
3116 }
3117 /*
3118 * In a multinode diskset, don't write out optimized
3119 * resync resyncs since only the mirror owner node
3120 * will have the correct data. If writecopy is
3121 * being called from writestart as a result of
3122 * an mddb failure, then writestart will handle
3123 * the optimized records when it calls fixoptrecords.
3124 */
3125 if ((MD_MNSET_SETNO(s->s_setno)) &&
3126 (dep->de_flags & MDDB_F_OPT)) {
3127 continue;
3128 }
3129
3130 rbp = dep->de_rb;
3131 checksum = rbp->rb_checksum_fiddle;
3132 checksum ^= rbp->rb_checksum;
3133 /* Generate the crc for this record */
3134 rec_crcgen(s, dep, rbp);
3135 checksum ^= rbp->rb_checksum;
3136 rbp->rb_checksum_fiddle = checksum;
3137 if (err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
3138 dep->de_blkcount, li, (mddb_bf_t **)0,
3139 MDDB_WR_ONLY_MASTER))
3140 return (err);
3141 }
3142 }
3143 return (0);
3144 }
3145
3146 static int
upd_med(mddb_set_t * s,char * tag)3147 upd_med(
3148 mddb_set_t *s,
3149 char *tag
3150 )
3151 {
3152 med_data_t meddb;
3153 int medok;
3154 mddb_lb_t *lbp = s->s_lbp;
3155 set_t setno = s->s_setno;
3156 int li;
3157 int alc;
3158 int lc;
3159
3160
3161 /* If no mediator hosts, nothing to do */
3162 if (s->s_med.n_cnt == 0)
3163 return (0);
3164
3165 /*
3166 * If this is a MN set and we are not the master, then don't
3167 * update mediator hosts or mark mediator as golden since
3168 * only master node should do that.
3169 */
3170 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
3171 (md_set[setno].s_am_i_master == 0)) {
3172 return (0);
3173 }
3174
3175 bzero((char *)&meddb, sizeof (med_data_t));
3176 meddb.med_dat_mag = MED_DATA_MAGIC;
3177 meddb.med_dat_rev = MED_DATA_REV;
3178 meddb.med_dat_fl = 0;
3179 meddb.med_dat_sn = setno;
3180 meddb.med_dat_cc = lbp->lb_commitcnt;
3181 TIMEVAL32_TO_TIMEVAL(&meddb.med_dat_id, &lbp->lb_ident.createtime);
3182 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3183
3184 /* count accessible mediators */
3185 medok = upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3186
3187 /* count accessible and existing replicas */
3188 for (li = 0, alc = 0, lc = 0; li < lbp->lb_loccnt; li++) {
3189 mddb_locator_t *lp = &lbp->lb_locators[li];
3190
3191 if (lp->l_flags & MDDB_F_DELETED)
3192 continue;
3193
3194 lc++;
3195
3196 if (! (lp->l_flags & MDDB_F_ACTIVE) ||
3197 (lp->l_flags & MDDB_F_EMASTER) ||
3198 (lp->l_flags & MDDB_F_EWRITE))
3199 continue;
3200
3201 alc++;
3202 }
3203
3204 /*
3205 * Mediator update quorum is >= 50%: check for less than
3206 * "mediator update" quorum.
3207 */
3208 if ((medok * 2) < s->s_med.n_cnt) {
3209 /* panic if <= 50% of all replicas are accessible */
3210 if ((lc > 0) && ((alc * 2) <= lc)) {
3211 cmn_err(CE_PANIC,
3212 "md: Update of 50%% of the mediator hosts failed");
3213 /* NOTREACHED */
3214 }
3215
3216 cmn_err(CE_WARN,
3217 "md: Update of 50%% of the mediator hosts failed");
3218 }
3219
3220 /*
3221 * If we have mediator update quorum and exactly 50% of the replicas
3222 * are accessible then mark the mediator as golden.
3223 */
3224 if (((medok * 2) >= (s->s_med.n_cnt + 1)) && (lc > 0) &&
3225 ((alc * 2) == lc)) {
3226 meddb.med_dat_fl = MED_DFL_GOLDEN;
3227 crcgen(&meddb, &meddb.med_dat_cks, sizeof (med_data_t), NULL);
3228 (void) upd_med_hosts(&s->s_med, s->s_setname, &meddb, tag);
3229 }
3230
3231 return (0);
3232 }
3233
3234 static int
push_lb(mddb_set_t * s)3235 push_lb(mddb_set_t *s)
3236 {
3237 mddb_lb_t *lbp = s->s_lbp;
3238
3239 /* push the change to all the replicas */
3240 uniqtime32(&lbp->lb_timestamp);
3241 if (MD_MNSET_SETNO(s->s_setno)) {
3242 lbp->lb_revision = MDDB_REV_MNLB;
3243 } else {
3244 lbp->lb_revision = MDDB_REV_LB;
3245 }
3246 /*
3247 * The updates to the mediator hosts are done
3248 * by the callers of this function.
3249 */
3250 return (writelocall(s));
3251 }
3252
3253 /* Should not call for MN diskset since data tags are not supported */
3254 static int
dtl_cmp(const mddb_dtag_t * odtp,const mddb_dtag_t * ndtp)3255 dtl_cmp(const mddb_dtag_t *odtp, const mddb_dtag_t *ndtp)
3256 {
3257 int diff = 0;
3258
3259 diff = (int)(odtp->dt_setno - ndtp->dt_setno);
3260 if (diff)
3261 return (diff);
3262
3263 diff = strncmp(odtp->dt_sn, ndtp->dt_sn, MDDB_SN_LEN);
3264 if (diff)
3265 return (diff);
3266
3267 diff = strncmp(odtp->dt_hn, ndtp->dt_hn, MD_MAX_NODENAME_PLUS_1);
3268 if (diff)
3269 return (diff);
3270
3271 /*CSTYLED*/
3272 return (timercmp(&odtp->dt_tv, &ndtp->dt_tv, !=));
3273 }
3274
3275 /* Should not call for MN diskset since data tags are not supported */
3276 static int
dtl_addl(mddb_set_t * s,const mddb_dtag_t * ndtp)3277 dtl_addl(mddb_set_t *s, const mddb_dtag_t *ndtp)
3278 {
3279 int nextid = 0;
3280 mddb_dtag_lst_t **dtlpp = &s->s_dtlp;
3281
3282 /* Run to the end of the list */
3283 for (/* void */; (*dtlpp != NULL); dtlpp = &(*dtlpp)->dtl_nx) {
3284 if (dtl_cmp(&(*dtlpp)->dtl_dt, ndtp) == 0)
3285 return (0);
3286 nextid++;
3287 }
3288
3289 /* Add the new member */
3290 *dtlpp = kmem_zalloc(sizeof (**dtlpp), KM_SLEEP);
3291
3292 /* Update the dtag portion of the list */
3293 bcopy((caddr_t)ndtp, (caddr_t)&((*dtlpp)->dtl_dt),
3294 sizeof (mddb_dtag_t));
3295
3296 /* Fix up the id value */
3297 (*dtlpp)->dtl_dt.dt_id = ++nextid;
3298
3299 return (0);
3300 }
3301
3302 /*
3303 * Even though data tags are not supported in MN disksets, dt_cntl may
3304 * be called for a MN diskset since this routine is called even before
3305 * it is known the kind of diskset being read in from disk.
3306 * For a MNdiskset, s_dtlp is 0 so a count of 0 is returned.
3307 */
3308 static int
dtl_cntl(mddb_set_t * s)3309 dtl_cntl(mddb_set_t *s)
3310 {
3311 mddb_dtag_lst_t *dtlp = s->s_dtlp;
3312 int ndt = 0;
3313
3314 while (dtlp != NULL) {
3315 ndt++;
3316 dtlp = dtlp->dtl_nx;
3317 }
3318
3319 return (ndt);
3320 }
3321
3322 /*
3323 * Even though data tags are not supported in MN disksets, dt_cntl may
3324 * be called for a MN diskset since this routine is called even before
3325 * it is known the kind of diskset being read in from disk.
3326 * For a MNdiskset, s_dtlp is 0 so a 0 is returned.
3327 */
3328 static mddb_dtag_t *
dtl_findl(mddb_set_t * s,int id)3329 dtl_findl(mddb_set_t *s, int id)
3330 {
3331 mddb_dtag_lst_t *dtlp = s->s_dtlp;
3332
3333 while (dtlp != NULL) {
3334 if (dtlp->dtl_dt.dt_id == id)
3335 return (&dtlp->dtl_dt);
3336 dtlp = dtlp->dtl_nx;
3337 }
3338 return ((mddb_dtag_t *)NULL);
3339 }
3340
3341 /* Should not call for MN diskset since data tags are not supported */
3342 static void
dtl_freel(mddb_dtag_lst_t ** dtlpp)3343 dtl_freel(mddb_dtag_lst_t **dtlpp)
3344 {
3345 mddb_dtag_lst_t *dtlp;
3346 mddb_dtag_lst_t *tdtlp;
3347
3348
3349 for (tdtlp = *dtlpp; tdtlp != NULL; tdtlp = dtlp) {
3350 dtlp = tdtlp->dtl_nx;
3351 kmem_free(tdtlp, sizeof (mddb_dtag_lst_t));
3352 }
3353 *dtlpp = (mddb_dtag_lst_t *)NULL;
3354 }
3355
3356 /*
3357 * Even though data tags are not supported in MN disksets, dt_setup will
3358 * be called for a MN diskset since this routine is called even before
3359 * it is known the kind of diskset being read in from disk.
3360 * Once this set is known as a MN diskset, the dtp area will be freed.
3361 */
3362 static void
dt_setup(mddb_set_t * s,const mddb_dtag_t * dtagp)3363 dt_setup(mddb_set_t *s, const mddb_dtag_t *dtagp)
3364 {
3365 mddb_dt_t *dtp;
3366 set_t setno = s->s_setno;
3367
3368
3369 if (md_set[setno].s_dtp == (mddb_dt_t *)NULL)
3370 md_set[setno].s_dtp = kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3371 else if (dtagp == (mddb_dtag_t *)NULL)
3372 bzero((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
3373
3374 /* shorthand */
3375 dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3376
3377 dtp->dt_mag = MDDB_MAGIC_DT;
3378 dtp->dt_rev = MDDB_REV_DT;
3379
3380 if (dtagp != NULL)
3381 dtp->dt_dtag = *dtagp; /* structure assignment */
3382
3383 /* Initialize the setno */
3384 dtp->dt_dtag.dt_setno = setno;
3385
3386 /* Clear the id and flags, this is only used in user land */
3387 dtp->dt_dtag.dt_id = 0;
3388
3389 /* Checksum it */
3390 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
3391 }
3392
3393 /* Should not call for MN diskset since data tags are not supported */
3394 static int
set_dtag(mddb_set_t * s,md_error_t * ep)3395 set_dtag(mddb_set_t *s, md_error_t *ep)
3396 {
3397 mddb_lb_t *lbp = s->s_lbp;
3398 mddb_dtag_t tag;
3399
3400 if (lbp->lb_dtblkcnt == 0) {
3401 /* Data tags not used in a MN set - so no failure returned */
3402 if (lbp->lb_flags & MDDB_MNSET)
3403 return (0);
3404
3405 cmn_err(CE_WARN,
3406 "No tag record allocated, unable to tag data");
3407 (void) mdmddberror(ep, MDE_DB_NOTAGREC, NODEV32, s->s_setno);
3408 return (1);
3409 }
3410
3411 /* Clear the stack variable */
3412 bzero((caddr_t)&tag, sizeof (mddb_dtag_t));
3413
3414 /* Get the HW serial number for this host */
3415 (void) snprintf(tag.dt_sn, MDDB_SN_LEN, "%u", zone_get_hostid(NULL));
3416 tag.dt_sn[MDDB_SN_LEN - 1] = '\0';
3417
3418 /* Get the nodename that this host goes by */
3419 (void) strncpy(tag.dt_hn, utsname.nodename, MD_MAX_NODENAME);
3420 tag.dt_hn[MD_MAX_NODENAME] = '\0';
3421
3422 /* Get a time stamp for NOW */
3423 uniqtime32(&tag.dt_tv);
3424
3425 /* Setup the data tag record */
3426 dt_setup(s, &tag);
3427
3428 /* Free any list of tags if they exist */
3429 dtl_freel(&s->s_dtlp);
3430
3431 /* Put the new tag onto the tag list */
3432 (void) dtl_addl(s, &tag);
3433
3434 return (0);
3435 }
3436
3437 /*
3438 * If called during upgrade, this routine expects a non-translated
3439 * (aka target) dev.
3440 * Should not call for MN diskset since data tags are not supported.
3441 */
3442 static int
dt_read(mddb_set_t * s,mddb_lb_t * lbp,mddb_ri_t * rip)3443 dt_read(mddb_set_t *s, mddb_lb_t *lbp, mddb_ri_t *rip)
3444 {
3445 int err = 0;
3446 md_dev64_t dev;
3447 caddr_t tbuf;
3448 daddr_t physblk;
3449 mddb_block_t blk;
3450 mddb_dt_t *dtp;
3451 mddb_dtag_t *dtagp;
3452 set_t setno = s->s_setno;
3453
3454 /* If have not allocated a data tag record, there is nothing to do */
3455 if (lbp->lb_dtblkcnt == 0)
3456 return (1);
3457
3458 dtp = rip->ri_dtp = (mddb_dt_t *)kmem_zalloc(MDDB_DT_BYTES, KM_SLEEP);
3459
3460 if (dtp == (mddb_dt_t *)NULL)
3461 return (1);
3462
3463 /* shorthand */
3464 dev = md_xlate_targ_2_mini(rip->ri_dev);
3465 if (dev == NODEV64) {
3466 return (1);
3467 }
3468
3469 tbuf = (caddr_t)rip->ri_dtp;
3470
3471 for (blk = 0; blk < lbp->lb_dtblkcnt; blk++) {
3472 physblk = getphysblk((blk + lbp->lb_dtfirstblk), rip->ri_mbip);
3473 err = getblks(s, tbuf, dev, physblk, btodb(MDDB_BSIZE), 0);
3474 /* error reading the tag */
3475 if (err) {
3476 err = 1;
3477 goto out;
3478 }
3479 tbuf += MDDB_BSIZE;
3480 }
3481
3482 /* magic is valid? */
3483 if (dtp->dt_mag != MDDB_MAGIC_DT) {
3484 err = 1;
3485 goto out;
3486 }
3487
3488 /* revision is valid? */
3489 if (revchk(MDDB_REV_DT, dtp->dt_rev)) {
3490 err = 1;
3491 goto out;
3492 }
3493
3494 /* crc is valid? */
3495 if (crcchk(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL)) {
3496 err = 1;
3497 goto out;
3498 }
3499
3500 /* shorthand */
3501 dtagp = &dtp->dt_dtag;
3502
3503 /* set number match? */
3504 if (dtagp->dt_setno != setno) {
3505 err = 1;
3506 goto out;
3507 }
3508
3509 /* tag is not empty? */
3510 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3511 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3512 dtagp->dt_id == 0) {
3513 err = 2;
3514 goto out;
3515 }
3516
3517 /* Mark the locator as having tagged data */
3518 rip->ri_flags |= MDDB_F_TAGDATA;
3519
3520 out:
3521 if (err) {
3522 if (err == 1) {
3523 md_set_setstatus(setno, MD_SET_BADTAG);
3524 rip->ri_flags |= MDDB_F_BADTAG;
3525 }
3526 if (dtp != NULL) {
3527 kmem_free(dtp, MDDB_DT_BYTES);
3528 rip->ri_dtp = (mddb_dt_t *)NULL;
3529 }
3530 }
3531
3532 return (err);
3533 }
3534
3535 /* Should not call for MN diskset since data tags are not supported */
3536 static int
dt_write(mddb_set_t * s)3537 dt_write(mddb_set_t *s)
3538 {
3539 int li;
3540 int err = 0;
3541 int werr;
3542 int empty_tag = 0;
3543 mddb_dtag_t *dtagp;
3544 mddb_dt_t *dtp;
3545 mddb_lb_t *lbp = s->s_lbp;
3546 set_t setno = s->s_setno;
3547 uint_t set_status = md_get_setstatus(setno);
3548
3549
3550 ASSERT(md_set[setno].s_dtp != NULL);
3551
3552 /* Nowhere to write to */
3553 if (lbp->lb_dtblkcnt == 0)
3554 return (err);
3555
3556 if (set_status & MD_SET_BADTAG)
3557 return (err);
3558
3559 /* shorthand */
3560 dtp = (mddb_dt_t *)md_set[setno].s_dtp;
3561 dtagp = &dtp->dt_dtag;
3562
3563 /* See if the tag is empty. */
3564 if (dtagp->dt_sn[0] == '\0' && dtagp->dt_hn[0] == '\0' &&
3565 (dtagp->dt_tv.tv_sec == 0 && dtagp->dt_tv.tv_usec == 0) &&
3566 dtagp->dt_id == 0)
3567 empty_tag = 1;
3568
3569 /* Write the tag to the locators and reset appropriate flags. */
3570 for (li = 0; li < lbp->lb_loccnt; li++) {
3571 mddb_locator_t *lp = &lbp->lb_locators[li];
3572
3573 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3574 (lp->l_flags & MDDB_F_DELETED) ||
3575 (lp->l_flags & MDDB_F_EWRITE))
3576 continue;
3577
3578 werr = writeblks(s, (caddr_t)dtp, lbp->lb_dtfirstblk,
3579 MDDB_DT_BLOCKS, li, MDDB_WR_ONLY_MASTER);
3580
3581 if (werr) {
3582 err |= werr;
3583 continue;
3584 }
3585
3586 if (empty_tag)
3587 lp->l_flags &= ~(MDDB_F_BADTAG | MDDB_F_TAGDATA);
3588 else {
3589 lp->l_flags |= MDDB_F_TAGDATA;
3590 lp->l_flags &= ~MDDB_F_BADTAG;
3591 }
3592 }
3593
3594 if (err)
3595 return (err);
3596
3597
3598 /* If the tags were written, check to see if any tags remain. */
3599 for (li = 0; li < lbp->lb_loccnt; li++) {
3600 mddb_locator_t *lp = &lbp->lb_locators[li];
3601
3602 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3603 (lp->l_flags & MDDB_F_DELETED) ||
3604 (lp->l_flags & MDDB_F_EWRITE))
3605 continue;
3606
3607 if (lp->l_flags & MDDB_F_TAGDATA)
3608 break;
3609 }
3610
3611 /* If there are no tags, then clear CLRTAG and TAGDATA */
3612 if (li == lbp->lb_loccnt) {
3613 md_clr_setstatus(setno, MD_SET_CLRTAG);
3614 md_clr_setstatus(setno, MD_SET_TAGDATA);
3615 }
3616
3617 return (err);
3618 }
3619
3620 /* Should not call for MN diskset since data tags are not supported */
3621 static int
dt_alloc_if_needed(mddb_set_t * s)3622 dt_alloc_if_needed(mddb_set_t *s)
3623 {
3624 int i;
3625 int li;
3626 int moveit = 0;
3627 mddb_lb_t *lbp = s->s_lbp;
3628 mddb_block_t blkcnt = lbp->lb_dtblkcnt;
3629 set_t setno = s->s_setno;
3630 uint_t set_status = md_get_setstatus(setno);
3631
3632 /*
3633 * If the data tag record is allocated (blkcnt != 0) and a bad tag was
3634 * not detected, there is nothing to do.
3635 */
3636 if (blkcnt != 0 && ! (set_status & MD_SET_BADTAG))
3637 return (0);
3638
3639 /* Bitmap not setup, checks can't be done */
3640 if (s->s_totalblkcnt == 0)
3641 return (0);
3642
3643 /* While reading the tag(s) an invalid tag data record was seen */
3644 if (set_status & MD_SET_BADTAG)
3645 /* See if the invalid tag needs to be moved */
3646 for (i = 0; i < MDDB_DT_BLOCKS; i++)
3647 if (blkcheck(s, (i + lbp->lb_dtfirstblk))) {
3648 moveit = 1;
3649 break;
3650 }
3651
3652 /* Need to move or allocate the tag data record */
3653 if (moveit || blkcnt == 0) {
3654 lbp->lb_dtfirstblk = getfreeblks(s, MDDB_DT_BLOCKS);
3655 if (lbp->lb_dtfirstblk == 0) {
3656 cmn_err(CE_WARN,
3657 "Unable to allocate data tag record");
3658 return (0);
3659 }
3660 lbp->lb_dtblkcnt = MDDB_DT_BLOCKS;
3661
3662 /* Mark the locators so that they get written to disk. */
3663 for (li = 0; li < lbp->lb_loccnt; li++) {
3664 mddb_locator_t *lp = &lbp->lb_locators[li];
3665
3666 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
3667 (lp->l_flags & MDDB_F_DELETED) ||
3668 (lp->l_flags & MDDB_F_EWRITE))
3669 continue;
3670
3671 lp->l_flags |= MDDB_F_BADTAG;
3672 }
3673 return (1);
3674 }
3675
3676 /*
3677 * Make sure the blocks are owned, since the calculation in
3678 * computefreeblks() is bypassed when MD_SET_BADTAG is set.
3679 */
3680 for (i = 0; i < MDDB_DT_BLOCKS; i++)
3681 blkbusy(s, (i + lbp->lb_dtfirstblk));
3682
3683 return (1);
3684 }
3685
3686 /*
3687 * Writestart writes the incore mddb out to all of the replicas.
3688 * This is called when a diskset is started and when an error has
3689 * been enountered during the write to a mddb.
3690 *
3691 * flag can be 2 values:
3692 * MDDB_WRITECOPY_ALL - write all records to all mddbs. This is
3693 * always used for traditional and local disksets.
3694 * This is the normal path for MN disksets since the slave
3695 * nodes aren't actually allowed to write to disk.
3696 * MDDB_WRITECOPY_SYNC - special case for MN diskset. When a new
3697 * master has been chosen, the new master may need to
3698 * write its incore mddb to disk (this is the case where the
3699 * old master had executed a message but hadn't relayed it
3700 * to this slave yet). New master should not write the
3701 * change log records since new master would be overwriting
3702 * valuable data. Only used during a reconfig cycle.
3703 */
3704 static int
writestart(mddb_set_t * s,int flag)3705 writestart(
3706 mddb_set_t *s,
3707 int flag
3708 )
3709 {
3710 int li;
3711 mddb_locator_t *lp;
3712 mddb_lb_t *lbp;
3713 mddb_ln_t *lnp;
3714 int err = 0;
3715 uint_t set_status;
3716
3717 lbp = s->s_lbp;
3718
3719 for (li = 0; li < lbp->lb_loccnt; li++) {
3720 lp = &lbp->lb_locators[li];
3721 if (! (lp->l_flags & MDDB_F_ACTIVE))
3722 continue;
3723 if (! (lp->l_flags & MDDB_F_SUSPECT))
3724 continue;
3725 if (writecopy(s, li, flag))
3726 return (1);
3727 lp->l_flags |= MDDB_F_UP2DATE;
3728 }
3729
3730 for (li = 0; li < lbp->lb_loccnt; li++) {
3731 lp = &lbp->lb_locators[li];
3732 if (! (lp->l_flags & MDDB_F_ACTIVE))
3733 continue;
3734 if ((lp->l_flags & MDDB_F_UP2DATE))
3735 continue;
3736 if (checkcopy(s, li))
3737 if (err = writecopy(s, li, flag))
3738 return (1);
3739 lp->l_flags |= MDDB_F_UP2DATE;
3740 }
3741
3742 /*
3743 * Call fixoptrecord even during a reconfig cycle since a replica
3744 * failure may force the master to re-assign the optimized
3745 * resync record to another replica.
3746 */
3747 if (fixoptrecords(s))
3748 return (1);
3749
3750 set_status = md_get_setstatus(s->s_setno);
3751
3752 /* See if any (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT) */
3753 for (li = 0; li < lbp->lb_loccnt; li++) {
3754 lp = &lbp->lb_locators[li];
3755
3756 if (lp->l_flags & MDDB_F_DELETED)
3757 continue;
3758
3759 if (((lp->l_flags & MDDB_F_ACTIVE) != 0 &&
3760 (lp->l_flags & MDDB_F_OLDACT) == 0) ||
3761 ((lp->l_flags & MDDB_F_ACTIVE) == 0 &&
3762 (lp->l_flags & MDDB_F_OLDACT) != 0))
3763 break;
3764
3765 if ((set_status & MD_SET_TAGDATA) ||
3766 (set_status & MD_SET_CLRTAG))
3767 if ((lp->l_flags & MDDB_F_TAGDATA) ||
3768 (lp->l_flags & MDDB_F_BADTAG))
3769 break;
3770 }
3771
3772 /*
3773 * If we found (ACTIVE and not OLDACT) or (not ACTIVE and OLDACT)
3774 * the lbp identifier and the set identifier doesn't match.
3775 */
3776 if (li != lbp->lb_loccnt || cmpidentifier(s, &lbp->lb_ident)) {
3777
3778 /* Only call for traditional and local sets */
3779 if (!(lbp->lb_flags & MDDB_MNSET))
3780 (void) dt_write(s);
3781
3782 setidentifier(s, &lbp->lb_ident);
3783
3784 if (err = push_lb(s)) {
3785 (void) upd_med(s, "writestart(0)");
3786 return (err);
3787 }
3788
3789 (void) upd_med(s, "writestart(0)");
3790
3791 if (err = push_lb(s)) {
3792 (void) upd_med(s, "writestart(1)");
3793 return (err);
3794 }
3795
3796 (void) upd_med(s, "writestart(1)");
3797
3798 lnp = s->s_lnp;
3799 uniqtime32(&lnp->ln_timestamp);
3800 if (lbp->lb_flags & MDDB_MNSET)
3801 lnp->ln_revision = MDDB_REV_MNLN;
3802 else
3803 lnp->ln_revision = MDDB_REV_LN;
3804 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
3805 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
3806 lbp->lb_lnblkcnt, 0);
3807 /*
3808 * If a MN diskset and this is the master, set the PARSE_LOCNM
3809 * flag in the mddb_set structure to show that the locator
3810 * names have changed.
3811 * Don't set parseflags as a result of a new master sync
3812 * during reconfig cycle since slaves nodes are already
3813 * in-sync with the new master.
3814 */
3815
3816 if ((lbp->lb_flags & MDDB_MNSET) &&
3817 (md_set[s->s_setno].s_am_i_master) &&
3818 (flag != MDDB_WRITECOPY_SYNC)) {
3819 s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
3820 }
3821
3822 if (err)
3823 return (err);
3824 }
3825
3826 for (li = 0; li < lbp->lb_loccnt; li++) {
3827 lp = &lbp->lb_locators[li];
3828 if (lp->l_flags & MDDB_F_DELETED)
3829 continue;
3830 if (lp->l_flags & MDDB_F_ACTIVE) {
3831 lp->l_flags |= MDDB_F_OLDACT;
3832 } else {
3833 lp->l_flags &= ~MDDB_F_OLDACT;
3834 }
3835 }
3836
3837 md_clr_setstatus(s->s_setno, MD_SET_STALE);
3838
3839 return (0);
3840 }
3841
3842 /*
3843 * selectreplicas selects the working replicas and may write the incore
3844 * version of the mddb out to the replicas ondisk.
3845 *
3846 * flag can be 3 values:
3847 * MDDB_RETRYSCAN - quick scan to see if there is an error.
3848 * If no new error, returns without writing mddb
3849 * to disks. If a new error is seen, writes out
3850 * mddb to disks.
3851 * MDDB_SCANALL - lengthy scan to check out mddbs and always writes
3852 * out mddb to the replica ondisk. Calls writecopy
3853 * with MDDB_WRITECOPY_ALL flag which writes out
3854 * all records to the replicas ondisk.
3855 * MDDB_SCANALLSYNC - called during reconfig cycle to sync up incore
3856 * and ondisk mddbs by writing incore values to disk.
3857 * Calls writecopy with MDDB_WRITECOPY_SYNC flag so
3858 * that change log records are not written out.
3859 * Only used by MN disksets.
3860 *
3861 * Returns:
3862 * 0 - Successful
3863 * 1 - Unable to write incore mddb data to disk since < 50% replicas.
3864 */
3865 int
selectreplicas(mddb_set_t * s,int flag)3866 selectreplicas(
3867 mddb_set_t *s,
3868 int flag
3869 )
3870 {
3871 int li;
3872 int alc;
3873 int lc;
3874 mddb_locator_t *lp;
3875 mddb_lb_t *lbp = s->s_lbp;
3876 set_t setno = s->s_setno;
3877 int wc_flag;
3878
3879 /*
3880 * can never transition from stale to not stale
3881 */
3882 if (md_get_setstatus(setno) & MD_SET_STALE) {
3883 for (li = 0; li < lbp->lb_loccnt; li++) {
3884 lp = &lbp->lb_locators[li];
3885 if (lp->l_flags & MDDB_F_DELETED)
3886 continue;
3887 if (! (lp->l_flags & MDDB_F_EMASTER)) {
3888 lp->l_flags |= MDDB_F_ACTIVE;
3889 } else {
3890 lp->l_flags &= ~MDDB_F_ACTIVE;
3891 }
3892 }
3893 return (1);
3894 }
3895
3896 if ((flag == MDDB_SCANALL) || (flag == MDDB_SCANALLSYNC)) {
3897 for (li = 0; li < lbp->lb_loccnt; li++) {
3898 lp = &lbp->lb_locators[li];
3899 if (lp->l_flags & MDDB_F_DELETED)
3900 continue;
3901 if (lp->l_flags & MDDB_F_ACTIVE) {
3902 lp->l_flags |= MDDB_F_OLDACT;
3903 lp->l_flags &= ~MDDB_F_SUSPECT;
3904 } else {
3905 lp->l_flags |= MDDB_F_SUSPECT;
3906 lp->l_flags &= ~MDDB_F_OLDACT;
3907 }
3908
3909 if (! (lp->l_flags & MDDB_F_EMASTER)) {
3910 lp->l_flags |= MDDB_F_ACTIVE;
3911 lp->l_flags &= ~MDDB_F_EWRITE;
3912 lp->l_flags &= ~MDDB_F_TOOSMALL;
3913 } else {
3914 lp->l_flags &= ~MDDB_F_ACTIVE;
3915 }
3916 }
3917 computefreeblks(s); /* set up free block bits */
3918 } else {
3919 for (li = 0; li < lbp->lb_loccnt; li++) {
3920 lp = &lbp->lb_locators[li];
3921 if (! (lp->l_flags & MDDB_F_ACTIVE))
3922 continue;
3923 if (lp->l_flags & MDDB_F_EWRITE)
3924 break;
3925 }
3926
3927 /*
3928 * if there are no errors this is error has already
3929 * been processed return current state
3930 */
3931 if (li == lbp->lb_loccnt)
3932 return (md_get_setstatus(setno) & MD_SET_TOOFEW);
3933
3934 lp->l_flags &= ~MDDB_F_ACTIVE;
3935 do {
3936 lp = &lbp->lb_locators[li];
3937 lp->l_flags &= ~MDDB_F_UP2DATE;
3938 } while (++li < lbp->lb_loccnt);
3939 }
3940
3941 alc = 0;
3942 lc = 0;
3943 for (li = 0; li < lbp->lb_loccnt; li++) {
3944 lp = &lbp->lb_locators[li];
3945 if (lp->l_flags & MDDB_F_DELETED)
3946 continue;
3947 lc++;
3948 if (! (lp->l_flags & MDDB_F_ACTIVE))
3949 continue;
3950 alc++;
3951 }
3952
3953 if (alc < ((lc + 1) / 2)) {
3954 md_set_setstatus(setno, MD_SET_TOOFEW);
3955 return (1);
3956 }
3957
3958 /* Set wc_flag based on flag passed in. */
3959 if (flag == MDDB_SCANALLSYNC)
3960 wc_flag = MDDB_WRITECOPY_SYNC;
3961 else
3962 wc_flag = MDDB_WRITECOPY_ALL;
3963
3964 do {
3965 if (! writestart(s, wc_flag)) {
3966 md_clr_setstatus(setno, MD_SET_TOOFEW);
3967 return (0);
3968 }
3969 alc = 0;
3970 for (li = 0; li < lbp->lb_loccnt; li++) {
3971 lp = &lbp->lb_locators[li];
3972 if ((lp->l_flags & MDDB_F_DELETED) ||
3973 (lp->l_flags & MDDB_F_EMASTER))
3974 continue;
3975
3976 if (lp->l_flags & MDDB_F_EWRITE) {
3977 lp->l_flags &= ~MDDB_F_ACTIVE;
3978 lp->l_flags &= ~MDDB_F_UP2DATE;
3979 continue;
3980 }
3981 alc++;
3982 }
3983 } while (alc >= ((lc + 1) / 2));
3984 md_set_setstatus(setno, MD_SET_TOOFEW);
3985 return (1);
3986 }
3987
3988 static int
checkstate(mddb_set_t * s,int probe)3989 checkstate(
3990 mddb_set_t *s,
3991 int probe
3992 )
3993 {
3994 int error;
3995 uint_t set_status = md_get_setstatus(s->s_setno);
3996
3997 ASSERT(s != NULL);
3998
3999 if (! (set_status & MD_SET_STALE) && ! (set_status & MD_SET_TOOFEW))
4000 return (0);
4001
4002 if (probe == MDDB_NOPROBE)
4003 return (1);
4004
4005 single_thread_start(s);
4006 error = selectreplicas(s, MDDB_SCANALL);
4007 single_thread_end(s);
4008
4009 if (error == 0 && s->s_zombie != 0) {
4010 mutex_exit(SETMUTEX(s->s_setno));
4011 error = mddb_deleterec(s->s_zombie);
4012 mutex_enter(SETMUTEX(s->s_setno));
4013 if (error == 0)
4014 s->s_zombie = 0;
4015 }
4016 return (error);
4017 }
4018
4019 static int
writeretry(mddb_set_t * s)4020 writeretry(
4021 mddb_set_t *s
4022 )
4023 {
4024 if (selectreplicas(s, MDDB_RETRYSCAN))
4025 if (selectreplicas(s, MDDB_SCANALL))
4026 return (1);
4027 return (0);
4028 }
4029
4030 static void
free_mbipp(mddb_mb_ic_t ** mbipp)4031 free_mbipp(mddb_mb_ic_t **mbipp)
4032 {
4033 mddb_mb_ic_t *mbip1, *mbip2;
4034
4035 for (mbip1 = *mbipp; mbip1 != NULL; mbip1 = mbip2) {
4036 mbip2 = mbip1->mbi_next;
4037 kmem_free((caddr_t)mbip1, MDDB_IC_BSIZE);
4038 }
4039 *mbipp = (mddb_mb_ic_t *)NULL;
4040 }
4041
4042 static mddb_ri_t *
save_rip(mddb_set_t * s)4043 save_rip(mddb_set_t *s)
4044 {
4045 mddb_ri_t *trip = s->s_rip;
4046 mddb_ri_t *nrip = NULL;
4047 mddb_ri_t **nripp = &nrip;
4048 mddb_ri_t *rip;
4049
4050 while (trip) {
4051 /* Run to the end of the list */
4052 for (/* void */; (*nripp != NULL); nripp = &(*nripp)->ri_next)
4053 /* void */;
4054
4055 /* Add the new member */
4056 *nripp = kmem_zalloc(sizeof (**nripp), KM_SLEEP);
4057
4058 ASSERT(*nripp != NULL);
4059
4060 /* shorthand */
4061 rip = *nripp;
4062
4063 *rip = *trip; /* structure assignment */
4064
4065 /* Clear the stuff that is not needed for hints */
4066 rip->ri_flags = 0;
4067 rip->ri_commitcnt = 0;
4068 rip->ri_transplant = 0;
4069 rip->ri_mbip = (mddb_mb_ic_t *)NULL;
4070 rip->ri_dtp = (mddb_dt_t *)NULL;
4071 rip->ri_lbp = (mddb_lb_t *)NULL;
4072 rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4073 rip->ri_devid = (ddi_devid_t)NULL;
4074 rip->ri_old_devid = (ddi_devid_t)NULL;
4075 rip->ri_next = (mddb_ri_t *)NULL;
4076
4077 trip = trip->ri_next;
4078 }
4079 return (nrip);
4080 }
4081
4082 static void
free_rip(mddb_ri_t ** ripp)4083 free_rip(mddb_ri_t **ripp)
4084 {
4085 mddb_ri_t *rip;
4086 mddb_ri_t *arip;
4087
4088 for (rip = *ripp; rip != (mddb_ri_t *)NULL; rip = arip) {
4089 arip = rip->ri_next;
4090 if (rip->ri_devid != (ddi_devid_t)NULL) {
4091 ddi_devid_free(rip->ri_devid);
4092 rip->ri_devid = (ddi_devid_t)NULL;
4093 }
4094 if (rip->ri_old_devid != (ddi_devid_t)NULL) {
4095 ddi_devid_free(rip->ri_old_devid);
4096 rip->ri_old_devid = (ddi_devid_t)NULL;
4097 }
4098 kmem_free((caddr_t)rip, sizeof (*rip));
4099 }
4100 *ripp = (mddb_ri_t *)NULL;
4101 }
4102
4103 /*
4104 * this routine selects the correct replica to use
4105 * the rules are as follows
4106 * 1. if all replica has same init time select highest commit count
4107 * 2. if some but not all replicas are from another hostid discard
4108 * them.
4109 * 3. find which init time is present is most replicas
4110 * 4. discard all replicas which do not match most init times
4111 * 5. select replica with highest commit count
4112 */
4113
4114 static mddb_lb_t *
selectlocator(mddb_set_t * s)4115 selectlocator(
4116 mddb_set_t *s
4117 )
4118 {
4119 mddb_ri_t *rip = s->s_rip;
4120 mddb_ri_t *r, *r1;
4121 mddb_lb_t *lbp;
4122 struct timeval32 *tp = (struct timeval32 *)NULL;
4123 int different;
4124 int same;
4125 int count;
4126 int maxcount;
4127 set_t setno = s->s_setno;
4128 size_t sz;
4129 int mn_set = 0;
4130
4131 /* Clear the ri_transplant flag on all the rip entries. */
4132 /* Set ri_commitcnt to locator's commitcnt - if available */
4133 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4134 r->ri_transplant = 0;
4135 if (r->ri_lbp != (mddb_lb_t *)NULL) {
4136 r->ri_commitcnt = r->ri_lbp->lb_commitcnt;
4137 /* If any locators have MN bit set, set flag */
4138 if (r->ri_lbp->lb_flags & MDDB_MNSET)
4139 mn_set = 1;
4140 }
4141 }
4142
4143 /*
4144 * A data tag is being used, so use it to limit the selection first.
4145 * Data tags not used in MN diskset.
4146 */
4147 if ((mn_set == 0) && (md_get_setstatus(setno) & MD_SET_USETAG)) {
4148 mddb_dt_t *dtp = (mddb_dt_t *)md_set[setno].s_dtp;
4149
4150 /*
4151 * now toss any locators that have a different data tag
4152 */
4153 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4154 if (r->ri_lbp == (mddb_lb_t *)NULL)
4155 continue;
4156
4157 if (r->ri_dtp != (mddb_dt_t *)NULL) {
4158 /* If same tag, keep it */
4159 if (dtl_cmp(&dtp->dt_dtag,
4160 &r->ri_dtp->dt_dtag) == 0)
4161 continue;
4162 }
4163
4164 if (r->ri_dtp != (mddb_dt_t *)NULL) {
4165 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4166 r->ri_dtp = (mddb_dt_t *)NULL;
4167 }
4168
4169 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4170 if (!(md_get_setstatus(setno) &
4171 MD_SET_REPLICATED_IMPORT)) {
4172 if (r->ri_old_devid != (ddi_devid_t)NULL) {
4173 sz = ddi_devid_sizeof(r->ri_old_devid);
4174 kmem_free((caddr_t)r->ri_old_devid, sz);
4175 r->ri_old_devid = (ddi_devid_t)NULL;
4176 }
4177 }
4178
4179 kmem_free((caddr_t)r->ri_lbp,
4180 dbtob(r->ri_lbp->lb_blkcnt));
4181 r->ri_lbp = (mddb_lb_t *)NULL;
4182
4183 r->ri_transplant = 1;
4184 }
4185
4186 /* Tag used, clear the bit */
4187 md_clr_setstatus(s->s_setno, MD_SET_USETAG);
4188
4189 if (md_get_setstatus(s->s_setno) & MD_SET_TAGDATA) {
4190 /*
4191 * Get rid of the list of tags.
4192 */
4193 dtl_freel(&s->s_dtlp);
4194
4195 /*
4196 * Re-create the list with the tag used.
4197 */
4198 (void) dtl_addl(s, &dtp->dt_dtag);
4199 }
4200 }
4201
4202 /*
4203 * scan to see if all replicas have same time
4204 */
4205 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4206 if (r->ri_lbp == (mddb_lb_t *)NULL)
4207 continue;
4208 if (tp == NULL) {
4209 tp = &r->ri_lbp->lb_inittime;
4210 continue;
4211 }
4212 /* CSTYLED */
4213 if (timercmp(tp, &r->ri_lbp->lb_inittime, !=))
4214 break;
4215 }
4216
4217 /*
4218 * if r == NULL then they were all them same. Choose highest
4219 * commit count
4220 */
4221 if (r == (mddb_ri_t *)NULL)
4222 goto out;
4223
4224 /*
4225 * If here, a bogus replica is present and at least 1 lb_inittime
4226 * did not match.
4227 */
4228
4229 /*
4230 * look and see if any but not all are from different id
4231 */
4232
4233 different = 0;
4234 same = 0;
4235 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4236 if (r->ri_lbp == (mddb_lb_t *)NULL)
4237 continue;
4238 if (cmpidentifier(s, &r->ri_lbp->lb_ident))
4239 different = 1;
4240 else
4241 same = 1;
4242 }
4243
4244 /*
4245 * now go through and throw out different if there are some
4246 * that are the same
4247 */
4248 if (different != 0 && same != 0) {
4249 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4250 if (r->ri_lbp == (mddb_lb_t *)NULL)
4251 continue;
4252
4253 if (!cmpidentifier(s, &r->ri_lbp->lb_ident))
4254 continue;
4255
4256 if (r->ri_dtp != (mddb_dt_t *)NULL) {
4257 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4258 r->ri_dtp = (mddb_dt_t *)NULL;
4259 }
4260
4261 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4262 if (!(md_get_setstatus(setno) &
4263 MD_SET_REPLICATED_IMPORT)) {
4264 if (r->ri_old_devid != (ddi_devid_t)NULL) {
4265 sz = ddi_devid_sizeof(r->ri_old_devid);
4266 kmem_free((caddr_t)r->ri_old_devid, sz);
4267 r->ri_old_devid = (ddi_devid_t)NULL;
4268 }
4269 }
4270
4271 kmem_free((caddr_t)r->ri_lbp,
4272 dbtob(r->ri_lbp->lb_blkcnt));
4273 r->ri_lbp = (mddb_lb_t *)NULL;
4274
4275 r->ri_transplant = 1;
4276 }
4277 }
4278
4279 /*
4280 * go through and pick highest. Use n square because it is
4281 * simple and 40 some is max possible
4282 */
4283 maxcount = 0;
4284 lbp = (mddb_lb_t *)NULL;
4285 for (r1 = rip; r1 != (mddb_ri_t *)NULL; r1 = r1->ri_next) {
4286 if (r1->ri_lbp == (mddb_lb_t *)NULL)
4287 continue;
4288 count = 0;
4289 for (r = r1; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4290 if (r->ri_lbp == (mddb_lb_t *)NULL)
4291 continue;
4292 if (timercmp(&r1->ri_lbp->lb_inittime, /* CSTYLED */
4293 &r->ri_lbp->lb_inittime, ==))
4294 count++;
4295 }
4296 if (count > maxcount) {
4297 maxcount = count;
4298 lbp = r1->ri_lbp;
4299 }
4300 }
4301
4302 /*
4303 * now go though and toss any that are of a different time stamp
4304 */
4305 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4306 if (r->ri_lbp == (mddb_lb_t *)NULL)
4307 continue;
4308 if (timercmp(&lbp->lb_inittime, /* CSTYLED */
4309 &r->ri_lbp->lb_inittime, ==))
4310 continue;
4311
4312 if (r->ri_dtp != (mddb_dt_t *)NULL) {
4313 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4314 r->ri_dtp = (mddb_dt_t *)NULL;
4315 }
4316
4317 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4318 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4319 if (r->ri_old_devid != (ddi_devid_t)NULL) {
4320 sz = ddi_devid_sizeof(r->ri_old_devid);
4321 kmem_free((caddr_t)r->ri_old_devid, sz);
4322 r->ri_old_devid = (ddi_devid_t)NULL;
4323 }
4324 }
4325
4326 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4327 r->ri_lbp = (mddb_lb_t *)NULL;
4328
4329 r->ri_transplant = 1;
4330 }
4331
4332 out:
4333 /*
4334 * Find the locator with the highest commit count, and make it the
4335 * "chosen" one.
4336 */
4337 lbp = (mddb_lb_t *)NULL;
4338 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4339 if (r->ri_lbp == (mddb_lb_t *)NULL)
4340 continue;
4341
4342 if (lbp == NULL) {
4343 lbp = r->ri_lbp;
4344 continue;
4345 }
4346
4347 if (r->ri_lbp->lb_commitcnt > lbp->lb_commitcnt)
4348 lbp = r->ri_lbp;
4349 }
4350
4351 /* Toss all locator blocks, except the "chosen" one. */
4352 for (r = rip; r != (mddb_ri_t *)NULL; r = r->ri_next) {
4353 if (r->ri_lbp == (mddb_lb_t *)NULL)
4354 continue;
4355
4356 /* Get rid of all dtp's */
4357 if (r->ri_dtp != (mddb_dt_t *)NULL) {
4358 kmem_free((caddr_t)r->ri_dtp, MDDB_DT_BYTES);
4359 r->ri_dtp = (mddb_dt_t *)NULL;
4360 }
4361
4362 if (r->ri_lbp == lbp)
4363 continue;
4364
4365 /* Get rid of extra locator devid block info */
4366 mddb_devid_icp_free(&r->ri_did_icp, r->ri_lbp);
4367 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
4368 if (r->ri_old_devid != (ddi_devid_t)NULL) {
4369 sz = ddi_devid_sizeof(r->ri_old_devid);
4370 kmem_free((caddr_t)r->ri_old_devid, sz);
4371 r->ri_old_devid = (ddi_devid_t)NULL;
4372 }
4373 }
4374
4375 /* Get rid of extra locators */
4376 kmem_free((caddr_t)r->ri_lbp, dbtob(r->ri_lbp->lb_blkcnt));
4377 r->ri_lbp = (mddb_lb_t *)NULL;
4378 }
4379 return (lbp);
4380 }
4381
4382 static void
locator2cfgloc(mddb_lb_t * lbp,mddb_cfg_loc_t * clp,int li,side_t sideno,mddb_did_ic_t * did_icp)4383 locator2cfgloc(
4384 mddb_lb_t *lbp,
4385 mddb_cfg_loc_t *clp,
4386 int li,
4387 side_t sideno,
4388 mddb_did_ic_t *did_icp
4389 )
4390 {
4391 mddb_drvnm_t *dn;
4392 mddb_locator_t *lp = &lbp->lb_locators[li];
4393 mddb_sidelocator_t *slp;
4394 mddb_mnsidelocator_t *mnslp;
4395 mddb_did_info_t *did_info;
4396 int i, sz, szalloc;
4397 int mn_set = 0;
4398 mddb_mnlb_t *mnlbp;
4399
4400 if (lbp->lb_flags & MDDB_MNSET) {
4401 mn_set = 1;
4402 mnlbp = (mddb_mnlb_t *)lbp;
4403 for (i = 0; i < MD_MNMAXSIDES; i++) {
4404 mnslp = &mnlbp->lb_mnsidelocators[i][li];
4405 if (mnslp->mnl_sideno == sideno)
4406 break;
4407 }
4408 if (i == MD_MNMAXSIDES)
4409 return;
4410 } else {
4411 slp = &lbp->lb_sidelocators[sideno][li];
4412 }
4413
4414 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4415 did_info = &(did_icp->did_ic_blkp->blk_info[li]);
4416 if (did_info->info_flags & MDDB_DID_EXISTS) {
4417 sz = (int)ddi_devid_sizeof(did_icp->did_ic_devid[li]);
4418 if (clp->l_devid_flags & MDDB_DEVID_SPACE) {
4419 /*
4420 * copy device id from mddb to
4421 * cfg_loc structure
4422 */
4423 szalloc = clp->l_devid_sz;
4424 if (sz <= szalloc) {
4425 for (i = 0; i < sz; i++) {
4426 ((char *)(uintptr_t)
4427 clp->l_devid)[i] =
4428 ((char *)did_icp->
4429 did_ic_devid[li])[i];
4430 }
4431 clp->l_devid_flags |= MDDB_DEVID_VALID;
4432 (void) strcpy(clp->l_minor_name,
4433 did_info->info_minor_name);
4434 } else {
4435 clp->l_devid_flags |=
4436 MDDB_DEVID_NOSPACE;
4437 }
4438 } else if (clp->l_devid_flags & MDDB_DEVID_GETSZ) {
4439 clp->l_devid_flags = MDDB_DEVID_SZ;
4440 clp->l_devid_sz = sz;
4441 }
4442 }
4443 }
4444
4445 /*
4446 * Even if a devid exists, use the dev, drvnm and mnum in the locators
4447 * and sidelocators. During startup, the dev, drvnm and mnum in
4448 * these structures may not match the devid (the locators and
4449 * sidelocators will be updated to match the devid by the routine
4450 * load_old_replicas). Using out-of-sync values won't cause any
4451 * problems since ridev will re-derive these from the devid and mnum.
4452 * After startup, the dev, drvnm and mnum in these structures have
4453 * been updated and can be used.
4454 */
4455
4456 clp->l_blkno = lp->l_blkno;
4457 clp->l_flags = lp->l_flags;
4458 clp->l_dev = lp->l_dev;
4459
4460 if (mn_set) {
4461 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
4462 clp->l_mnum = mnslp->mnl_mnum;
4463 } else {
4464 dn = &lbp->lb_drvnm[slp->l_drvnm_index];
4465 clp->l_mnum = slp->l_mnum;
4466 }
4467 (void) strncpy(clp->l_driver, dn->dn_data, MD_MAXDRVNM);
4468 }
4469
4470 /*
4471 * Find the index into the mnsidelocator where entry will go.
4472 * Then index can be fed into both splitname2locatorblocks and
4473 * cfgloc2locator so that those entries can be kept in sync.
4474 *
4475 * Returns:
4476 * -1 if failed to find unused slot or if a traditional diskset
4477 * index, if successful (0 <= index <= MD_MNMAXSIDES)
4478 */
4479 static int
checklocator(mddb_lb_t * lbp,int li,side_t sideno)4480 checklocator(
4481 mddb_lb_t *lbp,
4482 int li,
4483 side_t sideno
4484 )
4485 {
4486 uchar_t i;
4487 mddb_mnsidelocator_t *mnslp;
4488 mddb_mnlb_t *mnlbp;
4489 int index = -1;
4490
4491 if (lbp->lb_flags & MDDB_MNSET) {
4492 /*
4493 * Checking side locator structure. First, check if
4494 * there is already an entry for this side. If so,
4495 * then use that entry. Otherwise, find an entry
4496 * that has a sideno of 0.
4497 */
4498 mnlbp = (mddb_mnlb_t *)lbp;
4499 for (i = 0; i < MD_MNMAXSIDES; i++) {
4500 mnslp = &mnlbp->lb_mnsidelocators[i][li];
4501 if (mnslp->mnl_sideno == sideno) {
4502 /* Found a match - stop looking */
4503 index = i;
4504 break;
4505 } else if ((mnslp->mnl_sideno == 0) && (index == -1)) {
4506 /* Set first empty slot, but keep looking */
4507 index = i;
4508 }
4509 }
4510 /* Didn't find empty slot or previously used slot */
4511 if ((i == MD_MNMAXSIDES) && (index == -1)) {
4512 return (-1);
4513 }
4514 return (index);
4515 } else
4516 return (0);
4517 }
4518
4519 /*
4520 * Takes locator information (driver name, minor number, sideno) and
4521 * stores it in the locator block.
4522 * For traditional diskset, the sideno is the index into the sidelocator
4523 * array in the locator block.
4524 * For the MN diskset, the sideno is the nodeid which can be any number,
4525 * so the index passed in is the index into the mnsidelocator array
4526 * in the locator block.
4527 */
4528 static int
cfgloc2locator(mddb_lb_t * lbp,mddb_cfg_loc_t * clp,int li,side_t sideno,int index)4529 cfgloc2locator(
4530 mddb_lb_t *lbp,
4531 mddb_cfg_loc_t *clp,
4532 int li,
4533 side_t sideno,
4534 int index /* Only useful in MNsets when > 1 */
4535 )
4536 {
4537 uchar_t i;
4538 mddb_sidelocator_t *slp;
4539 mddb_mnsidelocator_t *mnslp;
4540 mddb_set_t *s;
4541 int mn_set = 0;
4542 mddb_mnlb_t *mnlbp;
4543
4544 if (lbp->lb_flags & MDDB_MNSET) {
4545 mnlbp = (mddb_mnlb_t *)lbp;
4546 mn_set = 1;
4547 /*
4548 * Index will be the slot that has the given sideno or
4549 * the first empty slot if no match is found.
4550 * This was pre-checked out in check locator.
4551 */
4552 mnslp = &mnlbp->lb_mnsidelocators[index][li];
4553 } else {
4554 slp = &lbp->lb_sidelocators[sideno][li];
4555 }
4556
4557 /*
4558 * Look for the driver name
4559 */
4560 for (i = 0; i < MDDB_DRVNMCNT; i++) {
4561 if (lbp->lb_drvnm[i].dn_len == 0)
4562 continue;
4563 if (strncmp(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4564 MD_MAXDRVNM) == 0)
4565 break;
4566 }
4567
4568 /*
4569 * Didn't find one, add a new one
4570 */
4571 if (i == MDDB_DRVNMCNT) {
4572 for (i = 0; i < MDDB_DRVNMCNT; i++) {
4573 if (lbp->lb_drvnm[i].dn_len == 0)
4574 break;
4575 }
4576 if (i == MDDB_DRVNMCNT)
4577 return (1);
4578 (void) strncpy(lbp->lb_drvnm[i].dn_data, clp->l_driver,
4579 MD_MAXDRVNM);
4580 lbp->lb_drvnm[i].dn_len = (uchar_t)strlen(clp->l_driver);
4581 }
4582
4583 /* Fill in the drvnm index */
4584 if (mn_set) {
4585 mnslp->mnl_drvnm_index = i;
4586 mnslp->mnl_mnum = clp->l_mnum;
4587 mnslp->mnl_sideno = sideno;
4588 } else {
4589 slp->l_drvnm_index = i;
4590 slp->l_mnum = clp->l_mnum;
4591 }
4592
4593 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4594 /*
4595 * This device id could already be associated with this index
4596 * if this is not the first side added to the set.
4597 * If device id is 0, there is no device id for this device.
4598 */
4599 if ((ddi_devid_t)(uintptr_t)clp->l_devid == 0)
4600 return (0);
4601 s = (mddb_set_t *)md_set[lbp->lb_setno].s_db;
4602 if (mddb_devid_add(s, li, (ddi_devid_t)(uintptr_t)clp->l_devid,
4603 clp->l_minor_name)) {
4604 return (1);
4605 }
4606 }
4607
4608 return (0);
4609 }
4610
4611 /*
4612 * See if there are mediator hosts and try to use the data.
4613 */
4614 static int
mediate(mddb_set_t * s)4615 mediate(
4616 mddb_set_t *s
4617 )
4618 {
4619 mddb_lb_t *lbp = s->s_lbp;
4620 med_data_lst_t *meddlp = NULL;
4621 med_data_lst_t *tmeddlp = NULL;
4622 med_data_t *meddp;
4623 int medok = 0;
4624 int medacc = 0;
4625 uint_t maxcc;
4626 int golden = 0;
4627 int err = 1;
4628 set_t setno = s->s_setno;
4629
4630 /* Do not have a mediator, then the state is stale */
4631 if (s->s_med.n_cnt == 0)
4632 return (err);
4633
4634 /* Contact the mediator hosts for the data */
4635 meddlp = get_med_host_data(&s->s_med, s->s_setname, setno);
4636
4637 /* No mediator data, stale */
4638 if (meddlp == NULL)
4639 return (err);
4640
4641 /* Mark all the mediator data that is not for this set as errored */
4642 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4643 struct timeval32 tmptime;
4644 meddp = tmeddlp->mdl_med;
4645
4646 /* Count the number of mediators contacted */
4647 medacc++;
4648
4649 /* Paranoid check */
4650 if (meddp->med_dat_sn != setno)
4651 meddp->med_dat_fl |= MED_DFL_ERROR;
4652
4653 TIMEVAL_TO_TIMEVAL32(&tmptime, &meddp->med_dat_id);
4654
4655 /*CSTYLED*/
4656 if (timercmp(&tmptime, &lbp->lb_ident.createtime, !=))
4657 meddp->med_dat_fl |= MED_DFL_ERROR;
4658 }
4659
4660 /* Get the max commitcount */
4661 maxcc = 0;
4662 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4663 meddp = tmeddlp->mdl_med;
4664 if (meddp->med_dat_fl & MED_DFL_ERROR)
4665 continue;
4666 if (meddp->med_dat_cc > maxcc)
4667 maxcc = meddp->med_dat_cc;
4668 }
4669
4670 /* Now mark the records that don't have the highest cc as errored */
4671 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4672 meddp = tmeddlp->mdl_med;
4673 if (meddp->med_dat_fl & MED_DFL_ERROR)
4674 continue;
4675 if (meddp->med_dat_cc != maxcc)
4676 meddp->med_dat_fl |= MED_DFL_ERROR;
4677 }
4678
4679 /* Now mark the records that don't match the lb commitcnt as errored */
4680 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4681 meddp = tmeddlp->mdl_med;
4682 if (meddp->med_dat_fl & MED_DFL_ERROR)
4683 continue;
4684 if (meddp->med_dat_cc != lbp->lb_commitcnt)
4685 meddp->med_dat_fl |= MED_DFL_ERROR;
4686 }
4687
4688 /* Is there a "golden" copy and how many valid mediators */
4689 for (tmeddlp = meddlp; tmeddlp != NULL; tmeddlp = tmeddlp->mdl_nx) {
4690 meddp = tmeddlp->mdl_med;
4691 if (meddp->med_dat_fl & MED_DFL_ERROR)
4692 continue;
4693
4694 if (meddp->med_dat_fl & MED_DFL_GOLDEN)
4695 golden++;
4696
4697 medok++;
4698 }
4699
4700 /* No survivors, stale */
4701 if (medok == 0)
4702 goto out;
4703
4704 /* No mediator quorum and no golden copies, stale */
4705 if (medacc < ((s->s_med.n_cnt / 2) + 1) && ! golden) {
4706 /* Skip odd numbers, no exact 50% */
4707 if (s->s_med.n_cnt & 1)
4708 goto out;
4709 /* Have 50%, allow an accept */
4710 if (medacc == (s->s_med.n_cnt / 2))
4711 md_set_setstatus(setno, MD_SET_ACCOK);
4712 goto out;
4713 }
4714
4715 /* We either have a quorum or a golden copy, or both */
4716 err = 0;
4717
4718 out:
4719 if (meddlp) {
4720 for (/* void */; meddlp != NULL; meddlp = tmeddlp) {
4721 tmeddlp = meddlp->mdl_nx;
4722 kmem_free(meddlp->mdl_med, sizeof (med_data_t));
4723 kmem_free(meddlp, sizeof (med_data_lst_t));
4724 }
4725 }
4726
4727 return (err);
4728 }
4729
4730 /*
4731 * 1. read masterblks and locator blocks for all know database locations
4732 * a. keep track of which have good master blks
4733 * b. keep track of which have good locators
4734 *
4735 */
4736 static int
get_mbs_n_lbs(mddb_set_t * s,int * write_lb)4737 get_mbs_n_lbs(
4738 mddb_set_t *s,
4739 int *write_lb
4740 )
4741 {
4742 mddb_lb_t *lbp = NULL; /* pointer to locator block */
4743 /* May be cast to mddb_mnlb_t */
4744 /* if accessing sidenames in */
4745 /* MN set */
4746 mddb_did_ic_t *did_icp = NULL; /* ptr to Device ID incore */
4747 mddb_did_blk_t *did_blkp = 0;
4748 int did_blkp_sz = 0;
4749 mddb_did_db_t *did_dbp;
4750 mddb_did_info_t *did_info;
4751 caddr_t did_block;
4752 mddb_ri_t *rip;
4753 mddb_dtag_lst_t *dtlp;
4754 mddb_locator_t *lp;
4755 daddr_t physblk;
4756 int li;
4757 uint_t blk;
4758 md_dev64_t dev;
4759 caddr_t buffer;
4760 uint_t lb_blkcnt;
4761 int retval = 0;
4762 int err = 0;
4763 int lb_ok = 0;
4764 int lb_total = 0;
4765 int lb_tagged = 0;
4766 int lb_tags;
4767 set_t setno = s->s_setno;
4768 int cont_flag, i;
4769 mddb_did_db_t *did_dbp1, *did_dbp2;
4770 int mn_set = 0;
4771 mddb_cfg_loc_t *cl;
4772
4773 /*
4774 * read in master blocks and locator block for all known locators.
4775 * lb_blkcnt will be set correctly for MN set later once getmasters
4776 * has determined that the set is a MN set.
4777 */
4778 lb_blkcnt = ((setno == MD_LOCAL_SET) ? MDDB_LOCAL_LBCNT : MDDB_LBCNT);
4779
4780 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
4781 rip->ri_flags &= (MDDB_F_PTCHED | MDDB_F_IOCTL |
4782 MDDB_F_EMASTER);
4783 rip->ri_lbp = (mddb_lb_t *)NULL;
4784 rip->ri_did_icp = (mddb_did_ic_t *)NULL;
4785
4786 /*
4787 * Translated dev is only used in calls to getmasters and
4788 * getblks which expect a translated (aka miniroot) dev.
4789 */
4790 dev = md_xlate_targ_2_mini(rip->ri_dev);
4791 if (dev == NODEV64) {
4792 /* Set error flag that getmasters would have set */
4793 /* if getmasters had been allowed to fail */
4794 rip->ri_flags |= MDDB_F_EMASTER;
4795 }
4796
4797 /*
4798 * Invalid device id on system (due to failed or
4799 * removed device) or invalid devt during upgrade
4800 * (due to powered off device) will cause this
4801 * replica to be marked in error and not used.
4802 */
4803 if (rip->ri_flags & MDDB_F_EMASTER)
4804 continue;
4805
4806 /* get all master blocks, does mddb_devopen() */
4807 rip->ri_mbip = getmasters(s, dev, rip->ri_blkno,
4808 &rip->ri_flags, &mn_set);
4809
4810 /* if invalid master block - try next replica */
4811 if (! rip->ri_mbip)
4812 continue;
4813
4814 /*
4815 * If lbp alloc'd to wrong size - reset it.
4816 * If MN set, lb_blkcnt must be MDDB_MNLBCNT.
4817 * If a traditional set, lb_blkcnt must NOT be MDDB_MNLBCNT.
4818 */
4819 if (lbp) {
4820 if (((mn_set) && (lb_blkcnt != MDDB_MNLBCNT)) ||
4821 ((!mn_set) && (lb_blkcnt == MDDB_MNLBCNT))) {
4822 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
4823 lbp = (mddb_lb_t *)NULL;
4824 }
4825 }
4826
4827 if (lbp == (mddb_lb_t *)NULL) {
4828 /* If a MN set, set lb_blkcnt for MN loc blk size */
4829 if (mn_set)
4830 lb_blkcnt = MDDB_MNLBCNT;
4831 lbp = (mddb_lb_t *)kmem_zalloc(dbtob(lb_blkcnt),
4832 KM_SLEEP);
4833 }
4834
4835 /*
4836 * Read in all the sectors for the locator block
4837 * NOTE: Need to use getblks, rather than readblklst.
4838 * because it is too early and things are
4839 * NOT set up yet for read*()'s
4840 */
4841 buffer = (caddr_t)lbp;
4842 for (blk = 0; blk < lb_blkcnt; blk++) {
4843 physblk = getphysblk(blk, rip->ri_mbip);
4844 err = getblks(s, buffer, dev, physblk,
4845 btodb(MDDB_BSIZE), 0);
4846 if (err) {
4847 rip->ri_flags |= err;
4848 break;
4849 }
4850 buffer += MDDB_BSIZE;
4851 }
4852
4853 if (err)
4854 continue;
4855
4856 /* Verify the locator block */
4857 if (blk != lb_blkcnt)
4858 continue;
4859 if (lbp->lb_magic != MDDB_MAGIC_LB)
4860 continue;
4861 if (lbp->lb_blkcnt != lb_blkcnt)
4862 continue;
4863 if (mn_set) {
4864 /* If a MN set, check for MNLB revision in lb. */
4865 if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
4866 continue;
4867 } else {
4868 /* If not a MN set, check for LB revision in lb. */
4869 if (revchk(MDDB_REV_LB, lbp->lb_revision))
4870 continue;
4871 }
4872 if (crcchk(lbp, &lbp->lb_checksum, dbtob(lb_blkcnt), NULL))
4873 continue;
4874
4875 /*
4876 * With the addition of MultiNode Disksets, we must make sure
4877 * to verify that this is the correct set. A node could
4878 * have been out of the config for awhile and this disk could
4879 * have been moved to a different diskset and we don't want
4880 * to accidentally start the wrong set.
4881 *
4882 * We don't do this check if we're in the middle of
4883 * importing a set.
4884 */
4885 if (!(md_get_setstatus(s->s_setno) &
4886 (MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT)) &&
4887 (lbp->lb_setno != s->s_setno))
4888 continue;
4889
4890 rip->ri_flags |= MDDB_F_LOCACC;
4891
4892 /*
4893 * a commit count of zero means this locator has been deleted
4894 */
4895 if (lbp->lb_commitcnt == 0)
4896 continue;
4897
4898 /*
4899 * If replica is in the device ID style and md_devid_destroy
4900 * flag is set, turn off device id style. This is only to be
4901 * used in a catastrophic failure case. Examples would be
4902 * where the device id of all drives in the system
4903 * (especially the mirror'd root drives) had been changed
4904 * by firmware upgrade or by a patch to an existing disk
4905 * driver. Another example would be in the case of non-unique
4906 * device ids due to a bug. The device id would be valid on
4907 * the system, but would return the wrong dev_t.
4908 */
4909 if ((lbp->lb_flags & MDDB_DEVID_STYLE) && md_devid_destroy) {
4910 lbp->lb_flags &= ~MDDB_DEVID_STYLE;
4911 lbp->lb_didfirstblk = 0;
4912 lbp->lb_didblkcnt = 0;
4913 *write_lb = 1;
4914 }
4915
4916
4917 /*
4918 * If replica is in device ID style, read in device ID
4919 * block and verify device ID block information.
4920 */
4921 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
4922
4923 /* Read in device ID block */
4924 if (did_icp == NULL) {
4925 did_icp = (mddb_did_ic_t *)
4926 kmem_zalloc(sizeof (mddb_did_ic_t),
4927 KM_SLEEP);
4928 } else {
4929 /* Reuse did_icp, but clear out data */
4930 if (did_icp->did_ic_blkp !=
4931 (mddb_did_blk_t *)NULL) {
4932 kmem_free((caddr_t)did_icp->did_ic_blkp,
4933 did_blkp_sz);
4934 did_blkp = (mddb_did_blk_t *)NULL;
4935 did_icp->did_ic_blkp =
4936 (mddb_did_blk_t *)NULL;
4937 }
4938 if (did_icp->did_ic_dbp !=
4939 (mddb_did_db_t *)NULL) {
4940 did_dbp1 = did_icp->did_ic_dbp;
4941 while (did_dbp1) {
4942 did_dbp2 = did_dbp1->db_next;
4943 kmem_free((caddr_t)
4944 did_dbp1->db_ptr,
4945 dbtob(did_dbp1->db_blkcnt));
4946 kmem_free((caddr_t)did_dbp1,
4947 sizeof (mddb_did_db_t));
4948 did_dbp1 = did_dbp2;
4949 }
4950 did_icp->did_ic_dbp =
4951 (mddb_did_db_t *)NULL;
4952 }
4953 for (i = 0; i < MDDB_NLB; i++) {
4954 did_icp->did_ic_devid[i] =
4955 (ddi_devid_t)NULL;
4956 }
4957 }
4958
4959 /* Can't reuse blkp since size could be different */
4960 if (did_blkp != (mddb_did_blk_t *)NULL) {
4961 kmem_free(did_blkp, did_blkp_sz);
4962 }
4963 did_blkp_sz = (int)dbtob(lbp->lb_didblkcnt);
4964 did_blkp = (mddb_did_blk_t *)kmem_zalloc(did_blkp_sz,
4965 KM_SLEEP);
4966 did_icp->did_ic_blkp = did_blkp;
4967 buffer = (caddr_t)did_blkp;
4968 for (blk = lbp->lb_didfirstblk;
4969 blk < (lbp->lb_didblkcnt + lbp->lb_didfirstblk);
4970 blk++) {
4971 physblk = getphysblk(blk, rip->ri_mbip);
4972 err = getblks(s, buffer, dev, physblk,
4973 btodb(MDDB_BSIZE), 0);
4974 if (err) {
4975 rip->ri_flags |= err;
4976 break;
4977 }
4978 buffer += MDDB_BSIZE;
4979 }
4980 if (err)
4981 continue;
4982
4983 /* Verify the Device ID block */
4984 if (blk != (lbp->lb_didblkcnt + lbp->lb_didfirstblk))
4985 continue;
4986 if (did_blkp->blk_magic != MDDB_MAGIC_DI)
4987 continue;
4988 if (lbp->lb_didblkcnt != MDDB_DID_BLOCKS)
4989 continue;
4990 if (revchk(MDDB_REV_DI, did_blkp->blk_revision))
4991 continue;
4992 if (crcchk(did_blkp, &did_blkp->blk_checksum,
4993 dbtob(lbp->lb_didblkcnt), NULL))
4994 continue;
4995
4996 /*
4997 * Check if device ID block is out of sync with the
4998 * Locator Block by checking if the locator block
4999 * commitcnt does not match the device id block
5000 * commitcnt. If an 'out of sync' condition
5001 * exists, discard this replica since it has
5002 * inconsistent data and can't be used in
5003 * determining the best replica.
5004 *
5005 * An 'out of sync' condition could happen if old
5006 * SDS code was running with new devid style replicas
5007 * or if a failure occurred between the writing of
5008 * the locator block's commitcnt and the device
5009 * id block's commitcnt.
5010 *
5011 * If old SDS code had been running, the upgrade
5012 * process should detect this situation and
5013 * have removed all of the device id information
5014 * via the md_devid_destroy flag in md.conf.
5015 */
5016 if (did_blkp->blk_commitcnt !=
5017 lbp->lb_commitcnt) {
5018 continue;
5019 }
5020 }
5021
5022
5023 /*
5024 * If replica is still in device ID style, read in all
5025 * of the device IDs, verify the checksum of the device IDs.
5026 */
5027 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5028 /*
5029 * Reset valid bit in device id info block flags. This
5030 * flag is stored on disk, but the valid bit is reset
5031 * when reading in the replica. If the corresponding
5032 * device id is valid (aka meaning that the system
5033 * knows about this device id), the valid bit will
5034 * be set at a later time. The valid bit for this
5035 * replica's device ID will be set in this routine.
5036 * The valid bits for the rest of the device id's
5037 * will be set after the 'best' replica has
5038 * been selected in routine load_old_replicas.
5039 * Reset updated bit in device id info block flags.
5040 * This flag is also stored on disk, reset when read
5041 * in and set when the locators and side locators
5042 * have been updated to match this valid device
5043 * id information.
5044 */
5045 for (li = 0; li < lbp->lb_loccnt; li++) {
5046 did_info = &did_blkp->blk_info[li];
5047 if (did_info->info_flags & MDDB_DID_EXISTS)
5048 did_info->info_flags &=
5049 ~(MDDB_DID_VALID |
5050 MDDB_DID_UPDATED);
5051 }
5052
5053 cont_flag = 0;
5054 for (li = 0; li < lbp->lb_loccnt; li++) {
5055 did_info = &did_blkp->blk_info[li];
5056 did_block = (caddr_t)NULL;
5057 if (did_info->info_flags & MDDB_DID_EXISTS) {
5058 /*
5059 * Check if block has
5060 * already been read in
5061 */
5062 did_dbp = did_icp->did_ic_dbp;
5063 while (did_dbp != 0) {
5064 if (did_dbp->db_firstblk ==
5065 did_info->info_firstblk)
5066 break;
5067 else
5068 did_dbp =
5069 did_dbp->db_next;
5070 }
5071 /* if block not found, read it in */
5072 if (did_dbp == NULL) {
5073 did_block = (caddr_t)
5074 (kmem_zalloc(dbtob(
5075 did_info->info_blkcnt),
5076 KM_SLEEP));
5077 buffer = (caddr_t)did_block;
5078 for (blk =
5079 did_info->info_firstblk;
5080 blk < (did_info->
5081 info_firstblk +
5082 did_info->info_blkcnt);
5083 blk++) {
5084 physblk =
5085 getphysblk(blk,
5086 rip->ri_mbip);
5087 err = getblks(s,
5088 buffer, dev,
5089 physblk, btodb(
5090 MDDB_BSIZE), 0);
5091 if (err) {
5092 rip->ri_flags |=
5093 err;
5094 break;
5095 }
5096 buffer += MDDB_BSIZE;
5097 }
5098 if (err) {
5099 kmem_free(did_block,
5100 dbtob(did_info->
5101 info_blkcnt));
5102 did_block =
5103 (caddr_t)NULL;
5104 cont_flag = 1;
5105 break;
5106 }
5107
5108 /*
5109 * Block read in -
5110 * alloc Disk Block area
5111 */
5112 did_dbp = (mddb_did_db_t *)
5113 kmem_zalloc(
5114 sizeof (mddb_did_db_t),
5115 KM_SLEEP);
5116 did_dbp->db_ptr = did_block;
5117 did_dbp->db_firstblk =
5118 did_info->info_firstblk;
5119 did_dbp->db_blkcnt =
5120 did_info->info_blkcnt;
5121
5122 /* Add to front of dbp list */
5123 did_dbp->db_next =
5124 did_icp->did_ic_dbp;
5125 did_icp->did_ic_dbp = did_dbp;
5126 }
5127 /* Check validity of devid in block */
5128 if (crcchk(((char *)did_dbp->db_ptr +
5129 did_info->info_offset),
5130 &did_info->info_checksum,
5131 did_info->info_length, NULL)) {
5132 cont_flag = 1;
5133 break;
5134 }
5135
5136 /* Block now pointed to by did_dbp */
5137 did_icp->did_ic_devid[li] =
5138 (ddi_devid_t)((char *)
5139 did_dbp->db_ptr +
5140 did_info->info_offset);
5141 }
5142 }
5143 if (cont_flag)
5144 continue;
5145 }
5146
5147 /*
5148 * All blocks containing devids are now in core.
5149 */
5150
5151 /*
5152 * If we're doing a replicated import (also known as
5153 * remote copy import), the device id in the locator
5154 * block is incorrect and we need to fix it up here
5155 * alongwith the l_dev otherwise we run into lots of
5156 * trouble later on.
5157 */
5158 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5159 mddb_ri_t *trip;
5160 for (li = 0; li < lbp->lb_loccnt; li++) {
5161 did_info = &did_blkp->blk_info[li];
5162 lp = &lbp->lb_locators[li];
5163
5164 if (lp->l_flags & MDDB_F_DELETED)
5165 continue;
5166
5167 if (!(did_info->info_flags & MDDB_DID_EXISTS))
5168 continue;
5169
5170 if (did_icp->did_ic_devid[li] == NULL)
5171 continue;
5172
5173 for (trip = s->s_rip; trip != NULL;
5174 trip = trip->ri_next) {
5175 if (trip->ri_old_devid == NULL)
5176 continue;
5177 if (ddi_devid_compare(
5178 trip->ri_old_devid,
5179 did_icp->did_ic_devid[li]) != 0) {
5180 continue;
5181 }
5182
5183 /* update l_dev and side mnum */
5184 lp->l_dev = md_cmpldev(trip->ri_dev);
5185 lbp->lb_sidelocators[0][li].l_mnum =
5186 md_getminor(trip->ri_dev);
5187 }
5188 }
5189 }
5190
5191 /*
5192 * If there is a valid devid, verify that this locator
5193 * block has information about itself by checking the
5194 * device ID, minor_name and block
5195 * number from this replica's incore data structure
5196 * against the locator block information that has just
5197 * been read in from disk.
5198 *
5199 * If not a valid devid, verify that this locator block
5200 * has information about itself by checking the minor
5201 * number, block number and driver name from this
5202 * replica's incore data structure against the locator
5203 * block information that has just been read in from disk.
5204 */
5205 if ((rip->ri_devid != NULL) &&
5206 (lbp->lb_flags & MDDB_DEVID_STYLE)) {
5207 /*
5208 * This locator block MUST have locator (replica)
5209 * information about itself. Check against devid,
5210 * slice part of minor number, and block number.
5211 */
5212 for (li = 0; li < lbp->lb_loccnt; li++) {
5213 did_info = &did_blkp->blk_info[li];
5214 lp = &lbp->lb_locators[li];
5215 if (lp->l_flags & MDDB_F_DELETED)
5216 continue;
5217
5218 if (!(did_info->info_flags & MDDB_DID_EXISTS))
5219 continue;
5220
5221 if (((md_get_setstatus(setno) &
5222 MD_SET_REPLICATED_IMPORT)) &&
5223 (rip->ri_old_devid != (ddi_devid_t)NULL)) {
5224 if (ddi_devid_compare(rip->ri_old_devid,
5225 did_icp->did_ic_devid[li]) != 0)
5226 continue;
5227 } else {
5228 if (ddi_devid_compare(rip->ri_devid,
5229 did_icp->did_ic_devid[li]) != 0)
5230 continue;
5231 }
5232
5233 if (strcmp(rip->ri_minor_name,
5234 did_info->info_minor_name) != 0)
5235 continue;
5236
5237 if (lp->l_blkno == rip->ri_blkno)
5238 break;
5239 }
5240 } else {
5241 /*
5242 * This locator block MUST have locator (replica)
5243 * information about itself.
5244 */
5245 if (!mn_set) {
5246 for (li = 0; li < lbp->lb_loccnt; li++) {
5247 mddb_drvnm_t *dn;
5248 mddb_sidelocator_t *slp;
5249
5250 lp = &lbp->lb_locators[li];
5251 slp = &lbp->
5252 lb_sidelocators[s->s_sideno][li];
5253 if (lp->l_flags & MDDB_F_DELETED)
5254 continue;
5255 if (slp->l_mnum != md_getminor(
5256 rip->ri_dev))
5257 continue;
5258 if (lp->l_blkno != rip->ri_blkno)
5259 continue;
5260 dn = &lbp->lb_drvnm[slp->l_drvnm_index];
5261 if (strncmp(dn->dn_data,
5262 rip->ri_driver, MD_MAXDRVNM) == 0)
5263 break;
5264 }
5265 } else {
5266 for (li = 0; li < lbp->lb_loccnt; li++) {
5267 mddb_drvnm_t *dn;
5268 mddb_mnsidelocator_t *mnslp;
5269 mddb_mnlb_t *mnlbp;
5270 int i;
5271
5272 /*
5273 * Check all possible locators locking
5274 * for match to the currently read-in
5275 * locator, must match on:
5276 * - blkno
5277 * - side locator for this
5278 * node's side
5279 * - side locator minor number
5280 * - side locator driver name
5281 */
5282
5283 /*
5284 * Looking at sidelocs:
5285 * cast lbp -> mnlbp
5286 */
5287 mnlbp = (mddb_mnlb_t *)lbp;
5288 lp = &mnlbp->lb_locators[li];
5289 if (lp->l_flags & MDDB_F_DELETED)
5290 continue;
5291 if (lp->l_blkno != rip->ri_blkno)
5292 continue;
5293
5294 for (i = 0; i < MD_MNMAXSIDES; i++) {
5295 mnslp = &mnlbp->
5296 lb_mnsidelocators[i][li];
5297 if (mnslp->mnl_sideno ==
5298 s->s_sideno) {
5299 break;
5300 }
5301 }
5302 /* No matching side found */
5303 if (i == MD_MNMAXSIDES)
5304 continue;
5305 if (mnslp->mnl_mnum !=
5306 md_getminor(rip->ri_dev))
5307 continue;
5308 dn = &lbp->
5309 lb_drvnm[mnslp->mnl_drvnm_index];
5310 if (strncmp(dn->dn_data,
5311 rip->ri_driver, MD_MAXDRVNM) == 0)
5312 break;
5313 }
5314 }
5315 }
5316
5317 /*
5318 * Didn't find ourself in this locator block it means
5319 * the locator block is a stale transplant. Probably from
5320 * a user doing a dd.
5321 */
5322 if (li == lbp->lb_loccnt)
5323 continue;
5324
5325 /*
5326 * Keep track of the number of accessed and valid
5327 * locator blocks.
5328 */
5329 lb_ok++;
5330
5331 /*
5332 * Read the tag in, skips invalid or blank tags.
5333 * Only valid tags allocate storage
5334 * Data tags are not used in MN disksets.
5335 */
5336 if ((!mn_set) && (! dt_read(s, lbp, rip))) {
5337 /*
5338 * Keep track of the number of tagged
5339 * locator blocks.
5340 */
5341 lb_tagged++;
5342
5343 /* Keep a list of unique tags. */
5344 (void) dtl_addl(s, &rip->ri_dtp->dt_dtag);
5345 }
5346
5347 if (!(md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5348 /*
5349 * go through locator block and add any other
5350 * locations of the data base.
5351 * For the replicated import case, this was done earlier
5352 * and we really don't need or want to do so again
5353 */
5354 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
5355 for (li = 0; li < lbp->lb_loccnt; li++) {
5356 lp = &lbp->lb_locators[li];
5357 if (lp->l_flags & MDDB_F_DELETED)
5358 continue;
5359
5360 cl->l_devid_flags = MDDB_DEVID_GETSZ;
5361 cl->l_devid = (uint64_t)0;
5362 cl->l_devid_sz = 0;
5363 cl->l_old_devid = (uint64_t)0;
5364 cl->l_old_devid_sz = 0;
5365 cl->l_minor_name[0] = '\0';
5366 locator2cfgloc(lbp, cl, li, s->s_sideno,
5367 did_icp);
5368
5369 if (cl->l_devid_flags & MDDB_DEVID_SZ) {
5370 if ((cl->l_devid = (uintptr_t)kmem_alloc
5371 (cl->l_devid_sz, KM_SLEEP))
5372 == NULL) {
5373 continue;
5374 } else {
5375 cl->l_devid_flags =
5376 MDDB_DEVID_SPACE;
5377 }
5378 }
5379 locator2cfgloc(lbp, cl, li, s->s_sideno,
5380 did_icp);
5381
5382 (void) ridev(&s->s_rip, cl, &lp->l_dev, 0);
5383
5384 if (cl->l_devid_flags & MDDB_DEVID_SPACE)
5385 kmem_free((caddr_t)(uintptr_t)
5386 cl->l_devid, cl->l_devid_sz);
5387 }
5388 kmem_free(cl, sizeof (mddb_cfg_loc_t));
5389 }
5390
5391 /* Save LB for later */
5392 rip->ri_lbp = lbp;
5393 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5394 rip->ri_did_icp = did_icp;
5395 did_icp = (mddb_did_ic_t *)NULL;
5396 did_blkp = (mddb_did_blk_t *)NULL;
5397 } else
5398 rip->ri_did_icp = NULL;
5399 lbp = (mddb_lb_t *)NULL;
5400 }
5401
5402 if (lbp != (mddb_lb_t *)NULL)
5403 kmem_free((caddr_t)lbp, dbtob(lb_blkcnt));
5404
5405 if (did_icp != (mddb_did_ic_t *)NULL) {
5406 if (did_icp->did_ic_blkp != (mddb_did_blk_t *)NULL) {
5407 kmem_free((caddr_t)did_icp->did_ic_blkp, did_blkp_sz);
5408 did_blkp = (mddb_did_blk_t *)NULL;
5409 }
5410 if (did_icp->did_ic_dbp != (mddb_did_db_t *)NULL) {
5411 mddb_did_db_t *did_dbp1, *did_dbp2;
5412
5413 did_dbp1 = did_icp->did_ic_dbp;
5414 while (did_dbp1) {
5415 did_dbp2 = did_dbp1->db_next;
5416 kmem_free((caddr_t)did_dbp1->db_ptr,
5417 dbtob(did_dbp1->db_blkcnt));
5418 kmem_free((caddr_t)did_dbp1,
5419 sizeof (mddb_did_db_t));
5420 did_dbp1 = did_dbp2;
5421 }
5422 }
5423 kmem_free((caddr_t)did_icp, sizeof (mddb_did_ic_t));
5424 }
5425
5426 if (did_blkp != (mddb_did_blk_t *)NULL) {
5427 kmem_free((caddr_t)did_blkp, did_blkp_sz);
5428 }
5429
5430 /* No locator blocks were ok */
5431 if (lb_ok == 0)
5432 goto out;
5433
5434 /* No tagged data was found - will be 0 for MN diskset */
5435 if (lb_tagged == 0)
5436 goto out;
5437
5438 /* Find the highest non-deleted replica count */
5439 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5440 int lb_tot = 0;
5441
5442 if (rip->ri_mbip == (mddb_mb_ic_t *)NULL)
5443 continue;
5444
5445 if (rip->ri_lbp == (mddb_lb_t *)NULL)
5446 continue;
5447
5448 for (li = 0; li < rip->ri_lbp->lb_loccnt; li++) {
5449 lp = &rip->ri_lbp->lb_locators[li];
5450 if (lp->l_flags & MDDB_F_DELETED)
5451 continue;
5452 lb_tot++;
5453 }
5454
5455 if (lb_tot > lb_total)
5456 lb_total = lb_tot;
5457 }
5458
5459 /* Count the number of unique tags */
5460 for (lb_tags = 0, dtlp = s->s_dtlp; dtlp != NULL; dtlp = dtlp->dtl_nx)
5461 lb_tags++;
5462
5463 /* Should have at least one tag at this point */
5464 ASSERT(lb_tags > 0);
5465
5466
5467 /*
5468 * If the number of tagged locators is not the same as the number of
5469 * OK locators OR more than one tag exists, then make sure the
5470 * selected tag will be written out later.
5471 */
5472 if ((lb_tagged - lb_ok) != 0 || lb_tags > 1)
5473 md_set_setstatus(setno, MD_SET_TAGDATA);
5474
5475 /* Only a single tag, take the tagged data */
5476 if (lb_tags == 1) {
5477 dt_setup(s, &s->s_dtlp->dtl_dt);
5478 md_set_setstatus(setno, MD_SET_USETAG);
5479 goto out;
5480 }
5481
5482 /* Multiple tags, not selecting a tag, tag mode is on */
5483 if (! (md_get_setstatus(setno) & MD_SET_USETAG))
5484 retval = MDDB_E_TAGDATA;
5485
5486 out:
5487
5488 return (retval);
5489 }
5490
5491 /*
5492 * 1. Select a locator.
5493 * 2. check if enough locators now have current copies
5494 * 3. read in database from one of latest
5495 * 4. if known to have latest make all database the same
5496 * 5. if configuration has changed rewrite locators
5497 *
5498 * Parameters:
5499 * s - pointer to mddb_set structure
5500 * flag - used in MN disksets to tell if this node is being joined to
5501 * a diskset that is in the STALE state. If the flag is
5502 * MDDB_MN_STALE, then this node should be marked in the STALE
5503 * state even if > 50% mddbs are available. (The diskset can
5504 * only change from STALE->OK if all nodes withdraw from the
5505 * MN diskset and then rejoin).
5506 */
5507 static int
load_old_replicas(mddb_set_t * s,int flag)5508 load_old_replicas(
5509 mddb_set_t *s,
5510 int flag
5511 )
5512 {
5513 mddb_lb_t *lbp = NULL;
5514 mddb_mnlb_t *mnlbp = NULL;
5515 mddb_ri_t *rip;
5516 mddb_locator_t *lp;
5517 mddb_db_t *dbp;
5518 mddb_de_ic_t *dep;
5519 int li;
5520 int alc;
5521 int lc;
5522 int tlc;
5523 int retval = 0;
5524 caddr_t p;
5525 size_t maxrecsize;
5526 set_t setno = s->s_setno;
5527 mddb_did_db_t *did_dbp1;
5528 mddb_did_info_t *did_info;
5529 mddb_did_ic_t *did_icp = NULL;
5530 md_dev64_t *newdev;
5531 mddb_sidelocator_t *slp = 0;
5532 mddb_mnsidelocator_t *mnslp = 0;
5533 uchar_t i;
5534 char *name;
5535 ddi_devid_t ret_devid;
5536 md_dev64_t dev;
5537 uint_t len, sz;
5538 char *minor_name;
5539 int write_lb = 0;
5540 int rval;
5541 int stale_rtn = 0;
5542
5543 /* The only error path out of get_mbs_n_lbs() is MDDB_E_TAGDATA */
5544 if (retval = get_mbs_n_lbs(s, &write_lb))
5545 goto errout;
5546
5547 if ((lbp = s->s_lbp = selectlocator(s)) == NULL) {
5548 retval = MDDB_E_NOLOCBLK;
5549 goto errout;
5550 }
5551
5552 /* If a multi-node set, then set md_set.s_status flag */
5553 if (lbp->lb_flags & MDDB_MNSET) {
5554 md_set_setstatus(setno, MD_SET_MNSET);
5555 /*
5556 * If data tag area had been allocated before set type was
5557 * known - free it now.
5558 */
5559 if (md_set[setno].s_dtp) {
5560 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
5561 md_set[setno].s_dtp = NULL;
5562 }
5563 }
5564
5565 /*
5566 * If the replica is in devid format, setup the devid incore ptr.
5567 */
5568 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5569 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5570 if (rip->ri_lbp == s->s_lbp) {
5571 did_icp = s->s_did_icp = rip->ri_did_icp;
5572 break;
5573 }
5574 }
5575 /*
5576 * If no devid incore info found - something has gone
5577 * wrong so errout.
5578 */
5579 if (rip == NULL) {
5580 retval = MDDB_E_NODEVID;
5581 goto errout;
5582 }
5583
5584 /*
5585 * Add all blocks containing devids to free list.
5586 * Then remove addresses that actually contain devids.
5587 */
5588 did_dbp1 = did_icp->did_ic_dbp;
5589 while (did_dbp1) {
5590 if (mddb_devid_free_add(s, did_dbp1->db_firstblk,
5591 0, dbtob(did_dbp1->db_blkcnt))) {
5592 retval = MDDB_E_NOSPACE;
5593 goto errout;
5594 }
5595
5596 did_dbp1 = did_dbp1->db_next;
5597 }
5598 for (li = 0; li < lbp->lb_loccnt; li++) {
5599 did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5600 if (!(did_info->info_flags & MDDB_DID_EXISTS))
5601 continue;
5602
5603 if (mddb_devid_free_delete(s, did_info->info_firstblk,
5604 did_info->info_offset, did_info->info_length)) {
5605 /* unable to find disk block */
5606 retval = MDDB_E_NODEVID;
5607 goto errout;
5608 }
5609 }
5610 }
5611
5612 /*
5613 * create mddb_mbaray, count all locators and active locators.
5614 */
5615 alc = 0;
5616 lc = 0;
5617 for (li = 0; li < lbp->lb_loccnt; li++) {
5618 ddi_devid_t li_devid;
5619
5620 lp = &lbp->lb_locators[li];
5621
5622 if (lp->l_flags & MDDB_F_DELETED)
5623 continue;
5624
5625 /* Count non-deleted replicas */
5626 lc++;
5627
5628 /*
5629 * Use the devid of this locator to compare with the rip
5630 * list. The scenario to watch out for here is that this
5631 * locator could be on a disk that is dead and there could
5632 * be a valid entry in the rip list for a different disk
5633 * that has been moved to the dead disks dev_t. We don't
5634 * want to match with the moved disk.
5635 */
5636 li_devid = NULL;
5637 (void) mddb_devid_get(s, li, &li_devid, &minor_name);
5638
5639 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5640 if (match_mddb(rip, li_devid, minor_name,
5641 md_expldev(lp->l_dev), lp->l_blkno)) {
5642 break;
5643 }
5644 }
5645 if (rip == NULL) {
5646 /*
5647 * If rip not found, then mark error in master block
5648 * so that no writes are later attempted to this
5649 * replica. rip may not be setup if ridev
5650 * failed due to un-found driver name.
5651 */
5652 lp->l_flags |= MDDB_F_EMASTER;
5653 continue;
5654 }
5655
5656 s->s_mbiarray[li] = rip->ri_mbip;
5657
5658 lp->l_flags &= MDDB_F_ACTIVE;
5659 lp->l_flags |= (int)rip->ri_flags;
5660
5661 if (rip->ri_transplant)
5662 lp->l_flags &= ~MDDB_F_ACTIVE;
5663
5664 if (lp->l_flags & MDDB_F_LOCACC)
5665 alc++;
5666 }
5667
5668 /* Save on a divide - calculate 50% + 1 up front */
5669 tlc = ((lc + 1) / 2);
5670
5671 if (alc > tlc) { /* alc > tlc - OK */
5672 md_clr_setstatus(setno, MD_SET_STALE);
5673 } else if (alc < tlc) { /* alc < tlc - stale */
5674 md_set_setstatus(setno, MD_SET_STALE);
5675 } else if (lc & 1) { /* alc == tlc && odd - OK */
5676 md_clr_setstatus(setno, MD_SET_STALE);
5677 } else { /* alc == tlc && even - ? */
5678 /* Can do an accept, and are */
5679 if (md_get_setstatus(setno) & (MD_SET_ACCOK | MD_SET_ACCEPT)) {
5680 md_clr_setstatus(setno, MD_SET_STALE);
5681 } else { /* possibly has a mediator */
5682 if (mediate(s)) {
5683 md_set_setstatus(setno, MD_SET_STALE);
5684 } else {
5685 md_clr_setstatus(setno, MD_SET_STALE);
5686 }
5687 }
5688
5689 /*
5690 * The mirrored_root_flag allows the sysadmin to decide to
5691 * start the local set in a read/write (non-stale) mode
5692 * when there are only 50% available mddbs on the system and
5693 * when the root file system is on a mirror. This is useful
5694 * in a 2 disk system where 1 disk failure would cause an mddb
5695 * quorum failure and subsequent boot failures since the root
5696 * filesystem would be in a read-only state.
5697 */
5698 if (mirrored_root_flag == 1 && setno == 0 &&
5699 svm_bootpath[0] != 0) {
5700 md_clr_setstatus(setno, MD_SET_STALE);
5701 } else {
5702 if (md_get_setstatus(setno) & MD_SET_STALE) {
5703 /* Allow half mode - CAREFUL! */
5704 if (mddb_allow_half)
5705 md_clr_setstatus(setno, MD_SET_STALE);
5706 }
5707 }
5708
5709 /*
5710 * In a MN diskset,
5711 * - if 50% mddbs are unavailable and this
5712 * has been marked STALE above
5713 * - master node isn't in the STALE state
5714 * - this node isn't the master node (this node
5715 * isn't the first node to join the set)
5716 * then clear the STALE state and set TOOFEW.
5717 *
5718 * If this node is the master node and set was marked STALE,
5719 * then the set stays STALE.
5720 *
5721 * If this node is not the master and this node's state is
5722 * STALE and the master node is not marked STALE,
5723 * then master node must be in the TOOFEW state or the
5724 * master is panic'ing. A MN diskset can only be placed into
5725 * the STALE state by having the first node join the set
5726 * with <= 50% mddbs. There's no way for a MN diskset to
5727 * transition between STALE and not-STALE states unless all
5728 * nodes are withdrawn from the diskset or all nodes in the
5729 * diskset are rebooted at the same time.
5730 *
5731 * So, mark this node's state as TOOFEW instead of STALE.
5732 */
5733 if (((md_get_setstatus(setno) & (MD_SET_MNSET | MD_SET_STALE))
5734 == (MD_SET_MNSET | MD_SET_STALE)) &&
5735 ((flag & MDDB_MN_STALE) == 0) &&
5736 (!(md_set[setno].s_am_i_master))) {
5737 md_clr_setstatus(setno, MD_SET_STALE);
5738 md_set_setstatus(setno, MD_SET_TOOFEW);
5739 }
5740 }
5741
5742 /*
5743 * If a MN set is marked STALE on the other nodes,
5744 * mark it stale here. Override all other considerations
5745 * such as a mediator or > 50% mddbs available.
5746 */
5747 if (md_get_setstatus(setno) & MD_SET_MNSET) {
5748 if (flag & MDDB_MN_STALE)
5749 md_set_setstatus(setno, MD_SET_STALE);
5750 }
5751
5752 /*
5753 * read a good copy of the locator names
5754 * if an error occurs reading what is suppose
5755 * to be a good copy continue looking for another
5756 * good copy
5757 */
5758 s->s_lnp = NULL;
5759 for (li = 0; li < lbp->lb_loccnt; li++) {
5760 lp = &lbp->lb_locators[li];
5761 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5762 (lp->l_flags & MDDB_F_EMASTER))
5763 continue;
5764
5765 /* Find rip entry for this locator if one exists */
5766 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5767 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5768 lp->l_blkno))
5769 break;
5770 }
5771
5772 if (rip == NULL) {
5773 continue;
5774 }
5775
5776 /*
5777 * Use the rip commitcnt since the commitcnt in lbp could
5778 * been cleared by selectlocator. Looking for a replica with
5779 * the same commitcnt as the 'golden' copy in order to
5780 * get the same data.
5781 */
5782 if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5783 continue;
5784 }
5785
5786 /*
5787 * Now have a copy of the database that is equivalent
5788 * to the chosen locator block with respect to
5789 * inittime, identifier and commitcnt. Trying the
5790 * equivalent databases in the order that they were
5791 * written will provide the most up to date data.
5792 */
5793 lp->l_flags |= readlocnames(s, li);
5794 if (s->s_lnp)
5795 break;
5796 }
5797
5798 if (s->s_lnp == NULL) {
5799 retval = MDDB_E_NOLOCNMS;
5800 goto errout;
5801 }
5802
5803 /*
5804 * read a good copy of the data base
5805 * if an error occurs reading what is suppose
5806 * to be a good copy continue looking for another
5807 * good copy
5808 */
5809
5810 s->s_dbp = NULL;
5811 for (li = 0; li < lbp->lb_loccnt; li++) {
5812 lp = &lbp->lb_locators[li];
5813 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
5814 (lp->l_flags & MDDB_F_EMASTER))
5815 continue;
5816
5817 /* Find rip entry for this locator if one exists */
5818 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
5819 if (match_mddb(rip, NULL, NULL, md_expldev(lp->l_dev),
5820 lp->l_blkno))
5821 break;
5822 }
5823
5824 if (rip == NULL) {
5825 continue;
5826 }
5827
5828 /*
5829 * Use the rip commitcnt since the commitcnt in lbp could
5830 * been cleared by selectlocator. Looking for a replica with
5831 * the same commitcnt as the 'golden' copy in order to
5832 * get the same data.
5833 */
5834 if (rip->ri_commitcnt != lbp->lb_commitcnt) {
5835 continue;
5836 }
5837
5838 /*
5839 * Now have a copy of the database that is equivalent
5840 * to the chosen locator block with respect to
5841 * inittime, identifier and commitcnt. Trying the
5842 * equivalent databases in the order that they were
5843 * written will provide the most up to date data.
5844 */
5845 lp->l_flags |= readcopy(s, li);
5846
5847 if (s->s_dbp)
5848 break;
5849 }
5850
5851 if (s->s_dbp == NULL) {
5852 retval = MDDB_E_NODIRBLK;
5853 goto errout;
5854 }
5855
5856 lp->l_flags |= MDDB_F_MASTER;
5857 lp->l_flags |= MDDB_F_UP2DATE;
5858
5859 /*
5860 * go through and find largest record;
5861 * Also fixup the user data area's
5862 */
5863 maxrecsize = MAX(MDDB_BSIZE, s->s_databuffer_size);
5864
5865 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next)
5866 for (dep = dbp->db_firstentry; dep != NULL; dep = dep->de_next)
5867 if (dep->de_flags & MDDB_F_OPT)
5868 getoptrecord(s, dep);
5869 else {
5870 allocuserdata(dep);
5871 maxrecsize = MAX(dep->de_recsize, maxrecsize);
5872 }
5873
5874 if (maxrecsize > s->s_databuffer_size) {
5875 p = (caddr_t)kmem_zalloc(maxrecsize, KM_SLEEP);
5876 if (s->s_databuffer_size)
5877 kmem_free(s->s_databuffer, s->s_databuffer_size);
5878 s->s_databuffer = p;
5879 s->s_databuffer_size = maxrecsize;
5880 }
5881
5882 /* If we can clear the tag data record, do it now. */
5883 /* Data tags not supported on MN sets */
5884 if ((md_get_setstatus(setno) & MD_SET_CLRTAG) &&
5885 (!(md_get_setstatus(setno) & MD_SET_MNSET)))
5886 dt_setup(s, NULL);
5887
5888 /* This will return non-zero if STALE or TOOFEW */
5889 /* This will write out chosen replica image to all replicas */
5890 stale_rtn = selectreplicas(s, MDDB_SCANALL);
5891
5892 if ((md_get_setstatus(setno) & MD_SET_REPLICATED_IMPORT)) {
5893 ddi_devid_t devidptr;
5894
5895 /*
5896 * ignore the return value from selectreplicas because we
5897 * may have a STALE or TOOFEW set in the case of a partial
5898 * replicated diskset. We will fix that up later.
5899 */
5900
5901 lbp = s->s_lbp;
5902 for (li = 0; li < lbp->lb_loccnt; li++) {
5903 did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5904
5905 if (did_info->info_flags & MDDB_DID_EXISTS) {
5906 devidptr = s->s_did_icp->did_ic_devid[li];
5907 lp = &lbp->lb_locators[li];
5908 for (rip = s->s_rip; rip != NULL;
5909 rip = rip->ri_next) {
5910 if (rip->ri_old_devid == 0)
5911 continue;
5912 if (ddi_devid_compare(rip->ri_old_devid,
5913 devidptr) != 0) {
5914 continue;
5915 }
5916 if (update_locatorblock(s,
5917 md_expldev(lp->l_dev),
5918 rip->ri_devid, rip->ri_old_devid)) {
5919 goto errout;
5920 }
5921 }
5922 }
5923 }
5924 } else {
5925 if (stale_rtn)
5926 goto errout;
5927 }
5928
5929 /*
5930 * If the replica is in device id style - validate the device id's,
5931 * if present, in the locator block devid area.
5932 */
5933 newdev = kmem_zalloc(sizeof (md_dev64_t) * MDDB_NLB, KM_SLEEP);
5934 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
5935 for (li = 0; li < lbp->lb_loccnt; li++) {
5936 newdev[li] = 0;
5937 lp = &lbp->lb_locators[li];
5938 if (lp->l_flags & MDDB_F_DELETED)
5939 continue;
5940 did_info = &(did_icp->did_ic_blkp->blk_info[li]);
5941 dev = md_expldev(lp->l_dev);
5942 if (did_info->info_flags & MDDB_DID_EXISTS) {
5943 /* Validate device id on current system */
5944 newdev[li] = dev;
5945 if (mddb_devid_validate(
5946 did_icp->did_ic_devid[li],
5947 &(newdev[li]),
5948 did_info->info_minor_name) == 0) {
5949 /* Set valid flag */
5950 did_info->info_flags |= MDDB_DID_VALID;
5951 } else {
5952 lp->l_flags |= MDDB_F_EMASTER;
5953 }
5954 } else if (!(MD_UPGRADE)) {
5955 /*
5956 * If a device doesn't have a device id,
5957 * check if there is now a device ID
5958 * associated with device. If one exists,
5959 * add it to the locator block devid area.
5960 * If there's not enough space to add it,
5961 * print a warning.
5962 * Don't do this during upgrade.
5963 */
5964 dev_t ddi_dev = md_dev64_to_dev(dev);
5965 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) ==
5966 DDI_SUCCESS) {
5967 if (ddi_lyr_get_minor_name(ddi_dev,
5968 S_IFBLK, &minor_name)
5969 == DDI_SUCCESS) {
5970 if (mddb_devid_add(s, li,
5971 ret_devid, minor_name)) {
5972 cmn_err(CE_WARN,
5973 "Not enough space"
5974 " in metadevice"
5975 " state"
5976 " database\n");
5977 cmn_err(CE_WARN,
5978 "to add relocation"
5979 " information for"
5980 " device:\n");
5981 cmn_err(CE_WARN,
5982 " major = %d, "
5983 " minor = %d\n",
5984 getmajor(ddi_dev),
5985 getminor(ddi_dev));
5986 } else {
5987 write_lb = 1;
5988 }
5989 kmem_free(minor_name,
5990 strlen(minor_name) + 1);
5991 }
5992 ddi_devid_free(ret_devid);
5993 }
5994 }
5995 }
5996
5997 /*
5998 * If a device has a valid device id and if the dev_t
5999 * associated with the device id has changed, update the
6000 * driver name, minor num and dev_t in the local and side
6001 * locators to match the dev_t that the system currently
6002 * associates with the device id.
6003 *
6004 * Don't do this during upgrade.
6005 */
6006 if (!(MD_UPGRADE)) {
6007 for (li = 0; li < lbp->lb_loccnt; li++) {
6008 lp = &lbp->lb_locators[li];
6009 if (lp->l_flags & MDDB_F_DELETED)
6010 continue;
6011 did_info = &(did_icp->did_ic_blkp->blk_info
6012 [li]);
6013 if ((did_info->info_flags & MDDB_DID_VALID) &&
6014 !(did_info->info_flags &
6015 MDDB_DID_UPDATED)) {
6016 if (lbp->lb_flags & MDDB_MNSET) {
6017 int j;
6018 int index = -1;
6019 mnlbp = (mddb_mnlb_t *)lbp;
6020 for (j = 0; j < MD_MNMAXSIDES;
6021 j++) {
6022 mnslp = &mnlbp->
6023 lb_mnsidelocators[j]
6024 [li];
6025 if (mnslp->mnl_sideno ==
6026 s->s_sideno)
6027 break;
6028 if (mnslp->mnl_sideno ==
6029 0)
6030 index = j;
6031 }
6032 if (j == MD_MNMAXSIDES) {
6033 /*
6034 * No match found; take
6035 * empty
6036 */
6037 mnslp = &mnlbp->
6038 lb_mnsidelocators
6039 [index][li];
6040 write_lb = 1;
6041 mnslp->mnl_mnum =
6042 md_getminor(newdev
6043 [li]);
6044 } else if (mnslp->mnl_mnum !=
6045 md_getminor(newdev[li])) {
6046 write_lb = 1;
6047 mnslp->mnl_mnum =
6048 md_getminor(newdev
6049 [li]);
6050 }
6051 } else {
6052 slp = &lbp->
6053 lb_sidelocators[s->s_sideno]
6054 [li];
6055 if (slp->l_mnum !=
6056 md_getminor(newdev[li])) {
6057 write_lb = 1;
6058 slp->l_mnum =
6059 md_getminor(newdev
6060 [li]);
6061 }
6062 }
6063 name = ddi_major_to_name(md_getmajor(
6064 newdev[li]));
6065 if (lbp->lb_flags & MDDB_MNSET)
6066 i = mnslp->mnl_drvnm_index;
6067 else
6068 i = slp->l_drvnm_index;
6069 if (strncmp(lbp->lb_drvnm[i].dn_data,
6070 name, lbp->lb_drvnm[i].dn_len) !=
6071 0) {
6072 /* Driver name has changed */
6073 len = strlen(name);
6074 /* Look for the driver name */
6075 for (i = 0; i < MDDB_DRVNMCNT;
6076 i++) {
6077 if (lbp->lb_drvnm[i].
6078 dn_len != len)
6079 continue;
6080 if (strncmp(lbp->
6081 lb_drvnm[i].dn_data,
6082 name, len) == 0)
6083 break;
6084 }
6085 /* Didn't find one, add it */
6086 if (i == MDDB_DRVNMCNT) {
6087 for (i = 0; i <
6088 MDDB_DRVNMCNT;
6089 i++) {
6090 if (lbp->
6091 lb_drvnm[i].
6092 dn_len == 0)
6093 break;
6094 }
6095 if (i ==
6096 MDDB_DRVNMCNT) {
6097 cmn_err(CE_WARN,
6098 "Unable to "
6099 " update "
6100 "driver "
6101 " name for "
6102 "dev: "
6103 "major = %d"
6104 ", minor = "
6105 "%d\n",
6106 md_getmajor(
6107 newdev[li]),
6108 md_getminor(
6109 newdev
6110 [li]));
6111 continue;
6112 }
6113 (void) strncpy(lbp->
6114 lb_drvnm[i].dn_data,
6115 name, MD_MAXDRVNM);
6116 lbp->lb_drvnm[i].
6117 dn_len = (uchar_t)
6118 strlen(name);
6119 }
6120 /* Fill in the drvnm index */
6121 if (lbp->lb_flags &
6122 MDDB_MNSET)
6123 mnslp->mnl_drvnm_index =
6124 i;
6125 else
6126 slp->l_drvnm_index = i;
6127 write_lb = 1;
6128 }
6129 did_info->info_flags |=
6130 MDDB_DID_UPDATED;
6131 }
6132 }
6133 }
6134 }
6135 kmem_free(newdev, sizeof (md_dev64_t) * MDDB_NLB);
6136
6137 /*
6138 * If locator block has been changed by get_mbs_n_lbs,
6139 * by addition of new device id, by updated minor name or
6140 * by updated driver name - write out locator block.
6141 */
6142 if (write_lb) {
6143 rval = push_lb(s);
6144 (void) upd_med(s, "load_old_replicas(0)");
6145 if (rval)
6146 goto errout;
6147 }
6148
6149 /*
6150 * If the tag was moved, allocated, or a BADTAG was seen for some other
6151 * reason, then make sure tags are written to all the replicas.
6152 * Data tags not supported on MN sets.
6153 */
6154 if (!(md_get_setstatus(setno) & MD_SET_MNSET)) {
6155 if (! (lc = dt_alloc_if_needed(s))) {
6156 for (li = 0; li < lbp->lb_loccnt; li++) {
6157 lp = &lbp->lb_locators[li];
6158
6159 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
6160 (lp->l_flags & MDDB_F_EMASTER))
6161 continue;
6162
6163 if (lp->l_flags & MDDB_F_BADTAG) {
6164 lc = 1;
6165 break;
6166 }
6167 }
6168 }
6169
6170 if (lc) {
6171 md_set_setstatus(setno, MD_SET_TAGDATA);
6172 md_clr_setstatus(setno, MD_SET_BADTAG);
6173 (void) selectreplicas(s, MDDB_SCANALL);
6174 }
6175 }
6176
6177 errout:
6178
6179 /* Free extraneous rip components. */
6180 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
6181 /* Get rid of lbp's and dtp's */
6182
6183 if (rip->ri_lbp != lbp) {
6184 if (rip->ri_dtp != (mddb_dt_t *)NULL) {
6185 kmem_free((caddr_t)rip->ri_dtp, MDDB_DT_BYTES);
6186 rip->ri_dtp = (mddb_dt_t *)NULL;
6187 }
6188
6189 if (rip->ri_devid != (ddi_devid_t)NULL) {
6190 sz = (int)ddi_devid_sizeof(rip->ri_devid);
6191 kmem_free((caddr_t)rip->ri_devid, sz);
6192 rip->ri_devid = (ddi_devid_t)NULL;
6193 }
6194 if (rip->ri_old_devid != (ddi_devid_t)NULL) {
6195 sz = (int)ddi_devid_sizeof(rip->ri_old_devid);
6196 kmem_free((caddr_t)rip->ri_old_devid, sz);
6197 rip->ri_old_devid = (ddi_devid_t)NULL;
6198 }
6199
6200 if (rip->ri_lbp != (mddb_lb_t *)NULL) {
6201 mddb_devid_icp_free(&rip->ri_did_icp,
6202 rip->ri_lbp);
6203
6204 kmem_free((caddr_t)rip->ri_lbp,
6205 dbtob(rip->ri_lbp->lb_blkcnt));
6206 rip->ri_lbp = (mddb_lb_t *)NULL;
6207 }
6208 }
6209
6210 if (lbp != NULL) {
6211 for (li = 0; li < lbp->lb_loccnt; li++) {
6212 lp = &lbp->lb_locators[li];
6213 if (lp->l_flags & MDDB_F_DELETED)
6214 continue;
6215 if (rip->ri_dev == md_expldev(lp->l_dev) &&
6216 rip->ri_blkno == lp->l_blkno)
6217 break;
6218 }
6219 if (li < lbp->lb_loccnt)
6220 continue;
6221 }
6222
6223 /*
6224 * Get rid of mbp's:
6225 * if lbp, those out of lb_loccnt bounds
6226 * if !lbp, all of them.
6227 */
6228 if (rip->ri_mbip) {
6229 md_dev64_t dev64 = md_xlate_targ_2_mini(rip->ri_dev);
6230 if (dev64 != NODEV64)
6231 mddb_devclose(dev64);
6232
6233 free_mbipp(&rip->ri_mbip);
6234 }
6235 /*
6236 * Turn off MDDB_F_EMASTER flag in a diskset since diskset
6237 * code always ends up calling ridev for all replicas
6238 * before calling load_old_replicas. ridev will reset
6239 * MDDB_F_EMASTER flag if flag was due to unresolved devid.
6240 */
6241 if (setno != MD_LOCAL_SET)
6242 rip->ri_flags &= ~MDDB_F_EMASTER;
6243 }
6244 return (retval);
6245 }
6246
6247 /*
6248 * Given the devt from the md.conf info, get the devid for the device.
6249 */
6250 static void
lookup_db_devid(mddb_cfg_loc_t * cl)6251 lookup_db_devid(mddb_cfg_loc_t *cl)
6252 {
6253 dev_t ldev;
6254 ddi_devid_t devid;
6255 char *minor;
6256
6257 if (ddi_name_to_major(cl->l_driver) == (major_t)-1) {
6258 cmn_err(CE_NOTE, "mddb: unknown major name '%s'", cl->l_driver);
6259 return;
6260 }
6261
6262 ldev = makedevice(ddi_name_to_major(cl->l_driver), cl->l_mnum);
6263 if (ddi_lyr_get_devid(ldev, &devid) != DDI_SUCCESS) {
6264 cmn_err(CE_NOTE, "mddb: unable to get devid for '%s', 0x%x",
6265 cl->l_driver, cl->l_mnum);
6266 return;
6267 }
6268
6269 if (ddi_lyr_get_minor_name(ldev, S_IFBLK, &minor) != DDI_SUCCESS) {
6270 cmn_err(CE_NOTE, "mddb: unable to get minor name 0x%x",
6271 cl->l_mnum);
6272 return;
6273 }
6274
6275 cl->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID | MDDB_DEVID_SZ;
6276 cl->l_devid_sz = (int)ddi_devid_sizeof(devid);
6277 cl->l_devid = (uint64_t)(uintptr_t)devid;
6278 (void) strlcpy(cl->l_minor_name, minor, MDDB_MINOR_NAME_MAX);
6279
6280 kmem_free(minor, strlen(minor) + 1);
6281 }
6282
6283 /*
6284 * grab driver name, minor, block and devid out of
6285 * strings like "driver:minor:block:devid"
6286 */
6287 static int
parse_db_loc(char * str,mddb_cfg_loc_t * clp)6288 parse_db_loc(
6289 char *str,
6290 mddb_cfg_loc_t *clp
6291 )
6292 {
6293 char *p, *e;
6294 char *minor_name;
6295 ddi_devid_t ret_devid;
6296
6297 clp->l_dev = 0;
6298 p = clp->l_driver;
6299 e = p + sizeof (clp->l_driver) - 1;
6300 while ((*str != ':') && (*str != '\0') && (p < e))
6301 *p++ = *str++;
6302 *p = '\0';
6303 if (*str++ != ':')
6304 return (-1);
6305 clp->l_mnum = 0;
6306 while (ISNUM(*str)) {
6307 clp->l_mnum *= 10;
6308 clp->l_mnum += *str++ - '0';
6309 }
6310 if (*str++ != ':')
6311 return (-1);
6312 clp->l_blkno = 0;
6313 while (ISNUM(*str)) {
6314 clp->l_blkno *= 10;
6315 clp->l_blkno += *str++ - '0';
6316 }
6317 if (*str++ != ':')
6318 return (-1);
6319
6320 /*
6321 * If the md_devid_destroy flag is set, ignore the device ids.
6322 * This is only to used in a catastrophic failure case. Examples
6323 * would be where the device id of all drives in the system
6324 * (especially the mirror'd root drives) had been changed
6325 * by firmware upgrade or by a patch to an existing disk
6326 * driver. Another example would be in the case of non-unique
6327 * device ids due to a bug. The device id would be valid on
6328 * the system, but would return the wrong dev_t.
6329 */
6330 if (md_devid_destroy) {
6331 clp->l_devid_flags = 0;
6332 clp->l_devid = (uint64_t)NULL;
6333 clp->l_devid_sz = 0;
6334 clp->l_old_devid = (uint64_t)NULL;
6335 clp->l_old_devid_sz = 0;
6336 clp->l_minor_name[0] = '\0';
6337 return (0);
6338 }
6339
6340 if (ddi_devid_str_decode(str,
6341 (ddi_devid_t *)&ret_devid, &minor_name) == DDI_FAILURE)
6342 return (-1);
6343
6344 clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
6345 clp->l_devid_flags = 0;
6346 clp->l_old_devid = (uint64_t)NULL;
6347 clp->l_old_devid_sz = 0;
6348
6349 /* If no device id associated with device, just return */
6350 if ((ddi_devid_t)(uintptr_t)clp->l_devid == (ddi_devid_t)NULL) {
6351 clp->l_devid_sz = 0;
6352 clp->l_minor_name[0] = '\0';
6353 if (strcmp(str, "id0") == 0 && md_devid_destroy == 0 &&
6354 md_keep_repl_state == 0) {
6355 /*
6356 * No devid in md.conf; we're in recovery mode so
6357 * lookup the devid for the device as specified by
6358 * the devt in md.conf.
6359 */
6360 lookup_db_devid(clp);
6361 }
6362 return (0);
6363 }
6364
6365 clp->l_devid_flags = MDDB_DEVID_SPACE | MDDB_DEVID_VALID |
6366 MDDB_DEVID_SZ;
6367 clp->l_devid_sz = (int)ddi_devid_sizeof(
6368 (ddi_devid_t)(uintptr_t)clp->l_devid);
6369 (void) strcpy(clp->l_minor_name, minor_name);
6370 kmem_free(minor_name, strlen(minor_name) + 1);
6371
6372 return (0);
6373 }
6374
6375 /*
6376 * grab driver name, minor, and block out of
6377 * strings like "driver:minor:block:devid driver:minor:block:devid ..."
6378 */
6379 static void
parse_db_string(char * str)6380 parse_db_string(
6381 char *str
6382 )
6383 {
6384 char *p, *e;
6385 mddb_cfg_loc_t *cl;
6386 char restore_space;
6387
6388 /* CSTYLED */
6389 cl = kmem_zalloc(sizeof (mddb_cfg_loc_t), KM_SLEEP);
6390 for (p = str; (*p != '\0'); ) {
6391 for (; ((*p != '\0') && (ISWHITE(*p))); ++p)
6392 ;
6393 if (*p == '\0')
6394 break;
6395 for (e = p; ((*e != '\0') && (! ISWHITE(*e))); ++e)
6396 ;
6397 /*
6398 * Only give parse_db_loc 1 entry, so stuff a null into
6399 * the string if we're not at the end. We need to save this
6400 * char and restore it after call.
6401 */
6402 restore_space = '\0';
6403 if (*e != '\0') {
6404 restore_space = *e;
6405 *e = '\0';
6406 }
6407 if (parse_db_loc(p, cl) != 0) {
6408 cmn_err(CE_NOTE, "mddb: parsing error on '%s'", p);
6409 } else {
6410 (void) ridev(
6411 &((mddb_set_t *)md_set[MD_LOCAL_SET].s_db)->s_rip,
6412 cl, NULL, MDDB_F_PTCHED);
6413 if (cl->l_devid_flags & MDDB_DEVID_SPACE) {
6414 kmem_free((caddr_t)(uintptr_t)cl->l_devid,
6415 cl->l_devid_sz);
6416 }
6417 }
6418 if (restore_space != '\0') {
6419 *e = restore_space;
6420 }
6421 p = e;
6422 }
6423 kmem_free(cl, sizeof (mddb_cfg_loc_t));
6424 }
6425
6426 /*
6427 * grab database locations supplied by md.conf as properties
6428 */
6429 static void
parse_db_strings(void)6430 parse_db_strings(void)
6431 {
6432 int bootlist_id;
6433 int proplen;
6434 /*
6435 * size of _bootlist_name should match uses of line and entry in
6436 * libmeta meta_systemfile_append_mddb routine (meta_systemfile.c)
6437 */
6438 char _bootlist_name[MDDB_BOOTLIST_MAX_LEN];
6439 char *bootlist_name;
6440 caddr_t prop;
6441
6442 /*
6443 * Step through the bootlist properties one at a time by forming the
6444 * correct name, fetching the property, parsing the property and
6445 * then freeing the memory. If a property does not exist or returns
6446 * some form of error just ignore it. There is no guarantee that
6447 * the properties will always exist in sequence, for example
6448 * mddb_bootlist1 may exist and mddb_bootlist2 may not exist with
6449 * mddb_bootlist3 existing.
6450 */
6451 bootlist_name = &_bootlist_name[0];
6452 for (bootlist_id = 0; bootlist_id < md_maxbootlist; bootlist_id++) {
6453
6454 proplen = 0;
6455 (void) sprintf(bootlist_name, "mddb_bootlist%d", bootlist_id);
6456
6457 if (ddi_getlongprop(DDI_DEV_T_ANY, md_devinfo,
6458 DDI_PROP_CANSLEEP, bootlist_name, (caddr_t)&prop,
6459 &proplen) != DDI_PROP_SUCCESS)
6460 continue;
6461
6462 if (proplen <= 0)
6463 continue;
6464
6465 if (md_init_debug)
6466 cmn_err(CE_NOTE, "%s is %s", bootlist_name, prop);
6467
6468 parse_db_string(prop);
6469 kmem_free(prop, proplen);
6470 }
6471 }
6472
6473 static int
initit(set_t setno,int flag)6474 initit(
6475 set_t setno,
6476 int flag
6477 )
6478 {
6479 int i;
6480 mddb_set_t *s;
6481 mddb_lb_t *lbp; /* pointer to locator block */
6482 mddb_ln_t *lnp; /* pointer to locator names */
6483 mddb_db_t *dbp; /* pointer to directory block */
6484 mddb_did_blk_t *did_blkp; /* pointer to Device ID block */
6485 mddb_did_ic_t *did_icp; /* pointer to Device ID incore area */
6486 mddb_bf_t *bfp;
6487 side_t sideno;
6488 side_t maxsides;
6489 mddb_block_t lb_blkcnt;
6490 int retval = 0;
6491 md_dev64_t dev;
6492 mddb_mnlb_t *mnlbp;
6493 int devid_flag;
6494
6495 /* single thread's all loads/unloads of set's */
6496 mutex_enter(&mddb_lock);
6497 mutex_enter(SETMUTEX(setno));
6498
6499 if (((mddb_set_t *)md_set[setno].s_db) == NULL) {
6500 mutex_exit(SETMUTEX(setno));
6501 mutex_exit(&mddb_lock);
6502 return (MDDB_E_NOTNOW);
6503 }
6504
6505 s = (mddb_set_t *)md_set[setno].s_db;
6506
6507 single_thread_start(s);
6508
6509 /*
6510 * init is already underway, block. Return success.
6511 */
6512 if (s->s_lbp) {
6513 single_thread_end(s);
6514 mutex_exit(SETMUTEX(setno));
6515 mutex_exit(&mddb_lock);
6516 return (0);
6517 }
6518
6519 uniqtime32(&s->s_inittime);
6520
6521 /* grab database locations patched by /etc/system */
6522 if (setno == MD_LOCAL_SET)
6523 parse_db_strings();
6524
6525 s->s_mbiarray = (mddb_mb_ic_t **)kmem_zalloc(
6526 sizeof (mddb_mb_ic_t *) * mddb_maxcopies, KM_SLEEP);
6527
6528 s->s_zombie = 0;
6529 s->s_staledeletes = 0;
6530 s->s_optcmtcnt = 0;
6531 s->s_opthavelck = 0;
6532 s->s_optwantlck = 0;
6533 s->s_optwaiterr = 0;
6534 s->s_opthungerr = 0;
6535
6536 /*
6537 * KEEPTAG can never be set for a MN diskset since no tags are
6538 * allowed to be stored in a MN diskset. No way to check
6539 * if this is a MN diskset or not at this point since the mddb
6540 * hasn't been read in from disk yet. (flag will only have
6541 * MUTLINODE bit set if a new set is being created.)
6542 */
6543 if (! (md_get_setstatus(s->s_setno) & MD_SET_KEEPTAG))
6544 dt_setup(s, NULL);
6545
6546 md_clr_setstatus(s->s_setno, MD_SET_TOOFEW);
6547
6548 for (i = 0; i < mddb_maxbufheaders; i++) {
6549 bfp = (mddb_bf_t *)kmem_zalloc(sizeof (*bfp), KM_SLEEP);
6550 sema_init(&bfp->bf_buf.b_io, 0, NULL,
6551 SEMA_DEFAULT, NULL);
6552 sema_init(&bfp->bf_buf.b_sem, 0, NULL,
6553 SEMA_DEFAULT, NULL);
6554 bfp->bf_buf.b_offset = -1;
6555 freebuffer(s, bfp);
6556 }
6557
6558 retval = load_old_replicas(s, flag);
6559 /* If 0 return value - success */
6560 if (! retval) {
6561 single_thread_end(s);
6562 mutex_exit(SETMUTEX(setno));
6563 mutex_exit(&mddb_lock);
6564 return (0);
6565 }
6566
6567 /*
6568 * If here, then the load_old_replicas() failed
6569 */
6570
6571
6572 /* If the database was supposed to exist. */
6573 if (flag & MDDB_MUSTEXIST) {
6574 if (s->s_mbiarray != (mddb_mb_ic_t **)NULL) {
6575 for (i = 0; i < mddb_maxcopies; i++) {
6576 if (! s->s_mbiarray[i])
6577 continue;
6578 dev = md_expldev(
6579 s->s_lbp->lb_locators[i].l_dev);
6580 dev = md_xlate_targ_2_mini(dev);
6581 if (dev != NODEV64)
6582 mddb_devclose(dev);
6583
6584 free_mbipp(&s->s_mbiarray[i]);
6585 }
6586
6587 kmem_free((caddr_t)s->s_mbiarray,
6588 sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
6589 s->s_mbiarray = NULL;
6590 }
6591
6592 if (s->s_lnp != (mddb_ln_t *)NULL) {
6593 kmem_free((caddr_t)s->s_lnp,
6594 dbtob(s->s_lbp->lb_lnblkcnt));
6595 s->s_lnp = (mddb_ln_t *)NULL;
6596 }
6597
6598 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
6599
6600 if (s->s_lbp != (mddb_lb_t *)NULL) {
6601 kmem_free((caddr_t)s->s_lbp,
6602 dbtob(s->s_lbp->lb_blkcnt));
6603 s->s_lbp = (mddb_lb_t *)NULL;
6604 }
6605
6606 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
6607 kmem_free((caddr_t)bfp, sizeof (*bfp));
6608
6609 single_thread_end(s);
6610 mutex_exit(SETMUTEX(setno));
6611 mutex_exit(&mddb_lock);
6612
6613 if (retval == MDDB_E_TAGDATA)
6614 return (retval);
6615
6616 /* Want a bit more detailed error messages */
6617 if (mddb_db_err_detail)
6618 return (retval);
6619
6620 return (MDDB_E_NODB);
6621 }
6622
6623
6624 /*
6625 * MDDB_NOOLDOK set - Creating a new database, so do
6626 * more initialization.
6627 */
6628
6629 lb_blkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6630 MDDB_LOCAL_LBCNT : MDDB_LBCNT);
6631 if (flag & MDDB_MULTINODE) {
6632 lb_blkcnt = MDDB_MNLBCNT;
6633 }
6634
6635 if (s->s_lbp == NULL)
6636 s->s_lbp = (mddb_lb_t *)kmem_alloc(dbtob(lb_blkcnt), KM_SLEEP);
6637 lbp = s->s_lbp;
6638
6639 bzero((caddr_t)lbp, dbtob(lb_blkcnt));
6640 lbp->lb_setno = setno;
6641 lbp->lb_magic = MDDB_MAGIC_LB;
6642 if (flag & MDDB_MULTINODE) {
6643 lbp->lb_revision = MDDB_REV_MNLB;
6644 } else {
6645 lbp->lb_revision = MDDB_REV_LB;
6646 }
6647 lbp->lb_inittime = s->s_inittime;
6648 if (flag & MDDB_MULTINODE) {
6649 mnlbp = (mddb_mnlb_t *)lbp;
6650 for (i = 0; i < MDDB_NLB; i++) {
6651 for (sideno = 0; sideno < MD_MNMAXSIDES; sideno++) {
6652 mddb_mnsidelocator_t *mnslp;
6653 mnslp = &mnlbp->lb_mnsidelocators[sideno][i];
6654 mnslp->mnl_mnum = NODEV32;
6655 mnslp->mnl_sideno = 0;
6656 mnslp->mnl_drvnm_index = 0;
6657 }
6658 }
6659 } else {
6660 maxsides = ((setno == MD_LOCAL_SET) ? 1 : MD_MAXSIDES);
6661 for (i = 0; i < MDDB_NLB; i++) {
6662 for (sideno = 0; sideno < maxsides; sideno++) {
6663 mddb_sidelocator_t *slp;
6664 slp = &lbp->lb_sidelocators[sideno][i];
6665 slp->l_mnum = NODEV32;
6666 }
6667 }
6668 }
6669 lbp->lb_blkcnt = lb_blkcnt;
6670
6671 /* lb starts on block 0 */
6672 /* locator names starts after locator block */
6673 lbp->lb_lnfirstblk = lb_blkcnt;
6674 if (flag & MDDB_MULTINODE) {
6675 lbp->lb_lnblkcnt = (mddb_block_t)MDDB_MNLNCNT;
6676 } else {
6677 lbp->lb_lnblkcnt = (mddb_block_t)((setno == MD_LOCAL_SET) ?
6678 MDDB_LOCAL_LNCNT : MDDB_LNCNT);
6679 }
6680
6681 if (flag & MDDB_MULTINODE) {
6682 /* Creating a multinode diskset */
6683 md_set_setstatus(setno, MD_SET_MNSET);
6684 lbp->lb_flags |= MDDB_MNSET;
6685 }
6686
6687 /* Data portion of mddb located after locator names */
6688 lbp->lb_dbfirstblk = lbp->lb_lnfirstblk + lbp->lb_lnblkcnt;
6689
6690 /* the btodb that follows is converting the directory block size */
6691 /* Data tag part of mddb located after first block of mddb data */
6692 lbp->lb_dtfirstblk = (mddb_block_t)(lbp->lb_dbfirstblk +
6693 btodb(MDDB_BSIZE));
6694 /* Data tags are not used in MN diskset - so set count to 0 */
6695 if (flag & MDDB_MULTINODE)
6696 lbp->lb_dtblkcnt = (mddb_block_t)0;
6697 else
6698 lbp->lb_dtblkcnt = (mddb_block_t)MDDB_DT_BLOCKS;
6699
6700
6701 lnp = (mddb_ln_t *)kmem_zalloc(dbtob(lbp->lb_lnblkcnt), KM_SLEEP);
6702 lnp->ln_magic = MDDB_MAGIC_LN;
6703 if (flag & MDDB_MULTINODE) {
6704 lnp->ln_revision = MDDB_REV_MNLN;
6705 } else {
6706 lnp->ln_revision = MDDB_REV_LN;
6707 }
6708 s->s_lnp = lnp;
6709
6710 /*
6711 * Set up Device ID portion of Locator Block.
6712 * Do not set locator to device id style if
6713 * md_devid_destroy is 1 and md_keep_repl_state is 1
6714 * (destroy all device id data and keep replica in
6715 * non device id mode).
6716 *
6717 * This is logically equivalent to set locator to
6718 * device id style if md_devid_destroy is 0 or
6719 * md_keep_repl_state is 0.
6720 *
6721 * In SunCluster environment, device id mode is disabled
6722 * which means diskset will be run in non-devid mode. For
6723 * localset, the behavior will remain intact and run in
6724 * device id mode.
6725 *
6726 * In multinode diskset devids are turned off.
6727 */
6728 devid_flag = 1;
6729 if (cluster_bootflags & CLUSTER_CONFIGURED)
6730 if (setno != MD_LOCAL_SET)
6731 devid_flag = 0;
6732 if (flag & MDDB_MULTINODE)
6733 devid_flag = 0;
6734 if ((md_devid_destroy == 1) && (md_keep_repl_state == 1))
6735 devid_flag = 0;
6736 /*
6737 * if we weren't devid style before and md_keep_repl_state=1
6738 * we need to stay non-devid
6739 */
6740 if (((lbp->lb_flags & MDDB_DEVID_STYLE) == 0) &&
6741 (md_keep_repl_state == 1))
6742 devid_flag = 0;
6743 if (devid_flag) {
6744 lbp->lb_didfirstblk = lbp->lb_dtfirstblk +
6745 lbp->lb_dtblkcnt;
6746 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
6747 lbp->lb_flags |= MDDB_DEVID_STYLE;
6748
6749 did_icp = (mddb_did_ic_t *)kmem_zalloc
6750 (sizeof (mddb_did_ic_t), KM_SLEEP);
6751 did_blkp = (mddb_did_blk_t *)
6752 kmem_zalloc(dbtob(lbp->lb_didblkcnt), KM_SLEEP);
6753 did_blkp->blk_magic = MDDB_MAGIC_DI;
6754 did_blkp->blk_revision = MDDB_REV_DI;
6755 did_icp->did_ic_blkp = did_blkp;
6756 s->s_did_icp = did_icp;
6757 }
6758
6759 setidentifier(s, &lbp->lb_ident);
6760 uniqtime32(&lbp->lb_timestamp);
6761 dbp = (mddb_db_t *)kmem_zalloc(sizeof (mddb_db_t), KM_SLEEP);
6762 dbp->db_magic = MDDB_MAGIC_DB;
6763 dbp->db_revision = MDDB_REV_DB;
6764 uniqtime32(&dbp->db_timestamp);
6765 dbp->db_nextblk = 0;
6766 dbp->db_firstentry = NULL;
6767 dbp->db_blknum = lbp->lb_dbfirstblk;
6768 dbp->db_recsum = MDDB_GLOBAL_XOR;
6769 s->s_dbp = dbp;
6770 single_thread_end(s);
6771 mutex_exit(SETMUTEX(setno));
6772 mutex_exit(&mddb_lock);
6773 return (0);
6774 }
6775
6776 mddb_set_t *
mddb_setenter(set_t setno,int flag,int * errorcodep)6777 mddb_setenter(
6778 set_t setno,
6779 int flag,
6780 int *errorcodep
6781 )
6782 {
6783 mddb_set_t *s;
6784 int err = 0;
6785 size_t sz = sizeof (void *) * MD_MAXUNITS;
6786
6787 mutex_enter(SETMUTEX(setno));
6788 if (! md_set[setno].s_db) {
6789 mutex_exit(SETMUTEX(setno));
6790 if (errorcodep != NULL)
6791 *errorcodep = MDDB_E_NOTOWNER;
6792 return (NULL);
6793 }
6794
6795 /* Allocate s_un and s_ui arrays if not already present. */
6796 if (md_set[setno].s_un == NULL) {
6797 md_set[setno].s_un = kmem_zalloc(sz, KM_NOSLEEP);
6798 if (md_set[setno].s_un == NULL) {
6799 mutex_exit(SETMUTEX(setno));
6800 if (errorcodep != NULL)
6801 *errorcodep = MDDB_E_NOTOWNER;
6802 return (NULL);
6803 }
6804 }
6805 if (md_set[setno].s_ui == NULL) {
6806 md_set[setno].s_ui = kmem_zalloc(sz, KM_NOSLEEP);
6807 if (md_set[setno].s_ui == NULL) {
6808 mutex_exit(&md_set[setno].s_dbmx);
6809 kmem_free(md_set[setno].s_un, sz);
6810 md_set[setno].s_un = NULL;
6811 if (errorcodep != NULL)
6812 *errorcodep = MDDB_E_NOTOWNER;
6813 return (NULL);
6814 }
6815 }
6816 s = (mddb_set_t *)md_set[setno].s_db;
6817 if (s->s_lbp)
6818 return (s);
6819
6820 if (flag & MDDB_NOINIT)
6821 return (s);
6822
6823 /*
6824 * Release the set mutex - it will be acquired and released in
6825 * initit after acquiring the mddb_lock. This is done to assure
6826 * that mutexes are always acquired in the same order to prevent
6827 * possible deadlock
6828 */
6829 mutex_exit(SETMUTEX(setno));
6830
6831 if ((err = initit(setno, flag)) != 0) {
6832 if (errorcodep != NULL)
6833 *errorcodep = err;
6834 return (NULL);
6835 }
6836
6837 mutex_enter(SETMUTEX(setno));
6838 return ((mddb_set_t *)md_set[setno].s_db);
6839 }
6840
6841 /*
6842 * Release the set lock for a given set.
6843 *
6844 * In a MN diskset, this routine may send messages to the rpc.mdcommd
6845 * in order to have the slave nodes re-parse parts of the mddb.
6846 * Messages are only sent if the global ioctl lock is not held.
6847 *
6848 * With the introduction of multi-threaded ioctls, there is no way
6849 * to determine which thread(s) are holding the ioctl lock. So, if
6850 * the ioctl lock is held (by process X) process X will send the
6851 * messages to the slave nodes when process X releases the ioctl lock.
6852 */
6853 void
mddb_setexit(mddb_set_t * s)6854 mddb_setexit(
6855 mddb_set_t *s
6856 )
6857 {
6858 md_mn_msg_mddb_parse_t *mddb_parse_msg;
6859 md_mn_kresult_t *kresult;
6860 mddb_lb_t *lbp = s->s_lbp;
6861 int i;
6862 int rval = 1;
6863
6864 /*
6865 * If not a MN diskset OR
6866 * a MN diskset but this node isn't master,
6867 * then release the mutex.
6868 */
6869 if (!(MD_MNSET_SETNO(s->s_setno)) ||
6870 ((MD_MNSET_SETNO(s->s_setno)) &&
6871 (!md_set[s->s_setno].s_am_i_master))) {
6872 mutex_exit(SETMUTEX(s->s_setno));
6873 return;
6874 }
6875
6876 /*
6877 * If global ioctl lock is held, then send no messages,
6878 * just release mutex and return.
6879 *
6880 */
6881 if (md_status & MD_GBL_IOCTL_LOCK) {
6882 mutex_exit(SETMUTEX(s->s_setno));
6883 return;
6884 }
6885
6886 /*
6887 * This thread is not holding the ioctl lock, so drop the set
6888 * lock, send messages to slave nodes to reparse portions
6889 * of the mddb and return.
6890 *
6891 * If the block parse flag is set, do not send parse messages.
6892 * This flag is set when master is adding a new mddb that would
6893 * cause parse messages to be sent to the slaves, but the slaves
6894 * don't have knowledge of the new mddb yet since the mddb add
6895 * operation hasn't been run on the slave nodes yet. When the
6896 * master unblocks the parse flag, the parse messages will be
6897 * generated.
6898 *
6899 * If s_mn_parseflags_sending is non-zero, then another thread
6900 * is already currently sending a parse message, so just release
6901 * the mutex and return. If an mddb change occurred that results
6902 * in a parse message to be generated, the thread that is currently
6903 * sending a parse message would generate the additional parse message.
6904 *
6905 * If s_mn_parseflags_sending is zero and parsing is not blocked,
6906 * then loop until s_mn_parseflags is 0 (until there are no more
6907 * messages to send).
6908 * While s_mn_parseflags is non-zero,
6909 * put snapshot of parse_flags in s_mn_parseflags_sending
6910 * set s_mn_parseflags to zero
6911 * release mutex
6912 * send message
6913 * re-grab mutex
6914 * set s_mn_parseflags_sending to zero
6915 */
6916 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t), KM_SLEEP);
6917 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
6918 (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
6919 (!(md_get_setstatus(s->s_setno) & MD_SET_MNPARSE_BLK))) {
6920 /* Grab snapshot of parse flags */
6921 s->s_mn_parseflags_sending = s->s_mn_parseflags;
6922 s->s_mn_parseflags = 0;
6923
6924 mutex_exit(SETMUTEX(s->s_setno));
6925
6926 /*
6927 * Send the message to the slaves to re-parse
6928 * the indicated portions of the mddb. Send the status
6929 * of the 50 mddbs in this set so that slaves know which
6930 * mddbs that the master node thinks are 'good'.
6931 * Otherwise, slave may reparse, but from wrong replica.
6932 */
6933 mddb_parse_msg->msg_parse_flags = s->s_mn_parseflags_sending;
6934 for (i = 0; i < MDDB_NLB; i++) {
6935 mddb_parse_msg->msg_lb_flags[i] =
6936 lbp->lb_locators[i].l_flags;
6937 }
6938 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
6939 while (rval != 0) {
6940 rval = mdmn_ksend_message(s->s_setno,
6941 MD_MN_MSG_MDDB_PARSE, 0, 0,
6942 (char *)mddb_parse_msg,
6943 sizeof (md_mn_msg_mddb_parse_t), kresult);
6944 if (rval != 0)
6945 cmn_err(CE_WARN, "mddb_setexit: Unable to send "
6946 "mddb update message to other nodes in "
6947 "diskset %s\n", s->s_setname);
6948 }
6949 kmem_free(kresult, sizeof (md_mn_kresult_t));
6950
6951 /*
6952 * Re-grab mutex to clear sending field and to
6953 * see if another parse message needs to be generated.
6954 */
6955 mutex_enter(SETMUTEX(s->s_setno));
6956 s->s_mn_parseflags_sending = 0;
6957 }
6958 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
6959 mutex_exit(SETMUTEX(s->s_setno));
6960 }
6961
6962 static void
mddb_setexit_no_parse(mddb_set_t * s)6963 mddb_setexit_no_parse(
6964 mddb_set_t *s
6965 )
6966 {
6967 mutex_exit(SETMUTEX(s->s_setno));
6968 }
6969
6970 uint_t
mddb_lb_did_convert(mddb_set_t * s,uint_t doit,uint_t * blk_cnt)6971 mddb_lb_did_convert(mddb_set_t *s, uint_t doit, uint_t *blk_cnt)
6972 {
6973 uint_t li;
6974 mddb_lb_t *lbp = s->s_lbp;
6975 mddb_locator_t *lp;
6976 ddi_devid_t ret_devid;
6977 uint_t devid_len;
6978 dev_t ddi_dev;
6979 mddb_did_ic_t *did_icp;
6980 mddb_did_blk_t *did_blkp;
6981 char *minor_name;
6982 size_t sz;
6983 int retval;
6984 int err;
6985 md_dev64_t dev64; /* tmp var to make code look better */
6986
6987
6988 /* Need disk block(s) to hold mddb_did_blk_t */
6989 *blk_cnt = MDDB_DID_BLOCKS;
6990
6991 if (doit) {
6992 /*
6993 * Alloc mddb_did_blk_t disk block and fill in header area.
6994 * Don't fill in did magic number until end of routine so
6995 * if machine panics in the middle of conversion, the
6996 * device id information will be thrown away at the
6997 * next snarfing of this set.
6998 * Need to set DEVID_STYLE so that mddb_devid_add will
6999 * function properly.
7000 */
7001 /* grab the mutex */
7002 if ((mddb_setenter(s->s_setno, MDDB_NOINIT, &err)) == NULL) {
7003 return (1);
7004 }
7005 single_thread_start(s);
7006 lbp->lb_didfirstblk = getfreeblks(s, MDDB_DID_BLOCKS);
7007 if (lbp->lb_didfirstblk == 0) {
7008 single_thread_end(s);
7009 mddb_setexit(s);
7010 return (1);
7011 }
7012 lbp->lb_didblkcnt = (mddb_block_t)MDDB_DID_BLOCKS;
7013 did_icp = (mddb_did_ic_t *)kmem_zalloc(sizeof (mddb_did_ic_t),
7014 KM_SLEEP);
7015 did_blkp = (mddb_did_blk_t *)kmem_zalloc(MDDB_DID_BYTES,
7016 KM_SLEEP);
7017
7018 did_blkp->blk_revision = MDDB_REV_DI;
7019 did_icp->did_ic_blkp = did_blkp;
7020 s->s_did_icp = did_icp;
7021 lbp->lb_flags |= MDDB_DEVID_STYLE;
7022 }
7023
7024 /* Fill in information in mddb_did_info_t array */
7025 for (li = 0; li < lbp->lb_loccnt; li++) {
7026 lp = &lbp->lb_locators[li];
7027 if (lp->l_flags & MDDB_F_DELETED)
7028 continue;
7029
7030 dev64 = md_xlate_targ_2_mini(md_expldev(lp->l_dev));
7031 ddi_dev = md_dev64_to_dev(dev64);
7032 if (ddi_dev == NODEV) {
7033 /*
7034 * No translation available for replica.
7035 * Could fail conversion to device id replica,
7036 * but instead will just continue with next
7037 * replica in list.
7038 */
7039 continue;
7040 }
7041 if (ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) {
7042 /*
7043 * Just count each devid as at least 1 block. This
7044 * is conservative since several device id's may fit
7045 * into 1 disk block, but it's better to overestimate
7046 * the number of blocks needed than to underestimate.
7047 */
7048 devid_len = (int)ddi_devid_sizeof(ret_devid);
7049 *blk_cnt += btodb(devid_len + (MDDB_BSIZE - 1));
7050 if (doit) {
7051 if (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK,
7052 &minor_name) == DDI_SUCCESS) {
7053 if (mddb_devid_add(s, li, ret_devid,
7054 minor_name)) {
7055 cmn_err(CE_WARN,
7056 "Not enough space in metadb"
7057 " to add device id for"
7058 " dev: major = %d, "
7059 "minor = %d\n",
7060 getmajor(ddi_dev),
7061 getminor(ddi_dev));
7062 }
7063 sz = strlen(minor_name) + 1;
7064 kmem_free(minor_name, sz);
7065 }
7066 }
7067 ddi_devid_free(ret_devid);
7068 }
7069 }
7070
7071 if (doit) {
7072 did_blkp->blk_magic = MDDB_MAGIC_DI;
7073 retval = push_lb(s);
7074 (void) upd_med(s, "mddb_lb_did_convert(0)");
7075 single_thread_end(s);
7076 mddb_setexit(s);
7077 if (retval != 0)
7078 return (1);
7079 }
7080
7081 return (0);
7082 }
7083
7084 static mddb_set_t *
init_set(mddb_config_t * cp,int flag,int * errp)7085 init_set(
7086 mddb_config_t *cp,
7087 int flag,
7088 int *errp
7089 )
7090 {
7091 mddb_set_t *s;
7092 char *setname = NULL;
7093 set_t setno = MD_LOCAL_SET;
7094 side_t sideno = 0;
7095 struct timeval32 *created = NULL;
7096
7097 if (cp != NULL) {
7098 setname = cp->c_setname;
7099 setno = cp->c_setno;
7100 sideno = cp->c_sideno;
7101 created = &cp->c_timestamp;
7102 }
7103
7104 if (setno >= MD_MAXSETS)
7105 return ((mddb_set_t *)NULL);
7106
7107 if (md_set[setno].s_db)
7108 return (mddb_setenter(setno, flag, errp));
7109
7110 s = (mddb_set_t *)kmem_zalloc(sizeof (*s), KM_SLEEP);
7111
7112 cv_init(&s->s_buf_cv, NULL, CV_DEFAULT, NULL);
7113 cv_init(&s->s_single_thread_cv, NULL, CV_DEFAULT, NULL);
7114 cv_init(&s->s_optqueuing_cv, NULL, CV_DEFAULT, NULL);
7115 cv_init(&s->s_opthungerr_cv, NULL, CV_DEFAULT, NULL);
7116 cv_init(&s->s_optwantlck_cv, NULL, CV_DEFAULT, NULL);
7117
7118 s->s_setno = setno;
7119 s->s_sideno = sideno;
7120 if (setno == MD_LOCAL_SET) {
7121 (void) snprintf(s->s_ident.serial, sizeof (s->s_ident.serial),
7122 "%u", zone_get_hostid(NULL));
7123 } else {
7124 s->s_ident.createtime = *created;
7125 s->s_setname = (char *)kmem_alloc(strlen(setname) + 1,
7126 KM_SLEEP);
7127 (void) strcpy(s->s_setname, setname);
7128 }
7129
7130 /* have a config struct, copy mediator information */
7131 if (cp != NULL)
7132 s->s_med = cp->c_med; /* structure assignment */
7133
7134 md_set[setno].s_db = (void *) s;
7135
7136 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_TAKEOVER, SVM_TAG_SET, setno, NODEV64);
7137
7138 return (mddb_setenter(setno, flag, errp));
7139 }
7140
7141 void
mddb_unload_set(set_t setno)7142 mddb_unload_set(
7143 set_t setno
7144 )
7145 {
7146
7147 mddb_set_t *s;
7148 mddb_db_t *dbp, *adbp = NULL;
7149 mddb_de_ic_t *dep, *dep2;
7150 mddb_bf_t *bfp;
7151 int i;
7152 md_dev64_t dev;
7153
7154 if ((s = mddb_setenter(setno, MDDB_NOINIT, NULL)) == NULL)
7155 return;
7156
7157 single_thread_start(s);
7158
7159 s->s_opthavequeuinglck = 0;
7160 s->s_optwantqueuinglck = 0;
7161
7162 for (dbp = s->s_dbp; dbp != 0; dbp = adbp) {
7163 for (dep = dbp->db_firstentry; dep != NULL; dep = dep2) {
7164 if (dep->de_rb_userdata != NULL) {
7165 if (dep->de_icreqsize)
7166 kmem_free(dep->de_rb_userdata_ic,
7167 dep->de_icreqsize);
7168 else
7169 kmem_free(dep->de_rb_userdata,
7170 dep->de_reqsize);
7171 }
7172 kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
7173 dep2 = dep->de_next;
7174 kmem_free((caddr_t)dep, sizeofde(dep));
7175 }
7176 adbp = dbp->db_next;
7177 kmem_free((caddr_t)dbp, sizeof (mddb_db_t));
7178 }
7179 s->s_dbp = (mddb_db_t *)NULL;
7180
7181 free_rip(&s->s_rip);
7182
7183 for (i = 0; i < mddb_maxcopies; i++) {
7184 if (! s->s_mbiarray)
7185 break;
7186
7187 if (! s->s_mbiarray[i])
7188 continue;
7189
7190 dev = md_expldev(s->s_lbp->lb_locators[i].l_dev);
7191 dev = md_xlate_targ_2_mini(dev);
7192 if (dev != NODEV64)
7193 mddb_devclose(dev);
7194
7195 free_mbipp(&s->s_mbiarray[i]);
7196 }
7197
7198 if (s->s_mbiarray) {
7199 kmem_free((caddr_t)s->s_mbiarray,
7200 sizeof (mddb_mb_ic_t *) * mddb_maxcopies);
7201 s->s_mbiarray = (mddb_mb_ic_t **)NULL;
7202 }
7203
7204 if (s->s_lnp) {
7205 kmem_free((caddr_t)s->s_lnp, dbtob(s->s_lbp->lb_lnblkcnt));
7206 s->s_lnp = (mddb_ln_t *)NULL;
7207 }
7208
7209 if (s->s_lbp) {
7210 mddb_devid_icp_free(&s->s_did_icp, s->s_lbp);
7211 kmem_free((caddr_t)s->s_lbp, dbtob(s->s_lbp->lb_blkcnt));
7212 s->s_lbp = (mddb_lb_t *)NULL;
7213 }
7214
7215 if (s->s_freebitmap) {
7216 kmem_free((caddr_t)s->s_freebitmap, s->s_freebitmapsize);
7217 s->s_freebitmap = NULL;
7218 s->s_freebitmapsize = 0;
7219 }
7220
7221 while ((bfp = allocbuffer(s, MDDB_NOSLEEP)) != NULL)
7222 kmem_free((caddr_t)bfp, sizeof (*bfp));
7223
7224 if (s->s_databuffer_size) {
7225 kmem_free(s->s_databuffer, s->s_databuffer_size);
7226 s->s_databuffer_size = 0;
7227 }
7228
7229 if (s->s_setname != NULL)
7230 kmem_free((caddr_t)s->s_setname, strlen(s->s_setname)+1);
7231
7232 /* Data tags not supported on MN sets. */
7233 if (!(md_get_setstatus(setno) & MD_SET_MNSET))
7234 dtl_freel(&s->s_dtlp);
7235
7236 md_set[setno].s_db = NULL;
7237 ASSERT(s->s_singlelockwanted == 0);
7238 kmem_free(s, sizeof (mddb_set_t));
7239
7240 /* Take care of things setup in the md_set array */
7241 if (! (md_get_setstatus(setno) & MD_SET_KEEPTAG)) {
7242 if (md_set[setno].s_dtp) {
7243 kmem_free((caddr_t)md_set[setno].s_dtp, MDDB_DT_BYTES);
7244 md_set[setno].s_dtp = NULL;
7245 }
7246 }
7247
7248 md_clr_setstatus(setno, MD_SET_ACCOK | MD_SET_ACCEPT |
7249 MD_SET_TAGDATA | MD_SET_USETAG | MD_SET_TOOFEW | MD_SET_STALE |
7250 MD_SET_OWNERSHIP | MD_SET_BADTAG | MD_SET_CLRTAG | MD_SET_MNSET |
7251 MD_SET_DIDCLUP | MD_SET_MNPARSE_BLK | MD_SET_MN_MIR_STATE_RC |
7252 MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
7253
7254 mutex_exit(SETMUTEX(setno));
7255 }
7256
7257 /*
7258 * returns 0 if name can be put into locator block
7259 * returns 1 if locator block prefixes are all used
7260 *
7261 * Takes splitname (suffix, prefix, sideno) and
7262 * stores it in the locator name structure.
7263 * For traditional diskset, the sideno is the index into the suffixes
7264 * array in the locator name structure.
7265 * For the MN diskset, the sideno is the nodeid which can be any number,
7266 * so the index passed in is the index into the mnsuffixes array
7267 * in the locator structure. This index was computed by the
7268 * routine checklocator which basically checked the locator block
7269 * mnside locator structure.
7270 */
7271 static int
splitname2locatorblock(md_splitname * spn,mddb_ln_t * lnp,int li,side_t sideno,int index)7272 splitname2locatorblock(
7273 md_splitname *spn,
7274 mddb_ln_t *lnp,
7275 int li,
7276 side_t sideno,
7277 int index
7278 )
7279 {
7280 uchar_t i;
7281 md_name_suffix *sn;
7282 md_mnname_suffix_t *mnsn;
7283 mddb_mnln_t *mnlnp;
7284
7285 for (i = 0; i < MDDB_PREFIXCNT; i++) {
7286 if (lnp->ln_prefixes[i].pre_len != SPN_PREFIX(spn).pre_len)
7287 continue;
7288 if (bcmp(lnp->ln_prefixes[i].pre_data, SPN_PREFIX(spn).pre_data,
7289 SPN_PREFIX(spn).pre_len) == 0)
7290 break;
7291 }
7292 if (i == MDDB_PREFIXCNT) {
7293 for (i = 0; i < MDDB_PREFIXCNT; i++) {
7294 if (lnp->ln_prefixes[i].pre_len == 0)
7295 break;
7296 }
7297 if (i == MDDB_PREFIXCNT)
7298 return (1);
7299 bcopy(SPN_PREFIX(spn).pre_data, lnp->ln_prefixes[i].pre_data,
7300 SPN_PREFIX(spn).pre_len);
7301 lnp->ln_prefixes[i].pre_len = SPN_PREFIX(spn).pre_len;
7302 }
7303
7304 if (lnp->ln_revision == MDDB_REV_MNLN) {
7305 /* If a MN diskset, use index */
7306 mnlnp = (mddb_mnln_t *)lnp;
7307 mnsn = &mnlnp->ln_mnsuffixes[index][li];
7308 mnsn->mn_ln_sideno = sideno;
7309 mnsn->mn_ln_suffix.suf_len = SPN_SUFFIX(spn).suf_len;
7310 mnsn->mn_ln_suffix.suf_prefix = i;
7311 bcopy(SPN_SUFFIX(spn).suf_data,
7312 mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_len);
7313 } else {
7314 sn = &lnp->ln_suffixes[sideno][li];
7315 sn->suf_len = SPN_SUFFIX(spn).suf_len;
7316 sn->suf_prefix = i;
7317 bcopy(SPN_SUFFIX(spn).suf_data, sn->suf_data,
7318 SPN_SUFFIX(spn).suf_len);
7319 }
7320 return (0);
7321 }
7322
7323 /*
7324 * Find the locator name for the given sideno and convert the locator name
7325 * information into a splitname structure.
7326 */
7327 void
mddb_locatorblock2splitname(mddb_ln_t * lnp,int li,side_t sideno,md_splitname * spn)7328 mddb_locatorblock2splitname(
7329 mddb_ln_t *lnp,
7330 int li,
7331 side_t sideno,
7332 md_splitname *spn
7333 )
7334 {
7335 int iprefix;
7336 md_name_suffix *sn;
7337 md_mnname_suffix_t *mnsn;
7338 int i;
7339 mddb_mnln_t *mnlnp;
7340
7341 if (lnp->ln_revision == MDDB_REV_MNLN) {
7342 mnlnp = (mddb_mnln_t *)lnp;
7343 for (i = 0; i < MD_MNMAXSIDES; i++) {
7344 mnsn = &mnlnp->ln_mnsuffixes[i][li];
7345 if (mnsn->mn_ln_sideno == sideno)
7346 break;
7347 }
7348 if (i == MD_MNMAXSIDES)
7349 return;
7350
7351 SPN_SUFFIX(spn).suf_len = mnsn->mn_ln_suffix.suf_len;
7352 bcopy(mnsn->mn_ln_suffix.suf_data, SPN_SUFFIX(spn).suf_data,
7353 SPN_SUFFIX(spn).suf_len);
7354 iprefix = mnsn->mn_ln_suffix.suf_prefix;
7355 } else {
7356 sn = &lnp->ln_suffixes[sideno][li];
7357 SPN_SUFFIX(spn).suf_len = sn->suf_len;
7358 bcopy(sn->suf_data, SPN_SUFFIX(spn).suf_data,
7359 SPN_SUFFIX(spn).suf_len);
7360 iprefix = sn->suf_prefix;
7361 }
7362 SPN_PREFIX(spn).pre_len = lnp->ln_prefixes[iprefix].pre_len;
7363 bcopy(lnp->ln_prefixes[iprefix].pre_data, SPN_PREFIX(spn).pre_data,
7364 SPN_PREFIX(spn).pre_len);
7365 }
7366
7367 static int
getdeldev(mddb_config_t * cp,int command,md_error_t * ep)7368 getdeldev(
7369 mddb_config_t *cp,
7370 int command,
7371 md_error_t *ep
7372 )
7373 {
7374 mddb_set_t *s;
7375 mddb_lb_t *lbp;
7376 mddb_locator_t *locators;
7377 uint_t loccnt;
7378 mddb_mb_ic_t *mbip;
7379 mddb_block_t blk;
7380 int err = 0;
7381 int i, j;
7382 int li;
7383 uint_t commitcnt;
7384 set_t setno = cp->c_setno;
7385 uint_t set_status;
7386 md_dev64_t dev;
7387 int flags = MDDB_MUSTEXIST;
7388 mddb_ri_t *rip;
7389
7390 cp->c_dbmax = MDDB_NLB;
7391
7392 /*
7393 * Data checking
7394 */
7395 if (setno >= md_nsets || cp->c_id < 0 ||
7396 cp->c_id > cp->c_dbmax) {
7397 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
7398 }
7399
7400 if (cp->c_flags & MDDB_C_STALE)
7401 flags |= MDDB_MN_STALE;
7402
7403 if ((s = mddb_setenter(setno, flags, &err)) == NULL)
7404 return (mddbstatus2error(ep, err, NODEV32, setno));
7405
7406 cp->c_flags = 0;
7407
7408 lbp = s->s_lbp;
7409 loccnt = lbp->lb_loccnt;
7410 locators = lbp->lb_locators;
7411
7412 /* shorthand */
7413 set_status = md_get_setstatus(setno);
7414
7415 if (set_status & MD_SET_STALE)
7416 cp->c_flags |= MDDB_C_STALE;
7417
7418 if (set_status & MD_SET_TOOFEW)
7419 cp->c_flags |= MDDB_C_TOOFEW;
7420
7421 cp->c_sideno = s->s_sideno;
7422
7423 cp->c_dbcnt = 0;
7424 /*
7425 * go through and count active entries
7426 */
7427 for (i = 0; i < loccnt; i++) {
7428 if (locators[i].l_flags & MDDB_F_DELETED)
7429 continue;
7430 cp->c_dbcnt++;
7431 }
7432
7433 /*
7434 * add the ability to accept a locator block index
7435 * which is not relative to previously deleted replicas. This
7436 * is for support of MD_DEBUG=STAT in metastat since it asks for
7437 * replica information specifically for each of the mirror resync
7438 * records. MDDB_CONFIG_SUBCMD uses one of the pad spares in
7439 * the mddb_config_t type.
7440 */
7441 if (cp->c_subcmd == MDDB_CONFIG_ABS) {
7442 if (cp->c_id < 0 || cp->c_id > cp->c_dbmax) {
7443 mddb_setexit(s);
7444 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7445 setno));
7446 }
7447 li = cp->c_id;
7448 } else {
7449 if (cp->c_id >= cp->c_dbcnt) {
7450 mddb_setexit(s);
7451 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32,
7452 setno));
7453 }
7454
7455 /* CSTYLED */
7456 for (li = 0, j = 0; /* void */; li++) {
7457 if (locators[li].l_flags & MDDB_F_DELETED)
7458 continue;
7459 j++;
7460 if (j > cp->c_id)
7461 break;
7462 }
7463 }
7464
7465 if (command == MDDB_ENDDEV) {
7466 daddr_t ib = 0, jb;
7467
7468 blk = 0;
7469 if ((s != NULL) && s->s_mbiarray[li]) {
7470 mbip = s->s_mbiarray[li];
7471 while ((jb = getphysblk(blk++, mbip)) > 0) {
7472 if (jb > ib)
7473 ib = jb;
7474 }
7475 cp->c_dbend = (int)ib;
7476 } else {
7477 cp->c_dbend = 0;
7478 }
7479 }
7480
7481 locator2cfgloc(lbp, &cp->c_locator, li, s->s_sideno, s->s_did_icp);
7482 mddb_locatorblock2splitname(s->s_lnp, li, s->s_sideno, &cp->c_devname);
7483
7484 if (command != MDDB_DELDEV) {
7485 mddb_setexit(s);
7486 return (0);
7487 }
7488
7489 /* Currently don't allow addition/deletion of sides during upgrade */
7490 if (MD_UPGRADE) {
7491 cmn_err(CE_WARN,
7492 "Deletion of replica not allowed during upgrade.\n");
7493 mddb_setexit(s);
7494 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
7495 }
7496
7497 /*
7498 * If here, replica delete in progress.
7499 */
7500 single_thread_start(s);
7501
7502 if ((! (locators[li].l_flags & MDDB_F_EMASTER)) &&
7503 (locators[li].l_flags & MDDB_F_ACTIVE)) {
7504 commitcnt = lbp->lb_commitcnt;
7505 lbp->lb_commitcnt = 0;
7506 setidentifier(s, &lbp->lb_ident);
7507 crcgen(lbp, &lbp->lb_checksum, dbtob(lbp->lb_blkcnt), NULL);
7508 /*
7509 * Don't need to write out device id area, since locator
7510 * block on this replica is being deleted by setting the
7511 * commitcnt to 0.
7512 */
7513 (void) writeblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, li,
7514 MDDB_WR_ONLY_MASTER);
7515 lbp->lb_commitcnt = commitcnt;
7516 }
7517
7518 if (s->s_mbiarray[li]) {
7519 /* A freed mbi pointer still exists in the mddb_ri_t */
7520 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
7521 if (rip->ri_mbip == s->s_mbiarray[li])
7522 rip->ri_mbip = NULL;
7523 }
7524 free_mbipp(&s->s_mbiarray[li]);
7525 }
7526
7527 if (! (locators[li].l_flags & MDDB_F_EMASTER)) {
7528 dev = md_expldev(locators[li].l_dev);
7529 dev = md_xlate_targ_2_mini(dev);
7530 if (dev != NODEV64)
7531 mddb_devclose(dev);
7532 }
7533
7534 s->s_mbiarray[li] = 0;
7535 lbp->lb_locators[li].l_flags = MDDB_F_DELETED;
7536
7537 /* Only support data tags for traditional and local sets */
7538 if ((md_get_setstatus(setno) & MD_SET_STALE) &&
7539 (!(lbp->lb_flags & MDDB_MNSET)) &&
7540 setno != MD_LOCAL_SET)
7541 if (set_dtag(s, ep))
7542 mdclrerror(ep);
7543
7544 /* Write data tags to all accessible devices */
7545 /* Only support data tags for traditional and local sets */
7546 if (!(lbp->lb_flags & MDDB_MNSET)) {
7547 (void) dt_write(s);
7548 }
7549
7550 /* Delete device id of deleted replica */
7551 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7552 (void) mddb_devid_delete(s, li);
7553 }
7554 /* write new locator to all devices */
7555 err = writelocall(s);
7556
7557 (void) upd_med(s, "getdeldev(0)");
7558
7559 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_REPLICA, setno,
7560 md_expldev(locators[li].l_dev));
7561
7562 computefreeblks(s); /* recompute always it may be larger */
7563 cp->c_dbcnt--;
7564 err |= fixoptrecords(s);
7565 if (err) {
7566 if (writeretry(s)) {
7567 single_thread_end(s);
7568 mddb_setexit(s);
7569 return (mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno));
7570 }
7571 }
7572
7573 single_thread_end(s);
7574 mddb_setexit(s);
7575 return (0);
7576 }
7577
7578 static int
getdriver(mddb_cfg_loc_t * clp)7579 getdriver(
7580 mddb_cfg_loc_t *clp
7581 )
7582 {
7583 major_t majordev;
7584
7585 /*
7586 * Data checking
7587 */
7588 if (clp->l_dev <= 0)
7589 return (EINVAL);
7590
7591 majordev = getmajor(expldev(clp->l_dev));
7592
7593 if (ddi_major_to_name(majordev) == (char *)NULL)
7594 return (EINVAL);
7595
7596 if (MD_UPGRADE)
7597 (void) strcpy(clp->l_driver, md_targ_major_to_name(majordev));
7598 else
7599 (void) strcpy(clp->l_driver, ddi_major_to_name(majordev));
7600 return (0);
7601 }
7602
7603 /*
7604 * update_valid_replica - updates the locator block namespace (prefix
7605 * and/or suffix) with new pathname and devname.
7606 * RETURN
7607 * 1 Error
7608 * 0 Success
7609 */
7610 static int
update_valid_replica(side_t side,mddb_locator_t * lp,mddb_set_t * s,int li,char * devname,char * pathname,md_dev64_t devt)7611 update_valid_replica(
7612 side_t side,
7613 mddb_locator_t *lp,
7614 mddb_set_t *s,
7615 int li,
7616 char *devname,
7617 char *pathname,
7618 md_dev64_t devt
7619 )
7620 {
7621 uchar_t pre_len, suf_len;
7622 md_name_suffix *sn;
7623 mddb_ln_t *lnp;
7624 uchar_t pre_index;
7625 uchar_t i;
7626
7627 if (md_expldev(lp->l_dev) != devt) {
7628 return (0);
7629 }
7630
7631 if (pathname[strlen(pathname) - 1] == '/')
7632 pathname[strlen(pathname) - 1] = '\0';
7633
7634 pre_len = (uchar_t)strlen(pathname);
7635 suf_len = (uchar_t)strlen(devname);
7636
7637 if ((pre_len > MD_MAXPREFIX) || (suf_len > MD_MAXSUFFIX))
7638 return (1);
7639
7640 lnp = s->s_lnp;
7641
7642 /*
7643 * Future note: Need to do something here for the MN diskset case
7644 * when device ids are supported in disksets.
7645 * Can't add until merging devids_in_diskset code into code base
7646 * Currently only called with side of 0.
7647 */
7648
7649 sn = &lnp->ln_suffixes[side][li];
7650
7651 /*
7652 * Check if prefix (Ex: /dev/dsk) needs to be changed.
7653 * If new prefix is the same as the previous prefix - no change.
7654 *
7655 * If new prefix is not the same, check if new prefix
7656 * matches an existing one. If so, use that one.
7657 *
7658 * If new prefix doesn't exist, add a new prefix. If not enough
7659 * space, return failure.
7660 */
7661 pre_index = sn->suf_prefix;
7662 /* Check if new prefix is the same as the old prefix. */
7663 if ((lnp->ln_prefixes[pre_index].pre_len != pre_len) ||
7664 (bcmp(lnp->ln_prefixes[pre_index].pre_data, pathname,
7665 pre_len) != 0)) {
7666 /* Check if new prefix is an already known prefix. */
7667 for (i = 0; i < MDDB_PREFIXCNT; i++) {
7668 if (lnp->ln_prefixes[i].pre_len != pre_len) {
7669 continue;
7670 }
7671 if (bcmp(lnp->ln_prefixes[i].pre_data, pathname,
7672 pre_len) == 0) {
7673 break;
7674 }
7675 }
7676 /* If no match found for new prefix - add the new prefix */
7677 if (i == MDDB_PREFIXCNT) {
7678 for (i = 0; i < MDDB_PREFIXCNT; i++) {
7679 if (lnp->ln_prefixes[i].pre_len == 0)
7680 break;
7681 }
7682 /* No space to add new prefix - return failure */
7683 if (i == MDDB_PREFIXCNT) {
7684 return (1);
7685 }
7686 bcopy(pathname, lnp->ln_prefixes[i].pre_data, pre_len);
7687 lnp->ln_prefixes[i].pre_len = pre_len;
7688 }
7689 sn->suf_prefix = i;
7690 }
7691
7692 /* Now, update the suffix (Ex: c0t0d0s0) if needed */
7693 if ((sn->suf_len != suf_len) ||
7694 (bcmp(sn->suf_data, devname, suf_len) != 0)) {
7695 bcopy(devname, sn->suf_data, suf_len);
7696 sn->suf_len = suf_len;
7697 }
7698 return (0);
7699 }
7700
7701
7702 /*
7703 * md_update_locator_namespace - If in devid style and active and the devid's
7704 * exist and are valid update the locator namespace pathname
7705 * and devname.
7706 * RETURN
7707 * 1 Error
7708 * 0 Success
7709 */
7710 int
md_update_locator_namespace(set_t setno,side_t side,char * dname,char * pname,md_dev64_t devt)7711 md_update_locator_namespace(
7712 set_t setno, /* which set to get name from */
7713 side_t side,
7714 char *dname,
7715 char *pname,
7716 md_dev64_t devt
7717 )
7718 {
7719 mddb_set_t *s;
7720 mddb_lb_t *lbp;
7721 int li;
7722 uint_t flg;
7723 int err = 0;
7724 mddb_ln_t *lnp;
7725
7726 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
7727 return (1);
7728 single_thread_start(s);
7729 lbp = s->s_lbp;
7730 /* must be DEVID_STYLE */
7731 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
7732 for (li = 0; li < lbp->lb_loccnt; li++) {
7733 mddb_locator_t *lp = &lbp->lb_locators[li];
7734
7735 if (lp->l_flags & MDDB_F_DELETED) {
7736 continue;
7737 }
7738
7739 /* replica also must be active */
7740 if (lp->l_flags & MDDB_F_ACTIVE) {
7741 flg = s->s_did_icp->did_ic_blkp->
7742 blk_info[li].info_flags;
7743 /* only update if did exists and is valid */
7744 if ((flg & MDDB_DID_EXISTS) &&
7745 (flg & MDDB_DID_VALID)) {
7746 if (update_valid_replica(side, lp, s,
7747 li, dname, pname, devt)) {
7748 err = 1;
7749 goto out;
7750 }
7751 }
7752 }
7753 }
7754 }
7755 lnp = s->s_lnp;
7756 uniqtime32(&lnp->ln_timestamp);
7757 if (lbp->lb_flags & MDDB_MNSET)
7758 lnp->ln_revision = MDDB_REV_MNLN;
7759 else
7760 lnp->ln_revision = MDDB_REV_LN;
7761 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
7762 err = writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
7763 lbp->lb_lnblkcnt, 0);
7764 /*
7765 * If a MN diskset and this is the master, set the PARSE_LOCNM
7766 * flag in the mddb_set structure to show that the locator
7767 * names have changed.
7768 */
7769
7770 if ((lbp->lb_flags & MDDB_MNSET) &&
7771 (md_set[s->s_setno].s_am_i_master)) {
7772 s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
7773 }
7774 out:
7775 single_thread_end(s);
7776 mddb_setexit(s);
7777 if (err)
7778 return (1);
7779 return (0);
7780 }
7781
7782 /*
7783 * update_locatorblock - for active entries in the locator block, check
7784 * the devt to see if it matches the given devt. If so, and
7785 * there is an associated device id which is not the same
7786 * as the passed in devid, delete old devid and add a new one.
7787 *
7788 * During import of replicated disksets, old_didptr contains
7789 * the original disk's device id. Use this device id in
7790 * addition to the devt to determine if an entry is a match
7791 * and should be updated with the new device id of the
7792 * replicated disk. Specifically, this is the case being handled:
7793 *
7794 * Original_disk Replicated_disk Disk_Available_During_Import
7795 * c1t1d0 c1t3d0 no - so old name c1t1d0 shown
7796 * c1t2d0 c1t1d0 yes - name is c1t1d0
7797 * c1t3d0 c1t2d0 yes - name is c1t2d0
7798 *
7799 * Can't just match on devt since devt for the first and third
7800 * disks will be the same, but the original disk's device id
7801 * is known and can be used to distinguish which disk's
7802 * replicated device id should be updated.
7803 * RETURN
7804 * MDDB_E_NODEVID
7805 * MDDB_E_NOLOCBLK
7806 * 1 Error
7807 * 0 Success
7808 */
7809 static int
update_locatorblock(mddb_set_t * s,md_dev64_t dev,ddi_devid_t didptr,ddi_devid_t old_didptr)7810 update_locatorblock(
7811 mddb_set_t *s,
7812 md_dev64_t dev,
7813 ddi_devid_t didptr,
7814 ddi_devid_t old_didptr
7815 )
7816 {
7817 mddb_lb_t *lbp = NULL;
7818 mddb_locator_t *lp;
7819 int li;
7820 uint_t flg;
7821 ddi_devid_t devid_ptr;
7822 int retval = 0;
7823 char *minor_name;
7824 int repl_import_flag;
7825
7826 /* Set replicated flag if this is a replicated import */
7827 repl_import_flag = md_get_setstatus(s->s_setno) &
7828 MD_SET_REPLICATED_IMPORT;
7829
7830 lbp = s->s_lbp;
7831 /* find replicas that haven't been deleted */
7832 for (li = 0; li < lbp->lb_loccnt; li++) {
7833 lp = &lbp->lb_locators[li];
7834
7835 if ((lp->l_flags & MDDB_F_DELETED)) {
7836 continue;
7837 }
7838 /*
7839 * check to see if locator devt matches given dev
7840 * and if there is a device ID associated with it
7841 */
7842 flg = s->s_did_icp->did_ic_blkp-> blk_info[li].info_flags;
7843 if ((md_expldev(lp->l_dev) == dev) &&
7844 (flg & MDDB_DID_EXISTS)) {
7845 if (flg & MDDB_DID_VALID) {
7846 continue; /* cont to nxt active entry */
7847 }
7848 devid_ptr = s->s_did_icp->did_ic_devid[li];
7849 if (devid_ptr == NULL) {
7850 return (MDDB_E_NODEVID);
7851 }
7852
7853 /*
7854 * During a replicated import the old_didptr
7855 * must match the current devid before the
7856 * devid can be updated.
7857 */
7858 if (repl_import_flag) {
7859 if (ddi_devid_compare(devid_ptr,
7860 old_didptr) != 0)
7861 continue;
7862 }
7863
7864 if (ddi_devid_compare(devid_ptr, didptr) != 0) {
7865 /*
7866 * devid's not equal so
7867 * delete and add
7868 */
7869 if (ddi_lyr_get_minor_name(
7870 md_dev64_to_dev(dev),
7871 S_IFBLK, &minor_name) == DDI_SUCCESS) {
7872 (void) mddb_devid_delete(s, li);
7873 (void) mddb_devid_add(s, li, didptr,
7874 minor_name);
7875 kmem_free(minor_name,
7876 strlen(minor_name)+1);
7877 break;
7878 } else {
7879 retval = 1;
7880 goto err_out;
7881 }
7882 }
7883 }
7884 } /* end for */
7885 retval = push_lb(s);
7886 (void) upd_med(s, "update_locatorblock(0)");
7887 err_out:
7888 return (retval);
7889 }
7890
7891 static int
update_mb_devid(mddb_set_t * s,mddb_ri_t * rip,ddi_devid_t devidptr)7892 update_mb_devid(
7893 mddb_set_t *s,
7894 mddb_ri_t *rip,
7895 ddi_devid_t devidptr
7896 )
7897 {
7898 mddb_mb_ic_t *mbip;
7899 mddb_mb_t *mb = NULL;
7900 daddr_t blkno;
7901 md_dev64_t device;
7902 uint_t sz;
7903 int mb2free = 0;
7904 int err = 0;
7905
7906
7907 /*
7908 * There is case where a disk may not have mddb,
7909 * and only has dummy mddb which contains
7910 * a valid devid we like to update and in this
7911 * case, the rip_lbp will be NULL but we still
7912 * like to update the devid embedded in the
7913 * dummy mb block.
7914 *
7915 */
7916 if (rip->ri_mbip != (mddb_mb_ic_t *)NULL) {
7917 mbip = rip->ri_mbip;
7918 mb = &mbip->mbi_mddb_mb;
7919 } else {
7920 /*
7921 * Done if it is non-replicated set
7922 */
7923 if (devidptr != (ddi_devid_t)NULL) {
7924 mb = (mddb_mb_t *)kmem_zalloc(MDDB_BSIZE,
7925 KM_SLEEP);
7926 mb->mb_magic = MDDB_MAGIC_DU;
7927 mb->mb_revision = MDDB_REV_MB;
7928 mb2free = 1;
7929 } else {
7930 goto out;
7931 }
7932 }
7933
7934 blkno = rip->ri_blkno;
7935 device = rip->ri_dev;
7936 /*
7937 * Replace the mb_devid with the new/valid one
7938 */
7939 if (devidptr != (ddi_devid_t)NULL) {
7940 /*
7941 * Zero out what we have previously
7942 */
7943 if (mb->mb_devid_len)
7944 bzero(mb->mb_devid, mb->mb_devid_len);
7945 sz = ddi_devid_sizeof(devidptr);
7946 bcopy((char *)devidptr, (char *)mb->mb_devid, sz);
7947 mb->mb_devid_len = sz;
7948 }
7949
7950 mb->mb_setno = s->s_setno;
7951 uniqtime32(&mb->mb_timestamp);
7952 crcgen(mb, &mb->mb_checksum, MDDB_BSIZE, NULL);
7953 /*
7954 * putblks will
7955 *
7956 * - drop the s_dbmx lock
7957 * - biowait
7958 * - regain the s_dbmx lock
7959 *
7960 * Need to update this if we wants to handle
7961 * mb_next != NULL which it is unlikely will happen
7962 */
7963 err = putblks(s, (caddr_t)mb, blkno, 1, device, 0);
7964
7965 if (mb2free) {
7966 kmem_free(mb, MDDB_BSIZE);
7967 }
7968 out:
7969 return (err);
7970 }
7971
7972 static int
setdid(mddb_config_t * cp)7973 setdid(
7974 mddb_config_t *cp
7975 )
7976 {
7977 ddi_devid_t devidp;
7978 dev_t ddi_dev;
7979 mddb_set_t *s;
7980 int err = 0;
7981 mddb_ri_t *rip;
7982
7983 /*
7984 * Data integrity check
7985 */
7986 if (cp->c_setno >= md_nsets || cp->c_devt <= 0)
7987 return (EINVAL);
7988
7989 if ((md_get_setstatus(cp->c_setno) & MD_SET_STALE))
7990 return (0);
7991
7992 ddi_dev = md_dev64_to_dev(cp->c_devt);
7993 if (ddi_lyr_get_devid(ddi_dev, &devidp) != DDI_SUCCESS) {
7994 return (-1);
7995 }
7996 if (devidp == NULL) {
7997 return (-1);
7998 }
7999
8000 if ((s = mddb_setenter(cp->c_setno, MDDB_MUSTEXIST, &err)) == NULL)
8001 return (-1);
8002 single_thread_start(s);
8003
8004 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
8005 if (rip->ri_lbp == (mddb_lb_t *)NULL)
8006 continue;
8007 /*
8008 * We only update what is asked
8009 */
8010 if (rip->ri_dev == cp->c_devt) {
8011 if (update_mb_devid(s, rip, devidp) != 0) {
8012 err = -1;
8013 goto out;
8014 }
8015 }
8016 }
8017
8018 if (update_locatorblock(s, cp->c_devt, devidp, NULL)) {
8019 err = -1;
8020 goto out;
8021 }
8022
8023 out:
8024 single_thread_end(s);
8025 mddb_setexit(s);
8026 ddi_devid_free(devidp);
8027 return (err);
8028 }
8029
8030 static int
delnewside(mddb_config_t * cp,int command,md_error_t * ep)8031 delnewside(
8032 mddb_config_t *cp,
8033 int command,
8034 md_error_t *ep
8035 )
8036 {
8037 mddb_set_t *s;
8038 int li;
8039 mddb_lb_t *lbp; /* pointer to locator block */
8040 mddb_ln_t *lnp; /* pointer to locator names */
8041 mddb_mnln_t *mnlnp; /* pointer to locator names */
8042 mddb_locator_t *lp;
8043 mddb_sidelocator_t *slp;
8044 mddb_cfg_loc_t *clp;
8045 int err = 0;
8046 set_t setno = cp->c_setno;
8047 ddi_devid_t devid;
8048 ddi_devid_t ret_devid = NULL;
8049 char *minor_name;
8050 uint_t use_devid = 0;
8051 dev_t ddi_dev;
8052 md_mnname_suffix_t *mnsn;
8053 mddb_mnlb_t *mnlbp;
8054 mddb_mnsidelocator_t *mnslp;
8055
8056 /* Currently don't allow addition/deletion of sides during upgrade */
8057 if (MD_UPGRADE) {
8058 cmn_err(CE_WARN,
8059 "Addition and deletion of sides not allowed"
8060 " during upgrade. \n");
8061 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8062 }
8063
8064 /*
8065 * Data integrity check
8066 */
8067 if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8068 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8069
8070 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8071 return (mddbstatus2error(ep, err, NODEV32, setno));
8072
8073 single_thread_start(s);
8074 clp = &cp->c_locator;
8075
8076 lbp = s->s_lbp;
8077
8078 if (lbp->lb_setno != setno) {
8079 single_thread_end(s);
8080 mddb_setexit(s);
8081 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8082 }
8083
8084 /*
8085 * Find this device/blkno pair
8086 */
8087 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8088 ddi_dev = md_dev64_to_dev(clp->l_dev);
8089 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8090 (ddi_lyr_get_minor_name(ddi_dev, S_IFBLK, &minor_name)
8091 == DDI_SUCCESS)) {
8092 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8093 clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8094 use_devid = 1;
8095 (void) strcpy(clp->l_minor_name, minor_name);
8096 }
8097 kmem_free(minor_name, strlen(minor_name)+1);
8098 }
8099 if (use_devid != 1 && ret_devid != NULL)
8100 ddi_devid_free(ret_devid);
8101 }
8102 for (li = 0; li < lbp->lb_loccnt; li++) {
8103 lp = &lbp->lb_locators[li];
8104 if (lp->l_flags & MDDB_F_DELETED)
8105 continue;
8106 if (use_devid) {
8107 if ((mddb_devid_get(s, li, &devid, &minor_name)) == 0)
8108 continue;
8109 if ((ddi_devid_compare(devid,
8110 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8111 (strcmp(clp->l_minor_name, minor_name) == 0) &&
8112 ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8113 break;
8114 }
8115 } else {
8116 if (lp->l_dev == clp->l_dev &&
8117 (daddr_t)lp->l_blkno == clp->l_blkno) {
8118 break;
8119 }
8120 }
8121 }
8122
8123 if (li == lbp->lb_loccnt) {
8124 if (use_devid)
8125 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8126 single_thread_end(s);
8127 mddb_setexit(s);
8128 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8129 }
8130
8131 lnp = s->s_lnp;
8132 if (command == MDDB_NEWSIDE) {
8133 int index = 0;
8134 /*
8135 * If a MN diskset, need to find the index where the new
8136 * locator information is to be stored in the mnsidelocator
8137 * field of the locator block so that the locator name can
8138 * be stored at the same array index in the mnsuffixes
8139 * field of the locator names structure.
8140 */
8141 if (lbp->lb_flags & MDDB_MNSET) {
8142 if ((index = checklocator(lbp, li,
8143 cp->c_sideno)) == -1) {
8144 if (use_devid) {
8145 ddi_devid_free((ddi_devid_t)
8146 (uintptr_t)clp->l_devid);
8147 }
8148 single_thread_end(s);
8149 mddb_setexit(s);
8150 return (mdmddberror(ep, MDE_DB_TOOSMALL,
8151 NODEV32, setno));
8152 }
8153 }
8154
8155 /*
8156 * Store the locator name before the sidelocator information
8157 * in case a panic occurs between these 2 steps. Must have
8158 * the locator name information in order to print reasonable
8159 * error information.
8160 */
8161 if (splitname2locatorblock(&cp->c_devname, lnp, li,
8162 cp->c_sideno, index)) {
8163 if (use_devid)
8164 ddi_devid_free(
8165 (ddi_devid_t)(uintptr_t)clp->l_devid);
8166 single_thread_end(s);
8167 mddb_setexit(s);
8168 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8169 setno));
8170 }
8171
8172 if (cfgloc2locator(lbp, clp, li, cp->c_sideno, index)) {
8173 if (use_devid)
8174 ddi_devid_free(
8175 (ddi_devid_t)(uintptr_t)clp->l_devid);
8176 single_thread_end(s);
8177 mddb_setexit(s);
8178 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8179 setno));
8180 }
8181 }
8182
8183 if (use_devid)
8184 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8185
8186 if (command == MDDB_DELSIDE) {
8187 int i;
8188 for (i = 0; i < lbp->lb_loccnt; i++) {
8189 if (lbp->lb_flags & MDDB_MNSET) {
8190 int j;
8191 mnlbp = (mddb_mnlb_t *)lbp;
8192 for (j = 0; j < MD_MNMAXSIDES; j++) {
8193 mnslp = &mnlbp->lb_mnsidelocators[j][i];
8194 if (mnslp->mnl_sideno == cp->c_sideno)
8195 break;
8196 }
8197 if (j < MD_MNMAXSIDES) {
8198 mnslp->mnl_mnum = NODEV32;
8199 mnslp->mnl_sideno = 0;
8200 mnlnp = (mddb_mnln_t *)lnp;
8201 mnsn = &(mnlnp->ln_mnsuffixes[j][i]);
8202 bzero((caddr_t)mnsn,
8203 sizeof (md_mnname_suffix_t));
8204 }
8205 } else {
8206 slp = &lbp->lb_sidelocators[cp->c_sideno][i];
8207 bzero((caddr_t)&lnp->ln_suffixes
8208 [cp->c_sideno][i], sizeof (md_name_suffix));
8209 slp->l_mnum = NODEV32;
8210 }
8211 }
8212 }
8213
8214 /* write new locator names to all devices */
8215 uniqtime32(&lnp->ln_timestamp);
8216 if (lbp->lb_flags & MDDB_MNSET)
8217 lnp->ln_revision = MDDB_REV_MNLN;
8218 else
8219 lnp->ln_revision = MDDB_REV_LN;
8220 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8221 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8222 lbp->lb_lnblkcnt, 0);
8223 /*
8224 * If a MN diskset and this is the master, set the PARSE_LOCNM
8225 * flag in the mddb_set structure to show that the locator
8226 * names have changed.
8227 */
8228
8229 if ((lbp->lb_flags & MDDB_MNSET) &&
8230 (md_set[s->s_setno].s_am_i_master)) {
8231 s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8232 }
8233 if (err) {
8234 if (writeretry(s)) {
8235 single_thread_end(s);
8236 mddb_setexit(s);
8237 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8238 }
8239 }
8240
8241 uniqtime32(&lbp->lb_timestamp);
8242 /* write new locator to all devices */
8243 err = writelocall(s);
8244
8245 (void) upd_med(s, "delnewside(0)");
8246
8247 computefreeblks(s); /* recompute always it may be larger */
8248 if (err) {
8249 if (writeretry(s)) {
8250 single_thread_end(s);
8251 mddb_setexit(s);
8252 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8253 }
8254 }
8255
8256 single_thread_end(s);
8257 mddb_setexit(s);
8258
8259 return (0);
8260 }
8261
8262 static int
newdev(mddb_config_t * cp,int command,md_error_t * ep)8263 newdev(
8264 mddb_config_t *cp,
8265 int command,
8266 md_error_t *ep
8267 )
8268 {
8269 mddb_set_t *s;
8270 mddb_mb_ic_t *mbip, *mbip1;
8271 int i, j;
8272 int li;
8273 mddb_lb_t *lbp; /* pointer to locator block */
8274 mddb_ln_t *lnp; /* pointer to locator names */
8275 mddb_locator_t *lp;
8276 mddb_cfg_loc_t *clp;
8277 int err = 0;
8278 set_t setno = cp->c_setno;
8279 ddi_devid_t devid2;
8280 ddi_devid_t ret_devid = NULL;
8281 char *minor_name;
8282 uint_t use_devid = 0;
8283 dev_t ddi_dev;
8284 int old_flags;
8285 int flags;
8286 int mn_set = 0;
8287 int index;
8288 mddb_ri_t *rip;
8289 int locator_deleted = 0;
8290 dev32_t locator_deleted_dev;
8291 int sz = 0;
8292
8293
8294 /* Currently don't allow addition of new replica during upgrade */
8295 if (MD_UPGRADE) {
8296 cmn_err(CE_WARN,
8297 "Addition of new replica not allowed during upgrade.\n");
8298 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8299 }
8300
8301 /*
8302 * Data integrity check
8303 */
8304 if (setno >= md_nsets || cp->c_locator.l_dev <= 0)
8305 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
8306
8307 /* Determine the flag settings for multinode sets */
8308 flags = MDDB_NOOLDOK;
8309 if (cp->c_multi_node)
8310 flags |= MDDB_MULTINODE;
8311
8312 if ((s = mddb_setenter(setno, flags, &err)) == NULL) {
8313 if (err != MDDB_E_NOTOWNER)
8314 return (mddbstatus2error(ep, err, NODEV32, setno));
8315 s = init_set(cp, flags, &err);
8316 if (s == NULL)
8317 return (mddbstatus2error(ep, err, NODEV32, setno));
8318 }
8319
8320 single_thread_start(s);
8321
8322 /* shorthand */
8323 clp = &cp->c_locator;
8324
8325 /* shorthand */
8326 lbp = s->s_lbp;
8327
8328 if (lbp->lb_setno != setno) {
8329 single_thread_end(s);
8330 mddb_setexit(s);
8331 return (mdmddberror(ep, MDE_DB_INVALID, NODEV32, setno));
8332 }
8333
8334 /*
8335 * See if this device/blkno pair is already a replica
8336 */
8337 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
8338 ddi_dev = expldev(clp->l_dev);
8339 if ((ddi_lyr_get_devid(ddi_dev, &ret_devid) == DDI_SUCCESS) &&
8340 (ddi_lyr_get_minor_name(ddi_dev,
8341 S_IFBLK, &minor_name) == DDI_SUCCESS)) {
8342 if (strlen(minor_name) < MDDB_MINOR_NAME_MAX) {
8343 clp->l_devid = (uint64_t)(uintptr_t)ret_devid;
8344 use_devid = 1;
8345 (void) strcpy(clp->l_minor_name, minor_name);
8346 }
8347 kmem_free(minor_name, strlen(minor_name)+1);
8348 }
8349 if (use_devid != 1 && ret_devid != NULL)
8350 ddi_devid_free(ret_devid);
8351 }
8352
8353 for (i = 0; i < lbp->lb_loccnt; i++) {
8354 lp = &lbp->lb_locators[i];
8355 if (lp->l_flags & MDDB_F_DELETED)
8356 continue;
8357 if (use_devid) {
8358 if ((mddb_devid_get(s, i, &devid2, &minor_name)) == 0)
8359 continue;
8360 if ((ddi_devid_compare(devid2,
8361 (ddi_devid_t)(uintptr_t)clp->l_devid) == 0) &&
8362 (strcmp(clp->l_minor_name, minor_name) == 0) &&
8363 ((daddr_t)lp->l_blkno == clp->l_blkno)) {
8364 if (command == MDDB_NEWDEV) {
8365 ddi_devid_free((ddi_devid_t)(uintptr_t)
8366 clp->l_devid);
8367 single_thread_end(s);
8368 mddb_setexit(s);
8369 return (mdmddberror(ep,
8370 MDE_DB_EXISTS, NODEV32, setno));
8371 }
8372 }
8373 } else {
8374 if (lp->l_dev == clp->l_dev &&
8375 (daddr_t)lp->l_blkno == clp->l_blkno) {
8376 if (command == MDDB_NEWDEV) {
8377 single_thread_end(s);
8378 mddb_setexit(s);
8379 return (mdmddberror(ep,
8380 MDE_DB_EXISTS, NODEV32, setno));
8381 }
8382 }
8383 }
8384 }
8385
8386 /*
8387 * Really is a new replica, go get the master blocks
8388 */
8389 mbip = getmasters(s, md_expldev(clp->l_dev), clp->l_blkno,
8390 (uint_t *)0, &mn_set);
8391 if (! mbip) {
8392 if (use_devid)
8393 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8394 single_thread_end(s);
8395 mddb_setexit(s);
8396 return (mdmddberror(ep, MDE_DB_MASTER, NODEV32, setno));
8397 }
8398
8399 /*
8400 * Compute free blocks in replica.
8401 */
8402 computefreeblks(s);
8403
8404 /*
8405 * Check if this is large enough
8406 */
8407 for (mbip1 = mbip, i = 0; mbip1 != NULL; mbip1 = mbip1->mbi_next)
8408 i += mbip1->mbi_mddb_mb.mb_blkcnt;
8409 for (j = i; j < s->s_totalblkcnt; j++) {
8410 if (blkcheck(s, j)) {
8411 while (mbip) {
8412 mbip1 = mbip->mbi_next;
8413 kmem_free((caddr_t)mbip, MDDB_IC_BSIZE);
8414 mbip = mbip1;
8415 }
8416 if (use_devid)
8417 ddi_devid_free(
8418 (ddi_devid_t)(uintptr_t)clp->l_devid);
8419 mddb_devclose(md_expldev(clp->l_dev));
8420 single_thread_end(s);
8421 mddb_setexit(s);
8422 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32,
8423 setno));
8424 }
8425 }
8426
8427 /* Look for a deleted slot */
8428 for (li = 0; li < lbp->lb_loccnt; li++) {
8429 lp = &lbp->lb_locators[li];
8430 if (lp->l_flags & MDDB_F_DELETED) {
8431 locator_deleted = 1;
8432 locator_deleted_dev = lp->l_dev;
8433 break;
8434 }
8435 }
8436
8437 /* If no deleted slots, add a new one */
8438 if (li == lbp->lb_loccnt) {
8439 /* Already have the max replicas, bail */
8440 if (lbp->lb_loccnt == MDDB_NLB) {
8441 if (use_devid)
8442 ddi_devid_free((ddi_devid_t)(uintptr_t)
8443 clp->l_devid);
8444 mddb_devclose(md_expldev(clp->l_dev));
8445 single_thread_end(s);
8446 mddb_setexit(s);
8447 return (mdmddberror(ep, MDE_TOOMANY_REPLICAS, NODEV32,
8448 setno));
8449 }
8450 lbp->lb_loccnt++;
8451 lp = &lbp->lb_locators[li];
8452 }
8453
8454 /* Initialize the new or deleted slot */
8455 old_flags = lp->l_flags;
8456 lp->l_dev = clp->l_dev;
8457 lp->l_blkno = (daddr32_t)clp->l_blkno;
8458 lp->l_flags = clp->l_flags;
8459
8460 /* shorthand */
8461 lnp = s->s_lnp;
8462
8463 index = 0;
8464 if ((lbp->lb_flags & MDDB_MNSET) || (flags & MDDB_MULTINODE)) {
8465 /*
8466 * If a MN diskset, need to find the index where the new
8467 * locator information is to be stored in the mnsidelocator
8468 * field of the locator block so that the locator name can
8469 * be stored at the same array index in the mnsuffixes
8470 * field of the locator names structure.
8471 */
8472 lbp->lb_flags |= MDDB_MNSET;
8473 if ((index = checklocator(lbp, li, s->s_sideno)) == -1) {
8474 if (use_devid)
8475 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->
8476 l_devid);
8477 lp->l_flags = old_flags;
8478 lbp->lb_loccnt--;
8479 mddb_devclose(md_expldev(clp->l_dev));
8480 single_thread_end(s);
8481 mddb_setexit(s);
8482 return (mdmddberror(ep, MDE_DB_TOOSMALL,
8483 NODEV32, setno));
8484 }
8485 }
8486 /*
8487 * Store the locator name before the sidelocator information
8488 * in case a panic occurs between these 2 steps. Must have
8489 * the locator name information in order to print reasonable
8490 * error information.
8491 */
8492 if (splitname2locatorblock(&cp->c_devname, lnp, li,
8493 s->s_sideno, index)) {
8494 if (use_devid)
8495 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8496 lp->l_flags = old_flags;
8497 lbp->lb_loccnt--;
8498 mddb_devclose(md_expldev(clp->l_dev));
8499 single_thread_end(s);
8500 mddb_setexit(s);
8501 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8502 }
8503
8504 /*
8505 * Compute free blocks in replica before calling cfgloc2locator
8506 * since cfgloc2locator may attempt to alloc an unused block
8507 * to store the device id.
8508 * mbiarray needs to be setup before calling computefreeblks.
8509 */
8510 s->s_mbiarray[li] = mbip;
8511 computefreeblks(s);
8512
8513 if (cfgloc2locator(lbp, clp, li, s->s_sideno, index)) {
8514 if (use_devid)
8515 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8516 lp->l_flags = old_flags;
8517 lbp->lb_loccnt--;
8518 s->s_mbiarray[li] = 0;
8519 mddb_devclose(md_expldev(clp->l_dev));
8520 single_thread_end(s);
8521 mddb_setexit(s);
8522 return (mdmddberror(ep, MDE_DB_TOOSMALL, NODEV32, setno));
8523 }
8524
8525 /*
8526 * Hijack a deleted rip master record and correct the contents
8527 */
8528 if (locator_deleted) {
8529 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
8530 if (rip->ri_lbp != NULL &&
8531 rip->ri_mbip == 0 &&
8532 (rip->ri_dev == md_expldev(locator_deleted_dev))) {
8533 rip->ri_dev = md_expldev(clp->l_dev);
8534 rip->ri_mbip = mbip;
8535
8536 if (use_devid && clp->l_devid != 0) {
8537 sz = (int)ddi_devid_sizeof(
8538 (ddi_devid_t)(uintptr_t)
8539 clp->l_devid);
8540 rip->ri_devid =
8541 (ddi_devid_t)kmem_zalloc(sz,
8542 KM_SLEEP);
8543 bcopy((void *)(uintptr_t)clp->l_devid,
8544 (char *)rip->ri_devid, sz);
8545 }
8546
8547 break;
8548 }
8549 }
8550 }
8551
8552 if (use_devid)
8553 ddi_devid_free((ddi_devid_t)(uintptr_t)clp->l_devid);
8554
8555 uniqtime32(&lbp->lb_timestamp);
8556 lp->l_flags = MDDB_F_ACTIVE;
8557
8558 /* write db copy to new device */
8559 err = writecopy(s, li, MDDB_WRITECOPY_ALL);
8560 lp->l_flags |= MDDB_F_UP2DATE;
8561
8562 /* write new locator names to all devices */
8563 uniqtime32(&lnp->ln_timestamp);
8564 if (lbp->lb_flags & MDDB_MNSET)
8565 lnp->ln_revision = MDDB_REV_MNLN;
8566 else
8567 lnp->ln_revision = MDDB_REV_LN;
8568 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
8569 err |= writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
8570 lbp->lb_lnblkcnt, 0);
8571 /*
8572 * If a MN diskset and this is the master, set the PARSE_LOCNM
8573 * flag in the mddb_set structure to show that the locator
8574 * names have changed.
8575 */
8576
8577 if ((lbp->lb_flags & MDDB_MNSET) &&
8578 (md_set[s->s_setno].s_am_i_master)) {
8579 s->s_mn_parseflags |= MDDB_PARSE_LOCNM;
8580 }
8581 if (err) {
8582 if (writeretry(s)) {
8583 single_thread_end(s);
8584 mddb_setexit(s);
8585 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8586 }
8587 }
8588
8589 /* Data tags not supported on MN sets */
8590 if ((md_get_setstatus(setno) & MD_SET_STALE) &&
8591 (!(lbp->lb_flags & MDDB_MNSET)) &&
8592 setno != MD_LOCAL_SET)
8593 if (set_dtag(s, ep))
8594 mdclrerror(ep);
8595
8596 /* Write data tags to all accessible devices */
8597 /* Data tags not supported on MN sets */
8598 if (!(lbp->lb_flags & MDDB_MNSET)) {
8599 (void) dt_write(s);
8600 }
8601
8602 /* write new locator to all devices */
8603 err = writelocall(s);
8604
8605 (void) upd_med(s, "newdev(0)");
8606
8607 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_REPLICA, setno,
8608 md_expldev(clp->l_dev));
8609
8610 computefreeblks(s); /* recompute always it may be smaller */
8611 if (err) {
8612 if (writeretry(s)) {
8613 single_thread_end(s);
8614 mddb_setexit(s);
8615 return (mdmddberror(ep, MDE_DB_NOTNOW, NODEV32, setno));
8616 }
8617 }
8618
8619 single_thread_end(s);
8620 mddb_setexit(s);
8621
8622 return (0);
8623 }
8624
8625 #ifdef DEBUG
8626 static void
mddb_check_set(set_t setno)8627 mddb_check_set(
8628 set_t setno
8629 )
8630 {
8631 mddb_set_t *s;
8632 mddb_db_t *dbp;
8633 mddb_de_ic_t *dep;
8634 mddb_rb32_t *rbp;
8635
8636 if (! md_set[setno].s_db)
8637 return;
8638
8639 s = (mddb_set_t *)md_set[setno].s_db;
8640
8641 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8642 for (dep = dbp->db_firstentry;
8643 dep != NULL; dep = dep->de_next) {
8644 rbp = dep->de_rb;
8645 ASSERT(rbp->rb_magic == MDDB_MAGIC_RB);
8646 if (dep->de_rb_userdata)
8647 ASSERT((uintptr_t)dep->de_rb_userdata > 2000);
8648 }
8649 }
8650 }
8651 #endif /* DEBUG */
8652
8653 /*
8654 * Exported Entry Points
8655 */
8656 #ifdef DEBUG
8657 void
mddb_check(void)8658 mddb_check(void)
8659 {
8660 int i;
8661
8662 for (i = 0; i < md_nsets; i++) {
8663 if (! md_set[i].s_db)
8664 return;
8665
8666 mddb_check_set(i);
8667 }
8668
8669 }
8670 #endif /* DEBUG */
8671
8672 int
mddb_configure(mddb_cfgcmd_t command,mddb_config_t * cp)8673 mddb_configure(
8674 mddb_cfgcmd_t command,
8675 mddb_config_t *cp
8676 )
8677 {
8678 mddb_set_t *s;
8679 md_error_t *ep = &cp->c_mde;
8680 int flag = 0;
8681 int err = 0;
8682 set_t setno = cp->c_setno;
8683
8684 mdclrerror(ep);
8685
8686 switch (command) {
8687 case MDDB_NEWDEV:
8688 err = newdev(cp, command, ep);
8689 break;
8690
8691 case MDDB_NEWSIDE:
8692 case MDDB_DELSIDE:
8693 err = delnewside(cp, command, ep);
8694 break;
8695
8696 case MDDB_GETDEV:
8697 case MDDB_DELDEV:
8698 case MDDB_ENDDEV:
8699 err = getdeldev(cp, command, ep);
8700 break;
8701
8702 case MDDB_GETDRVRNAME:
8703 err = getdriver(&cp->c_locator);
8704 break;
8705
8706 case MDDB_USEDEV:
8707 /*
8708 * Note: must allow USEDEV ioctl during upgrade to
8709 * support auto-take disksets.
8710 *
8711 * Also during the set import if the md_devid_destroy
8712 * flag is set then error out
8713 */
8714
8715 if ((cp->c_flags & MDDB_C_IMPORT) && md_devid_destroy)
8716 return (mdmderror(ep, MDE_INVAL_UNIT,
8717 MD_ADM_MINOR));
8718
8719 if (setno >= md_nsets)
8720 return (mdmderror(ep, MDE_INVAL_UNIT,
8721 MD_ADM_MINOR));
8722
8723 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) ==
8724 NULL) {
8725 if ((s = init_set(cp, MDDB_NOINIT, &err)) ==
8726 NULL) {
8727 err = mddbstatus2error(ep, err,
8728 NODEV32, setno);
8729 break;
8730 }
8731 }
8732 if (setno == MD_LOCAL_SET)
8733 flag = MDDB_F_IOCTL;
8734 if (cp->c_locator.l_old_devid) {
8735 md_set_setstatus(setno,
8736 MD_SET_REPLICATED_IMPORT);
8737 }
8738 err = ridev(&s->s_rip, &cp->c_locator, NULL, flag);
8739 mddb_setexit(s);
8740 break;
8741
8742 case MDDB_RELEASESET:
8743 mutex_enter(&mddb_lock);
8744 mddb_unload_set(cp->c_setno);
8745 mutex_exit(&mddb_lock);
8746 break;
8747
8748 case MDDB_SETDID:
8749 err = setdid(cp);
8750 break;
8751
8752 default:
8753 err = mdmddberror(ep, MDE_DB_INVALID, NODEV32,
8754 cp->c_setno);
8755 }
8756
8757 return (err);
8758 }
8759
8760 int
mddb_getoptloc(mddb_optloc_t * ol)8761 mddb_getoptloc(
8762 mddb_optloc_t *ol
8763 )
8764 {
8765 mddb_set_t *s;
8766 mddb_db_t *dbp;
8767 mddb_de_ic_t *dep;
8768 mddb_recid_t id;
8769 set_t setno;
8770
8771 ol->li[0] = -1;
8772 ol->li[1] = -1;
8773
8774 id = ol->recid;
8775 setno = DBSET(id);
8776 if (setno >= md_nsets)
8777 return (EINVAL);
8778
8779 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL)) == NULL)
8780 return (0);
8781
8782 id = DBID(id);
8783 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8784 for (dep = dbp->db_firstentry;
8785 dep != NULL; dep = dep->de_next) {
8786 if (dep->de_recid != id)
8787 continue;
8788 ol->li[0] = dep->de_optinfo[0].o_li;
8789 ol->li[1] = dep->de_optinfo[1].o_li;
8790 mddb_setexit(s);
8791 return (0);
8792 }
8793 }
8794 mddb_setexit(s);
8795 return (0);
8796 }
8797
8798 void
mddb_init(void)8799 mddb_init(void)
8800 {
8801 mddb_set_t *s;
8802
8803 mutex_init(&mddb_lock, NULL, MUTEX_DEFAULT, NULL);
8804 if ((s = init_set(NULL, MDDB_NOINIT, NULL)) != NULL)
8805 mddb_setexit(s);
8806 }
8807
8808
8809 void
mddb_unload(void)8810 mddb_unload(void)
8811 {
8812 int i;
8813
8814 mutex_enter(&mddb_lock);
8815
8816 for (i = 0; i < md_nsets; i++) {
8817 md_clr_setstatus(i, MD_SET_KEEPTAG);
8818 mddb_unload_set(i);
8819 }
8820
8821 crcfreetab();
8822
8823 mutex_exit(&mddb_lock);
8824 }
8825
8826 mddb_recid_t
mddb_createrec(size_t usersize,mddb_type_t type,uint_t type2,md_create_rec_option_t options,set_t setno)8827 mddb_createrec(
8828 size_t usersize, /* size of db record */
8829 mddb_type_t type, /* type1 of db record */
8830 uint_t type2, /* type2 of db record */
8831 md_create_rec_option_t options, /* options for this creation */
8832 set_t setno /* set number to create record in */
8833 )
8834 {
8835 mddb_set_t *s;
8836 mddb_db_t *dbp, *prevdbp, *newdbp;
8837 mddb_db32_t *db32p;
8838 mddb_de_ic_t *dep;
8839 /* LINTED variable unused - used for sizeof calculations */
8840 mddb_de32_t *de32p;
8841 mddb_rb32_t *rbp;
8842 size_t recsize;
8843 ulong_t blkcnt;
8844 ulong_t maxblocks;
8845 size_t desize, desize_ic;
8846 size_t used;
8847 mddb_recid_t newid;
8848 caddr_t tmppnt;
8849 int i, err = 0;
8850 void *userdata;
8851 uint_t flag_type;
8852
8853 #if defined(_ILP32) && !defined(lint)
8854 ASSERT(sizeof (mddb_de_t) == sizeof (mddb_de32_t));
8855 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
8856 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
8857 #endif
8858
8859 /*
8860 * everyone is supposed to sepcify if it's a
8861 * 32 bit or a 64 bit record
8862 */
8863 if ((options &(MD_CRO_32BIT|MD_CRO_64BIT)) == 0) {
8864 return (MDDB_E_INVALID);
8865 }
8866
8867 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
8868 return (err);
8869
8870 if (checkstate(s, MDDB_PROBE)) {
8871 mddb_setexit(s);
8872 return (MDDB_E_NOTNOW);
8873 }
8874
8875 recsize = roundup((sizeof (*rbp) - sizeof (rbp->rb_data)) +
8876 usersize, MDDB_BSIZE);
8877 blkcnt = btodb(recsize);
8878
8879 if (mddb_maxblocks)
8880 maxblocks = mddb_maxblocks;
8881 else
8882 maxblocks = (MDDB_BSIZE - (sizeof (*db32p) + sizeof (*de32p) -
8883 sizeof (de32p->de32_blks))) / sizeof (mddb_block_t);
8884
8885 if (blkcnt > maxblocks) {
8886 mddb_setexit(s);
8887 return (MDDB_E_INVALID);
8888 }
8889 /*
8890 * allocate record block
8891 * and new directory block so to avoid sleeping
8892 * after starting single_thread
8893 */
8894 rbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
8895 if ((options & MD_CRO_OPTIMIZE) == 0)
8896 userdata = kmem_zalloc(usersize, KM_SLEEP);
8897 newdbp = (mddb_db_t *)kmem_zalloc(sizeof (*newdbp), KM_SLEEP);
8898
8899 /*
8900 * if this is the largest record allocate new buffer for
8901 * checkcopy();
8902 */
8903 if (recsize > s->s_databuffer_size) {
8904 tmppnt = (caddr_t)kmem_zalloc(recsize, KM_SLEEP);
8905 /*
8906 * this test is incase when to sleep during kmem_alloc
8907 * and some other task bumped max record size
8908 */
8909 if (recsize > s->s_databuffer_size) {
8910 if (s->s_databuffer_size)
8911 kmem_free(s->s_databuffer,
8912 s->s_databuffer_size);
8913 s->s_databuffer = tmppnt;
8914 s->s_databuffer_size = recsize;
8915 } else {
8916 kmem_free(tmppnt, recsize);
8917 }
8918 }
8919
8920 single_thread_start(s);
8921
8922 newid = 0;
8923 do {
8924 newid++;
8925 if (DBID(newid) == 0) {
8926 kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8927 kmem_free((caddr_t)rbp, ((size_t)recsize));
8928 if ((options & MD_CRO_OPTIMIZE) == 0)
8929 kmem_free(userdata, usersize);
8930 single_thread_end(s);
8931 mddb_setexit(s);
8932 return (MDDB_E_NOTNOW);
8933 }
8934
8935 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8936 for (dep = dbp->db_firstentry; dep;
8937 dep = dep->de_next) {
8938 if (dep->de_recid == newid)
8939 break;
8940 }
8941 if (dep != NULL)
8942 break;
8943 }
8944 } while (dbp);
8945
8946 desize = (sizeof (*de32p) - sizeof (de32p->de32_blks)) +
8947 (sizeof (mddb_block_t) * blkcnt);
8948
8949 /*
8950 * see if a directory block exists which will hold this entry
8951 */
8952 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
8953 used = sizeof (*db32p);
8954 for (dep = dbp->db_firstentry;
8955 dep != NULL; dep = dep->de_next) {
8956 used += sizeof (*de32p) - sizeof (de32p->de32_blks);
8957 used += sizeof (mddb_block_t) * dep->de_blkcount;
8958 }
8959 if ((used + desize) < MDDB_BSIZE)
8960 break;
8961 }
8962 if (dbp) {
8963 kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8964 if (blkcnt > s->s_freeblkcnt) {
8965 kmem_free((caddr_t)rbp, ((size_t)recsize));
8966 if ((options & MD_CRO_OPTIMIZE) == 0)
8967 kmem_free(userdata, usersize);
8968 single_thread_end(s);
8969 mddb_setexit(s);
8970 return (MDDB_E_NOSPACE);
8971 }
8972 prevdbp = NULL;
8973 } else {
8974 /*
8975 * need to add directory block
8976 */
8977 if ((blkcnt + 1) > s->s_freeblkcnt) {
8978 kmem_free((caddr_t)newdbp, sizeof (*newdbp));
8979 kmem_free((caddr_t)rbp, ((size_t)recsize));
8980 if ((options & MD_CRO_OPTIMIZE) == 0)
8981 kmem_free(userdata, usersize);
8982 single_thread_end(s);
8983 mddb_setexit(s);
8984 return (MDDB_E_NOSPACE);
8985 }
8986 for (dbp = s->s_dbp; dbp->db_next; dbp = dbp->db_next)
8987 ;
8988 dbp->db_next = newdbp;
8989 bzero((caddr_t)dbp->db_next, sizeof (*newdbp));
8990 dbp->db_nextblk = getfreeblks(s, 1);
8991 dbp->db_next->db_blknum = dbp->db_nextblk;
8992 prevdbp = dbp;
8993 dbp = dbp->db_next;
8994 dbp->db_nextblk = 0;
8995 dbp->db_firstentry = NULL;
8996 dbp->db_recsum = 0;
8997 dbp->db_magic = MDDB_MAGIC_DB;
8998 }
8999 /*
9000 * ready to add record
9001 */
9002 desize_ic = (sizeof (*dep) - sizeof (dep->de_blks)) +
9003 (sizeof (mddb_block_t) * blkcnt);
9004 if (dbp->db_firstentry) {
9005 for (dep = dbp->db_firstentry; dep->de_next; dep = dep->de_next)
9006 ;
9007 dep->de_next = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
9008 dep = dep->de_next;
9009 } else {
9010 dep = (mddb_de_ic_t *)kmem_zalloc(desize_ic, KM_SLEEP);
9011 dbp->db_firstentry = dep;
9012 }
9013 bzero((caddr_t)dep, desize_ic);
9014 dep->de_recid = newid;
9015 /*
9016 * Optimized records have an owner node associated with them in
9017 * a MN diskset. The owner is only set on a node that is actively
9018 * writing to that record. The other nodes will show that record
9019 * as having an invalid owner. The owner for an optimized record
9020 * is used during fixoptrecord to determine which node should
9021 * write out the record when the replicas associated with that
9022 * optimized record have been changed.
9023 */
9024 if (MD_MNSET_SETNO(s->s_setno)) {
9025 dep->de_owner_nodeid = MD_MN_INVALID_NID;
9026 }
9027 dep->de_type1 = type;
9028 dep->de_type2 = type2;
9029 dep->de_reqsize = usersize;
9030 dep->de_recsize = recsize;
9031 dep->de_blkcount = blkcnt;
9032 flag_type = options &
9033 (MD_CRO_OPTIMIZE | MD_CRO_STRIPE | MD_CRO_MIRROR | MD_CRO_RAID |
9034 MD_CRO_SOFTPART | MD_CRO_TRANS_MASTER | MD_CRO_TRANS_LOG |
9035 MD_CRO_HOTSPARE | MD_CRO_HOTSPARE_POOL | MD_CRO_CHANGELOG);
9036 switch (flag_type) {
9037 case MD_CRO_OPTIMIZE:
9038 dep->de_flags = MDDB_F_OPT;
9039 getoptdev(s, dep, 0);
9040 getoptdev(s, dep, 1);
9041 break;
9042 case MD_CRO_STRIPE:
9043 dep->de_flags = MDDB_F_STRIPE;
9044 break;
9045 case MD_CRO_MIRROR:
9046 dep->de_flags = MDDB_F_MIRROR;
9047 break;
9048 case MD_CRO_RAID:
9049 dep->de_flags = MDDB_F_RAID;
9050 break;
9051 case MD_CRO_SOFTPART:
9052 dep->de_flags = MDDB_F_SOFTPART;
9053 break;
9054 case MD_CRO_TRANS_MASTER:
9055 dep->de_flags = MDDB_F_TRANS_MASTER;
9056 break;
9057 case MD_CRO_TRANS_LOG:
9058 dep->de_flags = MDDB_F_TRANS_LOG;
9059 break;
9060 case MD_CRO_HOTSPARE:
9061 dep->de_flags = MDDB_F_HOTSPARE;
9062 break;
9063 case MD_CRO_HOTSPARE_POOL:
9064 dep->de_flags = MDDB_F_HOTSPARE_POOL;
9065 break;
9066 case MD_CRO_CHANGELOG:
9067 dep->de_flags = MDDB_F_CHANGELOG;
9068 break;
9069 }
9070 /*
9071 * try to get all blocks consecutive. If not possible
9072 * just get them one at a time
9073 */
9074 dep->de_blks[0] = getfreeblks(s, blkcnt);
9075 if (dep->de_blks[0]) {
9076 for (i = 1; i < blkcnt; i++)
9077 dep->de_blks[i] = dep->de_blks[0] + i;
9078 } else {
9079 for (i = 0; i < blkcnt; i++)
9080 dep->de_blks[i] = getfreeblks(s, 1);
9081 }
9082 dep->de_rb = rbp;
9083 bzero((caddr_t)rbp, recsize);
9084 rbp->rb_magic = MDDB_MAGIC_RB;
9085
9086 /* Do we have to create an old style (32 bit) record? */
9087 if (options & MD_CRO_32BIT) {
9088 if (options & MD_CRO_FN)
9089 rbp->rb_revision = MDDB_REV_RBFN;
9090 else
9091 rbp->rb_revision = MDDB_REV_RB;
9092 } else {
9093 if (options & MD_CRO_FN)
9094 rbp->rb_revision = MDDB_REV_RB64FN;
9095 else
9096 rbp->rb_revision = MDDB_REV_RB64;
9097 }
9098
9099 /* set de_rb_userdata for non optimization records */
9100 if ((options & MD_CRO_OPTIMIZE) == 0) {
9101 dep->de_rb_userdata = userdata;
9102 }
9103
9104 uniqtime32(&rbp->rb_timestamp);
9105 /* Generate the crc for this record */
9106 rec_crcgen(s, dep, rbp);
9107 tmppnt = (caddr_t)rbp;
9108 /*
9109 * the following code writes new records to all instances of
9110 * the data base. Writing one block at a time to each instance
9111 * is safe because they are not yet in a directory entry which
9112 * has been written to the data base
9113 */
9114 err = 0;
9115 if ((options & MD_CRO_OPTIMIZE) == 0) {
9116 for (i = 0; i < blkcnt; i++) {
9117 err |= writeall(s, (caddr_t)tmppnt,
9118 dep->de_blks[i], 1, 0);
9119 tmppnt += MDDB_BSIZE;
9120 }
9121 } else {
9122 if ((MD_MNSET_SETNO(s->s_setno)) &&
9123 md_set[s->s_setno].s_am_i_master) {
9124 /*
9125 * If a MN diskset then only master writes out newly
9126 * created optimized record.
9127 */
9128 err |= writeoptrecord(s, dep);
9129 }
9130 }
9131 uniqtime32(&dbp->db_timestamp);
9132 dbp->db_revision = MDDB_REV_DB;
9133 /* Don't include opt resync and change log records in global XOR */
9134 if (!(dep->de_flags & MDDB_F_OPT) &&
9135 !(dep->de_flags & MDDB_F_CHANGELOG))
9136 dbp->db_recsum ^= rbp->rb_checksum;
9137 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9138 create_db32rec(db32p, dbp);
9139 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9140 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9141 if (prevdbp) {
9142 dbp = prevdbp;
9143 uniqtime32(&dbp->db_timestamp);
9144 dbp->db_revision = MDDB_REV_DB;
9145 create_db32rec(db32p, dbp);
9146 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9147 err |= writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0);
9148 }
9149 kmem_free((caddr_t)db32p, MDDB_BSIZE);
9150 if (err) {
9151 if (writeretry(s)) {
9152 s->s_zombie = newid;
9153 single_thread_end(s);
9154 mddb_setexit(s);
9155 return (MDDB_E_NOTNOW);
9156 }
9157 }
9158 single_thread_end(s);
9159 mddb_setexit(s);
9160
9161 ASSERT((newid & MDDB_SETMASK) == 0);
9162 return (MAKERECID(setno, newid));
9163 }
9164
9165 int
mddb_deleterec(mddb_recid_t id)9166 mddb_deleterec(
9167 mddb_recid_t id
9168 )
9169 {
9170 mddb_set_t *s;
9171 mddb_db_t *dbp;
9172 mddb_db32_t *db32p;
9173 mddb_de_ic_t *dep, *dep1;
9174 int i;
9175
9176 #if defined(_ILP32) && !defined(lint)
9177 ASSERT(sizeof (mddb_db_t) == sizeof (mddb_db32_t));
9178 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
9179 #endif
9180
9181 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9182 ASSERT(s != NULL);
9183
9184 id = DBID(id);
9185 if (checkstate(s, MDDB_PROBE)) {
9186 mddb_setexit(s);
9187 return (MDDB_E_NOTNOW);
9188 }
9189
9190 ASSERT(s->s_lbp != NULL);
9191 single_thread_start(s);
9192
9193 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9194 dep1 = NULL;
9195 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9196 if (dep->de_recid == id)
9197 break;
9198 dep1 = dep;
9199 }
9200 if (dep != NULL)
9201 break;
9202 }
9203 /*
9204 * no such record
9205 */
9206 if (dep == NULL) {
9207 single_thread_end(s);
9208 ASSERT(s->s_staledeletes != 0);
9209 s->s_staledeletes--;
9210 mddb_setexit(s);
9211 return (0);
9212 }
9213
9214 if (!(dep->de_flags & MDDB_F_OPT) &&
9215 !(dep->de_flags & MDDB_F_CHANGELOG)) {
9216 dbp->db_recsum ^= dep->de_rb->rb_checksum;
9217 dbp->db_recsum ^= dep->de_rb->rb_checksum_fiddle;
9218 }
9219
9220 if (dep->de_rb_userdata != NULL) {
9221 if (dep->de_icreqsize)
9222 kmem_free(dep->de_rb_userdata_ic, dep->de_icreqsize);
9223 else
9224 kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9225 }
9226
9227 kmem_free((caddr_t)dep->de_rb, dep->de_recsize);
9228
9229 for (i = 0; i < dep->de_blkcount; i++)
9230 blkfree(s, dep->de_blks[i]);
9231 if (dep1)
9232 dep1->de_next = dep->de_next;
9233 else
9234 dbp->db_firstentry = dep->de_next;
9235
9236 kmem_free(dep, sizeofde(dep));
9237
9238 uniqtime32(&dbp->db_timestamp);
9239 dbp->db_revision = MDDB_REV_DB;
9240 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE, KM_SLEEP);
9241 create_db32rec(db32p, dbp);
9242 crcgen(db32p, &db32p->db32_checksum, MDDB_BSIZE, NULL);
9243 if (writeall(s, (caddr_t)db32p, dbp->db_blknum, 1, 0)) {
9244 if (writeretry(s)) {
9245 /*
9246 * staledelete is used to mark deletes which failed.
9247 * its only use is to not panic when the user retries
9248 * the delete once the database is active again
9249 */
9250 single_thread_end(s);
9251 s->s_staledeletes++;
9252 kmem_free((caddr_t)db32p, MDDB_BSIZE);
9253 mddb_setexit(s);
9254 return (MDDB_E_NOTNOW);
9255 }
9256 }
9257 single_thread_end(s);
9258 kmem_free((caddr_t)db32p, MDDB_BSIZE);
9259 mddb_setexit(s);
9260 return (0);
9261 }
9262
9263 mddb_recid_t
mddb_getnextrec(mddb_recid_t id,mddb_type_t typ,uint_t type2)9264 mddb_getnextrec(
9265 mddb_recid_t id,
9266 mddb_type_t typ,
9267 uint_t type2
9268 )
9269 {
9270 mddb_set_t *s;
9271 mddb_db_t *dbp;
9272 mddb_de_ic_t *dep;
9273 int searching, err;
9274 set_t setno;
9275
9276 setno = DBSET(id);
9277 id = DBID(id);
9278 searching = id;
9279
9280 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
9281 return (err);
9282
9283 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9284 for (dep = dbp->db_firstentry;
9285 dep != NULL; dep = dep->de_next) {
9286 if (searching) {
9287 if (dep->de_recid == id)
9288 searching = 0;
9289 } else {
9290 if ((typ == MDDB_ALL || dep->de_type1 == typ) &&
9291 (type2 == 0 || dep->de_type2 == type2)) {
9292 id = dep->de_recid;
9293 mddb_setexit(s);
9294 ASSERT((id & MDDB_SETMASK) == 0);
9295 return (MAKERECID(setno, id));
9296 }
9297 }
9298 }
9299 }
9300
9301 mddb_setexit(s);
9302
9303 if (searching)
9304 return (MDDB_E_NORECORD);
9305 return (0);
9306 }
9307
9308 void *
mddb_getrecaddr(mddb_recid_t id)9309 mddb_getrecaddr(
9310 mddb_recid_t id
9311 )
9312 {
9313 mddb_set_t *s;
9314 mddb_db_t *dbp;
9315 mddb_de_ic_t *dep;
9316 void *rval;
9317
9318 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9319 return (NULL);
9320
9321 id = DBID(id);
9322 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9323 for (dep = dbp->db_firstentry;
9324 dep != NULL; dep = dep->de_next) {
9325 if (dep->de_recid != id)
9326 continue;
9327 if (dep->de_rb_userdata)
9328 rval = (void *)dep->de_rb_userdata;
9329 else
9330 rval = (void *)dep->de_rb->rb_data;
9331 mddb_setexit(s);
9332 return (rval);
9333 }
9334 }
9335
9336 mddb_setexit(s);
9337 return (NULL);
9338 }
9339
9340
9341 mddb_de_ic_t *
mddb_getrecdep(mddb_recid_t id)9342 mddb_getrecdep(
9343 mddb_recid_t id
9344 )
9345 {
9346 mddb_set_t *s;
9347 mddb_db_t *dbp;
9348 mddb_de_ic_t *dep;
9349
9350 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9351 return (NULL);
9352
9353 id = DBID(id);
9354 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9355 for (dep = dbp->db_firstentry;
9356 dep != NULL; dep = dep->de_next) {
9357 if (dep->de_recid != id)
9358 continue;
9359 mddb_setexit(s);
9360 return (dep);
9361 }
9362 }
9363
9364 mddb_setexit(s);
9365 return (NULL);
9366 }
9367
9368 void *
mddb_getrecaddr_resize(mddb_recid_t id,size_t icsize,off_t off)9369 mddb_getrecaddr_resize(
9370 mddb_recid_t id,
9371 size_t icsize,
9372 off_t off
9373 )
9374 {
9375 mddb_set_t *s;
9376 mddb_db_t *dbp;
9377 mddb_de_ic_t *dep;
9378 void *rval = NULL;
9379
9380 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
9381 return (NULL);
9382
9383 id = DBID(id);
9384 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9385 for (dep = dbp->db_firstentry;
9386 dep != NULL; dep = dep->de_next) {
9387 if (dep->de_recid != id)
9388 continue;
9389 if (dep->de_rb_userdata)
9390 rval = (void *)dep->de_rb_userdata;
9391 else
9392 rval = (void *)dep->de_rb->rb_data;
9393 break;
9394 }
9395 if (rval != NULL)
9396 break;
9397 }
9398
9399 if (rval == NULL) {
9400 mddb_setexit(s);
9401 return (NULL);
9402 }
9403
9404 if (dep->de_rb_userdata) {
9405 caddr_t nud;
9406
9407 if (dep->de_icreqsize || (dep->de_reqsize >= icsize)) {
9408 mddb_setexit(s);
9409 return (rval);
9410 }
9411 ASSERT((dep->de_reqsize + off) <= icsize);
9412 nud = kmem_zalloc(icsize, KM_SLEEP);
9413 bcopy(dep->de_rb_userdata, nud + off, dep->de_reqsize);
9414 kmem_free(dep->de_rb_userdata, dep->de_reqsize);
9415 dep->de_rb_userdata = nud + off;
9416 dep->de_rb_userdata_ic = nud;
9417 dep->de_icreqsize = icsize;
9418 rval = nud;
9419 } else {
9420 size_t recsize;
9421 /* LINTED variable unused - used for sizeof calculations */
9422 mddb_rb32_t *nrbp;
9423
9424 recsize = roundup((sizeof (*nrbp) - sizeof (nrbp->rb_data)) +
9425 icsize, MDDB_BSIZE);
9426 if (dep->de_recsize < recsize)
9427 cmn_err(CE_PANIC, "mddb_getrecaddr_resize: only "
9428 "nonoptimized records can be resized\n");
9429 }
9430
9431 mddb_setexit(s);
9432 return (rval);
9433 }
9434
9435 int
mddb_getrecprivate(mddb_recid_t id)9436 mddb_getrecprivate(
9437 mddb_recid_t id
9438 )
9439 {
9440 mddb_set_t *s;
9441 mddb_db_t *dbp;
9442 mddb_de_ic_t *dep;
9443 int err = 0;
9444 int private;
9445
9446 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9447 return (err);
9448
9449 id = DBID(id);
9450 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9451 for (dep = dbp->db_firstentry;
9452 dep != NULL; dep = dep->de_next) {
9453 if (dep->de_recid != id)
9454 continue;
9455 private = (int)dep->de_rb->rb_private;
9456 mddb_setexit(s);
9457 return (private);
9458 }
9459 }
9460
9461 mddb_setexit(s);
9462 return (MDDB_E_NORECORD);
9463 }
9464
9465 void
mddb_setrecprivate(mddb_recid_t id,uint_t private)9466 mddb_setrecprivate(
9467 mddb_recid_t id,
9468 uint_t private
9469 )
9470 {
9471 mddb_set_t *s;
9472 mddb_db_t *dbp;
9473 mddb_de_ic_t *dep;
9474
9475 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL) {
9476 ASSERT(0);
9477 return;
9478 }
9479
9480 id = DBID(id);
9481 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9482 for (dep = dbp->db_firstentry;
9483 dep != NULL; dep = dep->de_next) {
9484 if (dep->de_recid != id)
9485 continue;
9486 dep->de_rb->rb_private = private;
9487 mddb_setexit(s);
9488 return;
9489 }
9490 }
9491
9492 mddb_setexit(s);
9493 ASSERT(0);
9494 }
9495
9496 mddb_type_t
mddb_getrectype1(mddb_recid_t id)9497 mddb_getrectype1(
9498 mddb_recid_t id
9499 )
9500 {
9501 mddb_set_t *s;
9502 mddb_db_t *dbp;
9503 mddb_de_ic_t *dep;
9504 int err = 0;
9505 mddb_type_t rval;
9506
9507 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9508 return (err);
9509
9510 id = DBID(id);
9511 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9512 for (dep = dbp->db_firstentry;
9513 dep != NULL; dep = dep->de_next) {
9514 if (dep->de_recid != id)
9515 continue;
9516 rval = dep->de_type1;
9517 mddb_setexit(s);
9518 return (rval);
9519 }
9520 }
9521
9522 mddb_setexit(s);
9523 return (MDDB_E_NORECORD);
9524 }
9525
9526 int
mddb_getrectype2(mddb_recid_t id)9527 mddb_getrectype2(
9528 mddb_recid_t id
9529 )
9530 {
9531 mddb_set_t *s;
9532 mddb_db_t *dbp;
9533 mddb_de_ic_t *dep;
9534 int err = 0;
9535 int rval;
9536
9537 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9538 return (err);
9539
9540 id = DBID(id);
9541 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9542 for (dep = dbp->db_firstentry;
9543 dep != NULL; dep = dep->de_next) {
9544 if (dep->de_recid != id)
9545 continue;
9546 rval = (int)dep->de_type2;
9547 mddb_setexit(s);
9548 return (rval);
9549 }
9550 }
9551
9552 mddb_setexit(s);
9553 return (MDDB_E_NORECORD);
9554 }
9555
9556 int
mddb_getrecsize(mddb_recid_t id)9557 mddb_getrecsize(
9558 mddb_recid_t id
9559 )
9560 {
9561 mddb_set_t *s;
9562 mddb_db_t *dbp;
9563 mddb_de_ic_t *dep;
9564 int err = 0;
9565 int rval;
9566
9567 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9568 return (err);
9569
9570 id = DBID(id);
9571 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9572 for (dep = dbp->db_firstentry;
9573 dep != NULL; dep = dep->de_next) {
9574 if (dep->de_recid != id)
9575 continue;
9576 rval = (int)dep->de_reqsize;
9577 mddb_setexit(s);
9578 return (rval);
9579 }
9580 }
9581
9582 mddb_setexit(s);
9583 return (MDDB_E_NORECORD);
9584 }
9585
9586
9587 mddb_recstatus_t
mddb_getrecstatus(mddb_recid_t id)9588 mddb_getrecstatus(
9589 mddb_recid_t id
9590 )
9591 {
9592 mddb_set_t *s;
9593 mddb_db_t *dbp;
9594 mddb_de_ic_t *dep;
9595 int err = 0;
9596 mddb_recstatus_t e_err;
9597
9598 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, &err)) == NULL)
9599 return ((mddb_recstatus_t)err);
9600
9601 id = DBID(id);
9602 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9603 for (dep = dbp->db_firstentry;
9604 dep != NULL; dep = dep->de_next) {
9605 if (dep->de_recid == id)
9606 break;
9607 }
9608 if (dep)
9609 break;
9610 }
9611
9612 e_err = MDDB_OK;
9613
9614 if (! dep)
9615 e_err = MDDB_NORECORD;
9616 else if (! dep->de_rb->rb_commitcnt)
9617 e_err = MDDB_NODATA;
9618 else if (md_get_setstatus(s->s_setno) & MD_SET_STALE)
9619 e_err = MDDB_STALE;
9620
9621 mddb_setexit(s);
9622 return (e_err);
9623 }
9624
9625 static int mddb_commitrec_retries = 5;
9626
9627 /*
9628 * Commit given record to disk.
9629 * If committing an optimized record, do not call
9630 * with md ioctl lock held.
9631 */
9632 int
mddb_commitrec(mddb_recid_t id)9633 mddb_commitrec(
9634 mddb_recid_t id
9635 )
9636 {
9637 mddb_set_t *s;
9638 mddb_db_t *dbp;
9639 mddb_de_ic_t *dep;
9640 mddb_recid_t ids[2];
9641 mddb_rb32_t *rbp;
9642 static int err = 0;
9643 md_mn_msg_mddb_optrecerr_t *msg_recerr;
9644 md_mn_kresult_t *kres;
9645 mddb_lb_t *lbp;
9646 mddb_mnlb_t *mnlbp;
9647 mddb_locator_t *lp;
9648 mddb_mnsidelocator_t *mnslp;
9649 mddb_drvnm_t *dn;
9650 int li;
9651 md_replica_recerr_t *recerr;
9652 int i, j;
9653 int rval;
9654 int hit_err = 0;
9655 int retry = mddb_commitrec_retries;
9656 int gave_up = 0;
9657
9658 s = mddb_setenter(DBSET(id), MDDB_NOINIT, NULL);
9659 ASSERT(s != NULL);
9660
9661 if (checkstate(s, MDDB_PROBE)) {
9662 mddb_setexit(s);
9663 return (MDDB_E_NOTNOW);
9664 }
9665
9666 if (DBID(id) == 0) {
9667 mddb_setexit(s);
9668 return (0);
9669 }
9670
9671 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9672 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9673 if (dep->de_recid == DBID(id))
9674 break;
9675 }
9676 if (dep)
9677 break;
9678 }
9679
9680 if (dep == NULL) {
9681 mddb_setexit(s);
9682 return (MDDB_E_NORECORD);
9683 }
9684
9685 if (! (dep->de_flags & MDDB_F_OPT)) {
9686 ids[0] = id;
9687 ids[1] = 0;
9688 mddb_setexit(s);
9689 return (mddb_commitrecs(ids));
9690 }
9691
9692 /*
9693 * following code allows multiple processes to be doing
9694 * optimization commits in parallel.
9695 * NOTE: if lots of optimization commits then the lock
9696 * will not get released until it winds down
9697 */
9698 if (s->s_optwaiterr) {
9699 while (s->s_optwaiterr) {
9700 s->s_opthungerr = 1;
9701 cv_wait(&s->s_opthungerr_cv, SETMUTEX(s->s_setno));
9702 }
9703 if (checkstate(s, MDDB_PROBE)) {
9704 mddb_setexit(s);
9705 return (MDDB_E_NOTNOW);
9706 }
9707 }
9708 if (s->s_optcmtcnt++ == 0) {
9709 single_thread_start(s);
9710 s->s_opthavelck = 1;
9711 if (s->s_optwantlck) {
9712 cv_broadcast(&s->s_optwantlck_cv);
9713 s->s_optwantlck = 0;
9714 }
9715 } else {
9716 while (! s->s_opthavelck) {
9717 s->s_optwantlck = 1;
9718 cv_wait(&s->s_optwantlck_cv, SETMUTEX(s->s_setno));
9719 }
9720 }
9721
9722 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9723 for (dep = dbp->db_firstentry; dep; dep = dep->de_next) {
9724 if (dep->de_recid == DBID(id))
9725 break;
9726 }
9727 if (dep)
9728 break;
9729 }
9730
9731 if (dep == NULL) {
9732 if (! (--s->s_optcmtcnt)) {
9733 single_thread_end(s);
9734 s->s_opthavelck = 0;
9735 }
9736 mddb_setexit(s);
9737 return (MDDB_E_NORECORD);
9738 }
9739
9740 rbp = dep->de_rb;
9741 rbp->rb_commitcnt++;
9742 uniqtime32(&rbp->rb_timestamp);
9743 /* Generate the crc for this record */
9744 rec_crcgen(s, dep, rbp);
9745
9746 if (writeoptrecord(s, dep)) {
9747 if (MD_MNSET_SETNO(s->s_setno)) {
9748 hit_err = 1;
9749 }
9750 s->s_optwaiterr++;
9751 }
9752 if (MD_MNSET_SETNO(s->s_setno)) {
9753 /* If last thread out, release single_thread_start */
9754 if (! (--s->s_optcmtcnt)) {
9755 single_thread_end(s);
9756 s->s_opthavelck = 0;
9757 }
9758 /*
9759 * If this thread had a writeoptrecords failure, then
9760 * need to send message to master.
9761 * But, multiple threads could all be running on the
9762 * same single_thread_start, so serialize the threads
9763 * by making each thread grab single_thread_start.
9764 *
9765 * After return from sending message to master message,
9766 * replicas associated with optimized record will havei
9767 * been changed (via a callback from the master to all
9768 * nodes), so retry call to writeoptrecord.
9769 * This code is replacing the call to writeretry that
9770 * occurs for the local and traditional disksets.
9771 */
9772 if (hit_err) {
9773 single_thread_start(s);
9774 /*
9775 * If > 50% of replicas are alive then continue
9776 * to send message to master until writeoptrecord
9777 * succeeds. For now, assume that minor name,
9778 * major number on this node is the same as on
9779 * the master node. Once devids are turned on
9780 * for MN disksets, can send devid.
9781 */
9782 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
9783 msg_recerr = kmem_zalloc(
9784 sizeof (md_mn_msg_mddb_optrecerr_t), KM_SLEEP);
9785 while (!(md_get_setstatus(s->s_setno) &
9786 MD_SET_TOOFEW)) {
9787 bzero((caddr_t)msg_recerr,
9788 sizeof (md_mn_msg_mddb_optrecerr_t));
9789 lbp = s->s_lbp;
9790 mnlbp = (mddb_mnlb_t *)lbp;
9791 for (i = 0; i < 2; i++) {
9792 li = dep->de_optinfo[i].o_li;
9793 lp = &lbp->lb_locators[li];
9794 for (j = 0; j < MD_MNMAXSIDES; j++) {
9795 mnslp =
9796 &mnlbp->
9797 lb_mnsidelocators[j][li];
9798 if (mnslp->mnl_sideno ==
9799 s->s_sideno)
9800 break;
9801 }
9802 if (j == MD_MNMAXSIDES)
9803 continue;
9804
9805 dn = &lbp->
9806 lb_drvnm[mnslp->mnl_drvnm_index];
9807 recerr = &msg_recerr->msg_recerr[i];
9808 recerr->r_li = li;
9809 recerr->r_flags =
9810 dep->de_optinfo[i].o_flags;
9811 recerr->r_blkno = lp->l_blkno;
9812 recerr->r_mnum = md_getminor(lp->l_dev);
9813 (void) strncpy(recerr->r_driver_name,
9814 dn->dn_data, MD_MAXDRVNM);
9815 }
9816
9817 /* Release locks */
9818 single_thread_end(s);
9819 mutex_exit(SETMUTEX(s->s_setno));
9820
9821 /*
9822 * Send message to master about optimized
9823 * record failure. After return, master
9824 * should have marked failed replicas
9825 * and sent parse message to slaves causing
9826 * slaves to have fixed up the optimized
9827 * record.
9828 * On return from ksend_message, retry
9829 * the write since this node should have fixed
9830 * the optimized resync records it owns.
9831 */
9832 rval = mdmn_ksend_message(s->s_setno,
9833 MD_MN_MSG_MDDB_OPTRECERR,
9834 MD_MSGF_NO_BCAST, 0,
9835 (char *)msg_recerr,
9836 sizeof (md_mn_msg_mddb_optrecerr_t),
9837 kres);
9838 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
9839 cmn_err(CE_WARN, "mddb_commitrec: "
9840 "Unable to send optimized "
9841 "resync record failure "
9842 "message to other nodes in "
9843 "diskset %s\n", s->s_setname);
9844 mdmn_ksend_show_error(rval, kres,
9845 "MD_MN_MSG_MDDB_OPTRECERR");
9846 }
9847
9848 /* Regrab locks */
9849 mutex_enter(SETMUTEX(s->s_setno));
9850 single_thread_start(s);
9851
9852 /* Start over in case mddb changed */
9853 for (dbp = s->s_dbp; dbp != NULL;
9854 dbp = dbp->db_next) {
9855 for (dep = dbp->db_firstentry; dep;
9856 dep = dep->de_next) {
9857 if (dep->de_recid == DBID(id))
9858 break;
9859 }
9860 if (dep)
9861 break;
9862 }
9863 if (dep) {
9864 rbp = dep->de_rb;
9865 rbp->rb_commitcnt++;
9866 uniqtime32(&rbp->rb_timestamp);
9867 /* Generate the crc for this record */
9868 rec_crcgen(s, dep, rbp);
9869
9870 /*
9871 * If writeoptrecord succeeds, then
9872 * break out.
9873 */
9874 if (!(writeoptrecord(s, dep)))
9875 break;
9876 }
9877 if (--retry == 0) {
9878 cmn_err(CE_WARN, "mddb_commitrec: "
9879 "giving up writing optimized "
9880 "resync record for "
9881 "diskset %s, device %s,%d "
9882 "blkno 0x%x, flags 0x%x\n",
9883 s->s_setname, recerr->r_driver_name,
9884 recerr->r_mnum, recerr->r_blkno,
9885 recerr->r_flags);
9886 gave_up++;
9887 break;
9888 }
9889 }
9890 kmem_free(kres, sizeof (md_mn_kresult_t));
9891 kmem_free(msg_recerr,
9892 sizeof (md_mn_msg_mddb_optrecerr_t));
9893
9894 /* Resync record should be fixed - if possible */
9895 s->s_optwaiterr--;
9896 if (s->s_optwaiterr == 0) {
9897 /* All errors have been handled */
9898 if (s->s_opthungerr) {
9899 s->s_opthungerr = 0;
9900 cv_broadcast(&s->s_opthungerr_cv);
9901 }
9902 }
9903 single_thread_end(s);
9904 mddb_setexit(s);
9905 if (md_get_setstatus(s->s_setno) & MD_SET_TOOFEW) {
9906 return (MDDB_E_NOTNOW);
9907 } else if (gave_up) {
9908 return (MDDB_E_STALE);
9909 } else {
9910 return (0);
9911 }
9912 }
9913 } else {
9914 /* If set is a traditional or local set */
9915 if (! (--s->s_optcmtcnt)) {
9916 err = 0;
9917 if (s->s_optwaiterr) {
9918 err = writeretry(s);
9919 s->s_optwaiterr = 0;
9920 if (s->s_opthungerr) {
9921 s->s_opthungerr = 0;
9922 cv_broadcast(&s->s_opthungerr_cv);
9923 }
9924 }
9925 single_thread_end(s);
9926 s->s_opthavelck = 0;
9927 mddb_setexit(s);
9928 if (err)
9929 return (MDDB_E_NOTNOW);
9930 return (0);
9931 }
9932 if (s->s_optwaiterr) {
9933 while (s->s_optwaiterr) {
9934 s->s_opthungerr = 1;
9935 cv_wait(&s->s_opthungerr_cv,
9936 SETMUTEX(s->s_setno));
9937 }
9938 if (checkstate(s, MDDB_NOPROBE)) {
9939 mddb_setexit(s);
9940 return (MDDB_E_NOTNOW);
9941 }
9942 }
9943 }
9944
9945 mddb_setexit(s);
9946 return (0);
9947 }
9948
9949 int
mddb_commitrecs(mddb_recid_t ids[])9950 mddb_commitrecs(
9951 mddb_recid_t ids[]
9952 )
9953 {
9954 mddb_set_t *s;
9955 mddb_db_t *dbp;
9956 mddb_de_ic_t *dep;
9957 mddb_rb32_t *rbp;
9958 mddb_rb32_t *saverbp;
9959 mddb_lb_t *lbp;
9960 int li;
9961 uint_t checksum;
9962 mddb_recid_t *idp;
9963 int err = 0;
9964 set_t setno;
9965
9966 if (panicstr)
9967 cmn_err(CE_PANIC, "md: mddb: commit not allowed");
9968
9969 /*
9970 * scan through and make sure ids are from the same set
9971 */
9972 setno = DBSET(ids[0]);
9973 for (idp = ids; *idp != NULL; idp++)
9974 ASSERT(DBSET(*idp) == setno);
9975
9976 s = mddb_setenter(setno, MDDB_MUSTEXIST, NULL);
9977
9978 if (checkstate(s, MDDB_PROBE)) {
9979 mddb_setexit(s);
9980 return (MDDB_E_NOTNOW);
9981 }
9982
9983 ASSERT(s->s_lbp != NULL);
9984 err = 0;
9985
9986 if (! ids[0]) {
9987 mddb_setexit(s);
9988 return (0);
9989 }
9990
9991 single_thread_start(s);
9992 /*
9993 * scan through and make sure ids all exist
9994 */
9995 for (idp = ids; *idp != NULL; idp++) {
9996 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
9997 for (dep = dbp->db_firstentry; dep;
9998 dep = dep->de_next) {
9999 if (dep->de_recid == DBID(*idp))
10000 break;
10001 }
10002 if (dep != NULL)
10003 break;
10004 }
10005 if (dep == NULL) {
10006 single_thread_end(s);
10007 mddb_setexit(s);
10008 return (MDDB_E_NORECORD);
10009 }
10010 }
10011
10012 /*
10013 * scan through records fix commit counts and
10014 * zero fiddles and update time stamp and rechecksum record
10015 */
10016 checksum = 0;
10017 idp = ids;
10018 saverbp = NULL;
10019 while (*idp) {
10020 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10021 for (dep = dbp->db_firstentry; dep;
10022 dep = dep->de_next) {
10023 if (dep->de_recid == DBID(*idp))
10024 break;
10025 }
10026 if (dep != NULL)
10027 break;
10028 }
10029 rbp = dep->de_rb;
10030 ASSERT(! (dep->de_flags & MDDB_F_OPT));
10031
10032 getuserdata(setno, dep);
10033 /* Don't do fiddles for CHANGE LOG records */
10034 if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
10035 checksum ^= rbp->rb_checksum_fiddle;
10036 rbp->rb_checksum_fiddle = 0;
10037 checksum ^= rbp->rb_checksum;
10038 saverbp = rbp;
10039 }
10040 rbp->rb_commitcnt++;
10041 uniqtime32(&rbp->rb_timestamp);
10042 /* Generate the crc for this record */
10043 rec_crcgen(s, dep, rbp);
10044
10045 /* Don't do fiddles for CHANGE LOG records */
10046 if (!(dep->de_flags & MDDB_F_CHANGELOG)) {
10047 checksum ^= rbp->rb_checksum;
10048 }
10049 idp++;
10050 }
10051
10052 if (saverbp)
10053 saverbp->rb_checksum_fiddle = checksum;
10054
10055 /*
10056 * If this is a MN set but we are not the master, then we are not
10057 * supposed to update the mddb on disk. So we finish at this point.
10058 */
10059 if ((setno != MD_LOCAL_SET) && (s->s_lbp->lb_flags & MDDB_MNSET) &&
10060 (md_set[setno].s_am_i_master == 0)) {
10061 single_thread_end(s);
10062 mddb_setexit(s);
10063 return (0);
10064 }
10065
10066 lbp = s->s_lbp;
10067 for (li = 0; li < lbp->lb_loccnt; li++) {
10068 if (! (lbp->lb_locators[li].l_flags & MDDB_F_ACTIVE))
10069 continue;
10070
10071 idp = ids;
10072 while (*idp) {
10073 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
10074 dep = dbp->db_firstentry;
10075 while (dep && (dep->de_recid != DBID(*idp)))
10076 dep = dep->de_next;
10077 if (dep != NULL)
10078 break;
10079 }
10080 rbp = dep->de_rb;
10081 err = wrtblklst(s, (caddr_t)rbp, dep->de_blks,
10082 dep->de_blkcount, li, (mddb_bf_t **)0,
10083 MDDB_WR_ONLY_MASTER);
10084 if (err)
10085 break;
10086 idp++;
10087 }
10088 if (err)
10089 break;
10090 }
10091 if (err) {
10092 if (writeretry(s)) {
10093 single_thread_end(s);
10094 mddb_setexit(s);
10095 return (MDDB_E_NOTNOW);
10096 }
10097 }
10098 single_thread_end(s);
10099 mddb_setexit(s);
10100 return (0);
10101 }
10102
10103 mddb_recid_t
mddb_makerecid(set_t setno,mddb_recid_t id)10104 mddb_makerecid(
10105 set_t setno,
10106 mddb_recid_t id
10107 )
10108 {
10109 return (MAKERECID(setno, id));
10110 }
10111
10112 set_t
mddb_getsetnum(mddb_recid_t id)10113 mddb_getsetnum(
10114 mddb_recid_t id
10115 )
10116 {
10117 return (DBSET(id));
10118 }
10119
10120 char *
mddb_getsetname(set_t setno)10121 mddb_getsetname(
10122 set_t setno
10123 )
10124 {
10125 return (((mddb_set_t *)md_set[setno].s_db)->s_setname);
10126 }
10127
10128 side_t
mddb_getsidenum(set_t setno)10129 mddb_getsidenum(
10130 set_t setno
10131 )
10132 {
10133 if (md_set[setno].s_db)
10134 return (((mddb_set_t *)md_set[setno].s_db)->s_sideno);
10135 return (0);
10136 }
10137
10138 int
mddb_ownset(set_t setno)10139 mddb_ownset(
10140 set_t setno
10141 )
10142 {
10143 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) && md_set[setno].s_db)
10144 return (1);
10145
10146 if (md_set[setno].s_db && ((mddb_set_t *)md_set[setno].s_db)->s_lbp)
10147 return (1);
10148
10149 return (0);
10150 }
10151
10152 /*ARGSUSED*/
10153 int
getmed_ioctl(mddb_med_parm_t * medpp,int mode)10154 getmed_ioctl(mddb_med_parm_t *medpp, int mode)
10155 {
10156 mddb_set_t *s;
10157 int err = 0;
10158 set_t setno = medpp->med_setno;
10159 md_error_t *ep = &medpp->med_mde;
10160
10161 mdclrerror(ep);
10162
10163 if (setno >= md_nsets)
10164 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10165
10166 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10167 return (0);
10168
10169 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10170 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10171
10172 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10173 return (mddbstatus2error(ep, err, NODEV32, setno));
10174
10175 medpp->med = s->s_med; /* structure assignment */
10176
10177 mddb_setexit(s);
10178
10179 return (0);
10180 }
10181
10182 int
setmed_ioctl(mddb_med_parm_t * medpp,int mode)10183 setmed_ioctl(mddb_med_parm_t *medpp, int mode)
10184 {
10185
10186 mddb_set_t *s;
10187 int err = 0;
10188 set_t setno = medpp->med_setno;
10189 md_error_t *ep = &medpp->med_mde;
10190
10191 mdclrerror(ep);
10192
10193 if ((mode & FWRITE) == 0)
10194 return (mdsyserror(ep, EACCES));
10195
10196 /*
10197 * This should be the only thing that prevents LOCAL sets from having
10198 * mediators, at least in the kernel, userland needs to have some code
10199 * written.
10200 */
10201 if (setno == MD_LOCAL_SET)
10202 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10203
10204 if (setno >= md_nsets)
10205 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10206
10207 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10208 return (0);
10209
10210 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10211 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10212
10213 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10214 return (mddbstatus2error(ep, err, NODEV32, setno));
10215
10216 s->s_med = medpp->med; /* structure assignment */
10217
10218 mddb_setexit(s);
10219
10220 return (0);
10221 }
10222
10223 int
updmed_ioctl(mddb_med_upd_parm_t * medpp,int mode)10224 updmed_ioctl(mddb_med_upd_parm_t *medpp, int mode)
10225 {
10226
10227 mddb_set_t *s;
10228 int err = 0;
10229 set_t setno = medpp->med_setno;
10230 md_error_t *ep = &medpp->med_mde;
10231
10232 mdclrerror(ep);
10233
10234 if ((mode & FWRITE) == 0)
10235 return (mdsyserror(ep, EACCES));
10236
10237 if (setno >= md_nsets)
10238 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10239
10240 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10241 return (0);
10242
10243 if ((md_get_setstatus(setno) & MD_SET_SNARFED) == 0)
10244 return (mdmddberror(ep, MDE_DB_NOTOWNER, NODEV32, setno));
10245
10246 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10247 return (mddbstatus2error(ep, err, NODEV32, setno));
10248
10249 single_thread_start(s);
10250 (void) upd_med(s, "updmed_ioctl()");
10251 single_thread_end(s);
10252
10253 mddb_setexit(s);
10254
10255 return (0);
10256 }
10257
10258 int
take_set(mddb_config_t * cp,int mode)10259 take_set(mddb_config_t *cp, int mode)
10260 {
10261 int err = 0;
10262 mddb_med_upd_parm_t medup;
10263 set_t setno = cp->c_setno;
10264 md_error_t *ep = &cp->c_mde;
10265 int snarf_ok = 0;
10266
10267 if (md_get_setstatus(setno) & MD_SET_SNARFED)
10268 return (0);
10269
10270 err = mddb_configure(MDDB_GETDEV, cp);
10271 if (! err && mdisok(ep)) {
10272 if (md_snarf_db_set(setno, ep) != 0)
10273 goto out;
10274 snarf_ok = 1;
10275 }
10276
10277 /*
10278 * Clear replicated import flag since this is
10279 * used during the take of a diskset with
10280 * previously unresolved replicated disks.
10281 */
10282 if (md_get_setstatus(setno) &
10283 MD_SET_REPLICATED_IMPORT) {
10284 md_clr_setstatus(setno, MD_SET_REPLICATED_IMPORT);
10285 }
10286
10287 if (! err && mdisok(ep)) {
10288 if (! cp->c_flags) {
10289 medup.med_setno = setno;
10290 mdclrerror(&medup.med_mde);
10291
10292 err = updmed_ioctl(&medup, mode);
10293 if (! mdisok(&medup.med_mde))
10294 (void) mdstealerror(ep, &medup.med_mde);
10295 }
10296 }
10297
10298 out:
10299 /*
10300 * In the case that the snarf failed, the diskset is
10301 * left with s_db set, but s_lbp not set. The node is not
10302 * an owner of the set and won't be allowed to release the
10303 * diskset in order to cleanup. With s_db set, any call to the
10304 * GETDEV or ENDDEV ioctl (done by libmeta routine metareplicalist)
10305 * will cause the diskset to be loaded. So, cleanup the diskset so
10306 * that an inadvertent start of the diskset doesn't happen later.
10307 */
10308 if ((snarf_ok == 0) && md_set[setno].s_db &&
10309 (((mddb_set_t *)md_set[setno].s_db)->s_lbp == 0)) {
10310 mutex_enter(&mddb_lock);
10311 mddb_unload_set(setno);
10312 mutex_exit(&mddb_lock);
10313 }
10314 return (err);
10315 }
10316
10317 /*ARGSUSED*/
10318 int
release_set(mddb_config_t * cp,int mode)10319 release_set(mddb_config_t *cp, int mode)
10320 {
10321 int err = 0;
10322 set_t setno = cp->c_setno;
10323 md_error_t *ep = &cp->c_mde;
10324
10325 /*
10326 * Data integrity check
10327 */
10328 if (setno >= md_nsets)
10329 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10330
10331 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
10332 md_haltsnarf_enter(setno);
10333 /*
10334 * Attempt to mark set as HOLD. If it is marked as HOLD, this means
10335 * that the mirror code is currently searching all mirrors for a
10336 * errored component that needs a hotspare. While this search is in
10337 * progress, we cannot release the set and thgerefore we return EBUSY.
10338 * Once we have set HOLD, the mirror function (check_4_hotspares) will
10339 * block before the search until the set is released.
10340 */
10341 if (md_holdset_testandenter(setno) != 0) {
10342 md_haltsnarf_exit(setno);
10343 rw_exit(&md_unit_array_rw.lock);
10344 return (EBUSY);
10345 }
10346
10347 if ((err = md_halt_set(setno, MD_HALT_ALL)) == 0)
10348 err = mddb_configure(MDDB_RELEASESET, cp);
10349
10350 md_holdset_exit(setno);
10351 md_haltsnarf_exit(setno);
10352 rw_exit(&md_unit_array_rw.lock);
10353
10354 if (! err && mdisok(ep)) {
10355 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RELEASE, SVM_TAG_SET, setno,
10356 NODEV64);
10357 }
10358
10359 return (err);
10360 }
10361
10362 int
gettag_ioctl(mddb_dtag_get_parm_t * dtgpp,int mode)10363 gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, int mode)
10364 {
10365 mddb_set_t *s;
10366 int err = 0;
10367 mddb_dtag_lst_t *dtlp;
10368 set_t setno = dtgpp->dtgp_setno;
10369 md_error_t *ep = &dtgpp->dtgp_mde;
10370
10371 mdclrerror(ep);
10372
10373 if ((mode & FREAD) == 0)
10374 return (mdsyserror(ep, EACCES));
10375
10376 if (setno >= md_nsets)
10377 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10378
10379 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10380 return (0);
10381
10382 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10383 return (mddbstatus2error(ep, err, NODEV32, setno));
10384
10385 /*
10386 * Data tags not supported on MN sets so return invalid operation.
10387 * This ioctl could be called before the mddb has been read in so
10388 * the set status may not yet be set to MNSET, so code following
10389 * this check must handle a MN diskset properly.
10390 */
10391 if (md_get_setstatus(setno) & MD_SET_MNSET) {
10392 mddb_setexit(s);
10393 return (mderror(ep, MDE_INVAL_MNOP));
10394 }
10395
10396 /* s_dtlp is NULL for MN diskset */
10397 dtlp = s->s_dtlp;
10398 while (dtlp != NULL) {
10399 if (dtgpp->dtgp_dt.dt_id == 0 ||
10400 dtgpp->dtgp_dt.dt_id == dtlp->dtl_dt.dt_id) {
10401 bcopy((caddr_t)&dtlp->dtl_dt, (caddr_t)&dtgpp->dtgp_dt,
10402 sizeof (mddb_dtag_t));
10403 break;
10404 }
10405 dtlp = dtlp->dtl_nx;
10406 }
10407
10408 /* Walked the whole list and id not found, return error */
10409 if (dtlp == (mddb_dtag_lst_t *)NULL) {
10410 mddb_setexit(s);
10411 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10412 }
10413
10414 mddb_setexit(s);
10415
10416 return (0);
10417 }
10418
10419 int
usetag_ioctl(mddb_dtag_use_parm_t * dtupp,int mode)10420 usetag_ioctl(mddb_dtag_use_parm_t *dtupp, int mode)
10421 {
10422 mddb_set_t *s;
10423 int err = 0;
10424 mddb_config_t *cp;
10425 mddb_ri_t *trip = NULL;
10426 mddb_dtag_t *dtagp = NULL;
10427 set_t setno = dtupp->dtup_setno;
10428 md_error_t *ep = &dtupp->dtup_mde;
10429
10430 mdclrerror(ep);
10431
10432 if ((mode & FWRITE) == 0)
10433 return (mdsyserror(ep, EACCES));
10434
10435 if (setno >= md_nsets)
10436 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10437
10438 if (dtupp->dtup_id < 0)
10439 return (mdsyserror(ep, EINVAL));
10440 else if (dtupp->dtup_id == 0)
10441 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10442
10443 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10444 return (0);
10445
10446 if ((md_get_setstatus(setno) & MD_SET_TAGDATA) == 0)
10447 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10448
10449 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL)
10450 return (mddbstatus2error(ep, err, NODEV32, setno));
10451
10452 /*
10453 * Data tags not supported on MN sets so return invalid operation.
10454 * This ioctl could be called before the mddb has been read in so
10455 * the set status may not yet be set to MNSET, so code following
10456 * this check must handle a MN diskset properly.
10457 */
10458 if (md_get_setstatus(setno) & MD_SET_MNSET) {
10459 mddb_setexit(s);
10460 return (mderror(ep, MDE_INVAL_MNOP));
10461 }
10462
10463 /* Validate and find the id requested - nothing found if MN diskset */
10464 if ((dtagp = dtl_findl(s, dtupp->dtup_id)) == NULL) {
10465 mddb_setexit(s);
10466 return (mdmddberror(ep, MDE_DB_NOTAG, NODEV32, setno));
10467 }
10468
10469 /* Usetag is only valid when more than one tag exists */
10470 if (dtl_cntl(s) < 2) {
10471 mddb_setexit(s);
10472 return (mdmddberror(ep, MDE_DB_NTAGDATA, NODEV32, setno));
10473 }
10474
10475 /* Put the selected tag in place */
10476 dt_setup(s, dtagp);
10477
10478 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10479
10480 /* Save the hint information */
10481 trip = save_rip(s);
10482
10483 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */
10484 cp->c_setno = setno;
10485 cp->c_sideno = s->s_sideno;
10486 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10487 cp->c_setname[MD_MAX_SETNAME] = '\0';
10488 cp->c_med = s->s_med; /* struct assignment */
10489
10490 mddb_setexit(s);
10491
10492 s = NULL;
10493
10494 /* shorthand */
10495 setno = cp->c_setno;
10496
10497 /* Let unload know not to free the tag */
10498 md_set_setstatus(setno, MD_SET_KEEPTAG);
10499
10500 /* Release the set */
10501 if (err = release_set(cp, mode))
10502 goto out;
10503
10504 if (! mdisok(&cp->c_mde)) {
10505 (void) mdstealerror(ep, &cp->c_mde);
10506 err = 1;
10507 goto out;
10508 }
10509
10510 /* Re-init set using the saved mddb_config_t structure */
10511 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10512 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10513 err = mddbstatus2error(ep, err, NODEV32, setno);
10514 goto out;
10515 }
10516 }
10517
10518 ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10519
10520 /* use the saved rip structure */
10521 s->s_rip = trip;
10522 trip = (mddb_ri_t *)NULL;
10523
10524 /* Let the take code know a tag is being used */
10525 md_set_setstatus(setno, MD_SET_USETAG);
10526
10527 mddb_setexit(s);
10528
10529 s = NULL;
10530
10531 /* Take the set */
10532 if (err = take_set(cp, mode))
10533 goto out;
10534
10535 if (! mdisok(&cp->c_mde))
10536 (void) mdstealerror(ep, &cp->c_mde);
10537
10538 out:
10539 md_clr_setstatus(setno, (MD_SET_USETAG | MD_SET_KEEPTAG));
10540
10541 kmem_free(cp, sizeof (mddb_config_t));
10542
10543 if (trip)
10544 free_rip(&trip);
10545
10546 if (s)
10547 mddb_setexit(s);
10548
10549 return (err);
10550 }
10551
10552 int
accept_ioctl(mddb_accept_parm_t * accpp,int mode)10553 accept_ioctl(mddb_accept_parm_t *accpp, int mode)
10554 {
10555 mddb_set_t *s;
10556 int err = 0;
10557 mddb_config_t *cp;
10558 mddb_ri_t *trip = NULL;
10559 set_t setno = accpp->accp_setno;
10560 md_error_t *ep = &accpp->accp_mde;
10561
10562 mdclrerror(ep);
10563
10564 if ((mode & FWRITE) == 0)
10565 return (mdsyserror(ep, EACCES));
10566
10567 if (setno >= md_nsets)
10568 return (mdmderror(ep, MDE_INVAL_UNIT, MD_ADM_MINOR));
10569
10570 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
10571 return (0);
10572
10573 if ((md_get_setstatus(setno) & MD_SET_ACCOK) == 0)
10574 return (mdmddberror(ep, MDE_DB_ACCNOTOK, NODEV32, setno));
10575
10576 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10577 return (mddbstatus2error(ep, err, NODEV32, setno));
10578
10579 /*
10580 * Data tags not supported on MN sets so return invalid operation.
10581 * mddb is guaranteed to be incore at this point, so this
10582 * check will catch all MN disksets.
10583 */
10584 if (md_get_setstatus(setno) & MD_SET_MNSET) {
10585 mddb_setexit(s);
10586 return (mderror(ep, MDE_INVAL_MNOP));
10587 }
10588
10589 cp = kmem_zalloc(sizeof (mddb_config_t), KM_SLEEP);
10590
10591 trip = save_rip(s);
10592
10593 cp->c_timestamp = s->s_ident.createtime; /* struct assignment */
10594 cp->c_setno = setno;
10595 cp->c_sideno = s->s_sideno;
10596 (void) strncpy(cp->c_setname, s->s_setname, MD_MAX_SETNAME);
10597 cp->c_setname[MD_MAX_SETNAME] = '\0';
10598 cp->c_med = s->s_med; /* struct assignment */
10599
10600 /* Tag the data */
10601 if (err = set_dtag(s, ep)) {
10602 err = mdsyserror(ep, err);
10603 goto out;
10604 }
10605
10606 /* If we had a BADTAG, it will be re-written, so clear the bit. */
10607 if (md_get_setstatus(setno) & MD_SET_BADTAG)
10608 md_clr_setstatus(setno, MD_SET_BADTAG);
10609
10610 if (err = dt_write(s)) {
10611 err = mdsyserror(ep, err);
10612 goto out;
10613 }
10614
10615 mddb_setexit(s);
10616
10617 s = NULL;
10618
10619 /* shorthand */
10620 setno = cp->c_setno;
10621
10622 /* Clear the keeptag */
10623 md_clr_setstatus(setno, MD_SET_KEEPTAG);
10624
10625 /* Release the set */
10626 if (err = release_set(cp, mode))
10627 goto out;
10628
10629 if (! mdisok(&cp->c_mde)) {
10630 (void) mdstealerror(ep, &cp->c_mde);
10631 goto out;
10632 }
10633
10634 /* Re-init set using the saved mddb_config_t structure */
10635 if ((s = mddb_setenter(setno, MDDB_NOINIT, &err)) == NULL) {
10636 if ((s = init_set(cp, MDDB_NOINIT, &err)) == NULL) {
10637 err = mddbstatus2error(ep, err, NODEV32, setno);
10638 goto out;
10639 }
10640 }
10641
10642 ASSERT(s->s_rip == (mddb_ri_t *)NULL);
10643
10644 /* Free the allocated rip structure */
10645 if (s->s_rip != (mddb_ri_t *)NULL)
10646 free_rip(&s->s_rip);
10647
10648 /* use the saved rip structure */
10649 s->s_rip = trip;
10650 trip = (mddb_ri_t *)NULL;
10651
10652 /* Let the set init code know an accept is in progress */
10653 md_set_setstatus(setno, MD_SET_ACCEPT);
10654
10655 mddb_setexit(s);
10656
10657 s = NULL;
10658
10659 /* Take the set */
10660 if (err = take_set(cp, mode))
10661 goto out;
10662
10663 if (! mdisok(&cp->c_mde))
10664 (void) mdstealerror(ep, &cp->c_mde);
10665
10666 out:
10667 md_clr_setstatus(setno, (MD_SET_ACCOK | MD_SET_ACCEPT));
10668
10669 kmem_free(cp, sizeof (mddb_config_t));
10670
10671 if (trip)
10672 free_rip(&trip);
10673
10674 if (s)
10675 mddb_setexit(s);
10676
10677 return (err);
10678 }
10679
10680 /*
10681 * mddb_getinvlb_devid - cycles through the locator block and determines
10682 * if the device id's for any of the replica disks are invalid.
10683 * If so, it returns the diskname in the ctdptr.
10684 * RETURN
10685 * -1 Error
10686 * cnt number of invalid device id's
10687 */
10688 int
mddb_getinvlb_devid(set_t setno,int count,int size,char ** ctdptr)10689 mddb_getinvlb_devid(
10690 set_t setno,
10691 int count,
10692 int size,
10693 char **ctdptr
10694 )
10695 {
10696 mddb_set_t *s;
10697 int err = 0;
10698 mddb_lb_t *lbp;
10699 int li;
10700 mddb_did_blk_t *did_blk;
10701 mddb_did_info_t *did_info;
10702 int len;
10703 int cnt = 0;
10704 char *cptr;
10705 md_name_suffix *sn;
10706 int i, dont_add_it;
10707 char *tmpctd, *diskname;
10708 char *tmpname;
10709
10710 cptr = *ctdptr;
10711 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
10712 return (-1);
10713 }
10714
10715 single_thread_start(s);
10716 lbp = s->s_lbp;
10717
10718 if (lbp->lb_setno != setno) {
10719 single_thread_end(s);
10720 mddb_setexit(s);
10721 return (-1);
10722 }
10723
10724 /* check for lb being devid style */
10725 if (lbp->lb_flags & MDDB_DEVID_STYLE) {
10726 did_blk = s->s_did_icp->did_ic_blkp;
10727 for (li = 0; li < lbp->lb_loccnt; li++) {
10728 did_info = &(did_blk->blk_info[li]);
10729 /* Only if devid exists and isn't valid */
10730 if ((did_info->info_flags & MDDB_DID_EXISTS) &&
10731 !(did_info->info_flags & MDDB_DID_VALID)) {
10732 /*
10733 * if we count more invalid did's than
10734 * was passed in there's an error somewhere
10735 */
10736 if (cnt++ > count) {
10737 single_thread_end(s);
10738 mddb_setexit(s);
10739 return (-1);
10740 }
10741
10742 /*
10743 * Future note: Need to do something here
10744 * for the MN diskset case when device ids
10745 * are supported in disksets.
10746 * Can't add until merging devids_in_diskset
10747 * code into code base.
10748 */
10749
10750 sn = &s->s_lnp->ln_suffixes[0][li];
10751 /*
10752 * check to make sure length of device name is
10753 * not greater than computed first time through
10754 */
10755 len = sn->suf_len;
10756 if (len > size) {
10757 single_thread_end(s);
10758 mddb_setexit(s);
10759 return (-1);
10760 }
10761 tmpctd = *ctdptr;
10762 /* strip off slice part */
10763 diskname = md_strdup(sn->suf_data);
10764 tmpname = strrchr(diskname, 's');
10765 *tmpname = '\0';
10766 dont_add_it = 0;
10767 /* look to see if diskname is already in list */
10768 for (i = 0; i < (cnt-1); i++) {
10769 if (strcmp(diskname, tmpctd) == 0) {
10770 /* already there, don't add */
10771 dont_add_it = 1;
10772 break;
10773 }
10774 /* point to next diskname in list */
10775 tmpctd += size;
10776 }
10777 if (dont_add_it == 0) {
10778 /* add diskname to list */
10779 (void) strcpy(cptr, diskname);
10780 cptr += size;
10781 }
10782 kmem_free(diskname, strlen(sn->suf_data) + 1);
10783 }
10784 }
10785 }
10786 /* null terminate the list */
10787 *cptr = '\0';
10788 /*
10789 * need to save the new pointer so that calling routine can continue
10790 * to add information onto the end.
10791 */
10792 *ctdptr = cptr;
10793 single_thread_end(s);
10794 mddb_setexit(s);
10795 return (cnt);
10796 }
10797
10798 /*
10799 * mddb_validate_lb - count the number of lb's with invalid device id's. Keep
10800 * track of length of longest devicename.
10801 * RETURN
10802 * -1 error
10803 * cnt number of lb's with invalid devid's
10804 */
10805 int
mddb_validate_lb(set_t setno,int * rmaxsz)10806 mddb_validate_lb(
10807 set_t setno,
10808 int *rmaxsz
10809 )
10810 {
10811 mddb_set_t *s;
10812 int err = 0;
10813 mddb_lb_t *lbp;
10814 int li;
10815 mddb_did_blk_t *did_blk;
10816 mddb_did_info_t *did_info;
10817 int len;
10818 int cnt = 0;
10819
10820 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
10821 return (-1);
10822
10823 single_thread_start(s);
10824 lbp = s->s_lbp;
10825
10826 if (lbp->lb_setno != setno) {
10827 single_thread_end(s);
10828 mddb_setexit(s);
10829 return (-1);
10830 }
10831
10832 /* lb must be in devid style */
10833 if ((lbp->lb_flags & MDDB_DEVID_STYLE) == 0)
10834 goto mvl_out;
10835
10836 did_blk = s->s_did_icp->did_ic_blkp;
10837 for (li = 0; li < lbp->lb_loccnt; li++) {
10838 char *minor_name;
10839 mddb_locator_t *lp;
10840 dev_t ddi_dev;
10841 ddi_devid_t devid;
10842 ddi_devid_t rtn_devid = NULL;
10843 int get_rval;
10844
10845 did_info = &(did_blk->blk_info[li]);
10846 if (((did_info->info_flags & MDDB_DID_EXISTS) == 0) ||
10847 (did_info->info_flags & MDDB_DID_VALID))
10848 continue;
10849
10850 /* Here we know, did exists but isn't valid */
10851
10852 lp = &lbp->lb_locators[li];
10853 ddi_dev = expldev(lp->l_dev);
10854 get_rval = mddb_devid_get(s, li, &devid, &minor_name);
10855 ASSERT(get_rval == 1);
10856 if ((ddi_lyr_get_devid(ddi_dev, &rtn_devid) == DDI_SUCCESS) &&
10857 (ddi_devid_compare(rtn_devid, devid) == 0)) {
10858 did_info->info_flags = MDDB_DID_VALID |
10859 MDDB_DID_EXISTS | MDDB_DID_UPDATED;
10860 } else {
10861 cnt++;
10862 /*
10863 * Future note: Need to do something here
10864 * for the MN diskset case when device ids
10865 * are supported in disksets.
10866 * Can't add until merging devids_in_diskset
10867 * code into code base.
10868 */
10869 len = (&s->s_lnp->ln_suffixes[0][li])-> suf_len;
10870 if (*rmaxsz < len)
10871 *rmaxsz = len;
10872 }
10873 if (rtn_devid != NULL)
10874 ddi_devid_free(rtn_devid);
10875 }
10876
10877 mvl_out:
10878
10879 if (push_lb(s) != 0)
10880 cnt = -1;
10881 (void) upd_med(s, "mddb_validate_lb(0)");
10882 single_thread_end(s);
10883 mddb_setexit(s);
10884 return (cnt);
10885 }
10886
10887 int
check_active_locators()10888 check_active_locators()
10889 {
10890 mddb_set_t *s;
10891 mddb_lb_t *lbp;
10892 int li;
10893 int active = 0;
10894
10895 mutex_enter(&mddb_lock);
10896 /* there is nothing here..so we can unload */
10897 if ((mddb_set_t *)md_set[MD_LOCAL_SET].s_db == NULL) {
10898 mutex_exit(&mddb_lock);
10899 return (0);
10900 }
10901 s = (mddb_set_t *)md_set[MD_LOCAL_SET].s_db;
10902 lbp = s->s_lbp;
10903 if (lbp == NULL) {
10904 mutex_exit(&mddb_lock);
10905 return (0);
10906 }
10907
10908 for (li = 0; li < lbp->lb_loccnt; li++) {
10909 mddb_locator_t *lp = &lbp->lb_locators[li];
10910 if (lp->l_flags & MDDB_F_ACTIVE) {
10911 active = 1;
10912 break;
10913 }
10914 }
10915 mutex_exit(&mddb_lock);
10916 return (active);
10917 }
10918
10919 /*
10920 * regetoptrecord:
10921 * --------------
10922 * Update the in-core optimized resync record contents by re-reading the
10923 * record from the on-disk metadb.
10924 * The contents of the resync record will be overwritten by calling this
10925 * routine. This means that callers that require the previous contents to
10926 * be preserved must save the data before calling this routine.
10927 * Return values:
10928 * 0 - successfully read in resync record from a mddb
10929 * 1 - failure. Unable to read resync record from either mddb.
10930 */
10931 static int
regetoptrecord(mddb_set_t * s,mddb_de_ic_t * dep)10932 regetoptrecord(
10933 mddb_set_t *s,
10934 mddb_de_ic_t *dep
10935 )
10936 {
10937 mddb_lb_t *lbp;
10938 mddb_locator_t *lp;
10939 mddb_rb32_t *rbp, *crbp;
10940 int li;
10941 int i;
10942 int err = 0;
10943 size_t recsize;
10944
10945 #if defined(_ILP32) && !defined(lint)
10946 ASSERT(sizeof (mddb_rb_t) == sizeof (mddb_rb32_t));
10947 #endif
10948
10949 recsize = dep->de_recsize;
10950 crbp = (mddb_rb32_t *)kmem_zalloc(recsize, KM_SLEEP);
10951
10952 single_thread_start(s);
10953 rbp = dep->de_rb;
10954
10955 dep->de_optinfo[0].o_flags |= MDDB_F_EDATA;
10956 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10957
10958 lbp = s->s_lbp;
10959
10960 for (i = 0; i < 2; i++) {
10961 if (! (dep->de_optinfo[i].o_flags & MDDB_F_ACTIVE))
10962 continue;
10963 li = dep->de_optinfo[i].o_li;
10964 lp = &lbp->lb_locators[li];
10965
10966 if (! (lp->l_flags & MDDB_F_ACTIVE) ||
10967 (lp->l_flags & MDDB_F_EMASTER))
10968 continue;
10969
10970 /*
10971 * re-read the optimized resync record with failfast set
10972 * since a failed disk could lead to a very long wait.
10973 */
10974 err = readblklst(s, (caddr_t)rbp, dep->de_blks,
10975 dep->de_blkcount, li, B_FAILFAST);
10976
10977 if (err)
10978 continue;
10979
10980 if (rbp->rb_magic != MDDB_MAGIC_RB)
10981 continue;
10982
10983 if (revchk(MDDB_REV_RB, rbp->rb_revision))
10984 continue;
10985
10986 /* Check the crc for this record */
10987 if (rec_crcchk(s, dep, rbp)) {
10988 continue;
10989 }
10990 dep->de_optinfo[i].o_flags = MDDB_F_ACTIVE;
10991
10992 if (rbp == crbp) {
10993 if (rbp->rb_checksum != crbp->rb_checksum)
10994 dep->de_optinfo[1].o_flags |= MDDB_F_EDATA;
10995 break;
10996 }
10997 rbp = crbp;
10998 }
10999
11000 single_thread_end(s);
11001
11002 if (rbp == crbp) {
11003 rbp->rb_private = 0;
11004 kmem_free((caddr_t)crbp, recsize);
11005 return (0);
11006 }
11007 uniqtime32(&rbp->rb_timestamp);
11008 /* Generate the crc for this record */
11009 rec_crcgen(s, dep, rbp);
11010 kmem_free((caddr_t)crbp, recsize);
11011 return (1);
11012 }
11013
11014 /*
11015 * mddb_reread_rr:
11016 * Re-read the resync record from the on-disk copy. This is required for
11017 * multi-node support so that a new mirror-owner can determine if a resync
11018 * operation is required to guarantee data integrity.
11019 *
11020 * Arguments:
11021 * setno Associated set
11022 * id Resync record ID
11023 *
11024 * Return Value:
11025 * 0 successful reread
11026 * -1 invalid set (not multi-node or non-existant)
11027 * >0 metadb state invalid, failed to reread
11028 */
11029 int
mddb_reread_rr(set_t setno,mddb_recid_t id)11030 mddb_reread_rr(
11031 set_t setno,
11032 mddb_recid_t id
11033 )
11034 {
11035 mddb_set_t *s;
11036 int err = 0;
11037 mddb_db_t *dbp;
11038 mddb_de_ic_t *dep;
11039
11040 if (setno >= md_nsets)
11041 return (-1);
11042
11043 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL)
11044 return (-1);
11045
11046 if ((setno == MD_LOCAL_SET) || !(s->s_lbp->lb_flags & MDDB_MNSET)) {
11047 mddb_setexit(s);
11048 return (-1);
11049 }
11050
11051 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11052 dep = dbp->db_firstentry;
11053 while (dep && (dep->de_recid != DBID(id)))
11054 dep = dep->de_next;
11055 if (dep != NULL)
11056 break;
11057 }
11058
11059 if (dep != NULL) {
11060 err = regetoptrecord(s, dep);
11061 } else {
11062 err = -1;
11063 }
11064 mddb_setexit(s);
11065 return (err);
11066 }
11067
11068 /*
11069 * Set owner associated with MN optimized resync record.
11070 *
11071 * Optimized records have an owner node associated with them in
11072 * a MN diskset. The owner is only set on a node that is actively
11073 * writing to that record. The other nodes will show that record
11074 * as having an invalid owner. The owner for an optimized record
11075 * is used during fixoptrecord to determine which node should
11076 * write out the record when the replicas associated with that
11077 * optimized record have been changed.
11078 *
11079 * Called directly from mirror driver and not from an ioctl.
11080 *
11081 * Returns
11082 * NULL if successful.
11083 * MDDB_E_NORECORD if record not found.
11084 */
11085 int
mddb_setowner(mddb_recid_t id,md_mn_nodeid_t owner)11086 mddb_setowner(
11087 mddb_recid_t id,
11088 md_mn_nodeid_t owner
11089 )
11090 {
11091 mddb_set_t *s;
11092 mddb_db_t *dbp;
11093 mddb_de_ic_t *dep;
11094 int found = 0;
11095
11096
11097 if (DBSET(id) >= md_nsets)
11098 return (MDDB_E_NORECORD);
11099
11100 if ((s = mddb_setenter(DBSET(id), MDDB_MUSTEXIST, NULL)) == NULL)
11101 return (MDDB_E_NORECORD);
11102
11103 id = DBID(id);
11104 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11105 for (dep = dbp->db_firstentry;
11106 dep != NULL; dep = dep->de_next) {
11107 if (dep->de_recid != id)
11108 continue;
11109 dep->de_owner_nodeid = owner;
11110 found = 1;
11111 break;
11112 }
11113 if (found)
11114 break;
11115 }
11116
11117 mddb_setexit(s);
11118
11119 if (!found) {
11120 return (MDDB_E_NORECORD);
11121 }
11122
11123 return (NULL);
11124 }
11125
11126 /*
11127 * mddb_parse re-reads portions of the mddb from disk given a list
11128 * of good replicas to read from and flags describing
11129 * which portion of the mddb to read in.
11130 *
11131 * Used in a MN diskset when the master has made a change to some part
11132 * of the mddb and wants to relay this information to the slaves.
11133 */
11134 int
mddb_parse(mddb_parse_parm_t * mpp)11135 mddb_parse(mddb_parse_parm_t *mpp)
11136 {
11137 mddb_set_t *s;
11138 int err = 0;
11139 mddb_locator_t *lp, *old_lp;
11140 mddb_lb_t *lbp, *old_lbp;
11141 int rval = 0;
11142 int i, li;
11143 int found_good_one = 0;
11144 mddb_ln_t *lnp;
11145 mddb_block_t ln_blkcnt;
11146 md_error_t *ep = &mpp->c_mde;
11147
11148 if (mpp->c_setno >= md_nsets)
11149 return (EINVAL);
11150
11151 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11152 return (0);
11153
11154 if ((s = mddb_setenter(mpp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11155 return (mddbstatus2error(ep, err, NODEV32, mpp->c_setno));
11156 }
11157
11158 if (!(MD_MNSET_SETNO(mpp->c_setno))) {
11159 mddb_setexit_no_parse(s);
11160 return (EINVAL);
11161 }
11162
11163 /*
11164 * Master node initiated this request, so there's no work for
11165 * the master node to do.
11166 */
11167 if (md_set[mpp->c_setno].s_am_i_master) {
11168 mddb_setexit_no_parse(s);
11169 return (rval);
11170 }
11171
11172 single_thread_start(s);
11173
11174 if (mpp->c_parse_flags & MDDB_PARSE_LOCBLK) {
11175 lbp = 0;
11176 for (i = 0; i < MDDB_NLB; i++) {
11177 /* Walk through master's active list */
11178 if (!(mpp->c_lb_flags[i] & MDDB_F_ACTIVE))
11179 continue;
11180 if (s->s_mbiarray[i] == NULL)
11181 continue;
11182
11183 /* Assumes master blocks are already setup */
11184 if (lbp == (mddb_lb_t *)NULL) {
11185 lbp = (mddb_lb_t *)kmem_zalloc(
11186 dbtob(MDDB_MNLBCNT), KM_SLEEP);
11187 }
11188 err |= readblks(s, (caddr_t)lbp, 0, lbp->lb_blkcnt, i);
11189
11190 if (err)
11191 continue;
11192
11193 if (lbp->lb_magic != MDDB_MAGIC_LB)
11194 continue;
11195 if (lbp->lb_blkcnt != MDDB_MNLBCNT)
11196 continue;
11197 if (revchk(MDDB_REV_MNLB, lbp->lb_revision))
11198 continue;
11199 if (crcchk(lbp, &lbp->lb_checksum, dbtob(MDDB_MNLBCNT),
11200 NULL))
11201 continue;
11202 if (lbp->lb_setno != s->s_setno)
11203 continue;
11204 /*
11205 * a commit count of zero means this locator has
11206 * been deleted
11207 */
11208 if (lbp->lb_commitcnt == 0) {
11209 continue;
11210 }
11211 /* Found a good locator - keep it */
11212 found_good_one = 1;
11213 break;
11214 }
11215
11216 /*
11217 * If found a good copy of the mddb, then read it into
11218 * this node's locator block. Fix up the set's s_mbiarray
11219 * pointer (master block incore array pointer) to be
11220 * in sync with the newly read in locator block. If a
11221 * new mddb was added, read in the master blocks associated
11222 * with the new mddb. If an mddb was deleted, free the
11223 * master blocks associated with deleted mddb.
11224 */
11225 if (found_good_one) {
11226 /* Compare old and new view of mddb locator blocks */
11227 old_lbp = s->s_lbp;
11228 for (li = 0; li < lbp->lb_loccnt; li++) {
11229 int mn_set;
11230
11231 lp = &lbp->lb_locators[li];
11232 old_lp = &old_lbp->lb_locators[li];
11233
11234 /* If old and new views match, continue */
11235 if ((lp->l_flags & MDDB_F_ACTIVE) ==
11236 (old_lp->l_flags & MDDB_F_ACTIVE))
11237 continue;
11238
11239 if (lp->l_flags & MDDB_F_ACTIVE) {
11240 /*
11241 * If new mddb has been added - delete
11242 * old mbiarray and get new one.
11243 *
11244 * When devids are supported, will
11245 * need to get dev from devid.
11246 */
11247 if (s->s_mbiarray[li]) {
11248 free_mbipp(&s->s_mbiarray[li]);
11249 }
11250 /*
11251 * If getmasters fails, getmasters
11252 * will set appropriate error flags.
11253 */
11254 s->s_mbiarray[li] = getmasters(s,
11255 md_expldev(lp->l_dev), lp->l_blkno,
11256 (uint_t *)&(lp->l_flags), &mn_set);
11257 } else if (lp->l_flags & MDDB_F_DELETED) {
11258 /*
11259 * If old one has been deleted -
11260 * delete old mbiarray.
11261 */
11262 if (s->s_mbiarray[li]) {
11263 free_mbipp(&s->s_mbiarray[li]);
11264 }
11265 }
11266 }
11267
11268 /* Free this node's old view of mddb locator blocks */
11269 kmem_free((caddr_t)s->s_lbp,
11270 dbtob(s->s_lbp->lb_blkcnt));
11271 s->s_lbp = lbp;
11272 } else {
11273 if (lbp)
11274 kmem_free(lbp, dbtob(MDDB_MNLBCNT));
11275 }
11276 }
11277
11278 if (mpp->c_parse_flags & MDDB_PARSE_LOCNM) {
11279 lnp = s->s_lnp;
11280 lbp = s->s_lbp;
11281 ln_blkcnt = lbp->lb_lnblkcnt;
11282 s->s_lnp = NULL; /* readlocnames does this anyway */
11283 for (li = 0; li < lbp->lb_loccnt; li++) {
11284 lp = &lbp->lb_locators[li];
11285
11286 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11287 (lp->l_flags & MDDB_F_EMASTER))
11288 continue;
11289
11290 /* Successfully read the locator names */
11291 if (readlocnames(s, li) == 0)
11292 break;
11293 }
11294
11295 if (li == lbp->lb_loccnt) {
11296 /* Did not successfully read locnames; restore lnp */
11297 s->s_lnp = lnp;
11298 } else {
11299 /* readlocnames successful, free old struct */
11300 kmem_free((caddr_t)lnp, dbtob(ln_blkcnt));
11301 }
11302 }
11303
11304 if (mpp->c_parse_flags & MDDB_PARSE_OPTRECS) {
11305 mddb_de_ic_t *dep, *tdep, *first_dep, *dep2;
11306 mddb_db_t *dbp;
11307 mddb_db32_t *db32p;
11308 mddb_de32_t *de32p, *de32p2;
11309 int writeout;
11310
11311 lbp = s->s_lbp;
11312 /*
11313 * Walk through directory block and directory entry incore
11314 * linked list looking for optimized resync records.
11315 * For each opt record found, re-read in directory block.
11316 * The directoy block consists of a number of directory
11317 * entries. The directory entry for this opt record will
11318 * describe which 2 mddbs actually contain the resync record
11319 * since it could have been relocated by the master node
11320 * due to mddb failure or mddb deletion. If this node
11321 * is the record owner for this opt record, then write out
11322 * the record to the 2 mddbs listed in the directory entry
11323 * if the mddbs locations are different than previously known.
11324 */
11325 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11326 for (dep = dbp->db_firstentry; dep;
11327 dep = dep->de_next) {
11328 /* Found an opt record */
11329 if (dep->de_flags & MDDB_F_OPT)
11330 break;
11331 }
11332 /* If no opt records found, go to next dbp */
11333 if (dep == NULL)
11334 continue;
11335
11336 /*
11337 * Reread directory block from disk since
11338 * master could have rewritten in during fixoptrecord.
11339 */
11340 db32p = (mddb_db32_t *)kmem_zalloc(MDDB_BSIZE,
11341 KM_SLEEP);
11342 create_db32rec(db32p, dbp);
11343 for (li = 0; li < lbp->lb_loccnt; li++) {
11344 lp = &lbp->lb_locators[li];
11345
11346 if ((! (lp->l_flags & MDDB_F_ACTIVE)) ||
11347 (lp->l_flags & MDDB_F_EMASTER))
11348 continue;
11349
11350 err = readblks(s, (caddr_t)db32p,
11351 db32p->db32_blknum, 1, li);
11352 if (err)
11353 continue;
11354
11355 /* Reverify db; go to next mddb if bad */
11356 if ((db32p->db32_magic != MDDB_MAGIC_DB) ||
11357 (revchk(MDDB_REV_DB,
11358 db32p->db32_revision)) ||
11359 (crcchk(db32p, &db32p->db32_checksum,
11360 MDDB_BSIZE, NULL))) {
11361 continue;
11362 } else {
11363 break;
11364 }
11365 }
11366 /*
11367 * If all mddbs are unavailable then panic since
11368 * this slave cannot be allowed to continue out-of-sync
11369 * with the master node. Since the optimized resync
11370 * records are written by all nodes, all nodes must
11371 * stay in sync with the master.
11372 *
11373 * This also handles the case when all storage
11374 * connectivity to a slave node has failed. The
11375 * slave node will send an MDDB_OPTRECERR message to
11376 * the master node when the slave node has been unable
11377 * to write an optimized resync record to both
11378 * designated mddbs. After the master has fixed the
11379 * optimized records to be on available mddbs, the
11380 * MDDB_PARSE message (with the flag MDDB_PARSE_OPTRECS)
11381 * is sent to all slave nodes. If a slave node is
11382 * unable to access any mddb in order to read in the
11383 * relocated optimized resync record, then the slave
11384 * node must panic.
11385 */
11386 if (li == lbp->lb_loccnt) {
11387 kmem_free((caddr_t)db32p, MDDB_BSIZE);
11388 cmn_err(CE_PANIC, "md: mddb: Node unable to "
11389 "access any SVM state database "
11390 "replicas for diskset %s\n", s->s_setname);
11391 }
11392 /*
11393 * Setup temp copy of linked list of de's.
11394 * Already have an incore copy, but need to walk
11395 * the directory entry list contained in the
11396 * new directory block that was just read in above.
11397 * After finding the directory entry of an opt record
11398 * by walking the incore list, find the corresponding
11399 * entry in the temporary list and then update
11400 * the incore directory entry record with
11401 * the (possibly changed) mddb location stored
11402 * for the optimized resync records.
11403 */
11404 de32p = (mddb_de32_t *)
11405 ((void *) ((caddr_t)
11406 (&db32p->db32_firstentry)
11407 + sizeof (db32p->db32_firstentry)));
11408 tdep = (mddb_de_ic_t *)
11409 kmem_zalloc(sizeof (mddb_de_ic_t) -
11410 sizeof (mddb_block_t) +
11411 sizeof (mddb_block_t) *
11412 de32p->de32_blkcount, KM_SLEEP);
11413 de32tode(de32p, tdep);
11414 first_dep = tdep;
11415 while (de32p && de32p->de32_next) {
11416 de32p2 = nextentry(de32p);
11417 dep2 = (mddb_de_ic_t *)kmem_zalloc(
11418 sizeof (mddb_de_ic_t) -
11419 sizeof (mddb_block_t) +
11420 sizeof (mddb_block_t) *
11421 de32p2->de32_blkcount, KM_SLEEP);
11422 de32tode(de32p2, dep2);
11423 tdep->de_next = dep2;
11424 tdep = dep2;
11425 de32p = de32p2;
11426 }
11427
11428 /* Now, walk the incore directory entry list */
11429 for (dep = dbp->db_firstentry; dep;
11430 dep = dep->de_next) {
11431 if (! (dep->de_flags & MDDB_F_OPT))
11432 continue;
11433 /*
11434 * Found an opt record in the incore copy.
11435 * Find the corresponding entry in the temp
11436 * list. If anything has changed in the
11437 * opt record info between the incore copy
11438 * and the temp copy, update the incore copy
11439 * and set a flag to writeout the opt record
11440 * to the new mddb locations.
11441 */
11442 for (tdep = first_dep; tdep;
11443 tdep = tdep->de_next) {
11444 if (dep->de_recid == tdep->de_recid) {
11445 writeout = 0;
11446 /* Check first mddb location */
11447 if ((dep->de_optinfo[0].o_li !=
11448 tdep->de_optinfo[0].o_li) ||
11449 (dep->de_optinfo[0].
11450 o_flags != tdep->de_optinfo
11451 [0].o_flags)) {
11452 dep->de_optinfo[0] =
11453 tdep->de_optinfo[0];
11454 writeout = 1;
11455 }
11456 /* Check second mddb location */
11457 if ((dep->de_optinfo[1].o_li !=
11458 tdep->de_optinfo[1].o_li) ||
11459 (dep->de_optinfo[1].
11460 o_flags != tdep->de_optinfo
11461 [1].o_flags)) {
11462 dep->de_optinfo[1] =
11463 tdep->de_optinfo[1];
11464 writeout = 1;
11465 }
11466 /*
11467 * Record owner should rewrite
11468 * it
11469 */
11470 if ((writeout) &&
11471 (dep->de_owner_nodeid ==
11472 md_set[mpp->c_setno].
11473 s_nodeid))
11474 (void) writeoptrecord(s,
11475 dep);
11476 break;
11477 }
11478 }
11479 }
11480 /*
11481 * Update the incore checksum information for this
11482 * directory block to match the newly read in checksum.
11483 * This should have only changed if the incore and
11484 * temp directory entries differed, but it takes
11485 * more code to do the check than to just update
11486 * the information everytime.
11487 */
11488 dbp->db_checksum = db32p->db32_checksum;
11489
11490 /* Now free everything */
11491 tdep = first_dep;
11492 while (tdep) {
11493 dep2 = tdep->de_next;
11494 kmem_free((caddr_t)tdep,
11495 sizeofde(tdep));
11496 tdep = dep2;
11497 }
11498 kmem_free((caddr_t)db32p, MDDB_BSIZE);
11499 }
11500 rval = 0;
11501 }
11502 out:
11503 single_thread_end(s);
11504 mddb_setexit_no_parse(s);
11505 return (rval);
11506 }
11507
11508 int
mddb_block(mddb_block_parm_t * mbp)11509 mddb_block(mddb_block_parm_t *mbp)
11510 {
11511 mddb_set_t *s;
11512 int err = 0;
11513 md_error_t *ep = &mbp->c_mde;
11514
11515 if (mbp->c_setno >= md_nsets)
11516 return (EINVAL);
11517
11518 /*
11519 * If the new_master flag is set for this setno we are in the middle
11520 * of a reconfig cycle, and blocking or unblocking is not needed.
11521 * Hence we can return success immediately
11522 */
11523 if (md_get_setstatus(mbp->c_setno) & MD_SET_MN_NEWMAS_RC) {
11524 return (0);
11525 }
11526
11527 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11528 return (0);
11529
11530 if ((s = mddb_setenter(mbp->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11531 return (mddbstatus2error(ep, err, NODEV32, mbp->c_setno));
11532 }
11533
11534 if (!(MD_MNSET_SETNO(mbp->c_setno))) {
11535 mddb_setexit_no_parse(s);
11536 return (EINVAL);
11537 }
11538
11539 single_thread_start(s);
11540
11541 if (mbp->c_blk_flags & MDDB_BLOCK_PARSE)
11542 md_set_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11543
11544 if (mbp->c_blk_flags & MDDB_UNBLOCK_PARSE)
11545 md_clr_setstatus(mbp->c_setno, MD_SET_MNPARSE_BLK);
11546
11547 single_thread_end(s);
11548 mddb_setexit_no_parse(s);
11549 return (err);
11550 }
11551
11552 /*
11553 * mddb_optrecfix marks up to 2 mddbs as failed and calls fixoptrecords
11554 * to relocate any optimized resync records to available mddbs.
11555 * This routine is only called on the master node.
11556 *
11557 * Used in a MN diskset when a slave node has failed to write an optimized
11558 * resync record. The failed mddb information is sent to the master node
11559 * so the master can relocate the optimized records, if possible. If the
11560 * failed mddb information has a mddb marked as failed that was previously
11561 * marked active on the master, the master sets its incore mddb state to
11562 * EWRITE and sets the PARSE_LOCBLK flag. The master node then attempts
11563 * to relocate any optimized records on the newly failed mddbs by calling
11564 * fixoptrecords. (fixoptrecords will set the PARSE_OPTRECS flag if any
11565 * optimized records are relocated.)
11566 *
11567 * When mddb_optrecfix is finished, the ioctl exit code will notice the PARSE
11568 * flags and will send a PARSE message to the slave nodes. The PARSE_LOCBLK
11569 * flag causes the slave node to re-read in the locator block from disk.
11570 * The PARSE_OPTRECS flag causes the slave node to re-read in the directory
11571 * blocks and write out any optimized resync records that have been
11572 * relocated to a different mddb.
11573 */
11574 int
mddb_optrecfix(mddb_optrec_parm_t * mop)11575 mddb_optrecfix(mddb_optrec_parm_t *mop)
11576 {
11577 mddb_set_t *s;
11578 int err = 0;
11579 mddb_lb_t *lbp;
11580 mddb_mnlb_t *mnlbp;
11581 mddb_locator_t *lp;
11582 int li;
11583 mddb_mnsidelocator_t *mnslp;
11584 mddb_drvnm_t *dn;
11585 int i, j;
11586 md_replica_recerr_t *recerr;
11587 md_error_t *ep = &mop->c_mde;
11588 int something_changed = 0;
11589 int alc, lc;
11590 int setno;
11591
11592 setno = mop->c_setno;
11593 if (mop->c_setno >= md_nsets)
11594 return (EINVAL);
11595
11596 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11597 return (0);
11598
11599 if ((s = mddb_setenter(mop->c_setno, MDDB_MUSTEXIST, &err)) == NULL) {
11600 return (mddbstatus2error(ep, err, NODEV32, mop->c_setno));
11601 }
11602
11603 if (!(MD_MNSET_SETNO(mop->c_setno))) {
11604 mddb_setexit(s);
11605 return (EINVAL);
11606 }
11607
11608 single_thread_start(s);
11609 lbp = s->s_lbp;
11610 mnlbp = (mddb_mnlb_t *)lbp;
11611
11612 /*
11613 * If slave node has seen an mddb failure, but the master node
11614 * hasn't encountered this failure, mark the mddb as failed on
11615 * the master node and set the something_changed flag to 1.
11616 */
11617 for (i = 0; i < 2; i++) {
11618 recerr = &mop->c_recerr[i];
11619 if (recerr->r_flags & MDDB_F_EWRITE) {
11620 li = recerr->r_li;
11621 lp = &lbp->lb_locators[li];
11622 for (j = 0; j < MD_MNMAXSIDES; j++) {
11623 mnslp = &mnlbp->lb_mnsidelocators[j][li];
11624 if (mnslp->mnl_sideno == s->s_sideno)
11625 break;
11626 }
11627 /* Do quick check using li */
11628 if (j != MD_MNMAXSIDES)
11629 dn = &lbp->lb_drvnm[mnslp->mnl_drvnm_index];
11630
11631 if ((j != MD_MNMAXSIDES) &&
11632 (strncmp(dn->dn_data, recerr->r_driver_name,
11633 MD_MAXDRVNM) == 0) &&
11634 (recerr->r_blkno == lp->l_blkno) &&
11635 (recerr->r_mnum == mnslp->mnl_mnum)) {
11636 if ((lp->l_flags & MDDB_F_ACTIVE) ||
11637 ((lp->l_flags & MDDB_F_EWRITE) == 0)) {
11638 something_changed = 1;
11639 lp->l_flags |= MDDB_F_EWRITE;
11640 lp->l_flags &= ~MDDB_F_ACTIVE;
11641 }
11642 } else {
11643 /*
11644 * Passed in li from slave does not match
11645 * the replica in the master's structures.
11646 * This could have occurred if a delete
11647 * mddb command was running when the
11648 * optimized resync record had a failure.
11649 * Search all replicas for this entry.
11650 * If no match, just ignore.
11651 * If a match, set replica in error.
11652 */
11653 for (li = 0; li < lbp->lb_loccnt; li++) {
11654 lp = &lbp->lb_locators[li];
11655 if (lp->l_flags & MDDB_F_DELETED)
11656 continue;
11657
11658 for (j = 0; j < MD_MNMAXSIDES; j++) {
11659 mnslp =
11660 &mnlbp->
11661 lb_mnsidelocators[j][li];
11662 if (mnslp->mnl_sideno ==
11663 s->s_sideno)
11664 break;
11665 }
11666 if (j == MD_MNMAXSIDES)
11667 continue;
11668
11669 dn = &lbp->
11670 lb_drvnm[mnslp->mnl_drvnm_index];
11671 if ((strncmp(dn->dn_data,
11672 recerr->r_driver_name,
11673 MD_MAXDRVNM) == 0) &&
11674 (recerr->r_blkno == lp->l_blkno) &&
11675 (recerr->r_mnum ==
11676 mnslp->mnl_mnum)) {
11677 if ((lp->l_flags &
11678 MDDB_F_ACTIVE) ||
11679 ((lp->l_flags &
11680 MDDB_F_EWRITE) == 0)) {
11681 something_changed = 1;
11682 lp->l_flags |=
11683 MDDB_F_EWRITE;
11684 lp->l_flags &=
11685 ~MDDB_F_ACTIVE;
11686 }
11687 break;
11688 }
11689 }
11690 }
11691 }
11692 }
11693
11694 /*
11695 * If this message changed nothing, then we're done since this
11696 * failure has already been handled.
11697 * If some mddb state has been changed, send a parse message to
11698 * the slave nodes so that the slaves will re-read the locator
11699 * block from disk.
11700 */
11701 if (something_changed == 0) {
11702 single_thread_end(s);
11703 mddb_setexit(s);
11704 return (0);
11705 } else {
11706 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
11707 }
11708
11709 /*
11710 * Scan replicas setting MD_SET_TOOFEW if
11711 * 50% or more of the mddbs have seen errors.
11712 * Note: Don't call selectreplicas or writeretry
11713 * since these routines may end up setting the ACTIVE flag
11714 * on a failed mddb if the master is able to access the mddb
11715 * but the slave node couldn't. Need to have the ACTIVE flag
11716 * turned off in order to relocate the optimized records to
11717 * mddbs that are (hopefully) available on all nodes.
11718 */
11719 alc = 0;
11720 lc = 0;
11721 for (li = 0; li < lbp->lb_loccnt; li++) {
11722 lp = &lbp->lb_locators[li];
11723 if (lp->l_flags & MDDB_F_DELETED)
11724 continue;
11725 lc++;
11726 if (! (lp->l_flags & MDDB_F_ACTIVE))
11727 continue;
11728 alc++;
11729 }
11730
11731 /*
11732 * If more than 50% mddbs have failed, then don't relocate opt recs.
11733 * The node sending the mddb failure information will detect TOOFEW
11734 * and will panic when it attempts to re-write the optimized record.
11735 */
11736 if (alc < ((lc + 1) / 2)) {
11737 md_set_setstatus(setno, MD_SET_TOOFEW);
11738 (void) push_lb(s);
11739 (void) upd_med(s, "mddb_optrecfix(0)");
11740 single_thread_end(s);
11741 mddb_setexit(s);
11742 return (0);
11743 }
11744
11745 /* Attempt to relocate optimized records that are on failed mddbs */
11746 (void) fixoptrecords(s);
11747
11748 /* Push changed locator block out to disk */
11749 (void) push_lb(s);
11750 (void) upd_med(s, "mddb_optrecfix(1)");
11751
11752 /* Recheck for TOOFEW after writing out locator blocks */
11753 alc = 0;
11754 lc = 0;
11755 for (li = 0; li < lbp->lb_loccnt; li++) {
11756 lp = &lbp->lb_locators[li];
11757 if (lp->l_flags & MDDB_F_DELETED)
11758 continue;
11759 lc++;
11760 if (! (lp->l_flags & MDDB_F_ACTIVE))
11761 continue;
11762 alc++;
11763 }
11764
11765 /* If more than 50% mddbs have failed, then don't relocate opt recs */
11766 if (alc < ((lc + 1) / 2)) {
11767 md_set_setstatus(setno, MD_SET_TOOFEW);
11768 single_thread_end(s);
11769 mddb_setexit(s);
11770 return (0);
11771 }
11772
11773 single_thread_end(s);
11774 mddb_setexit(s);
11775 return (0);
11776 }
11777
11778 /*
11779 * Check if incore mddb on master node matches ondisk mddb.
11780 * If not, master writes out incore view to all mddbs.
11781 * Have previously verified that master is an owner of the
11782 * diskset (master has snarfed diskset) and that diskset is
11783 * not stale.
11784 *
11785 * Meant to be called during reconfig cycle during change of master.
11786 * Previous master in diskset may have changed the mddb and
11787 * panic'd before relaying information to slave nodes. New
11788 * master node just writes out its incore view of the mddb and
11789 * the replay of the change log will resync all the nodes.
11790 *
11791 * Only supported for MN disksets.
11792 *
11793 * Return values:
11794 * 0 - success
11795 * non-zero - failure
11796 */
11797 int
mddb_check_write_ioctl(mddb_config_t * info)11798 mddb_check_write_ioctl(mddb_config_t *info)
11799 {
11800 int err = 0;
11801 set_t setno = info->c_setno;
11802 mddb_set_t *s;
11803 int li;
11804 mddb_locator_t *lp;
11805 mddb_lb_t *lbp;
11806 mddb_mnlb_t *mnlbp_od;
11807 mddb_ln_t *lnp;
11808 mddb_mnln_t *mnlnp_od;
11809 mddb_db_t *dbp;
11810 mddb_de_ic_t *dep;
11811 int write_out_mddb;
11812 md_error_t *ep = &info->c_mde;
11813 int mddb_err = 0;
11814 int prev_li = 0;
11815 int rval = 0;
11816 int alc, lc;
11817 int mddbs_present = 0;
11818
11819 /* Verify that setno is in valid range */
11820 if (setno >= md_nsets)
11821 return (EINVAL);
11822
11823 if (md_snarf_db_set(MD_LOCAL_SET, ep) != 0)
11824 return (0);
11825
11826 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
11827 return (mddbstatus2error(ep, err, NODEV32, setno));
11828 }
11829
11830 /* Calling diskset must be a MN diskset */
11831 if (!(MD_MNSET_SETNO(setno))) {
11832 mddb_setexit(s);
11833 return (EINVAL);
11834 }
11835
11836 /* Re-verify that set is not stale */
11837 if (md_get_setstatus(setno) & MD_SET_STALE) {
11838 mddb_setexit(s);
11839 return (mdmddberror(ep, MDE_DB_STALE, NODEV32, setno));
11840 }
11841
11842 lbp = s->s_lbp;
11843 lnp = s->s_lnp;
11844
11845 /*
11846 * Previous master could have died during the write of data to
11847 * the mddbs so that the ondisk mddbs may not be consistent.
11848 * So, need to check the contents of the first and last active mddb
11849 * to see if the mddbs need to be rewritten.
11850 */
11851 for (li = 0; li < lbp->lb_loccnt; li++) {
11852 int checkcopy_err;
11853
11854 lp = &lbp->lb_locators[li];
11855 /* Find replica that is active */
11856 if (lp->l_flags & MDDB_F_DELETED)
11857 continue;
11858 mddbs_present = 1;
11859 if (! (lp->l_flags & MDDB_F_ACTIVE))
11860 continue;
11861 if (s->s_mbiarray[li] == NULL)
11862 continue;
11863 /* Check locator block */
11864 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
11865 KM_SLEEP);
11866 /* read in on-disk locator block */
11867 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
11868
11869 /* If err, try next mddb */
11870 if (err) {
11871 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11872 continue;
11873 }
11874
11875 /*
11876 * We resnarf all changelog entries for this set.
11877 * They may have been altered by the previous master
11878 */
11879 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
11880 for (dep = dbp->db_firstentry; dep; dep =
11881 dep->de_next) {
11882 if ((dep->de_flags & MDDB_F_CHANGELOG) == 0) {
11883 continue;
11884 }
11885 /*
11886 * This has been alloc'ed while
11887 * joining the set
11888 */
11889 if (dep->de_rb) {
11890 kmem_free(dep->de_rb, dep->de_recsize);
11891 dep->de_rb = (mddb_rb32_t *)NULL;
11892 }
11893 if (dep->de_rb_userdata) {
11894 kmem_free(dep->de_rb_userdata,
11895 dep->de_reqsize);
11896 dep->de_rb_userdata = (caddr_t)NULL;
11897 }
11898
11899 err = getrecord(s, dep, li);
11900 if (err) {
11901 /*
11902 * When we see on error while reading
11903 * the changelog entries, we move on
11904 * to the next mddb
11905 */
11906 err = 1;
11907 break; /* out of inner for-loop */
11908 }
11909 allocuserdata(dep);
11910 }
11911 if (err)
11912 break; /* out of outer for-loop */
11913 }
11914
11915 /* If err, try next mddb */
11916 if (err) {
11917 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
11918 continue;
11919 }
11920
11921 /* Is incore locator block same as ondisk? */
11922 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
11923 == 1) {
11924 write_out_mddb = 1;
11925 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11926 break;
11927 }
11928
11929 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
11930
11931 /* If lb ok, check locator names */
11932 mnlnp_od = (mddb_mnln_t *)kmem_zalloc(dbtob(MDDB_MNLNCNT),
11933 KM_SLEEP);
11934 /* read in on-disk locator names */
11935 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
11936 lbp->lb_lnblkcnt, li);
11937
11938 /* If err, try next mddb */
11939 if (err) {
11940 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
11941 continue;
11942 }
11943
11944 /* Are incore locator names same as ondisk? */
11945 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
11946 == 1) {
11947 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11948 write_out_mddb = 1;
11949 break;
11950 }
11951
11952 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
11953
11954 /*
11955 * Check records in mddb.
11956 * If a read error is encountered, set the error flag and
11957 * continue to the next mddb. Otherwise, if incore data is
11958 * different from ondisk, then set the flag to write out
11959 * the mddb and break out.
11960 */
11961 checkcopy_err = checkcopy(s, li);
11962 if (checkcopy_err == MDDB_F_EREAD) {
11963 lp->l_flags |= MDDB_F_EREAD;
11964 mddb_err = 1;
11965 continue;
11966 } else if (checkcopy_err == 1) {
11967 write_out_mddb = 1;
11968 break;
11969 }
11970 /*
11971 * Have found first active mddb and the data is the same as
11972 * incore - break out of loop
11973 */
11974 write_out_mddb = 0;
11975 break;
11976 }
11977
11978 /*
11979 * Skip checking for last active mddb if:
11980 * - already found a mismatch in the first active mddb
11981 * (write_out_mddb is 1) OR
11982 * - didn't find a readable mddb when looking for first
11983 * active mddb (there are mddbs present but all failed
11984 * when read was attempted).
11985 *
11986 * In either case, go to write_out_mddb label in order to attempt
11987 * to write out the data. If < 50% mddbs are available, panic.
11988 */
11989 if ((write_out_mddb == 1) ||
11990 ((li == lbp->lb_loccnt) && mddbs_present)) {
11991 write_out_mddb = 1;
11992 goto write_out_mddb;
11993 }
11994
11995 /*
11996 * Save which index was checked for the first active mddb. If only 1
11997 * active mddb, don't want to recheck the same mddb when looking for
11998 * last active mddb.
11999 */
12000 prev_li = li;
12001
12002 /*
12003 * Now, checking for last active mddb. If found same index as before
12004 * (only 1 active mddb), then skip.
12005 */
12006 for (li = (lbp->lb_loccnt - 1); li >= 0; li--) {
12007 int checkcopy_err;
12008
12009 lp = &lbp->lb_locators[li];
12010 /* Find replica that is active */
12011 if (! (lp->l_flags & MDDB_F_ACTIVE))
12012 continue;
12013 if (lp->l_flags & MDDB_F_DELETED)
12014 continue;
12015 if (s->s_mbiarray[li] == NULL)
12016 continue;
12017 /* If already checked mddb, bail out */
12018 if (li == prev_li)
12019 break;
12020 /* Check locator block */
12021 mnlbp_od = (mddb_mnlb_t *)kmem_zalloc(dbtob(MDDB_MNLBCNT),
12022 KM_SLEEP);
12023 /* read in on-disk locator block */
12024 err = readblks(s, (caddr_t)mnlbp_od, 0, lbp->lb_blkcnt, li);
12025
12026 /* If err, try next mddb */
12027 if (err) {
12028 kmem_free(mnlbp_od, dbtob(MDDB_MNLBCNT));
12029 continue;
12030 }
12031
12032
12033 /* Is incore locator block same as ondisk? */
12034 if (bcmp((mddb_mnlb_t *)lbp, mnlbp_od, dbtob(MDDB_MNLBCNT))
12035 == 1) {
12036 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
12037 write_out_mddb = 1;
12038 break;
12039 }
12040
12041 kmem_free((caddr_t)mnlbp_od, dbtob(MDDB_MNLBCNT));
12042
12043 /* If lb ok, check locator names */
12044 mnlnp_od = (mddb_mnln_t *)
12045 kmem_zalloc(dbtob(MDDB_MNLNCNT), KM_SLEEP);
12046
12047 /* read in on-disk locator names */
12048 err = readblks(s, (caddr_t)mnlnp_od, lbp->lb_lnfirstblk,
12049 lbp->lb_lnblkcnt, li);
12050
12051 /* If err, try next mddb */
12052 if (err) {
12053 kmem_free(mnlnp_od, dbtob(MDDB_MNLNCNT));
12054 continue;
12055 }
12056
12057 /* Are incore locator names same as ondisk? */
12058 if (bcmp((mddb_mnln_t *)lnp, mnlnp_od, dbtob(MDDB_MNLNCNT))
12059 == 1) {
12060 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12061 write_out_mddb = 1;
12062 break;
12063 }
12064
12065 kmem_free((caddr_t)mnlnp_od, dbtob(MDDB_MNLNCNT));
12066
12067 /*
12068 * Check records in mddb.
12069 * If a read error is encountered, set the error flag and
12070 * continue to the next mddb. Otherwise, if incore data is
12071 * different from ondisk, then set the flag to write out
12072 * the mddb and break out.
12073 */
12074 checkcopy_err = checkcopy(s, li);
12075 if (checkcopy_err == MDDB_F_EREAD) {
12076 lp->l_flags |= MDDB_F_EREAD;
12077 mddb_err = 1;
12078 continue;
12079 } else if (checkcopy_err == 1) {
12080 write_out_mddb = 1;
12081 break;
12082 }
12083 /*
12084 * Have found last active mddb and the data is the same as
12085 * incore - break out of loop
12086 */
12087 write_out_mddb = 0;
12088 break;
12089 }
12090
12091 /*
12092 * If ondisk and incore versions of the mddb don't match, then
12093 * write out this node's incore version to disk.
12094 * Or, if unable to read a copy of the mddb, attempt to write
12095 * out a new one.
12096 */
12097 write_out_mddb:
12098 if (write_out_mddb) {
12099 /* Recompute free blocks based on incore information */
12100 computefreeblks(s); /* set up free block bits */
12101
12102 /*
12103 * Write directory entries and record blocks.
12104 * Use flag MDDB_WRITECOPY_SYNC so that writecopy
12105 * routine won't write out change log records.
12106 */
12107 for (li = 0; li < lbp->lb_loccnt; li++) {
12108 lp = &lbp->lb_locators[li];
12109 /* Don't write to inactive or deleted mddbs */
12110 if (! (lp->l_flags & MDDB_F_ACTIVE))
12111 continue;
12112 if (lp->l_flags & MDDB_F_DELETED)
12113 continue;
12114 if (s->s_mbiarray[li] == NULL)
12115 continue;
12116 /* If encounter a write error, save it for later */
12117 if (writecopy(s, li, MDDB_WRITECOPY_SYNC)) {
12118 lp->l_flags |= MDDB_F_EWRITE;
12119 mddb_err = 1;
12120 }
12121 }
12122
12123 /*
12124 * Write out locator blocks to all replicas.
12125 * push_lb will set MDDB_F_EWRITE on replicas that fail.
12126 */
12127 if (push_lb(s))
12128 mddb_err = 1;
12129 (void) upd_med(s, "mddb_check_write_ioctl(0)");
12130
12131 /* Write out locator names to all replicas */
12132 lnp = s->s_lnp;
12133 uniqtime32(&lnp->ln_timestamp);
12134 lnp->ln_revision = MDDB_REV_MNLN;
12135 crcgen(lnp, &lnp->ln_checksum, dbtob(lbp->lb_lnblkcnt), NULL);
12136
12137 /* writeall sets MDDB_F_EWRITE if writes fails to replica */
12138 if (writeall(s, (caddr_t)lnp, lbp->lb_lnfirstblk,
12139 lbp->lb_lnblkcnt, 0))
12140 mddb_err = 1;
12141
12142 /*
12143 * The writes to the replicas above would have set
12144 * the MDDB_F_EWRITE flags if any write error was
12145 * encountered.
12146 * If < 50% of the mddbs are available, panic.
12147 */
12148 lc = alc = 0;
12149 for (li = 0; li < lbp->lb_loccnt; li++) {
12150 lp = &lbp->lb_locators[li];
12151 if (lp->l_flags & MDDB_F_DELETED)
12152 continue;
12153 lc++;
12154 /*
12155 * If mddb:
12156 * - is not active (previously had an error)
12157 * - had an error reading the master blocks or
12158 * - had an error in writing to the mddb
12159 * then don't count this mddb in the active count.
12160 */
12161 if (! (lp->l_flags & MDDB_F_ACTIVE) ||
12162 (lp->l_flags & MDDB_F_EMASTER) ||
12163 (lp->l_flags & MDDB_F_EWRITE))
12164 continue;
12165 alc++;
12166 }
12167 if (alc < ((lc + 1) / 2)) {
12168 cmn_err(CE_PANIC,
12169 "md: Panic due to lack of DiskSuite state\n"
12170 " database replicas. Fewer than 50%% of "
12171 "the total were available,\n so panic to "
12172 "ensure data integrity.");
12173 }
12174 }
12175
12176 /*
12177 * If encountered an error during checking or writing of
12178 * mddbs, call selectreplicas so that replica error can
12179 * be properly handled. This will involve another attempt
12180 * to write the mddb out to any mddb marked MDDB_F_EWRITE.
12181 * If mddb still fails, it will have the MDDB_F_ACTIVE bit
12182 * turned off. Set the MDDB_SCANALLSYNC flag so that
12183 * selectreplicas doesn't overwrite the change log entries.
12184 *
12185 * Set the PARSE_LOCBLK flag in the mddb_set structure to show
12186 * that the locator block has been changed.
12187 */
12188 if (mddb_err) {
12189 (void) selectreplicas(s, MDDB_SCANALLSYNC);
12190 s->s_mn_parseflags |= MDDB_PARSE_LOCBLK;
12191 }
12192
12193 write_out_end:
12194 mddb_setexit(s);
12195 return (rval);
12196 }
12197
12198 /*
12199 * Set/reset/get set flags in set structure.
12200 * Used during reconfig cycle
12201 * Only supported for MN disksets.
12202 *
12203 * Return values:
12204 * 0 - success
12205 * non-zero - failure
12206 */
12207 int
mddb_setflags_ioctl(mddb_setflags_config_t * info)12208 mddb_setflags_ioctl(mddb_setflags_config_t *info)
12209 {
12210 set_t setno = info->sf_setno;
12211
12212 /* Verify that setno is in valid range */
12213 if (setno >= md_nsets)
12214 return (EINVAL);
12215
12216 /*
12217 * When setting the flags, the set may not
12218 * be snarfed yet. So, don't check for SNARFED or MNset
12219 * and don't call mddb_setenter.
12220 * In order to discourage bad ioctl calls,
12221 * verify that magic field in structure is set correctly.
12222 */
12223 if (info->sf_magic != MDDB_SETFLAGS_MAGIC)
12224 return (EINVAL);
12225
12226 switch (info->sf_flags) {
12227 case MDDB_NM_SET:
12228 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12229 md_set_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12230 if (info->sf_setflags & MD_SET_MN_START_RC)
12231 md_set_setstatus(setno, MD_SET_MN_START_RC);
12232 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12233 md_set_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12234 break;
12235
12236 case MDDB_NM_RESET:
12237 if (info->sf_setflags & MD_SET_MN_NEWMAS_RC)
12238 md_clr_setstatus(setno, MD_SET_MN_NEWMAS_RC);
12239 if (info->sf_setflags & MD_SET_MN_START_RC)
12240 md_clr_setstatus(setno, MD_SET_MN_START_RC);
12241 if (info->sf_setflags & MD_SET_MN_MIR_STATE_RC)
12242 md_clr_setstatus(setno, MD_SET_MN_MIR_STATE_RC);
12243 break;
12244
12245 case MDDB_NM_GET:
12246 info->sf_setflags = md_get_setstatus(setno) &
12247 (MD_SET_MN_NEWMAS_RC|MD_SET_MN_START_RC|
12248 MD_SET_MN_MIR_STATE_RC);
12249 break;
12250 }
12251
12252 return (0);
12253 }
12254
12255 /*
12256 * md_update_minor
12257 *
12258 * This function updates the minor in the namespace entry for an
12259 * underlying metadevice. The function is called in mod_imp_set
12260 * where mod is sp, stripe, mirror and raid.
12261 *
12262 */
12263 int
md_update_minor(set_t setno,side_t side,mdkey_t key)12264 md_update_minor(
12265 set_t setno,
12266 side_t side,
12267 mdkey_t key
12268 )
12269 {
12270 struct nm_next_hdr *nh;
12271 struct nm_name *n;
12272 char *shn;
12273 int retval = 1;
12274 side_t s;
12275
12276 /*
12277 * Load the devid name space if it exists
12278 */
12279 (void) md_load_namespace(setno, NULL, NM_DEVID);
12280 if (! md_load_namespace(setno, NULL, 0L)) {
12281 /*
12282 * Unload the devid namespace
12283 */
12284 (void) md_unload_namespace(setno, NM_DEVID);
12285 return (0);
12286 }
12287
12288 rw_enter(&nm_lock.lock, RW_READER);
12289
12290 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12291 retval = 0;
12292 goto out;
12293 }
12294
12295 /*
12296 * Look up the key
12297 */
12298 for (s = 0; s < MD_MAXSIDES; s++) {
12299 /*
12300 * For side other than the import 'side', cleanup its entry
12301 */
12302 if ((n = lookup_entry(nh, setno, s, key, NODEV64, 0L)) !=
12303 NULL) {
12304 if (n->n_side == side) {
12305 /*
12306 * Update its n_minor if metadevice
12307 */
12308 if (((shn = (char *)getshared_name(setno,
12309 n->n_drv_key, 0L)) != NULL) &&
12310 (strcmp(shn, "md") == 0)) {
12311 n->n_minor = MD_MKMIN(setno,
12312 MD_MIN2UNIT(n->n_minor));
12313 }
12314 } else {
12315 /* We are not the import side, cleanup */
12316 (void) remove_entry(nh, n->n_side, key, 0L);
12317 }
12318 }
12319 }
12320
12321 out:
12322 rw_exit(&nm_lock.lock);
12323 return (retval);
12324 }
12325
12326 /*
12327 * md_update_top_device_minor
12328 *
12329 * This function updates the minor in the namespace entry for a top
12330 * level metadevice. The function is called in mod_imp_set where
12331 * mod is sp, stripe, mirror and raid.
12332 *
12333 */
12334 int
md_update_top_device_minor(set_t setno,side_t side,md_dev64_t dev)12335 md_update_top_device_minor(
12336 set_t setno,
12337 side_t side,
12338 md_dev64_t dev
12339 )
12340 {
12341 struct nm_next_hdr *nh;
12342 struct nm_name *n;
12343 char *shn;
12344 int retval = 1;
12345
12346 /*
12347 * Load the devid name space if it exists
12348 */
12349 (void) md_load_namespace(setno, NULL, NM_DEVID);
12350 if (! md_load_namespace(setno, NULL, 0L)) {
12351 /*
12352 * Unload the devid namespace
12353 */
12354 (void) md_unload_namespace(setno, NM_DEVID);
12355 return (0);
12356 }
12357
12358 rw_enter(&nm_lock.lock, RW_READER);
12359
12360 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
12361 retval = 0;
12362 goto out;
12363 }
12364
12365 /*
12366 * Look up the key
12367 */
12368 if ((n = lookup_entry(nh, setno, side, MD_KEYWILD, dev, 0L)) != NULL) {
12369 /*
12370 * Find the entry, update its n_minor if metadevice
12371 */
12372 if ((shn = (char *)getshared_name(setno, n->n_drv_key, 0L))
12373 == NULL) {
12374 retval = 0;
12375 goto out;
12376 }
12377
12378 if (strcmp(shn, "md") == 0) {
12379 n->n_minor = MD_MKMIN(setno, MD_MIN2UNIT(n->n_minor));
12380 }
12381 }
12382
12383 out:
12384 rw_exit(&nm_lock.lock);
12385 return (retval);
12386 }
12387
12388 static void
md_imp_nm(mddb_set_t * s)12389 md_imp_nm(
12390 mddb_set_t *s
12391 )
12392 {
12393 mddb_db_t *dbp;
12394 mddb_de_ic_t *dep;
12395 struct nm_rec_hdr *hdr;
12396 struct nm_header *hhdr;
12397 set_t setno = s->s_setno;
12398
12399 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12400 for (dep = dbp->db_firstentry; dep != NULL;
12401 dep = dep->de_next) {
12402 switch (dep->de_type1) {
12403
12404 case MDDB_NM_HDR:
12405 case MDDB_DID_NM_HDR:
12406
12407 hhdr = (struct nm_header *)
12408 dep->de_rb_userdata;
12409
12410 hdr = &hhdr->h_names;
12411 if (hdr->r_next_recid > 0) {
12412 hdr->r_next_recid = MAKERECID(setno,
12413 DBID(hdr->r_next_recid));
12414 }
12415
12416 hdr = &hhdr->h_shared;
12417 if (hdr->r_next_recid > 0) {
12418 hdr->r_next_recid = MAKERECID(setno,
12419 DBID(hdr->r_next_recid));
12420 }
12421 break;
12422
12423 case MDDB_NM:
12424 case MDDB_DID_NM:
12425 case MDDB_SHR_NM:
12426 case MDDB_DID_SHR_NM:
12427
12428 hdr = (struct nm_rec_hdr *)
12429 dep->de_rb_userdata;
12430
12431 if (hdr->r_next_recid > 0) {
12432 hdr->r_next_recid = MAKERECID
12433 (setno, DBID(hdr->r_next_recid));
12434 }
12435 break;
12436
12437 default:
12438 break;
12439 }
12440 }
12441 }
12442 }
12443
12444 static int
update_db_rec(mddb_set_t * s)12445 update_db_rec(
12446 mddb_set_t *s
12447 )
12448 {
12449 mddb_db_t *dbp;
12450 mddb_de_ic_t *dep;
12451 mddb_recid_t ids[2];
12452
12453 for (dbp = s->s_dbp; dbp != NULL; dbp = dbp->db_next) {
12454 for (dep = dbp->db_firstentry; dep != NULL;
12455 dep = dep->de_next) {
12456 if (! (dep->de_flags & MDDB_F_OPT)) {
12457 ids[0] = MAKERECID(s->s_setno, dep->de_recid);
12458 ids[1] = 0;
12459 if (mddb_commitrecs(ids)) {
12460 return (MDDB_E_NORECORD);
12461 }
12462 }
12463 }
12464 }
12465 return (0);
12466 }
12467
12468 static int
update_mb(mddb_set_t * s)12469 update_mb(
12470 mddb_set_t *s
12471 )
12472 {
12473 mddb_ri_t *rip;
12474 int err = 0;
12475
12476 for (rip = s->s_rip; rip != NULL; rip = rip->ri_next) {
12477 if (rip->ri_flags & MDDB_F_EMASTER)
12478 /* disk is powered off or not there */
12479 continue;
12480
12481 if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12482 /*
12483 * It is a replicated set
12484 */
12485 if (rip->ri_devid == (ddi_devid_t)NULL) {
12486 return (-1);
12487 }
12488 err = update_mb_devid(s, rip, rip->ri_devid);
12489 } else {
12490 /*
12491 * It is a non-replicated set
12492 * and there is no need to update
12493 * devid
12494 */
12495 err = update_mb_devid(s, rip, NULL);
12496 }
12497
12498 if (err)
12499 return (err);
12500 }
12501
12502 return (0);
12503 }
12504
12505 static int
update_setname(set_t setno)12506 update_setname(
12507 set_t setno
12508 )
12509 {
12510 struct nm_next_hdr *nh;
12511 struct nm_shared_name *shn, *new_shn;
12512 char *prefix = "/dev/md/";
12513 char *shrname;
12514 int len;
12515 mdkey_t o_key;
12516 uint32_t o_count, o_data;
12517 mddb_recid_t recid, ids[3];
12518 int err = 0;
12519 mddb_set_t *dbp;
12520
12521 /* Import setname */
12522 dbp = (mddb_set_t *)md_set[setno].s_db;
12523 len = strlen(prefix) + strlen(dbp->s_setname) + strlen("/dsk/") + 1;
12524 shrname = kmem_zalloc(len, KM_SLEEP);
12525 (void) sprintf(shrname, "%s%s%s", prefix, dbp->s_setname, "/dsk/");
12526
12527 rw_enter(&nm_lock.lock, RW_WRITER);
12528 if ((nh = get_first_record(setno, 0, NM_SHARED)) == NULL) {
12529 /*
12530 * No namespace is okay
12531 */
12532 err = 0;
12533 goto out;
12534 }
12535
12536 if ((shn = (struct nm_shared_name *)lookup_shared_entry(nh,
12537 0, prefix, NULL, NM_SHARED | NM_IMP_SHARED)) == NULL) {
12538 /*
12539 * No metadevice is okay
12540 */
12541 err = 0;
12542 goto out;
12543 }
12544
12545 /*
12546 * We have it, go ahead and update the namespace.
12547 */
12548 o_key = shn->sn_key;
12549 o_count = shn->sn_count;
12550 o_data = shn->sn_data;
12551
12552 if (remove_shared_entry(nh, o_key, NULL, 0L | NM_IMP_SHARED |
12553 NM_NOCOMMIT | NM_KEY_RECYCLE)) {
12554 err = MDDB_E_NORECORD;
12555 goto out;
12556 }
12557 if ((new_shn = (struct nm_shared_name *)alloc_entry(
12558 nh, md_set[setno].s_nmid, len, NM_SHARED |
12559 NM_NOCOMMIT, &recid)) == NULL) {
12560 err = MDDB_E_NORECORD;
12561 goto out;
12562 }
12563
12564 new_shn->sn_key = o_key;
12565 new_shn->sn_count = o_count;
12566 new_shn->sn_data = o_data;
12567 new_shn->sn_namlen = (ushort_t)len;
12568 (void) strcpy(new_shn->sn_name, shrname);
12569
12570 ids[0] = recid;
12571 ids[1] = md_set[setno].s_nmid;
12572 ids[2] = 0;
12573 err = mddb_commitrecs(ids);
12574
12575 out:
12576 if (shrname)
12577 kmem_free(shrname, len);
12578 rw_exit(&nm_lock.lock);
12579 return (err);
12580 }
12581
12582 /*
12583 * Returns 0 on success.
12584 * Returns -1 on failure with ep filled in.
12585 */
12586 static int
md_imp_db(set_t setno,int stale_flag,md_error_t * ep)12587 md_imp_db(
12588 set_t setno,
12589 int stale_flag,
12590 md_error_t *ep
12591 )
12592 {
12593 mddb_set_t *s;
12594 int err = 0;
12595 mddb_dt_t *dtp;
12596 mddb_lb_t *lbp;
12597 int i;
12598 int loccnt;
12599
12600 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12601 return (mddbstatus2error(ep, err, NODEV32, setno));
12602 }
12603
12604 /* Update dt */
12605 if ((dtp = (mddb_dt_t *)md_set[setno].s_dtp) != NULL) {
12606 crcgen(dtp, &dtp->dt_cks, MDDB_DT_BYTES, NULL);
12607 }
12608
12609 if ((err = dt_write(s)) != 0) {
12610 err = mdsyserror(ep, err);
12611 mddb_setexit(s);
12612 return (err);
12613 }
12614
12615 /*
12616 * Update lb, no need to update the mediator because
12617 * the diskset will only exist on the importing node
12618 * and as such a mediator adds no value.
12619 */
12620
12621 /* Update lb */
12622 if (stale_flag & MD_IMP_STALE_SET) {
12623 lbp = s->s_lbp;
12624 loccnt = lbp->lb_loccnt;
12625 for (i = 0; i < loccnt; i++) {
12626 mddb_locator_t *lp = &lbp->lb_locators[i];
12627 md_dev64_t ndev = md_expldev(lp->l_dev);
12628 ddi_devid_t devid_ptr;
12629
12630 devid_ptr = s->s_did_icp->did_ic_devid[i];
12631 if (devid_ptr == NULL) {
12632 /*
12633 * Already deleted, go to next one.
12634 */
12635 continue;
12636 }
12637 if (mddb_devid_validate((ddi_devid_t)devid_ptr, &ndev,
12638 NULL)) {
12639 /* disk unavailable, mark deleted */
12640 lp->l_flags = MDDB_F_DELETED;
12641 /* then remove the device id from the list */
12642 free_mbipp(&s->s_mbiarray[i]);
12643 (void) mddb_devid_delete(s, i);
12644 }
12645 }
12646 md_clr_setstatus(setno, MD_SET_STALE);
12647 }
12648
12649 if ((err = writelocall(s)) != 0) {
12650 err = mdmddberror(ep, MDDB_E_NOTNOW, NODEV32, setno);
12651 mddb_setexit(s);
12652 return (err);
12653 }
12654
12655 mddb_setexit(s);
12656
12657 /* Update db records */
12658 if ((err = update_db_rec(s)) != 0) {
12659 return (mddbstatus2error(ep, err, NODEV32, setno));
12660 }
12661
12662 /* Update setname embedded in the namespace */
12663 if ((err = update_setname(setno)) != 0)
12664 return (mddbstatus2error(ep, err, NODEV32, setno));
12665
12666 return (err);
12667 }
12668
12669 static void
md_dr_add(md_set_record * sr,md_drive_record * dr)12670 md_dr_add(
12671 md_set_record *sr,
12672 md_drive_record *dr
12673 )
12674 {
12675 md_drive_record *drv;
12676
12677 if (sr->sr_driverec == 0) {
12678 sr->sr_driverec = dr->dr_selfid;
12679 return;
12680 }
12681
12682 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12683 drv->dr_nextrec != 0;
12684 drv = (md_drive_record *)mddb_getrecaddr(drv->dr_nextrec))
12685 ;
12686 drv->dr_nextrec = dr->dr_selfid;
12687 }
12688
12689 static void
md_setup_recids(md_set_record * sr,mddb_recid_t ** ids,size_t size)12690 md_setup_recids(
12691 md_set_record *sr,
12692 mddb_recid_t **ids,
12693 size_t size
12694 )
12695 {
12696 md_drive_record *drv;
12697 int cnt;
12698 mddb_recid_t *recids;
12699
12700 recids = (mddb_recid_t *)kmem_zalloc(sizeof (mddb_recid_t)
12701 * size, KM_SLEEP);
12702 recids[0] = sr->sr_selfid;
12703 cnt = 1;
12704
12705 for (drv = (md_drive_record *)mddb_getrecaddr(sr->sr_driverec);
12706 /* CSTYLED */
12707 drv != NULL;) {
12708 recids[cnt++] = drv->dr_selfid;
12709 if (drv->dr_nextrec != 0)
12710 drv = (md_drive_record *)mddb_getrecaddr
12711 (drv->dr_nextrec);
12712 else
12713 drv = NULL;
12714 }
12715 recids[cnt] = 0;
12716 *ids = &recids[0];
12717 }
12718
12719 /*
12720 * The purpose of this function is to replace the old_devid with the
12721 * new_devid in the given namespace. This is used for importing
12722 * remotely replicated drives.
12723 */
12724 int
md_update_namespace_rr_did(mddb_config_t * cp)12725 md_update_namespace_rr_did(
12726 mddb_config_t *cp
12727 )
12728 {
12729 set_t setno = cp->c_setno;
12730 struct nm_next_hdr *nh;
12731 mdkey_t key = MD_KEYWILD;
12732 side_t side = MD_SIDEWILD;
12733 mddb_recid_t recids[3];
12734 struct did_min_name *n;
12735 struct nm_next_hdr *did_shr_nh;
12736 struct did_shr_name *shr_n;
12737 mdkey_t ent_did_key;
12738 uint32_t ent_did_count;
12739 uint32_t ent_did_data;
12740 ddi_devid_t devid = NULL;
12741 struct did_shr_name *shn;
12742 void *old_devid, *new_devid;
12743
12744 if (!(md_get_setstatus(setno) & MD_SET_NM_LOADED))
12745 return (EIO);
12746
12747 old_devid = (void *)(uintptr_t)cp->c_locator.l_old_devid;
12748 new_devid = (void *)(uintptr_t)cp->c_locator.l_devid;
12749
12750 /*
12751 * It is okay if we dont have any configuration
12752 */
12753 if ((nh = get_first_record(setno, 0, NM_DEVID | NM_NOTSHARED))
12754 == NULL) {
12755 return (0);
12756 }
12757 while ((key = md_getnextkey(setno, side, key, NULL)) != MD_KEYWILD) {
12758 /* check out every entry in the namespace */
12759 if ((n = (struct did_min_name *)lookup_entry(nh, setno,
12760 side, key, NODEV64, NM_DEVID)) == NULL) {
12761 continue;
12762 } else {
12763 did_shr_nh = get_first_record(setno, 0, NM_DEVID |
12764 NM_SHARED);
12765 if (did_shr_nh == NULL) {
12766 return (ENOENT);
12767 }
12768
12769 shr_n = (struct did_shr_name *)lookup_shared_entry(
12770 did_shr_nh, n->min_devid_key, (char *)0,
12771 &recids[0], NM_DEVID);
12772 if (shr_n == NULL) {
12773 return (ENOENT);
12774 }
12775 rw_enter(&nm_lock.lock, RW_WRITER);
12776 devid = (ddi_devid_t)shr_n->did_devid;
12777 /* find this devid in the incore replica */
12778 if (ddi_devid_compare(devid, old_devid) == 0) {
12779 /*
12780 * found the corresponding entry
12781 * update with new devid
12782 */
12783 /* first remove old devid info */
12784 ent_did_key = shr_n ->did_key;
12785 ent_did_count = shr_n->did_count;
12786 ent_did_data = shr_n->did_data;
12787 (void) remove_shared_entry(did_shr_nh,
12788 shr_n->did_key, NULL, NM_DEVID |
12789 NM_IMP_SHARED | NM_KEY_RECYCLE);
12790
12791 /* add in new devid info */
12792 if ((shn = (struct did_shr_name *)
12793 alloc_entry(did_shr_nh,
12794 md_set[setno].s_did_nmid,
12795 cp->c_locator.l_devid_sz,
12796 NM_DEVID | NM_SHARED | NM_NOCOMMIT,
12797 &recids[0])) == NULL) {
12798 rw_exit(&nm_lock.lock);
12799 return (ENOMEM);
12800 }
12801 shn->did_key = ent_did_key;
12802 shn->did_count = ent_did_count;
12803 ent_did_data |= NM_DEVID_VALID;
12804 shn->did_data = ent_did_data;
12805 shn->did_size = ddi_devid_sizeof(
12806 new_devid);
12807 bcopy((void *)new_devid, (void *)
12808 shn->did_devid, shn->did_size);
12809 recids[1] = md_set[setno].s_nmid;
12810 recids[2] = 0;
12811 mddb_commitrecs_wrapper(recids);
12812 }
12813 rw_exit(&nm_lock.lock);
12814 }
12815 }
12816
12817 return (0);
12818 }
12819
12820 /*
12821 * namespace is loaded before this is called.
12822 * This function is a wrapper for md_update_namespace_rr_did.
12823 *
12824 * md_update_namespace_rr_did may be called twice if attempting to
12825 * resolve a replicated device id during the take of a diskset - once
12826 * for the diskset namespace and a second time for the local namespace.
12827 * The local namespace would need to be updated when a drive has been
12828 * found during a take of the diskset that hadn't been resolved during
12829 * the import (aka partial replicated import).
12830 *
12831 * If being called during the import of the diskset (IMPORT flag set)
12832 * md_update_namespace_rr_did will only be called once with the disket
12833 * namespace.
12834 */
12835 int
md_update_nm_rr_did_ioctl(mddb_config_t * cp)12836 md_update_nm_rr_did_ioctl(
12837 mddb_config_t *cp
12838 )
12839 {
12840 int rval = 0;
12841
12842 /* If update of diskset namespace fails, stop and return failure */
12843 if ((rval = md_update_namespace_rr_did(cp)) != 0)
12844 return (rval);
12845
12846 if (cp->c_flags & MDDB_C_IMPORT)
12847 return (0);
12848
12849 /* If update of local namespace fails, return failure */
12850 cp->c_setno = MD_LOCAL_SET;
12851 rval = md_update_namespace_rr_did(cp);
12852 return (rval);
12853 }
12854
12855 /*ARGSUSED*/
12856 int
md_imp_snarf_set(mddb_config_t * cp)12857 md_imp_snarf_set(
12858 mddb_config_t *cp
12859 )
12860 {
12861 set_t setno;
12862 int stale_flag;
12863 mddb_set_t *s;
12864 int i, err = 0;
12865 md_ops_t *ops;
12866 md_error_t *ep = &cp->c_mde;
12867
12868 setno = cp->c_setno;
12869 stale_flag = cp->c_flags;
12870
12871 mdclrerror(ep);
12872 if (setno >= md_nsets) {
12873 return (mdsyserror(ep, EINVAL));
12874 }
12875
12876 md_haltsnarf_enter(setno);
12877 if (md_get_setstatus(setno) & MD_SET_IMPORT) {
12878 goto out;
12879 }
12880
12881 /* Set the bit first otherwise load_old_replicas can fail */
12882 md_set_setstatus(setno, MD_SET_IMPORT);
12883
12884 if ((s = mddb_setenter(setno, MDDB_MUSTEXIST, &err)) == NULL) {
12885 err = mddbstatus2error(ep, err, NODEV32, setno);
12886 goto out;
12887 }
12888
12889 /*
12890 * Upon completion of load_old_replicas, the old setno is
12891 * restored from the disk so we need to reset
12892 */
12893 s->s_lbp->lb_setno = setno;
12894
12895 /*
12896 * Fixup the NM records before loading namespace
12897 */
12898 (void) md_imp_nm(s);
12899 mddb_setexit(s);
12900
12901 /*
12902 * Load the devid name space if it exists
12903 * and ask each module to fixup unit records
12904 */
12905 if (!md_load_namespace(setno, NULL, NM_DEVID)) {
12906 err = mdsyserror(ep, ENOENT);
12907 goto cleanup;
12908 }
12909 if (!md_load_namespace(setno, NULL, 0L)) {
12910 (void) md_unload_namespace(setno, NM_DEVID);
12911 err = mdsyserror(ep, ENOENT);
12912 goto cleanup;
12913 }
12914
12915 do {
12916 i = 0;
12917 for (ops = md_opslist; ops != NULL; ops = ops->md_next)
12918 if (ops->md_imp_set != NULL)
12919 i += ops->md_imp_set(setno);
12920 } while (i);
12921
12922 /*
12923 * Fixup
12924 * (1) locator block
12925 * (2) locator name block if necessary
12926 * (3) master block
12927 * (4) directory block
12928 * calls appropriate writes to push changes out
12929 */
12930 if ((err = md_imp_db(setno, stale_flag, ep)) != 0) {
12931 goto cleanup;
12932 }
12933
12934 /*
12935 * Don't unload namespace if importing a replicated diskset.
12936 * Namespace will be unloaded with an explicit RELEASE_SET ioctl.
12937 */
12938 if (md_get_setstatus(s->s_setno) & MD_SET_REPLICATED_IMPORT) {
12939 md_haltsnarf_exit(setno);
12940 return (err);
12941 }
12942
12943 cleanup:
12944 /*
12945 * Halt the set
12946 */
12947 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
12948 (void) md_halt_set(setno, MD_HALT_ALL);
12949 rw_exit(&md_unit_array_rw.lock);
12950
12951 /*
12952 * Unload the namespace for the imported set
12953 */
12954 mutex_enter(&mddb_lock);
12955 mddb_unload_set(setno);
12956 mutex_exit(&mddb_lock);
12957
12958 out:
12959 md_haltsnarf_exit(setno);
12960 md_clr_setstatus(setno, MD_SET_IMPORT | MD_SET_REPLICATED_IMPORT);
12961 return (err);
12962 }
12963 #endif /* MDDB_FAKE */
12964