1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
24 */
25
26 /*
27 * Database location balancing code.
28 */
29
30 #include <meta.h>
31 #include <sys/lvm/md_mddb.h>
32 #include <sdssc.h>
33
34 #define MD_MINBALREP 2
35
36 /*
37 * Stuff for DB balancing.
38 */
39 enum md_ctlr_ops_t {
40 DRV_NOP = 0,
41 DRV_ADD = 1,
42 DRV_DEL = 2
43 };
44 typedef enum md_ctlr_ops_t md_ctlr_ops_t;
45
46 /* drive flag fields */
47 #define DRV_F_ERROR 0x1
48 #define DRV_F_INDISKSET 0x2
49
50 struct md_ctlr_drv_t {
51 md_ctlr_ops_t drv_op;
52 int drv_flags;
53 int drv_dbcnt;
54 int drv_new_dbcnt;
55 daddr_t drv_dbsize;
56 mddrivename_t *drv_dnp;
57 struct md_ctlr_drv_t *drv_next;
58 };
59 typedef struct md_ctlr_drv_t md_ctlr_drv_t;
60
61 struct md_ctlr_ctl_t {
62 mdcinfo_t *ctl_cinfop;
63 int ctl_dbcnt;
64 int ctl_drcnt;
65 md_ctlr_drv_t *ctl_drvs;
66 struct md_ctlr_ctl_t *ctl_next;
67 };
68 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
69
70 static int
add_replica(mdsetname_t * sp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,md_error_t * ep)71 add_replica(
72 mdsetname_t *sp,
73 mddrivename_t *dnp,
74 int dbcnt,
75 daddr_t dbsize,
76 md_error_t *ep
77 )
78 {
79 mdnamelist_t *nlp = NULL;
80 mdname_t *np;
81 md_set_desc *sd;
82 uint_t rep_slice;
83
84 if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
85 return (-1);
86
87 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
88 return (-1);
89
90 (void) metanamelist_append(&nlp, np);
91
92 if ((sd = metaget_setdesc(sp, ep)) == NULL) {
93 metafreenamelist(nlp);
94 return (-1);
95 }
96
97 if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
98 (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) {
99 metafreenamelist(nlp);
100 return (-1);
101 }
102
103 metafreenamelist(nlp);
104 return (0);
105 }
106
107 static int
del_replica(mdsetname_t * sp,mddrivename_t * dnp,md_error_t * ep)108 del_replica(
109 mdsetname_t *sp,
110 mddrivename_t *dnp,
111 md_error_t *ep
112 )
113 {
114 mdnamelist_t *nlp = NULL;
115 mdname_t *np;
116 uint_t rep_slice;
117
118 if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
119 return (-1);
120
121 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
122 return (-1);
123
124 (void) metanamelist_append(&nlp, np);
125
126 if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED),
127 NULL, ep) == -1) {
128 metafreenamelist(nlp);
129 return (-1);
130 }
131
132 metafreenamelist(nlp);
133 return (0);
134 }
135
136 static int
rep_has_err(md_replicalist_t * rlp,mdname_t * np)137 rep_has_err(md_replicalist_t *rlp, mdname_t *np)
138 {
139 md_replicalist_t *rl;
140
141 for (rl = rlp; rl != NULL; rl = rl->rl_next) {
142 md_replica_t *r = rl->rl_repp;
143
144 if (strcmp(r->r_namep->cname, np->cname) != 0)
145 continue;
146
147 if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA |
148 MDDB_F_EMASTER | MDDB_F_EWRITE))
149 return (1);
150
151 }
152 return (0);
153 }
154
155 static int
add_drv_to_ctl_lst(md_ctlr_ctl_t ** clpp,md_replicalist_t * rlp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,mdcinfo_t * cinfop,int indiskset,int with_bus,int errored,md_error_t * ep)156 add_drv_to_ctl_lst(
157 md_ctlr_ctl_t **clpp,
158 md_replicalist_t *rlp,
159 mddrivename_t *dnp,
160 int dbcnt,
161 daddr_t dbsize,
162 mdcinfo_t *cinfop,
163 int indiskset,
164 int with_bus,
165 int errored,
166 md_error_t *ep
167 )
168 {
169 md_ctlr_drv_t **dpp;
170 mdname_t *np;
171 mdcinfo_t *tcinfop;
172 char *cmp_name_1, *cmp_name_2;
173 int not_found;
174
175 /*
176 * The user must pass in a list head.
177 */
178 assert(clpp != NULL);
179
180 if (cinfop == NULL) {
181 uint_t rep_slice;
182
183 if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
184 /*
185 * A failure to get the slice information can occur
186 * because the drive has failed, if this is the
187 * case then there is nothing that can be done
188 * with this drive, so do not include it in the
189 * list of drives. Clear the error and return.
190 */
191 mdclrerror(ep);
192 return (0);
193 }
194
195 if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
196 return (-1);
197
198 if ((tcinfop = metagetcinfo(np, ep)) == NULL)
199 return (-1);
200
201 if (metagetvtoc(np, FALSE, NULL, ep) == NULL)
202 errored = 1;
203
204 if (rep_has_err(rlp, np))
205 errored = 1;
206 } else
207 tcinfop = cinfop;
208
209 for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) {
210 /*
211 * Try to locate ctlr.
212 */
213 (void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1);
214 (void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname,
215 &cmp_name_2);
216
217 if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype ||
218 tcinfop->cnum != (*clpp)->ctl_cinfop->cnum ||
219 strncmp(cmp_name_1, cmp_name_2, 16) != 0 ||
220 (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) {
221 not_found = 1;
222 } else
223 not_found = 0;
224
225
226 sdssc_convert_path_free(cmp_name_1);
227 sdssc_convert_path_free(cmp_name_2);
228
229 if (not_found)
230 continue;
231
232 /*
233 * Found ctlr, try to locate the drive.
234 */
235 for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL;
236 dpp = &(*dpp)->drv_next) {
237 (void) sdssc_convert_cluster_path(
238 (*dpp)->drv_dnp->cname, &cmp_name_1);
239 (void) sdssc_convert_cluster_path(dnp->cname,
240 &cmp_name_2);
241
242 not_found = strcmp(cmp_name_1, cmp_name_2);
243
244 sdssc_convert_path_free(cmp_name_1);
245 sdssc_convert_path_free(cmp_name_2);
246
247 if (not_found)
248 continue;
249
250 /*
251 * Found drive, must be deleting.
252 */
253 (*dpp)->drv_op = DRV_DEL;
254 if (indiskset)
255 (*dpp)->drv_flags |= DRV_F_INDISKSET;
256 if (errored) {
257 mdclrerror(ep);
258 (*dpp)->drv_flags |= DRV_F_ERROR;
259 }
260 (*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt;
261 (*clpp)->ctl_drcnt--;
262 return (0);
263 }
264 /*
265 * The ctlr was found, but not the drive, so add
266 * the drive
267 */
268 (*dpp) = Zalloc(sizeof (**dpp));
269
270
271 if (indiskset) {
272 (*dpp)->drv_op = DRV_NOP;
273 (*dpp)->drv_flags |= DRV_F_INDISKSET;
274 if (errored) {
275 mdclrerror(ep);
276 (*dpp)->drv_flags |= DRV_F_ERROR;
277 }
278 } else {
279 (*dpp)->drv_op = DRV_ADD;
280 if (errored) {
281 (*dpp)->drv_flags |= DRV_F_ERROR;
282 return (-1);
283 }
284 assert(dbsize != 0);
285 }
286 (*dpp)->drv_dbcnt = dbcnt;
287 (*dpp)->drv_dbsize = dbsize;
288 (*dpp)->drv_dnp = dnp;
289 (*clpp)->ctl_dbcnt += dbcnt;
290 (*clpp)->ctl_drcnt++;
291 return (0);
292 }
293 /*
294 * No ctlr was located, so add the ctlr, then recurse to add the
295 * drive to the ctlr.
296 */
297 (*clpp) = Zalloc(sizeof (**clpp));
298
299 (*clpp)->ctl_cinfop = tcinfop;
300
301 return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop,
302 indiskset, with_bus, errored, ep));
303 }
304
305 static int
add_replica_to_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,int minimum_replicas,md_error_t * ep)306 add_replica_to_ctl(
307 mdsetname_t *sp,
308 md_ctlr_ctl_t *c,
309 int minimum_replicas,
310 md_error_t *ep
311 )
312 {
313 md_ctlr_drv_t *d;
314 int maxdb = 0;
315
316 /*
317 * If this ctrl has no "usable" drives, assert() or just return if
318 * assert()'s are turned off.
319 */
320 if (c->ctl_drcnt == 0) {
321 assert(0);
322 return (0);
323 }
324
325 /*
326 * Determine the largest DB count on a drive.
327 */
328 for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
329 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
330 maxdb = d->drv_dbcnt;
331
332 /*
333 * Make sure we start at a reasonable number
334 */
335 if (maxdb == 0)
336 maxdb = 1;
337
338 /*
339 * Add a replica to a drive on this ctrl.
340 */
341 /*CONSTCOND*/
342 while (1) {
343 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
344 /*
345 * If this drive is being deleted, skip it.
346 */
347 if (d->drv_op == DRV_DEL)
348 continue;
349
350 if (d->drv_flags & DRV_F_ERROR)
351 continue;
352 /*
353 * Make sure that the replicas are distributed across
354 * the drives.
355 */
356 if (d->drv_dbcnt >= maxdb)
357 continue;
358 /*
359 * See if the drive already has replicas,
360 * if it does, then delete the exisiting
361 * replica(s) and re-add n+1 replicas to the drive.
362 */
363 /* ==== Vulnerability - no DB's start ==== */
364 if (d->drv_dbcnt > 0) {
365 if (del_replica(sp, d->drv_dnp, ep) == -1) {
366 d->drv_flags |= DRV_F_ERROR;
367 if (! (d->drv_flags & DRV_F_INDISKSET))
368 return (-1);
369 mdclrerror(ep);
370 continue;
371 }
372 }
373 if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1),
374 d->drv_dbsize, ep) == -1) {
375 md_error_t nep = mdnullerror;
376
377 if (d->drv_dbcnt) {
378 /*
379 * We have to to bring the replica
380 * in the drive to the previous
381 * status by adding the original no
382 * of replicas to the drive since
383 * the addition of (drv_dbcnt+1) no
384 * of replicas has failed. If we
385 * leave it at this state, we might
386 * end up having no replicas at
387 * all for the diskset.
388 */
389 if (add_replica(sp, d->drv_dnp,
390 d->drv_dbcnt, d->drv_dbsize,
391 &nep) == -1) {
392 c->ctl_dbcnt -= d->drv_dbcnt;
393 d->drv_dbcnt = 0;
394 mdclrerror(&nep);
395 }
396 }
397
398 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
399 return (-1);
400
401 if (mdismddberror(ep, MDE_REPLICA_TOOSMALL))
402 continue;
403
404 d->drv_flags |= DRV_F_ERROR;
405 if (! (d->drv_flags & DRV_F_INDISKSET))
406 return (-1);
407 mdclrerror(ep);
408 continue;
409 }
410
411 d->drv_dbcnt++;
412 c->ctl_dbcnt++;
413 /* ==== Vulnerability - no DB's end ==== */
414 return (1);
415 }
416 maxdb++;
417 if (maxdb > minimum_replicas)
418 return (0);
419 }
420 /*NOTREACHED*/
421 }
422
423 static int
del_replica_from_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,md_error_t * ep)424 del_replica_from_ctl(
425 mdsetname_t *sp,
426 md_ctlr_ctl_t *c,
427 md_error_t *ep
428 )
429 {
430 md_ctlr_drv_t *d;
431 int maxdb = 0;
432
433 /*
434 * If this ctrl has no "usable" drives, assert() or just return if
435 * assert()'s are turned off.
436 */
437 if (c->ctl_drcnt == 0) {
438 assert(0);
439 return (0);
440 }
441
442 /*
443 * Determine the largest DB count on a drive.
444 */
445 for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
446 if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
447 maxdb = d->drv_dbcnt;
448
449 if (maxdb == 0)
450 return (0);
451
452 /*
453 * Delete a replica from a drive on this ctrl.
454 */
455 /*CONSTCOND*/
456 while (1) {
457 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
458 /*
459 * If this drive is being deleted, skip it.
460 */
461 if (d->drv_op == DRV_DEL)
462 continue;
463
464 /*
465 * Make sure that there are replicas on this drive to
466 * delete.
467 */
468 if (d->drv_dbcnt == 0)
469 continue;
470
471 if (d->drv_flags & DRV_F_ERROR)
472 continue;
473
474 /*
475 * We need to keep the DB's distributed across the
476 * drives.
477 */
478 if (d->drv_dbcnt < maxdb)
479 continue;
480
481 /*
482 * Delete all the replicas on the drive.
483 */
484 /* ==== Vulnerability - no DB's start ==== */
485 if (del_replica(sp, d->drv_dnp, ep) == -1) {
486 d->drv_flags |= DRV_F_ERROR;
487 if (! (d->drv_flags & DRV_F_INDISKSET))
488 return (-1);
489 mdclrerror(ep);
490 continue;
491 }
492 d->drv_dbcnt--;
493 c->ctl_dbcnt--;
494 /*
495 * If there is still a dbcnt for this drive, then add
496 * back the needed DB's.
497 */
498 if (d->drv_dbcnt > 0) {
499 if (add_replica(sp, d->drv_dnp, d->drv_dbcnt,
500 d->drv_dbsize, ep) == -1) {
501 c->ctl_dbcnt -= d->drv_dbcnt;
502 d->drv_dbcnt = 0;
503
504 if (mdismddberror(ep,
505 MDE_TOOMANY_REPLICAS))
506 return (-1);
507
508 d->drv_flags |= DRV_F_ERROR;
509 if (! (d->drv_flags & DRV_F_INDISKSET))
510 return (-1);
511 mdclrerror(ep);
512 continue;
513 }
514 }
515 /* ==== Vulnerability - no DB's end ==== */
516 return (1);
517 }
518 maxdb--;
519 if (maxdb <= 0)
520 return (0);
521 }
522 /*NOTREACHED*/
523 }
524
525 static int
del_replicas(mdsetname_t * sp,md_ctlr_ctl_t * clp,md_error_t * ep)526 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep)
527 {
528 md_ctlr_ctl_t *c;
529 md_ctlr_drv_t *d;
530 mdnamelist_t *nlp;
531 mdname_t *np;
532
533 for (c = clp; c != NULL; c = c->ctl_next) {
534 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
535 uint_t rep_slice;
536
537 if (! (d->drv_flags & DRV_F_ERROR) &&
538 (d->drv_op != DRV_DEL))
539 continue;
540
541 if (d->drv_dbcnt == 0)
542 continue;
543
544 if (meta_replicaslice(d->drv_dnp,
545 &rep_slice, ep) != 0)
546 return (-1);
547
548 np = metaslicename(d->drv_dnp, rep_slice, ep);
549 if (np == NULL)
550 return (-1);
551
552 nlp = NULL;
553 (void) metanamelist_append(&nlp, np);
554
555 /*
556 * Delete the replicas listed.
557 */
558 if (meta_db_detach(sp, nlp,
559 (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
560 ep) == -1) {
561 metafreenamelist(nlp);
562 if (d->drv_flags & DRV_F_INDISKSET) {
563 mdclrerror(ep);
564 continue;
565 }
566 return (-1);
567 }
568 metafreenamelist(nlp);
569 }
570 }
571
572 return (0);
573 }
574
575 static void
free_ctlr_lst(md_ctlr_ctl_t ** clpp)576 free_ctlr_lst(md_ctlr_ctl_t **clpp)
577 {
578 md_ctlr_ctl_t *c, *tc = NULL;
579 md_ctlr_drv_t *d, *td = NULL;
580
581 for (c = *clpp; c != NULL; c = tc) {
582 tc = c->ctl_next;
583 for (d = c->ctl_drvs; d != NULL; d = td) {
584 td = d->drv_next;
585 Free(d);
586 }
587 Free(c);
588 }
589 *clpp = NULL;
590 }
591
592 static int
build_ctlr_lst(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,int with_bus,daddr_t dbsize,md_error_t * ep)593 build_ctlr_lst(
594 mdsetname_t *sp,
595 md_ctlr_ctl_t **clpp,
596 md_drive_desc *opdd,
597 md_drive_desc *curdd,
598 int with_bus,
599 daddr_t dbsize,
600 md_error_t *ep
601 )
602 {
603 md_drive_desc *d;
604 md_set_desc *sd;
605 daddr_t nblks;
606 md_replicalist_t *rlp = NULL;
607 static daddr_t min_dbsize = 0;
608
609 if (min_dbsize == 0) {
610 if ((nblks = meta_db_minreplica(sp, ep)) < 0) {
611 min_dbsize = MD_DBSIZE;
612
613 if (! metaislocalset(sp)) {
614 if ((sd = metaget_setdesc(sp, ep)) == NULL)
615 return (-1);
616
617 if (MD_MNSET_DESC(sd))
618 min_dbsize = MD_MN_DBSIZE;
619 }
620 mdclrerror(ep);
621 } else
622 min_dbsize = nblks;
623 }
624
625 if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
626 if (! mdismddberror(ep, MDE_DB_NODB) &&
627 ! mdismddberror(ep, MDE_DB_NOTOWNER))
628 return (-1);
629 mdclrerror(ep);
630 }
631
632 /*
633 * Add drives currently in the set to the ctlr list.
634 */
635 for (d = curdd; d != NULL; d = d->dd_next) {
636 daddr_t this_dbsize = d->dd_dbsize;
637
638 if (this_dbsize == 0)
639 this_dbsize = min_dbsize;
640
641 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt,
642 this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1)
643 return (-1);
644 }
645
646 /*
647 * Add the drives that are being operated on to the ctlr list.
648 */
649 for (d = opdd; d != NULL; d = d->dd_next)
650 if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL,
651 FALSE, with_bus, 0, ep) == -1)
652 return (-1);
653
654 metafreereplicalist(rlp);
655 return (0);
656 }
657
658 static int
count_replica_on_ctl(md_ctlr_ctl_t * c,int adding,int * db_cnt,int minimum_replicas)659 count_replica_on_ctl(
660 md_ctlr_ctl_t *c,
661 int adding,
662 int *db_cnt,
663 int minimum_replicas
664 )
665 {
666 md_ctlr_drv_t *d;
667 int maxdb = 0;
668
669 /*
670 * If this ctrl has no "usable" drives, nothing to do.
671 */
672 if (c->ctl_drcnt == 0)
673 return (0);
674
675 /*
676 * Determine the largest DB count on a drive.
677 */
678 for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
679 if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL)
680 maxdb = d->drv_new_dbcnt;
681
682 /*
683 * Make sure we start at a reasonable number
684 */
685 if (maxdb == 0) {
686 if (!adding)
687 return (0);
688 maxdb = 1;
689 }
690
691 /*
692 * Count or Un-Count replicas that would be
693 * added or deleted respectively.
694 */
695 /*CONSTCOND*/
696 while (1) {
697 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
698 /*
699 * If this drive is being deleted, skip it.
700 */
701 if (d->drv_op == DRV_DEL)
702 continue;
703
704 /*
705 * If the drive is errored and adding, skip it.
706 */
707 if (adding && (d->drv_flags & DRV_F_ERROR))
708 continue;
709
710 /*
711 * Make sure that the replicas are distributed across
712 * the drives.
713 */
714 if (adding) {
715 if (d->drv_new_dbcnt >= maxdb)
716 continue;
717 } else {
718 if (d->drv_new_dbcnt == 0)
719 continue;
720 if (d->drv_new_dbcnt < maxdb)
721 continue;
722 }
723
724 /*
725 * Count or Un-Count replicas here.
726 */
727 if (adding) {
728 mdpart_t *partp;
729 uint_t rep_slice;
730 md_error_t mde = mdnullerror;
731
732 if (meta_replicaslice(d->drv_dnp,
733 &rep_slice, &mde) != 0) {
734 mdclrerror(&mde);
735 continue;
736 }
737
738 partp = &d->drv_dnp->vtoc.parts[rep_slice];
739 if (! partp)
740 continue;
741
742 if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) >
743 (partp->size - 16))
744 continue;
745 (*db_cnt)++;
746 d->drv_new_dbcnt++;
747 } else {
748 (*db_cnt)--;
749 d->drv_new_dbcnt--;
750 }
751 return (0);
752 }
753
754 /*
755 * This should make sure they get spread
756 * around. This is to emulate the {add,del}_replica
757 * routines.
758 */
759 if (adding) {
760 maxdb++;
761 if (maxdb > minimum_replicas)
762 return (-1);
763 } else {
764 maxdb--;
765 if (maxdb <= 0)
766 return (-1);
767 }
768 }
769 /*NOTREACHED*/
770 }
771
772 static int
count_replicas(md_ctlr_ctl_t * clp,int min_reps)773 count_replicas(
774 md_ctlr_ctl_t *clp,
775 int min_reps
776 )
777 {
778 md_ctlr_ctl_t *c;
779 md_ctlr_drv_t *d;
780 int db_cnt;
781 int uctlrs = 0;
782 int total_cnt = 0;
783
784 /*
785 * Count the number of controllers,
786 * counting the replicas is slightly different based
787 * on the controller count.
788 */
789 for (c = clp; c != NULL; c = c->ctl_next)
790 if (c->ctl_drcnt > 0) {
791 uctlrs++;
792 for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
793 d->drv_new_dbcnt = d->drv_dbcnt;
794 }
795
796 if (uctlrs > 2) {
797 for (c = clp; c != NULL; c = c->ctl_next) {
798 if (c->ctl_drcnt == 0)
799 continue;
800
801 db_cnt = c->ctl_dbcnt;
802 /*
803 * Count the replicas that would be added.
804 */
805 while (db_cnt < min_reps)
806 if (count_replica_on_ctl(c, TRUE,
807 &db_cnt, min_reps))
808 return (-1);
809
810 /*
811 * Un-Count the replicas that would be deleted.
812 */
813 while (db_cnt > min_reps)
814 if (count_replica_on_ctl(c, FALSE,
815 &db_cnt, min_reps))
816 return (-1);
817 total_cnt += db_cnt;
818 }
819 } else {
820 for (c = clp; c != NULL; c = c->ctl_next) {
821 if (c->ctl_drcnt == 0)
822 continue;
823
824 db_cnt = c->ctl_dbcnt;
825 /*
826 * Count the replicas that woud be added.
827 */
828 while (db_cnt < (min_reps * c->ctl_drcnt))
829 if (count_replica_on_ctl(c, TRUE,
830 &db_cnt, min_reps))
831 return (-1);
832
833 total_cnt += db_cnt;
834 }
835 }
836
837 return (total_cnt);
838 }
839
840 static int
balance_replicas(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,int * minimum_replicas,md_error_t * ep)841 balance_replicas(
842 mdsetname_t *sp,
843 md_ctlr_ctl_t **clpp,
844 md_drive_desc *opdd,
845 md_drive_desc *curdd,
846 daddr_t dbsize,
847 int *minimum_replicas,
848 md_error_t *ep
849 )
850 {
851 int n;
852 int rctlrs = 0;
853 int uctlrs;
854 int ructlrs;
855 int octlrs;
856 int save_done;
857 int prevcnt = 0, issame = 1;
858 uint_t drvcnt = ~0U;
859 uint_t save_cnum;
860 mhd_ctlrtype_t save_ctype;
861 char save_cname[16];
862 char *cmp_name_1, *cmp_name_2;
863 int reps;
864 md_ctlr_ctl_t *c;
865
866 /*
867 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
868 */
869 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
870 return (-1);
871
872 /*
873 * Determine what controllers are usable in the sense of being able to
874 * add a replica to a drive on the controller.
875 * Also find the minimum number of drives on a controller.
876 */
877 for (c = *clpp; c != NULL; c = c->ctl_next) {
878 if (c->ctl_drcnt > 0) {
879 rctlrs++;
880 drvcnt = min(drvcnt, c->ctl_drcnt);
881 if (prevcnt == 0)
882 prevcnt = c->ctl_drcnt;
883 else if (prevcnt != c->ctl_drcnt)
884 issame = 0;
885 }
886 }
887
888 if ((rctlrs <= 2) || (issame && (drvcnt >= 30)))
889 goto cont;
890
891 /*
892 * If here: Handling 3 or more controllers most
893 * likely with non-symmetrical number of
894 * disks. The number of replicas will be
895 * the minimum number of disks on a controller.
896 *
897 * The main point is to insure that a
898 * controller does not have more than half
899 * of the replicas.
900 */
901 drvcnt = min(drvcnt, 12);
902 drvcnt = max(drvcnt, MD_MINBALREP);
903
904 /*
905 * Can we find fewer than the maximum replicas by reducing the
906 * number of replicas per drive.
907 */
908 for (n = drvcnt; n > 0; n--) {
909 reps = count_replicas(*clpp, n);
910 if (reps > 0 && reps <= MDDB_NLB) {
911 *minimum_replicas = n;
912 return (0);
913 }
914 }
915
916 cont:
917 free_ctlr_lst(clpp);
918
919 /*
920 * Build a ctlr list with SSA-100 busses as separate controllers.
921 *
922 * If Here: Try to put 2 replicas per controller/bus
923 * If that doesn't work put 1 replica per controller/bus
924 */
925 if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1)
926 return (-1);
927
928 /*
929 * If the number of "real" controllers is 2, special handling may be
930 * needed.
931 */
932 if (rctlrs != 2) {
933 drvcnt = MD_MINBALREP;
934 goto other;
935 }
936
937 /*
938 * Determine what controllers are usable in the sense of being able to
939 * add a replica to a drive on the controller.
940 * Also find the minimum number of drives on a controller.
941 */
942 drvcnt = ~0U;
943 uctlrs = 0;
944 for (c = *clpp; c != NULL; c = c->ctl_next) {
945 if (c->ctl_drcnt > 0) {
946 uctlrs++;
947 drvcnt = min(drvcnt, c->ctl_drcnt);
948 }
949 }
950
951 /*
952 * If the number of controllers is not changed, continue with original
953 * strategy.
954 */
955 if (uctlrs == rctlrs) {
956 drvcnt = MD_MINBALREP;
957 goto other;
958 }
959
960 /*
961 * Check the distribution of bus ctlrs across real controllers.
962 */
963 ructlrs = 0;
964 octlrs = 0;
965 save_done = 0;
966 for (c = *clpp; c != NULL; c = c->ctl_next) {
967 if (c->ctl_drcnt == 0)
968 continue;
969
970 if (! save_done) {
971 save_cnum = c->ctl_cinfop->cnum;
972 save_ctype = c->ctl_cinfop->ctype;
973 (void) strncpy(save_cname, c->ctl_cinfop->cname, 16);
974 save_done = 1;
975 }
976
977 (void) sdssc_convert_cluster_path(c->ctl_cinfop->cname,
978 &cmp_name_1);
979 (void) sdssc_convert_cluster_path(save_cname, &cmp_name_2);
980
981 if (save_ctype != c->ctl_cinfop->ctype ||
982 save_cnum != c->ctl_cinfop->cnum ||
983 strncmp(cmp_name_1, cmp_name_2, 16) != 0)
984 octlrs++;
985 else
986 ructlrs++;
987
988 sdssc_convert_path_free(cmp_name_1);
989 sdssc_convert_path_free(cmp_name_2);
990 }
991
992 /*
993 * Take the largest of the counts
994 */
995 ructlrs = max(ructlrs, octlrs);
996
997 /*
998 * If the distribution of bus controlers is half of the total, then
999 * this layout strategy will work, doit.
1000 */
1001 if ((uctlrs / 2) == ructlrs) {
1002 drvcnt = MD_MINBALREP;
1003 goto other;
1004 }
1005
1006 /*
1007 * If here, there is a distribution of bus controllers that will cause
1008 * the real controller distribution to be unbalanced, so a different
1009 * strategy is used.
1010 */
1011 free_ctlr_lst(clpp);
1012
1013 /*
1014 * Build the ctlr list with SSA-100 busses NOT as separate controllers.
1015 */
1016 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1017 return (-1);
1018
1019 /*
1020 * Make ctl_drcnt limit the number of replicas
1021 */
1022 for (c = *clpp; c != NULL; c = c->ctl_next)
1023 c->ctl_drcnt = min(drvcnt, c->ctl_drcnt);
1024
1025 /*
1026 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt
1027 */
1028 drvcnt = MD_MINBALREP;
1029
1030 other:
1031 /*
1032 * Can we find fewer than the maximum replicas by reducing the number
1033 * of replicas per drive.
1034 */
1035 for (n = drvcnt; n > 0; n--) {
1036 reps = count_replicas(*clpp, n);
1037 if (reps > 0 && reps <= MDDB_NLB) {
1038 *minimum_replicas = n;
1039 return (0);
1040 }
1041 }
1042
1043 free_ctlr_lst(clpp);
1044
1045 /*
1046 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
1047 *
1048 * If Here: Try to put 2 replicas per controller (not on busses)
1049 * If that doesn't work put 1 replica per controller
1050 */
1051 if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1052 return (-1);
1053
1054 /*
1055 * Can we find fewer than the maximum replicas by reducing the
1056 * number of replicas per drive.
1057 */
1058 for (n = MD_MINBALREP; n > 0; n--) {
1059 reps = count_replicas(*clpp, n);
1060 if (reps > 0 && reps <= MDDB_NLB) {
1061 *minimum_replicas = n;
1062 return (0);
1063 }
1064 }
1065
1066 /*
1067 * Return a ctrl list that does not include the SSA-100 buses as
1068 * separate controllers. This will create fewer separate controllers.
1069 */
1070 *minimum_replicas = 1;
1071 return (0);
1072 }
1073
1074 static int
morethan2_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)1075 morethan2_ctl_balance(
1076 mdsetname_t *sp,
1077 md_ctlr_ctl_t *clp,
1078 int min_reps,
1079 md_error_t *ep
1080 )
1081 {
1082 md_ctlr_ctl_t *c;
1083 int err;
1084 int multiple_reps = 0;
1085 md_ctlr_drv_t *d;
1086
1087 for (c = clp; c != NULL; c = c->ctl_next) {
1088 if (c->ctl_drcnt == 0)
1089 continue;
1090
1091 /*
1092 * check for multiple databases on a disk and compensate
1093 */
1094 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1095 if (d->drv_dbcnt)
1096 multiple_reps += d->drv_dbcnt - 1;
1097 }
1098
1099 /*
1100 * remove the number of multiple databases count from the
1101 * total db count. This enables us to rebalance if one of
1102 * the disks has a large enough slice for 2 metadb's. If we
1103 * then add a disk with a smaller slice into the set, we want
1104 * that disk to get a replica on it. If we just compare to
1105 * ctl_dbcnt, it won't.
1106 */
1107 while ((c->ctl_dbcnt - multiple_reps) <
1108 min_reps) {
1109 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1110 return (-1);
1111 if (err == 0)
1112 break;
1113 }
1114
1115 while (c->ctl_dbcnt > min_reps) {
1116 if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1117 return (-1);
1118 if (err == 0)
1119 break;
1120 }
1121 }
1122
1123 return (0);
1124 }
1125
1126 static int
lessthan3_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)1127 lessthan3_ctl_balance(
1128 mdsetname_t *sp,
1129 md_ctlr_ctl_t *clp,
1130 int min_reps,
1131 md_error_t *ep
1132 )
1133 {
1134 md_ctlr_ctl_t *c;
1135 int err;
1136 int multiple_reps = 0;
1137 md_ctlr_drv_t *d;
1138
1139 for (c = clp; c != NULL; c = c->ctl_next) {
1140 if (c->ctl_drcnt == 0)
1141 continue;
1142
1143 /*
1144 * check for multiple databases on a disk and compensate
1145 */
1146 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1147 if (d->drv_dbcnt)
1148 multiple_reps += d->drv_dbcnt - 1;
1149 }
1150
1151 /*
1152 * remove the number of multiple databases count from the
1153 * total db count. This enables us to rebalance if one of
1154 * the disks has a large enough slice for 2 metadb's. If we
1155 * then add a disk with a smaller slice into the set, we want
1156 * that disk to get a replica on it. If we just compare to
1157 * ctl_dbcnt, it won't.
1158 */
1159 while ((c->ctl_dbcnt - multiple_reps) <
1160 (min_reps * c->ctl_drcnt)) {
1161 if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1162 return (-1);
1163 if (err == 0)
1164 break;
1165 }
1166
1167 while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) {
1168 if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1169 return (-1);
1170 if (err == 0)
1171 break;
1172 }
1173 }
1174
1175 return (0);
1176 }
1177
1178 static int
try_again(md_ctlr_ctl_t * clp,md_error_t * ep)1179 try_again(
1180 md_ctlr_ctl_t *clp,
1181 md_error_t *ep
1182 )
1183 {
1184 md_ctlr_ctl_t *c;
1185 md_ctlr_drv_t *d;
1186
1187 if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
1188 return (TRUE);
1189
1190 /*
1191 * retry if all the errored drives are already in the diskset.
1192 */
1193 for (c = clp; c != NULL; c = c->ctl_next) {
1194 for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1195 if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR))
1196 == DRV_F_ERROR)
1197 return (FALSE);
1198 }
1199 }
1200 return (TRUE);
1201 }
1202
1203 int
meta_db_balance(mdsetname_t * sp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,md_error_t * ep)1204 meta_db_balance(
1205 mdsetname_t *sp,
1206 md_drive_desc *opdd,
1207 md_drive_desc *curdd,
1208 daddr_t dbsize,
1209 md_error_t *ep
1210 )
1211 {
1212 int min_reps;
1213 md_ctlr_ctl_t *c, *cl = NULL;
1214 int uctlrs = 0;
1215 int retry = 0;
1216 int rval = 0;
1217
1218 if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1)
1219 return (-1);
1220
1221 /*
1222 * Determine what controllers are usable in the sense of being able to
1223 * add a replica to a drive on the controller.
1224 */
1225 for (c = cl; c != NULL; c = c->ctl_next)
1226 if (c->ctl_drcnt > 0)
1227 uctlrs++;
1228
1229 /*
1230 * Add replicas to achieve a balance.
1231 */
1232 if (uctlrs > 2)
1233 rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1234 else
1235 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1236
1237 if (rval) {
1238 if ((retry = try_again(cl, ep)) == TRUE) {
1239 mdclrerror(ep);
1240 rval = 0;
1241 }
1242 }
1243
1244 /*
1245 * Delete all the replicas from drives that are so marked.
1246 */
1247 if (! rval)
1248 rval = del_replicas(sp, cl, ep);
1249
1250 if (retry) {
1251 if (uctlrs > 2)
1252 rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1253 else
1254 rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1255
1256 if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) {
1257 mdclrerror(ep);
1258 rval = 0;
1259 }
1260 }
1261
1262 /*
1263 * Free up the ctlr list.
1264 */
1265 free_ctlr_lst(&cl);
1266
1267 return (rval);
1268 }
1269