xref: /titanic_41/usr/src/lib/lvm/libmeta/common/meta_db_balance.c (revision 2791f8b95893f7d64b6f89703e7af240aa84a33f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Database location balancing code.
28  */
29 
30 #include <meta.h>
31 #include <sys/lvm/md_mddb.h>
32 #include <sdssc.h>
33 
34 #define	MD_MINBALREP	2
35 
36 /*
37  * Stuff for DB balancing.
38  */
39 enum md_ctlr_ops_t {
40 	DRV_NOP = 0,
41 	DRV_ADD = 1,
42 	DRV_DEL = 2
43 };
44 typedef enum md_ctlr_ops_t md_ctlr_ops_t;
45 
46 /* drive flag fields */
47 #define	DRV_F_ERROR	0x1
48 #define	DRV_F_INDISKSET	0x2
49 
50 struct md_ctlr_drv_t {
51 	md_ctlr_ops_t drv_op;
52 	int drv_flags;
53 	int drv_dbcnt;
54 	int drv_new_dbcnt;
55 	daddr_t drv_dbsize;
56 	mddrivename_t *drv_dnp;
57 	struct md_ctlr_drv_t *drv_next;
58 };
59 typedef struct md_ctlr_drv_t md_ctlr_drv_t;
60 
61 struct md_ctlr_ctl_t {
62 	mdcinfo_t *ctl_cinfop;
63 	int ctl_dbcnt;
64 	int ctl_drcnt;
65 	md_ctlr_drv_t *ctl_drvs;
66 	struct md_ctlr_ctl_t *ctl_next;
67 };
68 typedef struct md_ctlr_ctl_t md_ctlr_ctl_t;
69 
70 static int
add_replica(mdsetname_t * sp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,md_error_t * ep)71 add_replica(
72 	mdsetname_t		*sp,
73 	mddrivename_t		*dnp,
74 	int			dbcnt,
75 	daddr_t			dbsize,
76 	md_error_t		*ep
77 )
78 {
79 	mdnamelist_t		*nlp = NULL;
80 	mdname_t		*np;
81 	md_set_desc		*sd;
82 	uint_t			rep_slice;
83 
84 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
85 		return (-1);
86 
87 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
88 		return (-1);
89 
90 	(void) metanamelist_append(&nlp, np);
91 
92 	if ((sd = metaget_setdesc(sp, ep)) == NULL) {
93 		metafreenamelist(nlp);
94 		return (-1);
95 	}
96 
97 	if (meta_db_attach(sp, nlp, (MDCHK_DRVINSET | MDCHK_SET_LOCKED),
98 	    (&sd->sd_ctime), dbcnt, dbsize, NULL, ep) == -1) {
99 		metafreenamelist(nlp);
100 		return (-1);
101 	}
102 
103 	metafreenamelist(nlp);
104 	return (0);
105 }
106 
107 static int
del_replica(mdsetname_t * sp,mddrivename_t * dnp,md_error_t * ep)108 del_replica(
109 	mdsetname_t		*sp,
110 	mddrivename_t		*dnp,
111 	md_error_t		*ep
112 )
113 {
114 	mdnamelist_t		*nlp = NULL;
115 	mdname_t		*np;
116 	uint_t			rep_slice;
117 
118 	if (meta_replicaslice(dnp, &rep_slice, ep) != 0)
119 		return (-1);
120 
121 	if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
122 		return (-1);
123 
124 	(void) metanamelist_append(&nlp, np);
125 
126 	if (meta_db_detach(sp, nlp, (MDFORCE_DS | MDFORCE_SET_LOCKED),
127 	    NULL, ep) == -1) {
128 		metafreenamelist(nlp);
129 		return (-1);
130 	}
131 
132 	metafreenamelist(nlp);
133 	return (0);
134 }
135 
136 static int
rep_has_err(md_replicalist_t * rlp,mdname_t * np)137 rep_has_err(md_replicalist_t *rlp, mdname_t *np)
138 {
139 	md_replicalist_t	*rl;
140 
141 	for (rl = rlp; rl != NULL; rl = rl->rl_next) {
142 		md_replica_t	*r = rl->rl_repp;
143 
144 		if (strcmp(r->r_namep->cname, np->cname) != 0)
145 			continue;
146 
147 		if (r->r_flags & (MDDB_F_EREAD | MDDB_F_EFMT | MDDB_F_EDATA |
148 		    MDDB_F_EMASTER | MDDB_F_EWRITE))
149 			return (1);
150 
151 	}
152 	return (0);
153 }
154 
155 static int
add_drv_to_ctl_lst(md_ctlr_ctl_t ** clpp,md_replicalist_t * rlp,mddrivename_t * dnp,int dbcnt,daddr_t dbsize,mdcinfo_t * cinfop,int indiskset,int with_bus,int errored,md_error_t * ep)156 add_drv_to_ctl_lst(
157 	md_ctlr_ctl_t		**clpp,
158 	md_replicalist_t	*rlp,
159 	mddrivename_t		*dnp,
160 	int			dbcnt,
161 	daddr_t			dbsize,
162 	mdcinfo_t		*cinfop,
163 	int			indiskset,
164 	int			with_bus,
165 	int			errored,
166 	md_error_t		*ep
167 )
168 {
169 	md_ctlr_drv_t		**dpp;
170 	mdname_t		*np;
171 	mdcinfo_t		*tcinfop;
172 	char			*cmp_name_1, *cmp_name_2;
173 	int			not_found;
174 
175 	/*
176 	 * The user must pass in a list head.
177 	 */
178 	assert(clpp != NULL);
179 
180 	if (cinfop == NULL) {
181 		uint_t	rep_slice;
182 
183 		if (meta_replicaslice(dnp, &rep_slice, ep) != 0) {
184 			/*
185 			 * A failure to get the slice information can occur
186 			 * because the drive has failed, if this is the
187 			 * case then there is nothing that can be done
188 			 * with this drive, so do not include it in the
189 			 * list of drives. Clear the error and return.
190 			 */
191 			mdclrerror(ep);
192 			return (0);
193 		}
194 
195 		if ((np = metaslicename(dnp, rep_slice, ep)) == NULL)
196 			return (-1);
197 
198 		if ((tcinfop = metagetcinfo(np, ep)) == NULL)
199 			return (-1);
200 
201 		if (metagetvtoc(np, FALSE, NULL, ep) == NULL)
202 			errored = 1;
203 
204 		if (rep_has_err(rlp, np))
205 			errored = 1;
206 	} else
207 		tcinfop = cinfop;
208 
209 	for (/* void */; *clpp != NULL; clpp = &(*clpp)->ctl_next) {
210 		/*
211 		 * Try to locate ctlr.
212 		 */
213 		(void) sdssc_convert_cluster_path(tcinfop->cname, &cmp_name_1);
214 		(void) sdssc_convert_cluster_path((*clpp)->ctl_cinfop->cname,
215 		    &cmp_name_2);
216 
217 		if (tcinfop->ctype != (*clpp)->ctl_cinfop->ctype ||
218 		    tcinfop->cnum != (*clpp)->ctl_cinfop->cnum ||
219 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0 ||
220 		    (with_bus && tcinfop->bus != (*clpp)->ctl_cinfop->bus)) {
221 			not_found = 1;
222 		} else
223 			not_found = 0;
224 
225 
226 		sdssc_convert_path_free(cmp_name_1);
227 		sdssc_convert_path_free(cmp_name_2);
228 
229 		if (not_found)
230 			continue;
231 
232 		/*
233 		 * Found ctlr, try to locate the drive.
234 		 */
235 		for (dpp = &(*clpp)->ctl_drvs; *dpp != NULL;
236 		    dpp = &(*dpp)->drv_next) {
237 			(void) sdssc_convert_cluster_path(
238 			    (*dpp)->drv_dnp->cname, &cmp_name_1);
239 			(void) sdssc_convert_cluster_path(dnp->cname,
240 			    &cmp_name_2);
241 
242 			not_found = strcmp(cmp_name_1, cmp_name_2);
243 
244 			sdssc_convert_path_free(cmp_name_1);
245 			sdssc_convert_path_free(cmp_name_2);
246 
247 			if (not_found)
248 				continue;
249 
250 			/*
251 			 * Found drive, must be deleting.
252 			 */
253 			(*dpp)->drv_op = DRV_DEL;
254 			if (indiskset)
255 				(*dpp)->drv_flags |= DRV_F_INDISKSET;
256 			if (errored) {
257 				mdclrerror(ep);
258 				(*dpp)->drv_flags |= DRV_F_ERROR;
259 			}
260 			(*clpp)->ctl_dbcnt -= (*dpp)->drv_dbcnt;
261 			(*clpp)->ctl_drcnt--;
262 			return (0);
263 		}
264 		/*
265 		 * The ctlr was found, but not the drive, so add
266 		 * the drive
267 		 */
268 		(*dpp) = Zalloc(sizeof (**dpp));
269 
270 
271 		if (indiskset) {
272 			(*dpp)->drv_op = DRV_NOP;
273 			(*dpp)->drv_flags |= DRV_F_INDISKSET;
274 			if (errored) {
275 				mdclrerror(ep);
276 				(*dpp)->drv_flags |= DRV_F_ERROR;
277 			}
278 		} else {
279 			(*dpp)->drv_op = DRV_ADD;
280 			if (errored) {
281 				(*dpp)->drv_flags |= DRV_F_ERROR;
282 				return (-1);
283 			}
284 			assert(dbsize != 0);
285 		}
286 		(*dpp)->drv_dbcnt = dbcnt;
287 		(*dpp)->drv_dbsize = dbsize;
288 		(*dpp)->drv_dnp = dnp;
289 		(*clpp)->ctl_dbcnt += dbcnt;
290 		(*clpp)->ctl_drcnt++;
291 		return (0);
292 	}
293 	/*
294 	 * No ctlr was located, so add the ctlr, then recurse to add the
295 	 * drive to the ctlr.
296 	 */
297 	(*clpp) = Zalloc(sizeof (**clpp));
298 
299 	(*clpp)->ctl_cinfop = tcinfop;
300 
301 	return (add_drv_to_ctl_lst(clpp, rlp, dnp, dbcnt, dbsize, tcinfop,
302 	    indiskset, with_bus, errored, ep));
303 }
304 
305 static int
add_replica_to_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,int minimum_replicas,md_error_t * ep)306 add_replica_to_ctl(
307 	mdsetname_t		*sp,
308 	md_ctlr_ctl_t		*c,
309 	int			minimum_replicas,
310 	md_error_t		*ep
311 )
312 {
313 	md_ctlr_drv_t		*d;
314 	int			maxdb = 0;
315 
316 	/*
317 	 * If this ctrl has no "usable" drives, assert() or just return if
318 	 * assert()'s are turned off.
319 	 */
320 	if (c->ctl_drcnt == 0) {
321 		assert(0);
322 		return (0);
323 	}
324 
325 	/*
326 	 * Determine the largest DB count on a drive.
327 	 */
328 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
329 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
330 			maxdb = d->drv_dbcnt;
331 
332 	/*
333 	 * Make sure we start at a reasonable number
334 	 */
335 	if (maxdb == 0)
336 		maxdb = 1;
337 
338 	/*
339 	 * Add a replica to a drive on this ctrl.
340 	 */
341 	/*CONSTCOND*/
342 	while (1) {
343 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
344 			/*
345 			 * If this drive is being deleted, skip it.
346 			 */
347 			if (d->drv_op == DRV_DEL)
348 				continue;
349 
350 			if (d->drv_flags & DRV_F_ERROR)
351 				continue;
352 			/*
353 			 * Make sure that the replicas are distributed across
354 			 * the drives.
355 			 */
356 			if (d->drv_dbcnt >= maxdb)
357 				continue;
358 			/*
359 			 * See if the drive already has replicas,
360 			 * if it does, then delete the exisiting
361 			 * replica(s) and re-add n+1 replicas to the drive.
362 			 */
363 			/* ==== Vulnerability - no DB's start ==== */
364 			if (d->drv_dbcnt > 0) {
365 				if (del_replica(sp, d->drv_dnp, ep) == -1) {
366 					d->drv_flags |= DRV_F_ERROR;
367 					if (! (d->drv_flags & DRV_F_INDISKSET))
368 						return (-1);
369 					mdclrerror(ep);
370 					continue;
371 				}
372 			}
373 			if (add_replica(sp, d->drv_dnp, (d->drv_dbcnt + 1),
374 			    d->drv_dbsize, ep) == -1) {
375 				md_error_t nep = mdnullerror;
376 
377 				if (d->drv_dbcnt) {
378 					/*
379 					 * We have to to bring the replica
380 					 * in the drive to the previous
381 					 * status by adding the original no
382 					 * of replicas to the drive since
383 					 * the addition of (drv_dbcnt+1) no
384 					 * of replicas has failed. If we
385 					 * leave it at this state, we might
386 					 * end up having no replicas at
387 					 * all for the diskset.
388 					 */
389 					if (add_replica(sp, d->drv_dnp,
390 					    d->drv_dbcnt, d->drv_dbsize,
391 					    &nep) == -1) {
392 						c->ctl_dbcnt -= d->drv_dbcnt;
393 						d->drv_dbcnt = 0;
394 						mdclrerror(&nep);
395 					}
396 				}
397 
398 				if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
399 					return (-1);
400 
401 				if (mdismddberror(ep, MDE_REPLICA_TOOSMALL))
402 					continue;
403 
404 				d->drv_flags |= DRV_F_ERROR;
405 				if (! (d->drv_flags & DRV_F_INDISKSET))
406 					return (-1);
407 				mdclrerror(ep);
408 				continue;
409 			}
410 
411 			d->drv_dbcnt++;
412 			c->ctl_dbcnt++;
413 			/* ==== Vulnerability - no DB's end ==== */
414 			return (1);
415 		}
416 		maxdb++;
417 		if (maxdb > minimum_replicas)
418 			return (0);
419 	}
420 	/*NOTREACHED*/
421 }
422 
423 static int
del_replica_from_ctl(mdsetname_t * sp,md_ctlr_ctl_t * c,md_error_t * ep)424 del_replica_from_ctl(
425 	mdsetname_t		*sp,
426 	md_ctlr_ctl_t		*c,
427 	md_error_t		*ep
428 )
429 {
430 	md_ctlr_drv_t		*d;
431 	int			maxdb = 0;
432 
433 	/*
434 	 * If this ctrl has no "usable" drives, assert() or just return if
435 	 * assert()'s are turned off.
436 	 */
437 	if (c->ctl_drcnt == 0) {
438 		assert(0);
439 		return (0);
440 	}
441 
442 	/*
443 	 * Determine the largest DB count on a drive.
444 	 */
445 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
446 		if (d->drv_dbcnt > maxdb && d->drv_op != DRV_DEL)
447 			maxdb = d->drv_dbcnt;
448 
449 	if (maxdb == 0)
450 		return (0);
451 
452 	/*
453 	 * Delete a replica from a drive on this ctrl.
454 	 */
455 	/*CONSTCOND*/
456 	while (1) {
457 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
458 			/*
459 			 * If this drive is being deleted, skip it.
460 			 */
461 			if (d->drv_op == DRV_DEL)
462 				continue;
463 
464 			/*
465 			 * Make sure that there are replicas on this drive to
466 			 * delete.
467 			 */
468 			if (d->drv_dbcnt == 0)
469 				continue;
470 
471 			if (d->drv_flags & DRV_F_ERROR)
472 				continue;
473 
474 			/*
475 			 * We need to keep the DB's distributed across the
476 			 * drives.
477 			 */
478 			if (d->drv_dbcnt < maxdb)
479 				continue;
480 
481 			/*
482 			 * Delete all the replicas on the drive.
483 			 */
484 			/* ==== Vulnerability - no DB's start ==== */
485 			if (del_replica(sp, d->drv_dnp, ep) == -1) {
486 				d->drv_flags |= DRV_F_ERROR;
487 				if (! (d->drv_flags & DRV_F_INDISKSET))
488 					return (-1);
489 				mdclrerror(ep);
490 				continue;
491 			}
492 			d->drv_dbcnt--;
493 			c->ctl_dbcnt--;
494 			/*
495 			 * If there is still a dbcnt for this drive, then add
496 			 * back the needed DB's.
497 			 */
498 			if (d->drv_dbcnt > 0) {
499 				if (add_replica(sp, d->drv_dnp, d->drv_dbcnt,
500 				    d->drv_dbsize, ep) == -1) {
501 					c->ctl_dbcnt -= d->drv_dbcnt;
502 					d->drv_dbcnt = 0;
503 
504 					if (mdismddberror(ep,
505 					    MDE_TOOMANY_REPLICAS))
506 						return (-1);
507 
508 					d->drv_flags |= DRV_F_ERROR;
509 					if (! (d->drv_flags & DRV_F_INDISKSET))
510 						return (-1);
511 					mdclrerror(ep);
512 					continue;
513 				}
514 			}
515 			/* ==== Vulnerability - no DB's end ==== */
516 			return (1);
517 		}
518 		maxdb--;
519 		if (maxdb <= 0)
520 			return (0);
521 	}
522 	/*NOTREACHED*/
523 }
524 
525 static int
del_replicas(mdsetname_t * sp,md_ctlr_ctl_t * clp,md_error_t * ep)526 del_replicas(mdsetname_t *sp, md_ctlr_ctl_t *clp, md_error_t *ep)
527 {
528 	md_ctlr_ctl_t		*c;
529 	md_ctlr_drv_t		*d;
530 	mdnamelist_t		*nlp;
531 	mdname_t		*np;
532 
533 	for (c = clp; c != NULL; c = c->ctl_next) {
534 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
535 			uint_t	rep_slice;
536 
537 			if (! (d->drv_flags & DRV_F_ERROR) &&
538 			    (d->drv_op != DRV_DEL))
539 				continue;
540 
541 			if (d->drv_dbcnt == 0)
542 				continue;
543 
544 			if (meta_replicaslice(d->drv_dnp,
545 			    &rep_slice, ep) != 0)
546 				return (-1);
547 
548 			np = metaslicename(d->drv_dnp, rep_slice, ep);
549 			if (np == NULL)
550 				return (-1);
551 
552 			nlp = NULL;
553 			(void) metanamelist_append(&nlp, np);
554 
555 			/*
556 			 * Delete the replicas listed.
557 			 */
558 			if (meta_db_detach(sp, nlp,
559 			    (MDFORCE_DS | MDFORCE_SET_LOCKED), NULL,
560 			    ep) == -1) {
561 				metafreenamelist(nlp);
562 				if (d->drv_flags & DRV_F_INDISKSET) {
563 					mdclrerror(ep);
564 					continue;
565 				}
566 				return (-1);
567 			}
568 			metafreenamelist(nlp);
569 		}
570 	}
571 
572 	return (0);
573 }
574 
575 static void
free_ctlr_lst(md_ctlr_ctl_t ** clpp)576 free_ctlr_lst(md_ctlr_ctl_t **clpp)
577 {
578 	md_ctlr_ctl_t		*c, *tc = NULL;
579 	md_ctlr_drv_t		*d, *td = NULL;
580 
581 	for (c = *clpp; c != NULL; c = tc) {
582 		tc = c->ctl_next;
583 		for (d = c->ctl_drvs; d != NULL; d = td) {
584 			td = d->drv_next;
585 			Free(d);
586 		}
587 		Free(c);
588 	}
589 	*clpp = NULL;
590 }
591 
592 static int
build_ctlr_lst(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,int with_bus,daddr_t dbsize,md_error_t * ep)593 build_ctlr_lst(
594 	mdsetname_t		*sp,
595 	md_ctlr_ctl_t		**clpp,
596 	md_drive_desc		*opdd,
597 	md_drive_desc		*curdd,
598 	int			with_bus,
599 	daddr_t			dbsize,
600 	md_error_t		*ep
601 )
602 {
603 	md_drive_desc			*d;
604 	md_set_desc			*sd;
605 	daddr_t				nblks;
606 	md_replicalist_t		*rlp = NULL;
607 	static	daddr_t			min_dbsize = 0;
608 
609 	if (min_dbsize == 0) {
610 		if ((nblks = meta_db_minreplica(sp, ep)) < 0) {
611 			min_dbsize = MD_DBSIZE;
612 
613 			if (! metaislocalset(sp)) {
614 				if ((sd = metaget_setdesc(sp, ep)) == NULL)
615 					return (-1);
616 
617 				if (MD_MNSET_DESC(sd))
618 					min_dbsize = MD_MN_DBSIZE;
619 			}
620 			mdclrerror(ep);
621 		} else
622 			min_dbsize = nblks;
623 	}
624 
625 	if (metareplicalist(sp, MD_BASICNAME_OK, &rlp, ep) < 0) {
626 		if (! mdismddberror(ep, MDE_DB_NODB) &&
627 		    ! mdismddberror(ep, MDE_DB_NOTOWNER))
628 			return (-1);
629 		mdclrerror(ep);
630 	}
631 
632 	/*
633 	 * Add drives currently in the set to the ctlr list.
634 	 */
635 	for (d = curdd; d != NULL; d = d->dd_next) {
636 		daddr_t	this_dbsize = d->dd_dbsize;
637 
638 		if (this_dbsize == 0)
639 			this_dbsize = min_dbsize;
640 
641 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, d->dd_dbcnt,
642 		    this_dbsize, NULL, TRUE, with_bus, 0, ep) == -1)
643 			return (-1);
644 	}
645 
646 	/*
647 	 * Add the drives that are being operated on to the ctlr list.
648 	 */
649 	for (d = opdd; d != NULL; d = d->dd_next)
650 		if (add_drv_to_ctl_lst(clpp, rlp, d->dd_dnp, 0, dbsize, NULL,
651 		    FALSE, with_bus, 0, ep) == -1)
652 			return (-1);
653 
654 	metafreereplicalist(rlp);
655 	return (0);
656 }
657 
658 static int
count_replica_on_ctl(md_ctlr_ctl_t * c,int adding,int * db_cnt,int minimum_replicas)659 count_replica_on_ctl(
660 	md_ctlr_ctl_t		*c,
661 	int			adding,
662 	int			*db_cnt,
663 	int			minimum_replicas
664 )
665 {
666 	md_ctlr_drv_t		*d;
667 	int			maxdb = 0;
668 
669 	/*
670 	 * If this ctrl has no "usable" drives, nothing to do.
671 	 */
672 	if (c->ctl_drcnt == 0)
673 		return (0);
674 
675 	/*
676 	 * Determine the largest DB count on a drive.
677 	 */
678 	for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
679 		if (d->drv_new_dbcnt > maxdb && d->drv_op != DRV_DEL)
680 			maxdb = d->drv_new_dbcnt;
681 
682 	/*
683 	 * Make sure we start at a reasonable number
684 	 */
685 	if (maxdb == 0) {
686 		if (!adding)
687 			return (0);
688 		maxdb = 1;
689 	}
690 
691 	/*
692 	 * Count or Un-Count replicas that would be
693 	 * added or deleted respectively.
694 	 */
695 	/*CONSTCOND*/
696 	while (1) {
697 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
698 			/*
699 			 * If this drive is being deleted, skip it.
700 			 */
701 			if (d->drv_op == DRV_DEL)
702 				continue;
703 
704 			/*
705 			 * If the drive is errored and adding, skip it.
706 			 */
707 			if (adding && (d->drv_flags & DRV_F_ERROR))
708 				continue;
709 
710 			/*
711 			 * Make sure that the replicas are distributed across
712 			 * the drives.
713 			 */
714 			if (adding) {
715 				if (d->drv_new_dbcnt >= maxdb)
716 					continue;
717 			} else {
718 				if (d->drv_new_dbcnt == 0)
719 					continue;
720 				if (d->drv_new_dbcnt < maxdb)
721 					continue;
722 			}
723 
724 			/*
725 			 * Count or Un-Count replicas here.
726 			 */
727 			if (adding) {
728 				mdpart_t	*partp;
729 				uint_t		rep_slice;
730 				md_error_t	mde = mdnullerror;
731 
732 				if (meta_replicaslice(d->drv_dnp,
733 				    &rep_slice, &mde) != 0) {
734 					mdclrerror(&mde);
735 					continue;
736 				}
737 
738 				partp = &d->drv_dnp->vtoc.parts[rep_slice];
739 				if (! partp)
740 					continue;
741 
742 				if (((d->drv_new_dbcnt + 1) * d->drv_dbsize) >
743 				    (partp->size - 16))
744 					continue;
745 				(*db_cnt)++;
746 				d->drv_new_dbcnt++;
747 			} else {
748 				(*db_cnt)--;
749 				d->drv_new_dbcnt--;
750 			}
751 			return (0);
752 		}
753 
754 		/*
755 		 * This should make sure they get spread
756 		 * around.  This is to emulate the {add,del}_replica
757 		 * routines.
758 		 */
759 		if (adding) {
760 			maxdb++;
761 			if (maxdb > minimum_replicas)
762 				return (-1);
763 		} else {
764 			maxdb--;
765 			if (maxdb <= 0)
766 				return (-1);
767 		}
768 	}
769 	/*NOTREACHED*/
770 }
771 
772 static int
count_replicas(md_ctlr_ctl_t * clp,int min_reps)773 count_replicas(
774 	md_ctlr_ctl_t		*clp,
775 	int			min_reps
776 )
777 {
778 	md_ctlr_ctl_t		*c;
779 	md_ctlr_drv_t		*d;
780 	int			db_cnt;
781 	int			uctlrs = 0;
782 	int			total_cnt = 0;
783 
784 	/*
785 	 * Count the number of controllers,
786 	 * counting the replicas is slightly different based
787 	 * on the controller count.
788 	 */
789 	for (c = clp; c != NULL; c = c->ctl_next)
790 		if (c->ctl_drcnt > 0) {
791 			uctlrs++;
792 			for (d = c->ctl_drvs; d != NULL; d = d->drv_next)
793 				d->drv_new_dbcnt = d->drv_dbcnt;
794 		}
795 
796 	if (uctlrs > 2) {
797 		for (c = clp; c != NULL; c = c->ctl_next) {
798 			if (c->ctl_drcnt == 0)
799 				continue;
800 
801 			db_cnt = c->ctl_dbcnt;
802 			/*
803 			 * Count the replicas that would be added.
804 			 */
805 			while (db_cnt < min_reps)
806 				if (count_replica_on_ctl(c, TRUE,
807 				    &db_cnt, min_reps))
808 					return (-1);
809 
810 			/*
811 			 * Un-Count the replicas that would be deleted.
812 			 */
813 			while (db_cnt > min_reps)
814 				if (count_replica_on_ctl(c, FALSE,
815 				    &db_cnt, min_reps))
816 					return (-1);
817 			total_cnt += db_cnt;
818 		}
819 	} else {
820 		for (c = clp; c != NULL; c = c->ctl_next) {
821 			if (c->ctl_drcnt == 0)
822 				continue;
823 
824 			db_cnt = c->ctl_dbcnt;
825 			/*
826 			 * Count the replicas that woud be added.
827 			 */
828 			while (db_cnt < (min_reps * c->ctl_drcnt))
829 				if (count_replica_on_ctl(c, TRUE,
830 				    &db_cnt, min_reps))
831 					return (-1);
832 
833 			total_cnt += db_cnt;
834 		}
835 	}
836 
837 	return (total_cnt);
838 }
839 
840 static int
balance_replicas(mdsetname_t * sp,md_ctlr_ctl_t ** clpp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,int * minimum_replicas,md_error_t * ep)841 balance_replicas(
842 	mdsetname_t		*sp,
843 	md_ctlr_ctl_t		**clpp,
844 	md_drive_desc		*opdd,
845 	md_drive_desc		*curdd,
846 	daddr_t			dbsize,
847 	int			*minimum_replicas,
848 	md_error_t		*ep
849 )
850 {
851 	int			n;
852 	int			rctlrs = 0;
853 	int			uctlrs;
854 	int			ructlrs;
855 	int			octlrs;
856 	int			save_done;
857 	int			prevcnt = 0, issame = 1;
858 	uint_t			drvcnt = ~0U;
859 	uint_t			save_cnum;
860 	mhd_ctlrtype_t		save_ctype;
861 	char			save_cname[16];
862 	char			*cmp_name_1, *cmp_name_2;
863 	int			reps;
864 	md_ctlr_ctl_t		*c;
865 
866 	/*
867 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
868 	 */
869 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
870 		return (-1);
871 
872 	/*
873 	 * Determine what controllers are usable in the sense of being able to
874 	 * add a replica to a drive on the controller.
875 	 * Also find the minimum number of drives on a controller.
876 	 */
877 	for (c = *clpp; c != NULL; c = c->ctl_next) {
878 		if (c->ctl_drcnt > 0) {
879 			rctlrs++;
880 			drvcnt = min(drvcnt, c->ctl_drcnt);
881 			if (prevcnt == 0)
882 				prevcnt = c->ctl_drcnt;
883 			else if (prevcnt != c->ctl_drcnt)
884 				issame = 0;
885 		}
886 	}
887 
888 	if ((rctlrs <= 2) || (issame && (drvcnt >= 30)))
889 		goto cont;
890 
891 	/*
892 	 * If here: Handling 3 or more controllers most
893 	 *	    likely with non-symmetrical number of
894 	 *	    disks. The number of replicas will be
895 	 *	    the minimum number of disks on a controller.
896 	 *
897 	 *	    The main point is to insure that a
898 	 *	    controller does not have more than half
899 	 *	    of the replicas.
900 	 */
901 	drvcnt = min(drvcnt, 12);
902 	drvcnt = max(drvcnt, MD_MINBALREP);
903 
904 	/*
905 	 * Can we find fewer than the maximum replicas by reducing the
906 	 * number of replicas per drive.
907 	 */
908 	for (n = drvcnt; n > 0; n--) {
909 		reps = count_replicas(*clpp, n);
910 		if (reps > 0 && reps <= MDDB_NLB) {
911 			*minimum_replicas = n;
912 			return (0);
913 		}
914 	}
915 
916 cont:
917 	free_ctlr_lst(clpp);
918 
919 	/*
920 	 * Build a ctlr list with SSA-100 busses as separate controllers.
921 	 *
922 	 * If Here: Try to put 2 replicas per controller/bus
923 	 *	    If that doesn't work put 1 replica per controller/bus
924 	 */
925 	if (build_ctlr_lst(sp, clpp, opdd, curdd, TRUE, dbsize, ep) == -1)
926 		return (-1);
927 
928 	/*
929 	 * If the number of "real" controllers is 2, special handling may be
930 	 * needed.
931 	 */
932 	if (rctlrs != 2) {
933 		drvcnt = MD_MINBALREP;
934 		goto other;
935 	}
936 
937 	/*
938 	 * Determine what controllers are usable in the sense of being able to
939 	 * add a replica to a drive on the controller.
940 	 * Also find the minimum number of drives on a controller.
941 	 */
942 	drvcnt = ~0U;
943 	uctlrs = 0;
944 	for (c = *clpp; c != NULL; c = c->ctl_next) {
945 		if (c->ctl_drcnt > 0) {
946 			uctlrs++;
947 			drvcnt = min(drvcnt, c->ctl_drcnt);
948 		}
949 	}
950 
951 	/*
952 	 * If the number of controllers is not changed, continue with original
953 	 * strategy.
954 	 */
955 	if (uctlrs == rctlrs) {
956 		drvcnt = MD_MINBALREP;
957 		goto other;
958 	}
959 
960 	/*
961 	 * Check the distribution of bus ctlrs across real controllers.
962 	 */
963 	ructlrs = 0;
964 	octlrs = 0;
965 	save_done = 0;
966 	for (c = *clpp; c != NULL; c = c->ctl_next) {
967 		if (c->ctl_drcnt == 0)
968 			continue;
969 
970 		if (! save_done) {
971 			save_cnum = c->ctl_cinfop->cnum;
972 			save_ctype = c->ctl_cinfop->ctype;
973 			(void) strncpy(save_cname, c->ctl_cinfop->cname, 16);
974 			save_done = 1;
975 		}
976 
977 		(void) sdssc_convert_cluster_path(c->ctl_cinfop->cname,
978 		    &cmp_name_1);
979 		(void) sdssc_convert_cluster_path(save_cname, &cmp_name_2);
980 
981 		if (save_ctype != c->ctl_cinfop->ctype ||
982 		    save_cnum != c->ctl_cinfop->cnum ||
983 		    strncmp(cmp_name_1, cmp_name_2, 16) != 0)
984 			octlrs++;
985 		else
986 			ructlrs++;
987 
988 		sdssc_convert_path_free(cmp_name_1);
989 		sdssc_convert_path_free(cmp_name_2);
990 	}
991 
992 	/*
993 	 * Take the largest of the counts
994 	 */
995 	ructlrs = max(ructlrs, octlrs);
996 
997 	/*
998 	 * If the distribution of bus controlers is half of the total, then
999 	 * this layout strategy will work, doit.
1000 	 */
1001 	if ((uctlrs / 2) == ructlrs) {
1002 		drvcnt = MD_MINBALREP;
1003 		goto other;
1004 	}
1005 
1006 	/*
1007 	 * If here, there is a distribution of bus controllers that will cause
1008 	 * the real controller distribution to be unbalanced, so a different
1009 	 * strategy is used.
1010 	 */
1011 	free_ctlr_lst(clpp);
1012 
1013 	/*
1014 	 * Build the ctlr list with SSA-100 busses NOT as separate controllers.
1015 	 */
1016 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1017 		return (-1);
1018 
1019 	/*
1020 	 * Make ctl_drcnt limit the number of replicas
1021 	 */
1022 	for (c = *clpp; c != NULL; c = c->ctl_next)
1023 		c->ctl_drcnt = min(drvcnt, c->ctl_drcnt);
1024 
1025 	/*
1026 	 * Try at least MD_MINBALREP's per controller after changing ctl_drcnt
1027 	 */
1028 	drvcnt = MD_MINBALREP;
1029 
1030 other:
1031 	/*
1032 	 * Can we find fewer than the maximum replicas by reducing the number
1033 	 * of replicas per drive.
1034 	 */
1035 	for (n = drvcnt; n > 0; n--) {
1036 		reps = count_replicas(*clpp, n);
1037 		if (reps > 0 && reps <= MDDB_NLB) {
1038 			*minimum_replicas = n;
1039 			return (0);
1040 		}
1041 	}
1042 
1043 	free_ctlr_lst(clpp);
1044 
1045 	/*
1046 	 * Build a ctlr list with SSA-100 busses NOT as separate controllers.
1047 	 *
1048 	 * If Here: Try to put 2 replicas per controller (not on busses)
1049 	 *	    If that doesn't work put 1 replica per controller
1050 	 */
1051 	if (build_ctlr_lst(sp, clpp, opdd, curdd, FALSE, dbsize, ep) == -1)
1052 		return (-1);
1053 
1054 	/*
1055 	 * Can we find fewer than the maximum replicas by reducing the
1056 	 * number of replicas per drive.
1057 	 */
1058 	for (n = MD_MINBALREP; n > 0; n--) {
1059 		reps = count_replicas(*clpp, n);
1060 		if (reps > 0 && reps <= MDDB_NLB) {
1061 			*minimum_replicas = n;
1062 			return (0);
1063 		}
1064 	}
1065 
1066 	/*
1067 	 * Return a ctrl list that does not include the SSA-100 buses as
1068 	 * separate controllers.  This will create fewer separate controllers.
1069 	 */
1070 	*minimum_replicas = 1;
1071 	return (0);
1072 }
1073 
1074 static int
morethan2_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)1075 morethan2_ctl_balance(
1076 	mdsetname_t		*sp,
1077 	md_ctlr_ctl_t		*clp,
1078 	int			min_reps,
1079 	md_error_t		*ep
1080 )
1081 {
1082 	md_ctlr_ctl_t		*c;
1083 	int			err;
1084 	int			multiple_reps = 0;
1085 	md_ctlr_drv_t		*d;
1086 
1087 	for (c = clp; c != NULL; c = c->ctl_next) {
1088 		if (c->ctl_drcnt == 0)
1089 			continue;
1090 
1091 		/*
1092 		 * check for multiple databases on a disk and compensate
1093 		 */
1094 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1095 			if (d->drv_dbcnt)
1096 				multiple_reps += d->drv_dbcnt - 1;
1097 		}
1098 
1099 		/*
1100 		 * remove the number of multiple databases count from the
1101 		 * total db count. This enables us to rebalance if one of
1102 		 * the disks has a large enough slice for 2 metadb's. If we
1103 		 * then add a disk with a smaller slice into the set, we want
1104 		 * that disk to get a replica on it. If we just compare to
1105 		 * ctl_dbcnt, it won't.
1106 		 */
1107 		while ((c->ctl_dbcnt - multiple_reps) <
1108 		    min_reps) {
1109 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1110 				return (-1);
1111 			if (err == 0)
1112 				break;
1113 		}
1114 
1115 		while (c->ctl_dbcnt > min_reps) {
1116 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1117 				return (-1);
1118 			if (err == 0)
1119 				break;
1120 		}
1121 	}
1122 
1123 	return (0);
1124 }
1125 
1126 static int
lessthan3_ctl_balance(mdsetname_t * sp,md_ctlr_ctl_t * clp,int min_reps,md_error_t * ep)1127 lessthan3_ctl_balance(
1128 	mdsetname_t		*sp,
1129 	md_ctlr_ctl_t		*clp,
1130 	int			min_reps,
1131 	md_error_t		*ep
1132 )
1133 {
1134 	md_ctlr_ctl_t		*c;
1135 	int			err;
1136 	int			multiple_reps = 0;
1137 	md_ctlr_drv_t		*d;
1138 
1139 	for (c = clp; c != NULL; c = c->ctl_next) {
1140 		if (c->ctl_drcnt == 0)
1141 			continue;
1142 
1143 		/*
1144 		 * check for multiple databases on a disk and compensate
1145 		 */
1146 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1147 			if (d->drv_dbcnt)
1148 				multiple_reps += d->drv_dbcnt - 1;
1149 		}
1150 
1151 		/*
1152 		 * remove the number of multiple databases count from the
1153 		 * total db count. This enables us to rebalance if one of
1154 		 * the disks has a large enough slice for 2 metadb's. If we
1155 		 * then add a disk with a smaller slice into the set, we want
1156 		 * that disk to get a replica on it. If we just compare to
1157 		 * ctl_dbcnt, it won't.
1158 		 */
1159 		while ((c->ctl_dbcnt - multiple_reps) <
1160 		    (min_reps * c->ctl_drcnt)) {
1161 			if ((err = add_replica_to_ctl(sp, c, min_reps, ep)) < 0)
1162 				return (-1);
1163 			if (err == 0)
1164 				break;
1165 		}
1166 
1167 		while (c->ctl_dbcnt > (min_reps * c->ctl_drcnt)) {
1168 			if ((err = del_replica_from_ctl(sp, c, ep)) < 0)
1169 				return (-1);
1170 			if (err == 0)
1171 				break;
1172 		}
1173 	}
1174 
1175 	return (0);
1176 }
1177 
1178 static int
try_again(md_ctlr_ctl_t * clp,md_error_t * ep)1179 try_again(
1180 	md_ctlr_ctl_t	*clp,
1181 	md_error_t	*ep
1182 )
1183 {
1184 	md_ctlr_ctl_t	*c;
1185 	md_ctlr_drv_t	*d;
1186 
1187 	if (mdismddberror(ep, MDE_TOOMANY_REPLICAS))
1188 		return (TRUE);
1189 
1190 	/*
1191 	 * retry if all the errored drives are already in the diskset.
1192 	 */
1193 	for (c = clp; c != NULL; c = c->ctl_next) {
1194 		for (d = c->ctl_drvs; d != NULL; d = d->drv_next) {
1195 			if ((d->drv_flags & (DRV_F_INDISKSET|DRV_F_ERROR))
1196 			    == DRV_F_ERROR)
1197 				return (FALSE);
1198 		}
1199 	}
1200 	return (TRUE);
1201 }
1202 
1203 int
meta_db_balance(mdsetname_t * sp,md_drive_desc * opdd,md_drive_desc * curdd,daddr_t dbsize,md_error_t * ep)1204 meta_db_balance(
1205 	mdsetname_t		*sp,
1206 	md_drive_desc		*opdd,
1207 	md_drive_desc		*curdd,
1208 	daddr_t			dbsize,
1209 	md_error_t		*ep
1210 )
1211 {
1212 	int			min_reps;
1213 	md_ctlr_ctl_t		*c, *cl = NULL;
1214 	int			uctlrs = 0;
1215 	int			retry = 0;
1216 	int			rval = 0;
1217 
1218 	if (balance_replicas(sp, &cl, opdd, curdd, dbsize, &min_reps, ep) == -1)
1219 		return (-1);
1220 
1221 	/*
1222 	 * Determine what controllers are usable in the sense of being able to
1223 	 * add a replica to a drive on the controller.
1224 	 */
1225 	for (c = cl; c != NULL; c = c->ctl_next)
1226 		if (c->ctl_drcnt > 0)
1227 			uctlrs++;
1228 
1229 	/*
1230 	 * Add replicas to achieve a balance.
1231 	 */
1232 	if (uctlrs > 2)
1233 		rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1234 	else
1235 		rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1236 
1237 	if (rval) {
1238 		if ((retry = try_again(cl, ep)) == TRUE) {
1239 			mdclrerror(ep);
1240 			rval = 0;
1241 		}
1242 	}
1243 
1244 	/*
1245 	 * Delete all the replicas from drives that are so marked.
1246 	 */
1247 	if (! rval)
1248 		rval = del_replicas(sp, cl, ep);
1249 
1250 	if (retry) {
1251 		if (uctlrs > 2)
1252 			rval = morethan2_ctl_balance(sp, cl, min_reps, ep);
1253 		else
1254 			rval = lessthan3_ctl_balance(sp, cl, min_reps, ep);
1255 
1256 		if (rval && mdismddberror(ep, MDE_TOOMANY_REPLICAS)) {
1257 			mdclrerror(ep);
1258 			rval = 0;
1259 		}
1260 	}
1261 
1262 	/*
1263 	 * Free up the ctlr list.
1264 	 */
1265 	free_ctlr_lst(&cl);
1266 
1267 	return (rval);
1268 }
1269