xref: /titanic_41/usr/src/uts/common/io/lvm/md/md_rename.c (revision 890e8ff10cfc85bc7d33064a9a30c3e8477b4813)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 
29 /*
30  * rename or exchange identities of virtual device nodes
31  */
32 
33 #include <sys/param.h>
34 #include <sys/systm.h>
35 #include <sys/debug.h>
36 #include <sys/sysmacros.h>
37 #include <sys/types.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 
41 #include <sys/lvm/mdvar.h>
42 #include <sys/lvm/md_rename.h>
43 
44 #include <sys/sysevent/eventdefs.h>
45 #include <sys/sysevent/svm.h>
46 
47 extern	major_t		md_major;
48 extern	unit_t		md_nunits;
49 extern	set_t		md_nsets;
50 extern	md_set_t	md_set[];
51 
52 #define	ROLE(r)						\
53 	((r) == MDRR_PARENT?	"parent":		\
54 	(r) == MDRR_SELF?	"self":			\
55 	(r) == MDRR_CHILD?	"child":		\
56 	(r) == MDRR_UNK?	"<unknown>": "<garbage>")
57 
58 #define	OP_STR(op)							\
59 		(((op) == MDRNOP_UNK)?		"<unknown>"	:	\
60 		    ((op) == MDRNOP_RENAME)?	"rename"	:	\
61 		    ((op) == MDRNOP_EXCHANGE)?	"exchange"	:	\
62 						"<garbage>")
63 int md_rename_debug = 0;
64 
65 /* delta guard rails */
66 const unsigned long long	DELTA_BEG	= (0xDad08888a110beefull);
67 const unsigned long long	DELTA_END	= (0xa110Beef88880Dadull);
68 
69 const unsigned long long	DELTA_BEG_FREED	= (0xBad0c0ed0fed0dadull);
70 const unsigned long long	DELTA_END_FREED	= (0x0Fed0dadbad0c0edull);
71 
72 /* transaction guard rails */
73 const unsigned long long	TXN_BEG		= (0xDad01eadc0ed2badull);
74 const unsigned long long	TXN_END		= (0xc0ed2badDad01eadull);
75 
76 const unsigned long long	TXNUN_BEG	= (0xcafe0fedbad0beefull);
77 const unsigned long long	TXNUN_END	= (0xbad0beefcafe0fedull);
78 
79 const unsigned int		guard_shift	= (sizeof (u_longlong_t) - 3);
80 const md_stackcap_t		MD_CAN_DO_ANYTHING	= (md_stackcap_t)0;
81 
82 typedef struct role_change_mapping_tab_t {
83 	const int			ord;
84 	const md_renrole_t		old_role;
85 	const md_renrole_t		new_role;
86 	const char			*svc_name;
87 	md_ren_roleswap_svc_t * const	default_svc;
88 } role_change_tab_t;
89 
90 /*
91  *  The actual table is at the end of the file, so we don't need
92  *  many forward references
93  */
94 static	role_change_tab_t	role_swap_tab[];
95 
96 #define	ILLEGAL_ROLESWAP_SVC	((md_ren_roleswap_svc_t *)(0xA1100BAD))
97 #define	NO_DEFAULT_ROLESWAP_SVC	((md_ren_roleswap_svc_t *)(NULL))
98 #define	ILLEGAL_SVC_NAME	(NULL)
99 
100 /*
101  *
102  * Role swap rule table:
103  *
104  *                                New Role
105  *      +---------------------------------------------------------------|
106  *      |        |    Parent       |       Self     |      Child        |
107  *      +--------+-----------------+----------------+-------------------+
108  *      | Parent | no default      | ...no default  | illegal	        |
109  *      |        | 1 (update kids) | 2  (update to) | 3	                |
110  * Old  +--------+-----------------+----------------+-------------------+
111  * Role | Self   | ...self update  | ...rename self | no default (down  |
112  *      |        | 4   update up | 5	            | 6    update from) |
113  *      +--------+-----------------+----------------+-------------------+
114  *      | Child  | illegal         | ...child       | ...update         |
115  *      |        | 7	           | 8   update to  | 9	parent          |
116  *      +---------------------------------------------------------------+
117  *
118  * and notes:
119  *
120  * - Boxes 1, 4 and 6 are the most interesting. They are responsible
121  *   for updating the from unit's data structures. These may involve
122  *   finding (former or future) children, resetting name keys and the like.
123  *
124  * - The "rename" operation is boxes 1, 5 and 9. Most of the work
125  *   is done in box 5, since that contains both the "from" and "to"
126  *   unit struct for rename.
127  *
128  *  (There's got to be an eigen function for this; that diagonal
129  *   axis is a role identity operation searching for an expression.)
130  *
131  * - Almost every transaction will call more than one of these.
132  *   (Only a rename of a unit with no relatives will only call
133  *   a single box.)
134  *
135  * - Box 4 "...update from" is the generic self->parent modifier.
136  * - Box 8 "...update to" is the generic child->self modifier.
137  *   These can be generic because all of the information which
138  *   needs to be updated is in the common portion of the unit
139  *   structure when changing from their respective roles.
140  *
141  * - Boxes 1, 2 and 6 ("no default") indicate that per-metadevice
142  *   information must be updated. For example, in box 1, children
143  *   identities must be updated. Since different metadevice types
144  *   detect and manipulate their children differently, there can
145  *   be no generic "md_rename" function in this box.
146  *
147  * In addition to the named services in the table above, there
148  * are other named services used by rename/exchange.
149  * MDRNM_LIST_URFOLKS, MDRNM_LIST_URSELF, MDRNM_LIST_URKIDS
150  * list a device's parents, self and children, respectively.
151  * In most cases the default functions can be used for parents
152  * and self. Top-level devices, are not required to have a
153  * "list folks" named service. Likewise, devices which can
154  * not have metadevice children, are not required to have the
155  * "list kids" named service. The LIST_UR* functions call back into
156  * the base driver (md_build_rendelta()) to package the changes to
157  * a device for addition onto the tree. The LIST_UR* named service
158  * then adds this "rename delta" onto the delta tree itself.
159  * This keeps private knowledge appropriately encapsulated.
160  * They return the number of devices which will need to be changed,
161  * and hence the number of elements they've added to the delta list
162  * or -1 for error.
163  *
164  * Other named services used by rename/exchange are:
165  * "lock" (MDRNM_LOCK), "unlock" (MDRNM_UNLOCK) and "check" (MDRNM_CHECK).
166  * These (un) write-lock all of the relevant in-core structs,
167  * including the unit structs for the device and quiesce i/o as necessary.
168  * The "check" named service verifies that this device
169  * is in a state where rename could and may occur at this time.
170  * Since the role_swap functions themselves cannot be undone
171  * (at least in this implementation), it is check()'s job to
172  * verify that the device is renamable (sic) or, if not, abort.
173  * The check function for the device participating in the role
174  * of "self" is usually where rename or exchange validity is verified.
175  *
176  * All of these functions take two arguments which may be thought
177  * of as the collective state changes of the tree of devices
178  * (md_rendelta_t *family) and the rename transaction state
179  * (md_rentxn_t rtxn or rtxnp).
180  *
181  */
182 
183 
184 /*
185  * rename unit lock
186  * (default name service routine MDRNM_LOCK)
187  */
188 static intptr_t
md_rename_lock(md_rendelta_t * delta,md_rentxn_t * rtxnp)189 md_rename_lock(md_rendelta_t *delta, md_rentxn_t *rtxnp)
190 {
191 	minor_t		 mnum;
192 	md_renop_t	 op;
193 
194 	ASSERT(delta);
195 	ASSERT(rtxnp);
196 
197 	if (!delta || !rtxnp) {
198 		(void) mdsyserror(&rtxnp->mde, EINVAL);
199 		return (EINVAL);
200 	}
201 	mnum = md_getminor(delta->dev);
202 	op = rtxnp->op;
203 
204 	/*
205 	 * target doesn't exist if renaming (by definition),
206 	 * so it need not be locked
207 	 */
208 	if (op == MDRNOP_RENAME && mnum == rtxnp->to.mnum) {
209 		return (0);
210 	}
211 
212 	ASSERT(delta->uip);
213 	if (!delta->uip) {
214 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
215 		return (ENODEV);
216 	}
217 
218 	ASSERT(delta->unp);
219 	if (!delta->unp) {
220 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
221 		return (ENODEV);
222 	}
223 
224 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
225 
226 	(void) md_unit_writerlock(delta->uip);
227 
228 	ASSERT(UNIT_WRITER_HELD(delta->unp));
229 
230 	return (0);
231 }
232 
233 /*
234  * (default name service routine MDRNM_UNLOCK)
235  */
236 /* ARGSUSED */
237 static void
md_rename_unlock(md_rendelta_t * delta,md_rentxn_t * rtxnp)238 md_rename_unlock(
239 	md_rendelta_t	*delta,
240 	md_rentxn_t	*rtxnp)
241 {
242 	ASSERT(delta);
243 	ASSERT(delta->uip);
244 	ASSERT(delta->unp);
245 
246 	ASSERT(UNIT_WRITER_HELD(delta->unp));
247 
248 	(void) md_unit_writerexit(delta->uip);
249 
250 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
251 }
252 
253 /*
254  * This is used by the various MDRNM_LIST* named services.
255  */
256 md_rendelta_t *
md_build_rendelta(md_renrole_t old_role,md_renrole_t new_role,md_dev64_t dev,md_rendelta_t * prev,md_unit_t * unp,mdi_unit_t * uip,md_error_t * ep)257 md_build_rendelta(
258 	md_renrole_t	 old_role,
259 	md_renrole_t	 new_role,
260 	md_dev64_t	 dev,
261 	md_rendelta_t	*prev,
262 	md_unit_t	*unp,
263 	mdi_unit_t	*uip,
264 	md_error_t	*ep)
265 {
266 	int		 err	= 0;
267 	md_rendelta_t	*new;
268 
269 	new = (md_rendelta_t *)kmem_alloc(sizeof (md_rendelta_t), KM_SLEEP);
270 
271 	new->beginning	= DELTA_BEG;
272 	new->dev	= dev;
273 	new->new_role	= new_role;
274 	new->old_role	= old_role;
275 	new->next	= NULL;
276 	new->prev	= prev;
277 	new->unp = unp;
278 	new->uip = uip;
279 	bzero((void *) &new->txn_stat, sizeof (md_rendstat_t));
280 
281 	/*
282 	 * For non-meta devices that are being renamed (in the future,
283 	 * that is) we would need to pass in default functions to
284 	 * accommodate them, provided the default function is
285 	 * truly capable of performing the lock/check/unlock function
286 	 * on opaque devices.
287 	 */
288 
289 	new->lock	= md_get_named_service(dev, /* modindex */ 0,
290 						MDRNM_LOCK, md_rename_lock);
291 
292 	new->unlock	= (md_ren_void_svc_t *)md_get_named_service(dev,
293 					/* modindex */ 0, MDRNM_UNLOCK,
294 					(intptr_t (*)()) md_rename_unlock);
295 
296 	new->check	= md_get_named_service(dev, /* modindex */ 0,
297 					    MDRNM_CHECK, /* Default */ NULL);
298 
299 	new->role_swap	= NULL;	/* set this when the roles are determined */
300 
301 	if (!new->lock || !new->unlock || !new->check) {
302 		(void) mdmderror(ep, MDE_RENAME_CONFIG_ERROR, md_getminor(dev));
303 		err = EINVAL;
304 		goto out;
305 	}
306 
307 	new->end = DELTA_END;
308 
309 out:
310 	if (err != 0) {
311 		if (new) {
312 			new->beginning	= DELTA_BEG_FREED;
313 			new->end	= DELTA_END_FREED;
314 
315 			kmem_free(new, sizeof (md_rendelta_t));
316 			new = NULL;
317 		}
318 	}
319 
320 	if (prev) {
321 		prev->next = new;
322 	}
323 
324 	return (new);
325 }
326 
327 /*
328  * md_store_recid()
329  * used by role swap functions
330  */
331 void
md_store_recid(int * prec_idx,mddb_recid_t * recid_list,md_unit_t * un)332 md_store_recid(
333 	int		*prec_idx,
334 	mddb_recid_t	*recid_list,
335 	md_unit_t	*un)
336 {
337 	mddb_recid_t	*rp;
338 	bool_t		 add_recid;
339 
340 	ASSERT(prec_idx);
341 	ASSERT(recid_list);
342 	ASSERT(recid_list[*prec_idx] == 0);
343 	ASSERT(*prec_idx >= 0);
344 
345 	for (add_recid = TRUE, rp = recid_list; add_recid && rp && *rp; rp++) {
346 		if (MD_RECID(un) == *rp) {
347 			add_recid = FALSE;
348 		}
349 	}
350 
351 	if (add_recid) {
352 		recid_list[(*prec_idx)++] = MD_RECID(un);
353 	}
354 }
355 
356 /*
357  * MDRNM_LIST_URFOLKS: generic named svc entry point
358  * add all parents onto the list pointed to by dlpp
359  * (only weird multi-parented devices need to have their
360  * own named svc  to do this.)
361  */
362 static int
md_rename_listfolks(md_rendelta_t ** dlpp,md_rentxn_t * rtxnp)363 md_rename_listfolks(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
364 {
365 	md_rendelta_t	*new;
366 
367 	ASSERT(rtxnp);
368 	ASSERT(dlpp);
369 	ASSERT(*dlpp == NULL);
370 	ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
371 	ASSERT(rtxnp->from.uip);
372 	ASSERT(rtxnp->from.unp);
373 
374 	if ((!rtxnp->from.uip) || (!rtxnp->from.unp)) {
375 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
376 							rtxnp->from.mnum);
377 		return (-1);
378 	}
379 
380 	if (!MD_HAS_PARENT(MD_PARENT(rtxnp->from.unp))) {
381 		return (0);
382 	}
383 
384 	/*
385 	 * If supporting log renaming (and other multiparented devices)
386 	 * callout to each misc module to claim this waif and return the
387 	 * md_dev64_t of its parents.
388 	 */
389 	if (MD_PARENT(rtxnp->from.unp) == MD_MULTI_PARENT) {
390 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD,
391 							rtxnp->from.mnum);
392 		return (2);
393 	}
394 
395 	if ((rtxnp->op == MDRNOP_RENAME) ||
396 	    (MD_PARENT(rtxnp->from.unp) != MD_SID(rtxnp->to.unp))) {
397 
398 		new = md_build_rendelta(
399 			    MDRR_PARENT,
400 			    MDRR_PARENT,
401 			    md_makedevice(md_major, MD_PARENT(rtxnp->from.unp)),
402 			    NULL,
403 			    MD_UNIT(MD_PARENT(rtxnp->from.unp)),
404 			    MDI_UNIT(MD_PARENT(rtxnp->from.unp)),
405 			    &rtxnp->mde);
406 	} else {
407 		/* parent is swapping roles with self */
408 		new = md_build_rendelta(
409 			    MDRR_PARENT,
410 			    MDRR_SELF,
411 			    md_makedevice(md_major, MD_SID(rtxnp->to.unp)),
412 			    NULL,
413 			    rtxnp->to.unp,
414 			    rtxnp->to.uip,
415 			    &rtxnp->mde);
416 	}
417 
418 	if (!new) {
419 		if (mdisok(&rtxnp->mde)) {
420 			(void) mdsyserror(&rtxnp->mde, ENOMEM);
421 		}
422 		return (-1);
423 	}
424 
425 	*dlpp = new;
426 
427 	return (1);
428 }
429 
430 /*
431  * MDRNM_LIST_URSELF: named svc entry point
432  * add all delta entries appropriate for ourselves onto the deltalist pointed
433  * to by dlpp
434  */
435 static int
md_rename_listself(md_rendelta_t ** dlpp,md_rentxn_t * rtxnp)436 md_rename_listself(md_rendelta_t **dlpp, md_rentxn_t *rtxnp)
437 {
438 	md_rendelta_t	*new, *p;
439 	bool_t		 exchange_up	= FALSE;
440 
441 	ASSERT(rtxnp);
442 	ASSERT(dlpp);
443 	ASSERT((rtxnp->op == MDRNOP_EXCHANGE) || (rtxnp->op == MDRNOP_RENAME));
444 	ASSERT(rtxnp->from.unp);
445 	ASSERT(rtxnp->from.uip);
446 
447 	if ((!rtxnp->from.uip) || (!rtxnp->from.unp)) {
448 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
449 							rtxnp->from.mnum);
450 		return (-1);
451 	}
452 
453 	for (p = *dlpp; p && p->next != NULL; p = p->next) {
454 		/* NULL */
455 	}
456 
457 	/*
458 	 * renaming or
459 	 * from's parent is not to and to's parent is not from
460 	 */
461 	if (rtxnp->op == MDRNOP_RENAME) {
462 		new = md_build_rendelta(
463 				MDRR_SELF,
464 				MDRR_SELF,
465 				md_makedevice(md_major, rtxnp->from.mnum),
466 				p,
467 				rtxnp->from.unp,
468 				rtxnp->from.uip,
469 				&rtxnp->mde);
470 	} else {
471 
472 		if (MD_PARENT(rtxnp->from.unp) == MD_SID(rtxnp->to.unp)) {
473 			exchange_up = TRUE;
474 		}
475 
476 		/* self and parent are flipping */
477 		new = md_build_rendelta(
478 				MDRR_SELF,
479 				exchange_up? MDRR_PARENT: MDRR_CHILD,
480 				md_makedevice(md_major, rtxnp->from.mnum),
481 				p,
482 				rtxnp->from.unp,
483 				rtxnp->from.uip,
484 				&rtxnp->mde);
485 	}
486 
487 	if (!new) {
488 		if (mdisok(&rtxnp->mde)) {
489 			(void) mdsyserror(&rtxnp->mde, ENOMEM);
490 		}
491 		return (-1);
492 	}
493 
494 	if (!*dlpp) {
495 		*dlpp = new;
496 	}
497 
498 	return (1);
499 }
500 
501 /*
502  * free the tree of all deltas to devices involved in the rename transaction
503  */
504 static void
free_dtree(md_rendelta_t * family)505 free_dtree(md_rendelta_t *family)
506 {
507 	md_rendelta_t	*next		= NULL;
508 	int		 i		= 0;
509 	md_rendelta_t	*r;
510 
511 	for (r = family; (NULL != r); r = next, i++) {
512 
513 		next		= r->next;
514 
515 		/* shift << because it makes the resultant pattern readable */
516 		r->beginning	= DELTA_BEG_FREED ^ (i << guard_shift);
517 		r->end		= DELTA_END_FREED ^ (i << guard_shift);
518 
519 		kmem_free(r, sizeof (md_rendelta_t));
520 	}
521 }
522 
523 /*
524  * walk down family tree, calling lock service function
525  */
526 static int
lock_dtree(md_rendelta_t * family,md_rentxn_t * rtxnp)527 lock_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
528 {
529 	md_rendelta_t	*r;
530 	int		 rc;
531 
532 	ASSERT(family);
533 	ASSERT(rtxnp);
534 
535 	if (!family || !rtxnp) {
536 		return (EINVAL);
537 	}
538 
539 	for (rc = 0, r = family; r; r = r->next) {
540 
541 		ASSERT(r->unp);
542 		ASSERT(!UNIT_WRITER_HELD(r->unp));
543 		ASSERT(r->lock);
544 
545 		if ((rc = (int)(*r->lock) (r, rtxnp)) != 0) {
546 			return (rc);
547 		}
548 		r->txn_stat.locked = TRUE;
549 	}
550 
551 	return (0);
552 }
553 
554 /*
555  * We rely on check() (MDRNM_CHECK) to make exhaustive checks,
556  * since we don't attempt to undo role_swap() failures.
557  *
558  * To implement an undo() function would require each role_swap()
559  * to store a log of previous state of the structures it changes,
560  * presumably anchored by the rendelta.
561  *
562  */
563 static int
check_dtree(md_rendelta_t * family,md_rentxn_t * rtxnp)564 check_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
565 {
566 	md_rendelta_t	*r;
567 	int		 rc;
568 
569 	ASSERT(family);
570 	ASSERT(rtxnp);
571 
572 	if (!family || !rtxnp) {
573 		/* no error packet to set? */
574 		return (EINVAL);
575 	}
576 
577 	for (r = family, rc = 0; r; r = r->next) {
578 
579 		ASSERT(UNIT_WRITER_HELD(r->unp));
580 		ASSERT(r->txn_stat.locked);
581 
582 		/*
583 		 * <to> doesn't exist for rename
584 		 */
585 		if (!(rtxnp->op == MDRNOP_RENAME &&
586 		    md_getminor(r->dev) == rtxnp->to.mnum)) {
587 			ASSERT(r->uip);
588 			r->txn_stat.is_open = md_unit_isopen(r->uip);
589 		}
590 
591 		/*
592 		 * if only allowing offline rename/exchanges, check
593 		 * for top being trans because it opens its sub-devices
594 		 */
595 
596 		switch (rtxnp->revision) {
597 		case MD_RENAME_VERSION_OFFLINE:
598 			if ((r->txn_stat.is_open) &&
599 				(!rtxnp->stat.trans_in_stack)) {
600 				(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
601 							md_getminor(r->dev));
602 				return (EBUSY);
603 			}
604 			break;
605 
606 		case MD_RENAME_VERSION_ONLINE:
607 			break;
608 
609 		default:
610 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
611 						md_getminor(r->dev));
612 			return (EINVAL);
613 		}
614 
615 		/* MD_UN_MOD_INPROGRESS includes the MD_UN_RENAMING bit */
616 
617 		if (MD_STATUS(r->unp) & MD_UN_MOD_INPROGRESS) {
618 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
619 							md_getminor(r->dev));
620 			return (EBUSY);
621 		}
622 
623 		MD_STATUS(r->unp) |= MD_UN_RENAMING;
624 
625 		if ((rc = (int)(*r->check)(r, rtxnp)) != 0) {
626 			return (rc);
627 		}
628 
629 		/* and be sure we can proceed */
630 		if (!(r->role_swap)) {
631 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
632 							md_getminor(r->dev));
633 			return (EINVAL);
634 		}
635 		r->txn_stat.checked = TRUE;
636 	}
637 
638 	return (0);
639 }
640 
641 
642 /*
643  * rename role_swap() functions are responsible for updating their
644  * own parent, self and children references in both on-disk
645  * and in-core structures, as well as storing the changed
646  * record ids into recids and incrementing rec_idx.
647  */
648 
649 static void
role_swap_dtree(md_rendelta_t * family,md_rentxn_t * rtxnp)650 role_swap_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
651 {
652 	md_rendelta_t	*r;
653 
654 	ASSERT(family);
655 	ASSERT(rtxnp);
656 
657 	for (r = family; r; r = r->next) {
658 		ASSERT(r->role_swap);
659 		ASSERT(r->txn_stat.locked);
660 		ASSERT(r->txn_stat.checked);
661 
662 		(*r->role_swap)(r, rtxnp);
663 
664 		r->txn_stat.role_swapped = TRUE;
665 	}
666 
667 	/*
668 	 * there's some work to do, but not more than expected
669 	 */
670 	ASSERT(rtxnp->rec_idx > 0);
671 	ASSERT(rtxnp->rec_idx < rtxnp->n_recids);
672 
673 	if (rtxnp->rec_idx >= rtxnp->n_recids || rtxnp->rec_idx <= 0) {
674 		/*
675 		 * There's no way to indicate error from here,
676 		 * and even if we could, there's no undo mechanism.
677 		 * We've already modified the in-core structs, so
678 		 * We can't continue w/o committing, but we
679 		 * don't appear to have anything to commit.
680 		 */
681 		cmn_err(CE_PANIC,
682 			"md_rename: role_swap_dtree(family:%p, rtxnp:%p)",
683 					(void *) family, (void *) rtxnp);
684 		return;
685 	}
686 	rtxnp->recids[rtxnp->rec_idx] = 0;
687 
688 	mddb_commitrecs_wrapper(rtxnp->recids);
689 }
690 
691 /*
692  * walk down delta tree, calling the unlock service for each device,
693  * provided any of the devices appear to have been locked
694  */
695 static void
unlock_dtree(md_rendelta_t * family,md_rentxn_t * rtxnp)696 unlock_dtree(md_rendelta_t *family, md_rentxn_t *rtxnp)
697 {
698 	md_rendelta_t	*r;
699 	uint_t		 any_locked	= FALSE;
700 
701 	ASSERT(family);
702 	ASSERT(rtxnp);
703 
704 	for (r = family; r; r = r->next) {
705 
706 		ASSERT(!(r->txn_stat.unlocked)); /* "has been unlocked" */
707 		any_locked |= r->txn_stat.locked;
708 	}
709 
710 	if (any_locked) {
711 
712 		/* unwind in reverse order */
713 		for (r = family; NULL != r->next; r = r->next) {
714 			/* NULL */
715 		}
716 
717 		for (; NULL != r; r = r->prev) {
718 			MD_STATUS(r->unp) &= ~MD_UN_RENAMING;
719 			ASSERT(r->unlock);
720 			r->unlock(r, rtxnp);
721 			r->txn_stat.unlocked = TRUE;
722 		}
723 	}
724 }
725 
726 /*
727  * MDRNM_UPDATE_SELF
728  * This role swap function is identical for all unit types,
729  * so keep it here. It's also the best example because it
730  * touches all the modified portions of the relevant
731  * in-common structures.
732  */
733 static void
md_rename_update_self(md_rendelta_t * delta,md_rentxn_t * rtxnp)734 md_rename_update_self(
735 	md_rendelta_t	*delta,
736 	md_rentxn_t	*rtxnp)
737 {
738 	minor_t		from_min, to_min;
739 	sv_dev_t	sv;
740 	mddb_de_ic_t	*dep;
741 	mddb_rb32_t	*rbp;
742 
743 	ASSERT(rtxnp);
744 	ASSERT(rtxnp->op == MDRNOP_RENAME);
745 	ASSERT(delta);
746 	ASSERT(delta->unp);
747 	ASSERT(delta->uip);
748 	ASSERT(rtxnp->rec_idx >= 0);
749 	ASSERT(rtxnp->recids);
750 	ASSERT(delta->old_role == MDRR_SELF);
751 	ASSERT(delta->new_role == MDRR_SELF);
752 	ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
753 
754 	from_min = rtxnp->from.mnum;
755 	to_min = rtxnp->to.mnum;
756 
757 	/*
758 	 * self id changes in our own unit struct
759 	 */
760 	MD_SID(delta->unp) = to_min;
761 
762 	/*
763 	 * make sure that dest always has correct un_revision
764 	 * and rb_revision
765 	 */
766 	delta->unp->c.un_revision |= MD_FN_META_DEV;
767 	dep = mddb_getrecdep(MD_RECID(delta->unp));
768 	ASSERT(dep);
769 	rbp = dep->de_rb;
770 	if (rbp->rb_revision & MDDB_REV_RB) {
771 		rbp->rb_revision = MDDB_REV_RBFN;
772 	} else if (rbp->rb_revision & MDDB_REV_RB64) {
773 		rbp->rb_revision = MDDB_REV_RB64FN;
774 	}
775 
776 	/*
777 	 * clear old array pointers to unit in-core and unit
778 	 */
779 
780 	MDI_VOIDUNIT(from_min) = NULL;
781 	MD_VOIDUNIT(from_min) = NULL;
782 
783 	/*
784 	 * and point the new slots at the unit in-core and unit structs
785 	 */
786 
787 	MDI_VOIDUNIT(to_min) = delta->uip;
788 	MD_VOIDUNIT(to_min) = delta->unp;
789 
790 	/*
791 	 * recreate kstats
792 	 * - destroy the ones associated with our former identity
793 	 * - reallocate and associate them with our new identity
794 	 */
795 	md_kstat_destroy_ui(delta->uip);
796 	md_kstat_init_ui(to_min, delta->uip);
797 
798 	/*
799 	 * the unit in-core reference to the get next link's id changes
800 	 */
801 
802 	delta->uip->ui_link.ln_id = to_min;
803 
804 	/*
805 	 * name space addition of new key was done from user-level
806 	 * remove the old name's key here
807 	 */
808 
809 	sv.setno = MD_MIN2SET(from_min);
810 	sv.key = rtxnp->from.key;
811 
812 	md_rem_names(&sv, 1);
813 
814 	/*
815 	 * Remove associated device node as well
816 	 */
817 	md_remove_minor_node(from_min);
818 
819 	/*
820 	 * and store the record id (from the unit struct) into recids
821 	 * for later commitment by md_rename()
822 	 */
823 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
824 }
825 
826 /*
827  * Either one of our siblings and/or our parent changed identities.
828  */
829 static void
md_renexch_update_parent(md_rendelta_t * delta,md_rentxn_t * rtxnp)830 md_renexch_update_parent(
831 	md_rendelta_t	*delta,
832 	md_rentxn_t	*rtxnp)
833 {
834 	ASSERT(rtxnp);
835 	ASSERT((MDRNOP_RENAME == rtxnp->op) || (rtxnp->op == MDRNOP_EXCHANGE));
836 	ASSERT(rtxnp->rec_idx >= 0);
837 	ASSERT(rtxnp->recids);
838 	ASSERT(delta);
839 	ASSERT(delta->unp);
840 	ASSERT(delta->old_role == MDRR_CHILD);
841 	ASSERT(delta->new_role == MDRR_CHILD);
842 	ASSERT((MD_PARENT(delta->unp) == rtxnp->from.mnum) ||
843 		(MD_PARENT(delta->unp) == rtxnp->to.mnum));
844 
845 	if (MD_PARENT(delta->unp) == rtxnp->from.mnum) {
846 		MD_PARENT(delta->unp) = rtxnp->to.mnum;
847 	}
848 
849 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
850 }
851 
852 /*
853  * exchange up (child->self)
854  */
855 static void
md_exchange_child_update_to(md_rendelta_t * delta,md_rentxn_t * rtxnp)856 md_exchange_child_update_to(
857 	md_rendelta_t	*delta,
858 	md_rentxn_t	*rtxnp)
859 {
860 	minor_t from_min, to_min;
861 
862 	ASSERT(rtxnp);
863 	ASSERT(rtxnp->op == MDRNOP_EXCHANGE);
864 	ASSERT(rtxnp->rec_idx >= 0);
865 	ASSERT(rtxnp->recids);
866 	ASSERT(delta);
867 	ASSERT(delta->unp);
868 	ASSERT(delta->uip);
869 	ASSERT(delta->old_role == MDRR_CHILD);
870 	ASSERT(delta->new_role == MDRR_SELF);
871 	ASSERT(md_getminor(delta->dev) == rtxnp->to.mnum);
872 
873 	from_min = rtxnp->from.mnum;
874 	to_min = rtxnp->to.mnum;
875 
876 	/*
877 	 * self id changes in our own unit struct
878 	 * Note:
879 	 * - Since we're assuming the identity of "from" we use its mnum even
880 	 *   though we're updating the "to" structures.
881 	 */
882 
883 	MD_SID(delta->unp) = from_min;
884 
885 	/*
886 	 * our parent identifier becomes the new self, who was "to"
887 	 */
888 
889 	MD_PARENT(delta->unp) = to_min;
890 
891 	/*
892 	 * point the set array pointers at the "new" unit and unit in-cores
893 	 * Note:
894 	 * - The other half of this transfer is done in the "update from"
895 	 *   rename/exchange named service.
896 	 */
897 
898 	MD_VOIDUNIT(from_min) = delta->unp;
899 	MDI_VOIDUNIT(from_min) = delta->uip;
900 
901 	/*
902 	 * transfer kstats
903 	 */
904 
905 	delta->uip->ui_kstat = rtxnp->from.kstatp;
906 
907 	/*
908 	 * the unit in-core reference to the get next link's id changes
909 	 */
910 
911 	delta->uip->ui_link.ln_id = from_min;
912 
913 	/*
914 	 * name space additions, if necessary, were done from user-level.
915 	 * name space deletions, if necessary, were done in "exchange_from"
916 	 */
917 
918 	/*
919 	 * and store the record id (from the unit struct) into recids
920 	 * for later comitment by md_rename()
921 	 */
922 
923 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
924 }
925 
926 /*
927  * exchange up (self->parent)
928  */
929 static void
md_exchange_self_update_from_up(md_rendelta_t * delta,md_rentxn_t * rtxnp)930 md_exchange_self_update_from_up(
931 	md_rendelta_t	*delta,
932 	md_rentxn_t	*rtxnp)
933 {
934 	minor_t from_min, to_min;
935 
936 	ASSERT(rtxnp);
937 	ASSERT(rtxnp->op == MDRNOP_EXCHANGE);
938 	ASSERT(rtxnp->rec_idx >= 0);
939 	ASSERT(rtxnp->recids);
940 	ASSERT(delta);
941 	ASSERT(delta->unp);
942 	ASSERT(delta->uip);
943 	ASSERT(delta->old_role == MDRR_SELF);
944 	ASSERT(delta->new_role == MDRR_PARENT);
945 	ASSERT(md_getminor(delta->dev) == rtxnp->from.mnum);
946 
947 	from_min = rtxnp->from.mnum;
948 	to_min = rtxnp->to.mnum;
949 
950 	/*
951 	 * self id changes in our own unit struct
952 	 * Note:
953 	 * - Since we're assuming the identity of "to" we use its mnum
954 	 *   while we're updating the "to" structures.
955 	 */
956 
957 	MD_SID(delta->unp) = to_min;
958 
959 	/*
960 	 * our parent identifier becomes the new parent, who was "from"
961 	 */
962 
963 	MD_PARENT(delta->unp) = from_min;
964 
965 	/*
966 	 * point the set array pointers at the "new" unit and unit in-cores
967 	 * Note:
968 	 * - The other half of this transfer is done in the "update from"
969 	 *   rename/exchange named service.
970 	 */
971 
972 	MD_VOIDUNIT(to_min) = delta->unp;
973 	MDI_VOIDUNIT(to_min) = delta->uip;
974 
975 	/*
976 	 * transfer kstats
977 	 */
978 
979 	delta->uip->ui_kstat = rtxnp->to.kstatp;
980 
981 	/*
982 	 * the unit in-core reference to the get next link's id changes
983 	 */
984 
985 	delta->uip->ui_link.ln_id = to_min;
986 
987 	/*
988 	 * name space additions, if necessary, were done from user-level.
989 	 * name space deletions, if necessary, were done in "exchange_from"
990 	 */
991 
992 	/*
993 	 * and store the record id (from the unit struct) into recids
994 	 * for later comitment by md_rename()
995 	 */
996 
997 	md_store_recid(&rtxnp->rec_idx, rtxnp->recids, delta->unp);
998 }
999 
1000 /*
1001  * The order of the called role swap functions is critical.
1002  * If they're not ordered as "all parents", then "all self"
1003  * then "all child" transitions, we will almost certainly
1004  * corrupt the data base and the in-core linkages. So,
1005  * verify that the list built by the individual drivers is
1006  * ok here.
1007  *
1008  * We could have done fancy bit encodings of the roles so
1009  * it all fit into a single word and we wouldn't need the
1010  * prev_ord field. But, since cpu power is cheaper than
1011  * than people power, they're all separate for easier
1012  * debugging and maintaining. (In the unlikely event that
1013  * rename/exchange ever becomes cpu-limited, and this
1014  * algorithm is the bottleneck, we should revisit this.)
1015  */
1016 
1017 static bool_t
role_swap_is_valid(int previous,int current,md_rendelta_t * delta,md_rentxn_t * rtxnp)1018 role_swap_is_valid(
1019 	int		 previous,
1020 	int		 current,
1021 	md_rendelta_t	*delta,
1022 	md_rentxn_t	*rtxnp)
1023 {
1024 	bool_t	valid	= FALSE;
1025 
1026 	/*
1027 	 * we've backed up in processing the role table
1028 	 */
1029 	if ((previous > current) &&
1030 	    (delta->prev && (delta->old_role != delta->prev->old_role))) {
1031 		goto out;
1032 	}
1033 
1034 	/*
1035 	 * we're repeating the same role transition
1036 	 */
1037 	if (previous == current) {
1038 		switch (delta->old_role) {
1039 		case MDRR_PARENT:
1040 			/*
1041 			 * require at least one of the devices to
1042 			 * be multiparented for us to allow another
1043 			 * parent transition
1044 			 */
1045 			if ((MD_MULTI_PARENT != MD_PARENT(rtxnp->from.unp)) &&
1046 			    (MD_MULTI_PARENT != MD_PARENT(rtxnp->to.unp))) {
1047 				goto out;
1048 			}
1049 			break;
1050 
1051 		case MDRR_CHILD:
1052 			/* it's ok to have multiple children */
1053 			break;
1054 
1055 		case MDRR_SELF:
1056 			/* it's never ok to have multiple self transitions */
1057 			/* FALLTHROUGH */
1058 		default:
1059 			goto out;
1060 		}
1061 	}
1062 
1063 	valid = TRUE;
1064 out:
1065 	if (!valid) {
1066 		if (md_rename_debug != 0) {
1067 			cmn_err(CE_NOTE, "previous: %d, current: %d, role: %s",
1068 					previous, current,
1069 					ROLE(delta->old_role));
1070 			delay(3*drv_usectohz(1000000));
1071 			ASSERT(FALSE);
1072 		}
1073 	}
1074 
1075 	return (valid);
1076 }
1077 
1078 static role_change_tab_t *
lookup_role(md_renrole_t old_role,md_renrole_t new_role)1079 lookup_role(md_renrole_t old_role, md_renrole_t new_role)
1080 {
1081 	role_change_tab_t	*rp;
1082 	role_change_tab_t	*found = NULL;
1083 
1084 	for (rp = role_swap_tab; !found && (rp->old_role != MDRR_UNK); rp++) {
1085 
1086 		if (rp->old_role == old_role && rp->new_role == new_role) {
1087 			found = rp;
1088 		}
1089 	}
1090 	/*
1091 	 * we require a named svc if we've got two devices
1092 	 * claiming to be changing roles in this manner
1093 	 */
1094 	ASSERT(found);
1095 	ASSERT(found->default_svc != ILLEGAL_ROLESWAP_SVC);
1096 	ASSERT(found->svc_name != ILLEGAL_SVC_NAME);
1097 
1098 	if (!found ||
1099 	    (found->default_svc == ILLEGAL_ROLESWAP_SVC) ||
1100 	    (found->svc_name == ILLEGAL_SVC_NAME)) {
1101 		return (NULL);
1102 	}
1103 
1104 	return (found);
1105 }
1106 
1107 /*
1108  * fill in the role swap named svc., now that we know each device
1109  * and its changing role
1110  */
1111 static int
valid_roleswap_dtree(md_rendelta_t * family,md_rentxn_t * rtxnp)1112 valid_roleswap_dtree(
1113 	md_rendelta_t	*family,
1114 	md_rentxn_t	*rtxnp
1115 )
1116 {
1117 	md_rendelta_t		*r;
1118 	role_change_tab_t	*rolep;
1119 	minor_t			 from_min, to_min;
1120 	int			 prev_ord	= -1;
1121 	bool_t			found_self	= FALSE;
1122 	int			 err		= 0;
1123 
1124 	ASSERT(family);
1125 	ASSERT(rtxnp);
1126 
1127 	from_min = rtxnp->from.mnum;
1128 	to_min = rtxnp->to.mnum;
1129 
1130 	for (r = family; r; r = r->next, prev_ord = rolep->ord) {
1131 
1132 		if (!(rolep = lookup_role(r->old_role, r->new_role))) {
1133 			(void) mdmderror(&rtxnp->mde,
1134 					MDE_RENAME_CONFIG_ERROR, from_min);
1135 			err = EOPNOTSUPP;
1136 			goto out;
1137 		}
1138 		r->role_swap = (md_ren_roleswap_svc_t *)md_get_named_service(
1139 					r->dev, /* modindex */ 0,
1140 					(char *)rolep->svc_name,
1141 					(intptr_t (*)()) rolep->default_svc);
1142 
1143 		/*
1144 		 * someone probably called the ioctl directly and
1145 		 * incorrectly, rather than via the libmeta wrappers
1146 		 */
1147 		if (!(r->role_swap)) {
1148 			(void) mdmderror(&rtxnp->mde,
1149 					MDE_RENAME_TARGET_UNRELATED, to_min);
1150 			err = EOPNOTSUPP;
1151 			goto out;
1152 		}
1153 
1154 		if (!role_swap_is_valid(prev_ord, rolep->ord, r, rtxnp)) {
1155 			(void) mdmderror(&rtxnp->mde,
1156 					MDE_RENAME_CONFIG_ERROR, from_min);
1157 			err = EINVAL;
1158 			goto out;
1159 		}
1160 
1161 		if (rolep->old_role == MDRR_SELF) {
1162 			found_self = TRUE;
1163 		}
1164 
1165 		if (MD_PARENT(r->unp) == MD_MULTI_PARENT) {
1166 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_TARGET_BAD,
1167 							md_getminor(r->dev));
1168 			err = EINVAL;
1169 			goto out;
1170 		}
1171 	}
1172 
1173 	/*
1174 	 * must be at least one selfish device
1175 	 */
1176 	ASSERT(found_self);
1177 	if (!found_self) {
1178 		(void) mdmderror(&rtxnp->mde,
1179 					MDE_RENAME_CONFIG_ERROR, from_min);
1180 		err = EINVAL;
1181 		goto out;
1182 	}
1183 
1184 out:
1185 	return (err);
1186 }
1187 
1188 /*
1189  * dump contents of rename transaction
1190  */
1191 static void
dump_txn(md_rentxn_t * rtxnp)1192 dump_txn(md_rentxn_t *rtxnp) {
1193 
1194 	if (md_rename_debug == 0) {
1195 		return;
1196 	}
1197 
1198 	cmn_err(CE_NOTE, "rtxnp: %p", (void *) rtxnp);
1199 	if (rtxnp) {
1200 		cmn_err(CE_NOTE, "beginning: %llx, op: %s",
1201 			rtxnp->beginning, OP_STR(rtxnp->op));
1202 
1203 		cmn_err(CE_NOTE,
1204 	"revision: %d, uflags: %d, rec_idx: %d, n_recids: %d, rec_ids: %p%s",
1205 			rtxnp->revision, rtxnp->uflags,
1206 			rtxnp->rec_idx, rtxnp->n_recids, (void *) rtxnp->recids,
1207 			rtxnp->stat.trans_in_stack? " (trans in stack)": "");
1208 		cmn_err(CE_NOTE, " from: beginning: %llx",
1209 							rtxnp->from.beginning);
1210 		cmn_err(CE_NOTE, "    minor: %lX, key: %lX",
1211 			(ulong_t)rtxnp->from.mnum, (ulong_t)rtxnp->from.key);
1212 		cmn_err(CE_NOTE, "    unp: %lX, uip: %lX",
1213 			(ulong_t)rtxnp->from.unp, (ulong_t)rtxnp->from.uip);
1214 		cmn_err(CE_NOTE, "    end: %llx", rtxnp->from.end);
1215 		cmn_err(CE_NOTE, "  to: beginning: %llx", rtxnp->to.beginning);
1216 		cmn_err(CE_NOTE, "    minor: %lX, key: %lX",
1217 			(ulong_t)rtxnp->to.mnum, (ulong_t)rtxnp->to.key);
1218 		cmn_err(CE_NOTE, "    unp: %lX, uip: %lX",
1219 			(ulong_t)rtxnp->to.unp, (ulong_t)rtxnp->to.uip);
1220 		cmn_err(CE_NOTE, "    end: %llx", rtxnp->to.end);
1221 		cmn_err(CE_NOTE, "end: %llx\n", rtxnp->end);
1222 	}
1223 	delay(drv_usectohz(1000000));
1224 }
1225 
1226 /*
1227  * dump contents of all deltas
1228  */
1229 static void
dump_dtree(md_rendelta_t * family)1230 dump_dtree(md_rendelta_t *family)
1231 {
1232 	md_rendelta_t	*r;
1233 	int		i;
1234 
1235 	if (md_rename_debug == 0) {
1236 		return;
1237 	}
1238 
1239 	for (r = family, i = 0; r; r = r->next, i++) {
1240 		cmn_err(CE_NOTE, "%d.  beginning: %llx", i, r->beginning);
1241 		cmn_err(CE_NOTE, "  r: %lX, dev: %lX, next: %lx, prev: %lx",
1242 					(ulong_t)r, (ulong_t)r->dev,
1243 					(ulong_t)r->next, (ulong_t)r->prev);
1244 
1245 		cmn_err(CE_NOTE, "  role: %s -> %s, unp: %lx, uip: %lx",
1246 			ROLE(r->old_role), ROLE(r->new_role),
1247 			(ulong_t)r->unp, (ulong_t)r->uip);
1248 		cmn_err(CE_NOTE,
1249 		"  lock: %lx, unlock: %lx\n\t  check: %lx, role_swap: %lx",
1250 			(ulong_t)r->lock, (ulong_t)r->unlock,
1251 			(ulong_t)r->check, (ulong_t)r->role_swap);
1252 		if (*((uint_t *)(&r->txn_stat)) != 0) {
1253 			cmn_err(CE_NOTE, "status: (0x%x) %s%s%s%s%s",
1254 			*((uint_t *)(&r->txn_stat)),
1255 			r->txn_stat.is_open?		"is_open "	: "",
1256 			r->txn_stat.locked?		"locked "	: "",
1257 			r->txn_stat.checked?		"checked "	: "",
1258 			r->txn_stat.role_swapped?	"role_swapped "	: "",
1259 			r->txn_stat.unlocked?		"unlocked"	: "");
1260 		}
1261 		cmn_err(CE_NOTE, "end: %llx\n", r->end);
1262 	}
1263 	delay(drv_usectohz(1000000));
1264 }
1265 
1266 /*
1267  * validate the rename request parameters
1268  */
1269 static int
validate_txn_parms(md_rentxn_t * rtxnp)1270 validate_txn_parms(md_rentxn_t *rtxnp)
1271 {
1272 	minor_t	to_min, from_min;
1273 
1274 	ASSERT(rtxnp);
1275 
1276 	from_min = rtxnp->from.mnum;
1277 	to_min = rtxnp->to.mnum;
1278 
1279 	switch (rtxnp->revision) {
1280 	case MD_RENAME_VERSION_OFFLINE:
1281 		if (rtxnp->uflags != 0) {
1282 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1283 								from_min);
1284 			return (ENOTSUP);
1285 		}
1286 		break;
1287 
1288 	case MD_RENAME_VERSION_ONLINE:
1289 		/* not supported until 5.0 */
1290 		/* FALLTHROUGH */
1291 
1292 	default:
1293 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1294 								from_min);
1295 		return (EPROTONOSUPPORT);
1296 	}
1297 
1298 	if ((rtxnp->from.uip = MDI_UNIT(from_min)) == NULL) {
1299 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1300 		return (ENODEV);
1301 	}
1302 
1303 	if (!md_dev_exists(md_makedevice(md_major, from_min))) {
1304 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1305 		return (ENODEV);
1306 	}
1307 
1308 	if ((rtxnp->from.key == MD_KEYBAD) || (rtxnp->from.key == MD_KEYWILD)) {
1309 		(void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, from_min);
1310 		return (EINVAL);
1311 	}
1312 
1313 	rtxnp->from.kstatp = rtxnp->from.uip->ui_kstat;
1314 	rtxnp->from.unp = MD_UNIT(from_min);
1315 
1316 	if (MD_MIN2SET(to_min) != MD_MIN2SET(from_min)) {
1317 		(void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1318 		return (EINVAL);
1319 	}
1320 
1321 	switch (rtxnp->op) {
1322 	case MDRNOP_EXCHANGE:
1323 		rtxnp->to.unp = MD_UNIT(to_min);
1324 		rtxnp->to.uip = MDI_UNIT(to_min);
1325 
1326 		/*
1327 		 * exchange requires target to exist
1328 		 */
1329 
1330 		if ((rtxnp->to.uip == NULL) ||
1331 		    (md_dev_exists(md_makedevice(md_major, to_min)) == NULL)) {
1332 			(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP,
1333 									to_min);
1334 			return (ENODEV);
1335 		}
1336 
1337 		if ((rtxnp->to.key == MD_KEYBAD) ||
1338 		    (rtxnp->to.key == MD_KEYWILD)) {
1339 			(void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1340 			return (EINVAL);
1341 		}
1342 
1343 		/*
1344 		 * <from> is not in the role of <self>,
1345 		 * that is,
1346 		 * <from> has a parent, which is <to> and <to> has a parent too
1347 		 * or
1348 		 * <to> has a parent, which is <from> and <to> can have a child
1349 		 */
1350 		if ((MD_HAS_PARENT(MD_PARENT(rtxnp->from.unp))) &&
1351 		    (MD_PARENT(rtxnp->from.unp) == to_min) &&
1352 		    MD_HAS_PARENT(MD_PARENT(rtxnp->to.unp))) {
1353 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_ORDER,
1354 								from_min);
1355 			return (EINVAL);
1356 		}
1357 
1358 		if ((MD_HAS_PARENT(MD_PARENT(rtxnp->to.unp))) &&
1359 		    (MD_PARENT(rtxnp->to.unp) == from_min) &&
1360 		    (MD_CAPAB(rtxnp->to.unp) & MD_CAN_META_CHILD)) {
1361 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_ORDER,
1362 								from_min);
1363 			return (EINVAL);
1364 		}
1365 
1366 		rtxnp->to.kstatp = rtxnp->to.uip->ui_kstat;
1367 		break;
1368 
1369 	case MDRNOP_RENAME:
1370 
1371 		/*
1372 		 * rename requires <to> not to exist
1373 		 */
1374 
1375 		if (MDI_UNIT(to_min) ||
1376 		    md_dev_exists(md_makedevice(md_major, to_min))) {
1377 
1378 			(void) mdmderror(&rtxnp->mde, MDE_UNIT_ALREADY_SETUP,
1379 									to_min);
1380 			return (EEXIST);
1381 		}
1382 
1383 		/*
1384 		 * and to be within valid ranges for the current
1385 		 * limits on number of sets and metadevices
1386 		 */
1387 		if ((MD_MIN2SET(to_min) >= md_nsets) ||
1388 		    (MD_MIN2UNIT(to_min) >= md_nunits)) {
1389 			(void) mdmderror(&rtxnp->mde, MDE_INVAL_UNIT, to_min);
1390 			return (EINVAL);
1391 		}
1392 
1393 		rtxnp->to.unp = NULL;
1394 		rtxnp->to.uip = NULL;
1395 		rtxnp->to.kstatp = NULL;
1396 		break;
1397 
1398 	default:
1399 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1400 								from_min);
1401 		return (EINVAL);
1402 	}
1403 
1404 	/*
1405 	 * install guard rails
1406 	 */
1407 	rtxnp->beginning = TXN_BEG;
1408 
1409 	rtxnp->from.beginning	= TXNUN_BEG;
1410 	rtxnp->from.end		= TXNUN_END;
1411 
1412 	rtxnp->to.beginning	= TXNUN_BEG;
1413 	rtxnp->to.end		= TXNUN_END;
1414 
1415 	rtxnp->end = TXN_END;
1416 
1417 	return (0);
1418 }
1419 
1420 /*
1421  * If the device being changed exhibits this capability, set the list
1422  * relatives function pointer to the named service that lists the
1423  * appropriate relatives for this capability.
1424  */
1425 static int
set_list_rels_funcp(md_rentxn_t * rtxnp,md_stackcap_t capability,char * svc_name,md_ren_list_svc_t default_svc_func,md_ren_list_svc_t ** list_relatives_funcp)1426 set_list_rels_funcp(
1427 	md_rentxn_t		 *rtxnp,
1428 	md_stackcap_t		 capability,
1429 	char			 *svc_name,
1430 	md_ren_list_svc_t	 default_svc_func,
1431 	md_ren_list_svc_t	 **list_relatives_funcp
1432 )
1433 {
1434 	int		 err;
1435 	minor_t		 from_min;
1436 	md_dev64_t	 from_dev;
1437 	md_unit_t	*from_un;
1438 	mdi_unit_t	*from_ui;
1439 
1440 	ASSERT(rtxnp);
1441 	ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
1442 	ASSERT(list_relatives_funcp);
1443 
1444 	from_min	= rtxnp->from.mnum;
1445 	from_dev	= md_makedevice(md_major, from_min);
1446 	from_un		= MD_UNIT(from_min);
1447 	from_ui		= MDI_UNIT(from_min);
1448 	err		= 0;
1449 
1450 	if (!from_ui || !from_un) {
1451 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, from_min);
1452 		err = EINVAL;
1453 		goto out;
1454 	}
1455 
1456 	if ((capability == MD_CAN_DO_ANYTHING) ||
1457 	    ((MD_CAPAB(from_un) & capability) == capability)) {
1458 
1459 			*list_relatives_funcp = (md_ren_list_svc_t *)
1460 					md_get_named_service(from_dev,
1461 					/* modindex */ 0, svc_name,
1462 					(intptr_t (*)()) default_svc_func);
1463 
1464 			ASSERT(*list_relatives_funcp);
1465 			if (!(*list_relatives_funcp)) {
1466 				(void) mdmderror(&rtxnp->mde,
1467 					MDE_RENAME_CONFIG_ERROR, from_min);
1468 				err = EINVAL;
1469 				goto out;
1470 			}
1471 	} else {
1472 		*list_relatives_funcp = (md_ren_list_svc_t *)NULL;
1473 	}
1474 
1475 out:
1476 	return (err);
1477 }
1478 
1479 /*
1480  * call list relations function, bump recid counter
1481  * by number of members added to the delta list.
1482  * Validate that the number of members added is within bounds.
1483  */
1484 static int
list_relations(md_rendelta_t ** family,md_rentxn_t * rtxnp,md_ren_list_svc_t * add_relatives_funcp,int valid_min,int valid_max)1485 list_relations(
1486 		md_rendelta_t		**family,
1487 		md_rentxn_t		 *rtxnp,
1488 		md_ren_list_svc_t	 *add_relatives_funcp,
1489 		int			  valid_min,
1490 		int			  valid_max
1491 )
1492 {
1493 	int	n_added;
1494 	int	err = 0;
1495 
1496 	ASSERT(family);
1497 	ASSERT(rtxnp);
1498 
1499 	if (!family || !rtxnp) {
1500 		err = EINVAL;
1501 		goto out;
1502 	}
1503 
1504 	n_added = 0;
1505 
1506 	/* no relations of this type */
1507 	if (!add_relatives_funcp) {
1508 		goto out;
1509 	}
1510 
1511 	n_added = (*add_relatives_funcp) (family, rtxnp);
1512 
1513 	if ((n_added < valid_min) || (n_added > valid_max)) {
1514 		if (mdisok(&rtxnp->mde)) {
1515 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1516 							rtxnp->from.mnum);
1517 		}
1518 		err = EINVAL;
1519 		goto out;
1520 	}
1521 
1522 	rtxnp->n_recids += n_added;
1523 
1524 out:
1525 	return (err);
1526 }
1527 
1528 /*
1529  * build recid array
1530  */
1531 static int
alloc_recids(md_rendelta_t * family,md_rentxn_t * rtxnp)1532 alloc_recids(md_rendelta_t *family, md_rentxn_t *rtxnp)
1533 {
1534 	int	err	= 0;
1535 
1536 	if (!family || !rtxnp) {
1537 		err = ENOMEM;
1538 		goto out;
1539 	}
1540 
1541 	rtxnp->rec_idx = 0;
1542 
1543 	if (rtxnp->n_recids == 0) {
1544 		err = EINVAL;
1545 		goto out;
1546 	}
1547 
1548 	rtxnp->n_recids += 1;	/* terminator */
1549 
1550 	rtxnp->recids = kmem_alloc(sizeof (mddb_recid_t) * rtxnp->n_recids,
1551 	    KM_SLEEP);
1552 	if (!(rtxnp->recids)) {
1553 		err = ENOMEM;
1554 		goto out;
1555 	}
1556 
1557 	bzero((void *) rtxnp->recids,
1558 				(sizeof (mddb_recid_t) * rtxnp->n_recids));
1559 out:
1560 	if (err != 0) {
1561 		(void) mdsyserror(&rtxnp->mde, err);
1562 	}
1563 
1564 	return (err);
1565 }
1566 
1567 /*
1568  * build family tree (parent(s), self, children)
1569  * The order of the resultant list is important, as it governs
1570  * the order of locking, checking and changing the unit structures.
1571  * Since we'll be changing them, we may not use the MD_UNIT, MDI_UNIT,
1572  * and other pointer which depend on the array being correct.
1573  * Use only the cached pointers (in rtxnp.)
1574  */
1575 static md_rendelta_t *
build_dtree(md_rentxn_t * rtxnp)1576 build_dtree(md_rentxn_t *rtxnp)
1577 {
1578 	md_ren_list_svc_t	*add_folks, *add_self, *add_kids;
1579 	int			 err;
1580 	md_rendelta_t		*family	= NULL;
1581 
1582 	ASSERT(rtxnp);
1583 	ASSERT((rtxnp->op == MDRNOP_RENAME) || (rtxnp->op == MDRNOP_EXCHANGE));
1584 
1585 	err = set_list_rels_funcp(rtxnp, MD_CAN_PARENT, MDRNM_LIST_URFOLKS,
1586 					md_rename_listfolks, &add_folks);
1587 
1588 	if (err) {
1589 		goto out;
1590 	}
1591 
1592 	err = set_list_rels_funcp(rtxnp, MD_CAN_DO_ANYTHING, MDRNM_LIST_URSELF,
1593 						md_rename_listself, &add_self);
1594 	if (err) {
1595 		goto out;
1596 	}
1597 
1598 	err = set_list_rels_funcp(rtxnp, MD_CAN_META_CHILD, MDRNM_LIST_URKIDS,
1599 				/* no default list func */ ((int (*)()) NULL),
1600 								&add_kids);
1601 	if (err) {
1602 		goto out;
1603 	}
1604 
1605 	rtxnp->n_recids = 0;	/* accumulated by list_relations() */
1606 
1607 	if ((err = list_relations(&family, rtxnp, add_folks, 0, 1)) != 0) {
1608 		goto out;
1609 	}
1610 
1611 	if ((err = list_relations(&family, rtxnp, add_self, 1, 1)) != 0) {
1612 		goto out;
1613 	}
1614 
1615 	err = list_relations(&family, rtxnp, add_kids, 0, md_nunits);
1616 	if (err != 0) {
1617 		goto out;
1618 	}
1619 
1620 	/*
1621 	 * delta tree is still empty?
1622 	 */
1623 	if ((!family) || (rtxnp->n_recids == 0)) {
1624 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
1625 							rtxnp->from.mnum);
1626 		err = EINVAL;
1627 		goto out;
1628 	}
1629 
1630 	/*
1631 	 * verify role change interactions
1632 	 */
1633 	if ((err = valid_roleswap_dtree(family, rtxnp)) != 0) {
1634 		goto out;
1635 	}
1636 
1637 	if ((err = alloc_recids(family, rtxnp)) != 0) {
1638 		goto out;
1639 	}
1640 
1641 out:
1642 	if (err != 0) {
1643 		free_dtree(family);
1644 		dump_dtree(family);	/* yes, after freeing it */
1645 		family = NULL;
1646 	}
1647 
1648 	return (family);
1649 }
1650 
1651 
1652 /*
1653  * (MD_IOCRENAME) rename/exchange ioctl entry point
1654  * calls individual driver named service entry points
1655  * to build a list of devices which need state changed,
1656  * to verify that they're in a state where renames may occur,
1657  * and to modify themselves into their new identities
1658  */
1659 
1660 int
md_rename(md_rename_t * mrp,IOLOCK * iolockp)1661 md_rename(
1662 	md_rename_t	*mrp,
1663 	IOLOCK		*iolockp)
1664 {
1665 	md_rendelta_t	*family		= NULL;
1666 	md_rentxn_t	rtxn;
1667 	int		err		= 0;
1668 	set_t		setno;
1669 	mdc_unit_t	*mdc;
1670 
1671 	ASSERT(iolockp);
1672 	if (mrp == NULL)
1673 		return (EINVAL);
1674 
1675 	setno = MD_MIN2SET(mrp->from.mnum);
1676 	if (setno >= md_nsets) {
1677 		return (EINVAL);
1678 	}
1679 
1680 	/*
1681 	 * Early exit if top is eof trans
1682 	 */
1683 	mdc = (mdc_unit_t *)md_set[setno].s_un[MD_MIN2UNIT(mrp->from.mnum)];
1684 	while (mdc != NULL) {
1685 	    if (!MD_HAS_PARENT(mdc->un_parent)) {
1686 		break;
1687 	    } else {
1688 		mdc = (mdc_unit_t *)md_set[setno].s_un[MD_MIN2UNIT
1689 		    (mdc->un_parent)];
1690 	    }
1691 	}
1692 
1693 	if (mdc && mdc->un_type == MD_METATRANS) {
1694 		return (EINVAL);
1695 	}
1696 
1697 
1698 	mdclrerror(&mrp->mde);
1699 
1700 	bzero((void *) &rtxn, sizeof (md_rentxn_t));
1701 	mdclrerror(&rtxn.mde);
1702 
1703 	/*
1704 	 * encapsulate user parameters
1705 	 */
1706 	rtxn.from.key	= mrp->from.key;
1707 	rtxn.to.key	= mrp->to.key;
1708 	rtxn.from.mnum	= mrp->from.mnum;
1709 	rtxn.to.mnum	= mrp->to.mnum;
1710 	rtxn.op		= mrp->op;
1711 	rtxn.uflags	= mrp->flags;
1712 	rtxn.revision	= mrp->revision;
1713 
1714 	if (MD_MIN2UNIT(mrp->to.mnum) >= md_nunits) {
1715 		err = EINVAL;
1716 		goto cleanup;
1717 	}
1718 
1719 	/*
1720 	 * catch this early, before taking any locks
1721 	 */
1722 	if (md_get_setstatus(setno) & MD_SET_STALE) {
1723 		(void) (mdmddberror(&rtxn.mde, MDE_DB_STALE, rtxn.from.mnum,
1724 						MD_MIN2SET(rtxn.from.mnum)));
1725 		err = EROFS;
1726 		goto cleanup;
1727 	}
1728 
1729 	/*
1730 	 * Locking and re-validation (of the per-unit state) is
1731 	 * done by the rename lock/unlock service, for now only take
1732 	 * the array lock.
1733 	 */
1734 	md_array_writer(iolockp);
1735 
1736 	/*
1737 	 * validate the rename/exchange parameters
1738 	 * rtxn is filled in on succesful completion of validate_txn_parms()
1739 	 */
1740 	if ((err = validate_txn_parms(&rtxn)) != 0) {
1741 		goto cleanup;
1742 	}
1743 
1744 	/*
1745 	 * build list of work to do, the "delta tree" for related devices
1746 	 */
1747 	if (!(family = build_dtree(&rtxn))) {
1748 		err = ENOMEM;
1749 		goto cleanup;
1750 	}
1751 	dump_txn(&rtxn);
1752 	dump_dtree(family);
1753 
1754 	if ((err = lock_dtree(family, &rtxn)) != 0) {
1755 		goto cleanup;
1756 	}
1757 
1758 	if ((err = check_dtree(family, &rtxn)) != 0) {
1759 		goto cleanup;
1760 	}
1761 	dump_txn(&rtxn);
1762 
1763 	role_swap_dtree(family, &rtxn);	/* commits the recids */
1764 
1765 	/*
1766 	 * let folks know
1767 	 */
1768 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_RENAME_SRC, SVM_TAG_METADEVICE,
1769 	    MD_MIN2SET(rtxn.from.mnum), rtxn.from.mnum);
1770 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_RENAME_DST, SVM_TAG_METADEVICE,
1771 	    MD_MIN2SET(rtxn.from.mnum), rtxn.from.mnum);
1772 
1773 cleanup:
1774 
1775 	if (err != 0 && mdisok(&rtxn.mde)) {
1776 		(void) mdsyserror(&rtxn.mde, EINVAL);
1777 	}
1778 
1779 	if (family) {
1780 		unlock_dtree(family, &rtxn);
1781 		free_dtree(family);
1782 		dump_dtree(family);
1783 		family = NULL;
1784 	}
1785 
1786 	if (rtxn.recids && (rtxn.n_recids > 0)) {
1787 		kmem_free(rtxn.recids, sizeof (mddb_recid_t) * rtxn.n_recids);
1788 	}
1789 
1790 	if (!mdisok(&rtxn.mde)) {
1791 		(void) mdstealerror(&mrp->mde, &rtxn.mde);
1792 	}
1793 
1794 	return (0);	/* success/failure will be communicated via rtxn.mde */
1795 }
1796 
1797 static role_change_tab_t
1798 role_swap_tab[] =
1799 {
1800 	{
1801 		1,			/* ordinal */
1802 		MDRR_PARENT,		/* old role */
1803 		MDRR_PARENT,		/* new role */
1804 		MDRNM_UPDATE_KIDS,	/* named service */
1805 		NO_DEFAULT_ROLESWAP_SVC	/* default role swap function */
1806 	},
1807 	{
1808 		2,
1809 		MDRR_PARENT,
1810 		MDRR_SELF,
1811 		MDRNM_PARENT_UPDATE_TO,
1812 		NO_DEFAULT_ROLESWAP_SVC
1813 	},
1814 	{
1815 		3,
1816 		MDRR_PARENT,
1817 		MDRR_CHILD,
1818 		ILLEGAL_SVC_NAME,
1819 		ILLEGAL_ROLESWAP_SVC
1820 	},
1821 	{
1822 		4,
1823 		MDRR_SELF,
1824 		MDRR_PARENT,
1825 		MDRNM_SELF_UPDATE_FROM_UP,
1826 		md_exchange_self_update_from_up
1827 	},
1828 	{
1829 		5,
1830 		MDRR_SELF,
1831 		MDRR_SELF,
1832 		MDRNM_UPDATE_SELF,
1833 		md_rename_update_self
1834 	},
1835 	{
1836 		6,
1837 		MDRR_SELF,
1838 		MDRR_CHILD,
1839 		MDRNM_SELF_UPDATE_FROM_DOWN,
1840 		NO_DEFAULT_ROLESWAP_SVC
1841 	},
1842 	{
1843 		7,
1844 		MDRR_CHILD,
1845 		MDRR_PARENT,
1846 		ILLEGAL_SVC_NAME,
1847 		ILLEGAL_ROLESWAP_SVC
1848 	},
1849 	{
1850 		8,
1851 		MDRR_CHILD,
1852 		MDRR_SELF,
1853 		MDRNM_CHILD_UPDATE_TO,
1854 		md_exchange_child_update_to
1855 	},
1856 	{
1857 		9,
1858 		MDRR_CHILD,
1859 		MDRR_CHILD,
1860 		MDRNM_UPDATE_FOLKS,
1861 		md_renexch_update_parent
1862 	},
1863 
1864 	/* terminator is old_role == MDRR_UNK */
1865 	{
1866 		0,
1867 		MDRR_UNK,
1868 		MDRR_UNK,
1869 		ILLEGAL_SVC_NAME,
1870 		NO_DEFAULT_ROLESWAP_SVC
1871 	}
1872 };
1873