xref: /titanic_41/usr/src/uts/common/io/lvm/raid/raid_ioctl.c (revision b97d6ca7333c353b6ca20c20c99fb1be8d32a8de)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright 2012 Milan Jurik. All rights reserved.
26  */
27 
28 /*
29  * NAME:	raid_ioctl.c
30  *
31  * DESCRIPTION: RAID driver source file containing IOCTL operations.
32  *
33  * ROUTINES PROVIDED FOR EXTERNAL USE:
34  *	  raid_commit() - commits MD database updates for a RAID metadevice
35  *	md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
36  *
37  * ROUTINES PROVIDED FOR INTERNAL USE:
38  *	 raid_getun() - Performs unit checking on a RAID metadevice
39  *    init_col_nextio() - normal backend when zeroing column of RAID metadevice.
40  *	 init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
41  *  raid_init_columns() - Zero one or more columns of a RAID metadevice.
42  *	     raid_set() - used to create a RAID metadevice
43  *	     raid_get() - used to get the unit structure of a RAID metadevice
44  *	 raid_replace() - used to replace a component of a RAID metadevice
45  *	    raid_grow() - Concatenate to a RAID metadevice
46  *	  raid_change() - change dynamic values of a RAID metadevice
47  *	   raid_reset() - used to reset (clear / remove) a RAID metadevice
48  *	raid_get_geom() - used to get the geometry of a RAID metadevice
49  *	raid_get_vtoc() - used to get the VTOC on a RAID metadevice
50  *	raid_set_vtoc() - used to set the VTOC on a RAID metadevice
51  *	raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
52  *	raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
53  *	 raid_getdevs() - return all devices within a RAID metadevice
54  *   raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
55  */
56 
57 
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/conf.h>
61 #include <sys/file.h>
62 #include <sys/user.h>
63 #include <sys/uio.h>
64 #include <sys/t_lock.h>
65 #include <sys/buf.h>
66 #include <sys/dkio.h>
67 #include <sys/vtoc.h>
68 #include <sys/kmem.h>
69 #include <vm/page.h>
70 #include <sys/sysmacros.h>
71 #include <sys/types.h>
72 #include <sys/mkdev.h>
73 #include <sys/stat.h>
74 #include <sys/open.h>
75 #include <sys/disp.h>
76 #include <sys/modctl.h>
77 #include <sys/ddi.h>
78 #include <sys/sunddi.h>
79 #include <sys/cred.h>
80 #include <sys/lvm/mdvar.h>
81 #include <sys/lvm/md_names.h>
82 #include <sys/lvm/md_mddb.h>
83 #include <sys/lvm/md_raid.h>
84 #include <sys/lvm/md_convert.h>
85 
86 #include <sys/sysevent/eventdefs.h>
87 #include <sys/sysevent/svm.h>
88 
89 extern int		md_status;
90 extern unit_t		md_nunits;
91 extern set_t		md_nsets;
92 extern md_set_t		md_set[];
93 extern md_ops_t		raid_md_ops;
94 extern major_t		md_major;
95 extern md_krwlock_t	md_unit_array_rw;
96 extern mdq_anchor_t	md_done_daemon;
97 extern mdq_anchor_t	md_ff_daemonq;
98 extern	int		mdopen();
99 extern	int		mdclose();
100 extern	void		md_probe_one(probe_req_t *);
101 extern int		md_init_probereq(md_probedev_impl_t *,
102 				daemon_queue_t **);
103 extern md_resync_t	md_cpr_resync;
104 
105 
106 extern void dump_mr_unit(mr_unit_t *);
107 
108 typedef struct raid_ci {
109 	DAEMON_QUEUE
110 	struct raid_ci	*ci_next;
111 	mr_unit_t	*ci_un;
112 	int		ci_col;
113 	int		ci_err;
114 	int		ci_flag;
115 	size_t		ci_zerosize;
116 	diskaddr_t	ci_blkno;
117 	diskaddr_t	ci_lastblk;
118 	buf_t		ci_buf;
119 } raid_ci_t;
120 /* values for the ci_flag */
121 #define	COL_INITING	(0x0001)
122 #define	COL_INIT_DONE	(0x0002)
123 #define	COL_READY	(0x0004)
124 
125 /*
126  * NAME:	raid_getun
127  * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
128  * PARAMETERS:	minor_t	      mnum - minor device number for RAID unit
129  *		md_error_t    *mde - pointer to error reporting structure
130  *		int	     flags - pointer to error reporting structure
131  *					STALE_OK - allow stale MD memory
132  *					  NO_OLD - unit must not exist
133  *					 NO_LOCK - no IOCTL lock needed
134  *					 WR_LOCK - write IOCTL lock needed
135  *					 RD_LOCK - read IOCTL lock needed
136  *		IOLOCK	     *lock - pointer to IOCTL lock
137  *
138  * LOCKS:	obtains unit reader or writer lock via IOLOCK
139  *
140  */
141 static mr_unit_t *
raid_getun(minor_t mnum,md_error_t * mde,int flags,IOLOCK * lock)142 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
143 {
144 	mr_unit_t	*un;
145 	mdi_unit_t	*ui;
146 	set_t		setno = MD_MIN2SET(mnum);
147 
148 	if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
149 		(void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
150 		return (NULL);
151 	}
152 
153 	if (!(flags & STALE_OK)) {
154 		if (md_get_setstatus(setno) & MD_SET_STALE) {
155 			(void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
156 			return (NULL);
157 		}
158 	}
159 
160 	ui = MDI_UNIT(mnum);
161 	if (flags & NO_OLD) {
162 		if (ui != NULL) {
163 			(void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
164 			return (NULL);
165 		}
166 		return ((mr_unit_t *)1);
167 	}
168 
169 	if (ui == NULL) {
170 		(void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
171 		return (NULL);
172 	}
173 	if (flags & ARRAY_WRITER)
174 		md_array_writer(lock);
175 	else if (flags & ARRAY_READER)
176 		md_array_reader(lock);
177 
178 	if (!(flags & NO_LOCK)) {
179 		if (flags & WR_LOCK) {
180 			(void) md_ioctl_io_lock(lock, ui);
181 			(void) md_ioctl_writerlock(lock, ui);
182 		} else /* RD_LOCK */
183 			(void) md_ioctl_readerlock(lock, ui);
184 	}
185 	un = (mr_unit_t *)MD_UNIT(mnum);
186 
187 	if (un->c.un_type != MD_METARAID) {
188 		(void) mdmderror(mde, MDE_NOT_RAID, mnum);
189 		return (NULL);
190 	}
191 
192 	return (un);
193 }
194 
195 
196 /*
197  * NAME:	raid_commit
198  * DESCRIPTION: commits MD database updates for a RAID metadevice
199  * PARAMETERS:	mr_unit_t	 *un - RAID unit to update in the MD database
200  *		mddb_recid_t *extras - array of other record IDs to update
201  *
202  * LOCKS:	assumes caller holds unit writer lock
203  *
204  */
205 void
raid_commit(mr_unit_t * un,mddb_recid_t * extras)206 raid_commit(mr_unit_t *un, mddb_recid_t	*extras)
207 {
208 	mddb_recid_t	*recids;
209 	int 		ri = 0;
210 	int		nrecids = 0;
211 
212 	if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
213 		return;
214 
215 	/* Count the extra recids */
216 	if (extras != NULL) {
217 		while (extras[nrecids] != 0) {
218 			nrecids++;
219 		}
220 	}
221 
222 	/*
223 	 * Allocate space for two recids in addition to the extras:
224 	 * one for the unit structure, one for the null terminator.
225 	 */
226 	nrecids += 2;
227 	recids = (mddb_recid_t *)
228 	    kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
229 
230 	if (un != NULL) {
231 		ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
232 		recids[ri++] = un->c.un_record_id;
233 	}
234 
235 	if (extras != NULL) {
236 		while (*extras != 0) {
237 			recids[ri++] = *extras;
238 			extras++;
239 		}
240 	}
241 
242 	if (ri > 0) {
243 		mddb_commitrecs_wrapper(recids);
244 	}
245 
246 	kmem_free(recids, nrecids * sizeof (mddb_recid_t));
247 }
248 
249 static int
raid_check_pw(mr_unit_t * un)250 raid_check_pw(mr_unit_t *un)
251 {
252 	buf_t		bp;
253 	char		*buf;
254 	mr_column_t	*colptr;
255 	minor_t		mnum = MD_SID(un);
256 	int		i;
257 	int		err = 0;
258 	minor_t		unit;
259 
260 	buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
261 
262 	for (i = 0; i < un->un_totalcolumncnt; i++) {
263 		md_dev64_t tmpdev;
264 
265 		colptr = &un->un_column[i];
266 
267 		tmpdev = colptr->un_dev;
268 		/*
269 		 * Open by device id
270 		 * If this device is hotspared
271 		 * use the hotspare key
272 		 */
273 		tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
274 		    colptr->un_hs_key : colptr->un_orig_key);
275 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
276 			colptr->un_dev = tmpdev;
277 			return (1);
278 		}
279 		colptr->un_dev = tmpdev;
280 
281 		bzero((caddr_t)&bp, sizeof (buf_t));
282 		bp.b_back = &bp;
283 		bp.b_forw = &bp;
284 		bp.b_flags = B_READ | B_BUSY;
285 		sema_init(&bp.b_io, 0, NULL,
286 		    SEMA_DEFAULT, NULL);
287 		sema_init(&bp.b_sem, 0, NULL,
288 		    SEMA_DEFAULT, NULL);
289 		bp.b_edev = md_dev64_to_dev(colptr->un_dev);
290 		bp.b_lblkno = colptr->un_pwstart;
291 		bp.b_bcount = DEV_BSIZE;
292 		bp.b_bufsize = DEV_BSIZE;
293 		bp.b_un.b_addr = (caddr_t)buf;
294 		bp.b_offset = -1;
295 		(void) md_call_strategy(&bp, 0, NULL);
296 		if (biowait(&bp))
297 			err = 1;
298 		if (i == 0) {
299 			if (un->c.un_revision & MD_64BIT_META_DEV) {
300 				unit = ((raid_pwhdr_t *)buf)->rpw_unit;
301 			} else {
302 				unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
303 			}
304 		}
305 		/*
306 		 * depending upon being an 64bit or 32 bit raid, the
307 		 * pre write headers have different layout
308 		 */
309 		if (un->c.un_revision & MD_64BIT_META_DEV) {
310 			if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
311 			    (((raid_pwhdr_t *)buf)->rpw_unit != unit))
312 				err = 1;
313 		} else {
314 			if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
315 			    (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
316 				err = 1;
317 		}
318 		md_layered_close(colptr->un_dev, MD_OFLG_NULL);
319 		if (err)
320 			break;
321 	}
322 	kmem_free(buf, DEV_BSIZE);
323 	return (err);
324 }
325 
326 /*
327  * NAME:	init_col_nextio
328  * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
329  * PARAMETERS:	raid_ci_t *cur - struct for column being zeroed
330  *
331  * LOCKS:	assumes caller holds unit reader lock,
332  *		preiodically releases and reacquires unit reader lock,
333  *		broadcasts on unit conditional variable (un_cv)
334  *
335  */
336 #define	INIT_RLS_CNT	10
337 static void
init_col_nextio(raid_ci_t * cur)338 init_col_nextio(raid_ci_t *cur)
339 {
340 	mr_unit_t	*un;
341 
342 	un = cur->ci_un;
343 
344 	cur->ci_blkno += cur->ci_zerosize;
345 
346 	mutex_enter(&un->un_mx);
347 	/* ===> update un_percent_done */
348 	un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
349 	mutex_exit(&un->un_mx);
350 
351 	/*
352 	 * When gorwing a device, normal I/O is still going on.
353 	 * The init thread still holds the unit reader lock which
354 	 * prevents I/O from doing state changes.
355 	 * So every INIT_RLS_CNT init I/Os, we will release the
356 	 * unit reader lock.
357 	 *
358 	 * CAVEAT:
359 	 * We know we are in the middle of a grow operation and the
360 	 * unit cannot be grown or removed (through reset or halt)
361 	 * so the mr_unit_t structure will not move or disappear.
362 	 * In addition, we know that only one of the init I/Os
363 	 * can be in col_init_nextio at a time because they are
364 	 * placed on the md_done_daemon queue and md only processes
365 	 * one element of this queue at a time. In addition, any
366 	 * code that needs to acquire the unit writer lock to change
367 	 * state is supposed to be on the md_mstr_daemon queue so
368 	 * it can be processing while we sit here waiting to get the
369 	 * unit reader lock back.
370 	 */
371 
372 	if (cur->ci_blkno < cur->ci_lastblk) {
373 		/* truncate last chunk to end_addr if needed */
374 		if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
375 			cur->ci_zerosize = (size_t)
376 			    (cur->ci_lastblk - cur->ci_blkno);
377 		}
378 
379 		/* set address and length for I/O bufs */
380 		cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
381 		cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
382 		cur->ci_buf.b_lblkno = cur->ci_blkno;
383 
384 		(void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
385 		return;
386 	}
387 	/* finished initializing this column */
388 	mutex_enter(&un->un_mx);
389 	cur->ci_flag = COL_INIT_DONE;
390 	uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
391 	mutex_exit(&un->un_mx);
392 	cv_broadcast(&un->un_cv);
393 }
394 
395 /*
396  * NAME:	init_col_int
397  * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
398  * PARAMETERS:	buf_t	  *cb - I/O buffer for which interrupt occurred
399  *
400  * LOCKS:	assumes caller holds unit reader or writer lock
401  *
402  */
403 static int
init_col_int(buf_t * cb)404 init_col_int(buf_t *cb)
405 {
406 	raid_ci_t	*cur;
407 
408 	cur = (raid_ci_t *)cb->b_chain;
409 	if (cb->b_flags & B_ERROR) {
410 		mutex_enter(&cur->ci_un->un_mx);
411 		cur->ci_err = EIO;
412 		mutex_exit(&cur->ci_un->un_mx);
413 		cv_broadcast(&cur->ci_un->un_cv);
414 		return (1);
415 	}
416 	daemon_request(&md_done_daemon, init_col_nextio,
417 	    (daemon_queue_t *)cur, REQ_OLD);
418 	return (1);
419 }
420 
421 /*
422  * NAME:	raid_init_columns
423  * DESCRIPTION: Zero one or more columns of a RAID metadevice.
424  * PARAMETERS:	minor_t	 mnum - RAID unit minor identifier
425  *
426  * LOCKS:	obtains and releases unit reader lock,
427  *		obtains and releases unit writer lock,
428  *		obtains and releases md_unit_array_rw write lock,
429  *		obtains and releases unit mutex (un_mx) lock,
430  *		waits on unit conditional variable (un_cv)
431  *
432  */
433 static void
raid_init_columns(minor_t mnum)434 raid_init_columns(minor_t mnum)
435 {
436 	mr_unit_t	*un;
437 	mdi_unit_t	*ui;
438 	raid_ci_t	*ci_chain = NULL, *cur;
439 	rus_state_t	state;
440 	caddr_t		zero_addr;
441 	diskaddr_t	end_off;
442 	size_t		zerosize;
443 	int		err = 0;
444 	int		ix;
445 	int		colcnt = 0;
446 	int		col;
447 	set_t		setno = MD_MIN2SET(mnum);
448 
449 	/*
450 	 * Increment the raid resync count for cpr
451 	 */
452 	mutex_enter(&md_cpr_resync.md_resync_mutex);
453 	md_cpr_resync.md_raid_resync++;
454 	mutex_exit(&md_cpr_resync.md_resync_mutex);
455 
456 	/*
457 	 * initialization is a multiple step process.  The first step
458 	 * is to go through the unit structure and start each device
459 	 * in the init state writing zeros over the component.
460 	 * Next initialize the prewrite areas, so the device can be
461 	 * used if a metainit -k is done.  Now close the componenets.
462 	 *
463 	 * Once this complete set the state of each component being
464 	 * zeroed and set the correct state for the unit.
465 	 *
466 	 * last commit the records.
467 	 */
468 
469 	ui = MDI_UNIT(mnum);
470 	un = md_unit_readerlock(ui);
471 
472 	/* check for active init on this column */
473 	/* exiting is cpr safe */
474 	if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
475 		md_unit_readerexit(ui);
476 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
477 		/*
478 		 * Decrement the raid resync count for cpr
479 		 */
480 		mutex_enter(&md_cpr_resync.md_resync_mutex);
481 		md_cpr_resync.md_raid_resync--;
482 		mutex_exit(&md_cpr_resync.md_resync_mutex);
483 		thread_exit();
484 	}
485 
486 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
487 	    MD_SID(un));
488 	un->un_init_colcnt = 0;
489 	un->un_init_iocnt = 0;
490 	end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
491 	zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
492 
493 	/* allocate zero-filled buffer */
494 	zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
495 
496 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
497 		if (un->un_column[ix].un_devstate != RCS_INIT)
498 			continue;
499 		/* allocate new column init structure */
500 		cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
501 		ASSERT(cur != NULL);
502 		un->un_init_colcnt++;
503 		cur->ci_next = ci_chain;
504 		ci_chain = cur;
505 		cur->ci_un = un;
506 		cur->ci_col = ix;
507 		cur->ci_err = 0;
508 		cur->ci_flag = COL_INITING;
509 		cur->ci_zerosize = zerosize;
510 		cur->ci_blkno = un->un_column[ix].un_pwstart;
511 		cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
512 		    + (un->un_segsize * un->un_segsincolumn);
513 		/* initialize static buf fields */
514 		cur->ci_buf.b_un.b_addr = zero_addr;
515 		cur->ci_buf.b_chain = (buf_t *)cur;
516 		cur->ci_buf.b_back = &cur->ci_buf;
517 		cur->ci_buf.b_forw = &cur->ci_buf;
518 		cur->ci_buf.b_iodone = init_col_int;
519 		cur->ci_buf.b_flags = B_BUSY | B_WRITE;
520 		cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
521 		sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
522 		sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
523 		/* set address and length for I/O bufs */
524 		cur->ci_buf.b_bufsize = dbtob(zerosize);
525 		cur->ci_buf.b_bcount = dbtob(zerosize);
526 		cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
527 		cur->ci_buf.b_offset = -1;
528 
529 		if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
530 			md_dev64_t tmpdev = un->un_column[ix].un_dev;
531 			/*
532 			 * Open by device id
533 			 * If this column is hotspared then
534 			 * use the hotspare key
535 			 */
536 			tmpdev = md_resolve_bydevid(mnum, tmpdev,
537 			    HOTSPARED(un, ix) ?
538 			    un->un_column[ix].un_hs_key :
539 			    un->un_column[ix].un_orig_key);
540 			if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
541 			    MD_OFLG_NULL)) == 0)
542 				un->un_column[ix].un_devflags |=
543 				    MD_RAID_DEV_ISOPEN;
544 			un->un_column[ix].un_dev = tmpdev;
545 		}
546 		if (cur->ci_err == 0)
547 			md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
548 	}
549 
550 	md_unit_readerexit(ui);
551 	state = un->un_state;
552 	colcnt = un->un_init_colcnt;
553 	mutex_enter(&un->un_mx);
554 	while (colcnt) {
555 		cv_wait(&un->un_cv, &un->un_mx);
556 
557 		colcnt = 0;
558 		for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
559 			col = cur->ci_col;
560 			if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
561 				if (cur->ci_err)
562 					err = cur->ci_err;
563 				else if (cur->ci_flag == COL_INIT_DONE) {
564 					(void) init_pw_area(un,
565 					    un->un_column[col].un_dev,
566 					    un->un_column[col].un_pwstart,
567 					    col);
568 					cur->ci_flag = COL_READY;
569 				}
570 			} else {
571 				colcnt++;
572 			}
573 		}
574 	}
575 	mutex_exit(&un->un_mx);
576 
577 	/* This prevents new opens */
578 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
579 	(void) md_io_writerlock(ui);
580 	un = (mr_unit_t *)md_unit_writerlock(ui);
581 	while (ci_chain) {
582 		cur = ci_chain;
583 
584 		/* take this element out of the chain */
585 		ci_chain = cur->ci_next;
586 		/* free this element */
587 		sema_destroy(&cur->ci_buf.b_io);
588 		sema_destroy(&cur->ci_buf.b_sem);
589 		if (cur->ci_err)
590 			raid_set_state(cur->ci_un, cur->ci_col,
591 			    RCS_INIT_ERRED, 0);
592 		else
593 			raid_set_state(cur->ci_un, cur->ci_col,
594 			    RCS_OKAY, 0);
595 		kmem_free(cur, sizeof (raid_ci_t));
596 	}
597 
598 	/* free the zeroed buffer */
599 	kmem_free(zero_addr, dbtob(zerosize));
600 
601 	/* determine new unit state */
602 	if (err == 0) {
603 		if (state == RUS_INIT)
604 			un->un_state = RUS_OKAY;
605 		else {
606 			un->c.un_total_blocks = un->un_grow_tb;
607 			md_nblocks_set(mnum, un->c.un_total_blocks);
608 			un->un_grow_tb = 0;
609 			if (raid_state_cnt(un, RCS_OKAY) ==
610 			    un->un_totalcolumncnt)
611 				un->un_state = RUS_OKAY;
612 		}
613 	} else {  /* error orcurred */
614 		if (state & RUS_INIT)
615 			un->un_state = RUS_DOI;
616 	}
617 	uniqtime32(&un->un_timestamp);
618 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
619 	un->un_init_colcnt = 0;
620 	un->un_init_iocnt = 0;
621 	raid_commit(un, NULL);
622 	md_unit_writerexit(ui);
623 	(void) md_io_writerexit(ui);
624 	rw_exit(&md_unit_array_rw.lock);
625 	if (err) {
626 		if (un->un_state & RUS_DOI) {
627 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
628 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
629 		} else {
630 			SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
631 			    SVM_TAG_METADEVICE, setno, MD_SID(un));
632 		}
633 	} else {
634 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
635 		    SVM_TAG_METADEVICE, setno, MD_SID(un));
636 	}
637 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
638 	/*
639 	 * Decrement the raid resync count for cpr
640 	 */
641 	mutex_enter(&md_cpr_resync.md_resync_mutex);
642 	md_cpr_resync.md_raid_resync--;
643 	mutex_exit(&md_cpr_resync.md_resync_mutex);
644 	thread_exit();
645 	/*NOTREACHED*/
646 }
647 
648 static int
raid_init_unit(minor_t mnum,md_error_t * ep)649 raid_init_unit(minor_t mnum, md_error_t *ep)
650 {
651 	mdi_unit_t	*ui;
652 	mr_unit_t	*un;
653 	int		rval, i;
654 	set_t		setno = MD_MIN2SET(mnum);
655 
656 	ui = MDI_UNIT(mnum);
657 	if (md_get_setstatus(setno) & MD_SET_STALE)
658 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
659 
660 	/* Don't start an init if the device is not available */
661 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
662 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
663 	}
664 
665 	if (raid_internal_open(mnum, (FREAD | FWRITE),
666 	    OTYP_LYR, MD_OFLG_ISINIT)) {
667 		rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
668 		goto out;
669 	}
670 
671 	un = md_unit_readerlock(ui);
672 	un->un_percent_done = 0;
673 	md_unit_readerexit(ui);
674 	/* start resync_unit thread */
675 	(void) thread_create(NULL, 0, raid_init_columns,
676 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
677 
678 	return (0);
679 
680 out:
681 	un = md_unit_writerlock(ui);
682 	MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
683 	/* recover state */
684 	for (i = 0; i < un->un_totalcolumncnt; i++)
685 		if (COLUMN_STATE(un, i) == RCS_INIT)
686 			raid_set_state(un, i, RCS_ERRED, 0);
687 	if (un->un_state & RUS_INIT)
688 		un->un_state = RUS_DOI;
689 	raid_commit(un, NULL);
690 	md_unit_writerexit(ui);
691 	if (un->un_state & RUS_DOI) {
692 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
693 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
694 	} else {
695 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
696 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
697 	}
698 	return (rval);
699 }
700 
701 /*
702  * NAME:	raid_regen
703  *
704  * DESCRIPTION:	regenerate all the parity on the raid device.  This
705  *		routine starts a thread that will regenerate the
706  *		parity on a raid device.  If an I/O error occurs during
707  *		this process the entire device is placed in error.
708  *
709  * PARAMETERS:	md_set_params_t *msp - ioctl packet
710  */
711 static void
regen_unit(minor_t mnum)712 regen_unit(minor_t mnum)
713 {
714 	mdi_unit_t	*ui = MDI_UNIT(mnum);
715 	mr_unit_t	*un = MD_UNIT(mnum);
716 	buf_t		buf, *bp;
717 	caddr_t		buffer;
718 	int		err = 0;
719 	diskaddr_t	total_segments;
720 	diskaddr_t	line;
721 	size_t		iosize;
722 
723 	/*
724 	 * Increment raid resync count for cpr
725 	 */
726 	mutex_enter(&md_cpr_resync.md_resync_mutex);
727 	md_cpr_resync.md_raid_resync++;
728 	mutex_exit(&md_cpr_resync.md_resync_mutex);
729 
730 	iosize = dbtob(un->un_segsize);
731 	buffer = kmem_alloc(iosize, KM_SLEEP);
732 	bp = &buf;
733 	total_segments = un->un_segsincolumn;
734 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
735 	    MD_UN2SET(un), MD_SID(un));
736 	un->un_percent_done = 0;
737 	init_buf(bp, B_READ | B_BUSY, iosize);
738 
739 	for (line = 0; line < total_segments; line++) {
740 		bp->b_lblkno = line *
741 		    ((un->un_origcolumncnt - 1) * un->un_segsize);
742 		bp->b_un.b_addr = buffer;
743 		bp->b_bcount = iosize;
744 		bp->b_iodone = NULL;
745 		/*
746 		 * The following assignment is only correct because
747 		 * md_raid_strategy is fine when it's only a minor number
748 		 * and not a real dev_t. Yuck.
749 		 */
750 		bp->b_edev = mnum;
751 		md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
752 		if (biowait(bp)) {
753 			err = 1;
754 			break;
755 		}
756 		un->un_percent_done = (uint_t)((line * 1000) /
757 		    un->un_segsincolumn);
758 		/* just to avoid rounding errors */
759 		if (un->un_percent_done > 1000)
760 			un->un_percent_done = 1000;
761 		reset_buf(bp, B_READ | B_BUSY, iosize);
762 	}
763 	destroy_buf(bp);
764 	kmem_free(buffer, iosize);
765 
766 	(void) md_io_writerlock(ui);
767 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
768 	(void) md_io_writerexit(ui);
769 	un = md_unit_writerlock(ui);
770 	if (!err &&
771 	    (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
772 			un->un_state = RUS_OKAY;
773 	raid_commit(un, NULL);
774 	md_unit_writerexit(ui);
775 	if (err ||
776 	    raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
777 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
778 		    SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
779 	} else {
780 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
781 		    MD_UN2SET(un), MD_SID(un));
782 	}
783 
784 	/*
785 	 * Decrement the raid resync count for cpr
786 	 */
787 	mutex_enter(&md_cpr_resync.md_resync_mutex);
788 	md_cpr_resync.md_raid_resync--;
789 	mutex_exit(&md_cpr_resync.md_resync_mutex);
790 	thread_exit();
791 }
792 
793 static int
raid_regen_unit(minor_t mnum,md_error_t * ep)794 raid_regen_unit(minor_t mnum, md_error_t *ep)
795 {
796 	mdi_unit_t	*ui;
797 	mr_unit_t	*un;
798 	int		i;
799 	set_t		setno = MD_MIN2SET(mnum);
800 
801 	ui = MDI_UNIT(mnum);
802 	un = (mr_unit_t *)MD_UNIT(mnum);
803 
804 	if (md_get_setstatus(setno) & MD_SET_STALE)
805 		return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
806 
807 	/* Don't start a regen if the device is not available */
808 	if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
809 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
810 	}
811 
812 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
813 		(void) md_unit_writerlock(ui);
814 		for (i = 0; i < un->un_totalcolumncnt; i++)
815 			raid_set_state(un, i, RCS_ERRED, 0);
816 		md_unit_writerexit(ui);
817 		return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
818 	}
819 
820 	/* start resync_unit thread */
821 	(void) thread_create(NULL, 0, regen_unit,
822 	    (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
823 
824 	return (0);
825 }
826 
827 static int
raid_regen(md_regen_param_t * mrp,IOLOCK * lock)828 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
829 {
830 	minor_t		mnum = mrp->mnum;
831 	mr_unit_t	*un;
832 
833 	mdclrerror(&mrp->mde);
834 
835 	un = md_unit_readerlock(MDI_UNIT(mnum));
836 
837 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
838 		md_unit_readerexit(MDI_UNIT(mnum));
839 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
840 	}
841 
842 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
843 	    (raid_state_cnt(un, RCS_RESYNC))) {
844 		md_unit_readerexit(MDI_UNIT(mnum));
845 		return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
846 	}
847 
848 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
849 		md_unit_readerexit(MDI_UNIT(mnum));
850 		return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
851 	}
852 
853 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
854 	    (! (un->un_state & RUS_OKAY))) {
855 		md_unit_readerexit(MDI_UNIT(mnum));
856 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
857 	}
858 
859 	md_unit_readerexit(MDI_UNIT(mnum));
860 
861 	/* get locks and recheck to be sure something did not change */
862 	if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
863 		return (0);
864 
865 	if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
866 	    (! (un->un_state & RUS_OKAY))) {
867 		return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
868 	}
869 
870 	raid_set_state(un, 0, RCS_REGEN, 0);
871 	raid_commit(un, NULL);
872 	md_ioctl_droplocks(lock);
873 	return (raid_regen_unit(mnum, &mrp->mde));
874 }
875 
876 /*
877  * NAME:	raid_set
878  * DESCRIPTION: used to create a RAID metadevice
879  * PARAMETERS:	md_set_params_t *d   - pointer to set data structure
880  *		int		mode - must be FWRITE
881  *
882  * LOCKS:	none
883  *
884  */
885 static int
raid_set(void * d,int mode)886 raid_set(void	*d, int mode)
887 {
888 	minor_t		mnum;
889 	mr_unit_t	*un;
890 	mddb_recid_t	mr_recid;
891 	mddb_recid_t	*recids;
892 	mddb_type_t	typ1;
893 	int		err;
894 	set_t		setno;
895 	int		num_recs;
896 	int		rid;
897 	int		col;
898 	md_set_params_t	*msp = d;
899 
900 
901 	mnum = msp->mnum;
902 	setno = MD_MIN2SET(mnum);
903 
904 	mdclrerror(&msp->mde);
905 
906 	if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
907 		return (0);
908 
909 	typ1 = (mddb_type_t)md_getshared_key(setno,
910 	    raid_md_ops.md_driver.md_drivername);
911 
912 	/* create the db record for this mdstruct */
913 
914 	if (msp->options & MD_CRO_64BIT) {
915 #if defined(_ILP32)
916 		return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
917 #else
918 		mr_recid = mddb_createrec(msp->size, typ1, 0,
919 		    MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
920 #endif
921 	} else {
922 		mr_recid = mddb_createrec(msp->size, typ1, 0,
923 		    MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
924 	}
925 
926 	if (mr_recid < 0)
927 		return (mddbstatus2error(&msp->mde,
928 		    (int)mr_recid, mnum, setno));
929 
930 	/* get the address of the mdstruct */
931 	un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
932 	/*
933 	 * It is okay that we muck with the mdstruct here,
934 	 * since no one else will know about the mdstruct
935 	 * until we commit it. If we crash, the record will
936 	 * be automatically purged, since we haven't
937 	 * committed it yet.
938 	 */
939 
940 	/* copy in the user's mdstruct */
941 	if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
942 	    msp->size, mode)) {
943 		mddb_deleterec_wrapper(mr_recid);
944 		return (EFAULT);
945 	}
946 	/* All 64 bit metadevices only support EFI labels. */
947 	if (msp->options & MD_CRO_64BIT) {
948 		un->c.un_flag |= MD_EFILABEL;
949 	}
950 
951 	/*
952 	 * allocate the real recids array.  since we may have to commit
953 	 * underlying metadevice records, we need an array of size:
954 	 * total number of components in raid + 3 (1 for the raid itself,
955 	 * one for the hotspare, one for the end marker).
956 	 */
957 	num_recs = un->un_totalcolumncnt + 3;
958 	rid = 0;
959 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
960 	recids[rid++] = mr_recid;
961 
962 	MD_SID(un) = mnum;
963 	MD_RECID(un) = recids[0];
964 	MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
965 	MD_PARENT(un) = MD_NO_PARENT;
966 	un->un_resync_copysize = 0;
967 	un->c.un_revision |= MD_FN_META_DEV;
968 
969 	if (UNIT_STATE(un) == RUS_INIT)
970 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
971 
972 	if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
973 		mddb_deleterec_wrapper(mr_recid);
974 		err = mderror(&msp->mde, MDE_RAID_INVALID);
975 		goto out;
976 	}
977 
978 	if (err = raid_build_incore(un, 0)) {
979 		if (un->mr_ic) {
980 			kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
981 			    un->un_totalcolumncnt);
982 			kmem_free(un->mr_ic, sizeof (*un->mr_ic));
983 		}
984 
985 		md_nblocks_set(mnum, -1ULL);
986 		MD_UNIT(mnum) = NULL;
987 
988 		mddb_deleterec_wrapper(mr_recid);
989 		goto out;
990 	}
991 
992 	/*
993 	 * Update unit availability
994 	 */
995 	md_set[setno].s_un_avail--;
996 
997 	recids[rid] = 0;
998 	if (un->un_hsp_id != -1) {
999 		/* increment the reference count of the hot spare pool */
1000 		err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
1001 		    &recids[rid], NULL, NULL, NULL);
1002 		if (err) {
1003 			md_nblocks_set(mnum, -1ULL);
1004 			MD_UNIT(mnum) = NULL;
1005 
1006 			mddb_deleterec_wrapper(mr_recid);
1007 			goto out;
1008 		}
1009 		rid++;
1010 	}
1011 
1012 	/*
1013 	 * set the parent on any metadevice components.
1014 	 * NOTE: currently soft partitions are the only metadevices
1015 	 * which can appear within a RAID metadevice.
1016 	 */
1017 	for (col = 0; col < un->un_totalcolumncnt; col++) {
1018 		mr_column_t	*mr_col = &un->un_column[col];
1019 		md_unit_t	*comp_un;
1020 
1021 		if (md_getmajor(mr_col->un_dev) == md_major) {
1022 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1023 			recids[rid++] = MD_RECID(comp_un);
1024 			md_set_parent(mr_col->un_dev, MD_SID(un));
1025 		}
1026 	}
1027 
1028 	/* set the end marker */
1029 	recids[rid] = 0;
1030 
1031 	mddb_commitrecs_wrapper(recids);
1032 	md_create_unit_incore(mnum, &raid_md_ops, 1);
1033 
1034 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1035 	    MD_SID(un));
1036 
1037 out:
1038 	kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1039 	if (err)
1040 		return (err);
1041 
1042 	/* only attempt to init a device that is in the init state */
1043 	if (UNIT_STATE(un) != RUS_INIT)
1044 		return (0);
1045 
1046 	return (raid_init_unit(mnum, &msp->mde));
1047 }
1048 
1049 /*
1050  * NAME:	raid_get
1051  * DESCRIPTION: used to get the unit structure of a RAID metadevice
1052  * PARAMETERS:	md_i_get_t   *migp - pointer to get data structure
1053  *		int	      mode - must be FREAD
1054  *		IOLOCK	     *lock - pointer to IOCTL lock
1055  *
1056  * LOCKS:	obtains unit reader lock via IOLOCK
1057  *
1058  */
1059 static int
raid_get(void * migp,int mode,IOLOCK * lock)1060 raid_get(
1061 	void		*migp,
1062 	int		mode,
1063 	IOLOCK		*lock
1064 )
1065 {
1066 	minor_t		mnum;
1067 	mr_unit_t	*un;
1068 	md_i_get_t	*migph = migp;
1069 
1070 
1071 	mnum = migph->id;
1072 
1073 	mdclrerror(&migph->mde);
1074 
1075 	if ((un = raid_getun(mnum, &migph->mde,
1076 	    RD_LOCK, lock)) == NULL)
1077 		return (0);
1078 
1079 	if (migph->size == 0) {
1080 		migph->size = un->c.un_size;
1081 		return (0);
1082 	}
1083 
1084 	if (migph->size < un->c.un_size) {
1085 		return (EFAULT);
1086 	}
1087 	if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1088 	    un->c.un_size, mode))
1089 		return (EFAULT);
1090 
1091 	return (0);
1092 }
1093 
1094 
1095 /*
1096  * NAME:	raid_replace
1097  * DESCRIPTION: used to replace a component of a RAID metadevice
1098  * PARAMETERS:	replace_params_t *mrp - pointer to replace data structure
1099  *		IOLOCK	     *lock - pointer to IOCTL lock
1100  *
1101  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1102  *		obtains and releases md_unit_array_rw write lock
1103  *
1104  */
1105 static int
raid_replace(replace_params_t * mrp,IOLOCK * lock)1106 raid_replace(
1107 	replace_params_t	*mrp,
1108 	IOLOCK			*lock
1109 )
1110 {
1111 	minor_t		mnum = mrp->mnum;
1112 	md_dev64_t	odev = mrp->old_dev;
1113 	md_error_t	*ep = &mrp->mde;
1114 	mr_unit_t	*un;
1115 	rcs_state_t	state;
1116 	int		ix, col = -1;
1117 	int		force = 0;
1118 	int		err = 0;
1119 	replace_cmd_t	cmd;
1120 	set_t		setno;
1121 	side_t		side;
1122 	mdkey_t		devkey;
1123 	int		nkeys;
1124 	mddb_recid_t	extra_recids[3] = { 0, 0, 0 };
1125 	int		extra_rids = 0;
1126 	md_error_t	mde = mdnullerror;
1127 	sv_dev_t	sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1128 
1129 	mdclrerror(ep);
1130 	setno = MD_MIN2SET(mnum);
1131 	side = mddb_getsidenum(setno);
1132 
1133 	un = md_unit_readerlock(MDI_UNIT(mnum));
1134 
1135 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1136 	    (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1137 		md_unit_readerexit(MDI_UNIT(mnum));
1138 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1139 	}
1140 
1141 	if (un->un_state & RUS_DOI) {
1142 		md_unit_readerexit(MDI_UNIT(mnum));
1143 		return (mdmderror(ep, MDE_RAID_DOI, mnum));
1144 	}
1145 
1146 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1147 	    (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1148 		md_unit_readerexit(MDI_UNIT(mnum));
1149 		return (mdmderror(ep, MDE_IN_USE, mnum));
1150 	}
1151 
1152 	md_unit_readerexit(MDI_UNIT(mnum));
1153 
1154 	/* get locks and recheck to be sure something did not change */
1155 	if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1156 		return (0);
1157 
1158 	if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1159 		return (mddeverror(ep, MDE_NAME_SPACE, odev));
1160 	}
1161 
1162 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1163 		md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1164 		/*
1165 		 * Try to resolve devt again if NODEV64
1166 		 */
1167 		if (tmpdevt == NODEV64) {
1168 			tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1169 			    un->un_column[ix].un_orig_key);
1170 			un->un_column[ix].un_orig_dev = tmpdevt;
1171 		}
1172 
1173 		if (un->un_column[ix].un_orig_dev == odev) {
1174 			col = ix;
1175 			break;
1176 		} else {
1177 			if (un->un_column[ix].un_orig_dev == NODEV64) {
1178 				/*
1179 				 * Now we use the keys to match.
1180 				 * If no key found, continue.
1181 				 */
1182 				if (nkeys == 0) {
1183 					continue;
1184 				}
1185 				if (un->un_column[ix].un_orig_key == devkey) {
1186 					if (nkeys > 1)
1187 						return (mddeverror(ep,
1188 						    MDE_MULTNM, odev));
1189 					col = ix;
1190 					break;
1191 				}
1192 			}
1193 		}
1194 	}
1195 
1196 	if (col == -1)
1197 		return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1198 		    mnum, odev));
1199 
1200 	if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1201 	    (raid_state_cnt(un, RCS_RESYNC) != 0))
1202 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1203 
1204 	if (un->un_state & RUS_DOI)
1205 		return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1206 		    un->un_column[col].un_dev));
1207 
1208 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1209 	    (MD_STATUS(un) & MD_UN_GROW_PENDING))
1210 		return (mdmderror(ep, MDE_IN_USE, mnum));
1211 
1212 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1213 		force = 1;
1214 	if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1215 		cmd = ENABLE_COMP;
1216 	if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1217 		cmd = REPLACE_COMP;
1218 
1219 	if (un->un_state == RUS_LAST_ERRED) {
1220 		/* Must use -f force flag for unit in LAST_ERRED state */
1221 		if (!force)
1222 			return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
1223 
1224 		/* Must use -f force flag on ERRED column first */
1225 		if (un->un_column[col].un_devstate != RCS_ERRED) {
1226 			for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1227 				if (un->un_column[ix].un_devstate & RCS_ERRED)
1228 					return (mdcomperror(ep,
1229 					    MDE_RAID_COMP_ERRED, mnum,
1230 					    un->un_column[ix].un_dev));
1231 			}
1232 		}
1233 
1234 		/* must use -f force flag on LAST_ERRED columns next */
1235 		if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1236 		    (un->un_column[col].un_devstate != RCS_ERRED))
1237 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1238 			    mnum, un->un_column[col].un_dev));
1239 	}
1240 
1241 	if (un->un_state == RUS_ERRED) {
1242 		if (! (un->un_column[col].un_devstate &
1243 		    (RCS_ERRED | RCS_INIT_ERRED)))
1244 			return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1245 			    mnum, un->un_column[ix].un_dev));
1246 	}
1247 
1248 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1249 	ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1250 
1251 	state = un->un_column[col].un_devstate;
1252 	if (state & RCS_INIT_ERRED) {
1253 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1254 		un->un_percent_done = 0;
1255 		raid_set_state(un, col, RCS_INIT, 0);
1256 	} else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1257 	    resync_request(mnum, col, 0, ep))
1258 		return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1259 
1260 
1261 	if (cmd == REPLACE_COMP) {
1262 		md_dev64_t tmpdev = mrp->new_dev;
1263 
1264 		/*
1265 		 * open the device by device id
1266 		 */
1267 		tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1268 		if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1269 			return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1270 			    tmpdev));
1271 		}
1272 
1273 		/*
1274 		 * If it's a metadevice, make sure it gets reparented
1275 		 */
1276 		if (md_getmajor(tmpdev) == md_major) {
1277 			minor_t		new_mnum = md_getminor(tmpdev);
1278 			md_unit_t	*new_un = MD_UNIT(new_mnum);
1279 
1280 			md_set_parent(tmpdev, MD_SID(un));
1281 			extra_recids[extra_rids++] = MD_RECID(new_un);
1282 		}
1283 
1284 		mrp->new_dev = tmpdev;
1285 		un->un_column[col].un_orig_dev = tmpdev;
1286 		un->un_column[col].un_orig_key = mrp->new_key;
1287 		un->un_column[col].un_orig_pwstart = mrp->start_blk;
1288 		un->un_column[col].un_orig_devstart =
1289 		    mrp->start_blk + un->un_pwsize;
1290 
1291 		/*
1292 		 * If the old device was a metadevice, make sure to
1293 		 * reset its parent.
1294 		 */
1295 		if (md_getmajor(odev) == md_major) {
1296 			minor_t		old_mnum = md_getminor(odev);
1297 			md_unit_t	*old_un = MD_UNIT(old_mnum);
1298 
1299 			md_reset_parent(odev);
1300 			extra_recids[extra_rids++] =
1301 			    MD_RECID(old_un);
1302 		}
1303 
1304 		if (HOTSPARED(un, col)) {
1305 			md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1306 			un->un_column[col].un_alt_dev = mrp->new_dev;
1307 			un->un_column[col].un_alt_pwstart = mrp->start_blk;
1308 			un->un_column[col].un_alt_devstart =
1309 			    mrp->start_blk + un->un_pwsize;
1310 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1311 		} else {
1312 			/*
1313 			 * not hot spared.  Close the old device and
1314 			 * move the new device in.
1315 			 */
1316 			if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1317 				md_layered_close(odev, MD_OFLG_NULL);
1318 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1319 			un->un_column[col].un_dev = mrp->new_dev;
1320 			un->un_column[col].un_pwstart = mrp->start_blk;
1321 			un->un_column[col].un_devstart =
1322 			    mrp->start_blk + un->un_pwsize;
1323 			if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1324 				un->un_column[col].un_devflags |=
1325 				    MD_RAID_REGEN_RESYNC;
1326 			}
1327 		}
1328 		/*
1329 		 * If the old device is not a metadevice then
1330 		 * save off the set number and key so that it
1331 		 * can be removed from the namespace later.
1332 		 */
1333 		if (md_getmajor(odev) != md_major) {
1334 			sv.setno = setno;
1335 			sv.key = devkey;
1336 		}
1337 	}
1338 
1339 	if (cmd == ENABLE_COMP) {
1340 		md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1341 		mdkey_t raidkey =  un->un_column[col].un_orig_key;
1342 
1343 		/*
1344 		 * We trust the dev_t because we cannot determine the
1345 		 * dev_t from the device id since a new disk is in the
1346 		 * same location. Since this is a call from metareplace -e dx
1347 		 * AND it is SCSI a new dev_t is not generated.  So the
1348 		 * dev_t from the mddb is used. Before enabling the device
1349 		 * we check to make sure that multiple entries for the same
1350 		 * device does not exist in the namespace. If they do we
1351 		 * fail the ioctl.
1352 		 * One of the many ways multiple entries in the name space
1353 		 * can occur is if one removed the failed component in a
1354 		 * RAID metadevice and put another disk that was part of
1355 		 * another metadevice. After reboot metadevadm would correctly
1356 		 * update the device name for the metadevice whose component
1357 		 * has moved. However now in the metadb there are two entries
1358 		 * for the same name (ctds) that belong to different
1359 		 * metadevices. One is valid, the other is a ghost or "last
1360 		 * know as" ctds.
1361 		 */
1362 		tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1363 		if (tmpdev == NODEV64)
1364 			tmpdev = md_getdevnum(setno, side, raidkey,
1365 			    MD_TRUST_DEVT);
1366 		/*
1367 		 * check for multiple entries in namespace for the
1368 		 * same dev
1369 		 */
1370 
1371 		if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1372 		    &nkeys) != 0)
1373 			return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1374 		/*
1375 		 * If number of keys are greater that
1376 		 * 1, then we have an invalid
1377 		 * namespace. STOP and return.
1378 		 */
1379 		if (nkeys > 1)
1380 			return (mddeverror(ep, MDE_MULTNM, tmpdev));
1381 		if (devkey != raidkey)
1382 			return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1383 			    mnum, tmpdev));
1384 
1385 		if (un->un_column[col].un_orig_dev == NODEV64)
1386 			un->un_column[col].un_orig_dev = tmpdev;
1387 
1388 		if (HOTSPARED(un, col)) {
1389 			un->un_column[col].un_alt_dev =
1390 			    un->un_column[col].un_orig_dev;
1391 			un->un_column[col].un_alt_pwstart =
1392 			    un->un_column[col].un_orig_pwstart;
1393 			un->un_column[col].un_alt_devstart =
1394 			    un->un_column[col].un_orig_devstart;
1395 			un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1396 		} else {
1397 			if (!(un->un_column[col].un_devflags &
1398 			    MD_RAID_DEV_ISOPEN)) {
1399 				if (md_layered_open(mnum, &tmpdev,
1400 				    MD_OFLG_NULL)) {
1401 					un->un_column[col].un_dev = tmpdev;
1402 					return (mdcomperror(ep,
1403 					    MDE_COMP_OPEN_ERR, mnum, tmpdev));
1404 				}
1405 				ASSERT(tmpdev != NODEV64 &&
1406 				    tmpdev != 0);
1407 
1408 				if ((md_getmajor(tmpdev) != md_major) &&
1409 				    (md_devid_found(setno, side, raidkey)
1410 				    == 1)) {
1411 					if (md_update_namespace_did(setno, side,
1412 					    raidkey, &mde) != 0) {
1413 						cmn_err(CE_WARN,
1414 						    "md: could not"
1415 						    " update namespace\n");
1416 					}
1417 				}
1418 				un->un_column[col].un_dev =
1419 				    un->un_column[col].un_orig_dev;
1420 			}
1421 			un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1422 			un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1423 		}
1424 	}
1425 	if (mrp->has_label) {
1426 		un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1427 	} else {
1428 		un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1429 	}
1430 
1431 	raid_commit(un, extra_recids);
1432 
1433 	/* If the component has been replaced - clean up the name space */
1434 	if (sv.setno != MD_SET_BAD) {
1435 		md_rem_names(&sv, 1);
1436 	}
1437 
1438 	md_ioctl_droplocks(lock);
1439 
1440 	if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1441 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1442 		    setno, MD_SID(un));
1443 	} else {
1444 		SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1445 		    setno, MD_SID(un));
1446 	}
1447 
1448 	if (un->un_column[col].un_devstate & RCS_INIT)
1449 		err = raid_init_unit(mnum, ep);
1450 	else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1451 		err = raid_resync_unit(mnum, ep);
1452 
1453 	mdclrerror(ep);
1454 	if (!err)
1455 		return (0);
1456 
1457 	/* be sure state */
1458 	/* is already set by this time */
1459 	/* fix state  and commit record */
1460 	un = md_unit_writerlock(MDI_UNIT(mnum));
1461 	if (state & RCS_INIT_ERRED)
1462 		raid_set_state(un, col, state, 1);
1463 	else if (state & RCS_OKAY)
1464 		raid_set_state(un, col, RCS_ERRED, 0);
1465 	else
1466 		raid_set_state(un, col, state, 1);
1467 	raid_commit(un, NULL);
1468 	md_unit_writerexit(MDI_UNIT(mnum));
1469 	mdclrerror(ep);
1470 	return (0);
1471 }
1472 
1473 
1474 /*
1475  * NAME:	raid_set_sync
1476  * DESCRIPTION: used to sync a component of a RAID metadevice
1477  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1478  *		int	      mode - must be FWRITE
1479  *		IOLOCK	     *lock - pointer to IOCTL lock
1480  *
1481  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1482  *		obtains and releases md_unit_array_rw write lock
1483  *
1484  */
1485 static int
raid_set_sync(md_resync_ioctl_t * rip,IOLOCK * lock)1486 raid_set_sync(
1487 	md_resync_ioctl_t	*rip,
1488 	IOLOCK			*lock
1489 )
1490 {
1491 	minor_t			mnum = rip->ri_mnum;
1492 	mr_unit_t		*un;
1493 	int			init = 0;
1494 	int			resync = 0;
1495 	int			regen = 0;
1496 	int			ix;
1497 	int			err;
1498 
1499 	mdclrerror(&rip->mde);
1500 
1501 	if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1502 		return (0);
1503 
1504 	if (un->un_state & RUS_DOI)
1505 		return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1506 
1507 	if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1508 		return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1509 
1510 	/* This prevents new opens */
1511 
1512 	rip->ri_flags = 0;
1513 	if (un->un_state & RUS_REGEN)
1514 		regen++;
1515 
1516 	if (raid_state_cnt(un, RCS_RESYNC))
1517 		resync++;
1518 
1519 	if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1520 		init++;
1521 
1522 	ASSERT(!(resync && init && regen));
1523 	md_ioctl_droplocks(lock);
1524 	rip->ri_percent_done = 0;
1525 
1526 	if (init) {
1527 		MD_STATUS(un) |= MD_UN_GROW_PENDING;
1528 		return (raid_init_unit(mnum, &rip->mde));
1529 	}
1530 
1531 	/*
1532 	 * If resync is needed, it will call raid_internal_open forcing
1533 	 * replay before the open completes.
1534 	 * Otherwise, call raid_internal_open directly to force
1535 	 * replay to complete during boot (metasync -r).
1536 	 * NOTE: the unit writer lock must remain held while setting
1537 	 *	 MD_UN_RESYNC_ACTIVE but must be released before
1538 	 *	 calling raid_resync_unit or raid_internal_open.
1539 	 */
1540 	if (resync) {
1541 		ASSERT(resync < 2);
1542 		un = md_unit_writerlock(MDI_UNIT(mnum));
1543 		MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1544 		/* Must release unit writer lock for resync */
1545 		/*
1546 		 * correctly setup the devices before trying to start the
1547 		 * resync operation.
1548 		 */
1549 		for (ix = 0; un->un_totalcolumncnt; ix++) {
1550 			if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1551 				if ((un->un_column[ix].un_devflags &
1552 				    MD_RAID_COPY_RESYNC) &&
1553 				    HOTSPARED(un, ix)) {
1554 					un->un_column[ix].un_alt_dev =
1555 					    un->un_column[ix].un_orig_dev;
1556 					un->un_column[ix].un_alt_devstart =
1557 					    un->un_column[ix].un_orig_devstart;
1558 					un->un_column[ix].un_alt_pwstart =
1559 					    un->un_column[ix].un_orig_pwstart;
1560 				}
1561 				break;
1562 			}
1563 		}
1564 		ASSERT(un->un_column[ix].un_devflags &
1565 		    (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1566 		rip->ri_percent_done = 0;
1567 		un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1568 		(void) resync_request(mnum, ix, 0, NULL);
1569 		md_unit_writerexit(MDI_UNIT(mnum));
1570 		err = raid_resync_unit(mnum, &rip->mde);
1571 		return (err);
1572 	}
1573 
1574 	if (regen) {
1575 		err = raid_regen_unit(mnum, &rip->mde);
1576 		return (err);
1577 	}
1578 
1579 	/* The unit requires not work so just force replay of the device */
1580 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1581 		return (mdmderror(&rip->mde,
1582 		    MDE_RAID_OPEN_FAILURE, mnum));
1583 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1584 
1585 	return (0);
1586 }
1587 
1588 /*
1589  * NAME:	raid_get_resync
1590  * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1591  * PARAMETERS:	md_resync_ioctl_t *mrp - pointer to resync data structure
1592  *		int	      mode - must be FWRITE
1593  *		IOLOCK	     *lock - pointer to IOCTL lock
1594  *
1595  * LOCKS:	none
1596  *
1597  */
1598 static int
raid_get_resync(md_resync_ioctl_t * rip,IOLOCK * lock)1599 raid_get_resync(
1600 	md_resync_ioctl_t	*rip,
1601 	IOLOCK			*lock
1602 )
1603 {
1604 	minor_t			mnum = rip->ri_mnum;
1605 	mr_unit_t		*un;
1606 	u_longlong_t		percent;
1607 	int			cnt;
1608 	int			ix;
1609 	uint64_t		d;
1610 
1611 	mdclrerror(&rip->mde);
1612 
1613 	if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1614 		return (0);
1615 
1616 	rip->ri_flags = 0;
1617 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1618 		d = un->un_segsincolumn;
1619 		percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1620 		if (percent > 1000)
1621 			percent = 1000;	/* can't go over 100% */
1622 		rip->ri_percent_done = (int)percent;
1623 		rip->ri_flags |= MD_RI_INPROGRESS;
1624 	}
1625 
1626 	if (UNIT_STATE(un) & RUS_INIT) {
1627 		d = un->un_segsize * un->un_segsincolumn *
1628 		    un->un_totalcolumncnt;
1629 		percent =
1630 		    d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1631 		if (percent > 1000)
1632 			percent = 1000;	/* can't go over 100% */
1633 		rip->ri_percent_done = (int)percent;
1634 		rip->ri_flags |= MD_GROW_INPROGRESS;
1635 	} else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1636 		d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1637 		percent =
1638 		    d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1639 		if (percent > 1000)
1640 			percent = 1000;
1641 		rip->ri_percent_done = (int)percent;
1642 		rip->ri_flags |= MD_GROW_INPROGRESS;
1643 	}
1644 
1645 	if (un->un_state & RUS_REGEN)
1646 		rip->ri_percent_done = un->un_percent_done;
1647 
1648 	cnt = 0;
1649 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1650 		switch (un->un_column[ix].un_devstate) {
1651 		case RCS_INIT:
1652 		case RCS_ERRED:
1653 		case RCS_LAST_ERRED:
1654 			cnt++;
1655 			break;
1656 		default:
1657 			break;
1658 		}
1659 	}
1660 	d = un->un_totalcolumncnt;
1661 	rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1662 	return (0);
1663 }
1664 
1665 /*
1666  * NAME:	raid_grow
1667  * DESCRIPTION: Concatenate to a RAID metadevice
1668  * PARAMETERS:	md_grow_params_t *mgp
1669  *			      - pointer to IOCGROW data structure
1670  *		int	 mode - must be FWRITE
1671  *		IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1672  *
1673  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun),
1674  *		obtains and releases md_unit_array_rw write lock
1675  *
1676  */
1677 static int
raid_grow(void * mgp,int mode,IOLOCK * lock)1678 raid_grow(void *mgp, int mode, IOLOCK *lock)
1679 {
1680 	minor_t		mnum;
1681 	mr_unit_t	*un, *new_un;
1682 	mdi_unit_t	*ui;
1683 	mddb_type_t	typ1;
1684 	mddb_recid_t	mr_recid;
1685 	mddb_recid_t	old_vtoc = 0;
1686 	mddb_recid_t	*recids;
1687 	md_create_rec_option_t options;
1688 	int		err;
1689 	int		col, i;
1690 	int64_t		tb, atb;
1691 	u_longlong_t	unrev;
1692 	int		tc;
1693 	int		rval = 0;
1694 	set_t		setno;
1695 	mr_column_ic_t	*mrc;
1696 	int		num_recs, rid;
1697 	md_grow_params_t	*mgph = mgp;
1698 
1699 
1700 	mnum = mgph->mnum;
1701 
1702 	mdclrerror(&mgph->mde);
1703 
1704 	ui = MDI_UNIT(mnum);
1705 	un = md_unit_readerlock(ui);
1706 
1707 	if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1708 		md_unit_readerexit(ui);
1709 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1710 	}
1711 
1712 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1713 		md_unit_readerexit(ui);
1714 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1715 	}
1716 
1717 	if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1718 		md_unit_readerexit(ui);
1719 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1720 	}
1721 
1722 	if (UNIT_STATE(un) & RUS_DOI) {
1723 		md_unit_readerexit(ui);
1724 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1725 	}
1726 
1727 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1728 		md_unit_readerexit(ui);
1729 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1730 	}
1731 
1732 	md_unit_readerexit(ui);
1733 
1734 	if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1735 	    NULL)
1736 		return (0);
1737 
1738 	if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1739 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1740 
1741 	if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1742 		return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1743 
1744 	if (un->c.un_size >= mgph->size)
1745 		return (EINVAL);
1746 
1747 	if (UNIT_STATE(un) & RUS_LAST_ERRED)
1748 		return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1749 
1750 	if (UNIT_STATE(un) & RUS_DOI)
1751 		return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1752 
1753 	if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1754 		return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1755 
1756 	setno = MD_MIN2SET(mnum);
1757 
1758 	typ1 = (mddb_type_t)md_getshared_key(setno,
1759 	    raid_md_ops.md_driver.md_drivername);
1760 
1761 	/*
1762 	 * Preserve the friendly name nature of the device that is
1763 	 * growing.
1764 	 */
1765 	options = MD_CRO_RAID;
1766 	if (un->c.un_revision & MD_FN_META_DEV)
1767 		options |= MD_CRO_FN;
1768 	if (mgph->options & MD_CRO_64BIT) {
1769 #if defined(_ILP32)
1770 		return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1771 #else
1772 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1773 		    MD_CRO_64BIT | options, setno);
1774 #endif
1775 	} else {
1776 		mr_recid = mddb_createrec(mgph->size, typ1, 0,
1777 		    MD_CRO_32BIT | options, setno);
1778 	}
1779 	if (mr_recid < 0) {
1780 		rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1781 		    mnum, setno);
1782 		return (rval);
1783 	}
1784 
1785 	/* get the address of the new unit */
1786 	new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1787 
1788 	/*
1789 	 * It is okay that we muck with the new unit here,
1790 	 * since no one else will know about the unit struct
1791 	 * until we commit it. If we crash, the record will
1792 	 * be automatically purged, since we haven't
1793 	 * committed it yet and the old unit struct will be found.
1794 	 */
1795 
1796 	/* copy in the user's unit struct */
1797 	err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1798 	    mgph->size, mode);
1799 	if (err) {
1800 		mddb_deleterec_wrapper(mr_recid);
1801 		return (EFAULT);
1802 	}
1803 
1804 	/* make sure columns are being added */
1805 	if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1806 		mddb_deleterec_wrapper(mr_recid);
1807 		return (EINVAL);
1808 	}
1809 
1810 	/*
1811 	 * Save a few of the new unit structs fields.
1812 	 * Before they get clobbered.
1813 	 */
1814 	tc = new_un->un_totalcolumncnt;
1815 	tb = new_un->c.un_total_blocks;
1816 	atb = new_un->c.un_actual_tb;
1817 	unrev = new_un->c.un_revision;
1818 
1819 	/*
1820 	 * Copy the old unit struct (static stuff)
1821 	 * into new unit struct
1822 	 */
1823 	bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1824 
1825 	/*
1826 	 * Restore a few of the new unit struct values.
1827 	 */
1828 	new_un->un_totalcolumncnt = tc;
1829 	new_un->c.un_actual_tb = atb;
1830 	new_un->un_grow_tb = tb;
1831 	new_un->c.un_revision = unrev;
1832 	new_un->c.un_record_id = mr_recid;
1833 	new_un->c.un_size = mgph->size;
1834 
1835 	ASSERT(new_un->mr_ic == un->mr_ic);
1836 
1837 	/*
1838 	 * Save old column slots
1839 	 */
1840 	mrc = un->un_column_ic;
1841 
1842 	/*
1843 	 * Allocate new column slot
1844 	 */
1845 	new_un->un_column_ic = (mr_column_ic_t *)
1846 	    kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1847 	    KM_SLEEP);
1848 
1849 	/*
1850 	 * Restore old column slots
1851 	 * Free the old column slots
1852 	 */
1853 	bcopy(mrc, new_un->un_column_ic,
1854 	    sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855 	kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1856 
1857 	/* All 64 bit metadevices only support EFI labels. */
1858 	if (mgph->options & MD_CRO_64BIT) {
1859 		new_un->c.un_flag |= MD_EFILABEL;
1860 		/*
1861 		 * If the device was previously smaller than a terabyte,
1862 		 * and had a vtoc record attached to it, we remove the
1863 		 * vtoc record, because the layout has changed completely.
1864 		 */
1865 		if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1866 		    (un->c.un_vtoc_id != 0)) {
1867 			old_vtoc = un->c.un_vtoc_id;
1868 			new_un->c.un_vtoc_id =
1869 			    md_vtoc_to_efi_record(old_vtoc, setno);
1870 		}
1871 	}
1872 
1873 
1874 	/*
1875 	 * allocate the real recids array.  since we may have to commit
1876 	 * underlying metadevice records, we need an array of size:
1877 	 * total number of new components being attach + 2 (one for the
1878 	 * raid itself, one for the end marker).
1879 	 */
1880 	num_recs = new_un->un_totalcolumncnt + 2;
1881 	rid = 0;
1882 	recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1883 	recids[rid++] = mr_recid;
1884 
1885 	for (col = un->un_totalcolumncnt;
1886 	    (col < new_un->un_totalcolumncnt); col++) {
1887 		mr_column_t	*mr_col = &new_un->un_column[col];
1888 		md_unit_t	*comp_un;
1889 
1890 		if (raid_build_pw_reservation(new_un, col) != 0) {
1891 			/* release pwslots already allocated by grow */
1892 			for (i = un->un_totalcolumncnt; i < col; i++) {
1893 				raid_free_pw_reservation(new_un, i);
1894 			}
1895 			kmem_free(new_un->un_column_ic,
1896 			    sizeof (mr_column_ic_t) *
1897 			    new_un->un_totalcolumncnt);
1898 			kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1899 			kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1900 			mddb_deleterec_wrapper(mr_recid);
1901 			return (EINVAL);
1902 		}
1903 		/*
1904 		 * set parent on metadevices being added.
1905 		 * NOTE: currently soft partitions are the only metadevices
1906 		 * which can appear within a RAID metadevice.
1907 		 */
1908 		if (md_getmajor(mr_col->un_dev) == md_major) {
1909 			comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1910 			recids[rid++] = MD_RECID(comp_un);
1911 			md_set_parent(mr_col->un_dev, MD_SID(new_un));
1912 		}
1913 		new_un->un_column[col].un_devflags = 0;
1914 	}
1915 
1916 	/* set end marker */
1917 	recids[rid] = 0;
1918 
1919 	/* commit new unit struct */
1920 	mddb_commitrecs_wrapper(recids);
1921 
1922 	/* delete old unit struct */
1923 	mddb_deleterec_wrapper(un->c.un_record_id);
1924 
1925 	/* place new unit in in-core array */
1926 	md_nblocks_set(mnum, new_un->c.un_total_blocks);
1927 	MD_UNIT(mnum) = new_un;
1928 
1929 	/*
1930 	 * If old_vtoc has a non zero value, we know:
1931 	 * - This unit crossed the border from smaller to larger one TB
1932 	 * - There was a vtoc record for the unit,
1933 	 * - This vtoc record is no longer needed, because
1934 	 *   a new efi record has been created for this un.
1935 	 */
1936 	if (old_vtoc != 0) {
1937 		mddb_deleterec_wrapper(old_vtoc);
1938 	}
1939 
1940 	/* free recids */
1941 	kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1942 
1943 	SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1944 	    MD_UN2SET(new_un), MD_SID(new_un));
1945 	MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1946 
1947 	/*
1948 	 * Since the md_ioctl_writelock aquires the unit write lock
1949 	 * and open/close aquires the unit reader lock it is necessary
1950 	 * to drop the unit write lock and then reaquire it as needed
1951 	 * later.
1952 	 */
1953 	md_unit_writerexit(ui);
1954 
1955 	if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1956 		rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
1957 		SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1958 		    MD_UN2SET(new_un), MD_SID(new_un));
1959 		return (rval);
1960 	}
1961 	(void) md_unit_writerlock(ui);
1962 	for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1963 		if (new_un->un_column[i].un_devstate & RCS_OKAY)
1964 			(void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1965 			    new_un->un_column[i].un_pwstart, i);
1966 	}
1967 	md_unit_writerexit(ui);
1968 	(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1969 	(void) md_unit_writerlock(ui);
1970 	/* create a background thread to initialize the columns */
1971 	md_ioctl_droplocks(lock);
1972 
1973 	return (raid_init_unit(mnum, &mgph->mde));
1974 }
1975 
1976 /*
1977  * NAME:	raid_reset
1978  * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1979  * PARAMETERS:	md_i_reset_t *mirp - pointer to reset data structure
1980  *
1981  * LOCKS:	obtains and releases md_unit_array_rw write lock
1982  *
1983  */
1984 static int
raid_reset(md_i_reset_t * mirp)1985 raid_reset(md_i_reset_t	*mirp)
1986 {
1987 	minor_t		mnum = mirp->mnum;
1988 	mr_unit_t	*un;
1989 	mdi_unit_t	*ui;
1990 	set_t		setno = MD_MIN2SET(mnum);
1991 
1992 	mdclrerror(&mirp->mde);
1993 
1994 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1995 	/*
1996 	 * NOTE: need to get md_unit_writerlock to avoid conflict
1997 	 * with raid_init thread.
1998 	 */
1999 	if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
2000 	    NULL) {
2001 		rw_exit(&md_unit_array_rw.lock);
2002 		return (0);
2003 	}
2004 	ui = MDI_UNIT(mnum);
2005 
2006 	if (MD_HAS_PARENT(MD_PARENT(un))) {
2007 		rw_exit(&md_unit_array_rw.lock);
2008 		return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2009 	}
2010 
2011 	un = (mr_unit_t *)md_unit_openclose_enter(ui);
2012 	if (md_unit_isopen(MDI_UNIT(mnum))) {
2013 		md_unit_openclose_exit(ui);
2014 		rw_exit(&md_unit_array_rw.lock);
2015 		return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2016 	}
2017 	md_unit_openclose_exit(ui);
2018 	if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2019 		rw_exit(&md_unit_array_rw.lock);
2020 		return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2021 	}
2022 
2023 	reset_raid(un, mnum, 1);
2024 
2025 	/*
2026 	 * Update unit availability
2027 	 */
2028 	md_set[setno].s_un_avail++;
2029 
2030 	/*
2031 	 * If MN set, reset s_un_next so all nodes can have
2032 	 * the same view of the next available slot when
2033 	 * nodes are -w and -j
2034 	 */
2035 	if (MD_MNSET_SETNO(setno)) {
2036 		(void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2037 	}
2038 
2039 	rw_exit(&md_unit_array_rw.lock);
2040 
2041 	return (0);
2042 }
2043 
2044 /*
2045  * NAME:	raid_get_geom
2046  * DESCRIPTION: used to get the geometry of a RAID metadevice
2047  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the geometry for
2048  *		struct dk_geom *gp - pointer to geometry data structure
2049  *
2050  * LOCKS:	none
2051  *
2052  */
2053 static int
raid_get_geom(mr_unit_t * un,struct dk_geom * geomp)2054 raid_get_geom(
2055 	mr_unit_t	*un,
2056 	struct dk_geom	*geomp
2057 )
2058 {
2059 	md_get_geom((md_unit_t *)un, geomp);
2060 
2061 	return (0);
2062 }
2063 
2064 /*
2065  * NAME:	raid_get_vtoc
2066  * DESCRIPTION: used to get the VTOC on a RAID metadevice
2067  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
2068  *		struct vtoc *vtocp - pointer to VTOC data structure
2069  *
2070  * LOCKS:	none
2071  *
2072  */
2073 static int
raid_get_vtoc(mr_unit_t * un,struct vtoc * vtocp)2074 raid_get_vtoc(
2075 	mr_unit_t	*un,
2076 	struct vtoc	*vtocp
2077 )
2078 {
2079 	md_get_vtoc((md_unit_t *)un, vtocp);
2080 
2081 	return (0);
2082 }
2083 
2084 /*
2085  * NAME:	raid_set_vtoc
2086  * DESCRIPTION: used to set the VTOC on a RAID metadevice
2087  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2088  *		struct vtoc *vtocp - pointer to VTOC data structure
2089  *
2090  * LOCKS:	none
2091  *
2092  */
2093 static int
raid_set_vtoc(mr_unit_t * un,struct vtoc * vtocp)2094 raid_set_vtoc(
2095 	mr_unit_t	*un,
2096 	struct vtoc	*vtocp
2097 )
2098 {
2099 	return (md_set_vtoc((md_unit_t *)un, vtocp));
2100 }
2101 
2102 
2103 /*
2104  * NAME:	raid_get_extvtoc
2105  * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
2106  * PARAMETERS:	mr_unit_t    *un - RAID unit to get the VTOC from
2107  *		struct extvtoc *vtocp - pointer to extended VTOC data structure
2108  *
2109  * LOCKS:	none
2110  *
2111  */
2112 static int
raid_get_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2113 raid_get_extvtoc(
2114 	mr_unit_t	*un,
2115 	struct extvtoc	*vtocp
2116 )
2117 {
2118 	md_get_extvtoc((md_unit_t *)un, vtocp);
2119 
2120 	return (0);
2121 }
2122 
2123 /*
2124  * NAME:	raid_set_extvtoc
2125  * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
2126  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2127  *		struct extvtoc *vtocp - pointer to extended VTOC data structure
2128  *
2129  * LOCKS:	none
2130  *
2131  */
2132 static int
raid_set_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2133 raid_set_extvtoc(
2134 	mr_unit_t	*un,
2135 	struct extvtoc	*vtocp
2136 )
2137 {
2138 	return (md_set_extvtoc((md_unit_t *)un, vtocp));
2139 }
2140 
2141 
2142 
2143 /*
2144  * NAME:	raid_get_cgapart
2145  * DESCRIPTION: used to get the dk_map on a RAID metadevice
2146  * PARAMETERS:	mr_unit_t    *un - RAID unit to set the VTOC on
2147  *		struct vtoc *dkmapp - pointer to dk_map data structure
2148  *
2149  * LOCKS:	none
2150  *
2151  */
2152 
2153 static int
raid_get_cgapart(mr_unit_t * un,struct dk_map * dkmapp)2154 raid_get_cgapart(
2155 	mr_unit_t	*un,
2156 	struct dk_map	*dkmapp
2157 )
2158 {
2159 	md_get_cgapart((md_unit_t *)un, dkmapp);
2160 	return (0);
2161 }
2162 
2163 /*
2164  * NAME:	raid_getdevs
2165  * DESCRIPTION: return all devices within a RAID metadevice
2166  * PARAMETERS:	md_getdevs_params_t *mgdp
2167  *			      - pointer to getdevs IOCTL data structure
2168  *		int	 mode - should be FREAD
2169  *		IOLOCK *lockp - IOCTL read/write lock
2170  *
2171  * LOCKS:	obtains unit reader lock via IOLOCK
2172  *
2173  */
2174 static int
raid_getdevs(void * mgdp,int mode,IOLOCK * lock)2175 raid_getdevs(
2176 	void			*mgdp,
2177 	int			mode,
2178 	IOLOCK			*lock
2179 )
2180 {
2181 	minor_t			mnum;
2182 	mr_unit_t		*un;
2183 	md_dev64_t		*udevs;
2184 	int			i, cnt;
2185 	md_dev64_t		unit_dev;
2186 	md_getdevs_params_t	*mgdph = mgdp;
2187 
2188 
2189 	mnum = mgdph->mnum;
2190 
2191 	/* check out unit */
2192 	mdclrerror(&mgdph->mde);
2193 
2194 	if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
2195 		return (0);
2196 
2197 	udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2198 
2199 	for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2200 		if (cnt < mgdph->cnt) {
2201 			unit_dev = un->un_column[i].un_orig_dev;
2202 			if (md_getmajor(unit_dev) != md_major) {
2203 				if ((unit_dev = md_xlate_mini_2_targ
2204 				    (unit_dev)) == NODEV64)
2205 					return (ENODEV);
2206 			}
2207 
2208 			if (ddi_copyout((caddr_t)&unit_dev,
2209 			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2210 				return (EFAULT);
2211 		}
2212 		if (HOTSPARED(un, i)) {
2213 			cnt++;
2214 			if (cnt >= mgdph->cnt)
2215 				continue;
2216 
2217 			unit_dev = un->un_column[i].un_dev;
2218 			if (md_getmajor(unit_dev) != md_major) {
2219 				if ((unit_dev = md_xlate_mini_2_targ
2220 				    (unit_dev)) == NODEV64)
2221 					return (ENODEV);
2222 			}
2223 
2224 			if (ddi_copyout((caddr_t)&unit_dev,
2225 			    (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2226 				return (EFAULT);
2227 		}
2228 	}
2229 	mgdph->cnt = cnt;
2230 	return (0);
2231 }
2232 
2233 /*
2234  * NAME:	raid_change
2235  * DESCRIPTION: used to change the following dynamic values:
2236  *			the hot spare pool
2237  *		in the unit structure of a RAID metadevice
2238  * PARAMETERS:	md_change_params_t   *mcp - pointer to change data structure
2239  *		IOLOCK	     *lock - pointer to IOCTL lock
2240  *
2241  * LOCKS:	obtains unit writer lock via IOLOCK (through raid_getun)
2242  *
2243  */
2244 static int
raid_change(md_raid_params_t * mrp,IOLOCK * lock)2245 raid_change(
2246 	md_raid_params_t	*mrp,
2247 	IOLOCK			*lock
2248 )
2249 {
2250 	minor_t		mnum = mrp->mnum;
2251 	mr_unit_t	*un;
2252 	int		ix;
2253 	mddb_recid_t	recids[3] = {0, 0, 0};
2254 	int		err;
2255 	int		irecid;
2256 	int		inc_new_hsp = 0;
2257 
2258 	mdclrerror(&mrp->mde);
2259 
2260 	if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2261 		return (0);
2262 
2263 	if (!mrp->params.change_hsp_id)
2264 		return (0);
2265 
2266 	/* verify that no hotspare is in use */
2267 	for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2268 		if (HOTSPARED(un, ix)) {
2269 			return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2270 		}
2271 	}
2272 
2273 	/* replace the hot spare pool */
2274 
2275 	irecid = 0;
2276 	if (mrp->params.hsp_id != -1) {
2277 		/* increment the reference count of the new hsp */
2278 		err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2279 		    &recids[0], NULL, NULL, NULL);
2280 		if (err) {
2281 			return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2282 			    mrp->params.hsp_id));
2283 		}
2284 		inc_new_hsp = 1;
2285 		irecid++;
2286 	}
2287 
2288 	if (un->un_hsp_id != -1) {
2289 		/* decrement the reference count of the old hsp */
2290 		err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2291 		    &recids[irecid], NULL, NULL, NULL);
2292 		if (err) {
2293 			err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2294 			    mrp->params.hsp_id);
2295 			if (inc_new_hsp) {
2296 				(void) md_hot_spare_ifc(HSP_DECREF,
2297 				    mrp->params.hsp_id, 0, 0,
2298 				    &recids[0], NULL, NULL, NULL);
2299 				/*
2300 				 * Don't need to commit the record,
2301 				 * because it wasn't committed before
2302 				 */
2303 			}
2304 			return (err);
2305 		}
2306 	}
2307 
2308 	un->un_hsp_id = mrp->params.hsp_id;
2309 
2310 	raid_commit(un, recids);
2311 	SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2312 	    MD_UN2SET(un), MD_SID(un));
2313 
2314 	/* Now trigger hot spare processing in case one is needed. */
2315 	if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2316 		(void) raid_hotspares();
2317 
2318 	return (0);
2319 }
2320 
2321 /*
2322  * NAME:	raid_admin_ioctl
2323  * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2324  * PARAMETERS:	int	  cmd - IOCTL command to be executed
2325  *		void	*data - pointer to IOCTL data structure
2326  *		int	 mode - either FREAD or FWRITE
2327  *		IOLOCK *lockp - IOCTL read/write lock
2328  *
2329  * LOCKS:	none
2330  *
2331  */
2332 static int
raid_admin_ioctl(int cmd,void * data,int mode,IOLOCK * lockp)2333 raid_admin_ioctl(
2334 	int		cmd,
2335 	void		*data,
2336 	int		mode,
2337 	IOLOCK		*lockp
2338 )
2339 {
2340 	size_t		sz = 0;
2341 	void		*d = NULL;
2342 	int		err = 0;
2343 
2344 	/* We can only handle 32-bit clients for internal commands */
2345 	if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2346 		return (EINVAL);
2347 	}
2348 
2349 
2350 	/* dispatch ioctl */
2351 	switch (cmd) {
2352 
2353 	case MD_IOCSET:
2354 	{
2355 		if (! (mode & FWRITE))
2356 			return (EACCES);
2357 
2358 		sz = sizeof (md_set_params_t);
2359 		d = kmem_alloc(sz, KM_SLEEP);
2360 
2361 		if (ddi_copyin(data, d, sz, mode)) {
2362 			err = EFAULT;
2363 			break;
2364 		}
2365 
2366 		err = raid_set(d, mode);
2367 		break;
2368 	}
2369 
2370 	case MD_IOCGET:
2371 	{
2372 		if (! (mode & FREAD))
2373 			return (EACCES);
2374 
2375 		sz = sizeof (md_i_get_t);
2376 		d = kmem_alloc(sz, KM_SLEEP);
2377 
2378 		if (ddi_copyin(data, d, sz, mode)) {
2379 			err = EFAULT;
2380 			break;
2381 		}
2382 
2383 		err = raid_get(d, mode, lockp);
2384 		break;
2385 	}
2386 
2387 	case MD_IOCREPLACE:
2388 	{
2389 		if (! (mode & FWRITE))
2390 			return (EACCES);
2391 
2392 		sz = sizeof (replace_params_t);
2393 		d = kmem_alloc(sz, KM_SLEEP);
2394 
2395 		if (ddi_copyin(data, d, sz, mode)) {
2396 			err = EFAULT;
2397 			break;
2398 		}
2399 
2400 		err = raid_replace((replace_params_t *)d, lockp);
2401 		break;
2402 	}
2403 
2404 	case MD_IOCSETSYNC:
2405 	{
2406 		if (! (mode & FWRITE))
2407 			return (EACCES);
2408 
2409 		sz = sizeof (md_resync_ioctl_t);
2410 		d = kmem_alloc(sz, KM_SLEEP);
2411 
2412 		if (ddi_copyin(data, d, sz, mode)) {
2413 			err = EFAULT;
2414 			break;
2415 		}
2416 
2417 		err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2418 		break;
2419 	}
2420 
2421 	case MD_IOCGETSYNC:
2422 	{
2423 		if (! (mode & FREAD))
2424 			return (EACCES);
2425 
2426 		sz = sizeof (md_resync_ioctl_t);
2427 		d = kmem_alloc(sz, KM_SLEEP);
2428 
2429 		if (ddi_copyin(data, d, sz, mode)) {
2430 			err = EFAULT;
2431 			break;
2432 		}
2433 		err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2434 
2435 		break;
2436 	}
2437 
2438 	case MD_IOCGROW:
2439 	{
2440 		if (! (mode & FWRITE))
2441 			return (EACCES);
2442 
2443 		sz = sizeof (md_grow_params_t);
2444 		d = kmem_alloc(sz, KM_SLEEP);
2445 
2446 		if (ddi_copyin(data, d, sz, mode)) {
2447 			err = EFAULT;
2448 			break;
2449 		}
2450 
2451 		err = raid_grow(d, mode, lockp);
2452 		break;
2453 	}
2454 
2455 	case MD_IOCCHANGE:
2456 	{
2457 		if (! (mode & FWRITE))
2458 			return (EACCES);
2459 
2460 		sz = sizeof (md_raid_params_t);
2461 		d = kmem_alloc(sz, KM_SLEEP);
2462 
2463 		if (ddi_copyin(data, d, sz, mode)) {
2464 			err = EFAULT;
2465 			break;
2466 		}
2467 
2468 		err = raid_change((md_raid_params_t *)d, lockp);
2469 		break;
2470 	}
2471 
2472 	case MD_IOCRESET:
2473 	{
2474 		if (! (mode & FWRITE))
2475 			return (EACCES);
2476 
2477 		sz = sizeof (md_i_reset_t);
2478 		d = kmem_alloc(sz, KM_SLEEP);
2479 
2480 		if (ddi_copyin(data, d, sz, mode)) {
2481 			err = EFAULT;
2482 			break;
2483 		}
2484 
2485 		err = raid_reset((md_i_reset_t *)d);
2486 		break;
2487 	}
2488 
2489 	case MD_IOCGET_DEVS:
2490 	{
2491 		if (! (mode & FREAD))
2492 			return (EACCES);
2493 
2494 		sz = sizeof (md_getdevs_params_t);
2495 		d = kmem_alloc(sz, KM_SLEEP);
2496 
2497 		if (ddi_copyin(data, d, sz, mode)) {
2498 			err = EFAULT;
2499 			break;
2500 		}
2501 
2502 		err = raid_getdevs(d, mode, lockp);
2503 		break;
2504 	}
2505 
2506 	case MD_IOCSETREGEN:
2507 	{
2508 		if (! (mode & FWRITE))
2509 			return (EACCES);
2510 
2511 		sz = sizeof (md_regen_param_t);
2512 		d = kmem_alloc(sz, KM_SLEEP);
2513 
2514 		if (ddi_copyin(data, d, sz, mode)) {
2515 			err = EFAULT;
2516 			break;
2517 		}
2518 
2519 		err = raid_regen((md_regen_param_t *)d, lockp);
2520 		break;
2521 	}
2522 
2523 	case MD_IOCPROBE_DEV:
2524 	{
2525 		md_probedev_impl_t	*p = NULL;
2526 		md_probedev_t		*ph = NULL;
2527 		daemon_queue_t		*hdr = NULL;
2528 		int			i;
2529 		size_t			sz1 = 0;
2530 
2531 
2532 		if (! (mode & FREAD))
2533 			return (EACCES);
2534 
2535 		sz = sizeof (md_probedev_t);
2536 
2537 		d = kmem_alloc(sz, KM_SLEEP);
2538 
2539 		/* now copy in the data */
2540 		if (ddi_copyin(data, d, sz, mode)) {
2541 			err = EFAULT;
2542 			goto free_mem;
2543 		}
2544 
2545 		/*
2546 		 * Sanity test the args. Test name should have the keyword
2547 		 * probe.
2548 		 */
2549 		p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2550 		p->probe_sema = NULL;
2551 		p->probe_mx = NULL;
2552 		p->probe.mnum_list = (uint64_t)NULL;
2553 
2554 		ph = (md_probedev_t *)d;
2555 		p->probe.nmdevs = ph->nmdevs;
2556 		(void) strcpy(p->probe.test_name, ph->test_name);
2557 		bcopy(&ph->md_driver, &(p->probe.md_driver),
2558 		    sizeof (md_driver_t));
2559 
2560 		if ((p->probe.nmdevs < 1) ||
2561 		    (strstr(p->probe.test_name, "probe") == NULL)) {
2562 			err = EINVAL;
2563 			goto free_mem;
2564 		}
2565 
2566 		sz1 = sizeof (minor_t) * p->probe.nmdevs;
2567 
2568 		p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2569 		    KM_SLEEP);
2570 
2571 		if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2572 		    (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2573 			err = EFAULT;
2574 			goto free_mem;
2575 		}
2576 
2577 		if (err = md_init_probereq(p, &hdr))
2578 			goto free_mem;
2579 
2580 		/*
2581 		 * put the request on the queue and wait.
2582 		 */
2583 
2584 		daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2585 
2586 		(void) IOLOCK_RETURN(0, lockp);
2587 		/* wait for the events to occur */
2588 		for (i = 0; i < p->probe.nmdevs; i++) {
2589 			sema_p(PROBE_SEMA(p));
2590 		}
2591 		while (md_ioctl_lock_enter() == EINTR)
2592 			;
2593 
2594 		/*
2595 		 * clean up. The hdr list is freed in the probe routines
2596 		 * since the list is NULL by the time we get here.
2597 		 */
2598 free_mem:
2599 		if (p) {
2600 			if (p->probe_sema != NULL) {
2601 				sema_destroy(PROBE_SEMA(p));
2602 				kmem_free(p->probe_sema, sizeof (ksema_t));
2603 			}
2604 			if (p->probe_mx != NULL) {
2605 				mutex_destroy(PROBE_MX(p));
2606 				kmem_free(p->probe_mx, sizeof (kmutex_t));
2607 			}
2608 			if (p->probe.mnum_list)
2609 				kmem_free((caddr_t)(uintptr_t)
2610 				    p->probe.mnum_list, sz1);
2611 
2612 			kmem_free(p, sizeof (md_probedev_impl_t));
2613 		}
2614 		break;
2615 	}
2616 
2617 	default:
2618 		return (ENOTTY);
2619 	}
2620 
2621 	/*
2622 	 * copyout and free any args
2623 	 */
2624 	if (sz != 0) {
2625 		if (err == 0) {
2626 			if (ddi_copyout(d, data, sz, mode) != 0) {
2627 				err = EFAULT;
2628 			}
2629 		}
2630 		kmem_free(d, sz);
2631 	}
2632 	return (err);
2633 }
2634 
2635 /*
2636  * NAME:	md_raid_ioctl
2637  * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2638  * PARAMETERS:	md_dev64_t dev - RAID device identifier
2639  *		int	  cmd  - IOCTL command to be executed
2640  *		void	*data  - pointer to IOCTL data structure
2641  *		int	 mode  - either FREAD or FWRITE
2642  *		IOLOCK *lockp  - IOCTL read/write lock
2643  *
2644  * LOCKS:	none
2645  *
2646  */
2647 int
md_raid_ioctl(dev_t dev,int cmd,void * data,int mode,IOLOCK * lockp)2648 md_raid_ioctl(
2649 	dev_t		dev,
2650 	int		cmd,
2651 	void		*data,
2652 	int		mode,
2653 	IOLOCK		*lockp
2654 )
2655 {
2656 	minor_t		mnum = getminor(dev);
2657 	mr_unit_t	*un;
2658 	int		err = 0;
2659 
2660 	/* handle admin ioctls */
2661 	if (mnum == MD_ADM_MINOR)
2662 		return (raid_admin_ioctl(cmd, data, mode, lockp));
2663 
2664 	/* check unit */
2665 	if ((MD_MIN2SET(mnum) >= md_nsets) ||
2666 	    (MD_MIN2UNIT(mnum) >= md_nunits) ||
2667 	    ((un = MD_UNIT(mnum)) == NULL))
2668 		return (ENXIO);
2669 
2670 	/* is this a supported ioctl? */
2671 	err = md_check_ioctl_against_unit(cmd, un->c);
2672 	if (err != 0) {
2673 		return (err);
2674 	}
2675 
2676 	/* dispatch ioctl */
2677 	switch (cmd) {
2678 
2679 	case DKIOCINFO:
2680 	{
2681 		struct dk_cinfo *p;
2682 
2683 		if (! (mode & FREAD))
2684 			return (EACCES);
2685 
2686 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2687 
2688 		get_info(p, mnum);
2689 		if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2690 			err = EFAULT;
2691 
2692 		kmem_free(p, sizeof (*p));
2693 		return (err);
2694 	}
2695 
2696 	case DKIOCGMEDIAINFO:
2697 	{
2698 		struct dk_minfo	p;
2699 
2700 		if (! (mode & FREAD))
2701 			return (EACCES);
2702 
2703 		get_minfo(&p, mnum);
2704 		if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2705 			err = EFAULT;
2706 
2707 		return (err);
2708 	}
2709 
2710 	case DKIOCGGEOM:
2711 	{
2712 		struct dk_geom	*p;
2713 
2714 		if (! (mode & FREAD))
2715 			return (EACCES);
2716 
2717 		p = kmem_alloc(sizeof (*p), KM_SLEEP);
2718 
2719 		if ((err = raid_get_geom(un, p)) == 0) {
2720 			if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2721 			    mode) != 0)
2722 				err = EFAULT;
2723 		}
2724 
2725 		kmem_free(p, sizeof (*p));
2726 		return (err);
2727 	}
2728 
2729 	case DKIOCGVTOC:
2730 	{
2731 		struct vtoc	*vtoc;
2732 
2733 		if (! (mode & FREAD))
2734 			return (EACCES);
2735 
2736 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2737 		if ((err = raid_get_vtoc(un, vtoc)) != 0) {
2738 			kmem_free(vtoc, sizeof (*vtoc));
2739 			return (err);
2740 		}
2741 
2742 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2743 			if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
2744 				err = EFAULT;
2745 		}
2746 #ifdef _SYSCALL32
2747 		else {
2748 			struct vtoc32	*vtoc32;
2749 
2750 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2751 
2752 			vtoctovtoc32((*vtoc), (*vtoc32));
2753 			if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
2754 				err = EFAULT;
2755 			kmem_free(vtoc32, sizeof (*vtoc32));
2756 		}
2757 #endif /* _SYSCALL32 */
2758 
2759 		kmem_free(vtoc, sizeof (*vtoc));
2760 		return (err);
2761 	}
2762 
2763 	case DKIOCSVTOC:
2764 	{
2765 		struct vtoc	*vtoc;
2766 
2767 		if (! (mode & FWRITE))
2768 			return (EACCES);
2769 
2770 		vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2771 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2772 			if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
2773 				err = EFAULT;
2774 			}
2775 		}
2776 #ifdef _SYSCALL32
2777 		else {
2778 			struct vtoc32	*vtoc32;
2779 
2780 			vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2781 
2782 			if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
2783 				err = EFAULT;
2784 			} else {
2785 				vtoc32tovtoc((*vtoc32), (*vtoc));
2786 			}
2787 			kmem_free(vtoc32, sizeof (*vtoc32));
2788 		}
2789 #endif /* _SYSCALL32 */
2790 
2791 		if (err == 0)
2792 			err = raid_set_vtoc(un, vtoc);
2793 
2794 		kmem_free(vtoc, sizeof (*vtoc));
2795 		return (err);
2796 	}
2797 
2798 	case DKIOCGEXTVTOC:
2799 	{
2800 		struct extvtoc	*extvtoc;
2801 
2802 		if (! (mode & FREAD))
2803 			return (EACCES);
2804 
2805 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2806 		if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
2807 			kmem_free(extvtoc, sizeof (*extvtoc));
2808 			return (err);
2809 		}
2810 
2811 		if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
2812 			err = EFAULT;
2813 
2814 		kmem_free(extvtoc, sizeof (*extvtoc));
2815 		return (err);
2816 	}
2817 
2818 	case DKIOCSEXTVTOC:
2819 	{
2820 		struct extvtoc	*extvtoc;
2821 
2822 		if (! (mode & FWRITE))
2823 			return (EACCES);
2824 
2825 		extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2826 		if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
2827 			err = EFAULT;
2828 		}
2829 
2830 		if (err == 0)
2831 			err = raid_set_extvtoc(un, extvtoc);
2832 
2833 		kmem_free(extvtoc, sizeof (*extvtoc));
2834 		return (err);
2835 	}
2836 
2837 	case DKIOCGAPART:
2838 	{
2839 		struct dk_map	dmp;
2840 
2841 		if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2842 			return (err);
2843 		}
2844 
2845 		if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2846 			if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2847 			    mode) != 0)
2848 				err = EFAULT;
2849 		}
2850 #ifdef _SYSCALL32
2851 		else {
2852 			struct dk_map32 dmp32;
2853 
2854 			dmp32.dkl_cylno = dmp.dkl_cylno;
2855 			dmp32.dkl_nblk = dmp.dkl_nblk;
2856 
2857 			if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2858 			    mode) != 0)
2859 				err = EFAULT;
2860 		}
2861 #endif /* _SYSCALL32 */
2862 
2863 		return (err);
2864 	}
2865 	case DKIOCGETEFI:
2866 	{
2867 		/*
2868 		 * This one can be done centralized,
2869 		 * no need to put in the same code for all types of metadevices
2870 		 */
2871 		return (md_dkiocgetefi(mnum, data, mode));
2872 	}
2873 
2874 	case DKIOCSETEFI:
2875 	{
2876 		/*
2877 		 * This one can be done centralized,
2878 		 * no need to put in the same code for all types of metadevices
2879 		 */
2880 		return (md_dkiocsetefi(mnum, data, mode));
2881 	}
2882 
2883 	case DKIOCPARTITION:
2884 	{
2885 		return (md_dkiocpartition(mnum, data, mode));
2886 	}
2887 
2888 	default:
2889 		return (ENOTTY);
2890 	}
2891 }
2892 
2893 /*
2894  * rename/exchange named service entry points and support functions follow.
2895  * Most functions are handled generically, except for raid-specific locking
2896  * and checking
2897  */
2898 
2899 /*
2900  * NAME:	raid_may_renexch_self
2901  * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2902  * PARAMETERS:	mr_unit_t	*un - unit struct of raid unit to be renamed
2903  *		mdi_unit_t	*ui - in-core unit struct of same raid unit
2904  *		md_rentxn_t	*rtxnp - rename transaction state
2905  *
2906  * LOCKS:	none
2907  *
2908  */
2909 static int
raid_may_renexch_self(mr_unit_t * un,mdi_unit_t * ui,md_rentxn_t * rtxnp)2910 raid_may_renexch_self(
2911 	mr_unit_t	*un,
2912 	mdi_unit_t	*ui,
2913 	md_rentxn_t	*rtxnp)
2914 {
2915 	minor_t	from_min;
2916 	minor_t	to_min;
2917 	bool_t	toplevel;
2918 	bool_t	related;
2919 
2920 	from_min = rtxnp->from.mnum;
2921 	to_min = rtxnp->to.mnum;
2922 
2923 	if (!un || !ui) {
2924 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2925 		    from_min);
2926 		return (EINVAL);
2927 	}
2928 
2929 	ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2930 	if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2931 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2932 		return (EINVAL);
2933 	}
2934 
2935 	if (MD_PARENT(un) == MD_MULTI_PARENT) {
2936 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2937 		return (EINVAL);
2938 	}
2939 
2940 	toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2941 
2942 	/* we're related if trying to swap with our parent */
2943 	related = (!toplevel) && (MD_PARENT(un) == to_min);
2944 
2945 	switch (rtxnp->op) {
2946 	case MDRNOP_EXCHANGE:
2947 
2948 		if (!related) {
2949 			(void) mdmderror(&rtxnp->mde,
2950 			    MDE_RENAME_TARGET_UNRELATED, to_min);
2951 			return (EINVAL);
2952 		}
2953 
2954 		break;
2955 
2956 	case MDRNOP_RENAME:
2957 		/*
2958 		 * if from is top-level and is open, then the kernel is using
2959 		 * the md_dev64_t.
2960 		 */
2961 
2962 		if (toplevel && md_unit_isopen(ui)) {
2963 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2964 			    from_min);
2965 			return (EBUSY);
2966 		}
2967 		break;
2968 
2969 	default:
2970 		(void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2971 		    from_min);
2972 		return (EINVAL);
2973 	}
2974 
2975 	return (0);	/* ok */
2976 }
2977 
2978 /*
2979  * NAME:	raid_rename_check
2980  * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2981  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
2982  *					 raid device for rename transaction
2983  *		md_rentxn_t	*rtxnp - rename transaction state
2984  *
2985  * LOCKS:	none
2986  *
2987  */
2988 intptr_t
raid_rename_check(md_rendelta_t * delta,md_rentxn_t * rtxnp)2989 raid_rename_check(
2990 	md_rendelta_t	*delta,
2991 	md_rentxn_t	*rtxnp)
2992 {
2993 	int		 err	= 0;
2994 	int		 column;
2995 	mr_unit_t	*un;
2996 
2997 	ASSERT(delta);
2998 	ASSERT(rtxnp);
2999 	ASSERT(delta->unp);
3000 	ASSERT(delta->uip);
3001 
3002 	if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3003 		(void) mdsyserror(&rtxnp->mde, EINVAL);
3004 		return (EINVAL);
3005 	}
3006 
3007 	un = (mr_unit_t *)delta->unp;
3008 
3009 	for (column = 0; column < un->un_totalcolumncnt; column++) {
3010 		rcs_state_t	colstate;
3011 
3012 		colstate = un->un_column[column].un_devstate;
3013 
3014 		if (colstate & RCS_LAST_ERRED) {
3015 			(void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
3016 			    md_getminor(delta->dev));
3017 			return (EINVAL);
3018 		}
3019 
3020 		if (colstate & RCS_INIT_ERRED) {
3021 			(void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
3022 			    md_getminor(delta->dev));
3023 			return (EINVAL);
3024 		}
3025 
3026 		/* How did we get this far before detecting this? */
3027 		if (colstate & RCS_RESYNC) {
3028 			(void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3029 			    md_getminor(delta->dev));
3030 			return (EBUSY);
3031 		}
3032 
3033 		if (colstate & RCS_ERRED) {
3034 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3035 			    md_getminor(delta->dev));
3036 			return (EINVAL);
3037 		}
3038 
3039 		if (!(colstate & RCS_OKAY)) {
3040 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3041 			    md_getminor(delta->dev));
3042 			return (EINVAL);
3043 		}
3044 
3045 		if (HOTSPARED(un, column)) {
3046 			(void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3047 			    md_getminor(delta->dev));
3048 			return (EINVAL);
3049 		}
3050 	}
3051 
3052 	/* self does additional checks */
3053 	if (delta->old_role == MDRR_SELF) {
3054 		err = raid_may_renexch_self((mr_unit_t *)delta->unp,
3055 		    delta->uip, rtxnp);
3056 	}
3057 	return (err);
3058 }
3059 
3060 /*
3061  * NAME:	raid_rename_lock
3062  * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
3063  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
3064  *					 raid device for rename transaction
3065  *		md_rentxn_t	*rtxnp - rename transaction state
3066  *
3067  * LOCKS:	io and unit locks (taken explicitly *not* via ioctl wrappers)
3068  *
3069  */
3070 intptr_t
raid_rename_lock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3071 raid_rename_lock(
3072 	md_rendelta_t	*delta,
3073 	md_rentxn_t	*rtxnp)
3074 {
3075 	minor_t		mnum;
3076 
3077 	ASSERT(delta);
3078 	ASSERT(rtxnp);
3079 
3080 	mnum = md_getminor(delta->dev);
3081 	if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
3082 		return (0);
3083 	}
3084 
3085 	ASSERT(delta->uip);
3086 	if (!delta->uip) {
3087 		(void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
3088 		return (ENODEV);
3089 	}
3090 
3091 	ASSERT(delta->unp);
3092 	if (!delta->unp) {
3093 
3094 		return (ENODEV);
3095 	}
3096 
3097 	ASSERT(!IO_WRITER_HELD(delta->unp));
3098 	(void) md_io_writerlock(delta->uip);
3099 	ASSERT(IO_WRITER_HELD(delta->unp));
3100 
3101 
3102 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3103 	(void) md_unit_writerlock(delta->uip);
3104 	ASSERT(UNIT_WRITER_HELD(delta->unp));
3105 
3106 	return (0);
3107 }
3108 
3109 /*
3110  * NAME:	raid_rename_unlock
3111  * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3112  * PARAMETERS:	md_rendelta_t	*delta - describes changes to be made to this
3113  *					 raid device for rename transaction
3114  *		md_rentxn_t	*rtxnp - rename transaction state
3115  *
3116  * LOCKS:	drops io and unit locks
3117  *
3118  */
3119 /* ARGSUSED */
3120 void
raid_rename_unlock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3121 raid_rename_unlock(
3122 	md_rendelta_t	*delta,
3123 	md_rentxn_t	*rtxnp)
3124 {
3125 	mr_unit_t	*un = (mr_unit_t *)delta->unp;
3126 	minor_t		mnum = MD_SID(un);
3127 	int		col;
3128 
3129 	ASSERT(delta);
3130 	ASSERT(delta->unp);
3131 	ASSERT(delta->uip);
3132 
3133 	ASSERT(UNIT_WRITER_HELD(delta->unp));
3134 	md_unit_writerexit(delta->uip);
3135 	ASSERT(!UNIT_WRITER_HELD(delta->unp));
3136 
3137 	if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3138 		goto out;
3139 	}
3140 	if (raid_internal_open(mnum, (FREAD | FWRITE),
3141 	    OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3142 		for (col = 0; col < un->un_totalcolumncnt; col++) {
3143 			if (un->un_column[col].un_devstate & RCS_OKAY)
3144 				(void) init_pw_area(un,
3145 				    un->un_column[col].un_dev,
3146 				    un->un_column[col].un_pwstart, col);
3147 		}
3148 		(void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3149 	}
3150 
3151 out:
3152 	ASSERT(IO_WRITER_HELD(delta->unp));
3153 	md_io_writerexit(delta->uip);
3154 	ASSERT(!IO_WRITER_HELD(delta->unp));
3155 }
3156 /* end of rename/exchange named service and support functions */
3157