1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright 2012 Milan Jurik. All rights reserved.
26 */
27
28 /*
29 * NAME: raid_ioctl.c
30 *
31 * DESCRIPTION: RAID driver source file containing IOCTL operations.
32 *
33 * ROUTINES PROVIDED FOR EXTERNAL USE:
34 * raid_commit() - commits MD database updates for a RAID metadevice
35 * md_raid_ioctl() - RAID metadevice IOCTL operations entry point.
36 *
37 * ROUTINES PROVIDED FOR INTERNAL USE:
38 * raid_getun() - Performs unit checking on a RAID metadevice
39 * init_col_nextio() - normal backend when zeroing column of RAID metadevice.
40 * init_col_int() - I/O interrupt while zeroing column of RAID metadevice.
41 * raid_init_columns() - Zero one or more columns of a RAID metadevice.
42 * raid_set() - used to create a RAID metadevice
43 * raid_get() - used to get the unit structure of a RAID metadevice
44 * raid_replace() - used to replace a component of a RAID metadevice
45 * raid_grow() - Concatenate to a RAID metadevice
46 * raid_change() - change dynamic values of a RAID metadevice
47 * raid_reset() - used to reset (clear / remove) a RAID metadevice
48 * raid_get_geom() - used to get the geometry of a RAID metadevice
49 * raid_get_vtoc() - used to get the VTOC on a RAID metadevice
50 * raid_set_vtoc() - used to set the VTOC on a RAID metadevice
51 * raid_get_extvtoc() - used to get the extended VTOC on a RAID metadevice
52 * raid_set_extvtoc() - used to set the extended VTOC on a RAID metadevice
53 * raid_getdevs() - return all devices within a RAID metadevice
54 * raid_admin_ioctl() - IOCTL operations unique to metadevices and RAID
55 */
56
57
58 #include <sys/param.h>
59 #include <sys/systm.h>
60 #include <sys/conf.h>
61 #include <sys/file.h>
62 #include <sys/user.h>
63 #include <sys/uio.h>
64 #include <sys/t_lock.h>
65 #include <sys/buf.h>
66 #include <sys/dkio.h>
67 #include <sys/vtoc.h>
68 #include <sys/kmem.h>
69 #include <vm/page.h>
70 #include <sys/sysmacros.h>
71 #include <sys/types.h>
72 #include <sys/mkdev.h>
73 #include <sys/stat.h>
74 #include <sys/open.h>
75 #include <sys/disp.h>
76 #include <sys/modctl.h>
77 #include <sys/ddi.h>
78 #include <sys/sunddi.h>
79 #include <sys/cred.h>
80 #include <sys/lvm/mdvar.h>
81 #include <sys/lvm/md_names.h>
82 #include <sys/lvm/md_mddb.h>
83 #include <sys/lvm/md_raid.h>
84 #include <sys/lvm/md_convert.h>
85
86 #include <sys/sysevent/eventdefs.h>
87 #include <sys/sysevent/svm.h>
88
89 extern int md_status;
90 extern unit_t md_nunits;
91 extern set_t md_nsets;
92 extern md_set_t md_set[];
93 extern md_ops_t raid_md_ops;
94 extern major_t md_major;
95 extern md_krwlock_t md_unit_array_rw;
96 extern mdq_anchor_t md_done_daemon;
97 extern mdq_anchor_t md_ff_daemonq;
98 extern int mdopen();
99 extern int mdclose();
100 extern void md_probe_one(probe_req_t *);
101 extern int md_init_probereq(md_probedev_impl_t *,
102 daemon_queue_t **);
103 extern md_resync_t md_cpr_resync;
104
105
106 extern void dump_mr_unit(mr_unit_t *);
107
108 typedef struct raid_ci {
109 DAEMON_QUEUE
110 struct raid_ci *ci_next;
111 mr_unit_t *ci_un;
112 int ci_col;
113 int ci_err;
114 int ci_flag;
115 size_t ci_zerosize;
116 diskaddr_t ci_blkno;
117 diskaddr_t ci_lastblk;
118 buf_t ci_buf;
119 } raid_ci_t;
120 /* values for the ci_flag */
121 #define COL_INITING (0x0001)
122 #define COL_INIT_DONE (0x0002)
123 #define COL_READY (0x0004)
124
125 /*
126 * NAME: raid_getun
127 * DESCRIPTION: performs a lot of unit checking on a RAID metadevice
128 * PARAMETERS: minor_t mnum - minor device number for RAID unit
129 * md_error_t *mde - pointer to error reporting structure
130 * int flags - pointer to error reporting structure
131 * STALE_OK - allow stale MD memory
132 * NO_OLD - unit must not exist
133 * NO_LOCK - no IOCTL lock needed
134 * WR_LOCK - write IOCTL lock needed
135 * RD_LOCK - read IOCTL lock needed
136 * IOLOCK *lock - pointer to IOCTL lock
137 *
138 * LOCKS: obtains unit reader or writer lock via IOLOCK
139 *
140 */
141 static mr_unit_t *
raid_getun(minor_t mnum,md_error_t * mde,int flags,IOLOCK * lock)142 raid_getun(minor_t mnum, md_error_t *mde, int flags, IOLOCK *lock)
143 {
144 mr_unit_t *un;
145 mdi_unit_t *ui;
146 set_t setno = MD_MIN2SET(mnum);
147
148 if ((setno >= md_nsets) || (MD_MIN2UNIT(mnum) >= md_nunits)) {
149 (void) mdmderror(mde, MDE_INVAL_UNIT, mnum);
150 return (NULL);
151 }
152
153 if (!(flags & STALE_OK)) {
154 if (md_get_setstatus(setno) & MD_SET_STALE) {
155 (void) mdmddberror(mde, MDE_DB_STALE, mnum, setno);
156 return (NULL);
157 }
158 }
159
160 ui = MDI_UNIT(mnum);
161 if (flags & NO_OLD) {
162 if (ui != NULL) {
163 (void) mdmderror(mde, MDE_UNIT_ALREADY_SETUP, mnum);
164 return (NULL);
165 }
166 return ((mr_unit_t *)1);
167 }
168
169 if (ui == NULL) {
170 (void) mdmderror(mde, MDE_UNIT_NOT_SETUP, mnum);
171 return (NULL);
172 }
173 if (flags & ARRAY_WRITER)
174 md_array_writer(lock);
175 else if (flags & ARRAY_READER)
176 md_array_reader(lock);
177
178 if (!(flags & NO_LOCK)) {
179 if (flags & WR_LOCK) {
180 (void) md_ioctl_io_lock(lock, ui);
181 (void) md_ioctl_writerlock(lock, ui);
182 } else /* RD_LOCK */
183 (void) md_ioctl_readerlock(lock, ui);
184 }
185 un = (mr_unit_t *)MD_UNIT(mnum);
186
187 if (un->c.un_type != MD_METARAID) {
188 (void) mdmderror(mde, MDE_NOT_RAID, mnum);
189 return (NULL);
190 }
191
192 return (un);
193 }
194
195
196 /*
197 * NAME: raid_commit
198 * DESCRIPTION: commits MD database updates for a RAID metadevice
199 * PARAMETERS: mr_unit_t *un - RAID unit to update in the MD database
200 * mddb_recid_t *extras - array of other record IDs to update
201 *
202 * LOCKS: assumes caller holds unit writer lock
203 *
204 */
205 void
raid_commit(mr_unit_t * un,mddb_recid_t * extras)206 raid_commit(mr_unit_t *un, mddb_recid_t *extras)
207 {
208 mddb_recid_t *recids;
209 int ri = 0;
210 int nrecids = 0;
211
212 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
213 return;
214
215 /* Count the extra recids */
216 if (extras != NULL) {
217 while (extras[nrecids] != 0) {
218 nrecids++;
219 }
220 }
221
222 /*
223 * Allocate space for two recids in addition to the extras:
224 * one for the unit structure, one for the null terminator.
225 */
226 nrecids += 2;
227 recids = (mddb_recid_t *)
228 kmem_zalloc(nrecids * sizeof (mddb_recid_t), KM_SLEEP);
229
230 if (un != NULL) {
231 ASSERT(MDI_UNIT(MD_SID(un)) ? UNIT_WRITER_HELD(un) : 1);
232 recids[ri++] = un->c.un_record_id;
233 }
234
235 if (extras != NULL) {
236 while (*extras != 0) {
237 recids[ri++] = *extras;
238 extras++;
239 }
240 }
241
242 if (ri > 0) {
243 mddb_commitrecs_wrapper(recids);
244 }
245
246 kmem_free(recids, nrecids * sizeof (mddb_recid_t));
247 }
248
249 static int
raid_check_pw(mr_unit_t * un)250 raid_check_pw(mr_unit_t *un)
251 {
252 buf_t bp;
253 char *buf;
254 mr_column_t *colptr;
255 minor_t mnum = MD_SID(un);
256 int i;
257 int err = 0;
258 minor_t unit;
259
260 buf = kmem_zalloc((uint_t)DEV_BSIZE, KM_SLEEP);
261
262 for (i = 0; i < un->un_totalcolumncnt; i++) {
263 md_dev64_t tmpdev;
264
265 colptr = &un->un_column[i];
266
267 tmpdev = colptr->un_dev;
268 /*
269 * Open by device id
270 * If this device is hotspared
271 * use the hotspare key
272 */
273 tmpdev = md_resolve_bydevid(mnum, tmpdev, HOTSPARED(un, i) ?
274 colptr->un_hs_key : colptr->un_orig_key);
275 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
276 colptr->un_dev = tmpdev;
277 return (1);
278 }
279 colptr->un_dev = tmpdev;
280
281 bzero((caddr_t)&bp, sizeof (buf_t));
282 bp.b_back = &bp;
283 bp.b_forw = &bp;
284 bp.b_flags = B_READ | B_BUSY;
285 sema_init(&bp.b_io, 0, NULL,
286 SEMA_DEFAULT, NULL);
287 sema_init(&bp.b_sem, 0, NULL,
288 SEMA_DEFAULT, NULL);
289 bp.b_edev = md_dev64_to_dev(colptr->un_dev);
290 bp.b_lblkno = colptr->un_pwstart;
291 bp.b_bcount = DEV_BSIZE;
292 bp.b_bufsize = DEV_BSIZE;
293 bp.b_un.b_addr = (caddr_t)buf;
294 bp.b_offset = -1;
295 (void) md_call_strategy(&bp, 0, NULL);
296 if (biowait(&bp))
297 err = 1;
298 if (i == 0) {
299 if (un->c.un_revision & MD_64BIT_META_DEV) {
300 unit = ((raid_pwhdr_t *)buf)->rpw_unit;
301 } else {
302 unit = ((raid_pwhdr32_od_t *)buf)->rpw_unit;
303 }
304 }
305 /*
306 * depending upon being an 64bit or 32 bit raid, the
307 * pre write headers have different layout
308 */
309 if (un->c.un_revision & MD_64BIT_META_DEV) {
310 if ((((raid_pwhdr_t *)buf)->rpw_column != i) ||
311 (((raid_pwhdr_t *)buf)->rpw_unit != unit))
312 err = 1;
313 } else {
314 if ((((raid_pwhdr32_od_t *)buf)->rpw_column != i) ||
315 (((raid_pwhdr32_od_t *)buf)->rpw_unit != unit))
316 err = 1;
317 }
318 md_layered_close(colptr->un_dev, MD_OFLG_NULL);
319 if (err)
320 break;
321 }
322 kmem_free(buf, DEV_BSIZE);
323 return (err);
324 }
325
326 /*
327 * NAME: init_col_nextio
328 * DESCRIPTION: normal backend process when zeroing column of a RAID metadevice.
329 * PARAMETERS: raid_ci_t *cur - struct for column being zeroed
330 *
331 * LOCKS: assumes caller holds unit reader lock,
332 * preiodically releases and reacquires unit reader lock,
333 * broadcasts on unit conditional variable (un_cv)
334 *
335 */
336 #define INIT_RLS_CNT 10
337 static void
init_col_nextio(raid_ci_t * cur)338 init_col_nextio(raid_ci_t *cur)
339 {
340 mr_unit_t *un;
341
342 un = cur->ci_un;
343
344 cur->ci_blkno += cur->ci_zerosize;
345
346 mutex_enter(&un->un_mx);
347 /* ===> update un_percent_done */
348 un->un_init_iocnt += btodb(cur->ci_buf.b_bcount);
349 mutex_exit(&un->un_mx);
350
351 /*
352 * When gorwing a device, normal I/O is still going on.
353 * The init thread still holds the unit reader lock which
354 * prevents I/O from doing state changes.
355 * So every INIT_RLS_CNT init I/Os, we will release the
356 * unit reader lock.
357 *
358 * CAVEAT:
359 * We know we are in the middle of a grow operation and the
360 * unit cannot be grown or removed (through reset or halt)
361 * so the mr_unit_t structure will not move or disappear.
362 * In addition, we know that only one of the init I/Os
363 * can be in col_init_nextio at a time because they are
364 * placed on the md_done_daemon queue and md only processes
365 * one element of this queue at a time. In addition, any
366 * code that needs to acquire the unit writer lock to change
367 * state is supposed to be on the md_mstr_daemon queue so
368 * it can be processing while we sit here waiting to get the
369 * unit reader lock back.
370 */
371
372 if (cur->ci_blkno < cur->ci_lastblk) {
373 /* truncate last chunk to end_addr if needed */
374 if (cur->ci_blkno + cur->ci_zerosize > cur->ci_lastblk) {
375 cur->ci_zerosize = (size_t)
376 (cur->ci_lastblk - cur->ci_blkno);
377 }
378
379 /* set address and length for I/O bufs */
380 cur->ci_buf.b_bufsize = dbtob(cur->ci_zerosize);
381 cur->ci_buf.b_bcount = dbtob(cur->ci_zerosize);
382 cur->ci_buf.b_lblkno = cur->ci_blkno;
383
384 (void) md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
385 return;
386 }
387 /* finished initializing this column */
388 mutex_enter(&un->un_mx);
389 cur->ci_flag = COL_INIT_DONE;
390 uniqtime32(&un->un_column[cur->ci_col].un_devtimestamp);
391 mutex_exit(&un->un_mx);
392 cv_broadcast(&un->un_cv);
393 }
394
395 /*
396 * NAME: init_col_int
397 * DESCRIPTION: I/O interrupt while zeroing column of a RAID metadevice.
398 * PARAMETERS: buf_t *cb - I/O buffer for which interrupt occurred
399 *
400 * LOCKS: assumes caller holds unit reader or writer lock
401 *
402 */
403 static int
init_col_int(buf_t * cb)404 init_col_int(buf_t *cb)
405 {
406 raid_ci_t *cur;
407
408 cur = (raid_ci_t *)cb->b_chain;
409 if (cb->b_flags & B_ERROR) {
410 mutex_enter(&cur->ci_un->un_mx);
411 cur->ci_err = EIO;
412 mutex_exit(&cur->ci_un->un_mx);
413 cv_broadcast(&cur->ci_un->un_cv);
414 return (1);
415 }
416 daemon_request(&md_done_daemon, init_col_nextio,
417 (daemon_queue_t *)cur, REQ_OLD);
418 return (1);
419 }
420
421 /*
422 * NAME: raid_init_columns
423 * DESCRIPTION: Zero one or more columns of a RAID metadevice.
424 * PARAMETERS: minor_t mnum - RAID unit minor identifier
425 *
426 * LOCKS: obtains and releases unit reader lock,
427 * obtains and releases unit writer lock,
428 * obtains and releases md_unit_array_rw write lock,
429 * obtains and releases unit mutex (un_mx) lock,
430 * waits on unit conditional variable (un_cv)
431 *
432 */
433 static void
raid_init_columns(minor_t mnum)434 raid_init_columns(minor_t mnum)
435 {
436 mr_unit_t *un;
437 mdi_unit_t *ui;
438 raid_ci_t *ci_chain = NULL, *cur;
439 rus_state_t state;
440 caddr_t zero_addr;
441 diskaddr_t end_off;
442 size_t zerosize;
443 int err = 0;
444 int ix;
445 int colcnt = 0;
446 int col;
447 set_t setno = MD_MIN2SET(mnum);
448
449 /*
450 * Increment the raid resync count for cpr
451 */
452 mutex_enter(&md_cpr_resync.md_resync_mutex);
453 md_cpr_resync.md_raid_resync++;
454 mutex_exit(&md_cpr_resync.md_resync_mutex);
455
456 /*
457 * initialization is a multiple step process. The first step
458 * is to go through the unit structure and start each device
459 * in the init state writing zeros over the component.
460 * Next initialize the prewrite areas, so the device can be
461 * used if a metainit -k is done. Now close the componenets.
462 *
463 * Once this complete set the state of each component being
464 * zeroed and set the correct state for the unit.
465 *
466 * last commit the records.
467 */
468
469 ui = MDI_UNIT(mnum);
470 un = md_unit_readerlock(ui);
471
472 /* check for active init on this column */
473 /* exiting is cpr safe */
474 if ((un->un_init_colcnt > 0) && (un->un_resync_index != -1)) {
475 md_unit_readerexit(ui);
476 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
477 /*
478 * Decrement the raid resync count for cpr
479 */
480 mutex_enter(&md_cpr_resync.md_resync_mutex);
481 md_cpr_resync.md_raid_resync--;
482 mutex_exit(&md_cpr_resync.md_resync_mutex);
483 thread_exit();
484 }
485
486 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_START, SVM_TAG_METADEVICE, setno,
487 MD_SID(un));
488 un->un_init_colcnt = 0;
489 un->un_init_iocnt = 0;
490 end_off = un->un_pwsize + (un->un_segsize * un->un_segsincolumn);
491 zerosize = (size_t)MIN((diskaddr_t)un->un_maxio, end_off);
492
493 /* allocate zero-filled buffer */
494 zero_addr = kmem_zalloc(dbtob(zerosize), KM_SLEEP);
495
496 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
497 if (un->un_column[ix].un_devstate != RCS_INIT)
498 continue;
499 /* allocate new column init structure */
500 cur = (raid_ci_t *)kmem_zalloc((sizeof (raid_ci_t)), KM_SLEEP);
501 ASSERT(cur != NULL);
502 un->un_init_colcnt++;
503 cur->ci_next = ci_chain;
504 ci_chain = cur;
505 cur->ci_un = un;
506 cur->ci_col = ix;
507 cur->ci_err = 0;
508 cur->ci_flag = COL_INITING;
509 cur->ci_zerosize = zerosize;
510 cur->ci_blkno = un->un_column[ix].un_pwstart;
511 cur->ci_lastblk = cur->ci_blkno + un->un_pwsize
512 + (un->un_segsize * un->un_segsincolumn);
513 /* initialize static buf fields */
514 cur->ci_buf.b_un.b_addr = zero_addr;
515 cur->ci_buf.b_chain = (buf_t *)cur;
516 cur->ci_buf.b_back = &cur->ci_buf;
517 cur->ci_buf.b_forw = &cur->ci_buf;
518 cur->ci_buf.b_iodone = init_col_int;
519 cur->ci_buf.b_flags = B_BUSY | B_WRITE;
520 cur->ci_buf.b_edev = md_dev64_to_dev(un->un_column[ix].un_dev);
521 sema_init(&cur->ci_buf.b_io, 0, NULL, SEMA_DEFAULT, NULL);
522 sema_init(&cur->ci_buf.b_sem, 0, NULL, SEMA_DEFAULT, NULL);
523 /* set address and length for I/O bufs */
524 cur->ci_buf.b_bufsize = dbtob(zerosize);
525 cur->ci_buf.b_bcount = dbtob(zerosize);
526 cur->ci_buf.b_lblkno = un->un_column[ix].un_pwstart;
527 cur->ci_buf.b_offset = -1;
528
529 if (! (un->un_column[ix].un_devflags & MD_RAID_DEV_ISOPEN)) {
530 md_dev64_t tmpdev = un->un_column[ix].un_dev;
531 /*
532 * Open by device id
533 * If this column is hotspared then
534 * use the hotspare key
535 */
536 tmpdev = md_resolve_bydevid(mnum, tmpdev,
537 HOTSPARED(un, ix) ?
538 un->un_column[ix].un_hs_key :
539 un->un_column[ix].un_orig_key);
540 if ((cur->ci_err = md_layered_open(mnum, &tmpdev,
541 MD_OFLG_NULL)) == 0)
542 un->un_column[ix].un_devflags |=
543 MD_RAID_DEV_ISOPEN;
544 un->un_column[ix].un_dev = tmpdev;
545 }
546 if (cur->ci_err == 0)
547 md_call_strategy(&cur->ci_buf, MD_STR_NOTTOP, NULL);
548 }
549
550 md_unit_readerexit(ui);
551 state = un->un_state;
552 colcnt = un->un_init_colcnt;
553 mutex_enter(&un->un_mx);
554 while (colcnt) {
555 cv_wait(&un->un_cv, &un->un_mx);
556
557 colcnt = 0;
558 for (cur = ci_chain; cur != NULL; cur = cur->ci_next) {
559 col = cur->ci_col;
560 if ((cur->ci_flag != COL_INITING) || (cur->ci_err)) {
561 if (cur->ci_err)
562 err = cur->ci_err;
563 else if (cur->ci_flag == COL_INIT_DONE) {
564 (void) init_pw_area(un,
565 un->un_column[col].un_dev,
566 un->un_column[col].un_pwstart,
567 col);
568 cur->ci_flag = COL_READY;
569 }
570 } else {
571 colcnt++;
572 }
573 }
574 }
575 mutex_exit(&un->un_mx);
576
577 /* This prevents new opens */
578 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
579 (void) md_io_writerlock(ui);
580 un = (mr_unit_t *)md_unit_writerlock(ui);
581 while (ci_chain) {
582 cur = ci_chain;
583
584 /* take this element out of the chain */
585 ci_chain = cur->ci_next;
586 /* free this element */
587 sema_destroy(&cur->ci_buf.b_io);
588 sema_destroy(&cur->ci_buf.b_sem);
589 if (cur->ci_err)
590 raid_set_state(cur->ci_un, cur->ci_col,
591 RCS_INIT_ERRED, 0);
592 else
593 raid_set_state(cur->ci_un, cur->ci_col,
594 RCS_OKAY, 0);
595 kmem_free(cur, sizeof (raid_ci_t));
596 }
597
598 /* free the zeroed buffer */
599 kmem_free(zero_addr, dbtob(zerosize));
600
601 /* determine new unit state */
602 if (err == 0) {
603 if (state == RUS_INIT)
604 un->un_state = RUS_OKAY;
605 else {
606 un->c.un_total_blocks = un->un_grow_tb;
607 md_nblocks_set(mnum, un->c.un_total_blocks);
608 un->un_grow_tb = 0;
609 if (raid_state_cnt(un, RCS_OKAY) ==
610 un->un_totalcolumncnt)
611 un->un_state = RUS_OKAY;
612 }
613 } else { /* error orcurred */
614 if (state & RUS_INIT)
615 un->un_state = RUS_DOI;
616 }
617 uniqtime32(&un->un_timestamp);
618 MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
619 un->un_init_colcnt = 0;
620 un->un_init_iocnt = 0;
621 raid_commit(un, NULL);
622 md_unit_writerexit(ui);
623 (void) md_io_writerexit(ui);
624 rw_exit(&md_unit_array_rw.lock);
625 if (err) {
626 if (un->un_state & RUS_DOI) {
627 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
628 SVM_TAG_METADEVICE, setno, MD_SID(un));
629 } else {
630 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
631 SVM_TAG_METADEVICE, setno, MD_SID(un));
632 }
633 } else {
634 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_SUCCESS,
635 SVM_TAG_METADEVICE, setno, MD_SID(un));
636 }
637 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
638 /*
639 * Decrement the raid resync count for cpr
640 */
641 mutex_enter(&md_cpr_resync.md_resync_mutex);
642 md_cpr_resync.md_raid_resync--;
643 mutex_exit(&md_cpr_resync.md_resync_mutex);
644 thread_exit();
645 /*NOTREACHED*/
646 }
647
648 static int
raid_init_unit(minor_t mnum,md_error_t * ep)649 raid_init_unit(minor_t mnum, md_error_t *ep)
650 {
651 mdi_unit_t *ui;
652 mr_unit_t *un;
653 int rval, i;
654 set_t setno = MD_MIN2SET(mnum);
655
656 ui = MDI_UNIT(mnum);
657 if (md_get_setstatus(setno) & MD_SET_STALE)
658 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
659
660 /* Don't start an init if the device is not available */
661 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
662 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
663 }
664
665 if (raid_internal_open(mnum, (FREAD | FWRITE),
666 OTYP_LYR, MD_OFLG_ISINIT)) {
667 rval = mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum);
668 goto out;
669 }
670
671 un = md_unit_readerlock(ui);
672 un->un_percent_done = 0;
673 md_unit_readerexit(ui);
674 /* start resync_unit thread */
675 (void) thread_create(NULL, 0, raid_init_columns,
676 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
677
678 return (0);
679
680 out:
681 un = md_unit_writerlock(ui);
682 MD_STATUS(un) &= ~MD_UN_GROW_PENDING;
683 /* recover state */
684 for (i = 0; i < un->un_totalcolumncnt; i++)
685 if (COLUMN_STATE(un, i) == RCS_INIT)
686 raid_set_state(un, i, RCS_ERRED, 0);
687 if (un->un_state & RUS_INIT)
688 un->un_state = RUS_DOI;
689 raid_commit(un, NULL);
690 md_unit_writerexit(ui);
691 if (un->un_state & RUS_DOI) {
692 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FATAL,
693 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
694 } else {
695 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_INIT_FAILED,
696 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
697 }
698 return (rval);
699 }
700
701 /*
702 * NAME: raid_regen
703 *
704 * DESCRIPTION: regenerate all the parity on the raid device. This
705 * routine starts a thread that will regenerate the
706 * parity on a raid device. If an I/O error occurs during
707 * this process the entire device is placed in error.
708 *
709 * PARAMETERS: md_set_params_t *msp - ioctl packet
710 */
711 static void
regen_unit(minor_t mnum)712 regen_unit(minor_t mnum)
713 {
714 mdi_unit_t *ui = MDI_UNIT(mnum);
715 mr_unit_t *un = MD_UNIT(mnum);
716 buf_t buf, *bp;
717 caddr_t buffer;
718 int err = 0;
719 diskaddr_t total_segments;
720 diskaddr_t line;
721 size_t iosize;
722
723 /*
724 * Increment raid resync count for cpr
725 */
726 mutex_enter(&md_cpr_resync.md_resync_mutex);
727 md_cpr_resync.md_raid_resync++;
728 mutex_exit(&md_cpr_resync.md_resync_mutex);
729
730 iosize = dbtob(un->un_segsize);
731 buffer = kmem_alloc(iosize, KM_SLEEP);
732 bp = &buf;
733 total_segments = un->un_segsincolumn;
734 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_START, SVM_TAG_METADEVICE,
735 MD_UN2SET(un), MD_SID(un));
736 un->un_percent_done = 0;
737 init_buf(bp, B_READ | B_BUSY, iosize);
738
739 for (line = 0; line < total_segments; line++) {
740 bp->b_lblkno = line *
741 ((un->un_origcolumncnt - 1) * un->un_segsize);
742 bp->b_un.b_addr = buffer;
743 bp->b_bcount = iosize;
744 bp->b_iodone = NULL;
745 /*
746 * The following assignment is only correct because
747 * md_raid_strategy is fine when it's only a minor number
748 * and not a real dev_t. Yuck.
749 */
750 bp->b_edev = mnum;
751 md_raid_strategy(bp, MD_STR_NOTTOP, NULL);
752 if (biowait(bp)) {
753 err = 1;
754 break;
755 }
756 un->un_percent_done = (uint_t)((line * 1000) /
757 un->un_segsincolumn);
758 /* just to avoid rounding errors */
759 if (un->un_percent_done > 1000)
760 un->un_percent_done = 1000;
761 reset_buf(bp, B_READ | B_BUSY, iosize);
762 }
763 destroy_buf(bp);
764 kmem_free(buffer, iosize);
765
766 (void) md_io_writerlock(ui);
767 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
768 (void) md_io_writerexit(ui);
769 un = md_unit_writerlock(ui);
770 if (!err &&
771 (raid_state_cnt(un, RCS_OKAY) == un->un_totalcolumncnt))
772 un->un_state = RUS_OKAY;
773 raid_commit(un, NULL);
774 md_unit_writerexit(ui);
775 if (err ||
776 raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) {
777 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_FAILED,
778 SVM_TAG_METADEVICE, MD_UN2SET(un), MD_SID(un));
779 } else {
780 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_REGEN_DONE, SVM_TAG_METADEVICE,
781 MD_UN2SET(un), MD_SID(un));
782 }
783
784 /*
785 * Decrement the raid resync count for cpr
786 */
787 mutex_enter(&md_cpr_resync.md_resync_mutex);
788 md_cpr_resync.md_raid_resync--;
789 mutex_exit(&md_cpr_resync.md_resync_mutex);
790 thread_exit();
791 }
792
793 static int
raid_regen_unit(minor_t mnum,md_error_t * ep)794 raid_regen_unit(minor_t mnum, md_error_t *ep)
795 {
796 mdi_unit_t *ui;
797 mr_unit_t *un;
798 int i;
799 set_t setno = MD_MIN2SET(mnum);
800
801 ui = MDI_UNIT(mnum);
802 un = (mr_unit_t *)MD_UNIT(mnum);
803
804 if (md_get_setstatus(setno) & MD_SET_STALE)
805 return (mdmddberror(ep, MDE_DB_STALE, mnum, setno));
806
807 /* Don't start a regen if the device is not available */
808 if ((ui == NULL) || (ui->ui_tstate & MD_DEV_ERRORED)) {
809 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
810 }
811
812 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
813 (void) md_unit_writerlock(ui);
814 for (i = 0; i < un->un_totalcolumncnt; i++)
815 raid_set_state(un, i, RCS_ERRED, 0);
816 md_unit_writerexit(ui);
817 return (mdmderror(ep, MDE_RAID_OPEN_FAILURE, mnum));
818 }
819
820 /* start resync_unit thread */
821 (void) thread_create(NULL, 0, regen_unit,
822 (void *)(uintptr_t)mnum, 0, &p0, TS_RUN, minclsyspri);
823
824 return (0);
825 }
826
827 static int
raid_regen(md_regen_param_t * mrp,IOLOCK * lock)828 raid_regen(md_regen_param_t *mrp, IOLOCK *lock)
829 {
830 minor_t mnum = mrp->mnum;
831 mr_unit_t *un;
832
833 mdclrerror(&mrp->mde);
834
835 un = md_unit_readerlock(MDI_UNIT(mnum));
836
837 if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
838 md_unit_readerexit(MDI_UNIT(mnum));
839 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
840 }
841
842 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
843 (raid_state_cnt(un, RCS_RESYNC))) {
844 md_unit_readerexit(MDI_UNIT(mnum));
845 return (mdmderror(&mrp->mde, MDE_RESYNC_ACTIVE, mnum));
846 }
847
848 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
849 md_unit_readerexit(MDI_UNIT(mnum));
850 return (mdmderror(&mrp->mde, MDE_IN_USE, mnum));
851 }
852
853 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
854 (! (un->un_state & RUS_OKAY))) {
855 md_unit_readerexit(MDI_UNIT(mnum));
856 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
857 }
858
859 md_unit_readerexit(MDI_UNIT(mnum));
860
861 /* get locks and recheck to be sure something did not change */
862 if ((un = raid_getun(mnum, &mrp->mde, WRITERS, lock)) == NULL)
863 return (0);
864
865 if ((raid_state_cnt(un, RCS_OKAY) != un->un_totalcolumncnt) ||
866 (! (un->un_state & RUS_OKAY))) {
867 return (mdmderror(&mrp->mde, MDE_RAID_NOT_OKAY, mnum));
868 }
869
870 raid_set_state(un, 0, RCS_REGEN, 0);
871 raid_commit(un, NULL);
872 md_ioctl_droplocks(lock);
873 return (raid_regen_unit(mnum, &mrp->mde));
874 }
875
876 /*
877 * NAME: raid_set
878 * DESCRIPTION: used to create a RAID metadevice
879 * PARAMETERS: md_set_params_t *d - pointer to set data structure
880 * int mode - must be FWRITE
881 *
882 * LOCKS: none
883 *
884 */
885 static int
raid_set(void * d,int mode)886 raid_set(void *d, int mode)
887 {
888 minor_t mnum;
889 mr_unit_t *un;
890 mddb_recid_t mr_recid;
891 mddb_recid_t *recids;
892 mddb_type_t typ1;
893 int err;
894 set_t setno;
895 int num_recs;
896 int rid;
897 int col;
898 md_set_params_t *msp = d;
899
900
901 mnum = msp->mnum;
902 setno = MD_MIN2SET(mnum);
903
904 mdclrerror(&msp->mde);
905
906 if (raid_getun(mnum, &msp->mde, NO_OLD, NULL) == NULL)
907 return (0);
908
909 typ1 = (mddb_type_t)md_getshared_key(setno,
910 raid_md_ops.md_driver.md_drivername);
911
912 /* create the db record for this mdstruct */
913
914 if (msp->options & MD_CRO_64BIT) {
915 #if defined(_ILP32)
916 return (mdmderror(&msp->mde, MDE_UNIT_TOO_LARGE, mnum));
917 #else
918 mr_recid = mddb_createrec(msp->size, typ1, 0,
919 MD_CRO_64BIT | MD_CRO_RAID | MD_CRO_FN, setno);
920 #endif
921 } else {
922 mr_recid = mddb_createrec(msp->size, typ1, 0,
923 MD_CRO_32BIT | MD_CRO_RAID | MD_CRO_FN, setno);
924 }
925
926 if (mr_recid < 0)
927 return (mddbstatus2error(&msp->mde,
928 (int)mr_recid, mnum, setno));
929
930 /* get the address of the mdstruct */
931 un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
932 /*
933 * It is okay that we muck with the mdstruct here,
934 * since no one else will know about the mdstruct
935 * until we commit it. If we crash, the record will
936 * be automatically purged, since we haven't
937 * committed it yet.
938 */
939
940 /* copy in the user's mdstruct */
941 if (err = ddi_copyin((caddr_t)(uintptr_t)msp->mdp, un,
942 msp->size, mode)) {
943 mddb_deleterec_wrapper(mr_recid);
944 return (EFAULT);
945 }
946 /* All 64 bit metadevices only support EFI labels. */
947 if (msp->options & MD_CRO_64BIT) {
948 un->c.un_flag |= MD_EFILABEL;
949 }
950
951 /*
952 * allocate the real recids array. since we may have to commit
953 * underlying metadevice records, we need an array of size:
954 * total number of components in raid + 3 (1 for the raid itself,
955 * one for the hotspare, one for the end marker).
956 */
957 num_recs = un->un_totalcolumncnt + 3;
958 rid = 0;
959 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
960 recids[rid++] = mr_recid;
961
962 MD_SID(un) = mnum;
963 MD_RECID(un) = recids[0];
964 MD_CAPAB(un) = MD_CAN_PARENT | MD_CAN_SP;
965 MD_PARENT(un) = MD_NO_PARENT;
966 un->un_resync_copysize = 0;
967 un->c.un_revision |= MD_FN_META_DEV;
968
969 if (UNIT_STATE(un) == RUS_INIT)
970 MD_STATUS(un) |= MD_UN_GROW_PENDING;
971
972 if ((UNIT_STATE(un) != RUS_INIT) && raid_check_pw(un)) {
973 mddb_deleterec_wrapper(mr_recid);
974 err = mderror(&msp->mde, MDE_RAID_INVALID);
975 goto out;
976 }
977
978 if (err = raid_build_incore(un, 0)) {
979 if (un->mr_ic) {
980 kmem_free(un->un_column_ic, sizeof (mr_column_ic_t) *
981 un->un_totalcolumncnt);
982 kmem_free(un->mr_ic, sizeof (*un->mr_ic));
983 }
984
985 md_nblocks_set(mnum, -1ULL);
986 MD_UNIT(mnum) = NULL;
987
988 mddb_deleterec_wrapper(mr_recid);
989 goto out;
990 }
991
992 /*
993 * Update unit availability
994 */
995 md_set[setno].s_un_avail--;
996
997 recids[rid] = 0;
998 if (un->un_hsp_id != -1) {
999 /* increment the reference count of the hot spare pool */
1000 err = md_hot_spare_ifc(HSP_INCREF, un->un_hsp_id, 0, 0,
1001 &recids[rid], NULL, NULL, NULL);
1002 if (err) {
1003 md_nblocks_set(mnum, -1ULL);
1004 MD_UNIT(mnum) = NULL;
1005
1006 mddb_deleterec_wrapper(mr_recid);
1007 goto out;
1008 }
1009 rid++;
1010 }
1011
1012 /*
1013 * set the parent on any metadevice components.
1014 * NOTE: currently soft partitions are the only metadevices
1015 * which can appear within a RAID metadevice.
1016 */
1017 for (col = 0; col < un->un_totalcolumncnt; col++) {
1018 mr_column_t *mr_col = &un->un_column[col];
1019 md_unit_t *comp_un;
1020
1021 if (md_getmajor(mr_col->un_dev) == md_major) {
1022 comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1023 recids[rid++] = MD_RECID(comp_un);
1024 md_set_parent(mr_col->un_dev, MD_SID(un));
1025 }
1026 }
1027
1028 /* set the end marker */
1029 recids[rid] = 0;
1030
1031 mddb_commitrecs_wrapper(recids);
1032 md_create_unit_incore(mnum, &raid_md_ops, 1);
1033
1034 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_CREATE, SVM_TAG_METADEVICE, setno,
1035 MD_SID(un));
1036
1037 out:
1038 kmem_free(recids, (num_recs * sizeof (mddb_recid_t)));
1039 if (err)
1040 return (err);
1041
1042 /* only attempt to init a device that is in the init state */
1043 if (UNIT_STATE(un) != RUS_INIT)
1044 return (0);
1045
1046 return (raid_init_unit(mnum, &msp->mde));
1047 }
1048
1049 /*
1050 * NAME: raid_get
1051 * DESCRIPTION: used to get the unit structure of a RAID metadevice
1052 * PARAMETERS: md_i_get_t *migp - pointer to get data structure
1053 * int mode - must be FREAD
1054 * IOLOCK *lock - pointer to IOCTL lock
1055 *
1056 * LOCKS: obtains unit reader lock via IOLOCK
1057 *
1058 */
1059 static int
raid_get(void * migp,int mode,IOLOCK * lock)1060 raid_get(
1061 void *migp,
1062 int mode,
1063 IOLOCK *lock
1064 )
1065 {
1066 minor_t mnum;
1067 mr_unit_t *un;
1068 md_i_get_t *migph = migp;
1069
1070
1071 mnum = migph->id;
1072
1073 mdclrerror(&migph->mde);
1074
1075 if ((un = raid_getun(mnum, &migph->mde,
1076 RD_LOCK, lock)) == NULL)
1077 return (0);
1078
1079 if (migph->size == 0) {
1080 migph->size = un->c.un_size;
1081 return (0);
1082 }
1083
1084 if (migph->size < un->c.un_size) {
1085 return (EFAULT);
1086 }
1087 if (ddi_copyout(un, (void *)(uintptr_t)migph->mdp,
1088 un->c.un_size, mode))
1089 return (EFAULT);
1090
1091 return (0);
1092 }
1093
1094
1095 /*
1096 * NAME: raid_replace
1097 * DESCRIPTION: used to replace a component of a RAID metadevice
1098 * PARAMETERS: replace_params_t *mrp - pointer to replace data structure
1099 * IOLOCK *lock - pointer to IOCTL lock
1100 *
1101 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1102 * obtains and releases md_unit_array_rw write lock
1103 *
1104 */
1105 static int
raid_replace(replace_params_t * mrp,IOLOCK * lock)1106 raid_replace(
1107 replace_params_t *mrp,
1108 IOLOCK *lock
1109 )
1110 {
1111 minor_t mnum = mrp->mnum;
1112 md_dev64_t odev = mrp->old_dev;
1113 md_error_t *ep = &mrp->mde;
1114 mr_unit_t *un;
1115 rcs_state_t state;
1116 int ix, col = -1;
1117 int force = 0;
1118 int err = 0;
1119 replace_cmd_t cmd;
1120 set_t setno;
1121 side_t side;
1122 mdkey_t devkey;
1123 int nkeys;
1124 mddb_recid_t extra_recids[3] = { 0, 0, 0 };
1125 int extra_rids = 0;
1126 md_error_t mde = mdnullerror;
1127 sv_dev_t sv = {MD_SET_BAD, MD_SIDEWILD, MD_KEYWILD};
1128
1129 mdclrerror(ep);
1130 setno = MD_MIN2SET(mnum);
1131 side = mddb_getsidenum(setno);
1132
1133 un = md_unit_readerlock(MDI_UNIT(mnum));
1134
1135 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1136 (raid_state_cnt(un, RCS_RESYNC) != 0)) {
1137 md_unit_readerexit(MDI_UNIT(mnum));
1138 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1139 }
1140
1141 if (un->un_state & RUS_DOI) {
1142 md_unit_readerexit(MDI_UNIT(mnum));
1143 return (mdmderror(ep, MDE_RAID_DOI, mnum));
1144 }
1145
1146 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1147 (MD_STATUS(un) & MD_UN_GROW_PENDING)) {
1148 md_unit_readerexit(MDI_UNIT(mnum));
1149 return (mdmderror(ep, MDE_IN_USE, mnum));
1150 }
1151
1152 md_unit_readerexit(MDI_UNIT(mnum));
1153
1154 /* get locks and recheck to be sure something did not change */
1155 if ((un = raid_getun(mnum, ep, WRITERS, lock)) == NULL)
1156 return (0);
1157
1158 if (md_getkeyfromdev(setno, side, odev, &devkey, &nkeys) != 0) {
1159 return (mddeverror(ep, MDE_NAME_SPACE, odev));
1160 }
1161
1162 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1163 md_dev64_t tmpdevt = un->un_column[ix].un_orig_dev;
1164 /*
1165 * Try to resolve devt again if NODEV64
1166 */
1167 if (tmpdevt == NODEV64) {
1168 tmpdevt = md_resolve_bydevid(mnum, tmpdevt,
1169 un->un_column[ix].un_orig_key);
1170 un->un_column[ix].un_orig_dev = tmpdevt;
1171 }
1172
1173 if (un->un_column[ix].un_orig_dev == odev) {
1174 col = ix;
1175 break;
1176 } else {
1177 if (un->un_column[ix].un_orig_dev == NODEV64) {
1178 /*
1179 * Now we use the keys to match.
1180 * If no key found, continue.
1181 */
1182 if (nkeys == 0) {
1183 continue;
1184 }
1185 if (un->un_column[ix].un_orig_key == devkey) {
1186 if (nkeys > 1)
1187 return (mddeverror(ep,
1188 MDE_MULTNM, odev));
1189 col = ix;
1190 break;
1191 }
1192 }
1193 }
1194 }
1195
1196 if (col == -1)
1197 return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1198 mnum, odev));
1199
1200 if ((MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ||
1201 (raid_state_cnt(un, RCS_RESYNC) != 0))
1202 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1203
1204 if (un->un_state & RUS_DOI)
1205 return (mdcomperror(ep, MDE_REPL_INVAL_STATE, mnum,
1206 un->un_column[col].un_dev));
1207
1208 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT) ||
1209 (MD_STATUS(un) & MD_UN_GROW_PENDING))
1210 return (mdmderror(ep, MDE_IN_USE, mnum));
1211
1212 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == FORCE_REPLACE_COMP))
1213 force = 1;
1214 if ((mrp->cmd == FORCE_ENABLE_COMP) || (mrp->cmd == ENABLE_COMP))
1215 cmd = ENABLE_COMP;
1216 if ((mrp->cmd == FORCE_REPLACE_COMP) || (mrp->cmd == REPLACE_COMP))
1217 cmd = REPLACE_COMP;
1218
1219 if (un->un_state == RUS_LAST_ERRED) {
1220 /* Must use -f force flag for unit in LAST_ERRED state */
1221 if (!force)
1222 return (mdmderror(ep, MDE_RAID_NEED_FORCE, mnum));
1223
1224 /* Must use -f force flag on ERRED column first */
1225 if (un->un_column[col].un_devstate != RCS_ERRED) {
1226 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1227 if (un->un_column[ix].un_devstate & RCS_ERRED)
1228 return (mdcomperror(ep,
1229 MDE_RAID_COMP_ERRED, mnum,
1230 un->un_column[ix].un_dev));
1231 }
1232 }
1233
1234 /* must use -f force flag on LAST_ERRED columns next */
1235 if ((un->un_column[col].un_devstate != RCS_LAST_ERRED) &&
1236 (un->un_column[col].un_devstate != RCS_ERRED))
1237 return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1238 mnum, un->un_column[col].un_dev));
1239 }
1240
1241 if (un->un_state == RUS_ERRED) {
1242 if (! (un->un_column[col].un_devstate &
1243 (RCS_ERRED | RCS_INIT_ERRED)))
1244 return (mdcomperror(ep, MDE_RAID_COMP_ERRED,
1245 mnum, un->un_column[ix].un_dev));
1246 }
1247
1248 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_ALT_ISOPEN));
1249 ASSERT(!(un->un_column[col].un_devflags & MD_RAID_WRITE_ALT));
1250
1251 state = un->un_column[col].un_devstate;
1252 if (state & RCS_INIT_ERRED) {
1253 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1254 un->un_percent_done = 0;
1255 raid_set_state(un, col, RCS_INIT, 0);
1256 } else if (((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) &&
1257 resync_request(mnum, col, 0, ep))
1258 return (mdmderror(ep, MDE_RESYNC_ACTIVE, mnum));
1259
1260
1261 if (cmd == REPLACE_COMP) {
1262 md_dev64_t tmpdev = mrp->new_dev;
1263
1264 /*
1265 * open the device by device id
1266 */
1267 tmpdev = md_resolve_bydevid(mnum, tmpdev, mrp->new_key);
1268 if (md_layered_open(mnum, &tmpdev, MD_OFLG_NULL)) {
1269 return (mdcomperror(ep, MDE_COMP_OPEN_ERR, mnum,
1270 tmpdev));
1271 }
1272
1273 /*
1274 * If it's a metadevice, make sure it gets reparented
1275 */
1276 if (md_getmajor(tmpdev) == md_major) {
1277 minor_t new_mnum = md_getminor(tmpdev);
1278 md_unit_t *new_un = MD_UNIT(new_mnum);
1279
1280 md_set_parent(tmpdev, MD_SID(un));
1281 extra_recids[extra_rids++] = MD_RECID(new_un);
1282 }
1283
1284 mrp->new_dev = tmpdev;
1285 un->un_column[col].un_orig_dev = tmpdev;
1286 un->un_column[col].un_orig_key = mrp->new_key;
1287 un->un_column[col].un_orig_pwstart = mrp->start_blk;
1288 un->un_column[col].un_orig_devstart =
1289 mrp->start_blk + un->un_pwsize;
1290
1291 /*
1292 * If the old device was a metadevice, make sure to
1293 * reset its parent.
1294 */
1295 if (md_getmajor(odev) == md_major) {
1296 minor_t old_mnum = md_getminor(odev);
1297 md_unit_t *old_un = MD_UNIT(old_mnum);
1298
1299 md_reset_parent(odev);
1300 extra_recids[extra_rids++] =
1301 MD_RECID(old_un);
1302 }
1303
1304 if (HOTSPARED(un, col)) {
1305 md_layered_close(mrp->new_dev, MD_OFLG_NULL);
1306 un->un_column[col].un_alt_dev = mrp->new_dev;
1307 un->un_column[col].un_alt_pwstart = mrp->start_blk;
1308 un->un_column[col].un_alt_devstart =
1309 mrp->start_blk + un->un_pwsize;
1310 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1311 } else {
1312 /*
1313 * not hot spared. Close the old device and
1314 * move the new device in.
1315 */
1316 if (un->un_column[col].un_devflags & MD_RAID_DEV_ISOPEN)
1317 md_layered_close(odev, MD_OFLG_NULL);
1318 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1319 un->un_column[col].un_dev = mrp->new_dev;
1320 un->un_column[col].un_pwstart = mrp->start_blk;
1321 un->un_column[col].un_devstart =
1322 mrp->start_blk + un->un_pwsize;
1323 if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0) {
1324 un->un_column[col].un_devflags |=
1325 MD_RAID_REGEN_RESYNC;
1326 }
1327 }
1328 /*
1329 * If the old device is not a metadevice then
1330 * save off the set number and key so that it
1331 * can be removed from the namespace later.
1332 */
1333 if (md_getmajor(odev) != md_major) {
1334 sv.setno = setno;
1335 sv.key = devkey;
1336 }
1337 }
1338
1339 if (cmd == ENABLE_COMP) {
1340 md_dev64_t tmpdev = un->un_column[col].un_orig_dev;
1341 mdkey_t raidkey = un->un_column[col].un_orig_key;
1342
1343 /*
1344 * We trust the dev_t because we cannot determine the
1345 * dev_t from the device id since a new disk is in the
1346 * same location. Since this is a call from metareplace -e dx
1347 * AND it is SCSI a new dev_t is not generated. So the
1348 * dev_t from the mddb is used. Before enabling the device
1349 * we check to make sure that multiple entries for the same
1350 * device does not exist in the namespace. If they do we
1351 * fail the ioctl.
1352 * One of the many ways multiple entries in the name space
1353 * can occur is if one removed the failed component in a
1354 * RAID metadevice and put another disk that was part of
1355 * another metadevice. After reboot metadevadm would correctly
1356 * update the device name for the metadevice whose component
1357 * has moved. However now in the metadb there are two entries
1358 * for the same name (ctds) that belong to different
1359 * metadevices. One is valid, the other is a ghost or "last
1360 * know as" ctds.
1361 */
1362 tmpdev = md_resolve_bydevid(mnum, tmpdev, raidkey);
1363 if (tmpdev == NODEV64)
1364 tmpdev = md_getdevnum(setno, side, raidkey,
1365 MD_TRUST_DEVT);
1366 /*
1367 * check for multiple entries in namespace for the
1368 * same dev
1369 */
1370
1371 if (md_getkeyfromdev(setno, side, tmpdev, &devkey,
1372 &nkeys) != 0)
1373 return (mddeverror(ep, MDE_NAME_SPACE, tmpdev));
1374 /*
1375 * If number of keys are greater that
1376 * 1, then we have an invalid
1377 * namespace. STOP and return.
1378 */
1379 if (nkeys > 1)
1380 return (mddeverror(ep, MDE_MULTNM, tmpdev));
1381 if (devkey != raidkey)
1382 return (mdcomperror(ep, MDE_CANT_FIND_COMP,
1383 mnum, tmpdev));
1384
1385 if (un->un_column[col].un_orig_dev == NODEV64)
1386 un->un_column[col].un_orig_dev = tmpdev;
1387
1388 if (HOTSPARED(un, col)) {
1389 un->un_column[col].un_alt_dev =
1390 un->un_column[col].un_orig_dev;
1391 un->un_column[col].un_alt_pwstart =
1392 un->un_column[col].un_orig_pwstart;
1393 un->un_column[col].un_alt_devstart =
1394 un->un_column[col].un_orig_devstart;
1395 un->un_column[col].un_devflags |= MD_RAID_COPY_RESYNC;
1396 } else {
1397 if (!(un->un_column[col].un_devflags &
1398 MD_RAID_DEV_ISOPEN)) {
1399 if (md_layered_open(mnum, &tmpdev,
1400 MD_OFLG_NULL)) {
1401 un->un_column[col].un_dev = tmpdev;
1402 return (mdcomperror(ep,
1403 MDE_COMP_OPEN_ERR, mnum, tmpdev));
1404 }
1405 ASSERT(tmpdev != NODEV64 &&
1406 tmpdev != 0);
1407
1408 if ((md_getmajor(tmpdev) != md_major) &&
1409 (md_devid_found(setno, side, raidkey)
1410 == 1)) {
1411 if (md_update_namespace_did(setno, side,
1412 raidkey, &mde) != 0) {
1413 cmn_err(CE_WARN,
1414 "md: could not"
1415 " update namespace\n");
1416 }
1417 }
1418 un->un_column[col].un_dev =
1419 un->un_column[col].un_orig_dev;
1420 }
1421 un->un_column[col].un_devflags |= MD_RAID_DEV_ISOPEN;
1422 un->un_column[col].un_devflags |= MD_RAID_REGEN_RESYNC;
1423 }
1424 }
1425 if (mrp->has_label) {
1426 un->un_column[col].un_devflags |= MD_RAID_HAS_LABEL;
1427 } else {
1428 un->un_column[col].un_devflags &= ~MD_RAID_HAS_LABEL;
1429 }
1430
1431 raid_commit(un, extra_recids);
1432
1433 /* If the component has been replaced - clean up the name space */
1434 if (sv.setno != MD_SET_BAD) {
1435 md_rem_names(&sv, 1);
1436 }
1437
1438 md_ioctl_droplocks(lock);
1439
1440 if ((cmd == ENABLE_COMP) || (cmd == FORCE_ENABLE_COMP)) {
1441 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ENABLE, SVM_TAG_METADEVICE,
1442 setno, MD_SID(un));
1443 } else {
1444 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_REPLACE, SVM_TAG_METADEVICE,
1445 setno, MD_SID(un));
1446 }
1447
1448 if (un->un_column[col].un_devstate & RCS_INIT)
1449 err = raid_init_unit(mnum, ep);
1450 else if ((mrp->options & MDIOCTL_NO_RESYNC_RAID) == 0)
1451 err = raid_resync_unit(mnum, ep);
1452
1453 mdclrerror(ep);
1454 if (!err)
1455 return (0);
1456
1457 /* be sure state */
1458 /* is already set by this time */
1459 /* fix state and commit record */
1460 un = md_unit_writerlock(MDI_UNIT(mnum));
1461 if (state & RCS_INIT_ERRED)
1462 raid_set_state(un, col, state, 1);
1463 else if (state & RCS_OKAY)
1464 raid_set_state(un, col, RCS_ERRED, 0);
1465 else
1466 raid_set_state(un, col, state, 1);
1467 raid_commit(un, NULL);
1468 md_unit_writerexit(MDI_UNIT(mnum));
1469 mdclrerror(ep);
1470 return (0);
1471 }
1472
1473
1474 /*
1475 * NAME: raid_set_sync
1476 * DESCRIPTION: used to sync a component of a RAID metadevice
1477 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
1478 * int mode - must be FWRITE
1479 * IOLOCK *lock - pointer to IOCTL lock
1480 *
1481 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1482 * obtains and releases md_unit_array_rw write lock
1483 *
1484 */
1485 static int
raid_set_sync(md_resync_ioctl_t * rip,IOLOCK * lock)1486 raid_set_sync(
1487 md_resync_ioctl_t *rip,
1488 IOLOCK *lock
1489 )
1490 {
1491 minor_t mnum = rip->ri_mnum;
1492 mr_unit_t *un;
1493 int init = 0;
1494 int resync = 0;
1495 int regen = 0;
1496 int ix;
1497 int err;
1498
1499 mdclrerror(&rip->mde);
1500
1501 if ((un = raid_getun(mnum, &rip->mde, WRITERS, lock)) == NULL)
1502 return (0);
1503
1504 if (un->un_state & RUS_DOI)
1505 return (mdmderror(&rip->mde, MDE_RAID_DOI, mnum));
1506
1507 if (un->c.un_status & MD_UN_RESYNC_ACTIVE)
1508 return (mdmderror(&rip->mde, MDE_RESYNC_ACTIVE, mnum));
1509
1510 /* This prevents new opens */
1511
1512 rip->ri_flags = 0;
1513 if (un->un_state & RUS_REGEN)
1514 regen++;
1515
1516 if (raid_state_cnt(un, RCS_RESYNC))
1517 resync++;
1518
1519 if (raid_state_cnt(un, RCS_INIT) || (un->un_state & RUS_INIT))
1520 init++;
1521
1522 ASSERT(!(resync && init && regen));
1523 md_ioctl_droplocks(lock);
1524 rip->ri_percent_done = 0;
1525
1526 if (init) {
1527 MD_STATUS(un) |= MD_UN_GROW_PENDING;
1528 return (raid_init_unit(mnum, &rip->mde));
1529 }
1530
1531 /*
1532 * If resync is needed, it will call raid_internal_open forcing
1533 * replay before the open completes.
1534 * Otherwise, call raid_internal_open directly to force
1535 * replay to complete during boot (metasync -r).
1536 * NOTE: the unit writer lock must remain held while setting
1537 * MD_UN_RESYNC_ACTIVE but must be released before
1538 * calling raid_resync_unit or raid_internal_open.
1539 */
1540 if (resync) {
1541 ASSERT(resync < 2);
1542 un = md_unit_writerlock(MDI_UNIT(mnum));
1543 MD_STATUS(un) |= MD_UN_RESYNC_ACTIVE;
1544 /* Must release unit writer lock for resync */
1545 /*
1546 * correctly setup the devices before trying to start the
1547 * resync operation.
1548 */
1549 for (ix = 0; un->un_totalcolumncnt; ix++) {
1550 if (un->un_column[ix].un_devstate & RCS_RESYNC) {
1551 if ((un->un_column[ix].un_devflags &
1552 MD_RAID_COPY_RESYNC) &&
1553 HOTSPARED(un, ix)) {
1554 un->un_column[ix].un_alt_dev =
1555 un->un_column[ix].un_orig_dev;
1556 un->un_column[ix].un_alt_devstart =
1557 un->un_column[ix].un_orig_devstart;
1558 un->un_column[ix].un_alt_pwstart =
1559 un->un_column[ix].un_orig_pwstart;
1560 }
1561 break;
1562 }
1563 }
1564 ASSERT(un->un_column[ix].un_devflags &
1565 (MD_RAID_COPY_RESYNC | MD_RAID_REGEN_RESYNC));
1566 rip->ri_percent_done = 0;
1567 un->un_column[ix].un_devflags |= MD_RAID_RESYNC;
1568 (void) resync_request(mnum, ix, 0, NULL);
1569 md_unit_writerexit(MDI_UNIT(mnum));
1570 err = raid_resync_unit(mnum, &rip->mde);
1571 return (err);
1572 }
1573
1574 if (regen) {
1575 err = raid_regen_unit(mnum, &rip->mde);
1576 return (err);
1577 }
1578
1579 /* The unit requires not work so just force replay of the device */
1580 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0))
1581 return (mdmderror(&rip->mde,
1582 MDE_RAID_OPEN_FAILURE, mnum));
1583 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1584
1585 return (0);
1586 }
1587
1588 /*
1589 * NAME: raid_get_resync
1590 * DESCRIPTION: used to check resync status on a component of a RAID metadevice
1591 * PARAMETERS: md_resync_ioctl_t *mrp - pointer to resync data structure
1592 * int mode - must be FWRITE
1593 * IOLOCK *lock - pointer to IOCTL lock
1594 *
1595 * LOCKS: none
1596 *
1597 */
1598 static int
raid_get_resync(md_resync_ioctl_t * rip,IOLOCK * lock)1599 raid_get_resync(
1600 md_resync_ioctl_t *rip,
1601 IOLOCK *lock
1602 )
1603 {
1604 minor_t mnum = rip->ri_mnum;
1605 mr_unit_t *un;
1606 u_longlong_t percent;
1607 int cnt;
1608 int ix;
1609 uint64_t d;
1610
1611 mdclrerror(&rip->mde);
1612
1613 if ((un = raid_getun(mnum, &rip->mde, RD_LOCK, lock)) == NULL)
1614 return (0);
1615
1616 rip->ri_flags = 0;
1617 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1618 d = un->un_segsincolumn;
1619 percent = d ? ((1000 * un->un_resync_line_index) / d) : 0;
1620 if (percent > 1000)
1621 percent = 1000; /* can't go over 100% */
1622 rip->ri_percent_done = (int)percent;
1623 rip->ri_flags |= MD_RI_INPROGRESS;
1624 }
1625
1626 if (UNIT_STATE(un) & RUS_INIT) {
1627 d = un->un_segsize * un->un_segsincolumn *
1628 un->un_totalcolumncnt;
1629 percent =
1630 d ? ((1000 * (u_longlong_t)un->un_init_iocnt) / d) : 0;
1631 if (percent > 1000)
1632 percent = 1000; /* can't go over 100% */
1633 rip->ri_percent_done = (int)percent;
1634 rip->ri_flags |= MD_GROW_INPROGRESS;
1635 } else if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1636 d = un->un_segsize * un->un_segsincolumn * un->un_init_colcnt;
1637 percent =
1638 d ? (((u_longlong_t)un->un_init_iocnt * 1000) / d) : 0;
1639 if (percent > 1000)
1640 percent = 1000;
1641 rip->ri_percent_done = (int)percent;
1642 rip->ri_flags |= MD_GROW_INPROGRESS;
1643 }
1644
1645 if (un->un_state & RUS_REGEN)
1646 rip->ri_percent_done = un->un_percent_done;
1647
1648 cnt = 0;
1649 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
1650 switch (un->un_column[ix].un_devstate) {
1651 case RCS_INIT:
1652 case RCS_ERRED:
1653 case RCS_LAST_ERRED:
1654 cnt++;
1655 break;
1656 default:
1657 break;
1658 }
1659 }
1660 d = un->un_totalcolumncnt;
1661 rip->ri_percent_dirty = d ? (((u_longlong_t)cnt * 100) / d) : 0;
1662 return (0);
1663 }
1664
1665 /*
1666 * NAME: raid_grow
1667 * DESCRIPTION: Concatenate to a RAID metadevice
1668 * PARAMETERS: md_grow_params_t *mgp
1669 * - pointer to IOCGROW data structure
1670 * int mode - must be FWRITE
1671 * IOLOCK *lockp - IOCTL read/write and unit_array_rw lock
1672 *
1673 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun),
1674 * obtains and releases md_unit_array_rw write lock
1675 *
1676 */
1677 static int
raid_grow(void * mgp,int mode,IOLOCK * lock)1678 raid_grow(void *mgp, int mode, IOLOCK *lock)
1679 {
1680 minor_t mnum;
1681 mr_unit_t *un, *new_un;
1682 mdi_unit_t *ui;
1683 mddb_type_t typ1;
1684 mddb_recid_t mr_recid;
1685 mddb_recid_t old_vtoc = 0;
1686 mddb_recid_t *recids;
1687 md_create_rec_option_t options;
1688 int err;
1689 int col, i;
1690 int64_t tb, atb;
1691 u_longlong_t unrev;
1692 int tc;
1693 int rval = 0;
1694 set_t setno;
1695 mr_column_ic_t *mrc;
1696 int num_recs, rid;
1697 md_grow_params_t *mgph = mgp;
1698
1699
1700 mnum = mgph->mnum;
1701
1702 mdclrerror(&mgph->mde);
1703
1704 ui = MDI_UNIT(mnum);
1705 un = md_unit_readerlock(ui);
1706
1707 if (MD_STATUS(un) & MD_UN_GROW_PENDING) {
1708 md_unit_readerexit(ui);
1709 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1710 }
1711
1712 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
1713 md_unit_readerexit(ui);
1714 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1715 }
1716
1717 if (UNIT_STATE(un) & RUS_LAST_ERRED) {
1718 md_unit_readerexit(ui);
1719 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1720 }
1721
1722 if (UNIT_STATE(un) & RUS_DOI) {
1723 md_unit_readerexit(ui);
1724 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1725 }
1726
1727 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT)) {
1728 md_unit_readerexit(ui);
1729 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1730 }
1731
1732 md_unit_readerexit(ui);
1733
1734 if ((un = raid_getun(mnum, &mgph->mde, WRITERS, lock)) ==
1735 NULL)
1736 return (0);
1737
1738 if (MD_STATUS(un) & MD_UN_GROW_PENDING)
1739 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1740
1741 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
1742 return (mdmderror(&mgph->mde, MDE_RESYNC_ACTIVE, mnum));
1743
1744 if (un->c.un_size >= mgph->size)
1745 return (EINVAL);
1746
1747 if (UNIT_STATE(un) & RUS_LAST_ERRED)
1748 return (mdmderror(&mgph->mde, MDE_RAID_LAST_ERRED, mnum));
1749
1750 if (UNIT_STATE(un) & RUS_DOI)
1751 return (mdmderror(&mgph->mde, MDE_RAID_DOI, mnum));
1752
1753 if ((raid_state_cnt(un, RCS_INIT) != 0) || (un->un_state & RUS_INIT))
1754 return (mdmderror(&mgph->mde, MDE_IN_USE, mnum));
1755
1756 setno = MD_MIN2SET(mnum);
1757
1758 typ1 = (mddb_type_t)md_getshared_key(setno,
1759 raid_md_ops.md_driver.md_drivername);
1760
1761 /*
1762 * Preserve the friendly name nature of the device that is
1763 * growing.
1764 */
1765 options = MD_CRO_RAID;
1766 if (un->c.un_revision & MD_FN_META_DEV)
1767 options |= MD_CRO_FN;
1768 if (mgph->options & MD_CRO_64BIT) {
1769 #if defined(_ILP32)
1770 return (mdmderror(&mgph->mde, MDE_UNIT_TOO_LARGE, mnum));
1771 #else
1772 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1773 MD_CRO_64BIT | options, setno);
1774 #endif
1775 } else {
1776 mr_recid = mddb_createrec(mgph->size, typ1, 0,
1777 MD_CRO_32BIT | options, setno);
1778 }
1779 if (mr_recid < 0) {
1780 rval = mddbstatus2error(&mgph->mde, (int)mr_recid,
1781 mnum, setno);
1782 return (rval);
1783 }
1784
1785 /* get the address of the new unit */
1786 new_un = (mr_unit_t *)mddb_getrecaddr(mr_recid);
1787
1788 /*
1789 * It is okay that we muck with the new unit here,
1790 * since no one else will know about the unit struct
1791 * until we commit it. If we crash, the record will
1792 * be automatically purged, since we haven't
1793 * committed it yet and the old unit struct will be found.
1794 */
1795
1796 /* copy in the user's unit struct */
1797 err = ddi_copyin((void *)(uintptr_t)mgph->mdp, new_un,
1798 mgph->size, mode);
1799 if (err) {
1800 mddb_deleterec_wrapper(mr_recid);
1801 return (EFAULT);
1802 }
1803
1804 /* make sure columns are being added */
1805 if (un->un_totalcolumncnt >= new_un->un_totalcolumncnt) {
1806 mddb_deleterec_wrapper(mr_recid);
1807 return (EINVAL);
1808 }
1809
1810 /*
1811 * Save a few of the new unit structs fields.
1812 * Before they get clobbered.
1813 */
1814 tc = new_un->un_totalcolumncnt;
1815 tb = new_un->c.un_total_blocks;
1816 atb = new_un->c.un_actual_tb;
1817 unrev = new_un->c.un_revision;
1818
1819 /*
1820 * Copy the old unit struct (static stuff)
1821 * into new unit struct
1822 */
1823 bcopy((caddr_t)un, (caddr_t)new_un, un->c.un_size);
1824
1825 /*
1826 * Restore a few of the new unit struct values.
1827 */
1828 new_un->un_totalcolumncnt = tc;
1829 new_un->c.un_actual_tb = atb;
1830 new_un->un_grow_tb = tb;
1831 new_un->c.un_revision = unrev;
1832 new_un->c.un_record_id = mr_recid;
1833 new_un->c.un_size = mgph->size;
1834
1835 ASSERT(new_un->mr_ic == un->mr_ic);
1836
1837 /*
1838 * Save old column slots
1839 */
1840 mrc = un->un_column_ic;
1841
1842 /*
1843 * Allocate new column slot
1844 */
1845 new_un->un_column_ic = (mr_column_ic_t *)
1846 kmem_zalloc(sizeof (mr_column_ic_t) * new_un->un_totalcolumncnt,
1847 KM_SLEEP);
1848
1849 /*
1850 * Restore old column slots
1851 * Free the old column slots
1852 */
1853 bcopy(mrc, new_un->un_column_ic,
1854 sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1855 kmem_free(mrc, sizeof (mr_column_ic_t) * un->un_totalcolumncnt);
1856
1857 /* All 64 bit metadevices only support EFI labels. */
1858 if (mgph->options & MD_CRO_64BIT) {
1859 new_un->c.un_flag |= MD_EFILABEL;
1860 /*
1861 * If the device was previously smaller than a terabyte,
1862 * and had a vtoc record attached to it, we remove the
1863 * vtoc record, because the layout has changed completely.
1864 */
1865 if (((un->c.un_revision & MD_64BIT_META_DEV) == 0) &&
1866 (un->c.un_vtoc_id != 0)) {
1867 old_vtoc = un->c.un_vtoc_id;
1868 new_un->c.un_vtoc_id =
1869 md_vtoc_to_efi_record(old_vtoc, setno);
1870 }
1871 }
1872
1873
1874 /*
1875 * allocate the real recids array. since we may have to commit
1876 * underlying metadevice records, we need an array of size:
1877 * total number of new components being attach + 2 (one for the
1878 * raid itself, one for the end marker).
1879 */
1880 num_recs = new_un->un_totalcolumncnt + 2;
1881 rid = 0;
1882 recids = kmem_alloc(num_recs * sizeof (mddb_recid_t), KM_SLEEP);
1883 recids[rid++] = mr_recid;
1884
1885 for (col = un->un_totalcolumncnt;
1886 (col < new_un->un_totalcolumncnt); col++) {
1887 mr_column_t *mr_col = &new_un->un_column[col];
1888 md_unit_t *comp_un;
1889
1890 if (raid_build_pw_reservation(new_un, col) != 0) {
1891 /* release pwslots already allocated by grow */
1892 for (i = un->un_totalcolumncnt; i < col; i++) {
1893 raid_free_pw_reservation(new_un, i);
1894 }
1895 kmem_free(new_un->un_column_ic,
1896 sizeof (mr_column_ic_t) *
1897 new_un->un_totalcolumncnt);
1898 kmem_free(new_un->mr_ic, sizeof (*un->mr_ic));
1899 kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1900 mddb_deleterec_wrapper(mr_recid);
1901 return (EINVAL);
1902 }
1903 /*
1904 * set parent on metadevices being added.
1905 * NOTE: currently soft partitions are the only metadevices
1906 * which can appear within a RAID metadevice.
1907 */
1908 if (md_getmajor(mr_col->un_dev) == md_major) {
1909 comp_un = MD_UNIT(md_getminor(mr_col->un_dev));
1910 recids[rid++] = MD_RECID(comp_un);
1911 md_set_parent(mr_col->un_dev, MD_SID(new_un));
1912 }
1913 new_un->un_column[col].un_devflags = 0;
1914 }
1915
1916 /* set end marker */
1917 recids[rid] = 0;
1918
1919 /* commit new unit struct */
1920 mddb_commitrecs_wrapper(recids);
1921
1922 /* delete old unit struct */
1923 mddb_deleterec_wrapper(un->c.un_record_id);
1924
1925 /* place new unit in in-core array */
1926 md_nblocks_set(mnum, new_un->c.un_total_blocks);
1927 MD_UNIT(mnum) = new_un;
1928
1929 /*
1930 * If old_vtoc has a non zero value, we know:
1931 * - This unit crossed the border from smaller to larger one TB
1932 * - There was a vtoc record for the unit,
1933 * - This vtoc record is no longer needed, because
1934 * a new efi record has been created for this un.
1935 */
1936 if (old_vtoc != 0) {
1937 mddb_deleterec_wrapper(old_vtoc);
1938 }
1939
1940 /* free recids */
1941 kmem_free(recids, num_recs * sizeof (mddb_recid_t));
1942
1943 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_GROW, SVM_TAG_METADEVICE,
1944 MD_UN2SET(new_un), MD_SID(new_un));
1945 MD_STATUS(new_un) |= MD_UN_GROW_PENDING;
1946
1947 /*
1948 * Since the md_ioctl_writelock aquires the unit write lock
1949 * and open/close aquires the unit reader lock it is necessary
1950 * to drop the unit write lock and then reaquire it as needed
1951 * later.
1952 */
1953 md_unit_writerexit(ui);
1954
1955 if (raid_internal_open(mnum, (FREAD | FWRITE), OTYP_LYR, 0)) {
1956 rval = mdmderror(&mgph->mde, MDE_RAID_OPEN_FAILURE, mnum);
1957 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL, SVM_TAG_METADEVICE,
1958 MD_UN2SET(new_un), MD_SID(new_un));
1959 return (rval);
1960 }
1961 (void) md_unit_writerlock(ui);
1962 for (i = 0; i < new_un->un_totalcolumncnt; i++) {
1963 if (new_un->un_column[i].un_devstate & RCS_OKAY)
1964 (void) init_pw_area(new_un, new_un->un_column[i].un_dev,
1965 new_un->un_column[i].un_pwstart, i);
1966 }
1967 md_unit_writerexit(ui);
1968 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
1969 (void) md_unit_writerlock(ui);
1970 /* create a background thread to initialize the columns */
1971 md_ioctl_droplocks(lock);
1972
1973 return (raid_init_unit(mnum, &mgph->mde));
1974 }
1975
1976 /*
1977 * NAME: raid_reset
1978 * DESCRIPTION: used to reset (clear / remove) a RAID metadevice
1979 * PARAMETERS: md_i_reset_t *mirp - pointer to reset data structure
1980 *
1981 * LOCKS: obtains and releases md_unit_array_rw write lock
1982 *
1983 */
1984 static int
raid_reset(md_i_reset_t * mirp)1985 raid_reset(md_i_reset_t *mirp)
1986 {
1987 minor_t mnum = mirp->mnum;
1988 mr_unit_t *un;
1989 mdi_unit_t *ui;
1990 set_t setno = MD_MIN2SET(mnum);
1991
1992 mdclrerror(&mirp->mde);
1993
1994 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1995 /*
1996 * NOTE: need to get md_unit_writerlock to avoid conflict
1997 * with raid_init thread.
1998 */
1999 if ((un = raid_getun(mnum, &mirp->mde, NO_LOCK, NULL)) ==
2000 NULL) {
2001 rw_exit(&md_unit_array_rw.lock);
2002 return (0);
2003 }
2004 ui = MDI_UNIT(mnum);
2005
2006 if (MD_HAS_PARENT(MD_PARENT(un))) {
2007 rw_exit(&md_unit_array_rw.lock);
2008 return (mdmderror(&mirp->mde, MDE_IN_USE, mnum));
2009 }
2010
2011 un = (mr_unit_t *)md_unit_openclose_enter(ui);
2012 if (md_unit_isopen(MDI_UNIT(mnum))) {
2013 md_unit_openclose_exit(ui);
2014 rw_exit(&md_unit_array_rw.lock);
2015 return (mdmderror(&mirp->mde, MDE_IS_OPEN, mnum));
2016 }
2017 md_unit_openclose_exit(ui);
2018 if (UNIT_STATE(un) != RUS_OKAY && !mirp->force) {
2019 rw_exit(&md_unit_array_rw.lock);
2020 return (mdmderror(&mirp->mde, MDE_RAID_NEED_FORCE, mnum));
2021 }
2022
2023 reset_raid(un, mnum, 1);
2024
2025 /*
2026 * Update unit availability
2027 */
2028 md_set[setno].s_un_avail++;
2029
2030 /*
2031 * If MN set, reset s_un_next so all nodes can have
2032 * the same view of the next available slot when
2033 * nodes are -w and -j
2034 */
2035 if (MD_MNSET_SETNO(setno)) {
2036 (void) md_upd_set_unnext(setno, MD_MIN2UNIT(mnum));
2037 }
2038
2039 rw_exit(&md_unit_array_rw.lock);
2040
2041 return (0);
2042 }
2043
2044 /*
2045 * NAME: raid_get_geom
2046 * DESCRIPTION: used to get the geometry of a RAID metadevice
2047 * PARAMETERS: mr_unit_t *un - RAID unit to get the geometry for
2048 * struct dk_geom *gp - pointer to geometry data structure
2049 *
2050 * LOCKS: none
2051 *
2052 */
2053 static int
raid_get_geom(mr_unit_t * un,struct dk_geom * geomp)2054 raid_get_geom(
2055 mr_unit_t *un,
2056 struct dk_geom *geomp
2057 )
2058 {
2059 md_get_geom((md_unit_t *)un, geomp);
2060
2061 return (0);
2062 }
2063
2064 /*
2065 * NAME: raid_get_vtoc
2066 * DESCRIPTION: used to get the VTOC on a RAID metadevice
2067 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
2068 * struct vtoc *vtocp - pointer to VTOC data structure
2069 *
2070 * LOCKS: none
2071 *
2072 */
2073 static int
raid_get_vtoc(mr_unit_t * un,struct vtoc * vtocp)2074 raid_get_vtoc(
2075 mr_unit_t *un,
2076 struct vtoc *vtocp
2077 )
2078 {
2079 md_get_vtoc((md_unit_t *)un, vtocp);
2080
2081 return (0);
2082 }
2083
2084 /*
2085 * NAME: raid_set_vtoc
2086 * DESCRIPTION: used to set the VTOC on a RAID metadevice
2087 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2088 * struct vtoc *vtocp - pointer to VTOC data structure
2089 *
2090 * LOCKS: none
2091 *
2092 */
2093 static int
raid_set_vtoc(mr_unit_t * un,struct vtoc * vtocp)2094 raid_set_vtoc(
2095 mr_unit_t *un,
2096 struct vtoc *vtocp
2097 )
2098 {
2099 return (md_set_vtoc((md_unit_t *)un, vtocp));
2100 }
2101
2102
2103 /*
2104 * NAME: raid_get_extvtoc
2105 * DESCRIPTION: used to get the extended VTOC on a RAID metadevice
2106 * PARAMETERS: mr_unit_t *un - RAID unit to get the VTOC from
2107 * struct extvtoc *vtocp - pointer to extended VTOC data structure
2108 *
2109 * LOCKS: none
2110 *
2111 */
2112 static int
raid_get_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2113 raid_get_extvtoc(
2114 mr_unit_t *un,
2115 struct extvtoc *vtocp
2116 )
2117 {
2118 md_get_extvtoc((md_unit_t *)un, vtocp);
2119
2120 return (0);
2121 }
2122
2123 /*
2124 * NAME: raid_set_extvtoc
2125 * DESCRIPTION: used to set the extended VTOC on a RAID metadevice
2126 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2127 * struct extvtoc *vtocp - pointer to extended VTOC data structure
2128 *
2129 * LOCKS: none
2130 *
2131 */
2132 static int
raid_set_extvtoc(mr_unit_t * un,struct extvtoc * vtocp)2133 raid_set_extvtoc(
2134 mr_unit_t *un,
2135 struct extvtoc *vtocp
2136 )
2137 {
2138 return (md_set_extvtoc((md_unit_t *)un, vtocp));
2139 }
2140
2141
2142
2143 /*
2144 * NAME: raid_get_cgapart
2145 * DESCRIPTION: used to get the dk_map on a RAID metadevice
2146 * PARAMETERS: mr_unit_t *un - RAID unit to set the VTOC on
2147 * struct vtoc *dkmapp - pointer to dk_map data structure
2148 *
2149 * LOCKS: none
2150 *
2151 */
2152
2153 static int
raid_get_cgapart(mr_unit_t * un,struct dk_map * dkmapp)2154 raid_get_cgapart(
2155 mr_unit_t *un,
2156 struct dk_map *dkmapp
2157 )
2158 {
2159 md_get_cgapart((md_unit_t *)un, dkmapp);
2160 return (0);
2161 }
2162
2163 /*
2164 * NAME: raid_getdevs
2165 * DESCRIPTION: return all devices within a RAID metadevice
2166 * PARAMETERS: md_getdevs_params_t *mgdp
2167 * - pointer to getdevs IOCTL data structure
2168 * int mode - should be FREAD
2169 * IOLOCK *lockp - IOCTL read/write lock
2170 *
2171 * LOCKS: obtains unit reader lock via IOLOCK
2172 *
2173 */
2174 static int
raid_getdevs(void * mgdp,int mode,IOLOCK * lock)2175 raid_getdevs(
2176 void *mgdp,
2177 int mode,
2178 IOLOCK *lock
2179 )
2180 {
2181 minor_t mnum;
2182 mr_unit_t *un;
2183 md_dev64_t *udevs;
2184 int i, cnt;
2185 md_dev64_t unit_dev;
2186 md_getdevs_params_t *mgdph = mgdp;
2187
2188
2189 mnum = mgdph->mnum;
2190
2191 /* check out unit */
2192 mdclrerror(&mgdph->mde);
2193
2194 if ((un = raid_getun(mnum, &mgdph->mde, RD_LOCK, lock)) == NULL)
2195 return (0);
2196
2197 udevs = (md_dev64_t *)(uintptr_t)mgdph->devs;
2198
2199 for (cnt = 0, i = 0; i < un->un_totalcolumncnt; i++, cnt++) {
2200 if (cnt < mgdph->cnt) {
2201 unit_dev = un->un_column[i].un_orig_dev;
2202 if (md_getmajor(unit_dev) != md_major) {
2203 if ((unit_dev = md_xlate_mini_2_targ
2204 (unit_dev)) == NODEV64)
2205 return (ENODEV);
2206 }
2207
2208 if (ddi_copyout((caddr_t)&unit_dev,
2209 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2210 return (EFAULT);
2211 }
2212 if (HOTSPARED(un, i)) {
2213 cnt++;
2214 if (cnt >= mgdph->cnt)
2215 continue;
2216
2217 unit_dev = un->un_column[i].un_dev;
2218 if (md_getmajor(unit_dev) != md_major) {
2219 if ((unit_dev = md_xlate_mini_2_targ
2220 (unit_dev)) == NODEV64)
2221 return (ENODEV);
2222 }
2223
2224 if (ddi_copyout((caddr_t)&unit_dev,
2225 (caddr_t)&udevs[cnt], sizeof (*udevs), mode) != 0)
2226 return (EFAULT);
2227 }
2228 }
2229 mgdph->cnt = cnt;
2230 return (0);
2231 }
2232
2233 /*
2234 * NAME: raid_change
2235 * DESCRIPTION: used to change the following dynamic values:
2236 * the hot spare pool
2237 * in the unit structure of a RAID metadevice
2238 * PARAMETERS: md_change_params_t *mcp - pointer to change data structure
2239 * IOLOCK *lock - pointer to IOCTL lock
2240 *
2241 * LOCKS: obtains unit writer lock via IOLOCK (through raid_getun)
2242 *
2243 */
2244 static int
raid_change(md_raid_params_t * mrp,IOLOCK * lock)2245 raid_change(
2246 md_raid_params_t *mrp,
2247 IOLOCK *lock
2248 )
2249 {
2250 minor_t mnum = mrp->mnum;
2251 mr_unit_t *un;
2252 int ix;
2253 mddb_recid_t recids[3] = {0, 0, 0};
2254 int err;
2255 int irecid;
2256 int inc_new_hsp = 0;
2257
2258 mdclrerror(&mrp->mde);
2259
2260 if ((un = raid_getun(mnum, &mrp->mde, WR_LOCK, lock)) == NULL)
2261 return (0);
2262
2263 if (!mrp->params.change_hsp_id)
2264 return (0);
2265
2266 /* verify that no hotspare is in use */
2267 for (ix = 0; ix < un->un_totalcolumncnt; ix++) {
2268 if (HOTSPARED(un, ix)) {
2269 return (mdmderror(&mrp->mde, MDE_HS_IN_USE, mnum));
2270 }
2271 }
2272
2273 /* replace the hot spare pool */
2274
2275 irecid = 0;
2276 if (mrp->params.hsp_id != -1) {
2277 /* increment the reference count of the new hsp */
2278 err = md_hot_spare_ifc(HSP_INCREF, mrp->params.hsp_id, 0, 0,
2279 &recids[0], NULL, NULL, NULL);
2280 if (err) {
2281 return (mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2282 mrp->params.hsp_id));
2283 }
2284 inc_new_hsp = 1;
2285 irecid++;
2286 }
2287
2288 if (un->un_hsp_id != -1) {
2289 /* decrement the reference count of the old hsp */
2290 err = md_hot_spare_ifc(HSP_DECREF, un->un_hsp_id, 0, 0,
2291 &recids[irecid], NULL, NULL, NULL);
2292 if (err) {
2293 err = mdhsperror(&mrp->mde, MDE_INVAL_HSP,
2294 mrp->params.hsp_id);
2295 if (inc_new_hsp) {
2296 (void) md_hot_spare_ifc(HSP_DECREF,
2297 mrp->params.hsp_id, 0, 0,
2298 &recids[0], NULL, NULL, NULL);
2299 /*
2300 * Don't need to commit the record,
2301 * because it wasn't committed before
2302 */
2303 }
2304 return (err);
2305 }
2306 }
2307
2308 un->un_hsp_id = mrp->params.hsp_id;
2309
2310 raid_commit(un, recids);
2311 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_CHANGE, SVM_TAG_METADEVICE,
2312 MD_UN2SET(un), MD_SID(un));
2313
2314 /* Now trigger hot spare processing in case one is needed. */
2315 if ((un->un_hsp_id != -1) && (un->un_state == RUS_ERRED))
2316 (void) raid_hotspares();
2317
2318 return (0);
2319 }
2320
2321 /*
2322 * NAME: raid_admin_ioctl
2323 * DESCRIPTION: IOCTL operations unique to metadevices and RAID
2324 * PARAMETERS: int cmd - IOCTL command to be executed
2325 * void *data - pointer to IOCTL data structure
2326 * int mode - either FREAD or FWRITE
2327 * IOLOCK *lockp - IOCTL read/write lock
2328 *
2329 * LOCKS: none
2330 *
2331 */
2332 static int
raid_admin_ioctl(int cmd,void * data,int mode,IOLOCK * lockp)2333 raid_admin_ioctl(
2334 int cmd,
2335 void *data,
2336 int mode,
2337 IOLOCK *lockp
2338 )
2339 {
2340 size_t sz = 0;
2341 void *d = NULL;
2342 int err = 0;
2343
2344 /* We can only handle 32-bit clients for internal commands */
2345 if ((mode & DATAMODEL_MASK) != DATAMODEL_ILP32) {
2346 return (EINVAL);
2347 }
2348
2349
2350 /* dispatch ioctl */
2351 switch (cmd) {
2352
2353 case MD_IOCSET:
2354 {
2355 if (! (mode & FWRITE))
2356 return (EACCES);
2357
2358 sz = sizeof (md_set_params_t);
2359 d = kmem_alloc(sz, KM_SLEEP);
2360
2361 if (ddi_copyin(data, d, sz, mode)) {
2362 err = EFAULT;
2363 break;
2364 }
2365
2366 err = raid_set(d, mode);
2367 break;
2368 }
2369
2370 case MD_IOCGET:
2371 {
2372 if (! (mode & FREAD))
2373 return (EACCES);
2374
2375 sz = sizeof (md_i_get_t);
2376 d = kmem_alloc(sz, KM_SLEEP);
2377
2378 if (ddi_copyin(data, d, sz, mode)) {
2379 err = EFAULT;
2380 break;
2381 }
2382
2383 err = raid_get(d, mode, lockp);
2384 break;
2385 }
2386
2387 case MD_IOCREPLACE:
2388 {
2389 if (! (mode & FWRITE))
2390 return (EACCES);
2391
2392 sz = sizeof (replace_params_t);
2393 d = kmem_alloc(sz, KM_SLEEP);
2394
2395 if (ddi_copyin(data, d, sz, mode)) {
2396 err = EFAULT;
2397 break;
2398 }
2399
2400 err = raid_replace((replace_params_t *)d, lockp);
2401 break;
2402 }
2403
2404 case MD_IOCSETSYNC:
2405 {
2406 if (! (mode & FWRITE))
2407 return (EACCES);
2408
2409 sz = sizeof (md_resync_ioctl_t);
2410 d = kmem_alloc(sz, KM_SLEEP);
2411
2412 if (ddi_copyin(data, d, sz, mode)) {
2413 err = EFAULT;
2414 break;
2415 }
2416
2417 err = raid_set_sync((md_resync_ioctl_t *)d, lockp);
2418 break;
2419 }
2420
2421 case MD_IOCGETSYNC:
2422 {
2423 if (! (mode & FREAD))
2424 return (EACCES);
2425
2426 sz = sizeof (md_resync_ioctl_t);
2427 d = kmem_alloc(sz, KM_SLEEP);
2428
2429 if (ddi_copyin(data, d, sz, mode)) {
2430 err = EFAULT;
2431 break;
2432 }
2433 err = raid_get_resync((md_resync_ioctl_t *)d, lockp);
2434
2435 break;
2436 }
2437
2438 case MD_IOCGROW:
2439 {
2440 if (! (mode & FWRITE))
2441 return (EACCES);
2442
2443 sz = sizeof (md_grow_params_t);
2444 d = kmem_alloc(sz, KM_SLEEP);
2445
2446 if (ddi_copyin(data, d, sz, mode)) {
2447 err = EFAULT;
2448 break;
2449 }
2450
2451 err = raid_grow(d, mode, lockp);
2452 break;
2453 }
2454
2455 case MD_IOCCHANGE:
2456 {
2457 if (! (mode & FWRITE))
2458 return (EACCES);
2459
2460 sz = sizeof (md_raid_params_t);
2461 d = kmem_alloc(sz, KM_SLEEP);
2462
2463 if (ddi_copyin(data, d, sz, mode)) {
2464 err = EFAULT;
2465 break;
2466 }
2467
2468 err = raid_change((md_raid_params_t *)d, lockp);
2469 break;
2470 }
2471
2472 case MD_IOCRESET:
2473 {
2474 if (! (mode & FWRITE))
2475 return (EACCES);
2476
2477 sz = sizeof (md_i_reset_t);
2478 d = kmem_alloc(sz, KM_SLEEP);
2479
2480 if (ddi_copyin(data, d, sz, mode)) {
2481 err = EFAULT;
2482 break;
2483 }
2484
2485 err = raid_reset((md_i_reset_t *)d);
2486 break;
2487 }
2488
2489 case MD_IOCGET_DEVS:
2490 {
2491 if (! (mode & FREAD))
2492 return (EACCES);
2493
2494 sz = sizeof (md_getdevs_params_t);
2495 d = kmem_alloc(sz, KM_SLEEP);
2496
2497 if (ddi_copyin(data, d, sz, mode)) {
2498 err = EFAULT;
2499 break;
2500 }
2501
2502 err = raid_getdevs(d, mode, lockp);
2503 break;
2504 }
2505
2506 case MD_IOCSETREGEN:
2507 {
2508 if (! (mode & FWRITE))
2509 return (EACCES);
2510
2511 sz = sizeof (md_regen_param_t);
2512 d = kmem_alloc(sz, KM_SLEEP);
2513
2514 if (ddi_copyin(data, d, sz, mode)) {
2515 err = EFAULT;
2516 break;
2517 }
2518
2519 err = raid_regen((md_regen_param_t *)d, lockp);
2520 break;
2521 }
2522
2523 case MD_IOCPROBE_DEV:
2524 {
2525 md_probedev_impl_t *p = NULL;
2526 md_probedev_t *ph = NULL;
2527 daemon_queue_t *hdr = NULL;
2528 int i;
2529 size_t sz1 = 0;
2530
2531
2532 if (! (mode & FREAD))
2533 return (EACCES);
2534
2535 sz = sizeof (md_probedev_t);
2536
2537 d = kmem_alloc(sz, KM_SLEEP);
2538
2539 /* now copy in the data */
2540 if (ddi_copyin(data, d, sz, mode)) {
2541 err = EFAULT;
2542 goto free_mem;
2543 }
2544
2545 /*
2546 * Sanity test the args. Test name should have the keyword
2547 * probe.
2548 */
2549 p = kmem_alloc(sizeof (md_probedev_impl_t), KM_SLEEP);
2550 p->probe_sema = NULL;
2551 p->probe_mx = NULL;
2552 p->probe.mnum_list = (uint64_t)NULL;
2553
2554 ph = (md_probedev_t *)d;
2555 p->probe.nmdevs = ph->nmdevs;
2556 (void) strcpy(p->probe.test_name, ph->test_name);
2557 bcopy(&ph->md_driver, &(p->probe.md_driver),
2558 sizeof (md_driver_t));
2559
2560 if ((p->probe.nmdevs < 1) ||
2561 (strstr(p->probe.test_name, "probe") == NULL)) {
2562 err = EINVAL;
2563 goto free_mem;
2564 }
2565
2566 sz1 = sizeof (minor_t) * p->probe.nmdevs;
2567
2568 p->probe.mnum_list = (uint64_t)(uintptr_t)kmem_alloc(sz1,
2569 KM_SLEEP);
2570
2571 if (ddi_copyin((caddr_t)(uintptr_t)ph->mnum_list,
2572 (caddr_t)(uintptr_t)p->probe.mnum_list, sz1, mode)) {
2573 err = EFAULT;
2574 goto free_mem;
2575 }
2576
2577 if (err = md_init_probereq(p, &hdr))
2578 goto free_mem;
2579
2580 /*
2581 * put the request on the queue and wait.
2582 */
2583
2584 daemon_request_new(&md_ff_daemonq, md_probe_one, hdr, REQ_NEW);
2585
2586 (void) IOLOCK_RETURN(0, lockp);
2587 /* wait for the events to occur */
2588 for (i = 0; i < p->probe.nmdevs; i++) {
2589 sema_p(PROBE_SEMA(p));
2590 }
2591 while (md_ioctl_lock_enter() == EINTR)
2592 ;
2593
2594 /*
2595 * clean up. The hdr list is freed in the probe routines
2596 * since the list is NULL by the time we get here.
2597 */
2598 free_mem:
2599 if (p) {
2600 if (p->probe_sema != NULL) {
2601 sema_destroy(PROBE_SEMA(p));
2602 kmem_free(p->probe_sema, sizeof (ksema_t));
2603 }
2604 if (p->probe_mx != NULL) {
2605 mutex_destroy(PROBE_MX(p));
2606 kmem_free(p->probe_mx, sizeof (kmutex_t));
2607 }
2608 if (p->probe.mnum_list)
2609 kmem_free((caddr_t)(uintptr_t)
2610 p->probe.mnum_list, sz1);
2611
2612 kmem_free(p, sizeof (md_probedev_impl_t));
2613 }
2614 break;
2615 }
2616
2617 default:
2618 return (ENOTTY);
2619 }
2620
2621 /*
2622 * copyout and free any args
2623 */
2624 if (sz != 0) {
2625 if (err == 0) {
2626 if (ddi_copyout(d, data, sz, mode) != 0) {
2627 err = EFAULT;
2628 }
2629 }
2630 kmem_free(d, sz);
2631 }
2632 return (err);
2633 }
2634
2635 /*
2636 * NAME: md_raid_ioctl
2637 * DESCRIPTION: RAID metadevice IOCTL operations entry point.
2638 * PARAMETERS: md_dev64_t dev - RAID device identifier
2639 * int cmd - IOCTL command to be executed
2640 * void *data - pointer to IOCTL data structure
2641 * int mode - either FREAD or FWRITE
2642 * IOLOCK *lockp - IOCTL read/write lock
2643 *
2644 * LOCKS: none
2645 *
2646 */
2647 int
md_raid_ioctl(dev_t dev,int cmd,void * data,int mode,IOLOCK * lockp)2648 md_raid_ioctl(
2649 dev_t dev,
2650 int cmd,
2651 void *data,
2652 int mode,
2653 IOLOCK *lockp
2654 )
2655 {
2656 minor_t mnum = getminor(dev);
2657 mr_unit_t *un;
2658 int err = 0;
2659
2660 /* handle admin ioctls */
2661 if (mnum == MD_ADM_MINOR)
2662 return (raid_admin_ioctl(cmd, data, mode, lockp));
2663
2664 /* check unit */
2665 if ((MD_MIN2SET(mnum) >= md_nsets) ||
2666 (MD_MIN2UNIT(mnum) >= md_nunits) ||
2667 ((un = MD_UNIT(mnum)) == NULL))
2668 return (ENXIO);
2669
2670 /* is this a supported ioctl? */
2671 err = md_check_ioctl_against_unit(cmd, un->c);
2672 if (err != 0) {
2673 return (err);
2674 }
2675
2676 /* dispatch ioctl */
2677 switch (cmd) {
2678
2679 case DKIOCINFO:
2680 {
2681 struct dk_cinfo *p;
2682
2683 if (! (mode & FREAD))
2684 return (EACCES);
2685
2686 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2687
2688 get_info(p, mnum);
2689 if (ddi_copyout((caddr_t)p, data, sizeof (*p), mode) != 0)
2690 err = EFAULT;
2691
2692 kmem_free(p, sizeof (*p));
2693 return (err);
2694 }
2695
2696 case DKIOCGMEDIAINFO:
2697 {
2698 struct dk_minfo p;
2699
2700 if (! (mode & FREAD))
2701 return (EACCES);
2702
2703 get_minfo(&p, mnum);
2704 if (ddi_copyout(&p, data, sizeof (struct dk_minfo), mode) != 0)
2705 err = EFAULT;
2706
2707 return (err);
2708 }
2709
2710 case DKIOCGGEOM:
2711 {
2712 struct dk_geom *p;
2713
2714 if (! (mode & FREAD))
2715 return (EACCES);
2716
2717 p = kmem_alloc(sizeof (*p), KM_SLEEP);
2718
2719 if ((err = raid_get_geom(un, p)) == 0) {
2720 if (ddi_copyout((caddr_t)p, data, sizeof (*p),
2721 mode) != 0)
2722 err = EFAULT;
2723 }
2724
2725 kmem_free(p, sizeof (*p));
2726 return (err);
2727 }
2728
2729 case DKIOCGVTOC:
2730 {
2731 struct vtoc *vtoc;
2732
2733 if (! (mode & FREAD))
2734 return (EACCES);
2735
2736 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2737 if ((err = raid_get_vtoc(un, vtoc)) != 0) {
2738 kmem_free(vtoc, sizeof (*vtoc));
2739 return (err);
2740 }
2741
2742 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2743 if (ddi_copyout(vtoc, data, sizeof (*vtoc), mode))
2744 err = EFAULT;
2745 }
2746 #ifdef _SYSCALL32
2747 else {
2748 struct vtoc32 *vtoc32;
2749
2750 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2751
2752 vtoctovtoc32((*vtoc), (*vtoc32));
2753 if (ddi_copyout(vtoc32, data, sizeof (*vtoc32), mode))
2754 err = EFAULT;
2755 kmem_free(vtoc32, sizeof (*vtoc32));
2756 }
2757 #endif /* _SYSCALL32 */
2758
2759 kmem_free(vtoc, sizeof (*vtoc));
2760 return (err);
2761 }
2762
2763 case DKIOCSVTOC:
2764 {
2765 struct vtoc *vtoc;
2766
2767 if (! (mode & FWRITE))
2768 return (EACCES);
2769
2770 vtoc = kmem_zalloc(sizeof (*vtoc), KM_SLEEP);
2771 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2772 if (ddi_copyin(data, vtoc, sizeof (*vtoc), mode)) {
2773 err = EFAULT;
2774 }
2775 }
2776 #ifdef _SYSCALL32
2777 else {
2778 struct vtoc32 *vtoc32;
2779
2780 vtoc32 = kmem_zalloc(sizeof (*vtoc32), KM_SLEEP);
2781
2782 if (ddi_copyin(data, vtoc32, sizeof (*vtoc32), mode)) {
2783 err = EFAULT;
2784 } else {
2785 vtoc32tovtoc((*vtoc32), (*vtoc));
2786 }
2787 kmem_free(vtoc32, sizeof (*vtoc32));
2788 }
2789 #endif /* _SYSCALL32 */
2790
2791 if (err == 0)
2792 err = raid_set_vtoc(un, vtoc);
2793
2794 kmem_free(vtoc, sizeof (*vtoc));
2795 return (err);
2796 }
2797
2798 case DKIOCGEXTVTOC:
2799 {
2800 struct extvtoc *extvtoc;
2801
2802 if (! (mode & FREAD))
2803 return (EACCES);
2804
2805 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2806 if ((err = raid_get_extvtoc(un, extvtoc)) != 0) {
2807 kmem_free(extvtoc, sizeof (*extvtoc));
2808 return (err);
2809 }
2810
2811 if (ddi_copyout(extvtoc, data, sizeof (*extvtoc), mode))
2812 err = EFAULT;
2813
2814 kmem_free(extvtoc, sizeof (*extvtoc));
2815 return (err);
2816 }
2817
2818 case DKIOCSEXTVTOC:
2819 {
2820 struct extvtoc *extvtoc;
2821
2822 if (! (mode & FWRITE))
2823 return (EACCES);
2824
2825 extvtoc = kmem_zalloc(sizeof (*extvtoc), KM_SLEEP);
2826 if (ddi_copyin(data, extvtoc, sizeof (*extvtoc), mode)) {
2827 err = EFAULT;
2828 }
2829
2830 if (err == 0)
2831 err = raid_set_extvtoc(un, extvtoc);
2832
2833 kmem_free(extvtoc, sizeof (*extvtoc));
2834 return (err);
2835 }
2836
2837 case DKIOCGAPART:
2838 {
2839 struct dk_map dmp;
2840
2841 if ((err = raid_get_cgapart(un, &dmp)) != 0) {
2842 return (err);
2843 }
2844
2845 if ((mode & DATAMODEL_MASK) == DATAMODEL_NATIVE) {
2846 if (ddi_copyout((caddr_t)&dmp, data, sizeof (dmp),
2847 mode) != 0)
2848 err = EFAULT;
2849 }
2850 #ifdef _SYSCALL32
2851 else {
2852 struct dk_map32 dmp32;
2853
2854 dmp32.dkl_cylno = dmp.dkl_cylno;
2855 dmp32.dkl_nblk = dmp.dkl_nblk;
2856
2857 if (ddi_copyout((caddr_t)&dmp32, data, sizeof (dmp32),
2858 mode) != 0)
2859 err = EFAULT;
2860 }
2861 #endif /* _SYSCALL32 */
2862
2863 return (err);
2864 }
2865 case DKIOCGETEFI:
2866 {
2867 /*
2868 * This one can be done centralized,
2869 * no need to put in the same code for all types of metadevices
2870 */
2871 return (md_dkiocgetefi(mnum, data, mode));
2872 }
2873
2874 case DKIOCSETEFI:
2875 {
2876 /*
2877 * This one can be done centralized,
2878 * no need to put in the same code for all types of metadevices
2879 */
2880 return (md_dkiocsetefi(mnum, data, mode));
2881 }
2882
2883 case DKIOCPARTITION:
2884 {
2885 return (md_dkiocpartition(mnum, data, mode));
2886 }
2887
2888 default:
2889 return (ENOTTY);
2890 }
2891 }
2892
2893 /*
2894 * rename/exchange named service entry points and support functions follow.
2895 * Most functions are handled generically, except for raid-specific locking
2896 * and checking
2897 */
2898
2899 /*
2900 * NAME: raid_may_renexch_self
2901 * DESCRIPTION: support routine for rename check ("MDRNM_CHECK") named service
2902 * PARAMETERS: mr_unit_t *un - unit struct of raid unit to be renamed
2903 * mdi_unit_t *ui - in-core unit struct of same raid unit
2904 * md_rentxn_t *rtxnp - rename transaction state
2905 *
2906 * LOCKS: none
2907 *
2908 */
2909 static int
raid_may_renexch_self(mr_unit_t * un,mdi_unit_t * ui,md_rentxn_t * rtxnp)2910 raid_may_renexch_self(
2911 mr_unit_t *un,
2912 mdi_unit_t *ui,
2913 md_rentxn_t *rtxnp)
2914 {
2915 minor_t from_min;
2916 minor_t to_min;
2917 bool_t toplevel;
2918 bool_t related;
2919
2920 from_min = rtxnp->from.mnum;
2921 to_min = rtxnp->to.mnum;
2922
2923 if (!un || !ui) {
2924 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2925 from_min);
2926 return (EINVAL);
2927 }
2928
2929 ASSERT(!(MD_CAPAB(un) & MD_CAN_META_CHILD));
2930 if (MD_CAPAB(un) & MD_CAN_META_CHILD) {
2931 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2932 return (EINVAL);
2933 }
2934
2935 if (MD_PARENT(un) == MD_MULTI_PARENT) {
2936 (void) mdmderror(&rtxnp->mde, MDE_RENAME_SOURCE_BAD, from_min);
2937 return (EINVAL);
2938 }
2939
2940 toplevel = !MD_HAS_PARENT(MD_PARENT(un));
2941
2942 /* we're related if trying to swap with our parent */
2943 related = (!toplevel) && (MD_PARENT(un) == to_min);
2944
2945 switch (rtxnp->op) {
2946 case MDRNOP_EXCHANGE:
2947
2948 if (!related) {
2949 (void) mdmderror(&rtxnp->mde,
2950 MDE_RENAME_TARGET_UNRELATED, to_min);
2951 return (EINVAL);
2952 }
2953
2954 break;
2955
2956 case MDRNOP_RENAME:
2957 /*
2958 * if from is top-level and is open, then the kernel is using
2959 * the md_dev64_t.
2960 */
2961
2962 if (toplevel && md_unit_isopen(ui)) {
2963 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
2964 from_min);
2965 return (EBUSY);
2966 }
2967 break;
2968
2969 default:
2970 (void) mdmderror(&rtxnp->mde, MDE_RENAME_CONFIG_ERROR,
2971 from_min);
2972 return (EINVAL);
2973 }
2974
2975 return (0); /* ok */
2976 }
2977
2978 /*
2979 * NAME: raid_rename_check
2980 * DESCRIPTION: ("MDRNM_CHECK") rename/exchange named service entry point
2981 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
2982 * raid device for rename transaction
2983 * md_rentxn_t *rtxnp - rename transaction state
2984 *
2985 * LOCKS: none
2986 *
2987 */
2988 intptr_t
raid_rename_check(md_rendelta_t * delta,md_rentxn_t * rtxnp)2989 raid_rename_check(
2990 md_rendelta_t *delta,
2991 md_rentxn_t *rtxnp)
2992 {
2993 int err = 0;
2994 int column;
2995 mr_unit_t *un;
2996
2997 ASSERT(delta);
2998 ASSERT(rtxnp);
2999 ASSERT(delta->unp);
3000 ASSERT(delta->uip);
3001
3002 if (!delta || !rtxnp || !delta->unp || !delta->uip) {
3003 (void) mdsyserror(&rtxnp->mde, EINVAL);
3004 return (EINVAL);
3005 }
3006
3007 un = (mr_unit_t *)delta->unp;
3008
3009 for (column = 0; column < un->un_totalcolumncnt; column++) {
3010 rcs_state_t colstate;
3011
3012 colstate = un->un_column[column].un_devstate;
3013
3014 if (colstate & RCS_LAST_ERRED) {
3015 (void) mdmderror(&rtxnp->mde, MDE_RAID_LAST_ERRED,
3016 md_getminor(delta->dev));
3017 return (EINVAL);
3018 }
3019
3020 if (colstate & RCS_INIT_ERRED) {
3021 (void) mdmderror(&rtxnp->mde, MDE_RAID_DOI,
3022 md_getminor(delta->dev));
3023 return (EINVAL);
3024 }
3025
3026 /* How did we get this far before detecting this? */
3027 if (colstate & RCS_RESYNC) {
3028 (void) mdmderror(&rtxnp->mde, MDE_RENAME_BUSY,
3029 md_getminor(delta->dev));
3030 return (EBUSY);
3031 }
3032
3033 if (colstate & RCS_ERRED) {
3034 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3035 md_getminor(delta->dev));
3036 return (EINVAL);
3037 }
3038
3039 if (!(colstate & RCS_OKAY)) {
3040 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3041 md_getminor(delta->dev));
3042 return (EINVAL);
3043 }
3044
3045 if (HOTSPARED(un, column)) {
3046 (void) mdmderror(&rtxnp->mde, MDE_RAID_NOT_OKAY,
3047 md_getminor(delta->dev));
3048 return (EINVAL);
3049 }
3050 }
3051
3052 /* self does additional checks */
3053 if (delta->old_role == MDRR_SELF) {
3054 err = raid_may_renexch_self((mr_unit_t *)delta->unp,
3055 delta->uip, rtxnp);
3056 }
3057 return (err);
3058 }
3059
3060 /*
3061 * NAME: raid_rename_lock
3062 * DESCRIPTION: ("MDRNM_LOCK") rename/exchange named service entry point
3063 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
3064 * raid device for rename transaction
3065 * md_rentxn_t *rtxnp - rename transaction state
3066 *
3067 * LOCKS: io and unit locks (taken explicitly *not* via ioctl wrappers)
3068 *
3069 */
3070 intptr_t
raid_rename_lock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3071 raid_rename_lock(
3072 md_rendelta_t *delta,
3073 md_rentxn_t *rtxnp)
3074 {
3075 minor_t mnum;
3076
3077 ASSERT(delta);
3078 ASSERT(rtxnp);
3079
3080 mnum = md_getminor(delta->dev);
3081 if (mnum == rtxnp->to.mnum && rtxnp->op == MDRNOP_RENAME) {
3082 return (0);
3083 }
3084
3085 ASSERT(delta->uip);
3086 if (!delta->uip) {
3087 (void) mdmderror(&rtxnp->mde, MDE_UNIT_NOT_SETUP, mnum);
3088 return (ENODEV);
3089 }
3090
3091 ASSERT(delta->unp);
3092 if (!delta->unp) {
3093
3094 return (ENODEV);
3095 }
3096
3097 ASSERT(!IO_WRITER_HELD(delta->unp));
3098 (void) md_io_writerlock(delta->uip);
3099 ASSERT(IO_WRITER_HELD(delta->unp));
3100
3101
3102 ASSERT(!UNIT_WRITER_HELD(delta->unp));
3103 (void) md_unit_writerlock(delta->uip);
3104 ASSERT(UNIT_WRITER_HELD(delta->unp));
3105
3106 return (0);
3107 }
3108
3109 /*
3110 * NAME: raid_rename_unlock
3111 * DESCRIPTION: ("MDRNM_UNLOCK") rename/exchange named service entry point
3112 * PARAMETERS: md_rendelta_t *delta - describes changes to be made to this
3113 * raid device for rename transaction
3114 * md_rentxn_t *rtxnp - rename transaction state
3115 *
3116 * LOCKS: drops io and unit locks
3117 *
3118 */
3119 /* ARGSUSED */
3120 void
raid_rename_unlock(md_rendelta_t * delta,md_rentxn_t * rtxnp)3121 raid_rename_unlock(
3122 md_rendelta_t *delta,
3123 md_rentxn_t *rtxnp)
3124 {
3125 mr_unit_t *un = (mr_unit_t *)delta->unp;
3126 minor_t mnum = MD_SID(un);
3127 int col;
3128
3129 ASSERT(delta);
3130 ASSERT(delta->unp);
3131 ASSERT(delta->uip);
3132
3133 ASSERT(UNIT_WRITER_HELD(delta->unp));
3134 md_unit_writerexit(delta->uip);
3135 ASSERT(!UNIT_WRITER_HELD(delta->unp));
3136
3137 if (! (delta->txn_stat.role_swapped) || ! (delta->txn_stat.is_open)) {
3138 goto out;
3139 }
3140 if (raid_internal_open(mnum, (FREAD | FWRITE),
3141 OTYP_LYR, MD_OFLG_ISINIT) == 0) {
3142 for (col = 0; col < un->un_totalcolumncnt; col++) {
3143 if (un->un_column[col].un_devstate & RCS_OKAY)
3144 (void) init_pw_area(un,
3145 un->un_column[col].un_dev,
3146 un->un_column[col].un_pwstart, col);
3147 }
3148 (void) raid_internal_close(mnum, OTYP_LYR, 0, 0);
3149 }
3150
3151 out:
3152 ASSERT(IO_WRITER_HELD(delta->unp));
3153 md_io_writerexit(delta->uip);
3154 ASSERT(!IO_WRITER_HELD(delta->unp));
3155 }
3156 /* end of rename/exchange named service and support functions */
3157