1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * Driver for Virtual Disk.
29 */
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/user.h>
35 #include <sys/uio.h>
36 #include <sys/proc.h>
37 #include <sys/t_lock.h>
38 #include <sys/dkio.h>
39 #include <sys/kmem.h>
40 #include <sys/debug.h>
41 #include <sys/cmn_err.h>
42 #include <sys/sysmacros.h>
43 #include <sys/types.h>
44 #include <sys/mkdev.h>
45 #include <sys/vtoc.h>
46 #include <sys/open.h>
47 #include <sys/file.h>
48 #include <vm/page.h>
49 #include <sys/callb.h>
50 #include <sys/disp.h>
51 #include <sys/modctl.h>
52 #include <sys/errno.h>
53 #include <sys/door.h>
54 #include <sys/lvm/mdmn_commd.h>
55 #include <sys/lvm/md_hotspares.h>
56
57 #include <sys/lvm/mdvar.h>
58 #include <sys/lvm/md_names.h>
59
60 #include <sys/ddi.h>
61 #include <sys/proc.h>
62 #include <sys/sunddi.h>
63 #include <sys/esunddi.h>
64
65 #include <sys/sysevent.h>
66 #include <sys/sysevent/eventdefs.h>
67
68 #include <sys/sysevent/svm.h>
69 #include <sys/lvm/md_basic.h>
70
71
72 /*
73 * Machine specific Hertz is kept here
74 */
75 extern clock_t md_hz;
76
77 /*
78 * Externs.
79 */
80 extern int (*mdv_strategy_tstpnt)(buf_t *, int, void*);
81 extern major_t md_major;
82 extern unit_t md_nunits;
83 extern set_t md_nsets;
84 extern md_set_t md_set[];
85 extern md_set_io_t md_set_io[];
86 extern md_ops_t **md_ops;
87 extern md_ops_t *md_opslist;
88 extern ddi_modhandle_t *md_mods;
89 extern dev_info_t *md_devinfo;
90
91 extern md_krwlock_t md_unit_array_rw;
92 extern kmutex_t md_mx;
93 extern kcondvar_t md_cv;
94
95 extern md_krwlock_t hsp_rwlp;
96 extern md_krwlock_t ni_rwlp;
97
98 extern int md_num_daemons;
99 extern int md_status;
100 extern int md_ioctl_cnt;
101 extern int md_mtioctl_cnt;
102
103 extern struct metatransops metatransops;
104 extern md_event_queue_t *md_event_queue;
105 extern md_resync_t md_cpr_resync;
106 extern int md_done_daemon_threads;
107 extern int md_ff_daemon_threads;
108
109
110 extern mddb_set_t *mddb_setenter(set_t setno, int flag, int *errorcodep);
111 extern void mddb_setexit(mddb_set_t *s);
112 extern void *lookup_entry(struct nm_next_hdr *, set_t,
113 side_t, mdkey_t, md_dev64_t, int);
114 extern struct nm_next_hdr *get_first_record(set_t, int, int);
115 extern dev_t getrootdev(void);
116
117 struct mdq_anchor md_done_daemon; /* done request queue */
118 struct mdq_anchor md_mstr_daemon; /* mirror error, WOW requests */
119 struct mdq_anchor md_mhs_daemon; /* mirror hotspare requests queue */
120 struct mdq_anchor md_hs_daemon; /* raid hotspare requests queue */
121 struct mdq_anchor md_ff_daemonq; /* failfast request queue */
122 struct mdq_anchor md_mirror_daemon; /* mirror owner queue */
123 struct mdq_anchor md_mirror_io_daemon; /* mirror owner i/o queue */
124 struct mdq_anchor md_mirror_rs_daemon; /* mirror resync done queue */
125 struct mdq_anchor md_sp_daemon; /* soft-part error daemon queue */
126 struct mdq_anchor md_mto_daemon; /* mirror timeout daemon queue */
127
128 int md_done_daemon_threads = 1; /* threads for md_done_daemon requestq */
129 int md_mstr_daemon_threads = 1; /* threads for md_mstr_daemon requestq */
130 int md_mhs_daemon_threads = 1; /* threads for md_mhs_daemon requestq */
131 int md_hs_daemon_threads = 1; /* threads for md_hs_daemon requestq */
132 int md_ff_daemon_threads = 3; /* threads for md_ff_daemon requestq */
133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
134 int md_sp_daemon_threads = 1; /* threads for md_sp_daemon requestq */
135 int md_mto_daemon_threads = 1; /* threads for md_mto_daemon requestq */
136
137 #ifdef DEBUG
138 /* Flag to switch on debug messages */
139 int md_release_reacquire_debug = 0; /* debug flag */
140 #endif
141
142 /*
143 *
144 * The md_request_queues is table of pointers to request queues and the number
145 * of threads associated with the request queues.
146 * When the number of threads is set to 1, then the order of execution is
147 * sequential.
148 * The number of threads for all the queues have been defined as global
149 * variables to enable kernel tuning.
150 *
151 */
152
153 #define MD_DAEMON_QUEUES 11
154
155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
156 {&md_done_daemon, &md_done_daemon_threads},
157 {&md_mstr_daemon, &md_mstr_daemon_threads},
158 {&md_hs_daemon, &md_hs_daemon_threads},
159 {&md_ff_daemonq, &md_ff_daemon_threads},
160 {&md_mirror_daemon, &md_mirror_daemon_threads},
161 {&md_mirror_io_daemon, &md_mirror_daemon_threads},
162 {&md_mirror_rs_daemon, &md_mirror_daemon_threads},
163 {&md_sp_daemon, &md_sp_daemon_threads},
164 {&md_mhs_daemon, &md_mhs_daemon_threads},
165 {&md_mto_daemon, &md_mto_daemon_threads},
166 {0, 0}
167 };
168
169 /*
170 * Number of times a message is retried before issuing a warning to the operator
171 */
172 #define MD_MN_WARN_INTVL 10
173
174 /*
175 * Setting retry cnt to one (pre decremented) so that we actually do no
176 * retries when committing/deleting a mddb rec. The underlying disk driver
177 * does several retries to check if the disk is really dead or not so there
178 * is no reason for us to retry on top of the drivers retries.
179 */
180
181 uint_t md_retry_cnt = 1; /* global so it can be patched */
182
183 /*
184 * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
185 * Again, made patchable here should it prove useful.
186 */
187 uint_t md_send_retry_limit = 30;
188
189 /*
190 * Bug # 1212146
191 * Before this change the user had to pass in a short aligned buffer because of
192 * problems in some underlying device drivers. This problem seems to have been
193 * corrected in the underlying drivers so we will default to not requiring any
194 * alignment. If the user needs to check for a specific alignment,
195 * md_uio_alignment_mask may be set in /etc/system to accomplish this. To get
196 * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
197 * to check for word alignment, it can be set to 3, for double word alignment,
198 * it can be set to 7, etc.
199 *
200 * [Other part of fix is in function md_chk_uio()]
201 */
202 static int md_uio_alignment_mask = 0;
203
204 /*
205 * for md_dev64_t translation
206 */
207 struct md_xlate_table *md_tuple_table;
208 struct md_xlate_major_table *md_major_tuple_table;
209 int md_tuple_length;
210 uint_t md_majortab_len;
211
212 /* Function declarations */
213
214 static int md_create_probe_rqlist(md_probedev_impl_t *plist,
215 daemon_queue_t **hdr, intptr_t (*probe_test)());
216
217 /*
218 * manipulate global status
219 */
220 void
md_set_status(int bits)221 md_set_status(int bits)
222 {
223 mutex_enter(&md_mx);
224 md_status |= bits;
225 mutex_exit(&md_mx);
226 }
227
228 void
md_clr_status(int bits)229 md_clr_status(int bits)
230 {
231 mutex_enter(&md_mx);
232 md_status &= ~bits;
233 mutex_exit(&md_mx);
234 }
235
236 int
md_get_status()237 md_get_status()
238 {
239 int result;
240 mutex_enter(&md_mx);
241 result = md_status;
242 mutex_exit(&md_mx);
243 return (result);
244 }
245
246 void
md_set_setstatus(set_t setno,int bits)247 md_set_setstatus(set_t setno, int bits)
248 {
249 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
250
251 mutex_enter(&md_mx);
252 md_set[setno].s_status |= bits;
253 mutex_exit(&md_mx);
254 }
255
256 void
md_clr_setstatus(set_t setno,int bits)257 md_clr_setstatus(set_t setno, int bits)
258 {
259 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
260
261 mutex_enter(&md_mx);
262 md_set[setno].s_status &= ~bits;
263 mutex_exit(&md_mx);
264 }
265
266 uint_t
md_get_setstatus(set_t setno)267 md_get_setstatus(set_t setno)
268 {
269 uint_t result;
270
271 ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
272
273 mutex_enter(&md_mx);
274 result = md_set[setno].s_status;
275 mutex_exit(&md_mx);
276 return (result);
277 }
278
279 /*
280 * md_unit_readerlock_common:
281 * -------------------------
282 * Mark the given unit as having a reader reference. Spin waiting for any
283 * writer references to be released.
284 *
285 * Input:
286 * ui unit reference
287 * lock_held 0 => ui_mx needs to be grabbed
288 * 1 => ui_mx already held
289 * Output:
290 * mm_unit_t corresponding to unit structure
291 * ui->ui_readercnt incremented
292 */
293 static void *
md_unit_readerlock_common(mdi_unit_t * ui,int lock_held)294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
295 {
296 uint_t flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
297
298 if (!lock_held)
299 mutex_enter(&ui->ui_mx);
300 while (ui->ui_lock & flag) {
301 if (panicstr) {
302 if (ui->ui_lock & MD_UL_WRITER)
303 panic("md: writer lock is held");
304 break;
305 }
306 cv_wait(&ui->ui_cv, &ui->ui_mx);
307 }
308 ui->ui_readercnt++;
309 if (!lock_held)
310 mutex_exit(&ui->ui_mx);
311 return (MD_UNIT(ui->ui_link.ln_id));
312 }
313
314 void *
md_unit_readerlock(mdi_unit_t * ui)315 md_unit_readerlock(mdi_unit_t *ui)
316 {
317 return (md_unit_readerlock_common(ui, 0));
318 }
319
320 /*
321 * md_unit_writerlock_common:
322 * -------------------------
323 * Acquire a unique writer reference. Causes previous readers to drain.
324 * Spins if a writer reference already exists or if a previous reader/writer
325 * dropped the lock to allow a ksend_message to be despatched.
326 *
327 * Input:
328 * ui unit reference
329 * lock_held 0 => grab ui_mx
330 * 1 => ui_mx already held on entry
331 * Output:
332 * mm_unit_t reference
333 */
334 static void *
md_unit_writerlock_common(mdi_unit_t * ui,int lock_held)335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
336 {
337 uint_t flag = MD_UL_WRITER;
338
339 if (panicstr)
340 panic("md: writer lock not allowed");
341
342 if (!lock_held)
343 mutex_enter(&ui->ui_mx);
344
345 while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
346 ui->ui_wanabecnt++;
347 ui->ui_lock |= MD_UL_WANABEWRITER;
348 cv_wait(&ui->ui_cv, &ui->ui_mx);
349 if (--ui->ui_wanabecnt == 0)
350 ui->ui_lock &= ~MD_UL_WANABEWRITER;
351 }
352 ui->ui_lock |= MD_UL_WRITER;
353 ui->ui_owner = curthread;
354
355 if (!lock_held)
356 mutex_exit(&ui->ui_mx);
357 return (MD_UNIT(ui->ui_link.ln_id));
358 }
359
360 void *
md_unit_writerlock(mdi_unit_t * ui)361 md_unit_writerlock(mdi_unit_t *ui)
362 {
363 return (md_unit_writerlock_common(ui, 0));
364 }
365
366 /*
367 * md_unit_readerexit_common:
368 * -------------------------
369 * Release the readerlock for the specified unit. If the reader count reaches
370 * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
371 *
372 * Input:
373 * ui unit reference
374 * lock_held 0 => ui_mx needs to be acquired
375 * 1 => ui_mx already held
376 */
377 static void
md_unit_readerexit_common(mdi_unit_t * ui,int lock_held)378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
379 {
380 if (!lock_held)
381 mutex_enter(&ui->ui_mx);
382 ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
383 ASSERT(ui->ui_readercnt != 0);
384 ui->ui_readercnt--;
385 if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
386 cv_broadcast(&ui->ui_cv);
387
388 if (!lock_held)
389 mutex_exit(&ui->ui_mx);
390 }
391
392 void
md_unit_readerexit(mdi_unit_t * ui)393 md_unit_readerexit(mdi_unit_t *ui)
394 {
395 md_unit_readerexit_common(ui, 0);
396 }
397
398 /*
399 * md_unit_writerexit_common:
400 * -------------------------
401 * Release the writerlock currently held on the unit. Wake any threads waiting
402 * on becoming reader or writer (MD_UL_WANABEWRITER set).
403 *
404 * Input:
405 * ui unit reference
406 * lock_held 0 => ui_mx to be acquired
407 * 1 => ui_mx already held
408 */
409 static void
md_unit_writerexit_common(mdi_unit_t * ui,int lock_held)410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
411 {
412 if (!lock_held)
413 mutex_enter(&ui->ui_mx);
414 ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
415 ASSERT(ui->ui_readercnt == 0);
416 ui->ui_lock &= ~MD_UL_WRITER;
417 ui->ui_owner = NULL;
418
419 cv_broadcast(&ui->ui_cv);
420 if (!lock_held)
421 mutex_exit(&ui->ui_mx);
422 }
423
424 void
md_unit_writerexit(mdi_unit_t * ui)425 md_unit_writerexit(mdi_unit_t *ui)
426 {
427 md_unit_writerexit_common(ui, 0);
428 }
429
430 void *
md_io_readerlock(mdi_unit_t * ui)431 md_io_readerlock(mdi_unit_t *ui)
432 {
433 md_io_lock_t *io = ui->ui_io_lock;
434
435 ASSERT(io); /* checks case where no io lock allocated */
436 mutex_enter(&io->io_mx);
437 while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
438 if (panicstr) {
439 if (io->io_lock & MD_UL_WRITER)
440 panic("md: writer lock is held");
441 break;
442 }
443 cv_wait(&io->io_cv, &io->io_mx);
444 }
445 io->io_readercnt++;
446 mutex_exit(&io->io_mx);
447 return (MD_UNIT(ui->ui_link.ln_id));
448 }
449
450 void *
md_io_writerlock(mdi_unit_t * ui)451 md_io_writerlock(mdi_unit_t *ui)
452 {
453 md_io_lock_t *io = ui->ui_io_lock;
454
455 ASSERT(io); /* checks case where no io lock allocated */
456 if (panicstr)
457 panic("md: writer lock not allowed");
458
459 mutex_enter(&io->io_mx);
460 while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
461 io->io_wanabecnt++;
462 io->io_lock |= MD_UL_WANABEWRITER;
463 cv_wait(&io->io_cv, &io->io_mx);
464 if (--io->io_wanabecnt == 0)
465 io->io_lock &= ~MD_UL_WANABEWRITER;
466 }
467 io->io_lock |= MD_UL_WRITER;
468 io->io_owner = curthread;
469
470 mutex_exit(&io->io_mx);
471 return (MD_UNIT(ui->ui_link.ln_id));
472 }
473
474 void
md_io_readerexit(mdi_unit_t * ui)475 md_io_readerexit(mdi_unit_t *ui)
476 {
477 md_io_lock_t *io = ui->ui_io_lock;
478
479 mutex_enter(&io->io_mx);
480 ASSERT((io->io_lock & MD_UL_WRITER) == 0);
481 ASSERT(io->io_readercnt != 0);
482 io->io_readercnt--;
483 if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
484 cv_broadcast(&io->io_cv);
485 }
486 mutex_exit(&io->io_mx);
487 }
488
489 void
md_io_writerexit(mdi_unit_t * ui)490 md_io_writerexit(mdi_unit_t *ui)
491 {
492 md_io_lock_t *io = ui->ui_io_lock;
493
494 mutex_enter(&io->io_mx);
495 ASSERT((io->io_lock & MD_UL_WRITER) != 0);
496 ASSERT(io->io_readercnt == 0);
497 io->io_lock &= ~MD_UL_WRITER;
498 io->io_owner = NULL;
499
500 cv_broadcast(&io->io_cv);
501 mutex_exit(&io->io_mx);
502 }
503
504 /*
505 * Attempt to grab that set of locks defined as global.
506 * A mask containing the set of global locks that are owned upon
507 * entry is input. Any additional global locks are then grabbed.
508 * This keeps the caller from having to know the set of global
509 * locks.
510 */
511 static int
md_global_lock_enter(int global_locks_owned_mask)512 md_global_lock_enter(int global_locks_owned_mask)
513 {
514
515 /*
516 * The current implementation has been verified by inspection
517 * and test to be deadlock free. If another global lock is
518 * added, changing the algorithm used by this function should
519 * be considered. With more than 2 locks it is difficult to
520 * guarantee that locks are being acquired in the correct order.
521 * The safe approach would be to drop all of the locks that are
522 * owned at function entry and then reacquire all of the locks
523 * in the order defined by the lock hierarchy.
524 */
525 mutex_enter(&md_mx);
526 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
527 while ((md_mtioctl_cnt != 0) ||
528 (md_status & MD_GBL_IOCTL_LOCK)) {
529 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
530 mutex_exit(&md_mx);
531 return (EINTR);
532 }
533 }
534 md_status |= MD_GBL_IOCTL_LOCK;
535 md_ioctl_cnt++;
536 }
537 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
538 while (md_status & MD_GBL_HS_LOCK) {
539 if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
540 md_status &= ~MD_GBL_IOCTL_LOCK;
541 mutex_exit(&md_mx);
542 return (EINTR);
543 }
544 }
545 md_status |= MD_GBL_HS_LOCK;
546 }
547 mutex_exit(&md_mx);
548 return (0);
549 }
550
551 /*
552 * Release the set of global locks that were grabbed in md_global_lock_enter
553 * that were not already owned by the calling thread. The set of previously
554 * owned global locks is passed in as a mask parameter.
555 */
556 static int
md_global_lock_exit(int global_locks_owned_mask,int code,int flags,mdi_unit_t * ui)557 md_global_lock_exit(int global_locks_owned_mask, int code,
558 int flags, mdi_unit_t *ui)
559 {
560 mutex_enter(&md_mx);
561
562 /* If MT ioctl decrement mt_ioctl_cnt */
563 if ((flags & MD_MT_IOCTL)) {
564 md_mtioctl_cnt--;
565 } else {
566 if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
567 /* clear the lock and decrement count */
568 ASSERT(md_ioctl_cnt == 1);
569 md_ioctl_cnt--;
570 md_status &= ~MD_GBL_IOCTL_LOCK;
571 }
572 if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
573 md_status &= ~MD_GBL_HS_LOCK;
574 }
575 if (flags & MD_READER_HELD)
576 md_unit_readerexit(ui);
577 if (flags & MD_WRITER_HELD)
578 md_unit_writerexit(ui);
579 if (flags & MD_IO_HELD)
580 md_io_writerexit(ui);
581 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
582 rw_exit(&md_unit_array_rw.lock);
583 }
584 cv_broadcast(&md_cv);
585 mutex_exit(&md_mx);
586
587 return (code);
588 }
589
590 /*
591 * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
592 * use of the md_global_lock_{enter|exit} functions to avoid duplication
593 * of code. They rely upon the fact that the locks that are specified in
594 * the input mask are not acquired or freed. If this algorithm changes
595 * as described in the block comment at the beginning of md_global_lock_enter
596 * then it will be necessary to change these 2 functions. Otherwise these
597 * functions will be grabbing and holding global locks unnecessarily.
598 */
599 int
md_ioctl_lock_enter(void)600 md_ioctl_lock_enter(void)
601 {
602 /* grab only the ioctl lock */
603 return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
604 }
605
606 /*
607 * If md_ioctl_lock_exit is being called at the end of an ioctl before
608 * returning to user space, then ioctl_end is set to 1.
609 * Otherwise, the ioctl lock is being dropped in the middle of handling
610 * an ioctl and will be reacquired before the end of the ioctl.
611 * Do not attempt to process the MN diskset mddb parse flags unless
612 * ioctl_end is true - otherwise a deadlock situation could arise.
613 */
614 int
md_ioctl_lock_exit(int code,int flags,mdi_unit_t * ui,int ioctl_end)615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
616 {
617 int ret_val;
618 uint_t status;
619 mddb_set_t *s;
620 int i;
621 int err;
622 md_mn_msg_mddb_parse_t *mddb_parse_msg;
623 md_mn_kresult_t *kresult;
624 mddb_lb_t *lbp;
625 int rval = 1;
626 int flag;
627
628 /* release only the ioctl lock */
629 ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
630
631 /*
632 * If md_ioctl_lock_exit is being called with a possible lock held
633 * (ioctl_end is 0), then don't check the MN disksets since the
634 * call to mddb_setenter may cause a lock ordering deadlock.
635 */
636 if (!ioctl_end)
637 return (ret_val);
638
639 /*
640 * Walk through disksets to see if there is a MN diskset that
641 * has messages that need to be sent. Set must be snarfed and
642 * be a MN diskset in order to be checked.
643 *
644 * In a MN diskset, this routine may send messages to the
645 * rpc.mdcommd in order to have the slave nodes re-parse parts
646 * of the mddb. Messages can only be sent with no locks held,
647 * so if mddb change occurred while the ioctl lock is held, this
648 * routine must send the messages.
649 */
650 for (i = 1; i < md_nsets; i++) {
651 status = md_get_setstatus(i);
652
653 /* Set must be snarfed and be a MN diskset */
654 if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
655 (MD_SET_SNARFED | MD_SET_MNSET))
656 continue;
657
658 /* Grab set lock so that set can't change */
659 if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
660 continue;
661
662 lbp = s->s_lbp;
663
664 /* Re-get set status now that lock is held */
665 status = md_get_setstatus(i);
666
667 /*
668 * If MN parsing block flag is set - continue to next set.
669 *
670 * If s_mn_parseflags_sending is non-zero, then another thread
671 * is already currently sending a parse message, so just
672 * release the set mutex. If this ioctl had caused an mddb
673 * change that results in a parse message to be generated,
674 * the thread that is currently sending a parse message would
675 * generate the additional parse message.
676 *
677 * If s_mn_parseflags_sending is zero then loop until
678 * s_mn_parseflags is 0 (until there are no more
679 * messages to send).
680 * While s_mn_parseflags is non-zero,
681 * put snapshot of parse_flags in s_mn_parseflags_sending
682 * set s_mn_parseflags to zero
683 * release set mutex
684 * send message
685 * re-grab set mutex
686 * set s_mn_parseflags_sending to zero
687 *
688 * If set is STALE, send message with NO_LOG flag so that
689 * rpc.mdcommd won't attempt to log message to non-writeable
690 * replica.
691 */
692 mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
693 KM_SLEEP);
694 while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
695 (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
696 (!(status & MD_SET_MNPARSE_BLK))) {
697
698 /* Grab snapshot of parse flags */
699 s->s_mn_parseflags_sending = s->s_mn_parseflags;
700 s->s_mn_parseflags = 0;
701
702 mutex_exit(&md_set[(s)->s_setno].s_dbmx);
703
704 /*
705 * Send the message to the slaves to re-parse
706 * the indicated portions of the mddb. Send the status
707 * of the 50 mddbs in this set so that slaves know
708 * which mddbs that the master node thinks are 'good'.
709 * Otherwise, slave may reparse, but from wrong
710 * replica.
711 */
712 mddb_parse_msg->msg_parse_flags =
713 s->s_mn_parseflags_sending;
714
715 for (i = 0; i < MDDB_NLB; i++) {
716 mddb_parse_msg->msg_lb_flags[i] =
717 lbp->lb_locators[i].l_flags;
718 }
719 kresult = kmem_alloc(sizeof (md_mn_kresult_t),
720 KM_SLEEP);
721 while (rval != 0) {
722 flag = 0;
723 if (status & MD_SET_STALE)
724 flag |= MD_MSGF_NO_LOG;
725 rval = mdmn_ksend_message(s->s_setno,
726 MD_MN_MSG_MDDB_PARSE, flag, 0,
727 (char *)mddb_parse_msg,
728 sizeof (md_mn_msg_mddb_parse_t), kresult);
729 /* if the node hasn't yet joined, it's Ok. */
730 if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
731 (kresult->kmmr_comm_state !=
732 MDMNE_NOT_JOINED)) {
733 mdmn_ksend_show_error(rval, kresult,
734 "MD_MN_MSG_MDDB_PARSE");
735 cmn_err(CE_WARN, "md_ioctl_lock_exit: "
736 "Unable to send mddb update "
737 "message to other nodes in "
738 "diskset %s\n", s->s_setname);
739 rval = 1;
740 }
741 }
742 kmem_free(kresult, sizeof (md_mn_kresult_t));
743
744 /*
745 * Re-grab mutex to clear sending field and to
746 * see if another parse message needs to be generated.
747 */
748 mutex_enter(&md_set[(s)->s_setno].s_dbmx);
749 s->s_mn_parseflags_sending = 0;
750 }
751 kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
752 mutex_exit(&md_set[(s)->s_setno].s_dbmx);
753 }
754 return (ret_val);
755 }
756
757 /*
758 * Called when in an ioctl and need readerlock.
759 */
760 void *
md_ioctl_readerlock(IOLOCK * lock,mdi_unit_t * ui)761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
762 {
763 ASSERT(lock != NULL);
764 lock->l_ui = ui;
765 lock->l_flags |= MD_READER_HELD;
766 return (md_unit_readerlock_common(ui, 0));
767 }
768
769 /*
770 * Called when in an ioctl and need writerlock.
771 */
772 void *
md_ioctl_writerlock(IOLOCK * lock,mdi_unit_t * ui)773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
774 {
775 ASSERT(lock != NULL);
776 lock->l_ui = ui;
777 lock->l_flags |= MD_WRITER_HELD;
778 return (md_unit_writerlock_common(ui, 0));
779 }
780
781 void *
md_ioctl_io_lock(IOLOCK * lock,mdi_unit_t * ui)782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
783 {
784 ASSERT(lock != NULL);
785 lock->l_ui = ui;
786 lock->l_flags |= MD_IO_HELD;
787 return (md_io_writerlock(ui));
788 }
789
790 void
md_ioctl_readerexit(IOLOCK * lock)791 md_ioctl_readerexit(IOLOCK *lock)
792 {
793 ASSERT(lock != NULL);
794 lock->l_flags &= ~MD_READER_HELD;
795 md_unit_readerexit(lock->l_ui);
796 }
797
798 void
md_ioctl_writerexit(IOLOCK * lock)799 md_ioctl_writerexit(IOLOCK *lock)
800 {
801 ASSERT(lock != NULL);
802 lock->l_flags &= ~MD_WRITER_HELD;
803 md_unit_writerexit(lock->l_ui);
804 }
805
806 void
md_ioctl_io_exit(IOLOCK * lock)807 md_ioctl_io_exit(IOLOCK *lock)
808 {
809 ASSERT(lock != NULL);
810 lock->l_flags &= ~MD_IO_HELD;
811 md_io_writerexit(lock->l_ui);
812 }
813
814 /*
815 * md_ioctl_releaselocks:
816 * --------------------
817 * Release the unit locks that are held and stop subsequent
818 * md_unit_reader/writerlock calls from progressing. This allows the caller
819 * to send messages across the cluster when running in a multinode
820 * environment.
821 * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
822 * allowed to progress as normal. This is required as these typically are
823 * invoked by the message handler that may be called while a unit lock is
824 * marked as released.
825 *
826 * On entry:
827 * variety of unit locks may be held including ioctl lock
828 *
829 * On exit:
830 * locks released and unit structure updated to prevent subsequent reader/
831 * writer locks being acquired until md_ioctl_reacquirelocks is called
832 */
833 void
md_ioctl_releaselocks(int code,int flags,mdi_unit_t * ui)834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
835 {
836 /* This actually releases the locks. */
837 (void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
838 }
839
840 /*
841 * md_ioctl_reacquirelocks:
842 * ----------------------
843 * Reacquire the locks that were held when md_ioctl_releaselocks
844 * was called.
845 *
846 * On entry:
847 * No unit locks held
848 * On exit:
849 * locks held that were held at md_ioctl_releaselocks time including
850 * the ioctl lock.
851 */
852 void
md_ioctl_reacquirelocks(int flags,mdi_unit_t * ui)853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
854 {
855 if (flags & MD_MT_IOCTL) {
856 mutex_enter(&md_mx);
857 md_mtioctl_cnt++;
858 mutex_exit(&md_mx);
859 } else {
860 while (md_ioctl_lock_enter() == EINTR)
861 ;
862 }
863 if (flags & MD_ARRAY_WRITER) {
864 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
865 } else if (flags & MD_ARRAY_READER) {
866 rw_enter(&md_unit_array_rw.lock, RW_READER);
867 }
868 if (ui != (mdi_unit_t *)NULL) {
869 if (flags & MD_IO_HELD) {
870 (void) md_io_writerlock(ui);
871 }
872
873 mutex_enter(&ui->ui_mx);
874 if (flags & MD_READER_HELD) {
875 (void) md_unit_readerlock_common(ui, 1);
876 } else if (flags & MD_WRITER_HELD) {
877 (void) md_unit_writerlock_common(ui, 1);
878 }
879 /* Wake up any blocked readerlock() calls */
880 cv_broadcast(&ui->ui_cv);
881 mutex_exit(&ui->ui_mx);
882 }
883 }
884
885 void
md_ioctl_droplocks(IOLOCK * lock)886 md_ioctl_droplocks(IOLOCK *lock)
887 {
888 mdi_unit_t *ui;
889 int flags;
890
891 ASSERT(lock != NULL);
892 ui = lock->l_ui;
893 flags = lock->l_flags;
894 if (flags & MD_READER_HELD) {
895 lock->l_flags &= ~MD_READER_HELD;
896 md_unit_readerexit(ui);
897 }
898 if (flags & MD_WRITER_HELD) {
899 lock->l_flags &= ~MD_WRITER_HELD;
900 md_unit_writerexit(ui);
901 }
902 if (flags & MD_IO_HELD) {
903 lock->l_flags &= ~MD_IO_HELD;
904 md_io_writerexit(ui);
905 }
906 if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
907 lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
908 rw_exit(&md_unit_array_rw.lock);
909 }
910 }
911
912 void
md_array_writer(IOLOCK * lock)913 md_array_writer(IOLOCK *lock)
914 {
915 ASSERT(lock != NULL);
916 lock->l_flags |= MD_ARRAY_WRITER;
917 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
918 }
919
920 void
md_array_reader(IOLOCK * lock)921 md_array_reader(IOLOCK *lock)
922 {
923 ASSERT(lock != NULL);
924 lock->l_flags |= MD_ARRAY_READER;
925 rw_enter(&md_unit_array_rw.lock, RW_READER);
926 }
927
928 /*
929 * Called when in an ioctl and need opencloselock.
930 * Sets flags in lockp for READER_HELD.
931 */
932 void *
md_ioctl_openclose_enter(IOLOCK * lockp,mdi_unit_t * ui)933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
934 {
935 void *un;
936
937 ASSERT(lockp != NULL);
938 mutex_enter(&ui->ui_mx);
939 while (ui->ui_lock & MD_UL_OPENORCLOSE)
940 cv_wait(&ui->ui_cv, &ui->ui_mx);
941 ui->ui_lock |= MD_UL_OPENORCLOSE;
942
943 /* Maintain mutex across the readerlock call */
944 lockp->l_ui = ui;
945 lockp->l_flags |= MD_READER_HELD;
946 un = md_unit_readerlock_common(ui, 1);
947 mutex_exit(&ui->ui_mx);
948
949 return (un);
950 }
951
952 /*
953 * Clears reader lock using md_ioctl instead of md_unit
954 * and updates lockp.
955 */
956 void
md_ioctl_openclose_exit(IOLOCK * lockp)957 md_ioctl_openclose_exit(IOLOCK *lockp)
958 {
959 mdi_unit_t *ui;
960
961 ASSERT(lockp != NULL);
962 ui = lockp->l_ui;
963 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
964
965 md_ioctl_readerexit(lockp);
966
967 mutex_enter(&ui->ui_mx);
968 ui->ui_lock &= ~MD_UL_OPENORCLOSE;
969
970 cv_broadcast(&ui->ui_cv);
971 mutex_exit(&ui->ui_mx);
972 }
973
974 /*
975 * Clears reader lock using md_ioctl instead of md_unit
976 * and updates lockp.
977 * Does not acquire or release the ui_mx lock since the calling
978 * routine has already acquired this lock.
979 */
980 void
md_ioctl_openclose_exit_lh(IOLOCK * lockp)981 md_ioctl_openclose_exit_lh(IOLOCK *lockp)
982 {
983 mdi_unit_t *ui;
984
985 ASSERT(lockp != NULL);
986 ui = lockp->l_ui;
987 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
988
989 lockp->l_flags &= ~MD_READER_HELD;
990 md_unit_readerexit_common(lockp->l_ui, 1);
991
992 ui->ui_lock &= ~MD_UL_OPENORCLOSE;
993 cv_broadcast(&ui->ui_cv);
994 }
995
996 void *
md_unit_openclose_enter(mdi_unit_t * ui)997 md_unit_openclose_enter(mdi_unit_t *ui)
998 {
999 void *un;
1000
1001 mutex_enter(&ui->ui_mx);
1002 while (ui->ui_lock & (MD_UL_OPENORCLOSE))
1003 cv_wait(&ui->ui_cv, &ui->ui_mx);
1004 ui->ui_lock |= MD_UL_OPENORCLOSE;
1005
1006 /* Maintain mutex across the readerlock call */
1007 un = md_unit_readerlock_common(ui, 1);
1008 mutex_exit(&ui->ui_mx);
1009
1010 return (un);
1011 }
1012
1013 void
md_unit_openclose_exit(mdi_unit_t * ui)1014 md_unit_openclose_exit(mdi_unit_t *ui)
1015 {
1016 md_unit_readerexit(ui);
1017
1018 mutex_enter(&ui->ui_mx);
1019 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1020 ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1021
1022 cv_broadcast(&ui->ui_cv);
1023 mutex_exit(&ui->ui_mx);
1024 }
1025
1026 /*
1027 * Drop the openclose and readerlocks without acquiring or
1028 * releasing the ui_mx lock since the calling routine has
1029 * already acquired this lock.
1030 */
1031 void
md_unit_openclose_exit_lh(mdi_unit_t * ui)1032 md_unit_openclose_exit_lh(mdi_unit_t *ui)
1033 {
1034 md_unit_readerexit_common(ui, 1);
1035 ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1036 ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1037 cv_broadcast(&ui->ui_cv);
1038 }
1039
1040 int
md_unit_isopen(mdi_unit_t * ui)1041 md_unit_isopen(
1042 mdi_unit_t *ui
1043 )
1044 {
1045 int isopen;
1046
1047 /* check status */
1048 mutex_enter(&ui->ui_mx);
1049 isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
1050 mutex_exit(&ui->ui_mx);
1051 return (isopen);
1052 }
1053
1054 int
md_unit_incopen(minor_t mnum,int flag,int otyp)1055 md_unit_incopen(
1056 minor_t mnum,
1057 int flag,
1058 int otyp
1059 )
1060 {
1061 mdi_unit_t *ui = MDI_UNIT(mnum);
1062 int err = 0;
1063
1064 /* check type and flags */
1065 ASSERT(ui != NULL);
1066 mutex_enter(&ui->ui_mx);
1067 if ((otyp < 0) || (otyp >= OTYPCNT)) {
1068 err = EINVAL;
1069 goto out;
1070 }
1071 if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
1072 (ui->ui_lock & MD_UL_EXCL)) {
1073 err = EBUSY;
1074 goto out;
1075 }
1076
1077 /* count and flag open */
1078 ui->ui_ocnt[otyp]++;
1079 ui->ui_lock |= MD_UL_OPEN;
1080 if (flag & FEXCL)
1081 ui->ui_lock |= MD_UL_EXCL;
1082
1083 /* setup kstat, return success */
1084 mutex_exit(&ui->ui_mx);
1085 md_kstat_init(mnum);
1086 return (0);
1087
1088 /* return error */
1089 out:
1090 mutex_exit(&ui->ui_mx);
1091 return (err);
1092 }
1093
1094 int
md_unit_decopen(minor_t mnum,int otyp)1095 md_unit_decopen(
1096 minor_t mnum,
1097 int otyp
1098 )
1099 {
1100 mdi_unit_t *ui = MDI_UNIT(mnum);
1101 int err = 0;
1102 unsigned i;
1103
1104 /* check type and flags */
1105 ASSERT(ui != NULL);
1106 mutex_enter(&ui->ui_mx);
1107 if ((otyp < 0) || (otyp >= OTYPCNT)) {
1108 err = EINVAL;
1109 goto out;
1110 } else if (ui->ui_ocnt[otyp] == 0) {
1111 err = ENXIO;
1112 goto out;
1113 }
1114
1115 /* count and flag closed */
1116 if (otyp == OTYP_LYR)
1117 ui->ui_ocnt[otyp]--;
1118 else
1119 ui->ui_ocnt[otyp] = 0;
1120 ui->ui_lock &= ~MD_UL_OPEN;
1121 for (i = 0; (i < OTYPCNT); ++i)
1122 if (ui->ui_ocnt[i] != 0)
1123 ui->ui_lock |= MD_UL_OPEN;
1124 if (! (ui->ui_lock & MD_UL_OPEN))
1125 ui->ui_lock &= ~MD_UL_EXCL;
1126
1127 /* teardown kstat, return success */
1128 if (! (ui->ui_lock & MD_UL_OPEN)) {
1129
1130 /*
1131 * We have a race condition inherited from specfs between
1132 * open() and close() calls. This results in the kstat
1133 * for a pending I/O being torn down, and then a panic.
1134 * To avoid this, only tear the kstat down if there are
1135 * no other readers on this device.
1136 */
1137 if (ui->ui_readercnt > 1) {
1138 mutex_exit(&ui->ui_mx);
1139 } else {
1140 mutex_exit(&ui->ui_mx);
1141 md_kstat_destroy(mnum);
1142 }
1143 return (0);
1144 }
1145
1146 /* return success */
1147 out:
1148 mutex_exit(&ui->ui_mx);
1149 return (err);
1150 }
1151
1152 md_dev64_t
md_xlate_targ_2_mini(md_dev64_t targ_devt)1153 md_xlate_targ_2_mini(md_dev64_t targ_devt)
1154 {
1155 dev32_t mini_32_devt, targ_32_devt;
1156 int i;
1157
1158 /*
1159 * check to see if we're in an upgrade situation
1160 * if we are not in upgrade just return the input device
1161 */
1162
1163 if (!MD_UPGRADE)
1164 return (targ_devt);
1165
1166 targ_32_devt = md_cmpldev(targ_devt);
1167
1168 i = 0;
1169 while (i != md_tuple_length) {
1170 if (md_tuple_table[i].targ_devt == targ_32_devt) {
1171 mini_32_devt = md_tuple_table[i].mini_devt;
1172 return (md_expldev((md_dev64_t)mini_32_devt));
1173 }
1174 i++;
1175 }
1176 return (NODEV64);
1177 }
1178
1179 md_dev64_t
md_xlate_mini_2_targ(md_dev64_t mini_devt)1180 md_xlate_mini_2_targ(md_dev64_t mini_devt)
1181 {
1182 dev32_t mini_32_devt, targ_32_devt;
1183 int i;
1184
1185 if (!MD_UPGRADE)
1186 return (mini_devt);
1187
1188 mini_32_devt = md_cmpldev(mini_devt);
1189
1190 i = 0;
1191 while (i != md_tuple_length) {
1192 if (md_tuple_table[i].mini_devt == mini_32_devt) {
1193 targ_32_devt = md_tuple_table[i].targ_devt;
1194 return (md_expldev((md_dev64_t)targ_32_devt));
1195 }
1196 i++;
1197 }
1198 return (NODEV64);
1199 }
1200
1201 void
md_xlate_free(int size)1202 md_xlate_free(int size)
1203 {
1204 kmem_free(md_tuple_table, size);
1205 }
1206
1207 char *
md_targ_major_to_name(major_t maj)1208 md_targ_major_to_name(major_t maj)
1209 {
1210 char *drv_name = NULL;
1211 int i;
1212
1213 if (!MD_UPGRADE)
1214 return (ddi_major_to_name(maj));
1215
1216 for (i = 0; i < md_majortab_len; i++) {
1217 if (md_major_tuple_table[i].targ_maj == maj) {
1218 drv_name = md_major_tuple_table[i].drv_name;
1219 break;
1220 }
1221 }
1222 return (drv_name);
1223 }
1224
1225 major_t
md_targ_name_to_major(char * drv_name)1226 md_targ_name_to_major(char *drv_name)
1227 {
1228 major_t maj;
1229 int i;
1230
1231 maj = md_getmajor(NODEV64);
1232 if (!MD_UPGRADE)
1233 return (ddi_name_to_major(drv_name));
1234
1235 for (i = 0; i < md_majortab_len; i++) {
1236 if ((strcmp(md_major_tuple_table[i].drv_name,
1237 drv_name)) == 0) {
1238 maj = md_major_tuple_table[i].targ_maj;
1239 break;
1240 }
1241 }
1242
1243 return (maj);
1244 }
1245
1246 void
md_majortab_free()1247 md_majortab_free()
1248 {
1249 size_t sz;
1250 int i;
1251
1252 for (i = 0; i < md_majortab_len; i++) {
1253 freestr(md_major_tuple_table[i].drv_name);
1254 }
1255
1256 sz = md_majortab_len * sizeof (struct md_xlate_major_table);
1257 kmem_free(md_major_tuple_table, sz);
1258 }
1259
1260 /* functions return a pointer to a function which returns an int */
1261
1262 intptr_t (*
md_get_named_service(md_dev64_t dev,int modindex,char * name,intptr_t (* Default)())1263 md_get_named_service(md_dev64_t dev, int modindex, char *name,
1264 intptr_t (*Default)()))()
1265 {
1266 mdi_unit_t *ui;
1267 md_named_services_t *sp;
1268 int i;
1269
1270 /*
1271 * Return the first named service found.
1272 * Use this path when it is known that there is only
1273 * one named service possible (e.g., hotspare interface)
1274 */
1275 if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
1276 for (i = 0; i < MD_NOPS; i++) {
1277 if (md_ops[i] == NULL) {
1278 continue;
1279 }
1280 sp = md_ops[i]->md_services;
1281 if (sp == NULL)
1282 continue;
1283 while (sp->md_service != NULL) {
1284 if (strcmp(name, sp->md_name) == 0)
1285 return (sp->md_service);
1286 sp++;
1287 }
1288 }
1289 return (Default);
1290 }
1291
1292 /*
1293 * Return the named service for the given modindex.
1294 * This is used if there are multiple possible named services
1295 * and each one needs to be called (e.g., poke hotspares)
1296 */
1297 if (dev == NODEV64) {
1298 if (modindex >= MD_NOPS)
1299 return (Default);
1300
1301 if (md_ops[modindex] == NULL)
1302 return (Default);
1303
1304 sp = md_ops[modindex]->md_services;
1305 if (sp == NULL)
1306 return (Default);
1307
1308 while (sp->md_service != NULL) {
1309 if (strcmp(name, sp->md_name) == 0)
1310 return (sp->md_service);
1311 sp++;
1312 }
1313 return (Default);
1314 }
1315
1316 /*
1317 * Return the named service for this md_dev64_t
1318 */
1319 if (md_getmajor(dev) != md_major)
1320 return (Default);
1321
1322 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
1323 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
1324 return (NULL);
1325
1326
1327 if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
1328 return (NULL);
1329
1330 sp = md_ops[ui->ui_opsindex]->md_services;
1331 if (sp == NULL)
1332 return (Default);
1333 while (sp->md_service != NULL) {
1334 if (strcmp(name, sp->md_name) == 0)
1335 return (sp->md_service);
1336 sp++;
1337 }
1338 return (Default);
1339 }
1340
1341 /*
1342 * md_daemon callback routine
1343 */
1344 boolean_t
callb_md_cpr(void * arg,int code)1345 callb_md_cpr(void *arg, int code)
1346 {
1347 callb_cpr_t *cp = (callb_cpr_t *)arg;
1348 int ret = 0; /* assume success */
1349 clock_t delta;
1350
1351 mutex_enter(cp->cc_lockp);
1352
1353 switch (code) {
1354 case CB_CODE_CPR_CHKPT:
1355 /*
1356 * Check for active resync threads
1357 */
1358 mutex_enter(&md_cpr_resync.md_resync_mutex);
1359 if ((md_cpr_resync.md_mirror_resync > 0) ||
1360 (md_cpr_resync.md_raid_resync > 0)) {
1361 mutex_exit(&md_cpr_resync.md_resync_mutex);
1362 cmn_err(CE_WARN, "There are Solaris Volume Manager "
1363 "synchronization threads running.");
1364 cmn_err(CE_WARN, "Please try system suspension at "
1365 "a later time.");
1366 ret = -1;
1367 break;
1368 }
1369 mutex_exit(&md_cpr_resync.md_resync_mutex);
1370
1371 cp->cc_events |= CALLB_CPR_START;
1372 delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
1373 while (!(cp->cc_events & CALLB_CPR_SAFE))
1374 /* cv_reltimedwait() returns -1 if it times out. */
1375 if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
1376 cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
1377 break;
1378 break;
1379
1380 case CB_CODE_CPR_RESUME:
1381 cp->cc_events &= ~CALLB_CPR_START;
1382 cv_signal(&cp->cc_stop_cv);
1383 break;
1384 }
1385 mutex_exit(cp->cc_lockp);
1386 return (ret != -1);
1387 }
1388
1389 void
md_daemon(int pass_thru,mdq_anchor_t * anchor)1390 md_daemon(int pass_thru, mdq_anchor_t *anchor)
1391 {
1392 daemon_queue_t *dq;
1393 callb_cpr_t cprinfo;
1394
1395 if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
1396 return;
1397 /*
1398 * Register cpr callback
1399 */
1400 CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
1401
1402 /*CONSTCOND*/
1403 while (1) {
1404 mutex_enter(&anchor->a_mx);
1405 while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
1406 if (pass_thru) {
1407 /*
1408 * CALLB_CPR_EXIT Will do
1409 * mutex_exit(&anchor->a_mx)
1410 */
1411 CALLB_CPR_EXIT(&cprinfo);
1412 return;
1413 }
1414 if (md_get_status() & MD_GBL_DAEMONS_DIE) {
1415 mutex_exit(&anchor->a_mx);
1416 mutex_enter(&md_mx);
1417 md_num_daemons--;
1418 mutex_exit(&md_mx);
1419 /*
1420 * CALLB_CPR_EXIT will do
1421 * mutex_exit(&anchor->a_mx)
1422 */
1423 mutex_enter(&anchor->a_mx);
1424 CALLB_CPR_EXIT(&cprinfo);
1425 thread_exit();
1426 }
1427 CALLB_CPR_SAFE_BEGIN(&cprinfo);
1428 cv_wait(&anchor->a_cv, &anchor->a_mx);
1429 CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
1430 }
1431 dq->dq_prev->dq_next = dq->dq_next;
1432 dq->dq_next->dq_prev = dq->dq_prev;
1433 dq->dq_prev = dq->dq_next = NULL;
1434 anchor->dq.qlen--;
1435 mutex_exit(&anchor->a_mx);
1436 (*(dq->dq_call))(dq);
1437 }
1438 /*NOTREACHED*/
1439 }
1440
1441 /*
1442 * daemon_request:
1443 *
1444 * Adds requests to appropriate requestq which is
1445 * anchored by *anchor.
1446 * The request is the first element of a doubly linked circular list.
1447 * When the request is a single element, the forward and backward
1448 * pointers MUST point to the element itself.
1449 */
1450
1451 void
daemon_request(mdq_anchor_t * anchor,void (* func)(),daemon_queue_t * request,callstyle_t style)1452 daemon_request(mdq_anchor_t *anchor, void (*func)(),
1453 daemon_queue_t *request, callstyle_t style)
1454 {
1455 daemon_queue_t *rqtp;
1456 int i = 0;
1457
1458 rqtp = request;
1459 if (style == REQ_OLD) {
1460 ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
1461 /* set it to the new style */
1462 rqtp->dq_prev = rqtp->dq_next = rqtp;
1463 }
1464 ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
1465
1466 /* scan the list and add the function to each element */
1467
1468 do {
1469 rqtp->dq_call = func;
1470 i++;
1471 rqtp = rqtp->dq_next;
1472 } while (rqtp != request);
1473
1474 /* save pointer to tail of the request list */
1475 rqtp = request->dq_prev;
1476
1477 mutex_enter(&anchor->a_mx);
1478 /* stats */
1479 anchor->dq.qlen += i;
1480 anchor->dq.treqs += i;
1481 anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
1482 anchor->dq.qlen : anchor->dq.maxq_len;
1483
1484 /* now add the list to request queue */
1485 request->dq_prev = anchor->dq.dq_prev;
1486 rqtp->dq_next = &anchor->dq;
1487 anchor->dq.dq_prev->dq_next = request;
1488 anchor->dq.dq_prev = rqtp;
1489 cv_broadcast(&anchor->a_cv);
1490 mutex_exit(&anchor->a_mx);
1491 }
1492
1493 void
mddb_commitrec_wrapper(mddb_recid_t recid)1494 mddb_commitrec_wrapper(mddb_recid_t recid)
1495 {
1496 int sent_log = 0;
1497 uint_t retry = md_retry_cnt;
1498 set_t setno;
1499
1500 while (mddb_commitrec(recid)) {
1501 if (! sent_log) {
1502 cmn_err(CE_WARN,
1503 "md: state database commit failed");
1504 sent_log = 1;
1505 }
1506 delay(md_hz);
1507
1508 /*
1509 * Setting retry cnt to one (pre decremented) so that we
1510 * actually do no retries when committing/deleting a mddb rec.
1511 * The underlying disk driver does several retries to check
1512 * if the disk is really dead or not so there
1513 * is no reason for us to retry on top of the drivers retries.
1514 */
1515
1516 if (--retry == 0) {
1517 setno = mddb_getsetnum(recid);
1518 if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1519 panic(
1520 "md: Panic due to lack of DiskSuite state\n"
1521 " database replicas. Fewer than 50%% of "
1522 "the total were available,\n so panic to "
1523 "ensure data integrity.");
1524 } else {
1525 panic("md: state database problem");
1526 }
1527 /*NOTREACHED*/
1528 }
1529 }
1530 }
1531
1532 void
mddb_commitrecs_wrapper(mddb_recid_t * recids)1533 mddb_commitrecs_wrapper(mddb_recid_t *recids)
1534 {
1535 int sent_log = 0;
1536 uint_t retry = md_retry_cnt;
1537 set_t setno;
1538
1539 while (mddb_commitrecs(recids)) {
1540 if (! sent_log) {
1541 cmn_err(CE_WARN,
1542 "md: state database commit failed");
1543 sent_log = 1;
1544 }
1545 delay(md_hz);
1546
1547 /*
1548 * Setting retry cnt to one (pre decremented) so that we
1549 * actually do no retries when committing/deleting a mddb rec.
1550 * The underlying disk driver does several retries to check
1551 * if the disk is really dead or not so there
1552 * is no reason for us to retry on top of the drivers retries.
1553 */
1554
1555 if (--retry == 0) {
1556 /*
1557 * since all the records are part of the same set
1558 * use the first one to get setno
1559 */
1560 setno = mddb_getsetnum(*recids);
1561 if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1562 panic(
1563 "md: Panic due to lack of DiskSuite state\n"
1564 " database replicas. Fewer than 50%% of "
1565 "the total were available,\n so panic to "
1566 "ensure data integrity.");
1567 } else {
1568 panic("md: state database problem");
1569 }
1570 /*NOTREACHED*/
1571 }
1572 }
1573 }
1574
1575 void
mddb_deleterec_wrapper(mddb_recid_t recid)1576 mddb_deleterec_wrapper(mddb_recid_t recid)
1577 {
1578 int sent_log = 0;
1579 uint_t retry = md_retry_cnt;
1580 set_t setno;
1581
1582 while (mddb_deleterec(recid)) {
1583 if (! sent_log) {
1584 cmn_err(CE_WARN,
1585 "md: state database delete failed");
1586 sent_log = 1;
1587 }
1588 delay(md_hz);
1589
1590 /*
1591 * Setting retry cnt to one (pre decremented) so that we
1592 * actually do no retries when committing/deleting a mddb rec.
1593 * The underlying disk driver does several retries to check
1594 * if the disk is really dead or not so there
1595 * is no reason for us to retry on top of the drivers retries.
1596 */
1597
1598 if (--retry == 0) {
1599 setno = mddb_getsetnum(recid);
1600 if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1601 panic(
1602 "md: Panic due to lack of DiskSuite state\n"
1603 " database replicas. Fewer than 50%% of "
1604 "the total were available,\n so panic to "
1605 "ensure data integrity.");
1606 } else {
1607 panic("md: state database problem");
1608 }
1609 /*NOTREACHED*/
1610 }
1611 }
1612 }
1613
1614 /*
1615 * md_holdset_enter is called in order to hold the set in its
1616 * current state (loaded, unloaded, snarfed, unsnarfed, etc)
1617 * until md_holdset_exit is called. This is used by the mirror
1618 * code to mark the set as HOLD so that the set won't be
1619 * unloaded while hotspares are being allocated in check_4_hotspares.
1620 * The original fix to the mirror code to hold the set was to call
1621 * md_haltsnarf_enter, but this will block all ioctls and ioctls
1622 * must work for a MN diskset while hotspares are allocated.
1623 */
1624 void
md_holdset_enter(set_t setno)1625 md_holdset_enter(set_t setno)
1626 {
1627 mutex_enter(&md_mx);
1628 while (md_set[setno].s_status & MD_SET_HOLD)
1629 cv_wait(&md_cv, &md_mx);
1630 md_set[setno].s_status |= MD_SET_HOLD;
1631 mutex_exit(&md_mx);
1632 }
1633
1634 void
md_holdset_exit(set_t setno)1635 md_holdset_exit(set_t setno)
1636 {
1637 mutex_enter(&md_mx);
1638 md_set[setno].s_status &= ~MD_SET_HOLD;
1639 cv_broadcast(&md_cv);
1640 mutex_exit(&md_mx);
1641 }
1642
1643 /*
1644 * Returns a 0 if this thread marked the set as HOLD (success),
1645 * returns a -1 if set was already marked HOLD (failure).
1646 * Used by the release_set code to see if set is marked HOLD.
1647 * HOLD is set by a daemon when hotspares are being allocated
1648 * to mirror units.
1649 */
1650 int
md_holdset_testandenter(set_t setno)1651 md_holdset_testandenter(set_t setno)
1652 {
1653 mutex_enter(&md_mx);
1654 if (md_set[setno].s_status & MD_SET_HOLD) {
1655 mutex_exit(&md_mx);
1656 return (-1);
1657 }
1658 md_set[setno].s_status |= MD_SET_HOLD;
1659 mutex_exit(&md_mx);
1660 return (0);
1661 }
1662
1663 void
md_haltsnarf_enter(set_t setno)1664 md_haltsnarf_enter(set_t setno)
1665 {
1666 mutex_enter(&md_mx);
1667 while (md_set[setno].s_status & MD_SET_SNARFING)
1668 cv_wait(&md_cv, &md_mx);
1669
1670 md_set[setno].s_status |= MD_SET_SNARFING;
1671 mutex_exit(&md_mx);
1672 }
1673
1674 void
md_haltsnarf_exit(set_t setno)1675 md_haltsnarf_exit(set_t setno)
1676 {
1677 mutex_enter(&md_mx);
1678 md_set[setno].s_status &= ~MD_SET_SNARFING;
1679 cv_broadcast(&md_cv);
1680 mutex_exit(&md_mx);
1681 }
1682
1683 void
md_haltsnarf_wait(set_t setno)1684 md_haltsnarf_wait(set_t setno)
1685 {
1686 mutex_enter(&md_mx);
1687 while (md_set[setno].s_status & MD_SET_SNARFING)
1688 cv_wait(&md_cv, &md_mx);
1689 mutex_exit(&md_mx);
1690 }
1691
1692 /*
1693 * ASSUMED that the md_unit_array_rw WRITER lock is held.
1694 */
1695 int
md_halt_set(set_t setno,enum md_haltcmd cmd)1696 md_halt_set(set_t setno, enum md_haltcmd cmd)
1697 {
1698 int i, err;
1699
1700 if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
1701 return (0);
1702 }
1703
1704 if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
1705 for (i = 0; i < MD_NOPS; i++) {
1706 if (md_ops[i] == NULL)
1707 continue;
1708 if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
1709 for (--i; i > 0; --i) {
1710 if (md_ops[i] == NULL)
1711 continue;
1712 (void) (*(md_ops[i]->md_halt))
1713 (MD_HALT_OPEN, setno);
1714 }
1715 return (EBUSY);
1716 }
1717 }
1718
1719 for (i = 0; i < MD_NOPS; i++) {
1720 if (md_ops[i] == NULL)
1721 continue;
1722 if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
1723 for (i = 0; i < MD_NOPS; i++) {
1724 if (md_ops[i] == NULL)
1725 continue;
1726 (void) (*(md_ops[i]->md_halt))
1727 (MD_HALT_OPEN, setno);
1728 }
1729 return (EBUSY);
1730 }
1731 }
1732 }
1733
1734 if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
1735 for (i = 0; i < MD_NOPS; i++) {
1736 if (md_ops[i] == NULL)
1737 continue;
1738 err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
1739 if (err != 0)
1740 cmn_err(CE_NOTE,
1741 "md: halt failed for %s, error %d",
1742 md_ops[i]->md_driver.md_drivername, err);
1743 }
1744
1745 /*
1746 * Unload the devid namespace if it is loaded
1747 */
1748 md_unload_namespace(setno, NM_DEVID);
1749 md_unload_namespace(setno, 0L);
1750 md_clr_setstatus(setno, MD_SET_SNARFED);
1751 }
1752
1753 return (0);
1754 }
1755
1756 int
md_halt(int global_locks_owned_mask)1757 md_halt(int global_locks_owned_mask)
1758 {
1759 set_t i, j;
1760 int err;
1761 int init_queues;
1762 md_requestq_entry_t *rqp;
1763 md_ops_t **pops, *ops, *lops;
1764 ddi_modhandle_t mod;
1765 char *name;
1766
1767 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1768
1769 /*
1770 * Grab the all of the global locks that are not
1771 * already owned to ensure that there isn't another
1772 * thread trying to access a global resource
1773 * while the halt is in progress
1774 */
1775 if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
1776 return (EINTR);
1777
1778 for (i = 0; i < md_nsets; i++)
1779 md_haltsnarf_enter(i);
1780
1781 /*
1782 * Kill the daemon threads.
1783 */
1784 init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
1785 md_clr_status(MD_GBL_DAEMONS_LIVE);
1786 md_set_status(MD_GBL_DAEMONS_DIE);
1787
1788 rqp = &md_daemon_queues[0];
1789 i = 0;
1790 while (!NULL_REQUESTQ_ENTRY(rqp)) {
1791 cv_broadcast(&rqp->dispq_headp->a_cv);
1792 rqp = &md_daemon_queues[++i];
1793 }
1794
1795 mutex_enter(&md_mx);
1796 while (md_num_daemons != 0) {
1797 mutex_exit(&md_mx);
1798 delay(md_hz);
1799 mutex_enter(&md_mx);
1800 }
1801 mutex_exit(&md_mx);
1802 md_clr_status(MD_GBL_DAEMONS_DIE);
1803
1804 for (i = 0; i < md_nsets; i++)
1805 /*
1806 * Only call into md_halt_set if s_un / s_ui are both set.
1807 * If they are NULL this set hasn't been accessed, so its
1808 * pointless performing the call.
1809 */
1810 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1811 if (md_halt_set(i, MD_HALT_CHECK)) {
1812 if (md_start_daemons(init_queues))
1813 cmn_err(CE_WARN,
1814 "md: restart of daemon threads "
1815 "failed");
1816 for (j = 0; j < md_nsets; j++)
1817 md_haltsnarf_exit(j);
1818
1819 return (md_global_lock_exit(
1820 global_locks_owned_mask, EBUSY,
1821 MD_ARRAY_WRITER, NULL));
1822 }
1823 }
1824
1825 /*
1826 * if we get here we are going to do it
1827 */
1828 for (i = 0; i < md_nsets; i++) {
1829 /*
1830 * Only call into md_halt_set if s_un / s_ui are both set.
1831 * If they are NULL this set hasn't been accessed, so its
1832 * pointless performing the call.
1833 */
1834 if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1835 err = md_halt_set(i, MD_HALT_DOIT);
1836 if (err != 0)
1837 cmn_err(CE_NOTE,
1838 "md: halt failed set %u, error %d",
1839 (unsigned)i, err);
1840 }
1841 }
1842
1843 /*
1844 * issue a halt unload to each module to indicate that it
1845 * is about to be unloaded. Each module is called once, set
1846 * has no meaning at this point in time.
1847 */
1848 for (i = 0; i < MD_NOPS; i++) {
1849 if (md_ops[i] == NULL)
1850 continue;
1851 err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
1852 if (err != 0)
1853 cmn_err(CE_NOTE,
1854 "md: halt failed for %s, error %d",
1855 md_ops[i]->md_driver.md_drivername, err);
1856 }
1857
1858 /* ddi_modclose the submodules */
1859 for (i = 0; i < MD_NOPS; i++) {
1860 /* skip if not open */
1861 if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
1862 continue;
1863
1864 /* find and unlink from md_opslist */
1865 ops = md_ops[i];
1866 mod = md_mods[i];
1867 pops = &md_opslist;
1868 for (lops = *pops; lops;
1869 pops = &lops->md_next, lops = *pops) {
1870 if (lops == ops) {
1871 *pops = ops->md_next;
1872 ops->md_next = NULL;
1873 break;
1874 }
1875 }
1876
1877 /* uninitialize */
1878 name = ops->md_driver.md_drivername;
1879 md_ops[i] = NULL;
1880 md_mods[i] = NULL;
1881 ops->md_selfindex = 0;
1882 ops->md_driver.md_drivername[0] = '\0';
1883 rw_destroy(&ops->md_link_rw.lock);
1884
1885 /* close */
1886 err = ddi_modclose(mod);
1887 if (err != 0)
1888 cmn_err(CE_NOTE,
1889 "md: halt close failed for %s, error %d",
1890 name ? name : "UNKNOWN", err);
1891 }
1892
1893 /* Unload the database */
1894 mddb_unload();
1895
1896 md_set_status(MD_GBL_HALTED); /* we are ready to be unloaded */
1897
1898 for (i = 0; i < md_nsets; i++)
1899 md_haltsnarf_exit(i);
1900
1901 return (md_global_lock_exit(global_locks_owned_mask, 0,
1902 MD_ARRAY_WRITER, NULL));
1903 }
1904
1905 /*
1906 * md_layered_open() is an internal routine only for SVM modules.
1907 * So the input device will be a md_dev64_t, because all SVM modules internally
1908 * work with that device type.
1909 * ddi routines on the other hand work with dev_t. So, if we call any ddi
1910 * routines from here we first have to convert that device into a dev_t.
1911 */
1912
1913 int
md_layered_open(minor_t mnum,md_dev64_t * dev,int md_oflags)1914 md_layered_open(
1915 minor_t mnum,
1916 md_dev64_t *dev,
1917 int md_oflags
1918 )
1919 {
1920 int flag = (FREAD | FWRITE);
1921 cred_t *cred_p = kcred;
1922 major_t major;
1923 int err;
1924 dev_t ddi_dev = md_dev64_to_dev(*dev);
1925
1926 if (ddi_dev == NODEV)
1927 return (ENODEV);
1928
1929 major = getmajor(ddi_dev);
1930
1931 /* metadevice */
1932 if (major == md_major) {
1933 mdi_unit_t *ui;
1934
1935 /* open underlying driver */
1936 mnum = getminor(ddi_dev);
1937
1938 ui = MDI_UNIT(mnum);
1939 if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1940 int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
1941 flag, OTYP_LYR, cred_p, md_oflags);
1942 /*
1943 * As open() may change the device,
1944 * send this info back to the caller.
1945 */
1946 *dev = md_expldev(ddi_dev);
1947 return (ret);
1948 }
1949
1950 /* or do it ourselves */
1951 (void) md_unit_openclose_enter(ui);
1952 err = md_unit_incopen(mnum, flag, OTYP_LYR);
1953 md_unit_openclose_exit(ui);
1954 /* convert our ddi_dev back to the dev we were given */
1955 *dev = md_expldev(ddi_dev);
1956 return (err);
1957 }
1958
1959 /*
1960 * Open regular device, since open() may change dev_t give new dev_t
1961 * back to the caller.
1962 */
1963 err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
1964 *dev = md_expldev(ddi_dev);
1965 return (err);
1966 }
1967
1968 /*
1969 * md_layered_close() is an internal routine only for SVM modules.
1970 * So the input device will be a md_dev64_t, because all SVM modules internally
1971 * work with that device type.
1972 * ddi routines on the other hand work with dev_t. So, if we call any ddi
1973 * routines from here we first have to convert that device into a dev_t.
1974 */
1975 void
md_layered_close(md_dev64_t dev,int md_cflags)1976 md_layered_close(
1977 md_dev64_t dev,
1978 int md_cflags
1979 )
1980 {
1981 int flag = (FREAD | FWRITE);
1982 cred_t *cred_p = kcred;
1983 dev_t ddi_dev = md_dev64_to_dev(dev);
1984 major_t major = getmajor(ddi_dev);
1985 minor_t mnum = getminor(ddi_dev);
1986
1987 /* metadevice */
1988 if (major == md_major) {
1989 mdi_unit_t *ui = MDI_UNIT(mnum);
1990
1991 /* close underlying driver */
1992 if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1993 (*md_ops[ui->ui_opsindex]->md_close)
1994 (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
1995 return;
1996 }
1997
1998 /* or do it ourselves */
1999 (void) md_unit_openclose_enter(ui);
2000 (void) md_unit_decopen(mnum, OTYP_LYR);
2001 md_unit_openclose_exit(ui);
2002 return;
2003 }
2004
2005 /* close regular device */
2006 (void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
2007 }
2008
2009 /*
2010 * saves a little code in mdstrategy
2011 */
2012 int
errdone(mdi_unit_t * ui,struct buf * bp,int err)2013 errdone(mdi_unit_t *ui, struct buf *bp, int err)
2014 {
2015 if ((bp->b_error = err) != 0)
2016 bp->b_flags |= B_ERROR;
2017 else
2018 bp->b_resid = bp->b_bcount;
2019 md_unit_readerexit(ui);
2020 md_biodone(bp);
2021 return (1);
2022 }
2023
2024 static int md_write_label = 0;
2025
2026 int
md_checkbuf(mdi_unit_t * ui,md_unit_t * un,buf_t * bp)2027 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
2028 {
2029 diskaddr_t endblk;
2030 set_t setno = MD_UN2SET(un);
2031
2032 if ((md_get_setstatus(setno) & MD_SET_STALE) &&
2033 (! (bp->b_flags & B_READ)))
2034 return (errdone(ui, bp, EROFS));
2035 /*
2036 * Check early for unreasonable block number.
2037 *
2038 * b_blkno is defined as adaddr_t which is typedef'd to a long.
2039 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
2040 * doesn't, b_blkno is then compared as a negative number which is
2041 * always less than a positive.
2042 */
2043 if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
2044 return (errdone(ui, bp, EINVAL));
2045
2046 if (bp->b_lblkno == un->c.un_total_blocks)
2047 return (errdone(ui, bp, 0));
2048
2049 /*
2050 * make sure we don't clobber any labels
2051 */
2052 if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
2053 (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
2054 cmn_err(CE_NOTE, "md: %s: write to label",
2055 md_shortname(getminor(bp->b_edev)));
2056 return (errdone(ui, bp, EINVAL));
2057 }
2058
2059 bp->b_resid = 0;
2060 endblk = (diskaddr_t)(bp->b_lblkno +
2061 howmany(bp->b_bcount, DEV_BSIZE) - 1);
2062
2063 if (endblk > (un->c.un_total_blocks - 1)) {
2064 bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
2065 endblk = un->c.un_total_blocks - 1;
2066 bp->b_bcount -= bp->b_resid;
2067 }
2068 return (0);
2069 }
2070
2071 /*
2072 * init_request_queue: initializes the request queues and creates the threads.
2073 * return value = 0 :invalid num_threads
2074 * = n : n is the number of threads created.
2075 */
2076
2077 int
init_requestq(md_requestq_entry_t * rq,void (* threadfn)(),caddr_t threadfn_args,int pri,int init_queue)2078 init_requestq(
2079 md_requestq_entry_t *rq, /* request queue info */
2080 void (*threadfn)(), /* function to start the thread */
2081 caddr_t threadfn_args, /* args to the function */
2082 int pri, /* thread priority */
2083 int init_queue) /* flag to init queues */
2084 {
2085 struct mdq_anchor *rqhead;
2086 int i;
2087 int num_threads;
2088
2089
2090 num_threads = *(rq->num_threadsp);
2091 rqhead = rq->dispq_headp;
2092
2093 if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
2094 return (0);
2095
2096 if (init_queue) {
2097 rqhead->dq.maxq_len = 0;
2098 rqhead->dq.treqs = 0;
2099 rqhead->dq.dq_next = &rqhead->dq;
2100 rqhead->dq.dq_prev = &rqhead->dq;
2101 cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
2102 mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
2103 }
2104 for (i = 0; i < num_threads; i++) {
2105 (void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
2106 TS_RUN, pri);
2107 }
2108 return (i);
2109 }
2110
2111 static void
start_daemon(struct mdq_anchor * q)2112 start_daemon(struct mdq_anchor *q)
2113 {
2114 md_daemon(0, q);
2115 ASSERT(0);
2116 }
2117
2118 /*
2119 * Creates all the md daemons.
2120 * Global:
2121 * md_num_daemons is set to number of daemons.
2122 * MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
2123 *
2124 * Return value: 0 success
2125 * 1 failure
2126 */
2127 int
md_start_daemons(int init_queue)2128 md_start_daemons(int init_queue)
2129 {
2130 md_requestq_entry_t *rqp;
2131 int cnt;
2132 int i;
2133 int retval = 0;
2134
2135
2136 if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
2137 return (retval);
2138 }
2139 md_clr_status(MD_GBL_DAEMONS_DIE);
2140
2141 rqp = &md_daemon_queues[0];
2142 i = 0;
2143 while (!NULL_REQUESTQ_ENTRY(rqp)) {
2144 cnt = init_requestq(rqp, start_daemon,
2145 (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
2146
2147 if (cnt && cnt != *rqp->num_threadsp) {
2148 retval = 1;
2149 break;
2150 }
2151 /*
2152 * initialize variables
2153 */
2154 md_num_daemons += cnt;
2155 rqp = &md_daemon_queues[++i];
2156 }
2157
2158 md_set_status(MD_GBL_DAEMONS_LIVE);
2159 return (retval);
2160 }
2161
2162 int
md_loadsubmod(set_t setno,char * name,int drvrid)2163 md_loadsubmod(set_t setno, char *name, int drvrid)
2164 {
2165 ddi_modhandle_t mod;
2166 md_ops_t **pops, *ops;
2167 int i, err;
2168
2169 /*
2170 * See if the submodule is mdopened. If not, i is the index of the
2171 * next empty slot.
2172 */
2173 for (i = 0; md_ops[i] != NULL; i++) {
2174 if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2175 MD_DRIVERNAMELEN) == 0)
2176 return (i);
2177
2178 if (i == (MD_NOPS - 1))
2179 return (-1);
2180 }
2181
2182 if (drvrid < 0) {
2183 /* Do not try to add any records to the DB when stale. */
2184 if (md_get_setstatus(setno) & MD_SET_STALE)
2185 return (-1);
2186 drvrid = md_setshared_name(setno, name, 0L);
2187 }
2188
2189 if (drvrid < 0)
2190 return (-1);
2191
2192 /* open and import the md_ops of the submodules */
2193 mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
2194 if (mod == NULL) {
2195 cmn_err(CE_WARN, "md_loadsubmod: "
2196 "unable to ddi_modopen %s, error %d\n", name, err);
2197 return (-1);
2198 }
2199 pops = ddi_modsym(mod, "md_interface_ops", &err);
2200 if (pops == NULL) {
2201 cmn_err(CE_WARN, "md_loadsubmod: "
2202 "unable to import md_interface_ops from %s, error %d\n",
2203 name, err);
2204 (void) ddi_modclose(mod);
2205 return (-1);
2206 }
2207
2208 /* ddi_modsym returns pointer to md_interface_ops in submod */
2209 ops = *pops;
2210
2211 /* initialize */
2212 ops->md_selfindex = i;
2213 rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
2214 (void) strncpy(ops->md_driver.md_drivername, name,
2215 MD_DRIVERNAMELEN);
2216
2217 /* plumb */
2218 md_ops[i] = ops;
2219 md_mods[i] = mod;
2220 ops->md_next = md_opslist;
2221 md_opslist = ops;
2222
2223 /* return index */
2224 return (i);
2225 }
2226
2227 int
md_getmodindex(md_driver_t * driver,int dont_load,int db_notrequired)2228 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
2229 {
2230 int i;
2231 int modindex;
2232 char *name = driver->md_drivername;
2233 set_t setno = driver->md_setno;
2234 int drvid;
2235 int local_dont_load;
2236
2237 if (setno >= md_nsets)
2238 return (-1);
2239
2240 for (i = 0; name[i] != 0; i++)
2241 if (i == (MD_DRIVERNAMELEN -1))
2242 return (-1);
2243
2244 /*
2245 * If set is STALE, set local_dont_load to 1 since no records
2246 * should be added to DB when stale.
2247 */
2248 if (md_get_setstatus(setno) & MD_SET_STALE) {
2249 local_dont_load = 1;
2250 } else {
2251 local_dont_load = dont_load;
2252 }
2253
2254 /*
2255 * Single thread ioctl module binding with respect to
2256 * similar code executed in md_loadsubmod that is called
2257 * from md_snarf_db_set (which is where that path does
2258 * its md_haltsnarf_enter call).
2259 */
2260 md_haltsnarf_enter(setno);
2261
2262 /* See if the submodule is already ddi_modopened. */
2263 for (i = 0; md_ops[i] != NULL; i++) {
2264 if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2265 MD_DRIVERNAMELEN) == 0) {
2266 if (! local_dont_load &&
2267 (md_getshared_key(setno, name) == MD_KEYBAD)) {
2268 if (md_setshared_name(setno, name, 0L)
2269 == MD_KEYBAD) {
2270 if (!db_notrequired)
2271 goto err;
2272 }
2273 }
2274 md_haltsnarf_exit(setno);
2275 return (i);
2276 }
2277
2278 if (i == (MD_NOPS -1))
2279 break;
2280 }
2281
2282 if (local_dont_load)
2283 goto err;
2284
2285 drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
2286
2287 /* ddi_modopen the submodule */
2288 modindex = md_loadsubmod(setno, name, drvid);
2289 if (modindex < 0)
2290 goto err;
2291
2292 if (md_ops[modindex]->md_snarf != NULL)
2293 (*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
2294
2295 md_haltsnarf_exit(setno);
2296 return (modindex);
2297
2298 err: md_haltsnarf_exit(setno);
2299 return (-1);
2300 }
2301
2302 void
md_call_strategy(buf_t * bp,int flags,void * private)2303 md_call_strategy(buf_t *bp, int flags, void *private)
2304 {
2305 mdi_unit_t *ui;
2306
2307 if (mdv_strategy_tstpnt)
2308 if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
2309 return;
2310 if (getmajor(bp->b_edev) != md_major) {
2311 (void) bdev_strategy(bp);
2312 return;
2313 }
2314
2315 flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
2316 ui = MDI_UNIT(getminor(bp->b_edev));
2317 ASSERT(ui != NULL);
2318 (*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
2319 }
2320
2321 /*
2322 * md_call_ioctl:
2323 * -------------
2324 * Issue the specified ioctl to the device associated with the given md_dev64_t
2325 *
2326 * Arguments:
2327 * dev - underlying device [md_dev64_t]
2328 * cmd - ioctl to perform
2329 * data - arguments / result location
2330 * mode - read/write/layered ioctl
2331 * lockp - lock reference
2332 *
2333 * Returns:
2334 * 0 success
2335 * !=0 Failure (error code)
2336 */
2337 int
md_call_ioctl(md_dev64_t dev,int cmd,void * data,int mode,IOLOCK * lockp)2338 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
2339 {
2340 dev_t device = md_dev64_to_dev(dev);
2341 int rval;
2342 mdi_unit_t *ui;
2343
2344 /*
2345 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
2346 * call the ioctl entry-point in the metadevice.
2347 */
2348 if (md_getmajor(dev) != md_major) {
2349 int rv;
2350 rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
2351 ddi_get_cred(), &rv);
2352 } else {
2353 ui = MDI_UNIT(md_getminor(dev));
2354 ASSERT(ui != NULL);
2355 rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
2356 mode, lockp);
2357 }
2358 return (rval);
2359 }
2360
2361 void
md_rem_link(set_t setno,int id,krwlock_t * rw,md_link_t ** head)2362 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
2363 {
2364 md_link_t *next;
2365 md_link_t **pprev;
2366
2367 rw_enter(rw, RW_WRITER);
2368
2369 next = *head;
2370 pprev = head;
2371 while (next) {
2372 if ((next->ln_setno == setno) && (next->ln_id == id)) {
2373 *pprev = next->ln_next;
2374 rw_exit(rw);
2375 return;
2376 }
2377 pprev = &next->ln_next;
2378 next = next->ln_next;
2379 }
2380
2381 rw_exit(rw);
2382 }
2383
2384 int
md_dev_exists(md_dev64_t dev)2385 md_dev_exists(md_dev64_t dev)
2386 {
2387
2388 if (dev == NODEV64)
2389 return (0);
2390
2391 if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
2392 return (1);
2393
2394 if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
2395 (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
2396 return (0);
2397
2398 if (MDI_UNIT(md_getminor(dev)) != NULL)
2399 return (1);
2400
2401 return (0);
2402 }
2403
2404 md_parent_t
md_get_parent(md_dev64_t dev)2405 md_get_parent(md_dev64_t dev)
2406 {
2407 md_unit_t *un;
2408 mdi_unit_t *ui;
2409 md_parent_t parent;
2410
2411 if (md_getmajor(dev) != md_major)
2412 return (MD_NO_PARENT);
2413
2414 ui = MDI_UNIT(md_getminor(dev));
2415
2416 un = (md_unit_t *)md_unit_readerlock(ui);
2417 parent = un->c.un_parent;
2418 md_unit_readerexit(ui);
2419
2420 return (parent);
2421 }
2422
2423 void
md_set_parent(md_dev64_t dev,md_parent_t parent)2424 md_set_parent(md_dev64_t dev, md_parent_t parent)
2425 {
2426 md_unit_t *un;
2427 mdi_unit_t *ui;
2428
2429 if (md_getmajor(dev) != md_major)
2430 return;
2431
2432 ui = MDI_UNIT(md_getminor(dev));
2433
2434 un = (md_unit_t *)md_unit_readerlock(ui);
2435 un->c.un_parent = parent;
2436 md_unit_readerexit(ui);
2437 }
2438
2439 void
md_reset_parent(md_dev64_t dev)2440 md_reset_parent(md_dev64_t dev)
2441 {
2442 md_unit_t *un;
2443 mdi_unit_t *ui;
2444
2445 if (md_getmajor(dev) != md_major)
2446 return;
2447
2448 ui = MDI_UNIT(md_getminor(dev));
2449
2450 un = (md_unit_t *)md_unit_readerlock(ui);
2451 un->c.un_parent = MD_NO_PARENT;
2452 md_unit_readerexit(ui);
2453 }
2454
2455
2456 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
2457
2458 int
md_hot_spare_ifc(hs_cmds_t cmd,mddb_recid_t id,u_longlong_t size,int labeled,mddb_recid_t * hs_id,mdkey_t * key,md_dev64_t * dev,diskaddr_t * sblock)2459 md_hot_spare_ifc(
2460 hs_cmds_t cmd,
2461 mddb_recid_t id,
2462 u_longlong_t size,
2463 int labeled,
2464 mddb_recid_t *hs_id,
2465 mdkey_t *key,
2466 md_dev64_t *dev,
2467 diskaddr_t *sblock)
2468 {
2469 int err;
2470
2471 /*
2472 * RW lock on hot_spare_interface. We don't want it to change from
2473 * underneath us. If hot_spare_interface is NULL we're going to
2474 * need to set it. So we need to upgrade to a WRITER lock. If that
2475 * doesn't work, we drop the lock and reenter as WRITER. This leaves
2476 * a small hole during which hot_spare_interface could be modified
2477 * so we check it for NULL again. What a pain. Then if still null
2478 * load from md_get_named_service.
2479 */
2480
2481 rw_enter(&hsp_rwlp.lock, RW_READER);
2482 if (hot_spare_interface == NULL) {
2483 if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
2484 rw_exit(&hsp_rwlp.lock);
2485 rw_enter(&hsp_rwlp.lock, RW_WRITER);
2486 if (hot_spare_interface != NULL) {
2487 err = ((*hot_spare_interface)
2488 (cmd, id, size, labeled, hs_id, key, dev,
2489 sblock));
2490 rw_exit(&hsp_rwlp.lock);
2491 return (err);
2492 }
2493 }
2494 hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2495 "hot spare interface", 0);
2496 rw_downgrade(&hsp_rwlp.lock);
2497 }
2498
2499 if (hot_spare_interface == NULL) {
2500 cmn_err(CE_WARN, "md: no hotspare interface");
2501 rw_exit(&hsp_rwlp.lock);
2502 return (0);
2503 }
2504
2505 err = ((*hot_spare_interface)
2506 (cmd, id, size, labeled, hs_id, key, dev, sblock));
2507 rw_exit(&hsp_rwlp.lock);
2508 return (err);
2509 }
2510
2511 void
md_clear_hot_spare_interface()2512 md_clear_hot_spare_interface()
2513 {
2514 rw_enter(&hsp_rwlp.lock, RW_WRITER);
2515 hot_spare_interface = NULL;
2516 rw_exit(&hsp_rwlp.lock);
2517 }
2518
2519
2520 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
2521
2522 int
md_notify_interface(md_event_cmds_t cmd,md_tags_t tag,set_t set,md_dev64_t dev,md_event_type_t event)2523 md_notify_interface(
2524 md_event_cmds_t cmd,
2525 md_tags_t tag,
2526 set_t set,
2527 md_dev64_t dev,
2528 md_event_type_t event
2529 )
2530 {
2531 int err;
2532
2533 if (md_event_queue == NULL)
2534 return (0);
2535 rw_enter(&ni_rwlp.lock, RW_READER);
2536 if (notify_interface == NULL) {
2537 if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
2538 rw_exit(&ni_rwlp.lock);
2539 rw_enter(&ni_rwlp.lock, RW_WRITER);
2540 if (notify_interface != NULL) {
2541 err = ((*notify_interface)
2542 (cmd, tag, set, dev, event));
2543 rw_exit(&ni_rwlp.lock);
2544 return (err);
2545 }
2546 }
2547 notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2548 "notify interface", 0);
2549 rw_downgrade(&ni_rwlp.lock);
2550 }
2551 if (notify_interface == NULL) {
2552 cmn_err(CE_WARN, "md: no notify interface");
2553 rw_exit(&ni_rwlp.lock);
2554 return (0);
2555 }
2556 err = ((*notify_interface)(cmd, tag, set, dev, event));
2557 rw_exit(&ni_rwlp.lock);
2558 return (err);
2559 }
2560
2561 char *
obj2devname(uint32_t tag,uint_t setno,md_dev64_t dev)2562 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
2563 {
2564 char *setname;
2565 char name[MD_MAX_CTDLEN];
2566 minor_t mnum = md_getminor(dev);
2567 major_t maj = md_getmajor(dev);
2568 int rtn = 0;
2569
2570 /*
2571 * Verify that the passed dev_t refers to a valid metadevice.
2572 * If it doesn't we can make no assumptions as to what the device
2573 * name is. Return NULL in these cases.
2574 */
2575 if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
2576 (MD_MIN2SET(mnum) >= md_nsets)) {
2577 return (NULL);
2578 }
2579
2580 setname = NULL;
2581 name[0] = '\0';
2582 switch (tag) {
2583 case SVM_TAG_HSP:
2584 if (setno == 0) {
2585 rtn = snprintf(name, sizeof (name), "hsp%u",
2586 (unsigned)MD_MIN2UNIT(mnum));
2587 } else {
2588 setname = mddb_getsetname(setno);
2589 if (setname != NULL) {
2590 rtn = snprintf(name, sizeof (name), "%s/hsp%u",
2591 setname, (unsigned)MD_MIN2UNIT(mnum));
2592 }
2593 }
2594 break;
2595 case SVM_TAG_DRIVE:
2596 (void) sprintf(name, "drive");
2597 break;
2598 case SVM_TAG_HOST:
2599 (void) sprintf(name, "host");
2600 break;
2601 case SVM_TAG_SET:
2602 rtn = snprintf(name, sizeof (name), "%s",
2603 mddb_getsetname(setno));
2604 if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2605 (void) sprintf(name, "diskset");
2606 rtn = 0;
2607 }
2608 break;
2609 default:
2610 rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
2611 break;
2612 }
2613
2614 /* Check if we got any rubbish for any of the snprintf's */
2615 if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2616 return (NULL);
2617 }
2618
2619 return (md_strdup(name));
2620 }
2621
2622 /* Sysevent subclass and mdnotify event type pairs */
2623 struct node {
2624 char *se_ev;
2625 md_event_type_t md_ev;
2626 };
2627
2628 /*
2629 * Table must be sorted in case sensitive ascending order of
2630 * the sysevents values
2631 */
2632 static struct node ev_table[] = {
2633 { ESC_SVM_ADD, EQ_ADD },
2634 { ESC_SVM_ATTACH, EQ_ATTACH },
2635 { ESC_SVM_ATTACHING, EQ_ATTACHING },
2636 { ESC_SVM_CHANGE, EQ_CHANGE },
2637 { ESC_SVM_CREATE, EQ_CREATE },
2638 { ESC_SVM_DELETE, EQ_DELETE },
2639 { ESC_SVM_DETACH, EQ_DETACH },
2640 { ESC_SVM_DETACHING, EQ_DETACHING },
2641 { ESC_SVM_DRIVE_ADD, EQ_DRIVE_ADD },
2642 { ESC_SVM_DRIVE_DELETE, EQ_DRIVE_DELETE },
2643 { ESC_SVM_ENABLE, EQ_ENABLE },
2644 { ESC_SVM_ERRED, EQ_ERRED },
2645 { ESC_SVM_EXCHANGE, EQ_EXCHANGE },
2646 { ESC_SVM_GROW, EQ_GROW },
2647 { ESC_SVM_HS_CHANGED, EQ_HS_CHANGED },
2648 { ESC_SVM_HS_FREED, EQ_HS_FREED },
2649 { ESC_SVM_HOST_ADD, EQ_HOST_ADD },
2650 { ESC_SVM_HOST_DELETE, EQ_HOST_DELETE },
2651 { ESC_SVM_HOTSPARED, EQ_HOTSPARED },
2652 { ESC_SVM_INIT_FAILED, EQ_INIT_FAILED },
2653 { ESC_SVM_INIT_FATAL, EQ_INIT_FATAL },
2654 { ESC_SVM_INIT_START, EQ_INIT_START },
2655 { ESC_SVM_INIT_SUCCESS, EQ_INIT_SUCCESS },
2656 { ESC_SVM_IOERR, EQ_IOERR },
2657 { ESC_SVM_LASTERRED, EQ_LASTERRED },
2658 { ESC_SVM_MEDIATOR_ADD, EQ_MEDIATOR_ADD },
2659 { ESC_SVM_MEDIATOR_DELETE, EQ_MEDIATOR_DELETE },
2660 { ESC_SVM_OFFLINE, EQ_OFFLINE },
2661 { ESC_SVM_OK, EQ_OK },
2662 { ESC_SVM_ONLINE, EQ_ONLINE },
2663 { ESC_SVM_OPEN_FAIL, EQ_OPEN_FAIL },
2664 { ESC_SVM_REGEN_DONE, EQ_REGEN_DONE },
2665 { ESC_SVM_REGEN_FAILED, EQ_REGEN_FAILED },
2666 { ESC_SVM_REGEN_START, EQ_REGEN_START },
2667 { ESC_SVM_RELEASE, EQ_RELEASE },
2668 { ESC_SVM_REMOVE, EQ_REMOVE },
2669 { ESC_SVM_RENAME_DST, EQ_RENAME_DST },
2670 { ESC_SVM_RENAME_SRC, EQ_RENAME_SRC },
2671 { ESC_SVM_REPLACE, EQ_REPLACE },
2672 { ESC_SVM_RESYNC_DONE, EQ_RESYNC_DONE },
2673 { ESC_SVM_RESYNC_FAILED, EQ_RESYNC_FAILED },
2674 { ESC_SVM_RESYNC_START, EQ_RESYNC_START },
2675 { ESC_SVM_RESYNC_SUCCESS, EQ_RESYNC_SUCCESS },
2676 { ESC_SVM_TAKEOVER, EQ_TAKEOVER }
2677 };
2678
2679 static md_tags_t md_tags[] = {
2680 TAG_UNK,
2681 TAG_METADEVICE,
2682 TAG_UNK,
2683 TAG_UNK,
2684 TAG_UNK,
2685 TAG_UNK,
2686 TAG_REPLICA,
2687 TAG_HSP,
2688 TAG_HS,
2689 TAG_SET,
2690 TAG_DRIVE,
2691 TAG_HOST,
2692 TAG_MEDIATOR
2693 };
2694
2695 md_event_type_t
ev_get(char * subclass)2696 ev_get(char *subclass)
2697 {
2698 int high, mid, low, p;
2699
2700 low = 0;
2701 high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
2702 while (low <= high) {
2703 mid = (high + low) / 2;
2704 p = strcmp(subclass, ev_table[mid].se_ev);
2705 if (p == 0) {
2706 return (ev_table[mid].md_ev);
2707 } else if (p < 0) {
2708 high = mid - 1;
2709 } else {
2710 low = mid + 1;
2711 }
2712 }
2713
2714 return (EQ_EMPTY);
2715 }
2716
2717 /*
2718 * Log mdnotify event
2719 */
2720 void
do_mdnotify(char * se_subclass,uint32_t tag,set_t setno,md_dev64_t devid)2721 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
2722 {
2723 md_event_type_t ev_type;
2724 md_tags_t md_tag;
2725
2726 /* Translate sysevent into mdnotify event */
2727 ev_type = ev_get(se_subclass);
2728
2729 if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
2730 md_tag = TAG_UNK;
2731 } else {
2732 md_tag = md_tags[tag];
2733 }
2734
2735 NOTIFY_MD(md_tag, setno, devid, ev_type);
2736 }
2737
2738 /*
2739 * Log SVM sys events
2740 */
2741 void
svm_gen_sysevent(char * se_class,char * se_subclass,uint32_t tag,set_t setno,md_dev64_t devid)2742 svm_gen_sysevent(
2743 char *se_class,
2744 char *se_subclass,
2745 uint32_t tag,
2746 set_t setno,
2747 md_dev64_t devid
2748 )
2749 {
2750 nvlist_t *attr_list;
2751 sysevent_id_t eid;
2752 int err = DDI_SUCCESS;
2753 char *devname;
2754 extern dev_info_t *md_devinfo;
2755
2756 /* Raise the mdnotify event before anything else */
2757 do_mdnotify(se_subclass, tag, setno, devid);
2758
2759 if (md_devinfo == NULL) {
2760 return;
2761 }
2762
2763 err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
2764
2765 if (err == DDI_SUCCESS) {
2766 /* Add the version numver */
2767 err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
2768 (uint32_t)SVM_VERSION);
2769 if (err != DDI_SUCCESS) {
2770 goto fail;
2771 }
2772
2773 /* Add the tag attribute */
2774 err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
2775 if (err != DDI_SUCCESS) {
2776 goto fail;
2777 }
2778
2779 /* Add the set number attribute */
2780 err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
2781 if (err != DDI_SUCCESS) {
2782 goto fail;
2783 }
2784
2785 /* Add the device id attribute */
2786 err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
2787 if (err != DDI_SUCCESS) {
2788 goto fail;
2789 }
2790
2791 /* Add the device name attribute */
2792 devname = obj2devname(tag, setno, devid);
2793 if (devname != NULL) {
2794 err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2795 devname);
2796 freestr(devname);
2797 } else {
2798 err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2799 "unspecified");
2800 }
2801 if (err != DDI_SUCCESS) {
2802 goto fail;
2803 }
2804
2805 /* Attempt to post event */
2806 err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
2807 se_subclass, attr_list, &eid, DDI_SLEEP);
2808
2809 nvlist_free(attr_list);
2810 if (err != DDI_SUCCESS) {
2811 cmn_err(CE_WARN, "Failed to log event for %s, %s,"
2812 " err=%x", se_class, se_subclass, err);
2813 }
2814 }
2815
2816 return;
2817
2818 fail:
2819 nvlist_free(attr_list);
2820 cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
2821 se_class, se_subclass, err);
2822 }
2823
2824 void
md_clear_named_service()2825 md_clear_named_service()
2826 {
2827 rw_enter(&ni_rwlp.lock, RW_WRITER);
2828 notify_interface = NULL;
2829 rw_exit(&ni_rwlp.lock);
2830 }
2831
2832 void
md_create_unit_incore(minor_t mnum,md_ops_t * ops,int alloc_lock)2833 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
2834 {
2835 mdi_unit_t *ui;
2836 set_t setno = MD_MIN2SET(mnum);
2837
2838 ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
2839 ui->ui_opsindex = ops->md_selfindex;
2840
2841 /* initialize all the incore conditional variables */
2842 mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
2843 cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
2844
2845 if (alloc_lock) {
2846 ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
2847 mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
2848 cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
2849 mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
2850 MUTEX_DEFAULT, NULL);
2851 ui->ui_io_lock->io_list_front = NULL;
2852 ui->ui_io_lock->io_list_back = NULL;
2853 }
2854 if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
2855 rw_enter(&md_unit_array_rw.lock, RW_WRITER);
2856 MDI_VOIDUNIT(mnum) = (void *) ui;
2857 rw_exit(&md_unit_array_rw.lock);
2858 } else
2859 MDI_VOIDUNIT(mnum) = (void *) ui;
2860
2861 rw_enter(&ops->md_link_rw.lock, RW_WRITER);
2862 ui->ui_link.ln_next = ops->md_head;
2863 ui->ui_link.ln_setno = setno;
2864 ui->ui_link.ln_id = mnum;
2865 ops->md_head = &ui->ui_link;
2866 /* setup the unavailable field */
2867 #if defined(_ILP32)
2868 if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
2869 ui->ui_tstate |= MD_64MD_ON_32KERNEL;
2870 cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
2871 "metadevices are not accessible on a 32 bit kernel",
2872 mnum);
2873 }
2874 #endif
2875
2876 rw_exit(&ops->md_link_rw.lock);
2877 }
2878
2879 void
md_destroy_unit_incore(minor_t mnum,md_ops_t * ops)2880 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
2881 {
2882 mdi_unit_t *ui;
2883
2884 /*
2885 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
2886 */
2887 ui = MDI_UNIT(mnum);
2888 if (ui == NULL)
2889 return;
2890
2891 md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
2892 &ops->md_head);
2893
2894 /* destroy the io lock if one is being used */
2895 if (ui->ui_io_lock) {
2896 mutex_destroy(&ui->ui_io_lock->io_mx);
2897 cv_destroy(&ui->ui_io_lock->io_cv);
2898 kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
2899 }
2900
2901 /* teardown kstat */
2902 md_kstat_destroy(mnum);
2903
2904 /* destroy all the incore conditional variables */
2905 mutex_destroy(&ui->ui_mx);
2906 cv_destroy(&ui->ui_cv);
2907
2908 kmem_free(ui, sizeof (mdi_unit_t));
2909 MDI_VOIDUNIT(mnum) = (void *) NULL;
2910 }
2911
2912 void
md_rem_names(sv_dev_t * sv,int nsv)2913 md_rem_names(sv_dev_t *sv, int nsv)
2914 {
2915 int i, s;
2916 int max_sides;
2917
2918 if (nsv == 0)
2919 return;
2920
2921 /* All entries removed are in the same diskset */
2922 if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
2923 max_sides = MD_MNMAXSIDES;
2924 else
2925 max_sides = MD_MAXSIDES;
2926
2927 for (i = 0; i < nsv; i++)
2928 for (s = 0; s < max_sides; s++)
2929 (void) md_remdevname(sv[i].setno, s, sv[i].key);
2930 }
2931
2932 /*
2933 * Checking user args before we get into physio - returns 0 for ok, else errno
2934 * We do a lot of checking against illegal arguments here because some of the
2935 * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
2936 * like odd address user buffer.) Those drivers capture bad arguments in
2937 * xxread and xxwrite. But since meta-driver calls their strategy routines
2938 * directly, two bad scenario might happen:
2939 * 1. the real strategy doesn't like it and panic.
2940 * 2. the real strategy doesn't like it and set B_ERROR.
2941 *
2942 * The second case is no better than the first one, since the meta-driver
2943 * will treat it as a media-error and off line the mirror metapartition.
2944 * (Too bad there is no way to tell what error it is.)
2945 *
2946 */
2947 int
md_chk_uio(struct uio * uio)2948 md_chk_uio(struct uio *uio)
2949 {
2950 int i;
2951 struct iovec *iov;
2952
2953 /*
2954 * Check for negative or not block-aligned offset
2955 */
2956 if ((uio->uio_loffset < 0) ||
2957 ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
2958 return (EINVAL);
2959 }
2960 iov = uio->uio_iov;
2961 i = uio->uio_iovcnt;
2962
2963 while (i--) {
2964 if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
2965 return (EINVAL);
2966 /*
2967 * Bug # 1212146
2968 * The default is to not check alignment, but we can now check
2969 * for a larger number of alignments if desired.
2970 */
2971 if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
2972 return (EINVAL);
2973 iov++;
2974 }
2975 return (0);
2976 }
2977
2978 char *
md_shortname(minor_t mnum)2979 md_shortname(
2980 minor_t mnum
2981 )
2982 {
2983 static char buf[MAXPATHLEN];
2984 char *devname;
2985 char *invalid = " (Invalid minor number %u) ";
2986 char *metaname;
2987 mdc_unit_t *un;
2988 side_t side;
2989 set_t setno = MD_MIN2SET(mnum);
2990 unit_t unit = MD_MIN2UNIT(mnum);
2991
2992 if ((un = MD_UNIT(mnum)) == NULL) {
2993 (void) snprintf(buf, sizeof (buf), invalid, mnum);
2994 return (buf);
2995 }
2996
2997 /*
2998 * If unit is not a friendly name unit, derive the name from the
2999 * minor number.
3000 */
3001 if ((un->un_revision & MD_FN_META_DEV) == 0) {
3002 /* This is a traditional metadevice */
3003 if (setno == MD_LOCAL_SET) {
3004 (void) snprintf(buf, sizeof (buf), "d%u",
3005 (unsigned)unit);
3006 } else {
3007 (void) snprintf(buf, sizeof (buf), "%s/d%u",
3008 mddb_getsetname(setno), (unsigned)unit);
3009 }
3010 return (buf);
3011 }
3012
3013 /*
3014 * It is a friendly name metadevice, so we need to get its name.
3015 */
3016 side = mddb_getsidenum(setno);
3017 devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
3018 if (md_getdevname(setno, side, MD_KEYWILD,
3019 md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
3020 /*
3021 * md_getdevname has given us either /dev/md/dsk/<metaname>
3022 * or /dev/md/<setname>/dsk/<metname> depending on whether
3023 * or not we are in the local set. Thus, we'll pull the
3024 * metaname from this string.
3025 */
3026 if ((metaname = strrchr(devname, '/')) == NULL) {
3027 (void) snprintf(buf, sizeof (buf), invalid, mnum);
3028 goto out;
3029 }
3030 metaname++; /* move past slash */
3031 if (setno == MD_LOCAL_SET) {
3032 /* No set name. */
3033 (void) snprintf(buf, sizeof (buf), "%s", metaname);
3034 } else {
3035 /* Include setname */
3036 (void) snprintf(buf, sizeof (buf), "%s/%s",
3037 mddb_getsetname(setno), metaname);
3038 }
3039 } else {
3040 /* We couldn't find the name. */
3041 (void) snprintf(buf, sizeof (buf), invalid, mnum);
3042 }
3043
3044 out:
3045 kmem_free(devname, MAXPATHLEN);
3046 return (buf);
3047 }
3048
3049 char *
md_devname(set_t setno,md_dev64_t dev,char * buf,size_t size)3050 md_devname(
3051 set_t setno,
3052 md_dev64_t dev,
3053 char *buf,
3054 size_t size
3055 )
3056 {
3057 static char mybuf[MD_MAX_CTDLEN];
3058 int err;
3059
3060 if (buf == NULL) {
3061 buf = mybuf;
3062 size = sizeof (mybuf);
3063 } else {
3064 ASSERT(size >= MD_MAX_CTDLEN);
3065 }
3066
3067 err = md_getdevname_common(setno, mddb_getsidenum(setno),
3068 0, dev, buf, size, MD_NOWAIT_LOCK);
3069 if (err) {
3070 if (err == ENOENT) {
3071 (void) sprintf(buf, "(Unavailable)");
3072 } else {
3073 (void) sprintf(buf, "(%u.%u)",
3074 md_getmajor(dev), md_getminor(dev));
3075 }
3076 }
3077
3078 return (buf);
3079 }
3080 void
md_minphys(buf_t * pb)3081 md_minphys(buf_t *pb)
3082 {
3083 extern unsigned md_maxbcount;
3084
3085 if (pb->b_bcount > md_maxbcount)
3086 pb->b_bcount = md_maxbcount;
3087 }
3088
3089 void
md_bioinit(struct buf * bp)3090 md_bioinit(struct buf *bp)
3091 {
3092 ASSERT(bp);
3093
3094 bioinit(bp);
3095 bp->b_back = bp;
3096 bp->b_forw = bp;
3097 bp->b_flags = B_BUSY; /* initialize flags */
3098 }
3099
3100 void
md_bioreset(struct buf * bp)3101 md_bioreset(struct buf *bp)
3102 {
3103 ASSERT(bp);
3104
3105 bioreset(bp);
3106 bp->b_back = bp;
3107 bp->b_forw = bp;
3108 bp->b_flags = B_BUSY; /* initialize flags */
3109 }
3110
3111 /*
3112 * md_bioclone is needed as long as the real bioclone only takes a daddr_t
3113 * as block number.
3114 * We simply call bioclone with all input parameters but blkno, and set the
3115 * correct blkno afterwards.
3116 * Caveat Emptor: bp_mem must not be NULL!
3117 */
3118 buf_t *
md_bioclone(buf_t * bp,off_t off,size_t len,dev_t dev,diskaddr_t blkno,int (* iodone)(buf_t *),buf_t * bp_mem,int sleep)3119 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
3120 int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
3121 {
3122 (void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
3123 bp_mem->b_lblkno = blkno;
3124 return (bp_mem);
3125 }
3126
3127
3128 /*
3129 * kstat stuff
3130 */
3131 void
md_kstat_init_ui(minor_t mnum,mdi_unit_t * ui)3132 md_kstat_init_ui(
3133 minor_t mnum,
3134 mdi_unit_t *ui
3135 )
3136 {
3137 if ((ui != NULL) && (ui->ui_kstat == NULL)) {
3138 set_t setno = MD_MIN2SET(mnum);
3139 unit_t unit = MD_MIN2UNIT(mnum);
3140 char module[KSTAT_STRLEN];
3141 char *p = module;
3142
3143 if (setno != MD_LOCAL_SET) {
3144 char buf[64];
3145 char *s = buf;
3146 char *e = module + sizeof (module) - 4;
3147
3148 (void) sprintf(buf, "%u", setno);
3149 while ((p < e) && (*s != '\0'))
3150 *p++ = *s++;
3151 *p++ = '/';
3152 }
3153 *p++ = 'm';
3154 *p++ = 'd';
3155 *p = '\0';
3156 if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
3157 KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
3158 ui->ui_kstat->ks_lock = &ui->ui_mx;
3159 kstat_install(ui->ui_kstat);
3160 }
3161 }
3162 }
3163
3164 void
md_kstat_init(minor_t mnum)3165 md_kstat_init(
3166 minor_t mnum
3167 )
3168 {
3169 md_kstat_init_ui(mnum, MDI_UNIT(mnum));
3170 }
3171
3172 void
md_kstat_destroy_ui(mdi_unit_t * ui)3173 md_kstat_destroy_ui(
3174 mdi_unit_t *ui
3175 )
3176 {
3177 /*
3178 * kstat_delete() interface has it's own locking mechanism and
3179 * does not allow holding of kstat lock (ks_lock).
3180 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
3181 */
3182 if ((ui != NULL) && (ui->ui_kstat != NULL)) {
3183 kstat_delete(ui->ui_kstat);
3184 ui->ui_kstat = NULL;
3185 }
3186 }
3187
3188 void
md_kstat_destroy(minor_t mnum)3189 md_kstat_destroy(
3190 minor_t mnum
3191 )
3192 {
3193 md_kstat_destroy_ui(MDI_UNIT(mnum));
3194 }
3195
3196 /*
3197 * In the following subsequent routines, locks are held before checking the
3198 * validity of ui_kstat. This is done to make sure that we don't trip over
3199 * a NULL ui_kstat anymore.
3200 */
3201
3202 void
md_kstat_waitq_enter(mdi_unit_t * ui)3203 md_kstat_waitq_enter(
3204 mdi_unit_t *ui
3205 )
3206 {
3207 mutex_enter(&ui->ui_mx);
3208 if (ui->ui_kstat != NULL)
3209 kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3210 mutex_exit(&ui->ui_mx);
3211 }
3212
3213 void
md_kstat_waitq_to_runq(mdi_unit_t * ui)3214 md_kstat_waitq_to_runq(
3215 mdi_unit_t *ui
3216 )
3217 {
3218 mutex_enter(&ui->ui_mx);
3219 if (ui->ui_kstat != NULL)
3220 kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
3221 mutex_exit(&ui->ui_mx);
3222 }
3223
3224 void
md_kstat_waitq_exit(mdi_unit_t * ui)3225 md_kstat_waitq_exit(
3226 mdi_unit_t *ui
3227 )
3228 {
3229 mutex_enter(&ui->ui_mx);
3230 if (ui->ui_kstat != NULL)
3231 kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3232 mutex_exit(&ui->ui_mx);
3233 }
3234
3235 void
md_kstat_runq_enter(mdi_unit_t * ui)3236 md_kstat_runq_enter(
3237 mdi_unit_t *ui
3238 )
3239 {
3240 mutex_enter(&ui->ui_mx);
3241 if (ui->ui_kstat != NULL)
3242 kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3243 mutex_exit(&ui->ui_mx);
3244 }
3245
3246 void
md_kstat_runq_exit(mdi_unit_t * ui)3247 md_kstat_runq_exit(
3248 mdi_unit_t *ui
3249 )
3250 {
3251 mutex_enter(&ui->ui_mx);
3252 if (ui->ui_kstat != NULL)
3253 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3254 mutex_exit(&ui->ui_mx);
3255 }
3256
3257 void
md_kstat_done(mdi_unit_t * ui,buf_t * bp,int war)3258 md_kstat_done(
3259 mdi_unit_t *ui,
3260 buf_t *bp,
3261 int war
3262 )
3263 {
3264 size_t n_done;
3265
3266 /* check for end of device */
3267 if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
3268 n_done = bp->b_bcount;
3269 } else if (bp->b_bcount < bp->b_resid) {
3270 n_done = 0;
3271 } else {
3272 n_done = bp->b_bcount - bp->b_resid;
3273 }
3274
3275 /* do accounting */
3276 mutex_enter(&ui->ui_mx);
3277 if (ui->ui_kstat != NULL) {
3278 if ((! war) && (bp->b_flags & B_READ)) {
3279 KSTAT_IO_PTR(ui->ui_kstat)->reads++;
3280 KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
3281 } else {
3282 KSTAT_IO_PTR(ui->ui_kstat)->writes++;
3283 KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
3284 }
3285 kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3286 }
3287 mutex_exit(&ui->ui_mx);
3288 }
3289
3290 pid_t
md_getpid()3291 md_getpid()
3292 {
3293 pid_t valuep;
3294 if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
3295 ASSERT(0);
3296 return ((pid_t)0);
3297 } else {
3298 ASSERT(valuep);
3299 return (valuep);
3300 }
3301 }
3302
3303
3304 proc_t *
md_getproc()3305 md_getproc()
3306 {
3307 proc_t *valuep;
3308 if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
3309 ASSERT(0);
3310 return ((proc_t *)NULL);
3311 } else {
3312 ASSERT(valuep);
3313 return (valuep);
3314 }
3315 }
3316
3317 extern kmutex_t pidlock;
3318
3319 /*
3320 * this check to see if a process pid pair are still running. For the
3321 * disk set lock when both pid/proc are zero then the locks is not
3322 * currently held.
3323 */
3324 int
md_checkpid(pid_t pid,proc_t * proc)3325 md_checkpid(pid_t pid, proc_t *proc)
3326 {
3327 int retval = 1;
3328
3329 if (pid == 0 && proc == NULL)
3330 return (0);
3331
3332 mutex_enter(&pidlock);
3333 if (prfind(pid) != proc)
3334 retval = 0;
3335 mutex_exit(&pidlock);
3336 return (retval);
3337 }
3338
3339 /*
3340 * NAME: md_init_probereq
3341 *
3342 * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
3343 * they can be dispatched to multiple daemon threads.
3344 *
3345 * PARAMETERS: struct md_probedev *p pointer ioctl input
3346 *
3347 * RETURN VALUE: Returns errno
3348 *
3349 */
3350
3351 int
md_init_probereq(struct md_probedev_impl * p,daemon_queue_t ** hdrpp)3352 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
3353 {
3354 int err = 0;
3355 int modindx;
3356 intptr_t (*probe_test)();
3357
3358 /*
3359 * Initialize the semaphores and mutex
3360 * for the request
3361 */
3362
3363 p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
3364
3365 p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
3366 sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
3367 mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
3368
3369 modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
3370 probe_test = md_get_named_service(NODEV64, modindx,
3371 p->probe.test_name, 0);
3372 if (probe_test == NULL) {
3373 err = EINVAL;
3374 goto err_out;
3375 }
3376
3377 err = md_create_probe_rqlist(p, hdrpp, probe_test);
3378 err_out:
3379 return (err);
3380 }
3381
3382 /*
3383 * NAME: md_probe_one
3384 *
3385 * DESCRIPTION: Generic routine for probing disks. This is called from the
3386 * daemon.
3387 *
3388 * PARAMETERS: probe_req_t *reqp pointer to the probe request structure.
3389 *
3390 */
3391
3392 void
md_probe_one(probe_req_t * reqp)3393 md_probe_one(probe_req_t *reqp)
3394 {
3395 mdi_unit_t *ui;
3396 md_probedev_impl_t *p;
3397 int err = 0;
3398 set_t setno;
3399
3400 p = (md_probedev_impl_t *)reqp->private_handle;
3401 /*
3402 * Validate the unit while holding the global ioctl lock, then
3403 * obtain the unit_writerlock. Once the writerlock has been obtained
3404 * we can release the global lock. As long as we hold one of these
3405 * locks this will prevent a metaclear operation being performed
3406 * on the metadevice because metaclear takes the readerlock (via
3407 * openclose lock).
3408 * To avoid a potential deadlock with the probe_fcn() causing i/o to
3409 * be issued to the writerlock'd metadevice we only grab the writerlock
3410 * if the unit is not an SVM root device.
3411 */
3412 while (md_ioctl_lock_enter() == EINTR)
3413 ;
3414 setno = MD_MIN2SET(reqp->mnum);
3415 ui = MDI_UNIT(reqp->mnum);
3416 if (ui != NULL) {
3417 int writer_grabbed;
3418 dev_t svm_root;
3419
3420 if ((setno == MD_LOCAL_SET) && root_is_svm) {
3421 svm_root = getrootdev();
3422
3423 if (getminor(svm_root) == reqp->mnum) {
3424 writer_grabbed = 0;
3425 } else {
3426 writer_grabbed = 1;
3427 (void) md_unit_writerlock_common(ui, 0);
3428 }
3429 } else {
3430 writer_grabbed = 1;
3431 (void) md_unit_writerlock_common(ui, 0);
3432 }
3433 (void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3434 err = (*reqp->probe_fcn)(ui, reqp->mnum);
3435 if (writer_grabbed) {
3436 md_unit_writerexit(ui);
3437 }
3438 } else {
3439 (void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3440 }
3441
3442 /* update the info in the probe structure */
3443
3444 mutex_enter(PROBE_MX(p));
3445 if (err != 0) {
3446 cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
3447 reqp->mnum);
3448 (void) mdsyserror(&(p->probe.mde), err);
3449 }
3450
3451 mutex_exit(PROBE_MX(p));
3452 sema_v(PROBE_SEMA(p));
3453
3454 kmem_free(reqp, sizeof (probe_req_t));
3455 }
3456 char *
md_strdup(char * cp)3457 md_strdup(char *cp)
3458 {
3459 char *new_cp = NULL;
3460
3461 new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
3462
3463 return (strcpy(new_cp, cp));
3464 }
3465
3466 void
freestr(char * cp)3467 freestr(char *cp)
3468 {
3469 kmem_free(cp, strlen(cp) + 1);
3470 }
3471
3472 /*
3473 * Validate the list and skip invalid devices. Then create
3474 * a doubly linked circular list of devices to probe.
3475 * The hdr points to the head and tail of this list.
3476 */
3477
3478 static int
md_create_probe_rqlist(md_probedev_impl_t * plist,daemon_queue_t ** hdr,intptr_t (* probe_test)())3479 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
3480 intptr_t (*probe_test)())
3481 {
3482 int i, err, nodevcnt;
3483 probe_req_t *tp;
3484 daemon_queue_t *hp;
3485 minor_t mnum;
3486
3487 nodevcnt = 0;
3488
3489 hp = NULL;
3490
3491 for (i = 0; i < plist->probe.nmdevs; i++) {
3492 mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
3493 if (MDI_UNIT(mnum) == NULL) {
3494 cmn_err(CE_WARN, "md: Cannot probe %s since it does "
3495 "not exist", md_shortname(mnum));
3496 nodevcnt++;
3497 continue;
3498 }
3499 tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
3500 tp->mnum = mnum;
3501 tp->private_handle = (void *)plist;
3502 tp->probe_fcn = probe_test;
3503 if (hp == NULL) {
3504 hp = (daemon_queue_t *)tp;
3505 hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
3506 } else {
3507 tp->dq.dq_next = hp;
3508 tp->dq.dq_prev = hp->dq_prev;
3509 hp->dq_prev->dq_next = (daemon_queue_t *)tp;
3510 hp->dq_prev = (daemon_queue_t *)tp;
3511 }
3512 }
3513
3514 *hdr = hp;
3515 if (nodevcnt > 0)
3516 plist->probe.nmdevs -= nodevcnt;
3517
3518 /*
3519 * If there are no devices to be probed because they were
3520 * incorrect, then return an error.
3521 */
3522 err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
3523
3524 return (err);
3525 }
3526
3527 /*
3528 * This routine increments the I/O count for set I/O operations. This
3529 * value is used to determine if an I/O can done. If a release is in
3530 * process this will return an error and cause the I/O to be errored.
3531 */
3532 int
md_inc_iocount(set_t setno)3533 md_inc_iocount(set_t setno)
3534 {
3535 int rc = 0;
3536
3537 if (setno == 0)
3538 return (0);
3539
3540 mutex_enter(&md_set_io[setno].md_io_mx);
3541 if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
3542 rc = EIO;
3543 goto out;
3544 }
3545
3546 ASSERT(md_set_io[setno].io_cnt >= 0);
3547 md_set_io[setno].io_cnt++;
3548
3549 out: mutex_exit(&md_set_io[setno].md_io_mx);
3550 return (rc);
3551 }
3552
3553 void
md_inc_iocount_noblock(set_t setno)3554 md_inc_iocount_noblock(set_t setno)
3555 {
3556
3557 if (setno == 0)
3558 return;
3559
3560 mutex_enter(&md_set_io[setno].md_io_mx);
3561 md_set_io[setno].io_cnt++;
3562 mutex_exit(&md_set_io[setno].md_io_mx);
3563 }
3564 void
md_dec_iocount(set_t setno)3565 md_dec_iocount(set_t setno)
3566 {
3567
3568 if (setno == 0)
3569 return;
3570
3571 mutex_enter(&md_set_io[setno].md_io_mx);
3572 md_set_io[setno].io_cnt--;
3573 ASSERT(md_set_io[setno].io_cnt >= 0);
3574 if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
3575 (md_set_io[setno].io_cnt == 0))
3576 cv_broadcast(&md_set_io[setno].md_io_cv);
3577 mutex_exit(&md_set_io[setno].md_io_mx);
3578 }
3579
3580 int
md_isblock_setio(set_t setno)3581 md_isblock_setio(set_t setno)
3582 {
3583 int rc = 0;
3584
3585 if (setno == 0)
3586 return (0);
3587
3588 mutex_enter(&md_set_io[setno].md_io_mx);
3589 if (md_set_io[setno].io_state & MD_SET_RELEASE)
3590 rc = 1;
3591
3592 mutex_exit(&md_set_io[setno].md_io_mx);
3593 return (rc);
3594 }
3595
3596 int
md_block_setio(set_t setno)3597 md_block_setio(set_t setno)
3598 {
3599 int rc = 0;
3600
3601 if (setno == 0)
3602 return (1);
3603
3604 mutex_enter(&md_set_io[setno].md_io_mx);
3605 md_set_io[setno].io_state = MD_SET_RELEASE;
3606
3607 while (md_set_io[setno].io_cnt > 0) {
3608 cv_wait(&md_set_io[setno].md_io_cv,
3609 &md_set_io[setno].md_io_mx);
3610 }
3611 rc = 1;
3612
3613
3614 ASSERT(md_set_io[setno].io_cnt == 0);
3615 mutex_exit(&md_set_io[setno].md_io_mx);
3616
3617 return (rc);
3618 }
3619
3620 void
md_clearblock_setio(set_t setno)3621 md_clearblock_setio(set_t setno)
3622 {
3623 if (setno == 0)
3624 return;
3625
3626 mutex_enter(&md_set_io[setno].md_io_mx);
3627 md_set_io[setno].io_state = MD_SET_ACTIVE;
3628 mutex_exit(&md_set_io[setno].md_io_mx);
3629 }
3630
3631 void
md_unblock_setio(set_t setno)3632 md_unblock_setio(set_t setno)
3633 {
3634 if (setno == 0)
3635 return;
3636
3637 mutex_enter(&md_set_io[setno].md_io_mx);
3638 #ifdef DEBUG
3639 if (md_set_io[setno].io_cnt != 0) {
3640 cmn_err(CE_NOTE, "set %d count was %ld at take",
3641 setno, md_set_io[setno].io_cnt);
3642 }
3643 #endif /* DEBUG */
3644
3645 md_set_io[setno].io_state = MD_SET_ACTIVE;
3646 md_set_io[setno].io_cnt = 0;
3647 mutex_exit(&md_set_io[setno].md_io_mx);
3648 }
3649
3650 /*
3651 * Test and set version of the md_block_setio.
3652 * Set the io_state to keep new I/O from being issued.
3653 * If there is I/O currently in progress, then set io_state to active
3654 * and return failure. Otherwise, return a 1 for success.
3655 *
3656 * Used in a MN diskset since the commd must be suspended before
3657 * this node can attempt to withdraw from a diskset. But, with commd
3658 * suspended, I/O may have been issued that can never finish until
3659 * commd is resumed (allocation of hotspare, etc). So, if I/O is
3660 * outstanding after diskset io_state is marked RELEASE, then set diskset
3661 * io_state back to ACTIVE and return failure.
3662 */
3663 int
md_tas_block_setio(set_t setno)3664 md_tas_block_setio(set_t setno)
3665 {
3666 int rc;
3667
3668 if (setno == 0)
3669 return (1);
3670
3671 mutex_enter(&md_set_io[setno].md_io_mx);
3672 md_set_io[setno].io_state = MD_SET_RELEASE;
3673
3674 if (md_set_io[setno].io_cnt > 0) {
3675 md_set_io[setno].io_state = MD_SET_ACTIVE;
3676 rc = 0;
3677 } else {
3678 rc = 1;
3679 }
3680
3681 mutex_exit(&md_set_io[setno].md_io_mx);
3682
3683 return (rc);
3684 }
3685
3686 void
md_biodone(struct buf * pb)3687 md_biodone(struct buf *pb)
3688 {
3689 minor_t mnum;
3690 set_t setno;
3691 mdi_unit_t *ui;
3692
3693 mnum = getminor(pb->b_edev);
3694 setno = MD_MIN2SET(mnum);
3695
3696 if (setno == 0) {
3697 biodone(pb);
3698 return;
3699 }
3700
3701 #ifdef DEBUG
3702 ui = MDI_UNIT(mnum);
3703 if (!md_unit_isopen(ui))
3704 cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
3705 #endif /* DEBUG */
3706
3707 /*
3708 * Handle the local diskset
3709 */
3710 if (md_set_io[setno].io_cnt > 0)
3711 md_dec_iocount(setno);
3712
3713 #ifdef DEBUG
3714 /*
3715 * this is being done after the lock is dropped so there
3716 * are cases it may be invalid. It is advisory.
3717 */
3718 if (md_set_io[setno].io_state & MD_SET_RELEASE) {
3719 /* Only display this error once for this metadevice */
3720 if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
3721 cmn_err(CE_NOTE,
3722 "I/O to %s attempted during set RELEASE\n",
3723 md_shortname(mnum));
3724 ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
3725 }
3726 }
3727 #endif /* DEBUG */
3728
3729 biodone(pb);
3730 }
3731
3732
3733 /*
3734 * Driver special private devt handling routine
3735 * INPUT: md_dev64_t
3736 * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
3737 */
3738 dev_t
md_dev64_to_dev(md_dev64_t dev)3739 md_dev64_to_dev(md_dev64_t dev)
3740 {
3741 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3742 minor_t minor = (minor_t)(dev & MAXMIN64);
3743
3744 return (makedevice(major, minor));
3745
3746 }
3747
3748 /*
3749 * Driver private makedevice routine
3750 * INPUT: major_t major, minor_t minor
3751 * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
3752 */
3753 md_dev64_t
md_makedevice(major_t major,minor_t minor)3754 md_makedevice(major_t major, minor_t minor)
3755 {
3756 return (((md_dev64_t)major << NBITSMINOR64) | minor);
3757
3758 }
3759
3760
3761 /*
3762 * Driver private devt md_getmajor routine
3763 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device
3764 * OUTPUT: the appropriate major number
3765 */
3766 major_t
md_getmajor(md_dev64_t dev)3767 md_getmajor(md_dev64_t dev)
3768 {
3769 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3770
3771 if (major == 0) {
3772 /* Here we were given a 32bit dev */
3773 major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
3774 }
3775 return (major);
3776 }
3777
3778 /*
3779 * Driver private devt md_getminor routine
3780 * INPUT: dev a 64 bit container holding either a 32 bit or a 64 bit device
3781 * OUTPUT: the appropriate minor number
3782 */
3783 minor_t
md_getminor(md_dev64_t dev)3784 md_getminor(md_dev64_t dev)
3785 {
3786 minor_t minor;
3787 major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3788
3789 if (major == 0) {
3790 /* Here we were given a 32bit dev */
3791 minor = (minor_t)(dev & MAXMIN32);
3792 } else {
3793 minor = (minor_t)(dev & MAXMIN64);
3794 }
3795 return (minor);
3796 }
3797
3798 int
md_check_ioctl_against_unit(int cmd,mdc_unit_t c)3799 md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
3800 {
3801 /*
3802 * If the metadevice is an old style device, it has a vtoc,
3803 * in that case all reading EFI ioctls are not applicable.
3804 * If the metadevice has an EFI label, reading vtoc and geom ioctls
3805 * are not supposed to work.
3806 */
3807 switch (cmd) {
3808 case DKIOCGGEOM:
3809 case DKIOCGAPART:
3810 /* if > 2 TB then fail */
3811 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3812 return (ENOTSUP);
3813 }
3814 break;
3815 case DKIOCGVTOC:
3816 /* if > 2 TB then fail */
3817 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3818 return (ENOTSUP);
3819 }
3820
3821 /* if > 1 TB but < 2TB return overflow */
3822 if (c.un_revision & MD_64BIT_META_DEV) {
3823 return (EOVERFLOW);
3824 }
3825 break;
3826 case DKIOCGEXTVTOC:
3827 /* if > 2 TB then fail */
3828 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3829 return (ENOTSUP);
3830 }
3831 break;
3832 case DKIOCGETEFI:
3833 case DKIOCPARTITION:
3834 if ((c.un_flag & MD_EFILABEL) == 0) {
3835 return (ENOTSUP);
3836 }
3837 break;
3838
3839 case DKIOCSETEFI:
3840 /* setting an EFI label should always be ok */
3841 return (0);
3842
3843 case DKIOCSVTOC:
3844 /* if > 2 TB then fail */
3845 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3846 return (ENOTSUP);
3847 }
3848
3849 /* if > 1 TB but < 2TB return overflow */
3850 if (c.un_revision & MD_64BIT_META_DEV) {
3851 return (EOVERFLOW);
3852 }
3853 break;
3854 case DKIOCSEXTVTOC:
3855 if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3856 return (ENOTSUP);
3857 }
3858 break;
3859 }
3860 return (0);
3861 }
3862
3863 /*
3864 * md_vtoc_to_efi_record()
3865 * Input: record id of the vtoc record
3866 * Output: record id of the efi record
3867 * Function:
3868 * - reads the volume name from the vtoc record
3869 * - converts the volume name to a format, libefi understands
3870 * - creates a new record of size MD_EFI_PARTNAME_BYTES
3871 * - stores the volname in that record,
3872 * - commits that record
3873 * - returns the recid of the efi record.
3874 * Caveat Emptor:
3875 * The calling routine must do something like
3876 * - un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
3877 * - commit(un)
3878 * - delete(vtoc_recid)
3879 * in order to keep the mddb consistent in case of a panic in the middle.
3880 * Errors:
3881 * - returns 0 on any error
3882 */
3883 mddb_recid_t
md_vtoc_to_efi_record(mddb_recid_t vtoc_recid,set_t setno)3884 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
3885 {
3886 struct vtoc *vtoc;
3887 ushort_t *v;
3888 mddb_recid_t efi_recid;
3889 int i;
3890
3891 if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
3892 return (0);
3893 }
3894 vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
3895 efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
3896 MD_CRO_32BIT, setno);
3897 if (efi_recid < 0) {
3898 return (0);
3899 }
3900 v = (ushort_t *)mddb_getrecaddr(efi_recid);
3901
3902 /* This for loop read, converts and writes */
3903 for (i = 0; i < LEN_DKL_VVOL; i++) {
3904 v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
3905 }
3906 /* commit the new record */
3907 mddb_commitrec_wrapper(efi_recid);
3908
3909 return (efi_recid);
3910 }
3911
3912 /*
3913 * Send a kernel message.
3914 * user has to provide for an allocated result structure
3915 * If the door handler disappears we retry, emitting warnings every so often.
3916 *
3917 * The recipient argument is almost always unused, and is therefore typically
3918 * set to zero, as zero is an invalid cluster nodeid. The exceptions are the
3919 * marking and clearing of the DRL from a node that is not currently the
3920 * owner. In these cases, the recipient argument will be the nodeid of the
3921 * mirror owner, and MD_MSGF_DIRECTED will be set in the flags. Non-owner
3922 * nodes will not receive these messages.
3923 *
3924 * For the case where md_mn_is_commd_present() is false, we simply pre-set
3925 * the result->kmmr_comm_state to MDMNE_RPC_FAIL.
3926 * This covers the case where the service mdcommd has been killed and so we do
3927 * not get a 'new' result structure copied back. Instead we return with the
3928 * supplied result field, and we need to flag a failure to the caller.
3929 */
3930 int
mdmn_ksend_message(set_t setno,md_mn_msgtype_t type,uint_t flags,md_mn_nodeid_t recipient,char * data,int size,md_mn_kresult_t * result)3931 mdmn_ksend_message(
3932 set_t setno,
3933 md_mn_msgtype_t type,
3934 uint_t flags,
3935 md_mn_nodeid_t recipient,
3936 char *data,
3937 int size,
3938 md_mn_kresult_t *result)
3939 {
3940 door_arg_t da;
3941 md_mn_kmsg_t *kmsg;
3942 uint_t send_try_cnt = 0;
3943 uint_t retry_noise_cnt = 0;
3944 int rval;
3945 k_sigset_t oldmask, newmask;
3946
3947 /*
3948 * Ensure that we default to a recoverable failure state if the
3949 * door upcall cannot pass the request on to rpc.mdcommd.
3950 * This may occur when shutting the node down while there is still
3951 * a mirror resync or metadevice state update occurring.
3952 */
3953 result->kmmr_comm_state = MDMNE_RPC_FAIL;
3954 result->kmmr_exitval = ~0;
3955
3956 if (size > MDMN_MAX_KMSG_DATA)
3957 return (ENOMEM);
3958 kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
3959 kmsg->kmsg_flags = flags;
3960 kmsg->kmsg_setno = setno;
3961 kmsg->kmsg_recipient = recipient;
3962 kmsg->kmsg_type = type;
3963 kmsg->kmsg_size = size;
3964 bcopy(data, &(kmsg->kmsg_data), size);
3965
3966 /*
3967 * Wait for the door handle to be established.
3968 */
3969 while (mdmn_door_did == -1) {
3970 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3971 cmn_err(CE_WARN, "door handle not yet ready. "
3972 "Check if /usr/lib/lvm/mddoors is running");
3973 }
3974 delay(md_hz);
3975 }
3976
3977 /*
3978 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
3979 * do not fail if the user process receives a signal while we're
3980 * active in the door interface.
3981 */
3982 if (flags & MD_MSGF_BLK_SIGNAL) {
3983 sigfillset(&newmask);
3984 sigreplace(&newmask, &oldmask);
3985 }
3986
3987 /*
3988 * If message failed with an RPC_FAILURE when rpc.mdcommd had
3989 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
3990 * then don't retry the message anymore. If message
3991 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
3992 * times which should allow a shutting down system time to
3993 * notify the kernel of a graceful shutdown of rpc.mdcommd.
3994 *
3995 * Caller of this routine will need to check the md_mn_commd_present
3996 * flag and the failure error in order to determine whether to panic
3997 * or not. If md_mn_commd_present is set to 0 and failure error
3998 * is RPC_FAILURE, the calling routine should not panic since the
3999 * system is in the process of being shutdown.
4000 *
4001 */
4002
4003 retry_noise_cnt = send_try_cnt = 0;
4004 while (md_mn_is_commd_present_lite()) {
4005 /*
4006 * data_ptr and data_size are initialized here because on
4007 * return from the upcall, they contain data duplicated from
4008 * rbuf and rsize. This causes subsequent upcalls to fail.
4009 */
4010 da.data_ptr = (char *)(kmsg);
4011 da.data_size = sizeof (md_mn_kmsg_t);
4012 da.desc_ptr = NULL;
4013 da.desc_num = 0;
4014 da.rbuf = (char *)result;
4015 da.rsize = sizeof (*result);
4016
4017 while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
4018 NULL, SIZE_MAX, 0)) != 0) {
4019 if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
4020 if (rval == EAGAIN) {
4021 cmn_err(CE_WARN,
4022 "md: door_upcall failed. "
4023 "Check if mddoors is running.");
4024 } else if (rval == EINTR) {
4025 cmn_err(CE_WARN,
4026 "md: door_upcall failed. "
4027 "Check if rpc.mdcommd is running.");
4028 } else {
4029 cmn_err(CE_WARN,
4030 "md: door_upcall failed. "
4031 "Returned %d",
4032 rval);
4033 }
4034 }
4035 if (++send_try_cnt >= md_send_retry_limit)
4036 break;
4037
4038 delay(md_hz);
4039
4040 /*
4041 * data_ptr and data_size are re-initialized here
4042 * because on return from the upcall, they contain
4043 * data duplicated from rbuf and rsize. This causes
4044 * subsequent upcalls to fail.
4045 */
4046 da.data_ptr = (char *)(kmsg);
4047 da.data_size = sizeof (md_mn_kmsg_t);
4048 da.desc_ptr = NULL;
4049 da.desc_num = 0;
4050 da.rbuf = (char *)result;
4051 da.rsize = sizeof (*result);
4052 }
4053
4054
4055 /*
4056 * If:
4057 * - the send succeeded (MDMNE_ACK)
4058 * - we had an MDMNE_RPC_FAIL and commd is now gone
4059 * (note: since the outer loop is commd-dependent,
4060 * checking MDMN_RPC_FAIL here is meaningless)
4061 * - we were told not to retry
4062 * - we exceeded the RPC failure send limit
4063 * punch out of the outer loop prior to the delay()
4064 */
4065 if (result->kmmr_comm_state == MDMNE_ACK ||
4066 (flags & MD_MSGF_KSEND_NORETRY) ||
4067 (++send_try_cnt % md_send_retry_limit) == 0 ||
4068 !md_mn_is_commd_present())
4069 break;
4070 delay(md_hz);
4071 }
4072
4073 if (flags & MD_MSGF_BLK_SIGNAL) {
4074 sigreplace(&oldmask, (k_sigset_t *)NULL);
4075 }
4076 kmem_free(kmsg, sizeof (md_mn_kmsg_t));
4077
4078 return (0);
4079 }
4080
4081 /*
4082 * Called to propagate the capability of a metadevice to all nodes in the set.
4083 *
4084 * On entry, lockp is set if the function has been called from within an ioctl.
4085 *
4086 * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
4087 * routine to enable other mdioctls to enter the kernel while this
4088 * thread of execution waits on the completion of mdmn_ksend_message. When
4089 * the message is completed the thread continues and md_ioctl_lock must be
4090 * reacquired. Even though md_ioctl_lock is interruptable, we choose to
4091 * ignore EINTR as we must not return without acquiring md_ioctl_lock.
4092 */
4093
4094 int
mdmn_send_capability_message(minor_t mnum,volcap_t vc,IOLOCK * lockp)4095 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
4096 {
4097 md_mn_msg_setcap_t msg;
4098 md_mn_kresult_t *kres;
4099 mdi_unit_t *ui = MDI_UNIT(mnum);
4100 int ret;
4101 k_sigset_t oldmask, newmask;
4102
4103 (void) strncpy((char *)&msg.msg_setcap_driver,
4104 md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
4105 msg.msg_setcap_mnum = mnum;
4106 msg.msg_setcap_set = vc.vc_set;
4107
4108 if (lockp)
4109 IOLOCK_RETURN_RELEASE(0, lockp);
4110 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4111
4112 /*
4113 * Mask signals for the mdmd_ksend_message call. This keeps the door
4114 * interface from failing if the user process receives a signal while
4115 * in mdmn_ksend_message.
4116 */
4117 sigfillset(&newmask);
4118 sigreplace(&newmask, &oldmask);
4119 ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
4120 MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
4121 kres));
4122 sigreplace(&oldmask, (k_sigset_t *)NULL);
4123
4124 if (!MDMN_KSEND_MSG_OK(ret, kres)) {
4125 mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
4126 ret = EIO;
4127 }
4128 kmem_free(kres, sizeof (md_mn_kresult_t));
4129
4130 if (lockp) {
4131 IOLOCK_RETURN_REACQUIRE(lockp);
4132 }
4133 return (ret);
4134 }
4135
4136 /*
4137 * Called to clear all of the transient capabilities for a metadevice when it is
4138 * not open on any node in the cluster
4139 * Called from close for mirror and sp.
4140 */
4141
4142 void
mdmn_clear_all_capabilities(minor_t mnum)4143 mdmn_clear_all_capabilities(minor_t mnum)
4144 {
4145 md_isopen_t clumsg;
4146 int ret;
4147 md_mn_kresult_t *kresult;
4148 volcap_t vc;
4149 k_sigset_t oldmask, newmask;
4150
4151 clumsg.dev = md_makedevice(md_major, mnum);
4152 clumsg.mde = mdnullerror;
4153 /*
4154 * The check open message doesn't have to be logged, nor should the
4155 * result be stored in the MCT. We want an up-to-date state.
4156 */
4157 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4158
4159 /*
4160 * Mask signals for the mdmd_ksend_message call. This keeps the door
4161 * interface from failing if the user process receives a signal while
4162 * in mdmn_ksend_message.
4163 */
4164 sigfillset(&newmask);
4165 sigreplace(&newmask, &oldmask);
4166 ret = mdmn_ksend_message(MD_MIN2SET(mnum),
4167 MD_MN_MSG_CLU_CHECK,
4168 MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
4169 (char *)&clumsg, sizeof (clumsg), kresult);
4170 sigreplace(&oldmask, (k_sigset_t *)NULL);
4171
4172 if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
4173 /*
4174 * Not open on any node, clear all capabilities, eg ABR and
4175 * DMR
4176 */
4177 vc.vc_set = 0;
4178 (void) mdmn_send_capability_message(mnum, vc, NULL);
4179 }
4180 kmem_free(kresult, sizeof (md_mn_kresult_t));
4181 }
4182
4183 /*
4184 * mdmn_ksend_show_error:
4185 * ---------------------
4186 * Called to display the error contents of a failing mdmn_ksend_message() result
4187 *
4188 * Input:
4189 * rv - return value from mdmn_ksend_message()
4190 * kres - pointer to result structure filled in by mdmn_ksend_message
4191 * s - Informative message to identify failing condition (e.g.
4192 * "Ownership change") This string will be displayed with
4193 * cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
4194 * administrator
4195 */
4196 void
mdmn_ksend_show_error(int rv,md_mn_kresult_t * kres,const char * s)4197 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
4198 {
4199 if (rv == 0) {
4200 cmn_err(CE_WARN, "%s *FAILED*", s);
4201 cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
4202 " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
4203 kres->kmmr_failing_node);
4204 } else {
4205 cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
4206 }
4207 }
4208
4209 /*
4210 * Callback routine for resync thread. If requested to suspend we mark the
4211 * commd as not being present.
4212 */
4213 boolean_t
callb_md_mrs_cpr(void * arg,int code)4214 callb_md_mrs_cpr(void *arg, int code)
4215 {
4216 callb_cpr_t *cp = (callb_cpr_t *)arg;
4217 int ret = 0; /* assume success */
4218 clock_t delta;
4219
4220 mutex_enter(cp->cc_lockp);
4221
4222 switch (code) {
4223 case CB_CODE_CPR_CHKPT:
4224 /*
4225 * Mark the rpc.mdcommd as no longer present. We are trying to
4226 * suspend the system and so we should expect RPC failures to
4227 * occur.
4228 */
4229 md_mn_clear_commd_present();
4230 cp->cc_events |= CALLB_CPR_START;
4231 delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
4232 while (!(cp->cc_events & CALLB_CPR_SAFE))
4233 /* cv_timedwait() returns -1 if it times out. */
4234 if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
4235 cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
4236 break;
4237 break;
4238
4239 case CB_CODE_CPR_RESUME:
4240 cp->cc_events &= ~CALLB_CPR_START;
4241 cv_signal(&cp->cc_stop_cv);
4242 break;
4243 }
4244 mutex_exit(cp->cc_lockp);
4245 return (ret != -1);
4246 }
4247
4248
4249 void
md_rem_hspname(set_t setno,mdkey_t n_key)4250 md_rem_hspname(set_t setno, mdkey_t n_key)
4251 {
4252 int s;
4253 int max_sides;
4254
4255
4256 /* All entries removed are in the same diskset */
4257 if (md_get_setstatus(setno) & MD_SET_MNSET)
4258 max_sides = MD_MNMAXSIDES;
4259 else
4260 max_sides = MD_MAXSIDES;
4261
4262 for (s = 0; s < max_sides; s++)
4263 (void) md_remdevname(setno, s, n_key);
4264 }
4265
4266
4267 int
md_rem_selfname(minor_t selfid)4268 md_rem_selfname(minor_t selfid)
4269 {
4270 int s;
4271 set_t setno = MD_MIN2SET(selfid);
4272 int max_sides;
4273 md_dev64_t dev;
4274 struct nm_next_hdr *nh;
4275 struct nm_name *n;
4276 mdkey_t key;
4277
4278 /*
4279 * Get the key since remove routine expects it
4280 */
4281 dev = md_makedevice(md_major, selfid);
4282 if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
4283 return (ENOENT);
4284 }
4285
4286 if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
4287 MD_KEYWILD, dev, 0L)) == NULL) {
4288 return (ENOENT);
4289 }
4290
4291 /* All entries removed are in the same diskset */
4292 key = n->n_key;
4293 if (md_get_setstatus(setno) & MD_SET_MNSET)
4294 max_sides = MD_MNMAXSIDES;
4295 else
4296 max_sides = MD_MAXSIDES;
4297
4298 for (s = 0; s < max_sides; s++)
4299 (void) md_remdevname(setno, s, key);
4300
4301 return (0);
4302 }
4303
4304 void
md_upd_set_unnext(set_t setno,unit_t un)4305 md_upd_set_unnext(set_t setno, unit_t un)
4306 {
4307 if (un < md_set[setno].s_un_next) {
4308 md_set[setno].s_un_next = un;
4309 }
4310 }
4311
4312 struct hot_spare_pool *
find_hot_spare_pool(set_t setno,int hsp_id)4313 find_hot_spare_pool(set_t setno, int hsp_id)
4314 {
4315 hot_spare_pool_t *hsp;
4316
4317 hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
4318 while (hsp != NULL) {
4319 if (hsp->hsp_self_id == hsp_id)
4320 return (hsp);
4321 hsp = hsp->hsp_next;
4322 }
4323
4324 return ((hot_spare_pool_t *)0);
4325 }
4326
4327 /*
4328 * md_create_taskq:
4329 *
4330 * Create a kernel taskq for the given set/unit combination. This is typically
4331 * used to complete a RR_CLEAN request when the callee is unable to obtain the
4332 * mutex / condvar access required to update the DRL safely.
4333 */
4334 void *
md_create_taskq(set_t setno,minor_t mnum)4335 md_create_taskq(set_t setno, minor_t mnum)
4336 {
4337 char name[20];
4338 ddi_taskq_t *tqp;
4339
4340 (void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
4341
4342 tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
4343
4344 return ((void *)tqp);
4345 }
4346