xref: /titanic_41/usr/src/uts/common/io/lvm/md/md_subr.c (revision b30678564674f0af0118547884def6cf721f1360)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * Driver for Virtual Disk.
29  */
30 #include <sys/param.h>
31 #include <sys/systm.h>
32 #include <sys/buf.h>
33 #include <sys/conf.h>
34 #include <sys/user.h>
35 #include <sys/uio.h>
36 #include <sys/proc.h>
37 #include <sys/t_lock.h>
38 #include <sys/dkio.h>
39 #include <sys/kmem.h>
40 #include <sys/debug.h>
41 #include <sys/cmn_err.h>
42 #include <sys/sysmacros.h>
43 #include <sys/types.h>
44 #include <sys/mkdev.h>
45 #include <sys/vtoc.h>
46 #include <sys/open.h>
47 #include <sys/file.h>
48 #include <vm/page.h>
49 #include <sys/callb.h>
50 #include <sys/disp.h>
51 #include <sys/modctl.h>
52 #include <sys/errno.h>
53 #include <sys/door.h>
54 #include <sys/lvm/mdmn_commd.h>
55 #include <sys/lvm/md_hotspares.h>
56 
57 #include <sys/lvm/mdvar.h>
58 #include <sys/lvm/md_names.h>
59 
60 #include <sys/ddi.h>
61 #include <sys/proc.h>
62 #include <sys/sunddi.h>
63 #include <sys/esunddi.h>
64 
65 #include <sys/sysevent.h>
66 #include <sys/sysevent/eventdefs.h>
67 
68 #include <sys/sysevent/svm.h>
69 #include <sys/lvm/md_basic.h>
70 
71 
72 /*
73  * Machine specific Hertz is kept here
74  */
75 extern clock_t			md_hz;
76 
77 /*
78  * Externs.
79  */
80 extern int			(*mdv_strategy_tstpnt)(buf_t *, int, void*);
81 extern major_t			md_major;
82 extern unit_t			md_nunits;
83 extern set_t			md_nsets;
84 extern md_set_t			md_set[];
85 extern md_set_io_t		md_set_io[];
86 extern md_ops_t			**md_ops;
87 extern md_ops_t			*md_opslist;
88 extern ddi_modhandle_t		*md_mods;
89 extern dev_info_t		*md_devinfo;
90 
91 extern md_krwlock_t		md_unit_array_rw;
92 extern kmutex_t			md_mx;
93 extern kcondvar_t		md_cv;
94 
95 extern md_krwlock_t		hsp_rwlp;
96 extern md_krwlock_t		ni_rwlp;
97 
98 extern int			md_num_daemons;
99 extern int			md_status;
100 extern int			md_ioctl_cnt;
101 extern int			md_mtioctl_cnt;
102 
103 extern struct metatransops	metatransops;
104 extern md_event_queue_t		*md_event_queue;
105 extern md_resync_t		md_cpr_resync;
106 extern int			md_done_daemon_threads;
107 extern int			md_ff_daemon_threads;
108 
109 
110 extern mddb_set_t	*mddb_setenter(set_t setno, int flag, int *errorcodep);
111 extern void		mddb_setexit(mddb_set_t *s);
112 extern void		*lookup_entry(struct nm_next_hdr *, set_t,
113 				side_t, mdkey_t, md_dev64_t, int);
114 extern struct nm_next_hdr	*get_first_record(set_t, int, int);
115 extern dev_t		getrootdev(void);
116 
117 struct mdq_anchor	md_done_daemon; /* done request queue */
118 struct mdq_anchor	md_mstr_daemon; /* mirror error, WOW requests */
119 struct mdq_anchor	md_mhs_daemon;	/* mirror hotspare requests queue */
120 struct mdq_anchor	md_hs_daemon;	/* raid hotspare requests queue */
121 struct mdq_anchor	md_ff_daemonq;	/* failfast request queue */
122 struct mdq_anchor	md_mirror_daemon; /* mirror owner queue */
123 struct mdq_anchor	md_mirror_io_daemon; /* mirror owner i/o queue */
124 struct mdq_anchor	md_mirror_rs_daemon; /* mirror resync done queue */
125 struct mdq_anchor	md_sp_daemon;	/* soft-part error daemon queue */
126 struct mdq_anchor	md_mto_daemon;	/* mirror timeout daemon queue */
127 
128 int md_done_daemon_threads = 1;	/* threads for md_done_daemon requestq */
129 int md_mstr_daemon_threads = 1;	/* threads for md_mstr_daemon requestq */
130 int md_mhs_daemon_threads = 1;	/* threads for md_mhs_daemon requestq */
131 int md_hs_daemon_threads = 1;	/* threads for md_hs_daemon requestq */
132 int md_ff_daemon_threads = 3;	/* threads for md_ff_daemon requestq */
133 int md_mirror_daemon_threads = 1; /* threads for md_mirror_daemon requestq */
134 int md_sp_daemon_threads = 1;	/* threads for md_sp_daemon requestq */
135 int md_mto_daemon_threads = 1;	/* threads for md_mto_daemon requestq */
136 
137 #ifdef DEBUG
138 /* Flag to switch on debug messages */
139 int md_release_reacquire_debug = 0;	/* debug flag */
140 #endif
141 
142 /*
143  *
144  * The md_request_queues is table of pointers to request queues and the number
145  * of threads associated with the request queues.
146  * When the number of threads is set to 1, then the order of execution is
147  * sequential.
148  * The number of threads for all the queues have been defined as global
149  * variables to enable kernel tuning.
150  *
151  */
152 
153 #define	MD_DAEMON_QUEUES 11
154 
155 md_requestq_entry_t md_daemon_queues[MD_DAEMON_QUEUES] = {
156 	{&md_done_daemon, &md_done_daemon_threads},
157 	{&md_mstr_daemon, &md_mstr_daemon_threads},
158 	{&md_hs_daemon, &md_hs_daemon_threads},
159 	{&md_ff_daemonq, &md_ff_daemon_threads},
160 	{&md_mirror_daemon, &md_mirror_daemon_threads},
161 	{&md_mirror_io_daemon, &md_mirror_daemon_threads},
162 	{&md_mirror_rs_daemon, &md_mirror_daemon_threads},
163 	{&md_sp_daemon, &md_sp_daemon_threads},
164 	{&md_mhs_daemon, &md_mhs_daemon_threads},
165 	{&md_mto_daemon, &md_mto_daemon_threads},
166 	{0, 0}
167 };
168 
169 /*
170  * Number of times a message is retried before issuing a warning to the operator
171  */
172 #define	MD_MN_WARN_INTVL	10
173 
174 /*
175  * Setting retry cnt to one (pre decremented) so that we actually do no
176  * retries when committing/deleting a mddb rec. The underlying disk driver
177  * does several retries to check if the disk is really dead or not so there
178  * is no reason for us to retry on top of the drivers retries.
179  */
180 
181 uint_t			md_retry_cnt = 1; /* global so it can be patched */
182 
183 /*
184  * How many times to try to do the door_ki_upcall() in mdmn_ksend_message.
185  * Again, made patchable here should it prove useful.
186  */
187 uint_t			md_send_retry_limit = 30;
188 
189 /*
190  * Bug # 1212146
191  * Before this change the user had to pass in a short aligned buffer because of
192  * problems in some underlying device drivers.  This problem seems to have been
193  * corrected in the underlying drivers so we will default to not requiring any
194  * alignment.  If the user needs to check for a specific alignment,
195  * md_uio_alignment_mask may be set in /etc/system to accomplish this.  To get
196  * the behavior before this fix, the md_uio_alignment_mask would be set to 1,
197  * to check for word alignment, it can be set to 3, for double word alignment,
198  * it can be set to 7, etc.
199  *
200  * [Other part of fix is in function md_chk_uio()]
201  */
202 static int		md_uio_alignment_mask = 0;
203 
204 /*
205  * for md_dev64_t translation
206  */
207 struct md_xlate_table		*md_tuple_table;
208 struct md_xlate_major_table	*md_major_tuple_table;
209 int				md_tuple_length;
210 uint_t				md_majortab_len;
211 
212 /* Function declarations */
213 
214 static int md_create_probe_rqlist(md_probedev_impl_t *plist,
215 			daemon_queue_t **hdr, intptr_t (*probe_test)());
216 
217 /*
218  * manipulate global status
219  */
220 void
md_set_status(int bits)221 md_set_status(int bits)
222 {
223 	mutex_enter(&md_mx);
224 	md_status |= bits;
225 	mutex_exit(&md_mx);
226 }
227 
228 void
md_clr_status(int bits)229 md_clr_status(int bits)
230 {
231 	mutex_enter(&md_mx);
232 	md_status &= ~bits;
233 	mutex_exit(&md_mx);
234 }
235 
236 int
md_get_status()237 md_get_status()
238 {
239 	int result;
240 	mutex_enter(&md_mx);
241 	result = md_status;
242 	mutex_exit(&md_mx);
243 	return (result);
244 }
245 
246 void
md_set_setstatus(set_t setno,int bits)247 md_set_setstatus(set_t setno, int bits)
248 {
249 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
250 
251 	mutex_enter(&md_mx);
252 	md_set[setno].s_status |= bits;
253 	mutex_exit(&md_mx);
254 }
255 
256 void
md_clr_setstatus(set_t setno,int bits)257 md_clr_setstatus(set_t setno, int bits)
258 {
259 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
260 
261 	mutex_enter(&md_mx);
262 	md_set[setno].s_status &= ~bits;
263 	mutex_exit(&md_mx);
264 }
265 
266 uint_t
md_get_setstatus(set_t setno)267 md_get_setstatus(set_t setno)
268 {
269 	uint_t result;
270 
271 	ASSERT(setno != MD_SET_BAD && setno < MD_MAXSETS);
272 
273 	mutex_enter(&md_mx);
274 	result = md_set[setno].s_status;
275 	mutex_exit(&md_mx);
276 	return (result);
277 }
278 
279 /*
280  * md_unit_readerlock_common:
281  * -------------------------
282  * Mark the given unit as having a reader reference. Spin waiting for any
283  * writer references to be released.
284  *
285  * Input:
286  *	ui		unit reference
287  *	lock_held	0 => ui_mx needs to be grabbed
288  *			1 => ui_mx already held
289  * Output:
290  *	mm_unit_t corresponding to unit structure
291  *	ui->ui_readercnt incremented
292  */
293 static void *
md_unit_readerlock_common(mdi_unit_t * ui,int lock_held)294 md_unit_readerlock_common(mdi_unit_t *ui, int lock_held)
295 {
296 	uint_t	flag = MD_UL_WRITER | MD_UL_WANABEWRITER;
297 
298 	if (!lock_held)
299 		mutex_enter(&ui->ui_mx);
300 	while (ui->ui_lock & flag) {
301 		if (panicstr) {
302 			if (ui->ui_lock & MD_UL_WRITER)
303 				panic("md: writer lock is held");
304 			break;
305 		}
306 		cv_wait(&ui->ui_cv, &ui->ui_mx);
307 	}
308 	ui->ui_readercnt++;
309 	if (!lock_held)
310 		mutex_exit(&ui->ui_mx);
311 	return (MD_UNIT(ui->ui_link.ln_id));
312 }
313 
314 void *
md_unit_readerlock(mdi_unit_t * ui)315 md_unit_readerlock(mdi_unit_t *ui)
316 {
317 	return (md_unit_readerlock_common(ui, 0));
318 }
319 
320 /*
321  * md_unit_writerlock_common:
322  * -------------------------
323  * Acquire a unique writer reference. Causes previous readers to drain.
324  * Spins if a writer reference already exists or if a previous reader/writer
325  * dropped the lock to allow a ksend_message to be despatched.
326  *
327  * Input:
328  *	ui		unit reference
329  *	lock_held	0 => grab ui_mx
330  *			1 => ui_mx already held on entry
331  * Output:
332  *	mm_unit_t reference
333  */
334 static void *
md_unit_writerlock_common(mdi_unit_t * ui,int lock_held)335 md_unit_writerlock_common(mdi_unit_t *ui, int lock_held)
336 {
337 	uint_t	flag = MD_UL_WRITER;
338 
339 	if (panicstr)
340 		panic("md: writer lock not allowed");
341 
342 	if (!lock_held)
343 		mutex_enter(&ui->ui_mx);
344 
345 	while ((ui->ui_lock & flag) || (ui->ui_readercnt != 0)) {
346 		ui->ui_wanabecnt++;
347 		ui->ui_lock |= MD_UL_WANABEWRITER;
348 		cv_wait(&ui->ui_cv, &ui->ui_mx);
349 		if (--ui->ui_wanabecnt == 0)
350 			ui->ui_lock &= ~MD_UL_WANABEWRITER;
351 	}
352 	ui->ui_lock |= MD_UL_WRITER;
353 	ui->ui_owner = curthread;
354 
355 	if (!lock_held)
356 		mutex_exit(&ui->ui_mx);
357 	return (MD_UNIT(ui->ui_link.ln_id));
358 }
359 
360 void *
md_unit_writerlock(mdi_unit_t * ui)361 md_unit_writerlock(mdi_unit_t *ui)
362 {
363 	return (md_unit_writerlock_common(ui, 0));
364 }
365 
366 /*
367  * md_unit_readerexit_common:
368  * -------------------------
369  * Release the readerlock for the specified unit. If the reader count reaches
370  * zero and there are waiting writers (MD_UL_WANABEWRITER set) wake them up.
371  *
372  * Input:
373  *	ui		unit reference
374  *	lock_held	0 => ui_mx needs to be acquired
375  *			1 => ui_mx already held
376  */
377 static void
md_unit_readerexit_common(mdi_unit_t * ui,int lock_held)378 md_unit_readerexit_common(mdi_unit_t *ui, int lock_held)
379 {
380 	if (!lock_held)
381 		mutex_enter(&ui->ui_mx);
382 	ASSERT((ui->ui_lock & MD_UL_WRITER) == 0);
383 	ASSERT(ui->ui_readercnt != 0);
384 	ui->ui_readercnt--;
385 	if ((ui->ui_wanabecnt != 0) && (ui->ui_readercnt == 0))
386 		cv_broadcast(&ui->ui_cv);
387 
388 	if (!lock_held)
389 		mutex_exit(&ui->ui_mx);
390 }
391 
392 void
md_unit_readerexit(mdi_unit_t * ui)393 md_unit_readerexit(mdi_unit_t *ui)
394 {
395 	md_unit_readerexit_common(ui, 0);
396 }
397 
398 /*
399  * md_unit_writerexit_common:
400  * -------------------------
401  * Release the writerlock currently held on the unit. Wake any threads waiting
402  * on becoming reader or writer (MD_UL_WANABEWRITER set).
403  *
404  * Input:
405  *	ui		unit reference
406  *	lock_held	0 => ui_mx to be acquired
407  *			1 => ui_mx already held
408  */
409 static void
md_unit_writerexit_common(mdi_unit_t * ui,int lock_held)410 md_unit_writerexit_common(mdi_unit_t *ui, int lock_held)
411 {
412 	if (!lock_held)
413 		mutex_enter(&ui->ui_mx);
414 	ASSERT((ui->ui_lock & MD_UL_WRITER) != 0);
415 	ASSERT(ui->ui_readercnt == 0);
416 	ui->ui_lock &= ~MD_UL_WRITER;
417 	ui->ui_owner = NULL;
418 
419 	cv_broadcast(&ui->ui_cv);
420 	if (!lock_held)
421 		mutex_exit(&ui->ui_mx);
422 }
423 
424 void
md_unit_writerexit(mdi_unit_t * ui)425 md_unit_writerexit(mdi_unit_t *ui)
426 {
427 	md_unit_writerexit_common(ui, 0);
428 }
429 
430 void *
md_io_readerlock(mdi_unit_t * ui)431 md_io_readerlock(mdi_unit_t *ui)
432 {
433 	md_io_lock_t	*io = ui->ui_io_lock;
434 
435 	ASSERT(io);  /* checks case where no io lock allocated */
436 	mutex_enter(&io->io_mx);
437 	while (io->io_lock & (MD_UL_WRITER | MD_UL_WANABEWRITER)) {
438 		if (panicstr) {
439 			if (io->io_lock & MD_UL_WRITER)
440 				panic("md: writer lock is held");
441 			break;
442 		}
443 		cv_wait(&io->io_cv, &io->io_mx);
444 	}
445 	io->io_readercnt++;
446 	mutex_exit(&io->io_mx);
447 	return (MD_UNIT(ui->ui_link.ln_id));
448 }
449 
450 void *
md_io_writerlock(mdi_unit_t * ui)451 md_io_writerlock(mdi_unit_t *ui)
452 {
453 	md_io_lock_t	*io = ui->ui_io_lock;
454 
455 	ASSERT(io);  /* checks case where no io lock allocated */
456 	if (panicstr)
457 		panic("md: writer lock not allowed");
458 
459 	mutex_enter(&io->io_mx);
460 	while ((io->io_lock & MD_UL_WRITER) || (io->io_readercnt != 0)) {
461 		io->io_wanabecnt++;
462 		io->io_lock |= MD_UL_WANABEWRITER;
463 		cv_wait(&io->io_cv, &io->io_mx);
464 		if (--io->io_wanabecnt == 0)
465 			io->io_lock &= ~MD_UL_WANABEWRITER;
466 	}
467 	io->io_lock |= MD_UL_WRITER;
468 	io->io_owner = curthread;
469 
470 	mutex_exit(&io->io_mx);
471 	return (MD_UNIT(ui->ui_link.ln_id));
472 }
473 
474 void
md_io_readerexit(mdi_unit_t * ui)475 md_io_readerexit(mdi_unit_t *ui)
476 {
477 	md_io_lock_t	*io = ui->ui_io_lock;
478 
479 	mutex_enter(&io->io_mx);
480 	ASSERT((io->io_lock & MD_UL_WRITER) == 0);
481 	ASSERT(io->io_readercnt != 0);
482 	io->io_readercnt--;
483 	if ((io->io_wanabecnt != 0) && (io->io_readercnt == 0)) {
484 		cv_broadcast(&io->io_cv);
485 	}
486 	mutex_exit(&io->io_mx);
487 }
488 
489 void
md_io_writerexit(mdi_unit_t * ui)490 md_io_writerexit(mdi_unit_t *ui)
491 {
492 	md_io_lock_t	*io = ui->ui_io_lock;
493 
494 	mutex_enter(&io->io_mx);
495 	ASSERT((io->io_lock & MD_UL_WRITER) != 0);
496 	ASSERT(io->io_readercnt == 0);
497 	io->io_lock &= ~MD_UL_WRITER;
498 	io->io_owner = NULL;
499 
500 	cv_broadcast(&io->io_cv);
501 	mutex_exit(&io->io_mx);
502 }
503 
504 /*
505  * Attempt to grab that set of locks defined as global.
506  * A mask containing the set of global locks that are owned upon
507  * entry is input.  Any additional global locks are then grabbed.
508  * This keeps the caller from having to know the set of global
509  * locks.
510  */
511 static int
md_global_lock_enter(int global_locks_owned_mask)512 md_global_lock_enter(int global_locks_owned_mask)
513 {
514 
515 	/*
516 	 * The current implementation has been verified by inspection
517 	 * and test to be deadlock free.  If another global lock is
518 	 * added, changing the algorithm used by this function should
519 	 * be considered.  With more than 2 locks it is difficult to
520 	 * guarantee that locks are being acquired in the correct order.
521 	 * The safe approach would be to drop all of the locks that are
522 	 * owned at function entry and then reacquire all of the locks
523 	 * in the order defined by the lock hierarchy.
524 	 */
525 	mutex_enter(&md_mx);
526 	if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
527 		while ((md_mtioctl_cnt != 0) ||
528 		    (md_status & MD_GBL_IOCTL_LOCK)) {
529 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
530 				mutex_exit(&md_mx);
531 				return (EINTR);
532 			}
533 		}
534 		md_status |= MD_GBL_IOCTL_LOCK;
535 		md_ioctl_cnt++;
536 	}
537 	if (!(global_locks_owned_mask & MD_GBL_HS_LOCK)) {
538 		while (md_status & MD_GBL_HS_LOCK) {
539 			if (cv_wait_sig_swap(&md_cv, &md_mx) == 0) {
540 				md_status &= ~MD_GBL_IOCTL_LOCK;
541 				mutex_exit(&md_mx);
542 				return (EINTR);
543 			}
544 		}
545 		md_status |= MD_GBL_HS_LOCK;
546 	}
547 	mutex_exit(&md_mx);
548 	return (0);
549 }
550 
551 /*
552  * Release the set of global locks that were grabbed in md_global_lock_enter
553  * that were not already owned by the calling thread.  The set of previously
554  * owned global locks is passed in as a mask parameter.
555  */
556 static int
md_global_lock_exit(int global_locks_owned_mask,int code,int flags,mdi_unit_t * ui)557 md_global_lock_exit(int global_locks_owned_mask, int code,
558 	int flags, mdi_unit_t *ui)
559 {
560 	mutex_enter(&md_mx);
561 
562 	/* If MT ioctl decrement mt_ioctl_cnt */
563 	if ((flags & MD_MT_IOCTL)) {
564 		md_mtioctl_cnt--;
565 	} else {
566 		if (!(global_locks_owned_mask & MD_GBL_IOCTL_LOCK)) {
567 			/* clear the lock and decrement count */
568 			ASSERT(md_ioctl_cnt == 1);
569 			md_ioctl_cnt--;
570 			md_status &= ~MD_GBL_IOCTL_LOCK;
571 		}
572 		if (!(global_locks_owned_mask & MD_GBL_HS_LOCK))
573 			md_status &= ~MD_GBL_HS_LOCK;
574 	}
575 	if (flags & MD_READER_HELD)
576 		md_unit_readerexit(ui);
577 	if (flags & MD_WRITER_HELD)
578 		md_unit_writerexit(ui);
579 	if (flags & MD_IO_HELD)
580 		md_io_writerexit(ui);
581 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
582 		rw_exit(&md_unit_array_rw.lock);
583 	}
584 	cv_broadcast(&md_cv);
585 	mutex_exit(&md_mx);
586 
587 	return (code);
588 }
589 
590 /*
591  * The two functions, md_ioctl_lock_enter, and md_ioctl_lock_exit make
592  * use of the md_global_lock_{enter|exit} functions to avoid duplication
593  * of code.  They rely upon the fact that the locks that are specified in
594  * the input mask are not acquired or freed.  If this algorithm changes
595  * as described in the block comment at the beginning of md_global_lock_enter
596  * then it will be necessary to change these 2 functions.  Otherwise these
597  * functions will be grabbing and holding global locks unnecessarily.
598  */
599 int
md_ioctl_lock_enter(void)600 md_ioctl_lock_enter(void)
601 {
602 	/* grab only the ioctl lock */
603 	return (md_global_lock_enter(~MD_GBL_IOCTL_LOCK));
604 }
605 
606 /*
607  * If md_ioctl_lock_exit is being called at the end of an ioctl before
608  * returning to user space, then ioctl_end is set to 1.
609  * Otherwise, the ioctl lock is being dropped in the middle of handling
610  * an ioctl and will be reacquired before the end of the ioctl.
611  * Do not attempt to process the MN diskset mddb parse flags unless
612  * ioctl_end is true - otherwise a deadlock situation could arise.
613  */
614 int
md_ioctl_lock_exit(int code,int flags,mdi_unit_t * ui,int ioctl_end)615 md_ioctl_lock_exit(int code, int flags, mdi_unit_t *ui, int ioctl_end)
616 {
617 	int				ret_val;
618 	uint_t				status;
619 	mddb_set_t			*s;
620 	int				i;
621 	int				err;
622 	md_mn_msg_mddb_parse_t		*mddb_parse_msg;
623 	md_mn_kresult_t			*kresult;
624 	mddb_lb_t			*lbp;
625 	int				rval = 1;
626 	int				flag;
627 
628 	/* release only the ioctl lock */
629 	ret_val = md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
630 
631 	/*
632 	 * If md_ioctl_lock_exit is being called with a possible lock held
633 	 * (ioctl_end is 0), then don't check the MN disksets since the
634 	 * call to mddb_setenter may cause a lock ordering deadlock.
635 	 */
636 	if (!ioctl_end)
637 		return (ret_val);
638 
639 	/*
640 	 * Walk through disksets to see if there is a MN diskset that
641 	 * has messages that need to be sent.  Set must be snarfed and
642 	 * be a MN diskset in order to be checked.
643 	 *
644 	 * In a MN diskset, this routine may send messages to the
645 	 * rpc.mdcommd in order to have the slave nodes re-parse parts
646 	 * of the mddb.  Messages can only be sent with no locks held,
647 	 * so if mddb change occurred while the ioctl lock is held, this
648 	 * routine must send the messages.
649 	 */
650 	for (i = 1; i < md_nsets; i++) {
651 		status = md_get_setstatus(i);
652 
653 		/* Set must be snarfed and be a MN diskset */
654 		if ((status & (MD_SET_SNARFED | MD_SET_MNSET)) !=
655 		    (MD_SET_SNARFED | MD_SET_MNSET))
656 			continue;
657 
658 		/* Grab set lock so that set can't change */
659 		if ((s = mddb_setenter(i, MDDB_MUSTEXIST, &err)) == NULL)
660 			continue;
661 
662 		lbp = s->s_lbp;
663 
664 		/* Re-get set status now that lock is held */
665 		status = md_get_setstatus(i);
666 
667 		/*
668 		 * If MN parsing block flag is set - continue to next set.
669 		 *
670 		 * If s_mn_parseflags_sending is non-zero, then another thread
671 		 * is already currently sending a parse message, so just
672 		 * release the set mutex.  If this ioctl had caused an mddb
673 		 * change that results in a parse message to be generated,
674 		 * the thread that is currently sending a parse message would
675 		 * generate the additional parse message.
676 		 *
677 		 * If s_mn_parseflags_sending is zero then loop until
678 		 * s_mn_parseflags is 0 (until there are no more
679 		 * messages to send).
680 		 * While s_mn_parseflags is non-zero,
681 		 *	put snapshot of parse_flags in s_mn_parseflags_sending
682 		 *	set s_mn_parseflags to zero
683 		 *	release set mutex
684 		 *	send message
685 		 *	re-grab set mutex
686 		 *	set s_mn_parseflags_sending to zero
687 		 *
688 		 * If set is STALE, send message with NO_LOG flag so that
689 		 * rpc.mdcommd won't attempt to log message to non-writeable
690 		 * replica.
691 		 */
692 		mddb_parse_msg = kmem_zalloc(sizeof (md_mn_msg_mddb_parse_t),
693 		    KM_SLEEP);
694 		while (((s->s_mn_parseflags_sending & MDDB_PARSE_MASK) == 0) &&
695 		    (s->s_mn_parseflags & MDDB_PARSE_MASK) &&
696 		    (!(status & MD_SET_MNPARSE_BLK))) {
697 
698 			/* Grab snapshot of parse flags */
699 			s->s_mn_parseflags_sending = s->s_mn_parseflags;
700 			s->s_mn_parseflags = 0;
701 
702 			mutex_exit(&md_set[(s)->s_setno].s_dbmx);
703 
704 			/*
705 			 * Send the message to the slaves to re-parse
706 			 * the indicated portions of the mddb. Send the status
707 			 * of the 50 mddbs in this set so that slaves know
708 			 * which mddbs that the master node thinks are 'good'.
709 			 * Otherwise, slave may reparse, but from wrong
710 			 * replica.
711 			 */
712 			mddb_parse_msg->msg_parse_flags =
713 			    s->s_mn_parseflags_sending;
714 
715 			for (i = 0; i < MDDB_NLB; i++) {
716 				mddb_parse_msg->msg_lb_flags[i] =
717 				    lbp->lb_locators[i].l_flags;
718 			}
719 			kresult = kmem_alloc(sizeof (md_mn_kresult_t),
720 			    KM_SLEEP);
721 			while (rval != 0) {
722 				flag = 0;
723 				if (status & MD_SET_STALE)
724 					flag |= MD_MSGF_NO_LOG;
725 				rval = mdmn_ksend_message(s->s_setno,
726 				    MD_MN_MSG_MDDB_PARSE, flag, 0,
727 				    (char *)mddb_parse_msg,
728 				    sizeof (md_mn_msg_mddb_parse_t), kresult);
729 				/* if the node hasn't yet joined, it's Ok. */
730 				if ((!MDMN_KSEND_MSG_OK(rval, kresult)) &&
731 				    (kresult->kmmr_comm_state !=
732 				    MDMNE_NOT_JOINED)) {
733 					mdmn_ksend_show_error(rval, kresult,
734 					    "MD_MN_MSG_MDDB_PARSE");
735 					cmn_err(CE_WARN, "md_ioctl_lock_exit: "
736 					    "Unable to send mddb update "
737 					    "message to other nodes in "
738 					    "diskset %s\n", s->s_setname);
739 					rval = 1;
740 				}
741 			}
742 			kmem_free(kresult, sizeof (md_mn_kresult_t));
743 
744 			/*
745 			 * Re-grab mutex to clear sending field and to
746 			 * see if another parse message needs to be generated.
747 			 */
748 			mutex_enter(&md_set[(s)->s_setno].s_dbmx);
749 			s->s_mn_parseflags_sending = 0;
750 		}
751 		kmem_free(mddb_parse_msg, sizeof (md_mn_msg_mddb_parse_t));
752 		mutex_exit(&md_set[(s)->s_setno].s_dbmx);
753 	}
754 	return (ret_val);
755 }
756 
757 /*
758  * Called when in an ioctl and need readerlock.
759  */
760 void *
md_ioctl_readerlock(IOLOCK * lock,mdi_unit_t * ui)761 md_ioctl_readerlock(IOLOCK *lock, mdi_unit_t *ui)
762 {
763 	ASSERT(lock != NULL);
764 	lock->l_ui = ui;
765 	lock->l_flags |= MD_READER_HELD;
766 	return (md_unit_readerlock_common(ui, 0));
767 }
768 
769 /*
770  * Called when in an ioctl and need writerlock.
771  */
772 void *
md_ioctl_writerlock(IOLOCK * lock,mdi_unit_t * ui)773 md_ioctl_writerlock(IOLOCK *lock, mdi_unit_t *ui)
774 {
775 	ASSERT(lock != NULL);
776 	lock->l_ui = ui;
777 	lock->l_flags |= MD_WRITER_HELD;
778 	return (md_unit_writerlock_common(ui, 0));
779 }
780 
781 void *
md_ioctl_io_lock(IOLOCK * lock,mdi_unit_t * ui)782 md_ioctl_io_lock(IOLOCK *lock, mdi_unit_t *ui)
783 {
784 	ASSERT(lock != NULL);
785 	lock->l_ui = ui;
786 	lock->l_flags |= MD_IO_HELD;
787 	return (md_io_writerlock(ui));
788 }
789 
790 void
md_ioctl_readerexit(IOLOCK * lock)791 md_ioctl_readerexit(IOLOCK *lock)
792 {
793 	ASSERT(lock != NULL);
794 	lock->l_flags &= ~MD_READER_HELD;
795 	md_unit_readerexit(lock->l_ui);
796 }
797 
798 void
md_ioctl_writerexit(IOLOCK * lock)799 md_ioctl_writerexit(IOLOCK *lock)
800 {
801 	ASSERT(lock != NULL);
802 	lock->l_flags &= ~MD_WRITER_HELD;
803 	md_unit_writerexit(lock->l_ui);
804 }
805 
806 void
md_ioctl_io_exit(IOLOCK * lock)807 md_ioctl_io_exit(IOLOCK *lock)
808 {
809 	ASSERT(lock != NULL);
810 	lock->l_flags &= ~MD_IO_HELD;
811 	md_io_writerexit(lock->l_ui);
812 }
813 
814 /*
815  * md_ioctl_releaselocks:
816  * --------------------
817  * Release the unit locks that are held and stop subsequent
818  * md_unit_reader/writerlock calls from progressing. This allows the caller
819  * to send messages across the cluster when running in a multinode
820  * environment.
821  * ioctl originated locks (via md_ioctl_readerlock/md_ioctl_writerlock) are
822  * allowed to progress as normal. This is required as these typically are
823  * invoked by the message handler that may be called while a unit lock is
824  * marked as released.
825  *
826  * On entry:
827  *	variety of unit locks may be held including ioctl lock
828  *
829  * On exit:
830  *      locks released and unit structure updated to prevent subsequent reader/
831  *      writer locks being acquired until md_ioctl_reacquirelocks is called
832  */
833 void
md_ioctl_releaselocks(int code,int flags,mdi_unit_t * ui)834 md_ioctl_releaselocks(int code, int flags, mdi_unit_t *ui)
835 {
836 	/* This actually releases the locks. */
837 	(void) md_global_lock_exit(~MD_GBL_IOCTL_LOCK, code, flags, ui);
838 }
839 
840 /*
841  * md_ioctl_reacquirelocks:
842  * ----------------------
843  * Reacquire the locks that were held when md_ioctl_releaselocks
844  * was called.
845  *
846  * On entry:
847  *      No unit locks held
848  * On exit:
849  *	locks held that were held at md_ioctl_releaselocks time including
850  *	the ioctl lock.
851  */
852 void
md_ioctl_reacquirelocks(int flags,mdi_unit_t * ui)853 md_ioctl_reacquirelocks(int flags, mdi_unit_t *ui)
854 {
855 	if (flags & MD_MT_IOCTL) {
856 		mutex_enter(&md_mx);
857 		md_mtioctl_cnt++;
858 		mutex_exit(&md_mx);
859 	} else {
860 		while (md_ioctl_lock_enter() == EINTR)
861 			;
862 	}
863 	if (flags & MD_ARRAY_WRITER) {
864 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
865 	} else if (flags & MD_ARRAY_READER) {
866 		rw_enter(&md_unit_array_rw.lock, RW_READER);
867 	}
868 	if (ui != (mdi_unit_t *)NULL) {
869 		if (flags & MD_IO_HELD) {
870 			(void) md_io_writerlock(ui);
871 		}
872 
873 		mutex_enter(&ui->ui_mx);
874 		if (flags & MD_READER_HELD) {
875 			(void) md_unit_readerlock_common(ui, 1);
876 		} else if (flags & MD_WRITER_HELD) {
877 			(void) md_unit_writerlock_common(ui, 1);
878 		}
879 		/* Wake up any blocked readerlock() calls */
880 		cv_broadcast(&ui->ui_cv);
881 		mutex_exit(&ui->ui_mx);
882 	}
883 }
884 
885 void
md_ioctl_droplocks(IOLOCK * lock)886 md_ioctl_droplocks(IOLOCK *lock)
887 {
888 	mdi_unit_t	*ui;
889 	int		flags;
890 
891 	ASSERT(lock != NULL);
892 	ui = lock->l_ui;
893 	flags = lock->l_flags;
894 	if (flags & MD_READER_HELD) {
895 		lock->l_flags &= ~MD_READER_HELD;
896 		md_unit_readerexit(ui);
897 	}
898 	if (flags & MD_WRITER_HELD) {
899 		lock->l_flags &= ~MD_WRITER_HELD;
900 		md_unit_writerexit(ui);
901 	}
902 	if (flags & MD_IO_HELD) {
903 		lock->l_flags &= ~MD_IO_HELD;
904 		md_io_writerexit(ui);
905 	}
906 	if (flags & (MD_ARRAY_WRITER | MD_ARRAY_READER)) {
907 		lock->l_flags &= ~(MD_ARRAY_WRITER | MD_ARRAY_READER);
908 		rw_exit(&md_unit_array_rw.lock);
909 	}
910 }
911 
912 void
md_array_writer(IOLOCK * lock)913 md_array_writer(IOLOCK *lock)
914 {
915 	ASSERT(lock != NULL);
916 	lock->l_flags |= MD_ARRAY_WRITER;
917 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
918 }
919 
920 void
md_array_reader(IOLOCK * lock)921 md_array_reader(IOLOCK *lock)
922 {
923 	ASSERT(lock != NULL);
924 	lock->l_flags |= MD_ARRAY_READER;
925 	rw_enter(&md_unit_array_rw.lock, RW_READER);
926 }
927 
928 /*
929  * Called when in an ioctl and need opencloselock.
930  * Sets flags in lockp for READER_HELD.
931  */
932 void *
md_ioctl_openclose_enter(IOLOCK * lockp,mdi_unit_t * ui)933 md_ioctl_openclose_enter(IOLOCK *lockp, mdi_unit_t *ui)
934 {
935 	void	*un;
936 
937 	ASSERT(lockp != NULL);
938 	mutex_enter(&ui->ui_mx);
939 	while (ui->ui_lock & MD_UL_OPENORCLOSE)
940 		cv_wait(&ui->ui_cv, &ui->ui_mx);
941 	ui->ui_lock |= MD_UL_OPENORCLOSE;
942 
943 	/* Maintain mutex across the readerlock call */
944 	lockp->l_ui = ui;
945 	lockp->l_flags |= MD_READER_HELD;
946 	un = md_unit_readerlock_common(ui, 1);
947 	mutex_exit(&ui->ui_mx);
948 
949 	return (un);
950 }
951 
952 /*
953  * Clears reader lock using md_ioctl instead of md_unit
954  * and updates lockp.
955  */
956 void
md_ioctl_openclose_exit(IOLOCK * lockp)957 md_ioctl_openclose_exit(IOLOCK *lockp)
958 {
959 	mdi_unit_t	*ui;
960 
961 	ASSERT(lockp != NULL);
962 	ui = lockp->l_ui;
963 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
964 
965 	md_ioctl_readerexit(lockp);
966 
967 	mutex_enter(&ui->ui_mx);
968 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
969 
970 	cv_broadcast(&ui->ui_cv);
971 	mutex_exit(&ui->ui_mx);
972 }
973 
974 /*
975  * Clears reader lock using md_ioctl instead of md_unit
976  * and updates lockp.
977  * Does not acquire or release the ui_mx lock since the calling
978  * routine has already acquired this lock.
979  */
980 void
md_ioctl_openclose_exit_lh(IOLOCK * lockp)981 md_ioctl_openclose_exit_lh(IOLOCK *lockp)
982 {
983 	mdi_unit_t	*ui;
984 
985 	ASSERT(lockp != NULL);
986 	ui = lockp->l_ui;
987 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
988 
989 	lockp->l_flags &= ~MD_READER_HELD;
990 	md_unit_readerexit_common(lockp->l_ui, 1);
991 
992 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
993 	cv_broadcast(&ui->ui_cv);
994 }
995 
996 void *
md_unit_openclose_enter(mdi_unit_t * ui)997 md_unit_openclose_enter(mdi_unit_t *ui)
998 {
999 	void	*un;
1000 
1001 	mutex_enter(&ui->ui_mx);
1002 	while (ui->ui_lock & (MD_UL_OPENORCLOSE))
1003 		cv_wait(&ui->ui_cv, &ui->ui_mx);
1004 	ui->ui_lock |= MD_UL_OPENORCLOSE;
1005 
1006 	/* Maintain mutex across the readerlock call */
1007 	un = md_unit_readerlock_common(ui, 1);
1008 	mutex_exit(&ui->ui_mx);
1009 
1010 	return (un);
1011 }
1012 
1013 void
md_unit_openclose_exit(mdi_unit_t * ui)1014 md_unit_openclose_exit(mdi_unit_t *ui)
1015 {
1016 	md_unit_readerexit(ui);
1017 
1018 	mutex_enter(&ui->ui_mx);
1019 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1020 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1021 
1022 	cv_broadcast(&ui->ui_cv);
1023 	mutex_exit(&ui->ui_mx);
1024 }
1025 
1026 /*
1027  * Drop the openclose and readerlocks without acquiring or
1028  * releasing the ui_mx lock since the calling routine has
1029  * already acquired this lock.
1030  */
1031 void
md_unit_openclose_exit_lh(mdi_unit_t * ui)1032 md_unit_openclose_exit_lh(mdi_unit_t *ui)
1033 {
1034 	md_unit_readerexit_common(ui, 1);
1035 	ASSERT(ui->ui_lock & MD_UL_OPENORCLOSE);
1036 	ui->ui_lock &= ~MD_UL_OPENORCLOSE;
1037 	cv_broadcast(&ui->ui_cv);
1038 }
1039 
1040 int
md_unit_isopen(mdi_unit_t * ui)1041 md_unit_isopen(
1042 	mdi_unit_t	*ui
1043 )
1044 {
1045 	int		isopen;
1046 
1047 	/* check status */
1048 	mutex_enter(&ui->ui_mx);
1049 	isopen = ((ui->ui_lock & MD_UL_OPEN) ? 1 : 0);
1050 	mutex_exit(&ui->ui_mx);
1051 	return (isopen);
1052 }
1053 
1054 int
md_unit_incopen(minor_t mnum,int flag,int otyp)1055 md_unit_incopen(
1056 	minor_t		mnum,
1057 	int		flag,
1058 	int		otyp
1059 )
1060 {
1061 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1062 	int		err = 0;
1063 
1064 	/* check type and flags */
1065 	ASSERT(ui != NULL);
1066 	mutex_enter(&ui->ui_mx);
1067 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1068 		err = EINVAL;
1069 		goto out;
1070 	}
1071 	if (((flag & FEXCL) && (ui->ui_lock & MD_UL_OPEN)) ||
1072 	    (ui->ui_lock & MD_UL_EXCL)) {
1073 		err = EBUSY;
1074 		goto out;
1075 	}
1076 
1077 	/* count and flag open */
1078 	ui->ui_ocnt[otyp]++;
1079 	ui->ui_lock |= MD_UL_OPEN;
1080 	if (flag & FEXCL)
1081 		ui->ui_lock |= MD_UL_EXCL;
1082 
1083 	/* setup kstat, return success */
1084 	mutex_exit(&ui->ui_mx);
1085 	md_kstat_init(mnum);
1086 	return (0);
1087 
1088 	/* return error */
1089 out:
1090 	mutex_exit(&ui->ui_mx);
1091 	return (err);
1092 }
1093 
1094 int
md_unit_decopen(minor_t mnum,int otyp)1095 md_unit_decopen(
1096 	minor_t		mnum,
1097 	int		otyp
1098 )
1099 {
1100 	mdi_unit_t	*ui = MDI_UNIT(mnum);
1101 	int		err = 0;
1102 	unsigned	i;
1103 
1104 	/* check type and flags */
1105 	ASSERT(ui != NULL);
1106 	mutex_enter(&ui->ui_mx);
1107 	if ((otyp < 0) || (otyp >= OTYPCNT)) {
1108 		err = EINVAL;
1109 		goto out;
1110 	} else if (ui->ui_ocnt[otyp] == 0) {
1111 		err = ENXIO;
1112 		goto out;
1113 	}
1114 
1115 	/* count and flag closed */
1116 	if (otyp == OTYP_LYR)
1117 		ui->ui_ocnt[otyp]--;
1118 	else
1119 		ui->ui_ocnt[otyp] = 0;
1120 	ui->ui_lock &= ~MD_UL_OPEN;
1121 	for (i = 0; (i < OTYPCNT); ++i)
1122 		if (ui->ui_ocnt[i] != 0)
1123 			ui->ui_lock |= MD_UL_OPEN;
1124 	if (! (ui->ui_lock & MD_UL_OPEN))
1125 		ui->ui_lock &= ~MD_UL_EXCL;
1126 
1127 	/* teardown kstat, return success */
1128 	if (! (ui->ui_lock & MD_UL_OPEN)) {
1129 
1130 		/*
1131 		 * We have a race condition inherited from specfs between
1132 		 * open() and close() calls. This results in the kstat
1133 		 * for a pending I/O being torn down, and then a panic.
1134 		 * To avoid this, only tear the kstat down if there are
1135 		 * no other readers on this device.
1136 		 */
1137 		if (ui->ui_readercnt > 1) {
1138 			mutex_exit(&ui->ui_mx);
1139 		} else {
1140 			mutex_exit(&ui->ui_mx);
1141 			md_kstat_destroy(mnum);
1142 		}
1143 		return (0);
1144 	}
1145 
1146 	/* return success */
1147 out:
1148 	mutex_exit(&ui->ui_mx);
1149 	return (err);
1150 }
1151 
1152 md_dev64_t
md_xlate_targ_2_mini(md_dev64_t targ_devt)1153 md_xlate_targ_2_mini(md_dev64_t targ_devt)
1154 {
1155 	dev32_t		mini_32_devt, targ_32_devt;
1156 	int		i;
1157 
1158 	/*
1159 	 * check to see if we're in an upgrade situation
1160 	 * if we are not in upgrade just return the input device
1161 	 */
1162 
1163 	if (!MD_UPGRADE)
1164 		return (targ_devt);
1165 
1166 	targ_32_devt = md_cmpldev(targ_devt);
1167 
1168 	i = 0;
1169 	while (i != md_tuple_length) {
1170 		if (md_tuple_table[i].targ_devt == targ_32_devt) {
1171 			mini_32_devt = md_tuple_table[i].mini_devt;
1172 			return (md_expldev((md_dev64_t)mini_32_devt));
1173 		}
1174 		i++;
1175 	}
1176 	return (NODEV64);
1177 }
1178 
1179 md_dev64_t
md_xlate_mini_2_targ(md_dev64_t mini_devt)1180 md_xlate_mini_2_targ(md_dev64_t mini_devt)
1181 {
1182 	dev32_t		mini_32_devt, targ_32_devt;
1183 	int		i;
1184 
1185 	if (!MD_UPGRADE)
1186 		return (mini_devt);
1187 
1188 	mini_32_devt = md_cmpldev(mini_devt);
1189 
1190 	i = 0;
1191 	while (i != md_tuple_length) {
1192 		if (md_tuple_table[i].mini_devt == mini_32_devt) {
1193 			targ_32_devt = md_tuple_table[i].targ_devt;
1194 			return (md_expldev((md_dev64_t)targ_32_devt));
1195 		}
1196 		i++;
1197 	}
1198 	return (NODEV64);
1199 }
1200 
1201 void
md_xlate_free(int size)1202 md_xlate_free(int size)
1203 {
1204 	kmem_free(md_tuple_table, size);
1205 }
1206 
1207 char *
md_targ_major_to_name(major_t maj)1208 md_targ_major_to_name(major_t maj)
1209 {
1210 	char *drv_name = NULL;
1211 	int	i;
1212 
1213 	if (!MD_UPGRADE)
1214 		return (ddi_major_to_name(maj));
1215 
1216 	for (i = 0; i < md_majortab_len; i++) {
1217 		if (md_major_tuple_table[i].targ_maj == maj) {
1218 			drv_name = md_major_tuple_table[i].drv_name;
1219 			break;
1220 		}
1221 	}
1222 	return (drv_name);
1223 }
1224 
1225 major_t
md_targ_name_to_major(char * drv_name)1226 md_targ_name_to_major(char *drv_name)
1227 {
1228 	major_t maj;
1229 	int	i;
1230 
1231 	maj = md_getmajor(NODEV64);
1232 	if (!MD_UPGRADE)
1233 		return (ddi_name_to_major(drv_name));
1234 
1235 	for (i = 0; i < md_majortab_len; i++) {
1236 		if ((strcmp(md_major_tuple_table[i].drv_name,
1237 		    drv_name)) == 0) {
1238 			maj = md_major_tuple_table[i].targ_maj;
1239 			break;
1240 		}
1241 	}
1242 
1243 	return (maj);
1244 }
1245 
1246 void
md_majortab_free()1247 md_majortab_free()
1248 {
1249 	size_t	sz;
1250 	int	i;
1251 
1252 	for (i = 0; i < md_majortab_len; i++) {
1253 		freestr(md_major_tuple_table[i].drv_name);
1254 	}
1255 
1256 	sz = md_majortab_len * sizeof (struct md_xlate_major_table);
1257 	kmem_free(md_major_tuple_table, sz);
1258 }
1259 
1260 /* functions return a pointer to a function which returns an int */
1261 
1262 intptr_t (*
md_get_named_service(md_dev64_t dev,int modindex,char * name,intptr_t (* Default)())1263 md_get_named_service(md_dev64_t dev, int modindex, char *name,
1264 	intptr_t (*Default)()))()
1265 {
1266 	mdi_unit_t		*ui;
1267 	md_named_services_t	*sp;
1268 	int			i;
1269 
1270 	/*
1271 	 * Return the first named service found.
1272 	 * Use this path when it is known that there is only
1273 	 * one named service possible (e.g., hotspare interface)
1274 	 */
1275 	if ((dev == NODEV64) && (modindex == ANY_SERVICE)) {
1276 		for (i = 0; i < MD_NOPS; i++) {
1277 			if (md_ops[i] == NULL) {
1278 				continue;
1279 			}
1280 			sp = md_ops[i]->md_services;
1281 			if (sp == NULL)
1282 				continue;
1283 			while (sp->md_service != NULL) {
1284 				if (strcmp(name, sp->md_name) == 0)
1285 					return (sp->md_service);
1286 				sp++;
1287 			}
1288 		}
1289 		return (Default);
1290 	}
1291 
1292 	/*
1293 	 * Return the named service for the given modindex.
1294 	 * This is used if there are multiple possible named services
1295 	 * and each one needs to be called (e.g., poke hotspares)
1296 	 */
1297 	if (dev == NODEV64) {
1298 		if (modindex >= MD_NOPS)
1299 			return (Default);
1300 
1301 		if (md_ops[modindex] == NULL)
1302 			return (Default);
1303 
1304 		sp = md_ops[modindex]->md_services;
1305 		if (sp == NULL)
1306 			return (Default);
1307 
1308 		while (sp->md_service != NULL) {
1309 			if (strcmp(name, sp->md_name) == 0)
1310 				return (sp->md_service);
1311 			sp++;
1312 		}
1313 		return (Default);
1314 	}
1315 
1316 	/*
1317 	 * Return the named service for this md_dev64_t
1318 	 */
1319 	if (md_getmajor(dev) != md_major)
1320 		return (Default);
1321 
1322 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
1323 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
1324 		return (NULL);
1325 
1326 
1327 	if ((ui = MDI_UNIT(md_getminor(dev))) == NULL)
1328 		return (NULL);
1329 
1330 	sp = md_ops[ui->ui_opsindex]->md_services;
1331 	if (sp == NULL)
1332 		return (Default);
1333 	while (sp->md_service != NULL) {
1334 		if (strcmp(name, sp->md_name) == 0)
1335 			return (sp->md_service);
1336 		sp++;
1337 	}
1338 	return (Default);
1339 }
1340 
1341 /*
1342  * md_daemon callback routine
1343  */
1344 boolean_t
callb_md_cpr(void * arg,int code)1345 callb_md_cpr(void *arg, int code)
1346 {
1347 	callb_cpr_t *cp = (callb_cpr_t *)arg;
1348 	int ret = 0;				/* assume success */
1349 	clock_t delta;
1350 
1351 	mutex_enter(cp->cc_lockp);
1352 
1353 	switch (code) {
1354 	case CB_CODE_CPR_CHKPT:
1355 		/*
1356 		 * Check for active resync threads
1357 		 */
1358 		mutex_enter(&md_cpr_resync.md_resync_mutex);
1359 		if ((md_cpr_resync.md_mirror_resync > 0) ||
1360 		    (md_cpr_resync.md_raid_resync > 0)) {
1361 			mutex_exit(&md_cpr_resync.md_resync_mutex);
1362 			cmn_err(CE_WARN, "There are Solaris Volume Manager "
1363 			    "synchronization threads running.");
1364 			cmn_err(CE_WARN, "Please try system suspension at "
1365 			    "a later time.");
1366 			ret = -1;
1367 			break;
1368 		}
1369 		mutex_exit(&md_cpr_resync.md_resync_mutex);
1370 
1371 		cp->cc_events |= CALLB_CPR_START;
1372 		delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
1373 		while (!(cp->cc_events & CALLB_CPR_SAFE))
1374 			/* cv_reltimedwait() returns -1 if it times out. */
1375 			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
1376 			    cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
1377 				break;
1378 			break;
1379 
1380 	case CB_CODE_CPR_RESUME:
1381 		cp->cc_events &= ~CALLB_CPR_START;
1382 		cv_signal(&cp->cc_stop_cv);
1383 		break;
1384 	}
1385 	mutex_exit(cp->cc_lockp);
1386 	return (ret != -1);
1387 }
1388 
1389 void
md_daemon(int pass_thru,mdq_anchor_t * anchor)1390 md_daemon(int pass_thru, mdq_anchor_t *anchor)
1391 {
1392 	daemon_queue_t  *dq;
1393 	callb_cpr_t	cprinfo;
1394 
1395 	if (pass_thru && (md_get_status() & MD_GBL_DAEMONS_LIVE))
1396 		return;
1397 	/*
1398 	 * Register cpr callback
1399 	 */
1400 	CALLB_CPR_INIT(&cprinfo, &anchor->a_mx, callb_md_cpr, "md_daemon");
1401 
1402 	/*CONSTCOND*/
1403 	while (1) {
1404 		mutex_enter(&anchor->a_mx);
1405 		while ((dq = anchor->dq.dq_next) == &(anchor->dq)) {
1406 			if (pass_thru) {
1407 				/*
1408 				 * CALLB_CPR_EXIT Will do
1409 				 * mutex_exit(&anchor->a_mx)
1410 				 */
1411 				CALLB_CPR_EXIT(&cprinfo);
1412 				return;
1413 			}
1414 			if (md_get_status() & MD_GBL_DAEMONS_DIE) {
1415 				mutex_exit(&anchor->a_mx);
1416 				mutex_enter(&md_mx);
1417 				md_num_daemons--;
1418 				mutex_exit(&md_mx);
1419 				/*
1420 				 * CALLB_CPR_EXIT will do
1421 				 * mutex_exit(&anchor->a_mx)
1422 				 */
1423 				mutex_enter(&anchor->a_mx);
1424 				CALLB_CPR_EXIT(&cprinfo);
1425 				thread_exit();
1426 			}
1427 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1428 			cv_wait(&anchor->a_cv, &anchor->a_mx);
1429 			CALLB_CPR_SAFE_END(&cprinfo, &anchor->a_mx);
1430 		}
1431 		dq->dq_prev->dq_next = dq->dq_next;
1432 		dq->dq_next->dq_prev = dq->dq_prev;
1433 		dq->dq_prev = dq->dq_next = NULL;
1434 		anchor->dq.qlen--;
1435 		mutex_exit(&anchor->a_mx);
1436 		(*(dq->dq_call))(dq);
1437 	}
1438 	/*NOTREACHED*/
1439 }
1440 
1441 /*
1442  * daemon_request:
1443  *
1444  * Adds requests to appropriate requestq which is
1445  * anchored by *anchor.
1446  * The request is the first element of a doubly linked circular list.
1447  * When the request is a single element, the forward and backward
1448  * pointers MUST point to the element itself.
1449  */
1450 
1451 void
daemon_request(mdq_anchor_t * anchor,void (* func)(),daemon_queue_t * request,callstyle_t style)1452 daemon_request(mdq_anchor_t *anchor, void (*func)(),
1453 				daemon_queue_t *request, callstyle_t style)
1454 {
1455 	daemon_queue_t *rqtp;
1456 	int i = 0;
1457 
1458 	rqtp = request;
1459 	if (style == REQ_OLD) {
1460 		ASSERT((rqtp->dq_next == NULL) && (rqtp->dq_prev == NULL));
1461 		/* set it to the new style */
1462 		rqtp->dq_prev = rqtp->dq_next = rqtp;
1463 	}
1464 	ASSERT((rqtp->dq_next != NULL) && (rqtp->dq_prev != NULL));
1465 
1466 	/* scan the list and add the function to each element */
1467 
1468 	do {
1469 		rqtp->dq_call = func;
1470 		i++;
1471 		rqtp = rqtp->dq_next;
1472 	} while (rqtp != request);
1473 
1474 	/* save pointer to tail of the request list */
1475 	rqtp = request->dq_prev;
1476 
1477 	mutex_enter(&anchor->a_mx);
1478 	/* stats */
1479 	anchor->dq.qlen += i;
1480 	anchor->dq.treqs += i;
1481 	anchor->dq.maxq_len = (anchor->dq.qlen > anchor->dq.maxq_len) ?
1482 	    anchor->dq.qlen : anchor->dq.maxq_len;
1483 
1484 	/* now add the list to request queue */
1485 	request->dq_prev = anchor->dq.dq_prev;
1486 	rqtp->dq_next = &anchor->dq;
1487 	anchor->dq.dq_prev->dq_next = request;
1488 	anchor->dq.dq_prev = rqtp;
1489 	cv_broadcast(&anchor->a_cv);
1490 	mutex_exit(&anchor->a_mx);
1491 }
1492 
1493 void
mddb_commitrec_wrapper(mddb_recid_t recid)1494 mddb_commitrec_wrapper(mddb_recid_t recid)
1495 {
1496 	int sent_log = 0;
1497 	uint_t retry = md_retry_cnt;
1498 	set_t	setno;
1499 
1500 	while (mddb_commitrec(recid)) {
1501 		if (! sent_log) {
1502 			cmn_err(CE_WARN,
1503 			    "md: state database commit failed");
1504 			sent_log = 1;
1505 		}
1506 		delay(md_hz);
1507 
1508 		/*
1509 		 * Setting retry cnt to one (pre decremented) so that we
1510 		 * actually do no retries when committing/deleting a mddb rec.
1511 		 * The underlying disk driver does several retries to check
1512 		 * if the disk is really dead or not so there
1513 		 * is no reason for us to retry on top of the drivers retries.
1514 		 */
1515 
1516 		if (--retry == 0) {
1517 			setno = mddb_getsetnum(recid);
1518 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1519 				panic(
1520 				    "md: Panic due to lack of DiskSuite state\n"
1521 				    " database replicas. Fewer than 50%% of "
1522 				    "the total were available,\n so panic to "
1523 				    "ensure data integrity.");
1524 			} else {
1525 				panic("md: state database problem");
1526 			}
1527 			/*NOTREACHED*/
1528 		}
1529 	}
1530 }
1531 
1532 void
mddb_commitrecs_wrapper(mddb_recid_t * recids)1533 mddb_commitrecs_wrapper(mddb_recid_t *recids)
1534 {
1535 	int sent_log = 0;
1536 	uint_t retry = md_retry_cnt;
1537 	set_t	setno;
1538 
1539 	while (mddb_commitrecs(recids)) {
1540 		if (! sent_log) {
1541 			cmn_err(CE_WARN,
1542 			    "md: state database commit failed");
1543 			sent_log = 1;
1544 		}
1545 		delay(md_hz);
1546 
1547 		/*
1548 		 * Setting retry cnt to one (pre decremented) so that we
1549 		 * actually do no retries when committing/deleting a mddb rec.
1550 		 * The underlying disk driver does several retries to check
1551 		 * if the disk is really dead or not so there
1552 		 * is no reason for us to retry on top of the drivers retries.
1553 		 */
1554 
1555 		if (--retry == 0) {
1556 			/*
1557 			 * since all the records are part of the same set
1558 			 * use the first one to get setno
1559 			 */
1560 			setno = mddb_getsetnum(*recids);
1561 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1562 				panic(
1563 				    "md: Panic due to lack of DiskSuite state\n"
1564 				    " database replicas. Fewer than 50%% of "
1565 				    "the total were available,\n so panic to "
1566 				    "ensure data integrity.");
1567 			} else {
1568 				panic("md: state database problem");
1569 			}
1570 			/*NOTREACHED*/
1571 		}
1572 	}
1573 }
1574 
1575 void
mddb_deleterec_wrapper(mddb_recid_t recid)1576 mddb_deleterec_wrapper(mddb_recid_t recid)
1577 {
1578 	int sent_log = 0;
1579 	uint_t retry = md_retry_cnt;
1580 	set_t	setno;
1581 
1582 	while (mddb_deleterec(recid)) {
1583 		if (! sent_log) {
1584 			cmn_err(CE_WARN,
1585 			    "md: state database delete failed");
1586 			sent_log = 1;
1587 		}
1588 		delay(md_hz);
1589 
1590 		/*
1591 		 * Setting retry cnt to one (pre decremented) so that we
1592 		 * actually do no retries when committing/deleting a mddb rec.
1593 		 * The underlying disk driver does several retries to check
1594 		 * if the disk is really dead or not so there
1595 		 * is no reason for us to retry on top of the drivers retries.
1596 		 */
1597 
1598 		if (--retry == 0) {
1599 			setno = mddb_getsetnum(recid);
1600 			if (md_get_setstatus(setno) & MD_SET_TOOFEW) {
1601 				panic(
1602 				    "md: Panic due to lack of DiskSuite state\n"
1603 				    " database replicas. Fewer than 50%% of "
1604 				    "the total were available,\n so panic to "
1605 				    "ensure data integrity.");
1606 			} else {
1607 				panic("md: state database problem");
1608 			}
1609 			/*NOTREACHED*/
1610 		}
1611 	}
1612 }
1613 
1614 /*
1615  * md_holdset_enter is called in order to hold the set in its
1616  * current state (loaded, unloaded, snarfed, unsnarfed, etc)
1617  * until md_holdset_exit is called.  This is used by the mirror
1618  * code to mark the set as HOLD so that the set won't be
1619  * unloaded while hotspares are being allocated in check_4_hotspares.
1620  * The original fix to the mirror code to hold the set was to call
1621  * md_haltsnarf_enter, but this will block all ioctls and ioctls
1622  * must work for a MN diskset while hotspares are allocated.
1623  */
1624 void
md_holdset_enter(set_t setno)1625 md_holdset_enter(set_t setno)
1626 {
1627 	mutex_enter(&md_mx);
1628 	while (md_set[setno].s_status & MD_SET_HOLD)
1629 		cv_wait(&md_cv, &md_mx);
1630 	md_set[setno].s_status |= MD_SET_HOLD;
1631 	mutex_exit(&md_mx);
1632 }
1633 
1634 void
md_holdset_exit(set_t setno)1635 md_holdset_exit(set_t setno)
1636 {
1637 	mutex_enter(&md_mx);
1638 	md_set[setno].s_status &= ~MD_SET_HOLD;
1639 	cv_broadcast(&md_cv);
1640 	mutex_exit(&md_mx);
1641 }
1642 
1643 /*
1644  * Returns a 0 if this thread marked the set as HOLD (success),
1645  * returns a -1 if set was already marked HOLD (failure).
1646  * Used by the release_set code to see if set is marked HOLD.
1647  * HOLD is set by a daemon when hotspares are being allocated
1648  * to mirror units.
1649  */
1650 int
md_holdset_testandenter(set_t setno)1651 md_holdset_testandenter(set_t setno)
1652 {
1653 	mutex_enter(&md_mx);
1654 	if (md_set[setno].s_status & MD_SET_HOLD) {
1655 		mutex_exit(&md_mx);
1656 		return (-1);
1657 	}
1658 	md_set[setno].s_status |= MD_SET_HOLD;
1659 	mutex_exit(&md_mx);
1660 	return (0);
1661 }
1662 
1663 void
md_haltsnarf_enter(set_t setno)1664 md_haltsnarf_enter(set_t setno)
1665 {
1666 	mutex_enter(&md_mx);
1667 	while (md_set[setno].s_status & MD_SET_SNARFING)
1668 		cv_wait(&md_cv, &md_mx);
1669 
1670 	md_set[setno].s_status |= MD_SET_SNARFING;
1671 	mutex_exit(&md_mx);
1672 }
1673 
1674 void
md_haltsnarf_exit(set_t setno)1675 md_haltsnarf_exit(set_t setno)
1676 {
1677 	mutex_enter(&md_mx);
1678 	md_set[setno].s_status &= ~MD_SET_SNARFING;
1679 	cv_broadcast(&md_cv);
1680 	mutex_exit(&md_mx);
1681 }
1682 
1683 void
md_haltsnarf_wait(set_t setno)1684 md_haltsnarf_wait(set_t setno)
1685 {
1686 	mutex_enter(&md_mx);
1687 	while (md_set[setno].s_status & MD_SET_SNARFING)
1688 		cv_wait(&md_cv, &md_mx);
1689 	mutex_exit(&md_mx);
1690 }
1691 
1692 /*
1693  * ASSUMED that the md_unit_array_rw WRITER lock is held.
1694  */
1695 int
md_halt_set(set_t setno,enum md_haltcmd cmd)1696 md_halt_set(set_t setno, enum md_haltcmd cmd)
1697 {
1698 	int	i, err;
1699 
1700 	if (md_set[setno].s_un == NULL || md_set[setno].s_ui == NULL) {
1701 		return (0);
1702 	}
1703 
1704 	if ((cmd == MD_HALT_CHECK) || (cmd == MD_HALT_ALL)) {
1705 		for (i = 0; i < MD_NOPS; i++) {
1706 			if (md_ops[i] == NULL)
1707 				continue;
1708 			if ((*(md_ops[i]->md_halt))(MD_HALT_CLOSE, setno)) {
1709 				for (--i; i > 0; --i) {
1710 					if (md_ops[i] == NULL)
1711 						continue;
1712 					(void) (*(md_ops[i]->md_halt))
1713 					    (MD_HALT_OPEN, setno);
1714 				}
1715 				return (EBUSY);
1716 			}
1717 		}
1718 
1719 		for (i = 0; i < MD_NOPS; i++) {
1720 			if (md_ops[i] == NULL)
1721 				continue;
1722 			if ((*(md_ops[i]->md_halt))(MD_HALT_CHECK, setno)) {
1723 				for (i = 0; i < MD_NOPS; i++) {
1724 					if (md_ops[i] == NULL)
1725 						continue;
1726 					(void) (*(md_ops[i]->md_halt))
1727 					    (MD_HALT_OPEN, setno);
1728 				}
1729 				return (EBUSY);
1730 			}
1731 		}
1732 	}
1733 
1734 	if ((cmd == MD_HALT_DOIT) || (cmd == MD_HALT_ALL)) {
1735 		for (i = 0; i < MD_NOPS; i++) {
1736 			if (md_ops[i] == NULL)
1737 				continue;
1738 			err = (*(md_ops[i]->md_halt))(MD_HALT_DOIT, setno);
1739 			if (err != 0)
1740 				cmn_err(CE_NOTE,
1741 				    "md: halt failed for %s, error %d",
1742 				    md_ops[i]->md_driver.md_drivername, err);
1743 		}
1744 
1745 		/*
1746 		 * Unload the devid namespace if it is loaded
1747 		 */
1748 		md_unload_namespace(setno, NM_DEVID);
1749 		md_unload_namespace(setno, 0L);
1750 		md_clr_setstatus(setno, MD_SET_SNARFED);
1751 	}
1752 
1753 	return (0);
1754 }
1755 
1756 int
md_halt(int global_locks_owned_mask)1757 md_halt(int global_locks_owned_mask)
1758 {
1759 	set_t			i, j;
1760 	int			err;
1761 	int			init_queues;
1762 	md_requestq_entry_t	*rqp;
1763 	md_ops_t		**pops, *ops, *lops;
1764 	ddi_modhandle_t		mod;
1765 	char			*name;
1766 
1767 	rw_enter(&md_unit_array_rw.lock, RW_WRITER);
1768 
1769 	/*
1770 	 * Grab the all of the global locks that are not
1771 	 * already owned to ensure that there isn't another
1772 	 * thread trying to access a global resource
1773 	 * while the halt is in progress
1774 	 */
1775 	if (md_global_lock_enter(global_locks_owned_mask) == EINTR)
1776 		return (EINTR);
1777 
1778 	for (i = 0; i < md_nsets; i++)
1779 		md_haltsnarf_enter(i);
1780 
1781 	/*
1782 	 * Kill the daemon threads.
1783 	 */
1784 	init_queues = ((md_get_status() & MD_GBL_DAEMONS_LIVE) ? FALSE : TRUE);
1785 	md_clr_status(MD_GBL_DAEMONS_LIVE);
1786 	md_set_status(MD_GBL_DAEMONS_DIE);
1787 
1788 	rqp = &md_daemon_queues[0];
1789 	i = 0;
1790 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
1791 		cv_broadcast(&rqp->dispq_headp->a_cv);
1792 		rqp = &md_daemon_queues[++i];
1793 	}
1794 
1795 	mutex_enter(&md_mx);
1796 	while (md_num_daemons != 0) {
1797 		mutex_exit(&md_mx);
1798 		delay(md_hz);
1799 		mutex_enter(&md_mx);
1800 	}
1801 	mutex_exit(&md_mx);
1802 	md_clr_status(MD_GBL_DAEMONS_DIE);
1803 
1804 	for (i = 0; i < md_nsets; i++)
1805 		/*
1806 		 * Only call into md_halt_set if s_un / s_ui are both set.
1807 		 * If they are NULL this set hasn't been accessed, so its
1808 		 * pointless performing the call.
1809 		 */
1810 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1811 			if (md_halt_set(i, MD_HALT_CHECK)) {
1812 				if (md_start_daemons(init_queues))
1813 					cmn_err(CE_WARN,
1814 					    "md: restart of daemon threads "
1815 					    "failed");
1816 				for (j = 0; j < md_nsets; j++)
1817 					md_haltsnarf_exit(j);
1818 
1819 				return (md_global_lock_exit(
1820 				    global_locks_owned_mask, EBUSY,
1821 				    MD_ARRAY_WRITER, NULL));
1822 			}
1823 		}
1824 
1825 	/*
1826 	 * if we get here we are going to do it
1827 	 */
1828 	for (i = 0; i < md_nsets; i++) {
1829 		/*
1830 		 * Only call into md_halt_set if s_un / s_ui are both set.
1831 		 * If they are NULL this set hasn't been accessed, so its
1832 		 * pointless performing the call.
1833 		 */
1834 		if (md_set[i].s_un != NULL && md_set[i].s_ui != NULL) {
1835 			err = md_halt_set(i, MD_HALT_DOIT);
1836 			if (err != 0)
1837 				cmn_err(CE_NOTE,
1838 				    "md: halt failed set %u, error %d",
1839 				    (unsigned)i, err);
1840 		}
1841 	}
1842 
1843 	/*
1844 	 * issue a halt unload to each module to indicate that it
1845 	 * is about to be unloaded.  Each module is called once, set
1846 	 * has no meaning at this point in time.
1847 	 */
1848 	for (i = 0; i < MD_NOPS; i++) {
1849 		if (md_ops[i] == NULL)
1850 			continue;
1851 		err = (*(md_ops[i]->md_halt))(MD_HALT_UNLOAD, 0);
1852 		if (err != 0)
1853 			cmn_err(CE_NOTE,
1854 			    "md: halt failed for %s, error %d",
1855 			    md_ops[i]->md_driver.md_drivername, err);
1856 	}
1857 
1858 	/* ddi_modclose the submodules */
1859 	for (i = 0; i < MD_NOPS; i++) {
1860 		/* skip if not open */
1861 		if ((md_ops[i] == NULL) || (md_mods[i] == NULL))
1862 			continue;
1863 
1864 		/* find and unlink from md_opslist */
1865 		ops = md_ops[i];
1866 		mod = md_mods[i];
1867 		pops = &md_opslist;
1868 		for (lops = *pops; lops;
1869 		    pops = &lops->md_next, lops = *pops) {
1870 			if (lops == ops) {
1871 				*pops = ops->md_next;
1872 				ops->md_next = NULL;
1873 				break;
1874 			}
1875 		}
1876 
1877 		/* uninitialize */
1878 		name = ops->md_driver.md_drivername;
1879 		md_ops[i] = NULL;
1880 		md_mods[i] = NULL;
1881 		ops->md_selfindex = 0;
1882 		ops->md_driver.md_drivername[0] = '\0';
1883 		rw_destroy(&ops->md_link_rw.lock);
1884 
1885 		/* close */
1886 		err = ddi_modclose(mod);
1887 		if (err != 0)
1888 			cmn_err(CE_NOTE,
1889 			    "md: halt close failed for %s, error %d",
1890 			    name ? name : "UNKNOWN", err);
1891 	}
1892 
1893 	/* Unload the database */
1894 	mddb_unload();
1895 
1896 	md_set_status(MD_GBL_HALTED);	/* we are ready to be unloaded */
1897 
1898 	for (i = 0; i < md_nsets; i++)
1899 		md_haltsnarf_exit(i);
1900 
1901 	return (md_global_lock_exit(global_locks_owned_mask, 0,
1902 	    MD_ARRAY_WRITER, NULL));
1903 }
1904 
1905 /*
1906  * md_layered_open() is an internal routine only for SVM modules.
1907  * So the input device will be a md_dev64_t, because all SVM modules internally
1908  * work with that device type.
1909  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1910  * routines from here we first have to convert that device into a dev_t.
1911  */
1912 
1913 int
md_layered_open(minor_t mnum,md_dev64_t * dev,int md_oflags)1914 md_layered_open(
1915 	minor_t		mnum,
1916 	md_dev64_t	*dev,
1917 	int		md_oflags
1918 )
1919 {
1920 	int		flag = (FREAD | FWRITE);
1921 	cred_t		*cred_p = kcred;
1922 	major_t		major;
1923 	int		err;
1924 	dev_t		ddi_dev = md_dev64_to_dev(*dev);
1925 
1926 	if (ddi_dev == NODEV)
1927 		return (ENODEV);
1928 
1929 	major = getmajor(ddi_dev);
1930 
1931 	/* metadevice */
1932 	if (major == md_major) {
1933 		mdi_unit_t	*ui;
1934 
1935 		/* open underlying driver */
1936 		mnum = getminor(ddi_dev);
1937 
1938 		ui = MDI_UNIT(mnum);
1939 		if (md_ops[ui->ui_opsindex]->md_open != NULL) {
1940 			int ret = (*md_ops[ui->ui_opsindex]->md_open)(&ddi_dev,
1941 			    flag, OTYP_LYR, cred_p, md_oflags);
1942 			/*
1943 			 * As open() may change the device,
1944 			 * send this info back to the caller.
1945 			 */
1946 			*dev = md_expldev(ddi_dev);
1947 			return (ret);
1948 		}
1949 
1950 		/* or do it ourselves */
1951 		(void) md_unit_openclose_enter(ui);
1952 		err = md_unit_incopen(mnum, flag, OTYP_LYR);
1953 		md_unit_openclose_exit(ui);
1954 		/* convert our ddi_dev back to the dev we were given */
1955 		*dev = md_expldev(ddi_dev);
1956 		return (err);
1957 	}
1958 
1959 	/*
1960 	 * Open regular device, since open() may change dev_t give new dev_t
1961 	 * back to the caller.
1962 	 */
1963 	err = dev_lopen(&ddi_dev, flag, OTYP_LYR, cred_p);
1964 	*dev = md_expldev(ddi_dev);
1965 	return (err);
1966 }
1967 
1968 /*
1969  * md_layered_close() is an internal routine only for SVM modules.
1970  * So the input device will be a md_dev64_t, because all SVM modules internally
1971  * work with that device type.
1972  * ddi routines on the other hand work with dev_t. So, if we call any ddi
1973  * routines from here we first have to convert that device into a dev_t.
1974  */
1975 void
md_layered_close(md_dev64_t dev,int md_cflags)1976 md_layered_close(
1977 	md_dev64_t	dev,
1978 	int		md_cflags
1979 )
1980 {
1981 	int		flag = (FREAD | FWRITE);
1982 	cred_t		*cred_p = kcred;
1983 	dev_t		ddi_dev = md_dev64_to_dev(dev);
1984 	major_t		major = getmajor(ddi_dev);
1985 	minor_t		mnum = getminor(ddi_dev);
1986 
1987 	/* metadevice */
1988 	if (major == md_major) {
1989 		mdi_unit_t	*ui = MDI_UNIT(mnum);
1990 
1991 		/* close underlying driver */
1992 		if (md_ops[ui->ui_opsindex]->md_close != NULL) {
1993 			(*md_ops[ui->ui_opsindex]->md_close)
1994 			    (ddi_dev, flag, OTYP_LYR, cred_p, md_cflags);
1995 			return;
1996 		}
1997 
1998 		/* or do it ourselves */
1999 		(void) md_unit_openclose_enter(ui);
2000 		(void) md_unit_decopen(mnum, OTYP_LYR);
2001 		md_unit_openclose_exit(ui);
2002 		return;
2003 	}
2004 
2005 	/* close regular device */
2006 	(void) dev_lclose(ddi_dev, flag, OTYP_LYR, cred_p);
2007 }
2008 
2009 /*
2010  * saves a little code in mdstrategy
2011  */
2012 int
errdone(mdi_unit_t * ui,struct buf * bp,int err)2013 errdone(mdi_unit_t *ui, struct buf *bp, int err)
2014 {
2015 	if ((bp->b_error = err) != 0)
2016 		bp->b_flags |= B_ERROR;
2017 	else
2018 		bp->b_resid = bp->b_bcount;
2019 	md_unit_readerexit(ui);
2020 	md_biodone(bp);
2021 	return (1);
2022 }
2023 
2024 static int	md_write_label = 0;
2025 
2026 int
md_checkbuf(mdi_unit_t * ui,md_unit_t * un,buf_t * bp)2027 md_checkbuf(mdi_unit_t *ui, md_unit_t *un, buf_t *bp)
2028 {
2029 	diskaddr_t endblk;
2030 	set_t	setno = MD_UN2SET(un);
2031 
2032 	if ((md_get_setstatus(setno) & MD_SET_STALE) &&
2033 	    (! (bp->b_flags & B_READ)))
2034 		return (errdone(ui, bp, EROFS));
2035 	/*
2036 	 * Check early for unreasonable block number.
2037 	 *
2038 	 * b_blkno is defined as adaddr_t which is typedef'd to a long.
2039 	 * A problem occurs if b_blkno has bit 31 set and un_total_blocks
2040 	 * doesn't, b_blkno is then compared as a negative number which is
2041 	 * always less than a positive.
2042 	 */
2043 	if ((u_longlong_t)bp->b_lblkno > (u_longlong_t)un->c.un_total_blocks)
2044 		return (errdone(ui, bp, EINVAL));
2045 
2046 	if (bp->b_lblkno == un->c.un_total_blocks)
2047 		return (errdone(ui, bp, 0));
2048 
2049 	/*
2050 	 * make sure we don't clobber any labels
2051 	 */
2052 	if ((bp->b_lblkno == 0) && (! (bp->b_flags & B_READ)) &&
2053 	    (un->c.un_flag & MD_LABELED) && (! md_write_label)) {
2054 		cmn_err(CE_NOTE, "md: %s: write to label",
2055 		    md_shortname(getminor(bp->b_edev)));
2056 		return (errdone(ui, bp, EINVAL));
2057 	}
2058 
2059 	bp->b_resid = 0;
2060 	endblk = (diskaddr_t)(bp->b_lblkno +
2061 	    howmany(bp->b_bcount, DEV_BSIZE) - 1);
2062 
2063 	if (endblk > (un->c.un_total_blocks - 1)) {
2064 		bp->b_resid = dbtob(endblk - (un->c.un_total_blocks - 1));
2065 		endblk = un->c.un_total_blocks - 1;
2066 		bp->b_bcount -= bp->b_resid;
2067 	}
2068 	return (0);
2069 }
2070 
2071 /*
2072  * init_request_queue: initializes the request queues and creates the threads.
2073  *	return value =  0  :invalid num_threads
2074  *		     =  n   : n is the number of threads created.
2075  */
2076 
2077 int
init_requestq(md_requestq_entry_t * rq,void (* threadfn)(),caddr_t threadfn_args,int pri,int init_queue)2078 init_requestq(
2079 	md_requestq_entry_t *rq, /* request queue info */
2080 	void (*threadfn)(),	 /* function to start the thread */
2081 	caddr_t threadfn_args,	 /* args to the function */
2082 	int pri,		 /* thread priority */
2083 	int init_queue)		 /* flag to init queues */
2084 {
2085 	struct mdq_anchor *rqhead;
2086 	int	i;
2087 	int	num_threads;
2088 
2089 
2090 	num_threads = *(rq->num_threadsp);
2091 	rqhead = rq->dispq_headp;
2092 
2093 	if (NULL_REQUESTQ_ENTRY(rq) || num_threads == 0)
2094 		return (0);
2095 
2096 	if (init_queue) {
2097 		rqhead->dq.maxq_len = 0;
2098 		rqhead->dq.treqs = 0;
2099 		rqhead->dq.dq_next = &rqhead->dq;
2100 		rqhead->dq.dq_prev = &rqhead->dq;
2101 		cv_init(&rqhead->a_cv, NULL, CV_DEFAULT, NULL);
2102 		mutex_init(&rqhead->a_mx, NULL, MUTEX_DEFAULT, NULL);
2103 	}
2104 	for (i = 0; i < num_threads; i++) {
2105 		(void) thread_create(NULL, 0, threadfn, threadfn_args, 0, &p0,
2106 		    TS_RUN, pri);
2107 	}
2108 	return (i);
2109 }
2110 
2111 static void
start_daemon(struct mdq_anchor * q)2112 start_daemon(struct mdq_anchor *q)
2113 {
2114 	md_daemon(0, q);
2115 	ASSERT(0);
2116 }
2117 
2118 /*
2119  * Creates all the md daemons.
2120  * Global:
2121  *	md_num_daemons is set to number of daemons.
2122  *	MD_GBL_DAEMONS_LIVE flag set to indicate the daemons are active.
2123  *
2124  * Return value: 0  success
2125  *		 1  failure
2126  */
2127 int
md_start_daemons(int init_queue)2128 md_start_daemons(int init_queue)
2129 {
2130 	md_requestq_entry_t	*rqp;
2131 	int	cnt;
2132 	int	i;
2133 	int	retval = 0;
2134 
2135 
2136 	if (md_get_status() & MD_GBL_DAEMONS_LIVE) {
2137 		return (retval);
2138 	}
2139 	md_clr_status(MD_GBL_DAEMONS_DIE);
2140 
2141 	rqp = &md_daemon_queues[0];
2142 	i = 0;
2143 	while (!NULL_REQUESTQ_ENTRY(rqp)) {
2144 		cnt = init_requestq(rqp, start_daemon,
2145 		    (caddr_t)rqp->dispq_headp, minclsyspri, init_queue);
2146 
2147 		if (cnt && cnt != *rqp->num_threadsp) {
2148 			retval = 1;
2149 			break;
2150 		}
2151 		/*
2152 		 * initialize variables
2153 		 */
2154 		md_num_daemons += cnt;
2155 		rqp = &md_daemon_queues[++i];
2156 	}
2157 
2158 	md_set_status(MD_GBL_DAEMONS_LIVE);
2159 	return (retval);
2160 }
2161 
2162 int
md_loadsubmod(set_t setno,char * name,int drvrid)2163 md_loadsubmod(set_t setno, char *name, int drvrid)
2164 {
2165 	ddi_modhandle_t	mod;
2166 	md_ops_t	**pops, *ops;
2167 	int		i, err;
2168 
2169 	/*
2170 	 * See if the submodule is mdopened. If not, i is the index of the
2171 	 * next empty slot.
2172 	 */
2173 	for (i = 0; md_ops[i] != NULL; i++) {
2174 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2175 		    MD_DRIVERNAMELEN) == 0)
2176 			return (i);
2177 
2178 		if (i == (MD_NOPS - 1))
2179 			return (-1);
2180 	}
2181 
2182 	if (drvrid < 0) {
2183 		/* Do not try to add any records to the DB when stale. */
2184 		if (md_get_setstatus(setno) & MD_SET_STALE)
2185 			return (-1);
2186 		drvrid = md_setshared_name(setno, name, 0L);
2187 	}
2188 
2189 	if (drvrid < 0)
2190 		return (-1);
2191 
2192 	/* open and import the md_ops of the submodules */
2193 	mod = ddi_modopen(name, KRTLD_MODE_FIRST, &err);
2194 	if (mod == NULL) {
2195 		cmn_err(CE_WARN, "md_loadsubmod: "
2196 		    "unable to ddi_modopen %s, error %d\n", name, err);
2197 		return (-1);
2198 	}
2199 	pops = ddi_modsym(mod, "md_interface_ops", &err);
2200 	if (pops == NULL) {
2201 		cmn_err(CE_WARN, "md_loadsubmod: "
2202 		    "unable to import md_interface_ops from %s, error %d\n",
2203 		    name, err);
2204 		(void) ddi_modclose(mod);
2205 		return (-1);
2206 	}
2207 
2208 	/* ddi_modsym returns pointer to md_interface_ops in submod */
2209 	ops = *pops;
2210 
2211 	/* initialize */
2212 	ops->md_selfindex = i;
2213 	rw_init(&ops->md_link_rw.lock, NULL, RW_DEFAULT, NULL);
2214 	(void) strncpy(ops->md_driver.md_drivername, name,
2215 	    MD_DRIVERNAMELEN);
2216 
2217 	/* plumb */
2218 	md_ops[i] = ops;
2219 	md_mods[i] = mod;
2220 	ops->md_next = md_opslist;
2221 	md_opslist = ops;
2222 
2223 	/* return index */
2224 	return (i);
2225 }
2226 
2227 int
md_getmodindex(md_driver_t * driver,int dont_load,int db_notrequired)2228 md_getmodindex(md_driver_t *driver, int dont_load, int db_notrequired)
2229 {
2230 	int	i;
2231 	int	modindex;
2232 	char	*name = driver->md_drivername;
2233 	set_t	setno = driver->md_setno;
2234 	int	drvid;
2235 	int	local_dont_load;
2236 
2237 	if (setno >= md_nsets)
2238 		return (-1);
2239 
2240 	for (i = 0; name[i] != 0; i++)
2241 		if (i == (MD_DRIVERNAMELEN -1))
2242 			return (-1);
2243 
2244 	/*
2245 	 * If set is STALE, set local_dont_load to 1 since no records
2246 	 * should be added to DB when stale.
2247 	 */
2248 	if (md_get_setstatus(setno) & MD_SET_STALE) {
2249 		local_dont_load = 1;
2250 	} else {
2251 		local_dont_load = dont_load;
2252 	}
2253 
2254 	/*
2255 	 * Single thread ioctl module binding with respect to
2256 	 * similar code executed in md_loadsubmod that is called
2257 	 * from md_snarf_db_set (which is where that path does
2258 	 * its md_haltsnarf_enter call).
2259 	 */
2260 	md_haltsnarf_enter(setno);
2261 
2262 	/* See if the submodule is already ddi_modopened. */
2263 	for (i = 0; md_ops[i] != NULL; i++) {
2264 		if (strncmp(name, md_ops[i]->md_driver.md_drivername,
2265 		    MD_DRIVERNAMELEN) == 0) {
2266 			if (! local_dont_load &&
2267 			    (md_getshared_key(setno, name) == MD_KEYBAD)) {
2268 				if (md_setshared_name(setno, name, 0L)
2269 				    == MD_KEYBAD) {
2270 					if (!db_notrequired)
2271 						goto err;
2272 				}
2273 			}
2274 			md_haltsnarf_exit(setno);
2275 			return (i);
2276 		}
2277 
2278 		if (i == (MD_NOPS -1))
2279 			break;
2280 	}
2281 
2282 	if (local_dont_load)
2283 		goto err;
2284 
2285 	drvid = ((db_notrequired) ? 0 : (int)md_getshared_key(setno, name));
2286 
2287 	/* ddi_modopen the submodule */
2288 	modindex = md_loadsubmod(setno, name, drvid);
2289 	if (modindex < 0)
2290 		goto err;
2291 
2292 	if (md_ops[modindex]->md_snarf != NULL)
2293 		(*(md_ops[modindex]->md_snarf))(MD_SNARF_DOIT, setno);
2294 
2295 	md_haltsnarf_exit(setno);
2296 	return (modindex);
2297 
2298 err:	md_haltsnarf_exit(setno);
2299 	return (-1);
2300 }
2301 
2302 void
md_call_strategy(buf_t * bp,int flags,void * private)2303 md_call_strategy(buf_t *bp, int flags, void *private)
2304 {
2305 	mdi_unit_t	*ui;
2306 
2307 	if (mdv_strategy_tstpnt)
2308 		if ((*mdv_strategy_tstpnt)(bp, flags, private) != 0)
2309 			return;
2310 	if (getmajor(bp->b_edev) != md_major) {
2311 		(void) bdev_strategy(bp);
2312 		return;
2313 	}
2314 
2315 	flags = (flags & MD_STR_PASSEDON) | MD_STR_NOTTOP;
2316 	ui = MDI_UNIT(getminor(bp->b_edev));
2317 	ASSERT(ui != NULL);
2318 	(*md_ops[ui->ui_opsindex]->md_strategy)(bp, flags, private);
2319 }
2320 
2321 /*
2322  * md_call_ioctl:
2323  * -------------
2324  * Issue the specified ioctl to the device associated with the given md_dev64_t
2325  *
2326  * Arguments:
2327  *	dev	- underlying device [md_dev64_t]
2328  *	cmd	- ioctl to perform
2329  *	data	- arguments / result location
2330  *	mode	- read/write/layered ioctl
2331  *	lockp	- lock reference
2332  *
2333  * Returns:
2334  *	0	success
2335  *	!=0	Failure (error code)
2336  */
2337 int
md_call_ioctl(md_dev64_t dev,int cmd,void * data,int mode,IOLOCK * lockp)2338 md_call_ioctl(md_dev64_t dev, int cmd, void *data, int mode, IOLOCK *lockp)
2339 {
2340 	dev_t		device = md_dev64_to_dev(dev);
2341 	int		rval;
2342 	mdi_unit_t	*ui;
2343 
2344 	/*
2345 	 * See if device is a metadevice. If not call cdev_ioctl(), otherwise
2346 	 * call the ioctl entry-point in the metadevice.
2347 	 */
2348 	if (md_getmajor(dev) != md_major) {
2349 		int	rv;
2350 		rval = cdev_ioctl(device, cmd, (intptr_t)data, mode,
2351 		    ddi_get_cred(), &rv);
2352 	} else {
2353 		ui = MDI_UNIT(md_getminor(dev));
2354 		ASSERT(ui != NULL);
2355 		rval = (*md_ops[ui->ui_opsindex]->md_ioctl)(device, cmd, data,
2356 		    mode, lockp);
2357 	}
2358 	return (rval);
2359 }
2360 
2361 void
md_rem_link(set_t setno,int id,krwlock_t * rw,md_link_t ** head)2362 md_rem_link(set_t setno, int id, krwlock_t *rw, md_link_t **head)
2363 {
2364 	md_link_t	*next;
2365 	md_link_t	**pprev;
2366 
2367 	rw_enter(rw, RW_WRITER);
2368 
2369 	next = *head;
2370 	pprev = head;
2371 	while (next) {
2372 		if ((next->ln_setno == setno) && (next->ln_id == id)) {
2373 			*pprev = next->ln_next;
2374 			rw_exit(rw);
2375 			return;
2376 		}
2377 		pprev = &next->ln_next;
2378 		next = next->ln_next;
2379 	}
2380 
2381 	rw_exit(rw);
2382 }
2383 
2384 int
md_dev_exists(md_dev64_t dev)2385 md_dev_exists(md_dev64_t dev)
2386 {
2387 
2388 	if (dev == NODEV64)
2389 		return (0);
2390 
2391 	if (strcmp(ddi_major_to_name(md_getmajor(dev)), "md") != 0)
2392 		return (1);
2393 
2394 	if ((MD_MIN2SET(md_getminor(dev)) >= md_nsets) ||
2395 	    (MD_MIN2UNIT(md_getminor(dev)) >= md_nunits))
2396 		return (0);
2397 
2398 	if (MDI_UNIT(md_getminor(dev)) != NULL)
2399 		return (1);
2400 
2401 	return (0);
2402 }
2403 
2404 md_parent_t
md_get_parent(md_dev64_t dev)2405 md_get_parent(md_dev64_t dev)
2406 {
2407 	md_unit_t	*un;
2408 	mdi_unit_t	*ui;
2409 	md_parent_t	parent;
2410 
2411 	if (md_getmajor(dev) != md_major)
2412 		return (MD_NO_PARENT);
2413 
2414 	ui = MDI_UNIT(md_getminor(dev));
2415 
2416 	un = (md_unit_t *)md_unit_readerlock(ui);
2417 	parent = un->c.un_parent;
2418 	md_unit_readerexit(ui);
2419 
2420 	return (parent);
2421 }
2422 
2423 void
md_set_parent(md_dev64_t dev,md_parent_t parent)2424 md_set_parent(md_dev64_t dev, md_parent_t parent)
2425 {
2426 	md_unit_t	*un;
2427 	mdi_unit_t	*ui;
2428 
2429 	if (md_getmajor(dev) != md_major)
2430 		return;
2431 
2432 	ui = MDI_UNIT(md_getminor(dev));
2433 
2434 	un = (md_unit_t *)md_unit_readerlock(ui);
2435 	un->c.un_parent = parent;
2436 	md_unit_readerexit(ui);
2437 }
2438 
2439 void
md_reset_parent(md_dev64_t dev)2440 md_reset_parent(md_dev64_t dev)
2441 {
2442 	md_unit_t	*un;
2443 	mdi_unit_t	*ui;
2444 
2445 	if (md_getmajor(dev) != md_major)
2446 		return;
2447 
2448 	ui = MDI_UNIT(md_getminor(dev));
2449 
2450 	un = (md_unit_t *)md_unit_readerlock(ui);
2451 	un->c.un_parent = MD_NO_PARENT;
2452 	md_unit_readerexit(ui);
2453 }
2454 
2455 
2456 static intptr_t (*hot_spare_interface)() = (intptr_t (*)())NULL;
2457 
2458 int
md_hot_spare_ifc(hs_cmds_t cmd,mddb_recid_t id,u_longlong_t size,int labeled,mddb_recid_t * hs_id,mdkey_t * key,md_dev64_t * dev,diskaddr_t * sblock)2459 md_hot_spare_ifc(
2460 	hs_cmds_t	cmd,
2461 	mddb_recid_t	id,
2462 	u_longlong_t	size,
2463 	int		labeled,
2464 	mddb_recid_t	*hs_id,
2465 	mdkey_t		*key,
2466 	md_dev64_t	*dev,
2467 	diskaddr_t	*sblock)
2468 {
2469 	int		err;
2470 
2471 	/*
2472 	 * RW lock on hot_spare_interface. We don't want it to change from
2473 	 * underneath us. If hot_spare_interface is NULL we're going to
2474 	 * need to set it. So we need to upgrade to a WRITER lock. If that
2475 	 * doesn't work, we drop the lock and reenter as WRITER. This leaves
2476 	 * a small hole during which hot_spare_interface could be modified
2477 	 * so we check it for NULL again. What a pain. Then if still null
2478 	 * load from md_get_named_service.
2479 	 */
2480 
2481 	rw_enter(&hsp_rwlp.lock, RW_READER);
2482 	if (hot_spare_interface == NULL) {
2483 		if (rw_tryupgrade(&hsp_rwlp.lock) == 0) {
2484 			rw_exit(&hsp_rwlp.lock);
2485 			rw_enter(&hsp_rwlp.lock, RW_WRITER);
2486 			if (hot_spare_interface != NULL) {
2487 				err = ((*hot_spare_interface)
2488 				    (cmd, id, size, labeled, hs_id, key, dev,
2489 				    sblock));
2490 				rw_exit(&hsp_rwlp.lock);
2491 				return (err);
2492 			}
2493 		}
2494 		hot_spare_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2495 		    "hot spare interface", 0);
2496 		rw_downgrade(&hsp_rwlp.lock);
2497 	}
2498 
2499 	if (hot_spare_interface == NULL) {
2500 		cmn_err(CE_WARN, "md: no hotspare interface");
2501 		rw_exit(&hsp_rwlp.lock);
2502 		return (0);
2503 	}
2504 
2505 	err = ((*hot_spare_interface)
2506 	    (cmd, id, size, labeled, hs_id, key, dev, sblock));
2507 	rw_exit(&hsp_rwlp.lock);
2508 	return (err);
2509 }
2510 
2511 void
md_clear_hot_spare_interface()2512 md_clear_hot_spare_interface()
2513 {
2514 	rw_enter(&hsp_rwlp.lock, RW_WRITER);
2515 	hot_spare_interface = NULL;
2516 	rw_exit(&hsp_rwlp.lock);
2517 }
2518 
2519 
2520 static intptr_t (*notify_interface)() = (intptr_t (*)())NULL;
2521 
2522 int
md_notify_interface(md_event_cmds_t cmd,md_tags_t tag,set_t set,md_dev64_t dev,md_event_type_t event)2523 md_notify_interface(
2524 	md_event_cmds_t cmd,
2525 	md_tags_t	tag,
2526 	set_t		set,
2527 	md_dev64_t	dev,
2528 	md_event_type_t event
2529 )
2530 {
2531 	int		err;
2532 
2533 	if (md_event_queue == NULL)
2534 		return (0);
2535 	rw_enter(&ni_rwlp.lock, RW_READER);
2536 	if (notify_interface == NULL) {
2537 		if (rw_tryupgrade(&ni_rwlp.lock) == 0) {
2538 			rw_exit(&ni_rwlp.lock);
2539 			rw_enter(&ni_rwlp.lock, RW_WRITER);
2540 			if (notify_interface != NULL) {
2541 				err = ((*notify_interface)
2542 				    (cmd, tag, set, dev, event));
2543 				rw_exit(&ni_rwlp.lock);
2544 				return (err);
2545 			}
2546 		}
2547 		notify_interface = md_get_named_service(NODEV64, ANY_SERVICE,
2548 		    "notify interface", 0);
2549 		rw_downgrade(&ni_rwlp.lock);
2550 	}
2551 	if (notify_interface == NULL) {
2552 		cmn_err(CE_WARN, "md: no notify interface");
2553 		rw_exit(&ni_rwlp.lock);
2554 		return (0);
2555 	}
2556 	err = ((*notify_interface)(cmd, tag, set, dev, event));
2557 	rw_exit(&ni_rwlp.lock);
2558 	return (err);
2559 }
2560 
2561 char *
obj2devname(uint32_t tag,uint_t setno,md_dev64_t dev)2562 obj2devname(uint32_t tag, uint_t setno, md_dev64_t dev)
2563 {
2564 	char		*setname;
2565 	char		name[MD_MAX_CTDLEN];
2566 	minor_t		mnum = md_getminor(dev);
2567 	major_t		maj = md_getmajor(dev);
2568 	int		rtn = 0;
2569 
2570 	/*
2571 	 * Verify that the passed dev_t refers to a valid metadevice.
2572 	 * If it doesn't we can make no assumptions as to what the device
2573 	 * name is. Return NULL in these cases.
2574 	 */
2575 	if (((maj != md_major) || (MD_MIN2UNIT(mnum) >= md_nunits)) ||
2576 	    (MD_MIN2SET(mnum) >= md_nsets)) {
2577 		return (NULL);
2578 	}
2579 
2580 	setname = NULL;
2581 	name[0] = '\0';
2582 	switch (tag) {
2583 	case SVM_TAG_HSP:
2584 		if (setno == 0) {
2585 			rtn = snprintf(name, sizeof (name), "hsp%u",
2586 			    (unsigned)MD_MIN2UNIT(mnum));
2587 		} else {
2588 			setname = mddb_getsetname(setno);
2589 			if (setname != NULL) {
2590 				rtn = snprintf(name, sizeof (name), "%s/hsp%u",
2591 				    setname, (unsigned)MD_MIN2UNIT(mnum));
2592 			}
2593 		}
2594 		break;
2595 	case SVM_TAG_DRIVE:
2596 		(void) sprintf(name, "drive");
2597 		break;
2598 	case SVM_TAG_HOST:
2599 		(void) sprintf(name, "host");
2600 		break;
2601 	case SVM_TAG_SET:
2602 		rtn = snprintf(name, sizeof (name), "%s",
2603 		    mddb_getsetname(setno));
2604 		if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2605 			(void) sprintf(name, "diskset");
2606 			rtn = 0;
2607 		}
2608 		break;
2609 	default:
2610 		rtn = snprintf(name, sizeof (name), "%s", md_shortname(mnum));
2611 		break;
2612 	}
2613 
2614 	/* Check if we got any rubbish for any of the snprintf's */
2615 	if ((name[0] == '\0') || (rtn >= sizeof (name))) {
2616 		return (NULL);
2617 	}
2618 
2619 	return (md_strdup(name));
2620 }
2621 
2622 /* Sysevent subclass and mdnotify event type pairs */
2623 struct node {
2624 	char		*se_ev;
2625 	md_event_type_t	md_ev;
2626 };
2627 
2628 /*
2629  * Table must be sorted in case sensitive ascending order of
2630  * the sysevents values
2631  */
2632 static struct node ev_table[] = {
2633 	{ ESC_SVM_ADD,			EQ_ADD },
2634 	{ ESC_SVM_ATTACH,		EQ_ATTACH },
2635 	{ ESC_SVM_ATTACHING,		EQ_ATTACHING },
2636 	{ ESC_SVM_CHANGE,		EQ_CHANGE },
2637 	{ ESC_SVM_CREATE,		EQ_CREATE },
2638 	{ ESC_SVM_DELETE,		EQ_DELETE },
2639 	{ ESC_SVM_DETACH,		EQ_DETACH },
2640 	{ ESC_SVM_DETACHING,		EQ_DETACHING },
2641 	{ ESC_SVM_DRIVE_ADD,		EQ_DRIVE_ADD },
2642 	{ ESC_SVM_DRIVE_DELETE,		EQ_DRIVE_DELETE },
2643 	{ ESC_SVM_ENABLE,		EQ_ENABLE },
2644 	{ ESC_SVM_ERRED,		EQ_ERRED },
2645 	{ ESC_SVM_EXCHANGE,		EQ_EXCHANGE },
2646 	{ ESC_SVM_GROW,			EQ_GROW },
2647 	{ ESC_SVM_HS_CHANGED,		EQ_HS_CHANGED },
2648 	{ ESC_SVM_HS_FREED,		EQ_HS_FREED },
2649 	{ ESC_SVM_HOST_ADD,		EQ_HOST_ADD },
2650 	{ ESC_SVM_HOST_DELETE,		EQ_HOST_DELETE },
2651 	{ ESC_SVM_HOTSPARED,		EQ_HOTSPARED },
2652 	{ ESC_SVM_INIT_FAILED,		EQ_INIT_FAILED },
2653 	{ ESC_SVM_INIT_FATAL,		EQ_INIT_FATAL },
2654 	{ ESC_SVM_INIT_START,		EQ_INIT_START },
2655 	{ ESC_SVM_INIT_SUCCESS,		EQ_INIT_SUCCESS },
2656 	{ ESC_SVM_IOERR,		EQ_IOERR },
2657 	{ ESC_SVM_LASTERRED,		EQ_LASTERRED },
2658 	{ ESC_SVM_MEDIATOR_ADD,		EQ_MEDIATOR_ADD },
2659 	{ ESC_SVM_MEDIATOR_DELETE,	EQ_MEDIATOR_DELETE },
2660 	{ ESC_SVM_OFFLINE,		EQ_OFFLINE },
2661 	{ ESC_SVM_OK,			EQ_OK },
2662 	{ ESC_SVM_ONLINE,		EQ_ONLINE },
2663 	{ ESC_SVM_OPEN_FAIL,		EQ_OPEN_FAIL },
2664 	{ ESC_SVM_REGEN_DONE,		EQ_REGEN_DONE },
2665 	{ ESC_SVM_REGEN_FAILED,		EQ_REGEN_FAILED },
2666 	{ ESC_SVM_REGEN_START,		EQ_REGEN_START },
2667 	{ ESC_SVM_RELEASE,		EQ_RELEASE },
2668 	{ ESC_SVM_REMOVE,		EQ_REMOVE },
2669 	{ ESC_SVM_RENAME_DST,		EQ_RENAME_DST },
2670 	{ ESC_SVM_RENAME_SRC,		EQ_RENAME_SRC },
2671 	{ ESC_SVM_REPLACE,		EQ_REPLACE },
2672 	{ ESC_SVM_RESYNC_DONE,		EQ_RESYNC_DONE },
2673 	{ ESC_SVM_RESYNC_FAILED,	EQ_RESYNC_FAILED },
2674 	{ ESC_SVM_RESYNC_START,		EQ_RESYNC_START },
2675 	{ ESC_SVM_RESYNC_SUCCESS,	EQ_RESYNC_SUCCESS },
2676 	{ ESC_SVM_TAKEOVER,		EQ_TAKEOVER }
2677 };
2678 
2679 static md_tags_t md_tags[] = {
2680 	TAG_UNK,
2681 	TAG_METADEVICE,
2682 	TAG_UNK,
2683 	TAG_UNK,
2684 	TAG_UNK,
2685 	TAG_UNK,
2686 	TAG_REPLICA,
2687 	TAG_HSP,
2688 	TAG_HS,
2689 	TAG_SET,
2690 	TAG_DRIVE,
2691 	TAG_HOST,
2692 	TAG_MEDIATOR
2693 };
2694 
2695 md_event_type_t
ev_get(char * subclass)2696 ev_get(char *subclass)
2697 {
2698 	int	high, mid, low, p;
2699 
2700 	low = 0;
2701 	high = (sizeof (ev_table) / sizeof (ev_table[0])) - 1;
2702 	while (low <= high) {
2703 		mid = (high + low) / 2;
2704 		p = strcmp(subclass, ev_table[mid].se_ev);
2705 		if (p == 0) {
2706 			return (ev_table[mid].md_ev);
2707 		} else if (p < 0) {
2708 			high = mid - 1;
2709 		} else {
2710 			low = mid + 1;
2711 		}
2712 	}
2713 
2714 	return (EQ_EMPTY);
2715 }
2716 
2717 /*
2718  * Log mdnotify event
2719  */
2720 void
do_mdnotify(char * se_subclass,uint32_t tag,set_t setno,md_dev64_t devid)2721 do_mdnotify(char *se_subclass, uint32_t tag, set_t setno, md_dev64_t devid)
2722 {
2723 	md_event_type_t	ev_type;
2724 	md_tags_t	md_tag;
2725 
2726 	/* Translate sysevent into mdnotify event */
2727 	ev_type = ev_get(se_subclass);
2728 
2729 	if (tag >= (sizeof (md_tags) / sizeof (md_tags[0]))) {
2730 		md_tag = TAG_UNK;
2731 	} else {
2732 		md_tag = md_tags[tag];
2733 	}
2734 
2735 	NOTIFY_MD(md_tag, setno, devid, ev_type);
2736 }
2737 
2738 /*
2739  * Log SVM sys events
2740  */
2741 void
svm_gen_sysevent(char * se_class,char * se_subclass,uint32_t tag,set_t setno,md_dev64_t devid)2742 svm_gen_sysevent(
2743 	char		*se_class,
2744 	char		*se_subclass,
2745 	uint32_t	tag,
2746 	set_t		setno,
2747 	md_dev64_t	devid
2748 )
2749 {
2750 	nvlist_t		*attr_list;
2751 	sysevent_id_t		eid;
2752 	int			err = DDI_SUCCESS;
2753 	char			*devname;
2754 	extern dev_info_t	*md_devinfo;
2755 
2756 	/* Raise the mdnotify event before anything else */
2757 	do_mdnotify(se_subclass, tag, setno, devid);
2758 
2759 	if (md_devinfo == NULL) {
2760 		return;
2761 	}
2762 
2763 	err = nvlist_alloc(&attr_list, NV_UNIQUE_NAME, KM_NOSLEEP);
2764 
2765 	if (err == DDI_SUCCESS) {
2766 		/* Add the version numver */
2767 		err = nvlist_add_uint32(attr_list, SVM_VERSION_NO,
2768 		    (uint32_t)SVM_VERSION);
2769 		if (err != DDI_SUCCESS) {
2770 			goto fail;
2771 		}
2772 
2773 		/* Add the tag attribute */
2774 		err = nvlist_add_uint32(attr_list, SVM_TAG, (uint32_t)tag);
2775 		if (err != DDI_SUCCESS) {
2776 			goto fail;
2777 		}
2778 
2779 		/* Add the set number attribute */
2780 		err = nvlist_add_uint32(attr_list, SVM_SET_NO, (uint32_t)setno);
2781 		if (err != DDI_SUCCESS) {
2782 			goto fail;
2783 		}
2784 
2785 		/* Add the device id attribute */
2786 		err = nvlist_add_uint64(attr_list, SVM_DEV_ID, (uint64_t)devid);
2787 		if (err != DDI_SUCCESS) {
2788 			goto fail;
2789 		}
2790 
2791 		/* Add the device name attribute */
2792 		devname = obj2devname(tag, setno, devid);
2793 		if (devname != NULL) {
2794 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2795 			    devname);
2796 			freestr(devname);
2797 		} else {
2798 			err = nvlist_add_string(attr_list, SVM_DEV_NAME,
2799 			    "unspecified");
2800 		}
2801 		if (err != DDI_SUCCESS) {
2802 			goto fail;
2803 		}
2804 
2805 		/* Attempt to post event */
2806 		err = ddi_log_sysevent(md_devinfo, DDI_VENDOR_SUNW, se_class,
2807 		    se_subclass, attr_list, &eid, DDI_SLEEP);
2808 
2809 		nvlist_free(attr_list);
2810 		if (err != DDI_SUCCESS) {
2811 			cmn_err(CE_WARN, "Failed to log event for %s, %s,"
2812 			    " err=%x", se_class, se_subclass, err);
2813 		}
2814 	}
2815 
2816 	return;
2817 
2818 fail:
2819 	nvlist_free(attr_list);
2820 	cmn_err(CE_WARN, "Failed to setup attributes for event %s, %s, err=%x",
2821 	    se_class, se_subclass, err);
2822 }
2823 
2824 void
md_clear_named_service()2825 md_clear_named_service()
2826 {
2827 	rw_enter(&ni_rwlp.lock, RW_WRITER);
2828 	notify_interface = NULL;
2829 	rw_exit(&ni_rwlp.lock);
2830 }
2831 
2832 void
md_create_unit_incore(minor_t mnum,md_ops_t * ops,int alloc_lock)2833 md_create_unit_incore(minor_t mnum, md_ops_t *ops, int alloc_lock)
2834 {
2835 	mdi_unit_t	*ui;
2836 	set_t		setno = MD_MIN2SET(mnum);
2837 
2838 	ui = (mdi_unit_t *)kmem_zalloc(sizeof (mdi_unit_t), KM_SLEEP);
2839 	ui->ui_opsindex = ops->md_selfindex;
2840 
2841 	/* initialize all the incore conditional variables */
2842 	mutex_init(&ui->ui_mx, NULL, MUTEX_DEFAULT, NULL);
2843 	cv_init(&ui->ui_cv, NULL, CV_DEFAULT, NULL);
2844 
2845 	if (alloc_lock) {
2846 		ui->ui_io_lock = kmem_zalloc(sizeof (md_io_lock_t), KM_SLEEP);
2847 		mutex_init(&ui->ui_io_lock->io_mx, NULL, MUTEX_DEFAULT, NULL);
2848 		cv_init(&ui->ui_io_lock->io_cv, NULL, CV_DEFAULT, NULL);
2849 		mutex_init(&ui->ui_io_lock->io_list_mutex, NULL,
2850 		    MUTEX_DEFAULT, NULL);
2851 		ui->ui_io_lock->io_list_front = NULL;
2852 		ui->ui_io_lock->io_list_back = NULL;
2853 	}
2854 	if (! (md_get_setstatus(setno) & MD_SET_SNARFING)) {
2855 		rw_enter(&md_unit_array_rw.lock, RW_WRITER);
2856 		MDI_VOIDUNIT(mnum) = (void *) ui;
2857 		rw_exit(&md_unit_array_rw.lock);
2858 	} else
2859 		MDI_VOIDUNIT(mnum) = (void *) ui;
2860 
2861 	rw_enter(&ops->md_link_rw.lock, RW_WRITER);
2862 	ui->ui_link.ln_next = ops->md_head;
2863 	ui->ui_link.ln_setno = setno;
2864 	ui->ui_link.ln_id = mnum;
2865 	ops->md_head = &ui->ui_link;
2866 	/* setup the unavailable field */
2867 #if defined(_ILP32)
2868 	if (((md_unit_t *)MD_UNIT(mnum))->c.un_revision & MD_64BIT_META_DEV) {
2869 		ui->ui_tstate |= MD_64MD_ON_32KERNEL;
2870 		cmn_err(CE_NOTE, "d%d is unavailable because 64 bit "
2871 		    "metadevices are not accessible on a 32 bit kernel",
2872 		    mnum);
2873 	}
2874 #endif
2875 
2876 	rw_exit(&ops->md_link_rw.lock);
2877 }
2878 
2879 void
md_destroy_unit_incore(minor_t mnum,md_ops_t * ops)2880 md_destroy_unit_incore(minor_t mnum, md_ops_t *ops)
2881 {
2882 	mdi_unit_t	*ui;
2883 
2884 	/*
2885 	 * ASSUMPTION: md_unit_array_rw WRITER lock is held.
2886 	 */
2887 	ui = MDI_UNIT(mnum);
2888 	if (ui == NULL)
2889 		return;
2890 
2891 	md_rem_link(MD_MIN2SET(mnum), mnum, &ops->md_link_rw.lock,
2892 	    &ops->md_head);
2893 
2894 	/* destroy the io lock if one is being used */
2895 	if (ui->ui_io_lock) {
2896 		mutex_destroy(&ui->ui_io_lock->io_mx);
2897 		cv_destroy(&ui->ui_io_lock->io_cv);
2898 		kmem_free(ui->ui_io_lock, sizeof (md_io_lock_t));
2899 	}
2900 
2901 	/* teardown kstat */
2902 	md_kstat_destroy(mnum);
2903 
2904 	/* destroy all the incore conditional variables */
2905 	mutex_destroy(&ui->ui_mx);
2906 	cv_destroy(&ui->ui_cv);
2907 
2908 	kmem_free(ui, sizeof (mdi_unit_t));
2909 	MDI_VOIDUNIT(mnum) = (void *) NULL;
2910 }
2911 
2912 void
md_rem_names(sv_dev_t * sv,int nsv)2913 md_rem_names(sv_dev_t *sv, int nsv)
2914 {
2915 	int	i, s;
2916 	int	max_sides;
2917 
2918 	if (nsv == 0)
2919 		return;
2920 
2921 	/* All entries removed are in the same diskset */
2922 	if (md_get_setstatus(sv[0].setno) & MD_SET_MNSET)
2923 		max_sides = MD_MNMAXSIDES;
2924 	else
2925 		max_sides = MD_MAXSIDES;
2926 
2927 	for (i = 0; i < nsv; i++)
2928 		for (s = 0; s < max_sides; s++)
2929 			(void) md_remdevname(sv[i].setno, s, sv[i].key);
2930 }
2931 
2932 /*
2933  * Checking user args before we get into physio - returns 0 for ok, else errno
2934  * We do a lot of checking against illegal arguments here because some of the
2935  * real disk drivers don't like certain kinds of arguments. (e.g xy doesn't
2936  * like odd address user buffer.) Those drivers capture bad arguments in
2937  * xxread and xxwrite. But since meta-driver calls their strategy routines
2938  * directly, two bad scenario might happen:
2939  *	1. the real strategy doesn't like it and panic.
2940  *	2. the real strategy doesn't like it and set B_ERROR.
2941  *
2942  * The second case is no better than the first one, since the meta-driver
2943  * will treat it as a media-error and off line the mirror metapartition.
2944  * (Too bad there is no way to tell what error it is.)
2945  *
2946  */
2947 int
md_chk_uio(struct uio * uio)2948 md_chk_uio(struct uio *uio)
2949 {
2950 	int	i;
2951 	struct iovec *iov;
2952 
2953 	/*
2954 	 * Check for negative or not block-aligned offset
2955 	 */
2956 	if ((uio->uio_loffset < 0) ||
2957 	    ((uio->uio_loffset & (DEV_BSIZE - 1)) != 0)) {
2958 		return (EINVAL);
2959 	}
2960 	iov = uio->uio_iov;
2961 	i = uio->uio_iovcnt;
2962 
2963 	while (i--) {
2964 		if ((iov->iov_len & (DEV_BSIZE - 1)) != 0)
2965 			return (EINVAL);
2966 		/*
2967 		 * Bug # 1212146
2968 		 * The default is to not check alignment, but we can now check
2969 		 * for a larger number of alignments if desired.
2970 		 */
2971 		if ((uintptr_t)(iov->iov_base) & md_uio_alignment_mask)
2972 			return (EINVAL);
2973 		iov++;
2974 	}
2975 	return (0);
2976 }
2977 
2978 char *
md_shortname(minor_t mnum)2979 md_shortname(
2980 	minor_t		mnum
2981 )
2982 {
2983 	static char	buf[MAXPATHLEN];
2984 	char		*devname;
2985 	char		*invalid = " (Invalid minor number %u) ";
2986 	char		*metaname;
2987 	mdc_unit_t	*un;
2988 	side_t		side;
2989 	set_t		setno = MD_MIN2SET(mnum);
2990 	unit_t		unit = MD_MIN2UNIT(mnum);
2991 
2992 	if ((un = MD_UNIT(mnum)) == NULL) {
2993 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
2994 		return (buf);
2995 	}
2996 
2997 	/*
2998 	 * If unit is not a friendly name unit, derive the name from the
2999 	 * minor number.
3000 	 */
3001 	if ((un->un_revision & MD_FN_META_DEV) == 0) {
3002 		/* This is a traditional metadevice */
3003 		if (setno == MD_LOCAL_SET) {
3004 			(void) snprintf(buf, sizeof (buf), "d%u",
3005 			    (unsigned)unit);
3006 		} else {
3007 			(void) snprintf(buf, sizeof (buf), "%s/d%u",
3008 			    mddb_getsetname(setno), (unsigned)unit);
3009 		}
3010 		return (buf);
3011 	}
3012 
3013 	/*
3014 	 * It is a friendly name metadevice, so we need to get its name.
3015 	 */
3016 	side = mddb_getsidenum(setno);
3017 	devname = (char *)kmem_alloc(MAXPATHLEN, KM_SLEEP);
3018 	if (md_getdevname(setno, side, MD_KEYWILD,
3019 	    md_makedevice(md_major, mnum), devname, MAXPATHLEN) == 0) {
3020 		/*
3021 		 * md_getdevname has given us either /dev/md/dsk/<metaname>
3022 		 * or /dev/md/<setname>/dsk/<metname> depending on whether
3023 		 * or not we are in the local set.  Thus, we'll pull the
3024 		 * metaname from this string.
3025 		 */
3026 		if ((metaname = strrchr(devname, '/')) == NULL) {
3027 			(void) snprintf(buf, sizeof (buf), invalid, mnum);
3028 			goto out;
3029 		}
3030 		metaname++;	/* move past slash */
3031 		if (setno == MD_LOCAL_SET) {
3032 			/* No set name. */
3033 			(void) snprintf(buf, sizeof (buf), "%s", metaname);
3034 		} else {
3035 			/* Include setname */
3036 			(void) snprintf(buf, sizeof (buf), "%s/%s",
3037 			    mddb_getsetname(setno), metaname);
3038 		}
3039 	} else {
3040 		/* We couldn't find the name. */
3041 		(void) snprintf(buf, sizeof (buf), invalid, mnum);
3042 	}
3043 
3044 out:
3045 	kmem_free(devname, MAXPATHLEN);
3046 	return (buf);
3047 }
3048 
3049 char *
md_devname(set_t setno,md_dev64_t dev,char * buf,size_t size)3050 md_devname(
3051 	set_t		setno,
3052 	md_dev64_t	dev,
3053 	char		*buf,
3054 	size_t		size
3055 )
3056 {
3057 	static char	mybuf[MD_MAX_CTDLEN];
3058 	int		err;
3059 
3060 	if (buf == NULL) {
3061 		buf = mybuf;
3062 		size = sizeof (mybuf);
3063 	} else {
3064 		ASSERT(size >= MD_MAX_CTDLEN);
3065 	}
3066 
3067 	err = md_getdevname_common(setno, mddb_getsidenum(setno),
3068 	    0, dev, buf, size, MD_NOWAIT_LOCK);
3069 	if (err) {
3070 		if (err == ENOENT) {
3071 			(void) sprintf(buf, "(Unavailable)");
3072 		} else {
3073 			(void) sprintf(buf, "(%u.%u)",
3074 			    md_getmajor(dev), md_getminor(dev));
3075 		}
3076 	}
3077 
3078 	return (buf);
3079 }
3080 void
md_minphys(buf_t * pb)3081 md_minphys(buf_t *pb)
3082 {
3083 	extern unsigned md_maxbcount;
3084 
3085 	if (pb->b_bcount > md_maxbcount)
3086 		pb->b_bcount = md_maxbcount;
3087 }
3088 
3089 void
md_bioinit(struct buf * bp)3090 md_bioinit(struct buf *bp)
3091 {
3092 	ASSERT(bp);
3093 
3094 	bioinit(bp);
3095 	bp->b_back = bp;
3096 	bp->b_forw = bp;
3097 	bp->b_flags = B_BUSY;	/* initialize flags */
3098 }
3099 
3100 void
md_bioreset(struct buf * bp)3101 md_bioreset(struct buf *bp)
3102 {
3103 	ASSERT(bp);
3104 
3105 	bioreset(bp);
3106 	bp->b_back = bp;
3107 	bp->b_forw = bp;
3108 	bp->b_flags = B_BUSY;	/* initialize flags */
3109 }
3110 
3111 /*
3112  * md_bioclone is needed as long as the real bioclone only takes a daddr_t
3113  * as block number.
3114  * We simply call bioclone with all input parameters but blkno, and set the
3115  * correct blkno afterwards.
3116  * Caveat Emptor: bp_mem must not be NULL!
3117  */
3118 buf_t *
md_bioclone(buf_t * bp,off_t off,size_t len,dev_t dev,diskaddr_t blkno,int (* iodone)(buf_t *),buf_t * bp_mem,int sleep)3119 md_bioclone(buf_t *bp, off_t off, size_t len, dev_t dev, diskaddr_t blkno,
3120 		int (*iodone)(buf_t *), buf_t *bp_mem, int sleep)
3121 {
3122 	(void) bioclone(bp, off, len, dev, 0, iodone, bp_mem, sleep);
3123 	bp_mem->b_lblkno = blkno;
3124 	return (bp_mem);
3125 }
3126 
3127 
3128 /*
3129  * kstat stuff
3130  */
3131 void
md_kstat_init_ui(minor_t mnum,mdi_unit_t * ui)3132 md_kstat_init_ui(
3133 	minor_t		 mnum,
3134 	mdi_unit_t	*ui
3135 )
3136 {
3137 	if ((ui != NULL) && (ui->ui_kstat == NULL)) {
3138 		set_t	setno = MD_MIN2SET(mnum);
3139 		unit_t  unit = MD_MIN2UNIT(mnum);
3140 		char	module[KSTAT_STRLEN];
3141 		char	*p = module;
3142 
3143 		if (setno != MD_LOCAL_SET) {
3144 			char	buf[64];
3145 			char	*s = buf;
3146 			char	*e = module + sizeof (module) - 4;
3147 
3148 			(void) sprintf(buf, "%u", setno);
3149 			while ((p < e) && (*s != '\0'))
3150 				*p++ = *s++;
3151 			*p++ = '/';
3152 		}
3153 		*p++ = 'm';
3154 		*p++ = 'd';
3155 		*p = '\0';
3156 		if ((ui->ui_kstat = kstat_create(module, unit, NULL, "disk",
3157 		    KSTAT_TYPE_IO, 1, KSTAT_FLAG_PERSISTENT)) != NULL) {
3158 			ui->ui_kstat->ks_lock = &ui->ui_mx;
3159 			kstat_install(ui->ui_kstat);
3160 		}
3161 	}
3162 }
3163 
3164 void
md_kstat_init(minor_t mnum)3165 md_kstat_init(
3166 	minor_t		mnum
3167 )
3168 {
3169 	md_kstat_init_ui(mnum, MDI_UNIT(mnum));
3170 }
3171 
3172 void
md_kstat_destroy_ui(mdi_unit_t * ui)3173 md_kstat_destroy_ui(
3174 	mdi_unit_t	*ui
3175 )
3176 {
3177 	/*
3178 	 * kstat_delete() interface has it's own locking mechanism and
3179 	 * does not allow holding of kstat lock (ks_lock).
3180 	 * Note: ks_lock == ui_mx from the md_kstat_init_ui().
3181 	 */
3182 	if ((ui != NULL) && (ui->ui_kstat != NULL)) {
3183 		kstat_delete(ui->ui_kstat);
3184 		ui->ui_kstat = NULL;
3185 	}
3186 }
3187 
3188 void
md_kstat_destroy(minor_t mnum)3189 md_kstat_destroy(
3190 	minor_t		mnum
3191 )
3192 {
3193 	md_kstat_destroy_ui(MDI_UNIT(mnum));
3194 }
3195 
3196 /*
3197  * In the following subsequent routines, locks are held before checking the
3198  * validity of ui_kstat. This is done to make sure that we don't trip over
3199  * a NULL ui_kstat anymore.
3200  */
3201 
3202 void
md_kstat_waitq_enter(mdi_unit_t * ui)3203 md_kstat_waitq_enter(
3204 	mdi_unit_t	*ui
3205 )
3206 {
3207 	mutex_enter(&ui->ui_mx);
3208 	if (ui->ui_kstat != NULL)
3209 		kstat_waitq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3210 	mutex_exit(&ui->ui_mx);
3211 }
3212 
3213 void
md_kstat_waitq_to_runq(mdi_unit_t * ui)3214 md_kstat_waitq_to_runq(
3215 	mdi_unit_t	*ui
3216 )
3217 {
3218 	mutex_enter(&ui->ui_mx);
3219 	if (ui->ui_kstat != NULL)
3220 		kstat_waitq_to_runq(KSTAT_IO_PTR(ui->ui_kstat));
3221 	mutex_exit(&ui->ui_mx);
3222 }
3223 
3224 void
md_kstat_waitq_exit(mdi_unit_t * ui)3225 md_kstat_waitq_exit(
3226 	mdi_unit_t	*ui
3227 )
3228 {
3229 	mutex_enter(&ui->ui_mx);
3230 	if (ui->ui_kstat != NULL)
3231 		kstat_waitq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3232 	mutex_exit(&ui->ui_mx);
3233 }
3234 
3235 void
md_kstat_runq_enter(mdi_unit_t * ui)3236 md_kstat_runq_enter(
3237 	mdi_unit_t	*ui
3238 )
3239 {
3240 	mutex_enter(&ui->ui_mx);
3241 	if (ui->ui_kstat != NULL)
3242 		kstat_runq_enter(KSTAT_IO_PTR(ui->ui_kstat));
3243 	mutex_exit(&ui->ui_mx);
3244 }
3245 
3246 void
md_kstat_runq_exit(mdi_unit_t * ui)3247 md_kstat_runq_exit(
3248 	mdi_unit_t	*ui
3249 )
3250 {
3251 	mutex_enter(&ui->ui_mx);
3252 	if (ui->ui_kstat != NULL)
3253 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3254 	mutex_exit(&ui->ui_mx);
3255 }
3256 
3257 void
md_kstat_done(mdi_unit_t * ui,buf_t * bp,int war)3258 md_kstat_done(
3259 	mdi_unit_t	*ui,
3260 	buf_t		*bp,
3261 	int		war
3262 )
3263 {
3264 	size_t  n_done;
3265 
3266 	/* check for end of device */
3267 	if ((bp->b_resid != 0) && (! (bp->b_flags & B_ERROR))) {
3268 		n_done = bp->b_bcount;
3269 	} else if (bp->b_bcount < bp->b_resid) {
3270 		n_done = 0;
3271 	} else {
3272 		n_done = bp->b_bcount - bp->b_resid;
3273 	}
3274 
3275 	/* do accounting */
3276 	mutex_enter(&ui->ui_mx);
3277 	if (ui->ui_kstat != NULL) {
3278 		if ((! war) && (bp->b_flags & B_READ)) {
3279 			KSTAT_IO_PTR(ui->ui_kstat)->reads++;
3280 			KSTAT_IO_PTR(ui->ui_kstat)->nread += n_done;
3281 		} else {
3282 			KSTAT_IO_PTR(ui->ui_kstat)->writes++;
3283 			KSTAT_IO_PTR(ui->ui_kstat)->nwritten += n_done;
3284 		}
3285 		kstat_runq_exit(KSTAT_IO_PTR(ui->ui_kstat));
3286 	}
3287 	mutex_exit(&ui->ui_mx);
3288 }
3289 
3290 pid_t
md_getpid()3291 md_getpid()
3292 {
3293 	pid_t valuep;
3294 	if (drv_getparm(PPID, (pid_t *)&valuep) != 0) {
3295 		ASSERT(0);
3296 		return ((pid_t)0);
3297 	} else {
3298 		ASSERT(valuep);
3299 		return (valuep);
3300 	}
3301 }
3302 
3303 
3304 proc_t *
md_getproc()3305 md_getproc()
3306 {
3307 	proc_t  *valuep;
3308 	if (drv_getparm(UPROCP, (proc_t **)&valuep) != 0) {
3309 		ASSERT(0);
3310 		return ((proc_t *)NULL);
3311 	} else {
3312 		ASSERT(valuep);
3313 		return (valuep);
3314 	}
3315 }
3316 
3317 extern kmutex_t pidlock;
3318 
3319 /*
3320  * this check to see if a process pid pair are still running.  For the
3321  * disk set lock when both pid/proc are zero then the locks is not
3322  * currently held.
3323  */
3324 int
md_checkpid(pid_t pid,proc_t * proc)3325 md_checkpid(pid_t pid, proc_t *proc)
3326 {
3327 	int	retval = 1;
3328 
3329 	if (pid == 0 && proc == NULL)
3330 		return (0);
3331 
3332 	mutex_enter(&pidlock);
3333 	if (prfind(pid)  != proc)
3334 		retval = 0;
3335 	mutex_exit(&pidlock);
3336 	return (retval);
3337 }
3338 
3339 /*
3340  * NAME: md_init_probereq
3341  *
3342  * DESCRIPTION: initializes a probe request. Parcels out the mnums such that
3343  *		they can be dispatched to multiple daemon threads.
3344  *
3345  * PARAMETERS: struct md_probedev *p	pointer ioctl input
3346  *
3347  * RETURN VALUE: Returns errno
3348  *
3349  */
3350 
3351 int
md_init_probereq(struct md_probedev_impl * p,daemon_queue_t ** hdrpp)3352 md_init_probereq(struct md_probedev_impl *p, daemon_queue_t **hdrpp)
3353 {
3354 	int		err = 0;
3355 	int		modindx;
3356 	intptr_t	(*probe_test)();
3357 
3358 	/*
3359 	 * Initialize the semaphores and mutex
3360 	 * for the request
3361 	 */
3362 
3363 	p->probe_sema = kmem_alloc(sizeof (ksema_t), KM_SLEEP);
3364 
3365 	p->probe_mx = kmem_alloc(sizeof (kmutex_t), KM_SLEEP);
3366 	sema_init(PROBE_SEMA(p), 0, NULL, SEMA_DRIVER, NULL);
3367 	mutex_init(PROBE_MX(p), NULL, MUTEX_DEFAULT, NULL);
3368 
3369 	modindx = md_getmodindex(&(p->probe.md_driver), 1, 1);
3370 	probe_test = md_get_named_service(NODEV64, modindx,
3371 	    p->probe.test_name, 0);
3372 	if (probe_test == NULL) {
3373 		err = EINVAL;
3374 		goto err_out;
3375 	}
3376 
3377 	err = md_create_probe_rqlist(p, hdrpp, probe_test);
3378 err_out:
3379 	return (err);
3380 }
3381 
3382 /*
3383  * NAME: md_probe_one
3384  *
3385  * DESCRIPTION: Generic routine for probing disks. This is called from the
3386  *		daemon.
3387  *
3388  * PARAMETERS: probe_req_t	*reqp	pointer to the probe request structure.
3389  *
3390  */
3391 
3392 void
md_probe_one(probe_req_t * reqp)3393 md_probe_one(probe_req_t *reqp)
3394 {
3395 	mdi_unit_t		*ui;
3396 	md_probedev_impl_t	*p;
3397 	int			err = 0;
3398 	set_t			setno;
3399 
3400 	p = (md_probedev_impl_t *)reqp->private_handle;
3401 	/*
3402 	 * Validate the unit while holding the global ioctl lock, then
3403 	 * obtain the unit_writerlock. Once the writerlock has been obtained
3404 	 * we can release the global lock. As long as we hold one of these
3405 	 * locks this will prevent a metaclear operation being performed
3406 	 * on the metadevice because metaclear takes the readerlock (via
3407 	 * openclose lock).
3408 	 * To avoid a potential deadlock with the probe_fcn() causing i/o to
3409 	 * be issued to the writerlock'd metadevice we only grab the writerlock
3410 	 * if the unit is not an SVM root device.
3411 	 */
3412 	while (md_ioctl_lock_enter() == EINTR)
3413 		;
3414 	setno = MD_MIN2SET(reqp->mnum);
3415 	ui = MDI_UNIT(reqp->mnum);
3416 	if (ui != NULL) {
3417 		int	writer_grabbed;
3418 		dev_t	svm_root;
3419 
3420 		if ((setno == MD_LOCAL_SET) && root_is_svm) {
3421 			svm_root = getrootdev();
3422 
3423 			if (getminor(svm_root) == reqp->mnum) {
3424 				writer_grabbed = 0;
3425 			} else {
3426 				writer_grabbed = 1;
3427 				(void) md_unit_writerlock_common(ui, 0);
3428 			}
3429 		} else {
3430 			writer_grabbed = 1;
3431 			(void) md_unit_writerlock_common(ui, 0);
3432 		}
3433 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3434 		err = (*reqp->probe_fcn)(ui, reqp->mnum);
3435 		if (writer_grabbed) {
3436 			md_unit_writerexit(ui);
3437 		}
3438 	} else {
3439 		(void) md_ioctl_lock_exit(0, 0, 0, FALSE);
3440 	}
3441 
3442 	/* update the info in the probe structure */
3443 
3444 	mutex_enter(PROBE_MX(p));
3445 	if (err != 0) {
3446 		cmn_err(CE_NOTE, "md_probe_one: err %d mnum %d\n", err,
3447 		    reqp->mnum);
3448 		(void) mdsyserror(&(p->probe.mde), err);
3449 	}
3450 
3451 	mutex_exit(PROBE_MX(p));
3452 	sema_v(PROBE_SEMA(p));
3453 
3454 	kmem_free(reqp, sizeof (probe_req_t));
3455 }
3456 char *
md_strdup(char * cp)3457 md_strdup(char *cp)
3458 {
3459 	char *new_cp = NULL;
3460 
3461 	new_cp = kmem_alloc(strlen(cp) + 1, KM_SLEEP);
3462 
3463 	return (strcpy(new_cp, cp));
3464 }
3465 
3466 void
freestr(char * cp)3467 freestr(char *cp)
3468 {
3469 	kmem_free(cp, strlen(cp) + 1);
3470 }
3471 
3472 /*
3473  * Validate the list and skip invalid devices. Then create
3474  * a doubly linked circular list of devices to probe.
3475  * The hdr points to the head and tail of this list.
3476  */
3477 
3478 static int
md_create_probe_rqlist(md_probedev_impl_t * plist,daemon_queue_t ** hdr,intptr_t (* probe_test)())3479 md_create_probe_rqlist(md_probedev_impl_t *plist, daemon_queue_t **hdr,
3480 			intptr_t (*probe_test)())
3481 {
3482 	int i, err, nodevcnt;
3483 	probe_req_t *tp;
3484 	daemon_queue_t *hp;
3485 	minor_t mnum;
3486 
3487 	nodevcnt = 0;
3488 
3489 	hp = NULL;
3490 
3491 	for (i = 0; i <  plist->probe.nmdevs; i++) {
3492 		mnum = ((minor_t *)(uintptr_t)(plist->probe.mnum_list))[i];
3493 		if (MDI_UNIT(mnum) == NULL) {
3494 			cmn_err(CE_WARN, "md: Cannot probe %s since it does "
3495 			    "not exist", md_shortname(mnum));
3496 			nodevcnt++;
3497 			continue;
3498 		}
3499 		tp = kmem_alloc(sizeof (probe_req_t), KM_SLEEP);
3500 		tp->mnum = mnum;
3501 		tp->private_handle = (void *)plist;
3502 		tp->probe_fcn = probe_test;
3503 		if (hp == NULL) {
3504 			hp = (daemon_queue_t *)tp;
3505 			hp->dq_prev = hp->dq_next = (daemon_queue_t *)tp;
3506 		} else {
3507 			tp->dq.dq_next = hp;
3508 			tp->dq.dq_prev = hp->dq_prev;
3509 			hp->dq_prev->dq_next = (daemon_queue_t *)tp;
3510 			hp->dq_prev = (daemon_queue_t *)tp;
3511 		}
3512 	}
3513 
3514 	*hdr = hp;
3515 	if (nodevcnt > 0)
3516 		plist->probe.nmdevs -= nodevcnt;
3517 
3518 	/*
3519 	 * If there are no devices to be probed because they were
3520 	 * incorrect, then return an error.
3521 	 */
3522 	err = (plist->probe.nmdevs == 0) ? ENODEV : 0;
3523 
3524 	return (err);
3525 }
3526 
3527 /*
3528  * This routine increments the I/O count for set I/O operations.  This
3529  * value is used to determine if an I/O can done.  If a release is in
3530  * process this will return an error and cause the I/O to be errored.
3531  */
3532 int
md_inc_iocount(set_t setno)3533 md_inc_iocount(set_t setno)
3534 {
3535 	int	rc = 0;
3536 
3537 	if (setno == 0)
3538 		return (0);
3539 
3540 	mutex_enter(&md_set_io[setno].md_io_mx);
3541 	if (!(md_set_io[setno].io_state & MD_SET_ACTIVE)) {
3542 		rc = EIO;
3543 		goto out;
3544 	}
3545 
3546 	ASSERT(md_set_io[setno].io_cnt >= 0);
3547 	md_set_io[setno].io_cnt++;
3548 
3549 out:	mutex_exit(&md_set_io[setno].md_io_mx);
3550 	return (rc);
3551 }
3552 
3553 void
md_inc_iocount_noblock(set_t setno)3554 md_inc_iocount_noblock(set_t setno)
3555 {
3556 
3557 	if (setno == 0)
3558 		return;
3559 
3560 	mutex_enter(&md_set_io[setno].md_io_mx);
3561 	md_set_io[setno].io_cnt++;
3562 	mutex_exit(&md_set_io[setno].md_io_mx);
3563 }
3564 void
md_dec_iocount(set_t setno)3565 md_dec_iocount(set_t setno)
3566 {
3567 
3568 	if (setno == 0)
3569 		return;
3570 
3571 	mutex_enter(&md_set_io[setno].md_io_mx);
3572 	md_set_io[setno].io_cnt--;
3573 	ASSERT(md_set_io[setno].io_cnt >= 0);
3574 	if ((md_set_io[setno].io_state & MD_SET_RELEASE) &&
3575 	    (md_set_io[setno].io_cnt == 0))
3576 		cv_broadcast(&md_set_io[setno].md_io_cv);
3577 	mutex_exit(&md_set_io[setno].md_io_mx);
3578 }
3579 
3580 int
md_isblock_setio(set_t setno)3581 md_isblock_setio(set_t setno)
3582 {
3583 	int	rc = 0;
3584 
3585 	if (setno == 0)
3586 		return (0);
3587 
3588 	mutex_enter(&md_set_io[setno].md_io_mx);
3589 	if (md_set_io[setno].io_state & MD_SET_RELEASE)
3590 		rc = 1;
3591 
3592 	mutex_exit(&md_set_io[setno].md_io_mx);
3593 	return (rc);
3594 }
3595 
3596 int
md_block_setio(set_t setno)3597 md_block_setio(set_t setno)
3598 {
3599 	int	rc = 0;
3600 
3601 	if (setno == 0)
3602 		return (1);
3603 
3604 	mutex_enter(&md_set_io[setno].md_io_mx);
3605 	md_set_io[setno].io_state = MD_SET_RELEASE;
3606 
3607 	while (md_set_io[setno].io_cnt > 0) {
3608 		cv_wait(&md_set_io[setno].md_io_cv,
3609 		    &md_set_io[setno].md_io_mx);
3610 	}
3611 	rc = 1;
3612 
3613 
3614 	ASSERT(md_set_io[setno].io_cnt == 0);
3615 	mutex_exit(&md_set_io[setno].md_io_mx);
3616 
3617 	return (rc);
3618 }
3619 
3620 void
md_clearblock_setio(set_t setno)3621 md_clearblock_setio(set_t setno)
3622 {
3623 	if (setno == 0)
3624 		return;
3625 
3626 	mutex_enter(&md_set_io[setno].md_io_mx);
3627 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3628 	mutex_exit(&md_set_io[setno].md_io_mx);
3629 }
3630 
3631 void
md_unblock_setio(set_t setno)3632 md_unblock_setio(set_t setno)
3633 {
3634 	if (setno == 0)
3635 		return;
3636 
3637 	mutex_enter(&md_set_io[setno].md_io_mx);
3638 #ifdef DEBUG
3639 	if (md_set_io[setno].io_cnt != 0) {
3640 		cmn_err(CE_NOTE, "set %d count was %ld at take",
3641 		    setno, md_set_io[setno].io_cnt);
3642 	}
3643 #endif /* DEBUG */
3644 
3645 	md_set_io[setno].io_state = MD_SET_ACTIVE;
3646 	md_set_io[setno].io_cnt = 0;
3647 	mutex_exit(&md_set_io[setno].md_io_mx);
3648 }
3649 
3650 /*
3651  * Test and set version of the md_block_setio.
3652  * Set the io_state to keep new I/O from being issued.
3653  * If there is I/O currently in progress, then set io_state to active
3654  * and return failure.  Otherwise, return a 1 for success.
3655  *
3656  * Used in a MN diskset since the commd must be suspended before
3657  * this node can attempt to withdraw from a diskset.  But, with commd
3658  * suspended, I/O may have been issued that can never finish until
3659  * commd is resumed (allocation of hotspare, etc). So, if I/O is
3660  * outstanding after diskset io_state is marked RELEASE, then set diskset
3661  * io_state back to ACTIVE and return failure.
3662  */
3663 int
md_tas_block_setio(set_t setno)3664 md_tas_block_setio(set_t setno)
3665 {
3666 	int	rc;
3667 
3668 	if (setno == 0)
3669 		return (1);
3670 
3671 	mutex_enter(&md_set_io[setno].md_io_mx);
3672 	md_set_io[setno].io_state = MD_SET_RELEASE;
3673 
3674 	if (md_set_io[setno].io_cnt > 0) {
3675 		md_set_io[setno].io_state = MD_SET_ACTIVE;
3676 		rc = 0;
3677 	} else {
3678 		rc = 1;
3679 	}
3680 
3681 	mutex_exit(&md_set_io[setno].md_io_mx);
3682 
3683 	return (rc);
3684 }
3685 
3686 void
md_biodone(struct buf * pb)3687 md_biodone(struct buf *pb)
3688 {
3689 	minor_t	mnum;
3690 	set_t	setno;
3691 	mdi_unit_t	*ui;
3692 
3693 	mnum = getminor(pb->b_edev);
3694 	setno = MD_MIN2SET(mnum);
3695 
3696 	if (setno == 0) {
3697 		biodone(pb);
3698 		return;
3699 	}
3700 
3701 #ifdef DEBUG
3702 	ui = MDI_UNIT(mnum);
3703 	if (!md_unit_isopen(ui))
3704 		cmn_err(CE_NOTE, "io after close on %s\n", md_shortname(mnum));
3705 #endif /* DEBUG */
3706 
3707 	/*
3708 	 * Handle the local diskset
3709 	 */
3710 	if (md_set_io[setno].io_cnt > 0)
3711 		md_dec_iocount(setno);
3712 
3713 #ifdef DEBUG
3714 	/*
3715 	 * this is being done after the lock is dropped so there
3716 	 * are cases it may be invalid.  It is advisory.
3717 	 */
3718 	if (md_set_io[setno].io_state & MD_SET_RELEASE) {
3719 		/* Only display this error once for this metadevice */
3720 		if ((ui->ui_tstate & MD_RELEASE_IOERR_DONE) == 0) {
3721 			cmn_err(CE_NOTE,
3722 			    "I/O to %s attempted during set RELEASE\n",
3723 			    md_shortname(mnum));
3724 			ui->ui_tstate |= MD_RELEASE_IOERR_DONE;
3725 		}
3726 	}
3727 #endif /* DEBUG */
3728 
3729 	biodone(pb);
3730 }
3731 
3732 
3733 /*
3734  * Driver special private devt handling routine
3735  * INPUT:  md_dev64_t
3736  * OUTPUT: dev_t, 32 bit on a 32 bit kernel, 64 bit on a 64 bit kernel.
3737  */
3738 dev_t
md_dev64_to_dev(md_dev64_t dev)3739 md_dev64_to_dev(md_dev64_t dev)
3740 {
3741 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3742 	minor_t minor = (minor_t)(dev & MAXMIN64);
3743 
3744 	return (makedevice(major, minor));
3745 
3746 }
3747 
3748 /*
3749  * Driver private makedevice routine
3750  * INPUT:  major_t major, minor_t minor
3751  * OUTPUT: md_dev64_t, no matter if on 32 bit or 64 bit kernel.
3752  */
3753 md_dev64_t
md_makedevice(major_t major,minor_t minor)3754 md_makedevice(major_t major, minor_t minor)
3755 {
3756 	return (((md_dev64_t)major << NBITSMINOR64) | minor);
3757 
3758 }
3759 
3760 
3761 /*
3762  * Driver private devt md_getmajor routine
3763  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3764  * OUTPUT: the appropriate major number
3765  */
3766 major_t
md_getmajor(md_dev64_t dev)3767 md_getmajor(md_dev64_t dev)
3768 {
3769 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3770 
3771 	if (major == 0) {
3772 		/* Here we were given a 32bit dev */
3773 		major = (major_t)(dev >> NBITSMINOR32) & MAXMAJ32;
3774 	}
3775 	return (major);
3776 }
3777 
3778 /*
3779  * Driver private devt md_getminor routine
3780  * INPUT:  dev	a 64 bit container holding either a 32 bit or a 64 bit device
3781  * OUTPUT: the appropriate minor number
3782  */
3783 minor_t
md_getminor(md_dev64_t dev)3784 md_getminor(md_dev64_t dev)
3785 {
3786 	minor_t minor;
3787 	major_t major = (major_t)(dev >> NBITSMINOR64) & MAXMAJ64;
3788 
3789 	if (major == 0) {
3790 		/* Here we were given a 32bit dev */
3791 		minor = (minor_t)(dev & MAXMIN32);
3792 	} else {
3793 		minor = (minor_t)(dev & MAXMIN64);
3794 	}
3795 	return (minor);
3796 }
3797 
3798 int
md_check_ioctl_against_unit(int cmd,mdc_unit_t c)3799 md_check_ioctl_against_unit(int cmd, mdc_unit_t c)
3800 {
3801 	/*
3802 	 * If the metadevice is an old style device, it has a vtoc,
3803 	 *	in that case all reading EFI ioctls are not applicable.
3804 	 * If the metadevice has an EFI label, reading vtoc and geom ioctls
3805 	 *	are not supposed to work.
3806 	 */
3807 	switch (cmd) {
3808 		case DKIOCGGEOM:
3809 		case DKIOCGAPART:
3810 			/* if > 2 TB then fail */
3811 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3812 				return (ENOTSUP);
3813 			}
3814 			break;
3815 		case DKIOCGVTOC:
3816 			/* if > 2 TB then fail */
3817 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3818 				return (ENOTSUP);
3819 			}
3820 
3821 			/* if > 1 TB but < 2TB return overflow */
3822 			if (c.un_revision & MD_64BIT_META_DEV) {
3823 				return (EOVERFLOW);
3824 			}
3825 			break;
3826 		case DKIOCGEXTVTOC:
3827 			/* if > 2 TB then fail */
3828 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3829 				return (ENOTSUP);
3830 			}
3831 			break;
3832 		case DKIOCGETEFI:
3833 		case DKIOCPARTITION:
3834 			if ((c.un_flag & MD_EFILABEL) == 0) {
3835 				return (ENOTSUP);
3836 			}
3837 			break;
3838 
3839 		case DKIOCSETEFI:
3840 		/* setting an EFI label should always be ok */
3841 			return (0);
3842 
3843 		case DKIOCSVTOC:
3844 			/* if > 2 TB then fail */
3845 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3846 				return (ENOTSUP);
3847 			}
3848 
3849 			/* if > 1 TB but < 2TB return overflow */
3850 			if (c.un_revision & MD_64BIT_META_DEV) {
3851 				return (EOVERFLOW);
3852 			}
3853 			break;
3854 		case DKIOCSEXTVTOC:
3855 			if (c.un_total_blocks > MD_MAX_BLKS_FOR_EXTVTOC) {
3856 				return (ENOTSUP);
3857 			}
3858 			break;
3859 	}
3860 	return (0);
3861 }
3862 
3863 /*
3864  * md_vtoc_to_efi_record()
3865  * Input:  record id of the vtoc record
3866  * Output: record id of the efi record
3867  * Function:
3868  *	- reads the  volume name from the vtoc record
3869  *	- converts the volume name to a format, libefi understands
3870  *	- creates a new record of size MD_EFI_PARTNAME_BYTES
3871  *	- stores the volname in that record,
3872  *	- commits that record
3873  *	- returns the recid of the efi record.
3874  * Caveat Emptor:
3875  *	The calling routine must do something like
3876  *	- un->c.un_vtoc_id = md_vtoc_to_efi_record(vtoc_recid)
3877  *	- commit(un)
3878  *	- delete(vtoc_recid)
3879  *	in order to keep the mddb consistent in case of a panic in the middle.
3880  * Errors:
3881  *	- returns 0 on any error
3882  */
3883 mddb_recid_t
md_vtoc_to_efi_record(mddb_recid_t vtoc_recid,set_t setno)3884 md_vtoc_to_efi_record(mddb_recid_t vtoc_recid, set_t setno)
3885 {
3886 	struct vtoc	*vtoc;
3887 	ushort_t	*v;
3888 	mddb_recid_t	efi_recid;
3889 	int		i;
3890 
3891 	if (mddb_getrecstatus(vtoc_recid) != MDDB_OK) {
3892 		return (0);
3893 	}
3894 	vtoc = (struct vtoc *)mddb_getrecaddr(vtoc_recid);
3895 	efi_recid = mddb_createrec(MD_EFI_PARTNAME_BYTES, MDDB_EFILABEL, 0,
3896 	    MD_CRO_32BIT, setno);
3897 	if (efi_recid < 0) {
3898 		return (0);
3899 	}
3900 	v = (ushort_t *)mddb_getrecaddr(efi_recid);
3901 
3902 	/* This for loop read, converts and writes */
3903 	for (i = 0; i < LEN_DKL_VVOL; i++) {
3904 		v[i] = LE_16((uint16_t)vtoc->v_volume[i]);
3905 	}
3906 	/* commit the new record */
3907 	mddb_commitrec_wrapper(efi_recid);
3908 
3909 	return (efi_recid);
3910 }
3911 
3912 /*
3913  * Send a kernel message.
3914  * user has to provide for an allocated result structure
3915  * If the door handler disappears we retry, emitting warnings every so often.
3916  *
3917  * The recipient argument is almost always unused, and is therefore typically
3918  * set to zero, as zero is an invalid cluster nodeid.  The exceptions are the
3919  * marking and clearing of the DRL from a node that is not currently the
3920  * owner.  In these cases, the recipient argument will be the nodeid of the
3921  * mirror owner, and MD_MSGF_DIRECTED will be set in the flags.  Non-owner
3922  * nodes will not receive these messages.
3923  *
3924  * For the case where md_mn_is_commd_present() is false, we simply pre-set
3925  * the result->kmmr_comm_state to MDMNE_RPC_FAIL.
3926  * This covers the case where the service mdcommd has been killed and so we do
3927  * not get a 'new' result structure copied back. Instead we return with the
3928  * supplied result field, and we need to flag a failure to the caller.
3929  */
3930 int
mdmn_ksend_message(set_t setno,md_mn_msgtype_t type,uint_t flags,md_mn_nodeid_t recipient,char * data,int size,md_mn_kresult_t * result)3931 mdmn_ksend_message(
3932 	set_t		setno,
3933 	md_mn_msgtype_t	type,
3934 	uint_t		flags,
3935 	md_mn_nodeid_t	recipient,
3936 	char		*data,
3937 	int		size,
3938 	md_mn_kresult_t	*result)
3939 {
3940 	door_arg_t	da;
3941 	md_mn_kmsg_t	*kmsg;
3942 	uint_t		send_try_cnt = 0;
3943 	uint_t		retry_noise_cnt = 0;
3944 	int		rval;
3945 	k_sigset_t	oldmask, newmask;
3946 
3947 	/*
3948 	 * Ensure that we default to a recoverable failure state if the
3949 	 * door upcall cannot pass the request on to rpc.mdcommd.
3950 	 * This may occur when shutting the node down while there is still
3951 	 * a mirror resync or metadevice state update occurring.
3952 	 */
3953 	result->kmmr_comm_state = MDMNE_RPC_FAIL;
3954 	result->kmmr_exitval = ~0;
3955 
3956 	if (size > MDMN_MAX_KMSG_DATA)
3957 		return (ENOMEM);
3958 	kmsg = kmem_zalloc(sizeof (md_mn_kmsg_t), KM_SLEEP);
3959 	kmsg->kmsg_flags = flags;
3960 	kmsg->kmsg_setno = setno;
3961 	kmsg->kmsg_recipient = recipient;
3962 	kmsg->kmsg_type	= type;
3963 	kmsg->kmsg_size	= size;
3964 	bcopy(data, &(kmsg->kmsg_data), size);
3965 
3966 	/*
3967 	 * Wait for the door handle to be established.
3968 	 */
3969 	while (mdmn_door_did == -1) {
3970 		if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
3971 			cmn_err(CE_WARN, "door handle not yet ready. "
3972 			    "Check if /usr/lib/lvm/mddoors is running");
3973 		}
3974 		delay(md_hz);
3975 	}
3976 
3977 	/*
3978 	 * If MD_MSGF_BLK_SIGNAL is set, mask out all signals so that we
3979 	 * do not fail if the user process receives a signal while we're
3980 	 * active in the door interface.
3981 	 */
3982 	if (flags & MD_MSGF_BLK_SIGNAL) {
3983 		sigfillset(&newmask);
3984 		sigreplace(&newmask, &oldmask);
3985 	}
3986 
3987 	/*
3988 	 * If message failed with an RPC_FAILURE when rpc.mdcommd had
3989 	 * been gracefully shutdown (md_mn_is_commd_present returns FALSE)
3990 	 * then don't retry the message anymore.  If message
3991 	 * failed due to any other reason, then retry up to MD_MN_WARN_INTVL
3992 	 * times which should allow a shutting down system time to
3993 	 * notify the kernel of a graceful shutdown of rpc.mdcommd.
3994 	 *
3995 	 * Caller of this routine will need to check the md_mn_commd_present
3996 	 * flag and the failure error in order to determine whether to panic
3997 	 * or not.  If md_mn_commd_present is set to 0 and failure error
3998 	 * is RPC_FAILURE, the calling routine should not panic since the
3999 	 * system is in the process of being shutdown.
4000 	 *
4001 	 */
4002 
4003 	retry_noise_cnt = send_try_cnt = 0;
4004 	while (md_mn_is_commd_present_lite()) {
4005 		/*
4006 		 * data_ptr and data_size are initialized here because on
4007 		 * return from the upcall, they contain data duplicated from
4008 		 * rbuf and rsize.  This causes subsequent upcalls to fail.
4009 		 */
4010 		da.data_ptr = (char *)(kmsg);
4011 		da.data_size = sizeof (md_mn_kmsg_t);
4012 		da.desc_ptr = NULL;
4013 		da.desc_num = 0;
4014 		da.rbuf = (char *)result;
4015 		da.rsize = sizeof (*result);
4016 
4017 		while ((rval = door_ki_upcall_limited(mdmn_door_handle, &da,
4018 		    NULL, SIZE_MAX, 0)) != 0) {
4019 			if ((++retry_noise_cnt % MD_MN_WARN_INTVL) == 0) {
4020 				if (rval == EAGAIN)  {
4021 					cmn_err(CE_WARN,
4022 					    "md: door_upcall failed. "
4023 					    "Check if mddoors is running.");
4024 				} else if (rval == EINTR) {
4025 					cmn_err(CE_WARN,
4026 					    "md: door_upcall failed. "
4027 					    "Check if rpc.mdcommd is running.");
4028 				} else {
4029 					cmn_err(CE_WARN,
4030 					    "md: door_upcall failed. "
4031 					    "Returned %d",
4032 					    rval);
4033 				}
4034 			}
4035 			if (++send_try_cnt >= md_send_retry_limit)
4036 				break;
4037 
4038 			delay(md_hz);
4039 
4040 			/*
4041 			 * data_ptr and data_size are re-initialized here
4042 			 * because on return from the upcall, they contain
4043 			 * data duplicated from rbuf and rsize.  This causes
4044 			 * subsequent upcalls to fail.
4045 			 */
4046 			da.data_ptr = (char *)(kmsg);
4047 			da.data_size = sizeof (md_mn_kmsg_t);
4048 			da.desc_ptr = NULL;
4049 			da.desc_num = 0;
4050 			da.rbuf = (char *)result;
4051 			da.rsize = sizeof (*result);
4052 		}
4053 
4054 
4055 		/*
4056 		 * If:
4057 		 * - the send succeeded (MDMNE_ACK)
4058 		 * - we had an MDMNE_RPC_FAIL and commd is now gone
4059 		 *   (note: since the outer loop is commd-dependent,
4060 		 *   checking MDMN_RPC_FAIL here is meaningless)
4061 		 * - we were told not to retry
4062 		 * - we exceeded the RPC failure send limit
4063 		 * punch out of the outer loop prior to the delay()
4064 		 */
4065 		if (result->kmmr_comm_state == MDMNE_ACK ||
4066 		    (flags & MD_MSGF_KSEND_NORETRY) ||
4067 		    (++send_try_cnt % md_send_retry_limit) == 0 ||
4068 		    !md_mn_is_commd_present())
4069 			break;
4070 		delay(md_hz);
4071 	}
4072 
4073 	if (flags & MD_MSGF_BLK_SIGNAL) {
4074 		sigreplace(&oldmask, (k_sigset_t *)NULL);
4075 	}
4076 	kmem_free(kmsg, sizeof (md_mn_kmsg_t));
4077 
4078 	return (0);
4079 }
4080 
4081 /*
4082  * Called to propagate the capability of a metadevice to all nodes in the set.
4083  *
4084  * On entry, lockp is set if the function has been called from within an ioctl.
4085  *
4086  * IOLOCK_RETURN_RELEASE, which drops the md_ioctl_lock is called in this
4087  * routine to enable other mdioctls to enter the kernel while this
4088  * thread of execution waits on the completion of mdmn_ksend_message. When
4089  * the message is completed the thread continues and md_ioctl_lock must be
4090  * reacquired.  Even though md_ioctl_lock is interruptable, we choose to
4091  * ignore EINTR as we must not return without acquiring md_ioctl_lock.
4092  */
4093 
4094 int
mdmn_send_capability_message(minor_t mnum,volcap_t vc,IOLOCK * lockp)4095 mdmn_send_capability_message(minor_t mnum, volcap_t vc, IOLOCK *lockp)
4096 {
4097 	md_mn_msg_setcap_t	msg;
4098 	md_mn_kresult_t		*kres;
4099 	mdi_unit_t		*ui = MDI_UNIT(mnum);
4100 	int			ret;
4101 	k_sigset_t		oldmask, newmask;
4102 
4103 	(void) strncpy((char *)&msg.msg_setcap_driver,
4104 	    md_ops[ui->ui_opsindex]->md_driver.md_drivername, MD_DRIVERNAMELEN);
4105 	msg.msg_setcap_mnum = mnum;
4106 	msg.msg_setcap_set = vc.vc_set;
4107 
4108 	if (lockp)
4109 		IOLOCK_RETURN_RELEASE(0, lockp);
4110 	kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4111 
4112 	/*
4113 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4114 	 * interface from failing if the user process receives a signal while
4115 	 * in mdmn_ksend_message.
4116 	 */
4117 	sigfillset(&newmask);
4118 	sigreplace(&newmask, &oldmask);
4119 	ret = (mdmn_ksend_message(MD_MIN2SET(mnum), MD_MN_MSG_SET_CAP,
4120 	    MD_MSGF_NO_LOG, 0, (char *)&msg, sizeof (md_mn_msg_setcap_t),
4121 	    kres));
4122 	sigreplace(&oldmask, (k_sigset_t *)NULL);
4123 
4124 	if (!MDMN_KSEND_MSG_OK(ret, kres)) {
4125 		mdmn_ksend_show_error(ret, kres, "MD_MN_MSG_SET_CAP");
4126 		ret = EIO;
4127 	}
4128 	kmem_free(kres, sizeof (md_mn_kresult_t));
4129 
4130 	if (lockp) {
4131 		IOLOCK_RETURN_REACQUIRE(lockp);
4132 	}
4133 	return (ret);
4134 }
4135 
4136 /*
4137  * Called to clear all of the transient capabilities for a metadevice when it is
4138  * not open on any node in the cluster
4139  * Called from close for mirror and sp.
4140  */
4141 
4142 void
mdmn_clear_all_capabilities(minor_t mnum)4143 mdmn_clear_all_capabilities(minor_t mnum)
4144 {
4145 	md_isopen_t	clumsg;
4146 	int		ret;
4147 	md_mn_kresult_t	*kresult;
4148 	volcap_t	vc;
4149 	k_sigset_t	oldmask, newmask;
4150 
4151 	clumsg.dev = md_makedevice(md_major, mnum);
4152 	clumsg.mde = mdnullerror;
4153 	/*
4154 	 * The check open message doesn't have to be logged, nor should the
4155 	 * result be stored in the MCT. We want an up-to-date state.
4156 	 */
4157 	kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
4158 
4159 	/*
4160 	 * Mask signals for the mdmd_ksend_message call.  This keeps the door
4161 	 * interface from failing if the user process receives a signal while
4162 	 * in mdmn_ksend_message.
4163 	 */
4164 	sigfillset(&newmask);
4165 	sigreplace(&newmask, &oldmask);
4166 	ret = mdmn_ksend_message(MD_MIN2SET(mnum),
4167 	    MD_MN_MSG_CLU_CHECK,
4168 	    MD_MSGF_STOP_ON_ERROR | MD_MSGF_NO_LOG | MD_MSGF_NO_MCT, 0,
4169 	    (char *)&clumsg, sizeof (clumsg), kresult);
4170 	sigreplace(&oldmask, (k_sigset_t *)NULL);
4171 
4172 	if ((ret == 0) && (kresult->kmmr_exitval == 0)) {
4173 		/*
4174 		 * Not open on any node, clear all capabilities, eg ABR and
4175 		 * DMR
4176 		 */
4177 		vc.vc_set = 0;
4178 		(void) mdmn_send_capability_message(mnum, vc, NULL);
4179 	}
4180 	kmem_free(kresult, sizeof (md_mn_kresult_t));
4181 }
4182 
4183 /*
4184  * mdmn_ksend_show_error:
4185  * ---------------------
4186  * Called to display the error contents of a failing mdmn_ksend_message() result
4187  *
4188  * Input:
4189  *	rv	- return value from mdmn_ksend_message()
4190  *	kres	- pointer to result structure filled in by mdmn_ksend_message
4191  *	s	- Informative message to identify failing condition (e.g.
4192  *		  "Ownership change") This string will be displayed with
4193  *		  cmn_err(CE_WARN, "%s *FAILED*",...) to alert the system
4194  *		  administrator
4195  */
4196 void
mdmn_ksend_show_error(int rv,md_mn_kresult_t * kres,const char * s)4197 mdmn_ksend_show_error(int rv, md_mn_kresult_t *kres, const char *s)
4198 {
4199 	if (rv == 0) {
4200 		cmn_err(CE_WARN, "%s *FAILED*", s);
4201 		cmn_err(CE_CONT, "exit_val = %d, comm_state = %d, failing_node"
4202 		    " = %d", kres->kmmr_exitval, kres->kmmr_comm_state,
4203 		    kres->kmmr_failing_node);
4204 	} else {
4205 		cmn_err(CE_WARN, "%s *FAILED*, return value = %d", s, rv);
4206 	}
4207 }
4208 
4209 /*
4210  * Callback routine for resync thread. If requested to suspend we mark the
4211  * commd as not being present.
4212  */
4213 boolean_t
callb_md_mrs_cpr(void * arg,int code)4214 callb_md_mrs_cpr(void *arg, int code)
4215 {
4216 	callb_cpr_t *cp = (callb_cpr_t *)arg;
4217 	int ret = 0;				/* assume success */
4218 	clock_t delta;
4219 
4220 	mutex_enter(cp->cc_lockp);
4221 
4222 	switch (code) {
4223 	case CB_CODE_CPR_CHKPT:
4224 		/*
4225 		 * Mark the rpc.mdcommd as no longer present. We are trying to
4226 		 * suspend the system and so we should expect RPC failures to
4227 		 * occur.
4228 		 */
4229 		md_mn_clear_commd_present();
4230 		cp->cc_events |= CALLB_CPR_START;
4231 		delta = CPR_KTHREAD_TIMEOUT_SEC * hz;
4232 		while (!(cp->cc_events & CALLB_CPR_SAFE))
4233 			/* cv_timedwait() returns -1 if it times out. */
4234 			if ((ret = cv_reltimedwait(&cp->cc_callb_cv,
4235 			    cp->cc_lockp, delta, TR_CLOCK_TICK)) == -1)
4236 				break;
4237 			break;
4238 
4239 	case CB_CODE_CPR_RESUME:
4240 		cp->cc_events &= ~CALLB_CPR_START;
4241 		cv_signal(&cp->cc_stop_cv);
4242 		break;
4243 	}
4244 	mutex_exit(cp->cc_lockp);
4245 	return (ret != -1);
4246 }
4247 
4248 
4249 void
md_rem_hspname(set_t setno,mdkey_t n_key)4250 md_rem_hspname(set_t setno, mdkey_t n_key)
4251 {
4252 	int	s;
4253 	int	max_sides;
4254 
4255 
4256 	/* All entries removed are in the same diskset */
4257 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4258 		max_sides = MD_MNMAXSIDES;
4259 	else
4260 		max_sides = MD_MAXSIDES;
4261 
4262 	for (s = 0; s < max_sides; s++)
4263 		(void) md_remdevname(setno, s, n_key);
4264 }
4265 
4266 
4267 int
md_rem_selfname(minor_t selfid)4268 md_rem_selfname(minor_t selfid)
4269 {
4270 	int	s;
4271 	set_t	setno = MD_MIN2SET(selfid);
4272 	int	max_sides;
4273 	md_dev64_t	dev;
4274 	struct nm_next_hdr	*nh;
4275 	struct nm_name	*n;
4276 	mdkey_t key;
4277 
4278 	/*
4279 	 * Get the key since remove routine expects it
4280 	 */
4281 	dev = md_makedevice(md_major, selfid);
4282 	if ((nh = get_first_record(setno, 0, NM_NOTSHARED)) == NULL) {
4283 		return (ENOENT);
4284 	}
4285 
4286 	if ((n = (struct nm_name *)lookup_entry(nh, setno, MD_SIDEWILD,
4287 	    MD_KEYWILD, dev, 0L)) == NULL) {
4288 		return (ENOENT);
4289 	}
4290 
4291 	/* All entries removed are in the same diskset */
4292 	key = n->n_key;
4293 	if (md_get_setstatus(setno) & MD_SET_MNSET)
4294 		max_sides = MD_MNMAXSIDES;
4295 	else
4296 		max_sides = MD_MAXSIDES;
4297 
4298 	for (s = 0; s < max_sides; s++)
4299 		(void) md_remdevname(setno, s, key);
4300 
4301 	return (0);
4302 }
4303 
4304 void
md_upd_set_unnext(set_t setno,unit_t un)4305 md_upd_set_unnext(set_t setno, unit_t un)
4306 {
4307 	if (un < md_set[setno].s_un_next) {
4308 		md_set[setno].s_un_next = un;
4309 	}
4310 }
4311 
4312 struct hot_spare_pool *
find_hot_spare_pool(set_t setno,int hsp_id)4313 find_hot_spare_pool(set_t setno, int hsp_id)
4314 {
4315 	hot_spare_pool_t *hsp;
4316 
4317 	hsp = (hot_spare_pool_t *)md_set[setno].s_hsp;
4318 	while (hsp != NULL) {
4319 		if (hsp->hsp_self_id == hsp_id)
4320 			return (hsp);
4321 		hsp = hsp->hsp_next;
4322 	}
4323 
4324 	return ((hot_spare_pool_t *)0);
4325 }
4326 
4327 /*
4328  * md_create_taskq:
4329  *
4330  * Create a kernel taskq for the given set/unit combination. This is typically
4331  * used to complete a RR_CLEAN request when the callee is unable to obtain the
4332  * mutex / condvar access required to update the DRL safely.
4333  */
4334 void *
md_create_taskq(set_t setno,minor_t mnum)4335 md_create_taskq(set_t setno, minor_t mnum)
4336 {
4337 	char			name[20];
4338 	ddi_taskq_t		*tqp;
4339 
4340 	(void) snprintf(name, 20, "%d/d%d", setno, MD_MIN2UNIT(mnum));
4341 
4342 	tqp = ddi_taskq_create(md_devinfo, name, 1, TASKQ_DEFAULTPRI, 0);
4343 
4344 	return ((void *)tqp);
4345 }
4346