1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
25 */
26
27 #include <sys/param.h>
28 #include <sys/systm.h>
29 #include <sys/conf.h>
30 #include <sys/file.h>
31 #include <sys/user.h>
32 #include <sys/uio.h>
33 #include <sys/t_lock.h>
34 #include <sys/buf.h>
35 #include <sys/dkio.h>
36 #include <sys/vtoc.h>
37 #include <sys/kmem.h>
38 #include <vm/page.h>
39 #include <sys/cmn_err.h>
40 #include <sys/sysmacros.h>
41 #include <sys/types.h>
42 #include <sys/mkdev.h>
43 #include <sys/stat.h>
44 #include <sys/open.h>
45 #include <sys/modctl.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/debug.h>
49 #include <sys/dklabel.h>
50 #include <vm/hat.h>
51 #include <sys/lvm/mdvar.h>
52 #include <sys/lvm/md_mirror.h>
53 #include <sys/lvm/md_convert.h>
54 #include <sys/lvm/md_mddb.h>
55 #include <sys/esunddi.h>
56
57 #include <sys/sysevent/eventdefs.h>
58 #include <sys/sysevent/svm.h>
59 #include <sys/lvm/mdmn_commd.h>
60 #include <sys/avl.h>
61
62 md_ops_t mirror_md_ops;
63 #ifndef lint
64 md_ops_t *md_interface_ops = &mirror_md_ops;
65 #endif
66
67 extern mdq_anchor_t md_done_daemon;
68 extern mdq_anchor_t md_mstr_daemon;
69 extern mdq_anchor_t md_mirror_daemon;
70 extern mdq_anchor_t md_mirror_io_daemon;
71 extern mdq_anchor_t md_mirror_rs_daemon;
72 extern mdq_anchor_t md_mhs_daemon;
73
74 extern unit_t md_nunits;
75 extern set_t md_nsets;
76 extern md_set_t md_set[];
77
78 extern int md_status;
79 extern clock_t md_hz;
80
81 extern md_krwlock_t md_unit_array_rw;
82 extern kmutex_t md_mx;
83 extern kcondvar_t md_cv;
84 extern int md_mtioctl_cnt;
85
86 daemon_request_t mirror_timeout;
87 static daemon_request_t hotspare_request;
88 static daemon_request_t mn_hs_request[MD_MAXSETS]; /* Multinode hs req */
89
90 int md_mirror_mcs_buf_off;
91
92 /* Flags for mdmn_ksend_message to allow debugging */
93 int md_mirror_msg_flags;
94
95 #ifdef DEBUG
96 /* Flag to switch on debug messages */
97 int mirror_debug_flag = 0;
98 #endif
99
100 /*
101 * Struct used to hold count of DMR reads and the timestamp of last DMR read
102 * It is used to verify, using a debugger, that the DMR read ioctl has been
103 * executed.
104 */
105 dmr_stats_t mirror_dmr_stats = {0, 0};
106
107 /*
108 * Mutex protecting list of non-failfast drivers.
109 */
110 static kmutex_t non_ff_drv_mutex;
111 extern char **non_ff_drivers;
112
113 extern major_t md_major;
114
115 /*
116 * Write-On-Write memory pool.
117 */
118 static void copy_write_cont(wowhdr_t *wowhdr);
119 static kmem_cache_t *mirror_wowblk_cache = NULL;
120 static int md_wowbuf_size = 16384;
121 static size_t md_wowblk_size;
122
123 /*
124 * This is a flag that allows:
125 * - disabling the write-on-write mechanism.
126 * - logging occurrences of write-on-write
127 * - switching wow handling procedure processing
128 * Counter for occurences of WOW.
129 */
130 static uint_t md_mirror_wow_flg = 0;
131 static int md_mirror_wow_cnt = 0;
132
133 /*
134 * Tunable to enable/disable dirty region
135 * processing when closing down a mirror.
136 */
137 static int new_resync = 1;
138 kmem_cache_t *mirror_parent_cache = NULL;
139 kmem_cache_t *mirror_child_cache = NULL;
140
141 extern int md_ff_disable; /* disable failfast */
142
143 static int mirror_map_write(mm_unit_t *, md_mcs_t *, md_mps_t *, int);
144 static void mirror_read_strategy(buf_t *, int, void *);
145 static void mirror_write_strategy(buf_t *, int, void *);
146 static void become_owner(daemon_queue_t *);
147 static int mirror_done(struct buf *cb);
148 static int mirror_done_common(struct buf *cb);
149 static void clear_retry_error(struct buf *cb);
150
151 /*
152 * patchables
153 */
154 int md_min_rr_size = 200; /* 2000 blocks, or 100k */
155 int md_def_num_rr = 1000; /* Default number of dirty regions */
156
157 /*
158 * patchable to change delay before rescheduling mirror ownership request.
159 * Value is clock ticks, default 0.5 seconds
160 */
161 clock_t md_mirror_owner_to = 500000;
162
163 /*ARGSUSED1*/
164 static int
mirror_parent_constructor(void * p,void * d1,int d2)165 mirror_parent_constructor(void *p, void *d1, int d2)
166 {
167 mutex_init(&((md_mps_t *)p)->ps_mx, NULL, MUTEX_DEFAULT, NULL);
168 return (0);
169 }
170
171 static void
mirror_parent_init(md_mps_t * ps)172 mirror_parent_init(md_mps_t *ps)
173 {
174 bzero(ps, offsetof(md_mps_t, ps_mx));
175 bzero(&ps->ps_overlap_node, sizeof (avl_node_t));
176 }
177
178 /*ARGSUSED1*/
179 static void
mirror_parent_destructor(void * p,void * d)180 mirror_parent_destructor(void *p, void *d)
181 {
182 mutex_destroy(&((md_mps_t *)p)->ps_mx);
183 }
184
185 /*ARGSUSED1*/
186 static int
mirror_child_constructor(void * p,void * d1,int d2)187 mirror_child_constructor(void *p, void *d1, int d2)
188 {
189 bioinit(&((md_mcs_t *)p)->cs_buf);
190 return (0);
191 }
192
193 void
mirror_child_init(md_mcs_t * cs)194 mirror_child_init(md_mcs_t *cs)
195 {
196 cs->cs_ps = NULL;
197 cs->cs_mdunit = 0;
198 md_bioreset(&cs->cs_buf);
199 }
200
201 /*ARGSUSED1*/
202 static void
mirror_child_destructor(void * p,void * d)203 mirror_child_destructor(void *p, void *d)
204 {
205 biofini(&((md_mcs_t *)p)->cs_buf);
206 }
207
208 static void
mirror_wowblk_init(wowhdr_t * p)209 mirror_wowblk_init(wowhdr_t *p)
210 {
211 bzero(p, md_wowblk_size);
212 }
213
214 static void
send_poke_hotspares_msg(daemon_request_t * drq)215 send_poke_hotspares_msg(daemon_request_t *drq)
216 {
217 int rval;
218 int nretries = 0;
219 md_mn_msg_pokehsp_t pokehsp;
220 md_mn_kresult_t *kresult;
221 set_t setno = (set_t)drq->dq.qlen;
222
223 pokehsp.pokehsp_setno = setno;
224
225 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
226
227 retry_sphmsg:
228 rval = mdmn_ksend_message(setno, MD_MN_MSG_POKE_HOTSPARES,
229 MD_MSGF_NO_LOG | MD_MSGF_NO_BCAST, 0, (char *)&pokehsp,
230 sizeof (pokehsp), kresult);
231
232 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
233 mdmn_ksend_show_error(rval, kresult, "POKE_HOTSPARES");
234 /* If we're shutting down already, pause things here. */
235 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
236 while (!md_mn_is_commd_present()) {
237 delay(md_hz);
238 }
239 /*
240 * commd has become reachable again, so retry once.
241 * If this fails we'll panic as the system is in an
242 * unexpected state.
243 */
244 if (nretries++ == 0)
245 goto retry_sphmsg;
246 }
247 cmn_err(CE_PANIC,
248 "ksend_message failure: POKE_HOTSPARES");
249 }
250 kmem_free(kresult, sizeof (md_mn_kresult_t));
251
252 /* Allow further requests to use this set's queue structure */
253 mutex_enter(&drq->dr_mx);
254 drq->dr_pending = 0;
255 mutex_exit(&drq->dr_mx);
256 }
257
258 /*
259 * Send a poke_hotspares message to the master node. To avoid swamping the
260 * commd handler with requests we only send a message if there is not one
261 * already outstanding. We punt the request to a separate thread context as
262 * cannot afford to block waiting on the request to be serviced. This is
263 * essential when a reconfig cycle is in progress as any open() of a multinode
264 * metadevice may result in a livelock.
265 */
266 static void
send_poke_hotspares(set_t setno)267 send_poke_hotspares(set_t setno)
268 {
269 daemon_request_t *drq = &mn_hs_request[setno];
270
271 mutex_enter(&drq->dr_mx);
272 if (drq->dr_pending == 0) {
273 drq->dr_pending = 1;
274 drq->dq.qlen = (int)setno;
275 daemon_request(&md_mhs_daemon,
276 send_poke_hotspares_msg, (daemon_queue_t *)drq, REQ_OLD);
277 }
278 mutex_exit(&drq->dr_mx);
279 }
280
281 void
mirror_set_sm_state(mm_submirror_t * sm,mm_submirror_ic_t * smic,sm_state_t newstate,int force)282 mirror_set_sm_state(
283 mm_submirror_t *sm,
284 mm_submirror_ic_t *smic,
285 sm_state_t newstate,
286 int force)
287 {
288 int compcnt;
289 int i;
290 int errcnt;
291 sm_state_t origstate;
292 md_m_shared_t *shared;
293
294 if (force) {
295 sm->sm_state = newstate;
296 uniqtime32(&sm->sm_timestamp);
297 return;
298 }
299
300 origstate = newstate;
301
302 compcnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
303 for (i = 0, errcnt = 0; i < compcnt; i++) {
304 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
305 (sm->sm_dev, sm, i);
306 if (shared->ms_state & (CS_ERRED | CS_LAST_ERRED))
307 newstate |= SMS_COMP_ERRED;
308 if (shared->ms_state & (CS_RESYNC))
309 newstate |= SMS_COMP_RESYNC;
310 if (shared->ms_state & CS_ERRED)
311 errcnt++;
312 }
313
314 if ((newstate & (SMS_COMP_ERRED | SMS_COMP_RESYNC)) != 0)
315 newstate &= ~origstate;
316
317 if (errcnt == compcnt)
318 newstate |= SMS_ALL_ERRED;
319 else
320 newstate &= ~SMS_ALL_ERRED;
321
322 sm->sm_state = newstate;
323 uniqtime32(&sm->sm_timestamp);
324 }
325
326 static int
mirror_geterror(mm_unit_t * un,int * smi,int * cip,int clr_error,int frm_probe)327 mirror_geterror(mm_unit_t *un, int *smi, int *cip, int clr_error,
328 int frm_probe)
329 {
330 mm_submirror_t *sm;
331 mm_submirror_ic_t *smic;
332 md_m_shared_t *shared;
333 int ci;
334 int i;
335 int compcnt;
336 int open_comp; /* flag for open component */
337
338 for (i = *smi; i < NMIRROR; i++) {
339 sm = &un->un_sm[i];
340 smic = &un->un_smic[i];
341
342 if (!SMS_IS(sm, SMS_INUSE))
343 continue;
344
345 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
346 for (ci = *cip; ci < compcnt; ci++) {
347 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
348 (sm->sm_dev, sm, ci);
349 /*
350 * if called from any routine but probe, we check for
351 * MDM_S_ISOPEN flag. Since probe does a pseduo open,
352 * it sets MDM_S_PROBEOPEN flag and we test for this
353 * flag. They are both exclusive tests.
354 */
355 open_comp = (frm_probe) ?
356 (shared->ms_flags & MDM_S_PROBEOPEN):
357 (shared->ms_flags & MDM_S_ISOPEN);
358 if (((shared->ms_flags & MDM_S_IOERR || !open_comp) &&
359 ((shared->ms_state == CS_OKAY) ||
360 (shared->ms_state == CS_RESYNC))) ||
361 (!open_comp &&
362 (shared->ms_state == CS_LAST_ERRED))) {
363 if (clr_error) {
364 shared->ms_flags &= ~MDM_S_IOERR;
365 }
366 *cip = ci;
367 *smi = i;
368 return (1);
369 }
370
371 if (clr_error && (shared->ms_flags & MDM_S_IOERR)) {
372 shared->ms_flags &= ~MDM_S_IOERR;
373 }
374 }
375
376 *cip = 0;
377 }
378 return (0);
379 }
380
381 /*ARGSUSED*/
382 static void
mirror_run_queue(void * d)383 mirror_run_queue(void *d)
384 {
385 if (!(md_status & MD_GBL_DAEMONS_LIVE))
386 md_daemon(1, &md_done_daemon);
387 }
388 /*
389 * check_comp_4_hotspares
390 *
391 * This function attempts to allocate a hotspare for this component if the
392 * component is in error. In a MN set, the function can be called in 2 modes.
393 * It can be called either when a component error has been detected or when a
394 * new hotspare has been allocated. In this case, MD_HOTSPARE_XMIT is set
395 * in flags and the request is sent to all nodes.
396 * The handler on each of the nodes then calls this function with
397 * MD_HOTSPARE_XMIT unset and the hotspare allocation is then performed.
398 *
399 * For non-MN sets the function simply attempts to allocate a hotspare.
400 *
401 * On entry, the following locks are held
402 * mirror_md_ops.md_link_rw (if flags has MD_HOTSPARE_LINKHELD set)
403 * md_unit_writerlock
404 *
405 * Returns 0 if ok
406 * 1 if the unit containing the component has been cleared while
407 * the mdmn_ksend_message() was being executed
408 */
409 extern int
check_comp_4_hotspares(mm_unit_t * un,int smi,int ci,uint_t flags,mddb_recid_t hs_id,IOLOCK * lockp)410 check_comp_4_hotspares(
411 mm_unit_t *un,
412 int smi,
413 int ci,
414 uint_t flags,
415 mddb_recid_t hs_id, /* Only used by MN disksets */
416 IOLOCK *lockp /* can be NULL */
417 )
418 {
419 mm_submirror_t *sm;
420 mm_submirror_ic_t *smic;
421 md_m_shared_t *shared;
422 mddb_recid_t recids[6];
423 minor_t mnum;
424 intptr_t (*hs_dev)();
425 void (*hs_done)();
426 void *hs_data;
427 md_error_t mde = mdnullerror;
428 set_t setno;
429 md_mn_msg_allochsp_t allochspmsg;
430 md_mn_kresult_t *kresult;
431 mm_unit_t *new_un;
432 int rval;
433 int nretries = 0;
434
435 mnum = MD_SID(un);
436 setno = MD_UN2SET(un);
437 sm = &un->un_sm[smi];
438 smic = &un->un_smic[smi];
439 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
440 (sm->sm_dev, sm, ci);
441
442 if (shared->ms_state != CS_ERRED)
443 return (0);
444
445 /* Don't start a new component resync if a resync is already running. */
446 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
447 return (0);
448
449 if (MD_MNSET_SETNO(setno) && (flags & MD_HOTSPARE_XMIT)) {
450 uint_t msgflags;
451 md_mn_msgtype_t msgtype;
452
453 /* Send allocate hotspare message to all nodes */
454
455 allochspmsg.msg_allochsp_mnum = un->c.un_self_id;
456 allochspmsg.msg_allochsp_sm = smi;
457 allochspmsg.msg_allochsp_comp = ci;
458 allochspmsg.msg_allochsp_hs_id = shared->ms_hs_id;
459
460 /*
461 * Before calling mdmn_ksend_message(), release locks
462 * Can never be in the context of an ioctl.
463 */
464 md_unit_writerexit(MDI_UNIT(mnum));
465 if (flags & MD_HOTSPARE_LINKHELD)
466 rw_exit(&mirror_md_ops.md_link_rw.lock);
467 #ifdef DEBUG
468 if (mirror_debug_flag)
469 printf("send alloc hotspare, flags="
470 "0x%x %x, %x, %x, %x\n", flags,
471 allochspmsg.msg_allochsp_mnum,
472 allochspmsg.msg_allochsp_sm,
473 allochspmsg.msg_allochsp_comp,
474 allochspmsg.msg_allochsp_hs_id);
475 #endif
476 if (flags & MD_HOTSPARE_WMUPDATE) {
477 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE2;
478 /*
479 * When coming from an update of watermarks, there
480 * must already be a message logged that triggered
481 * this action. So, no need to log this message, too.
482 */
483 msgflags = MD_MSGF_NO_LOG;
484 } else {
485 msgtype = MD_MN_MSG_ALLOCATE_HOTSPARE;
486 msgflags = MD_MSGF_DEFAULT_FLAGS;
487 }
488
489 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
490
491 cc4hs_msg:
492 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
493 (char *)&allochspmsg, sizeof (allochspmsg),
494 kresult);
495
496 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
497 #ifdef DEBUG
498 if (mirror_debug_flag)
499 mdmn_ksend_show_error(rval, kresult,
500 "ALLOCATE HOTSPARE");
501 #endif
502 /*
503 * If message is sent ok but exitval indicates an error
504 * it must be because the mirror has been cleared. In
505 * this case re-obtain lock and return an error
506 */
507 if ((rval == 0) && (kresult->kmmr_exitval != 0)) {
508 if (flags & MD_HOTSPARE_LINKHELD) {
509 rw_enter(&mirror_md_ops.md_link_rw.lock,
510 RW_READER);
511 }
512 kmem_free(kresult, sizeof (md_mn_kresult_t));
513 return (1);
514 }
515 /* If we're shutting down already, pause things here. */
516 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
517 while (!md_mn_is_commd_present()) {
518 delay(md_hz);
519 }
520 /*
521 * commd has become reachable again, so retry
522 * once. If this fails we'll panic as the
523 * system is in an unexpected state.
524 */
525 if (nretries++ == 0)
526 goto cc4hs_msg;
527 }
528 cmn_err(CE_PANIC,
529 "ksend_message failure: ALLOCATE_HOTSPARE");
530 }
531 kmem_free(kresult, sizeof (md_mn_kresult_t));
532
533 /*
534 * re-obtain the locks
535 */
536 if (flags & MD_HOTSPARE_LINKHELD)
537 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
538 new_un = md_unit_writerlock(MDI_UNIT(mnum));
539
540 /*
541 * As we had to release the locks in order to send the
542 * message to all nodes, we need to check to see if the
543 * unit has changed. If it has we release the writerlock
544 * and return fail.
545 */
546 if ((new_un != un) || (un->c.un_type != MD_METAMIRROR)) {
547 md_unit_writerexit(MDI_UNIT(mnum));
548 return (1);
549 }
550 } else {
551 if (MD_MNSET_SETNO(setno)) {
552 /*
553 * If 2 or more nodes simultaneously see a
554 * component failure, these nodes will each
555 * send an ALLOCATE_HOTSPARE[2] message.
556 * The first message will allocate the hotspare
557 * and the subsequent messages should do nothing.
558 *
559 * If a slave node doesn't have a hotspare allocated
560 * at the time the message is initiated, then the
561 * passed in hs_id will be 0. If the node
562 * executing this routine has a component shared
563 * ms_hs_id of non-zero, but the message shows a
564 * hs_id of 0, then just return since a hotspare
565 * has already been allocated for this failing
566 * component. When the slave node returns from
567 * the ksend_message the hotspare will have
568 * already been allocated.
569 *
570 * If the slave node does send an hs_id of non-zero,
571 * and the slave node's hs_id matches this node's
572 * ms_hs_id, then the hotspare has error'd and
573 * should be replaced.
574 *
575 * If the slave node sends an hs_id of non-zero and
576 * this node has a different shared ms_hs_id, then
577 * just return since this hotspare has already
578 * been hotspared.
579 */
580 if (shared->ms_hs_id != 0) {
581 if (hs_id == 0) {
582 #ifdef DEBUG
583 if (mirror_debug_flag) {
584 printf("check_comp_4_hotspares"
585 "(NOXMIT), short circuit "
586 "hs_id=0x%x, "
587 "ms_hs_id=0x%x\n",
588 hs_id, shared->ms_hs_id);
589 }
590 #endif
591 return (0);
592 }
593 if (hs_id != shared->ms_hs_id) {
594 #ifdef DEBUG
595 if (mirror_debug_flag) {
596 printf("check_comp_4_hotspares"
597 "(NOXMIT), short circuit2 "
598 "hs_id=0x%x, "
599 "ms_hs_id=0x%x\n",
600 hs_id, shared->ms_hs_id);
601 }
602 #endif
603 return (0);
604 }
605 }
606 }
607
608 sm = &un->un_sm[smi];
609 hs_dev = md_get_named_service(sm->sm_dev, 0,
610 "hotspare device", 0);
611 if ((*hs_dev)(sm->sm_dev, 0, ci, recids, 6, &hs_done,
612 &hs_data) != 0)
613 return (0);
614
615 /*
616 * set_sm_comp_state() commits the modified records.
617 * As we don't transmit the changes, no need to drop the lock.
618 */
619 set_sm_comp_state(un, smi, ci, CS_RESYNC, recids,
620 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
621
622 (*hs_done)(sm->sm_dev, hs_data);
623
624 mirror_check_failfast(mnum);
625
626 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_HOTSPARED, SVM_TAG_METADEVICE,
627 setno, MD_SID(un));
628
629 /*
630 * For a multi-node set we need to reset the un_rs_type,
631 * un_rs_resync_done and un_rs_resync_2_do fields as the
632 * hot-spare resync must copy all applicable data.
633 */
634 if (MD_MNSET_SETNO(setno)) {
635 un->un_rs_type = MD_RS_NONE;
636 un->un_rs_resync_done = 0;
637 un->un_rs_resync_2_do = 0;
638 }
639
640 /*
641 * Must drop writer lock since mirror_resync_unit will
642 * open devices and must be able to grab readerlock.
643 * Don't need to drop IOLOCK since any descendent routines
644 * calling ksend_messages will drop the IOLOCK as needed.
645 *
646 */
647 if (lockp) {
648 md_ioctl_writerexit(lockp);
649 } else {
650 md_unit_writerexit(MDI_UNIT(mnum));
651 }
652
653 /* start resync */
654 (void) mirror_resync_unit(mnum, NULL, &mde, lockp);
655
656 if (lockp) {
657 new_un = md_ioctl_writerlock(lockp, MDI_UNIT(mnum));
658 } else {
659 new_un = md_unit_writerlock(MDI_UNIT(mnum));
660 }
661 }
662 return (0);
663 }
664
665 /*
666 * check_unit_4_hotspares
667 *
668 * For a given mirror, allocate hotspares, if available for any components
669 * that are in error
670 *
671 * Returns 0 if ok
672 * 1 if check_comp_4_hotspares returns non-zero. This will only
673 * happen for a MN unit where the unit has been cleared while
674 * the allocate hotspare message is sent to all nodes.
675 */
676 static int
check_unit_4_hotspares(mm_unit_t * un,int flags)677 check_unit_4_hotspares(mm_unit_t *un, int flags)
678 {
679 mm_submirror_t *sm;
680 mm_submirror_ic_t *smic;
681 int ci;
682 int i;
683 int compcnt;
684
685 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE)
686 return (0);
687
688 for (i = 0; i < NMIRROR; i++) {
689 sm = &un->un_sm[i];
690 smic = &un->un_smic[i];
691 if (!SMS_IS(sm, SMS_INUSE))
692 continue;
693 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, sm);
694 for (ci = 0; ci < compcnt; ci++) {
695 md_m_shared_t *shared;
696
697 shared = (md_m_shared_t *)
698 (*(smic->sm_shared_by_indx))(sm->sm_dev, sm, ci);
699 /*
700 * Never called from ioctl context, so pass in
701 * (IOLOCK *)NULL. Pass through flags from calling
702 * routine, also setting XMIT flag.
703 */
704 if (check_comp_4_hotspares(un, i, ci,
705 (MD_HOTSPARE_XMIT | flags),
706 shared->ms_hs_id, (IOLOCK *)NULL) != 0)
707 return (1);
708 }
709 }
710 return (0);
711 }
712
713 static void
check_4_hotspares(daemon_request_t * drq)714 check_4_hotspares(daemon_request_t *drq)
715 {
716 mdi_unit_t *ui;
717 mm_unit_t *un;
718 md_link_t *next;
719 int x;
720
721 mutex_enter(&drq->dr_mx); /* clear up front so can poke */
722 drq->dr_pending = 0; /* again in low level routine if */
723 mutex_exit(&drq->dr_mx); /* something found to do */
724
725 /*
726 * Used to have a problem here. The disksets weren't marked as being
727 * MNHOLD. This opened a window where we could be searching for
728 * hotspares and have the disk set unloaded (released) from under
729 * us causing a panic in stripe_component_count().
730 * The way to prevent that is to mark the set MNHOLD which prevents
731 * any diskset from being released while we are scanning the mirrors,
732 * submirrors and components.
733 */
734
735 for (x = 0; x < md_nsets; x++)
736 md_holdset_enter(x);
737
738 rw_enter(&mirror_md_ops.md_link_rw.lock, RW_READER);
739 for (next = mirror_md_ops.md_head; next != NULL; next = next->ln_next) {
740 ui = MDI_UNIT(next->ln_id);
741
742 un = (mm_unit_t *)md_unit_readerlock(ui);
743
744 /*
745 * Only check the unit if we are the master for this set
746 * For an MN set, poke_hotspares() is only effective on the
747 * master
748 */
749 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
750 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
751 md_unit_readerexit(ui);
752 continue;
753 }
754 if (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) {
755 md_unit_readerexit(ui);
756 continue;
757 }
758 md_unit_readerexit(ui);
759
760 un = (mm_unit_t *)md_unit_writerlock(ui);
761 /*
762 * check_unit_4_hotspares will exit 1 if the unit has been
763 * removed during the process of allocating the hotspare.
764 * This can only happen for a MN metadevice. If unit no longer
765 * exists, no need to release writerlock
766 */
767 if (check_unit_4_hotspares(un, MD_HOTSPARE_LINKHELD) == 0)
768 md_unit_writerexit(ui);
769 else {
770 /*
771 * If check_unit_4_hotspares failed, queue another
772 * request and break out of this one
773 */
774 (void) poke_hotspares();
775 break;
776 }
777 }
778 rw_exit(&mirror_md_ops.md_link_rw.lock);
779
780 for (x = 0; x < md_nsets; x++)
781 md_holdset_exit(x);
782 }
783
784 /*
785 * poke_hotspares
786 *
787 * If there is not a pending poke_hotspares request pending, queue a requent
788 * to call check_4_hotspares(). This will scan all mirrors and attempt to
789 * allocate hotspares for all components in error.
790 */
791 int
poke_hotspares()792 poke_hotspares()
793 {
794 mutex_enter(&hotspare_request.dr_mx);
795 if (hotspare_request.dr_pending == 0) {
796 hotspare_request.dr_pending = 1;
797 daemon_request(&md_mhs_daemon,
798 check_4_hotspares, (daemon_queue_t *)&hotspare_request,
799 REQ_OLD);
800 }
801 mutex_exit(&hotspare_request.dr_mx);
802 return (0);
803 }
804
805 static void
free_all_ecomps(err_comp_t * ecomp)806 free_all_ecomps(err_comp_t *ecomp)
807 {
808 err_comp_t *d;
809
810 while (ecomp != NULL) {
811 d = ecomp;
812 ecomp = ecomp->ec_next;
813 kmem_free(d, sizeof (err_comp_t));
814 }
815 }
816
817 /*
818 * NAME: mirror_openfail_console_info
819 *
820 * DESCRIPTION: Prints a informative message to the console when mirror
821 * cannot be opened.
822 *
823 * PARAMETERS: mm_unit_t un - pointer to mirror unit structure
824 * int smi - submirror index
825 * int ci - component index
826 */
827
828 void
mirror_openfail_console_info(mm_unit_t * un,int smi,int ci)829 mirror_openfail_console_info(mm_unit_t *un, int smi, int ci)
830 {
831 void (*get_dev)();
832 ms_cd_info_t cd;
833 md_dev64_t tmpdev;
834
835 tmpdev = un->un_sm[smi].sm_dev;
836 get_dev = (void (*)())md_get_named_service(tmpdev, 0, "get device", 0);
837 if (get_dev != NULL) {
838 (void) (*get_dev)(tmpdev, smi, ci, &cd);
839 cmn_err(CE_WARN, "md %s: open error on %s",
840 md_shortname(MD_SID(un)), md_devname(MD_UN2SET(un),
841 cd.cd_dev, NULL, 0));
842 } else {
843 cmn_err(CE_WARN, "md %s: open error",
844 md_shortname(MD_SID(un)));
845 }
846 }
847
848 static int
mirror_close_all_devs(mm_unit_t * un,int md_cflags)849 mirror_close_all_devs(mm_unit_t *un, int md_cflags)
850 {
851 int i;
852 md_dev64_t dev;
853
854 for (i = 0; i < NMIRROR; i++) {
855 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
856 continue;
857 dev = un->un_sm[i].sm_dev;
858 md_layered_close(dev, md_cflags);
859 }
860 return (0);
861 }
862
863 /*
864 * Keep track of drivers that don't support failfast. We use this so that
865 * we only log one diagnostic message for each of these drivers, no matter
866 * how many times we run the mirror_check_failfast function.
867 * Return 1 if this is a new driver that does not support failfast,
868 * return 0 if we have already seen this non-failfast driver.
869 */
870 static int
new_non_ff_driver(const char * s)871 new_non_ff_driver(const char *s)
872 {
873 mutex_enter(&non_ff_drv_mutex);
874 if (non_ff_drivers == NULL) {
875 non_ff_drivers = (char **)kmem_alloc(2 * sizeof (char *),
876 KM_NOSLEEP);
877 if (non_ff_drivers == NULL) {
878 mutex_exit(&non_ff_drv_mutex);
879 return (1);
880 }
881
882 non_ff_drivers[0] = (char *)kmem_alloc(strlen(s) + 1,
883 KM_NOSLEEP);
884 if (non_ff_drivers[0] == NULL) {
885 kmem_free(non_ff_drivers, 2 * sizeof (char *));
886 non_ff_drivers = NULL;
887 mutex_exit(&non_ff_drv_mutex);
888 return (1);
889 }
890
891 (void) strcpy(non_ff_drivers[0], s);
892 non_ff_drivers[1] = NULL;
893
894 } else {
895 int i;
896 char **tnames;
897 char **tmp;
898
899 for (i = 0; non_ff_drivers[i] != NULL; i++) {
900 if (strcmp(s, non_ff_drivers[i]) == 0) {
901 mutex_exit(&non_ff_drv_mutex);
902 return (0);
903 }
904 }
905
906 /* allow for new element and null */
907 i += 2;
908 tnames = (char **)kmem_alloc(i * sizeof (char *), KM_NOSLEEP);
909 if (tnames == NULL) {
910 mutex_exit(&non_ff_drv_mutex);
911 return (1);
912 }
913
914 for (i = 0; non_ff_drivers[i] != NULL; i++)
915 tnames[i] = non_ff_drivers[i];
916
917 tnames[i] = (char *)kmem_alloc(strlen(s) + 1, KM_NOSLEEP);
918 if (tnames[i] == NULL) {
919 /* adjust i so that it is the right count to free */
920 kmem_free(tnames, (i + 2) * sizeof (char *));
921 mutex_exit(&non_ff_drv_mutex);
922 return (1);
923 }
924
925 (void) strcpy(tnames[i++], s);
926 tnames[i] = NULL;
927
928 tmp = non_ff_drivers;
929 non_ff_drivers = tnames;
930 /* i now represents the count we previously alloced */
931 kmem_free(tmp, i * sizeof (char *));
932 }
933 mutex_exit(&non_ff_drv_mutex);
934
935 return (1);
936 }
937
938 /*
939 * Check for the "ddi-failfast-supported" devtree property on each submirror
940 * component to indicate if we should do I/O to that submirror with the
941 * B_FAILFAST flag set or not. This check is made at various state transitions
942 * in the mirror code (e.g. open, enable, hotspare, etc.). Sometimes we
943 * only need to check one drive (e.g. hotspare) but since the check is
944 * fast and infrequent and sometimes needs to be done on all components we
945 * just check all components on each call.
946 */
947 void
mirror_check_failfast(minor_t mnum)948 mirror_check_failfast(minor_t mnum)
949 {
950 int i;
951 mm_unit_t *un;
952
953 if (md_ff_disable)
954 return;
955
956 un = MD_UNIT(mnum);
957
958 for (i = 0; i < NMIRROR; i++) {
959 int ci;
960 int cnt;
961 int ff = 1;
962 mm_submirror_t *sm;
963 mm_submirror_ic_t *smic;
964 void (*get_dev)();
965
966 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
967 continue;
968
969 sm = &un->un_sm[i];
970 smic = &un->un_smic[i];
971
972 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
973 "get device", 0);
974
975 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, sm);
976 for (ci = 0; ci < cnt; ci++) {
977 int found = 0;
978 dev_t ci_dev;
979 major_t major;
980 dev_info_t *devi;
981 ms_cd_info_t cd;
982
983 /*
984 * this already returns the hs
985 * dev if the device is spared
986 */
987 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
988
989 ci_dev = md_dev64_to_dev(cd.cd_dev);
990 major = getmajor(ci_dev);
991
992 if (major == md_major) {
993 /*
994 * this component must be a soft
995 * partition; get the real dev
996 */
997 minor_t dev_mnum;
998 mdi_unit_t *ui;
999 mp_unit_t *un;
1000 set_t setno;
1001 side_t side;
1002 md_dev64_t tmpdev;
1003
1004 ui = MDI_UNIT(getminor(ci_dev));
1005
1006 /* grab necessary lock */
1007 un = (mp_unit_t *)md_unit_readerlock(ui);
1008
1009 dev_mnum = MD_SID(un);
1010 setno = MD_MIN2SET(dev_mnum);
1011 side = mddb_getsidenum(setno);
1012
1013 tmpdev = un->un_dev;
1014
1015 /* Get dev by device id */
1016 if (md_devid_found(setno, side,
1017 un->un_key) == 1) {
1018 tmpdev = md_resolve_bydevid(dev_mnum,
1019 tmpdev, un->un_key);
1020 }
1021
1022 md_unit_readerexit(ui);
1023
1024 ci_dev = md_dev64_to_dev(tmpdev);
1025 major = getmajor(ci_dev);
1026 }
1027
1028 if (ci_dev != NODEV32 &&
1029 (devi = e_ddi_hold_devi_by_dev(ci_dev, 0))
1030 != NULL) {
1031 ddi_prop_op_t prop_op = PROP_LEN_AND_VAL_BUF;
1032 int propvalue = 0;
1033 int proplength = sizeof (int);
1034 int error;
1035 struct cb_ops *cb;
1036
1037 if ((cb = devopsp[major]->devo_cb_ops) !=
1038 NULL) {
1039 error = (*cb->cb_prop_op)
1040 (DDI_DEV_T_ANY, devi, prop_op,
1041 DDI_PROP_NOTPROM|DDI_PROP_DONTPASS,
1042 "ddi-failfast-supported",
1043 (caddr_t)&propvalue, &proplength);
1044
1045 if (error == DDI_PROP_SUCCESS)
1046 found = 1;
1047 }
1048
1049 if (!found && new_non_ff_driver(
1050 ddi_driver_name(devi))) {
1051 cmn_err(CE_NOTE, "!md: B_FAILFAST I/O"
1052 "disabled on %s",
1053 ddi_driver_name(devi));
1054 }
1055
1056 ddi_release_devi(devi);
1057 }
1058
1059 /*
1060 * All components must support
1061 * failfast in the submirror.
1062 */
1063 if (!found) {
1064 ff = 0;
1065 break;
1066 }
1067 }
1068
1069 if (ff) {
1070 sm->sm_flags |= MD_SM_FAILFAST;
1071 } else {
1072 sm->sm_flags &= ~MD_SM_FAILFAST;
1073 }
1074 }
1075 }
1076
1077 /*
1078 * Return true if the submirror is unavailable.
1079 * If any of the submirror components are opened then the submirror cannot
1080 * be unavailable (MD_INACCESSIBLE).
1081 * If any of the components are already in the errored state, then the submirror
1082 * cannot be unavailable (MD_INACCESSIBLE).
1083 */
1084 static bool_t
submirror_unavailable(mm_unit_t * un,int smi,int from_probe)1085 submirror_unavailable(mm_unit_t *un, int smi, int from_probe)
1086 {
1087 mm_submirror_t *sm;
1088 mm_submirror_ic_t *smic;
1089 md_m_shared_t *shared;
1090 int ci;
1091 int compcnt;
1092
1093 sm = &un->un_sm[smi];
1094 smic = &un->un_smic[smi];
1095
1096 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
1097 for (ci = 0; ci < compcnt; ci++) {
1098 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
1099 (sm->sm_dev, sm, ci);
1100 if (from_probe) {
1101 if (shared->ms_flags & MDM_S_PROBEOPEN)
1102 return (B_FALSE);
1103 } else {
1104 if (shared->ms_flags & MDM_S_ISOPEN)
1105 return (B_FALSE);
1106 }
1107 if (shared->ms_state == CS_ERRED ||
1108 shared->ms_state == CS_LAST_ERRED)
1109 return (B_FALSE);
1110 }
1111
1112 return (B_TRUE);
1113 }
1114
1115 static int
mirror_open_all_devs(minor_t mnum,int md_oflags,IOLOCK * lockp)1116 mirror_open_all_devs(minor_t mnum, int md_oflags, IOLOCK *lockp)
1117 {
1118 int i;
1119 mm_unit_t *un;
1120 mdi_unit_t *ui;
1121 int err;
1122 int smi;
1123 int ci;
1124 err_comp_t *c;
1125 err_comp_t *ecomps = NULL;
1126 int smmask = 0;
1127 set_t setno;
1128 int sm_cnt;
1129 int sm_unavail_cnt;
1130
1131 mirror_check_failfast(mnum);
1132
1133 un = MD_UNIT(mnum);
1134 ui = MDI_UNIT(mnum);
1135 setno = MD_UN2SET(un);
1136
1137 for (i = 0; i < NMIRROR; i++) {
1138 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1139
1140 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1141 continue;
1142 if (md_layered_open(mnum, &tmpdev, md_oflags))
1143 smmask |= SMI2BIT(i);
1144 un->un_sm[i].sm_dev = tmpdev;
1145 }
1146
1147 /*
1148 * If smmask is clear, all submirrors are accessible. Clear the
1149 * MD_INACCESSIBLE bit in this case. This bit is also cleared for the
1150 * mirror device. If smmask is set, we have to determine which of the
1151 * submirrors are in error. If no submirror is accessible we mark the
1152 * whole mirror as MD_INACCESSIBLE.
1153 */
1154 if (smmask == 0) {
1155 if (lockp) {
1156 md_ioctl_readerexit(lockp);
1157 (void) md_ioctl_writerlock(lockp, ui);
1158 } else {
1159 md_unit_readerexit(ui);
1160 (void) md_unit_writerlock(ui);
1161 }
1162 ui->ui_tstate &= ~MD_INACCESSIBLE;
1163 if (lockp) {
1164 md_ioctl_writerexit(lockp);
1165 (void) md_ioctl_readerlock(lockp, ui);
1166 } else {
1167 md_unit_writerexit(ui);
1168 (void) md_unit_readerlock(ui);
1169 }
1170
1171 for (i = 0; i < NMIRROR; i++) {
1172 md_dev64_t tmpdev;
1173 mdi_unit_t *sm_ui;
1174
1175 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1176 continue;
1177
1178 tmpdev = un->un_sm[i].sm_dev;
1179 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1180 (void) md_unit_writerlock(sm_ui);
1181 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1182 md_unit_writerexit(sm_ui);
1183 }
1184
1185 return (0);
1186 }
1187
1188 for (i = 0; i < NMIRROR; i++) {
1189 md_dev64_t tmpdev;
1190
1191 if (!(smmask & SMI2BIT(i)))
1192 continue;
1193
1194 tmpdev = un->un_sm[i].sm_dev;
1195 err = md_layered_open(mnum, &tmpdev, MD_OFLG_CONT_ERRS);
1196 un->un_sm[i].sm_dev = tmpdev;
1197 ASSERT(err == 0);
1198 }
1199
1200 if (lockp) {
1201 md_ioctl_readerexit(lockp);
1202 un = (mm_unit_t *)md_ioctl_writerlock(lockp, ui);
1203 } else {
1204 md_unit_readerexit(ui);
1205 un = (mm_unit_t *)md_unit_writerlock(ui);
1206 }
1207
1208 /*
1209 * We want to make sure the unavailable flag is not masking a real
1210 * error on the submirror.
1211 * For each submirror,
1212 * if all of the submirror components couldn't be opened and there
1213 * are no errors on the submirror, then set the unavailable flag
1214 * otherwise, clear unavailable.
1215 */
1216 sm_cnt = 0;
1217 sm_unavail_cnt = 0;
1218 for (i = 0; i < NMIRROR; i++) {
1219 md_dev64_t tmpdev;
1220 mdi_unit_t *sm_ui;
1221
1222 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
1223 continue;
1224
1225 sm_cnt++;
1226 tmpdev = un->un_sm[i].sm_dev;
1227 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
1228
1229 (void) md_unit_writerlock(sm_ui);
1230 if (submirror_unavailable(un, i, 0)) {
1231 sm_ui->ui_tstate |= MD_INACCESSIBLE;
1232 sm_unavail_cnt++;
1233 } else {
1234 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
1235 }
1236 md_unit_writerexit(sm_ui);
1237 }
1238
1239 /*
1240 * If all of the submirrors are unavailable, the mirror is also
1241 * unavailable.
1242 */
1243 if (sm_cnt == sm_unavail_cnt) {
1244 ui->ui_tstate |= MD_INACCESSIBLE;
1245 } else {
1246 ui->ui_tstate &= ~MD_INACCESSIBLE;
1247 }
1248
1249 smi = 0;
1250 ci = 0;
1251 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
1252 if (mirror_other_sources(un, smi, ci, 1) == 1) {
1253
1254 free_all_ecomps(ecomps);
1255 (void) mirror_close_all_devs(un, md_oflags);
1256 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_OPEN_FAIL,
1257 SVM_TAG_METADEVICE, setno, MD_SID(un));
1258 mirror_openfail_console_info(un, smi, ci);
1259 if (lockp) {
1260 md_ioctl_writerexit(lockp);
1261 (void) md_ioctl_readerlock(lockp, ui);
1262 } else {
1263 md_unit_writerexit(ui);
1264 (void) md_unit_readerlock(ui);
1265 }
1266 return (ENXIO);
1267 }
1268
1269 /* track all component states that need changing */
1270 c = (err_comp_t *)kmem_alloc(sizeof (err_comp_t), KM_SLEEP);
1271 c->ec_next = ecomps;
1272 c->ec_smi = smi;
1273 c->ec_ci = ci;
1274 ecomps = c;
1275 ci++;
1276 }
1277
1278 /* Make all state changes and commit them */
1279 for (c = ecomps; c != NULL; c = c->ec_next) {
1280 /*
1281 * If lockp is set, then entering kernel through ioctl.
1282 * For a MN set, the only ioctl path is via a commd message
1283 * (ALLOCATE_HOTSPARE or *RESYNC* messages) that is already
1284 * being sent to each node.
1285 * In this case, set NO_XMIT so that set_sm_comp_state
1286 * won't attempt to send a message on a message.
1287 *
1288 * In !MN sets, the xmit flag is ignored, so it doesn't matter
1289 * which flag is passed.
1290 */
1291 if (lockp) {
1292 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1293 MD_STATE_NO_XMIT, lockp);
1294 } else {
1295 set_sm_comp_state(un, c->ec_smi, c->ec_ci, CS_ERRED, 0,
1296 (MD_STATE_XMIT | MD_STATE_OCHELD), lockp);
1297 }
1298 /*
1299 * For a MN set, the NOTIFY is done when the state change is
1300 * processed on each node
1301 */
1302 if (!MD_MNSET_SETNO(setno)) {
1303 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
1304 SVM_TAG_METADEVICE, setno, MD_SID(un));
1305 }
1306 }
1307
1308 if (lockp) {
1309 md_ioctl_writerexit(lockp);
1310 (void) md_ioctl_readerlock(lockp, ui);
1311 } else {
1312 md_unit_writerexit(ui);
1313 (void) md_unit_readerlock(ui);
1314 }
1315
1316 free_all_ecomps(ecomps);
1317
1318 /* allocate hotspares for all errored components */
1319 if (MD_MNSET_SETNO(setno)) {
1320 /*
1321 * If we're called from an ioctl (lockp set) then we cannot
1322 * directly call send_poke_hotspares as this will block until
1323 * the message gets despatched to all nodes. If the cluster is
1324 * going through a reconfig cycle then the message will block
1325 * until the cycle is complete, and as we originate from a
1326 * service call from commd we will livelock.
1327 */
1328 if (lockp == NULL) {
1329 md_unit_readerexit(ui);
1330 send_poke_hotspares(setno);
1331 (void) md_unit_readerlock(ui);
1332 }
1333 } else {
1334 (void) poke_hotspares();
1335 }
1336 return (0);
1337 }
1338
1339 void
mirror_overlap_tree_remove(md_mps_t * ps)1340 mirror_overlap_tree_remove(md_mps_t *ps)
1341 {
1342 mm_unit_t *un;
1343
1344 if (panicstr)
1345 return;
1346
1347 VERIFY(ps->ps_flags & MD_MPS_ON_OVERLAP);
1348 un = ps->ps_un;
1349
1350 mutex_enter(&un->un_overlap_tree_mx);
1351 avl_remove(&un->un_overlap_root, ps);
1352 ps->ps_flags &= ~MD_MPS_ON_OVERLAP;
1353 if (un->un_overlap_tree_flag != 0) {
1354 un->un_overlap_tree_flag = 0;
1355 cv_broadcast(&un->un_overlap_tree_cv);
1356 }
1357 mutex_exit(&un->un_overlap_tree_mx);
1358 }
1359
1360
1361 /*
1362 * wait_for_overlaps:
1363 * -----------------
1364 * Check that given i/o request does not cause an overlap with already pending
1365 * i/o. If it does, block until the overlapped i/o completes.
1366 *
1367 * The flag argument has MD_OVERLAP_ALLOW_REPEAT set if it is ok for the parent
1368 * structure to be already in the overlap tree and MD_OVERLAP_NO_REPEAT if
1369 * it must not already be in the tree.
1370 */
1371 static void
wait_for_overlaps(md_mps_t * ps,int flags)1372 wait_for_overlaps(md_mps_t *ps, int flags)
1373 {
1374 mm_unit_t *un;
1375 avl_index_t where;
1376 md_mps_t *ps1;
1377
1378 if (panicstr)
1379 return;
1380
1381 un = ps->ps_un;
1382 mutex_enter(&un->un_overlap_tree_mx);
1383 if ((flags & MD_OVERLAP_ALLOW_REPEAT) &&
1384 (ps->ps_flags & MD_MPS_ON_OVERLAP)) {
1385 mutex_exit(&un->un_overlap_tree_mx);
1386 return;
1387 }
1388
1389 VERIFY(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1390
1391 do {
1392 ps1 = avl_find(&un->un_overlap_root, ps, &where);
1393 if (ps1 == NULL) {
1394 /*
1395 * The candidate range does not overlap with any
1396 * range in the tree. Insert it and be done.
1397 */
1398 avl_insert(&un->un_overlap_root, ps, where);
1399 ps->ps_flags |= MD_MPS_ON_OVERLAP;
1400 } else {
1401 /*
1402 * The candidate range would overlap. Set the flag
1403 * indicating we need to be woken up, and sleep
1404 * until another thread removes a range. If upon
1405 * waking up we find this mps was put on the tree
1406 * by another thread, the loop terminates.
1407 */
1408 un->un_overlap_tree_flag = 1;
1409 cv_wait(&un->un_overlap_tree_cv,
1410 &un->un_overlap_tree_mx);
1411 }
1412 } while (!(ps->ps_flags & MD_MPS_ON_OVERLAP));
1413 mutex_exit(&un->un_overlap_tree_mx);
1414 }
1415
1416 /*
1417 * This function is called from mirror_done to check whether any pages have
1418 * been modified while a mirrored write was in progress. Returns 0 if
1419 * all pages associated with bp are clean, 1 otherwise.
1420 */
1421 static int
any_pages_dirty(struct buf * bp)1422 any_pages_dirty(struct buf *bp)
1423 {
1424 int rval;
1425
1426 rval = biomodified(bp);
1427 if (rval == -1)
1428 rval = 0;
1429
1430 return (rval);
1431 }
1432
1433 #define MAX_EXTRAS 10
1434
1435 void
mirror_commit(mm_unit_t * un,int smmask,mddb_recid_t * extras)1436 mirror_commit(
1437 mm_unit_t *un,
1438 int smmask,
1439 mddb_recid_t *extras
1440 )
1441 {
1442 mm_submirror_t *sm;
1443 md_unit_t *su;
1444 int i;
1445
1446 /* 2=mirror,null id */
1447 mddb_recid_t recids[NMIRROR+2+MAX_EXTRAS];
1448
1449 int ri = 0;
1450
1451 if (md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)
1452 return;
1453
1454 /* Add two, this includes the mirror unit and the null recid */
1455 if (extras != NULL) {
1456 int nrecids = 0;
1457 while (extras[nrecids] != 0) {
1458 nrecids++;
1459 }
1460 ASSERT(nrecids <= MAX_EXTRAS);
1461 }
1462
1463 if (un != NULL)
1464 recids[ri++] = un->c.un_record_id;
1465 for (i = 0; i < NMIRROR; i++) {
1466 if (!(smmask & SMI2BIT(i)))
1467 continue;
1468 sm = &un->un_sm[i];
1469 if (!SMS_IS(sm, SMS_INUSE))
1470 continue;
1471 if (md_getmajor(sm->sm_dev) != md_major)
1472 continue;
1473 su = MD_UNIT(md_getminor(sm->sm_dev));
1474 recids[ri++] = su->c.un_record_id;
1475 }
1476
1477 if (extras != NULL)
1478 while (*extras != 0) {
1479 recids[ri++] = *extras;
1480 extras++;
1481 }
1482
1483 if (ri == 0)
1484 return;
1485 recids[ri] = 0;
1486
1487 /*
1488 * Ok to hold ioctl lock across record commit to mddb as
1489 * long as the record(s) being committed aren't resync records.
1490 */
1491 mddb_commitrecs_wrapper(recids);
1492 }
1493
1494
1495 /*
1496 * This routine is used to set a bit in the writable_bm bitmap
1497 * which represents each submirror in a metamirror which
1498 * is writable. The first writable submirror index is assigned
1499 * to the sm_index. The number of writable submirrors are returned in nunits.
1500 *
1501 * This routine returns the submirror's unit number.
1502 */
1503
1504 static void
select_write_units(struct mm_unit * un,md_mps_t * ps)1505 select_write_units(struct mm_unit *un, md_mps_t *ps)
1506 {
1507
1508 int i;
1509 unsigned writable_bm = 0;
1510 unsigned nunits = 0;
1511
1512 for (i = 0; i < NMIRROR; i++) {
1513 if (SUBMIRROR_IS_WRITEABLE(un, i)) {
1514 /* set bit of all writable units */
1515 writable_bm |= SMI2BIT(i);
1516 nunits++;
1517 }
1518 }
1519 ps->ps_writable_sm = writable_bm;
1520 ps->ps_active_cnt = nunits;
1521 ps->ps_current_sm = 0;
1522 }
1523
1524 static
1525 unsigned
select_write_after_read_units(struct mm_unit * un,md_mps_t * ps)1526 select_write_after_read_units(struct mm_unit *un, md_mps_t *ps)
1527 {
1528
1529 int i;
1530 unsigned writable_bm = 0;
1531 unsigned nunits = 0;
1532
1533 for (i = 0; i < NMIRROR; i++) {
1534 if (SUBMIRROR_IS_WRITEABLE(un, i) &&
1535 un->un_sm[i].sm_flags & MD_SM_RESYNC_TARGET) {
1536 writable_bm |= SMI2BIT(i);
1537 nunits++;
1538 }
1539 }
1540 if ((writable_bm & ps->ps_allfrom_sm) != 0) {
1541 writable_bm &= ~ps->ps_allfrom_sm;
1542 nunits--;
1543 }
1544 ps->ps_writable_sm = writable_bm;
1545 ps->ps_active_cnt = nunits;
1546 ps->ps_current_sm = 0;
1547 return (nunits);
1548 }
1549
1550 static md_dev64_t
select_read_unit(mm_unit_t * un,diskaddr_t blkno,u_longlong_t reqcount,u_longlong_t * cando,int must_be_opened,md_m_shared_t ** shared,md_mcs_t * cs)1551 select_read_unit(
1552 mm_unit_t *un,
1553 diskaddr_t blkno,
1554 u_longlong_t reqcount,
1555 u_longlong_t *cando,
1556 int must_be_opened,
1557 md_m_shared_t **shared,
1558 md_mcs_t *cs)
1559 {
1560 int i;
1561 md_m_shared_t *s;
1562 uint_t lasterrcnt = 0;
1563 md_dev64_t dev = 0;
1564 u_longlong_t cnt;
1565 u_longlong_t mincnt;
1566 mm_submirror_t *sm;
1567 mm_submirror_ic_t *smic;
1568 mdi_unit_t *ui;
1569
1570 mincnt = reqcount;
1571 for (i = 0; i < NMIRROR; i++) {
1572 if (!SUBMIRROR_IS_READABLE(un, i))
1573 continue;
1574 sm = &un->un_sm[i];
1575 smic = &un->un_smic[i];
1576 cnt = reqcount;
1577
1578 /*
1579 * If the current submirror is marked as inaccessible, do not
1580 * try to access it.
1581 */
1582 ui = MDI_UNIT(getminor(expldev(sm->sm_dev)));
1583 (void) md_unit_readerlock(ui);
1584 if (ui->ui_tstate & MD_INACCESSIBLE) {
1585 md_unit_readerexit(ui);
1586 continue;
1587 }
1588 md_unit_readerexit(ui);
1589
1590 s = (md_m_shared_t *)(*(smic->sm_shared_by_blk))
1591 (sm->sm_dev, sm, blkno, &cnt);
1592
1593 if (must_be_opened && !(s->ms_flags & MDM_S_ISOPEN))
1594 continue;
1595 if (s->ms_state == CS_OKAY) {
1596 *cando = cnt;
1597 if (shared != NULL)
1598 *shared = s;
1599
1600 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST &&
1601 cs != NULL) {
1602 cs->cs_buf.b_flags |= B_FAILFAST;
1603 }
1604
1605 return (un->un_sm[i].sm_dev);
1606 }
1607 if (s->ms_state != CS_LAST_ERRED)
1608 continue;
1609
1610 /* don't use B_FAILFAST since we're Last Erred */
1611
1612 if (mincnt > cnt)
1613 mincnt = cnt;
1614 if (s->ms_lasterrcnt > lasterrcnt) {
1615 lasterrcnt = s->ms_lasterrcnt;
1616 if (shared != NULL)
1617 *shared = s;
1618 dev = un->un_sm[i].sm_dev;
1619 }
1620 }
1621 *cando = mincnt;
1622 return (dev);
1623 }
1624
1625 /*
1626 * Given a 32-bit bitmap, this routine will return the bit number
1627 * of the nth bit set. The nth bit set is passed via the index integer.
1628 *
1629 * This routine is used to run through the writable submirror bitmap
1630 * and starting all of the writes. See the value returned is the
1631 * index to appropriate submirror structure, in the md_sm
1632 * array for metamirrors.
1633 */
1634 static int
md_find_nth_unit(uint_t mask,int index)1635 md_find_nth_unit(uint_t mask, int index)
1636 {
1637 int bit, nfound;
1638
1639 for (bit = -1, nfound = -1; nfound != index; bit++) {
1640 ASSERT(mask != 0);
1641 nfound += (mask & 1);
1642 mask >>= 1;
1643 }
1644 return (bit);
1645 }
1646
1647 static int
fast_select_read_unit(md_mps_t * ps,md_mcs_t * cs)1648 fast_select_read_unit(md_mps_t *ps, md_mcs_t *cs)
1649 {
1650 mm_unit_t *un;
1651 buf_t *bp;
1652 int i;
1653 unsigned nunits = 0;
1654 int iunit;
1655 uint_t running_bm = 0;
1656 uint_t sm_index;
1657
1658 bp = &cs->cs_buf;
1659 un = ps->ps_un;
1660
1661 for (i = 0; i < NMIRROR; i++) {
1662 if (!SMS_BY_INDEX_IS(un, i, SMS_RUNNING))
1663 continue;
1664 running_bm |= SMI2BIT(i);
1665 nunits++;
1666 }
1667 if (nunits == 0)
1668 return (1);
1669
1670 /*
1671 * For directed mirror read (DMR) we only use the specified side and
1672 * do not compute the source of the read.
1673 * If we're running with MD_MPS_DIRTY_RD set we always return the
1674 * first mirror side (this prevents unnecessary ownership switching).
1675 * Otherwise we return the submirror according to the mirror read option
1676 */
1677 if (ps->ps_flags & MD_MPS_DMR) {
1678 sm_index = un->un_dmr_last_read;
1679 } else if (ps->ps_flags & MD_MPS_DIRTY_RD) {
1680 sm_index = md_find_nth_unit(running_bm, 0);
1681 } else {
1682 /* Normal (non-DMR) operation */
1683 switch (un->un_read_option) {
1684 case RD_GEOMETRY:
1685 iunit = (int)(bp->b_lblkno /
1686 howmany(un->c.un_total_blocks, nunits));
1687 sm_index = md_find_nth_unit(running_bm, iunit);
1688 break;
1689 case RD_FIRST:
1690 sm_index = md_find_nth_unit(running_bm, 0);
1691 break;
1692 case RD_LOAD_BAL:
1693 /* this is intentional to fall into the default */
1694 default:
1695 un->un_last_read = (un->un_last_read + 1) % nunits;
1696 sm_index = md_find_nth_unit(running_bm,
1697 un->un_last_read);
1698 break;
1699 }
1700 }
1701 bp->b_edev = md_dev64_to_dev(un->un_sm[sm_index].sm_dev);
1702 ps->ps_allfrom_sm = SMI2BIT(sm_index);
1703
1704 if (un->un_sm[sm_index].sm_flags & MD_SM_FAILFAST) {
1705 bp->b_flags |= B_FAILFAST;
1706 }
1707
1708 return (0);
1709 }
1710
1711 static
1712 int
mirror_are_submirrors_available(mm_unit_t * un)1713 mirror_are_submirrors_available(mm_unit_t *un)
1714 {
1715 int i;
1716 for (i = 0; i < NMIRROR; i++) {
1717 md_dev64_t tmpdev = un->un_sm[i].sm_dev;
1718
1719 if ((!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) ||
1720 md_getmajor(tmpdev) != md_major)
1721 continue;
1722
1723 if ((MD_MIN2SET(md_getminor(tmpdev)) >= md_nsets) ||
1724 (MD_MIN2UNIT(md_getminor(tmpdev)) >= md_nunits))
1725 return (0);
1726
1727 if (MDI_UNIT(md_getminor(tmpdev)) == NULL)
1728 return (0);
1729 }
1730 return (1);
1731 }
1732
1733 void
build_submirror(mm_unit_t * un,int i,int snarfing)1734 build_submirror(mm_unit_t *un, int i, int snarfing)
1735 {
1736 struct mm_submirror *sm;
1737 struct mm_submirror_ic *smic;
1738 md_unit_t *su;
1739 set_t setno;
1740
1741 sm = &un->un_sm[i];
1742 smic = &un->un_smic[i];
1743
1744 sm->sm_flags = 0; /* sometime we may need to do more here */
1745
1746 setno = MD_UN2SET(un);
1747
1748 if (!SMS_IS(sm, SMS_INUSE))
1749 return;
1750 if (snarfing) {
1751 sm->sm_dev = md_getdevnum(setno, mddb_getsidenum(setno),
1752 sm->sm_key, MD_NOTRUST_DEVT);
1753 } else {
1754 if (md_getmajor(sm->sm_dev) == md_major) {
1755 su = MD_UNIT(md_getminor(sm->sm_dev));
1756 un->c.un_flag |= (su->c.un_flag & MD_LABELED);
1757 /* submirror can no longer be soft partitioned */
1758 MD_CAPAB(su) &= (~MD_CAN_SP);
1759 }
1760 }
1761 smic->sm_shared_by_blk = md_get_named_service(sm->sm_dev,
1762 0, "shared by blk", 0);
1763 smic->sm_shared_by_indx = md_get_named_service(sm->sm_dev,
1764 0, "shared by indx", 0);
1765 smic->sm_get_component_count = (int (*)())md_get_named_service(
1766 sm->sm_dev, 0, "get component count", 0);
1767 smic->sm_get_bcss = (int (*)())md_get_named_service(sm->sm_dev, 0,
1768 "get block count skip size", 0);
1769 sm->sm_state &= ~SMS_IGNORE;
1770 if (SMS_IS(sm, SMS_OFFLINE))
1771 MD_STATUS(un) |= MD_UN_OFFLINE_SM;
1772 md_set_parent(sm->sm_dev, MD_SID(un));
1773 }
1774
1775 static void
mirror_cleanup(mm_unit_t * un)1776 mirror_cleanup(mm_unit_t *un)
1777 {
1778 mddb_recid_t recid;
1779 int smi;
1780 sv_dev_t sv[NMIRROR];
1781 int nsv = 0;
1782
1783 /*
1784 * If a MN diskset and this node is not the master, do
1785 * not delete any records on snarf of the mirror records.
1786 */
1787 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1788 md_set[MD_UN2SET(un)].s_am_i_master == 0) {
1789 return;
1790 }
1791
1792 for (smi = 0; smi < NMIRROR; smi++) {
1793 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
1794 continue;
1795 sv[nsv].setno = MD_UN2SET(un);
1796 sv[nsv++].key = un->un_sm[smi].sm_key;
1797 }
1798
1799 recid = un->un_rr_dirty_recid;
1800 mddb_deleterec_wrapper(un->c.un_record_id);
1801 if (recid > 0)
1802 mddb_deleterec_wrapper(recid);
1803
1804 md_rem_names(sv, nsv);
1805 }
1806
1807 /*
1808 * Comparison function for the avl tree which tracks
1809 * outstanding writes on submirrors.
1810 *
1811 * Returns:
1812 * -1: ps1 < ps2
1813 * 0: ps1 and ps2 overlap
1814 * 1: ps1 > ps2
1815 */
1816 static int
mirror_overlap_compare(const void * p1,const void * p2)1817 mirror_overlap_compare(const void *p1, const void *p2)
1818 {
1819 const md_mps_t *ps1 = (md_mps_t *)p1;
1820 const md_mps_t *ps2 = (md_mps_t *)p2;
1821
1822 if (ps1->ps_firstblk < ps2->ps_firstblk) {
1823 if (ps1->ps_lastblk >= ps2->ps_firstblk)
1824 return (0);
1825 return (-1);
1826 }
1827
1828 if (ps1->ps_firstblk > ps2->ps_firstblk) {
1829 if (ps1->ps_firstblk <= ps2->ps_lastblk)
1830 return (0);
1831 return (1);
1832 }
1833
1834 return (0);
1835 }
1836
1837 /*
1838 * Collapse any sparse submirror entries snarfed from the on-disk replica.
1839 * Only the in-core entries are updated. The replica will be updated on-disk
1840 * when the in-core replica is committed on shutdown of the SVM subsystem.
1841 */
1842 static void
collapse_submirrors(mm_unit_t * un)1843 collapse_submirrors(mm_unit_t *un)
1844 {
1845 int smi, nremovals, smiremove;
1846 mm_submirror_t *sm, *new_sm, *old_sm;
1847 mm_submirror_ic_t *smic;
1848 int nsmidx = un->un_nsm - 1;
1849
1850 rescan:
1851 nremovals = 0;
1852 smiremove = -1;
1853
1854 for (smi = 0; smi <= nsmidx; smi++) {
1855 sm = &un->un_sm[smi];
1856
1857 /*
1858 * Check to see if this submirror is marked as in-use.
1859 * If it isn't then it is a potential sparse entry and
1860 * may need to be cleared from the configuration.
1861 * The records should _already_ have been cleared by the
1862 * original mirror_detach() code, but we need to shuffle
1863 * any NULL entries in un_sm[] to the end of the array.
1864 * Any NULL un_smic[] entries need to be reset to the underlying
1865 * submirror/slice accessor functions.
1866 */
1867 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE)) {
1868 nremovals++;
1869 smiremove = smi;
1870 break;
1871 }
1872 }
1873
1874 if (nremovals == 0) {
1875 /*
1876 * Ensure that we have a matching contiguous set of un_smic[]
1877 * entries for the corresponding un_sm[] entries
1878 */
1879 for (smi = 0; smi <= nsmidx; smi++) {
1880 smic = &un->un_smic[smi];
1881 sm = &un->un_sm[smi];
1882
1883 smic->sm_shared_by_blk =
1884 md_get_named_service(sm->sm_dev, 0,
1885 "shared by_blk", 0);
1886 smic->sm_shared_by_indx =
1887 md_get_named_service(sm->sm_dev, 0,
1888 "shared by indx", 0);
1889 smic->sm_get_component_count =
1890 (int (*)())md_get_named_service(sm->sm_dev, 0,
1891 "get component count", 0);
1892 smic->sm_get_bcss =
1893 (int (*)())md_get_named_service(sm->sm_dev, 0,
1894 "get block count skip size", 0);
1895 }
1896 return;
1897 }
1898
1899 /*
1900 * Reshuffle the submirror devices so that we do not have a dead record
1901 * in the middle of the array. Once we've done this we need to rescan
1902 * the mirror to check for any other holes.
1903 */
1904 for (smi = 0; smi < NMIRROR; smi++) {
1905 if (smi < smiremove)
1906 continue;
1907 if (smi > smiremove) {
1908 old_sm = &un->un_sm[smi];
1909 new_sm = &un->un_sm[smi - 1];
1910 bcopy(old_sm, new_sm, sizeof (mm_submirror_t));
1911 bzero(old_sm, sizeof (mm_submirror_t));
1912 }
1913 }
1914
1915 /*
1916 * Now we need to rescan the array to find the next potential dead
1917 * entry.
1918 */
1919 goto rescan;
1920 }
1921
1922 /* Return a -1 if optimized record unavailable and set should be released */
1923 int
mirror_build_incore(mm_unit_t * un,int snarfing)1924 mirror_build_incore(mm_unit_t *un, int snarfing)
1925 {
1926 int i;
1927
1928 if (MD_STATUS(un) & MD_UN_BEING_RESET) {
1929 mddb_setrecprivate(un->c.un_record_id, MD_PRV_PENDCLEAN);
1930 return (1);
1931 }
1932
1933 if (mirror_are_submirrors_available(un) == 0)
1934 return (1);
1935
1936 if (MD_UNIT(MD_SID(un)) != NULL)
1937 return (0);
1938
1939 MD_STATUS(un) = 0;
1940
1941 /* pre-4.1 didn't define CAN_META_CHILD capability */
1942 MD_CAPAB(un) = MD_CAN_META_CHILD | MD_CAN_PARENT | MD_CAN_SP;
1943
1944 un->un_overlap_tree_flag = 0;
1945 avl_create(&un->un_overlap_root, mirror_overlap_compare,
1946 sizeof (md_mps_t), offsetof(md_mps_t, ps_overlap_node));
1947
1948 /*
1949 * We need to collapse any sparse submirror entries into a non-sparse
1950 * array. This is to cover the case where we have an old replica image
1951 * which has not been updated (i.e. snarfed) since being modified.
1952 * The new code expects all submirror access to be sequential (i.e.
1953 * both the un_sm[] and un_smic[] entries correspond to non-empty
1954 * submirrors.
1955 */
1956
1957 collapse_submirrors(un);
1958
1959 for (i = 0; i < NMIRROR; i++)
1960 build_submirror(un, i, snarfing);
1961
1962 if (unit_setup_resync(un, snarfing) != 0) {
1963 if (snarfing) {
1964 mddb_setrecprivate(un->c.un_record_id, MD_PRV_GOTIT);
1965 /*
1966 * If a MN set and set is not stale, then return -1
1967 * which will force the caller to unload the set.
1968 * The MN diskset nodes will return failure if
1969 * unit_setup_resync fails so that nodes won't
1970 * get out of sync.
1971 *
1972 * If set is STALE, the master node can't allocate
1973 * a resync record (if needed), but node needs to
1974 * join the set so that user can delete broken mddbs.
1975 * So, if set is STALE, just continue on.
1976 */
1977 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1978 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
1979 return (-1);
1980 }
1981 } else
1982 return (1);
1983 }
1984
1985 mutex_init(&un->un_overlap_tree_mx, NULL, MUTEX_DEFAULT, NULL);
1986 cv_init(&un->un_overlap_tree_cv, NULL, CV_DEFAULT, NULL);
1987
1988 un->un_suspend_wr_flag = 0;
1989 mutex_init(&un->un_suspend_wr_mx, NULL, MUTEX_DEFAULT, NULL);
1990 cv_init(&un->un_suspend_wr_cv, NULL, CV_DEFAULT, NULL);
1991
1992 /*
1993 * Allocate mutexes for mirror-owner and resync-owner changes.
1994 * All references to the owner message state field must be guarded
1995 * by this mutex.
1996 */
1997 mutex_init(&un->un_owner_mx, NULL, MUTEX_DEFAULT, NULL);
1998
1999 /*
2000 * Allocate mutex and condvar for resync thread manipulation. These
2001 * will be used by mirror_resync_unit/mirror_ioctl_resync
2002 */
2003 mutex_init(&un->un_rs_thread_mx, NULL, MUTEX_DEFAULT, NULL);
2004 cv_init(&un->un_rs_thread_cv, NULL, CV_DEFAULT, NULL);
2005
2006 /*
2007 * Allocate mutex and condvar for resync progress thread manipulation.
2008 * This allows resyncs to be continued across an intervening reboot.
2009 */
2010 mutex_init(&un->un_rs_progress_mx, NULL, MUTEX_DEFAULT, NULL);
2011 cv_init(&un->un_rs_progress_cv, NULL, CV_DEFAULT, NULL);
2012
2013 /*
2014 * Allocate mutex and condvar for Directed Mirror Reads (DMR). This
2015 * provides synchronization between a user-ioctl and the resulting
2016 * strategy() call that performs the read().
2017 */
2018 mutex_init(&un->un_dmr_mx, NULL, MUTEX_DEFAULT, NULL);
2019 cv_init(&un->un_dmr_cv, NULL, CV_DEFAULT, NULL);
2020
2021 /*
2022 * Allocate rwlocks for un_pernode_dirty_bm accessing.
2023 */
2024 for (i = 0; i < MD_MNMAXSIDES; i++) {
2025 rw_init(&un->un_pernode_dirty_mx[i], NULL, RW_DEFAULT, NULL);
2026 }
2027
2028 /* place various information in the in-core data structures */
2029 md_nblocks_set(MD_SID(un), un->c.un_total_blocks);
2030 MD_UNIT(MD_SID(un)) = un;
2031
2032 return (0);
2033 }
2034
2035
2036 void
reset_mirror(struct mm_unit * un,minor_t mnum,int removing)2037 reset_mirror(struct mm_unit *un, minor_t mnum, int removing)
2038 {
2039 mddb_recid_t recid, vtoc_id;
2040 size_t bitcnt;
2041 size_t shortcnt;
2042 int smi;
2043 sv_dev_t sv[NMIRROR];
2044 int nsv = 0;
2045 uint_t bits = 0;
2046 minor_t selfid;
2047 md_unit_t *su;
2048 int i;
2049
2050 md_destroy_unit_incore(mnum, &mirror_md_ops);
2051
2052 shortcnt = un->un_rrd_num * sizeof (short);
2053 bitcnt = howmany(un->un_rrd_num, NBBY);
2054
2055 if (un->un_outstanding_writes)
2056 kmem_free((caddr_t)un->un_outstanding_writes, shortcnt);
2057 if (un->un_goingclean_bm)
2058 kmem_free((caddr_t)un->un_goingclean_bm, bitcnt);
2059 if (un->un_goingdirty_bm)
2060 kmem_free((caddr_t)un->un_goingdirty_bm, bitcnt);
2061 if (un->un_resync_bm)
2062 kmem_free((caddr_t)un->un_resync_bm, bitcnt);
2063 if (un->un_pernode_dirty_sum)
2064 kmem_free((caddr_t)un->un_pernode_dirty_sum, un->un_rrd_num);
2065
2066 /*
2067 * Destroy the taskq for deferred processing of DRL clean requests.
2068 * This taskq will only be present for Multi Owner mirrors.
2069 */
2070 if (un->un_drl_task != NULL)
2071 ddi_taskq_destroy(un->un_drl_task);
2072
2073 md_nblocks_set(mnum, -1ULL);
2074 MD_UNIT(mnum) = NULL;
2075
2076 /*
2077 * Attempt release of its minor node
2078 */
2079 md_remove_minor_node(mnum);
2080
2081 if (!removing)
2082 return;
2083
2084 for (smi = 0; smi < NMIRROR; smi++) {
2085 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
2086 continue;
2087 /* reallow soft partitioning of submirror and reset parent */
2088 su = MD_UNIT(md_getminor(un->un_sm[smi].sm_dev));
2089 MD_CAPAB(su) |= MD_CAN_SP;
2090 md_reset_parent(un->un_sm[smi].sm_dev);
2091 reset_comp_states(&un->un_sm[smi], &un->un_smic[smi]);
2092
2093 sv[nsv].setno = MD_MIN2SET(mnum);
2094 sv[nsv++].key = un->un_sm[smi].sm_key;
2095 bits |= SMI2BIT(smi);
2096 }
2097
2098 MD_STATUS(un) |= MD_UN_BEING_RESET;
2099 recid = un->un_rr_dirty_recid;
2100 vtoc_id = un->c.un_vtoc_id;
2101 selfid = MD_SID(un);
2102
2103 mirror_commit(un, bits, 0);
2104
2105 avl_destroy(&un->un_overlap_root);
2106
2107 /* Destroy all mutexes and condvars before returning. */
2108 mutex_destroy(&un->un_suspend_wr_mx);
2109 cv_destroy(&un->un_suspend_wr_cv);
2110 mutex_destroy(&un->un_overlap_tree_mx);
2111 cv_destroy(&un->un_overlap_tree_cv);
2112 mutex_destroy(&un->un_owner_mx);
2113 mutex_destroy(&un->un_rs_thread_mx);
2114 cv_destroy(&un->un_rs_thread_cv);
2115 mutex_destroy(&un->un_rs_progress_mx);
2116 cv_destroy(&un->un_rs_progress_cv);
2117 mutex_destroy(&un->un_dmr_mx);
2118 cv_destroy(&un->un_dmr_cv);
2119
2120 for (i = 0; i < MD_MNMAXSIDES; i++) {
2121 rw_destroy(&un->un_pernode_dirty_mx[i]);
2122 if (un->un_pernode_dirty_bm[i])
2123 kmem_free((caddr_t)un->un_pernode_dirty_bm[i], bitcnt);
2124 }
2125
2126 /*
2127 * Remove self from the namespace
2128 */
2129 if (un->c.un_revision & MD_FN_META_DEV) {
2130 (void) md_rem_selfname(un->c.un_self_id);
2131 }
2132
2133 /* This frees the unit structure. */
2134 mddb_deleterec_wrapper(un->c.un_record_id);
2135
2136 if (recid != 0)
2137 mddb_deleterec_wrapper(recid);
2138
2139 /* Remove the vtoc, if present */
2140 if (vtoc_id)
2141 mddb_deleterec_wrapper(vtoc_id);
2142
2143 md_rem_names(sv, nsv);
2144
2145 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, SVM_TAG_METADEVICE,
2146 MD_MIN2SET(selfid), selfid);
2147 }
2148
2149 int
mirror_internal_open(minor_t mnum,int flag,int otyp,int md_oflags,IOLOCK * lockp)2150 mirror_internal_open(
2151 minor_t mnum,
2152 int flag,
2153 int otyp,
2154 int md_oflags,
2155 IOLOCK *lockp /* can be NULL */
2156 )
2157 {
2158 mdi_unit_t *ui = MDI_UNIT(mnum);
2159 int err = 0;
2160
2161 tryagain:
2162 /* single thread */
2163 if (lockp) {
2164 /*
2165 * If ioctl lock is held, use openclose_enter
2166 * routine that will set the ioctl flag when
2167 * grabbing the readerlock.
2168 */
2169 (void) md_ioctl_openclose_enter(lockp, ui);
2170 } else {
2171 (void) md_unit_openclose_enter(ui);
2172 }
2173
2174 /*
2175 * The mirror_open_all_devs routine may end up sending a STATE_UPDATE
2176 * message in a MN diskset and this requires that the openclose
2177 * lock is dropped in order to send this message. So, another
2178 * flag (MD_UL_OPENINPROGRESS) is used to keep another thread from
2179 * attempting an open while this thread has an open in progress.
2180 * Call the *_lh version of the lock exit routines since the ui_mx
2181 * mutex must be held from checking for OPENINPROGRESS until
2182 * after the cv_wait call.
2183 */
2184 mutex_enter(&ui->ui_mx);
2185 if (ui->ui_lock & MD_UL_OPENINPROGRESS) {
2186 if (lockp) {
2187 (void) md_ioctl_openclose_exit_lh(lockp);
2188 } else {
2189 md_unit_openclose_exit_lh(ui);
2190 }
2191 cv_wait(&ui->ui_cv, &ui->ui_mx);
2192 mutex_exit(&ui->ui_mx);
2193 goto tryagain;
2194 }
2195
2196 ui->ui_lock |= MD_UL_OPENINPROGRESS;
2197 mutex_exit(&ui->ui_mx);
2198
2199 /* open devices, if necessary */
2200 if (! md_unit_isopen(ui) || (ui->ui_tstate & MD_INACCESSIBLE)) {
2201 if ((err = mirror_open_all_devs(mnum, md_oflags, lockp)) != 0)
2202 goto out;
2203 }
2204
2205 /* count open */
2206 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
2207 goto out;
2208
2209 /* unlock, return success */
2210 out:
2211 mutex_enter(&ui->ui_mx);
2212 ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
2213 mutex_exit(&ui->ui_mx);
2214
2215 if (lockp) {
2216 /*
2217 * If ioctl lock is held, use openclose_exit
2218 * routine that will clear the lockp reader flag.
2219 */
2220 (void) md_ioctl_openclose_exit(lockp);
2221 } else {
2222 md_unit_openclose_exit(ui);
2223 }
2224 return (err);
2225 }
2226
2227 int
mirror_internal_close(minor_t mnum,int otyp,int md_cflags,IOLOCK * lockp)2228 mirror_internal_close(
2229 minor_t mnum,
2230 int otyp,
2231 int md_cflags,
2232 IOLOCK *lockp /* can be NULL */
2233 )
2234 {
2235 mdi_unit_t *ui = MDI_UNIT(mnum);
2236 mm_unit_t *un;
2237 int err = 0;
2238
2239 /* single thread */
2240 if (lockp) {
2241 /*
2242 * If ioctl lock is held, use openclose_enter
2243 * routine that will set the ioctl flag when
2244 * grabbing the readerlock.
2245 */
2246 un = (mm_unit_t *)md_ioctl_openclose_enter(lockp, ui);
2247 } else {
2248 un = (mm_unit_t *)md_unit_openclose_enter(ui);
2249 }
2250
2251 /* count closed */
2252 if ((err = md_unit_decopen(mnum, otyp)) != 0)
2253 goto out;
2254
2255 /* close devices, if necessary */
2256 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
2257 /*
2258 * Clean up dirty bitmap for this unit. Do this
2259 * before closing the underlying devices to avoid
2260 * race conditions with reset_mirror() as a
2261 * result of a 'metaset -r' command running in
2262 * parallel. This might cause deallocation of
2263 * dirty region bitmaps; with underlying metadevices
2264 * in place this can't happen.
2265 * Don't do this if a MN set and ABR not set
2266 */
2267 if (new_resync && !(MD_STATUS(un) & MD_UN_KEEP_DIRTY)) {
2268 if (!MD_MNSET_SETNO(MD_UN2SET(un)) ||
2269 !(ui->ui_tstate & MD_ABR_CAP))
2270 mirror_process_unit_resync(un);
2271 }
2272 (void) mirror_close_all_devs(un, md_cflags);
2273
2274 /*
2275 * For a MN set with transient capabilities (eg ABR/DMR) set,
2276 * clear these capabilities on the last open in the cluster.
2277 * To do this we send a message to all nodes to see of the
2278 * device is open.
2279 */
2280 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
2281 (ui->ui_tstate & (MD_ABR_CAP|MD_DMR_CAP))) {
2282 if (lockp) {
2283 (void) md_ioctl_openclose_exit(lockp);
2284 } else {
2285 md_unit_openclose_exit(ui);
2286 }
2287
2288 /*
2289 * if we are in the context of an ioctl, drop the
2290 * ioctl lock.
2291 * Otherwise, no other locks should be held.
2292 */
2293 if (lockp) {
2294 IOLOCK_RETURN_RELEASE(0, lockp);
2295 }
2296
2297 mdmn_clear_all_capabilities(mnum);
2298
2299 /* if dropped the lock previously, regain it */
2300 if (lockp) {
2301 IOLOCK_RETURN_REACQUIRE(lockp);
2302 }
2303 return (0);
2304 }
2305 /* unlock and return success */
2306 }
2307 out:
2308 /* Call whether lockp is NULL or not. */
2309 if (lockp) {
2310 md_ioctl_openclose_exit(lockp);
2311 } else {
2312 md_unit_openclose_exit(ui);
2313 }
2314 return (err);
2315 }
2316
2317 /*
2318 * When a component has completed resyncing and is now ok, check if the
2319 * corresponding component in the other submirrors is in the Last Erred
2320 * state. If it is, we want to change that to the Erred state so we stop
2321 * using that component and start using this good component instead.
2322 *
2323 * This is called from set_sm_comp_state and recursively calls
2324 * set_sm_comp_state if it needs to change the Last Erred state.
2325 */
2326 static void
reset_lasterred(mm_unit_t * un,int smi,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)2327 reset_lasterred(mm_unit_t *un, int smi, mddb_recid_t *extras, uint_t flags,
2328 IOLOCK *lockp)
2329 {
2330 mm_submirror_t *sm;
2331 mm_submirror_ic_t *smic;
2332 int ci;
2333 int i;
2334 int compcnt;
2335 int changed = 0;
2336
2337 for (i = 0; i < NMIRROR; i++) {
2338 sm = &un->un_sm[i];
2339 smic = &un->un_smic[i];
2340
2341 if (!SMS_IS(sm, SMS_INUSE))
2342 continue;
2343
2344 /* ignore the submirror that we just made ok */
2345 if (i == smi)
2346 continue;
2347
2348 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
2349 for (ci = 0; ci < compcnt; ci++) {
2350 md_m_shared_t *shared;
2351
2352 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2353 (sm->sm_dev, sm, ci);
2354
2355 if ((shared->ms_state & CS_LAST_ERRED) &&
2356 !mirror_other_sources(un, i, ci, 1)) {
2357
2358 set_sm_comp_state(un, i, ci, CS_ERRED, extras,
2359 flags, lockp);
2360 changed = 1;
2361 }
2362 }
2363 }
2364
2365 /* maybe there is a hotspare for this newly erred component */
2366 if (changed) {
2367 set_t setno;
2368
2369 setno = MD_UN2SET(un);
2370 if (MD_MNSET_SETNO(setno)) {
2371 send_poke_hotspares(setno);
2372 } else {
2373 (void) poke_hotspares();
2374 }
2375 }
2376 }
2377
2378 /*
2379 * set_sm_comp_state
2380 *
2381 * Set the state of a submirror component to the specified new state.
2382 * If the mirror is in a multi-node set, send messages to all nodes to
2383 * block all writes to the mirror and then update the state and release the
2384 * writes. These messages are only sent if MD_STATE_XMIT is set in flags.
2385 * MD_STATE_XMIT will be unset in 2 cases:
2386 * 1. When the state is changed to CS_RESYNC as this state change
2387 * will already have been updated on each node by the processing of the
2388 * distributed metasync command, hence no need to xmit.
2389 * 2. When the state is change to CS_OKAY after a resync has completed. Again
2390 * the resync completion will already have been processed on each node by
2391 * the processing of the MD_MN_MSG_RESYNC_PHASE_DONE message for a component
2392 * resync, hence no need to xmit.
2393 *
2394 * In case we are called from the updates of a watermark,
2395 * (then MD_STATE_WMUPDATE will be set in the ps->flags) this is due to
2396 * a metainit or similar. In this case the message that we sent to propagate
2397 * the state change must not be a class1 message as that would deadlock with
2398 * the metainit command that is still being processed.
2399 * This we achieve by creating a class2 message MD_MN_MSG_STATE_UPDATE2
2400 * instead. This also makes the submessage generator to create a class2
2401 * submessage rather than a class1 (which would also block)
2402 *
2403 * On entry, unit_writerlock is held
2404 * If MD_STATE_OCHELD is set in flags, then unit_openclose lock is
2405 * also held.
2406 */
2407 void
set_sm_comp_state(mm_unit_t * un,int smi,int ci,int newstate,mddb_recid_t * extras,uint_t flags,IOLOCK * lockp)2408 set_sm_comp_state(
2409 mm_unit_t *un,
2410 int smi,
2411 int ci,
2412 int newstate,
2413 mddb_recid_t *extras,
2414 uint_t flags,
2415 IOLOCK *lockp
2416 )
2417 {
2418 mm_submirror_t *sm;
2419 mm_submirror_ic_t *smic;
2420 md_m_shared_t *shared;
2421 int origstate;
2422 void (*get_dev)();
2423 ms_cd_info_t cd;
2424 char devname[MD_MAX_CTDLEN];
2425 int err;
2426 set_t setno = MD_UN2SET(un);
2427 md_mn_msg_stch_t stchmsg;
2428 mdi_unit_t *ui = MDI_UNIT(MD_SID(un));
2429 md_mn_kresult_t *kresult;
2430 int rval;
2431 uint_t msgflags;
2432 md_mn_msgtype_t msgtype;
2433 int save_lock = 0;
2434 mdi_unit_t *ui_sm;
2435 int nretries = 0;
2436
2437 sm = &un->un_sm[smi];
2438 smic = &un->un_smic[smi];
2439
2440 /* If we have a real error status then turn off MD_INACCESSIBLE. */
2441 ui_sm = MDI_UNIT(getminor(md_dev64_to_dev(sm->sm_dev)));
2442 if (newstate & (CS_ERRED | CS_RESYNC | CS_LAST_ERRED) &&
2443 ui_sm->ui_tstate & MD_INACCESSIBLE) {
2444 ui_sm->ui_tstate &= ~MD_INACCESSIBLE;
2445 }
2446
2447 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
2448 (sm->sm_dev, sm, ci);
2449 origstate = shared->ms_state;
2450
2451 /*
2452 * If the new state is an error and the old one wasn't, generate
2453 * a console message. We do this before we send the state to other
2454 * nodes in a MN set because the state change may change the component
2455 * name if a hotspare is allocated.
2456 */
2457 if ((! (origstate & (CS_ERRED|CS_LAST_ERRED))) &&
2458 (newstate & (CS_ERRED|CS_LAST_ERRED))) {
2459
2460 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2461 "get device", 0);
2462 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2463
2464 err = md_getdevname(setno, mddb_getsidenum(setno), 0,
2465 cd.cd_dev, devname, sizeof (devname));
2466
2467 if (err == ENOENT) {
2468 (void) md_devname(setno, cd.cd_dev, devname,
2469 sizeof (devname));
2470 }
2471
2472 cmn_err(CE_WARN, "md: %s: %s needs maintenance",
2473 md_shortname(md_getminor(sm->sm_dev)), devname);
2474
2475 if (newstate & CS_LAST_ERRED) {
2476 cmn_err(CE_WARN, "md: %s: %s last erred",
2477 md_shortname(md_getminor(sm->sm_dev)),
2478 devname);
2479
2480 } else if (shared->ms_flags & MDM_S_ISOPEN) {
2481 /*
2482 * Close the broken device and clear the open flag on
2483 * it. Closing the device means the RCM framework will
2484 * be able to unconfigure the device if required.
2485 *
2486 * We have to check that the device is open, otherwise
2487 * the first open on it has resulted in the error that
2488 * is being processed and the actual cd.cd_dev will be
2489 * NODEV64.
2490 *
2491 * If this is a multi-node mirror, then the multinode
2492 * state checks following this code will cause the
2493 * slave nodes to close the mirror in the function
2494 * mirror_set_state().
2495 */
2496 md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2497 shared->ms_flags &= ~MDM_S_ISOPEN;
2498 }
2499
2500 } else if ((origstate & CS_LAST_ERRED) && (newstate & CS_ERRED) &&
2501 (shared->ms_flags & MDM_S_ISOPEN)) {
2502 /*
2503 * Similar to logic above except no log messages since we
2504 * are just transitioning from Last Erred to Erred.
2505 */
2506 get_dev = (void (*)())md_get_named_service(sm->sm_dev, 0,
2507 "get device", 0);
2508 (void) (*get_dev)(sm->sm_dev, sm, ci, &cd);
2509
2510 md_layered_close(cd.cd_dev, MD_OFLG_NULL);
2511 shared->ms_flags &= ~MDM_S_ISOPEN;
2512 }
2513
2514 if ((MD_MNSET_SETNO(setno)) && (origstate != newstate) &&
2515 (flags & MD_STATE_XMIT) && !(ui->ui_tstate & MD_ERR_PENDING)) {
2516 /*
2517 * For a multi-node mirror, send the state change to the
2518 * master, which broadcasts to all nodes, including this
2519 * one. Once the message is received, the state is set
2520 * in-core and the master commits the change to disk.
2521 * There is a case, comp_replace, where this function
2522 * can be called from within an ioctl and therefore in this
2523 * case, as the ioctl will already be called on each node,
2524 * there is no need to xmit the state change to the master for
2525 * distribution to the other nodes. MD_STATE_XMIT flag is used
2526 * to indicate whether a xmit is required. The mirror's
2527 * transient state is set to MD_ERR_PENDING to avoid sending
2528 * multiple messages.
2529 */
2530 if (newstate & (CS_ERRED|CS_LAST_ERRED))
2531 ui->ui_tstate |= MD_ERR_PENDING;
2532
2533 /*
2534 * Send a state update message to all nodes. This message
2535 * will generate 2 submessages, the first one to suspend
2536 * all writes to the mirror and the second to update the
2537 * state and resume writes.
2538 */
2539 stchmsg.msg_stch_mnum = un->c.un_self_id;
2540 stchmsg.msg_stch_sm = smi;
2541 stchmsg.msg_stch_comp = ci;
2542 stchmsg.msg_stch_new_state = newstate;
2543 stchmsg.msg_stch_hs_id = shared->ms_hs_id;
2544 #ifdef DEBUG
2545 if (mirror_debug_flag)
2546 printf("send set state, %x, %x, %x, %x, %x\n",
2547 stchmsg.msg_stch_mnum, stchmsg.msg_stch_sm,
2548 stchmsg.msg_stch_comp, stchmsg.msg_stch_new_state,
2549 stchmsg.msg_stch_hs_id);
2550 #endif
2551 if (flags & MD_STATE_WMUPDATE) {
2552 msgtype = MD_MN_MSG_STATE_UPDATE2;
2553 /*
2554 * When coming from an update of watermarks, there
2555 * must already be a message logged that triggered
2556 * this action. So, no need to log this message, too.
2557 */
2558 msgflags = MD_MSGF_NO_LOG;
2559 } else {
2560 msgtype = MD_MN_MSG_STATE_UPDATE;
2561 msgflags = MD_MSGF_DEFAULT_FLAGS;
2562 }
2563
2564 /*
2565 * If we are in the context of an ioctl, drop the ioctl lock.
2566 * lockp holds the list of locks held.
2567 *
2568 * Otherwise, increment the appropriate reacquire counters.
2569 * If openclose lock is *held, then must reacquire reader
2570 * lock before releasing the openclose lock.
2571 * Do not drop the ARRAY_WRITER lock as we may not be able
2572 * to reacquire it.
2573 */
2574 if (lockp) {
2575 if (lockp->l_flags & MD_ARRAY_WRITER) {
2576 save_lock = MD_ARRAY_WRITER;
2577 lockp->l_flags &= ~MD_ARRAY_WRITER;
2578 } else if (lockp->l_flags & MD_ARRAY_READER) {
2579 save_lock = MD_ARRAY_READER;
2580 lockp->l_flags &= ~MD_ARRAY_READER;
2581 }
2582 IOLOCK_RETURN_RELEASE(0, lockp);
2583 } else {
2584 if (flags & MD_STATE_OCHELD) {
2585 md_unit_writerexit(ui);
2586 (void) md_unit_readerlock(ui);
2587 md_unit_openclose_exit(ui);
2588 } else {
2589 md_unit_writerexit(ui);
2590 }
2591 }
2592
2593 kresult = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
2594 sscs_msg:
2595 rval = mdmn_ksend_message(setno, msgtype, msgflags, 0,
2596 (char *)&stchmsg, sizeof (stchmsg), kresult);
2597
2598 if (!MDMN_KSEND_MSG_OK(rval, kresult)) {
2599 mdmn_ksend_show_error(rval, kresult, "STATE UPDATE");
2600 /* If we're shutting down already, pause things here. */
2601 if (kresult->kmmr_comm_state == MDMNE_RPC_FAIL) {
2602 while (!md_mn_is_commd_present()) {
2603 delay(md_hz);
2604 }
2605 /*
2606 * commd is now available; retry the message
2607 * one time. If that fails we fall through and
2608 * panic as the system is in an unexpected state
2609 */
2610 if (nretries++ == 0)
2611 goto sscs_msg;
2612 }
2613 cmn_err(CE_PANIC,
2614 "ksend_message failure: STATE_UPDATE");
2615 }
2616 kmem_free(kresult, sizeof (md_mn_kresult_t));
2617
2618 /* if dropped the lock previously, regain it */
2619 if (lockp) {
2620 IOLOCK_RETURN_REACQUIRE(lockp);
2621 lockp->l_flags |= save_lock;
2622 } else {
2623 /*
2624 * Reacquire dropped locks and update acquirecnts
2625 * appropriately.
2626 */
2627 if (flags & MD_STATE_OCHELD) {
2628 /*
2629 * openclose also grabs readerlock.
2630 */
2631 (void) md_unit_openclose_enter(ui);
2632 md_unit_readerexit(ui);
2633 (void) md_unit_writerlock(ui);
2634 } else {
2635 (void) md_unit_writerlock(ui);
2636 }
2637 }
2638
2639 ui->ui_tstate &= ~MD_ERR_PENDING;
2640 } else {
2641 shared->ms_state = newstate;
2642 uniqtime32(&shared->ms_timestamp);
2643
2644 if (newstate == CS_ERRED)
2645 shared->ms_flags |= MDM_S_NOWRITE;
2646 else
2647 shared->ms_flags &= ~MDM_S_NOWRITE;
2648
2649 shared->ms_flags &= ~MDM_S_IOERR;
2650 un->un_changecnt++;
2651 shared->ms_lasterrcnt = un->un_changecnt;
2652
2653 mirror_set_sm_state(sm, smic, SMS_RUNNING, 0);
2654 mirror_commit(un, SMI2BIT(smi), extras);
2655 }
2656
2657 if ((origstate & CS_RESYNC) && (newstate & CS_OKAY)) {
2658 /*
2659 * Resetting the Last Erred state will recursively call back
2660 * into this function (set_sm_comp_state) to update the state.
2661 */
2662 reset_lasterred(un, smi, extras, flags, lockp);
2663 }
2664 }
2665
2666 static int
find_another_logical(mm_unit_t * un,mm_submirror_t * esm,diskaddr_t blk,u_longlong_t cnt,int must_be_open,int state,int err_cnt)2667 find_another_logical(
2668 mm_unit_t *un,
2669 mm_submirror_t *esm,
2670 diskaddr_t blk,
2671 u_longlong_t cnt,
2672 int must_be_open,
2673 int state,
2674 int err_cnt)
2675 {
2676 u_longlong_t cando;
2677 md_dev64_t dev;
2678 md_m_shared_t *s;
2679
2680 esm->sm_state |= SMS_IGNORE;
2681 while (cnt != 0) {
2682 u_longlong_t mcnt;
2683
2684 mcnt = MIN(cnt, lbtodb(1024 * 1024 * 1024)); /* 1 Gig Blks */
2685
2686 dev = select_read_unit(un, blk, mcnt, &cando,
2687 must_be_open, &s, NULL);
2688 if (dev == (md_dev64_t)0)
2689 break;
2690
2691 if ((state == CS_LAST_ERRED) &&
2692 (s->ms_state == CS_LAST_ERRED) &&
2693 (err_cnt > s->ms_lasterrcnt))
2694 break;
2695
2696 cnt -= cando;
2697 blk += cando;
2698 }
2699 esm->sm_state &= ~SMS_IGNORE;
2700 return (cnt != 0);
2701 }
2702
2703 int
mirror_other_sources(mm_unit_t * un,int smi,int ci,int must_be_open)2704 mirror_other_sources(mm_unit_t *un, int smi, int ci, int must_be_open)
2705 {
2706 mm_submirror_t *sm;
2707 mm_submirror_ic_t *smic;
2708 size_t count;
2709 diskaddr_t block;
2710 u_longlong_t skip;
2711 u_longlong_t size;
2712 md_dev64_t dev;
2713 int cnt;
2714 md_m_shared_t *s;
2715 int not_found;
2716
2717 sm = &un->un_sm[smi];
2718 smic = &un->un_smic[smi];
2719 dev = sm->sm_dev;
2720
2721 /*
2722 * Make sure every component of the submirror
2723 * has other sources.
2724 */
2725 if (ci < 0) {
2726 /* Find the highest lasterrcnt */
2727 cnt = (*(smic->sm_get_component_count))(dev, sm);
2728 for (ci = 0; ci < cnt; ci++) {
2729 not_found = mirror_other_sources(un, smi, ci,
2730 must_be_open);
2731 if (not_found)
2732 return (1);
2733 }
2734 return (0);
2735 }
2736
2737 /*
2738 * Make sure this component has other sources
2739 */
2740 (void) (*(smic->sm_get_bcss))
2741 (dev, sm, ci, &block, &count, &skip, &size);
2742
2743 if (count == 0)
2744 return (1);
2745
2746 s = (md_m_shared_t *)(*(smic->sm_shared_by_indx))(dev, sm, ci);
2747
2748 while (count--) {
2749 if (block >= un->c.un_total_blocks)
2750 return (0);
2751
2752 if ((block + size) > un->c.un_total_blocks)
2753 size = un->c.un_total_blocks - block;
2754
2755 not_found = find_another_logical(un, sm, block, size,
2756 must_be_open, s->ms_state, s->ms_lasterrcnt);
2757 if (not_found)
2758 return (1);
2759
2760 block += size + skip;
2761 }
2762 return (0);
2763 }
2764
2765 static void
finish_error(md_mps_t * ps)2766 finish_error(md_mps_t *ps)
2767 {
2768 struct buf *pb;
2769 mm_unit_t *un;
2770 mdi_unit_t *ui;
2771 uint_t new_str_flags;
2772
2773 pb = ps->ps_bp;
2774 un = ps->ps_un;
2775 ui = ps->ps_ui;
2776
2777 /*
2778 * Must flag any error to the resync originator if we're performing
2779 * a Write-after-Read. This corresponds to an i/o error on a resync
2780 * target device and in this case we ought to abort the resync as there
2781 * is nothing that can be done to recover from this without operator
2782 * intervention. If we don't set the B_ERROR flag we will continue
2783 * reading from the mirror but won't write to the target (as it will
2784 * have been placed into an errored state).
2785 * To handle the case of multiple components within a submirror we only
2786 * set the B_ERROR bit if explicitly requested to via MD_MPS_FLAG_ERROR.
2787 * The originator of the resync read will cause this bit to be set if
2788 * the underlying component count is one for a submirror resync. All
2789 * other resync types will have the flag set as there is no underlying
2790 * resync which can be performed on a contained metadevice for these
2791 * resync types (optimized or component).
2792 */
2793
2794 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ) {
2795 if (ps->ps_flags & MD_MPS_FLAG_ERROR)
2796 pb->b_flags |= B_ERROR;
2797 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2798 MPS_FREE(mirror_parent_cache, ps);
2799 md_unit_readerexit(ui);
2800 md_biodone(pb);
2801 return;
2802 }
2803 /*
2804 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
2805 * operation therefore this I/O request has already been counted,
2806 * the I/O count variable will be decremented by mirror_done()'s
2807 * call to md_biodone().
2808 */
2809 if (ps->ps_changecnt != un->un_changecnt) {
2810 new_str_flags = MD_STR_NOTTOP | MD_IO_COUNTED;
2811 if (ps->ps_flags & MD_MPS_WOW)
2812 new_str_flags |= MD_STR_WOW;
2813 if (ps->ps_flags & MD_MPS_MAPPED)
2814 new_str_flags |= MD_STR_MAPPED;
2815 /*
2816 * If this I/O request was a read that was part of a resync,
2817 * set MD_STR_WAR for the retried read to ensure that the
2818 * resync write (i.e. write-after-read) will be performed
2819 */
2820 if (ps->ps_flags & MD_MPS_RESYNC_READ)
2821 new_str_flags |= MD_STR_WAR;
2822 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2823 MPS_FREE(mirror_parent_cache, ps);
2824 md_unit_readerexit(ui);
2825 (void) md_mirror_strategy(pb, new_str_flags, NULL);
2826 return;
2827 }
2828
2829 pb->b_flags |= B_ERROR;
2830 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
2831 MPS_FREE(mirror_parent_cache, ps);
2832 md_unit_readerexit(ui);
2833 md_biodone(pb);
2834 }
2835
2836 static void
error_update_unit(md_mps_t * ps)2837 error_update_unit(md_mps_t *ps)
2838 {
2839 mm_unit_t *un;
2840 mdi_unit_t *ui;
2841 int smi; /* sub mirror index */
2842 int ci; /* errored component */
2843 set_t setno;
2844 uint_t flags; /* for set_sm_comp_state() */
2845 uint_t hspflags; /* for check_comp_4_hotspares() */
2846
2847 ui = ps->ps_ui;
2848 un = (mm_unit_t *)md_unit_writerlock(ui);
2849 setno = MD_UN2SET(un);
2850
2851 /* All of these updates have to propagated in case of MN set */
2852 flags = MD_STATE_XMIT;
2853 hspflags = MD_HOTSPARE_XMIT;
2854
2855 /* special treatment if we are called during updating watermarks */
2856 if (ps->ps_flags & MD_MPS_WMUPDATE) {
2857 flags |= MD_STATE_WMUPDATE;
2858 hspflags |= MD_HOTSPARE_WMUPDATE;
2859 }
2860 smi = 0;
2861 ci = 0;
2862 while (mirror_geterror(un, &smi, &ci, 1, 0) != 0) {
2863 if (mirror_other_sources(un, smi, ci, 0) == 1) {
2864
2865 /* Never called from ioctl context, so (IOLOCK *)NULL */
2866 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED, 0, flags,
2867 (IOLOCK *)NULL);
2868 /*
2869 * For a MN set, the NOTIFY is done when the state
2870 * change is processed on each node
2871 */
2872 if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2873 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_LASTERRED,
2874 SVM_TAG_METADEVICE, setno, MD_SID(un));
2875 }
2876 continue;
2877 }
2878 /* Never called from ioctl context, so (IOLOCK *)NULL */
2879 set_sm_comp_state(un, smi, ci, CS_ERRED, 0, flags,
2880 (IOLOCK *)NULL);
2881 /*
2882 * For a MN set, the NOTIFY is done when the state
2883 * change is processed on each node
2884 */
2885 if (!MD_MNSET_SETNO(MD_UN2SET(un))) {
2886 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
2887 SVM_TAG_METADEVICE, setno, MD_SID(un));
2888 }
2889 smi = 0;
2890 ci = 0;
2891 }
2892
2893 md_unit_writerexit(ui);
2894 if (MD_MNSET_SETNO(setno)) {
2895 send_poke_hotspares(setno);
2896 } else {
2897 (void) poke_hotspares();
2898 }
2899 (void) md_unit_readerlock(ui);
2900
2901 finish_error(ps);
2902 }
2903
2904 /*
2905 * When we have a B_FAILFAST IO error on a Last Erred component we need to
2906 * retry the IO without B_FAILFAST set so that we try to ensure that the
2907 * component "sees" each IO.
2908 */
2909 static void
last_err_retry(md_mcs_t * cs)2910 last_err_retry(md_mcs_t *cs)
2911 {
2912 struct buf *cb;
2913 md_mps_t *ps;
2914 uint_t flags;
2915
2916 cb = &cs->cs_buf;
2917 cb->b_flags &= ~B_FAILFAST;
2918
2919 /* if we're panicing just let this I/O error out */
2920 if (panicstr) {
2921 (void) mirror_done(cb);
2922 return;
2923 }
2924
2925 /* reissue the I/O */
2926
2927 ps = cs->cs_ps;
2928
2929 bioerror(cb, 0);
2930
2931 mutex_enter(&ps->ps_mx);
2932
2933 flags = MD_STR_NOTTOP;
2934 if (ps->ps_flags & MD_MPS_MAPPED)
2935 flags |= MD_STR_MAPPED;
2936 if (ps->ps_flags & MD_MPS_NOBLOCK)
2937 flags |= MD_NOBLOCK;
2938
2939 mutex_exit(&ps->ps_mx);
2940
2941 clear_retry_error(cb);
2942
2943 cmn_err(CE_NOTE, "!md: %s: Last Erred, retry I/O without B_FAILFAST",
2944 md_shortname(getminor(cb->b_edev)));
2945
2946 md_call_strategy(cb, flags, NULL);
2947 }
2948
2949 static void
mirror_error(md_mps_t * ps)2950 mirror_error(md_mps_t *ps)
2951 {
2952 int smi; /* sub mirror index */
2953 int ci; /* errored component */
2954
2955 if (panicstr) {
2956 finish_error(ps);
2957 return;
2958 }
2959
2960 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
2961 mirror_overlap_tree_remove(ps);
2962
2963 smi = 0;
2964 ci = 0;
2965 if (mirror_geterror(ps->ps_un, &smi, &ci, 0, 0) != 0) {
2966 md_unit_readerexit(ps->ps_ui);
2967 daemon_request(&md_mstr_daemon, error_update_unit,
2968 (daemon_queue_t *)ps, REQ_OLD);
2969 return;
2970 }
2971
2972 finish_error(ps);
2973 }
2974
2975 static int
copy_write_done(struct buf * cb)2976 copy_write_done(struct buf *cb)
2977 {
2978 md_mps_t *ps;
2979 buf_t *pb;
2980 char *wowbuf;
2981 wowhdr_t *wowhdr;
2982 ssize_t wow_resid;
2983
2984 /* get wowbuf ans save structure */
2985 wowbuf = cb->b_un.b_addr;
2986 wowhdr = WOWBUF_HDR(wowbuf);
2987 ps = wowhdr->wow_ps;
2988 pb = ps->ps_bp;
2989
2990 /* Save error information, then free cb */
2991 if (cb->b_flags & B_ERROR)
2992 pb->b_flags |= B_ERROR;
2993
2994 if (cb->b_flags & B_REMAPPED)
2995 bp_mapout(cb);
2996
2997 freerbuf(cb);
2998
2999 /* update residual and continue if needed */
3000 if ((pb->b_flags & B_ERROR) == 0) {
3001 wow_resid = pb->b_bcount - wowhdr->wow_offset;
3002 pb->b_resid = wow_resid;
3003 if (wow_resid > 0) {
3004 daemon_request(&md_mstr_daemon, copy_write_cont,
3005 (daemon_queue_t *)wowhdr, REQ_OLD);
3006 return (1);
3007 }
3008 }
3009
3010 /* Write is complete, release resources. */
3011 kmem_cache_free(mirror_wowblk_cache, wowhdr);
3012 ASSERT(!(ps->ps_flags & MD_MPS_ON_OVERLAP));
3013 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3014 MPS_FREE(mirror_parent_cache, ps);
3015 md_biodone(pb);
3016 return (0);
3017 }
3018
3019 static void
copy_write_cont(wowhdr_t * wowhdr)3020 copy_write_cont(wowhdr_t *wowhdr)
3021 {
3022 buf_t *pb;
3023 buf_t *cb;
3024 char *wowbuf;
3025 int wow_offset;
3026 size_t wow_resid;
3027 diskaddr_t wow_blkno;
3028
3029 wowbuf = WOWHDR_BUF(wowhdr);
3030 pb = wowhdr->wow_ps->ps_bp;
3031
3032 /* get data on current location */
3033 wow_offset = wowhdr->wow_offset;
3034 wow_resid = pb->b_bcount - wow_offset;
3035 wow_blkno = pb->b_lblkno + lbtodb(wow_offset);
3036
3037 /* setup child buffer */
3038 cb = getrbuf(KM_SLEEP);
3039 cb->b_flags = B_WRITE;
3040 cb->b_edev = pb->b_edev;
3041 cb->b_un.b_addr = wowbuf; /* change to point at WOWBUF */
3042 cb->b_bufsize = md_wowbuf_size; /* change to wowbuf_size */
3043 cb->b_iodone = copy_write_done;
3044 cb->b_bcount = MIN(md_wowbuf_size, wow_resid);
3045 cb->b_lblkno = wow_blkno;
3046
3047 /* move offset to next section */
3048 wowhdr->wow_offset += cb->b_bcount;
3049
3050 /* copy and setup write for current section */
3051 bcopy(&pb->b_un.b_addr[wow_offset], wowbuf, cb->b_bcount);
3052
3053 /* do it */
3054 /*
3055 * Do not set the MD_IO_COUNTED flag as this is a new I/O request
3056 * that handles the WOW condition. The resultant increment on the
3057 * I/O count variable is cleared by copy_write_done()'s call to
3058 * md_biodone().
3059 */
3060 (void) md_mirror_strategy(cb, MD_STR_NOTTOP | MD_STR_WOW
3061 | MD_STR_MAPPED, NULL);
3062 }
3063
3064 static void
md_mirror_copy_write(md_mps_t * ps)3065 md_mirror_copy_write(md_mps_t *ps)
3066 {
3067 wowhdr_t *wowhdr;
3068
3069 wowhdr = kmem_cache_alloc(mirror_wowblk_cache, MD_ALLOCFLAGS);
3070 mirror_wowblk_init(wowhdr);
3071 wowhdr->wow_ps = ps;
3072 wowhdr->wow_offset = 0;
3073 copy_write_cont(wowhdr);
3074 }
3075
3076 static void
handle_wow(md_mps_t * ps)3077 handle_wow(md_mps_t *ps)
3078 {
3079 buf_t *pb;
3080
3081 pb = ps->ps_bp;
3082
3083 bp_mapin(pb);
3084
3085 md_mirror_wow_cnt++;
3086 if (!(pb->b_flags & B_PHYS) && (md_mirror_wow_flg & WOW_LOGIT)) {
3087 cmn_err(CE_NOTE,
3088 "md: %s, blk %lld, cnt %ld: Write on write %d occurred",
3089 md_shortname(getminor(pb->b_edev)),
3090 (longlong_t)pb->b_lblkno, pb->b_bcount, md_mirror_wow_cnt);
3091 }
3092
3093 /*
3094 * Set the MD_IO_COUNTED flag as we are retrying the same I/O
3095 * operation therefore this I/O request has already been counted,
3096 * the I/O count variable will be decremented by mirror_done()'s
3097 * call to md_biodone().
3098 */
3099 if (md_mirror_wow_flg & WOW_NOCOPY)
3100 (void) md_mirror_strategy(pb, MD_STR_NOTTOP | MD_STR_WOW |
3101 MD_STR_MAPPED | MD_IO_COUNTED, ps);
3102 else
3103 md_mirror_copy_write(ps);
3104 }
3105
3106 /*
3107 * Return true if the specified submirror is either in the Last Erred
3108 * state or is transitioning into the Last Erred state.
3109 */
3110 static bool_t
submirror_is_lasterred(mm_unit_t * un,int smi)3111 submirror_is_lasterred(mm_unit_t *un, int smi)
3112 {
3113 mm_submirror_t *sm;
3114 mm_submirror_ic_t *smic;
3115 md_m_shared_t *shared;
3116 int ci;
3117 int compcnt;
3118
3119 sm = &un->un_sm[smi];
3120 smic = &un->un_smic[smi];
3121
3122 compcnt = (*(smic->sm_get_component_count)) (sm->sm_dev, un);
3123 for (ci = 0; ci < compcnt; ci++) {
3124 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3125 (sm->sm_dev, sm, ci);
3126
3127 if (shared->ms_state == CS_LAST_ERRED)
3128 return (B_TRUE);
3129
3130 /*
3131 * It is not currently Last Erred, check if entering Last Erred.
3132 */
3133 if ((shared->ms_flags & MDM_S_IOERR) &&
3134 ((shared->ms_state == CS_OKAY) ||
3135 (shared->ms_state == CS_RESYNC))) {
3136 if (mirror_other_sources(un, smi, ci, 0) == 1)
3137 return (B_TRUE);
3138 }
3139 }
3140
3141 return (B_FALSE);
3142 }
3143
3144
3145 static int
mirror_done(struct buf * cb)3146 mirror_done(struct buf *cb)
3147 {
3148 md_mps_t *ps;
3149 md_mcs_t *cs;
3150
3151 /*LINTED*/
3152 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3153 ps = cs->cs_ps;
3154
3155 mutex_enter(&ps->ps_mx);
3156
3157 /* check if we need to retry an errored failfast I/O */
3158 if (cb->b_flags & B_ERROR) {
3159 struct buf *pb = ps->ps_bp;
3160
3161 if (cb->b_flags & B_FAILFAST) {
3162 int i;
3163 mm_unit_t *un = ps->ps_un;
3164
3165 for (i = 0; i < NMIRROR; i++) {
3166 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE))
3167 continue;
3168
3169 if (cb->b_edev ==
3170 md_dev64_to_dev(un->un_sm[i].sm_dev)) {
3171
3172 /*
3173 * This is the submirror that had the
3174 * error. Check if it is Last Erred.
3175 */
3176 if (submirror_is_lasterred(un, i)) {
3177 daemon_queue_t *dqp;
3178
3179 mutex_exit(&ps->ps_mx);
3180 dqp = (daemon_queue_t *)cs;
3181 dqp->dq_prev = NULL;
3182 dqp->dq_next = NULL;
3183 daemon_request(&md_done_daemon,
3184 last_err_retry, dqp,
3185 REQ_OLD);
3186 return (1);
3187 }
3188 break;
3189 }
3190 }
3191 }
3192
3193 /* continue to process the buf without doing a retry */
3194 ps->ps_flags |= MD_MPS_ERROR;
3195 pb->b_error = cb->b_error;
3196 }
3197
3198 return (mirror_done_common(cb));
3199 }
3200
3201 /*
3202 * Split from the original mirror_done function so we can handle bufs after a
3203 * retry.
3204 * ps->ps_mx is already held in the caller of this function and the cb error
3205 * has already been checked and handled in the caller.
3206 */
3207 static int
mirror_done_common(struct buf * cb)3208 mirror_done_common(struct buf *cb)
3209 {
3210 struct buf *pb;
3211 mm_unit_t *un;
3212 mdi_unit_t *ui;
3213 md_mps_t *ps;
3214 md_mcs_t *cs;
3215 size_t end_rr, start_rr, current_rr;
3216
3217 /*LINTED*/
3218 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3219 ps = cs->cs_ps;
3220 pb = ps->ps_bp;
3221
3222 if (cb->b_flags & B_REMAPPED)
3223 bp_mapout(cb);
3224
3225 ps->ps_frags--;
3226 if (ps->ps_frags != 0) {
3227 mutex_exit(&ps->ps_mx);
3228 kmem_cache_free(mirror_child_cache, cs);
3229 return (1);
3230 }
3231 un = ps->ps_un;
3232 ui = ps->ps_ui;
3233
3234 /*
3235 * Do not update outstanding_writes if we're running with ABR
3236 * set for this mirror or the write() was issued with MD_STR_ABR set.
3237 * Also a resync initiated write() has no outstanding_writes update
3238 * either.
3239 */
3240 if (((cb->b_flags & B_READ) == 0) &&
3241 (un->un_nsm >= 2) &&
3242 (ps->ps_call == NULL) &&
3243 !((ui->ui_tstate & MD_ABR_CAP) || (ps->ps_flags & MD_MPS_ABR)) &&
3244 !(ps->ps_flags & MD_MPS_WRITE_AFTER_READ)) {
3245 BLK_TO_RR(end_rr, ps->ps_lastblk, un);
3246 BLK_TO_RR(start_rr, ps->ps_firstblk, un);
3247 mutex_enter(&un->un_resync_mx);
3248 for (current_rr = start_rr; current_rr <= end_rr; current_rr++)
3249 un->un_outstanding_writes[current_rr]--;
3250 mutex_exit(&un->un_resync_mx);
3251 }
3252 kmem_cache_free(mirror_child_cache, cs);
3253 mutex_exit(&ps->ps_mx);
3254
3255 if (ps->ps_call != NULL) {
3256 daemon_request(&md_done_daemon, ps->ps_call,
3257 (daemon_queue_t *)ps, REQ_OLD);
3258 return (1);
3259 }
3260
3261 if ((ps->ps_flags & MD_MPS_ERROR)) {
3262 daemon_request(&md_done_daemon, mirror_error,
3263 (daemon_queue_t *)ps, REQ_OLD);
3264 return (1);
3265 }
3266
3267 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
3268 mirror_overlap_tree_remove(ps);
3269
3270 /*
3271 * Handle Write-on-Write problem.
3272 * Skip In case of Raw and Direct I/O as they are
3273 * handled earlier.
3274 *
3275 */
3276 if (!(md_mirror_wow_flg & WOW_DISABLE) &&
3277 !(pb->b_flags & B_READ) &&
3278 !(ps->ps_flags & MD_MPS_WOW) &&
3279 !(pb->b_flags & B_PHYS) &&
3280 any_pages_dirty(pb)) {
3281 md_unit_readerexit(ps->ps_ui);
3282 daemon_request(&md_mstr_daemon, handle_wow,
3283 (daemon_queue_t *)ps, REQ_OLD);
3284 return (1);
3285 }
3286
3287 md_kstat_done(ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3288 MPS_FREE(mirror_parent_cache, ps);
3289 md_unit_readerexit(ui);
3290 md_biodone(pb);
3291 return (0);
3292 }
3293
3294 /*
3295 * Clear error state in submirror component if the retry worked after
3296 * a failfast error.
3297 */
3298 static void
clear_retry_error(struct buf * cb)3299 clear_retry_error(struct buf *cb)
3300 {
3301 int smi;
3302 md_mcs_t *cs;
3303 mm_unit_t *un;
3304 mdi_unit_t *ui_sm;
3305 mm_submirror_t *sm;
3306 mm_submirror_ic_t *smic;
3307 u_longlong_t cnt;
3308 md_m_shared_t *shared;
3309
3310 /*LINTED*/
3311 cs = (md_mcs_t *)((caddr_t)cb - md_mirror_mcs_buf_off);
3312 un = cs->cs_ps->ps_un;
3313
3314 for (smi = 0; smi < NMIRROR; smi++) {
3315 if (!SMS_BY_INDEX_IS(un, smi, SMS_INUSE))
3316 continue;
3317
3318 if (cb->b_edev == md_dev64_to_dev(un->un_sm[smi].sm_dev))
3319 break;
3320 }
3321
3322 if (smi >= NMIRROR)
3323 return;
3324
3325 sm = &un->un_sm[smi];
3326 smic = &un->un_smic[smi];
3327 cnt = cb->b_bcount;
3328
3329 ui_sm = MDI_UNIT(getminor(cb->b_edev));
3330 (void) md_unit_writerlock(ui_sm);
3331
3332 shared = (md_m_shared_t *)(*(smic->sm_shared_by_blk))(sm->sm_dev, sm,
3333 cb->b_blkno, &cnt);
3334
3335 if (shared->ms_flags & MDM_S_IOERR) {
3336 shared->ms_flags &= ~MDM_S_IOERR;
3337
3338 } else {
3339 /* the buf spans components and the first one is not erred */
3340 int cnt;
3341 int i;
3342
3343 cnt = (*(smic->sm_get_component_count))(sm->sm_dev, un);
3344 for (i = 0; i < cnt; i++) {
3345 shared = (md_m_shared_t *)(*(smic->sm_shared_by_indx))
3346 (sm->sm_dev, sm, i);
3347
3348 if (shared->ms_flags & MDM_S_IOERR &&
3349 shared->ms_state == CS_OKAY) {
3350
3351 shared->ms_flags &= ~MDM_S_IOERR;
3352 break;
3353 }
3354 }
3355 }
3356
3357 md_unit_writerexit(ui_sm);
3358 }
3359
3360 static size_t
mirror_map_read(md_mps_t * ps,md_mcs_t * cs,diskaddr_t blkno,u_longlong_t count)3361 mirror_map_read(
3362 md_mps_t *ps,
3363 md_mcs_t *cs,
3364 diskaddr_t blkno,
3365 u_longlong_t count
3366 )
3367 {
3368 mm_unit_t *un;
3369 buf_t *bp;
3370 u_longlong_t cando;
3371
3372 bp = &cs->cs_buf;
3373 un = ps->ps_un;
3374
3375 bp->b_lblkno = blkno;
3376 if (fast_select_read_unit(ps, cs) == 0) {
3377 bp->b_bcount = ldbtob(count);
3378 return (0);
3379 }
3380 bp->b_edev = md_dev64_to_dev(select_read_unit(un, blkno,
3381 count, &cando, 0, NULL, cs));
3382 bp->b_bcount = ldbtob(cando);
3383 if (count != cando)
3384 return (cando);
3385 return (0);
3386 }
3387
3388 static void
write_after_read(md_mps_t * ps)3389 write_after_read(md_mps_t *ps)
3390 {
3391 struct buf *pb;
3392 int flags;
3393
3394 if (ps->ps_flags & MD_MPS_ERROR) {
3395 mirror_error(ps);
3396 return;
3397 }
3398
3399 pb = ps->ps_bp;
3400 md_kstat_done(ps->ps_ui, pb, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3401 ps->ps_call = NULL;
3402 ps->ps_flags |= MD_MPS_WRITE_AFTER_READ;
3403 flags = MD_STR_NOTTOP | MD_STR_WAR;
3404 if (ps->ps_flags & MD_MPS_MAPPED)
3405 flags |= MD_STR_MAPPED;
3406 if (ps->ps_flags & MD_MPS_NOBLOCK)
3407 flags |= MD_NOBLOCK;
3408 if (ps->ps_flags & MD_MPS_DIRTY_RD)
3409 flags |= MD_STR_DIRTY_RD;
3410 (void) mirror_write_strategy(pb, flags, ps);
3411 }
3412
3413 static void
continue_serial(md_mps_t * ps)3414 continue_serial(md_mps_t *ps)
3415 {
3416 md_mcs_t *cs;
3417 buf_t *cb;
3418 mm_unit_t *un;
3419 int flags;
3420
3421 un = ps->ps_un;
3422 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
3423 mirror_child_init(cs);
3424 cb = &cs->cs_buf;
3425 ps->ps_call = NULL;
3426 ps->ps_frags = 1;
3427 (void) mirror_map_write(un, cs, ps, 0);
3428 flags = MD_STR_NOTTOP;
3429 if (ps->ps_flags & MD_MPS_MAPPED)
3430 flags |= MD_STR_MAPPED;
3431 md_call_strategy(cb, flags, NULL);
3432 }
3433
3434 static int
mirror_map_write(mm_unit_t * un,md_mcs_t * cs,md_mps_t * ps,int war)3435 mirror_map_write(mm_unit_t *un, md_mcs_t *cs, md_mps_t *ps, int war)
3436 {
3437 int i;
3438 dev_t dev; /* needed for bioclone, so not md_dev64_t */
3439 buf_t *cb;
3440 buf_t *pb;
3441 diskaddr_t blkno;
3442 size_t bcount;
3443 off_t offset;
3444
3445 pb = ps->ps_bp;
3446 cb = &cs->cs_buf;
3447 cs->cs_ps = ps;
3448
3449 i = md_find_nth_unit(ps->ps_writable_sm, ps->ps_current_sm);
3450
3451 dev = md_dev64_to_dev(un->un_sm[i].sm_dev);
3452
3453 blkno = pb->b_lblkno;
3454 bcount = pb->b_bcount;
3455 offset = 0;
3456 if (war && (blkno == 0) && (un->c.un_flag & MD_LABELED)) {
3457 blkno = DK_LABEL_LOC + 1;
3458 /*
3459 * This handles the case where we're requesting
3460 * a write to block 0 on a label partition
3461 * and the request size was smaller than the
3462 * size of the label. If this is the case
3463 * then we'll return -1. Failure to do so will
3464 * either cause the calling thread to hang due to
3465 * an ssd bug, or worse if the bcount were allowed
3466 * to go negative (ie large).
3467 */
3468 if (bcount <= DEV_BSIZE*(DK_LABEL_LOC + 1))
3469 return (-1);
3470 bcount -= (DEV_BSIZE*(DK_LABEL_LOC + 1));
3471 offset = (DEV_BSIZE*(DK_LABEL_LOC + 1));
3472 }
3473
3474 cb = md_bioclone(pb, offset, bcount, dev, blkno, mirror_done,
3475 cb, KM_NOSLEEP);
3476 if (war)
3477 cb->b_flags = (cb->b_flags & ~B_READ) | B_WRITE;
3478
3479 /*
3480 * If the submirror is in the erred stated, check if any component is
3481 * in the Last Erred state. If so, we don't want to use the B_FAILFAST
3482 * flag on the IO.
3483 *
3484 * Provide a fast path for the non-erred case (which should be the
3485 * normal case).
3486 */
3487 if (un->un_sm[i].sm_flags & MD_SM_FAILFAST) {
3488 if (un->un_sm[i].sm_state & SMS_COMP_ERRED) {
3489 mm_submirror_t *sm;
3490 mm_submirror_ic_t *smic;
3491 int ci;
3492 int compcnt;
3493
3494 sm = &un->un_sm[i];
3495 smic = &un->un_smic[i];
3496
3497 compcnt = (*(smic->sm_get_component_count))
3498 (sm->sm_dev, un);
3499 for (ci = 0; ci < compcnt; ci++) {
3500 md_m_shared_t *shared;
3501
3502 shared = (md_m_shared_t *)
3503 (*(smic->sm_shared_by_indx))(sm->sm_dev,
3504 sm, ci);
3505
3506 if (shared->ms_state == CS_LAST_ERRED)
3507 break;
3508 }
3509 if (ci >= compcnt)
3510 cb->b_flags |= B_FAILFAST;
3511
3512 } else {
3513 cb->b_flags |= B_FAILFAST;
3514 }
3515 }
3516
3517 ps->ps_current_sm++;
3518 if (ps->ps_current_sm != ps->ps_active_cnt) {
3519 if (un->un_write_option == WR_SERIAL) {
3520 ps->ps_call = continue_serial;
3521 return (0);
3522 }
3523 return (1);
3524 }
3525 return (0);
3526 }
3527
3528 /*
3529 * directed_read_done:
3530 * ------------------
3531 * Completion routine called when a DMR request has been returned from the
3532 * underlying driver. Wake-up the original ioctl() and return the data to
3533 * the user.
3534 */
3535 static void
directed_read_done(md_mps_t * ps)3536 directed_read_done(md_mps_t *ps)
3537 {
3538 mm_unit_t *un;
3539 mdi_unit_t *ui;
3540
3541 un = ps->ps_un;
3542 ui = ps->ps_ui;
3543
3544 md_unit_readerexit(ui);
3545 md_kstat_done(ui, ps->ps_bp, (ps->ps_flags & MD_MPS_WRITE_AFTER_READ));
3546 ps->ps_call = NULL;
3547
3548 mutex_enter(&un->un_dmr_mx);
3549 cv_signal(&un->un_dmr_cv);
3550 mutex_exit(&un->un_dmr_mx);
3551
3552 /* release the parent structure */
3553 kmem_cache_free(mirror_parent_cache, ps);
3554 }
3555
3556 /*
3557 * daemon_io:
3558 * ------------
3559 * Called to issue a mirror_write_strategy() or mirror_read_strategy
3560 * call from a blockable context. NOTE: no mutex can be held on entry to this
3561 * routine
3562 */
3563 static void
daemon_io(daemon_queue_t * dq)3564 daemon_io(daemon_queue_t *dq)
3565 {
3566 md_mps_t *ps = (md_mps_t *)dq;
3567 int flag = MD_STR_NOTTOP;
3568 buf_t *pb = ps->ps_bp;
3569
3570 if (ps->ps_flags & MD_MPS_MAPPED)
3571 flag |= MD_STR_MAPPED;
3572 if (ps->ps_flags & MD_MPS_WOW)
3573 flag |= MD_STR_WOW;
3574 if (ps->ps_flags & MD_MPS_WRITE_AFTER_READ)
3575 flag |= MD_STR_WAR;
3576 if (ps->ps_flags & MD_MPS_ABR)
3577 flag |= MD_STR_ABR;
3578 if (ps->ps_flags & MD_MPS_BLOCKABLE_IO)
3579 flag |= MD_STR_BLOCK_OK;
3580
3581 /*
3582 * If this is a resync read, ie MD_STR_DIRTY_RD not set, set
3583 * MD_STR_WAR before calling mirror_read_strategy
3584 */
3585 if (pb->b_flags & B_READ) {
3586 if (!(ps->ps_flags & MD_MPS_DIRTY_RD))
3587 flag |= MD_STR_WAR;
3588 mirror_read_strategy(pb, flag, ps);
3589 } else
3590 mirror_write_strategy(pb, flag, ps);
3591 }
3592
3593 /*
3594 * update_resync:
3595 * -------------
3596 * Called to update the in-core version of the resync record with the latest
3597 * version that was committed to disk when the previous mirror owner
3598 * relinquished ownership. This call is likely to block as we must hold-off
3599 * any current resync processing that may be occurring.
3600 * On completion of the resync record update we issue the mirror_write_strategy
3601 * call to complete the i/o that first started this sequence. To remove a race
3602 * condition between a new write() request which is submitted and the resync
3603 * record update we acquire the writerlock. This will hold off all i/o to the
3604 * mirror until the resync update has completed.
3605 * NOTE: no mutex can be held on entry to this routine
3606 */
3607 static void
update_resync(daemon_queue_t * dq)3608 update_resync(daemon_queue_t *dq)
3609 {
3610 md_mps_t *ps = (md_mps_t *)dq;
3611 buf_t *pb = ps->ps_bp;
3612 mdi_unit_t *ui = ps->ps_ui;
3613 mm_unit_t *un = MD_UNIT(ui->ui_link.ln_id);
3614 set_t setno;
3615 int restart_resync;
3616
3617 mutex_enter(&un->un_rrp_inflight_mx);
3618 (void) md_unit_writerlock(ui);
3619 ps->ps_un = un;
3620 setno = MD_MIN2SET(getminor(pb->b_edev));
3621 if (mddb_reread_rr(setno, un->un_rr_dirty_recid) == 0) {
3622 /*
3623 * Synchronize our in-core view of what regions need to be
3624 * resync'd with the on-disk version.
3625 */
3626 mirror_copy_rr(howmany(un->un_rrd_num, NBBY), un->un_resync_bm,
3627 un->un_dirty_bm);
3628
3629 /* Region dirty map is now up to date */
3630 }
3631 restart_resync = (un->un_rs_thread_flags & MD_RI_BLOCK_OWNER) ? 1 : 0;
3632 md_unit_writerexit(ui);
3633 mutex_exit(&un->un_rrp_inflight_mx);
3634
3635 /* Restart the resync thread if it was previously blocked */
3636 if (restart_resync) {
3637 mutex_enter(&un->un_rs_thread_mx);
3638 un->un_rs_thread_flags &= ~MD_RI_BLOCK_OWNER;
3639 cv_signal(&un->un_rs_thread_cv);
3640 mutex_exit(&un->un_rs_thread_mx);
3641 }
3642 /* Continue with original deferred i/o */
3643 daemon_io(dq);
3644 }
3645
3646 /*
3647 * owner_timeout:
3648 * -------------
3649 * Called if the original mdmn_ksend_message() failed and the request is to be
3650 * retried. Reattempt the original ownership change.
3651 *
3652 * NOTE: called at interrupt context (see timeout(9f)).
3653 */
3654 static void
owner_timeout(void * arg)3655 owner_timeout(void *arg)
3656 {
3657 daemon_queue_t *dq = (daemon_queue_t *)arg;
3658
3659 daemon_request(&md_mirror_daemon, become_owner, dq, REQ_OLD);
3660 }
3661
3662 /*
3663 * become_owner:
3664 * ------------
3665 * Called to issue RPC request to become the owner of the mirror
3666 * associated with this i/o request. We assume that the ownership request
3667 * is synchronous, so if it succeeds we will issue the request via
3668 * mirror_write_strategy().
3669 * If multiple i/o's are outstanding we will be called from the mirror_daemon
3670 * service thread.
3671 * NOTE: no mutex should be held on entry to this routine.
3672 */
3673 static void
become_owner(daemon_queue_t * dq)3674 become_owner(daemon_queue_t *dq)
3675 {
3676 md_mps_t *ps = (md_mps_t *)dq;
3677 mm_unit_t *un = ps->ps_un;
3678 buf_t *pb = ps->ps_bp;
3679 set_t setno;
3680 md_mn_kresult_t *kres;
3681 int msg_flags = md_mirror_msg_flags;
3682 md_mps_t *ps1;
3683
3684 ASSERT(dq->dq_next == NULL && dq->dq_prev == NULL);
3685
3686 /*
3687 * If we're already the mirror owner we do not need to send a message
3688 * but can simply process the i/o request immediately.
3689 * If we've already sent the request to become owner we requeue the
3690 * request as we're waiting for the synchronous ownership message to
3691 * be processed.
3692 */
3693 if (MD_MN_MIRROR_OWNER(un)) {
3694 /*
3695 * As the strategy() call will potentially block we need to
3696 * punt this to a separate thread and complete this request
3697 * as quickly as possible. Note: if we're a read request
3698 * this must be a resync, we cannot afford to be queued
3699 * behind any intervening i/o requests. In this case we put the
3700 * request on the md_mirror_rs_daemon queue.
3701 */
3702 if (pb->b_flags & B_READ) {
3703 daemon_request(&md_mirror_rs_daemon, daemon_io, dq,
3704 REQ_OLD);
3705 } else {
3706 daemon_request(&md_mirror_io_daemon, daemon_io, dq,
3707 REQ_OLD);
3708 }
3709 } else {
3710 mutex_enter(&un->un_owner_mx);
3711 if ((un->un_owner_state & MM_MN_OWNER_SENT) == 0) {
3712 md_mn_req_owner_t *msg;
3713 int rval = 0;
3714
3715 /*
3716 * Check to see that we haven't exceeded the maximum
3717 * retry count. If we have we fail the i/o as the
3718 * comms mechanism has become wedged beyond recovery.
3719 */
3720 if (dq->qlen++ >= MD_OWNER_RETRIES) {
3721 mutex_exit(&un->un_owner_mx);
3722 cmn_err(CE_WARN,
3723 "md_mirror: Request exhausted ownership "
3724 "retry limit of %d attempts", dq->qlen);
3725 pb->b_error = EIO;
3726 pb->b_flags |= B_ERROR;
3727 pb->b_resid = pb->b_bcount;
3728 kmem_cache_free(mirror_parent_cache, ps);
3729 md_biodone(pb);
3730 return;
3731 }
3732
3733 /*
3734 * Issue request to change ownership. The call is
3735 * synchronous so when it returns we can complete the
3736 * i/o (if successful), or enqueue it again so that
3737 * the operation will be retried.
3738 */
3739 un->un_owner_state |= MM_MN_OWNER_SENT;
3740 mutex_exit(&un->un_owner_mx);
3741
3742 msg = kmem_zalloc(sizeof (md_mn_req_owner_t), KM_SLEEP);
3743 setno = MD_MIN2SET(getminor(pb->b_edev));
3744 msg->mnum = MD_SID(un);
3745 msg->owner = md_mn_mynode_id;
3746 msg_flags |= MD_MSGF_NO_LOG;
3747 /*
3748 * If this IO is triggered by updating a watermark,
3749 * it might be issued by the creation of a softpartition
3750 * while the commd subsystem is suspended.
3751 * We don't want this message to block.
3752 */
3753 if (ps->ps_flags & MD_MPS_WMUPDATE) {
3754 msg_flags |= MD_MSGF_OVERRIDE_SUSPEND;
3755 }
3756
3757 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
3758 rval = mdmn_ksend_message(setno,
3759 MD_MN_MSG_REQUIRE_OWNER, msg_flags, 0,
3760 (char *)msg, sizeof (md_mn_req_owner_t), kres);
3761
3762 kmem_free(msg, sizeof (md_mn_req_owner_t));
3763
3764 if (MDMN_KSEND_MSG_OK(rval, kres)) {
3765 dq->qlen = 0;
3766 /*
3767 * Successfully changed owner, reread the
3768 * resync record so that we have a valid idea of
3769 * any previously committed incomplete write()s.
3770 * NOTE: As we need to acquire the resync mutex
3771 * this may block, so we defer it to a separate
3772 * thread handler. This makes us (effectively)
3773 * non-blocking once the ownership message
3774 * handling has completed.
3775 */
3776 mutex_enter(&un->un_owner_mx);
3777 if (un->un_owner_state & MM_MN_BECOME_OWNER) {
3778 un->un_mirror_owner = md_mn_mynode_id;
3779 /* Sets owner of un_rr_dirty record */
3780 if (un->un_rr_dirty_recid)
3781 (void) mddb_setowner(
3782 un->un_rr_dirty_recid,
3783 md_mn_mynode_id);
3784 un->un_owner_state &=
3785 ~MM_MN_BECOME_OWNER;
3786 /*
3787 * Release the block on the current
3788 * resync region if it is blocked
3789 */
3790 ps1 = un->un_rs_prev_overlap;
3791 if ((ps1 != NULL) &&
3792 (ps1->ps_flags & MD_MPS_ON_OVERLAP))
3793 mirror_overlap_tree_remove(ps1);
3794 mutex_exit(&un->un_owner_mx);
3795
3796 /*
3797 * If we're a read, this must be a
3798 * resync request, issue
3799 * the i/o request on the
3800 * md_mirror_rs_daemon queue. This is
3801 * to avoid a deadlock between the
3802 * resync_unit thread and
3803 * subsequent i/o requests that may
3804 * block on the resync region.
3805 */
3806 if (pb->b_flags & B_READ) {
3807 daemon_request(
3808 &md_mirror_rs_daemon,
3809 update_resync, dq, REQ_OLD);
3810 } else {
3811 daemon_request(
3812 &md_mirror_io_daemon,
3813 update_resync, dq, REQ_OLD);
3814 }
3815 kmem_free(kres,
3816 sizeof (md_mn_kresult_t));
3817 return;
3818 } else {
3819 /*
3820 * Some other node has beaten us to
3821 * obtain ownership. We need to
3822 * reschedule our ownership request
3823 */
3824 mutex_exit(&un->un_owner_mx);
3825 }
3826 } else {
3827 mdmn_ksend_show_error(rval, kres,
3828 "MD_MN_MSG_REQUIRE_OWNER");
3829 /*
3830 * Message transport failure is handled by the
3831 * comms layer. If the ownership change request
3832 * does not succeed we need to flag the error to
3833 * the initiator of the i/o. This is handled by
3834 * the retry logic above. As the request failed
3835 * we do not know _who_ the owner of the mirror
3836 * currently is. We reset our idea of the owner
3837 * to None so that any further write()s will
3838 * attempt to become the owner again. This stops
3839 * multiple nodes writing to the same mirror
3840 * simultaneously.
3841 */
3842 mutex_enter(&un->un_owner_mx);
3843 un->un_owner_state &=
3844 ~(MM_MN_OWNER_SENT|MM_MN_BECOME_OWNER);
3845 un->un_mirror_owner = MD_MN_MIRROR_UNOWNED;
3846 mutex_exit(&un->un_owner_mx);
3847 }
3848 kmem_free(kres, sizeof (md_mn_kresult_t));
3849 } else
3850 mutex_exit(&un->un_owner_mx);
3851
3852 /*
3853 * Re-enqueue this request on the deferred i/o list. Delay the
3854 * request for md_mirror_owner_to usecs to stop thrashing.
3855 */
3856 (void) timeout(owner_timeout, dq,
3857 drv_usectohz(md_mirror_owner_to));
3858 }
3859 }
3860
3861 static void
mirror_write_strategy(buf_t * pb,int flag,void * private)3862 mirror_write_strategy(buf_t *pb, int flag, void *private)
3863 {
3864 md_mps_t *ps;
3865 md_mcs_t *cs;
3866 int more;
3867 mm_unit_t *un;
3868 mdi_unit_t *ui;
3869 buf_t *cb; /* child buf pointer */
3870 set_t setno;
3871 int rs_on_overlap = 0;
3872
3873 ui = MDI_UNIT(getminor(pb->b_edev));
3874 un = (mm_unit_t *)MD_UNIT(getminor(pb->b_edev));
3875
3876
3877 md_kstat_waitq_enter(ui);
3878
3879 /*
3880 * If a state change is in progress for this mirror in a MN set,
3881 * suspend all non-resync writes until the state change is complete.
3882 * The objective of this suspend is to ensure that it is not
3883 * possible for one node to read data from a submirror that another node
3884 * has not written to because of the state change. Therefore we
3885 * suspend all writes until the state change has been made. As it is
3886 * not possible to read from the target of a resync, there is no need
3887 * to suspend resync writes.
3888 * Note that we only block here if the caller can handle a busy-wait.
3889 * The MD_STR_BLOCK_OK flag is set for daemon_io originated i/o only.
3890 */
3891
3892 if (!(flag & MD_STR_WAR)) {
3893 if (flag & MD_STR_BLOCK_OK) {
3894 mutex_enter(&un->un_suspend_wr_mx);
3895 while (un->un_suspend_wr_flag) {
3896 cv_wait(&un->un_suspend_wr_cv,
3897 &un->un_suspend_wr_mx);
3898 }
3899 mutex_exit(&un->un_suspend_wr_mx);
3900 }
3901 (void) md_unit_readerlock(ui);
3902 }
3903
3904 if (!(flag & MD_STR_NOTTOP)) {
3905 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
3906 md_kstat_waitq_exit(ui);
3907 return;
3908 }
3909 }
3910
3911 setno = MD_MIN2SET(getminor(pb->b_edev));
3912
3913 /* If an ABR write has been requested, set MD_STR_ABR flag */
3914 if (MD_MNSET_SETNO(setno) && (pb->b_flags & B_ABRWRITE))
3915 flag |= MD_STR_ABR;
3916
3917 if (private == NULL) {
3918 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
3919 mirror_parent_init(ps);
3920 } else {
3921 ps = private;
3922 private = NULL;
3923 }
3924 if (flag & MD_STR_MAPPED)
3925 ps->ps_flags |= MD_MPS_MAPPED;
3926
3927 if (flag & MD_STR_WOW)
3928 ps->ps_flags |= MD_MPS_WOW;
3929
3930 if (flag & MD_STR_ABR)
3931 ps->ps_flags |= MD_MPS_ABR;
3932
3933 if (flag & MD_STR_WMUPDATE)
3934 ps->ps_flags |= MD_MPS_WMUPDATE;
3935
3936 /*
3937 * Save essential information from the original buffhdr
3938 * in the md_save structure.
3939 */
3940 ps->ps_un = un;
3941 ps->ps_ui = ui;
3942 ps->ps_bp = pb;
3943 ps->ps_addr = pb->b_un.b_addr;
3944 ps->ps_firstblk = pb->b_lblkno;
3945 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
3946 ps->ps_changecnt = un->un_changecnt;
3947
3948 /*
3949 * Check for suspended writes here. This is where we can defer the
3950 * write request to the daemon_io queue which will then call us with
3951 * the MD_STR_BLOCK_OK flag set and we'll busy-wait (if necessary) at
3952 * the top of this routine.
3953 */
3954 if (!(flag & MD_STR_WAR) && !(flag & MD_STR_BLOCK_OK)) {
3955 mutex_enter(&un->un_suspend_wr_mx);
3956 if (un->un_suspend_wr_flag) {
3957 ps->ps_flags |= MD_MPS_BLOCKABLE_IO;
3958 mutex_exit(&un->un_suspend_wr_mx);
3959 md_unit_readerexit(ui);
3960 daemon_request(&md_mirror_daemon, daemon_io,
3961 (daemon_queue_t *)ps, REQ_OLD);
3962 return;
3963 }
3964 mutex_exit(&un->un_suspend_wr_mx);
3965 }
3966
3967 /*
3968 * If not MN owner and this is an ABR write, make sure the current
3969 * resync region is in the overlaps tree
3970 */
3971 mutex_enter(&un->un_owner_mx);
3972 if (MD_MNSET_SETNO(setno) && (!(MD_MN_MIRROR_OWNER(un))) &&
3973 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
3974 md_mps_t *ps1;
3975 /* Block the current resync region, if not already blocked */
3976 ps1 = un->un_rs_prev_overlap;
3977
3978 if ((ps1 != NULL) && ((ps1->ps_firstblk != 0) ||
3979 (ps1->ps_lastblk != 0))) {
3980 /* Drop locks to avoid deadlock */
3981 mutex_exit(&un->un_owner_mx);
3982 md_unit_readerexit(ui);
3983 wait_for_overlaps(ps1, MD_OVERLAP_ALLOW_REPEAT);
3984 rs_on_overlap = 1;
3985 (void) md_unit_readerlock(ui);
3986 mutex_enter(&un->un_owner_mx);
3987 /*
3988 * Check to see if we have obtained ownership
3989 * while waiting for overlaps. If we have, remove
3990 * the resync_region entry from the overlap tree
3991 */
3992 if (MD_MN_MIRROR_OWNER(un) &&
3993 (ps1->ps_flags & MD_MPS_ON_OVERLAP)) {
3994 mirror_overlap_tree_remove(ps1);
3995 rs_on_overlap = 0;
3996 }
3997 }
3998 }
3999 mutex_exit(&un->un_owner_mx);
4000
4001
4002 /*
4003 * following keep write after read from writing to the
4004 * source in the case where it all came from one place
4005 */
4006 if (flag & MD_STR_WAR) {
4007 int abort_write = 0;
4008 /*
4009 * We are perfoming a write-after-read. This is either as a
4010 * result of a resync read or as a result of a read in a
4011 * dirty resync region when the optimized resync is not
4012 * complete. If in a MN set and a resync generated i/o,
4013 * if the current block is not in the current
4014 * resync region terminate the write as another node must have
4015 * completed this resync region
4016 */
4017 if ((MD_MNSET_SETNO(MD_UN2SET(un))) &&
4018 (!(flag & MD_STR_DIRTY_RD))) {
4019 if (!IN_RESYNC_REGION(un, ps))
4020 abort_write = 1;
4021 }
4022 if ((select_write_after_read_units(un, ps) == 0) ||
4023 (abort_write)) {
4024 #ifdef DEBUG
4025 if (mirror_debug_flag)
4026 printf("Abort resync write on %x, block %lld\n",
4027 MD_SID(un), ps->ps_firstblk);
4028 #endif
4029 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4030 mirror_overlap_tree_remove(ps);
4031 kmem_cache_free(mirror_parent_cache, ps);
4032 md_kstat_waitq_exit(ui);
4033 md_unit_readerexit(ui);
4034 md_biodone(pb);
4035 return;
4036 }
4037 } else {
4038 select_write_units(un, ps);
4039
4040 /* Drop readerlock to avoid deadlock */
4041 md_unit_readerexit(ui);
4042 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4043 un = md_unit_readerlock(ui);
4044 /*
4045 * For a MN set with an ABR write, if we are now the
4046 * owner and we have a resync region in the overlap
4047 * tree, remove the entry from overlaps and retry the write.
4048 */
4049
4050 if (MD_MNSET_SETNO(setno) &&
4051 ((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR))) {
4052 mutex_enter(&un->un_owner_mx);
4053 if (((MD_MN_MIRROR_OWNER(un))) && rs_on_overlap) {
4054 mirror_overlap_tree_remove(ps);
4055 md_kstat_waitq_exit(ui);
4056 mutex_exit(&un->un_owner_mx);
4057 md_unit_readerexit(ui);
4058 daemon_request(&md_mirror_daemon, daemon_io,
4059 (daemon_queue_t *)ps, REQ_OLD);
4060 return;
4061 }
4062 mutex_exit(&un->un_owner_mx);
4063 }
4064 }
4065
4066 /*
4067 * For Multinode mirrors with no owner and a Resync Region (not ABR)
4068 * we need to become the mirror owner before continuing with the
4069 * write(). For ABR mirrors we check that we 'own' the resync if
4070 * we're in write-after-read mode. We do this _after_ ensuring that
4071 * there are no overlaps to ensure that once we know that we are
4072 * the owner, the readerlock will not be released until the write is
4073 * complete. As a change of ownership in a MN set requires the
4074 * writerlock, this ensures that ownership cannot be changed until
4075 * the write is complete.
4076 */
4077 if (MD_MNSET_SETNO(setno) && (!((ui->ui_tstate & MD_ABR_CAP) ||
4078 (flag & MD_STR_ABR)) || (flag & MD_STR_WAR))) {
4079 if (MD_MN_NO_MIRROR_OWNER(un)) {
4080 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4081 mirror_overlap_tree_remove(ps);
4082 md_kstat_waitq_exit(ui);
4083 ASSERT(!(flag & MD_STR_WAR));
4084 md_unit_readerexit(ui);
4085 daemon_request(&md_mirror_daemon, become_owner,
4086 (daemon_queue_t *)ps, REQ_OLD);
4087 return;
4088 }
4089 }
4090
4091 /*
4092 * Mark resync region if mirror has a Resync Region _and_ we are not
4093 * a resync initiated write(). Don't mark region if we're flagged as
4094 * an ABR write.
4095 */
4096 if (!((ui->ui_tstate & MD_ABR_CAP) || (flag & MD_STR_ABR)) &&
4097 !(flag & MD_STR_WAR)) {
4098 if (mirror_mark_resync_region(un, ps->ps_firstblk,
4099 ps->ps_lastblk, md_mn_mynode_id)) {
4100 pb->b_flags |= B_ERROR;
4101 pb->b_resid = pb->b_bcount;
4102 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4103 mirror_overlap_tree_remove(ps);
4104 kmem_cache_free(mirror_parent_cache, ps);
4105 md_kstat_waitq_exit(ui);
4106 md_unit_readerexit(ui);
4107 md_biodone(pb);
4108 return;
4109 }
4110 }
4111
4112 ps->ps_childbflags = pb->b_flags | B_WRITE;
4113 ps->ps_childbflags &= ~B_READ;
4114 if (flag & MD_STR_MAPPED)
4115 ps->ps_childbflags &= ~B_PAGEIO;
4116
4117 if (!(flag & MD_STR_NOTTOP) && panicstr)
4118 /* Disable WOW and don't free ps */
4119 ps->ps_flags |= (MD_MPS_WOW|MD_MPS_DONTFREE);
4120
4121 md_kstat_waitq_to_runq(ui);
4122
4123 /*
4124 * Treat Raw and Direct I/O as Write-on-Write always
4125 */
4126
4127 if (!(md_mirror_wow_flg & WOW_DISABLE) &&
4128 (md_mirror_wow_flg & WOW_PHYS_ENABLE) &&
4129 (pb->b_flags & B_PHYS) &&
4130 !(ps->ps_flags & MD_MPS_WOW)) {
4131 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4132 mirror_overlap_tree_remove(ps);
4133 md_unit_readerexit(ui);
4134 daemon_request(&md_mstr_daemon, handle_wow,
4135 (daemon_queue_t *)ps, REQ_OLD);
4136 return;
4137 }
4138
4139 ps->ps_frags = 1;
4140 do {
4141 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4142 mirror_child_init(cs);
4143 cb = &cs->cs_buf;
4144 more = mirror_map_write(un, cs, ps, (flag & MD_STR_WAR));
4145
4146 /*
4147 * This handles the case where we're requesting
4148 * a write to block 0 on a label partition. (more < 0)
4149 * means that the request size was smaller than the
4150 * size of the label. If so this request is done.
4151 */
4152 if (more < 0) {
4153 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4154 mirror_overlap_tree_remove(ps);
4155 md_kstat_runq_exit(ui);
4156 kmem_cache_free(mirror_child_cache, cs);
4157 kmem_cache_free(mirror_parent_cache, ps);
4158 md_unit_readerexit(ui);
4159 md_biodone(pb);
4160 return;
4161 }
4162 if (more) {
4163 mutex_enter(&ps->ps_mx);
4164 ps->ps_frags++;
4165 mutex_exit(&ps->ps_mx);
4166 }
4167 md_call_strategy(cb, flag, private);
4168 } while (more);
4169
4170 if (!(flag & MD_STR_NOTTOP) && panicstr) {
4171 while (!(ps->ps_flags & MD_MPS_DONE)) {
4172 md_daemon(1, &md_done_daemon);
4173 drv_usecwait(10);
4174 }
4175 kmem_cache_free(mirror_parent_cache, ps);
4176 }
4177 }
4178
4179 static void
mirror_read_strategy(buf_t * pb,int flag,void * private)4180 mirror_read_strategy(buf_t *pb, int flag, void *private)
4181 {
4182 md_mps_t *ps;
4183 md_mcs_t *cs;
4184 size_t more;
4185 mm_unit_t *un;
4186 mdi_unit_t *ui;
4187 size_t current_count;
4188 diskaddr_t current_blkno;
4189 off_t current_offset;
4190 buf_t *cb; /* child buf pointer */
4191 set_t setno;
4192
4193 ui = MDI_UNIT(getminor(pb->b_edev));
4194
4195 md_kstat_waitq_enter(ui);
4196
4197 un = (mm_unit_t *)md_unit_readerlock(ui);
4198
4199 if (!(flag & MD_STR_NOTTOP)) {
4200 if (md_checkbuf(ui, (md_unit_t *)un, pb)) {
4201 md_kstat_waitq_exit(ui);
4202 return;
4203 }
4204 }
4205
4206 if (private == NULL) {
4207 ps = kmem_cache_alloc(mirror_parent_cache, MD_ALLOCFLAGS);
4208 mirror_parent_init(ps);
4209 } else {
4210 ps = private;
4211 private = NULL;
4212 }
4213
4214 if (flag & MD_STR_MAPPED)
4215 ps->ps_flags |= MD_MPS_MAPPED;
4216 if (flag & MD_NOBLOCK)
4217 ps->ps_flags |= MD_MPS_NOBLOCK;
4218 if (flag & MD_STR_WMUPDATE)
4219 ps->ps_flags |= MD_MPS_WMUPDATE;
4220
4221 /*
4222 * Check to see if this is a DMR driven read. If so we need to use the
4223 * specified side (in un->un_dmr_last_read) for the source of the data.
4224 */
4225 if (flag & MD_STR_DMR)
4226 ps->ps_flags |= MD_MPS_DMR;
4227
4228 /*
4229 * Save essential information from the original buffhdr
4230 * in the md_save structure.
4231 */
4232 ps->ps_un = un;
4233 ps->ps_ui = ui;
4234 ps->ps_bp = pb;
4235 ps->ps_addr = pb->b_un.b_addr;
4236 ps->ps_firstblk = pb->b_lblkno;
4237 ps->ps_lastblk = pb->b_lblkno + lbtodb(pb->b_bcount) - 1;
4238 ps->ps_changecnt = un->un_changecnt;
4239
4240 current_count = btodb(pb->b_bcount);
4241 current_blkno = pb->b_lblkno;
4242 current_offset = 0;
4243
4244 /*
4245 * If flag has MD_STR_WAR set this means that the read is issued by a
4246 * resync thread which may or may not be an optimised resync.
4247 *
4248 * If MD_UN_OPT_NOT_DONE is set this means that the optimized resync
4249 * code has not completed; either a resync has not started since snarf,
4250 * or there is an optimized resync in progress.
4251 *
4252 * We need to generate a write after this read in the following two
4253 * cases,
4254 *
4255 * 1. Any Resync-Generated read
4256 *
4257 * 2. Any read to a DIRTY REGION if there is an optimized resync
4258 * pending or in progress.
4259 *
4260 * The write after read is done in these cases to ensure that all sides
4261 * of the mirror are in sync with the read data and that it is not
4262 * possible for an application to read the same block multiple times
4263 * and get different data.
4264 *
4265 * This would be possible if the block was in a dirty region.
4266 *
4267 * If we're performing a directed read we don't write the data out as
4268 * the application is responsible for restoring the mirror to a known
4269 * state.
4270 */
4271 if (((MD_STATUS(un) & MD_UN_OPT_NOT_DONE) || (flag & MD_STR_WAR)) &&
4272 !(flag & MD_STR_DMR)) {
4273 size_t start_rr, i, end_rr;
4274 int region_dirty = 1;
4275
4276 /*
4277 * We enter here under three circumstances,
4278 *
4279 * MD_UN_OPT_NOT_DONE MD_STR_WAR
4280 * 0 1
4281 * 1 0
4282 * 1 1
4283 *
4284 * To be optimal we only care to explicitly check for dirty
4285 * regions in the second case since if MD_STR_WAR is set we
4286 * always do the write after read.
4287 */
4288 if (!(flag & MD_STR_WAR)) {
4289 BLK_TO_RR(end_rr, ps->ps_lastblk, un);
4290 BLK_TO_RR(start_rr, ps->ps_firstblk, un);
4291
4292 for (i = start_rr; i <= end_rr; i++)
4293 if ((region_dirty = IS_KEEPDIRTY(i, un)) != 0)
4294 break;
4295 }
4296
4297 if ((region_dirty) &&
4298 !(md_get_setstatus(MD_UN2SET(un)) & MD_SET_STALE)) {
4299 ps->ps_call = write_after_read;
4300 /*
4301 * Mark this as a RESYNC_READ in ps_flags.
4302 * This is used if the read fails during a
4303 * resync of a 3-way mirror to ensure that
4304 * the retried read to the remaining
4305 * good submirror has MD_STR_WAR set. This
4306 * is needed to ensure that the resync write
4307 * (write-after-read) takes place.
4308 */
4309 ps->ps_flags |= MD_MPS_RESYNC_READ;
4310
4311 /*
4312 * If MD_STR_FLAG_ERR is set in the flags we
4313 * set MD_MPS_FLAG_ERROR so that an error on the resync
4314 * write (issued by write_after_read) will be flagged
4315 * to the biowait'ing resync thread. This allows us to
4316 * avoid issuing further resync requests to a device
4317 * that has had a write failure.
4318 */
4319 if (flag & MD_STR_FLAG_ERR)
4320 ps->ps_flags |= MD_MPS_FLAG_ERROR;
4321
4322 setno = MD_UN2SET(un);
4323 /*
4324 * Drop the readerlock to avoid
4325 * deadlock
4326 */
4327 md_unit_readerexit(ui);
4328 wait_for_overlaps(ps, MD_OVERLAP_NO_REPEAT);
4329 un = md_unit_readerlock(ui);
4330 /*
4331 * Ensure that we are owner
4332 */
4333 if (MD_MNSET_SETNO(setno)) {
4334 /*
4335 * For a non-resync read that requires a
4336 * write-after-read to be done, set a flag
4337 * in the parent structure, so that the
4338 * write_strategy routine can omit the
4339 * test that the write is still within the
4340 * resync region
4341 */
4342 if (!(flag & MD_STR_WAR))
4343 ps->ps_flags |= MD_MPS_DIRTY_RD;
4344
4345 /*
4346 * Before reading the buffer, see if
4347 * there is an owner.
4348 */
4349 if (MD_MN_NO_MIRROR_OWNER(un)) {
4350 ps->ps_call = NULL;
4351 mirror_overlap_tree_remove(ps);
4352 md_kstat_waitq_exit(ui);
4353 md_unit_readerexit(ui);
4354 daemon_request(
4355 &md_mirror_daemon,
4356 become_owner,
4357 (daemon_queue_t *)ps,
4358 REQ_OLD);
4359 return;
4360 }
4361 /*
4362 * For a resync read, check to see if I/O is
4363 * outside of the current resync region, or
4364 * the resync has finished. If so
4365 * just terminate the I/O
4366 */
4367 if ((flag & MD_STR_WAR) &&
4368 (!(un->c.un_status & MD_UN_WAR) ||
4369 (!IN_RESYNC_REGION(un, ps)))) {
4370 #ifdef DEBUG
4371 if (mirror_debug_flag)
4372 printf("Abort resync read "
4373 "%x: %lld\n",
4374 MD_SID(un),
4375 ps->ps_firstblk);
4376 #endif
4377 mirror_overlap_tree_remove(ps);
4378 kmem_cache_free(mirror_parent_cache,
4379 ps);
4380 md_kstat_waitq_exit(ui);
4381 md_unit_readerexit(ui);
4382 md_biodone(pb);
4383 return;
4384 }
4385 }
4386 }
4387 }
4388
4389 if (flag & MD_STR_DMR) {
4390 ps->ps_call = directed_read_done;
4391 }
4392
4393 if (!(flag & MD_STR_NOTTOP) && panicstr)
4394 ps->ps_flags |= MD_MPS_DONTFREE;
4395
4396 md_kstat_waitq_to_runq(ui);
4397
4398 ps->ps_frags++;
4399 do {
4400 cs = kmem_cache_alloc(mirror_child_cache, MD_ALLOCFLAGS);
4401 mirror_child_init(cs);
4402 cb = &cs->cs_buf;
4403 cs->cs_ps = ps;
4404
4405 cb = md_bioclone(pb, current_offset, current_count, NODEV,
4406 current_blkno, mirror_done, cb, KM_NOSLEEP);
4407
4408 more = mirror_map_read(ps, cs, current_blkno,
4409 (u_longlong_t)current_count);
4410 if (more) {
4411 mutex_enter(&ps->ps_mx);
4412 ps->ps_frags++;
4413 mutex_exit(&ps->ps_mx);
4414 }
4415
4416 /*
4417 * Do these calculations now,
4418 * so that we pickup a valid b_bcount from the chld_bp.
4419 */
4420 current_count -= more;
4421 current_offset += cb->b_bcount;
4422 current_blkno += more;
4423 md_call_strategy(cb, flag, private);
4424 } while (more);
4425
4426 if (!(flag & MD_STR_NOTTOP) && panicstr) {
4427 while (!(ps->ps_flags & MD_MPS_DONE)) {
4428 md_daemon(1, &md_done_daemon);
4429 drv_usecwait(10);
4430 }
4431 kmem_cache_free(mirror_parent_cache, ps);
4432 }
4433 }
4434
4435 void
md_mirror_strategy(buf_t * bp,int flag,void * private)4436 md_mirror_strategy(buf_t *bp, int flag, void *private)
4437 {
4438 set_t setno = MD_MIN2SET(getminor(bp->b_edev));
4439
4440 /*
4441 * When doing IO to a multi owner meta device, check if set is halted.
4442 * We do this check without the needed lock held, for performance
4443 * reasons.
4444 * If an IO just slips through while the set is locked via an
4445 * MD_MN_SUSPEND_SET, we don't care about it.
4446 * Only check for suspension if we are a top-level i/o request
4447 * (MD_STR_NOTTOP is cleared in 'flag').
4448 */
4449 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
4450 (MD_SET_HALTED | MD_SET_MNSET)) {
4451 if ((flag & MD_STR_NOTTOP) == 0) {
4452 mutex_enter(&md_mx);
4453 /* Here we loop until the set is no longer halted */
4454 while (md_set[setno].s_status & MD_SET_HALTED) {
4455 cv_wait(&md_cv, &md_mx);
4456 }
4457 mutex_exit(&md_mx);
4458 }
4459 }
4460
4461 if ((flag & MD_IO_COUNTED) == 0) {
4462 if ((flag & MD_NOBLOCK) == 0) {
4463 if (md_inc_iocount(setno) != 0) {
4464 bp->b_flags |= B_ERROR;
4465 bp->b_error = ENXIO;
4466 bp->b_resid = bp->b_bcount;
4467 biodone(bp);
4468 return;
4469 }
4470 } else {
4471 md_inc_iocount_noblock(setno);
4472 }
4473 }
4474
4475 if (bp->b_flags & B_READ)
4476 mirror_read_strategy(bp, flag, private);
4477 else
4478 mirror_write_strategy(bp, flag, private);
4479 }
4480
4481 /*
4482 * mirror_directed_read:
4483 * --------------------
4484 * Entry-point for the DKIOCDMR ioctl. We issue a read to a specified sub-mirror
4485 * so that the application can determine what (if any) resync needs to be
4486 * performed. The data is copied out to the user-supplied buffer.
4487 *
4488 * Parameters:
4489 * mdev - dev_t for the mirror device
4490 * vdr - directed read parameters specifying location and submirror
4491 * to perform the read from
4492 * mode - used to ddi_copyout() any resulting data from the read
4493 *
4494 * Returns:
4495 * 0 success
4496 * !0 error code
4497 * EINVAL - invalid request format
4498 */
4499 int
mirror_directed_read(dev_t mdev,vol_directed_rd_t * vdr,int mode)4500 mirror_directed_read(dev_t mdev, vol_directed_rd_t *vdr, int mode)
4501 {
4502 buf_t *bp;
4503 minor_t mnum = getminor(mdev);
4504 mdi_unit_t *ui = MDI_UNIT(mnum);
4505 mm_unit_t *un;
4506 mm_submirror_t *sm;
4507 char *sm_nm;
4508 uint_t next_side;
4509 void *kbuffer;
4510
4511 if (ui == NULL)
4512 return (ENXIO);
4513
4514 if (!(vdr->vdr_flags & DKV_DMR_NEXT_SIDE)) {
4515 return (EINVAL);
4516 }
4517
4518 /* Check for aligned block access. We disallow non-aligned requests. */
4519 if (vdr->vdr_offset % DEV_BSIZE) {
4520 return (EINVAL);
4521 }
4522
4523 /*
4524 * Allocate kernel buffer for target of read(). If we had a reliable
4525 * (sorry functional) DDI this wouldn't be needed.
4526 */
4527 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
4528 if (kbuffer == NULL) {
4529 cmn_err(CE_WARN, "mirror_directed_read: couldn't allocate %lx"
4530 " bytes\n", vdr->vdr_nbytes);
4531 return (ENOMEM);
4532 }
4533
4534 bp = getrbuf(KM_SLEEP);
4535
4536 bp->b_un.b_addr = kbuffer;
4537 bp->b_flags = B_READ;
4538 bp->b_bcount = vdr->vdr_nbytes;
4539 bp->b_lblkno = lbtodb(vdr->vdr_offset);
4540 bp->b_edev = mdev;
4541
4542 un = md_unit_readerlock(ui);
4543
4544 /*
4545 * If DKV_SIDE_INIT is set we need to determine the first available
4546 * side to start reading from. If it isn't set we increment to the
4547 * next readable submirror.
4548 * If there are no readable submirrors we error out with DKV_DMR_ERROR.
4549 * Note: we check for a readable submirror on completion of the i/o so
4550 * we should _always_ have one available. If this becomes unavailable
4551 * we have missed the 'DKV_DMR_DONE' opportunity. This could happen if
4552 * a metadetach is made between the completion of one DKIOCDMR ioctl
4553 * and the start of the next (i.e. a sys-admin 'accident' occurred).
4554 * The chance of this is small, but not non-existent.
4555 */
4556 if (vdr->vdr_side == DKV_SIDE_INIT) {
4557 next_side = 0;
4558 } else {
4559 next_side = vdr->vdr_side + 1;
4560 }
4561 while ((next_side < NMIRROR) &&
4562 !SUBMIRROR_IS_READABLE(un, next_side))
4563 next_side++;
4564 if (next_side >= NMIRROR) {
4565 vdr->vdr_flags |= DKV_DMR_ERROR;
4566 freerbuf(bp);
4567 vdr->vdr_bytesread = 0;
4568 md_unit_readerexit(ui);
4569 return (0);
4570 }
4571
4572 /* Set the side to read from */
4573 un->un_dmr_last_read = next_side;
4574
4575 md_unit_readerexit(ui);
4576
4577 /*
4578 * Save timestamp for verification purposes. Can be read by debugger
4579 * to verify that this ioctl has been executed and to find the number
4580 * of DMR reads and the time of the last DMR read.
4581 */
4582 uniqtime(&mirror_dmr_stats.dmr_timestamp);
4583 mirror_dmr_stats.dmr_count++;
4584
4585 /* Issue READ request and wait for completion */
4586 mirror_read_strategy(bp, MD_STR_DMR|MD_NOBLOCK|MD_STR_NOTTOP, NULL);
4587
4588 mutex_enter(&un->un_dmr_mx);
4589 cv_wait(&un->un_dmr_cv, &un->un_dmr_mx);
4590 mutex_exit(&un->un_dmr_mx);
4591
4592 /*
4593 * Check to see if we encountered an error during the read. If so we
4594 * can make no guarantee about any possibly returned data.
4595 */
4596 if ((bp->b_flags & B_ERROR) == 0) {
4597 vdr->vdr_flags &= ~DKV_DMR_ERROR;
4598 if (bp->b_resid) {
4599 vdr->vdr_flags |= DKV_DMR_SHORT;
4600 vdr->vdr_bytesread = vdr->vdr_nbytes - bp->b_resid;
4601 } else {
4602 vdr->vdr_flags |= DKV_DMR_SUCCESS;
4603 vdr->vdr_bytesread = vdr->vdr_nbytes;
4604 }
4605 /* Copy the data read back out to the user supplied buffer */
4606 if (ddi_copyout(kbuffer, vdr->vdr_data, vdr->vdr_bytesread,
4607 mode)) {
4608 kmem_free(kbuffer, vdr->vdr_nbytes);
4609 return (EFAULT);
4610 }
4611
4612 } else {
4613 /* Error out with DKV_DMR_ERROR */
4614 vdr->vdr_flags |= DKV_DMR_ERROR;
4615 vdr->vdr_flags &= ~(DKV_DMR_SUCCESS|DKV_DMR_SHORT|DKV_DMR_DONE);
4616 }
4617 /*
4618 * Update the DMR parameters with the side and name of submirror that
4619 * we have just read from (un->un_dmr_last_read)
4620 */
4621 un = md_unit_readerlock(ui);
4622
4623 vdr->vdr_side = un->un_dmr_last_read;
4624 sm = &un->un_sm[un->un_dmr_last_read];
4625 sm_nm = md_shortname(md_getminor(sm->sm_dev));
4626
4627 (void) strncpy(vdr->vdr_side_name, sm_nm, sizeof (vdr->vdr_side_name));
4628
4629 /*
4630 * Determine if we've completed the read cycle. This is true iff the
4631 * next computed submirror (side) equals or exceeds NMIRROR. We cannot
4632 * use un_nsm as we need to handle a sparse array of submirrors (which
4633 * can occur if a submirror is metadetached).
4634 */
4635 next_side = un->un_dmr_last_read + 1;
4636 while ((next_side < NMIRROR) &&
4637 !SUBMIRROR_IS_READABLE(un, next_side))
4638 next_side++;
4639 if (next_side >= NMIRROR) {
4640 /* We've finished */
4641 vdr->vdr_flags |= DKV_DMR_DONE;
4642 }
4643
4644 md_unit_readerexit(ui);
4645 freerbuf(bp);
4646 kmem_free(kbuffer, vdr->vdr_nbytes);
4647
4648 return (0);
4649 }
4650
4651 /*
4652 * mirror_resync_message:
4653 * ---------------------
4654 * Handle the multi-node resync messages that keep all nodes within a given
4655 * disk-set in sync with their view of a mirror's resync status.
4656 *
4657 * The message types dealt with are:
4658 * MD_MN_MSG_RESYNC_STARTING - start a resync thread for a unit
4659 * MD_MN_MSG_RESYNC_NEXT - specified next region to be resynced
4660 * MD_MN_MSG_RESYNC_FINISH - stop the resync thread for a unit
4661 * MD_MN_MSG_RESYNC_PHASE_DONE - end of a resync phase, opt, submirror or comp
4662 *
4663 * Returns:
4664 * 0 Success
4665 * >0 Failure error number
4666 */
4667 int
mirror_resync_message(md_mn_rs_params_t * p,IOLOCK * lockp)4668 mirror_resync_message(md_mn_rs_params_t *p, IOLOCK *lockp)
4669 {
4670 mdi_unit_t *ui;
4671 mm_unit_t *un;
4672 set_t setno;
4673 int is_ABR;
4674 int smi;
4675 int ci;
4676 sm_state_t state;
4677 int broke_out;
4678 mm_submirror_t *sm;
4679 mm_submirror_ic_t *smic;
4680 md_m_shared_t *shared;
4681 md_error_t mde = mdnullerror;
4682 md_mps_t *ps;
4683 int rs_active;
4684 int rr, rr_start, rr_end;
4685
4686 /* Check that the given device is part of a multi-node set */
4687 setno = MD_MIN2SET(p->mnum);
4688 if (setno >= md_nsets) {
4689 return (ENXIO);
4690 }
4691 if (!MD_MNSET_SETNO(setno)) {
4692 return (EINVAL);
4693 }
4694
4695 if ((un = mirror_getun(p->mnum, &p->mde, NO_LOCK, NULL)) == NULL)
4696 return (EINVAL);
4697 if ((ui = MDI_UNIT(p->mnum)) == NULL)
4698 return (EINVAL);
4699 is_ABR = (ui->ui_tstate & MD_ABR_CAP);
4700
4701 /* Obtain the current resync status */
4702 (void) md_ioctl_readerlock(lockp, ui);
4703 rs_active = (MD_STATUS(un) & MD_UN_RESYNC_ACTIVE) ? 1 : 0;
4704 md_ioctl_readerexit(lockp);
4705
4706 switch ((md_mn_msgtype_t)p->msg_type) {
4707 case MD_MN_MSG_RESYNC_STARTING:
4708 /* Start the resync thread for the mirror */
4709 (void) mirror_resync_unit(p->mnum, NULL, &p->mde, lockp);
4710 break;
4711
4712 case MD_MN_MSG_RESYNC_NEXT:
4713 /*
4714 * We have to release any previously marked overlap regions
4715 * so that i/o can resume. Then we need to block the region
4716 * from [rs_start..rs_start+rs_size) * so that no i/o is issued.
4717 * Update un_rs_resync_done and un_rs_resync_2_do.
4718 */
4719 (void) md_ioctl_readerlock(lockp, ui);
4720 /*
4721 * Ignore the message if there is no active resync thread or
4722 * if it is for a resync type that we have already completed.
4723 * un_resync_completed is set to the last resync completed
4724 * when processing a PHASE_DONE message.
4725 */
4726 if (!rs_active || (p->rs_type == un->un_resync_completed))
4727 break;
4728 /*
4729 * If this message is for the same resync and is for an earlier
4730 * resync region, just ignore it. This can only occur if this
4731 * node has progressed on to the next resync region before
4732 * we receive this message. This can occur if the class for
4733 * this message is busy and the originator has to retry thus
4734 * allowing this node to move onto the next resync_region.
4735 */
4736 if ((p->rs_type == un->un_rs_type) &&
4737 (p->rs_start < un->un_resync_startbl))
4738 break;
4739 ps = un->un_rs_prev_overlap;
4740
4741 /* Allocate previous overlap reference if needed */
4742 if (ps == NULL) {
4743 ps = kmem_cache_alloc(mirror_parent_cache,
4744 MD_ALLOCFLAGS);
4745 ps->ps_un = un;
4746 ps->ps_ui = ui;
4747 ps->ps_firstblk = 0;
4748 ps->ps_lastblk = 0;
4749 ps->ps_flags = 0;
4750 md_ioctl_readerexit(lockp);
4751 (void) md_ioctl_writerlock(lockp, ui);
4752 un->un_rs_prev_overlap = ps;
4753 md_ioctl_writerexit(lockp);
4754 } else
4755 md_ioctl_readerexit(lockp);
4756
4757 if (p->rs_originator != md_mn_mynode_id) {
4758 /*
4759 * Clear our un_resync_bm for the regions completed.
4760 * The owner (originator) will take care of itself.
4761 */
4762 BLK_TO_RR(rr_end, ps->ps_lastblk, un);
4763 BLK_TO_RR(rr_start, p->rs_start, un);
4764 if (ps->ps_lastblk && rr_end < rr_start) {
4765 BLK_TO_RR(rr_start, ps->ps_firstblk, un);
4766 mutex_enter(&un->un_resync_mx);
4767 /*
4768 * Update our resync bitmap to reflect that
4769 * another node has synchronized this range.
4770 */
4771 for (rr = rr_start; rr <= rr_end; rr++) {
4772 CLR_KEEPDIRTY(rr, un);
4773 }
4774 mutex_exit(&un->un_resync_mx);
4775 }
4776
4777 /*
4778 * On all but the originating node, first update
4779 * the resync state, then unblock the previous
4780 * region and block the next one. No need
4781 * to do this if the region is already blocked.
4782 * Update the submirror state and flags from the
4783 * originator. This keeps the cluster in sync with
4784 * regards to the resync status.
4785 */
4786
4787 (void) md_ioctl_writerlock(lockp, ui);
4788 un->un_rs_resync_done = p->rs_done;
4789 un->un_rs_resync_2_do = p->rs_2_do;
4790 un->un_rs_type = p->rs_type;
4791 un->un_resync_startbl = p->rs_start;
4792 md_ioctl_writerexit(lockp);
4793 /*
4794 * Use un_owner_mx to ensure that an ownership change
4795 * cannot happen at the same time as this message
4796 */
4797 mutex_enter(&un->un_owner_mx);
4798 if (MD_MN_MIRROR_OWNER(un)) {
4799 ps->ps_firstblk = p->rs_start;
4800 ps->ps_lastblk = ps->ps_firstblk +
4801 p->rs_size - 1;
4802 } else {
4803 if ((ps->ps_firstblk != p->rs_start) ||
4804 (ps->ps_lastblk != p->rs_start +
4805 p->rs_size - 1)) {
4806 /* Remove previous overlap range */
4807 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4808 mirror_overlap_tree_remove(ps);
4809
4810 ps->ps_firstblk = p->rs_start;
4811 ps->ps_lastblk = ps->ps_firstblk +
4812 p->rs_size - 1;
4813
4814 mutex_exit(&un->un_owner_mx);
4815 /* Block this range from all i/o. */
4816 if (ps->ps_firstblk != 0 ||
4817 ps->ps_lastblk != 0)
4818 wait_for_overlaps(ps,
4819 MD_OVERLAP_ALLOW_REPEAT);
4820 mutex_enter(&un->un_owner_mx);
4821 /*
4822 * Check to see if we have obtained
4823 * ownership while waiting for
4824 * overlaps. If we have, remove
4825 * the resync_region entry from the
4826 * overlap tree
4827 */
4828 if (MD_MN_MIRROR_OWNER(un) &&
4829 (ps->ps_flags & MD_MPS_ON_OVERLAP))
4830 mirror_overlap_tree_remove(ps);
4831 }
4832 }
4833 mutex_exit(&un->un_owner_mx);
4834
4835 /*
4836 * If this is the first RESYNC_NEXT message (i.e.
4837 * MD_MN_RS_FIRST_RESYNC_NEXT set in p->rs_flags),
4838 * issue RESYNC_START NOTIFY event
4839 */
4840 if (p->rs_flags & MD_MN_RS_FIRST_RESYNC_NEXT) {
4841 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_START,
4842 SVM_TAG_METADEVICE, MD_UN2SET(un),
4843 MD_SID(un));
4844 }
4845
4846 /* Ensure that our local resync thread is running */
4847 if (un->un_rs_thread == NULL) {
4848 (void) mirror_resync_unit(p->mnum, NULL,
4849 &p->mde, lockp);
4850 }
4851 }
4852
4853 break;
4854 case MD_MN_MSG_RESYNC_FINISH:
4855 /*
4856 * Complete the resync by stopping the resync thread.
4857 * Also release the previous overlap region field.
4858 * Update the resync_progress_thread by cv_signal'ing it so
4859 * that we mark the end of the resync as soon as possible. This
4860 * stops an unnecessary delay should be panic after resync
4861 * completion.
4862 */
4863 #ifdef DEBUG
4864 if (!rs_active) {
4865 if (mirror_debug_flag)
4866 printf("RESYNC_FINISH (mnum = %x), "
4867 "Resync *NOT* active",
4868 p->mnum);
4869 }
4870 #endif
4871
4872 if ((un->c.un_status & MD_UN_RESYNC_ACTIVE) &&
4873 (p->rs_originator != md_mn_mynode_id)) {
4874 mutex_enter(&un->un_rs_thread_mx);
4875 un->c.un_status &= ~MD_UN_RESYNC_CANCEL;
4876 un->un_rs_thread_flags |= MD_RI_SHUTDOWN;
4877 un->un_rs_thread_flags &=
4878 ~(MD_RI_BLOCK|MD_RI_BLOCK_OWNER);
4879 cv_signal(&un->un_rs_thread_cv);
4880 mutex_exit(&un->un_rs_thread_mx);
4881 }
4882 if (is_ABR) {
4883 /* Resync finished, if ABR set owner to NULL */
4884 mutex_enter(&un->un_owner_mx);
4885 un->un_mirror_owner = 0;
4886 mutex_exit(&un->un_owner_mx);
4887 }
4888 (void) md_ioctl_writerlock(lockp, ui);
4889 ps = un->un_rs_prev_overlap;
4890 if (ps != NULL) {
4891 /* Remove previous overlap range */
4892 if (ps->ps_flags & MD_MPS_ON_OVERLAP)
4893 mirror_overlap_tree_remove(ps);
4894 /*
4895 * Release the overlap range reference
4896 */
4897 un->un_rs_prev_overlap = NULL;
4898 kmem_cache_free(mirror_parent_cache,
4899 ps);
4900 }
4901 md_ioctl_writerexit(lockp);
4902
4903 /* Mark the resync as complete in the metadb */
4904 un->un_rs_resync_done = p->rs_done;
4905 un->un_rs_resync_2_do = p->rs_2_do;
4906 un->un_rs_type = p->rs_type;
4907 mutex_enter(&un->un_rs_progress_mx);
4908 cv_signal(&un->un_rs_progress_cv);
4909 mutex_exit(&un->un_rs_progress_mx);
4910
4911 un = md_ioctl_writerlock(lockp, ui);
4912 un->c.un_status &= ~MD_UN_RESYNC_ACTIVE;
4913 /* Deal with any pending grow_unit */
4914 if (un->c.un_status & MD_UN_GROW_PENDING) {
4915 if ((mirror_grow_unit(un, &mde) != 0) ||
4916 (! mdismderror(&mde, MDE_GROW_DELAYED))) {
4917 un->c.un_status &= ~MD_UN_GROW_PENDING;
4918 }
4919 }
4920 md_ioctl_writerexit(lockp);
4921 break;
4922
4923 case MD_MN_MSG_RESYNC_PHASE_DONE:
4924 /*
4925 * A phase of the resync, optimized. component or
4926 * submirror is complete. Update mirror status.
4927 * If the flag CLEAR_OPT_NOT_DONE is set, it means that the
4928 * mirror owner is peforming a resync. If we have just snarfed
4929 * this set, then we must clear any of the flags set at snarf
4930 * time by unit_setup_resync().
4931 * Note that unit_setup_resync() sets up these flags to
4932 * indicate that an optimized resync is required. These flags
4933 * need to be reset because if we get here, the mirror owner
4934 * will have handled the optimized resync.
4935 * The flags that must be cleared are MD_UN_OPT_NOT_DONE and
4936 * MD_UN_WAR. In addition, for each submirror,
4937 * MD_SM_RESYNC_TARGET must be cleared and SMS_OFFLINE_RESYNC
4938 * set to SMS_OFFLINE.
4939 */
4940 #ifdef DEBUG
4941 if (mirror_debug_flag)
4942 printf("phase done mess received from %d, mnum=%x,"
4943 "type=%x, flags=%x\n", p->rs_originator, p->mnum,
4944 p->rs_type, p->rs_flags);
4945 #endif
4946 /*
4947 * Ignore the message if there is no active resync thread.
4948 */
4949 if (!rs_active)
4950 break;
4951
4952 broke_out = p->rs_flags & MD_MN_RS_ERR;
4953 switch (RS_TYPE(p->rs_type)) {
4954 case MD_RS_OPTIMIZED:
4955 un = md_ioctl_writerlock(lockp, ui);
4956 if (p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE) {
4957 /* If we are originator, just clear rs_type */
4958 if (p->rs_originator == md_mn_mynode_id) {
4959 SET_RS_TYPE_NONE(un->un_rs_type);
4960 md_ioctl_writerexit(lockp);
4961 break;
4962 }
4963 /*
4964 * If CLEAR_OPT_NOT_DONE is set, only clear the
4965 * flags if OPT_NOT_DONE is set *and* rs_type
4966 * is MD_RS_NONE.
4967 */
4968 if ((un->c.un_status & MD_UN_OPT_NOT_DONE) &&
4969 (RS_TYPE(un->un_rs_type) == MD_RS_NONE)) {
4970 /* No resync in progress */
4971 un->c.un_status &= ~MD_UN_OPT_NOT_DONE;
4972 un->c.un_status &= ~MD_UN_WAR;
4973 } else {
4974 /*
4975 * We are in the middle of an
4976 * optimized resync and this message
4977 * should be ignored.
4978 */
4979 md_ioctl_writerexit(lockp);
4980 break;
4981 }
4982 } else {
4983 /*
4984 * This is the end of an optimized resync,
4985 * clear the OPT_NOT_DONE and OFFLINE_SM flags
4986 */
4987
4988 un->c.un_status &= ~MD_UN_KEEP_DIRTY;
4989 if (!broke_out)
4990 un->c.un_status &= ~MD_UN_WAR;
4991
4992 /*
4993 * Clear our un_resync_bm for the regions
4994 * completed. The owner (originator) will
4995 * take care of itself.
4996 */
4997 if (p->rs_originator != md_mn_mynode_id &&
4998 (ps = un->un_rs_prev_overlap) != NULL) {
4999 BLK_TO_RR(rr_start, ps->ps_firstblk,
5000 un);
5001 BLK_TO_RR(rr_end, ps->ps_lastblk, un);
5002 mutex_enter(&un->un_resync_mx);
5003 for (rr = rr_start; rr <= rr_end;
5004 rr++) {
5005 CLR_KEEPDIRTY(rr, un);
5006 }
5007 mutex_exit(&un->un_resync_mx);
5008 }
5009 }
5010
5011 /*
5012 * Set resync_completed to last resync type and then
5013 * clear resync_type to indicate no resync in progress
5014 */
5015 un->un_resync_completed = un->un_rs_type;
5016 SET_RS_TYPE_NONE(un->un_rs_type);
5017
5018 /*
5019 * If resync is as a result of a submirror ONLINE,
5020 * reset the submirror state to SMS_RUNNING if the
5021 * resync was ok else set back to SMS_OFFLINE.
5022 */
5023 for (smi = 0; smi < NMIRROR; smi++) {
5024 un->un_sm[smi].sm_flags &=
5025 ~MD_SM_RESYNC_TARGET;
5026 if (SMS_BY_INDEX_IS(un, smi,
5027 SMS_OFFLINE_RESYNC)) {
5028 if (p->rs_flags &
5029 MD_MN_RS_CLEAR_OPT_NOT_DONE) {
5030 state = SMS_OFFLINE;
5031 } else {
5032 state = (broke_out ?
5033 SMS_OFFLINE : SMS_RUNNING);
5034 }
5035 mirror_set_sm_state(
5036 &un->un_sm[smi],
5037 &un->un_smic[smi], state,
5038 broke_out);
5039 mirror_commit(un, NO_SUBMIRRORS,
5040 0);
5041 }
5042 /*
5043 * If we still have an offline submirror, reset
5044 * the OFFLINE_SM flag in the mirror status
5045 */
5046 if (SMS_BY_INDEX_IS(un, smi,
5047 SMS_OFFLINE))
5048 un->c.un_status |=
5049 MD_UN_OFFLINE_SM;
5050 }
5051 md_ioctl_writerexit(lockp);
5052 break;
5053 case MD_RS_SUBMIRROR:
5054 un = md_ioctl_writerlock(lockp, ui);
5055 smi = RS_SMI(p->rs_type);
5056 sm = &un->un_sm[smi];
5057 smic = &un->un_smic[smi];
5058 /* Clear RESYNC target */
5059 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5060 /*
5061 * Set resync_completed to last resync type and then
5062 * clear resync_type to indicate no resync in progress
5063 */
5064 un->un_resync_completed = un->un_rs_type;
5065 SET_RS_TYPE_NONE(un->un_rs_type);
5066 /*
5067 * If the resync completed ok reset the submirror
5068 * state to SMS_RUNNING else reset it to SMS_ATTACHED
5069 */
5070 state = (broke_out ?
5071 SMS_ATTACHED : SMS_RUNNING);
5072 mirror_set_sm_state(sm, smic, state, broke_out);
5073 un->c.un_status &= ~MD_UN_WAR;
5074 mirror_commit(un, SMI2BIT(smi), 0);
5075 md_ioctl_writerexit(lockp);
5076 break;
5077 case MD_RS_COMPONENT:
5078 un = md_ioctl_writerlock(lockp, ui);
5079 smi = RS_SMI(p->rs_type);
5080 ci = RS_CI(p->rs_type);
5081 sm = &un->un_sm[smi];
5082 smic = &un->un_smic[smi];
5083 shared = (md_m_shared_t *)
5084 (*(smic->sm_shared_by_indx))
5085 (sm->sm_dev, sm, ci);
5086 un->c.un_status &= ~MD_UN_WAR;
5087 /* Clear RESYNC target */
5088 un->un_sm[smi].sm_flags &= ~MD_SM_RESYNC_TARGET;
5089 /*
5090 * Set resync_completed to last resync type and then
5091 * clear resync_type to indicate no resync in progress
5092 */
5093 un->un_resync_completed = un->un_rs_type;
5094 SET_RS_TYPE_NONE(un->un_rs_type);
5095
5096 /*
5097 * If the resync completed ok, set the component state
5098 * to CS_OKAY.
5099 */
5100 if (broke_out)
5101 shared->ms_flags |= MDM_S_RS_TRIED;
5102 else {
5103 /*
5104 * As we don't transmit the changes,
5105 * no need to drop the lock.
5106 */
5107 set_sm_comp_state(un, smi, ci, CS_OKAY, 0,
5108 MD_STATE_NO_XMIT, (IOLOCK *)NULL);
5109 }
5110 md_ioctl_writerexit(lockp);
5111 default:
5112 break;
5113 }
5114 /*
5115 * If the purpose of this PHASE_DONE message is just to
5116 * indicate to all other nodes that the optimized resync
5117 * required (OPT_NOT_DONE) flag is to be cleared, there is
5118 * no need to generate a notify event as there has not
5119 * actually been a resync.
5120 */
5121 if (!(p->rs_flags & MD_MN_RS_CLEAR_OPT_NOT_DONE)) {
5122 if (broke_out) {
5123 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_FAILED,
5124 SVM_TAG_METADEVICE, MD_UN2SET(un),
5125 MD_SID(un));
5126 } else {
5127 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_RESYNC_DONE,
5128 SVM_TAG_METADEVICE, MD_UN2SET(un),
5129 MD_SID(un));
5130 }
5131 }
5132 break;
5133
5134 default:
5135 #ifdef DEBUG
5136 cmn_err(CE_PANIC, "mirror_resync_message: Unknown message type"
5137 " %x\n", p->msg_type);
5138 #endif
5139 return (EINVAL);
5140 }
5141 return (0);
5142 }
5143
5144 /* Return a -1 if snarf of optimized record failed and set should be released */
5145 static int
mirror_snarf(md_snarfcmd_t cmd,set_t setno)5146 mirror_snarf(md_snarfcmd_t cmd, set_t setno)
5147 {
5148 mddb_recid_t recid;
5149 int gotsomething;
5150 int all_mirrors_gotten;
5151 mm_unit_t *un;
5152 mddb_type_t typ1;
5153 mddb_de_ic_t *dep;
5154 mddb_rb32_t *rbp;
5155 size_t newreqsize;
5156 mm_unit_t *big_un;
5157 mm_unit32_od_t *small_un;
5158 int retval;
5159 mdi_unit_t *ui;
5160
5161 if (cmd == MD_SNARF_CLEANUP) {
5162 if (md_get_setstatus(setno) & MD_SET_STALE)
5163 return (0);
5164
5165 recid = mddb_makerecid(setno, 0);
5166 typ1 = (mddb_type_t)md_getshared_key(setno,
5167 mirror_md_ops.md_driver.md_drivername);
5168 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5169 if (mddb_getrecprivate(recid) & MD_PRV_CLEANUP) {
5170 un = (mm_unit_t *)mddb_getrecaddr(recid);
5171 mirror_cleanup(un);
5172 recid = mddb_makerecid(setno, 0);
5173 }
5174 }
5175 return (0);
5176 }
5177
5178 all_mirrors_gotten = 1;
5179 gotsomething = 0;
5180
5181 recid = mddb_makerecid(setno, 0);
5182 typ1 = (mddb_type_t)md_getshared_key(setno,
5183 mirror_md_ops.md_driver.md_drivername);
5184
5185 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5186 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5187 continue;
5188
5189 dep = mddb_getrecdep(recid);
5190 dep->de_flags = MDDB_F_MIRROR;
5191 rbp = dep->de_rb;
5192
5193 switch (rbp->rb_revision) {
5194 case MDDB_REV_RB:
5195 case MDDB_REV_RBFN:
5196 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
5197 /*
5198 * This means, we have an old and small
5199 * record and this record hasn't already
5200 * been converted. Before we create an
5201 * incore metadevice from this we have to
5202 * convert it to a big record.
5203 */
5204 small_un =
5205 (mm_unit32_od_t *)mddb_getrecaddr(recid);
5206 newreqsize = sizeof (mm_unit_t);
5207 big_un = (mm_unit_t *)kmem_zalloc(newreqsize,
5208 KM_SLEEP);
5209 mirror_convert((caddr_t)small_un,
5210 (caddr_t)big_un, SMALL_2_BIG);
5211 kmem_free(small_un, dep->de_reqsize);
5212
5213 /*
5214 * Update userdata and incore userdata
5215 * incores are at the end of un
5216 */
5217 dep->de_rb_userdata_ic = big_un;
5218 dep->de_rb_userdata = big_un;
5219 dep->de_icreqsize = newreqsize;
5220 un = big_un;
5221 rbp->rb_private |= MD_PRV_CONVD;
5222 } else {
5223 /*
5224 * Unit already converted, just get the
5225 * record address.
5226 */
5227 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5228 sizeof (*un), 0);
5229 }
5230 un->c.un_revision &= ~MD_64BIT_META_DEV;
5231 break;
5232 case MDDB_REV_RB64:
5233 case MDDB_REV_RB64FN:
5234 /* Big device */
5235 un = (mm_unit_t *)mddb_getrecaddr_resize(recid,
5236 sizeof (*un), 0);
5237 un->c.un_revision |= MD_64BIT_META_DEV;
5238 un->c.un_flag |= MD_EFILABEL;
5239 break;
5240 }
5241 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
5242
5243 /*
5244 * Create minor device node for snarfed entry.
5245 */
5246 (void) md_create_minor_node(setno, MD_SID(un));
5247
5248 if (MD_UNIT(MD_SID(un)) != NULL) {
5249 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5250 continue;
5251 }
5252 all_mirrors_gotten = 0;
5253 retval = mirror_build_incore(un, 1);
5254 if (retval == 0) {
5255 mddb_setrecprivate(recid, MD_PRV_GOTIT);
5256 md_create_unit_incore(MD_SID(un), &mirror_md_ops, 0);
5257 resync_start_timeout(setno);
5258 gotsomething = 1;
5259 } else {
5260 return (retval);
5261 }
5262 /*
5263 * Set flag to indicate that the mirror has not yet
5264 * been through a reconfig. This flag is used for MN sets
5265 * when determining whether to update the mirror state from
5266 * the Master node.
5267 */
5268 if (MD_MNSET_SETNO(setno)) {
5269 ui = MDI_UNIT(MD_SID(un));
5270 ui->ui_tstate |= MD_RESYNC_NOT_DONE;
5271 }
5272 }
5273
5274 if (!all_mirrors_gotten)
5275 return (gotsomething);
5276
5277 recid = mddb_makerecid(setno, 0);
5278 while ((recid = mddb_getnextrec(recid, typ1, RESYNC_REC)) > 0)
5279 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
5280 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
5281
5282 return (0);
5283 }
5284
5285 static int
mirror_halt(md_haltcmd_t cmd,set_t setno)5286 mirror_halt(md_haltcmd_t cmd, set_t setno)
5287 {
5288 unit_t i;
5289 mdi_unit_t *ui;
5290 minor_t mnum;
5291 int reset_mirror_flag = 0;
5292
5293 if (cmd == MD_HALT_CLOSE)
5294 return (0);
5295
5296 if (cmd == MD_HALT_OPEN)
5297 return (0);
5298
5299 if (cmd == MD_HALT_UNLOAD)
5300 return (0);
5301
5302 if (cmd == MD_HALT_CHECK) {
5303 for (i = 0; i < md_nunits; i++) {
5304 mnum = MD_MKMIN(setno, i);
5305 if ((ui = MDI_UNIT(mnum)) == NULL)
5306 continue;
5307 if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5308 continue;
5309 if (md_unit_isopen(ui))
5310 return (1);
5311 }
5312 return (0);
5313 }
5314
5315 if (cmd != MD_HALT_DOIT)
5316 return (1);
5317
5318 for (i = 0; i < md_nunits; i++) {
5319 mnum = MD_MKMIN(setno, i);
5320 if ((ui = MDI_UNIT(mnum)) == NULL)
5321 continue;
5322 if (ui->ui_opsindex != mirror_md_ops.md_selfindex)
5323 continue;
5324 reset_mirror((mm_unit_t *)MD_UNIT(mnum), mnum, 0);
5325
5326 /* Set a flag if there is at least one mirror metadevice. */
5327 reset_mirror_flag = 1;
5328 }
5329
5330 /*
5331 * Only wait for the global dr_timeout to finish
5332 * - if there are mirror metadevices in this diskset or
5333 * - if this is the local set since an unload of the md_mirror
5334 * driver could follow a successful mirror halt in the local set.
5335 */
5336 if ((reset_mirror_flag != 0) || (setno == MD_LOCAL_SET)) {
5337 while ((mirror_md_ops.md_head == NULL) &&
5338 (mirror_timeout.dr_timeout_id != 0))
5339 delay(md_hz);
5340 }
5341
5342 return (0);
5343 }
5344
5345 /*ARGSUSED3*/
5346 static int
mirror_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)5347 mirror_open(dev_t *dev, int flag, int otyp, cred_t *cred_p, int md_oflags)
5348 {
5349 IOLOCK lock;
5350 minor_t mnum = getminor(*dev);
5351 set_t setno;
5352
5353 /*
5354 * When doing an open of a multi owner metadevice, check to see if this
5355 * node is a starting node and if a reconfig cycle is underway.
5356 * If so, the system isn't sufficiently set up enough to handle the
5357 * open (which involves I/O during sp_validate), so fail with ENXIO.
5358 */
5359 setno = MD_MIN2SET(mnum);
5360 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
5361 (MD_SET_MNSET | MD_SET_MN_START_RC)) {
5362 return (ENXIO);
5363 }
5364
5365 if (md_oflags & MD_OFLG_FROMIOCTL) {
5366 /*
5367 * This indicates that the caller is an ioctl service routine.
5368 * In this case we initialise our stack-based IOLOCK and pass
5369 * this into the internal open routine. This allows multi-owner
5370 * metadevices to avoid deadlocking if an error is encountered
5371 * during the open() attempt. The failure case is:
5372 * s-p -> mirror -> s-p (with error). Attempting to metaclear
5373 * this configuration would deadlock as the mirror code has to
5374 * send a state-update to the other nodes when it detects the
5375 * failure of the underlying submirror with an errored soft-part
5376 * on it. As there is a class1 message in progress (metaclear)
5377 * set_sm_comp_state() cannot send another class1 message;
5378 * instead we do not send a state_update message as the
5379 * metaclear is distributed and the failed submirror will be
5380 * cleared from the configuration by the metaclear.
5381 */
5382 IOLOCK_INIT(&lock);
5383 return (mirror_internal_open(getminor(*dev), flag, otyp,
5384 md_oflags, &lock));
5385 } else {
5386 return (mirror_internal_open(getminor(*dev), flag, otyp,
5387 md_oflags, (IOLOCK *)NULL));
5388 }
5389 }
5390
5391
5392 /*ARGSUSED1*/
5393 static int
mirror_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)5394 mirror_close(dev_t dev, int flag, int otyp, cred_t *cred_p, int md_cflags)
5395 {
5396 return (mirror_internal_close(getminor(dev), otyp, md_cflags,
5397 (IOLOCK *)NULL));
5398 }
5399
5400
5401 /*
5402 * This routine dumps memory to the disk. It assumes that the memory has
5403 * already been mapped into mainbus space. It is called at disk interrupt
5404 * priority when the system is in trouble.
5405 *
5406 */
5407 static int
mirror_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)5408 mirror_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
5409 {
5410 mm_unit_t *un;
5411 dev_t mapdev;
5412 int result;
5413 int smi;
5414 int any_succeed = 0;
5415 int save_result = 0;
5416
5417 /*
5418 * Don't need to grab the unit lock.
5419 * Cause nothing else is suppose to be happenning.
5420 * Also dump is not suppose to sleep.
5421 */
5422 un = (mm_unit_t *)MD_UNIT(getminor(dev));
5423
5424 if ((diskaddr_t)blkno >= un->c.un_total_blocks)
5425 return (EINVAL);
5426
5427 if ((diskaddr_t)blkno + nblk > un->c.un_total_blocks)
5428 return (EINVAL);
5429
5430 for (smi = 0; smi < NMIRROR; smi++) {
5431 if (!SUBMIRROR_IS_WRITEABLE(un, smi))
5432 continue;
5433 mapdev = md_dev64_to_dev(un->un_sm[smi].sm_dev);
5434 result = bdev_dump(mapdev, addr, blkno, nblk);
5435 if (result)
5436 save_result = result;
5437
5438 if (result == 0)
5439 any_succeed++;
5440 }
5441
5442 if (any_succeed)
5443 return (0);
5444
5445 return (save_result);
5446 }
5447
5448 /*
5449 * NAME: mirror_probe_dev
5450 *
5451 * DESCRITPION: force opens every component of a mirror.
5452 *
5453 * On entry the unit writerlock is held
5454 */
5455 static int
mirror_probe_dev(mdi_unit_t * ui,minor_t mnum)5456 mirror_probe_dev(mdi_unit_t *ui, minor_t mnum)
5457 {
5458 int i;
5459 int smi;
5460 int ci;
5461 mm_unit_t *un;
5462 int md_devopen = 0;
5463 set_t setno;
5464 int sm_cnt;
5465 int sm_unavail_cnt;
5466
5467 if (md_unit_isopen(ui))
5468 md_devopen++;
5469
5470 un = MD_UNIT(mnum);
5471 setno = MD_UN2SET(un);
5472
5473 sm_cnt = 0;
5474 sm_unavail_cnt = 0;
5475 for (i = 0; i < NMIRROR; i++) {
5476 md_dev64_t tmpdev;
5477 mdi_unit_t *sm_ui;
5478
5479 if (!SMS_BY_INDEX_IS(un, i, SMS_INUSE)) {
5480 continue;
5481 }
5482
5483 sm_cnt++;
5484 tmpdev = un->un_sm[i].sm_dev;
5485 (void) md_layered_open(mnum, &tmpdev,
5486 MD_OFLG_CONT_ERRS | MD_OFLG_PROBEDEV);
5487 un->un_sm[i].sm_dev = tmpdev;
5488
5489 sm_ui = MDI_UNIT(getminor(md_dev64_to_dev(tmpdev)));
5490
5491 /*
5492 * Logic similar to that in mirror_open_all_devs. We set or
5493 * clear the submirror Unavailable bit.
5494 */
5495 (void) md_unit_writerlock(sm_ui);
5496 if (submirror_unavailable(un, i, 1)) {
5497 sm_ui->ui_tstate |= MD_INACCESSIBLE;
5498 sm_unavail_cnt++;
5499 } else {
5500 sm_ui->ui_tstate &= ~MD_INACCESSIBLE;
5501 }
5502 md_unit_writerexit(sm_ui);
5503 }
5504
5505 /*
5506 * If all of the submirrors are unavailable, the mirror is also
5507 * unavailable.
5508 */
5509 if (sm_cnt == sm_unavail_cnt) {
5510 ui->ui_tstate |= MD_INACCESSIBLE;
5511 } else {
5512 ui->ui_tstate &= ~MD_INACCESSIBLE;
5513 }
5514
5515 /*
5516 * Start checking from probe failures. If failures occur we
5517 * set the appropriate erred state only if the metadevice is in
5518 * use. This is specifically to prevent unnecessary resyncs.
5519 * For instance if the disks were accidentally disconnected when
5520 * the system booted up then until the metadevice is accessed
5521 * (like file system mount) the user can shutdown, recable and
5522 * reboot w/o incurring a potentially huge resync.
5523 */
5524
5525 smi = 0;
5526 ci = 0;
5527 while (mirror_geterror(un, &smi, &ci, 1, 1) != 0) {
5528
5529 if (mirror_other_sources(un, smi, ci, 0) == 1) {
5530 /*
5531 * Note that for a MN set, there is no need to call
5532 * SE_NOTIFY as that is done when processing the
5533 * state change
5534 */
5535 if (md_devopen) {
5536 /*
5537 * Never called from ioctl context,
5538 * so (IOLOCK *)NULL
5539 */
5540 set_sm_comp_state(un, smi, ci, CS_LAST_ERRED,
5541 0, MD_STATE_XMIT, (IOLOCK *)NULL);
5542 if (!MD_MNSET_SETNO(setno)) {
5543 SE_NOTIFY(EC_SVM_STATE,
5544 ESC_SVM_LASTERRED,
5545 SVM_TAG_METADEVICE, setno,
5546 MD_SID(un));
5547 }
5548 continue;
5549 } else {
5550 (void) mirror_close_all_devs(un,
5551 MD_OFLG_PROBEDEV);
5552 if (!MD_MNSET_SETNO(setno)) {
5553 SE_NOTIFY(EC_SVM_STATE,
5554 ESC_SVM_OPEN_FAIL,
5555 SVM_TAG_METADEVICE, setno,
5556 MD_SID(un));
5557 }
5558 mirror_openfail_console_info(un, smi, ci);
5559 return (ENXIO);
5560 }
5561 }
5562
5563 /*
5564 * Note that for a MN set, there is no need to call
5565 * SE_NOTIFY as that is done when processing the
5566 * state change
5567 */
5568 if (md_devopen) {
5569 /* Never called from ioctl context, so (IOLOCK *)NULL */
5570 set_sm_comp_state(un, smi, ci, CS_ERRED, 0,
5571 MD_STATE_XMIT, (IOLOCK *)NULL);
5572 if (!MD_MNSET_SETNO(setno)) {
5573 SE_NOTIFY(EC_SVM_STATE, ESC_SVM_ERRED,
5574 SVM_TAG_METADEVICE, setno,
5575 MD_SID(un));
5576 }
5577 }
5578 mirror_openfail_console_info(un, smi, ci);
5579 ci++;
5580 }
5581
5582 if (MD_MNSET_SETNO(setno)) {
5583 send_poke_hotspares(setno);
5584 } else {
5585 (void) poke_hotspares();
5586 }
5587 (void) mirror_close_all_devs(un, MD_OFLG_PROBEDEV);
5588
5589 return (0);
5590 }
5591
5592
5593 static int
mirror_imp_set(set_t setno)5594 mirror_imp_set(
5595 set_t setno
5596 )
5597 {
5598
5599 mddb_recid_t recid;
5600 int gotsomething, i;
5601 mddb_type_t typ1;
5602 mddb_de_ic_t *dep;
5603 mddb_rb32_t *rbp;
5604 mm_unit32_od_t *un32;
5605 mm_unit_t *un64;
5606 md_dev64_t self_devt;
5607 minor_t *self_id; /* minor needs to be updated */
5608 md_parent_t *parent_id; /* parent needs to be updated */
5609 mddb_recid_t *record_id; /* record id needs to be updated */
5610 mddb_recid_t *optrec_id;
5611 md_dev64_t tmpdev;
5612
5613
5614 gotsomething = 0;
5615
5616 typ1 = (mddb_type_t)md_getshared_key(setno,
5617 mirror_md_ops.md_driver.md_drivername);
5618 recid = mddb_makerecid(setno, 0);
5619
5620 while ((recid = mddb_getnextrec(recid, typ1, MIRROR_REC)) > 0) {
5621 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
5622 continue;
5623
5624 dep = mddb_getrecdep(recid);
5625 rbp = dep->de_rb;
5626
5627 switch (rbp->rb_revision) {
5628 case MDDB_REV_RB:
5629 case MDDB_REV_RBFN:
5630 /*
5631 * Small device
5632 */
5633 un32 = (mm_unit32_od_t *)mddb_getrecaddr(recid);
5634 self_id = &(un32->c.un_self_id);
5635 parent_id = &(un32->c.un_parent);
5636 record_id = &(un32->c.un_record_id);
5637 optrec_id = &(un32->un_rr_dirty_recid);
5638
5639 for (i = 0; i < un32->un_nsm; i++) {
5640 tmpdev = md_expldev(un32->un_sm[i].sm_dev);
5641 un32->un_sm[i].sm_dev = md_cmpldev
5642 (md_makedevice(md_major, MD_MKMIN(setno,
5643 MD_MIN2UNIT(md_getminor(tmpdev)))));
5644
5645 if (!md_update_minor(setno, mddb_getsidenum
5646 (setno), un32->un_sm[i].sm_key))
5647 goto out;
5648 }
5649 break;
5650 case MDDB_REV_RB64:
5651 case MDDB_REV_RB64FN:
5652 un64 = (mm_unit_t *)mddb_getrecaddr(recid);
5653 self_id = &(un64->c.un_self_id);
5654 parent_id = &(un64->c.un_parent);
5655 record_id = &(un64->c.un_record_id);
5656 optrec_id = &(un64->un_rr_dirty_recid);
5657
5658 for (i = 0; i < un64->un_nsm; i++) {
5659 tmpdev = un64->un_sm[i].sm_dev;
5660 un64->un_sm[i].sm_dev = md_makedevice
5661 (md_major, MD_MKMIN(setno, MD_MIN2UNIT
5662 (md_getminor(tmpdev))));
5663
5664 if (!md_update_minor(setno, mddb_getsidenum
5665 (setno), un64->un_sm[i].sm_key))
5666 goto out;
5667 }
5668 break;
5669 }
5670
5671 /*
5672 * If this is a top level and a friendly name metadevice,
5673 * update its minor in the namespace.
5674 */
5675 if ((*parent_id == MD_NO_PARENT) &&
5676 ((rbp->rb_revision == MDDB_REV_RBFN) ||
5677 (rbp->rb_revision == MDDB_REV_RB64FN))) {
5678
5679 self_devt = md_makedevice(md_major, *self_id);
5680 if (!md_update_top_device_minor(setno,
5681 mddb_getsidenum(setno), self_devt))
5682 goto out;
5683 }
5684
5685 /*
5686 * Update unit with the imported setno
5687 *
5688 */
5689 mddb_setrecprivate(recid, MD_PRV_GOTIT);
5690
5691 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
5692 if (*parent_id != MD_NO_PARENT)
5693 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
5694 *record_id = MAKERECID(setno, DBID(*record_id));
5695 *optrec_id = MAKERECID(setno, DBID(*optrec_id));
5696
5697 gotsomething = 1;
5698 }
5699
5700 out:
5701 return (gotsomething);
5702 }
5703
5704 /*
5705 * NAME: mirror_check_offline
5706 *
5707 * DESCRIPTION: return offline_status = 1 if any submirrors are offline
5708 *
5709 * Called from ioctl, so access to MD_UN_OFFLINE_SM in un_status is
5710 * protected by the global ioctl lock as it is only set by the MD_IOCOFFLINE
5711 * ioctl.
5712 */
5713 int
mirror_check_offline(md_dev64_t dev,int * offline_status)5714 mirror_check_offline(md_dev64_t dev, int *offline_status)
5715 {
5716 mm_unit_t *un;
5717 md_error_t mde = mdnullerror;
5718
5719 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5720 return (EINVAL);
5721 *offline_status = 0;
5722 if (un->c.un_status & MD_UN_OFFLINE_SM)
5723 *offline_status = 1;
5724 return (0);
5725 }
5726
5727 /*
5728 * NAME: mirror_inc_abr_count
5729 *
5730 * DESCRIPTION: increment the count of layered soft parts with ABR set
5731 *
5732 * Called from ioctl, so access to un_abr_count is protected by the global
5733 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5734 */
5735 int
mirror_inc_abr_count(md_dev64_t dev)5736 mirror_inc_abr_count(md_dev64_t dev)
5737 {
5738 mm_unit_t *un;
5739 md_error_t mde = mdnullerror;
5740
5741 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5742 return (EINVAL);
5743 un->un_abr_count++;
5744 return (0);
5745 }
5746
5747 /*
5748 * NAME: mirror_dec_abr_count
5749 *
5750 * DESCRIPTION: decrement the count of layered soft parts with ABR set
5751 *
5752 * Called from ioctl, so access to un_abr_count is protected by the global
5753 * ioctl lock. It is only referenced in the MD_IOCOFFLINE ioctl.
5754 */
5755 int
mirror_dec_abr_count(md_dev64_t dev)5756 mirror_dec_abr_count(md_dev64_t dev)
5757 {
5758 mm_unit_t *un;
5759 md_error_t mde = mdnullerror;
5760
5761 if ((un = mirror_getun(getminor(dev), &mde, NO_LOCK, NULL)) == NULL)
5762 return (EINVAL);
5763 un->un_abr_count--;
5764 return (0);
5765 }
5766
5767 static md_named_services_t mirror_named_services[] = {
5768 {(intptr_t (*)()) poke_hotspares, "poke hotspares" },
5769 {(intptr_t (*)()) mirror_rename_listkids, MDRNM_LIST_URKIDS },
5770 {mirror_rename_check, MDRNM_CHECK },
5771 {(intptr_t (*)()) mirror_renexch_update_kids, MDRNM_UPDATE_KIDS },
5772 {(intptr_t (*)()) mirror_exchange_parent_update_to,
5773 MDRNM_PARENT_UPDATE_TO},
5774 {(intptr_t (*)()) mirror_exchange_self_update_from_down,
5775 MDRNM_SELF_UPDATE_FROM_DOWN },
5776 {(intptr_t (*)())mirror_probe_dev, "probe open test" },
5777 {(intptr_t (*)())mirror_check_offline, MD_CHECK_OFFLINE },
5778 {(intptr_t (*)())mirror_inc_abr_count, MD_INC_ABR_COUNT },
5779 {(intptr_t (*)())mirror_dec_abr_count, MD_DEC_ABR_COUNT },
5780 { NULL, 0 }
5781 };
5782
5783 md_ops_t mirror_md_ops = {
5784 mirror_open, /* open */
5785 mirror_close, /* close */
5786 md_mirror_strategy, /* strategy */
5787 NULL, /* print */
5788 mirror_dump, /* dump */
5789 NULL, /* read */
5790 NULL, /* write */
5791 md_mirror_ioctl, /* mirror_ioctl, */
5792 mirror_snarf, /* mirror_snarf */
5793 mirror_halt, /* mirror_halt */
5794 NULL, /* aread */
5795 NULL, /* awrite */
5796 mirror_imp_set, /* import set */
5797 mirror_named_services
5798 };
5799
5800 /* module specific initilization */
5801 static void
init_init()5802 init_init()
5803 {
5804 md_mirror_mcs_buf_off = sizeof (md_mcs_t) - sizeof (buf_t);
5805
5806 /* Initialize the parent and child save memory pools */
5807 mirror_parent_cache = kmem_cache_create("md_mirror_parent",
5808 sizeof (md_mps_t), 0, mirror_parent_constructor,
5809 mirror_parent_destructor, mirror_run_queue, NULL, NULL,
5810 0);
5811
5812 mirror_child_cache = kmem_cache_create("md_mirror_child",
5813 sizeof (md_mcs_t) - sizeof (buf_t) + biosize(), 0,
5814 mirror_child_constructor, mirror_child_destructor,
5815 mirror_run_queue, NULL, NULL, 0);
5816
5817 /*
5818 * Insure wowbuf_size is a multiple of DEV_BSIZE,
5819 * then initialize wowbuf memory pool.
5820 */
5821 md_wowbuf_size = roundup(md_wowbuf_size, DEV_BSIZE);
5822 if (md_wowbuf_size <= 0)
5823 md_wowbuf_size = 2 * DEV_BSIZE;
5824 if (md_wowbuf_size > (32 * DEV_BSIZE))
5825 md_wowbuf_size = (32 * DEV_BSIZE);
5826
5827 md_wowblk_size = md_wowbuf_size + sizeof (wowhdr_t);
5828 mirror_wowblk_cache = kmem_cache_create("md_mirror_wow",
5829 md_wowblk_size, 0, NULL, NULL, NULL, NULL, NULL, 0);
5830
5831 mutex_init(&mirror_timeout.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5832 mutex_init(&hotspare_request.dr_mx, NULL, MUTEX_DEFAULT, NULL);
5833
5834 mutex_init(&non_ff_drv_mutex, NULL, MUTEX_DEFAULT, NULL);
5835 }
5836
5837 /* module specific uninitilization (undo init_init()) */
5838 static void
fini_uninit()5839 fini_uninit()
5840 {
5841 kmem_cache_destroy(mirror_parent_cache);
5842 kmem_cache_destroy(mirror_child_cache);
5843 kmem_cache_destroy(mirror_wowblk_cache);
5844 mirror_parent_cache = mirror_child_cache =
5845 mirror_wowblk_cache = NULL;
5846
5847 mutex_destroy(&mirror_timeout.dr_mx);
5848 mutex_destroy(&hotspare_request.dr_mx);
5849 mutex_destroy(&non_ff_drv_mutex);
5850 }
5851
5852 /* define the module linkage */
5853 MD_PLUGIN_MISC_MODULE("mirrors module", init_init(), fini_uninit())
5854