1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 * Copyright (c) 2011 Bayard G. Bell. All rights reserved.
26 */
27
28 /*
29 * Soft partitioning metadevice driver (md_sp).
30 *
31 * This file contains the primary operations of the soft partitioning
32 * metadevice driver. This includes all routines for normal operation
33 * (open/close/read/write). Please see mdvar.h for a definition of
34 * metadevice operations vector (md_ops_t). This driver is loosely
35 * based on the stripe driver (md_stripe).
36 *
37 * All metadevice administration is done through the use of ioctl's.
38 * As such, all administrative routines appear in sp_ioctl.c.
39 *
40 * Soft partitions are represented both in-core and in the metadb with a
41 * unit structure. The soft partition-specific information in the unit
42 * structure includes the following information:
43 * - Device information (md_dev64_t & md key) about the device on which
44 * the soft partition is built.
45 * - Soft partition status information.
46 * - The size of the soft partition and number of extents used to
47 * make up that size.
48 * - An array of exents which define virtual/physical offset
49 * mappings and lengths for each extent.
50 *
51 * Typical soft partition operation proceeds as follows:
52 * - The unit structure is fetched from the metadb and placed into
53 * an in-core array (as with other metadevices). This operation
54 * is performed via sp_build_incore( ) and takes place during
55 * "snarfing" (when all metadevices are brought in-core at
56 * once) and when a new soft partition is created.
57 * - A soft partition is opened via sp_open( ). At open time the
58 * the soft partition unit structure is verified with the soft
59 * partition on-disk structures. Additionally, the soft partition
60 * status is checked (only soft partitions in the OK state may be
61 * opened).
62 * - Soft partition I/O is performed via sp_strategy( ) which relies on
63 * a support routine, sp_mapbuf( ), to do most of the work.
64 * sp_mapbuf( ) maps a buffer to a particular extent via a binary
65 * search of the extent array in the soft partition unit structure.
66 * Once a translation has been performed, the I/O is passed down
67 * to the next layer, which may be another metadevice or a physical
68 * disk. Since a soft partition may contain multiple, non-contiguous
69 * extents, a single I/O may have to be fragmented.
70 * - Soft partitions are closed using sp_close.
71 *
72 */
73
74 #include <sys/param.h>
75 #include <sys/systm.h>
76 #include <sys/conf.h>
77 #include <sys/file.h>
78 #include <sys/user.h>
79 #include <sys/uio.h>
80 #include <sys/t_lock.h>
81 #include <sys/buf.h>
82 #include <sys/dkio.h>
83 #include <sys/vtoc.h>
84 #include <sys/kmem.h>
85 #include <vm/page.h>
86 #include <sys/cmn_err.h>
87 #include <sys/sysmacros.h>
88 #include <sys/types.h>
89 #include <sys/mkdev.h>
90 #include <sys/stat.h>
91 #include <sys/open.h>
92 #include <sys/lvm/mdvar.h>
93 #include <sys/lvm/md_sp.h>
94 #include <sys/lvm/md_convert.h>
95 #include <sys/lvm/md_notify.h>
96 #include <sys/lvm/md_crc.h>
97 #include <sys/modctl.h>
98 #include <sys/ddi.h>
99 #include <sys/sunddi.h>
100 #include <sys/debug.h>
101
102 #include <sys/sysevent/eventdefs.h>
103 #include <sys/sysevent/svm.h>
104
105 md_ops_t sp_md_ops;
106 #ifndef lint
107 md_ops_t *md_interface_ops = &sp_md_ops;
108 #endif
109
110 extern unit_t md_nunits;
111 extern set_t md_nsets;
112 extern md_set_t md_set[];
113
114 extern int md_status;
115 extern major_t md_major;
116 extern mdq_anchor_t md_done_daemon;
117 extern mdq_anchor_t md_sp_daemon;
118 extern kmutex_t md_mx;
119 extern kcondvar_t md_cv;
120 extern md_krwlock_t md_unit_array_rw;
121 extern clock_t md_hz;
122
123 static kmem_cache_t *sp_parent_cache = NULL;
124 static kmem_cache_t *sp_child_cache = NULL;
125 static void sp_send_stat_ok(mp_unit_t *);
126 static void sp_send_stat_err(mp_unit_t *);
127
128 /*
129 * FUNCTION: sp_parent_constructor()
130 * INPUT: none.
131 * OUTPUT: ps - parent save structure initialized.
132 * RETURNS: void * - ptr to initialized parent save structure.
133 * PURPOSE: initialize parent save structure.
134 */
135 /*ARGSUSED1*/
136 static int
sp_parent_constructor(void * p,void * d1,int d2)137 sp_parent_constructor(void *p, void *d1, int d2)
138 {
139 mutex_init(&((md_spps_t *)p)->ps_mx,
140 NULL, MUTEX_DEFAULT, NULL);
141 return (0);
142 }
143
144 static void
sp_parent_init(md_spps_t * ps)145 sp_parent_init(md_spps_t *ps)
146 {
147 bzero(ps, offsetof(md_spps_t, ps_mx));
148 }
149
150 /*ARGSUSED1*/
151 static void
sp_parent_destructor(void * p,void * d)152 sp_parent_destructor(void *p, void *d)
153 {
154 mutex_destroy(&((md_spps_t *)p)->ps_mx);
155 }
156
157 /*
158 * FUNCTION: sp_child_constructor()
159 * INPUT: none.
160 * OUTPUT: cs - child save structure initialized.
161 * RETURNS: void * - ptr to initialized child save structure.
162 * PURPOSE: initialize child save structure.
163 */
164 /*ARGSUSED1*/
165 static int
sp_child_constructor(void * p,void * d1,int d2)166 sp_child_constructor(void *p, void *d1, int d2)
167 {
168 bioinit(&((md_spcs_t *)p)->cs_buf);
169 return (0);
170 }
171
172 static void
sp_child_init(md_spcs_t * cs)173 sp_child_init(md_spcs_t *cs)
174 {
175 cs->cs_mdunit = 0;
176 cs->cs_ps = NULL;
177 md_bioreset(&cs->cs_buf);
178 }
179
180 /*ARGSUSED1*/
181 static void
sp_child_destructor(void * p,void * d)182 sp_child_destructor(void *p, void *d)
183 {
184 biofini(&((md_spcs_t *)p)->cs_buf);
185 }
186
187 /*
188 * FUNCTION: sp_run_queue()
189 * INPUT: none.
190 * OUTPUT: none.
191 * RETURNS: void.
192 * PURPOSE: run the md_daemon to clean up memory pool.
193 */
194 /*ARGSUSED*/
195 static void
sp_run_queue(void * d)196 sp_run_queue(void *d)
197 {
198 if (!(md_status & MD_GBL_DAEMONS_LIVE))
199 md_daemon(1, &md_done_daemon);
200 }
201
202
203 /*
204 * FUNCTION: sp_build_incore()
205 * INPUT: p - ptr to unit structure.
206 * snarfing - flag to tell us we are snarfing.
207 * OUTPUT: non.
208 * RETURNS: int - 0 (always).
209 * PURPOSE: place unit structure into in-core unit array (keyed from
210 * minor number).
211 */
212 int
sp_build_incore(void * p,int snarfing)213 sp_build_incore(void *p, int snarfing)
214 {
215 mp_unit_t *un = (mp_unit_t *)p;
216 minor_t mnum;
217 set_t setno;
218 md_dev64_t tmpdev;
219
220 mnum = MD_SID(un);
221
222 if (MD_UNIT(mnum) != NULL)
223 return (0);
224
225 MD_STATUS(un) = 0;
226
227 if (snarfing) {
228 /*
229 * if we are snarfing, we get the device information
230 * from the metadb record (using the metadb key for
231 * that device).
232 */
233 setno = MD_MIN2SET(mnum);
234
235 tmpdev = md_getdevnum(setno, mddb_getsidenum(setno),
236 un->un_key, MD_NOTRUST_DEVT);
237 un->un_dev = tmpdev;
238 }
239
240 /* place various information in the in-core data structures */
241 md_nblocks_set(mnum, un->c.un_total_blocks);
242 MD_UNIT(mnum) = un;
243
244 return (0);
245 }
246
247 /*
248 * FUNCTION: reset_sp()
249 * INPUT: un - unit structure to be reset/removed.
250 * mnum - minor number to be reset/removed.
251 * removing - flag to tell us if we are removing
252 * permanently or just reseting in-core
253 * structures.
254 * OUTPUT: none.
255 * RETURNS: void.
256 * PURPOSE: used to either simply reset in-core structures or to
257 * permanently remove metadevices from the metadb.
258 */
259 void
reset_sp(mp_unit_t * un,minor_t mnum,int removing)260 reset_sp(mp_unit_t *un, minor_t mnum, int removing)
261 {
262 sv_dev_t *sv;
263 mddb_recid_t vtoc_id;
264
265 /* clean up in-core structures */
266 md_destroy_unit_incore(mnum, &sp_md_ops);
267
268 md_nblocks_set(mnum, -1ULL);
269 MD_UNIT(mnum) = NULL;
270
271 /*
272 * Attempt release of minor node
273 */
274 md_remove_minor_node(mnum);
275
276 if (!removing)
277 return;
278
279 /* we are removing the soft partition from the metadb */
280
281 /*
282 * Save off device information so we can get to
283 * it after we do the mddb_deleterec().
284 */
285 sv = (sv_dev_t *)kmem_alloc(sizeof (sv_dev_t), KM_SLEEP);
286 sv->setno = MD_MIN2SET(mnum);
287 sv->key = un->un_key;
288 vtoc_id = un->c.un_vtoc_id;
289
290 /*
291 * Remove self from the namespace
292 */
293 if (un->c.un_revision & MD_FN_META_DEV) {
294 (void) md_rem_selfname(un->c.un_self_id);
295 }
296
297 /* Remove the unit structure */
298 mddb_deleterec_wrapper(un->c.un_record_id);
299
300 if (vtoc_id)
301 mddb_deleterec_wrapper(vtoc_id);
302
303 SE_NOTIFY(EC_SVM_CONFIG, ESC_SVM_DELETE, TAG_METADEVICE,
304 MD_MIN2SET(mnum), MD_MIN2UNIT(mnum));
305
306 /*
307 * remove the underlying device name from the metadb. if other
308 * soft partitions are built on this device, this will simply
309 * decrease the reference count for this device. otherwise the
310 * name record for this device will be removed from the metadb.
311 */
312 md_rem_names(sv, 1);
313 kmem_free(sv, sizeof (sv_dev_t));
314 }
315
316 /*
317 * FUNCTION: sp_send_stat_msg
318 * INPUT: un - unit reference
319 * status - status to be sent to master node
320 * MD_SP_OK - soft-partition is now OK
321 * MD_SP_ERR " " errored
322 * OUTPUT: none.
323 * RETURNS: void.
324 * PURPOSE: send a soft-partition status change to the master node. If the
325 * message succeeds we simply return. If it fails we panic as the
326 * cluster-wide view of the metadevices is now inconsistent.
327 * CALLING CONTEXT:
328 * Blockable. No locks can be held.
329 */
330 static void
sp_send_stat_msg(mp_unit_t * un,sp_status_t status)331 sp_send_stat_msg(mp_unit_t *un, sp_status_t status)
332 {
333 md_mn_msg_sp_setstat_t sp_msg;
334 md_mn_kresult_t *kres;
335 set_t setno = MD_UN2SET(un);
336 int rval;
337 const char *str = (status == MD_SP_ERR) ? "MD_SP_ERR" : "MD_SP_OK";
338 int nretries = 0;
339
340 sp_msg.sp_setstat_mnum = MD_SID(un);
341 sp_msg.sp_setstat_status = status;
342
343 kres = kmem_alloc(sizeof (md_mn_kresult_t), KM_SLEEP);
344
345 spss_msg:
346 rval = mdmn_ksend_message(setno, MD_MN_MSG_SP_SETSTAT2, MD_MSGF_NO_LOG,
347 0, (char *)&sp_msg, sizeof (sp_msg), kres);
348
349 if (!MDMN_KSEND_MSG_OK(rval, kres)) {
350 mdmn_ksend_show_error(rval, kres, "MD_MN_MSG_SP_SETSTAT2");
351 /* If we're shutting down already, pause things here. */
352 if (kres->kmmr_comm_state == MDMNE_RPC_FAIL) {
353 while (!md_mn_is_commd_present()) {
354 delay(md_hz);
355 }
356 /*
357 * commd is available again. Retry the message once.
358 * If it fails we panic as the system is in an
359 * unexpected state.
360 */
361 if (nretries++ == 0)
362 goto spss_msg;
363 }
364 /*
365 * Panic as we are now in an inconsistent state.
366 */
367 cmn_err(CE_PANIC, "md: %s: %s could not be set on all nodes\n",
368 md_shortname(MD_SID(un)), str);
369 }
370
371 kmem_free(kres, sizeof (md_mn_kresult_t));
372 }
373
374 /*
375 * FUNCTION: sp_finish_error
376 * INPUT: ps - parent save structure for error-ed I/O.
377 * lock_held - set if the unit readerlock is held
378 * OUTPUT: none.
379 * RETURNS: void.
380 * PURPOSE: report a driver error
381 */
382 static void
sp_finish_error(md_spps_t * ps,int lock_held)383 sp_finish_error(md_spps_t *ps, int lock_held)
384 {
385 struct buf *pb = ps->ps_bp;
386 mdi_unit_t *ui = ps->ps_ui;
387 md_dev64_t un_dev; /* underlying device */
388 md_dev64_t md_dev = md_expldev(pb->b_edev); /* metadev in error */
389 char *str;
390
391 un_dev = md_expldev(ps->ps_un->un_dev);
392 /* set error type */
393 if (pb->b_flags & B_READ) {
394 str = "read";
395 } else {
396 str = "write";
397 }
398
399
400 SPPS_FREE(sp_parent_cache, ps);
401 pb->b_flags |= B_ERROR;
402
403 md_kstat_done(ui, pb, 0);
404
405 if (lock_held) {
406 md_unit_readerexit(ui);
407 }
408 md_biodone(pb);
409
410 cmn_err(CE_WARN, "md: %s: %s error on %s",
411 md_shortname(md_getminor(md_dev)), str,
412 md_devname(MD_DEV2SET(md_dev), un_dev, NULL, 0));
413 }
414
415
416 /*
417 * FUNCTION: sp_xmit_ok
418 * INPUT: dq - daemon queue referencing failing ps structure
419 * OUTPUT: none.
420 * RETURNS: void.
421 * PURPOSE: send a message to the master node in a multi-owner diskset to
422 * update all attached nodes view of the soft-part to be MD_SP_OK.
423 * CALLING CONTEXT:
424 * Blockable. No unit lock held.
425 */
426 static void
sp_xmit_ok(daemon_queue_t * dq)427 sp_xmit_ok(daemon_queue_t *dq)
428 {
429 md_spps_t *ps = (md_spps_t *)dq;
430
431 /* Send a MD_MN_MSG_SP_SETSTAT to the master */
432 sp_send_stat_msg(ps->ps_un, MD_SP_OK);
433
434 /*
435 * Successfully transmitted error state to all nodes, now release this
436 * parent structure.
437 */
438 SPPS_FREE(sp_parent_cache, ps);
439 }
440
441 /*
442 * FUNCTION: sp_xmit_error
443 * INPUT: dq - daemon queue referencing failing ps structure
444 * OUTPUT: none.
445 * RETURNS: void.
446 * PURPOSE: send a message to the master node in a multi-owner diskset to
447 * update all attached nodes view of the soft-part to be MD_SP_ERR.
448 * CALLING CONTEXT:
449 * Blockable. No unit lock held.
450 */
451 static void
sp_xmit_error(daemon_queue_t * dq)452 sp_xmit_error(daemon_queue_t *dq)
453 {
454 md_spps_t *ps = (md_spps_t *)dq;
455
456 /* Send a MD_MN_MSG_SP_SETSTAT to the master */
457 sp_send_stat_msg(ps->ps_un, MD_SP_ERR);
458
459 /*
460 * Successfully transmitted error state to all nodes, now release this
461 * parent structure.
462 */
463 SPPS_FREE(sp_parent_cache, ps);
464 }
465 static void
sp_send_stat_ok(mp_unit_t * un)466 sp_send_stat_ok(mp_unit_t *un)
467 {
468 minor_t mnum = MD_SID(un);
469 md_spps_t *ps;
470
471 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
472 sp_parent_init(ps);
473 ps->ps_un = un;
474 ps->ps_ui = MDI_UNIT(mnum);
475
476 daemon_request(&md_sp_daemon, sp_xmit_ok, (daemon_queue_t *)ps,
477 REQ_OLD);
478 }
479
480 static void
sp_send_stat_err(mp_unit_t * un)481 sp_send_stat_err(mp_unit_t *un)
482 {
483 minor_t mnum = MD_SID(un);
484 md_spps_t *ps;
485
486 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
487 sp_parent_init(ps);
488 ps->ps_un = un;
489 ps->ps_ui = MDI_UNIT(mnum);
490
491 daemon_request(&md_sp_daemon, sp_xmit_error, (daemon_queue_t *)ps,
492 REQ_OLD);
493 }
494
495
496 /*
497 * FUNCTION: sp_error()
498 * INPUT: ps - parent save structure for error-ed I/O.
499 * OUTPUT: none.
500 * RETURNS: void.
501 * PURPOSE: report a driver error.
502 * CALLING CONTEXT:
503 * Interrupt - non-blockable
504 */
505 static void
sp_error(md_spps_t * ps)506 sp_error(md_spps_t *ps)
507 {
508 set_t setno = MD_UN2SET(ps->ps_un);
509
510 /*
511 * Drop the mutex associated with this request before (potentially)
512 * enqueuing the free onto a separate thread. We have to release the
513 * mutex before destroying the parent structure.
514 */
515 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
516 if (MUTEX_HELD(&ps->ps_mx)) {
517 mutex_exit(&ps->ps_mx);
518 }
519 } else {
520 /*
521 * this should only ever happen if we are panicking,
522 * since DONTFREE is only set on the parent if panicstr
523 * is non-NULL.
524 */
525 ASSERT(panicstr);
526 }
527
528 /*
529 * For a multi-owner set we need to send a message to the master so that
530 * all nodes get the errored status when we first encounter it. To avoid
531 * deadlocking when multiple soft-partitions encounter an error on one
532 * physical unit we drop the unit readerlock before enqueueing the
533 * request. That way we can service any messages that require a
534 * writerlock to be held. Additionally, to avoid deadlocking when at
535 * the bottom of a metadevice stack and a higher level mirror has
536 * multiple requests outstanding on this soft-part, we clone the ps
537 * that failed and pass the error back up the stack to release the
538 * reference that this i/o may have in the higher-level metadevice.
539 * The other nodes in the cluster just have to modify the soft-part
540 * status and we do not need to block the i/o completion for this.
541 */
542 if (MD_MNSET_SETNO(setno)) {
543 md_spps_t *err_ps;
544 err_ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
545 sp_parent_init(err_ps);
546
547 err_ps->ps_un = ps->ps_un;
548 err_ps->ps_ui = ps->ps_ui;
549
550 md_unit_readerexit(ps->ps_ui);
551
552 daemon_request(&md_sp_daemon, sp_xmit_error,
553 (daemon_queue_t *)err_ps, REQ_OLD);
554
555 sp_finish_error(ps, 0);
556
557 return;
558 } else {
559 ps->ps_un->un_status = MD_SP_ERR;
560 }
561
562 /* Flag the error */
563 sp_finish_error(ps, 1);
564
565 }
566
567 /*
568 * FUNCTION: sp_mapbuf()
569 * INPUT: un - unit structure for soft partition we are doing
570 * I/O on.
571 * voff - virtual offset in soft partition to map.
572 * bcount - # of blocks in the I/O.
573 * OUTPUT: bp - translated buffer to be passed down to next layer.
574 * RETURNS: 1 - request must be fragmented, more work to do,
575 * 0 - request satisified, no more work to do
576 * -1 - error
577 * PURPOSE: Map the the virtual offset in the soft partition (passed
578 * in via voff) to the "physical" offset on whatever the soft
579 * partition is built on top of. We do this by doing a binary
580 * search of the extent array in the soft partition unit
581 * structure. Once the current extent is found, we do the
582 * translation, determine if the I/O will cross extent
583 * boundaries (if so, we have to fragment the I/O), then
584 * fill in the buf structure to be passed down to the next layer.
585 */
586 static int
sp_mapbuf(mp_unit_t * un,sp_ext_offset_t voff,sp_ext_length_t bcount,buf_t * bp)587 sp_mapbuf(
588 mp_unit_t *un,
589 sp_ext_offset_t voff,
590 sp_ext_length_t bcount,
591 buf_t *bp
592 )
593 {
594 int lo, mid, hi, found, more;
595 size_t new_bcount;
596 sp_ext_offset_t new_blkno;
597 sp_ext_offset_t new_offset;
598 sp_ext_offset_t ext_endblk;
599 md_dev64_t new_edev;
600 extern unsigned md_maxphys;
601
602 found = 0;
603 lo = 0;
604 hi = un->un_numexts - 1;
605
606 /*
607 * do a binary search to find the extent that contains the
608 * starting offset. after this loop, mid contains the index
609 * of the correct extent.
610 */
611 while (lo <= hi && !found) {
612 mid = (lo + hi) / 2;
613 /* is the starting offset contained within the mid-ext? */
614 if (voff >= un->un_ext[mid].un_voff &&
615 voff < un->un_ext[mid].un_voff + un->un_ext[mid].un_len)
616 found = 1;
617 else if (voff < un->un_ext[mid].un_voff)
618 hi = mid - 1;
619 else /* voff > un->un_ext[mid].un_voff + un->un_ext[mid].len */
620 lo = mid + 1;
621 }
622
623 if (!found) {
624 cmn_err(CE_WARN, "sp_mapbuf: invalid offset %llu.\n", voff);
625 return (-1);
626 }
627
628 /* translate to underlying physical offset/device */
629 new_offset = voff - un->un_ext[mid].un_voff;
630 new_blkno = un->un_ext[mid].un_poff + new_offset;
631 new_edev = un->un_dev;
632
633 /* determine if we need to break the I/O into fragments */
634 ext_endblk = un->un_ext[mid].un_voff + un->un_ext[mid].un_len;
635 if (voff + btodb(bcount) > ext_endblk) {
636 new_bcount = dbtob(ext_endblk - voff);
637 more = 1;
638 } else {
639 new_bcount = bcount;
640 more = 0;
641 }
642
643 /* only break up the I/O if we're not built on another metadevice */
644 if ((md_getmajor(new_edev) != md_major) && (new_bcount > md_maxphys)) {
645 new_bcount = md_maxphys;
646 more = 1;
647 }
648 if (bp != (buf_t *)NULL) {
649 /* do bp updates */
650 bp->b_bcount = new_bcount;
651 bp->b_lblkno = new_blkno;
652 bp->b_edev = md_dev64_to_dev(new_edev);
653 }
654 return (more);
655 }
656
657 /*
658 * FUNCTION: sp_validate()
659 * INPUT: un - unit structure to be validated.
660 * OUTPUT: none.
661 * RETURNS: 0 - soft partition ok.
662 * -1 - error.
663 * PURPOSE: called on open to sanity check the soft partition. In
664 * order to open a soft partition:
665 * - it must have at least one extent
666 * - the extent info in core and on disk must match
667 * - it may not be in an intermediate state (which would
668 * imply that a two-phase commit was interrupted)
669 *
670 * If the extent checking fails (B_ERROR returned from the read
671 * strategy call) _and_ we're a multi-owner diskset, we send a
672 * message to the master so that all nodes inherit the same view
673 * of the soft partition.
674 * If we are checking a soft-part that is marked as in error, and
675 * we can actually read and validate the watermarks we send a
676 * message to clear the error to the master node.
677 */
678 static int
sp_validate(mp_unit_t * un)679 sp_validate(mp_unit_t *un)
680 {
681 uint_t ext;
682 struct buf *buf;
683 sp_ext_length_t len;
684 mp_watermark_t *wm;
685 set_t setno;
686 int reset_error = 0;
687
688 setno = MD_UN2SET(un);
689
690 /* sanity check unit structure components ?? */
691 if (un->un_status != MD_SP_OK) {
692 if (un->un_status != MD_SP_ERR) {
693 cmn_err(CE_WARN, "md: %s: open failed, soft partition "
694 "status is %u.",
695 md_shortname(MD_SID(un)),
696 un->un_status);
697 return (-1);
698 } else {
699 cmn_err(CE_WARN, "md: %s: open of soft partition "
700 "in Errored state.",
701 md_shortname(MD_SID(un)));
702 reset_error = 1;
703 }
704 }
705
706 if (un->un_numexts == 0) {
707 cmn_err(CE_WARN, "md: %s: open failed, soft partition does "
708 "not have any extents.", md_shortname(MD_SID(un)));
709 return (-1);
710 }
711
712 len = 0LL;
713 for (ext = 0; ext < un->un_numexts; ext++) {
714
715 /* tally extent lengths to check total size */
716 len += un->un_ext[ext].un_len;
717
718 /* allocate buffer for watermark */
719 buf = getrbuf(KM_SLEEP);
720
721 /* read watermark */
722 buf->b_flags = B_READ;
723 buf->b_edev = md_dev64_to_dev(un->un_dev);
724 buf->b_iodone = NULL;
725 buf->b_proc = NULL;
726 buf->b_bcount = sizeof (mp_watermark_t);
727 buf->b_lblkno = un->un_ext[ext].un_poff - 1;
728 buf->b_bufsize = sizeof (mp_watermark_t);
729 buf->b_un.b_addr = kmem_alloc(sizeof (mp_watermark_t),
730 KM_SLEEP);
731
732 /*
733 * make the call non-blocking so that it is not affected
734 * by a set take.
735 */
736 md_call_strategy(buf, MD_STR_MAPPED|MD_NOBLOCK, NULL);
737 (void) biowait(buf);
738
739 if (buf->b_flags & B_ERROR) {
740 cmn_err(CE_WARN, "md: %s: open failed, could not "
741 "read watermark at block %llu for extent %u, "
742 "error %d.", md_shortname(MD_SID(un)),
743 buf->b_lblkno, ext, buf->b_error);
744 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
745 freerbuf(buf);
746
747 /*
748 * If we're a multi-owner diskset we send a message
749 * indicating that this soft-part has an invalid
750 * extent to the master node. This ensures a consistent
751 * view of the soft-part across the cluster.
752 */
753 if (MD_MNSET_SETNO(setno)) {
754 sp_send_stat_err(un);
755 }
756 return (-1);
757 }
758
759 wm = (mp_watermark_t *)buf->b_un.b_addr;
760
761 /* make sure the checksum is correct first */
762 if (crcchk((uchar_t *)wm, (uint_t *)&wm->wm_checksum,
763 (uint_t)sizeof (mp_watermark_t), (uchar_t *)NULL)) {
764 cmn_err(CE_WARN, "md: %s: open failed, watermark "
765 "at block %llu for extent %u does not have a "
766 "valid checksum 0x%08x.", md_shortname(MD_SID(un)),
767 buf->b_lblkno, ext, wm->wm_checksum);
768 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
769 freerbuf(buf);
770 return (-1);
771 }
772
773 if (wm->wm_magic != MD_SP_MAGIC) {
774 cmn_err(CE_WARN, "md: %s: open failed, watermark "
775 "at block %llu for extent %u does not have a "
776 "valid watermark magic number, expected 0x%x, "
777 "found 0x%x.", md_shortname(MD_SID(un)),
778 buf->b_lblkno, ext, MD_SP_MAGIC, wm->wm_magic);
779 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
780 freerbuf(buf);
781 return (-1);
782 }
783
784 /* make sure sequence number matches the current extent */
785 if (wm->wm_seq != ext) {
786 cmn_err(CE_WARN, "md: %s: open failed, watermark "
787 "at block %llu for extent %u has invalid "
788 "sequence number %u.", md_shortname(MD_SID(un)),
789 buf->b_lblkno, ext, wm->wm_seq);
790 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
791 freerbuf(buf);
792 return (-1);
793 }
794
795 /* make sure watermark length matches unit structure */
796 if (wm->wm_length != un->un_ext[ext].un_len) {
797 cmn_err(CE_WARN, "md: %s: open failed, watermark "
798 "at block %llu for extent %u has inconsistent "
799 "length, expected %llu, found %llu.",
800 md_shortname(MD_SID(un)), buf->b_lblkno,
801 ext, un->un_ext[ext].un_len,
802 (u_longlong_t)wm->wm_length);
803 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
804 freerbuf(buf);
805 return (-1);
806 }
807
808 /*
809 * make sure the type is a valid soft partition and not
810 * a free extent or the end.
811 */
812 if (wm->wm_type != EXTTYP_ALLOC) {
813 cmn_err(CE_WARN, "md: %s: open failed, watermark "
814 "at block %llu for extent %u is not marked "
815 "as in-use, type = %u.", md_shortname(MD_SID(un)),
816 buf->b_lblkno, ext, wm->wm_type);
817 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
818 freerbuf(buf);
819 return (-1);
820 }
821 /* free up buffer */
822 kmem_free(buf->b_un.b_addr, sizeof (mp_watermark_t));
823 freerbuf(buf);
824 }
825
826 if (len != un->un_length) {
827 cmn_err(CE_WARN, "md: %s: open failed, computed length "
828 "%llu != expected length %llu.", md_shortname(MD_SID(un)),
829 len, un->un_length);
830 return (-1);
831 }
832
833 /*
834 * If we're a multi-owner set _and_ reset_error is set, we should clear
835 * the error condition on all nodes in the set. Use SP_SETSTAT2 with
836 * MD_SP_OK.
837 */
838 if (MD_MNSET_SETNO(setno) && reset_error) {
839 sp_send_stat_ok(un);
840 }
841 return (0);
842 }
843
844 /*
845 * FUNCTION: sp_done()
846 * INPUT: child_buf - buffer attached to child save structure.
847 * this is the buffer on which I/O has just
848 * completed.
849 * OUTPUT: none.
850 * RETURNS: 0 - success.
851 * 1 - error.
852 * PURPOSE: called on I/O completion.
853 */
854 static int
sp_done(struct buf * child_buf)855 sp_done(struct buf *child_buf)
856 {
857 struct buf *parent_buf;
858 mdi_unit_t *ui;
859 md_spps_t *ps;
860 md_spcs_t *cs;
861
862 /* find the child save structure to which this buffer belongs */
863 cs = (md_spcs_t *)((caddr_t)child_buf -
864 (sizeof (md_spcs_t) - sizeof (buf_t)));
865 /* now get the parent save structure */
866 ps = cs->cs_ps;
867 parent_buf = ps->ps_bp;
868
869 mutex_enter(&ps->ps_mx);
870 /* pass any errors back up to the parent */
871 if (child_buf->b_flags & B_ERROR) {
872 ps->ps_flags |= MD_SPPS_ERROR;
873 parent_buf->b_error = child_buf->b_error;
874 }
875 /* mapout, if needed */
876 if (child_buf->b_flags & B_REMAPPED)
877 bp_mapout(child_buf);
878
879 ps->ps_frags--;
880 if (ps->ps_frags != 0) {
881 /*
882 * if this parent has more children, we just free the
883 * child and return.
884 */
885 kmem_cache_free(sp_child_cache, cs);
886 mutex_exit(&ps->ps_mx);
887 return (1);
888 }
889 /* there are no more children */
890 kmem_cache_free(sp_child_cache, cs);
891 if (ps->ps_flags & MD_SPPS_ERROR) {
892 sp_error(ps);
893 return (1);
894 }
895 ui = ps->ps_ui;
896 if (!(ps->ps_flags & MD_SPPS_DONTFREE)) {
897 mutex_exit(&ps->ps_mx);
898 } else {
899 /*
900 * this should only ever happen if we are panicking,
901 * since DONTFREE is only set on the parent if panicstr
902 * is non-NULL.
903 */
904 ASSERT(panicstr);
905 }
906 SPPS_FREE(sp_parent_cache, ps);
907 md_kstat_done(ui, parent_buf, 0);
908 md_unit_readerexit(ui);
909 md_biodone(parent_buf);
910 return (0);
911 }
912
913 /*
914 * FUNCTION: md_sp_strategy()
915 * INPUT: parent_buf - parent buffer
916 * flag - flags
917 * private - private data
918 * OUTPUT: none.
919 * RETURNS: void.
920 * PURPOSE: Soft partitioning I/O strategy. Performs the main work
921 * needed to do I/O to a soft partition. The basic
922 * algorithm is as follows:
923 * - Allocate a child save structure to keep track
924 * of the I/O we are going to pass down.
925 * - Map the I/O to the correct extent in the soft
926 * partition (see sp_mapbuf()).
927 * - bioclone() the buffer and pass it down the
928 * stack using md_call_strategy.
929 * - If the I/O needs to split across extents,
930 * repeat the above steps until all fragments
931 * are finished.
932 */
933 static void
md_sp_strategy(buf_t * parent_buf,int flag,void * private)934 md_sp_strategy(buf_t *parent_buf, int flag, void *private)
935 {
936 md_spps_t *ps;
937 md_spcs_t *cs;
938 int more;
939 mp_unit_t *un;
940 mdi_unit_t *ui;
941 size_t current_count;
942 off_t current_offset;
943 sp_ext_offset_t current_blkno;
944 buf_t *child_buf;
945 set_t setno = MD_MIN2SET(getminor(parent_buf->b_edev));
946 int strat_flag = flag;
947
948 /*
949 * When doing IO to a multi owner meta device, check if set is halted.
950 * We do this check without the needed lock held, for performance
951 * reasons.
952 * If an IO just slips through while the set is locked via an
953 * MD_MN_SUSPEND_SET, we don't care about it.
954 * Only check for suspension if we are a top-level i/o request
955 * (MD_STR_NOTTOP is cleared in 'flag');
956 */
957 if ((md_set[setno].s_status & (MD_SET_HALTED | MD_SET_MNSET)) ==
958 (MD_SET_HALTED | MD_SET_MNSET)) {
959 if ((flag & MD_STR_NOTTOP) == 0) {
960 mutex_enter(&md_mx);
961 /* Here we loop until the set is no longer halted */
962 while (md_set[setno].s_status & MD_SET_HALTED) {
963 cv_wait(&md_cv, &md_mx);
964 }
965 mutex_exit(&md_mx);
966 }
967 }
968
969 ui = MDI_UNIT(getminor(parent_buf->b_edev));
970
971 md_kstat_waitq_enter(ui);
972
973 un = (mp_unit_t *)md_unit_readerlock(ui);
974
975 if ((flag & MD_NOBLOCK) == 0) {
976 if (md_inc_iocount(setno) != 0) {
977 parent_buf->b_flags |= B_ERROR;
978 parent_buf->b_error = ENXIO;
979 parent_buf->b_resid = parent_buf->b_bcount;
980 md_kstat_waitq_exit(ui);
981 md_unit_readerexit(ui);
982 biodone(parent_buf);
983 return;
984 }
985 } else {
986 md_inc_iocount_noblock(setno);
987 }
988
989 if (!(flag & MD_STR_NOTTOP)) {
990 if (md_checkbuf(ui, (md_unit_t *)un, parent_buf) != 0) {
991 md_kstat_waitq_exit(ui);
992 return;
993 }
994 }
995
996 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
997 sp_parent_init(ps);
998
999 /*
1000 * Save essential information from the original buffhdr
1001 * in the parent.
1002 */
1003 ps->ps_un = un;
1004 ps->ps_ui = ui;
1005 ps->ps_bp = parent_buf;
1006 ps->ps_addr = parent_buf->b_un.b_addr;
1007
1008 current_count = parent_buf->b_bcount;
1009 current_blkno = (sp_ext_offset_t)parent_buf->b_blkno;
1010 current_offset = 0;
1011
1012 /*
1013 * if we are at the top and we are panicking,
1014 * we don't free in order to save state.
1015 */
1016 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL))
1017 ps->ps_flags |= MD_SPPS_DONTFREE;
1018
1019 md_kstat_waitq_to_runq(ui);
1020
1021 ps->ps_frags++;
1022
1023 /*
1024 * Mark this i/o as MD_STR_ABR if we've had ABR enabled on this
1025 * metadevice.
1026 */
1027 if (ui->ui_tstate & MD_ABR_CAP)
1028 strat_flag |= MD_STR_ABR;
1029
1030 /*
1031 * this loop does the main work of an I/O. we allocate a
1032 * a child save for each buf, do the logical to physical
1033 * mapping, decide if we need to frag the I/O, clone the
1034 * new I/O to pass down the stack. repeat until we've
1035 * taken care of the entire buf that was passed to us.
1036 */
1037 do {
1038 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1039 sp_child_init(cs);
1040 child_buf = &cs->cs_buf;
1041 cs->cs_ps = ps;
1042
1043 more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1044 if (more == -1) {
1045 parent_buf->b_flags |= B_ERROR;
1046 parent_buf->b_error = EIO;
1047 md_kstat_done(ui, parent_buf, 0);
1048 md_unit_readerexit(ui);
1049 md_biodone(parent_buf);
1050 kmem_cache_free(sp_parent_cache, ps);
1051 return;
1052 }
1053
1054 child_buf = md_bioclone(parent_buf, current_offset,
1055 child_buf->b_bcount, child_buf->b_edev,
1056 child_buf->b_blkno, sp_done, child_buf,
1057 KM_NOSLEEP);
1058 /* calculate new offset, counts, etc... */
1059 current_offset += child_buf->b_bcount;
1060 current_count -= child_buf->b_bcount;
1061 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1062
1063 if (more) {
1064 mutex_enter(&ps->ps_mx);
1065 ps->ps_frags++;
1066 mutex_exit(&ps->ps_mx);
1067 }
1068
1069 md_call_strategy(child_buf, strat_flag, private);
1070 } while (more);
1071
1072 if (!(flag & MD_STR_NOTTOP) && (panicstr != NULL)) {
1073 while (!(ps->ps_flags & MD_SPPS_DONE)) {
1074 md_daemon(1, &md_done_daemon);
1075 }
1076 kmem_cache_free(sp_parent_cache, ps);
1077 }
1078 }
1079
1080 /*
1081 * FUNCTION: sp_directed_read()
1082 * INPUT: mnum - minor number
1083 * vdr - vol_directed_rd_t from user
1084 * mode - access mode for copying data out.
1085 * OUTPUT: none.
1086 * RETURNS: 0 - success
1087 * Exxxxx - failure error-code
1088 * PURPOSE: Construct the necessary sub-device i/o requests to perform the
1089 * directed read as requested by the user. This is essentially the
1090 * same as md_sp_strategy() with the exception being that the
1091 * underlying 'md_call_strategy' is replaced with an ioctl call.
1092 */
1093 int
sp_directed_read(minor_t mnum,vol_directed_rd_t * vdr,int mode)1094 sp_directed_read(minor_t mnum, vol_directed_rd_t *vdr, int mode)
1095 {
1096 md_spps_t *ps;
1097 md_spcs_t *cs;
1098 int more;
1099 mp_unit_t *un;
1100 mdi_unit_t *ui;
1101 size_t current_count;
1102 off_t current_offset;
1103 sp_ext_offset_t current_blkno;
1104 buf_t *child_buf, *parent_buf;
1105 void *kbuffer;
1106 vol_directed_rd_t cvdr;
1107 caddr_t userbuf;
1108 offset_t useroff;
1109 int ret = 0;
1110
1111 ui = MDI_UNIT(mnum);
1112
1113 md_kstat_waitq_enter(ui);
1114
1115 bzero(&cvdr, sizeof (cvdr));
1116
1117 un = (mp_unit_t *)md_unit_readerlock(ui);
1118
1119 /*
1120 * Construct a parent_buf header which reflects the user-supplied
1121 * request.
1122 */
1123
1124 kbuffer = kmem_alloc(vdr->vdr_nbytes, KM_NOSLEEP);
1125 if (kbuffer == NULL) {
1126 vdr->vdr_flags |= DKV_DMR_ERROR;
1127 md_kstat_waitq_exit(ui);
1128 md_unit_readerexit(ui);
1129 return (ENOMEM);
1130 }
1131
1132 parent_buf = getrbuf(KM_NOSLEEP);
1133 if (parent_buf == NULL) {
1134 vdr->vdr_flags |= DKV_DMR_ERROR;
1135 md_kstat_waitq_exit(ui);
1136 md_unit_readerexit(ui);
1137 kmem_free(kbuffer, vdr->vdr_nbytes);
1138 return (ENOMEM);
1139 }
1140 parent_buf->b_un.b_addr = kbuffer;
1141 parent_buf->b_flags = B_READ;
1142 parent_buf->b_bcount = vdr->vdr_nbytes;
1143 parent_buf->b_lblkno = lbtodb(vdr->vdr_offset);
1144 parent_buf->b_edev = un->un_dev;
1145
1146
1147 ps = kmem_cache_alloc(sp_parent_cache, MD_ALLOCFLAGS);
1148 sp_parent_init(ps);
1149
1150 /*
1151 * Save essential information from the original buffhdr
1152 * in the parent.
1153 */
1154 ps->ps_un = un;
1155 ps->ps_ui = ui;
1156 ps->ps_bp = parent_buf;
1157 ps->ps_addr = parent_buf->b_un.b_addr;
1158
1159 current_count = parent_buf->b_bcount;
1160 current_blkno = (sp_ext_offset_t)parent_buf->b_lblkno;
1161 current_offset = 0;
1162
1163 md_kstat_waitq_to_runq(ui);
1164
1165 ps->ps_frags++;
1166 vdr->vdr_bytesread = 0;
1167
1168 /*
1169 * this loop does the main work of an I/O. we allocate a
1170 * a child save for each buf, do the logical to physical
1171 * mapping, decide if we need to frag the I/O, clone the
1172 * new I/O to pass down the stack. repeat until we've
1173 * taken care of the entire buf that was passed to us.
1174 */
1175 do {
1176 cs = kmem_cache_alloc(sp_child_cache, MD_ALLOCFLAGS);
1177 sp_child_init(cs);
1178 child_buf = &cs->cs_buf;
1179 cs->cs_ps = ps;
1180
1181 more = sp_mapbuf(un, current_blkno, current_count, child_buf);
1182 if (more == -1) {
1183 ret = EIO;
1184 vdr->vdr_flags |= DKV_DMR_SHORT;
1185 kmem_cache_free(sp_child_cache, cs);
1186 goto err_out;
1187 }
1188
1189 cvdr.vdr_flags = vdr->vdr_flags;
1190 cvdr.vdr_side = vdr->vdr_side;
1191 cvdr.vdr_nbytes = child_buf->b_bcount;
1192 cvdr.vdr_offset = ldbtob(child_buf->b_lblkno);
1193 /* Work out where we are in the allocated buffer */
1194 useroff = (offset_t)(uintptr_t)kbuffer;
1195 useroff = useroff + (offset_t)current_offset;
1196 cvdr.vdr_data = (void *)(uintptr_t)useroff;
1197 child_buf = md_bioclone(parent_buf, current_offset,
1198 child_buf->b_bcount, child_buf->b_edev,
1199 child_buf->b_blkno, NULL,
1200 child_buf, KM_NOSLEEP);
1201 /* calculate new offset, counts, etc... */
1202 current_offset += child_buf->b_bcount;
1203 current_count -= child_buf->b_bcount;
1204 current_blkno += (sp_ext_offset_t)(btodb(child_buf->b_bcount));
1205
1206 if (more) {
1207 mutex_enter(&ps->ps_mx);
1208 ps->ps_frags++;
1209 mutex_exit(&ps->ps_mx);
1210 }
1211
1212 ret = md_call_ioctl(child_buf->b_edev, DKIOCDMR, &cvdr,
1213 (mode | FKIOCTL), NULL);
1214
1215 /*
1216 * Free the child structure as we've finished with it.
1217 * Normally this would be done by sp_done() but we're just
1218 * using md_bioclone() to segment the transfer and we never
1219 * issue a strategy request so the iodone will not be called.
1220 */
1221 kmem_cache_free(sp_child_cache, cs);
1222 if (ret == 0) {
1223 /* copyout the returned data to vdr_data + offset */
1224 userbuf = (caddr_t)kbuffer;
1225 userbuf += (caddr_t)(cvdr.vdr_data) - (caddr_t)kbuffer;
1226 if (ddi_copyout(userbuf, vdr->vdr_data,
1227 cvdr.vdr_bytesread, mode)) {
1228 ret = EFAULT;
1229 goto err_out;
1230 }
1231 vdr->vdr_bytesread += cvdr.vdr_bytesread;
1232 } else {
1233 goto err_out;
1234 }
1235 } while (more);
1236
1237 /*
1238 * Update the user-supplied vol_directed_rd_t structure with the
1239 * contents of the last issued child request.
1240 */
1241 vdr->vdr_flags = cvdr.vdr_flags;
1242 vdr->vdr_side = cvdr.vdr_side;
1243 bcopy(cvdr.vdr_side_name, vdr->vdr_side_name, VOL_SIDENAME);
1244
1245 err_out:
1246 if (ret != 0) {
1247 vdr->vdr_flags |= DKV_DMR_ERROR;
1248 }
1249 if (vdr->vdr_bytesread != vdr->vdr_nbytes) {
1250 vdr->vdr_flags |= DKV_DMR_SHORT;
1251 }
1252 kmem_cache_free(sp_parent_cache, ps);
1253 kmem_free(kbuffer, vdr->vdr_nbytes);
1254 freerbuf(parent_buf);
1255 md_unit_readerexit(ui);
1256 return (ret);
1257 }
1258
1259 /*
1260 * FUNCTION: sp_snarf()
1261 * INPUT: cmd - snarf cmd.
1262 * setno - set number.
1263 * OUTPUT: none.
1264 * RETURNS: 1 - soft partitions were snarfed.
1265 * 0 - no soft partitions were snarfed.
1266 * PURPOSE: Snarf soft partition metadb records into their in-core
1267 * structures. This routine is called at "snarf time" when
1268 * md loads and gets all metadevices records into memory.
1269 * The basic algorithm is simply to walk the soft partition
1270 * records in the metadb and call the soft partitioning
1271 * build_incore routine to set up the in-core structures.
1272 */
1273 static int
sp_snarf(md_snarfcmd_t cmd,set_t setno)1274 sp_snarf(md_snarfcmd_t cmd, set_t setno)
1275 {
1276 mp_unit_t *un;
1277 mddb_recid_t recid;
1278 int gotsomething;
1279 int all_sp_gotten;
1280 mddb_type_t rec_type;
1281 mddb_de_ic_t *dep;
1282 mddb_rb32_t *rbp;
1283 mp_unit_t *big_un;
1284 mp_unit32_od_t *small_un;
1285 size_t newreqsize;
1286
1287
1288 if (cmd == MD_SNARF_CLEANUP)
1289 return (0);
1290
1291 all_sp_gotten = 1;
1292 gotsomething = 0;
1293
1294 /* get the record type */
1295 rec_type = (mddb_type_t)md_getshared_key(setno,
1296 sp_md_ops.md_driver.md_drivername);
1297 recid = mddb_makerecid(setno, 0);
1298
1299 /*
1300 * walk soft partition records in the metadb and call
1301 * sp_build_incore to build in-core structures.
1302 */
1303 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1304 /* if we've already gotten this record, go to the next one */
1305 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1306 continue;
1307
1308
1309 dep = mddb_getrecdep(recid);
1310 dep->de_flags = MDDB_F_SOFTPART;
1311 rbp = dep->de_rb;
1312
1313 switch (rbp->rb_revision) {
1314 case MDDB_REV_RB:
1315 case MDDB_REV_RBFN:
1316 if ((rbp->rb_private & MD_PRV_CONVD) == 0) {
1317 /*
1318 * This means, we have an old and small record.
1319 * And this record hasn't already been converted
1320 * :-o before we create an incore metadevice
1321 * from this we have to convert it to a big
1322 * record.
1323 */
1324 small_un =
1325 (mp_unit32_od_t *)mddb_getrecaddr(recid);
1326 newreqsize = sizeof (mp_unit_t) +
1327 ((small_un->un_numexts - 1) *
1328 sizeof (struct mp_ext));
1329 big_un = (mp_unit_t *)kmem_zalloc(newreqsize,
1330 KM_SLEEP);
1331 softpart_convert((caddr_t)small_un,
1332 (caddr_t)big_un, SMALL_2_BIG);
1333 kmem_free(small_un, dep->de_reqsize);
1334 dep->de_rb_userdata = big_un;
1335 dep->de_reqsize = newreqsize;
1336 rbp->rb_private |= MD_PRV_CONVD;
1337 un = big_un;
1338 } else {
1339 /* Record has already been converted */
1340 un = (mp_unit_t *)mddb_getrecaddr(recid);
1341 }
1342 un->c.un_revision &= ~MD_64BIT_META_DEV;
1343 break;
1344 case MDDB_REV_RB64:
1345 case MDDB_REV_RB64FN:
1346 /* Large device */
1347 un = (mp_unit_t *)mddb_getrecaddr(recid);
1348 un->c.un_revision |= MD_64BIT_META_DEV;
1349 un->c.un_flag |= MD_EFILABEL;
1350 break;
1351 }
1352 MDDB_NOTE_FN(rbp->rb_revision, un->c.un_revision);
1353
1354 /*
1355 * Create minor node for snarfed entry.
1356 */
1357 (void) md_create_minor_node(MD_MIN2SET(MD_SID(un)), MD_SID(un));
1358
1359 if (MD_UNIT(MD_SID(un)) != NULL) {
1360 /* unit is already in-core */
1361 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1362 continue;
1363 }
1364 all_sp_gotten = 0;
1365 if (sp_build_incore((void *)un, 1) == 0) {
1366 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1367 md_create_unit_incore(MD_SID(un), &sp_md_ops, 0);
1368 gotsomething = 1;
1369 }
1370 }
1371
1372 if (!all_sp_gotten)
1373 return (gotsomething);
1374 /* double-check records */
1375 recid = mddb_makerecid(setno, 0);
1376 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0)
1377 if (!(mddb_getrecprivate(recid) & MD_PRV_GOTIT))
1378 mddb_setrecprivate(recid, MD_PRV_PENDDEL);
1379
1380 return (0);
1381 }
1382
1383 /*
1384 * FUNCTION: sp_halt()
1385 * INPUT: cmd - halt cmd.
1386 * setno - set number.
1387 * RETURNS: 0 - success.
1388 * 1 - err.
1389 * PURPOSE: Perform driver halt operations. As with stripe, we
1390 * support MD_HALT_CHECK and MD_HALT_DOIT. The first
1391 * does a check to see if halting can be done safely
1392 * (no open soft partitions), the second cleans up and
1393 * shuts down the driver.
1394 */
1395 static int
sp_halt(md_haltcmd_t cmd,set_t setno)1396 sp_halt(md_haltcmd_t cmd, set_t setno)
1397 {
1398 int i;
1399 mdi_unit_t *ui;
1400 minor_t mnum;
1401
1402 if (cmd == MD_HALT_CLOSE)
1403 return (0);
1404
1405 if (cmd == MD_HALT_OPEN)
1406 return (0);
1407
1408 if (cmd == MD_HALT_UNLOAD)
1409 return (0);
1410
1411 if (cmd == MD_HALT_CHECK) {
1412 for (i = 0; i < md_nunits; i++) {
1413 mnum = MD_MKMIN(setno, i);
1414 if ((ui = MDI_UNIT(mnum)) == NULL)
1415 continue;
1416 if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1417 continue;
1418 if (md_unit_isopen(ui))
1419 return (1);
1420 }
1421 return (0);
1422 }
1423
1424 if (cmd != MD_HALT_DOIT)
1425 return (1);
1426
1427 for (i = 0; i < md_nunits; i++) {
1428 mnum = MD_MKMIN(setno, i);
1429 if ((ui = MDI_UNIT(mnum)) == NULL)
1430 continue;
1431 if (ui->ui_opsindex != sp_md_ops.md_selfindex)
1432 continue;
1433 reset_sp((mp_unit_t *)MD_UNIT(mnum), mnum, 0);
1434 }
1435
1436 return (0);
1437 }
1438
1439 /*
1440 * FUNCTION: sp_open_dev()
1441 * INPUT: un - unit structure.
1442 * oflags - open flags.
1443 * OUTPUT: none.
1444 * RETURNS: 0 - success.
1445 * non-zero - err.
1446 * PURPOSE: open underlying device via md_layered_open.
1447 */
1448 static int
sp_open_dev(mp_unit_t * un,int oflags)1449 sp_open_dev(mp_unit_t *un, int oflags)
1450 {
1451 minor_t mnum = MD_SID(un);
1452 int err;
1453 md_dev64_t tmpdev;
1454 set_t setno = MD_MIN2SET(MD_SID(un));
1455 side_t side = mddb_getsidenum(setno);
1456
1457 tmpdev = un->un_dev;
1458 /*
1459 * Do the open by device id if underlying is regular
1460 */
1461 if ((md_getmajor(tmpdev) != md_major) &&
1462 md_devid_found(setno, side, un->un_key) == 1) {
1463 tmpdev = md_resolve_bydevid(mnum, tmpdev, un->un_key);
1464 }
1465 err = md_layered_open(mnum, &tmpdev, oflags);
1466 un->un_dev = tmpdev;
1467
1468 if (err)
1469 return (ENXIO);
1470
1471 return (0);
1472 }
1473
1474 /*
1475 * FUNCTION: sp_open()
1476 * INPUT: dev - device to open.
1477 * flag - pass-through flag.
1478 * otyp - pass-through open type.
1479 * cred_p - credentials.
1480 * md_oflags - open flags.
1481 * OUTPUT: none.
1482 * RETURNS: 0 - success.
1483 * non-zero - err.
1484 * PURPOSE: open a soft partition.
1485 */
1486 /* ARGSUSED */
1487 static int
sp_open(dev_t * dev,int flag,int otyp,cred_t * cred_p,int md_oflags)1488 sp_open(
1489 dev_t *dev,
1490 int flag,
1491 int otyp,
1492 cred_t *cred_p,
1493 int md_oflags
1494 )
1495 {
1496 minor_t mnum = getminor(*dev);
1497 mdi_unit_t *ui = MDI_UNIT(mnum);
1498 mp_unit_t *un;
1499 int err = 0;
1500 set_t setno;
1501
1502 /*
1503 * When doing an open of a multi owner metadevice, check to see if this
1504 * node is a starting node and if a reconfig cycle is underway.
1505 * If so, the system isn't sufficiently set up enough to handle the
1506 * open (which involves I/O during sp_validate), so fail with ENXIO.
1507 */
1508 setno = MD_MIN2SET(mnum);
1509 if ((md_set[setno].s_status & (MD_SET_MNSET | MD_SET_MN_START_RC)) ==
1510 (MD_SET_MNSET | MD_SET_MN_START_RC)) {
1511 return (ENXIO);
1512 }
1513
1514 /* grab necessary locks */
1515 un = (mp_unit_t *)md_unit_openclose_enter(ui);
1516 setno = MD_UN2SET(un);
1517
1518 /* open underlying device, if necessary */
1519 if (! md_unit_isopen(ui) || (md_oflags & MD_OFLG_PROBEDEV)) {
1520 if ((err = sp_open_dev(un, md_oflags)) != 0)
1521 goto out;
1522
1523 if (MD_MNSET_SETNO(setno)) {
1524 /* For probe, don't incur the overhead of validate */
1525 if (!(md_oflags & MD_OFLG_PROBEDEV)) {
1526 /*
1527 * Don't call sp_validate while
1528 * unit_openclose lock is held. So, actually
1529 * open the device, drop openclose lock,
1530 * call sp_validate, reacquire openclose lock,
1531 * and close the device. If sp_validate
1532 * succeeds, then device will be re-opened.
1533 */
1534 if ((err = md_unit_incopen(mnum, flag,
1535 otyp)) != 0)
1536 goto out;
1537
1538 mutex_enter(&ui->ui_mx);
1539 ui->ui_lock |= MD_UL_OPENINPROGRESS;
1540 mutex_exit(&ui->ui_mx);
1541 md_unit_openclose_exit(ui);
1542 if (otyp != OTYP_LYR)
1543 rw_exit(&md_unit_array_rw.lock);
1544
1545 err = sp_validate(un);
1546
1547 if (otyp != OTYP_LYR)
1548 rw_enter(&md_unit_array_rw.lock,
1549 RW_READER);
1550 (void) md_unit_openclose_enter(ui);
1551 (void) md_unit_decopen(mnum, otyp);
1552 mutex_enter(&ui->ui_mx);
1553 ui->ui_lock &= ~MD_UL_OPENINPROGRESS;
1554 cv_broadcast(&ui->ui_cv);
1555 mutex_exit(&ui->ui_mx);
1556 /*
1557 * Should be in the same state as before
1558 * the sp_validate.
1559 */
1560 if (err != 0) {
1561 /* close the device opened above */
1562 md_layered_close(un->un_dev, md_oflags);
1563 err = EIO;
1564 goto out;
1565 }
1566 }
1567 /*
1568 * As we're a multi-owner metadevice we need to ensure
1569 * that all nodes have the same idea of the status.
1570 * sp_validate() will mark the device as errored (if
1571 * it cannot read the watermark) or ok (if it was
1572 * previously errored but the watermark is now valid).
1573 * This code-path is only entered on the non-probe open
1574 * so we will maintain the errored state during a probe
1575 * call. This means the sys-admin must metarecover -m
1576 * to reset the soft-partition error.
1577 */
1578 } else {
1579 /* For probe, don't incur the overhead of validate */
1580 if (!(md_oflags & MD_OFLG_PROBEDEV) &&
1581 (err = sp_validate(un)) != 0) {
1582 /* close the device opened above */
1583 md_layered_close(un->un_dev, md_oflags);
1584 err = EIO;
1585 goto out;
1586 } else {
1587 /*
1588 * we succeeded in validating the on disk
1589 * format versus the in core, so reset the
1590 * status if it's in error
1591 */
1592 if (un->un_status == MD_SP_ERR) {
1593 un->un_status = MD_SP_OK;
1594 }
1595 }
1596 }
1597 }
1598
1599 /* count open */
1600 if ((err = md_unit_incopen(mnum, flag, otyp)) != 0)
1601 goto out;
1602
1603 out:
1604 md_unit_openclose_exit(ui);
1605 return (err);
1606 }
1607
1608 /*
1609 * FUNCTION: sp_close()
1610 * INPUT: dev - device to close.
1611 * flag - pass-through flag.
1612 * otyp - pass-through type.
1613 * cred_p - credentials.
1614 * md_cflags - close flags.
1615 * OUTPUT: none.
1616 * RETURNS: 0 - success.
1617 * non-zero - err.
1618 * PURPOSE: close a soft paritition.
1619 */
1620 /* ARGSUSED */
1621 static int
sp_close(dev_t dev,int flag,int otyp,cred_t * cred_p,int md_cflags)1622 sp_close(
1623 dev_t dev,
1624 int flag,
1625 int otyp,
1626 cred_t *cred_p,
1627 int md_cflags
1628 )
1629 {
1630 minor_t mnum = getminor(dev);
1631 mdi_unit_t *ui = MDI_UNIT(mnum);
1632 mp_unit_t *un;
1633 int err = 0;
1634
1635 /* grab necessary locks */
1636 un = (mp_unit_t *)md_unit_openclose_enter(ui);
1637
1638 /* count closed */
1639 if ((err = md_unit_decopen(mnum, otyp)) != 0)
1640 goto out;
1641
1642 /* close devices, if necessary */
1643 if (! md_unit_isopen(ui) || (md_cflags & MD_OFLG_PROBEDEV)) {
1644 md_layered_close(un->un_dev, md_cflags);
1645 }
1646
1647 /*
1648 * If a MN set and transient capabilities (eg ABR/DMR) are set,
1649 * clear these capabilities if this is the last close in
1650 * the cluster
1651 */
1652 if (MD_MNSET_SETNO(MD_UN2SET(un)) &&
1653 (ui->ui_tstate & MD_ABR_CAP)) {
1654 md_unit_openclose_exit(ui);
1655 mdmn_clear_all_capabilities(mnum);
1656 return (0);
1657 }
1658 /* unlock, return success */
1659 out:
1660 md_unit_openclose_exit(ui);
1661 return (err);
1662 }
1663
1664
1665 /* used in sp_dump routine */
1666 static struct buf dumpbuf;
1667
1668 /*
1669 * FUNCTION: sp_dump()
1670 * INPUT: dev - device to dump to.
1671 * addr - address to dump.
1672 * blkno - blkno on device.
1673 * nblk - number of blocks to dump.
1674 * OUTPUT: none.
1675 * RETURNS: result from bdev_dump.
1676 * PURPOSE: This routine dumps memory to the disk. It assumes that
1677 * the memory has already been mapped into mainbus space.
1678 * It is called at disk interrupt priority when the system
1679 * is in trouble.
1680 * NOTE: this function is defined using 32-bit arguments,
1681 * but soft partitioning is internally 64-bit. Arguments
1682 * are casted where appropriate.
1683 */
1684 static int
sp_dump(dev_t dev,caddr_t addr,daddr_t blkno,int nblk)1685 sp_dump(dev_t dev, caddr_t addr, daddr_t blkno, int nblk)
1686 {
1687 mp_unit_t *un;
1688 buf_t *bp;
1689 sp_ext_length_t nb;
1690 daddr_t mapblk;
1691 int result;
1692 int more;
1693 int saveresult = 0;
1694
1695 /*
1696 * Don't need to grab the unit lock.
1697 * Cause nothing else is supposed to be happenning.
1698 * Also dump is not supposed to sleep.
1699 */
1700 un = (mp_unit_t *)MD_UNIT(getminor(dev));
1701
1702 if ((diskaddr_t)blkno >= un->c.un_total_blocks)
1703 return (EINVAL);
1704
1705 if (((diskaddr_t)blkno + nblk) > un->c.un_total_blocks)
1706 return (EINVAL);
1707
1708 bp = &dumpbuf;
1709 nb = (sp_ext_length_t)dbtob(nblk);
1710 do {
1711 bzero((caddr_t)bp, sizeof (*bp));
1712 more = sp_mapbuf(un, (sp_ext_offset_t)blkno, nb, bp);
1713 nblk = (int)(btodb(bp->b_bcount));
1714 mapblk = bp->b_blkno;
1715 result = bdev_dump(bp->b_edev, addr, mapblk, nblk);
1716 if (result)
1717 saveresult = result;
1718
1719 nb -= bp->b_bcount;
1720 addr += bp->b_bcount;
1721 blkno += nblk;
1722 } while (more);
1723
1724 return (saveresult);
1725 }
1726
1727 static int
sp_imp_set(set_t setno)1728 sp_imp_set(
1729 set_t setno
1730 )
1731 {
1732 mddb_recid_t recid;
1733 int gotsomething;
1734 mddb_type_t rec_type;
1735 mddb_de_ic_t *dep;
1736 mddb_rb32_t *rbp;
1737 mp_unit_t *un64;
1738 mp_unit32_od_t *un32;
1739 md_dev64_t self_devt;
1740 minor_t *self_id; /* minor needs to be updated */
1741 md_parent_t *parent_id; /* parent needs to be updated */
1742 mddb_recid_t *record_id; /* record id needs to be updated */
1743
1744 gotsomething = 0;
1745
1746 rec_type = (mddb_type_t)md_getshared_key(setno,
1747 sp_md_ops.md_driver.md_drivername);
1748 recid = mddb_makerecid(setno, 0);
1749
1750 while ((recid = mddb_getnextrec(recid, rec_type, 0)) > 0) {
1751 if (mddb_getrecprivate(recid) & MD_PRV_GOTIT)
1752 continue;
1753
1754 dep = mddb_getrecdep(recid);
1755 rbp = dep->de_rb;
1756
1757 switch (rbp->rb_revision) {
1758 case MDDB_REV_RB:
1759 case MDDB_REV_RBFN:
1760 /*
1761 * Small device
1762 */
1763 un32 = (mp_unit32_od_t *)mddb_getrecaddr(recid);
1764 self_id = &(un32->c.un_self_id);
1765 parent_id = &(un32->c.un_parent);
1766 record_id = &(un32->c.un_record_id);
1767
1768 if (!md_update_minor(setno, mddb_getsidenum
1769 (setno), un32->un_key))
1770 goto out;
1771 break;
1772
1773 case MDDB_REV_RB64:
1774 case MDDB_REV_RB64FN:
1775 un64 = (mp_unit_t *)mddb_getrecaddr(recid);
1776 self_id = &(un64->c.un_self_id);
1777 parent_id = &(un64->c.un_parent);
1778 record_id = &(un64->c.un_record_id);
1779
1780 if (!md_update_minor(setno, mddb_getsidenum
1781 (setno), un64->un_key))
1782 goto out;
1783 break;
1784 }
1785
1786 /*
1787 * If this is a top level and a friendly name metadevice,
1788 * update its minor in the namespace.
1789 */
1790 if ((*parent_id == MD_NO_PARENT) &&
1791 ((rbp->rb_revision == MDDB_REV_RBFN) ||
1792 (rbp->rb_revision == MDDB_REV_RB64FN))) {
1793
1794 self_devt = md_makedevice(md_major, *self_id);
1795 if (!md_update_top_device_minor(setno,
1796 mddb_getsidenum(setno), self_devt))
1797 goto out;
1798 }
1799
1800 /*
1801 * Update unit with the imported setno
1802 *
1803 */
1804 mddb_setrecprivate(recid, MD_PRV_GOTIT);
1805
1806 *self_id = MD_MKMIN(setno, MD_MIN2UNIT(*self_id));
1807 if (*parent_id != MD_NO_PARENT)
1808 *parent_id = MD_MKMIN(setno, MD_MIN2UNIT(*parent_id));
1809 *record_id = MAKERECID(setno, DBID(*record_id));
1810
1811 gotsomething = 1;
1812 }
1813
1814 out:
1815 return (gotsomething);
1816 }
1817
1818 static md_named_services_t sp_named_services[] = {
1819 {NULL, 0}
1820 };
1821
1822 md_ops_t sp_md_ops = {
1823 sp_open, /* open */
1824 sp_close, /* close */
1825 md_sp_strategy, /* strategy */
1826 NULL, /* print */
1827 sp_dump, /* dump */
1828 NULL, /* read */
1829 NULL, /* write */
1830 md_sp_ioctl, /* ioctl, */
1831 sp_snarf, /* snarf */
1832 sp_halt, /* halt */
1833 NULL, /* aread */
1834 NULL, /* awrite */
1835 sp_imp_set, /* import set */
1836 sp_named_services
1837 };
1838
1839 static void
init_init()1840 init_init()
1841 {
1842 sp_parent_cache = kmem_cache_create("md_softpart_parent",
1843 sizeof (md_spps_t), 0, sp_parent_constructor,
1844 sp_parent_destructor, sp_run_queue, NULL, NULL, 0);
1845 sp_child_cache = kmem_cache_create("md_softpart_child",
1846 sizeof (md_spcs_t) - sizeof (buf_t) + biosize(), 0,
1847 sp_child_constructor, sp_child_destructor, sp_run_queue,
1848 NULL, NULL, 0);
1849 }
1850
1851 static void
fini_uninit()1852 fini_uninit()
1853 {
1854 kmem_cache_destroy(sp_parent_cache);
1855 kmem_cache_destroy(sp_child_cache);
1856 sp_parent_cache = sp_child_cache = NULL;
1857 }
1858
1859 /* define the module linkage */
1860 MD_PLUGIN_MISC_MODULE("soft partition module", init_init(), fini_uninit())
1861