xref: /titanic_41/usr/src/uts/common/sys/lvm/md_mddb.h (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef _SYS_MD_MDDB_H
28 #define	_SYS_MD_MDDB_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <sys/types.h>
33 #include <sys/buf.h>
34 
35 #ifdef	__cplusplus
36 extern "C" {
37 #endif
38 
39 #if 0 /* DRP FOR DEBUGGING */
40 #define	MDDB_FAKE
41 #endif
42 
43 /* Private flags */
44 #define	MD_PRV_GOTIT		0x0001	/* Been snarfed */
45 #define	MD_PRV_DELETE		0x0002	/* Record pending to be deleted */
46 #define	MD_PRV_COMMIT		0x0004	/* Record pending to be commited */
47 #define	MD_PRV_CLEANUP		0x0008	/* Record pending to be cleaned up */
48 #define	MD_PRV_CONVD		0x0010  /* Record has been converted (32->64) */
49 #define	MD_PRV_PENDDEL		(MD_PRV_GOTIT | MD_PRV_DELETE)
50 #define	MD_PRV_PENDCOM		(MD_PRV_GOTIT | MD_PRV_COMMIT)
51 #define	MD_PRV_PENDCLEAN	(MD_PRV_GOTIT | MD_PRV_CLEANUP)
52 
53 
54 #define	MDDB_E_INVALID	(-1)	/* an invalid argument was passed */
55 #define	MDDB_E_EXISTS	(-2)	/* doing an operation a 2nd time which can */
56 				/*	only be done once */
57 #define	MDDB_E_MASTER	(-3)	/* problem occurred accessing mastor block */
58 				/*	returned from NEW_DEV	*/
59 #define	MDDB_E_TOOSMALL	(-4)	/* device is not large enough */
60 #define	MDDB_E_NORECORD	(-5)	/* record does not exits */
61 				/*
62 				 *	returned from:	mddb_getnextrec
63 				 *			mddb_getrecsize
64 				 *			mddb_commitrec
65 				 *			mddb_commitrecs
66 				 *			mddb_deleterec
67 				 */
68 #define	MDDB_E_NOSPACE	(-6)	/* no space to create record */
69 #define	MDDB_E_NOTNOW	(-7)	/* do not presently have enough resources */
70 				/*	to perform requested operation */
71 #define	MDDB_E_NODB	(-8)	/* no database exist */
72 #define	MDDB_E_NOTOWNER (-9)	/* have not been told to grab this set */
73 #define	MDDB_E_STALE	(-10)	/* database is stale */
74 #define	MDDB_E_TOOFEW	(-11)	/* not enough replicas available */
75 #define	MDDB_E_TAGDATA	(-12)	/* tagged data detected */
76 #define	MDDB_E_ACCOK	(-13)	/* 50/50 mode */
77 #define	MDDB_E_NTAGDATA	(-14)	/* tagop try, no tag data */
78 #define	MDDB_E_ACCNOTOK	(-15)	/* accop try, no accept possible */
79 #define	MDDB_E_NOLOCBLK	(-16)	/* No valid locators found */
80 #define	MDDB_E_NOLOCNMS	(-17)	/* No valid locator name information */
81 #define	MDDB_E_NODIRBLK	(-18)	/* No directory blocks found */
82 #define	MDDB_E_NOTAGREC	(-19)	/* No tag record blocks found */
83 #define	MDDB_E_NOTAG	(-20)	/* No matching tag record found */
84 #define	MDDB_E_NODEVID	(-21)	/* No device id found */
85 
86 #define	MDDB_MINBLKS		16	/* enough for a few metadevices */
87 #define	MDDB_MAXBLKS		8192	/* size of free bit map (must be / 8) */
88 #define	MDDB_MN_MINBLKS		32768	/* Multinode metadb minimum size */
89 					/* 16MB */
90 #define	MDDB_MN_MAXBLKS		524288	/* size of free bit map (must be / 8) */
91 					/* 256MB */
92 
93 #define	MDDB_C_STALE		0x0001
94 #define	MDDB_C_TOOFEW		0x0002
95 #define	MDDB_C_NOTOWNER		0x0004
96 #define	MDDB_C_SET_MN_STALE	0x0008	/* Set MN set to stale */
97 #define	MDDB_C_IMPORT		0x0010
98 
99 /*
100  * Defines used to set/reset new master flag in set structure.
101  * Used during reconfig cycle to determine quickly if there is
102  * new master for the set.
103  */
104 #define	MDDB_NM_SET		0x0001
105 #define	MDDB_NM_RESET		0x0002
106 #define	MDDB_NM_GET		0x0004
107 
108 /* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
109 #define	MDDB_DID_EXISTS		0x0001	/* Device ID exists */
110 #define	MDDB_DID_VALID		0x0002	/* Device ID valid on current system */
111 #define	MDDB_DID_UPDATED	0x0004  /* locator/sidelocator info updated */
112 
113 /* Definitions of flag in Locator Block - mddb_lb */
114 #define	MDDB_DEVID_STYLE	0x0001	/* Locator Block in Device ID format */
115 #define	MDDB_MNSET		0x0002  /* MDDB is for a multi-node set */
116 
117 
118 #define	MDDB_MAX_PATCH	25		/* number of locations that */
119 					/*	can be patched in etc/system */
120 
121 /*
122  * Set struct used by all parts of the driver, to store anchor pointers.
123  */
124 typedef struct md_set {
125 	uint_t		s_status;	/* set status */
126 	void		**s_ui;		/* set unit incore anchor */
127 	void		**s_un;		/* set unit anchor */
128 	void		*s_hsp;		/* set Hot Spare Pool anchor */
129 	void		*s_hs;		/* set Hot Spare anchor */
130 	void		*s_db;		/* set MDDB anchor */
131 	kmutex_t	s_dbmx;		/* set MDDB mutex */
132 	void		*s_nm;		/* set namespace anchor */
133 	mddb_recid_t	s_nmid;		/* set namespace anchor record */
134 	void		*s_did_nm;	/* set device id namespace anchor */
135 	mddb_recid_t	s_did_nmid;	/* set device id namespace anchor rec */
136 	void		*s_dtp;		/* set data tag rec */
137 	int		s_am_i_master;	/* incore master flag for this node */
138 	md_mn_nodeid_t	s_nodeid;	/* nodeid of this node - for MN sets */
139 	uint_t		s_rcnt;		/* incore resync count for set */
140 } md_set_t;
141 
142 
143 #define	MDDB_MAGIC_MB	0x6d646d62	/* magic number for master blocks */
144 #define	MDDB_MAGIC_DB	0x6d646462	/* magic number for directory blocks */
145 #define	MDDB_MAGIC_RB	0x6d647262	/* magic number for record blocks */
146 #define	MDDB_MAGIC_LB	0x6d646c62	/* magic number for locator blocks */
147 #define	MDDB_MAGIC_LN	0x6d646c6e	/* magic number for locator names */
148 #define	MDDB_MAGIC_DT	0x6d646474	/* magic number for data tag */
149 #define	MDDB_MAGIC_DI	0x6d646469	/* magic number for device ID block */
150 #define	MDDB_MAGIC_DU	0x6d646475	/* magic num for dummy mb */
151 #define	MDDB_MAGIC_DE	0x6d646465	/* magic num for mb devid */
152 
153 #define	MDDB_GLOBAL_XOR 1234567890
154 
155 #define	MDDB_REV_MAJOR  (uint_t)0xff00
156 #define	MDDB_REV_MINOR  (uint_t)0x00ff
157 
158 /*
159  * MDDB_REV_MNMB:
160  * If a MN diskset, master block revision is set to MDDB_REV_MNMB.
161  * Even though the master block structure is no different
162  * for a MN set, setting the revision field to a different
163  * number keeps any pre-MN_diskset code from accessing
164  * this diskset.  It also allows for an early determination
165  * of a MN diskset when reading in from disk so that the
166  * proper size locator block and locator names structure
167  * can be read in thus saving time on diskset startup.
168  * Since no change in master block structure, the MDDB_REV_MINOR
169  * portion of the revision was incremented.
170  *
171  * MDDB_REV_MNLB:
172  * If a MN diskset, the locator block structure is a different size in
173  * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
174  * with any nodeid (sideno) allowed.
175  * The revision is set to MDDB_REV_MNLB which is a change of the
176  * MDDB_REV_MAJOR portion of the revision.
177  *
178  * MDDB_REV_MNLN:
179  * If a MN diskset, the locator names is a different size in
180  * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
181  * with any nodeid (sideno) allowed.
182  * The revision is set to MDDB_REV_MNLN which is a change of the
183  * MDDB_REV_MAJOR portion of the revision.
184  */
185 
186 #define	MDDB_REV_MB	(uint_t)0x0201
187 #define	MDDB_REV_MNMB	(uint_t)0x0202
188 #define	MDDB_REV_DB	(uint_t)0x0201
189 #define	MDDB_REV_LB	(uint_t)0x0500
190 #define	MDDB_REV_MNLB	(uint_t)0x0600
191 #define	MDDB_REV_LN	(uint_t)0x0100
192 #define	MDDB_REV_MNLN	(uint_t)0x0300
193 #define	MDDB_REV_RB	(uint_t)0x0200
194 #define	MDDB_REV_RB64	(uint_t)0x0201
195 #define	MDDB_REV_DT	(uint_t)0x0100
196 #define	MDDB_REV_DI	(uint_t)0x0100
197 
198 #define	MDDB_BSIZE	(uint_t)DEV_BSIZE
199 #define	MDDB_PREFIXCNT	10
200 #define	MDDB_DRVNMCNT   10
201 
202 typedef int	mddb_block_t;
203 
204 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
205 #pragma pack(4)
206 #endif
207 typedef struct md_mnname_suffix {
208 	md_name_suffix	mn_ln_suffix;
209 	uint_t		mn_ln_sideno;
210 } md_mnname_suffix_t;
211 
212 typedef	struct mddb_ln {
213 	int			ln_magic;
214 	uint_t			ln_revision;
215 	uint_t			ln_checksum;
216 	struct timeval32	ln_timestamp;
217 	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
218 	/* Don't change array sizes without changing RNDUP_BLKCNT */
219 	md_name_suffix		ln_suffixes[MD_MAXSIDES][MDDB_NLB];
220 } mddb_ln_t;
221 
222 /*
223  * Locator name structure for MN diskset.  Same as for traditional
224  * and local diskset except that more sides are supported and the
225  * side number can be any number since the side number is stored
226  * in the ln_mnsuffixes structure instead of being used as an index
227  * into that array.  This means that the whole array may need to be
228  * searched in order to find the correct information given a side number.
229  */
230 typedef	struct mddb_mnln {
231 	int			ln_magic;
232 	uint_t			ln_revision;
233 	uint_t			ln_checksum;
234 	struct timeval32	ln_timestamp;
235 	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
236 	/* Don't change array sizes without changing MDDB_MNLNCNT */
237 	md_mnname_suffix_t	ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
238 } mddb_mnln_t;
239 
240 #define	RNDUP_BLKCNT(sz, delta)	(((sz) - \
241 				    ((delta) * \
242 				    ((MD_MAXSIDES  - 1) * MDDB_NLB)) + \
243 				    MDDB_BSIZE - 1) / MDDB_BSIZE)
244 #define	MDDB_LNCNT		RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
245 #define	MDDB_LOCAL_LNCNT	RNDUP_BLKCNT(sizeof (mddb_ln_t), \
246 				    sizeof (md_name_suffix))
247 
248 #define	MDDB_MNLNCNT		((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
249 				    / MDDB_BSIZE)
250 
251 typedef struct mddb_dt {
252 	uint_t		dt_mag;
253 	uint_t		dt_rev;
254 	uint_t		dt_cks;
255 	mddb_dtag_t	dt_dtag;
256 } mddb_dt_t;
257 
258 #define	MDDB_DT_BYTES	(roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
259 #define	MDDB_DT_BLOCKS	(btodb(MDDB_DT_BYTES))
260 
261 typedef union identifier {
262 	char			serial[MDDB_SN_LEN];
263 	struct timeval32	createtime;
264 } identifier_t;
265 
266 typedef struct mddb_locator {
267 	dev32_t		l_dev;
268 	daddr32_t	l_blkno;
269 	int		l_flags;
270 } mddb_locator_t;
271 
272 typedef struct mddb_sidelocator {
273 	uchar_t		l_drvnm_index;
274 	minor_t		l_mnum;
275 } mddb_sidelocator_t;
276 
277 typedef struct mddb_mnsidelocator {
278 	uchar_t		mnl_drvnm_index;
279 	minor_t		mnl_mnum;
280 	uint_t		mnl_sideno;
281 } mddb_mnsidelocator_t;
282 
283 typedef struct mddb_drvnm {
284 	uchar_t		dn_len;
285 	char		dn_data[MD_MAXDRVNM];
286 } mddb_drvnm_t;
287 
288 /*
289  * Locator Block Device ID Information
290  * Several device id's may share one disk block in an effort to
291  * conserve used replica space.
292  */
293 typedef struct mddb_did_info {
294 	uint_t		info_flags;	/* MDDB Device ID flags */
295 	uint_t		info_firstblk;	/* Device ID Start Block */
296 	uint_t		info_blkcnt;	/* Device ID Block Count */
297 	uint_t		info_offset;	/* Device ID offset w/i Block */
298 	uint_t		info_length;	/* Device ID Length */
299 	uint_t		info_checksum;	/* Device ID Checksum */
300 	char		info_minor_name[32]; /* Minor name of lb dev */
301 } mddb_did_info_t;
302 
303 typedef struct mddb_did_blk {
304 	int		blk_magic;	/* used for verification */
305 	uint_t		blk_revision;	/* used for verification */
306 	int		blk_checksum;	/* used for verification */
307 	uint_t		blk_commitcnt;	/* matches LB's commitcnt */
308 	mddb_did_info_t	blk_info[MDDB_NLB];
309 } mddb_did_blk_t;
310 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
311 #pragma pack()
312 #endif
313 
314 #define	MDDB_DID_BYTES	(roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
315 #define	MDDB_DID_BLOCKS	(btodb(MDDB_DID_BYTES))
316 
317 /*
318  * Device ID Disk Blocks.
319  * Incore linked list of disk blocks containing device IDs.
320  * The list is built when reading in the mddb_did_blk structure and
321  * when reading in the actual disk blocks containing device ids.
322  * This list is used to easily write out all disk blocks containing
323  * device ids.
324  */
325 typedef struct mddb_did_db {
326 	uint_t		db_firstblk;	/* Disk Block's logical addr */
327 	uint_t		db_blkcnt;	/* Contig Disk Block Count */
328 	caddr_t		db_ptr;		/* Ptr to incore Block(s) */
329 	struct mddb_did_db	*db_next;	/* Ptr to next in list */
330 } mddb_did_db_t;
331 
332 /*
333  * Device ID Free List.
334  * Incore linked list of free space in disk blocks containing device IDs.
335  * Used to manage placement of device IDs in disk blocks.
336  * All disk blocks on free list are also in linked list of disk block
337  * containing device IDs (mddb_did_db_t).
338  */
339 typedef struct mddb_did_free {
340 	uint_t			free_blk;	/* Disk Block's logical addr */
341 	uint_t			free_offset;	/* offset of free space */
342 	uint_t			free_length;	/* length of free space */
343 	struct mddb_did_free	*free_next;	/* Ptr to next in list */
344 } mddb_did_free_t;
345 
346 /*
347  * Device ID Incore Area
348  *    Contains pointer to Device ID Disk Block list and
349  *         Device ID Free List.
350  *    Also contains incore array of pointers to device IDs.  Pointers
351  *    point into the device ID Disk Block list and are used as a
352  *    shortcut to find incore device IDs.
353  */
354 typedef struct mddb_did_ic {
355 	mddb_did_blk_t	*did_ic_blkp;
356 	mddb_did_db_t	*did_ic_dbp;
357 	mddb_did_free_t	*did_ic_freep;
358 	ddi_devid_t	did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
359 } mddb_did_ic_t;
360 
361 /*
362  * Locator Block (LB):
363  *	- Are fixed size, but the size is different
364  *		for local/shared set db replicas.
365  *	- All LB's start at logical block 0.
366  * 	- After a replica quorum is found, there is
367  *	  is only one incore copy of the LB.
368  *	- LB's are only written when replicas are added, deleted, or errored.
369  *	- LB's provide information about other replica's and their state.
370  */
371 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
372 #pragma pack(4)
373 #endif
374 typedef struct mddb_lb {
375 	int			lb_magic;	/* used for verification */
376 	uint_t			lb_revision;	/* used for verification */
377 	int			lb_checksum;	/* used for verification */
378 	uint_t			lb_commitcnt;	/* IMPORTANT */
379 	struct timeval32	lb_timestamp;	/* informative only */
380 	int			lb_loccnt;	/* used for verification */
381 	identifier_t		lb_ident;	/* used for verification */
382 	uint_t			lb_flags;	/* flags describing LB */
383 	uint_t			lb_spare[8];	/* Spare/Pad */
384 	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
385 	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
386 	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
387 	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
388 	struct timeval32	lb_inittime;	/* creation of database */
389 	set_t			lb_setno;	/* used for verification */
390 	mddb_block_t		lb_blkcnt;	/* used for verification */
391 	mddb_block_t		lb_lnfirstblk;
392 	mddb_block_t		lb_lnblkcnt;
393 	mddb_block_t		lb_dbfirstblk;
394 	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
395 	mddb_locator_t		lb_locators[MDDB_NLB];
396 	/* Don't change array sizes without changing RNDUP_BLKCNT */
397 	mddb_sidelocator_t	lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
398 } mddb_lb_t;
399 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
400 #pragma pack()
401 #endif
402 
403 /*
404  * Locator block structure for MN diskset.  Same as for traditional
405  * and local diskset except that more sides are supported and the
406  * side number can be any number since the side number is stored
407  * in the lb_mnsidelocators structure instead of being used as an index
408  * into that array.  This means that the whole array may need to be
409  * searched in order to find the correct information given a side number.
410  */
411 typedef struct mddb_mnlb {
412 	int			lb_magic;	/* used for verification */
413 	uint_t			lb_revision;	/* used for verification */
414 	int			lb_checksum;	/* used for verification */
415 	uint_t			lb_commitcnt;	/* IMPORTANT */
416 	struct timeval32	lb_timestamp;	/* informative only */
417 	int			lb_loccnt;	/* used for verification */
418 	identifier_t		lb_ident;	/* used for verification */
419 	uint_t			lb_flags;	/* flags describing LB */
420 	uint_t			lb_spare[8];	/* Spare/Pad */
421 	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
422 	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
423 	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
424 	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
425 	struct timeval32	lb_inittime;	/* creation of database */
426 	set_t			lb_setno;	/* used for verification */
427 	mddb_block_t		lb_blkcnt;	/* used for verification */
428 	mddb_block_t		lb_lnfirstblk;
429 	mddb_block_t		lb_lnblkcnt;
430 	mddb_block_t		lb_dbfirstblk;
431 	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
432 	mddb_locator_t		lb_locators[MDDB_NLB];
433 	/* Don't change array sizes without changing MDDB_MNLBCNT */
434 	mddb_mnsidelocator_t	lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
435 } mddb_mnlb_t;
436 
437 
438 #define	MDDB_LBCNT		RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
439 #define	MDDB_LOCAL_LBCNT	RNDUP_BLKCNT(sizeof (mddb_lb_t), \
440 				    sizeof (mddb_sidelocator_t))
441 
442 #define	MDDB_MNLBCNT		((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
443 				    / MDDB_BSIZE)
444 
445 typedef struct mddb_map {
446 	daddr32_t		m_consecutive;
447 	daddr32_t		m_firstblk;
448 } mddb_map_t;
449 
450 /*
451  * Master block(s) (MB)
452  * 	- Are written by userland; Never by the driver!
453  *	- Each replica has there own master blocks,
454  *		the master block(s) are not shared.
455  *	- MB's are not in the logical block address space of the database.
456  *	- MB's are a fixed size record (MDDB_BSIZE)
457  *	- MB's provide the logical to physical block translation,
458  *		for their replica.
459  */
460 typedef	struct mddb_mb {
461 	int			mb_magic;	/* used for verification */
462 	uint_t			mb_revision;	/* used for verification */
463 	uint_t			mb_checksum;	/* used for verification */
464 #ifdef _LP64
465 	uint32_t		mb_next;	/* incore to next mb */
466 #else
467 	struct mddb_mb		*mb_next;	/* incore to next mb */
468 #endif	/* _LP64 */
469 	daddr32_t		mb_nextblk;	/* block # for next mb */
470 	md_timeval32_t		mb_timestamp;	/* timestamp */
471 	daddr32_t		mb_blkcnt;	/* size of blkmap */
472 	daddr32_t		mb_blkno;	/* physical loc. for this MB */
473 	set_t			mb_setno;	/* used for verification */
474 	struct timeval32	mb_setcreatetime; /* set creation timestamp */
475 	int			spares[7];
476 	mddb_map_t		mb_blkmap;	/* logical->physical blk map */
477 	int			mb_devid_magic;	/* verify devid in mb */
478 	short			mb_devid_len;	/* len of following devid */
479 	char			mb_devid[1];	/* devid byte array */
480 } mddb_mb_t;
481 
482 /*
483  * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
484  * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
485  */
486 #define	MDDB_IC_BSIZE	(MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
487 typedef struct mddb_mb_ic {
488 	struct mddb_mb_ic 	*mbi_next;
489 	struct mddb_mb		mbi_mddb_mb;
490 } mddb_mb_ic_t;
491 
492 
493 /*
494  * there can be no address in record block. The checksum must
495  * stay the same where ever the record is in memory. Many
496  * things depend on this. Also the timestamp is the time the the
497  * record was committed not the time it was written to a particular
498  * device.
499  *
500  * Old definition of mddb_rb, for 32-bit apps and libraries
501  */
502 typedef struct mddb_rb {
503 	uint_t			rb_magic;
504 	uint_t			rb_revision;
505 	uint_t			rb_checksum;
506 	uint_t			rb_checksum_fiddle;
507 	uint_t			rb_private;
508 	void			*rb_userdata;
509 	uint_t			rb_commitcnt;
510 	uint_t			rb_spare[1];
511 	struct timeval32	rb_timestamp;
512 	int			rb_data[1];
513 } mddb_rb_t;
514 
515 /* This is, and always will be, the on-disk version of mddb_rb */
516 typedef struct mddb_rb32 {
517 	uint_t			rb_magic;
518 	uint_t			rb_revision;
519 	uint_t			rb_checksum;
520 	uint_t			rb_checksum_fiddle;
521 	uint_t			rb_private;
522 	uint32_t		rb_userdata;
523 	uint_t			rb_commitcnt;
524 	uint_t			rb_spare[1];
525 	struct timeval32	rb_timestamp;
526 	int			rb_data[1];
527 } mddb_rb32_t;
528 
529 /*
530  * directory entries
531  */
532 typedef struct mddb_optinfo {
533 	int		o_li;
534 	int		o_flags;
535 } mddb_optinfo_t;
536 
537 /* Old definition of mddb_de, for 32-bit apps and libraries */
538 typedef struct mddb_de {
539 	struct mddb_de	*de_next;
540 	mddb_rb_t	*de_rb;
541 	mddb_recid_t	de_recid;
542 	mddb_type_t	de_type1;
543 	uint_t		de_type2;
544 	uint_t		de_reqsize;
545 	uint_t		de_recsize;
546 	mddb_block_t	de_blkcount;
547 	uint_t		de_flags;
548 	mddb_optinfo_t	de_optinfo[2];
549 	mddb_block_t	de_blks[1];
550 } mddb_de_t;
551 
552 /*
553  * In core version of mddb_de, includes pointer for mddb_rb32_t user data
554  * mddb_rb32_t is used incore
555  */
556 typedef struct mddb_de_ic {
557 	void			*de_rb_userdata;
558 	void			*de_rb_userdata_ic;
559 	uint_t			de_owner_nodeid;
560 	struct mddb_de_ic	*de_next;
561 	mddb_rb32_t		*de_rb;
562 	mddb_recid_t		de_recid;
563 	mddb_type_t		de_type1;
564 	uint_t			de_type2;
565 	size_t			de_reqsize;
566 	size_t			de_icreqsize;
567 	size_t			de_recsize;
568 	uint_t			de_blkcount;
569 	uint_t			de_flags;
570 	mddb_optinfo_t		de_optinfo[2];
571 	mddb_block_t		de_blks[1];
572 } mddb_de_ic_t;
573 
574 typedef struct mddb_db {
575 	uint_t			db_magic;
576 	uint_t			db_revision;
577 	uint_t			db_checksum;
578 	mddb_block_t		db_blknum;
579 	struct mddb_db		*db_next;
580 	mddb_block_t		db_nextblk;
581 	struct timeval32	db_timestamp;
582 	uint_t			db_recsum;
583 #ifdef _KERNEL
584 	mddb_de_ic_t		*db_firstentry;
585 #else
586 	mddb_de_t		*db_firstentry;
587 #endif
588 } mddb_db_t;
589 
590 /*
591  * This is, and always will be, the on-disk version of mddb_de
592  * When mddb_de32 is read in it is converted into mddb_de_ic
593  */
594 typedef struct mddb_de32 {
595 	uint32_t	de32_next;
596 	uint32_t	de32_rb;
597 	mddb_recid_t	de32_recid;
598 	mddb_type_t	de32_type1;
599 	uint_t		de32_type2;
600 	uint_t		de32_reqsize;
601 	uint_t		de32_recsize;
602 	mddb_block_t	de32_blkcount;
603 	uint_t		de32_flags;
604 	mddb_optinfo_t	de32_optinfo[2];
605 	mddb_block_t	de32_blks[1];
606 } mddb_de32_t;
607 
608 /*
609  * This is, and always will be, the on-disk version of mddb_db
610  * When mddb_db32 is read in it is converted into mddb_db
611  * To minimize impact on mddb format mddb_db fileds remain intact
612  */
613 typedef struct mddb_db32 {
614 	uint_t			db32_magic;
615 	uint_t			db32_revision;
616 	uint_t			db32_checksum;
617 	mddb_block_t		db32_blknum;
618 	uint32_t		db32_next;
619 	mddb_block_t		db32_nextblk;
620 	struct timeval32	db32_timestamp;
621 	uint_t			db32_recsum;
622 	uint32_t		db32_firstentry;
623 } mddb_db32_t;
624 
625 #define	de32tode(from, to) \
626 	{ \
627 	int i; \
628 	to->de_rb_userdata = NULL; \
629 	to->de_owner_nodeid = MD_MN_INVALID_NID; \
630 	to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
631 	to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
632 	to->de_recid =  from->de32_recid; \
633 	to->de_type1 =  from->de32_type1; \
634 	to->de_type2 =  from->de32_type2; \
635 	to->de_reqsize =  from->de32_reqsize; \
636 	to->de_recsize =  from->de32_recsize; \
637 	to->de_blkcount =  from->de32_blkcount; \
638 	to->de_flags =  from->de32_flags; \
639 	to->de_optinfo[0] =  from->de32_optinfo[0]; \
640 	to->de_optinfo[1] =  from->de32_optinfo[1]; \
641 	for (i = 0; i < from->de32_blkcount; i++) \
642 		to->de_blks[i] = from->de32_blks[i]; \
643 	}
644 
645 #define	detode32(from, to) \
646 	{ \
647 	int i; \
648 	to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
649 	to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
650 	to->de32_recid =  from->de_recid; \
651 	to->de32_type1 =  from->de_type1; \
652 	to->de32_type2 =  from->de_type2; \
653 	to->de32_reqsize =  from->de_reqsize; \
654 	to->de32_recsize =  from->de_recsize; \
655 	to->de32_blkcount =  from->de_blkcount; \
656 	to->de32_flags =  from->de_flags; \
657 	to->de32_optinfo[0] =  from->de_optinfo[0]; \
658 	to->de32_optinfo[1] =  from->de_optinfo[1]; \
659 	for (i = 0; i < from->de_blkcount; i++) \
660 		to->de32_blks[i] = from->de_blks[i]; \
661 	}
662 
663 #define	db32todb(from, to) \
664 	to->db_magic = from->db32_magic; \
665 	to->db_revision = from->db32_revision; \
666 	to->db_checksum = from->db32_checksum; \
667 	to->db_blknum = from->db32_blknum; \
668 	to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
669 	to->db_nextblk = from->db32_nextblk; \
670 	to->db_timestamp = from->db32_timestamp; \
671 	to->db_recsum = from->db32_recsum; \
672 	to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;
673 
674 #define	dbtodb32(from, to) \
675 	to->db32_magic = from->db_magic; \
676 	to->db32_revision = from->db_revision; \
677 	to->db32_checksum = from->db_checksum; \
678 	to->db32_blknum = from->db_blknum; \
679 	to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
680 	to->db32_nextblk = from->db_nextblk; \
681 	to->db32_timestamp = from->db_timestamp; \
682 	to->db32_recsum = from->db_recsum; \
683 	to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;
684 
685 /*
686  * information about a replica of the data base
687  */
688 typedef struct mddb_ri {
689 	struct mddb_ri		*ri_next;
690 	uint_t			ri_flags;
691 	uint_t			ri_commitcnt;
692 	int			ri_transplant;
693 	md_dev64_t		ri_dev;
694 	daddr32_t		ri_blkno;
695 	char			ri_driver[16];
696 	mddb_mb_ic_t		*ri_mbip;
697 	mddb_lb_t		*ri_lbp;
698 	mddb_dt_t		*ri_dtp;
699 	mddb_did_ic_t		*ri_did_icp;
700 	ddi_devid_t		ri_devid;
701 	ddi_devid_t		ri_old_devid;
702 	char			ri_minor_name[MDDB_MINOR_NAME_MAX];
703 	char			ri_devname[MAXPATHLEN];
704 } mddb_ri_t;
705 
706 typedef struct mddb_bf {
707 	struct mddb_bf	*bf_next;
708 	mddb_locator_t	*bf_locator;
709 	buf_t		bf_buf;
710 } mddb_bf_t;
711 
712 /*
713  * Information for sets of databases (which include replicas)
714  */
715 #define	MDDB_BITSRECID	31
716 #define	MDDB_SETSHIFT	(MDDB_BITSRECID - MD_BITSSET)
717 #define	MDDB_SETMASK	(MD_SETMASK << MDDB_SETSHIFT)
718 #define	MDDB_RECIDMASK	((1 << MDDB_SETSHIFT) - 1)
719 
720 #define	DBSET(id)	(((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
721 #define	DBID(id)	((id) & MDDB_RECIDMASK)
722 #define	MAKERECID(s, i)	((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
723 			((i) & MDDB_RECIDMASK))
724 
725 #define	MDDB_PARSE_LOCBLK	0x00000001
726 #define	MDDB_PARSE_LOCNM	0x00000002
727 #define	MDDB_PARSE_OPTRECS	0x00000004
728 #define	MDDB_PARSE_MASK		0x0000000F
729 
730 
731 #define	MDDB_BLOCK_PARSE	0x00000001	/* Block sending parse msgs */
732 #define	MDDB_UNBLOCK_PARSE	0x00000002	/* Unblock sending parse msgs */
733 
734 /*
735  * We need to keep s_ident and s_inittime 32 bit.  They are used in mddb_lb
736  */
737 typedef struct mddb_set {
738 	uint_t		s_setno;		/* set number */
739 	uint_t		s_sideno;		/* side number */
740 	identifier_t	s_ident;		/* set identifier */
741 	char		*s_setname;		/* set name */
742 	mddb_mb_ic_t	**s_mbiarray;		/* master blocks array */
743 	mddb_db_t	*s_dbp;			/* directory block */
744 	mddb_lb_t	*s_lbp;			/* locator block */
745 						/* May be cast to mddb_mnlb_t */
746 						/* if accessing sidenames in */
747 						/* MN diskset */
748 	mddb_ln_t	*s_lnp;			/* locator names block */
749 						/* May be cast to mddb_mnln_t */
750 						/* if accessing sidenames in */
751 						/* MN diskset */
752 	mddb_dtag_lst_t	*s_dtlp;		/* List of data tags found */
753 	mddb_did_ic_t	*s_did_icp;		/* Device ID incore area */
754 	mddb_ri_t	*s_rip;			/* replicas incore list */
755 	int		s_freeblkcnt;		/* visable for test code */
756 	int		s_totalblkcnt;		/* visable for test code */
757 	int		s_mn_parseflags;	/* mddb parse flags for MNset */
758 	int		s_mn_parseflags_sending; /* parse flgs sent to slaves */
759 	uchar_t		*s_freebitmap;		/* free blocks bitmap */
760 	uint_t		s_freebitmapsize;	/* size of bitmap */
761 	struct timeval32	s_inittime;	/* timestamp set created */
762 	mddb_recid_t	s_zombie;		/* zombie record - createrec */
763 	int		s_staledeletes;		/* number of stale deleterec */
764 	int		s_optcmtcnt;		/* Following are opt. record */
765 	int		s_opthavelck;		/*   bookkeeping records ... */
766 	int		s_optwantlck;
767 	kcondvar_t	s_optwantlck_cv;
768 	int		s_optwaiterr;
769 	int		s_opthungerr;
770 	kcondvar_t	s_opthungerr_cv;
771 	int		s_opthavequeuinglck;
772 	int		s_optwantqueuinglck;
773 	kcondvar_t	s_optqueuing_cv;
774 	ulong_t		s_bufmisses;
775 	mddb_bf_t	*s_freebufhead;
776 	int		s_bufwakeup;
777 	kcondvar_t	s_buf_cv;
778 	size_t		s_databuffer_size;
779 	void		*s_databuffer;
780 	int		s_singlelockgotten;
781 	int		s_singlelockwanted;
782 	kcondvar_t	s_single_thread_cv;
783 	md_hi_arr_t	s_med;
784 } mddb_set_t;
785 
786 #ifndef MDDB_FAKE
787 #ifdef _KERNEL
788 /* md_mddb.c */
789 extern uint_t			mddb_lb_did_convert(mddb_set_t *,
790 				    uint_t, uint_t *);
791 extern void			mddb_locatorblock2splitname(mddb_ln_t *,
792 				    int, side_t, md_splitname *);
793 extern int			mddb_configure(mddb_cfgcmd_t,
794 				    struct mddb_config *);
795 extern mddb_recid_t		mddb_getnextrec(mddb_recid_t,
796 				    mddb_type_t, uint_t);
797 extern int			mddb_getoptloc(mddb_optloc_t *);
798 extern void			*mddb_getrecaddr(mddb_recid_t);
799 extern void			*mddb_getrecaddr_resize(mddb_recid_t, size_t,
800 				    off_t);
801 extern int			mddb_getrecprivate(mddb_recid_t);
802 extern void			mddb_setrecprivate(mddb_recid_t, uint_t);
803 extern mddb_de_ic_t		*mddb_getrecdep(mddb_recid_t);
804 extern mddb_type_t		mddb_getrectype1(mddb_recid_t);
805 extern int			mddb_getrectype2(mddb_recid_t);
806 extern int			mddb_getrecsize(mddb_recid_t);
807 extern int			mddb_commitrec(mddb_recid_t);
808 extern int			mddb_commitrecs(mddb_recid_t *);
809 extern int			mddb_deleterec(mddb_recid_t);
810 extern mddb_recstatus_t		mddb_getrecstatus(mddb_recid_t);
811 extern mddb_recid_t		mddb_createrec(size_t usersize,
812 				    mddb_type_t type, uint_t type2,
813 				    md_create_rec_option_t option, set_t setno);
814 extern void			mddb_init(void);
815 extern void			mddb_unload(void);
816 extern void			mddb_unload_set(set_t setno);
817 extern mddb_recid_t		mddb_makerecid(set_t setno, mddb_recid_t id);
818 extern set_t			mddb_getsetnum(mddb_recid_t id);
819 extern char			*mddb_getsetname(set_t setno);
820 extern side_t			mddb_getsidenum(set_t setno);
821 extern int			mddb_ownset(set_t setno);
822 extern int			getmed_ioctl(mddb_med_parm_t *medpp, int mode);
823 extern int			setmed_ioctl(mddb_med_parm_t *medpp, int mode);
824 extern int			updmed_ioctl(mddb_med_upd_parm_t *medpp,
825 				    int mode);
826 extern int			take_set(mddb_config_t *cp, int mode);
827 extern int			release_set(mddb_config_t *cp, int mode);
828 extern int			gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
829 				    int mode);
830 extern int			usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
831 				    int mode);
832 extern int			accept_ioctl(mddb_accept_parm_t *medpp,
833 				    int mode);
834 extern int			md_update_locator_namespace(set_t setno,
835 				    side_t side, char *dname, char *pname,
836 				    md_dev64_t devt);
837 extern int			mddb_validate_lb(set_t setno, int *rmaxsz);
838 extern int			mddb_getinvlb_devid(set_t setno, int count,
839 				    int size, char **ctdptr);
840 extern int			md_update_minor(set_t, side_t, mdkey_t);
841 #ifdef DEBUG
842 extern void			mddb_check(void);
843 #endif /* DEBUG */
844 #endif /* _KERNEL */
845 
846 #else
847 
848 caddr_t mddb_fakeit;
849 
850 #define	md_lb_did_convert(a, b, c)	(0)
851 #define	mddb_configure(a, b)	(0)
852 #define	mddb_getnextrec(a, b, c)		((mddb_recid_t)0)
853 #define	mddb_getrecaddr(a)	(mddb_fakeit)
854 #define	mddb_getrecprivate(a)	(0)
855 #define	mddb_setrecprivate(a, b) (0)
856 #define	mddb_getrectype1(a)	(0)
857 #define	mddb_getrectype2(a)	(0)
858 #define	mddb_getrecsize(a)	(0)
859 #define	mddb_commitrec(a)	(0)
860 #define	mddb_commitrecs(a)	(0)
861 #define	mddb_deleterec(a)	(0)
862 #define	mddb_getrecstatus(a)	(MDDB_OK)
863 #define	mddb_createrec(s, a, b)	(0xffff & (int)(mddb_fakeit = \
864 					(caddr_t)kmem_zalloc(s, KM_SLEEP)))
865 #define	mddb_unload()		(0)
866 
867 #endif
868 
869 #define	MDDB_NOSLEEP	1
870 #define	MDDB_SLEEPOK	0
871 
872 #define	MDDB_NOOLDOK	0x1
873 #define	MDDB_MUSTEXIST	0x2
874 #define	MDDB_NOINIT	0x4
875 #define	MDDB_MULTINODE	0x8
876 #define	MDDB_MN_STALE	0x10	/* MN set is stale */
877 
878 /* Flags passed to selectreplicas - not a bit mask */
879 #define	MDDB_SCANALL		1
880 #define	MDDB_RETRYSCAN		0
881 #define	MDDB_SCANALLSYNC	2	/* During reconfig, sync up incore */
882 					/* and ondisk mddb by writing incore */
883 					/* values to disk.  Don't write */
884 					/* change log records. */
885 
886 /* Flags passed to writestart and writecopy */
887 #define	MDDB_WRITECOPY_ALL	1	/* Write all incore mddb to disk */
888 #define	MDDB_WRITECOPY_SYNC	2	/* Write incore mddb to disk except */
889 					/* 	- change log records */
890 					/*	- optimized resync records */
891 
892 
893 #define	MDDB_PROBE	1
894 #define	MDDB_NOPROBE	0
895 
896 
897 /*
898  * MN diskset definitions used to determine if a slave can write
899  * directly to the mddb.  ONLY_MASTER only allows the master node
900  * to write to the mddb.  ANY_NODE allows any node to write
901  * to the mddb.
902  */
903 #define	MDDB_WR_ONLY_MASTER	0
904 #define	MDDB_WR_ANY_NODE	1
905 
906 #define	MDDB_L_LOCKED	0x0001	/* this record is locked */
907 #define	MDDB_L_WANTED	0x0002
908 
909 #ifdef	__cplusplus
910 }
911 #endif
912 
913 #endif	/* _SYS_MD_MDDB_H */
914