xref: /titanic_41/usr/src/uts/common/sys/lvm/md_mddb.h (revision c10c16dec587a0662068f6e2991c29ed3a9db943)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #ifndef _SYS_MD_MDDB_H
27 #define	_SYS_MD_MDDB_H
28 
29 #pragma ident	"%Z%%M%	%I%	%E% SMI"
30 
31 #include <sys/types.h>
32 #include <sys/buf.h>
33 
34 #ifdef	__cplusplus
35 extern "C" {
36 #endif
37 
38 #if 0 /* DRP FOR DEBUGGING */
39 #define	MDDB_FAKE
40 #endif
41 
42 /* Private flags */
43 #define	MD_PRV_GOTIT		0x0001	/* Been snarfed */
44 #define	MD_PRV_DELETE		0x0002	/* Record pending to be deleted */
45 #define	MD_PRV_COMMIT		0x0004	/* Record pending to be commited */
46 #define	MD_PRV_CLEANUP		0x0008	/* Record pending to be cleaned up */
47 #define	MD_PRV_CONVD		0x0010  /* Record has been converted (32->64) */
48 #define	MD_PRV_PENDDEL		(MD_PRV_GOTIT | MD_PRV_DELETE)
49 #define	MD_PRV_PENDCOM		(MD_PRV_GOTIT | MD_PRV_COMMIT)
50 #define	MD_PRV_PENDCLEAN	(MD_PRV_GOTIT | MD_PRV_CLEANUP)
51 
52 
53 #define	MDDB_E_INVALID	(-1)	/* an invalid argument was passed */
54 #define	MDDB_E_EXISTS	(-2)	/* doing an operation a 2nd time which can */
55 				/*	only be done once */
56 #define	MDDB_E_MASTER	(-3)	/* problem occurred accessing mastor block */
57 				/*	returned from NEW_DEV	*/
58 #define	MDDB_E_TOOSMALL	(-4)	/* device is not large enough */
59 #define	MDDB_E_NORECORD	(-5)	/* record does not exits */
60 				/*
61 				 *	returned from:	mddb_getnextrec
62 				 *			mddb_getrecsize
63 				 *			mddb_commitrec
64 				 *			mddb_commitrecs
65 				 *			mddb_deleterec
66 				 */
67 #define	MDDB_E_NOSPACE	(-6)	/* no space to create record */
68 #define	MDDB_E_NOTNOW	(-7)	/* do not presently have enough resources */
69 				/*	to perform requested operation */
70 #define	MDDB_E_NODB	(-8)	/* no database exist */
71 #define	MDDB_E_NOTOWNER (-9)	/* have not been told to grab this set */
72 #define	MDDB_E_STALE	(-10)	/* database is stale */
73 #define	MDDB_E_TOOFEW	(-11)	/* not enough replicas available */
74 #define	MDDB_E_TAGDATA	(-12)	/* tagged data detected */
75 #define	MDDB_E_ACCOK	(-13)	/* 50/50 mode */
76 #define	MDDB_E_NTAGDATA	(-14)	/* tagop try, no tag data */
77 #define	MDDB_E_ACCNOTOK	(-15)	/* accop try, no accept possible */
78 #define	MDDB_E_NOLOCBLK	(-16)	/* No valid locators found */
79 #define	MDDB_E_NOLOCNMS	(-17)	/* No valid locator name information */
80 #define	MDDB_E_NODIRBLK	(-18)	/* No directory blocks found */
81 #define	MDDB_E_NOTAGREC	(-19)	/* No tag record blocks found */
82 #define	MDDB_E_NOTAG	(-20)	/* No matching tag record found */
83 #define	MDDB_E_NODEVID	(-21)	/* No device id found */
84 
85 #define	MDDB_MINBLKS		16	/* enough for a few metadevices */
86 #define	MDDB_MAXBLKS		8192	/* size of free bit map (must be / 8) */
87 #define	MDDB_MN_MINBLKS		32768	/* Multinode metadb minimum size */
88 					/* 16MB */
89 #define	MDDB_MN_MAXBLKS		524288	/* size of free bit map (must be / 8) */
90 					/* 256MB */
91 
92 #define	MDDB_C_STALE		0x0001
93 #define	MDDB_C_TOOFEW		0x0002
94 #define	MDDB_C_NOTOWNER		0x0004
95 #define	MDDB_C_SET_MN_STALE	0x0008	/* Set MN set to stale */
96 #define	MDDB_C_IMPORT		0x0010
97 
98 /*
99  * Defines used to set/reset new master flag in set structure.
100  * Used during reconfig cycle to determine quickly if there is
101  * new master for the set.
102  */
103 #define	MDDB_NM_SET		0x0001
104 #define	MDDB_NM_RESET		0x0002
105 #define	MDDB_NM_GET		0x0004
106 
107 /* Definitions of flag in Locator Block Device ID data area - mddb_did_info */
108 #define	MDDB_DID_EXISTS		0x0001	/* Device ID exists */
109 #define	MDDB_DID_VALID		0x0002	/* Device ID valid on current system */
110 #define	MDDB_DID_UPDATED	0x0004  /* locator/sidelocator info updated */
111 
112 /* Definitions of flag in Locator Block - mddb_lb */
113 #define	MDDB_DEVID_STYLE	0x0001	/* Locator Block in Device ID format */
114 #define	MDDB_MNSET		0x0002  /* MDDB is for a multi-node set */
115 
116 
117 #define	MDDB_MAX_PATCH	25		/* number of locations that */
118 					/*	can be patched in etc/system */
119 
120 /*
121  * Set struct used by all parts of the driver, to store anchor pointers.
122  *
123  * Lock associated with field in this structure:
124  *
125  * Some of fields are accessible by both the single threaded ioctl thread
126  * and internal threads such as resync, hotsparing...etc.  In this case
127  * additional protection is needed.  For example, s_db is protected by
128  * s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock
129  * s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock
130  * Rest of other fileds are protected by md_mx.  Two fields s_un_next and
131  * s_un_avail are introduced by the friendly name project and are ONLY
132  * accessible via a single threaded ioctl thread which already is protected
133  * by the ioctl lock and there is no need to add extra protection to them.
134  * However, in the future if they become accessible by other internal threads
135  * then an additional protection such as md_mx lock is highly recommended.
136  *
137  */
138 typedef struct md_set {
139 	uint_t		s_status;	/* set status */
140 	void		**s_ui;		/* set unit incore anchor */
141 	void		**s_un;		/* set unit anchor */
142 	void		*s_hsp;		/* set Hot Spare Pool anchor */
143 	void		*s_hs;		/* set Hot Spare anchor */
144 	void		*s_db;		/* set MDDB anchor */
145 	kmutex_t	s_dbmx;		/* set MDDB mutex */
146 	void		*s_nm;		/* set namespace anchor */
147 	mddb_recid_t	s_nmid;		/* set namespace anchor record */
148 	void		*s_did_nm;	/* set device id namespace anchor */
149 	mddb_recid_t	s_did_nmid;	/* set device id namespace anchor rec */
150 	void		*s_dtp;		/* set data tag rec */
151 	int		s_am_i_master;	/* incore master flag for this node */
152 	md_mn_nodeid_t	s_nodeid;	/* nodeid of this node - for MN sets */
153 	uint_t		s_rcnt;		/* incore resync count for set */
154 	unit_t		s_un_next;	/* s_un scan starts here */
155 	unit_t		s_un_avail;	/* number of avail slots */
156 } md_set_t;
157 
158 
159 #define	MDDB_MAGIC_MB	0x6d646d62	/* magic number for master blocks */
160 #define	MDDB_MAGIC_DB	0x6d646462	/* magic number for directory blocks */
161 #define	MDDB_MAGIC_RB	0x6d647262	/* magic number for record blocks */
162 #define	MDDB_MAGIC_LB	0x6d646c62	/* magic number for locator blocks */
163 #define	MDDB_MAGIC_LN	0x6d646c6e	/* magic number for locator names */
164 #define	MDDB_MAGIC_DT	0x6d646474	/* magic number for data tag */
165 #define	MDDB_MAGIC_DI	0x6d646469	/* magic number for device ID block */
166 #define	MDDB_MAGIC_DU	0x6d646475	/* magic num for dummy mb */
167 #define	MDDB_MAGIC_DE	0x6d646465	/* magic num for mb devid */
168 
169 #define	MDDB_GLOBAL_XOR 1234567890
170 
171 #define	MDDB_REV_MAJOR  (uint_t)0xff00
172 #define	MDDB_REV_MINOR  (uint_t)0x00ff
173 
174 /*
175  * MDDB_REV_MNMB:
176  * If a MN diskset, master block revision is set to MDDB_REV_MNMB.
177  * Even though the master block structure is no different
178  * for a MN set, setting the revision field to a different
179  * number keeps any pre-MN_diskset code from accessing
180  * this diskset.  It also allows for an early determination
181  * of a MN diskset when reading in from disk so that the
182  * proper size locator block and locator names structure
183  * can be read in thus saving time on diskset startup.
184  * Since no change in master block structure, the MDDB_REV_MINOR
185  * portion of the revision was incremented.
186  *
187  * MDDB_REV_MNLB:
188  * If a MN diskset, the locator block structure is a different size in
189  * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
190  * with any nodeid (sideno) allowed.
191  * The revision is set to MDDB_REV_MNLB which is a change of the
192  * MDDB_REV_MAJOR portion of the revision.
193  *
194  * MDDB_REV_MNLN:
195  * If a MN diskset, the locator names is a different size in
196  * order to accomodate up to MD_MNMAXSIDES nodes in a diskset
197  * with any nodeid (sideno) allowed.
198  * The revision is set to MDDB_REV_MNLN which is a change of the
199  * MDDB_REV_MAJOR portion of the revision.
200  *
201  * The record blocks have two binary properties.  A record block can
202  * represent either a 32 or 64 bit unit.  A record block can also represent
203  * a traditionally named unit or a friendly named unit.  Thus, there are
204  * minor revisions of record block.
205  *
206  *		Traditional		Friendly
207  *		Name			Name
208  *		-----------		--------
209  * 32 bit	MDDB_REV_RB		MDDB_REV_RBFN
210  * 64 bit	MDDB_REV_RB64		MDDB_REV_RB64FN
211  */
212 
213 #define	MDDB_REV_MB	(uint_t)0x0201
214 #define	MDDB_REV_MNMB	(uint_t)0x0202
215 #define	MDDB_REV_DB	(uint_t)0x0201
216 #define	MDDB_REV_LB	(uint_t)0x0500
217 #define	MDDB_REV_MNLB	(uint_t)0x0600
218 #define	MDDB_REV_LN	(uint_t)0x0100
219 #define	MDDB_REV_MNLN	(uint_t)0x0300
220 #define	MDDB_REV_RB	(uint_t)0x0200
221 #define	MDDB_REV_RB64	(uint_t)0x0201
222 #define	MDDB_REV_RBFN	(uint_t)0x0202
223 #define	MDDB_REV_RB64FN	(uint_t)0x0203
224 #define	MDDB_REV_DT	(uint_t)0x0100
225 #define	MDDB_REV_DI	(uint_t)0x0100
226 
227 /*
228  * Transfer record block friendly name status to unit/hs structure.
229  */
230 #define	MDDB_NOTE_FN(rbv, unv)	switch (rbv) { \
231 				case MDDB_REV_RB: \
232 				case MDDB_REV_RB64: \
233 					unv &= ~MD_FN_META_DEV; \
234 					break; \
235 				case MDDB_REV_RBFN: \
236 				case MDDB_REV_RB64FN: \
237 					unv |= MD_FN_META_DEV; \
238 					break;	\
239 				}
240 
241 #define	MDDB_BSIZE	(uint_t)DEV_BSIZE
242 #define	MDDB_PREFIXCNT	10
243 #define	MDDB_DRVNMCNT   10
244 
245 typedef int	mddb_block_t;
246 
247 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
248 #pragma pack(4)
249 #endif
250 typedef struct md_mnname_suffix {
251 	md_name_suffix	mn_ln_suffix;
252 	uint_t		mn_ln_sideno;
253 } md_mnname_suffix_t;
254 
255 typedef	struct mddb_ln {
256 	int			ln_magic;
257 	uint_t			ln_revision;
258 	uint_t			ln_checksum;
259 	struct timeval32	ln_timestamp;
260 	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
261 	/* Don't change array sizes without changing RNDUP_BLKCNT */
262 	md_name_suffix		ln_suffixes[MD_MAXSIDES][MDDB_NLB];
263 } mddb_ln_t;
264 
265 /*
266  * Locator name structure for MN diskset.  Same as for traditional
267  * and local diskset except that more sides are supported and the
268  * side number can be any number since the side number is stored
269  * in the ln_mnsuffixes structure instead of being used as an index
270  * into that array.  This means that the whole array may need to be
271  * searched in order to find the correct information given a side number.
272  */
273 typedef	struct mddb_mnln {
274 	int			ln_magic;
275 	uint_t			ln_revision;
276 	uint_t			ln_checksum;
277 	struct timeval32	ln_timestamp;
278 	md_name_prefix		ln_prefixes[MDDB_PREFIXCNT];
279 	/* Don't change array sizes without changing MDDB_MNLNCNT */
280 	md_mnname_suffix_t	ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB];
281 } mddb_mnln_t;
282 
283 #define	RNDUP_BLKCNT(sz, delta)	(((sz) - \
284 				    ((delta) * \
285 				    ((MD_MAXSIDES  - 1) * MDDB_NLB)) + \
286 				    MDDB_BSIZE - 1) / MDDB_BSIZE)
287 #define	MDDB_LNCNT		RNDUP_BLKCNT(sizeof (mddb_ln_t), 0)
288 #define	MDDB_LOCAL_LNCNT	RNDUP_BLKCNT(sizeof (mddb_ln_t), \
289 				    sizeof (md_name_suffix))
290 
291 #define	MDDB_MNLNCNT		((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \
292 				    / MDDB_BSIZE)
293 
294 typedef struct mddb_dt {
295 	uint_t		dt_mag;
296 	uint_t		dt_rev;
297 	uint_t		dt_cks;
298 	mddb_dtag_t	dt_dtag;
299 } mddb_dt_t;
300 
301 #define	MDDB_DT_BYTES	(roundup(sizeof (mddb_dt_t), MDDB_BSIZE))
302 #define	MDDB_DT_BLOCKS	(btodb(MDDB_DT_BYTES))
303 
304 typedef union identifier {
305 	char			serial[MDDB_SN_LEN];
306 	struct timeval32	createtime;
307 } identifier_t;
308 
309 typedef struct mddb_locator {
310 	dev32_t		l_dev;
311 	daddr32_t	l_blkno;
312 	int		l_flags;
313 } mddb_locator_t;
314 
315 typedef struct mddb_sidelocator {
316 	uchar_t		l_drvnm_index;
317 	minor_t		l_mnum;
318 } mddb_sidelocator_t;
319 
320 typedef struct mddb_mnsidelocator {
321 	uchar_t		mnl_drvnm_index;
322 	minor_t		mnl_mnum;
323 	uint_t		mnl_sideno;
324 } mddb_mnsidelocator_t;
325 
326 typedef struct mddb_drvnm {
327 	uchar_t		dn_len;
328 	char		dn_data[MD_MAXDRVNM];
329 } mddb_drvnm_t;
330 
331 /*
332  * Locator Block Device ID Information
333  * Several device id's may share one disk block in an effort to
334  * conserve used replica space.
335  */
336 typedef struct mddb_did_info {
337 	uint_t		info_flags;	/* MDDB Device ID flags */
338 	uint_t		info_firstblk;	/* Device ID Start Block */
339 	uint_t		info_blkcnt;	/* Device ID Block Count */
340 	uint_t		info_offset;	/* Device ID offset w/i Block */
341 	uint_t		info_length;	/* Device ID Length */
342 	uint_t		info_checksum;	/* Device ID Checksum */
343 	char		info_minor_name[32]; /* Minor name of lb dev */
344 } mddb_did_info_t;
345 
346 typedef struct mddb_did_blk {
347 	int		blk_magic;	/* used for verification */
348 	uint_t		blk_revision;	/* used for verification */
349 	int		blk_checksum;	/* used for verification */
350 	uint_t		blk_commitcnt;	/* matches LB's commitcnt */
351 	mddb_did_info_t	blk_info[MDDB_NLB];
352 } mddb_did_blk_t;
353 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
354 #pragma pack()
355 #endif
356 
357 #define	MDDB_DID_BYTES	(roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE))
358 #define	MDDB_DID_BLOCKS	(btodb(MDDB_DID_BYTES))
359 
360 /*
361  * Device ID Disk Blocks.
362  * Incore linked list of disk blocks containing device IDs.
363  * The list is built when reading in the mddb_did_blk structure and
364  * when reading in the actual disk blocks containing device ids.
365  * This list is used to easily write out all disk blocks containing
366  * device ids.
367  */
368 typedef struct mddb_did_db {
369 	uint_t		db_firstblk;	/* Disk Block's logical addr */
370 	uint_t		db_blkcnt;	/* Contig Disk Block Count */
371 	caddr_t		db_ptr;		/* Ptr to incore Block(s) */
372 	struct mddb_did_db	*db_next;	/* Ptr to next in list */
373 } mddb_did_db_t;
374 
375 /*
376  * Device ID Free List.
377  * Incore linked list of free space in disk blocks containing device IDs.
378  * Used to manage placement of device IDs in disk blocks.
379  * All disk blocks on free list are also in linked list of disk block
380  * containing device IDs (mddb_did_db_t).
381  */
382 typedef struct mddb_did_free {
383 	uint_t			free_blk;	/* Disk Block's logical addr */
384 	uint_t			free_offset;	/* offset of free space */
385 	uint_t			free_length;	/* length of free space */
386 	struct mddb_did_free	*free_next;	/* Ptr to next in list */
387 } mddb_did_free_t;
388 
389 /*
390  * Device ID Incore Area
391  *    Contains pointer to Device ID Disk Block list and
392  *         Device ID Free List.
393  *    Also contains incore array of pointers to device IDs.  Pointers
394  *    point into the device ID Disk Block list and are used as a
395  *    shortcut to find incore device IDs.
396  */
397 typedef struct mddb_did_ic {
398 	mddb_did_blk_t	*did_ic_blkp;
399 	mddb_did_db_t	*did_ic_dbp;
400 	mddb_did_free_t	*did_ic_freep;
401 	ddi_devid_t	did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */
402 } mddb_did_ic_t;
403 
404 /*
405  * Locator Block (LB):
406  *	- Are fixed size, but the size is different
407  *		for local/shared set db replicas.
408  *	- All LB's start at logical block 0.
409  * 	- After a replica quorum is found, there is
410  *	  is only one incore copy of the LB.
411  *	- LB's are only written when replicas are added, deleted, or errored.
412  *	- LB's provide information about other replica's and their state.
413  */
414 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
415 #pragma pack(4)
416 #endif
417 typedef struct mddb_lb {
418 	int			lb_magic;	/* used for verification */
419 	uint_t			lb_revision;	/* used for verification */
420 	int			lb_checksum;	/* used for verification */
421 	uint_t			lb_commitcnt;	/* IMPORTANT */
422 	struct timeval32	lb_timestamp;	/* informative only */
423 	int			lb_loccnt;	/* used for verification */
424 	identifier_t		lb_ident;	/* used for verification */
425 	uint_t			lb_flags;	/* flags describing LB */
426 	uint_t			lb_spare[8];	/* Spare/Pad */
427 	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
428 	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
429 	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
430 	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
431 	struct timeval32	lb_inittime;	/* creation of database */
432 	set_t			lb_setno;	/* used for verification */
433 	mddb_block_t		lb_blkcnt;	/* used for verification */
434 	mddb_block_t		lb_lnfirstblk;
435 	mddb_block_t		lb_lnblkcnt;
436 	mddb_block_t		lb_dbfirstblk;
437 	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
438 	mddb_locator_t		lb_locators[MDDB_NLB];
439 	/* Don't change array sizes without changing RNDUP_BLKCNT */
440 	mddb_sidelocator_t	lb_sidelocators[MD_MAXSIDES][MDDB_NLB];
441 } mddb_lb_t;
442 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4
443 #pragma pack()
444 #endif
445 
446 /*
447  * Locator block structure for MN diskset.  Same as for traditional
448  * and local diskset except that more sides are supported and the
449  * side number can be any number since the side number is stored
450  * in the lb_mnsidelocators structure instead of being used as an index
451  * into that array.  This means that the whole array may need to be
452  * searched in order to find the correct information given a side number.
453  */
454 typedef struct mddb_mnlb {
455 	int			lb_magic;	/* used for verification */
456 	uint_t			lb_revision;	/* used for verification */
457 	int			lb_checksum;	/* used for verification */
458 	uint_t			lb_commitcnt;	/* IMPORTANT */
459 	struct timeval32	lb_timestamp;	/* informative only */
460 	int			lb_loccnt;	/* used for verification */
461 	identifier_t		lb_ident;	/* used for verification */
462 	uint_t			lb_flags;	/* flags describing LB */
463 	uint_t			lb_spare[8];	/* Spare/Pad */
464 	mddb_block_t		lb_didfirstblk;	/* Devid Array Start Block */
465 	mddb_block_t		lb_didblkcnt;	/* Devid Array Number Blocks */
466 	mddb_block_t		lb_dtfirstblk;	/* Data Tag Start Block */
467 	mddb_block_t		lb_dtblkcnt;	/* Data Tag Number Block(s) */
468 	struct timeval32	lb_inittime;	/* creation of database */
469 	set_t			lb_setno;	/* used for verification */
470 	mddb_block_t		lb_blkcnt;	/* used for verification */
471 	mddb_block_t		lb_lnfirstblk;
472 	mddb_block_t		lb_lnblkcnt;
473 	mddb_block_t		lb_dbfirstblk;
474 	mddb_drvnm_t		lb_drvnm[MDDB_DRVNMCNT];
475 	mddb_locator_t		lb_locators[MDDB_NLB];
476 	/* Don't change array sizes without changing MDDB_MNLBCNT */
477 	mddb_mnsidelocator_t	lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB];
478 } mddb_mnlb_t;
479 
480 
481 #define	MDDB_LBCNT		RNDUP_BLKCNT(sizeof (mddb_lb_t), 0)
482 #define	MDDB_LOCAL_LBCNT	RNDUP_BLKCNT(sizeof (mddb_lb_t), \
483 				    sizeof (mddb_sidelocator_t))
484 
485 #define	MDDB_MNLBCNT		((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \
486 				    / MDDB_BSIZE)
487 
488 typedef struct mddb_map {
489 	daddr32_t		m_consecutive;
490 	daddr32_t		m_firstblk;
491 } mddb_map_t;
492 
493 /*
494  * Master block(s) (MB)
495  * 	- Are written by userland; Never by the driver!
496  *	- Each replica has there own master blocks,
497  *		the master block(s) are not shared.
498  *	- MB's are not in the logical block address space of the database.
499  *	- MB's are a fixed size record (MDDB_BSIZE)
500  *	- MB's provide the logical to physical block translation,
501  *		for their replica.
502  */
503 typedef	struct mddb_mb {
504 	int			mb_magic;	/* used for verification */
505 	uint_t			mb_revision;	/* used for verification */
506 	uint_t			mb_checksum;	/* used for verification */
507 #ifdef _LP64
508 	uint32_t		mb_next;	/* incore to next mb */
509 #else
510 	struct mddb_mb		*mb_next;	/* incore to next mb */
511 #endif	/* _LP64 */
512 	daddr32_t		mb_nextblk;	/* block # for next mb */
513 	md_timeval32_t		mb_timestamp;	/* timestamp */
514 	daddr32_t		mb_blkcnt;	/* size of blkmap */
515 	daddr32_t		mb_blkno;	/* physical loc. for this MB */
516 	set_t			mb_setno;	/* used for verification */
517 	struct timeval32	mb_setcreatetime; /* set creation timestamp */
518 	int			spares[7];
519 	mddb_map_t		mb_blkmap;	/* logical->physical blk map */
520 	int			mb_devid_magic;	/* verify devid in mb */
521 	short			mb_devid_len;	/* len of following devid */
522 	char			mb_devid[1];	/* devid byte array */
523 } mddb_mb_t;
524 
525 /*
526  * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on
527  * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *)
528  */
529 #define	MDDB_IC_BSIZE	(MDDB_BSIZE + sizeof (struct mddb_mb_ic *))
530 typedef struct mddb_mb_ic {
531 	struct mddb_mb_ic 	*mbi_next;
532 	struct mddb_mb		mbi_mddb_mb;
533 } mddb_mb_ic_t;
534 
535 
536 /*
537  * there can be no address in record block. The checksum must
538  * stay the same where ever the record is in memory. Many
539  * things depend on this. Also the timestamp is the time the the
540  * record was committed not the time it was written to a particular
541  * device.
542  *
543  * Old definition of mddb_rb, for 32-bit apps and libraries
544  */
545 typedef struct mddb_rb {
546 	uint_t			rb_magic;
547 	uint_t			rb_revision;
548 	uint_t			rb_checksum;
549 	uint_t			rb_checksum_fiddle;
550 	uint_t			rb_private;
551 	void			*rb_userdata;
552 	uint_t			rb_commitcnt;
553 	uint_t			rb_spare[1];
554 	struct timeval32	rb_timestamp;
555 	int			rb_data[1];
556 } mddb_rb_t;
557 
558 /* This is, and always will be, the on-disk version of mddb_rb */
559 typedef struct mddb_rb32 {
560 	uint_t			rb_magic;
561 	uint_t			rb_revision;
562 	uint_t			rb_checksum;
563 	uint_t			rb_checksum_fiddle;
564 	uint_t			rb_private;
565 	uint32_t		rb_userdata;
566 	uint_t			rb_commitcnt;
567 	uint_t			rb_spare[1];
568 	struct timeval32	rb_timestamp;
569 	int			rb_data[1];
570 } mddb_rb32_t;
571 
572 /*
573  * directory entries
574  */
575 typedef struct mddb_optinfo {
576 	int		o_li;
577 	int		o_flags;
578 } mddb_optinfo_t;
579 
580 /* Old definition of mddb_de, for 32-bit apps and libraries */
581 typedef struct mddb_de {
582 	struct mddb_de	*de_next;
583 	mddb_rb_t	*de_rb;
584 	mddb_recid_t	de_recid;
585 	mddb_type_t	de_type1;
586 	uint_t		de_type2;
587 	uint_t		de_reqsize;
588 	uint_t		de_recsize;
589 	mddb_block_t	de_blkcount;
590 	uint_t		de_flags;
591 	mddb_optinfo_t	de_optinfo[2];
592 	mddb_block_t	de_blks[1];
593 } mddb_de_t;
594 
595 /*
596  * In core version of mddb_de, includes pointer for mddb_rb32_t user data
597  * mddb_rb32_t is used incore
598  */
599 typedef struct mddb_de_ic {
600 	void			*de_rb_userdata;
601 	void			*de_rb_userdata_ic;
602 	uint_t			de_owner_nodeid;
603 	struct mddb_de_ic	*de_next;
604 	mddb_rb32_t		*de_rb;
605 	mddb_recid_t		de_recid;
606 	mddb_type_t		de_type1;
607 	uint_t			de_type2;
608 	size_t			de_reqsize;
609 	size_t			de_icreqsize;
610 	size_t			de_recsize;
611 	uint_t			de_blkcount;
612 	uint_t			de_flags;
613 	mddb_optinfo_t		de_optinfo[2];
614 	mddb_block_t		de_blks[1];
615 } mddb_de_ic_t;
616 
617 typedef struct mddb_db {
618 	uint_t			db_magic;
619 	uint_t			db_revision;
620 	uint_t			db_checksum;
621 	mddb_block_t		db_blknum;
622 	struct mddb_db		*db_next;
623 	mddb_block_t		db_nextblk;
624 	struct timeval32	db_timestamp;
625 	uint_t			db_recsum;
626 #ifdef _KERNEL
627 	mddb_de_ic_t		*db_firstentry;
628 #else
629 	mddb_de_t		*db_firstentry;
630 #endif
631 } mddb_db_t;
632 
633 /*
634  * This is, and always will be, the on-disk version of mddb_de
635  * When mddb_de32 is read in it is converted into mddb_de_ic
636  */
637 typedef struct mddb_de32 {
638 	uint32_t	de32_next;
639 	uint32_t	de32_rb;
640 	mddb_recid_t	de32_recid;
641 	mddb_type_t	de32_type1;
642 	uint_t		de32_type2;
643 	uint_t		de32_reqsize;
644 	uint_t		de32_recsize;
645 	mddb_block_t	de32_blkcount;
646 	uint_t		de32_flags;
647 	mddb_optinfo_t	de32_optinfo[2];
648 	mddb_block_t	de32_blks[1];
649 } mddb_de32_t;
650 
651 /*
652  * This is, and always will be, the on-disk version of mddb_db
653  * When mddb_db32 is read in it is converted into mddb_db
654  * To minimize impact on mddb format mddb_db fileds remain intact
655  */
656 typedef struct mddb_db32 {
657 	uint_t			db32_magic;
658 	uint_t			db32_revision;
659 	uint_t			db32_checksum;
660 	mddb_block_t		db32_blknum;
661 	uint32_t		db32_next;
662 	mddb_block_t		db32_nextblk;
663 	struct timeval32	db32_timestamp;
664 	uint_t			db32_recsum;
665 	uint32_t		db32_firstentry;
666 } mddb_db32_t;
667 
668 #define	de32tode(from, to) \
669 	{ \
670 	int i; \
671 	to->de_rb_userdata = NULL; \
672 	to->de_owner_nodeid = MD_MN_INVALID_NID; \
673 	to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \
674 	to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \
675 	to->de_recid =  from->de32_recid; \
676 	to->de_type1 =  from->de32_type1; \
677 	to->de_type2 =  from->de32_type2; \
678 	to->de_reqsize =  from->de32_reqsize; \
679 	to->de_recsize =  from->de32_recsize; \
680 	to->de_blkcount =  from->de32_blkcount; \
681 	to->de_flags =  from->de32_flags; \
682 	to->de_optinfo[0] =  from->de32_optinfo[0]; \
683 	to->de_optinfo[1] =  from->de32_optinfo[1]; \
684 	for (i = 0; i < from->de32_blkcount; i++) \
685 		to->de_blks[i] = from->de32_blks[i]; \
686 	}
687 
688 #define	detode32(from, to) \
689 	{ \
690 	int i; \
691 	to->de32_next = (uint32_t)(uintptr_t)from->de_next; \
692 	to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \
693 	to->de32_recid =  from->de_recid; \
694 	to->de32_type1 =  from->de_type1; \
695 	to->de32_type2 =  from->de_type2; \
696 	to->de32_reqsize =  from->de_reqsize; \
697 	to->de32_recsize =  from->de_recsize; \
698 	to->de32_blkcount =  from->de_blkcount; \
699 	to->de32_flags =  from->de_flags; \
700 	to->de32_optinfo[0] =  from->de_optinfo[0]; \
701 	to->de32_optinfo[1] =  from->de_optinfo[1]; \
702 	for (i = 0; i < from->de_blkcount; i++) \
703 		to->de32_blks[i] = from->de_blks[i]; \
704 	}
705 
706 #define	db32todb(from, to) \
707 	to->db_magic = from->db32_magic; \
708 	to->db_revision = from->db32_revision; \
709 	to->db_checksum = from->db32_checksum; \
710 	to->db_blknum = from->db32_blknum; \
711 	to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \
712 	to->db_nextblk = from->db32_nextblk; \
713 	to->db_timestamp = from->db32_timestamp; \
714 	to->db_recsum = from->db32_recsum; \
715 	to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry;
716 
717 #define	dbtodb32(from, to) \
718 	to->db32_magic = from->db_magic; \
719 	to->db32_revision = from->db_revision; \
720 	to->db32_checksum = from->db_checksum; \
721 	to->db32_blknum = from->db_blknum; \
722 	to->db32_next = (uint32_t)(uintptr_t)from->db_next; \
723 	to->db32_nextblk = from->db_nextblk; \
724 	to->db32_timestamp = from->db_timestamp; \
725 	to->db32_recsum = from->db_recsum; \
726 	to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry;
727 
728 /*
729  * information about a replica of the data base
730  */
731 typedef struct mddb_ri {
732 	struct mddb_ri		*ri_next;
733 	uint_t			ri_flags;
734 	uint_t			ri_commitcnt;
735 	int			ri_transplant;
736 	md_dev64_t		ri_dev;
737 	daddr32_t		ri_blkno;
738 	char			ri_driver[16];
739 	mddb_mb_ic_t		*ri_mbip;
740 	mddb_lb_t		*ri_lbp;
741 	mddb_dt_t		*ri_dtp;
742 	mddb_did_ic_t		*ri_did_icp;
743 	ddi_devid_t		ri_devid;
744 	ddi_devid_t		ri_old_devid;
745 	char			ri_minor_name[MDDB_MINOR_NAME_MAX];
746 	char			ri_devname[MAXPATHLEN];
747 } mddb_ri_t;
748 
749 typedef struct mddb_bf {
750 	struct mddb_bf	*bf_next;
751 	mddb_locator_t	*bf_locator;
752 	buf_t		bf_buf;
753 } mddb_bf_t;
754 
755 /*
756  * Information for sets of databases (which include replicas)
757  */
758 #define	MDDB_BITSRECID	31
759 #define	MDDB_SETSHIFT	(MDDB_BITSRECID - MD_BITSSET)
760 #define	MDDB_SETMASK	(MD_SETMASK << MDDB_SETSHIFT)
761 #define	MDDB_RECIDMASK	((1 << MDDB_SETSHIFT) - 1)
762 
763 #define	DBSET(id)	(((id) & MDDB_SETMASK) >> MDDB_SETSHIFT)
764 #define	DBID(id)	((id) & MDDB_RECIDMASK)
765 #define	MAKERECID(s, i)	((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \
766 			((i) & MDDB_RECIDMASK))
767 
768 #define	MDDB_PARSE_LOCBLK	0x00000001
769 #define	MDDB_PARSE_LOCNM	0x00000002
770 #define	MDDB_PARSE_OPTRECS	0x00000004
771 #define	MDDB_PARSE_MASK		0x0000000F
772 
773 
774 #define	MDDB_BLOCK_PARSE	0x00000001	/* Block sending parse msgs */
775 #define	MDDB_UNBLOCK_PARSE	0x00000002	/* Unblock sending parse msgs */
776 
777 /*
778  * We need to keep s_ident and s_inittime 32 bit.  They are used in mddb_lb
779  */
780 typedef struct mddb_set {
781 	uint_t		s_setno;		/* set number */
782 	uint_t		s_sideno;		/* side number */
783 	identifier_t	s_ident;		/* set identifier */
784 	char		*s_setname;		/* set name */
785 	mddb_mb_ic_t	**s_mbiarray;		/* master blocks array */
786 	mddb_db_t	*s_dbp;			/* directory block */
787 	mddb_lb_t	*s_lbp;			/* locator block */
788 						/* May be cast to mddb_mnlb_t */
789 						/* if accessing sidenames in */
790 						/* MN diskset */
791 	mddb_ln_t	*s_lnp;			/* locator names block */
792 						/* May be cast to mddb_mnln_t */
793 						/* if accessing sidenames in */
794 						/* MN diskset */
795 	mddb_dtag_lst_t	*s_dtlp;		/* List of data tags found */
796 	mddb_did_ic_t	*s_did_icp;		/* Device ID incore area */
797 	mddb_ri_t	*s_rip;			/* replicas incore list */
798 	int		s_freeblkcnt;		/* visable for test code */
799 	int		s_totalblkcnt;		/* visable for test code */
800 	int		s_mn_parseflags;	/* mddb parse flags for MNset */
801 	int		s_mn_parseflags_sending; /* parse flgs sent to slaves */
802 	uchar_t		*s_freebitmap;		/* free blocks bitmap */
803 	uint_t		s_freebitmapsize;	/* size of bitmap */
804 	struct timeval32	s_inittime;	/* timestamp set created */
805 	mddb_recid_t	s_zombie;		/* zombie record - createrec */
806 	int		s_staledeletes;		/* number of stale deleterec */
807 	int		s_optcmtcnt;		/* Following are opt. record */
808 	int		s_opthavelck;		/*   bookkeeping records ... */
809 	int		s_optwantlck;
810 	kcondvar_t	s_optwantlck_cv;
811 	int		s_optwaiterr;
812 	int		s_opthungerr;
813 	kcondvar_t	s_opthungerr_cv;
814 	int		s_opthavequeuinglck;
815 	int		s_optwantqueuinglck;
816 	kcondvar_t	s_optqueuing_cv;
817 	ulong_t		s_bufmisses;
818 	mddb_bf_t	*s_freebufhead;
819 	int		s_bufwakeup;
820 	kcondvar_t	s_buf_cv;
821 	size_t		s_databuffer_size;
822 	void		*s_databuffer;
823 	int		s_singlelockgotten;
824 	int		s_singlelockwanted;
825 	kcondvar_t	s_single_thread_cv;
826 	md_hi_arr_t	s_med;
827 } mddb_set_t;
828 
829 #ifndef MDDB_FAKE
830 #ifdef _KERNEL
831 /* md_mddb.c */
832 extern uint_t			mddb_lb_did_convert(mddb_set_t *,
833 				    uint_t, uint_t *);
834 extern void			mddb_locatorblock2splitname(mddb_ln_t *,
835 				    int, side_t, md_splitname *);
836 extern int			mddb_configure(mddb_cfgcmd_t,
837 				    struct mddb_config *);
838 extern mddb_recid_t		mddb_getnextrec(mddb_recid_t,
839 				    mddb_type_t, uint_t);
840 extern int			mddb_getoptloc(mddb_optloc_t *);
841 extern void			*mddb_getrecaddr(mddb_recid_t);
842 extern void			*mddb_getrecaddr_resize(mddb_recid_t, size_t,
843 				    off_t);
844 extern int			mddb_getrecprivate(mddb_recid_t);
845 extern void			mddb_setrecprivate(mddb_recid_t, uint_t);
846 extern mddb_de_ic_t		*mddb_getrecdep(mddb_recid_t);
847 extern mddb_type_t		mddb_getrectype1(mddb_recid_t);
848 extern int			mddb_getrectype2(mddb_recid_t);
849 extern int			mddb_getrecsize(mddb_recid_t);
850 extern int			mddb_commitrec(mddb_recid_t);
851 extern int			mddb_commitrecs(mddb_recid_t *);
852 extern int			mddb_deleterec(mddb_recid_t);
853 extern mddb_recstatus_t		mddb_getrecstatus(mddb_recid_t);
854 extern mddb_recid_t		mddb_createrec(size_t usersize,
855 				    mddb_type_t type, uint_t type2,
856 				    md_create_rec_option_t option, set_t setno);
857 extern void			mddb_init(void);
858 extern void			mddb_unload(void);
859 extern void			mddb_unload_set(set_t setno);
860 extern mddb_recid_t		mddb_makerecid(set_t setno, mddb_recid_t id);
861 extern set_t			mddb_getsetnum(mddb_recid_t id);
862 extern char			*mddb_getsetname(set_t setno);
863 extern side_t			mddb_getsidenum(set_t setno);
864 extern int			mddb_ownset(set_t setno);
865 extern int			getmed_ioctl(mddb_med_parm_t *medpp, int mode);
866 extern int			setmed_ioctl(mddb_med_parm_t *medpp, int mode);
867 extern int			updmed_ioctl(mddb_med_upd_parm_t *medpp,
868 				    int mode);
869 extern int			take_set(mddb_config_t *cp, int mode);
870 extern int			release_set(mddb_config_t *cp, int mode);
871 extern int			gettag_ioctl(mddb_dtag_get_parm_t *dtgpp,
872 				    int mode);
873 extern int			usetag_ioctl(mddb_dtag_use_parm_t *dtupp,
874 				    int mode);
875 extern int			accept_ioctl(mddb_accept_parm_t *medpp,
876 				    int mode);
877 extern int			md_update_locator_namespace(set_t setno,
878 				    side_t side, char *dname, char *pname,
879 				    md_dev64_t devt);
880 extern int			mddb_validate_lb(set_t setno, int *rmaxsz);
881 extern int			mddb_getinvlb_devid(set_t setno, int count,
882 				    int size, char **ctdptr);
883 extern int			md_update_minor(set_t, side_t, mdkey_t);
884 extern int			md_update_nm_rr_did_ioctl(mddb_config_t *cp);
885 extern int			md_update_top_device_minor(set_t, side_t,
886 				    md_dev64_t);
887 #ifdef DEBUG
888 extern void			mddb_check(void);
889 #endif /* DEBUG */
890 #endif /* _KERNEL */
891 
892 #else
893 
894 caddr_t mddb_fakeit;
895 
896 #define	md_lb_did_convert(a, b, c)	(0)
897 #define	mddb_configure(a, b)	(0)
898 #define	mddb_getnextrec(a, b, c)		((mddb_recid_t)0)
899 #define	mddb_getrecaddr(a)	(mddb_fakeit)
900 #define	mddb_getrecprivate(a)	(0)
901 #define	mddb_setrecprivate(a, b) (0)
902 #define	mddb_getrectype1(a)	(0)
903 #define	mddb_getrectype2(a)	(0)
904 #define	mddb_getrecsize(a)	(0)
905 #define	mddb_commitrec(a)	(0)
906 #define	mddb_commitrecs(a)	(0)
907 #define	mddb_deleterec(a)	(0)
908 #define	mddb_getrecstatus(a)	(MDDB_OK)
909 #define	mddb_createrec(s, a, b)	(0xffff & (int)(mddb_fakeit = \
910 					(caddr_t)kmem_zalloc(s, KM_SLEEP)))
911 #define	mddb_unload()		(0)
912 
913 #endif
914 
915 #define	MDDB_NOSLEEP	1
916 #define	MDDB_SLEEPOK	0
917 
918 #define	MDDB_NOOLDOK	0x1
919 #define	MDDB_MUSTEXIST	0x2
920 #define	MDDB_NOINIT	0x4
921 #define	MDDB_MULTINODE	0x8
922 #define	MDDB_MN_STALE	0x10	/* MN set is stale */
923 
924 /* Flags passed to selectreplicas - not a bit mask */
925 #define	MDDB_SCANALL		1
926 #define	MDDB_RETRYSCAN		0
927 #define	MDDB_SCANALLSYNC	2	/* During reconfig, sync up incore */
928 					/* and ondisk mddb by writing incore */
929 					/* values to disk.  Don't write */
930 					/* change log records. */
931 
932 /* Flags passed to writestart and writecopy */
933 #define	MDDB_WRITECOPY_ALL	1	/* Write all incore mddb to disk */
934 #define	MDDB_WRITECOPY_SYNC	2	/* Write incore mddb to disk except */
935 					/* 	- change log records */
936 					/*	- optimized resync records */
937 
938 
939 #define	MDDB_PROBE	1
940 #define	MDDB_NOPROBE	0
941 
942 
943 /*
944  * MN diskset definitions used to determine if a slave can write
945  * directly to the mddb.  ONLY_MASTER only allows the master node
946  * to write to the mddb.  ANY_NODE allows any node to write
947  * to the mddb.
948  */
949 #define	MDDB_WR_ONLY_MASTER	0
950 #define	MDDB_WR_ANY_NODE	1
951 
952 #define	MDDB_L_LOCKED	0x0001	/* this record is locked */
953 #define	MDDB_L_WANTED	0x0002
954 
955 #ifdef	__cplusplus
956 }
957 #endif
958 
959 #endif	/* _SYS_MD_MDDB_H */
960