1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #ifndef _SYS_MD_MDDB_H 27 #define _SYS_MD_MDDB_H 28 29 #pragma ident "%Z%%M% %I% %E% SMI" 30 31 #include <sys/types.h> 32 #include <sys/buf.h> 33 34 #ifdef __cplusplus 35 extern "C" { 36 #endif 37 38 #if 0 /* DRP FOR DEBUGGING */ 39 #define MDDB_FAKE 40 #endif 41 42 /* Private flags */ 43 #define MD_PRV_GOTIT 0x0001 /* Been snarfed */ 44 #define MD_PRV_DELETE 0x0002 /* Record pending to be deleted */ 45 #define MD_PRV_COMMIT 0x0004 /* Record pending to be commited */ 46 #define MD_PRV_CLEANUP 0x0008 /* Record pending to be cleaned up */ 47 #define MD_PRV_CONVD 0x0010 /* Record has been converted (32->64) */ 48 #define MD_PRV_PENDDEL (MD_PRV_GOTIT | MD_PRV_DELETE) 49 #define MD_PRV_PENDCOM (MD_PRV_GOTIT | MD_PRV_COMMIT) 50 #define MD_PRV_PENDCLEAN (MD_PRV_GOTIT | MD_PRV_CLEANUP) 51 52 53 #define MDDB_E_INVALID (-1) /* an invalid argument was passed */ 54 #define MDDB_E_EXISTS (-2) /* doing an operation a 2nd time which can */ 55 /* only be done once */ 56 #define MDDB_E_MASTER (-3) /* problem occurred accessing mastor block */ 57 /* returned from NEW_DEV */ 58 #define MDDB_E_TOOSMALL (-4) /* device is not large enough */ 59 #define MDDB_E_NORECORD (-5) /* record does not exits */ 60 /* 61 * returned from: mddb_getnextrec 62 * mddb_getrecsize 63 * mddb_commitrec 64 * mddb_commitrecs 65 * mddb_deleterec 66 */ 67 #define MDDB_E_NOSPACE (-6) /* no space to create record */ 68 #define MDDB_E_NOTNOW (-7) /* do not presently have enough resources */ 69 /* to perform requested operation */ 70 #define MDDB_E_NODB (-8) /* no database exist */ 71 #define MDDB_E_NOTOWNER (-9) /* have not been told to grab this set */ 72 #define MDDB_E_STALE (-10) /* database is stale */ 73 #define MDDB_E_TOOFEW (-11) /* not enough replicas available */ 74 #define MDDB_E_TAGDATA (-12) /* tagged data detected */ 75 #define MDDB_E_ACCOK (-13) /* 50/50 mode */ 76 #define MDDB_E_NTAGDATA (-14) /* tagop try, no tag data */ 77 #define MDDB_E_ACCNOTOK (-15) /* accop try, no accept possible */ 78 #define MDDB_E_NOLOCBLK (-16) /* No valid locators found */ 79 #define MDDB_E_NOLOCNMS (-17) /* No valid locator name information */ 80 #define MDDB_E_NODIRBLK (-18) /* No directory blocks found */ 81 #define MDDB_E_NOTAGREC (-19) /* No tag record blocks found */ 82 #define MDDB_E_NOTAG (-20) /* No matching tag record found */ 83 #define MDDB_E_NODEVID (-21) /* No device id found */ 84 85 #define MDDB_MINBLKS 16 /* enough for a few metadevices */ 86 #define MDDB_MAXBLKS 8192 /* size of free bit map (must be / 8) */ 87 #define MDDB_MN_MINBLKS 32768 /* Multinode metadb minimum size */ 88 /* 16MB */ 89 #define MDDB_MN_MAXBLKS 524288 /* size of free bit map (must be / 8) */ 90 /* 256MB */ 91 92 #define MDDB_C_STALE 0x0001 93 #define MDDB_C_TOOFEW 0x0002 94 #define MDDB_C_NOTOWNER 0x0004 95 #define MDDB_C_SET_MN_STALE 0x0008 /* Set MN set to stale */ 96 #define MDDB_C_IMPORT 0x0010 97 98 /* 99 * Defines used to set/reset new master flag in set structure. 100 * Used during reconfig cycle to determine quickly if there is 101 * new master for the set. 102 */ 103 #define MDDB_NM_SET 0x0001 104 #define MDDB_NM_RESET 0x0002 105 #define MDDB_NM_GET 0x0004 106 107 /* Definitions of flag in Locator Block Device ID data area - mddb_did_info */ 108 #define MDDB_DID_EXISTS 0x0001 /* Device ID exists */ 109 #define MDDB_DID_VALID 0x0002 /* Device ID valid on current system */ 110 #define MDDB_DID_UPDATED 0x0004 /* locator/sidelocator info updated */ 111 112 /* Definitions of flag in Locator Block - mddb_lb */ 113 #define MDDB_DEVID_STYLE 0x0001 /* Locator Block in Device ID format */ 114 #define MDDB_MNSET 0x0002 /* MDDB is for a multi-node set */ 115 116 117 #define MDDB_MAX_PATCH 25 /* number of locations that */ 118 /* can be patched in etc/system */ 119 120 /* 121 * Set struct used by all parts of the driver, to store anchor pointers. 122 * 123 * Lock associated with field in this structure: 124 * 125 * Some of fields are accessible by both the single threaded ioctl thread 126 * and internal threads such as resync, hotsparing...etc. In this case 127 * additional protection is needed. For example, s_db is protected by 128 * s_dbmx additionally and s_un, s_ui are protected by md_unit_array_rw.lock 129 * s_nm, s_nmid, s_did_nm and s_did_nmid and s_dtp are protected by nm_lock 130 * Rest of other fileds are protected by md_mx. Two fields s_un_next and 131 * s_un_avail are introduced by the friendly name project and are ONLY 132 * accessible via a single threaded ioctl thread which already is protected 133 * by the ioctl lock and there is no need to add extra protection to them. 134 * However, in the future if they become accessible by other internal threads 135 * then an additional protection such as md_mx lock is highly recommended. 136 * 137 */ 138 typedef struct md_set { 139 uint_t s_status; /* set status */ 140 void **s_ui; /* set unit incore anchor */ 141 void **s_un; /* set unit anchor */ 142 void *s_hsp; /* set Hot Spare Pool anchor */ 143 void *s_hs; /* set Hot Spare anchor */ 144 void *s_db; /* set MDDB anchor */ 145 kmutex_t s_dbmx; /* set MDDB mutex */ 146 void *s_nm; /* set namespace anchor */ 147 mddb_recid_t s_nmid; /* set namespace anchor record */ 148 void *s_did_nm; /* set device id namespace anchor */ 149 mddb_recid_t s_did_nmid; /* set device id namespace anchor rec */ 150 void *s_dtp; /* set data tag rec */ 151 int s_am_i_master; /* incore master flag for this node */ 152 md_mn_nodeid_t s_nodeid; /* nodeid of this node - for MN sets */ 153 uint_t s_rcnt; /* incore resync count for set */ 154 unit_t s_un_next; /* s_un scan starts here */ 155 unit_t s_un_avail; /* number of avail slots */ 156 } md_set_t; 157 158 159 #define MDDB_MAGIC_MB 0x6d646d62 /* magic number for master blocks */ 160 #define MDDB_MAGIC_DB 0x6d646462 /* magic number for directory blocks */ 161 #define MDDB_MAGIC_RB 0x6d647262 /* magic number for record blocks */ 162 #define MDDB_MAGIC_LB 0x6d646c62 /* magic number for locator blocks */ 163 #define MDDB_MAGIC_LN 0x6d646c6e /* magic number for locator names */ 164 #define MDDB_MAGIC_DT 0x6d646474 /* magic number for data tag */ 165 #define MDDB_MAGIC_DI 0x6d646469 /* magic number for device ID block */ 166 #define MDDB_MAGIC_DU 0x6d646475 /* magic num for dummy mb */ 167 #define MDDB_MAGIC_DE 0x6d646465 /* magic num for mb devid */ 168 169 #define MDDB_GLOBAL_XOR 1234567890 170 171 #define MDDB_REV_MAJOR (uint_t)0xff00 172 #define MDDB_REV_MINOR (uint_t)0x00ff 173 174 /* 175 * MDDB_REV_MNMB: 176 * If a MN diskset, master block revision is set to MDDB_REV_MNMB. 177 * Even though the master block structure is no different 178 * for a MN set, setting the revision field to a different 179 * number keeps any pre-MN_diskset code from accessing 180 * this diskset. It also allows for an early determination 181 * of a MN diskset when reading in from disk so that the 182 * proper size locator block and locator names structure 183 * can be read in thus saving time on diskset startup. 184 * Since no change in master block structure, the MDDB_REV_MINOR 185 * portion of the revision was incremented. 186 * 187 * MDDB_REV_MNLB: 188 * If a MN diskset, the locator block structure is a different size in 189 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset 190 * with any nodeid (sideno) allowed. 191 * The revision is set to MDDB_REV_MNLB which is a change of the 192 * MDDB_REV_MAJOR portion of the revision. 193 * 194 * MDDB_REV_MNLN: 195 * If a MN diskset, the locator names is a different size in 196 * order to accomodate up to MD_MNMAXSIDES nodes in a diskset 197 * with any nodeid (sideno) allowed. 198 * The revision is set to MDDB_REV_MNLN which is a change of the 199 * MDDB_REV_MAJOR portion of the revision. 200 * 201 * The record blocks have two binary properties. A record block can 202 * represent either a 32 or 64 bit unit. A record block can also represent 203 * a traditionally named unit or a friendly named unit. Thus, there are 204 * minor revisions of record block. 205 * 206 * Traditional Friendly 207 * Name Name 208 * ----------- -------- 209 * 32 bit MDDB_REV_RB MDDB_REV_RBFN 210 * 64 bit MDDB_REV_RB64 MDDB_REV_RB64FN 211 */ 212 213 #define MDDB_REV_MB (uint_t)0x0201 214 #define MDDB_REV_MNMB (uint_t)0x0202 215 #define MDDB_REV_DB (uint_t)0x0201 216 #define MDDB_REV_LB (uint_t)0x0500 217 #define MDDB_REV_MNLB (uint_t)0x0600 218 #define MDDB_REV_LN (uint_t)0x0100 219 #define MDDB_REV_MNLN (uint_t)0x0300 220 #define MDDB_REV_RB (uint_t)0x0200 221 #define MDDB_REV_RB64 (uint_t)0x0201 222 #define MDDB_REV_RBFN (uint_t)0x0202 223 #define MDDB_REV_RB64FN (uint_t)0x0203 224 #define MDDB_REV_DT (uint_t)0x0100 225 #define MDDB_REV_DI (uint_t)0x0100 226 227 /* 228 * Transfer record block friendly name status to unit/hs structure. 229 */ 230 #define NOTE_FN(rbv, unv) switch (rbv) { \ 231 case MDDB_REV_RB: \ 232 case MDDB_REV_RB64: \ 233 unv &= ~MD_FN_META_DEV; \ 234 break; \ 235 case MDDB_REV_RBFN: \ 236 case MDDB_REV_RB64FN: \ 237 unv |= MD_FN_META_DEV; \ 238 break; \ 239 } 240 241 #define MDDB_BSIZE (uint_t)DEV_BSIZE 242 #define MDDB_PREFIXCNT 10 243 #define MDDB_DRVNMCNT 10 244 245 typedef int mddb_block_t; 246 247 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 248 #pragma pack(4) 249 #endif 250 typedef struct md_mnname_suffix { 251 md_name_suffix mn_ln_suffix; 252 uint_t mn_ln_sideno; 253 } md_mnname_suffix_t; 254 255 typedef struct mddb_ln { 256 int ln_magic; 257 uint_t ln_revision; 258 uint_t ln_checksum; 259 struct timeval32 ln_timestamp; 260 md_name_prefix ln_prefixes[MDDB_PREFIXCNT]; 261 /* Don't change array sizes without changing RNDUP_BLKCNT */ 262 md_name_suffix ln_suffixes[MD_MAXSIDES][MDDB_NLB]; 263 } mddb_ln_t; 264 265 /* 266 * Locator name structure for MN diskset. Same as for traditional 267 * and local diskset except that more sides are supported and the 268 * side number can be any number since the side number is stored 269 * in the ln_mnsuffixes structure instead of being used as an index 270 * into that array. This means that the whole array may need to be 271 * searched in order to find the correct information given a side number. 272 */ 273 typedef struct mddb_mnln { 274 int ln_magic; 275 uint_t ln_revision; 276 uint_t ln_checksum; 277 struct timeval32 ln_timestamp; 278 md_name_prefix ln_prefixes[MDDB_PREFIXCNT]; 279 /* Don't change array sizes without changing MDDB_MNLNCNT */ 280 md_mnname_suffix_t ln_mnsuffixes[MD_MNMAXSIDES][MDDB_NLB]; 281 } mddb_mnln_t; 282 283 #define RNDUP_BLKCNT(sz, delta) (((sz) - \ 284 ((delta) * \ 285 ((MD_MAXSIDES - 1) * MDDB_NLB)) + \ 286 MDDB_BSIZE - 1) / MDDB_BSIZE) 287 #define MDDB_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), 0) 288 #define MDDB_LOCAL_LNCNT RNDUP_BLKCNT(sizeof (mddb_ln_t), \ 289 sizeof (md_name_suffix)) 290 291 #define MDDB_MNLNCNT ((sizeof (mddb_mnln_t) + (MDDB_BSIZE - 1)) \ 292 / MDDB_BSIZE) 293 294 typedef struct mddb_dt { 295 uint_t dt_mag; 296 uint_t dt_rev; 297 uint_t dt_cks; 298 mddb_dtag_t dt_dtag; 299 } mddb_dt_t; 300 301 #define MDDB_DT_BYTES (roundup(sizeof (mddb_dt_t), MDDB_BSIZE)) 302 #define MDDB_DT_BLOCKS (btodb(MDDB_DT_BYTES)) 303 304 typedef union identifier { 305 char serial[MDDB_SN_LEN]; 306 struct timeval32 createtime; 307 } identifier_t; 308 309 typedef struct mddb_locator { 310 dev32_t l_dev; 311 daddr32_t l_blkno; 312 int l_flags; 313 } mddb_locator_t; 314 315 typedef struct mddb_sidelocator { 316 uchar_t l_drvnm_index; 317 minor_t l_mnum; 318 } mddb_sidelocator_t; 319 320 typedef struct mddb_mnsidelocator { 321 uchar_t mnl_drvnm_index; 322 minor_t mnl_mnum; 323 uint_t mnl_sideno; 324 } mddb_mnsidelocator_t; 325 326 typedef struct mddb_drvnm { 327 uchar_t dn_len; 328 char dn_data[MD_MAXDRVNM]; 329 } mddb_drvnm_t; 330 331 /* 332 * Locator Block Device ID Information 333 * Several device id's may share one disk block in an effort to 334 * conserve used replica space. 335 */ 336 typedef struct mddb_did_info { 337 uint_t info_flags; /* MDDB Device ID flags */ 338 uint_t info_firstblk; /* Device ID Start Block */ 339 uint_t info_blkcnt; /* Device ID Block Count */ 340 uint_t info_offset; /* Device ID offset w/i Block */ 341 uint_t info_length; /* Device ID Length */ 342 uint_t info_checksum; /* Device ID Checksum */ 343 char info_minor_name[32]; /* Minor name of lb dev */ 344 } mddb_did_info_t; 345 346 typedef struct mddb_did_blk { 347 int blk_magic; /* used for verification */ 348 uint_t blk_revision; /* used for verification */ 349 int blk_checksum; /* used for verification */ 350 uint_t blk_commitcnt; /* matches LB's commitcnt */ 351 mddb_did_info_t blk_info[MDDB_NLB]; 352 } mddb_did_blk_t; 353 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 354 #pragma pack() 355 #endif 356 357 #define MDDB_DID_BYTES (roundup(sizeof (mddb_did_blk_t), MDDB_BSIZE)) 358 #define MDDB_DID_BLOCKS (btodb(MDDB_DID_BYTES)) 359 360 /* 361 * Device ID Disk Blocks. 362 * Incore linked list of disk blocks containing device IDs. 363 * The list is built when reading in the mddb_did_blk structure and 364 * when reading in the actual disk blocks containing device ids. 365 * This list is used to easily write out all disk blocks containing 366 * device ids. 367 */ 368 typedef struct mddb_did_db { 369 uint_t db_firstblk; /* Disk Block's logical addr */ 370 uint_t db_blkcnt; /* Contig Disk Block Count */ 371 caddr_t db_ptr; /* Ptr to incore Block(s) */ 372 struct mddb_did_db *db_next; /* Ptr to next in list */ 373 } mddb_did_db_t; 374 375 /* 376 * Device ID Free List. 377 * Incore linked list of free space in disk blocks containing device IDs. 378 * Used to manage placement of device IDs in disk blocks. 379 * All disk blocks on free list are also in linked list of disk block 380 * containing device IDs (mddb_did_db_t). 381 */ 382 typedef struct mddb_did_free { 383 uint_t free_blk; /* Disk Block's logical addr */ 384 uint_t free_offset; /* offset of free space */ 385 uint_t free_length; /* length of free space */ 386 struct mddb_did_free *free_next; /* Ptr to next in list */ 387 } mddb_did_free_t; 388 389 /* 390 * Device ID Incore Area 391 * Contains pointer to Device ID Disk Block list and 392 * Device ID Free List. 393 * Also contains incore array of pointers to device IDs. Pointers 394 * point into the device ID Disk Block list and are used as a 395 * shortcut to find incore device IDs. 396 */ 397 typedef struct mddb_did_ic { 398 mddb_did_blk_t *did_ic_blkp; 399 mddb_did_db_t *did_ic_dbp; 400 mddb_did_free_t *did_ic_freep; 401 ddi_devid_t did_ic_devid[MDDB_NLB]; /* Ptr to device IDs */ 402 } mddb_did_ic_t; 403 404 /* 405 * Locator Block (LB): 406 * - Are fixed size, but the size is different 407 * for local/shared set db replicas. 408 * - All LB's start at logical block 0. 409 * - After a replica quorum is found, there is 410 * is only one incore copy of the LB. 411 * - LB's are only written when replicas are added, deleted, or errored. 412 * - LB's provide information about other replica's and their state. 413 */ 414 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 415 #pragma pack(4) 416 #endif 417 typedef struct mddb_lb { 418 int lb_magic; /* used for verification */ 419 uint_t lb_revision; /* used for verification */ 420 int lb_checksum; /* used for verification */ 421 uint_t lb_commitcnt; /* IMPORTANT */ 422 struct timeval32 lb_timestamp; /* informative only */ 423 int lb_loccnt; /* used for verification */ 424 identifier_t lb_ident; /* used for verification */ 425 uint_t lb_flags; /* flags describing LB */ 426 uint_t lb_spare[8]; /* Spare/Pad */ 427 mddb_block_t lb_didfirstblk; /* Devid Array Start Block */ 428 mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */ 429 mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */ 430 mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */ 431 struct timeval32 lb_inittime; /* creation of database */ 432 set_t lb_setno; /* used for verification */ 433 mddb_block_t lb_blkcnt; /* used for verification */ 434 mddb_block_t lb_lnfirstblk; 435 mddb_block_t lb_lnblkcnt; 436 mddb_block_t lb_dbfirstblk; 437 mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT]; 438 mddb_locator_t lb_locators[MDDB_NLB]; 439 /* Don't change array sizes without changing RNDUP_BLKCNT */ 440 mddb_sidelocator_t lb_sidelocators[MD_MAXSIDES][MDDB_NLB]; 441 } mddb_lb_t; 442 #if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 443 #pragma pack() 444 #endif 445 446 /* 447 * Locator block structure for MN diskset. Same as for traditional 448 * and local diskset except that more sides are supported and the 449 * side number can be any number since the side number is stored 450 * in the lb_mnsidelocators structure instead of being used as an index 451 * into that array. This means that the whole array may need to be 452 * searched in order to find the correct information given a side number. 453 */ 454 typedef struct mddb_mnlb { 455 int lb_magic; /* used for verification */ 456 uint_t lb_revision; /* used for verification */ 457 int lb_checksum; /* used for verification */ 458 uint_t lb_commitcnt; /* IMPORTANT */ 459 struct timeval32 lb_timestamp; /* informative only */ 460 int lb_loccnt; /* used for verification */ 461 identifier_t lb_ident; /* used for verification */ 462 uint_t lb_flags; /* flags describing LB */ 463 uint_t lb_spare[8]; /* Spare/Pad */ 464 mddb_block_t lb_didfirstblk; /* Devid Array Start Block */ 465 mddb_block_t lb_didblkcnt; /* Devid Array Number Blocks */ 466 mddb_block_t lb_dtfirstblk; /* Data Tag Start Block */ 467 mddb_block_t lb_dtblkcnt; /* Data Tag Number Block(s) */ 468 struct timeval32 lb_inittime; /* creation of database */ 469 set_t lb_setno; /* used for verification */ 470 mddb_block_t lb_blkcnt; /* used for verification */ 471 mddb_block_t lb_lnfirstblk; 472 mddb_block_t lb_lnblkcnt; 473 mddb_block_t lb_dbfirstblk; 474 mddb_drvnm_t lb_drvnm[MDDB_DRVNMCNT]; 475 mddb_locator_t lb_locators[MDDB_NLB]; 476 /* Don't change array sizes without changing MDDB_MNLBCNT */ 477 mddb_mnsidelocator_t lb_mnsidelocators[MD_MNMAXSIDES][MDDB_NLB]; 478 } mddb_mnlb_t; 479 480 481 #define MDDB_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), 0) 482 #define MDDB_LOCAL_LBCNT RNDUP_BLKCNT(sizeof (mddb_lb_t), \ 483 sizeof (mddb_sidelocator_t)) 484 485 #define MDDB_MNLBCNT ((sizeof (mddb_mnlb_t) + (MDDB_BSIZE - 1)) \ 486 / MDDB_BSIZE) 487 488 typedef struct mddb_map { 489 daddr32_t m_consecutive; 490 daddr32_t m_firstblk; 491 } mddb_map_t; 492 493 /* 494 * Master block(s) (MB) 495 * - Are written by userland; Never by the driver! 496 * - Each replica has there own master blocks, 497 * the master block(s) are not shared. 498 * - MB's are not in the logical block address space of the database. 499 * - MB's are a fixed size record (MDDB_BSIZE) 500 * - MB's provide the logical to physical block translation, 501 * for their replica. 502 */ 503 typedef struct mddb_mb { 504 int mb_magic; /* used for verification */ 505 uint_t mb_revision; /* used for verification */ 506 uint_t mb_checksum; /* used for verification */ 507 #ifdef _LP64 508 uint32_t mb_next; /* incore to next mb */ 509 #else 510 struct mddb_mb *mb_next; /* incore to next mb */ 511 #endif /* _LP64 */ 512 daddr32_t mb_nextblk; /* block # for next mb */ 513 md_timeval32_t mb_timestamp; /* timestamp */ 514 daddr32_t mb_blkcnt; /* size of blkmap */ 515 daddr32_t mb_blkno; /* physical loc. for this MB */ 516 set_t mb_setno; /* used for verification */ 517 struct timeval32 mb_setcreatetime; /* set creation timestamp */ 518 int spares[7]; 519 mddb_map_t mb_blkmap; /* logical->physical blk map */ 520 int mb_devid_magic; /* verify devid in mb */ 521 short mb_devid_len; /* len of following devid */ 522 char mb_devid[1]; /* devid byte array */ 523 } mddb_mb_t; 524 525 /* 526 * In-core version of mddb_mb. It is known that the mddb_mb is 512 bytes on 527 * disk, really, and so this structure is 512 + sizeof(struct mddb_mb_ic *) 528 */ 529 #define MDDB_IC_BSIZE (MDDB_BSIZE + sizeof (struct mddb_mb_ic *)) 530 typedef struct mddb_mb_ic { 531 struct mddb_mb_ic *mbi_next; 532 struct mddb_mb mbi_mddb_mb; 533 } mddb_mb_ic_t; 534 535 536 /* 537 * there can be no address in record block. The checksum must 538 * stay the same where ever the record is in memory. Many 539 * things depend on this. Also the timestamp is the time the the 540 * record was committed not the time it was written to a particular 541 * device. 542 * 543 * Old definition of mddb_rb, for 32-bit apps and libraries 544 */ 545 typedef struct mddb_rb { 546 uint_t rb_magic; 547 uint_t rb_revision; 548 uint_t rb_checksum; 549 uint_t rb_checksum_fiddle; 550 uint_t rb_private; 551 void *rb_userdata; 552 uint_t rb_commitcnt; 553 uint_t rb_spare[1]; 554 struct timeval32 rb_timestamp; 555 int rb_data[1]; 556 } mddb_rb_t; 557 558 /* This is, and always will be, the on-disk version of mddb_rb */ 559 typedef struct mddb_rb32 { 560 uint_t rb_magic; 561 uint_t rb_revision; 562 uint_t rb_checksum; 563 uint_t rb_checksum_fiddle; 564 uint_t rb_private; 565 uint32_t rb_userdata; 566 uint_t rb_commitcnt; 567 uint_t rb_spare[1]; 568 struct timeval32 rb_timestamp; 569 int rb_data[1]; 570 } mddb_rb32_t; 571 572 /* 573 * directory entries 574 */ 575 typedef struct mddb_optinfo { 576 int o_li; 577 int o_flags; 578 } mddb_optinfo_t; 579 580 /* Old definition of mddb_de, for 32-bit apps and libraries */ 581 typedef struct mddb_de { 582 struct mddb_de *de_next; 583 mddb_rb_t *de_rb; 584 mddb_recid_t de_recid; 585 mddb_type_t de_type1; 586 uint_t de_type2; 587 uint_t de_reqsize; 588 uint_t de_recsize; 589 mddb_block_t de_blkcount; 590 uint_t de_flags; 591 mddb_optinfo_t de_optinfo[2]; 592 mddb_block_t de_blks[1]; 593 } mddb_de_t; 594 595 /* 596 * In core version of mddb_de, includes pointer for mddb_rb32_t user data 597 * mddb_rb32_t is used incore 598 */ 599 typedef struct mddb_de_ic { 600 void *de_rb_userdata; 601 void *de_rb_userdata_ic; 602 uint_t de_owner_nodeid; 603 struct mddb_de_ic *de_next; 604 mddb_rb32_t *de_rb; 605 mddb_recid_t de_recid; 606 mddb_type_t de_type1; 607 uint_t de_type2; 608 size_t de_reqsize; 609 size_t de_icreqsize; 610 size_t de_recsize; 611 uint_t de_blkcount; 612 uint_t de_flags; 613 mddb_optinfo_t de_optinfo[2]; 614 mddb_block_t de_blks[1]; 615 } mddb_de_ic_t; 616 617 typedef struct mddb_db { 618 uint_t db_magic; 619 uint_t db_revision; 620 uint_t db_checksum; 621 mddb_block_t db_blknum; 622 struct mddb_db *db_next; 623 mddb_block_t db_nextblk; 624 struct timeval32 db_timestamp; 625 uint_t db_recsum; 626 #ifdef _KERNEL 627 mddb_de_ic_t *db_firstentry; 628 #else 629 mddb_de_t *db_firstentry; 630 #endif 631 } mddb_db_t; 632 633 /* 634 * This is, and always will be, the on-disk version of mddb_de 635 * When mddb_de32 is read in it is converted into mddb_de_ic 636 */ 637 typedef struct mddb_de32 { 638 uint32_t de32_next; 639 uint32_t de32_rb; 640 mddb_recid_t de32_recid; 641 mddb_type_t de32_type1; 642 uint_t de32_type2; 643 uint_t de32_reqsize; 644 uint_t de32_recsize; 645 mddb_block_t de32_blkcount; 646 uint_t de32_flags; 647 mddb_optinfo_t de32_optinfo[2]; 648 mddb_block_t de32_blks[1]; 649 } mddb_de32_t; 650 651 /* 652 * This is, and always will be, the on-disk version of mddb_db 653 * When mddb_db32 is read in it is converted into mddb_db 654 * To minimize impact on mddb format mddb_db fileds remain intact 655 */ 656 typedef struct mddb_db32 { 657 uint_t db32_magic; 658 uint_t db32_revision; 659 uint_t db32_checksum; 660 mddb_block_t db32_blknum; 661 uint32_t db32_next; 662 mddb_block_t db32_nextblk; 663 struct timeval32 db32_timestamp; 664 uint_t db32_recsum; 665 uint32_t db32_firstentry; 666 } mddb_db32_t; 667 668 #define de32tode(from, to) \ 669 { \ 670 int i; \ 671 to->de_rb_userdata = NULL; \ 672 to->de_owner_nodeid = MD_MN_INVALID_NID; \ 673 to->de_next = (struct mddb_de_ic *)(uintptr_t)from->de32_next; \ 674 to->de_rb = (mddb_rb32_t *)(uintptr_t)from->de32_rb; \ 675 to->de_recid = from->de32_recid; \ 676 to->de_type1 = from->de32_type1; \ 677 to->de_type2 = from->de32_type2; \ 678 to->de_reqsize = from->de32_reqsize; \ 679 to->de_recsize = from->de32_recsize; \ 680 to->de_blkcount = from->de32_blkcount; \ 681 to->de_flags = from->de32_flags; \ 682 to->de_optinfo[0] = from->de32_optinfo[0]; \ 683 to->de_optinfo[1] = from->de32_optinfo[1]; \ 684 for (i = 0; i < from->de32_blkcount; i++) \ 685 to->de_blks[i] = from->de32_blks[i]; \ 686 } 687 688 #define detode32(from, to) \ 689 { \ 690 int i; \ 691 to->de32_next = (uint32_t)(uintptr_t)from->de_next; \ 692 to->de32_rb = (uint32_t)(uintptr_t)from->de_rb; \ 693 to->de32_recid = from->de_recid; \ 694 to->de32_type1 = from->de_type1; \ 695 to->de32_type2 = from->de_type2; \ 696 to->de32_reqsize = from->de_reqsize; \ 697 to->de32_recsize = from->de_recsize; \ 698 to->de32_blkcount = from->de_blkcount; \ 699 to->de32_flags = from->de_flags; \ 700 to->de32_optinfo[0] = from->de_optinfo[0]; \ 701 to->de32_optinfo[1] = from->de_optinfo[1]; \ 702 for (i = 0; i < from->de_blkcount; i++) \ 703 to->de32_blks[i] = from->de_blks[i]; \ 704 } 705 706 #define db32todb(from, to) \ 707 to->db_magic = from->db32_magic; \ 708 to->db_revision = from->db32_revision; \ 709 to->db_checksum = from->db32_checksum; \ 710 to->db_blknum = from->db32_blknum; \ 711 to->db_next = (struct mddb_db *)(uintptr_t)from->db32_next; \ 712 to->db_nextblk = from->db32_nextblk; \ 713 to->db_timestamp = from->db32_timestamp; \ 714 to->db_recsum = from->db32_recsum; \ 715 to->db_firstentry = (mddb_de_ic_t *)(uintptr_t)from->db32_firstentry; 716 717 #define dbtodb32(from, to) \ 718 to->db32_magic = from->db_magic; \ 719 to->db32_revision = from->db_revision; \ 720 to->db32_checksum = from->db_checksum; \ 721 to->db32_blknum = from->db_blknum; \ 722 to->db32_next = (uint32_t)(uintptr_t)from->db_next; \ 723 to->db32_nextblk = from->db_nextblk; \ 724 to->db32_timestamp = from->db_timestamp; \ 725 to->db32_recsum = from->db_recsum; \ 726 to->db32_firstentry = (uint32_t)(uintptr_t)from->db_firstentry; 727 728 /* 729 * information about a replica of the data base 730 */ 731 typedef struct mddb_ri { 732 struct mddb_ri *ri_next; 733 uint_t ri_flags; 734 uint_t ri_commitcnt; 735 int ri_transplant; 736 md_dev64_t ri_dev; 737 daddr32_t ri_blkno; 738 char ri_driver[16]; 739 mddb_mb_ic_t *ri_mbip; 740 mddb_lb_t *ri_lbp; 741 mddb_dt_t *ri_dtp; 742 mddb_did_ic_t *ri_did_icp; 743 ddi_devid_t ri_devid; 744 ddi_devid_t ri_old_devid; 745 char ri_minor_name[MDDB_MINOR_NAME_MAX]; 746 char ri_devname[MAXPATHLEN]; 747 } mddb_ri_t; 748 749 typedef struct mddb_bf { 750 struct mddb_bf *bf_next; 751 mddb_locator_t *bf_locator; 752 buf_t bf_buf; 753 } mddb_bf_t; 754 755 /* 756 * Information for sets of databases (which include replicas) 757 */ 758 #define MDDB_BITSRECID 31 759 #define MDDB_SETSHIFT (MDDB_BITSRECID - MD_BITSSET) 760 #define MDDB_SETMASK (MD_SETMASK << MDDB_SETSHIFT) 761 #define MDDB_RECIDMASK ((1 << MDDB_SETSHIFT) - 1) 762 763 #define DBSET(id) (((id) & MDDB_SETMASK) >> MDDB_SETSHIFT) 764 #define DBID(id) ((id) & MDDB_RECIDMASK) 765 #define MAKERECID(s, i) ((((s) << MDDB_SETSHIFT) & MDDB_SETMASK) | \ 766 ((i) & MDDB_RECIDMASK)) 767 768 #define MDDB_PARSE_LOCBLK 0x00000001 769 #define MDDB_PARSE_LOCNM 0x00000002 770 #define MDDB_PARSE_OPTRECS 0x00000004 771 #define MDDB_PARSE_MASK 0x0000000F 772 773 774 #define MDDB_BLOCK_PARSE 0x00000001 /* Block sending parse msgs */ 775 #define MDDB_UNBLOCK_PARSE 0x00000002 /* Unblock sending parse msgs */ 776 777 /* 778 * We need to keep s_ident and s_inittime 32 bit. They are used in mddb_lb 779 */ 780 typedef struct mddb_set { 781 uint_t s_setno; /* set number */ 782 uint_t s_sideno; /* side number */ 783 identifier_t s_ident; /* set identifier */ 784 char *s_setname; /* set name */ 785 mddb_mb_ic_t **s_mbiarray; /* master blocks array */ 786 mddb_db_t *s_dbp; /* directory block */ 787 mddb_lb_t *s_lbp; /* locator block */ 788 /* May be cast to mddb_mnlb_t */ 789 /* if accessing sidenames in */ 790 /* MN diskset */ 791 mddb_ln_t *s_lnp; /* locator names block */ 792 /* May be cast to mddb_mnln_t */ 793 /* if accessing sidenames in */ 794 /* MN diskset */ 795 mddb_dtag_lst_t *s_dtlp; /* List of data tags found */ 796 mddb_did_ic_t *s_did_icp; /* Device ID incore area */ 797 mddb_ri_t *s_rip; /* replicas incore list */ 798 int s_freeblkcnt; /* visable for test code */ 799 int s_totalblkcnt; /* visable for test code */ 800 int s_mn_parseflags; /* mddb parse flags for MNset */ 801 int s_mn_parseflags_sending; /* parse flgs sent to slaves */ 802 uchar_t *s_freebitmap; /* free blocks bitmap */ 803 uint_t s_freebitmapsize; /* size of bitmap */ 804 struct timeval32 s_inittime; /* timestamp set created */ 805 mddb_recid_t s_zombie; /* zombie record - createrec */ 806 int s_staledeletes; /* number of stale deleterec */ 807 int s_optcmtcnt; /* Following are opt. record */ 808 int s_opthavelck; /* bookkeeping records ... */ 809 int s_optwantlck; 810 kcondvar_t s_optwantlck_cv; 811 int s_optwaiterr; 812 int s_opthungerr; 813 kcondvar_t s_opthungerr_cv; 814 int s_opthavequeuinglck; 815 int s_optwantqueuinglck; 816 kcondvar_t s_optqueuing_cv; 817 ulong_t s_bufmisses; 818 mddb_bf_t *s_freebufhead; 819 int s_bufwakeup; 820 kcondvar_t s_buf_cv; 821 size_t s_databuffer_size; 822 void *s_databuffer; 823 int s_singlelockgotten; 824 int s_singlelockwanted; 825 kcondvar_t s_single_thread_cv; 826 md_hi_arr_t s_med; 827 } mddb_set_t; 828 829 #ifndef MDDB_FAKE 830 #ifdef _KERNEL 831 /* md_mddb.c */ 832 extern uint_t mddb_lb_did_convert(mddb_set_t *, 833 uint_t, uint_t *); 834 extern void mddb_locatorblock2splitname(mddb_ln_t *, 835 int, side_t, md_splitname *); 836 extern int mddb_configure(mddb_cfgcmd_t, 837 struct mddb_config *); 838 extern mddb_recid_t mddb_getnextrec(mddb_recid_t, 839 mddb_type_t, uint_t); 840 extern int mddb_getoptloc(mddb_optloc_t *); 841 extern void *mddb_getrecaddr(mddb_recid_t); 842 extern void *mddb_getrecaddr_resize(mddb_recid_t, size_t, 843 off_t); 844 extern int mddb_getrecprivate(mddb_recid_t); 845 extern void mddb_setrecprivate(mddb_recid_t, uint_t); 846 extern mddb_de_ic_t *mddb_getrecdep(mddb_recid_t); 847 extern mddb_type_t mddb_getrectype1(mddb_recid_t); 848 extern int mddb_getrectype2(mddb_recid_t); 849 extern int mddb_getrecsize(mddb_recid_t); 850 extern int mddb_commitrec(mddb_recid_t); 851 extern int mddb_commitrecs(mddb_recid_t *); 852 extern int mddb_deleterec(mddb_recid_t); 853 extern mddb_recstatus_t mddb_getrecstatus(mddb_recid_t); 854 extern mddb_recid_t mddb_createrec(size_t usersize, 855 mddb_type_t type, uint_t type2, 856 md_create_rec_option_t option, set_t setno); 857 extern void mddb_init(void); 858 extern void mddb_unload(void); 859 extern void mddb_unload_set(set_t setno); 860 extern mddb_recid_t mddb_makerecid(set_t setno, mddb_recid_t id); 861 extern set_t mddb_getsetnum(mddb_recid_t id); 862 extern char *mddb_getsetname(set_t setno); 863 extern side_t mddb_getsidenum(set_t setno); 864 extern int mddb_ownset(set_t setno); 865 extern int getmed_ioctl(mddb_med_parm_t *medpp, int mode); 866 extern int setmed_ioctl(mddb_med_parm_t *medpp, int mode); 867 extern int updmed_ioctl(mddb_med_upd_parm_t *medpp, 868 int mode); 869 extern int take_set(mddb_config_t *cp, int mode); 870 extern int release_set(mddb_config_t *cp, int mode); 871 extern int gettag_ioctl(mddb_dtag_get_parm_t *dtgpp, 872 int mode); 873 extern int usetag_ioctl(mddb_dtag_use_parm_t *dtupp, 874 int mode); 875 extern int accept_ioctl(mddb_accept_parm_t *medpp, 876 int mode); 877 extern int md_update_locator_namespace(set_t setno, 878 side_t side, char *dname, char *pname, 879 md_dev64_t devt); 880 extern int mddb_validate_lb(set_t setno, int *rmaxsz); 881 extern int mddb_getinvlb_devid(set_t setno, int count, 882 int size, char **ctdptr); 883 extern int md_update_minor(set_t, side_t, mdkey_t); 884 extern int md_update_top_device_minor(set_t, side_t, 885 md_dev64_t); 886 #ifdef DEBUG 887 extern void mddb_check(void); 888 #endif /* DEBUG */ 889 #endif /* _KERNEL */ 890 891 #else 892 893 caddr_t mddb_fakeit; 894 895 #define md_lb_did_convert(a, b, c) (0) 896 #define mddb_configure(a, b) (0) 897 #define mddb_getnextrec(a, b, c) ((mddb_recid_t)0) 898 #define mddb_getrecaddr(a) (mddb_fakeit) 899 #define mddb_getrecprivate(a) (0) 900 #define mddb_setrecprivate(a, b) (0) 901 #define mddb_getrectype1(a) (0) 902 #define mddb_getrectype2(a) (0) 903 #define mddb_getrecsize(a) (0) 904 #define mddb_commitrec(a) (0) 905 #define mddb_commitrecs(a) (0) 906 #define mddb_deleterec(a) (0) 907 #define mddb_getrecstatus(a) (MDDB_OK) 908 #define mddb_createrec(s, a, b) (0xffff & (int)(mddb_fakeit = \ 909 (caddr_t)kmem_zalloc(s, KM_SLEEP))) 910 #define mddb_unload() (0) 911 912 #endif 913 914 #define MDDB_NOSLEEP 1 915 #define MDDB_SLEEPOK 0 916 917 #define MDDB_NOOLDOK 0x1 918 #define MDDB_MUSTEXIST 0x2 919 #define MDDB_NOINIT 0x4 920 #define MDDB_MULTINODE 0x8 921 #define MDDB_MN_STALE 0x10 /* MN set is stale */ 922 923 /* Flags passed to selectreplicas - not a bit mask */ 924 #define MDDB_SCANALL 1 925 #define MDDB_RETRYSCAN 0 926 #define MDDB_SCANALLSYNC 2 /* During reconfig, sync up incore */ 927 /* and ondisk mddb by writing incore */ 928 /* values to disk. Don't write */ 929 /* change log records. */ 930 931 /* Flags passed to writestart and writecopy */ 932 #define MDDB_WRITECOPY_ALL 1 /* Write all incore mddb to disk */ 933 #define MDDB_WRITECOPY_SYNC 2 /* Write incore mddb to disk except */ 934 /* - change log records */ 935 /* - optimized resync records */ 936 937 938 #define MDDB_PROBE 1 939 #define MDDB_NOPROBE 0 940 941 942 /* 943 * MN diskset definitions used to determine if a slave can write 944 * directly to the mddb. ONLY_MASTER only allows the master node 945 * to write to the mddb. ANY_NODE allows any node to write 946 * to the mddb. 947 */ 948 #define MDDB_WR_ONLY_MASTER 0 949 #define MDDB_WR_ANY_NODE 1 950 951 #define MDDB_L_LOCKED 0x0001 /* this record is locked */ 952 #define MDDB_L_WANTED 0x0002 953 954 #ifdef __cplusplus 955 } 956 #endif 957 958 #endif /* _SYS_MD_MDDB_H */ 959