1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/endian.h> 33 34 #include <assert.h> 35 #include <stddef.h> 36 #include <stdint.h> 37 #include <stdlib.h> 38 #include <string.h> 39 40 #include <util.h> 41 42 #include "makefs.h" 43 #include "zfs.h" 44 45 typedef struct zfs_zap_entry { 46 char *name; /* entry key, private copy */ 47 uint64_t hash; /* key hash */ 48 union { 49 uint8_t *valp; 50 uint16_t *val16p; 51 uint32_t *val32p; 52 uint64_t *val64p; 53 }; /* entry value, an integer array */ 54 uint64_t val64; /* embedded value for a common case */ 55 size_t intsz; /* array element size; 1, 2, 4 or 8 */ 56 size_t intcnt; /* array size */ 57 STAILQ_ENTRY(zfs_zap_entry) next; 58 } zfs_zap_entry_t; 59 60 struct zfs_zap { 61 STAILQ_HEAD(, zfs_zap_entry) kvps; 62 uint64_t hashsalt; /* key hash input */ 63 unsigned long kvpcnt; /* number of key-value pairs */ 64 unsigned long chunks; /* count of chunks needed for fat ZAP */ 65 bool micro; /* can this be a micro ZAP? */ 66 67 dnode_phys_t *dnode; /* backpointer */ 68 zfs_objset_t *os; /* backpointer */ 69 }; 70 71 static uint16_t 72 zap_entry_chunks(zfs_zap_entry_t *ent) 73 { 74 return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + 75 howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); 76 } 77 78 static uint64_t 79 zap_hash(uint64_t salt, const char *name) 80 { 81 static uint64_t crc64_table[256]; 82 const uint64_t crc64_poly = 0xC96C5795D7870F42UL; 83 const uint8_t *cp; 84 uint64_t crc; 85 uint8_t c; 86 87 assert(salt != 0); 88 if (crc64_table[128] == 0) { 89 for (int i = 0; i < 256; i++) { 90 uint64_t *t; 91 92 t = crc64_table + i; 93 *t = i; 94 for (int j = 8; j > 0; j--) 95 *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); 96 } 97 } 98 assert(crc64_table[128] == crc64_poly); 99 100 for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) 101 crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; 102 103 /* 104 * Only use 28 bits, since we need 4 bits in the cookie for the 105 * collision differentiator. We MUST use the high bits, since 106 * those are the ones that we first pay attention to when 107 * choosing the bucket. 108 */ 109 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 110 111 return (crc); 112 } 113 114 zfs_zap_t * 115 zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) 116 { 117 zfs_zap_t *zap; 118 119 zap = ecalloc(1, sizeof(*zap)); 120 STAILQ_INIT(&zap->kvps); 121 zap->hashsalt = ((uint64_t)random() << 32) | random(); 122 zap->micro = true; 123 zap->kvpcnt = 0; 124 zap->chunks = 0; 125 zap->dnode = dnode; 126 zap->os = os; 127 return (zap); 128 } 129 130 void 131 zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, 132 const uint8_t *val) 133 { 134 zfs_zap_entry_t *ent; 135 136 assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); 137 assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); 138 assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); 139 140 ent = ecalloc(1, sizeof(*ent)); 141 ent->name = estrdup(name); 142 ent->hash = zap_hash(zap->hashsalt, ent->name); 143 ent->intsz = intsz; 144 ent->intcnt = intcnt; 145 if (intsz == sizeof(uint64_t) && intcnt == 1) { 146 /* 147 * Micro-optimization to elide a memory allocation in that most 148 * common case where this is a directory entry. 149 */ 150 ent->val64p = &ent->val64; 151 } else { 152 ent->valp = ecalloc(intcnt, intsz); 153 } 154 memcpy(ent->valp, val, intcnt * intsz); 155 zap->kvpcnt++; 156 zap->chunks += zap_entry_chunks(ent); 157 STAILQ_INSERT_TAIL(&zap->kvps, ent, next); 158 159 if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || 160 strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) 161 zap->micro = false; 162 } 163 164 void 165 zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) 166 { 167 zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); 168 } 169 170 void 171 zap_add_uint64_self(zfs_zap_t *zap, uint64_t val) 172 { 173 char name[32]; 174 175 snprintf(name, sizeof(name), "%jx", (uintmax_t)val); 176 zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); 177 } 178 179 void 180 zap_add_string(zfs_zap_t *zap, const char *name, const char *val) 181 { 182 zap_add(zap, name, 1, strlen(val) + 1, val); 183 } 184 185 bool 186 zap_entry_exists(zfs_zap_t *zap, const char *name) 187 { 188 zfs_zap_entry_t *ent; 189 190 STAILQ_FOREACH(ent, &zap->kvps, next) { 191 if (strcmp(ent->name, name) == 0) 192 return (true); 193 } 194 return (false); 195 } 196 197 static void 198 zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) 199 { 200 dnode_phys_t *dnode; 201 zfs_zap_entry_t *ent; 202 mzap_phys_t *mzap; 203 mzap_ent_phys_t *ment; 204 off_t bytes, loc; 205 uint16_t cd; 206 207 _Static_assert(MZAP_ENT_MAX <= UINT16_MAX, 208 "micro ZAP collision differentiator must fit in 16 bits"); 209 210 memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); 211 mzap = (mzap_phys_t *)&zfs->filebuf[0]; 212 mzap->mz_block_type = ZBT_MICRO; 213 mzap->mz_salt = zap->hashsalt; 214 mzap->mz_normflags = 0; 215 216 bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); 217 assert(bytes <= (off_t)MZAP_MAX_BLKSZ); 218 219 cd = 0; 220 ment = &mzap->mz_chunk[0]; 221 STAILQ_FOREACH(ent, &zap->kvps, next) { 222 memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); 223 ment->mze_cd = cd++; 224 strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); 225 ment++; 226 } 227 228 loc = objset_space_alloc(zfs, zap->os, &bytes); 229 230 dnode = zap->dnode; 231 dnode->dn_maxblkid = 0; 232 dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; 233 234 vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); 235 } 236 237 /* 238 * Write some data to the fat ZAP leaf chunk starting at index "li". 239 * 240 * Note that individual integers in the value may be split among consecutive 241 * leaves. 242 */ 243 static void 244 zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, 245 const uint8_t *val) 246 { 247 struct zap_leaf_array *la; 248 249 assert(sz <= ZAP_MAXVALUELEN); 250 251 for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { 252 n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); 253 254 la = &ZAP_LEAF_CHUNK(l, li).l_array; 255 assert(la->la_type == ZAP_CHUNK_FREE); 256 la->la_type = ZAP_CHUNK_ARRAY; 257 memcpy(la->la_array, val, n); 258 la->la_next = li + 1; 259 } 260 la->la_next = 0xffff; 261 } 262 263 /* 264 * Find the shortest hash prefix length which lets us distribute keys without 265 * overflowing a leaf block. This is not (space) optimal, but is simple, and 266 * directories large enough to overflow a single 128KB leaf block are uncommon. 267 */ 268 static unsigned int 269 zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) 270 { 271 zfs_zap_entry_t *ent; 272 unsigned int prefixlen; 273 274 if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { 275 /* 276 * All chunks will fit in a single leaf block. 277 */ 278 return (0); 279 } 280 281 for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { 282 uint32_t *leafchunks; 283 284 leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); 285 STAILQ_FOREACH(ent, &zap->kvps, next) { 286 uint64_t li; 287 uint16_t chunks; 288 289 li = ZAP_HASH_IDX(ent->hash, prefixlen); 290 291 chunks = zap_entry_chunks(ent); 292 if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { 293 /* 294 * Not enough space, grow the prefix and retry. 295 */ 296 break; 297 } 298 leafchunks[li] += chunks; 299 } 300 free(leafchunks); 301 302 if (ent == NULL) { 303 /* 304 * Everything fits, we're done. 305 */ 306 break; 307 } 308 } 309 310 /* 311 * If this fails, then we need to expand the pointer table. For now 312 * this situation is unhandled since it is hard to trigger. 313 */ 314 assert(prefixlen < (unsigned int)l->l_bs); 315 316 return (prefixlen); 317 } 318 319 /* 320 * Initialize a fat ZAP leaf block. 321 */ 322 static void 323 zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) 324 { 325 zap_leaf_phys_t *leaf; 326 327 leaf = l->l_phys; 328 329 leaf->l_hdr.lh_block_type = ZBT_LEAF; 330 leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; 331 leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); 332 leaf->l_hdr.lh_prefix = prefix; 333 leaf->l_hdr.lh_prefix_len = prefixlen; 334 335 /* Initialize the leaf hash table. */ 336 assert(leaf->l_hdr.lh_nfree < 0xffff); 337 memset(leaf->l_hash, 0xff, 338 ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); 339 340 /* Initialize the leaf chunks. */ 341 for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { 342 struct zap_leaf_free *lf; 343 344 lf = &ZAP_LEAF_CHUNK(l, i).l_free; 345 lf->lf_type = ZAP_CHUNK_FREE; 346 if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) 347 lf->lf_next = 0xffff; 348 else 349 lf->lf_next = i + 1; 350 } 351 } 352 353 static void 354 zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) 355 { 356 struct dnode_cursor *c; 357 zap_leaf_t l; 358 zap_phys_t *zaphdr; 359 struct zap_table_phys *zt; 360 zfs_zap_entry_t *ent; 361 dnode_phys_t *dnode; 362 uint8_t *leafblks; 363 uint64_t lblkcnt, *ptrhasht; 364 off_t loc, blksz; 365 size_t blkshift; 366 unsigned int prefixlen; 367 int ptrcnt; 368 369 /* 370 * For simplicity, always use the largest block size. This should be ok 371 * since most directories will be micro ZAPs, but it's space inefficient 372 * for small ZAPs and might need to be revisited. 373 */ 374 blkshift = MAXBLOCKSHIFT; 375 blksz = (off_t)1 << blkshift; 376 377 /* 378 * Embedded pointer tables give up to 8192 entries. This ought to be 379 * enough for anything except massive directories. 380 */ 381 ptrcnt = (blksz / 2) / sizeof(uint64_t); 382 383 memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); 384 zaphdr = (zap_phys_t *)&zfs->filebuf[0]; 385 zaphdr->zap_block_type = ZBT_HEADER; 386 zaphdr->zap_magic = ZAP_MAGIC; 387 zaphdr->zap_num_entries = zap->kvpcnt; 388 zaphdr->zap_salt = zap->hashsalt; 389 390 l.l_bs = blkshift; 391 l.l_phys = NULL; 392 393 zt = &zaphdr->zap_ptrtbl; 394 zt->zt_blk = 0; 395 zt->zt_numblks = 0; 396 zt->zt_shift = flsll(ptrcnt) - 1; 397 zt->zt_nextblk = 0; 398 zt->zt_blks_copied = 0; 399 400 /* 401 * How many leaf blocks do we need? Initialize them and update the 402 * header. 403 */ 404 prefixlen = zap_fat_write_prefixlen(zap, &l); 405 lblkcnt = (uint64_t)1 << prefixlen; 406 leafblks = ecalloc(lblkcnt, blksz); 407 for (unsigned int li = 0; li < lblkcnt; li++) { 408 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); 409 zap_fat_write_leaf_init(&l, li, prefixlen); 410 } 411 zaphdr->zap_num_leafs = lblkcnt; 412 zaphdr->zap_freeblk = lblkcnt + 1; 413 414 /* 415 * For each entry, figure out which leaf block it belongs to based on 416 * the upper bits of its hash, allocate chunks from that leaf, and fill 417 * them out. 418 */ 419 ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); 420 STAILQ_FOREACH(ent, &zap->kvps, next) { 421 struct zap_leaf_entry *le; 422 uint16_t *lptr; 423 uint64_t hi, li; 424 uint16_t namelen, nchunks, nnamechunks, nvalchunks; 425 426 hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); 427 li = ZAP_HASH_IDX(ent->hash, prefixlen); 428 assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); 429 ptrhasht[hi] = li + 1; 430 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); 431 432 namelen = strlen(ent->name) + 1; 433 434 /* 435 * How many leaf chunks do we need for this entry? 436 */ 437 nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); 438 nvalchunks = howmany(ent->intcnt, 439 ZAP_LEAF_ARRAY_BYTES / ent->intsz); 440 nchunks = 1 + nnamechunks + nvalchunks; 441 442 /* 443 * Allocate a run of free leaf chunks for this entry, 444 * potentially extending a hash chain. 445 */ 446 assert(l.l_phys->l_hdr.lh_nfree >= nchunks); 447 l.l_phys->l_hdr.lh_nfree -= nchunks; 448 l.l_phys->l_hdr.lh_nentries++; 449 lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); 450 while (*lptr != 0xffff) { 451 assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); 452 le = ZAP_LEAF_ENTRY(&l, *lptr); 453 assert(le->le_type == ZAP_CHUNK_ENTRY); 454 le->le_cd++; 455 lptr = &le->le_next; 456 } 457 *lptr = l.l_phys->l_hdr.lh_freelist; 458 l.l_phys->l_hdr.lh_freelist += nchunks; 459 assert(l.l_phys->l_hdr.lh_freelist <= 460 ZAP_LEAF_NUMCHUNKS(&l)); 461 if (l.l_phys->l_hdr.lh_freelist == 462 ZAP_LEAF_NUMCHUNKS(&l)) 463 l.l_phys->l_hdr.lh_freelist = 0xffff; 464 465 /* 466 * Integer values must be stored in big-endian format. 467 */ 468 switch (ent->intsz) { 469 case 1: 470 break; 471 case 2: 472 for (uint16_t *v = ent->val16p; 473 v - ent->val16p < (ptrdiff_t)ent->intcnt; 474 v++) 475 *v = htobe16(*v); 476 break; 477 case 4: 478 for (uint32_t *v = ent->val32p; 479 v - ent->val32p < (ptrdiff_t)ent->intcnt; 480 v++) 481 *v = htobe32(*v); 482 break; 483 case 8: 484 for (uint64_t *v = ent->val64p; 485 v - ent->val64p < (ptrdiff_t)ent->intcnt; 486 v++) 487 *v = htobe64(*v); 488 break; 489 default: 490 assert(0); 491 } 492 493 /* 494 * Finally, write out the leaf chunks for this entry. 495 */ 496 le = ZAP_LEAF_ENTRY(&l, *lptr); 497 assert(le->le_type == ZAP_CHUNK_FREE); 498 le->le_type = ZAP_CHUNK_ENTRY; 499 le->le_next = 0xffff; 500 le->le_name_chunk = *lptr + 1; 501 le->le_name_numints = namelen; 502 le->le_value_chunk = *lptr + 1 + nnamechunks; 503 le->le_value_intlen = ent->intsz; 504 le->le_value_numints = ent->intcnt; 505 le->le_hash = ent->hash; 506 zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); 507 zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, 508 ent->intcnt * ent->intsz, ent->valp); 509 } 510 511 /* 512 * Initialize unused slots of the pointer table. 513 */ 514 for (int i = 0; i < ptrcnt; i++) 515 if (ptrhasht[i] == 0) 516 ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; 517 518 /* 519 * Write the whole thing to disk. 520 */ 521 dnode = zap->dnode; 522 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 523 dnode->dn_maxblkid = lblkcnt + 1; 524 525 c = dnode_cursor_init(zfs, zap->os, zap->dnode, 526 (lblkcnt + 1) * blksz, blksz); 527 528 loc = objset_space_alloc(zfs, zap->os, &blksz); 529 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, 530 dnode_cursor_next(zfs, c, 0)); 531 532 for (uint64_t i = 0; i < lblkcnt; i++) { 533 loc = objset_space_alloc(zfs, zap->os, &blksz); 534 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, 535 blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); 536 } 537 538 dnode_cursor_finish(zfs, c); 539 540 free(leafblks); 541 } 542 543 void 544 zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) 545 { 546 zfs_zap_entry_t *ent; 547 548 if (zap->micro) { 549 zap_micro_write(zfs, zap); 550 } else { 551 assert(!STAILQ_EMPTY(&zap->kvps)); 552 assert(zap->kvpcnt > 0); 553 zap_fat_write(zfs, zap); 554 } 555 556 while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { 557 STAILQ_REMOVE_HEAD(&zap->kvps, next); 558 if (ent->val64p != &ent->val64) 559 free(ent->valp); 560 free(ent->name); 561 free(ent); 562 } 563 free(zap); 564 } 565