1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/endian.h> 33 34 #include <assert.h> 35 #include <stddef.h> 36 #include <stdlib.h> 37 #include <string.h> 38 39 #include <util.h> 40 41 #include "makefs.h" 42 #include "zfs.h" 43 44 typedef struct zfs_zap_entry { 45 char *name; /* entry key, private copy */ 46 uint64_t hash; /* key hash */ 47 union { 48 uint8_t *valp; 49 uint16_t *val16p; 50 uint32_t *val32p; 51 uint64_t *val64p; 52 }; /* entry value, an integer array */ 53 uint64_t val64; /* embedded value for a common case */ 54 size_t intsz; /* array element size; 1, 2, 4 or 8 */ 55 size_t intcnt; /* array size */ 56 STAILQ_ENTRY(zfs_zap_entry) next; 57 } zfs_zap_entry_t; 58 59 struct zfs_zap { 60 STAILQ_HEAD(, zfs_zap_entry) kvps; 61 uint64_t hashsalt; /* key hash input */ 62 unsigned long kvpcnt; /* number of key-value pairs */ 63 unsigned long chunks; /* count of chunks needed for fat ZAP */ 64 bool micro; /* can this be a micro ZAP? */ 65 66 dnode_phys_t *dnode; /* backpointer */ 67 zfs_objset_t *os; /* backpointer */ 68 }; 69 70 static uint16_t 71 zap_entry_chunks(zfs_zap_entry_t *ent) 72 { 73 return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) + 74 howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES)); 75 } 76 77 static uint64_t 78 zap_hash(uint64_t salt, const char *name) 79 { 80 static uint64_t crc64_table[256]; 81 const uint64_t crc64_poly = 0xC96C5795D7870F42UL; 82 const uint8_t *cp; 83 uint64_t crc; 84 uint8_t c; 85 86 assert(salt != 0); 87 if (crc64_table[128] == 0) { 88 for (int i = 0; i < 256; i++) { 89 uint64_t *t; 90 91 t = crc64_table + i; 92 *t = i; 93 for (int j = 8; j > 0; j--) 94 *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly); 95 } 96 } 97 assert(crc64_table[128] == crc64_poly); 98 99 for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++) 100 crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF]; 101 102 /* 103 * Only use 28 bits, since we need 4 bits in the cookie for the 104 * collision differentiator. We MUST use the high bits, since 105 * those are the ones that we first pay attention to when 106 * choosing the bucket. 107 */ 108 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); 109 110 return (crc); 111 } 112 113 zfs_zap_t * 114 zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode) 115 { 116 zfs_zap_t *zap; 117 118 zap = ecalloc(1, sizeof(*zap)); 119 STAILQ_INIT(&zap->kvps); 120 zap->hashsalt = ((uint64_t)random() << 32) | random(); 121 zap->micro = true; 122 zap->kvpcnt = 0; 123 zap->chunks = 0; 124 zap->dnode = dnode; 125 zap->os = os; 126 return (zap); 127 } 128 129 void 130 zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt, 131 const uint8_t *val) 132 { 133 zfs_zap_entry_t *ent; 134 135 assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8); 136 assert(strlen(name) + 1 <= ZAP_MAXNAMELEN); 137 assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN); 138 139 ent = ecalloc(1, sizeof(*ent)); 140 ent->name = estrdup(name); 141 ent->hash = zap_hash(zap->hashsalt, ent->name); 142 ent->intsz = intsz; 143 ent->intcnt = intcnt; 144 if (intsz == sizeof(uint64_t) && intcnt == 1) { 145 /* 146 * Micro-optimization to elide a memory allocation in that most 147 * common case where this is a directory entry. 148 */ 149 ent->val64p = &ent->val64; 150 } else { 151 ent->valp = ecalloc(intcnt, intsz); 152 } 153 memcpy(ent->valp, val, intcnt * intsz); 154 zap->kvpcnt++; 155 zap->chunks += zap_entry_chunks(ent); 156 STAILQ_INSERT_TAIL(&zap->kvps, ent, next); 157 158 if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) || 159 strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX)) 160 zap->micro = false; 161 } 162 163 void 164 zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val) 165 { 166 zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val); 167 } 168 169 void 170 zap_add_string(zfs_zap_t *zap, const char *name, const char *val) 171 { 172 zap_add(zap, name, 1, strlen(val) + 1, val); 173 } 174 175 bool 176 zap_entry_exists(zfs_zap_t *zap, const char *name) 177 { 178 zfs_zap_entry_t *ent; 179 180 STAILQ_FOREACH(ent, &zap->kvps, next) { 181 if (strcmp(ent->name, name) == 0) 182 return (true); 183 } 184 return (false); 185 } 186 187 static void 188 zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap) 189 { 190 dnode_phys_t *dnode; 191 zfs_zap_entry_t *ent; 192 mzap_phys_t *mzap; 193 mzap_ent_phys_t *ment; 194 off_t bytes, loc; 195 196 memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); 197 mzap = (mzap_phys_t *)&zfs->filebuf[0]; 198 mzap->mz_block_type = ZBT_MICRO; 199 mzap->mz_salt = zap->hashsalt; 200 mzap->mz_normflags = 0; 201 202 bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment); 203 assert(bytes <= (off_t)MZAP_MAX_BLKSZ); 204 205 ment = &mzap->mz_chunk[0]; 206 STAILQ_FOREACH(ent, &zap->kvps, next) { 207 memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt); 208 ment->mze_cd = 0; /* XXX-MJ */ 209 strlcpy(ment->mze_name, ent->name, sizeof(ment->mze_name)); 210 ment++; 211 } 212 213 loc = objset_space_alloc(zfs, zap->os, &bytes); 214 215 dnode = zap->dnode; 216 dnode->dn_maxblkid = 0; 217 dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT; 218 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 219 220 vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc); 221 } 222 223 /* 224 * Write some data to the fat ZAP leaf chunk starting at index "li". 225 * 226 * Note that individual integers in the value may be split among consecutive 227 * leaves. 228 */ 229 static void 230 zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz, 231 const uint8_t *val) 232 { 233 struct zap_leaf_array *la; 234 235 assert(sz <= ZAP_MAXVALUELEN); 236 237 for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) { 238 n = MIN(resid, ZAP_LEAF_ARRAY_BYTES); 239 240 la = &ZAP_LEAF_CHUNK(l, li).l_array; 241 assert(la->la_type == ZAP_CHUNK_FREE); 242 la->la_type = ZAP_CHUNK_ARRAY; 243 memcpy(la->la_array, val, n); 244 la->la_next = li + 1; 245 } 246 la->la_next = 0xffff; 247 } 248 249 /* 250 * Find the shortest hash prefix length which lets us distribute keys without 251 * overflowing a leaf block. This is not (space) optimal, but is simple, and 252 * directories large enough to overflow a single 128KB leaf block are uncommon. 253 */ 254 static unsigned int 255 zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l) 256 { 257 zfs_zap_entry_t *ent; 258 unsigned int prefixlen; 259 260 if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) { 261 /* 262 * All chunks will fit in a single leaf block. 263 */ 264 return (0); 265 } 266 267 for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) { 268 uint32_t *leafchunks; 269 270 leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks)); 271 STAILQ_FOREACH(ent, &zap->kvps, next) { 272 uint64_t li; 273 uint16_t chunks; 274 275 li = ZAP_HASH_IDX(ent->hash, prefixlen); 276 277 chunks = zap_entry_chunks(ent); 278 if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) { 279 /* 280 * Not enough space, grow the prefix and retry. 281 */ 282 break; 283 } 284 leafchunks[li] += chunks; 285 } 286 free(leafchunks); 287 288 if (ent == NULL) { 289 /* 290 * Everything fits, we're done. 291 */ 292 break; 293 } 294 } 295 296 /* 297 * If this fails, then we need to expand the pointer table. For now 298 * this situation is unhandled since it is hard to trigger. 299 */ 300 assert(prefixlen < (unsigned int)l->l_bs); 301 302 return (prefixlen); 303 } 304 305 /* 306 * Initialize a fat ZAP leaf block. 307 */ 308 static void 309 zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen) 310 { 311 zap_leaf_phys_t *leaf; 312 313 leaf = l->l_phys; 314 315 leaf->l_hdr.lh_block_type = ZBT_LEAF; 316 leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC; 317 leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); 318 leaf->l_hdr.lh_prefix = prefix; 319 leaf->l_hdr.lh_prefix_len = prefixlen; 320 321 /* Initialize the leaf hash table. */ 322 assert(leaf->l_hdr.lh_nfree < 0xffff); 323 memset(leaf->l_hash, 0xff, 324 ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash)); 325 326 /* Initialize the leaf chunks. */ 327 for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { 328 struct zap_leaf_free *lf; 329 330 lf = &ZAP_LEAF_CHUNK(l, i).l_free; 331 lf->lf_type = ZAP_CHUNK_FREE; 332 if (i + 1 == ZAP_LEAF_NUMCHUNKS(l)) 333 lf->lf_next = 0xffff; 334 else 335 lf->lf_next = i + 1; 336 } 337 } 338 339 static void 340 zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap) 341 { 342 struct dnode_cursor *c; 343 zap_leaf_t l; 344 zap_phys_t *zaphdr; 345 struct zap_table_phys *zt; 346 zfs_zap_entry_t *ent; 347 dnode_phys_t *dnode; 348 uint8_t *leafblks; 349 uint64_t lblkcnt, *ptrhasht; 350 off_t loc, blksz; 351 size_t blkshift; 352 unsigned int prefixlen; 353 int ptrcnt; 354 355 /* 356 * For simplicity, always use the largest block size. This should be ok 357 * since most directories will be micro ZAPs, but it's space inefficient 358 * for small ZAPs and might need to be revisited. 359 */ 360 blkshift = MAXBLOCKSHIFT; 361 blksz = (off_t)1 << blkshift; 362 363 /* 364 * Embedded pointer tables give up to 8192 entries. This ought to be 365 * enough for anything except massive directories. 366 */ 367 ptrcnt = (blksz / 2) / sizeof(uint64_t); 368 369 memset(zfs->filebuf, 0, sizeof(zfs->filebuf)); 370 zaphdr = (zap_phys_t *)&zfs->filebuf[0]; 371 zaphdr->zap_block_type = ZBT_HEADER; 372 zaphdr->zap_magic = ZAP_MAGIC; 373 zaphdr->zap_num_entries = zap->kvpcnt; 374 zaphdr->zap_salt = zap->hashsalt; 375 376 l.l_bs = blkshift; 377 l.l_phys = NULL; 378 379 zt = &zaphdr->zap_ptrtbl; 380 zt->zt_blk = 0; 381 zt->zt_numblks = 0; 382 zt->zt_shift = flsll(ptrcnt) - 1; 383 zt->zt_nextblk = 0; 384 zt->zt_blks_copied = 0; 385 386 /* 387 * How many leaf blocks do we need? Initialize them and update the 388 * header. 389 */ 390 prefixlen = zap_fat_write_prefixlen(zap, &l); 391 lblkcnt = 1 << prefixlen; 392 leafblks = ecalloc(lblkcnt, blksz); 393 for (unsigned int li = 0; li < lblkcnt; li++) { 394 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); 395 zap_fat_write_leaf_init(&l, li, prefixlen); 396 } 397 zaphdr->zap_num_leafs = lblkcnt; 398 zaphdr->zap_freeblk = lblkcnt + 1; 399 400 /* 401 * For each entry, figure out which leaf block it belongs to based on 402 * the upper bits of its hash, allocate chunks from that leaf, and fill 403 * them out. 404 */ 405 ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2); 406 STAILQ_FOREACH(ent, &zap->kvps, next) { 407 struct zap_leaf_entry *le; 408 uint16_t *lptr; 409 uint64_t hi, li; 410 uint16_t namelen, nchunks, nnamechunks, nvalchunks; 411 412 hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift); 413 li = ZAP_HASH_IDX(ent->hash, prefixlen); 414 assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1); 415 ptrhasht[hi] = li + 1; 416 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz); 417 418 namelen = strlen(ent->name) + 1; 419 420 /* 421 * How many leaf chunks do we need for this entry? 422 */ 423 nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES); 424 nvalchunks = howmany(ent->intcnt, 425 ZAP_LEAF_ARRAY_BYTES / ent->intsz); 426 nchunks = 1 + nnamechunks + nvalchunks; 427 428 /* 429 * Allocate a run of free leaf chunks for this entry, 430 * potentially extending a hash chain. 431 */ 432 assert(l.l_phys->l_hdr.lh_nfree >= nchunks); 433 l.l_phys->l_hdr.lh_nfree -= nchunks; 434 l.l_phys->l_hdr.lh_nentries++; 435 lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash); 436 while (*lptr != 0xffff) { 437 assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l)); 438 le = ZAP_LEAF_ENTRY(&l, *lptr); 439 assert(le->le_type == ZAP_CHUNK_ENTRY); 440 le->le_cd++; 441 lptr = &le->le_next; 442 } 443 *lptr = l.l_phys->l_hdr.lh_freelist; 444 l.l_phys->l_hdr.lh_freelist += nchunks; 445 assert(l.l_phys->l_hdr.lh_freelist <= 446 ZAP_LEAF_NUMCHUNKS(&l)); 447 if (l.l_phys->l_hdr.lh_freelist == 448 ZAP_LEAF_NUMCHUNKS(&l)) 449 l.l_phys->l_hdr.lh_freelist = 0xffff; 450 451 /* 452 * Integer values must be stored in big-endian format. 453 */ 454 switch (ent->intsz) { 455 case 1: 456 break; 457 case 2: 458 for (uint16_t *v = ent->val16p; 459 v - ent->val16p < (ptrdiff_t)ent->intcnt; 460 v++) 461 *v = htobe16(*v); 462 break; 463 case 4: 464 for (uint32_t *v = ent->val32p; 465 v - ent->val32p < (ptrdiff_t)ent->intcnt; 466 v++) 467 *v = htobe32(*v); 468 break; 469 case 8: 470 for (uint64_t *v = ent->val64p; 471 v - ent->val64p < (ptrdiff_t)ent->intcnt; 472 v++) 473 *v = htobe64(*v); 474 break; 475 default: 476 assert(0); 477 } 478 479 /* 480 * Finally, write out the leaf chunks for this entry. 481 */ 482 le = ZAP_LEAF_ENTRY(&l, *lptr); 483 assert(le->le_type == ZAP_CHUNK_FREE); 484 le->le_type = ZAP_CHUNK_ENTRY; 485 le->le_next = 0xffff; 486 le->le_name_chunk = *lptr + 1; 487 le->le_name_numints = namelen; 488 le->le_value_chunk = *lptr + 1 + nnamechunks; 489 le->le_value_intlen = ent->intsz; 490 le->le_value_numints = ent->intcnt; 491 le->le_hash = ent->hash; 492 zap_fat_write_array_chunk(&l, *lptr + 1, namelen, ent->name); 493 zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks, 494 ent->intcnt * ent->intsz, ent->valp); 495 } 496 497 /* 498 * Initialize unused slots of the pointer table. 499 */ 500 for (int i = 0; i < ptrcnt; i++) 501 if (ptrhasht[i] == 0) 502 ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1; 503 504 /* 505 * Write the whole thing to disk. 506 */ 507 dnode = zap->dnode; 508 dnode->dn_nblkptr = 1; 509 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT; 510 dnode->dn_maxblkid = lblkcnt + 1; 511 dnode->dn_flags = DNODE_FLAG_USED_BYTES; 512 513 c = dnode_cursor_init(zfs, zap->os, zap->dnode, 514 (lblkcnt + 1) * blksz, blksz); 515 516 loc = objset_space_alloc(zfs, zap->os, &blksz); 517 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc, 518 dnode_cursor_next(zfs, c, 0)); 519 520 for (uint64_t i = 0; i < lblkcnt; i++) { 521 loc = objset_space_alloc(zfs, zap->os, &blksz); 522 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz, 523 blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz)); 524 } 525 526 dnode_cursor_finish(zfs, c); 527 528 free(leafblks); 529 } 530 531 void 532 zap_write(zfs_opt_t *zfs, zfs_zap_t *zap) 533 { 534 zfs_zap_entry_t *ent; 535 536 if (zap->micro) { 537 zap_micro_write(zfs, zap); 538 } else { 539 assert(!STAILQ_EMPTY(&zap->kvps)); 540 assert(zap->kvpcnt > 0); 541 zap_fat_write(zfs, zap); 542 } 543 544 while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) { 545 STAILQ_REMOVE_HEAD(&zap->kvps, next); 546 if (ent->val64p != &ent->val64) 547 free(ent->valp); 548 free(ent->name); 549 free(ent); 550 } 551 free(zap); 552 } 553