1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/param.h>
32 #include <sys/endian.h>
33
34 #include <assert.h>
35 #include <stddef.h>
36 #include <stdint.h>
37 #include <stdlib.h>
38 #include <string.h>
39
40 #include <util.h>
41
42 #include "makefs.h"
43 #include "zfs.h"
44
45 typedef struct zfs_zap_entry {
46 char *name; /* entry key, private copy */
47 uint64_t hash; /* key hash */
48 union {
49 uint8_t *valp;
50 uint16_t *val16p;
51 uint32_t *val32p;
52 uint64_t *val64p;
53 }; /* entry value, an integer array */
54 uint64_t val64; /* embedded value for a common case */
55 size_t intsz; /* array element size; 1, 2, 4 or 8 */
56 size_t intcnt; /* array size */
57 STAILQ_ENTRY(zfs_zap_entry) next;
58 } zfs_zap_entry_t;
59
60 struct zfs_zap {
61 STAILQ_HEAD(, zfs_zap_entry) kvps;
62 uint64_t hashsalt; /* key hash input */
63 unsigned long kvpcnt; /* number of key-value pairs */
64 unsigned long chunks; /* count of chunks needed for fat ZAP */
65 bool micro; /* can this be a micro ZAP? */
66
67 dnode_phys_t *dnode; /* backpointer */
68 zfs_objset_t *os; /* backpointer */
69 };
70
71 static uint16_t
zap_entry_chunks(zfs_zap_entry_t * ent)72 zap_entry_chunks(zfs_zap_entry_t *ent)
73 {
74 return (1 + howmany(strlen(ent->name) + 1, ZAP_LEAF_ARRAY_BYTES) +
75 howmany(ent->intsz * ent->intcnt, ZAP_LEAF_ARRAY_BYTES));
76 }
77
78 static uint64_t
zap_hash(uint64_t salt,const char * name)79 zap_hash(uint64_t salt, const char *name)
80 {
81 static uint64_t crc64_table[256];
82 const uint64_t crc64_poly = 0xC96C5795D7870F42UL;
83 const uint8_t *cp;
84 uint64_t crc;
85 uint8_t c;
86
87 assert(salt != 0);
88 if (crc64_table[128] == 0) {
89 for (int i = 0; i < 256; i++) {
90 uint64_t *t;
91
92 t = crc64_table + i;
93 *t = i;
94 for (int j = 8; j > 0; j--)
95 *t = (*t >> 1) ^ (-(*t & 1) & crc64_poly);
96 }
97 }
98 assert(crc64_table[128] == crc64_poly);
99
100 for (cp = (const uint8_t *)name, crc = salt; (c = *cp) != '\0'; cp++)
101 crc = (crc >> 8) ^ crc64_table[(crc ^ c) & 0xFF];
102
103 /*
104 * Only use 28 bits, since we need 4 bits in the cookie for the
105 * collision differentiator. We MUST use the high bits, since
106 * those are the ones that we first pay attention to when
107 * choosing the bucket.
108 */
109 crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
110
111 return (crc);
112 }
113
114 zfs_zap_t *
zap_alloc(zfs_objset_t * os,dnode_phys_t * dnode)115 zap_alloc(zfs_objset_t *os, dnode_phys_t *dnode)
116 {
117 zfs_zap_t *zap;
118
119 zap = ecalloc(1, sizeof(*zap));
120 STAILQ_INIT(&zap->kvps);
121 zap->hashsalt = ((uint64_t)random() << 32) | random();
122 zap->micro = true;
123 zap->kvpcnt = 0;
124 zap->chunks = 0;
125 zap->dnode = dnode;
126 zap->os = os;
127 return (zap);
128 }
129
130 void
zap_add(zfs_zap_t * zap,const char * name,size_t intsz,size_t intcnt,const uint8_t * val)131 zap_add(zfs_zap_t *zap, const char *name, size_t intsz, size_t intcnt,
132 const uint8_t *val)
133 {
134 zfs_zap_entry_t *ent;
135
136 assert(intsz == 1 || intsz == 2 || intsz == 4 || intsz == 8);
137 assert(strlen(name) + 1 <= ZAP_MAXNAMELEN);
138 assert(intcnt <= ZAP_MAXVALUELEN && intcnt * intsz <= ZAP_MAXVALUELEN);
139
140 ent = ecalloc(1, sizeof(*ent));
141 ent->name = estrdup(name);
142 ent->hash = zap_hash(zap->hashsalt, ent->name);
143 ent->intsz = intsz;
144 ent->intcnt = intcnt;
145 if (intsz == sizeof(uint64_t) && intcnt == 1) {
146 /*
147 * Micro-optimization to elide a memory allocation in that most
148 * common case where this is a directory entry.
149 */
150 ent->val64p = &ent->val64;
151 } else {
152 ent->valp = ecalloc(intcnt, intsz);
153 }
154 memcpy(ent->valp, val, intcnt * intsz);
155 zap->kvpcnt++;
156 zap->chunks += zap_entry_chunks(ent);
157 STAILQ_INSERT_TAIL(&zap->kvps, ent, next);
158
159 if (zap->micro && (intcnt != 1 || intsz != sizeof(uint64_t) ||
160 strlen(name) + 1 > MZAP_NAME_LEN || zap->kvpcnt > MZAP_ENT_MAX))
161 zap->micro = false;
162 }
163
164 void
zap_add_uint64(zfs_zap_t * zap,const char * name,uint64_t val)165 zap_add_uint64(zfs_zap_t *zap, const char *name, uint64_t val)
166 {
167 zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
168 }
169
170 void
zap_add_uint64_self(zfs_zap_t * zap,uint64_t val)171 zap_add_uint64_self(zfs_zap_t *zap, uint64_t val)
172 {
173 char name[32];
174
175 (void)snprintf(name, sizeof(name), "%jx", (uintmax_t)val);
176 zap_add(zap, name, sizeof(uint64_t), 1, (uint8_t *)&val);
177 }
178
179 void
zap_add_string(zfs_zap_t * zap,const char * name,const char * val)180 zap_add_string(zfs_zap_t *zap, const char *name, const char *val)
181 {
182 zap_add(zap, name, 1, strlen(val) + 1, (const uint8_t *)val);
183 }
184
185 bool
zap_entry_exists(zfs_zap_t * zap,const char * name)186 zap_entry_exists(zfs_zap_t *zap, const char *name)
187 {
188 zfs_zap_entry_t *ent;
189
190 STAILQ_FOREACH(ent, &zap->kvps, next) {
191 if (strcmp(ent->name, name) == 0)
192 return (true);
193 }
194 return (false);
195 }
196
197 static void
zap_micro_write(zfs_opt_t * zfs,zfs_zap_t * zap)198 zap_micro_write(zfs_opt_t *zfs, zfs_zap_t *zap)
199 {
200 dnode_phys_t *dnode;
201 zfs_zap_entry_t *ent;
202 mzap_phys_t *mzap;
203 mzap_ent_phys_t *ment;
204 off_t bytes, loc;
205 uint16_t cd;
206
207 _Static_assert(MZAP_ENT_MAX <= UINT16_MAX,
208 "micro ZAP collision differentiator must fit in 16 bits");
209
210 memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
211 mzap = (mzap_phys_t *)&zfs->filebuf[0];
212 mzap->mz_block_type = ZBT_MICRO;
213 mzap->mz_salt = zap->hashsalt;
214 mzap->mz_normflags = 0;
215
216 bytes = sizeof(*mzap) + (zap->kvpcnt - 1) * sizeof(*ment);
217 assert(bytes <= (off_t)MZAP_MAX_BLKSZ);
218
219 cd = 0;
220 ment = &mzap->mz_chunk[0];
221 STAILQ_FOREACH(ent, &zap->kvps, next) {
222 memcpy(&ment->mze_value, ent->valp, ent->intsz * ent->intcnt);
223 ment->mze_cd = cd++;
224 (void)strlcpy(ment->mze_name, ent->name,
225 sizeof(ment->mze_name));
226 ment++;
227 }
228
229 loc = objset_space_alloc(zfs, zap->os, &bytes);
230
231 dnode = zap->dnode;
232 dnode->dn_maxblkid = 0;
233 dnode->dn_datablkszsec = bytes >> MINBLOCKSHIFT;
234
235 vdev_pwrite_dnode_data(zfs, dnode, zfs->filebuf, bytes, loc);
236 }
237
238 /*
239 * Write some data to the fat ZAP leaf chunk starting at index "li".
240 *
241 * Note that individual integers in the value may be split among consecutive
242 * leaves.
243 */
244 static void
zap_fat_write_array_chunk(zap_leaf_t * l,uint16_t li,size_t sz,const uint8_t * val)245 zap_fat_write_array_chunk(zap_leaf_t *l, uint16_t li, size_t sz,
246 const uint8_t *val)
247 {
248 struct zap_leaf_array *la;
249
250 assert(sz <= ZAP_MAXVALUELEN);
251 assert(sz > 0);
252
253 for (uint16_t n, resid = sz; resid > 0; resid -= n, val += n, li++) {
254 n = MIN(resid, ZAP_LEAF_ARRAY_BYTES);
255
256 la = &ZAP_LEAF_CHUNK(l, li).l_array;
257 assert(la->la_type == ZAP_CHUNK_FREE);
258 la->la_type = ZAP_CHUNK_ARRAY;
259 memcpy(la->la_array, val, n);
260 la->la_next = li + 1;
261 }
262 la->la_next = 0xffff;
263 }
264
265 /*
266 * Find the shortest hash prefix length which lets us distribute keys without
267 * overflowing a leaf block. This is not (space) optimal, but is simple, and
268 * directories large enough to overflow a single 128KB leaf block are uncommon.
269 */
270 static unsigned int
zap_fat_write_prefixlen(zfs_zap_t * zap,zap_leaf_t * l)271 zap_fat_write_prefixlen(zfs_zap_t *zap, zap_leaf_t *l)
272 {
273 zfs_zap_entry_t *ent;
274 unsigned int prefixlen;
275
276 if (zap->chunks <= ZAP_LEAF_NUMCHUNKS(l)) {
277 /*
278 * All chunks will fit in a single leaf block.
279 */
280 return (0);
281 }
282
283 for (prefixlen = 1; prefixlen < (unsigned int)l->l_bs; prefixlen++) {
284 uint32_t *leafchunks;
285
286 leafchunks = ecalloc(1u << prefixlen, sizeof(*leafchunks));
287 STAILQ_FOREACH(ent, &zap->kvps, next) {
288 uint64_t li;
289 uint16_t chunks;
290
291 li = ZAP_HASH_IDX(ent->hash, prefixlen);
292
293 chunks = zap_entry_chunks(ent);
294 if (ZAP_LEAF_NUMCHUNKS(l) - leafchunks[li] < chunks) {
295 /*
296 * Not enough space, grow the prefix and retry.
297 */
298 break;
299 }
300 leafchunks[li] += chunks;
301 }
302 free(leafchunks);
303
304 if (ent == NULL) {
305 /*
306 * Everything fits, we're done.
307 */
308 break;
309 }
310 }
311
312 /*
313 * If this fails, then we need to expand the pointer table. For now
314 * this situation is unhandled since it is hard to trigger.
315 */
316 assert(prefixlen < (unsigned int)l->l_bs);
317
318 return (prefixlen);
319 }
320
321 /*
322 * Initialize a fat ZAP leaf block.
323 */
324 static void
zap_fat_write_leaf_init(zap_leaf_t * l,uint64_t prefix,int prefixlen)325 zap_fat_write_leaf_init(zap_leaf_t *l, uint64_t prefix, int prefixlen)
326 {
327 zap_leaf_phys_t *leaf;
328
329 leaf = l->l_phys;
330
331 leaf->l_hdr.lh_block_type = ZBT_LEAF;
332 leaf->l_hdr.lh_magic = ZAP_LEAF_MAGIC;
333 leaf->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l);
334 leaf->l_hdr.lh_prefix = prefix;
335 leaf->l_hdr.lh_prefix_len = prefixlen;
336
337 /* Initialize the leaf hash table. */
338 assert(leaf->l_hdr.lh_nfree < 0xffff);
339 memset(leaf->l_hash, 0xff,
340 ZAP_LEAF_HASH_NUMENTRIES(l) * sizeof(*leaf->l_hash));
341
342 /* Initialize the leaf chunks. */
343 for (uint16_t i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) {
344 struct zap_leaf_free *lf;
345
346 lf = &ZAP_LEAF_CHUNK(l, i).l_free;
347 lf->lf_type = ZAP_CHUNK_FREE;
348 if (i + 1 == ZAP_LEAF_NUMCHUNKS(l))
349 lf->lf_next = 0xffff;
350 else
351 lf->lf_next = i + 1;
352 }
353 }
354
355 static void
zap_fat_write(zfs_opt_t * zfs,zfs_zap_t * zap)356 zap_fat_write(zfs_opt_t *zfs, zfs_zap_t *zap)
357 {
358 struct dnode_cursor *c;
359 zap_leaf_t l;
360 zap_phys_t *zaphdr;
361 struct zap_table_phys *zt;
362 zfs_zap_entry_t *ent;
363 dnode_phys_t *dnode;
364 uint8_t *leafblks;
365 uint64_t lblkcnt, *ptrhasht;
366 off_t loc, blksz;
367 size_t blkshift;
368 unsigned int prefixlen;
369 int ptrcnt;
370
371 /*
372 * For simplicity, always use the largest block size. This should be ok
373 * since most directories will be micro ZAPs, but it's space inefficient
374 * for small ZAPs and might need to be revisited.
375 */
376 blkshift = MAXBLOCKSHIFT;
377 blksz = (off_t)1 << blkshift;
378
379 /*
380 * Embedded pointer tables give up to 8192 entries. This ought to be
381 * enough for anything except massive directories.
382 */
383 ptrcnt = (blksz / 2) / sizeof(uint64_t);
384
385 memset(zfs->filebuf, 0, sizeof(zfs->filebuf));
386 zaphdr = (zap_phys_t *)&zfs->filebuf[0];
387 zaphdr->zap_block_type = ZBT_HEADER;
388 zaphdr->zap_magic = ZAP_MAGIC;
389 zaphdr->zap_num_entries = zap->kvpcnt;
390 zaphdr->zap_salt = zap->hashsalt;
391
392 l.l_bs = blkshift;
393 l.l_phys = NULL;
394
395 zt = &zaphdr->zap_ptrtbl;
396 zt->zt_blk = 0;
397 zt->zt_numblks = 0;
398 zt->zt_shift = flsll(ptrcnt) - 1;
399 zt->zt_nextblk = 0;
400 zt->zt_blks_copied = 0;
401
402 /*
403 * How many leaf blocks do we need? Initialize them and update the
404 * header.
405 */
406 prefixlen = zap_fat_write_prefixlen(zap, &l);
407 lblkcnt = (uint64_t)1 << prefixlen;
408 leafblks = ecalloc(lblkcnt, blksz);
409 for (unsigned int li = 0; li < lblkcnt; li++) {
410 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
411 zap_fat_write_leaf_init(&l, li, prefixlen);
412 }
413 zaphdr->zap_num_leafs = lblkcnt;
414 zaphdr->zap_freeblk = lblkcnt + 1;
415
416 /*
417 * For each entry, figure out which leaf block it belongs to based on
418 * the upper bits of its hash, allocate chunks from that leaf, and fill
419 * them out.
420 */
421 ptrhasht = (uint64_t *)(&zfs->filebuf[0] + blksz / 2);
422 STAILQ_FOREACH(ent, &zap->kvps, next) {
423 struct zap_leaf_entry *le;
424 uint16_t *lptr;
425 uint64_t hi, li;
426 uint16_t namelen, nchunks, nnamechunks, nvalchunks;
427
428 hi = ZAP_HASH_IDX(ent->hash, zt->zt_shift);
429 li = ZAP_HASH_IDX(ent->hash, prefixlen);
430 assert(ptrhasht[hi] == 0 || ptrhasht[hi] == li + 1);
431 ptrhasht[hi] = li + 1;
432 l.l_phys = (zap_leaf_phys_t *)(leafblks + li * blksz);
433
434 namelen = strlen(ent->name) + 1;
435
436 /*
437 * How many leaf chunks do we need for this entry?
438 */
439 nnamechunks = howmany(namelen, ZAP_LEAF_ARRAY_BYTES);
440 nvalchunks = howmany(ent->intcnt,
441 ZAP_LEAF_ARRAY_BYTES / ent->intsz);
442 nchunks = 1 + nnamechunks + nvalchunks;
443
444 /*
445 * Allocate a run of free leaf chunks for this entry,
446 * potentially extending a hash chain.
447 */
448 assert(l.l_phys->l_hdr.lh_nfree >= nchunks);
449 l.l_phys->l_hdr.lh_nfree -= nchunks;
450 l.l_phys->l_hdr.lh_nentries++;
451 lptr = ZAP_LEAF_HASH_ENTPTR(&l, ent->hash);
452 while (*lptr != 0xffff) {
453 assert(*lptr < ZAP_LEAF_NUMCHUNKS(&l));
454 le = ZAP_LEAF_ENTRY(&l, *lptr);
455 assert(le->le_type == ZAP_CHUNK_ENTRY);
456 le->le_cd++;
457 lptr = &le->le_next;
458 }
459 *lptr = l.l_phys->l_hdr.lh_freelist;
460 l.l_phys->l_hdr.lh_freelist += nchunks;
461 assert(l.l_phys->l_hdr.lh_freelist <=
462 ZAP_LEAF_NUMCHUNKS(&l));
463 if (l.l_phys->l_hdr.lh_freelist ==
464 ZAP_LEAF_NUMCHUNKS(&l))
465 l.l_phys->l_hdr.lh_freelist = 0xffff;
466
467 /*
468 * Integer values must be stored in big-endian format.
469 */
470 switch (ent->intsz) {
471 case 1:
472 break;
473 case 2:
474 for (uint16_t *v = ent->val16p;
475 v - ent->val16p < (ptrdiff_t)ent->intcnt;
476 v++)
477 *v = htobe16(*v);
478 break;
479 case 4:
480 for (uint32_t *v = ent->val32p;
481 v - ent->val32p < (ptrdiff_t)ent->intcnt;
482 v++)
483 *v = htobe32(*v);
484 break;
485 case 8:
486 for (uint64_t *v = ent->val64p;
487 v - ent->val64p < (ptrdiff_t)ent->intcnt;
488 v++)
489 *v = htobe64(*v);
490 break;
491 default:
492 assert(0);
493 }
494
495 /*
496 * Finally, write out the leaf chunks for this entry.
497 */
498 le = ZAP_LEAF_ENTRY(&l, *lptr);
499 assert(le->le_type == ZAP_CHUNK_FREE);
500 le->le_type = ZAP_CHUNK_ENTRY;
501 le->le_next = 0xffff;
502 le->le_name_chunk = *lptr + 1;
503 le->le_name_numints = namelen;
504 le->le_value_chunk = *lptr + 1 + nnamechunks;
505 le->le_value_intlen = ent->intsz;
506 le->le_value_numints = ent->intcnt;
507 le->le_hash = ent->hash;
508 zap_fat_write_array_chunk(&l, *lptr + 1, namelen,
509 (uint8_t *)ent->name);
510 zap_fat_write_array_chunk(&l, *lptr + 1 + nnamechunks,
511 ent->intcnt * ent->intsz, ent->valp);
512 }
513
514 /*
515 * Initialize unused slots of the pointer table.
516 */
517 for (int i = 0; i < ptrcnt; i++)
518 if (ptrhasht[i] == 0)
519 ptrhasht[i] = (i >> (zt->zt_shift - prefixlen)) + 1;
520
521 /*
522 * Write the whole thing to disk.
523 */
524 dnode = zap->dnode;
525 dnode->dn_datablkszsec = blksz >> MINBLOCKSHIFT;
526 dnode->dn_maxblkid = lblkcnt + 1;
527
528 c = dnode_cursor_init(zfs, zap->os, zap->dnode,
529 (lblkcnt + 1) * blksz, blksz);
530
531 loc = objset_space_alloc(zfs, zap->os, &blksz);
532 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, zfs->filebuf, blksz, loc,
533 dnode_cursor_next(zfs, c, 0));
534
535 for (uint64_t i = 0; i < lblkcnt; i++) {
536 loc = objset_space_alloc(zfs, zap->os, &blksz);
537 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, leafblks + i * blksz,
538 blksz, loc, dnode_cursor_next(zfs, c, (i + 1) * blksz));
539 }
540
541 dnode_cursor_finish(zfs, c);
542
543 free(leafblks);
544 }
545
546 void
zap_write(zfs_opt_t * zfs,zfs_zap_t * zap)547 zap_write(zfs_opt_t *zfs, zfs_zap_t *zap)
548 {
549 zfs_zap_entry_t *ent;
550
551 if (zap->micro) {
552 zap_micro_write(zfs, zap);
553 } else {
554 assert(!STAILQ_EMPTY(&zap->kvps));
555 assert(zap->kvpcnt > 0);
556 zap_fat_write(zfs, zap);
557 }
558
559 while ((ent = STAILQ_FIRST(&zap->kvps)) != NULL) {
560 STAILQ_REMOVE_HEAD(&zap->kvps, next);
561 if (ent->val64p != &ent->val64)
562 free(ent->valp);
563 free(ent->name);
564 free(ent);
565 }
566 free(zap);
567 }
568