1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause
3 *
4 * Copyright (c) 2022 The FreeBSD Foundation
5 *
6 * This software was developed by Mark Johnston under sponsorship from
7 * the FreeBSD Foundation.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions are
11 * met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28 * SUCH DAMAGE.
29 */
30
31 #include <sys/stat.h>
32
33 #include <assert.h>
34 #include <dirent.h>
35 #include <fcntl.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39
40 #include <util.h>
41
42 #include "makefs.h"
43 #include "zfs.h"
44
45 typedef struct {
46 const char *name;
47 unsigned int id;
48 uint16_t size;
49 sa_bswap_type_t bs;
50 } zfs_sattr_t;
51
52 typedef struct zfs_fs {
53 zfs_objset_t *os;
54
55 /* Offset table for system attributes, indexed by a zpl_attr_t. */
56 uint16_t *saoffs;
57 size_t sacnt;
58 const zfs_sattr_t *satab;
59 } zfs_fs_t;
60
61 /*
62 * The order of the attributes doesn't matter, this is simply the one hard-coded
63 * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
64 */
65 typedef enum zpl_attr {
66 ZPL_ATIME,
67 ZPL_MTIME,
68 ZPL_CTIME,
69 ZPL_CRTIME,
70 ZPL_GEN,
71 ZPL_MODE,
72 ZPL_SIZE,
73 ZPL_PARENT,
74 ZPL_LINKS,
75 ZPL_XATTR,
76 ZPL_RDEV,
77 ZPL_FLAGS,
78 ZPL_UID,
79 ZPL_GID,
80 ZPL_PAD,
81 ZPL_ZNODE_ACL,
82 ZPL_DACL_COUNT,
83 ZPL_SYMLINK,
84 ZPL_SCANSTAMP,
85 ZPL_DACL_ACES,
86 ZPL_DXATTR,
87 ZPL_PROJID,
88 } zpl_attr_t;
89
90 /*
91 * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
92 */
93 static const zfs_sattr_t zpl_attrs[] = {
94 #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b }
95 _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
96 _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
97 _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
98 _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
99 _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
100 _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
101 _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
102 _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
103 _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
104 _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
105 _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
106 _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
107 _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
108 _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
109 _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
110 _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
111 _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
112 _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
113 _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
114 _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
115 _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
116 _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
117 #undef ZPL_ATTR
118 };
119
120 /*
121 * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
122 * It need not match in general, but FreeBSD's loader doesn't bother parsing the
123 * layout and just hard-codes attribute offsets.
124 */
125 static const sa_attr_type_t zpl_attr_layout[] = {
126 ZPL_MODE,
127 ZPL_SIZE,
128 ZPL_GEN,
129 ZPL_UID,
130 ZPL_GID,
131 ZPL_PARENT,
132 ZPL_FLAGS,
133 ZPL_ATIME,
134 ZPL_MTIME,
135 ZPL_CTIME,
136 ZPL_CRTIME,
137 ZPL_LINKS,
138 ZPL_DACL_COUNT,
139 ZPL_DACL_ACES,
140 ZPL_SYMLINK,
141 };
142
143 /*
144 * Keys for the ZPL attribute tables in the SA layout ZAP. The first two
145 * indices are reserved for legacy attribute encoding.
146 */
147 #define SA_LAYOUT_INDEX_DEFAULT 2
148 #define SA_LAYOUT_INDEX_SYMLINK 3
149
150 struct fs_populate_dir {
151 SLIST_ENTRY(fs_populate_dir) next;
152 int dirfd;
153 uint64_t objid;
154 zfs_zap_t *zap;
155 };
156
157 struct fs_populate_arg {
158 zfs_opt_t *zfs;
159 zfs_fs_t *fs; /* owning filesystem */
160 uint64_t rootdirid; /* root directory dnode ID */
161 int rootdirfd; /* root directory fd */
162 SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */
163 };
164
165 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
166
167 static void
eclose(int fd)168 eclose(int fd)
169 {
170 if (close(fd) != 0)
171 err(1, "close");
172 }
173
174 static bool
fsnode_isroot(const fsnode * cur)175 fsnode_isroot(const fsnode *cur)
176 {
177 return (strcmp(cur->name, ".") == 0);
178 }
179
180 static bool
fsnode_valid(const fsnode * cur)181 fsnode_valid(const fsnode *cur)
182 {
183 return (cur->type == S_IFREG || cur->type == S_IFDIR ||
184 cur->type == S_IFLNK);
185 }
186
187 /*
188 * Visit each node in a directory hierarchy, in pre-order depth-first order.
189 */
190 static void
fsnode_foreach(fsnode * root,int (* cb)(fsnode *,void *),void * arg)191 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
192 {
193 assert(root->type == S_IFDIR);
194
195 for (fsnode *cur = root; cur != NULL; cur = cur->next) {
196 if (!fsnode_valid(cur)) {
197 warnx("skipping unhandled %s %s/%s",
198 inode_type(cur->type), cur->path, cur->name);
199 continue;
200 }
201 if (cb(cur, arg) == 0)
202 continue;
203 if (cur->type == S_IFDIR && cur->child != NULL)
204 fsnode_foreach(cur->child, cb, arg);
205 }
206 }
207
208 static void
fs_populate_dirent(struct fs_populate_arg * arg,fsnode * cur,uint64_t dnid)209 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
210 {
211 struct fs_populate_dir *dir;
212 uint64_t type;
213
214 switch (cur->type) {
215 case S_IFREG:
216 type = DT_REG;
217 break;
218 case S_IFDIR:
219 type = DT_DIR;
220 break;
221 case S_IFLNK:
222 type = DT_LNK;
223 break;
224 default:
225 assert(0);
226 }
227
228 dir = SLIST_FIRST(&arg->dirs);
229 zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
230 }
231
232 static void
fs_populate_attr(zfs_fs_t * fs,char * attrbuf,const void * val,uint16_t ind,size_t * szp)233 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
234 size_t *szp)
235 {
236 assert(ind < fs->sacnt);
237 assert(fs->saoffs[ind] != 0xffff);
238
239 memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
240 *szp += fs->satab[ind].size;
241 }
242
243 static void
fs_populate_varszattr(zfs_fs_t * fs,char * attrbuf,const void * val,size_t valsz,size_t varoff,uint16_t ind,size_t * szp)244 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
245 size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
246 {
247 assert(ind < fs->sacnt);
248 assert(fs->saoffs[ind] != 0xffff);
249 assert(fs->satab[ind].size == 0);
250
251 memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
252 *szp += valsz;
253 }
254
255 /*
256 * Derive the relative fd/path combo needed to access a file. Ideally we'd
257 * always be able to use relative lookups (i.e., use the *at() system calls),
258 * since they require less path translation and are more amenable to sandboxing,
259 * but the handling of multiple staging directories makes that difficult. To
260 * make matters worse, we have no choice but to use relative lookups when
261 * dealing with an mtree manifest, so both mechanisms are implemented.
262 */
263 static void
fs_populate_path(const fsnode * cur,struct fs_populate_arg * arg,char * path,size_t sz,int * dirfdp)264 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
265 char *path, size_t sz, int *dirfdp)
266 {
267 if (cur->contents != NULL) {
268 size_t n;
269
270 *dirfdp = AT_FDCWD;
271 n = strlcpy(path, cur->contents, sz);
272 assert(n < sz);
273 } else if (cur->root == NULL) {
274 size_t n;
275
276 *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
277 n = strlcpy(path, cur->name, sz);
278 assert(n < sz);
279 } else {
280 int n;
281
282 *dirfdp = AT_FDCWD;
283 n = snprintf(path, sz, "%s/%s/%s",
284 cur->root, cur->path, cur->name);
285 assert(n >= 0);
286 assert((size_t)n < sz);
287 }
288 }
289
290 static int
fs_open(const fsnode * cur,struct fs_populate_arg * arg,int flags)291 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
292 {
293 char path[PATH_MAX];
294 int fd;
295
296 fs_populate_path(cur, arg, path, sizeof(path), &fd);
297
298 fd = openat(fd, path, flags);
299 if (fd < 0)
300 err(1, "openat(%s)", path);
301 return (fd);
302 }
303
304 static int
fs_open_can_fail(const fsnode * cur,struct fs_populate_arg * arg,int flags)305 fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags)
306 {
307 int fd;
308 char path[PATH_MAX];
309
310 fs_populate_path(cur, arg, path, sizeof(path), &fd);
311
312 return (openat(fd, path, flags));
313 }
314
315 static void
fs_readlink(const fsnode * cur,struct fs_populate_arg * arg,char * buf,size_t bufsz)316 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
317 char *buf, size_t bufsz)
318 {
319 char path[PATH_MAX];
320 int fd;
321
322 if (cur->symlink != NULL) {
323 size_t n;
324
325 n = strlcpy(buf, cur->symlink, bufsz);
326 assert(n < bufsz);
327 } else {
328 ssize_t n;
329
330 fs_populate_path(cur, arg, path, sizeof(path), &fd);
331
332 n = readlinkat(fd, path, buf, bufsz - 1);
333 if (n == -1)
334 err(1, "readlinkat(%s)", cur->name);
335 buf[n] = '\0';
336 }
337 }
338
339 static void
fs_populate_time(zfs_fs_t * fs,char * attrbuf,struct timespec * ts,uint16_t ind,size_t * szp)340 fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts,
341 uint16_t ind, size_t *szp)
342 {
343 uint64_t timebuf[2];
344
345 assert(ind < fs->sacnt);
346 assert(fs->saoffs[ind] != 0xffff);
347 assert(fs->satab[ind].size == sizeof(timebuf));
348
349 timebuf[0] = ts->tv_sec;
350 timebuf[1] = ts->tv_nsec;
351 fs_populate_attr(fs, attrbuf, timebuf, ind, szp);
352 }
353
354 static void
fs_populate_sattrs(struct fs_populate_arg * arg,const fsnode * cur,dnode_phys_t * dnode)355 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
356 dnode_phys_t *dnode)
357 {
358 char target[PATH_MAX];
359 zfs_fs_t *fs;
360 zfs_ace_hdr_t aces[3];
361 struct stat *sb;
362 sa_hdr_phys_t *sahdr;
363 uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
364 char *attrbuf;
365 size_t bonussz, hdrsz;
366 int layout;
367
368 assert(dnode->dn_bonustype == DMU_OT_SA);
369 assert(dnode->dn_nblkptr == 1);
370
371 fs = arg->fs;
372 sb = &cur->inode->st;
373
374 switch (cur->type) {
375 case S_IFREG:
376 layout = SA_LAYOUT_INDEX_DEFAULT;
377 links = cur->inode->nlink;
378 objsize = sb->st_size;
379 parent = SLIST_FIRST(&arg->dirs)->objid;
380 break;
381 case S_IFDIR:
382 layout = SA_LAYOUT_INDEX_DEFAULT;
383 links = 1; /* .. */
384 objsize = 1; /* .. */
385
386 /*
387 * The size of a ZPL directory is the number of entries
388 * (including "." and ".."), and the link count is the number of
389 * entries which are directories (including "." and "..").
390 */
391 for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
392 c != NULL; c = c->next) {
393 switch (c->type) {
394 case S_IFDIR:
395 links++;
396 /* FALLTHROUGH */
397 case S_IFREG:
398 case S_IFLNK:
399 objsize++;
400 break;
401 }
402 }
403
404 /* The root directory is its own parent. */
405 parent = SLIST_EMPTY(&arg->dirs) ?
406 arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
407 break;
408 case S_IFLNK:
409 fs_readlink(cur, arg, target, sizeof(target));
410
411 layout = SA_LAYOUT_INDEX_SYMLINK;
412 links = 1;
413 objsize = strlen(target);
414 parent = SLIST_FIRST(&arg->dirs)->objid;
415 break;
416 default:
417 assert(0);
418 }
419
420 daclcount = nitems(aces);
421 flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE |
422 ZFS_AV_MODIFIED;
423 gen = 1;
424 gid = sb->st_gid;
425 mode = sb->st_mode;
426 uid = sb->st_uid;
427
428 memset(aces, 0, sizeof(aces));
429 aces[0].z_flags = ACE_OWNER;
430 aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
431 aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
432 ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
433 ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
434 if ((mode & S_IRUSR) != 0)
435 aces[0].z_access_mask |= ACE_READ_DATA;
436 if ((mode & S_IWUSR) != 0)
437 aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
438 if ((mode & S_IXUSR) != 0)
439 aces[0].z_access_mask |= ACE_EXECUTE;
440
441 aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
442 aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
443 aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
444 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
445 if ((mode & S_IRGRP) != 0)
446 aces[1].z_access_mask |= ACE_READ_DATA;
447 if ((mode & S_IWGRP) != 0)
448 aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
449 if ((mode & S_IXGRP) != 0)
450 aces[1].z_access_mask |= ACE_EXECUTE;
451
452 aces[2].z_flags = ACE_EVERYONE;
453 aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
454 aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
455 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
456 if ((mode & S_IROTH) != 0)
457 aces[2].z_access_mask |= ACE_READ_DATA;
458 if ((mode & S_IWOTH) != 0)
459 aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
460 if ((mode & S_IXOTH) != 0)
461 aces[2].z_access_mask |= ACE_EXECUTE;
462
463 switch (layout) {
464 case SA_LAYOUT_INDEX_DEFAULT:
465 /* At most one variable-length attribute. */
466 hdrsz = sizeof(uint64_t);
467 break;
468 case SA_LAYOUT_INDEX_SYMLINK:
469 /* At most five variable-length attributes. */
470 hdrsz = sizeof(uint64_t) * 2;
471 break;
472 default:
473 assert(0);
474 }
475
476 sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
477 sahdr->sa_magic = SA_MAGIC;
478 SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
479
480 bonussz = SA_HDR_SIZE(sahdr);
481 attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
482
483 fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
484 fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
485 fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
486 fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
487 fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
488 fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
489 fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
490 fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
491 fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
492
493 /*
494 * We deliberately set atime = mtime here to ensure that images are
495 * reproducible.
496 */
497 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
498 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
499 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
500 #ifdef __linux__
501 /* Linux has no st_birthtim; approximate with st_ctim */
502 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz);
503 #else
504 fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
505 #endif
506
507 fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
508 ZPL_DACL_ACES, &bonussz);
509 sahdr->sa_lengths[0] = sizeof(aces);
510
511 if (cur->type == S_IFLNK) {
512 assert(layout == SA_LAYOUT_INDEX_SYMLINK);
513 /* Need to use a spill block pointer if the target is long. */
514 assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
515 fs_populate_varszattr(fs, attrbuf, target, objsize,
516 sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
517 sahdr->sa_lengths[1] = (uint16_t)objsize;
518 }
519
520 dnode->dn_bonuslen = bonussz;
521 }
522
523 static void
fs_populate_file(fsnode * cur,struct fs_populate_arg * arg)524 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
525 {
526 struct dnode_cursor *c;
527 dnode_phys_t *dnode;
528 zfs_opt_t *zfs;
529 char *buf;
530 uint64_t dnid;
531 ssize_t n;
532 size_t bufsz;
533 off_t nbytes, reqbytes, size;
534 int fd;
535
536 assert(cur->type == S_IFREG);
537 assert((cur->inode->flags & FI_ROOT) == 0);
538
539 zfs = arg->zfs;
540
541 assert(cur->inode->ino != 0);
542 if ((cur->inode->flags & FI_ALLOCATED) != 0) {
543 /*
544 * This is a hard link of an existing file.
545 *
546 * XXX-MJ need to check whether it crosses datasets, add a test
547 * case for that
548 */
549 fs_populate_dirent(arg, cur, cur->inode->ino);
550 return;
551 }
552
553 dnode = objset_dnode_bonus_alloc(arg->fs->os,
554 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
555 cur->inode->ino = dnid;
556 cur->inode->flags |= FI_ALLOCATED;
557
558 fd = fs_open(cur, arg, O_RDONLY);
559
560 buf = zfs->filebuf;
561 bufsz = sizeof(zfs->filebuf);
562 size = cur->inode->st.st_size;
563 c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
564 for (off_t foff = 0; foff < size; foff += nbytes) {
565 off_t loc, sofar;
566
567 /*
568 * Fill up our buffer, handling partial reads.
569 */
570 sofar = 0;
571 nbytes = MIN(size - foff, (off_t)bufsz);
572 do {
573 n = read(fd, buf + sofar, nbytes);
574 if (n < 0)
575 err(1, "reading from '%s'", cur->name);
576 if (n == 0)
577 errx(1, "unexpected EOF reading '%s'",
578 cur->name);
579 sofar += n;
580 } while (sofar < nbytes);
581
582 if (nbytes < (off_t)bufsz)
583 memset(buf + nbytes, 0, bufsz - nbytes);
584
585 reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE;
586 loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes);
587 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc,
588 dnode_cursor_next(zfs, c, foff));
589 }
590 eclose(fd);
591 dnode_cursor_finish(zfs, c);
592
593 fs_populate_sattrs(arg, cur, dnode);
594 fs_populate_dirent(arg, cur, dnid);
595 }
596
597 static void
fs_populate_dir(fsnode * cur,struct fs_populate_arg * arg)598 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
599 {
600 dnode_phys_t *dnode;
601 zfs_objset_t *os;
602 uint64_t dnid;
603 int dirfd;
604
605 assert(cur->type == S_IFDIR);
606 assert((cur->inode->flags & FI_ALLOCATED) == 0);
607
608 os = arg->fs->os;
609
610 dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
611 DMU_OT_SA, 0, &dnid);
612
613 /*
614 * Add an entry to the parent directory and open this directory.
615 */
616 if (!SLIST_EMPTY(&arg->dirs)) {
617 fs_populate_dirent(arg, cur, dnid);
618 /*
619 * We only need the directory fd if we're finding files in
620 * it. If it's just there for other directories or
621 * files using contents= we don't need to succeed here.
622 */
623 dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY);
624 } else {
625 arg->rootdirid = dnid;
626 dirfd = arg->rootdirfd;
627 arg->rootdirfd = -1;
628 }
629
630 /*
631 * Set ZPL attributes.
632 */
633 fs_populate_sattrs(arg, cur, dnode);
634
635 /*
636 * If this is a root directory, then its children belong to a different
637 * dataset and this directory remains empty in the current objset.
638 */
639 if ((cur->inode->flags & FI_ROOT) == 0) {
640 struct fs_populate_dir *dir;
641
642 dir = ecalloc(1, sizeof(*dir));
643 dir->dirfd = dirfd;
644 dir->objid = dnid;
645 dir->zap = zap_alloc(os, dnode);
646 SLIST_INSERT_HEAD(&arg->dirs, dir, next);
647 } else {
648 zap_write(arg->zfs, zap_alloc(os, dnode));
649 fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
650 }
651 }
652
653 static void
fs_populate_symlink(fsnode * cur,struct fs_populate_arg * arg)654 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
655 {
656 dnode_phys_t *dnode;
657 uint64_t dnid;
658
659 assert(cur->type == S_IFLNK);
660 assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
661
662 dnode = objset_dnode_bonus_alloc(arg->fs->os,
663 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
664
665 fs_populate_dirent(arg, cur, dnid);
666
667 fs_populate_sattrs(arg, cur, dnode);
668 }
669
670 static fsnode *
fsnode_next(fsnode * cur)671 fsnode_next(fsnode *cur)
672 {
673 for (cur = cur->next; cur != NULL; cur = cur->next) {
674 if (fsnode_valid(cur))
675 return (cur);
676 }
677 return (NULL);
678 }
679
680 static int
fs_foreach_populate(fsnode * cur,void * _arg)681 fs_foreach_populate(fsnode *cur, void *_arg)
682 {
683 struct fs_populate_arg *arg;
684 struct fs_populate_dir *dir;
685 int ret;
686
687 arg = _arg;
688 switch (cur->type) {
689 case S_IFREG:
690 fs_populate_file(cur, arg);
691 break;
692 case S_IFDIR:
693 if (fsnode_isroot(cur))
694 break;
695 fs_populate_dir(cur, arg);
696 break;
697 case S_IFLNK:
698 fs_populate_symlink(cur, arg);
699 break;
700 default:
701 assert(0);
702 }
703
704 ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
705
706 if (fsnode_next(cur) == NULL &&
707 (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
708 /*
709 * We reached a terminal node in a subtree. Walk back up and
710 * write out directories. We're done once we hit the root of a
711 * dataset or find a level where we're not on the edge of the
712 * tree.
713 */
714 do {
715 dir = SLIST_FIRST(&arg->dirs);
716 SLIST_REMOVE_HEAD(&arg->dirs, next);
717 zap_write(arg->zfs, dir->zap);
718 if (dir->dirfd != -1)
719 eclose(dir->dirfd);
720 free(dir);
721 cur = cur->parent;
722 } while (cur != NULL && fsnode_next(cur) == NULL &&
723 (cur->inode->flags & FI_ROOT) == 0);
724 }
725
726 return (ret);
727 }
728
729 static void
fs_add_zpl_attr_layout(zfs_zap_t * zap,unsigned int index,const sa_attr_type_t layout[],size_t sacnt)730 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
731 const sa_attr_type_t layout[], size_t sacnt)
732 {
733 char ti[16];
734
735 assert(sizeof(layout[0]) == 2);
736
737 snprintf(ti, sizeof(ti), "%u", index);
738 zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
739 (const uint8_t *)layout);
740 }
741
742 /*
743 * Initialize system attribute tables.
744 *
745 * There are two elements to this. First, we write the zpl_attrs[] and
746 * zpl_attr_layout[] tables to disk. Then we create a lookup table which
747 * allows us to set file attributes quickly.
748 */
749 static uint64_t
fs_set_zpl_attrs(zfs_opt_t * zfs,zfs_fs_t * fs)750 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
751 {
752 zfs_zap_t *sazap, *salzap, *sarzap;
753 zfs_objset_t *os;
754 dnode_phys_t *saobj, *salobj, *sarobj;
755 uint64_t saobjid, salobjid, sarobjid;
756 uint16_t offset;
757
758 os = fs->os;
759
760 /*
761 * The on-disk tables are stored in two ZAP objects, the registry object
762 * and the layout object. Individual attributes are described by
763 * entries in the registry object; for example, the value for the
764 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
765 * The attributes of a file are ordered according to one of the layouts
766 * defined in the layout object. The master node object is simply used
767 * to locate the registry and layout objects.
768 */
769 saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
770 salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
771 sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
772
773 sarzap = zap_alloc(os, sarobj);
774 for (size_t i = 0; i < nitems(zpl_attrs); i++) {
775 const zfs_sattr_t *sa;
776 uint64_t attr;
777
778 attr = 0;
779 sa = &zpl_attrs[i];
780 SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
781 zap_add_uint64(sarzap, sa->name, attr);
782 }
783 zap_write(zfs, sarzap);
784
785 /*
786 * Layouts are arrays of indices into the registry. We define two
787 * layouts for use by the ZPL, one for non-symlinks and one for
788 * symlinks. They are identical except that the symlink layout includes
789 * ZPL_SYMLINK as its final attribute.
790 */
791 salzap = zap_alloc(os, salobj);
792 assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
793 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
794 zpl_attr_layout, nitems(zpl_attr_layout) - 1);
795 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
796 zpl_attr_layout, nitems(zpl_attr_layout));
797 zap_write(zfs, salzap);
798
799 sazap = zap_alloc(os, saobj);
800 zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
801 zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
802 zap_write(zfs, sazap);
803
804 /* Sanity check. */
805 for (size_t i = 0; i < nitems(zpl_attrs); i++)
806 assert(i == zpl_attrs[i].id);
807
808 /*
809 * Build the offset table used when setting file attributes. File
810 * attributes are stored in the object's bonus buffer; this table
811 * provides the buffer offset of attributes referenced by the layout
812 * table.
813 */
814 fs->sacnt = nitems(zpl_attrs);
815 fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
816 for (size_t i = 0; i < fs->sacnt; i++)
817 fs->saoffs[i] = 0xffff;
818 offset = 0;
819 for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
820 uint16_t size;
821
822 assert(zpl_attr_layout[i] < fs->sacnt);
823
824 fs->saoffs[zpl_attr_layout[i]] = offset;
825 size = zpl_attrs[zpl_attr_layout[i]].size;
826 offset += size;
827 }
828 fs->satab = zpl_attrs;
829
830 return (saobjid);
831 }
832
833 static void
fs_layout_one(zfs_opt_t * zfs,zfs_dsl_dir_t * dsldir,void * arg)834 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
835 {
836 char *mountpoint, *origmountpoint, *name, *next;
837 fsnode *cur, *root;
838 uint64_t canmount;
839
840 if (!dsl_dir_has_dataset(dsldir))
841 return;
842
843 if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
844 return;
845 mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
846 if (mountpoint == NULL)
847 return;
848
849 /*
850 * If we were asked to specify a bootfs, set it here.
851 */
852 if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
853 dsl_dir_fullname(dsldir)) == 0) {
854 zap_add_uint64(zfs->poolprops, "bootfs",
855 dsl_dir_dataset_id(dsldir));
856 }
857
858 origmountpoint = mountpoint;
859
860 /*
861 * Figure out which fsnode corresponds to our mountpoint.
862 */
863 root = arg;
864 cur = root;
865 if (strcmp(mountpoint, zfs->rootpath) != 0) {
866 mountpoint += strlen(zfs->rootpath);
867
868 /*
869 * Look up the directory in the staged tree. For example, if
870 * the dataset's mount point is /foo/bar/baz, we'll search the
871 * root directory for "foo", search "foo" for "baz", and so on.
872 * Each intermediate name must refer to a directory; the final
873 * component need not exist.
874 */
875 cur = root;
876 for (next = name = mountpoint; next != NULL;) {
877 for (; *next == '/'; next++)
878 ;
879 name = strsep(&next, "/");
880
881 for (; cur != NULL && strcmp(cur->name, name) != 0;
882 cur = cur->next)
883 ;
884 if (cur == NULL) {
885 if (next == NULL)
886 break;
887 errx(1, "missing mountpoint directory for `%s'",
888 dsl_dir_fullname(dsldir));
889 }
890 if (cur->type != S_IFDIR) {
891 errx(1,
892 "mountpoint for `%s' is not a directory",
893 dsl_dir_fullname(dsldir));
894 }
895 if (next != NULL)
896 cur = cur->child;
897 }
898 }
899
900 if (cur != NULL) {
901 assert(cur->type == S_IFDIR);
902
903 /*
904 * Multiple datasets shouldn't share a mountpoint. It's
905 * technically allowed, but it's not clear what makefs should do
906 * in that case.
907 */
908 assert((cur->inode->flags & FI_ROOT) == 0);
909 if (cur != root)
910 cur->inode->flags |= FI_ROOT;
911 assert(cur->inode->param == NULL);
912 cur->inode->param = dsldir;
913 }
914
915 free(origmountpoint);
916 }
917
918 static int
fs_foreach_mark(fsnode * cur,void * arg)919 fs_foreach_mark(fsnode *cur, void *arg)
920 {
921 uint64_t *countp;
922
923 countp = arg;
924 if (cur->type == S_IFDIR && fsnode_isroot(cur))
925 return (1);
926
927 if (cur->inode->ino == 0) {
928 cur->inode->ino = ++(*countp);
929 cur->inode->nlink = 1;
930 } else {
931 cur->inode->nlink++;
932 }
933
934 return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
935 }
936
937 /*
938 * Create a filesystem dataset. More specifically:
939 * - create an object set for the dataset,
940 * - add required metadata (SA tables, property definitions, etc.) to that
941 * object set,
942 * - optionally populate the object set with file objects, using "root" as the
943 * root directory.
944 *
945 * "dirfd" is a directory descriptor for the directory referenced by "root". It
946 * is closed before returning.
947 */
948 static void
fs_build_one(zfs_opt_t * zfs,zfs_dsl_dir_t * dsldir,fsnode * root,int dirfd)949 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
950 {
951 struct fs_populate_arg arg;
952 zfs_fs_t fs;
953 zfs_zap_t *masterzap;
954 zfs_objset_t *os;
955 dnode_phys_t *deleteq, *masterobj;
956 uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
957 bool fakedroot;
958
959 /*
960 * This dataset's mountpoint doesn't exist in the staging tree, or the
961 * dataset doesn't have a mountpoint at all. In either case we still
962 * need a root directory. Fake up a root fsnode to handle this case.
963 */
964 fakedroot = root == NULL;
965 if (fakedroot) {
966 struct stat *stp;
967
968 assert(dirfd == -1);
969
970 root = ecalloc(1, sizeof(*root));
971 root->inode = ecalloc(1, sizeof(*root->inode));
972 root->name = estrdup(".");
973 root->type = S_IFDIR;
974
975 stp = &root->inode->st;
976 stp->st_uid = 0;
977 stp->st_gid = 0;
978 stp->st_mode = S_IFDIR | 0755;
979 }
980 assert(root->type == S_IFDIR);
981 assert(fsnode_isroot(root));
982
983 /*
984 * Initialize the object set for this dataset.
985 */
986 os = objset_alloc(zfs, DMU_OST_ZFS);
987 masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
988 assert(moid == MASTER_NODE_OBJ);
989
990 memset(&fs, 0, sizeof(fs));
991 fs.os = os;
992
993 /*
994 * Create the ZAP SA layout now since filesystem object dnodes will
995 * refer to those attributes.
996 */
997 saobjid = fs_set_zpl_attrs(zfs, &fs);
998
999 /*
1000 * Make a pass over the staged directory to detect hard links and assign
1001 * virtual dnode numbers.
1002 */
1003 dnodecount = 1; /* root directory */
1004 fsnode_foreach(root, fs_foreach_mark, &dnodecount);
1005
1006 /*
1007 * Make a second pass to populate the dataset with files from the
1008 * staged directory. Most of our runtime is spent here.
1009 */
1010 arg.rootdirfd = dirfd;
1011 arg.zfs = zfs;
1012 arg.fs = &fs;
1013 SLIST_INIT(&arg.dirs);
1014 fs_populate_dir(root, &arg);
1015 assert(!SLIST_EMPTY(&arg.dirs));
1016 fsnode_foreach(root, fs_foreach_populate, &arg);
1017 assert(SLIST_EMPTY(&arg.dirs));
1018 rootdirid = arg.rootdirid;
1019
1020 /*
1021 * Create an empty delete queue. We don't do anything with it, but
1022 * OpenZFS will refuse to mount filesystems that don't have one.
1023 */
1024 deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
1025 zap_write(zfs, zap_alloc(os, deleteq));
1026
1027 /*
1028 * Populate and write the master node object. This is a ZAP object
1029 * containing various dataset properties and the object IDs of the root
1030 * directory and delete queue.
1031 */
1032 masterzap = zap_alloc(os, masterobj);
1033 zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
1034 zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
1035 zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
1036 zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
1037 zap_add_uint64(masterzap, "normalization", 0 /* off */);
1038 zap_add_uint64(masterzap, "utf8only", 0 /* off */);
1039 zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
1040 zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
1041 zap_write(zfs, masterzap);
1042
1043 /*
1044 * All finished with this object set, we may as well write it now.
1045 * The DSL layer will sum up the bytes consumed by each dataset using
1046 * information stored in the object set, so it can't be freed just yet.
1047 */
1048 dsl_dir_dataset_write(zfs, os, dsldir);
1049
1050 if (fakedroot) {
1051 free(root->inode);
1052 free(root->name);
1053 free(root);
1054 }
1055 free(fs.saoffs);
1056 }
1057
1058 /*
1059 * Create an object set for each DSL directory which has a dataset and doesn't
1060 * already have an object set.
1061 */
1062 static void
fs_build_unmounted(zfs_opt_t * zfs,zfs_dsl_dir_t * dsldir,void * arg __unused)1063 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
1064 {
1065 if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
1066 fs_build_one(zfs, dsldir, NULL, -1);
1067 }
1068
1069 /*
1070 * Create our datasets and populate them with files.
1071 */
1072 void
fs_build(zfs_opt_t * zfs,int dirfd,fsnode * root)1073 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
1074 {
1075 /*
1076 * Run through our datasets and find the root fsnode for each one. Each
1077 * root fsnode is flagged so that we can figure out which dataset it
1078 * belongs to.
1079 */
1080 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
1081
1082 /*
1083 * Did we find our boot filesystem?
1084 */
1085 if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
1086 errx(1, "no mounted dataset matches bootfs property `%s'",
1087 zfs->bootfs);
1088
1089 /*
1090 * Traverse the file hierarchy starting from the root fsnode. One
1091 * dataset, not necessarily the root dataset, must "own" the root
1092 * directory by having its mountpoint be equal to the root path.
1093 *
1094 * As roots of other datasets are encountered during the traversal,
1095 * fs_build_one() recursively creates the corresponding object sets and
1096 * populates them. Once this function has returned, all datasets will
1097 * have been fully populated.
1098 */
1099 fs_build_one(zfs, root->inode->param, root, dirfd);
1100
1101 /*
1102 * Now create object sets for datasets whose mountpoints weren't found
1103 * in the staging directory, either because there is no mountpoint, or
1104 * because the mountpoint doesn't correspond to an existing directory.
1105 */
1106 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
1107 }
1108