xref: /freebsd/usr.sbin/makefs/zfs/fs.c (revision 8eca3207980a8c2f6457c1cd2e9ff6b235a3018d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/dirent.h>
32 #include <sys/stat.h>
33 
34 #include <assert.h>
35 #include <fcntl.h>
36 #include <string.h>
37 #include <unistd.h>
38 
39 #include <util.h>
40 
41 #include "makefs.h"
42 #include "zfs.h"
43 
44 typedef struct {
45 	const char	*name;
46 	unsigned int	id;
47 	uint16_t	size;
48 	sa_bswap_type_t	bs;
49 } zfs_sattr_t;
50 
51 typedef struct zfs_fs {
52 	zfs_objset_t	*os;
53 
54 	/* Offset table for system attributes, indexed by a zpl_attr_t. */
55 	uint16_t	*saoffs;
56 	size_t		sacnt;
57 	const zfs_sattr_t *satab;
58 } zfs_fs_t;
59 
60 /*
61  * The order of the attributes doesn't matter, this is simply the one hard-coded
62  * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
63  */
64 typedef enum zpl_attr {
65 	ZPL_ATIME,
66 	ZPL_MTIME,
67 	ZPL_CTIME,
68 	ZPL_CRTIME,
69 	ZPL_GEN,
70 	ZPL_MODE,
71 	ZPL_SIZE,
72 	ZPL_PARENT,
73 	ZPL_LINKS,
74 	ZPL_XATTR,
75 	ZPL_RDEV,
76 	ZPL_FLAGS,
77 	ZPL_UID,
78 	ZPL_GID,
79 	ZPL_PAD,
80 	ZPL_ZNODE_ACL,
81 	ZPL_DACL_COUNT,
82 	ZPL_SYMLINK,
83 	ZPL_SCANSTAMP,
84 	ZPL_DACL_ACES,
85 	ZPL_DXATTR,
86 	ZPL_PROJID,
87 } zpl_attr_t;
88 
89 /*
90  * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
91  */
92 static const zfs_sattr_t zpl_attrs[] = {
93 #define	_ZPL_ATTR(n, s, b)	{ .name = #n, .id = n, .size = s, .bs = b }
94 	_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
95 	_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
96 	_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
97 	_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
98 	_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
99 	_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
100 	_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
101 	_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
102 	_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
103 	_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
104 	_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
105 	_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
106 	_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
107 	_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
108 	_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
109 	_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
110 	_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
111 	_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
112 	_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
113 	_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
114 	_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
115 	_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
116 #undef ZPL_ATTR
117 };
118 
119 /*
120  * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
121  * It need not match in general, but FreeBSD's loader doesn't bother parsing the
122  * layout and just hard-codes attribute offsets.
123  */
124 static const sa_attr_type_t zpl_attr_layout[] = {
125 	ZPL_MODE,
126 	ZPL_SIZE,
127 	ZPL_GEN,
128 	ZPL_UID,
129 	ZPL_GID,
130 	ZPL_PARENT,
131 	ZPL_FLAGS,
132 	ZPL_ATIME,
133 	ZPL_MTIME,
134 	ZPL_CTIME,
135 	ZPL_CRTIME,
136 	ZPL_LINKS,
137 	ZPL_DACL_COUNT,
138 	ZPL_DACL_ACES,
139 	ZPL_SYMLINK,
140 };
141 
142 /*
143  * Keys for the ZPL attribute tables in the SA layout ZAP.  The first two
144  * indices are reserved for legacy attribute encoding.
145  */
146 #define	SA_LAYOUT_INDEX_DEFAULT	2
147 #define	SA_LAYOUT_INDEX_SYMLINK	3
148 
149 struct fs_populate_dir {
150 	SLIST_ENTRY(fs_populate_dir) next;
151 	int			dirfd;
152 	uint64_t		objid;
153 	zfs_zap_t		*zap;
154 };
155 
156 struct fs_populate_arg {
157 	zfs_opt_t	*zfs;
158 	zfs_fs_t	*fs;			/* owning filesystem */
159 	uint64_t	rootdirid;		/* root directory dnode ID */
160 	int		rootdirfd;		/* root directory fd */
161 	SLIST_HEAD(, fs_populate_dir) dirs;	/* stack of directories */
162 };
163 
164 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
165 
166 static void
167 eclose(int fd)
168 {
169 	if (close(fd) != 0)
170 		err(1, "close");
171 }
172 
173 static bool
174 fsnode_isroot(const fsnode *cur)
175 {
176 	return (strcmp(cur->name, ".") == 0);
177 }
178 
179 /*
180  * Visit each node in a directory hierarchy, in pre-order depth-first order.
181  */
182 static void
183 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
184 {
185 	assert(root->type == S_IFDIR);
186 
187 	for (fsnode *cur = root; cur != NULL; cur = cur->next) {
188 		assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
189 		    cur->type == S_IFLNK);
190 
191 		if (cb(cur, arg) == 0)
192 			continue;
193 		if (cur->type == S_IFDIR && cur->child != NULL)
194 			fsnode_foreach(cur->child, cb, arg);
195 	}
196 }
197 
198 static void
199 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
200 {
201 	struct fs_populate_dir *dir;
202 	uint64_t type;
203 
204 	switch (cur->type) {
205 	case S_IFREG:
206 		type = DT_REG;
207 		break;
208 	case S_IFDIR:
209 		type = DT_DIR;
210 		break;
211 	case S_IFLNK:
212 		type = DT_LNK;
213 		break;
214 	default:
215 		assert(0);
216 	}
217 
218 	dir = SLIST_FIRST(&arg->dirs);
219 	zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
220 }
221 
222 static void
223 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
224     size_t *szp)
225 {
226 	assert(ind < fs->sacnt);
227 	assert(fs->saoffs[ind] != 0xffff);
228 
229 	memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
230 	*szp += fs->satab[ind].size;
231 }
232 
233 static void
234 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
235     size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
236 {
237 	assert(ind < fs->sacnt);
238 	assert(fs->saoffs[ind] != 0xffff);
239 	assert(fs->satab[ind].size == 0);
240 
241 	memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
242 	*szp += valsz;
243 }
244 
245 /*
246  * Derive the relative fd/path combo needed to access a file.  Ideally we'd
247  * always be able to use relative lookups (i.e., use the *at() system calls),
248  * since they require less path translation and are more amenable to sandboxing,
249  * but the handling of multiple staging directories makes that difficult.  To
250  * make matters worse, we have no choice but to use relative lookups when
251  * dealing with an mtree manifest, so both mechanisms are implemented.
252  */
253 static void
254 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
255     char *path, size_t sz, int *dirfdp)
256 {
257 	if (cur->root == NULL) {
258 		size_t n;
259 
260 		*dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
261 		n = strlcpy(path, cur->name, sz);
262 		assert(n < sz);
263 	} else {
264 		int n;
265 
266 		*dirfdp = AT_FDCWD;
267 		n = snprintf(path, sz, "%s/%s/%s",
268 		    cur->root, cur->path, cur->name);
269 		assert(n >= 0);
270 		assert((size_t)n < sz);
271 	}
272 }
273 
274 static int
275 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
276 {
277 	char path[PATH_MAX];
278 	int fd;
279 
280 	fs_populate_path(cur, arg, path, sizeof(path), &fd);
281 
282 	fd = openat(fd, path, flags);
283 	if (fd < 0)
284 		err(1, "openat(%s)", path);
285 	return (fd);
286 }
287 
288 static void
289 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
290     char *buf, size_t bufsz)
291 {
292 	char path[PATH_MAX];
293 	ssize_t n;
294 	int fd;
295 
296 	fs_populate_path(cur, arg, path, sizeof(path), &fd);
297 
298 	n = readlinkat(fd, path, buf, bufsz - 1);
299 	if (n == -1)
300 		err(1, "readlinkat(%s)", cur->name);
301 	buf[n] = '\0';
302 }
303 
304 static void
305 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
306     dnode_phys_t *dnode)
307 {
308 	char target[PATH_MAX];
309 	zfs_fs_t *fs;
310 	zfs_ace_hdr_t aces[3];
311 	struct stat *sb;
312 	sa_hdr_phys_t *sahdr;
313 	uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
314 	char *attrbuf;
315 	size_t bonussz, hdrsz;
316 	int layout;
317 
318 	assert(dnode->dn_bonustype == DMU_OT_SA);
319 	assert(dnode->dn_nblkptr == 1);
320 
321 	fs = arg->fs;
322 	sb = &cur->inode->st;
323 
324 	switch (cur->type) {
325 	case S_IFREG:
326 		layout = SA_LAYOUT_INDEX_DEFAULT;
327 		links = cur->inode->nlink;
328 		objsize = sb->st_size;
329 		parent = SLIST_FIRST(&arg->dirs)->objid;
330 		break;
331 	case S_IFDIR:
332 		layout = SA_LAYOUT_INDEX_DEFAULT;
333 		links = 1; /* .. */
334 		objsize = 1; /* .. */
335 
336 		/*
337 		 * The size of a ZPL directory is the number of entries
338 		 * (including "." and ".."), and the link count is the number of
339 		 * entries which are directories (including "." and "..").
340 		 */
341 		for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
342 		    c != NULL; c = c->next) {
343 			if (c->type == S_IFDIR)
344 				links++;
345 			objsize++;
346 		}
347 
348 		/* The root directory is its own parent. */
349 		parent = SLIST_EMPTY(&arg->dirs) ?
350 		    arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
351 		break;
352 	case S_IFLNK:
353 		fs_readlink(cur, arg, target, sizeof(target));
354 
355 		layout = SA_LAYOUT_INDEX_SYMLINK;
356 		links = 1;
357 		objsize = strlen(target);
358 		parent = SLIST_FIRST(&arg->dirs)->objid;
359 		break;
360 	default:
361 		assert(0);
362 	}
363 
364 	daclcount = nitems(aces);
365 	flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
366 	    ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
367 	gen = 1;
368 	gid = sb->st_gid;
369 	mode = sb->st_mode;
370 	uid = sb->st_uid;
371 
372 	memset(aces, 0, sizeof(aces));
373 	aces[0].z_flags = ACE_OWNER;
374 	aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
375 	aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
376 	    ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
377 	    ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
378 	if ((mode & S_IRUSR) != 0)
379 		aces[0].z_access_mask |= ACE_READ_DATA;
380 	if ((mode & S_IWUSR) != 0)
381 		aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
382 	if ((mode & S_IXUSR) != 0)
383 		aces[0].z_access_mask |= ACE_EXECUTE;
384 
385 	aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
386 	aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
387 	aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
388 	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
389 	if ((mode & S_IRGRP) != 0)
390 		aces[1].z_access_mask |= ACE_READ_DATA;
391 	if ((mode & S_IWGRP) != 0)
392 		aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
393 	if ((mode & S_IXGRP) != 0)
394 		aces[1].z_access_mask |= ACE_EXECUTE;
395 
396 	aces[2].z_flags = ACE_EVERYONE;
397 	aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
398 	aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
399 	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
400 	if ((mode & S_IROTH) != 0)
401 		aces[2].z_access_mask |= ACE_READ_DATA;
402 	if ((mode & S_IWOTH) != 0)
403 		aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
404 	if ((mode & S_IXOTH) != 0)
405 		aces[2].z_access_mask |= ACE_EXECUTE;
406 
407 	switch (layout) {
408 	case SA_LAYOUT_INDEX_DEFAULT:
409 		/* At most one variable-length attribute. */
410 		hdrsz = sizeof(uint64_t);
411 		break;
412 	case SA_LAYOUT_INDEX_SYMLINK:
413 		/* At most five variable-length attributes. */
414 		hdrsz = sizeof(uint64_t) * 2;
415 		break;
416 	default:
417 		assert(0);
418 	}
419 
420 	sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
421 	sahdr->sa_magic = SA_MAGIC;
422 	SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
423 
424 	bonussz = SA_HDR_SIZE(sahdr);
425 	attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
426 
427 	fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
428 	fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
429 	fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
430 	fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
431 	fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
432 	fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
433 	fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
434 	fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
435 	fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
436 
437 	/*
438 	 * We deliberately set atime = mtime here to ensure that images are
439 	 * reproducible.
440 	 */
441 	assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size);
442 	fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
443 	assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size);
444 	fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
445 	assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size);
446 	fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
447 	assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size);
448 	fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
449 
450 	fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
451 	    ZPL_DACL_ACES, &bonussz);
452 	sahdr->sa_lengths[0] = sizeof(aces);
453 
454 	if (cur->type == S_IFLNK) {
455 		assert(layout == SA_LAYOUT_INDEX_SYMLINK);
456 		/* Need to use a spill block pointer if the target is long. */
457 		assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
458 		fs_populate_varszattr(fs, attrbuf, target, objsize,
459 		    sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
460 		sahdr->sa_lengths[1] = (uint16_t)objsize;
461 	}
462 
463 	dnode->dn_bonuslen = bonussz;
464 }
465 
466 static void
467 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
468 {
469 	struct dnode_cursor *c;
470 	dnode_phys_t *dnode;
471 	zfs_opt_t *zfs;
472 	char *buf;
473 	uint64_t dnid;
474 	ssize_t n;
475 	size_t bufsz;
476 	off_t size, target;
477 	int fd;
478 
479 	assert(cur->type == S_IFREG);
480 	assert((cur->inode->flags & FI_ROOT) == 0);
481 
482 	zfs = arg->zfs;
483 
484 	assert(cur->inode->ino != 0);
485 	if ((cur->inode->flags & FI_ALLOCATED) != 0) {
486 		/*
487 		 * This is a hard link of an existing file.
488 		 *
489 		 * XXX-MJ need to check whether it crosses datasets, add a test
490 		 * case for that
491 		 */
492 		fs_populate_dirent(arg, cur, cur->inode->ino);
493 		return;
494 	}
495 
496 	dnode = objset_dnode_bonus_alloc(arg->fs->os,
497 	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
498 	cur->inode->ino = dnid;
499 	cur->inode->flags |= FI_ALLOCATED;
500 
501 	fd = fs_open(cur, arg, O_RDONLY);
502 
503 	buf = zfs->filebuf;
504 	bufsz = sizeof(zfs->filebuf);
505 	size = cur->inode->st.st_size;
506 	c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
507 	for (off_t foff = 0; foff < size; foff += target) {
508 		off_t loc, sofar;
509 
510 		/*
511 		 * Fill up our buffer, handling partial reads.
512 		 *
513 		 * It might be profitable to use copy_file_range(2) here.
514 		 */
515 		sofar = 0;
516 		target = MIN(size - foff, (off_t)bufsz);
517 		do {
518 			n = read(fd, buf + sofar, target);
519 			if (n < 0)
520 				err(1, "reading from '%s'", cur->name);
521 			if (n == 0)
522 				errx(1, "unexpected EOF reading '%s'",
523 				    cur->name);
524 			sofar += n;
525 		} while (sofar < target);
526 
527 		if (target < (off_t)bufsz)
528 			memset(buf + target, 0, bufsz - target);
529 
530 		loc = objset_space_alloc(zfs, arg->fs->os, &target);
531 		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc,
532 		    dnode_cursor_next(zfs, c, foff));
533 	}
534 	eclose(fd);
535 	dnode_cursor_finish(zfs, c);
536 
537 	fs_populate_sattrs(arg, cur, dnode);
538 	fs_populate_dirent(arg, cur, dnid);
539 }
540 
541 static void
542 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
543 {
544 	dnode_phys_t *dnode;
545 	zfs_objset_t *os;
546 	uint64_t dnid;
547 	int dirfd;
548 
549 	assert(cur->type == S_IFDIR);
550 	assert((cur->inode->flags & FI_ALLOCATED) == 0);
551 
552 	os = arg->fs->os;
553 
554 	dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
555 	    DMU_OT_SA, 0, &dnid);
556 
557 	/*
558 	 * Add an entry to the parent directory and open this directory.
559 	 */
560 	if (!SLIST_EMPTY(&arg->dirs)) {
561 		fs_populate_dirent(arg, cur, dnid);
562 		dirfd = fs_open(cur, arg, O_DIRECTORY | O_RDONLY);
563 	} else {
564 		arg->rootdirid = dnid;
565 		dirfd = arg->rootdirfd;
566 		arg->rootdirfd = -1;
567 	}
568 
569 	/*
570 	 * Set ZPL attributes.
571 	 */
572 	fs_populate_sattrs(arg, cur, dnode);
573 
574 	/*
575 	 * If this is a root directory, then its children belong to a different
576 	 * dataset and this directory remains empty in the current objset.
577 	 */
578 	if ((cur->inode->flags & FI_ROOT) == 0) {
579 		struct fs_populate_dir *dir;
580 
581 		dir = ecalloc(1, sizeof(*dir));
582 		dir->dirfd = dirfd;
583 		dir->objid = dnid;
584 		dir->zap = zap_alloc(os, dnode);
585 		SLIST_INSERT_HEAD(&arg->dirs, dir, next);
586 	} else {
587 		zap_write(arg->zfs, zap_alloc(os, dnode));
588 		fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
589 	}
590 }
591 
592 static void
593 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
594 {
595 	dnode_phys_t *dnode;
596 	uint64_t dnid;
597 
598 	assert(cur->type == S_IFLNK);
599 	assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
600 
601 	dnode = objset_dnode_bonus_alloc(arg->fs->os,
602 	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
603 
604 	fs_populate_dirent(arg, cur, dnid);
605 
606 	fs_populate_sattrs(arg, cur, dnode);
607 }
608 
609 static int
610 fs_foreach_populate(fsnode *cur, void *_arg)
611 {
612 	struct fs_populate_arg *arg;
613 	struct fs_populate_dir *dir;
614 	int ret;
615 
616 	arg = _arg;
617 	switch (cur->type) {
618 	case S_IFREG:
619 		fs_populate_file(cur, arg);
620 		break;
621 	case S_IFDIR:
622 		if (fsnode_isroot(cur))
623 			break;
624 		fs_populate_dir(cur, arg);
625 		break;
626 	case S_IFLNK:
627 		fs_populate_symlink(cur, arg);
628 		break;
629 	default:
630 		assert(0);
631 	}
632 
633 	ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
634 
635 	if (cur->next == NULL &&
636 	    (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
637 		/*
638 		 * We reached a terminal node in a subtree.  Walk back up and
639 		 * write out directories.  We're done once we hit the root of a
640 		 * dataset or find a level where we're not on the edge of the
641 		 * tree.
642 		 */
643 		do {
644 			dir = SLIST_FIRST(&arg->dirs);
645 			SLIST_REMOVE_HEAD(&arg->dirs, next);
646 			zap_write(arg->zfs, dir->zap);
647 			if (dir->dirfd != -1)
648 				eclose(dir->dirfd);
649 			free(dir);
650 			cur = cur->parent;
651 		} while (cur != NULL && cur->next == NULL &&
652 		    (cur->inode->flags & FI_ROOT) == 0);
653 	}
654 
655 	return (ret);
656 }
657 
658 static void
659 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
660     const sa_attr_type_t layout[], size_t sacnt)
661 {
662 	char ti[16];
663 
664 	assert(sizeof(layout[0]) == 2);
665 
666 	snprintf(ti, sizeof(ti), "%u", index);
667 	zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
668 	    (const uint8_t *)layout);
669 }
670 
671 /*
672  * Initialize system attribute tables.
673  *
674  * There are two elements to this.  First, we write the zpl_attrs[] and
675  * zpl_attr_layout[] tables to disk.  Then we create a lookup table which
676  * allows us to set file attributes quickly.
677  */
678 static uint64_t
679 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
680 {
681 	zfs_zap_t *sazap, *salzap, *sarzap;
682 	zfs_objset_t *os;
683 	dnode_phys_t *saobj, *salobj, *sarobj;
684 	uint64_t saobjid, salobjid, sarobjid;
685 	uint16_t offset;
686 
687 	os = fs->os;
688 
689 	/*
690 	 * The on-disk tables are stored in two ZAP objects, the registry object
691 	 * and the layout object.  Individual attributes are described by
692 	 * entries in the registry object; for example, the value for the
693 	 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
694 	 * The attributes of a file are ordered according to one of the layouts
695 	 * defined in the layout object.  The master node object is simply used
696 	 * to locate the registry and layout objects.
697 	 */
698 	saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
699 	salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
700 	sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
701 
702 	sarzap = zap_alloc(os, sarobj);
703 	for (size_t i = 0; i < nitems(zpl_attrs); i++) {
704 		const zfs_sattr_t *sa;
705 		uint64_t attr;
706 
707 		attr = 0;
708 		sa = &zpl_attrs[i];
709 		SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
710 		zap_add_uint64(sarzap, sa->name, attr);
711 	}
712 	zap_write(zfs, sarzap);
713 
714 	/*
715 	 * Layouts are arrays of indices into the registry.  We define two
716 	 * layouts for use by the ZPL, one for non-symlinks and one for
717 	 * symlinks.  They are identical except that the symlink layout includes
718 	 * ZPL_SYMLINK as its final attribute.
719 	 */
720 	salzap = zap_alloc(os, salobj);
721 	assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
722 	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
723 	    zpl_attr_layout, nitems(zpl_attr_layout) - 1);
724 	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
725 	    zpl_attr_layout, nitems(zpl_attr_layout));
726 	zap_write(zfs, salzap);
727 
728 	sazap = zap_alloc(os, saobj);
729 	zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
730 	zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
731 	zap_write(zfs, sazap);
732 
733 	/* Sanity check. */
734 	for (size_t i = 0; i < nitems(zpl_attrs); i++)
735 		assert(i == zpl_attrs[i].id);
736 
737 	/*
738 	 * Build the offset table used when setting file attributes.  File
739 	 * attributes are stored in the object's bonus buffer; this table
740 	 * provides the buffer offset of attributes referenced by the layout
741 	 * table.
742 	 */
743 	fs->sacnt = nitems(zpl_attrs);
744 	fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
745 	for (size_t i = 0; i < fs->sacnt; i++)
746 		fs->saoffs[i] = 0xffff;
747 	offset = 0;
748 	for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
749 		uint16_t size;
750 
751 		assert(zpl_attr_layout[i] < fs->sacnt);
752 
753 		fs->saoffs[zpl_attr_layout[i]] = offset;
754 		size = zpl_attrs[zpl_attr_layout[i]].size;
755 		offset += size;
756 	}
757 	fs->satab = zpl_attrs;
758 
759 	return (saobjid);
760 }
761 
762 static void
763 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
764 {
765 	char *mountpoint, *origmountpoint, *name, *next;
766 	fsnode *cur, *root;
767 	uint64_t canmount;
768 
769 	if (!dsl_dir_has_dataset(dsldir))
770 		return;
771 
772 	if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
773 		return;
774 	mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
775 	if (mountpoint == NULL)
776 		return;
777 
778 	/*
779 	 * If we were asked to specify a bootfs, set it here.
780 	 */
781 	if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
782 	    dsl_dir_fullname(dsldir)) == 0) {
783 		zap_add_uint64(zfs->poolprops, "bootfs",
784 		    dsl_dir_dataset_id(dsldir));
785 	}
786 
787 	origmountpoint = mountpoint;
788 
789 	/*
790 	 * Figure out which fsnode corresponds to our mountpoint.
791 	 */
792 	root = arg;
793 	cur = root;
794 	if (strcmp(mountpoint, zfs->rootpath) != 0) {
795 		mountpoint += strlen(zfs->rootpath);
796 
797 		/*
798 		 * Look up the directory in the staged tree.  For example, if
799 		 * the dataset's mount point is /foo/bar/baz, we'll search the
800 		 * root directory for "foo", search "foo" for "baz", and so on.
801 		 * Each intermediate name must refer to a directory; the final
802 		 * component need not exist.
803 		 */
804 		cur = root;
805 		for (next = name = mountpoint; next != NULL;) {
806 			for (; *next == '/'; next++)
807 				;
808 			name = strsep(&next, "/");
809 
810 			for (; cur != NULL && strcmp(cur->name, name) != 0;
811 			    cur = cur->next)
812 				;
813 			if (cur == NULL) {
814 				if (next == NULL)
815 					break;
816 				errx(1, "missing mountpoint directory for `%s'",
817 				    dsl_dir_fullname(dsldir));
818 			}
819 			if (cur->type != S_IFDIR) {
820 				errx(1,
821 				    "mountpoint for `%s' is not a directory",
822 				    dsl_dir_fullname(dsldir));
823 			}
824 			if (next != NULL)
825 				cur = cur->child;
826 		}
827 	}
828 
829 	if (cur != NULL) {
830 		assert(cur->type == S_IFDIR);
831 
832 		/*
833 		 * Multiple datasets shouldn't share a mountpoint.  It's
834 		 * technically allowed, but it's not clear what makefs should do
835 		 * in that case.
836 		 */
837 		assert((cur->inode->flags & FI_ROOT) == 0);
838 		if (cur != root)
839 			cur->inode->flags |= FI_ROOT;
840 		assert(cur->inode->param == NULL);
841 		cur->inode->param = dsldir;
842 	}
843 
844 	free(origmountpoint);
845 }
846 
847 static int
848 fs_foreach_mark(fsnode *cur, void *arg)
849 {
850 	uint64_t *countp;
851 
852 	countp = arg;
853 	if (cur->type == S_IFDIR && fsnode_isroot(cur))
854 		return (1);
855 
856 	if (cur->inode->ino == 0) {
857 		cur->inode->ino = ++(*countp);
858 		cur->inode->nlink = 1;
859 	} else {
860 		cur->inode->nlink++;
861 	}
862 
863 	return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
864 }
865 
866 /*
867  * Create a filesystem dataset.  More specifically:
868  * - create an object set for the dataset,
869  * - add required metadata (SA tables, property definitions, etc.) to that
870  *   object set,
871  * - optionally populate the object set with file objects, using "root" as the
872  *   root directory.
873  *
874  * "dirfd" is a directory descriptor for the directory referenced by "root".  It
875  * is closed before returning.
876  */
877 static void
878 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
879 {
880 	struct fs_populate_arg arg;
881 	zfs_fs_t fs;
882 	zfs_zap_t *masterzap;
883 	zfs_objset_t *os;
884 	dnode_phys_t *deleteq, *masterobj;
885 	uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
886 	bool fakedroot;
887 
888 	/*
889 	 * This dataset's mountpoint doesn't exist in the staging tree, or the
890 	 * dataset doesn't have a mountpoint at all.  In either case we still
891 	 * need a root directory.  Fake up a root fsnode to handle this case.
892 	 */
893 	fakedroot = root == NULL;
894 	if (fakedroot) {
895 		struct stat *stp;
896 
897 		assert(dirfd == -1);
898 
899 		root = ecalloc(1, sizeof(*root));
900 		root->inode = ecalloc(1, sizeof(*root->inode));
901 		root->name = estrdup(".");
902 		root->type = S_IFDIR;
903 
904 		stp = &root->inode->st;
905 		stp->st_uid = 0;
906 		stp->st_gid = 0;
907 		stp->st_mode = S_IFDIR | 0755;
908 	}
909 	assert(root->type == S_IFDIR);
910 	assert(fsnode_isroot(root));
911 
912 	/*
913 	 * Initialize the object set for this dataset.
914 	 */
915 	os = objset_alloc(zfs, DMU_OST_ZFS);
916 	masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
917 	assert(moid == MASTER_NODE_OBJ);
918 
919 	memset(&fs, 0, sizeof(fs));
920 	fs.os = os;
921 
922 	/*
923 	 * Create the ZAP SA layout now since filesystem object dnodes will
924 	 * refer to those attributes.
925 	 */
926 	saobjid = fs_set_zpl_attrs(zfs, &fs);
927 
928 	/*
929 	 * Make a pass over the staged directory to detect hard links and assign
930 	 * virtual dnode numbers.
931 	 */
932 	dnodecount = 1; /* root directory */
933 	fsnode_foreach(root, fs_foreach_mark, &dnodecount);
934 
935 	/*
936 	 * Make a second pass to populate the dataset with files from the
937 	 * staged directory.  Most of our runtime is spent here.
938 	 */
939 	arg.rootdirfd = dirfd;
940 	arg.zfs = zfs;
941 	arg.fs = &fs;
942 	SLIST_INIT(&arg.dirs);
943 	fs_populate_dir(root, &arg);
944 	assert(!SLIST_EMPTY(&arg.dirs));
945 	fsnode_foreach(root, fs_foreach_populate, &arg);
946 	assert(SLIST_EMPTY(&arg.dirs));
947 	rootdirid = arg.rootdirid;
948 
949 	/*
950 	 * Create an empty delete queue.  We don't do anything with it, but
951 	 * OpenZFS will refuse to mount filesystems that don't have one.
952 	 */
953 	deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
954 	zap_write(zfs, zap_alloc(os, deleteq));
955 
956 	/*
957 	 * Populate and write the master node object.  This is a ZAP object
958 	 * containing various dataset properties and the object IDs of the root
959 	 * directory and delete queue.
960 	 */
961 	masterzap = zap_alloc(os, masterobj);
962 	zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
963 	zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
964 	zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
965 	zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
966 	zap_add_uint64(masterzap, "normalization", 0 /* off */);
967 	zap_add_uint64(masterzap, "utf8only", 0 /* off */);
968 	zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
969 	zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
970 	zap_write(zfs, masterzap);
971 
972 	/*
973 	 * All finished with this object set, we may as well write it now.
974 	 * The DSL layer will sum up the bytes consumed by each dataset using
975 	 * information stored in the object set, so it can't be freed just yet.
976 	 */
977 	dsl_dir_dataset_write(zfs, os, dsldir);
978 
979 	if (fakedroot) {
980 		free(root->inode);
981 		free(root->name);
982 		free(root);
983 	}
984 	free(fs.saoffs);
985 }
986 
987 /*
988  * Create an object set for each DSL directory which has a dataset and doesn't
989  * already have an object set.
990  */
991 static void
992 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
993 {
994 	if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
995 		fs_build_one(zfs, dsldir, NULL, -1);
996 }
997 
998 /*
999  * Create our datasets and populate them with files.
1000  */
1001 void
1002 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
1003 {
1004 	/*
1005 	 * Run through our datasets and find the root fsnode for each one.  Each
1006 	 * root fsnode is flagged so that we can figure out which dataset it
1007 	 * belongs to.
1008 	 */
1009 	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
1010 
1011 	/*
1012 	 * Did we find our boot filesystem?
1013 	 */
1014 	if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
1015 		errx(1, "no mounted dataset matches bootfs property `%s'",
1016 		    zfs->bootfs);
1017 
1018 	/*
1019 	 * Traverse the file hierarchy starting from the root fsnode.  One
1020 	 * dataset, not necessarily the root dataset, must "own" the root
1021 	 * directory by having its mountpoint be equal to the root path.
1022 	 *
1023 	 * As roots of other datasets are encountered during the traversal,
1024 	 * fs_build_one() recursively creates the corresponding object sets and
1025 	 * populates them.  Once this function has returned, all datasets will
1026 	 * have been fully populated.
1027 	 */
1028 	fs_build_one(zfs, root->inode->param, root, dirfd);
1029 
1030 	/*
1031 	 * Now create object sets for datasets whose mountpoints weren't found
1032 	 * in the staging directory, either because there is no mountpoint, or
1033 	 * because the mountpoint doesn't correspond to an existing directory.
1034 	 */
1035 	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
1036 }
1037