xref: /freebsd/usr.sbin/makefs/zfs/fs.c (revision 2008043f386721d58158e37e0d7e50df8095942d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 The FreeBSD Foundation
5  *
6  * This software was developed by Mark Johnston under sponsorship from
7  * the FreeBSD Foundation.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions are
11  * met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/stat.h>
32 
33 #include <assert.h>
34 #include <dirent.h>
35 #include <fcntl.h>
36 #include <stdlib.h>
37 #include <string.h>
38 #include <unistd.h>
39 
40 #include <util.h>
41 
42 #include "makefs.h"
43 #include "zfs.h"
44 
45 typedef struct {
46 	const char	*name;
47 	unsigned int	id;
48 	uint16_t	size;
49 	sa_bswap_type_t	bs;
50 } zfs_sattr_t;
51 
52 typedef struct zfs_fs {
53 	zfs_objset_t	*os;
54 
55 	/* Offset table for system attributes, indexed by a zpl_attr_t. */
56 	uint16_t	*saoffs;
57 	size_t		sacnt;
58 	const zfs_sattr_t *satab;
59 } zfs_fs_t;
60 
61 /*
62  * The order of the attributes doesn't matter, this is simply the one hard-coded
63  * by OpenZFS, based on a zdb dump of the SA_REGISTRY table.
64  */
65 typedef enum zpl_attr {
66 	ZPL_ATIME,
67 	ZPL_MTIME,
68 	ZPL_CTIME,
69 	ZPL_CRTIME,
70 	ZPL_GEN,
71 	ZPL_MODE,
72 	ZPL_SIZE,
73 	ZPL_PARENT,
74 	ZPL_LINKS,
75 	ZPL_XATTR,
76 	ZPL_RDEV,
77 	ZPL_FLAGS,
78 	ZPL_UID,
79 	ZPL_GID,
80 	ZPL_PAD,
81 	ZPL_ZNODE_ACL,
82 	ZPL_DACL_COUNT,
83 	ZPL_SYMLINK,
84 	ZPL_SCANSTAMP,
85 	ZPL_DACL_ACES,
86 	ZPL_DXATTR,
87 	ZPL_PROJID,
88 } zpl_attr_t;
89 
90 /*
91  * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t.
92  */
93 static const zfs_sattr_t zpl_attrs[] = {
94 #define	_ZPL_ATTR(n, s, b)	{ .name = #n, .id = n, .size = s, .bs = b }
95 	_ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
96 	_ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
97 	_ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
98 	_ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY),
99 	_ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY),
100 	_ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY),
101 	_ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY),
102 	_ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY),
103 	_ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY),
104 	_ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY),
105 	_ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY),
106 	_ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY),
107 	_ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY),
108 	_ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY),
109 	_ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY),
110 	_ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY),
111 	_ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY),
112 	_ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY),
113 	_ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY),
114 	_ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL),
115 	_ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY),
116 	_ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY),
117 #undef ZPL_ATTR
118 };
119 
120 /*
121  * This layout matches that of a filesystem created using OpenZFS on FreeBSD.
122  * It need not match in general, but FreeBSD's loader doesn't bother parsing the
123  * layout and just hard-codes attribute offsets.
124  */
125 static const sa_attr_type_t zpl_attr_layout[] = {
126 	ZPL_MODE,
127 	ZPL_SIZE,
128 	ZPL_GEN,
129 	ZPL_UID,
130 	ZPL_GID,
131 	ZPL_PARENT,
132 	ZPL_FLAGS,
133 	ZPL_ATIME,
134 	ZPL_MTIME,
135 	ZPL_CTIME,
136 	ZPL_CRTIME,
137 	ZPL_LINKS,
138 	ZPL_DACL_COUNT,
139 	ZPL_DACL_ACES,
140 	ZPL_SYMLINK,
141 };
142 
143 /*
144  * Keys for the ZPL attribute tables in the SA layout ZAP.  The first two
145  * indices are reserved for legacy attribute encoding.
146  */
147 #define	SA_LAYOUT_INDEX_DEFAULT	2
148 #define	SA_LAYOUT_INDEX_SYMLINK	3
149 
150 struct fs_populate_dir {
151 	SLIST_ENTRY(fs_populate_dir) next;
152 	int			dirfd;
153 	uint64_t		objid;
154 	zfs_zap_t		*zap;
155 };
156 
157 struct fs_populate_arg {
158 	zfs_opt_t	*zfs;
159 	zfs_fs_t	*fs;			/* owning filesystem */
160 	uint64_t	rootdirid;		/* root directory dnode ID */
161 	int		rootdirfd;		/* root directory fd */
162 	SLIST_HEAD(, fs_populate_dir) dirs;	/* stack of directories */
163 };
164 
165 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int);
166 
167 static void
168 eclose(int fd)
169 {
170 	if (close(fd) != 0)
171 		err(1, "close");
172 }
173 
174 static bool
175 fsnode_isroot(const fsnode *cur)
176 {
177 	return (strcmp(cur->name, ".") == 0);
178 }
179 
180 /*
181  * Visit each node in a directory hierarchy, in pre-order depth-first order.
182  */
183 static void
184 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg)
185 {
186 	assert(root->type == S_IFDIR);
187 
188 	for (fsnode *cur = root; cur != NULL; cur = cur->next) {
189 		assert(cur->type == S_IFREG || cur->type == S_IFDIR ||
190 		    cur->type == S_IFLNK);
191 
192 		if (cb(cur, arg) == 0)
193 			continue;
194 		if (cur->type == S_IFDIR && cur->child != NULL)
195 			fsnode_foreach(cur->child, cb, arg);
196 	}
197 }
198 
199 static void
200 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid)
201 {
202 	struct fs_populate_dir *dir;
203 	uint64_t type;
204 
205 	switch (cur->type) {
206 	case S_IFREG:
207 		type = DT_REG;
208 		break;
209 	case S_IFDIR:
210 		type = DT_DIR;
211 		break;
212 	case S_IFLNK:
213 		type = DT_LNK;
214 		break;
215 	default:
216 		assert(0);
217 	}
218 
219 	dir = SLIST_FIRST(&arg->dirs);
220 	zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid));
221 }
222 
223 static void
224 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind,
225     size_t *szp)
226 {
227 	assert(ind < fs->sacnt);
228 	assert(fs->saoffs[ind] != 0xffff);
229 
230 	memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size);
231 	*szp += fs->satab[ind].size;
232 }
233 
234 static void
235 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val,
236     size_t valsz, size_t varoff, uint16_t ind, size_t *szp)
237 {
238 	assert(ind < fs->sacnt);
239 	assert(fs->saoffs[ind] != 0xffff);
240 	assert(fs->satab[ind].size == 0);
241 
242 	memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz);
243 	*szp += valsz;
244 }
245 
246 /*
247  * Derive the relative fd/path combo needed to access a file.  Ideally we'd
248  * always be able to use relative lookups (i.e., use the *at() system calls),
249  * since they require less path translation and are more amenable to sandboxing,
250  * but the handling of multiple staging directories makes that difficult.  To
251  * make matters worse, we have no choice but to use relative lookups when
252  * dealing with an mtree manifest, so both mechanisms are implemented.
253  */
254 static void
255 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg,
256     char *path, size_t sz, int *dirfdp)
257 {
258 	if (cur->contents != NULL) {
259 		size_t n;
260 
261 		*dirfdp = AT_FDCWD;
262 		n = strlcpy(path, cur->contents, sz);
263 		assert(n < sz);
264 	} else if (cur->root == NULL) {
265 		size_t n;
266 
267 		*dirfdp = SLIST_FIRST(&arg->dirs)->dirfd;
268 		n = strlcpy(path, cur->name, sz);
269 		assert(n < sz);
270 	} else {
271 		int n;
272 
273 		*dirfdp = AT_FDCWD;
274 		n = snprintf(path, sz, "%s/%s/%s",
275 		    cur->root, cur->path, cur->name);
276 		assert(n >= 0);
277 		assert((size_t)n < sz);
278 	}
279 }
280 
281 static int
282 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags)
283 {
284 	char path[PATH_MAX];
285 	int fd;
286 
287 	fs_populate_path(cur, arg, path, sizeof(path), &fd);
288 
289 	fd = openat(fd, path, flags);
290 	if (fd < 0)
291 		err(1, "openat(%s)", path);
292 	return (fd);
293 }
294 
295 static int
296 fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags)
297 {
298 	int fd;
299 	char path[PATH_MAX];
300 
301 	fs_populate_path(cur, arg, path, sizeof(path), &fd);
302 
303 	return (openat(fd, path, flags));
304 }
305 
306 static void
307 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg,
308     char *buf, size_t bufsz)
309 {
310 	char path[PATH_MAX];
311 	int fd;
312 
313 	if (cur->symlink != NULL) {
314 		size_t n;
315 
316 		n = strlcpy(buf, cur->symlink, bufsz);
317 		assert(n < bufsz);
318 	} else {
319 		ssize_t n;
320 
321 		fs_populate_path(cur, arg, path, sizeof(path), &fd);
322 
323 		n = readlinkat(fd, path, buf, bufsz - 1);
324 		if (n == -1)
325 			err(1, "readlinkat(%s)", cur->name);
326 		buf[n] = '\0';
327 	}
328 }
329 
330 static void
331 fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts,
332     uint16_t ind, size_t *szp)
333 {
334 	uint64_t timebuf[2];
335 
336 	assert(ind < fs->sacnt);
337 	assert(fs->saoffs[ind] != 0xffff);
338 	assert(fs->satab[ind].size == sizeof(timebuf));
339 
340 	timebuf[0] = ts->tv_sec;
341 	timebuf[1] = ts->tv_nsec;
342 	fs_populate_attr(fs, attrbuf, timebuf, ind, szp);
343 }
344 
345 static void
346 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur,
347     dnode_phys_t *dnode)
348 {
349 	char target[PATH_MAX];
350 	zfs_fs_t *fs;
351 	zfs_ace_hdr_t aces[3];
352 	struct stat *sb;
353 	sa_hdr_phys_t *sahdr;
354 	uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid;
355 	char *attrbuf;
356 	size_t bonussz, hdrsz;
357 	int layout;
358 
359 	assert(dnode->dn_bonustype == DMU_OT_SA);
360 	assert(dnode->dn_nblkptr == 1);
361 
362 	fs = arg->fs;
363 	sb = &cur->inode->st;
364 
365 	switch (cur->type) {
366 	case S_IFREG:
367 		layout = SA_LAYOUT_INDEX_DEFAULT;
368 		links = cur->inode->nlink;
369 		objsize = sb->st_size;
370 		parent = SLIST_FIRST(&arg->dirs)->objid;
371 		break;
372 	case S_IFDIR:
373 		layout = SA_LAYOUT_INDEX_DEFAULT;
374 		links = 1; /* .. */
375 		objsize = 1; /* .. */
376 
377 		/*
378 		 * The size of a ZPL directory is the number of entries
379 		 * (including "." and ".."), and the link count is the number of
380 		 * entries which are directories (including "." and "..").
381 		 */
382 		for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child;
383 		    c != NULL; c = c->next) {
384 			if (c->type == S_IFDIR)
385 				links++;
386 			objsize++;
387 		}
388 
389 		/* The root directory is its own parent. */
390 		parent = SLIST_EMPTY(&arg->dirs) ?
391 		    arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid;
392 		break;
393 	case S_IFLNK:
394 		fs_readlink(cur, arg, target, sizeof(target));
395 
396 		layout = SA_LAYOUT_INDEX_SYMLINK;
397 		links = 1;
398 		objsize = strlen(target);
399 		parent = SLIST_FIRST(&arg->dirs)->objid;
400 		break;
401 	default:
402 		assert(0);
403 	}
404 
405 	daclcount = nitems(aces);
406 	flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED |
407 	    ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */
408 	gen = 1;
409 	gid = sb->st_gid;
410 	mode = sb->st_mode;
411 	uid = sb->st_uid;
412 
413 	memset(aces, 0, sizeof(aces));
414 	aces[0].z_flags = ACE_OWNER;
415 	aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
416 	aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER |
417 	    ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL |
418 	    ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
419 	if ((mode & S_IRUSR) != 0)
420 		aces[0].z_access_mask |= ACE_READ_DATA;
421 	if ((mode & S_IWUSR) != 0)
422 		aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
423 	if ((mode & S_IXUSR) != 0)
424 		aces[0].z_access_mask |= ACE_EXECUTE;
425 
426 	aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP;
427 	aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
428 	aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
429 	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
430 	if ((mode & S_IRGRP) != 0)
431 		aces[1].z_access_mask |= ACE_READ_DATA;
432 	if ((mode & S_IWGRP) != 0)
433 		aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
434 	if ((mode & S_IXGRP) != 0)
435 		aces[1].z_access_mask |= ACE_EXECUTE;
436 
437 	aces[2].z_flags = ACE_EVERYONE;
438 	aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE;
439 	aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES |
440 	    ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE;
441 	if ((mode & S_IROTH) != 0)
442 		aces[2].z_access_mask |= ACE_READ_DATA;
443 	if ((mode & S_IWOTH) != 0)
444 		aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA;
445 	if ((mode & S_IXOTH) != 0)
446 		aces[2].z_access_mask |= ACE_EXECUTE;
447 
448 	switch (layout) {
449 	case SA_LAYOUT_INDEX_DEFAULT:
450 		/* At most one variable-length attribute. */
451 		hdrsz = sizeof(uint64_t);
452 		break;
453 	case SA_LAYOUT_INDEX_SYMLINK:
454 		/* At most five variable-length attributes. */
455 		hdrsz = sizeof(uint64_t) * 2;
456 		break;
457 	default:
458 		assert(0);
459 	}
460 
461 	sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode);
462 	sahdr->sa_magic = SA_MAGIC;
463 	SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz);
464 
465 	bonussz = SA_HDR_SIZE(sahdr);
466 	attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr);
467 
468 	fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz);
469 	fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz);
470 	fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz);
471 	fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz);
472 	fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz);
473 	fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz);
474 	fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz);
475 	fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz);
476 	fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz);
477 
478 	/*
479 	 * We deliberately set atime = mtime here to ensure that images are
480 	 * reproducible.
481 	 */
482 	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz);
483 	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz);
484 	fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz);
485 #ifdef __linux__
486 	/* Linux has no st_birthtim; approximate with st_ctim */
487 	fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz);
488 #else
489 	fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz);
490 #endif
491 
492 	fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0,
493 	    ZPL_DACL_ACES, &bonussz);
494 	sahdr->sa_lengths[0] = sizeof(aces);
495 
496 	if (cur->type == S_IFLNK) {
497 		assert(layout == SA_LAYOUT_INDEX_SYMLINK);
498 		/* Need to use a spill block pointer if the target is long. */
499 		assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN);
500 		fs_populate_varszattr(fs, attrbuf, target, objsize,
501 		    sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz);
502 		sahdr->sa_lengths[1] = (uint16_t)objsize;
503 	}
504 
505 	dnode->dn_bonuslen = bonussz;
506 }
507 
508 static void
509 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg)
510 {
511 	struct dnode_cursor *c;
512 	dnode_phys_t *dnode;
513 	zfs_opt_t *zfs;
514 	char *buf;
515 	uint64_t dnid;
516 	ssize_t n;
517 	size_t bufsz;
518 	off_t nbytes, reqbytes, size;
519 	int fd;
520 
521 	assert(cur->type == S_IFREG);
522 	assert((cur->inode->flags & FI_ROOT) == 0);
523 
524 	zfs = arg->zfs;
525 
526 	assert(cur->inode->ino != 0);
527 	if ((cur->inode->flags & FI_ALLOCATED) != 0) {
528 		/*
529 		 * This is a hard link of an existing file.
530 		 *
531 		 * XXX-MJ need to check whether it crosses datasets, add a test
532 		 * case for that
533 		 */
534 		fs_populate_dirent(arg, cur, cur->inode->ino);
535 		return;
536 	}
537 
538 	dnode = objset_dnode_bonus_alloc(arg->fs->os,
539 	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
540 	cur->inode->ino = dnid;
541 	cur->inode->flags |= FI_ALLOCATED;
542 
543 	fd = fs_open(cur, arg, O_RDONLY);
544 
545 	buf = zfs->filebuf;
546 	bufsz = sizeof(zfs->filebuf);
547 	size = cur->inode->st.st_size;
548 	c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0);
549 	for (off_t foff = 0; foff < size; foff += nbytes) {
550 		off_t loc, sofar;
551 
552 		/*
553 		 * Fill up our buffer, handling partial reads.
554 		 */
555 		sofar = 0;
556 		nbytes = MIN(size - foff, (off_t)bufsz);
557 		do {
558 			n = read(fd, buf + sofar, nbytes);
559 			if (n < 0)
560 				err(1, "reading from '%s'", cur->name);
561 			if (n == 0)
562 				errx(1, "unexpected EOF reading '%s'",
563 				    cur->name);
564 			sofar += n;
565 		} while (sofar < nbytes);
566 
567 		if (nbytes < (off_t)bufsz)
568 			memset(buf + nbytes, 0, bufsz - nbytes);
569 
570 		reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE;
571 		loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes);
572 		vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc,
573 		    dnode_cursor_next(zfs, c, foff));
574 	}
575 	eclose(fd);
576 	dnode_cursor_finish(zfs, c);
577 
578 	fs_populate_sattrs(arg, cur, dnode);
579 	fs_populate_dirent(arg, cur, dnid);
580 }
581 
582 static void
583 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg)
584 {
585 	dnode_phys_t *dnode;
586 	zfs_objset_t *os;
587 	uint64_t dnid;
588 	int dirfd;
589 
590 	assert(cur->type == S_IFDIR);
591 	assert((cur->inode->flags & FI_ALLOCATED) == 0);
592 
593 	os = arg->fs->os;
594 
595 	dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS,
596 	    DMU_OT_SA, 0, &dnid);
597 
598 	/*
599 	 * Add an entry to the parent directory and open this directory.
600 	 */
601 	if (!SLIST_EMPTY(&arg->dirs)) {
602 		fs_populate_dirent(arg, cur, dnid);
603 		/*
604 		 * We only need the directory fd if we're finding files in
605 		 * it.  If it's just there for other directories or
606 		 * files using contents= we don't need to succeed here.
607 		 */
608 		dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY);
609 	} else {
610 		arg->rootdirid = dnid;
611 		dirfd = arg->rootdirfd;
612 		arg->rootdirfd = -1;
613 	}
614 
615 	/*
616 	 * Set ZPL attributes.
617 	 */
618 	fs_populate_sattrs(arg, cur, dnode);
619 
620 	/*
621 	 * If this is a root directory, then its children belong to a different
622 	 * dataset and this directory remains empty in the current objset.
623 	 */
624 	if ((cur->inode->flags & FI_ROOT) == 0) {
625 		struct fs_populate_dir *dir;
626 
627 		dir = ecalloc(1, sizeof(*dir));
628 		dir->dirfd = dirfd;
629 		dir->objid = dnid;
630 		dir->zap = zap_alloc(os, dnode);
631 		SLIST_INSERT_HEAD(&arg->dirs, dir, next);
632 	} else {
633 		zap_write(arg->zfs, zap_alloc(os, dnode));
634 		fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd);
635 	}
636 }
637 
638 static void
639 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg)
640 {
641 	dnode_phys_t *dnode;
642 	uint64_t dnid;
643 
644 	assert(cur->type == S_IFLNK);
645 	assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0);
646 
647 	dnode = objset_dnode_bonus_alloc(arg->fs->os,
648 	    DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid);
649 
650 	fs_populate_dirent(arg, cur, dnid);
651 
652 	fs_populate_sattrs(arg, cur, dnode);
653 }
654 
655 static int
656 fs_foreach_populate(fsnode *cur, void *_arg)
657 {
658 	struct fs_populate_arg *arg;
659 	struct fs_populate_dir *dir;
660 	int ret;
661 
662 	arg = _arg;
663 	switch (cur->type) {
664 	case S_IFREG:
665 		fs_populate_file(cur, arg);
666 		break;
667 	case S_IFDIR:
668 		if (fsnode_isroot(cur))
669 			break;
670 		fs_populate_dir(cur, arg);
671 		break;
672 	case S_IFLNK:
673 		fs_populate_symlink(cur, arg);
674 		break;
675 	default:
676 		assert(0);
677 	}
678 
679 	ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1;
680 
681 	if (cur->next == NULL &&
682 	    (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) {
683 		/*
684 		 * We reached a terminal node in a subtree.  Walk back up and
685 		 * write out directories.  We're done once we hit the root of a
686 		 * dataset or find a level where we're not on the edge of the
687 		 * tree.
688 		 */
689 		do {
690 			dir = SLIST_FIRST(&arg->dirs);
691 			SLIST_REMOVE_HEAD(&arg->dirs, next);
692 			zap_write(arg->zfs, dir->zap);
693 			if (dir->dirfd != -1)
694 				eclose(dir->dirfd);
695 			free(dir);
696 			cur = cur->parent;
697 		} while (cur != NULL && cur->next == NULL &&
698 		    (cur->inode->flags & FI_ROOT) == 0);
699 	}
700 
701 	return (ret);
702 }
703 
704 static void
705 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index,
706     const sa_attr_type_t layout[], size_t sacnt)
707 {
708 	char ti[16];
709 
710 	assert(sizeof(layout[0]) == 2);
711 
712 	snprintf(ti, sizeof(ti), "%u", index);
713 	zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt,
714 	    (const uint8_t *)layout);
715 }
716 
717 /*
718  * Initialize system attribute tables.
719  *
720  * There are two elements to this.  First, we write the zpl_attrs[] and
721  * zpl_attr_layout[] tables to disk.  Then we create a lookup table which
722  * allows us to set file attributes quickly.
723  */
724 static uint64_t
725 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs)
726 {
727 	zfs_zap_t *sazap, *salzap, *sarzap;
728 	zfs_objset_t *os;
729 	dnode_phys_t *saobj, *salobj, *sarobj;
730 	uint64_t saobjid, salobjid, sarobjid;
731 	uint16_t offset;
732 
733 	os = fs->os;
734 
735 	/*
736 	 * The on-disk tables are stored in two ZAP objects, the registry object
737 	 * and the layout object.  Individual attributes are described by
738 	 * entries in the registry object; for example, the value for the
739 	 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute.
740 	 * The attributes of a file are ordered according to one of the layouts
741 	 * defined in the layout object.  The master node object is simply used
742 	 * to locate the registry and layout objects.
743 	 */
744 	saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid);
745 	salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid);
746 	sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid);
747 
748 	sarzap = zap_alloc(os, sarobj);
749 	for (size_t i = 0; i < nitems(zpl_attrs); i++) {
750 		const zfs_sattr_t *sa;
751 		uint64_t attr;
752 
753 		attr = 0;
754 		sa = &zpl_attrs[i];
755 		SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs);
756 		zap_add_uint64(sarzap, sa->name, attr);
757 	}
758 	zap_write(zfs, sarzap);
759 
760 	/*
761 	 * Layouts are arrays of indices into the registry.  We define two
762 	 * layouts for use by the ZPL, one for non-symlinks and one for
763 	 * symlinks.  They are identical except that the symlink layout includes
764 	 * ZPL_SYMLINK as its final attribute.
765 	 */
766 	salzap = zap_alloc(os, salobj);
767 	assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK);
768 	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT,
769 	    zpl_attr_layout, nitems(zpl_attr_layout) - 1);
770 	fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK,
771 	    zpl_attr_layout, nitems(zpl_attr_layout));
772 	zap_write(zfs, salzap);
773 
774 	sazap = zap_alloc(os, saobj);
775 	zap_add_uint64(sazap, SA_LAYOUTS, salobjid);
776 	zap_add_uint64(sazap, SA_REGISTRY, sarobjid);
777 	zap_write(zfs, sazap);
778 
779 	/* Sanity check. */
780 	for (size_t i = 0; i < nitems(zpl_attrs); i++)
781 		assert(i == zpl_attrs[i].id);
782 
783 	/*
784 	 * Build the offset table used when setting file attributes.  File
785 	 * attributes are stored in the object's bonus buffer; this table
786 	 * provides the buffer offset of attributes referenced by the layout
787 	 * table.
788 	 */
789 	fs->sacnt = nitems(zpl_attrs);
790 	fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs));
791 	for (size_t i = 0; i < fs->sacnt; i++)
792 		fs->saoffs[i] = 0xffff;
793 	offset = 0;
794 	for (size_t i = 0; i < nitems(zpl_attr_layout); i++) {
795 		uint16_t size;
796 
797 		assert(zpl_attr_layout[i] < fs->sacnt);
798 
799 		fs->saoffs[zpl_attr_layout[i]] = offset;
800 		size = zpl_attrs[zpl_attr_layout[i]].size;
801 		offset += size;
802 	}
803 	fs->satab = zpl_attrs;
804 
805 	return (saobjid);
806 }
807 
808 static void
809 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg)
810 {
811 	char *mountpoint, *origmountpoint, *name, *next;
812 	fsnode *cur, *root;
813 	uint64_t canmount;
814 
815 	if (!dsl_dir_has_dataset(dsldir))
816 		return;
817 
818 	if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0)
819 		return;
820 	mountpoint = dsl_dir_get_mountpoint(zfs, dsldir);
821 	if (mountpoint == NULL)
822 		return;
823 
824 	/*
825 	 * If we were asked to specify a bootfs, set it here.
826 	 */
827 	if (zfs->bootfs != NULL && strcmp(zfs->bootfs,
828 	    dsl_dir_fullname(dsldir)) == 0) {
829 		zap_add_uint64(zfs->poolprops, "bootfs",
830 		    dsl_dir_dataset_id(dsldir));
831 	}
832 
833 	origmountpoint = mountpoint;
834 
835 	/*
836 	 * Figure out which fsnode corresponds to our mountpoint.
837 	 */
838 	root = arg;
839 	cur = root;
840 	if (strcmp(mountpoint, zfs->rootpath) != 0) {
841 		mountpoint += strlen(zfs->rootpath);
842 
843 		/*
844 		 * Look up the directory in the staged tree.  For example, if
845 		 * the dataset's mount point is /foo/bar/baz, we'll search the
846 		 * root directory for "foo", search "foo" for "baz", and so on.
847 		 * Each intermediate name must refer to a directory; the final
848 		 * component need not exist.
849 		 */
850 		cur = root;
851 		for (next = name = mountpoint; next != NULL;) {
852 			for (; *next == '/'; next++)
853 				;
854 			name = strsep(&next, "/");
855 
856 			for (; cur != NULL && strcmp(cur->name, name) != 0;
857 			    cur = cur->next)
858 				;
859 			if (cur == NULL) {
860 				if (next == NULL)
861 					break;
862 				errx(1, "missing mountpoint directory for `%s'",
863 				    dsl_dir_fullname(dsldir));
864 			}
865 			if (cur->type != S_IFDIR) {
866 				errx(1,
867 				    "mountpoint for `%s' is not a directory",
868 				    dsl_dir_fullname(dsldir));
869 			}
870 			if (next != NULL)
871 				cur = cur->child;
872 		}
873 	}
874 
875 	if (cur != NULL) {
876 		assert(cur->type == S_IFDIR);
877 
878 		/*
879 		 * Multiple datasets shouldn't share a mountpoint.  It's
880 		 * technically allowed, but it's not clear what makefs should do
881 		 * in that case.
882 		 */
883 		assert((cur->inode->flags & FI_ROOT) == 0);
884 		if (cur != root)
885 			cur->inode->flags |= FI_ROOT;
886 		assert(cur->inode->param == NULL);
887 		cur->inode->param = dsldir;
888 	}
889 
890 	free(origmountpoint);
891 }
892 
893 static int
894 fs_foreach_mark(fsnode *cur, void *arg)
895 {
896 	uint64_t *countp;
897 
898 	countp = arg;
899 	if (cur->type == S_IFDIR && fsnode_isroot(cur))
900 		return (1);
901 
902 	if (cur->inode->ino == 0) {
903 		cur->inode->ino = ++(*countp);
904 		cur->inode->nlink = 1;
905 	} else {
906 		cur->inode->nlink++;
907 	}
908 
909 	return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1);
910 }
911 
912 /*
913  * Create a filesystem dataset.  More specifically:
914  * - create an object set for the dataset,
915  * - add required metadata (SA tables, property definitions, etc.) to that
916  *   object set,
917  * - optionally populate the object set with file objects, using "root" as the
918  *   root directory.
919  *
920  * "dirfd" is a directory descriptor for the directory referenced by "root".  It
921  * is closed before returning.
922  */
923 static void
924 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd)
925 {
926 	struct fs_populate_arg arg;
927 	zfs_fs_t fs;
928 	zfs_zap_t *masterzap;
929 	zfs_objset_t *os;
930 	dnode_phys_t *deleteq, *masterobj;
931 	uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid;
932 	bool fakedroot;
933 
934 	/*
935 	 * This dataset's mountpoint doesn't exist in the staging tree, or the
936 	 * dataset doesn't have a mountpoint at all.  In either case we still
937 	 * need a root directory.  Fake up a root fsnode to handle this case.
938 	 */
939 	fakedroot = root == NULL;
940 	if (fakedroot) {
941 		struct stat *stp;
942 
943 		assert(dirfd == -1);
944 
945 		root = ecalloc(1, sizeof(*root));
946 		root->inode = ecalloc(1, sizeof(*root->inode));
947 		root->name = estrdup(".");
948 		root->type = S_IFDIR;
949 
950 		stp = &root->inode->st;
951 		stp->st_uid = 0;
952 		stp->st_gid = 0;
953 		stp->st_mode = S_IFDIR | 0755;
954 	}
955 	assert(root->type == S_IFDIR);
956 	assert(fsnode_isroot(root));
957 
958 	/*
959 	 * Initialize the object set for this dataset.
960 	 */
961 	os = objset_alloc(zfs, DMU_OST_ZFS);
962 	masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid);
963 	assert(moid == MASTER_NODE_OBJ);
964 
965 	memset(&fs, 0, sizeof(fs));
966 	fs.os = os;
967 
968 	/*
969 	 * Create the ZAP SA layout now since filesystem object dnodes will
970 	 * refer to those attributes.
971 	 */
972 	saobjid = fs_set_zpl_attrs(zfs, &fs);
973 
974 	/*
975 	 * Make a pass over the staged directory to detect hard links and assign
976 	 * virtual dnode numbers.
977 	 */
978 	dnodecount = 1; /* root directory */
979 	fsnode_foreach(root, fs_foreach_mark, &dnodecount);
980 
981 	/*
982 	 * Make a second pass to populate the dataset with files from the
983 	 * staged directory.  Most of our runtime is spent here.
984 	 */
985 	arg.rootdirfd = dirfd;
986 	arg.zfs = zfs;
987 	arg.fs = &fs;
988 	SLIST_INIT(&arg.dirs);
989 	fs_populate_dir(root, &arg);
990 	assert(!SLIST_EMPTY(&arg.dirs));
991 	fsnode_foreach(root, fs_foreach_populate, &arg);
992 	assert(SLIST_EMPTY(&arg.dirs));
993 	rootdirid = arg.rootdirid;
994 
995 	/*
996 	 * Create an empty delete queue.  We don't do anything with it, but
997 	 * OpenZFS will refuse to mount filesystems that don't have one.
998 	 */
999 	deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid);
1000 	zap_write(zfs, zap_alloc(os, deleteq));
1001 
1002 	/*
1003 	 * Populate and write the master node object.  This is a ZAP object
1004 	 * containing various dataset properties and the object IDs of the root
1005 	 * directory and delete queue.
1006 	 */
1007 	masterzap = zap_alloc(os, masterobj);
1008 	zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid);
1009 	zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid);
1010 	zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid);
1011 	zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */);
1012 	zap_add_uint64(masterzap, "normalization", 0 /* off */);
1013 	zap_add_uint64(masterzap, "utf8only", 0 /* off */);
1014 	zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */);
1015 	zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */);
1016 	zap_write(zfs, masterzap);
1017 
1018 	/*
1019 	 * All finished with this object set, we may as well write it now.
1020 	 * The DSL layer will sum up the bytes consumed by each dataset using
1021 	 * information stored in the object set, so it can't be freed just yet.
1022 	 */
1023 	dsl_dir_dataset_write(zfs, os, dsldir);
1024 
1025 	if (fakedroot) {
1026 		free(root->inode);
1027 		free(root->name);
1028 		free(root);
1029 	}
1030 	free(fs.saoffs);
1031 }
1032 
1033 /*
1034  * Create an object set for each DSL directory which has a dataset and doesn't
1035  * already have an object set.
1036  */
1037 static void
1038 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused)
1039 {
1040 	if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir))
1041 		fs_build_one(zfs, dsldir, NULL, -1);
1042 }
1043 
1044 /*
1045  * Create our datasets and populate them with files.
1046  */
1047 void
1048 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root)
1049 {
1050 	/*
1051 	 * Run through our datasets and find the root fsnode for each one.  Each
1052 	 * root fsnode is flagged so that we can figure out which dataset it
1053 	 * belongs to.
1054 	 */
1055 	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root);
1056 
1057 	/*
1058 	 * Did we find our boot filesystem?
1059 	 */
1060 	if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs"))
1061 		errx(1, "no mounted dataset matches bootfs property `%s'",
1062 		    zfs->bootfs);
1063 
1064 	/*
1065 	 * Traverse the file hierarchy starting from the root fsnode.  One
1066 	 * dataset, not necessarily the root dataset, must "own" the root
1067 	 * directory by having its mountpoint be equal to the root path.
1068 	 *
1069 	 * As roots of other datasets are encountered during the traversal,
1070 	 * fs_build_one() recursively creates the corresponding object sets and
1071 	 * populates them.  Once this function has returned, all datasets will
1072 	 * have been fully populated.
1073 	 */
1074 	fs_build_one(zfs, root->inode->param, root, dirfd);
1075 
1076 	/*
1077 	 * Now create object sets for datasets whose mountpoints weren't found
1078 	 * in the staging directory, either because there is no mountpoint, or
1079 	 * because the mountpoint doesn't correspond to an existing directory.
1080 	 */
1081 	dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL);
1082 }
1083