1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/stat.h> 32 33 #include <assert.h> 34 #include <dirent.h> 35 #include <fcntl.h> 36 #include <stdlib.h> 37 #include <string.h> 38 #include <unistd.h> 39 40 #include <util.h> 41 42 #include "makefs.h" 43 #include "zfs.h" 44 45 typedef struct { 46 const char *name; 47 unsigned int id; 48 uint16_t size; 49 sa_bswap_type_t bs; 50 } zfs_sattr_t; 51 52 typedef struct zfs_fs { 53 zfs_objset_t *os; 54 55 /* Offset table for system attributes, indexed by a zpl_attr_t. */ 56 uint16_t *saoffs; 57 size_t sacnt; 58 const zfs_sattr_t *satab; 59 } zfs_fs_t; 60 61 /* 62 * The order of the attributes doesn't matter, this is simply the one hard-coded 63 * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. 64 */ 65 typedef enum zpl_attr { 66 ZPL_ATIME, 67 ZPL_MTIME, 68 ZPL_CTIME, 69 ZPL_CRTIME, 70 ZPL_GEN, 71 ZPL_MODE, 72 ZPL_SIZE, 73 ZPL_PARENT, 74 ZPL_LINKS, 75 ZPL_XATTR, 76 ZPL_RDEV, 77 ZPL_FLAGS, 78 ZPL_UID, 79 ZPL_GID, 80 ZPL_PAD, 81 ZPL_ZNODE_ACL, 82 ZPL_DACL_COUNT, 83 ZPL_SYMLINK, 84 ZPL_SCANSTAMP, 85 ZPL_DACL_ACES, 86 ZPL_DXATTR, 87 ZPL_PROJID, 88 } zpl_attr_t; 89 90 /* 91 * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. 92 */ 93 static const zfs_sattr_t zpl_attrs[] = { 94 #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } 95 _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 96 _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 97 _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 98 _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 99 _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), 100 _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), 101 _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), 102 _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), 103 _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), 104 _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), 105 _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), 106 _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), 107 _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), 108 _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), 109 _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), 110 _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), 111 _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), 112 _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), 113 _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), 114 _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), 115 _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), 116 _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), 117 #undef ZPL_ATTR 118 }; 119 120 /* 121 * This layout matches that of a filesystem created using OpenZFS on FreeBSD. 122 * It need not match in general, but FreeBSD's loader doesn't bother parsing the 123 * layout and just hard-codes attribute offsets. 124 */ 125 static const sa_attr_type_t zpl_attr_layout[] = { 126 ZPL_MODE, 127 ZPL_SIZE, 128 ZPL_GEN, 129 ZPL_UID, 130 ZPL_GID, 131 ZPL_PARENT, 132 ZPL_FLAGS, 133 ZPL_ATIME, 134 ZPL_MTIME, 135 ZPL_CTIME, 136 ZPL_CRTIME, 137 ZPL_LINKS, 138 ZPL_DACL_COUNT, 139 ZPL_DACL_ACES, 140 ZPL_SYMLINK, 141 }; 142 143 /* 144 * Keys for the ZPL attribute tables in the SA layout ZAP. The first two 145 * indices are reserved for legacy attribute encoding. 146 */ 147 #define SA_LAYOUT_INDEX_DEFAULT 2 148 #define SA_LAYOUT_INDEX_SYMLINK 3 149 150 struct fs_populate_dir { 151 SLIST_ENTRY(fs_populate_dir) next; 152 int dirfd; 153 uint64_t objid; 154 zfs_zap_t *zap; 155 }; 156 157 struct fs_populate_arg { 158 zfs_opt_t *zfs; 159 zfs_fs_t *fs; /* owning filesystem */ 160 uint64_t rootdirid; /* root directory dnode ID */ 161 int rootdirfd; /* root directory fd */ 162 SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ 163 }; 164 165 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); 166 167 static void 168 eclose(int fd) 169 { 170 if (close(fd) != 0) 171 err(1, "close"); 172 } 173 174 static bool 175 fsnode_isroot(const fsnode *cur) 176 { 177 return (strcmp(cur->name, ".") == 0); 178 } 179 180 static bool 181 fsnode_valid(const fsnode *cur) 182 { 183 return (cur->type == S_IFREG || cur->type == S_IFDIR || 184 cur->type == S_IFLNK); 185 } 186 187 /* 188 * Visit each node in a directory hierarchy, in pre-order depth-first order. 189 */ 190 static void 191 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) 192 { 193 assert(root->type == S_IFDIR); 194 195 for (fsnode *cur = root; cur != NULL; cur = cur->next) { 196 if (!fsnode_valid(cur)) { 197 warnx("skipping unhandled %s %s/%s", 198 inode_type(cur->type), cur->path, cur->name); 199 continue; 200 } 201 if (cb(cur, arg) == 0) 202 continue; 203 if (cur->type == S_IFDIR && cur->child != NULL) 204 fsnode_foreach(cur->child, cb, arg); 205 } 206 } 207 208 static void 209 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) 210 { 211 struct fs_populate_dir *dir; 212 uint64_t type; 213 214 switch (cur->type) { 215 case S_IFREG: 216 type = DT_REG; 217 break; 218 case S_IFDIR: 219 type = DT_DIR; 220 break; 221 case S_IFLNK: 222 type = DT_LNK; 223 break; 224 default: 225 assert(0); 226 } 227 228 dir = SLIST_FIRST(&arg->dirs); 229 zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); 230 } 231 232 static void 233 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, 234 size_t *szp) 235 { 236 assert(ind < fs->sacnt); 237 assert(fs->saoffs[ind] != 0xffff); 238 239 memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); 240 *szp += fs->satab[ind].size; 241 } 242 243 static void 244 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, 245 size_t valsz, size_t varoff, uint16_t ind, size_t *szp) 246 { 247 assert(ind < fs->sacnt); 248 assert(fs->saoffs[ind] != 0xffff); 249 assert(fs->satab[ind].size == 0); 250 251 memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); 252 *szp += valsz; 253 } 254 255 /* 256 * Derive the relative fd/path combo needed to access a file. Ideally we'd 257 * always be able to use relative lookups (i.e., use the *at() system calls), 258 * since they require less path translation and are more amenable to sandboxing, 259 * but the handling of multiple staging directories makes that difficult. To 260 * make matters worse, we have no choice but to use relative lookups when 261 * dealing with an mtree manifest, so both mechanisms are implemented. 262 */ 263 static void 264 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg, 265 char *path, size_t sz, int *dirfdp) 266 { 267 if (cur->contents != NULL) { 268 size_t n; 269 270 *dirfdp = AT_FDCWD; 271 n = strlcpy(path, cur->contents, sz); 272 assert(n < sz); 273 } else if (cur->root == NULL) { 274 size_t n; 275 276 *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd; 277 n = strlcpy(path, cur->name, sz); 278 assert(n < sz); 279 } else { 280 int n; 281 282 *dirfdp = AT_FDCWD; 283 n = snprintf(path, sz, "%s/%s/%s", 284 cur->root, cur->path, cur->name); 285 assert(n >= 0); 286 assert((size_t)n < sz); 287 } 288 } 289 290 static int 291 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags) 292 { 293 char path[PATH_MAX]; 294 int fd; 295 296 fs_populate_path(cur, arg, path, sizeof(path), &fd); 297 298 fd = openat(fd, path, flags); 299 if (fd < 0) 300 err(1, "openat(%s)", path); 301 return (fd); 302 } 303 304 static int 305 fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags) 306 { 307 int fd; 308 char path[PATH_MAX]; 309 310 fs_populate_path(cur, arg, path, sizeof(path), &fd); 311 312 return (openat(fd, path, flags)); 313 } 314 315 static void 316 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg, 317 char *buf, size_t bufsz) 318 { 319 char path[PATH_MAX]; 320 int fd; 321 322 if (cur->symlink != NULL) { 323 size_t n; 324 325 n = strlcpy(buf, cur->symlink, bufsz); 326 assert(n < bufsz); 327 } else { 328 ssize_t n; 329 330 fs_populate_path(cur, arg, path, sizeof(path), &fd); 331 332 n = readlinkat(fd, path, buf, bufsz - 1); 333 if (n == -1) 334 err(1, "readlinkat(%s)", cur->name); 335 buf[n] = '\0'; 336 } 337 } 338 339 static void 340 fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts, 341 uint16_t ind, size_t *szp) 342 { 343 uint64_t timebuf[2]; 344 345 assert(ind < fs->sacnt); 346 assert(fs->saoffs[ind] != 0xffff); 347 assert(fs->satab[ind].size == sizeof(timebuf)); 348 349 timebuf[0] = ts->tv_sec; 350 timebuf[1] = ts->tv_nsec; 351 fs_populate_attr(fs, attrbuf, timebuf, ind, szp); 352 } 353 354 static void 355 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, 356 dnode_phys_t *dnode) 357 { 358 char target[PATH_MAX]; 359 zfs_fs_t *fs; 360 zfs_ace_hdr_t aces[3]; 361 struct stat *sb; 362 sa_hdr_phys_t *sahdr; 363 uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; 364 char *attrbuf; 365 size_t bonussz, hdrsz; 366 int layout; 367 368 assert(dnode->dn_bonustype == DMU_OT_SA); 369 assert(dnode->dn_nblkptr == 1); 370 371 fs = arg->fs; 372 sb = &cur->inode->st; 373 374 switch (cur->type) { 375 case S_IFREG: 376 layout = SA_LAYOUT_INDEX_DEFAULT; 377 links = cur->inode->nlink; 378 objsize = sb->st_size; 379 parent = SLIST_FIRST(&arg->dirs)->objid; 380 break; 381 case S_IFDIR: 382 layout = SA_LAYOUT_INDEX_DEFAULT; 383 links = 1; /* .. */ 384 objsize = 1; /* .. */ 385 386 /* 387 * The size of a ZPL directory is the number of entries 388 * (including "." and ".."), and the link count is the number of 389 * entries which are directories (including "." and ".."). 390 */ 391 for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; 392 c != NULL; c = c->next) { 393 switch (c->type) { 394 case S_IFDIR: 395 links++; 396 /* FALLTHROUGH */ 397 case S_IFREG: 398 case S_IFLNK: 399 objsize++; 400 break; 401 } 402 } 403 404 /* The root directory is its own parent. */ 405 parent = SLIST_EMPTY(&arg->dirs) ? 406 arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; 407 break; 408 case S_IFLNK: 409 fs_readlink(cur, arg, target, sizeof(target)); 410 411 layout = SA_LAYOUT_INDEX_SYMLINK; 412 links = 1; 413 objsize = strlen(target); 414 parent = SLIST_FIRST(&arg->dirs)->objid; 415 break; 416 default: 417 assert(0); 418 } 419 420 daclcount = nitems(aces); 421 flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE | 422 ZFS_AV_MODIFIED; 423 gen = 1; 424 gid = sb->st_gid; 425 mode = sb->st_mode; 426 uid = sb->st_uid; 427 428 memset(aces, 0, sizeof(aces)); 429 aces[0].z_flags = ACE_OWNER; 430 aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 431 aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | 432 ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | 433 ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 434 if ((mode & S_IRUSR) != 0) 435 aces[0].z_access_mask |= ACE_READ_DATA; 436 if ((mode & S_IWUSR) != 0) 437 aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 438 if ((mode & S_IXUSR) != 0) 439 aces[0].z_access_mask |= ACE_EXECUTE; 440 441 aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; 442 aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 443 aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 444 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 445 if ((mode & S_IRGRP) != 0) 446 aces[1].z_access_mask |= ACE_READ_DATA; 447 if ((mode & S_IWGRP) != 0) 448 aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 449 if ((mode & S_IXGRP) != 0) 450 aces[1].z_access_mask |= ACE_EXECUTE; 451 452 aces[2].z_flags = ACE_EVERYONE; 453 aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 454 aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 455 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 456 if ((mode & S_IROTH) != 0) 457 aces[2].z_access_mask |= ACE_READ_DATA; 458 if ((mode & S_IWOTH) != 0) 459 aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 460 if ((mode & S_IXOTH) != 0) 461 aces[2].z_access_mask |= ACE_EXECUTE; 462 463 switch (layout) { 464 case SA_LAYOUT_INDEX_DEFAULT: 465 /* At most one variable-length attribute. */ 466 hdrsz = sizeof(uint64_t); 467 break; 468 case SA_LAYOUT_INDEX_SYMLINK: 469 /* At most five variable-length attributes. */ 470 hdrsz = sizeof(uint64_t) * 2; 471 break; 472 default: 473 assert(0); 474 } 475 476 sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); 477 sahdr->sa_magic = SA_MAGIC; 478 SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); 479 480 bonussz = SA_HDR_SIZE(sahdr); 481 attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); 482 483 fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); 484 fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); 485 fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); 486 fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); 487 fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); 488 fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); 489 fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); 490 fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); 491 fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); 492 493 /* 494 * We deliberately set atime = mtime here to ensure that images are 495 * reproducible. 496 */ 497 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); 498 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); 499 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); 500 #ifdef __linux__ 501 /* Linux has no st_birthtim; approximate with st_ctim */ 502 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz); 503 #else 504 fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); 505 #endif 506 507 fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, 508 ZPL_DACL_ACES, &bonussz); 509 sahdr->sa_lengths[0] = sizeof(aces); 510 511 if (cur->type == S_IFLNK) { 512 assert(layout == SA_LAYOUT_INDEX_SYMLINK); 513 /* Need to use a spill block pointer if the target is long. */ 514 assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); 515 fs_populate_varszattr(fs, attrbuf, target, objsize, 516 sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); 517 sahdr->sa_lengths[1] = (uint16_t)objsize; 518 } 519 520 dnode->dn_bonuslen = bonussz; 521 } 522 523 static void 524 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) 525 { 526 struct dnode_cursor *c; 527 dnode_phys_t *dnode; 528 zfs_opt_t *zfs; 529 char *buf; 530 uint64_t dnid; 531 ssize_t n; 532 size_t bufsz; 533 off_t nbytes, reqbytes, size; 534 int fd; 535 536 assert(cur->type == S_IFREG); 537 assert((cur->inode->flags & FI_ROOT) == 0); 538 539 zfs = arg->zfs; 540 541 assert(cur->inode->ino != 0); 542 if ((cur->inode->flags & FI_ALLOCATED) != 0) { 543 /* 544 * This is a hard link of an existing file. 545 * 546 * XXX-MJ need to check whether it crosses datasets, add a test 547 * case for that 548 */ 549 fs_populate_dirent(arg, cur, cur->inode->ino); 550 return; 551 } 552 553 dnode = objset_dnode_bonus_alloc(arg->fs->os, 554 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 555 cur->inode->ino = dnid; 556 cur->inode->flags |= FI_ALLOCATED; 557 558 fd = fs_open(cur, arg, O_RDONLY); 559 560 buf = zfs->filebuf; 561 bufsz = sizeof(zfs->filebuf); 562 size = cur->inode->st.st_size; 563 c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); 564 for (off_t foff = 0; foff < size; foff += nbytes) { 565 off_t loc, sofar; 566 567 /* 568 * Fill up our buffer, handling partial reads. 569 */ 570 sofar = 0; 571 nbytes = MIN(size - foff, (off_t)bufsz); 572 do { 573 n = read(fd, buf + sofar, nbytes); 574 if (n < 0) 575 err(1, "reading from '%s'", cur->name); 576 if (n == 0) 577 errx(1, "unexpected EOF reading '%s'", 578 cur->name); 579 sofar += n; 580 } while (sofar < nbytes); 581 582 if (nbytes < (off_t)bufsz) 583 memset(buf + nbytes, 0, bufsz - nbytes); 584 585 reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE; 586 loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes); 587 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc, 588 dnode_cursor_next(zfs, c, foff)); 589 } 590 eclose(fd); 591 dnode_cursor_finish(zfs, c); 592 593 fs_populate_sattrs(arg, cur, dnode); 594 fs_populate_dirent(arg, cur, dnid); 595 } 596 597 static void 598 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) 599 { 600 dnode_phys_t *dnode; 601 zfs_objset_t *os; 602 uint64_t dnid; 603 int dirfd; 604 605 assert(cur->type == S_IFDIR); 606 assert((cur->inode->flags & FI_ALLOCATED) == 0); 607 608 os = arg->fs->os; 609 610 dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, 611 DMU_OT_SA, 0, &dnid); 612 613 /* 614 * Add an entry to the parent directory and open this directory. 615 */ 616 if (!SLIST_EMPTY(&arg->dirs)) { 617 fs_populate_dirent(arg, cur, dnid); 618 /* 619 * We only need the directory fd if we're finding files in 620 * it. If it's just there for other directories or 621 * files using contents= we don't need to succeed here. 622 */ 623 dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY); 624 } else { 625 arg->rootdirid = dnid; 626 dirfd = arg->rootdirfd; 627 arg->rootdirfd = -1; 628 } 629 630 /* 631 * Set ZPL attributes. 632 */ 633 fs_populate_sattrs(arg, cur, dnode); 634 635 /* 636 * If this is a root directory, then its children belong to a different 637 * dataset and this directory remains empty in the current objset. 638 */ 639 if ((cur->inode->flags & FI_ROOT) == 0) { 640 struct fs_populate_dir *dir; 641 642 dir = ecalloc(1, sizeof(*dir)); 643 dir->dirfd = dirfd; 644 dir->objid = dnid; 645 dir->zap = zap_alloc(os, dnode); 646 SLIST_INSERT_HEAD(&arg->dirs, dir, next); 647 } else { 648 zap_write(arg->zfs, zap_alloc(os, dnode)); 649 fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); 650 } 651 } 652 653 static void 654 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) 655 { 656 dnode_phys_t *dnode; 657 uint64_t dnid; 658 659 assert(cur->type == S_IFLNK); 660 assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); 661 662 dnode = objset_dnode_bonus_alloc(arg->fs->os, 663 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 664 665 fs_populate_dirent(arg, cur, dnid); 666 667 fs_populate_sattrs(arg, cur, dnode); 668 } 669 670 static fsnode * 671 fsnode_next(fsnode *cur) 672 { 673 for (cur = cur->next; cur != NULL; cur = cur->next) { 674 if (fsnode_valid(cur)) 675 return (cur); 676 } 677 return (NULL); 678 } 679 680 static int 681 fs_foreach_populate(fsnode *cur, void *_arg) 682 { 683 struct fs_populate_arg *arg; 684 struct fs_populate_dir *dir; 685 int ret; 686 687 arg = _arg; 688 switch (cur->type) { 689 case S_IFREG: 690 fs_populate_file(cur, arg); 691 break; 692 case S_IFDIR: 693 if (fsnode_isroot(cur)) 694 break; 695 fs_populate_dir(cur, arg); 696 break; 697 case S_IFLNK: 698 fs_populate_symlink(cur, arg); 699 break; 700 default: 701 assert(0); 702 } 703 704 ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; 705 706 if (fsnode_next(cur) == NULL && 707 (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { 708 /* 709 * We reached a terminal node in a subtree. Walk back up and 710 * write out directories. We're done once we hit the root of a 711 * dataset or find a level where we're not on the edge of the 712 * tree. 713 */ 714 do { 715 dir = SLIST_FIRST(&arg->dirs); 716 SLIST_REMOVE_HEAD(&arg->dirs, next); 717 zap_write(arg->zfs, dir->zap); 718 if (dir->dirfd != -1) 719 eclose(dir->dirfd); 720 free(dir); 721 cur = cur->parent; 722 } while (cur != NULL && fsnode_next(cur) == NULL && 723 (cur->inode->flags & FI_ROOT) == 0); 724 } 725 726 return (ret); 727 } 728 729 static void 730 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, 731 const sa_attr_type_t layout[], size_t sacnt) 732 { 733 char ti[16]; 734 735 assert(sizeof(layout[0]) == 2); 736 737 snprintf(ti, sizeof(ti), "%u", index); 738 zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, 739 (const uint8_t *)layout); 740 } 741 742 /* 743 * Initialize system attribute tables. 744 * 745 * There are two elements to this. First, we write the zpl_attrs[] and 746 * zpl_attr_layout[] tables to disk. Then we create a lookup table which 747 * allows us to set file attributes quickly. 748 */ 749 static uint64_t 750 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) 751 { 752 zfs_zap_t *sazap, *salzap, *sarzap; 753 zfs_objset_t *os; 754 dnode_phys_t *saobj, *salobj, *sarobj; 755 uint64_t saobjid, salobjid, sarobjid; 756 uint16_t offset; 757 758 os = fs->os; 759 760 /* 761 * The on-disk tables are stored in two ZAP objects, the registry object 762 * and the layout object. Individual attributes are described by 763 * entries in the registry object; for example, the value for the 764 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. 765 * The attributes of a file are ordered according to one of the layouts 766 * defined in the layout object. The master node object is simply used 767 * to locate the registry and layout objects. 768 */ 769 saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); 770 salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); 771 sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); 772 773 sarzap = zap_alloc(os, sarobj); 774 for (size_t i = 0; i < nitems(zpl_attrs); i++) { 775 const zfs_sattr_t *sa; 776 uint64_t attr; 777 778 attr = 0; 779 sa = &zpl_attrs[i]; 780 SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); 781 zap_add_uint64(sarzap, sa->name, attr); 782 } 783 zap_write(zfs, sarzap); 784 785 /* 786 * Layouts are arrays of indices into the registry. We define two 787 * layouts for use by the ZPL, one for non-symlinks and one for 788 * symlinks. They are identical except that the symlink layout includes 789 * ZPL_SYMLINK as its final attribute. 790 */ 791 salzap = zap_alloc(os, salobj); 792 assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); 793 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, 794 zpl_attr_layout, nitems(zpl_attr_layout) - 1); 795 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, 796 zpl_attr_layout, nitems(zpl_attr_layout)); 797 zap_write(zfs, salzap); 798 799 sazap = zap_alloc(os, saobj); 800 zap_add_uint64(sazap, SA_LAYOUTS, salobjid); 801 zap_add_uint64(sazap, SA_REGISTRY, sarobjid); 802 zap_write(zfs, sazap); 803 804 /* Sanity check. */ 805 for (size_t i = 0; i < nitems(zpl_attrs); i++) 806 assert(i == zpl_attrs[i].id); 807 808 /* 809 * Build the offset table used when setting file attributes. File 810 * attributes are stored in the object's bonus buffer; this table 811 * provides the buffer offset of attributes referenced by the layout 812 * table. 813 */ 814 fs->sacnt = nitems(zpl_attrs); 815 fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); 816 for (size_t i = 0; i < fs->sacnt; i++) 817 fs->saoffs[i] = 0xffff; 818 offset = 0; 819 for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { 820 uint16_t size; 821 822 assert(zpl_attr_layout[i] < fs->sacnt); 823 824 fs->saoffs[zpl_attr_layout[i]] = offset; 825 size = zpl_attrs[zpl_attr_layout[i]].size; 826 offset += size; 827 } 828 fs->satab = zpl_attrs; 829 830 return (saobjid); 831 } 832 833 static void 834 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) 835 { 836 char *mountpoint, *origmountpoint, *name, *next; 837 fsnode *cur, *root; 838 uint64_t canmount; 839 840 if (!dsl_dir_has_dataset(dsldir)) 841 return; 842 843 if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) 844 return; 845 mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); 846 if (mountpoint == NULL) 847 return; 848 849 /* 850 * If we were asked to specify a bootfs, set it here. 851 */ 852 if (zfs->bootfs != NULL && strcmp(zfs->bootfs, 853 dsl_dir_fullname(dsldir)) == 0) { 854 zap_add_uint64(zfs->poolprops, "bootfs", 855 dsl_dir_dataset_id(dsldir)); 856 } 857 858 origmountpoint = mountpoint; 859 860 /* 861 * Figure out which fsnode corresponds to our mountpoint. 862 */ 863 root = arg; 864 cur = root; 865 if (strcmp(mountpoint, zfs->rootpath) != 0) { 866 mountpoint += strlen(zfs->rootpath); 867 868 /* 869 * Look up the directory in the staged tree. For example, if 870 * the dataset's mount point is /foo/bar/baz, we'll search the 871 * root directory for "foo", search "foo" for "baz", and so on. 872 * Each intermediate name must refer to a directory; the final 873 * component need not exist. 874 */ 875 cur = root; 876 for (next = name = mountpoint; next != NULL;) { 877 for (; *next == '/'; next++) 878 ; 879 name = strsep(&next, "/"); 880 881 for (; cur != NULL && strcmp(cur->name, name) != 0; 882 cur = cur->next) 883 ; 884 if (cur == NULL) { 885 if (next == NULL) 886 break; 887 errx(1, "missing mountpoint directory for `%s'", 888 dsl_dir_fullname(dsldir)); 889 } 890 if (cur->type != S_IFDIR) { 891 errx(1, 892 "mountpoint for `%s' is not a directory", 893 dsl_dir_fullname(dsldir)); 894 } 895 if (next != NULL) 896 cur = cur->child; 897 } 898 } 899 900 if (cur != NULL) { 901 assert(cur->type == S_IFDIR); 902 903 /* 904 * Multiple datasets shouldn't share a mountpoint. It's 905 * technically allowed, but it's not clear what makefs should do 906 * in that case. 907 */ 908 assert((cur->inode->flags & FI_ROOT) == 0); 909 if (cur != root) 910 cur->inode->flags |= FI_ROOT; 911 assert(cur->inode->param == NULL); 912 cur->inode->param = dsldir; 913 } 914 915 free(origmountpoint); 916 } 917 918 static int 919 fs_foreach_mark(fsnode *cur, void *arg) 920 { 921 uint64_t *countp; 922 923 countp = arg; 924 if (cur->type == S_IFDIR && fsnode_isroot(cur)) 925 return (1); 926 927 if (cur->inode->ino == 0) { 928 cur->inode->ino = ++(*countp); 929 cur->inode->nlink = 1; 930 } else { 931 cur->inode->nlink++; 932 } 933 934 return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); 935 } 936 937 /* 938 * Create a filesystem dataset. More specifically: 939 * - create an object set for the dataset, 940 * - add required metadata (SA tables, property definitions, etc.) to that 941 * object set, 942 * - optionally populate the object set with file objects, using "root" as the 943 * root directory. 944 * 945 * "dirfd" is a directory descriptor for the directory referenced by "root". It 946 * is closed before returning. 947 */ 948 static void 949 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) 950 { 951 struct fs_populate_arg arg; 952 zfs_fs_t fs; 953 zfs_zap_t *masterzap; 954 zfs_objset_t *os; 955 dnode_phys_t *deleteq, *masterobj; 956 uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; 957 bool fakedroot; 958 959 /* 960 * This dataset's mountpoint doesn't exist in the staging tree, or the 961 * dataset doesn't have a mountpoint at all. In either case we still 962 * need a root directory. Fake up a root fsnode to handle this case. 963 */ 964 fakedroot = root == NULL; 965 if (fakedroot) { 966 struct stat *stp; 967 968 assert(dirfd == -1); 969 970 root = ecalloc(1, sizeof(*root)); 971 root->inode = ecalloc(1, sizeof(*root->inode)); 972 root->name = estrdup("."); 973 root->type = S_IFDIR; 974 975 stp = &root->inode->st; 976 stp->st_uid = 0; 977 stp->st_gid = 0; 978 stp->st_mode = S_IFDIR | 0755; 979 } 980 assert(root->type == S_IFDIR); 981 assert(fsnode_isroot(root)); 982 983 /* 984 * Initialize the object set for this dataset. 985 */ 986 os = objset_alloc(zfs, DMU_OST_ZFS); 987 masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); 988 assert(moid == MASTER_NODE_OBJ); 989 990 memset(&fs, 0, sizeof(fs)); 991 fs.os = os; 992 993 /* 994 * Create the ZAP SA layout now since filesystem object dnodes will 995 * refer to those attributes. 996 */ 997 saobjid = fs_set_zpl_attrs(zfs, &fs); 998 999 /* 1000 * Make a pass over the staged directory to detect hard links and assign 1001 * virtual dnode numbers. 1002 */ 1003 dnodecount = 1; /* root directory */ 1004 fsnode_foreach(root, fs_foreach_mark, &dnodecount); 1005 1006 /* 1007 * Make a second pass to populate the dataset with files from the 1008 * staged directory. Most of our runtime is spent here. 1009 */ 1010 arg.rootdirfd = dirfd; 1011 arg.zfs = zfs; 1012 arg.fs = &fs; 1013 SLIST_INIT(&arg.dirs); 1014 fs_populate_dir(root, &arg); 1015 assert(!SLIST_EMPTY(&arg.dirs)); 1016 fsnode_foreach(root, fs_foreach_populate, &arg); 1017 assert(SLIST_EMPTY(&arg.dirs)); 1018 rootdirid = arg.rootdirid; 1019 1020 /* 1021 * Create an empty delete queue. We don't do anything with it, but 1022 * OpenZFS will refuse to mount filesystems that don't have one. 1023 */ 1024 deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); 1025 zap_write(zfs, zap_alloc(os, deleteq)); 1026 1027 /* 1028 * Populate and write the master node object. This is a ZAP object 1029 * containing various dataset properties and the object IDs of the root 1030 * directory and delete queue. 1031 */ 1032 masterzap = zap_alloc(os, masterobj); 1033 zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); 1034 zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); 1035 zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); 1036 zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); 1037 zap_add_uint64(masterzap, "normalization", 0 /* off */); 1038 zap_add_uint64(masterzap, "utf8only", 0 /* off */); 1039 zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); 1040 zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); 1041 zap_write(zfs, masterzap); 1042 1043 /* 1044 * All finished with this object set, we may as well write it now. 1045 * The DSL layer will sum up the bytes consumed by each dataset using 1046 * information stored in the object set, so it can't be freed just yet. 1047 */ 1048 dsl_dir_dataset_write(zfs, os, dsldir); 1049 1050 if (fakedroot) { 1051 free(root->inode); 1052 free(root->name); 1053 free(root); 1054 } 1055 free(fs.saoffs); 1056 } 1057 1058 /* 1059 * Create an object set for each DSL directory which has a dataset and doesn't 1060 * already have an object set. 1061 */ 1062 static void 1063 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) 1064 { 1065 if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) 1066 fs_build_one(zfs, dsldir, NULL, -1); 1067 } 1068 1069 /* 1070 * Create our datasets and populate them with files. 1071 */ 1072 void 1073 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) 1074 { 1075 /* 1076 * Run through our datasets and find the root fsnode for each one. Each 1077 * root fsnode is flagged so that we can figure out which dataset it 1078 * belongs to. 1079 */ 1080 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); 1081 1082 /* 1083 * Did we find our boot filesystem? 1084 */ 1085 if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) 1086 errx(1, "no mounted dataset matches bootfs property `%s'", 1087 zfs->bootfs); 1088 1089 /* 1090 * Traverse the file hierarchy starting from the root fsnode. One 1091 * dataset, not necessarily the root dataset, must "own" the root 1092 * directory by having its mountpoint be equal to the root path. 1093 * 1094 * As roots of other datasets are encountered during the traversal, 1095 * fs_build_one() recursively creates the corresponding object sets and 1096 * populates them. Once this function has returned, all datasets will 1097 * have been fully populated. 1098 */ 1099 fs_build_one(zfs, root->inode->param, root, dirfd); 1100 1101 /* 1102 * Now create object sets for datasets whose mountpoints weren't found 1103 * in the staging directory, either because there is no mountpoint, or 1104 * because the mountpoint doesn't correspond to an existing directory. 1105 */ 1106 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); 1107 } 1108