1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/dirent.h> 32 #include <sys/stat.h> 33 34 #include <assert.h> 35 #include <fcntl.h> 36 #include <string.h> 37 #include <unistd.h> 38 39 #include <util.h> 40 41 #include "makefs.h" 42 #include "zfs.h" 43 44 typedef struct { 45 const char *name; 46 unsigned int id; 47 uint16_t size; 48 sa_bswap_type_t bs; 49 } zfs_sattr_t; 50 51 typedef struct zfs_fs { 52 zfs_objset_t *os; 53 54 /* Offset table for system attributes, indexed by a zpl_attr_t. */ 55 uint16_t *saoffs; 56 size_t sacnt; 57 const zfs_sattr_t *satab; 58 } zfs_fs_t; 59 60 /* 61 * The order of the attributes doesn't matter, this is simply the one hard-coded 62 * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. 63 */ 64 typedef enum zpl_attr { 65 ZPL_ATIME, 66 ZPL_MTIME, 67 ZPL_CTIME, 68 ZPL_CRTIME, 69 ZPL_GEN, 70 ZPL_MODE, 71 ZPL_SIZE, 72 ZPL_PARENT, 73 ZPL_LINKS, 74 ZPL_XATTR, 75 ZPL_RDEV, 76 ZPL_FLAGS, 77 ZPL_UID, 78 ZPL_GID, 79 ZPL_PAD, 80 ZPL_ZNODE_ACL, 81 ZPL_DACL_COUNT, 82 ZPL_SYMLINK, 83 ZPL_SCANSTAMP, 84 ZPL_DACL_ACES, 85 ZPL_DXATTR, 86 ZPL_PROJID, 87 } zpl_attr_t; 88 89 /* 90 * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. 91 */ 92 static const zfs_sattr_t zpl_attrs[] = { 93 #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } 94 _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 95 _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 96 _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 97 _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 98 _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), 99 _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), 100 _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), 101 _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), 102 _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), 103 _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), 104 _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), 105 _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), 106 _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), 107 _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), 108 _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), 109 _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), 110 _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), 111 _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), 112 _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), 113 _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), 114 _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), 115 _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), 116 #undef ZPL_ATTR 117 }; 118 119 /* 120 * This layout matches that of a filesystem created using OpenZFS on FreeBSD. 121 * It need not match in general, but FreeBSD's loader doesn't bother parsing the 122 * layout and just hard-codes attribute offsets. 123 */ 124 static const sa_attr_type_t zpl_attr_layout[] = { 125 ZPL_MODE, 126 ZPL_SIZE, 127 ZPL_GEN, 128 ZPL_UID, 129 ZPL_GID, 130 ZPL_PARENT, 131 ZPL_FLAGS, 132 ZPL_ATIME, 133 ZPL_MTIME, 134 ZPL_CTIME, 135 ZPL_CRTIME, 136 ZPL_LINKS, 137 ZPL_DACL_COUNT, 138 ZPL_DACL_ACES, 139 ZPL_SYMLINK, 140 }; 141 142 /* 143 * Keys for the ZPL attribute tables in the SA layout ZAP. The first two 144 * indices are reserved for legacy attribute encoding. 145 */ 146 #define SA_LAYOUT_INDEX_DEFAULT 2 147 #define SA_LAYOUT_INDEX_SYMLINK 3 148 149 struct fs_populate_dir { 150 SLIST_ENTRY(fs_populate_dir) next; 151 int dirfd; 152 uint64_t objid; 153 zfs_zap_t *zap; 154 }; 155 156 struct fs_populate_arg { 157 zfs_opt_t *zfs; 158 zfs_fs_t *fs; /* owning filesystem */ 159 uint64_t rootdirid; /* root directory dnode ID */ 160 int rootdirfd; /* root directory fd */ 161 SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ 162 }; 163 164 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); 165 166 static void 167 eclose(int fd) 168 { 169 if (close(fd) != 0) 170 err(1, "close"); 171 } 172 173 static bool 174 fsnode_isroot(const fsnode *cur) 175 { 176 return (strcmp(cur->name, ".") == 0); 177 } 178 179 /* 180 * Visit each node in a directory hierarchy, in pre-order depth-first order. 181 */ 182 static void 183 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) 184 { 185 assert(root->type == S_IFDIR); 186 187 for (fsnode *cur = root; cur != NULL; cur = cur->next) { 188 assert(cur->type == S_IFREG || cur->type == S_IFDIR || 189 cur->type == S_IFLNK); 190 191 if (cb(cur, arg) == 0) 192 continue; 193 if (cur->type == S_IFDIR && cur->child != NULL) 194 fsnode_foreach(cur->child, cb, arg); 195 } 196 } 197 198 static void 199 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) 200 { 201 struct fs_populate_dir *dir; 202 uint64_t type; 203 204 switch (cur->type) { 205 case S_IFREG: 206 type = DT_REG; 207 break; 208 case S_IFDIR: 209 type = DT_DIR; 210 break; 211 case S_IFLNK: 212 type = DT_LNK; 213 break; 214 default: 215 assert(0); 216 } 217 218 dir = SLIST_FIRST(&arg->dirs); 219 zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); 220 } 221 222 static void 223 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, 224 size_t *szp) 225 { 226 assert(ind < fs->sacnt); 227 assert(fs->saoffs[ind] != 0xffff); 228 229 memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); 230 *szp += fs->satab[ind].size; 231 } 232 233 static void 234 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, 235 size_t valsz, size_t varoff, uint16_t ind, size_t *szp) 236 { 237 assert(ind < fs->sacnt); 238 assert(fs->saoffs[ind] != 0xffff); 239 assert(fs->satab[ind].size == 0); 240 241 memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); 242 *szp += valsz; 243 } 244 245 /* 246 * Derive the relative fd/path combo needed to access a file. Ideally we'd 247 * always be able to use relative lookups (i.e., use the *at() system calls), 248 * since they require less path translation and are more amenable to sandboxing, 249 * but the handling of multiple staging directories makes that difficult. To 250 * make matters worse, we have no choice but to use relative lookups when 251 * dealing with an mtree manifest, so both mechanisms are implemented. 252 */ 253 static void 254 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg, 255 char *path, size_t sz, int *dirfdp) 256 { 257 if (cur->root == NULL) { 258 size_t n; 259 260 *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd; 261 n = strlcpy(path, cur->name, sz); 262 assert(n < sz); 263 } else { 264 int n; 265 266 *dirfdp = AT_FDCWD; 267 n = snprintf(path, sz, "%s/%s/%s", 268 cur->root, cur->path, cur->name); 269 assert(n >= 0); 270 assert((size_t)n < sz); 271 } 272 } 273 274 static int 275 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags) 276 { 277 char path[PATH_MAX]; 278 int fd; 279 280 fs_populate_path(cur, arg, path, sizeof(path), &fd); 281 282 fd = openat(fd, path, flags); 283 if (fd < 0) 284 err(1, "openat(%s)", path); 285 return (fd); 286 } 287 288 static void 289 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg, 290 char *buf, size_t bufsz) 291 { 292 char path[PATH_MAX]; 293 ssize_t n; 294 int fd; 295 296 fs_populate_path(cur, arg, path, sizeof(path), &fd); 297 298 n = readlinkat(fd, path, buf, bufsz - 1); 299 if (n == -1) 300 err(1, "readlinkat(%s)", cur->name); 301 buf[n] = '\0'; 302 } 303 304 static void 305 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, 306 dnode_phys_t *dnode) 307 { 308 char target[PATH_MAX]; 309 zfs_fs_t *fs; 310 zfs_ace_hdr_t aces[3]; 311 struct stat *sb; 312 sa_hdr_phys_t *sahdr; 313 uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; 314 char *attrbuf; 315 size_t bonussz, hdrsz; 316 int layout; 317 318 assert(dnode->dn_bonustype == DMU_OT_SA); 319 assert(dnode->dn_nblkptr == 1); 320 321 fs = arg->fs; 322 sb = &cur->inode->st; 323 324 switch (cur->type) { 325 case S_IFREG: 326 layout = SA_LAYOUT_INDEX_DEFAULT; 327 links = cur->inode->nlink; 328 objsize = sb->st_size; 329 parent = SLIST_FIRST(&arg->dirs)->objid; 330 break; 331 case S_IFDIR: 332 layout = SA_LAYOUT_INDEX_DEFAULT; 333 links = 1; /* .. */ 334 objsize = 1; /* .. */ 335 336 /* 337 * The size of a ZPL directory is the number of entries 338 * (including "." and ".."), and the link count is the number of 339 * entries which are directories (including "." and ".."). 340 */ 341 for (fsnode *c = fsnode_isroot(cur) ? cur->next : cur->child; 342 c != NULL; c = c->next) { 343 if (c->type == S_IFDIR) 344 links++; 345 objsize++; 346 } 347 348 /* The root directory is its own parent. */ 349 parent = SLIST_EMPTY(&arg->dirs) ? 350 arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; 351 break; 352 case S_IFLNK: 353 fs_readlink(cur, arg, target, sizeof(target)); 354 355 layout = SA_LAYOUT_INDEX_SYMLINK; 356 links = 1; 357 objsize = strlen(target); 358 parent = SLIST_FIRST(&arg->dirs)->objid; 359 break; 360 default: 361 assert(0); 362 } 363 364 daclcount = nitems(aces); 365 flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_NO_EXECS_DENIED | 366 ZFS_ARCHIVE | ZFS_AV_MODIFIED; /* XXX-MJ */ 367 gen = 1; 368 gid = sb->st_gid; 369 mode = sb->st_mode; 370 uid = sb->st_uid; 371 372 memset(aces, 0, sizeof(aces)); 373 aces[0].z_flags = ACE_OWNER; 374 aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 375 aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | 376 ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | 377 ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 378 if ((mode & S_IRUSR) != 0) 379 aces[0].z_access_mask |= ACE_READ_DATA; 380 if ((mode & S_IWUSR) != 0) 381 aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 382 if ((mode & S_IXUSR) != 0) 383 aces[0].z_access_mask |= ACE_EXECUTE; 384 385 aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; 386 aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 387 aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 388 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 389 if ((mode & S_IRGRP) != 0) 390 aces[1].z_access_mask |= ACE_READ_DATA; 391 if ((mode & S_IWGRP) != 0) 392 aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 393 if ((mode & S_IXGRP) != 0) 394 aces[1].z_access_mask |= ACE_EXECUTE; 395 396 aces[2].z_flags = ACE_EVERYONE; 397 aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 398 aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 399 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 400 if ((mode & S_IROTH) != 0) 401 aces[2].z_access_mask |= ACE_READ_DATA; 402 if ((mode & S_IWOTH) != 0) 403 aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 404 if ((mode & S_IXOTH) != 0) 405 aces[2].z_access_mask |= ACE_EXECUTE; 406 407 switch (layout) { 408 case SA_LAYOUT_INDEX_DEFAULT: 409 /* At most one variable-length attribute. */ 410 hdrsz = sizeof(uint64_t); 411 break; 412 case SA_LAYOUT_INDEX_SYMLINK: 413 /* At most five variable-length attributes. */ 414 hdrsz = sizeof(uint64_t) * 2; 415 break; 416 default: 417 assert(0); 418 } 419 420 sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); 421 sahdr->sa_magic = SA_MAGIC; 422 SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); 423 424 bonussz = SA_HDR_SIZE(sahdr); 425 attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); 426 427 fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); 428 fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); 429 fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); 430 fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); 431 fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); 432 fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); 433 fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); 434 fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); 435 fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); 436 437 /* 438 * We deliberately set atime = mtime here to ensure that images are 439 * reproducible. 440 */ 441 assert(sizeof(sb->st_mtim) == fs->satab[ZPL_ATIME].size); 442 fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); 443 assert(sizeof(sb->st_ctim) == fs->satab[ZPL_CTIME].size); 444 fs_populate_attr(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); 445 assert(sizeof(sb->st_mtim) == fs->satab[ZPL_MTIME].size); 446 fs_populate_attr(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); 447 assert(sizeof(sb->st_birthtim) == fs->satab[ZPL_CRTIME].size); 448 fs_populate_attr(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); 449 450 fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, 451 ZPL_DACL_ACES, &bonussz); 452 sahdr->sa_lengths[0] = sizeof(aces); 453 454 if (cur->type == S_IFLNK) { 455 assert(layout == SA_LAYOUT_INDEX_SYMLINK); 456 /* Need to use a spill block pointer if the target is long. */ 457 assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); 458 fs_populate_varszattr(fs, attrbuf, target, objsize, 459 sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); 460 sahdr->sa_lengths[1] = (uint16_t)objsize; 461 } 462 463 dnode->dn_bonuslen = bonussz; 464 } 465 466 static void 467 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) 468 { 469 struct dnode_cursor *c; 470 dnode_phys_t *dnode; 471 zfs_opt_t *zfs; 472 char *buf; 473 uint64_t dnid; 474 ssize_t n; 475 size_t bufsz; 476 off_t size, target; 477 int fd; 478 479 assert(cur->type == S_IFREG); 480 assert((cur->inode->flags & FI_ROOT) == 0); 481 482 zfs = arg->zfs; 483 484 assert(cur->inode->ino != 0); 485 if ((cur->inode->flags & FI_ALLOCATED) != 0) { 486 /* 487 * This is a hard link of an existing file. 488 * 489 * XXX-MJ need to check whether it crosses datasets, add a test 490 * case for that 491 */ 492 fs_populate_dirent(arg, cur, cur->inode->ino); 493 return; 494 } 495 496 dnode = objset_dnode_bonus_alloc(arg->fs->os, 497 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 498 cur->inode->ino = dnid; 499 cur->inode->flags |= FI_ALLOCATED; 500 501 fd = fs_open(cur, arg, O_RDONLY); 502 503 buf = zfs->filebuf; 504 bufsz = sizeof(zfs->filebuf); 505 size = cur->inode->st.st_size; 506 c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); 507 for (off_t foff = 0; foff < size; foff += target) { 508 off_t loc, sofar; 509 510 /* 511 * Fill up our buffer, handling partial reads. 512 * 513 * It might be profitable to use copy_file_range(2) here. 514 */ 515 sofar = 0; 516 target = MIN(size - foff, (off_t)bufsz); 517 do { 518 n = read(fd, buf + sofar, target); 519 if (n < 0) 520 err(1, "reading from '%s'", cur->name); 521 if (n == 0) 522 errx(1, "unexpected EOF reading '%s'", 523 cur->name); 524 sofar += n; 525 } while (sofar < target); 526 527 if (target < (off_t)bufsz) 528 memset(buf + target, 0, bufsz - target); 529 530 loc = objset_space_alloc(zfs, arg->fs->os, &target); 531 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, target, loc, 532 dnode_cursor_next(zfs, c, foff)); 533 } 534 eclose(fd); 535 dnode_cursor_finish(zfs, c); 536 537 fs_populate_sattrs(arg, cur, dnode); 538 fs_populate_dirent(arg, cur, dnid); 539 } 540 541 static void 542 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) 543 { 544 dnode_phys_t *dnode; 545 zfs_objset_t *os; 546 uint64_t dnid; 547 int dirfd; 548 549 assert(cur->type == S_IFDIR); 550 assert((cur->inode->flags & FI_ALLOCATED) == 0); 551 552 os = arg->fs->os; 553 554 dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, 555 DMU_OT_SA, 0, &dnid); 556 557 /* 558 * Add an entry to the parent directory and open this directory. 559 */ 560 if (!SLIST_EMPTY(&arg->dirs)) { 561 fs_populate_dirent(arg, cur, dnid); 562 dirfd = fs_open(cur, arg, O_DIRECTORY | O_RDONLY); 563 } else { 564 arg->rootdirid = dnid; 565 dirfd = arg->rootdirfd; 566 arg->rootdirfd = -1; 567 } 568 569 /* 570 * Set ZPL attributes. 571 */ 572 fs_populate_sattrs(arg, cur, dnode); 573 574 /* 575 * If this is a root directory, then its children belong to a different 576 * dataset and this directory remains empty in the current objset. 577 */ 578 if ((cur->inode->flags & FI_ROOT) == 0) { 579 struct fs_populate_dir *dir; 580 581 dir = ecalloc(1, sizeof(*dir)); 582 dir->dirfd = dirfd; 583 dir->objid = dnid; 584 dir->zap = zap_alloc(os, dnode); 585 SLIST_INSERT_HEAD(&arg->dirs, dir, next); 586 } else { 587 zap_write(arg->zfs, zap_alloc(os, dnode)); 588 fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); 589 } 590 } 591 592 static void 593 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) 594 { 595 dnode_phys_t *dnode; 596 uint64_t dnid; 597 598 assert(cur->type == S_IFLNK); 599 assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); 600 601 dnode = objset_dnode_bonus_alloc(arg->fs->os, 602 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 603 604 fs_populate_dirent(arg, cur, dnid); 605 606 fs_populate_sattrs(arg, cur, dnode); 607 } 608 609 static int 610 fs_foreach_populate(fsnode *cur, void *_arg) 611 { 612 struct fs_populate_arg *arg; 613 struct fs_populate_dir *dir; 614 int ret; 615 616 arg = _arg; 617 switch (cur->type) { 618 case S_IFREG: 619 fs_populate_file(cur, arg); 620 break; 621 case S_IFDIR: 622 if (fsnode_isroot(cur)) 623 break; 624 fs_populate_dir(cur, arg); 625 break; 626 case S_IFLNK: 627 fs_populate_symlink(cur, arg); 628 break; 629 default: 630 assert(0); 631 } 632 633 ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; 634 635 if (cur->next == NULL && 636 (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { 637 /* 638 * We reached a terminal node in a subtree. Walk back up and 639 * write out directories. We're done once we hit the root of a 640 * dataset or find a level where we're not on the edge of the 641 * tree. 642 */ 643 do { 644 dir = SLIST_FIRST(&arg->dirs); 645 SLIST_REMOVE_HEAD(&arg->dirs, next); 646 zap_write(arg->zfs, dir->zap); 647 if (dir->dirfd != -1) 648 eclose(dir->dirfd); 649 free(dir); 650 cur = cur->parent; 651 } while (cur != NULL && cur->next == NULL && 652 (cur->inode->flags & FI_ROOT) == 0); 653 } 654 655 return (ret); 656 } 657 658 static void 659 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, 660 const sa_attr_type_t layout[], size_t sacnt) 661 { 662 char ti[16]; 663 664 assert(sizeof(layout[0]) == 2); 665 666 snprintf(ti, sizeof(ti), "%u", index); 667 zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, 668 (const uint8_t *)layout); 669 } 670 671 /* 672 * Initialize system attribute tables. 673 * 674 * There are two elements to this. First, we write the zpl_attrs[] and 675 * zpl_attr_layout[] tables to disk. Then we create a lookup table which 676 * allows us to set file attributes quickly. 677 */ 678 static uint64_t 679 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) 680 { 681 zfs_zap_t *sazap, *salzap, *sarzap; 682 zfs_objset_t *os; 683 dnode_phys_t *saobj, *salobj, *sarobj; 684 uint64_t saobjid, salobjid, sarobjid; 685 uint16_t offset; 686 687 os = fs->os; 688 689 /* 690 * The on-disk tables are stored in two ZAP objects, the registry object 691 * and the layout object. Individual attributes are described by 692 * entries in the registry object; for example, the value for the 693 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. 694 * The attributes of a file are ordered according to one of the layouts 695 * defined in the layout object. The master node object is simply used 696 * to locate the registry and layout objects. 697 */ 698 saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); 699 salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); 700 sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); 701 702 sarzap = zap_alloc(os, sarobj); 703 for (size_t i = 0; i < nitems(zpl_attrs); i++) { 704 const zfs_sattr_t *sa; 705 uint64_t attr; 706 707 attr = 0; 708 sa = &zpl_attrs[i]; 709 SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); 710 zap_add_uint64(sarzap, sa->name, attr); 711 } 712 zap_write(zfs, sarzap); 713 714 /* 715 * Layouts are arrays of indices into the registry. We define two 716 * layouts for use by the ZPL, one for non-symlinks and one for 717 * symlinks. They are identical except that the symlink layout includes 718 * ZPL_SYMLINK as its final attribute. 719 */ 720 salzap = zap_alloc(os, salobj); 721 assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); 722 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, 723 zpl_attr_layout, nitems(zpl_attr_layout) - 1); 724 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, 725 zpl_attr_layout, nitems(zpl_attr_layout)); 726 zap_write(zfs, salzap); 727 728 sazap = zap_alloc(os, saobj); 729 zap_add_uint64(sazap, SA_LAYOUTS, salobjid); 730 zap_add_uint64(sazap, SA_REGISTRY, sarobjid); 731 zap_write(zfs, sazap); 732 733 /* Sanity check. */ 734 for (size_t i = 0; i < nitems(zpl_attrs); i++) 735 assert(i == zpl_attrs[i].id); 736 737 /* 738 * Build the offset table used when setting file attributes. File 739 * attributes are stored in the object's bonus buffer; this table 740 * provides the buffer offset of attributes referenced by the layout 741 * table. 742 */ 743 fs->sacnt = nitems(zpl_attrs); 744 fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); 745 for (size_t i = 0; i < fs->sacnt; i++) 746 fs->saoffs[i] = 0xffff; 747 offset = 0; 748 for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { 749 uint16_t size; 750 751 assert(zpl_attr_layout[i] < fs->sacnt); 752 753 fs->saoffs[zpl_attr_layout[i]] = offset; 754 size = zpl_attrs[zpl_attr_layout[i]].size; 755 offset += size; 756 } 757 fs->satab = zpl_attrs; 758 759 return (saobjid); 760 } 761 762 static void 763 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) 764 { 765 char *mountpoint, *origmountpoint, *name, *next; 766 fsnode *cur, *root; 767 uint64_t canmount; 768 769 if (!dsl_dir_has_dataset(dsldir)) 770 return; 771 772 if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) 773 return; 774 mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); 775 if (mountpoint == NULL) 776 return; 777 778 /* 779 * If we were asked to specify a bootfs, set it here. 780 */ 781 if (zfs->bootfs != NULL && strcmp(zfs->bootfs, 782 dsl_dir_fullname(dsldir)) == 0) { 783 zap_add_uint64(zfs->poolprops, "bootfs", 784 dsl_dir_dataset_id(dsldir)); 785 } 786 787 origmountpoint = mountpoint; 788 789 /* 790 * Figure out which fsnode corresponds to our mountpoint. 791 */ 792 root = arg; 793 cur = root; 794 if (strcmp(mountpoint, zfs->rootpath) != 0) { 795 mountpoint += strlen(zfs->rootpath); 796 797 /* 798 * Look up the directory in the staged tree. For example, if 799 * the dataset's mount point is /foo/bar/baz, we'll search the 800 * root directory for "foo", search "foo" for "baz", and so on. 801 * Each intermediate name must refer to a directory; the final 802 * component need not exist. 803 */ 804 cur = root; 805 for (next = name = mountpoint; next != NULL;) { 806 for (; *next == '/'; next++) 807 ; 808 name = strsep(&next, "/"); 809 810 for (; cur != NULL && strcmp(cur->name, name) != 0; 811 cur = cur->next) 812 ; 813 if (cur == NULL) { 814 if (next == NULL) 815 break; 816 errx(1, "missing mountpoint directory for `%s'", 817 dsl_dir_fullname(dsldir)); 818 } 819 if (cur->type != S_IFDIR) { 820 errx(1, 821 "mountpoint for `%s' is not a directory", 822 dsl_dir_fullname(dsldir)); 823 } 824 if (next != NULL) 825 cur = cur->child; 826 } 827 } 828 829 if (cur != NULL) { 830 assert(cur->type == S_IFDIR); 831 832 /* 833 * Multiple datasets shouldn't share a mountpoint. It's 834 * technically allowed, but it's not clear what makefs should do 835 * in that case. 836 */ 837 assert((cur->inode->flags & FI_ROOT) == 0); 838 if (cur != root) 839 cur->inode->flags |= FI_ROOT; 840 assert(cur->inode->param == NULL); 841 cur->inode->param = dsldir; 842 } 843 844 free(origmountpoint); 845 } 846 847 static int 848 fs_foreach_mark(fsnode *cur, void *arg) 849 { 850 uint64_t *countp; 851 852 countp = arg; 853 if (cur->type == S_IFDIR && fsnode_isroot(cur)) 854 return (1); 855 856 if (cur->inode->ino == 0) { 857 cur->inode->ino = ++(*countp); 858 cur->inode->nlink = 1; 859 } else { 860 cur->inode->nlink++; 861 } 862 863 return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); 864 } 865 866 /* 867 * Create a filesystem dataset. More specifically: 868 * - create an object set for the dataset, 869 * - add required metadata (SA tables, property definitions, etc.) to that 870 * object set, 871 * - optionally populate the object set with file objects, using "root" as the 872 * root directory. 873 * 874 * "dirfd" is a directory descriptor for the directory referenced by "root". It 875 * is closed before returning. 876 */ 877 static void 878 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) 879 { 880 struct fs_populate_arg arg; 881 zfs_fs_t fs; 882 zfs_zap_t *masterzap; 883 zfs_objset_t *os; 884 dnode_phys_t *deleteq, *masterobj; 885 uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; 886 bool fakedroot; 887 888 /* 889 * This dataset's mountpoint doesn't exist in the staging tree, or the 890 * dataset doesn't have a mountpoint at all. In either case we still 891 * need a root directory. Fake up a root fsnode to handle this case. 892 */ 893 fakedroot = root == NULL; 894 if (fakedroot) { 895 struct stat *stp; 896 897 assert(dirfd == -1); 898 899 root = ecalloc(1, sizeof(*root)); 900 root->inode = ecalloc(1, sizeof(*root->inode)); 901 root->name = estrdup("."); 902 root->type = S_IFDIR; 903 904 stp = &root->inode->st; 905 stp->st_uid = 0; 906 stp->st_gid = 0; 907 stp->st_mode = S_IFDIR | 0755; 908 } 909 assert(root->type == S_IFDIR); 910 assert(fsnode_isroot(root)); 911 912 /* 913 * Initialize the object set for this dataset. 914 */ 915 os = objset_alloc(zfs, DMU_OST_ZFS); 916 masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); 917 assert(moid == MASTER_NODE_OBJ); 918 919 memset(&fs, 0, sizeof(fs)); 920 fs.os = os; 921 922 /* 923 * Create the ZAP SA layout now since filesystem object dnodes will 924 * refer to those attributes. 925 */ 926 saobjid = fs_set_zpl_attrs(zfs, &fs); 927 928 /* 929 * Make a pass over the staged directory to detect hard links and assign 930 * virtual dnode numbers. 931 */ 932 dnodecount = 1; /* root directory */ 933 fsnode_foreach(root, fs_foreach_mark, &dnodecount); 934 935 /* 936 * Make a second pass to populate the dataset with files from the 937 * staged directory. Most of our runtime is spent here. 938 */ 939 arg.rootdirfd = dirfd; 940 arg.zfs = zfs; 941 arg.fs = &fs; 942 SLIST_INIT(&arg.dirs); 943 fs_populate_dir(root, &arg); 944 assert(!SLIST_EMPTY(&arg.dirs)); 945 fsnode_foreach(root, fs_foreach_populate, &arg); 946 assert(SLIST_EMPTY(&arg.dirs)); 947 rootdirid = arg.rootdirid; 948 949 /* 950 * Create an empty delete queue. We don't do anything with it, but 951 * OpenZFS will refuse to mount filesystems that don't have one. 952 */ 953 deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); 954 zap_write(zfs, zap_alloc(os, deleteq)); 955 956 /* 957 * Populate and write the master node object. This is a ZAP object 958 * containing various dataset properties and the object IDs of the root 959 * directory and delete queue. 960 */ 961 masterzap = zap_alloc(os, masterobj); 962 zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); 963 zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); 964 zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); 965 zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); 966 zap_add_uint64(masterzap, "normalization", 0 /* off */); 967 zap_add_uint64(masterzap, "utf8only", 0 /* off */); 968 zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); 969 zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); 970 zap_write(zfs, masterzap); 971 972 /* 973 * All finished with this object set, we may as well write it now. 974 * The DSL layer will sum up the bytes consumed by each dataset using 975 * information stored in the object set, so it can't be freed just yet. 976 */ 977 dsl_dir_dataset_write(zfs, os, dsldir); 978 979 if (fakedroot) { 980 free(root->inode); 981 free(root->name); 982 free(root); 983 } 984 free(fs.saoffs); 985 } 986 987 /* 988 * Create an object set for each DSL directory which has a dataset and doesn't 989 * already have an object set. 990 */ 991 static void 992 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) 993 { 994 if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) 995 fs_build_one(zfs, dsldir, NULL, -1); 996 } 997 998 /* 999 * Create our datasets and populate them with files. 1000 */ 1001 void 1002 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) 1003 { 1004 /* 1005 * Run through our datasets and find the root fsnode for each one. Each 1006 * root fsnode is flagged so that we can figure out which dataset it 1007 * belongs to. 1008 */ 1009 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); 1010 1011 /* 1012 * Did we find our boot filesystem? 1013 */ 1014 if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) 1015 errx(1, "no mounted dataset matches bootfs property `%s'", 1016 zfs->bootfs); 1017 1018 /* 1019 * Traverse the file hierarchy starting from the root fsnode. One 1020 * dataset, not necessarily the root dataset, must "own" the root 1021 * directory by having its mountpoint be equal to the root path. 1022 * 1023 * As roots of other datasets are encountered during the traversal, 1024 * fs_build_one() recursively creates the corresponding object sets and 1025 * populates them. Once this function has returned, all datasets will 1026 * have been fully populated. 1027 */ 1028 fs_build_one(zfs, root->inode->param, root, dirfd); 1029 1030 /* 1031 * Now create object sets for datasets whose mountpoints weren't found 1032 * in the staging directory, either because there is no mountpoint, or 1033 * because the mountpoint doesn't correspond to an existing directory. 1034 */ 1035 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); 1036 } 1037