1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2022 The FreeBSD Foundation 5 * 6 * This software was developed by Mark Johnston under sponsorship from 7 * the FreeBSD Foundation. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions are 11 * met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in 16 * the documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 21 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 22 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 23 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 24 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 25 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 26 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 27 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 28 * SUCH DAMAGE. 29 */ 30 31 #include <sys/param.h> 32 #include <sys/stat.h> 33 34 #include <assert.h> 35 #include <dirent.h> 36 #include <fcntl.h> 37 #include <stdlib.h> 38 #include <string.h> 39 #include <unistd.h> 40 41 #include <util.h> 42 43 #include "makefs.h" 44 #include "zfs.h" 45 46 typedef struct { 47 const char *name; 48 unsigned int id; 49 uint16_t size; 50 sa_bswap_type_t bs; 51 } zfs_sattr_t; 52 53 typedef struct zfs_fs { 54 zfs_objset_t *os; 55 56 /* Offset table for system attributes, indexed by a zpl_attr_t. */ 57 uint16_t *saoffs; 58 size_t sacnt; 59 const zfs_sattr_t *satab; 60 } zfs_fs_t; 61 62 /* 63 * The order of the attributes doesn't matter, this is simply the one hard-coded 64 * by OpenZFS, based on a zdb dump of the SA_REGISTRY table. 65 */ 66 typedef enum zpl_attr { 67 ZPL_ATIME, 68 ZPL_MTIME, 69 ZPL_CTIME, 70 ZPL_CRTIME, 71 ZPL_GEN, 72 ZPL_MODE, 73 ZPL_SIZE, 74 ZPL_PARENT, 75 ZPL_LINKS, 76 ZPL_XATTR, 77 ZPL_RDEV, 78 ZPL_FLAGS, 79 ZPL_UID, 80 ZPL_GID, 81 ZPL_PAD, 82 ZPL_ZNODE_ACL, 83 ZPL_DACL_COUNT, 84 ZPL_SYMLINK, 85 ZPL_SCANSTAMP, 86 ZPL_DACL_ACES, 87 ZPL_DXATTR, 88 ZPL_PROJID, 89 } zpl_attr_t; 90 91 /* 92 * This table must be kept in sync with zpl_attr_layout[] and zpl_attr_t. 93 */ 94 static const zfs_sattr_t zpl_attrs[] = { 95 #define _ZPL_ATTR(n, s, b) { .name = #n, .id = n, .size = s, .bs = b } 96 _ZPL_ATTR(ZPL_ATIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 97 _ZPL_ATTR(ZPL_MTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 98 _ZPL_ATTR(ZPL_CTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 99 _ZPL_ATTR(ZPL_CRTIME, sizeof(uint64_t) * 2, SA_UINT64_ARRAY), 100 _ZPL_ATTR(ZPL_GEN, sizeof(uint64_t), SA_UINT64_ARRAY), 101 _ZPL_ATTR(ZPL_MODE, sizeof(uint64_t), SA_UINT64_ARRAY), 102 _ZPL_ATTR(ZPL_SIZE, sizeof(uint64_t), SA_UINT64_ARRAY), 103 _ZPL_ATTR(ZPL_PARENT, sizeof(uint64_t), SA_UINT64_ARRAY), 104 _ZPL_ATTR(ZPL_LINKS, sizeof(uint64_t), SA_UINT64_ARRAY), 105 _ZPL_ATTR(ZPL_XATTR, sizeof(uint64_t), SA_UINT64_ARRAY), 106 _ZPL_ATTR(ZPL_RDEV, sizeof(uint64_t), SA_UINT64_ARRAY), 107 _ZPL_ATTR(ZPL_FLAGS, sizeof(uint64_t), SA_UINT64_ARRAY), 108 _ZPL_ATTR(ZPL_UID, sizeof(uint64_t), SA_UINT64_ARRAY), 109 _ZPL_ATTR(ZPL_GID, sizeof(uint64_t), SA_UINT64_ARRAY), 110 _ZPL_ATTR(ZPL_PAD, sizeof(uint64_t), SA_UINT64_ARRAY), 111 _ZPL_ATTR(ZPL_ZNODE_ACL, 88, SA_UINT64_ARRAY), 112 _ZPL_ATTR(ZPL_DACL_COUNT, sizeof(uint64_t), SA_UINT64_ARRAY), 113 _ZPL_ATTR(ZPL_SYMLINK, 0, SA_UINT8_ARRAY), 114 _ZPL_ATTR(ZPL_SCANSTAMP, sizeof(uint64_t) * 4, SA_UINT8_ARRAY), 115 _ZPL_ATTR(ZPL_DACL_ACES, 0, SA_ACL), 116 _ZPL_ATTR(ZPL_DXATTR, 0, SA_UINT8_ARRAY), 117 _ZPL_ATTR(ZPL_PROJID, sizeof(uint64_t), SA_UINT64_ARRAY), 118 #undef ZPL_ATTR 119 }; 120 121 /* 122 * This layout matches that of a filesystem created using OpenZFS on FreeBSD. 123 * It need not match in general, but FreeBSD's loader doesn't bother parsing the 124 * layout and just hard-codes attribute offsets. 125 */ 126 static const sa_attr_type_t zpl_attr_layout[] = { 127 ZPL_MODE, 128 ZPL_SIZE, 129 ZPL_GEN, 130 ZPL_UID, 131 ZPL_GID, 132 ZPL_PARENT, 133 ZPL_FLAGS, 134 ZPL_ATIME, 135 ZPL_MTIME, 136 ZPL_CTIME, 137 ZPL_CRTIME, 138 ZPL_LINKS, 139 ZPL_DACL_COUNT, 140 ZPL_DACL_ACES, 141 ZPL_SYMLINK, 142 }; 143 144 /* 145 * Keys for the ZPL attribute tables in the SA layout ZAP. The first two 146 * indices are reserved for legacy attribute encoding. 147 */ 148 #define SA_LAYOUT_INDEX_DEFAULT 2 149 #define SA_LAYOUT_INDEX_SYMLINK 3 150 151 struct fs_populate_dir { 152 SLIST_ENTRY(fs_populate_dir) next; 153 int dirfd; 154 uint64_t objid; 155 zfs_zap_t *zap; 156 }; 157 158 struct fs_populate_arg { 159 zfs_opt_t *zfs; 160 zfs_fs_t *fs; /* owning filesystem */ 161 uint64_t rootdirid; /* root directory dnode ID */ 162 int rootdirfd; /* root directory fd */ 163 SLIST_HEAD(, fs_populate_dir) dirs; /* stack of directories */ 164 }; 165 166 static void fs_build_one(zfs_opt_t *, zfs_dsl_dir_t *, fsnode *, int); 167 168 static void 169 eclose(int fd) 170 { 171 if (close(fd) != 0) 172 err(1, "close"); 173 } 174 175 static bool 176 fsnode_isroot(const fsnode *cur) 177 { 178 return (strcmp(cur->name, ".") == 0); 179 } 180 181 static bool 182 fsnode_valid(const fsnode *cur) 183 { 184 return (cur->type == S_IFREG || cur->type == S_IFDIR || 185 cur->type == S_IFLNK); 186 } 187 188 /* 189 * Visit each node in a directory hierarchy, in pre-order depth-first order. 190 */ 191 static void 192 fsnode_foreach(fsnode *root, int (*cb)(fsnode *, void *), void *arg) 193 { 194 assert(root->type == S_IFDIR); 195 196 for (fsnode *cur = root; cur != NULL; cur = cur->next) { 197 if (!fsnode_valid(cur)) { 198 warnx("skipping unhandled %s %s/%s", 199 inode_type(cur->type), cur->path, cur->name); 200 continue; 201 } 202 if (cb(cur, arg) == 0) 203 continue; 204 if (cur->type == S_IFDIR && cur->child != NULL) 205 fsnode_foreach(cur->child, cb, arg); 206 } 207 } 208 209 static void 210 fs_populate_dirent(struct fs_populate_arg *arg, fsnode *cur, uint64_t dnid) 211 { 212 struct fs_populate_dir *dir; 213 uint64_t type; 214 215 switch (cur->type) { 216 case S_IFREG: 217 type = DT_REG; 218 break; 219 case S_IFDIR: 220 type = DT_DIR; 221 break; 222 case S_IFLNK: 223 type = DT_LNK; 224 break; 225 default: 226 assert(0); 227 } 228 229 dir = SLIST_FIRST(&arg->dirs); 230 zap_add_uint64(dir->zap, cur->name, ZFS_DIRENT_MAKE(type, dnid)); 231 } 232 233 static void 234 fs_populate_attr(zfs_fs_t *fs, char *attrbuf, const void *val, uint16_t ind, 235 size_t *szp) 236 { 237 assert(ind < fs->sacnt); 238 assert(fs->saoffs[ind] != 0xffff); 239 240 memcpy(attrbuf + fs->saoffs[ind], val, fs->satab[ind].size); 241 *szp += fs->satab[ind].size; 242 } 243 244 static void 245 fs_populate_varszattr(zfs_fs_t *fs, char *attrbuf, const void *val, 246 size_t valsz, size_t varoff, uint16_t ind, size_t *szp) 247 { 248 assert(ind < fs->sacnt); 249 assert(fs->saoffs[ind] != 0xffff); 250 assert(fs->satab[ind].size == 0); 251 252 memcpy(attrbuf + fs->saoffs[ind] + varoff, val, valsz); 253 *szp += valsz; 254 } 255 256 /* 257 * Derive the relative fd/path combo needed to access a file. Ideally we'd 258 * always be able to use relative lookups (i.e., use the *at() system calls), 259 * since they require less path translation and are more amenable to sandboxing, 260 * but the handling of multiple staging directories makes that difficult. To 261 * make matters worse, we have no choice but to use relative lookups when 262 * dealing with an mtree manifest, so both mechanisms are implemented. 263 */ 264 static void 265 fs_populate_path(const fsnode *cur, struct fs_populate_arg *arg, 266 char *path, size_t sz, int *dirfdp) 267 { 268 if (cur->contents != NULL) { 269 size_t n; 270 271 *dirfdp = AT_FDCWD; 272 n = strlcpy(path, cur->contents, sz); 273 assert(n < sz); 274 } else if (cur->root == NULL) { 275 size_t n; 276 277 *dirfdp = SLIST_FIRST(&arg->dirs)->dirfd; 278 n = strlcpy(path, cur->name, sz); 279 assert(n < sz); 280 } else { 281 int n; 282 283 *dirfdp = AT_FDCWD; 284 n = snprintf(path, sz, "%s/%s/%s", 285 cur->root, cur->path, cur->name); 286 assert(n >= 0); 287 assert((size_t)n < sz); 288 } 289 } 290 291 static int 292 fs_open(const fsnode *cur, struct fs_populate_arg *arg, int flags) 293 { 294 char path[PATH_MAX]; 295 int fd; 296 297 fs_populate_path(cur, arg, path, sizeof(path), &fd); 298 299 fd = openat(fd, path, flags); 300 if (fd < 0) 301 err(1, "openat(%s)", path); 302 return (fd); 303 } 304 305 static int 306 fs_open_can_fail(const fsnode *cur, struct fs_populate_arg *arg, int flags) 307 { 308 int fd; 309 char path[PATH_MAX]; 310 311 fs_populate_path(cur, arg, path, sizeof(path), &fd); 312 313 return (openat(fd, path, flags)); 314 } 315 316 static void 317 fs_readlink(const fsnode *cur, struct fs_populate_arg *arg, 318 char *buf, size_t bufsz) 319 { 320 char path[PATH_MAX]; 321 int fd; 322 323 if (cur->symlink != NULL) { 324 size_t n; 325 326 n = strlcpy(buf, cur->symlink, bufsz); 327 assert(n < bufsz); 328 } else { 329 ssize_t n; 330 331 fs_populate_path(cur, arg, path, sizeof(path), &fd); 332 333 n = readlinkat(fd, path, buf, bufsz - 1); 334 if (n == -1) 335 err(1, "readlinkat(%s)", cur->name); 336 buf[n] = '\0'; 337 } 338 } 339 340 static void 341 fs_populate_time(zfs_fs_t *fs, char *attrbuf, struct timespec *ts, 342 uint16_t ind, size_t *szp) 343 { 344 uint64_t timebuf[2]; 345 346 assert(ind < fs->sacnt); 347 assert(fs->saoffs[ind] != 0xffff); 348 assert(fs->satab[ind].size == sizeof(timebuf)); 349 350 timebuf[0] = ts->tv_sec; 351 timebuf[1] = ts->tv_nsec; 352 fs_populate_attr(fs, attrbuf, timebuf, ind, szp); 353 } 354 355 static void 356 fs_populate_sattrs(struct fs_populate_arg *arg, const fsnode *cur, 357 dnode_phys_t *dnode) 358 { 359 char target[PATH_MAX]; 360 zfs_fs_t *fs; 361 zfs_ace_hdr_t aces[3]; 362 struct stat *sb; 363 sa_hdr_phys_t *sahdr; 364 uint64_t daclcount, flags, gen, gid, links, mode, parent, objsize, uid; 365 char *attrbuf; 366 size_t bonussz, hdrsz; 367 int layout; 368 369 assert(dnode->dn_bonustype == DMU_OT_SA); 370 assert(dnode->dn_nblkptr == 1); 371 372 fs = arg->fs; 373 sb = &cur->inode->st; 374 375 switch (cur->type) { 376 case S_IFREG: 377 layout = SA_LAYOUT_INDEX_DEFAULT; 378 links = cur->inode->nlink; 379 objsize = sb->st_size; 380 parent = SLIST_FIRST(&arg->dirs)->objid; 381 break; 382 case S_IFDIR: 383 layout = SA_LAYOUT_INDEX_DEFAULT; 384 links = 1; /* .. */ 385 objsize = 1; /* .. */ 386 387 if ((cur->inode->flags & FI_ROOT) == 0 ) { 388 /* 389 * The size of a ZPL directory is the number of entries 390 * (including "." and ".."), and the link count is the 391 * number of entries which are directories 392 * (including "." and ".."). 393 */ 394 for (fsnode *c = 395 fsnode_isroot(cur) ? cur->next : cur->child; 396 c != NULL; c = c->next) { 397 switch (c->type) { 398 case S_IFDIR: 399 links++; 400 /* FALLTHROUGH */ 401 case S_IFREG: 402 case S_IFLNK: 403 objsize++; 404 break; 405 } 406 } 407 } else { 408 /* 409 * Root directory children do belong to 410 * different dataset and this directory is 411 * empty in the current objset. 412 */ 413 links++; /* . */ 414 objsize++; /* . */ 415 } 416 417 /* The root directory is its own parent. */ 418 parent = SLIST_EMPTY(&arg->dirs) ? 419 arg->rootdirid : SLIST_FIRST(&arg->dirs)->objid; 420 break; 421 case S_IFLNK: 422 fs_readlink(cur, arg, target, sizeof(target)); 423 424 layout = SA_LAYOUT_INDEX_SYMLINK; 425 links = 1; 426 objsize = strlen(target); 427 parent = SLIST_FIRST(&arg->dirs)->objid; 428 break; 429 default: 430 assert(0); 431 } 432 433 daclcount = nitems(aces); 434 flags = ZFS_ACL_TRIVIAL | ZFS_ACL_AUTO_INHERIT | ZFS_ARCHIVE | 435 ZFS_AV_MODIFIED; 436 gen = 1; 437 gid = sb->st_gid; 438 mode = sb->st_mode; 439 uid = sb->st_uid; 440 441 memset(aces, 0, sizeof(aces)); 442 aces[0].z_flags = ACE_OWNER; 443 aces[0].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 444 aces[0].z_access_mask = ACE_WRITE_ATTRIBUTES | ACE_WRITE_OWNER | 445 ACE_WRITE_ACL | ACE_WRITE_NAMED_ATTRS | ACE_READ_ACL | 446 ACE_READ_ATTRIBUTES | ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 447 if ((mode & S_IRUSR) != 0) 448 aces[0].z_access_mask |= ACE_READ_DATA; 449 if ((mode & S_IWUSR) != 0) 450 aces[0].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 451 if ((mode & S_IXUSR) != 0) 452 aces[0].z_access_mask |= ACE_EXECUTE; 453 454 aces[1].z_flags = ACE_GROUP | ACE_IDENTIFIER_GROUP; 455 aces[1].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 456 aces[1].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 457 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 458 if ((mode & S_IRGRP) != 0) 459 aces[1].z_access_mask |= ACE_READ_DATA; 460 if ((mode & S_IWGRP) != 0) 461 aces[1].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 462 if ((mode & S_IXGRP) != 0) 463 aces[1].z_access_mask |= ACE_EXECUTE; 464 465 aces[2].z_flags = ACE_EVERYONE; 466 aces[2].z_type = ACE_ACCESS_ALLOWED_ACE_TYPE; 467 aces[2].z_access_mask = ACE_READ_ACL | ACE_READ_ATTRIBUTES | 468 ACE_READ_NAMED_ATTRS | ACE_SYNCHRONIZE; 469 if ((mode & S_IROTH) != 0) 470 aces[2].z_access_mask |= ACE_READ_DATA; 471 if ((mode & S_IWOTH) != 0) 472 aces[2].z_access_mask |= ACE_WRITE_DATA | ACE_APPEND_DATA; 473 if ((mode & S_IXOTH) != 0) 474 aces[2].z_access_mask |= ACE_EXECUTE; 475 476 switch (layout) { 477 case SA_LAYOUT_INDEX_DEFAULT: 478 /* At most one variable-length attribute. */ 479 hdrsz = sizeof(uint64_t); 480 break; 481 case SA_LAYOUT_INDEX_SYMLINK: 482 /* At most five variable-length attributes. */ 483 hdrsz = sizeof(uint64_t) * 2; 484 break; 485 default: 486 assert(0); 487 } 488 489 sahdr = (sa_hdr_phys_t *)DN_BONUS(dnode); 490 sahdr->sa_magic = SA_MAGIC; 491 SA_HDR_LAYOUT_INFO_ENCODE(sahdr->sa_layout_info, layout, hdrsz); 492 493 bonussz = SA_HDR_SIZE(sahdr); 494 attrbuf = (char *)sahdr + SA_HDR_SIZE(sahdr); 495 496 fs_populate_attr(fs, attrbuf, &daclcount, ZPL_DACL_COUNT, &bonussz); 497 fs_populate_attr(fs, attrbuf, &flags, ZPL_FLAGS, &bonussz); 498 fs_populate_attr(fs, attrbuf, &gen, ZPL_GEN, &bonussz); 499 fs_populate_attr(fs, attrbuf, &gid, ZPL_GID, &bonussz); 500 fs_populate_attr(fs, attrbuf, &links, ZPL_LINKS, &bonussz); 501 fs_populate_attr(fs, attrbuf, &mode, ZPL_MODE, &bonussz); 502 fs_populate_attr(fs, attrbuf, &parent, ZPL_PARENT, &bonussz); 503 fs_populate_attr(fs, attrbuf, &objsize, ZPL_SIZE, &bonussz); 504 fs_populate_attr(fs, attrbuf, &uid, ZPL_UID, &bonussz); 505 506 /* 507 * We deliberately set atime = mtime here to ensure that images are 508 * reproducible. 509 */ 510 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_ATIME, &bonussz); 511 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CTIME, &bonussz); 512 fs_populate_time(fs, attrbuf, &sb->st_mtim, ZPL_MTIME, &bonussz); 513 #ifdef __linux__ 514 /* Linux has no st_birthtim; approximate with st_ctim */ 515 fs_populate_time(fs, attrbuf, &sb->st_ctim, ZPL_CRTIME, &bonussz); 516 #else 517 fs_populate_time(fs, attrbuf, &sb->st_birthtim, ZPL_CRTIME, &bonussz); 518 #endif 519 520 fs_populate_varszattr(fs, attrbuf, aces, sizeof(aces), 0, 521 ZPL_DACL_ACES, &bonussz); 522 sahdr->sa_lengths[0] = sizeof(aces); 523 524 if (cur->type == S_IFLNK) { 525 assert(layout == SA_LAYOUT_INDEX_SYMLINK); 526 /* Need to use a spill block pointer if the target is long. */ 527 assert(bonussz + objsize <= DN_OLD_MAX_BONUSLEN); 528 fs_populate_varszattr(fs, attrbuf, target, objsize, 529 sahdr->sa_lengths[0], ZPL_SYMLINK, &bonussz); 530 sahdr->sa_lengths[1] = (uint16_t)objsize; 531 } 532 533 dnode->dn_bonuslen = bonussz; 534 } 535 536 static void 537 fs_populate_file(fsnode *cur, struct fs_populate_arg *arg) 538 { 539 struct dnode_cursor *c; 540 dnode_phys_t *dnode; 541 zfs_opt_t *zfs; 542 char *buf; 543 uint64_t dnid; 544 ssize_t n; 545 size_t bufsz; 546 off_t nbytes, reqbytes, size; 547 int fd; 548 549 assert(cur->type == S_IFREG); 550 assert((cur->inode->flags & FI_ROOT) == 0); 551 552 zfs = arg->zfs; 553 554 assert(cur->inode->ino != 0); 555 if ((cur->inode->flags & FI_ALLOCATED) != 0) { 556 /* 557 * This is a hard link of an existing file. 558 * 559 * XXX-MJ need to check whether it crosses datasets, add a test 560 * case for that 561 */ 562 fs_populate_dirent(arg, cur, cur->inode->ino); 563 return; 564 } 565 566 dnode = objset_dnode_bonus_alloc(arg->fs->os, 567 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 568 cur->inode->ino = dnid; 569 cur->inode->flags |= FI_ALLOCATED; 570 571 fd = fs_open(cur, arg, O_RDONLY); 572 573 buf = zfs->filebuf; 574 bufsz = sizeof(zfs->filebuf); 575 size = cur->inode->st.st_size; 576 c = dnode_cursor_init(zfs, arg->fs->os, dnode, size, 0); 577 for (off_t foff = 0; foff < size; foff += nbytes) { 578 off_t loc, sofar; 579 580 /* 581 * Fill up our buffer, handling partial reads. 582 */ 583 sofar = 0; 584 nbytes = MIN(size - foff, (off_t)bufsz); 585 do { 586 n = read(fd, buf + sofar, nbytes); 587 if (n < 0) 588 err(1, "reading from '%s'", cur->name); 589 if (n == 0) 590 errx(1, "unexpected EOF reading '%s'", 591 cur->name); 592 sofar += n; 593 } while (sofar < nbytes); 594 595 if (nbytes < (off_t)bufsz) 596 memset(buf + nbytes, 0, bufsz - nbytes); 597 598 reqbytes = foff == 0 ? nbytes : MAXBLOCKSIZE; 599 loc = objset_space_alloc(zfs, arg->fs->os, &reqbytes); 600 vdev_pwrite_dnode_indir(zfs, dnode, 0, 1, buf, reqbytes, loc, 601 dnode_cursor_next(zfs, c, foff)); 602 } 603 eclose(fd); 604 dnode_cursor_finish(zfs, c); 605 606 fs_populate_sattrs(arg, cur, dnode); 607 fs_populate_dirent(arg, cur, dnid); 608 } 609 610 static void 611 fs_populate_dir(fsnode *cur, struct fs_populate_arg *arg) 612 { 613 dnode_phys_t *dnode; 614 zfs_objset_t *os; 615 uint64_t dnid; 616 int dirfd; 617 618 assert(cur->type == S_IFDIR); 619 assert((cur->inode->flags & FI_ALLOCATED) == 0); 620 621 os = arg->fs->os; 622 623 dnode = objset_dnode_bonus_alloc(os, DMU_OT_DIRECTORY_CONTENTS, 624 DMU_OT_SA, 0, &dnid); 625 626 /* 627 * Add an entry to the parent directory and open this directory. 628 */ 629 if (!SLIST_EMPTY(&arg->dirs)) { 630 fs_populate_dirent(arg, cur, dnid); 631 /* 632 * We only need the directory fd if we're finding files in 633 * it. If it's just there for other directories or 634 * files using contents= we don't need to succeed here. 635 */ 636 dirfd = fs_open_can_fail(cur, arg, O_DIRECTORY | O_RDONLY); 637 } else { 638 arg->rootdirid = dnid; 639 dirfd = arg->rootdirfd; 640 arg->rootdirfd = -1; 641 } 642 643 /* 644 * Set ZPL attributes. 645 */ 646 fs_populate_sattrs(arg, cur, dnode); 647 648 /* 649 * If this is a root directory, then its children belong to a different 650 * dataset and this directory remains empty in the current objset. 651 */ 652 if ((cur->inode->flags & FI_ROOT) == 0) { 653 struct fs_populate_dir *dir; 654 655 dir = ecalloc(1, sizeof(*dir)); 656 dir->dirfd = dirfd; 657 dir->objid = dnid; 658 dir->zap = zap_alloc(os, dnode); 659 SLIST_INSERT_HEAD(&arg->dirs, dir, next); 660 } else { 661 zap_write(arg->zfs, zap_alloc(os, dnode)); 662 fs_build_one(arg->zfs, cur->inode->param, cur->child, dirfd); 663 } 664 } 665 666 static void 667 fs_populate_symlink(fsnode *cur, struct fs_populate_arg *arg) 668 { 669 dnode_phys_t *dnode; 670 uint64_t dnid; 671 672 assert(cur->type == S_IFLNK); 673 assert((cur->inode->flags & (FI_ALLOCATED | FI_ROOT)) == 0); 674 675 dnode = objset_dnode_bonus_alloc(arg->fs->os, 676 DMU_OT_PLAIN_FILE_CONTENTS, DMU_OT_SA, 0, &dnid); 677 678 fs_populate_dirent(arg, cur, dnid); 679 680 fs_populate_sattrs(arg, cur, dnode); 681 } 682 683 static fsnode * 684 fsnode_next(fsnode *cur) 685 { 686 for (cur = cur->next; cur != NULL; cur = cur->next) { 687 if (fsnode_valid(cur)) 688 return (cur); 689 } 690 return (NULL); 691 } 692 693 static int 694 fs_foreach_populate(fsnode *cur, void *_arg) 695 { 696 struct fs_populate_arg *arg; 697 struct fs_populate_dir *dir; 698 int ret; 699 700 arg = _arg; 701 switch (cur->type) { 702 case S_IFREG: 703 fs_populate_file(cur, arg); 704 break; 705 case S_IFDIR: 706 if (fsnode_isroot(cur)) 707 break; 708 fs_populate_dir(cur, arg); 709 break; 710 case S_IFLNK: 711 fs_populate_symlink(cur, arg); 712 break; 713 default: 714 assert(0); 715 } 716 717 ret = (cur->inode->flags & FI_ROOT) != 0 ? 0 : 1; 718 719 if (fsnode_next(cur) == NULL && 720 (cur->child == NULL || (cur->inode->flags & FI_ROOT) != 0)) { 721 /* 722 * We reached a terminal node in a subtree. Walk back up and 723 * write out directories. We're done once we hit the root of a 724 * dataset or find a level where we're not on the edge of the 725 * tree. 726 */ 727 do { 728 dir = SLIST_FIRST(&arg->dirs); 729 SLIST_REMOVE_HEAD(&arg->dirs, next); 730 zap_write(arg->zfs, dir->zap); 731 if (dir->dirfd != -1) 732 eclose(dir->dirfd); 733 free(dir); 734 cur = cur->parent; 735 } while (cur != NULL && fsnode_next(cur) == NULL && 736 (cur->inode->flags & FI_ROOT) == 0); 737 } 738 739 return (ret); 740 } 741 742 static void 743 fs_add_zpl_attr_layout(zfs_zap_t *zap, unsigned int index, 744 const sa_attr_type_t layout[], size_t sacnt) 745 { 746 char ti[16]; 747 748 assert(sizeof(layout[0]) == 2); 749 750 (void)snprintf(ti, sizeof(ti), "%u", index); 751 zap_add(zap, ti, sizeof(sa_attr_type_t), sacnt, 752 (const uint8_t *)layout); 753 } 754 755 /* 756 * Initialize system attribute tables. 757 * 758 * There are two elements to this. First, we write the zpl_attrs[] and 759 * zpl_attr_layout[] tables to disk. Then we create a lookup table which 760 * allows us to set file attributes quickly. 761 */ 762 static uint64_t 763 fs_set_zpl_attrs(zfs_opt_t *zfs, zfs_fs_t *fs) 764 { 765 zfs_zap_t *sazap, *salzap, *sarzap; 766 zfs_objset_t *os; 767 dnode_phys_t *saobj, *salobj, *sarobj; 768 uint64_t saobjid, salobjid, sarobjid; 769 uint16_t offset; 770 771 os = fs->os; 772 773 /* 774 * The on-disk tables are stored in two ZAP objects, the registry object 775 * and the layout object. Individual attributes are described by 776 * entries in the registry object; for example, the value for the 777 * "ZPL_SIZE" key gives the size and encoding of the ZPL_SIZE attribute. 778 * The attributes of a file are ordered according to one of the layouts 779 * defined in the layout object. The master node object is simply used 780 * to locate the registry and layout objects. 781 */ 782 saobj = objset_dnode_alloc(os, DMU_OT_SA_MASTER_NODE, &saobjid); 783 salobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_LAYOUTS, &salobjid); 784 sarobj = objset_dnode_alloc(os, DMU_OT_SA_ATTR_REGISTRATION, &sarobjid); 785 786 sarzap = zap_alloc(os, sarobj); 787 for (size_t i = 0; i < nitems(zpl_attrs); i++) { 788 const zfs_sattr_t *sa; 789 uint64_t attr; 790 791 attr = 0; 792 sa = &zpl_attrs[i]; 793 SA_ATTR_ENCODE(attr, (uint64_t)i, sa->size, sa->bs); 794 zap_add_uint64(sarzap, sa->name, attr); 795 } 796 zap_write(zfs, sarzap); 797 798 /* 799 * Layouts are arrays of indices into the registry. We define two 800 * layouts for use by the ZPL, one for non-symlinks and one for 801 * symlinks. They are identical except that the symlink layout includes 802 * ZPL_SYMLINK as its final attribute. 803 */ 804 salzap = zap_alloc(os, salobj); 805 assert(zpl_attr_layout[nitems(zpl_attr_layout) - 1] == ZPL_SYMLINK); 806 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_DEFAULT, 807 zpl_attr_layout, nitems(zpl_attr_layout) - 1); 808 fs_add_zpl_attr_layout(salzap, SA_LAYOUT_INDEX_SYMLINK, 809 zpl_attr_layout, nitems(zpl_attr_layout)); 810 zap_write(zfs, salzap); 811 812 sazap = zap_alloc(os, saobj); 813 zap_add_uint64(sazap, SA_LAYOUTS, salobjid); 814 zap_add_uint64(sazap, SA_REGISTRY, sarobjid); 815 zap_write(zfs, sazap); 816 817 /* Sanity check. */ 818 for (size_t i = 0; i < nitems(zpl_attrs); i++) 819 assert(i == zpl_attrs[i].id); 820 821 /* 822 * Build the offset table used when setting file attributes. File 823 * attributes are stored in the object's bonus buffer; this table 824 * provides the buffer offset of attributes referenced by the layout 825 * table. 826 */ 827 fs->sacnt = nitems(zpl_attrs); 828 fs->saoffs = ecalloc(fs->sacnt, sizeof(*fs->saoffs)); 829 for (size_t i = 0; i < fs->sacnt; i++) 830 fs->saoffs[i] = 0xffff; 831 offset = 0; 832 for (size_t i = 0; i < nitems(zpl_attr_layout); i++) { 833 uint16_t size; 834 835 assert(zpl_attr_layout[i] < fs->sacnt); 836 837 fs->saoffs[zpl_attr_layout[i]] = offset; 838 size = zpl_attrs[zpl_attr_layout[i]].size; 839 offset += size; 840 } 841 fs->satab = zpl_attrs; 842 843 return (saobjid); 844 } 845 846 static void 847 fs_layout_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg) 848 { 849 char *mountpoint, *origmountpoint, *name, *next; 850 fsnode *cur, *root; 851 uint64_t canmount; 852 853 if (!dsl_dir_has_dataset(dsldir)) 854 return; 855 856 if (dsl_dir_get_canmount(dsldir, &canmount) == 0 && canmount == 0) 857 return; 858 mountpoint = dsl_dir_get_mountpoint(zfs, dsldir); 859 if (mountpoint == NULL) 860 return; 861 862 /* 863 * If we were asked to specify a bootfs, set it here. 864 */ 865 if (zfs->bootfs != NULL && strcmp(zfs->bootfs, 866 dsl_dir_fullname(dsldir)) == 0) { 867 zap_add_uint64(zfs->poolprops, "bootfs", 868 dsl_dir_dataset_id(dsldir)); 869 } 870 871 origmountpoint = mountpoint; 872 873 /* 874 * Figure out which fsnode corresponds to our mountpoint. 875 */ 876 root = arg; 877 cur = root; 878 if (strcmp(mountpoint, zfs->rootpath) != 0) { 879 mountpoint += strlen(zfs->rootpath); 880 881 /* 882 * Look up the directory in the staged tree. For example, if 883 * the dataset's mount point is /foo/bar/baz, we'll search the 884 * root directory for "foo", search "foo" for "baz", and so on. 885 * Each intermediate name must refer to a directory; the final 886 * component need not exist. 887 */ 888 cur = root; 889 for (next = name = mountpoint; next != NULL;) { 890 for (; *next == '/'; next++) 891 ; 892 name = strsep(&next, "/"); 893 894 for (; cur != NULL && strcmp(cur->name, name) != 0; 895 cur = cur->next) 896 ; 897 if (cur == NULL) { 898 if (next == NULL) 899 break; 900 errx(1, "missing mountpoint directory for `%s'", 901 dsl_dir_fullname(dsldir)); 902 } 903 if (cur->type != S_IFDIR) { 904 errx(1, 905 "mountpoint for `%s' is not a directory", 906 dsl_dir_fullname(dsldir)); 907 } 908 if (next != NULL) 909 cur = cur->child; 910 } 911 } 912 913 if (cur != NULL) { 914 assert(cur->type == S_IFDIR); 915 916 /* 917 * Multiple datasets shouldn't share a mountpoint. It's 918 * technically allowed, but it's not clear what makefs should do 919 * in that case. 920 */ 921 assert((cur->inode->flags & FI_ROOT) == 0); 922 if (cur != root) 923 cur->inode->flags |= FI_ROOT; 924 assert(cur->inode->param == NULL); 925 cur->inode->param = dsldir; 926 } 927 928 free(origmountpoint); 929 } 930 931 static int 932 fs_foreach_mark(fsnode *cur, void *arg) 933 { 934 uint64_t *countp; 935 936 countp = arg; 937 if (cur->type == S_IFDIR && fsnode_isroot(cur)) 938 return (1); 939 940 if (cur->inode->ino == 0) { 941 cur->inode->ino = ++(*countp); 942 cur->inode->nlink = 1; 943 } else { 944 cur->inode->nlink++; 945 } 946 947 return ((cur->inode->flags & FI_ROOT) != 0 ? 0 : 1); 948 } 949 950 /* 951 * Create a filesystem dataset. More specifically: 952 * - create an object set for the dataset, 953 * - add required metadata (SA tables, property definitions, etc.) to that 954 * object set, 955 * - optionally populate the object set with file objects, using "root" as the 956 * root directory. 957 * 958 * "dirfd" is a directory descriptor for the directory referenced by "root". It 959 * is closed before returning. 960 */ 961 static void 962 fs_build_one(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, fsnode *root, int dirfd) 963 { 964 struct fs_populate_arg arg; 965 zfs_fs_t fs; 966 zfs_zap_t *masterzap; 967 zfs_objset_t *os; 968 dnode_phys_t *deleteq, *masterobj; 969 uint64_t deleteqid, dnodecount, moid, rootdirid, saobjid; 970 bool fakedroot; 971 972 /* 973 * This dataset's mountpoint doesn't exist in the staging tree, or the 974 * dataset doesn't have a mountpoint at all. In either case we still 975 * need a root directory. Fake up a root fsnode to handle this case. 976 */ 977 fakedroot = root == NULL; 978 if (fakedroot) { 979 struct stat *stp; 980 981 assert(dirfd == -1); 982 983 root = ecalloc(1, sizeof(*root)); 984 root->inode = ecalloc(1, sizeof(*root->inode)); 985 root->name = estrdup("."); 986 root->type = S_IFDIR; 987 988 stp = &root->inode->st; 989 stp->st_uid = 0; 990 stp->st_gid = 0; 991 stp->st_mode = S_IFDIR | 0755; 992 } 993 assert(root->type == S_IFDIR); 994 assert(fsnode_isroot(root)); 995 996 /* 997 * Initialize the object set for this dataset. 998 */ 999 os = objset_alloc(zfs, DMU_OST_ZFS); 1000 masterobj = objset_dnode_alloc(os, DMU_OT_MASTER_NODE, &moid); 1001 assert(moid == MASTER_NODE_OBJ); 1002 1003 memset(&fs, 0, sizeof(fs)); 1004 fs.os = os; 1005 1006 /* 1007 * Create the ZAP SA layout now since filesystem object dnodes will 1008 * refer to those attributes. 1009 */ 1010 saobjid = fs_set_zpl_attrs(zfs, &fs); 1011 1012 /* 1013 * Make a pass over the staged directory to detect hard links and assign 1014 * virtual dnode numbers. 1015 */ 1016 dnodecount = 1; /* root directory */ 1017 fsnode_foreach(root, fs_foreach_mark, &dnodecount); 1018 1019 /* 1020 * Make a second pass to populate the dataset with files from the 1021 * staged directory. Most of our runtime is spent here. 1022 */ 1023 arg.rootdirfd = dirfd; 1024 arg.zfs = zfs; 1025 arg.fs = &fs; 1026 SLIST_INIT(&arg.dirs); 1027 fs_populate_dir(root, &arg); 1028 assert(!SLIST_EMPTY(&arg.dirs)); 1029 fsnode_foreach(root, fs_foreach_populate, &arg); 1030 assert(SLIST_EMPTY(&arg.dirs)); 1031 rootdirid = arg.rootdirid; 1032 1033 /* 1034 * Create an empty delete queue. We don't do anything with it, but 1035 * OpenZFS will refuse to mount filesystems that don't have one. 1036 */ 1037 deleteq = objset_dnode_alloc(os, DMU_OT_UNLINKED_SET, &deleteqid); 1038 zap_write(zfs, zap_alloc(os, deleteq)); 1039 1040 /* 1041 * Populate and write the master node object. This is a ZAP object 1042 * containing various dataset properties and the object IDs of the root 1043 * directory and delete queue. 1044 */ 1045 masterzap = zap_alloc(os, masterobj); 1046 zap_add_uint64(masterzap, ZFS_ROOT_OBJ, rootdirid); 1047 zap_add_uint64(masterzap, ZFS_UNLINKED_SET, deleteqid); 1048 zap_add_uint64(masterzap, ZFS_SA_ATTRS, saobjid); 1049 zap_add_uint64(masterzap, ZPL_VERSION_OBJ, 5 /* ZPL_VERSION_SA */); 1050 zap_add_uint64(masterzap, "normalization", 0 /* off */); 1051 zap_add_uint64(masterzap, "utf8only", 0 /* off */); 1052 zap_add_uint64(masterzap, "casesensitivity", 0 /* case sensitive */); 1053 zap_add_uint64(masterzap, "acltype", 2 /* NFSv4 */); 1054 zap_write(zfs, masterzap); 1055 1056 /* 1057 * All finished with this object set, we may as well write it now. 1058 * The DSL layer will sum up the bytes consumed by each dataset using 1059 * information stored in the object set, so it can't be freed just yet. 1060 */ 1061 dsl_dir_dataset_write(zfs, os, dsldir); 1062 1063 if (fakedroot) { 1064 free(root->inode); 1065 free(root->name); 1066 free(root); 1067 } 1068 free(fs.saoffs); 1069 } 1070 1071 /* 1072 * Create an object set for each DSL directory which has a dataset and doesn't 1073 * already have an object set. 1074 */ 1075 static void 1076 fs_build_unmounted(zfs_opt_t *zfs, zfs_dsl_dir_t *dsldir, void *arg __unused) 1077 { 1078 if (dsl_dir_has_dataset(dsldir) && !dsl_dir_dataset_has_objset(dsldir)) 1079 fs_build_one(zfs, dsldir, NULL, -1); 1080 } 1081 1082 /* 1083 * Create our datasets and populate them with files. 1084 */ 1085 void 1086 fs_build(zfs_opt_t *zfs, int dirfd, fsnode *root) 1087 { 1088 /* 1089 * Run through our datasets and find the root fsnode for each one. Each 1090 * root fsnode is flagged so that we can figure out which dataset it 1091 * belongs to. 1092 */ 1093 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_layout_one, root); 1094 1095 /* 1096 * Did we find our boot filesystem? 1097 */ 1098 if (zfs->bootfs != NULL && !zap_entry_exists(zfs->poolprops, "bootfs")) 1099 errx(1, "no mounted dataset matches bootfs property `%s'", 1100 zfs->bootfs); 1101 1102 /* 1103 * Traverse the file hierarchy starting from the root fsnode. One 1104 * dataset, not necessarily the root dataset, must "own" the root 1105 * directory by having its mountpoint be equal to the root path. 1106 * 1107 * As roots of other datasets are encountered during the traversal, 1108 * fs_build_one() recursively creates the corresponding object sets and 1109 * populates them. Once this function has returned, all datasets will 1110 * have been fully populated. 1111 */ 1112 fs_build_one(zfs, root->inode->param, root, dirfd); 1113 1114 /* 1115 * Now create object sets for datasets whose mountpoints weren't found 1116 * in the staging directory, either because there is no mountpoint, or 1117 * because the mountpoint doesn't correspond to an existing directory. 1118 */ 1119 dsl_dir_foreach(zfs, zfs->rootdsldir, fs_build_unmounted, NULL); 1120 } 1121