1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright (C) 2011 Novell Inc. 5 */ 6 7 #include <linux/fs.h> 8 #include <linux/slab.h> 9 #include <linux/cred.h> 10 #include <linux/xattr.h> 11 #include <linux/ratelimit.h> 12 #include <linux/fiemap.h> 13 #include <linux/fileattr.h> 14 #include <linux/security.h> 15 #include <linux/namei.h> 16 #include <linux/posix_acl.h> 17 #include <linux/posix_acl_xattr.h> 18 #include "overlayfs.h" 19 20 21 int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 22 struct iattr *attr) 23 { 24 int err; 25 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 26 bool full_copy_up = false; 27 struct dentry *upperdentry; 28 29 err = setattr_prepare(&nop_mnt_idmap, dentry, attr); 30 if (err) 31 return err; 32 33 if (attr->ia_valid & ATTR_SIZE) { 34 /* Truncate should trigger data copy up as well */ 35 full_copy_up = true; 36 } 37 38 if (!full_copy_up) 39 err = ovl_copy_up(dentry); 40 else 41 err = ovl_copy_up_with_data(dentry); 42 if (!err) { 43 struct inode *winode = NULL; 44 45 upperdentry = ovl_dentry_upper(dentry); 46 47 if (attr->ia_valid & ATTR_SIZE) { 48 winode = d_inode(upperdentry); 49 err = get_write_access(winode); 50 if (err) 51 goto out; 52 } 53 54 if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) 55 attr->ia_valid &= ~ATTR_MODE; 56 57 /* 58 * We might have to translate ovl file into real file object 59 * once use cases emerge. For now, simply don't let underlying 60 * filesystem rely on attr->ia_file 61 */ 62 attr->ia_valid &= ~ATTR_FILE; 63 64 /* 65 * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN 66 * set. Overlayfs does not pass O_TRUNC flag to underlying 67 * filesystem during open -> do not pass ATTR_OPEN. This 68 * disables optimization in fuse which assumes open(O_TRUNC) 69 * already set file size to 0. But we never passed O_TRUNC to 70 * fuse. So by clearing ATTR_OPEN, fuse will be forced to send 71 * setattr request to server. 72 */ 73 attr->ia_valid &= ~ATTR_OPEN; 74 75 err = ovl_want_write(dentry); 76 if (err) 77 goto out_put_write; 78 79 inode_lock(upperdentry->d_inode); 80 with_ovl_creds(dentry->d_sb) 81 err = ovl_do_notify_change(ofs, upperdentry, attr); 82 if (!err) 83 ovl_copyattr(dentry->d_inode); 84 inode_unlock(upperdentry->d_inode); 85 ovl_drop_write(dentry); 86 87 out_put_write: 88 if (winode) 89 put_write_access(winode); 90 } 91 out: 92 return err; 93 } 94 95 static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 96 { 97 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 98 bool samefs = ovl_same_fs(ofs); 99 unsigned int xinobits = ovl_xino_bits(ofs); 100 unsigned int xinoshift = 64 - xinobits; 101 102 if (samefs) { 103 /* 104 * When all layers are on the same fs, all real inode 105 * number are unique, so we use the overlay st_dev, 106 * which is friendly to du -x. 107 */ 108 stat->dev = dentry->d_sb->s_dev; 109 return; 110 } else if (xinobits) { 111 /* 112 * All inode numbers of underlying fs should not be using the 113 * high xinobits, so we use high xinobits to partition the 114 * overlay st_ino address space. The high bits holds the fsid 115 * (upper fsid is 0). The lowest xinobit is reserved for mapping 116 * the non-persistent inode numbers range in case of overflow. 117 * This way all overlay inode numbers are unique and use the 118 * overlay st_dev. 119 */ 120 if (likely(!(stat->ino >> xinoshift))) { 121 stat->ino |= ((u64)fsid) << (xinoshift + 1); 122 stat->dev = dentry->d_sb->s_dev; 123 return; 124 } else if (ovl_xino_warn(ofs)) { 125 pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", 126 dentry, stat->ino, xinobits); 127 } 128 } 129 130 /* The inode could not be mapped to a unified st_ino address space */ 131 if (S_ISDIR(dentry->d_inode->i_mode)) { 132 /* 133 * Always use the overlay st_dev for directories, so 'find 134 * -xdev' will scan the entire overlay mount and won't cross the 135 * overlay mount boundaries. 136 * 137 * If not all layers are on the same fs the pair {real st_ino; 138 * overlay st_dev} is not unique, so use the non persistent 139 * overlay st_ino for directories. 140 */ 141 stat->dev = dentry->d_sb->s_dev; 142 stat->ino = dentry->d_inode->i_ino; 143 } else { 144 /* 145 * For non-samefs setup, if we cannot map all layers st_ino 146 * to a unified address space, we need to make sure that st_dev 147 * is unique per underlying fs, so we use the unique anonymous 148 * bdev assigned to the underlying fs. 149 */ 150 stat->dev = ofs->fs[fsid].pseudo_dev; 151 } 152 } 153 154 static inline int ovl_real_getattr_nosec(struct super_block *sb, 155 const struct path *path, 156 struct kstat *stat, u32 request_mask, 157 unsigned int flags) 158 { 159 with_ovl_creds(sb) 160 return vfs_getattr_nosec(path, stat, request_mask, flags); 161 } 162 163 int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, 164 struct kstat *stat, u32 request_mask, unsigned int flags) 165 { 166 struct dentry *dentry = path->dentry; 167 struct super_block *sb = dentry->d_sb; 168 enum ovl_path_type type; 169 struct path realpath; 170 struct inode *inode = d_inode(dentry); 171 bool is_dir = S_ISDIR(inode->i_mode); 172 int fsid = 0; 173 int err; 174 bool metacopy_blocks = false; 175 176 metacopy_blocks = ovl_is_metacopy_dentry(dentry); 177 178 type = ovl_path_real(dentry, &realpath); 179 err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags); 180 if (err) 181 return err; 182 183 /* Report the effective immutable/append-only STATX flags */ 184 generic_fill_statx_attr(inode, stat); 185 186 /* 187 * For non-dir or same fs, we use st_ino of the copy up origin. 188 * This guaranties constant st_dev/st_ino across copy up. 189 * With xino feature and non-samefs, we use st_ino of the copy up 190 * origin masked with high bits that represent the layer id. 191 * 192 * If lower filesystem supports NFS file handles, this also guaranties 193 * persistent st_ino across mount cycle. 194 */ 195 if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) { 196 if (!OVL_TYPE_UPPER(type)) { 197 fsid = ovl_layer_lower(dentry)->fsid; 198 } else if (OVL_TYPE_ORIGIN(type)) { 199 struct kstat lowerstat; 200 u32 lowermask = STATX_INO | STATX_BLOCKS | 201 (!is_dir ? STATX_NLINK : 0); 202 203 ovl_path_lower(dentry, &realpath); 204 err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags); 205 if (err) 206 return err; 207 208 /* 209 * Lower hardlinks may be broken on copy up to different 210 * upper files, so we cannot use the lower origin st_ino 211 * for those different files, even for the same fs case. 212 * 213 * Similarly, several redirected dirs can point to the 214 * same dir on a lower layer. With the "verify_lower" 215 * feature, we do not use the lower origin st_ino, if 216 * we haven't verified that this redirect is unique. 217 * 218 * With inodes index enabled, it is safe to use st_ino 219 * of an indexed origin. The index validates that the 220 * upper hardlink is not broken and that a redirected 221 * dir is the only redirect to that origin. 222 */ 223 if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || 224 (!ovl_verify_lower(dentry->d_sb) && 225 (is_dir || lowerstat.nlink == 1))) { 226 fsid = ovl_layer_lower(dentry)->fsid; 227 stat->ino = lowerstat.ino; 228 } 229 230 /* 231 * If we are querying a metacopy dentry and lower 232 * dentry is data dentry, then use the blocks we 233 * queried just now. We don't have to do additional 234 * vfs_getattr(). If lower itself is metacopy, then 235 * additional vfs_getattr() is unavoidable. 236 */ 237 if (metacopy_blocks && 238 realpath.dentry == ovl_dentry_lowerdata(dentry)) { 239 stat->blocks = lowerstat.blocks; 240 metacopy_blocks = false; 241 } 242 } 243 244 if (metacopy_blocks) { 245 /* 246 * If lower is not same as lowerdata or if there was 247 * no origin on upper, we can end up here. 248 * With lazy lowerdata lookup, guess lowerdata blocks 249 * from size to avoid lowerdata lookup on stat(2). 250 */ 251 struct kstat lowerdatastat; 252 u32 lowermask = STATX_BLOCKS; 253 254 ovl_path_lowerdata(dentry, &realpath); 255 if (realpath.dentry) { 256 err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat, 257 lowermask, flags); 258 if (err) 259 return err; 260 } else { 261 lowerdatastat.blocks = 262 round_up(stat->size, stat->blksize) >> 9; 263 } 264 stat->blocks = lowerdatastat.blocks; 265 } 266 } 267 268 ovl_map_dev_ino(dentry, stat, fsid); 269 270 /* 271 * It's probably not worth it to count subdirs to get the 272 * correct link count. nlink=1 seems to pacify 'find' and 273 * other utilities. 274 */ 275 if (is_dir && OVL_TYPE_MERGE(type)) 276 stat->nlink = 1; 277 278 /* 279 * Return the overlay inode nlinks for indexed upper inodes. 280 * Overlay inode nlink counts the union of the upper hardlinks 281 * and non-covered lower hardlinks. It does not include the upper 282 * index hardlink. 283 */ 284 if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) 285 stat->nlink = dentry->d_inode->i_nlink; 286 287 return err; 288 } 289 290 int ovl_permission(struct mnt_idmap *idmap, 291 struct inode *inode, int mask) 292 { 293 struct inode *upperinode = ovl_inode_upper(inode); 294 struct inode *realinode; 295 struct path realpath; 296 int err; 297 298 /* Careful in RCU walk mode */ 299 realinode = ovl_i_path_real(inode, &realpath); 300 if (!realinode) { 301 WARN_ON(!(mask & MAY_NOT_BLOCK)); 302 return -ECHILD; 303 } 304 305 /* 306 * Check overlay inode with the creds of task and underlying inode 307 * with creds of mounter 308 */ 309 err = generic_permission(&nop_mnt_idmap, inode, mask); 310 if (err) 311 return err; 312 313 if (!upperinode && 314 !special_file(realinode->i_mode) && mask & MAY_WRITE) { 315 mask &= ~(MAY_WRITE | MAY_APPEND); 316 /* Make sure mounter can read file for copy up later */ 317 mask |= MAY_READ; 318 } 319 320 with_ovl_creds(inode->i_sb) 321 return inode_permission(mnt_idmap(realpath.mnt), realinode, mask); 322 } 323 324 static const char *ovl_get_link(struct dentry *dentry, 325 struct inode *inode, 326 struct delayed_call *done) 327 { 328 if (!dentry) 329 return ERR_PTR(-ECHILD); 330 331 with_ovl_creds(dentry->d_sb) 332 return vfs_get_link(ovl_dentry_real(dentry), done); 333 } 334 335 #ifdef CONFIG_FS_POSIX_ACL 336 /* 337 * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone 338 * of the POSIX ACLs retrieved from the lower layer to this function to not 339 * alter the POSIX ACLs for the underlying filesystem. 340 */ 341 static void ovl_idmap_posix_acl(const struct inode *realinode, 342 struct mnt_idmap *idmap, 343 struct posix_acl *acl) 344 { 345 struct user_namespace *fs_userns = i_user_ns(realinode); 346 347 for (unsigned int i = 0; i < acl->a_count; i++) { 348 vfsuid_t vfsuid; 349 vfsgid_t vfsgid; 350 351 struct posix_acl_entry *e = &acl->a_entries[i]; 352 switch (e->e_tag) { 353 case ACL_USER: 354 vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); 355 e->e_uid = vfsuid_into_kuid(vfsuid); 356 break; 357 case ACL_GROUP: 358 vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); 359 e->e_gid = vfsgid_into_kgid(vfsgid); 360 break; 361 } 362 } 363 } 364 365 /* 366 * The @noperm argument is used to skip permission checking and is a temporary 367 * measure. Quoting Miklos from an earlier discussion: 368 * 369 * > So there are two paths to getting an acl: 370 * > 1) permission checking and 2) retrieving the value via getxattr(2). 371 * > This is a similar situation as reading a symlink vs. following it. 372 * > When following a symlink overlayfs always reads the link on the 373 * > underlying fs just as if it was a readlink(2) call, calling 374 * > security_inode_readlink() instead of security_inode_follow_link(). 375 * > This is logical: we are reading the link from the underlying storage, 376 * > and following it on overlayfs. 377 * > 378 * > Applying the same logic to acl: we do need to call the 379 * > security_inode_getxattr() on the underlying fs, even if just want to 380 * > check permissions on overlay. This is currently not done, which is an 381 * > inconsistency. 382 * > 383 * > Maybe adding the check to ovl_get_acl() is the right way to go, but 384 * > I'm a little afraid of a performance regression. Will look into that. 385 * 386 * Until we have made a decision allow this helper to take the @noperm 387 * argument. We should hopefully be able to remove it soon. 388 */ 389 struct posix_acl *ovl_get_acl_path(const struct path *path, 390 const char *acl_name, bool noperm) 391 { 392 struct posix_acl *real_acl, *clone; 393 struct mnt_idmap *idmap; 394 struct inode *realinode = d_inode(path->dentry); 395 396 idmap = mnt_idmap(path->mnt); 397 398 if (noperm) 399 real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); 400 else 401 real_acl = vfs_get_acl(idmap, path->dentry, acl_name); 402 if (IS_ERR_OR_NULL(real_acl)) 403 return real_acl; 404 405 if (!is_idmapped_mnt(path->mnt)) 406 return real_acl; 407 408 /* 409 * We cannot alter the ACLs returned from the relevant layer as that 410 * would alter the cached values filesystem wide for the lower 411 * filesystem. Instead we can clone the ACLs and then apply the 412 * relevant idmapping of the layer. 413 */ 414 clone = posix_acl_clone(real_acl, GFP_KERNEL); 415 posix_acl_release(real_acl); /* release original acl */ 416 if (!clone) 417 return ERR_PTR(-ENOMEM); 418 419 ovl_idmap_posix_acl(realinode, idmap, clone); 420 return clone; 421 } 422 423 /* 424 * When the relevant layer is an idmapped mount we need to take the idmapping 425 * of the layer into account and translate any ACL_{GROUP,USER} values 426 * according to the idmapped mount. 427 * 428 * We cannot alter the ACLs returned from the relevant layer as that would 429 * alter the cached values filesystem wide for the lower filesystem. Instead we 430 * can clone the ACLs and then apply the relevant idmapping of the layer. 431 * 432 * This is obviously only relevant when idmapped layers are used. 433 */ 434 struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, 435 struct inode *inode, int type, 436 bool rcu, bool noperm) 437 { 438 struct inode *realinode; 439 struct posix_acl *acl; 440 struct path realpath; 441 442 /* Careful in RCU walk mode */ 443 realinode = ovl_i_path_real(inode, &realpath); 444 if (!realinode) { 445 WARN_ON(!rcu); 446 return ERR_PTR(-ECHILD); 447 } 448 449 if (!IS_POSIXACL(realinode)) 450 return NULL; 451 452 if (rcu) { 453 /* 454 * If the layer is idmapped drop out of RCU path walk 455 * so we can clone the ACLs. 456 */ 457 if (is_idmapped_mnt(realpath.mnt)) 458 return ERR_PTR(-ECHILD); 459 460 acl = get_cached_acl_rcu(realinode, type); 461 } else { 462 with_ovl_creds(inode->i_sb) 463 acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); 464 } 465 466 return acl; 467 } 468 469 static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, 470 struct posix_acl *acl, int type) 471 { 472 int err; 473 struct path realpath; 474 const char *acl_name; 475 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 476 struct dentry *upperdentry = ovl_dentry_upper(dentry); 477 struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); 478 479 /* 480 * If ACL is to be removed from a lower file, check if it exists in 481 * the first place before copying it up. 482 */ 483 acl_name = posix_acl_xattr_name(type); 484 if (!acl && !upperdentry) { 485 struct posix_acl *real_acl; 486 487 ovl_path_lower(dentry, &realpath); 488 with_ovl_creds(dentry->d_sb) 489 real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); 490 if (IS_ERR(real_acl)) { 491 err = PTR_ERR(real_acl); 492 goto out; 493 } 494 posix_acl_release(real_acl); 495 } 496 497 if (!upperdentry) { 498 err = ovl_copy_up(dentry); 499 if (err) 500 goto out; 501 502 realdentry = ovl_dentry_upper(dentry); 503 } 504 505 err = ovl_want_write(dentry); 506 if (err) 507 goto out; 508 509 with_ovl_creds(dentry->d_sb) { 510 if (acl) 511 err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); 512 else 513 err = ovl_do_remove_acl(ofs, realdentry, acl_name); 514 } 515 ovl_drop_write(dentry); 516 517 /* copy c/mtime */ 518 ovl_copyattr(inode); 519 out: 520 return err; 521 } 522 523 int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, 524 struct posix_acl *acl, int type) 525 { 526 int err; 527 struct inode *inode = d_inode(dentry); 528 struct dentry *workdir = ovl_workdir(dentry); 529 struct inode *realinode = ovl_inode_real(inode); 530 531 if (!IS_POSIXACL(d_inode(workdir))) 532 return -EOPNOTSUPP; 533 if (!realinode->i_op->set_acl) 534 return -EOPNOTSUPP; 535 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) 536 return acl ? -EACCES : 0; 537 if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) 538 return -EPERM; 539 540 /* 541 * Check if sgid bit needs to be cleared (actual setacl operation will 542 * be done with mounter's capabilities and so that won't do it for us). 543 */ 544 if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && 545 !in_group_p(inode->i_gid) && 546 !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { 547 struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; 548 549 err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); 550 if (err) 551 return err; 552 } 553 554 return ovl_set_or_remove_acl(dentry, inode, acl, type); 555 } 556 #endif 557 558 int ovl_update_time(struct inode *inode, int flags) 559 { 560 if (flags & S_ATIME) { 561 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 562 struct path upperpath = { 563 .mnt = ovl_upper_mnt(ofs), 564 .dentry = ovl_upperdentry_dereference(OVL_I(inode)), 565 }; 566 567 if (upperpath.dentry) { 568 touch_atime(&upperpath); 569 inode_set_atime_to_ts(inode, 570 inode_get_atime(d_inode(upperpath.dentry))); 571 } 572 } 573 return 0; 574 } 575 576 static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 577 u64 start, u64 len) 578 { 579 struct inode *realinode = ovl_inode_realdata(inode); 580 581 if (!realinode) 582 return -EIO; 583 584 if (!realinode->i_op->fiemap) 585 return -EOPNOTSUPP; 586 587 with_ovl_creds(inode->i_sb) 588 return realinode->i_op->fiemap(realinode, fieinfo, start, len); 589 } 590 591 /* 592 * Work around the fact that security_file_ioctl() takes a file argument. 593 * Introducing security_inode_fileattr_get/set() hooks would solve this issue 594 * properly. 595 */ 596 static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa, 597 bool set) 598 { 599 struct file *file; 600 unsigned int cmd; 601 int err; 602 unsigned int flags; 603 604 flags = O_RDONLY; 605 if (force_o_largefile()) 606 flags |= O_LARGEFILE; 607 608 file = dentry_open(realpath, flags, current_cred()); 609 if (IS_ERR(file)) 610 return PTR_ERR(file); 611 612 if (set) 613 cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; 614 else 615 cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; 616 617 err = security_file_ioctl(file, cmd, 0); 618 fput(file); 619 620 return err; 621 } 622 623 int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa) 624 { 625 int err; 626 627 err = ovl_security_fileattr(realpath, fa, true); 628 if (err) 629 return err; 630 631 return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); 632 } 633 634 int ovl_fileattr_set(struct mnt_idmap *idmap, 635 struct dentry *dentry, struct file_kattr *fa) 636 { 637 struct inode *inode = d_inode(dentry); 638 struct path upperpath; 639 unsigned int flags; 640 int err; 641 642 err = ovl_copy_up(dentry); 643 if (!err) { 644 ovl_path_real(dentry, &upperpath); 645 646 err = ovl_want_write(dentry); 647 if (err) 648 goto out; 649 650 with_ovl_creds(inode->i_sb) { 651 /* 652 * Store immutable/append-only flags in xattr and clear them 653 * in upper fileattr (in case they were set by older kernel) 654 * so children of "ovl-immutable" directories lower aliases of 655 * "ovl-immutable" hardlinks could be copied up. 656 * Clear xattr when flags are cleared. 657 */ 658 err = ovl_set_protattr(inode, upperpath.dentry, fa); 659 if (!err) 660 err = ovl_real_fileattr_set(&upperpath, fa); 661 } 662 ovl_drop_write(dentry); 663 664 /* 665 * Merge real inode flags with inode flags read from 666 * overlay.protattr xattr 667 */ 668 flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; 669 670 BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); 671 flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; 672 inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); 673 674 /* Update ctime */ 675 ovl_copyattr(inode); 676 } 677 out: 678 return err; 679 } 680 681 /* Convert inode protection flags to fileattr flags */ 682 static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa) 683 { 684 BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); 685 BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); 686 687 if (inode->i_flags & S_APPEND) { 688 fa->flags |= FS_APPEND_FL; 689 fa->fsx_xflags |= FS_XFLAG_APPEND; 690 } 691 if (inode->i_flags & S_IMMUTABLE) { 692 fa->flags |= FS_IMMUTABLE_FL; 693 fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; 694 } 695 } 696 697 int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) 698 { 699 int err; 700 701 err = ovl_security_fileattr(realpath, fa, false); 702 if (err) 703 return err; 704 705 err = vfs_fileattr_get(realpath->dentry, fa); 706 if (err == -ENOIOCTLCMD) 707 err = -ENOTTY; 708 return err; 709 } 710 711 int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) 712 { 713 struct inode *inode = d_inode(dentry); 714 struct path realpath; 715 int err; 716 717 ovl_path_real(dentry, &realpath); 718 719 with_ovl_creds(inode->i_sb) 720 err = ovl_real_fileattr_get(&realpath, fa); 721 ovl_fileattr_prot_flags(inode, fa); 722 723 return err; 724 } 725 726 static const struct inode_operations ovl_file_inode_operations = { 727 .setattr = ovl_setattr, 728 .permission = ovl_permission, 729 .getattr = ovl_getattr, 730 .listxattr = ovl_listxattr, 731 .get_inode_acl = ovl_get_inode_acl, 732 .get_acl = ovl_get_acl, 733 .set_acl = ovl_set_acl, 734 .update_time = ovl_update_time, 735 .fiemap = ovl_fiemap, 736 .fileattr_get = ovl_fileattr_get, 737 .fileattr_set = ovl_fileattr_set, 738 }; 739 740 static const struct inode_operations ovl_symlink_inode_operations = { 741 .setattr = ovl_setattr, 742 .get_link = ovl_get_link, 743 .getattr = ovl_getattr, 744 .listxattr = ovl_listxattr, 745 .update_time = ovl_update_time, 746 }; 747 748 static const struct inode_operations ovl_special_inode_operations = { 749 .setattr = ovl_setattr, 750 .permission = ovl_permission, 751 .getattr = ovl_getattr, 752 .listxattr = ovl_listxattr, 753 .get_inode_acl = ovl_get_inode_acl, 754 .get_acl = ovl_get_acl, 755 .set_acl = ovl_set_acl, 756 .update_time = ovl_update_time, 757 }; 758 759 static const struct address_space_operations ovl_aops = { 760 /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 761 .direct_IO = noop_direct_IO, 762 }; 763 764 /* 765 * It is possible to stack overlayfs instance on top of another 766 * overlayfs instance as lower layer. We need to annotate the 767 * stackable i_mutex locks according to stack level of the super 768 * block instance. An overlayfs instance can never be in stack 769 * depth 0 (there is always a real fs below it). An overlayfs 770 * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. 771 * 772 * For example, here is a snip from /proc/lockdep_chains after 773 * dir_iterate of nested overlayfs: 774 * 775 * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) 776 * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) 777 * [...] &type->i_mutex_dir_key (stack_depth=0) 778 * 779 * Locking order w.r.t ovl_want_write() is important for nested overlayfs. 780 * 781 * This chain is valid: 782 * - inode->i_rwsem (inode_lock[2]) 783 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 784 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 785 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 786 * 787 * And this chain is valid: 788 * - inode->i_rwsem (inode_lock[2]) 789 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 790 * - lowerinode->i_rwsem (inode_lock[1]) 791 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 792 * 793 * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is 794 * held, because it is in reverse order of the non-nested case using the same 795 * upper fs: 796 * - inode->i_rwsem (inode_lock[1]) 797 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 798 * - OVL_I(inode)->lock (ovl_inode_lock[1]) 799 */ 800 #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH 801 802 static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) 803 { 804 #ifdef CONFIG_LOCKDEP 805 static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; 806 static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; 807 static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; 808 809 int depth = inode->i_sb->s_stack_depth - 1; 810 811 if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) 812 depth = 0; 813 814 if (S_ISDIR(inode->i_mode)) 815 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); 816 else 817 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); 818 819 lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); 820 #endif 821 } 822 823 static void ovl_next_ino(struct inode *inode) 824 { 825 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 826 827 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 828 if (unlikely(!inode->i_ino)) 829 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 830 } 831 832 static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) 833 { 834 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 835 int xinobits = ovl_xino_bits(ofs); 836 unsigned int xinoshift = 64 - xinobits; 837 838 /* 839 * When d_ino is consistent with st_ino (samefs or i_ino has enough 840 * bits to encode layer), set the same value used for st_ino to i_ino, 841 * so inode number exposed via /proc/locks and a like will be 842 * consistent with d_ino and st_ino values. An i_ino value inconsistent 843 * with d_ino also causes nfsd readdirplus to fail. 844 */ 845 inode->i_ino = ino; 846 if (ovl_same_fs(ofs)) { 847 return; 848 } else if (xinobits && likely(!(ino >> xinoshift))) { 849 inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); 850 return; 851 } 852 853 /* 854 * For directory inodes on non-samefs with xino disabled or xino 855 * overflow, we allocate a non-persistent inode number, to be used for 856 * resolving st_ino collisions in ovl_map_dev_ino(). 857 * 858 * To avoid ino collision with legitimate xino values from upper 859 * layer (fsid 0), use the lowest xinobit to map the non 860 * persistent inode numbers to the unified st_ino address space. 861 */ 862 if (S_ISDIR(inode->i_mode)) { 863 ovl_next_ino(inode); 864 if (xinobits) { 865 inode->i_ino &= ~0UL >> xinobits; 866 inode->i_ino |= 1UL << xinoshift; 867 } 868 } 869 } 870 871 void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, 872 unsigned long ino, int fsid) 873 { 874 struct inode *realinode; 875 struct ovl_inode *oi = OVL_I(inode); 876 877 oi->__upperdentry = oip->upperdentry; 878 oi->oe = oip->oe; 879 oi->redirect = oip->redirect; 880 oi->lowerdata_redirect = oip->lowerdata_redirect; 881 882 realinode = ovl_inode_real(inode); 883 ovl_copyattr(inode); 884 ovl_copyflags(realinode, inode); 885 ovl_map_ino(inode, ino, fsid); 886 } 887 888 static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) 889 { 890 inode->i_mode = mode; 891 inode->i_flags |= S_NOCMTIME; 892 #ifdef CONFIG_FS_POSIX_ACL 893 inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; 894 #endif 895 896 ovl_lockdep_annotate_inode_mutex_key(inode); 897 898 switch (mode & S_IFMT) { 899 case S_IFREG: 900 inode->i_op = &ovl_file_inode_operations; 901 inode->i_fop = &ovl_file_operations; 902 inode->i_mapping->a_ops = &ovl_aops; 903 break; 904 905 case S_IFDIR: 906 inode->i_op = &ovl_dir_inode_operations; 907 inode->i_fop = &ovl_dir_operations; 908 break; 909 910 case S_IFLNK: 911 inode->i_op = &ovl_symlink_inode_operations; 912 break; 913 914 default: 915 inode->i_op = &ovl_special_inode_operations; 916 init_special_inode(inode, mode, rdev); 917 break; 918 } 919 } 920 921 /* 922 * With inodes index enabled, an overlay inode nlink counts the union of upper 923 * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure 924 * upper inode, the following nlink modifying operations can happen: 925 * 926 * 1. Lower hardlink copy up 927 * 2. Upper hardlink created, unlinked or renamed over 928 * 3. Lower hardlink whiteout or renamed over 929 * 930 * For the first, copy up case, the union nlink does not change, whether the 931 * operation succeeds or fails, but the upper inode nlink may change. 932 * Therefore, before copy up, we store the union nlink value relative to the 933 * lower inode nlink in the index inode xattr .overlay.nlink. 934 * 935 * For the second, upper hardlink case, the union nlink should be incremented 936 * or decremented IFF the operation succeeds, aligned with nlink change of the 937 * upper inode. Therefore, before link/unlink/rename, we store the union nlink 938 * value relative to the upper inode nlink in the index inode. 939 * 940 * For the last, lower cover up case, we simplify things by preceding the 941 * whiteout or cover up with copy up. This makes sure that there is an index 942 * upper inode where the nlink xattr can be stored before the copied up upper 943 * entry is unlink. 944 */ 945 #define OVL_NLINK_ADD_UPPER (1 << 0) 946 947 /* 948 * On-disk format for indexed nlink: 949 * 950 * nlink relative to the upper inode - "U[+-]NUM" 951 * nlink relative to the lower inode - "L[+-]NUM" 952 */ 953 954 static int ovl_set_nlink_common(struct dentry *dentry, 955 struct dentry *realdentry, const char *format) 956 { 957 struct inode *inode = d_inode(dentry); 958 struct inode *realinode = d_inode(realdentry); 959 char buf[13]; 960 int len; 961 962 len = snprintf(buf, sizeof(buf), format, 963 (int) (inode->i_nlink - realinode->i_nlink)); 964 965 if (WARN_ON(len >= sizeof(buf))) 966 return -EIO; 967 968 return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), 969 OVL_XATTR_NLINK, buf, len); 970 } 971 972 int ovl_set_nlink_upper(struct dentry *dentry) 973 { 974 return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); 975 } 976 977 int ovl_set_nlink_lower(struct dentry *dentry) 978 { 979 return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); 980 } 981 982 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, 983 struct dentry *upperdentry, 984 unsigned int fallback) 985 { 986 int nlink_diff; 987 int nlink; 988 char buf[13]; 989 int err; 990 991 if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) 992 return fallback; 993 994 err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, 995 &buf, sizeof(buf) - 1); 996 if (err < 0) 997 goto fail; 998 999 buf[err] = '\0'; 1000 if ((buf[0] != 'L' && buf[0] != 'U') || 1001 (buf[1] != '+' && buf[1] != '-')) 1002 goto fail; 1003 1004 err = kstrtoint(buf + 1, 10, &nlink_diff); 1005 if (err < 0) 1006 goto fail; 1007 1008 nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; 1009 nlink += nlink_diff; 1010 1011 if (nlink <= 0) 1012 goto fail; 1013 1014 return nlink; 1015 1016 fail: 1017 pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", 1018 upperdentry, err); 1019 return fallback; 1020 } 1021 1022 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) 1023 { 1024 struct inode *inode; 1025 1026 inode = new_inode(sb); 1027 if (inode) 1028 ovl_fill_inode(inode, mode, rdev); 1029 1030 return inode; 1031 } 1032 1033 static int ovl_inode_test(struct inode *inode, void *data) 1034 { 1035 return inode->i_private == data; 1036 } 1037 1038 static int ovl_inode_set(struct inode *inode, void *data) 1039 { 1040 inode->i_private = data; 1041 return 0; 1042 } 1043 1044 static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, 1045 struct dentry *upperdentry, bool strict) 1046 { 1047 /* 1048 * For directories, @strict verify from lookup path performs consistency 1049 * checks, so NULL lower/upper in dentry must match NULL lower/upper in 1050 * inode. Non @strict verify from NFS handle decode path passes NULL for 1051 * 'unknown' lower/upper. 1052 */ 1053 if (S_ISDIR(inode->i_mode) && strict) { 1054 /* Real lower dir moved to upper layer under us? */ 1055 if (!lowerdentry && ovl_inode_lower(inode)) 1056 return false; 1057 1058 /* Lookup of an uncovered redirect origin? */ 1059 if (!upperdentry && ovl_inode_upper(inode)) 1060 return false; 1061 } 1062 1063 /* 1064 * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. 1065 * This happens when finding a copied up overlay inode for a renamed 1066 * or hardlinked overlay dentry and lower dentry cannot be followed 1067 * by origin because lower fs does not support file handles. 1068 */ 1069 if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) 1070 return false; 1071 1072 /* 1073 * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. 1074 * This happens when finding a lower alias for a copied up hard link. 1075 */ 1076 if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) 1077 return false; 1078 1079 return true; 1080 } 1081 1082 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, 1083 bool is_upper) 1084 { 1085 struct inode *inode, *key = d_inode(real); 1086 1087 inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 1088 if (!inode) 1089 return NULL; 1090 1091 if (!ovl_verify_inode(inode, is_upper ? NULL : real, 1092 is_upper ? real : NULL, false)) { 1093 iput(inode); 1094 return ERR_PTR(-ESTALE); 1095 } 1096 1097 return inode; 1098 } 1099 1100 bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) 1101 { 1102 struct inode *key = d_inode(dir); 1103 struct inode *trap; 1104 bool res; 1105 1106 trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 1107 if (!trap) 1108 return false; 1109 1110 res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && 1111 !ovl_inode_lower(trap); 1112 1113 iput(trap); 1114 return res; 1115 } 1116 1117 /* 1118 * Create an inode cache entry for layer root dir, that will intentionally 1119 * fail ovl_verify_inode(), so any lookup that will find some layer root 1120 * will fail. 1121 */ 1122 struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) 1123 { 1124 struct inode *key = d_inode(dir); 1125 struct inode *trap; 1126 1127 if (!d_is_dir(dir)) 1128 return ERR_PTR(-ENOTDIR); 1129 1130 trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, 1131 ovl_inode_set, key); 1132 if (!trap) 1133 return ERR_PTR(-ENOMEM); 1134 1135 if (!(inode_state_read_once(trap) & I_NEW)) { 1136 /* Conflicting layer roots? */ 1137 iput(trap); 1138 return ERR_PTR(-ELOOP); 1139 } 1140 1141 trap->i_mode = S_IFDIR; 1142 trap->i_flags = S_DEAD; 1143 unlock_new_inode(trap); 1144 1145 return trap; 1146 } 1147 1148 /* 1149 * Does overlay inode need to be hashed by lower inode? 1150 */ 1151 static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, 1152 struct dentry *lower, bool index) 1153 { 1154 struct ovl_fs *ofs = OVL_FS(sb); 1155 1156 /* No, if pure upper */ 1157 if (!lower) 1158 return false; 1159 1160 /* Yes, if already indexed */ 1161 if (index) 1162 return true; 1163 1164 /* Yes, if won't be copied up */ 1165 if (!ovl_upper_mnt(ofs)) 1166 return true; 1167 1168 /* No, if lower hardlink is or will be broken on copy up */ 1169 if ((upper || !ovl_indexdir(sb)) && 1170 !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) 1171 return false; 1172 1173 /* No, if non-indexed upper with NFS export */ 1174 if (ofs->config.nfs_export && upper) 1175 return false; 1176 1177 /* Otherwise, hash by lower inode for fsnotify */ 1178 return true; 1179 } 1180 1181 static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, 1182 struct inode *key) 1183 { 1184 return newinode ? inode_insert5(newinode, (unsigned long) key, 1185 ovl_inode_test, ovl_inode_set, key) : 1186 iget5_locked(sb, (unsigned long) key, 1187 ovl_inode_test, ovl_inode_set, key); 1188 } 1189 1190 struct inode *ovl_get_inode(struct super_block *sb, 1191 struct ovl_inode_params *oip) 1192 { 1193 struct ovl_fs *ofs = OVL_FS(sb); 1194 struct dentry *upperdentry = oip->upperdentry; 1195 struct ovl_path *lowerpath = ovl_lowerpath(oip->oe); 1196 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; 1197 struct inode *inode; 1198 struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; 1199 struct path realpath = { 1200 .dentry = upperdentry ?: lowerdentry, 1201 .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, 1202 }; 1203 bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, 1204 oip->index); 1205 int fsid = bylower ? lowerpath->layer->fsid : 0; 1206 bool is_dir; 1207 unsigned long ino = 0; 1208 int err = oip->newinode ? -EEXIST : -ENOMEM; 1209 1210 if (!realinode) 1211 realinode = d_inode(lowerdentry); 1212 1213 /* 1214 * Copy up origin (lower) may exist for non-indexed upper, but we must 1215 * not use lower as hash key if this is a broken hardlink. 1216 */ 1217 is_dir = S_ISDIR(realinode->i_mode); 1218 if (upperdentry || bylower) { 1219 struct inode *key = d_inode(bylower ? lowerdentry : 1220 upperdentry); 1221 unsigned int nlink = is_dir ? 1 : realinode->i_nlink; 1222 1223 inode = ovl_iget5(sb, oip->newinode, key); 1224 if (!inode) 1225 goto out_err; 1226 if (!(inode_state_read_once(inode) & I_NEW)) { 1227 /* 1228 * Verify that the underlying files stored in the inode 1229 * match those in the dentry. 1230 */ 1231 if (!ovl_verify_inode(inode, lowerdentry, upperdentry, 1232 true)) { 1233 iput(inode); 1234 err = -ESTALE; 1235 goto out_err; 1236 } 1237 1238 dput(upperdentry); 1239 ovl_free_entry(oip->oe); 1240 kfree(oip->redirect); 1241 kfree(oip->lowerdata_redirect); 1242 goto out; 1243 } 1244 1245 /* Recalculate nlink for non-dir due to indexing */ 1246 if (!is_dir) 1247 nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, 1248 nlink); 1249 set_nlink(inode, nlink); 1250 ino = key->i_ino; 1251 } else { 1252 /* Lower hardlink that will be broken on copy up */ 1253 inode = new_inode(sb); 1254 if (!inode) { 1255 err = -ENOMEM; 1256 goto out_err; 1257 } 1258 ino = realinode->i_ino; 1259 fsid = lowerpath->layer->fsid; 1260 } 1261 ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); 1262 ovl_inode_init(inode, oip, ino, fsid); 1263 WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold); 1264 1265 if (upperdentry && ovl_is_impuredir(sb, upperdentry)) 1266 ovl_set_flag(OVL_IMPURE, inode); 1267 1268 if (oip->index) 1269 ovl_set_flag(OVL_INDEX, inode); 1270 1271 if (bylower) 1272 ovl_set_flag(OVL_CONST_INO, inode); 1273 1274 /* Check for non-merge dir that may have whiteouts */ 1275 if (is_dir) { 1276 if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) || 1277 ovl_path_check_origin_xattr(ofs, &realpath)) { 1278 ovl_set_flag(OVL_WHITEOUTS, inode); 1279 } 1280 } 1281 1282 /* Check for immutable/append-only inode flags in xattr */ 1283 if (upperdentry) 1284 ovl_check_protattr(inode, upperdentry); 1285 1286 if (inode_state_read_once(inode) & I_NEW) 1287 unlock_new_inode(inode); 1288 out: 1289 return inode; 1290 1291 out_err: 1292 pr_warn_ratelimited("failed to get inode (%i)\n", err); 1293 inode = ERR_PTR(err); 1294 goto out; 1295 } 1296