1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * 4 * Copyright (C) 2011 Novell Inc. 5 */ 6 7 #include <linux/fs.h> 8 #include <linux/slab.h> 9 #include <linux/cred.h> 10 #include <linux/xattr.h> 11 #include <linux/ratelimit.h> 12 #include <linux/fiemap.h> 13 #include <linux/fileattr.h> 14 #include <linux/security.h> 15 #include <linux/namei.h> 16 #include <linux/posix_acl.h> 17 #include <linux/posix_acl_xattr.h> 18 #include "overlayfs.h" 19 20 21 int ovl_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 22 struct iattr *attr) 23 { 24 int err; 25 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 26 bool full_copy_up = false; 27 struct dentry *upperdentry; 28 29 err = setattr_prepare(&nop_mnt_idmap, dentry, attr); 30 if (err) 31 return err; 32 33 if (attr->ia_valid & ATTR_SIZE) { 34 /* Truncate should trigger data copy up as well */ 35 full_copy_up = true; 36 } 37 38 if (!full_copy_up) 39 err = ovl_copy_up(dentry); 40 else 41 err = ovl_copy_up_with_data(dentry); 42 if (!err) { 43 struct inode *winode = NULL; 44 45 upperdentry = ovl_dentry_upper(dentry); 46 47 if (attr->ia_valid & ATTR_SIZE) { 48 winode = d_inode(upperdentry); 49 err = get_write_access(winode); 50 if (err) 51 goto out; 52 } 53 54 if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID)) 55 attr->ia_valid &= ~ATTR_MODE; 56 57 /* 58 * We might have to translate ovl file into real file object 59 * once use cases emerge. For now, simply don't let underlying 60 * filesystem rely on attr->ia_file 61 */ 62 attr->ia_valid &= ~ATTR_FILE; 63 64 /* 65 * If open(O_TRUNC) is done, VFS calls ->setattr with ATTR_OPEN 66 * set. Overlayfs does not pass O_TRUNC flag to underlying 67 * filesystem during open -> do not pass ATTR_OPEN. This 68 * disables optimization in fuse which assumes open(O_TRUNC) 69 * already set file size to 0. But we never passed O_TRUNC to 70 * fuse. So by clearing ATTR_OPEN, fuse will be forced to send 71 * setattr request to server. 72 */ 73 attr->ia_valid &= ~ATTR_OPEN; 74 75 err = ovl_want_write(dentry); 76 if (err) 77 goto out_put_write; 78 79 inode_lock(upperdentry->d_inode); 80 with_ovl_creds(dentry->d_sb) 81 err = ovl_do_notify_change(ofs, upperdentry, attr); 82 if (!err) 83 ovl_copyattr(dentry->d_inode); 84 inode_unlock(upperdentry->d_inode); 85 ovl_drop_write(dentry); 86 87 out_put_write: 88 if (winode) 89 put_write_access(winode); 90 } 91 out: 92 return err; 93 } 94 95 static void ovl_map_dev_ino(struct dentry *dentry, struct kstat *stat, int fsid) 96 { 97 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 98 bool samefs = ovl_same_fs(ofs); 99 unsigned int xinobits = ovl_xino_bits(ofs); 100 unsigned int xinoshift = 64 - xinobits; 101 102 if (samefs) { 103 /* 104 * When all layers are on the same fs, all real inode 105 * number are unique, so we use the overlay st_dev, 106 * which is friendly to du -x. 107 */ 108 stat->dev = dentry->d_sb->s_dev; 109 return; 110 } else if (xinobits) { 111 /* 112 * All inode numbers of underlying fs should not be using the 113 * high xinobits, so we use high xinobits to partition the 114 * overlay st_ino address space. The high bits holds the fsid 115 * (upper fsid is 0). The lowest xinobit is reserved for mapping 116 * the non-persistent inode numbers range in case of overflow. 117 * This way all overlay inode numbers are unique and use the 118 * overlay st_dev. 119 */ 120 if (likely(!(stat->ino >> xinoshift))) { 121 stat->ino |= ((u64)fsid) << (xinoshift + 1); 122 stat->dev = dentry->d_sb->s_dev; 123 return; 124 } else if (ovl_xino_warn(ofs)) { 125 pr_warn_ratelimited("inode number too big (%pd2, ino=%llu, xinobits=%d)\n", 126 dentry, stat->ino, xinobits); 127 } 128 } 129 130 /* The inode could not be mapped to a unified st_ino address space */ 131 if (S_ISDIR(dentry->d_inode->i_mode)) { 132 /* 133 * Always use the overlay st_dev for directories, so 'find 134 * -xdev' will scan the entire overlay mount and won't cross the 135 * overlay mount boundaries. 136 * 137 * If not all layers are on the same fs the pair {real st_ino; 138 * overlay st_dev} is not unique, so use the non persistent 139 * overlay st_ino for directories. 140 */ 141 stat->dev = dentry->d_sb->s_dev; 142 stat->ino = dentry->d_inode->i_ino; 143 } else { 144 /* 145 * For non-samefs setup, if we cannot map all layers st_ino 146 * to a unified address space, we need to make sure that st_dev 147 * is unique per underlying fs, so we use the unique anonymous 148 * bdev assigned to the underlying fs. 149 */ 150 stat->dev = ofs->fs[fsid].pseudo_dev; 151 } 152 } 153 154 static inline int ovl_real_getattr_nosec(struct super_block *sb, 155 const struct path *path, 156 struct kstat *stat, u32 request_mask, 157 unsigned int flags) 158 { 159 with_ovl_creds(sb) 160 return vfs_getattr_nosec(path, stat, request_mask, flags); 161 } 162 163 int ovl_getattr(struct mnt_idmap *idmap, const struct path *path, 164 struct kstat *stat, u32 request_mask, unsigned int flags) 165 { 166 struct dentry *dentry = path->dentry; 167 struct super_block *sb = dentry->d_sb; 168 enum ovl_path_type type; 169 struct path realpath; 170 struct inode *inode = d_inode(dentry); 171 bool is_dir = S_ISDIR(inode->i_mode); 172 int fsid = 0; 173 int err; 174 bool metacopy_blocks = false; 175 176 metacopy_blocks = ovl_is_metacopy_dentry(dentry); 177 178 type = ovl_path_real(dentry, &realpath); 179 err = ovl_real_getattr_nosec(sb, &realpath, stat, request_mask, flags); 180 if (err) 181 return err; 182 183 /* Report the effective immutable/append-only STATX flags */ 184 generic_fill_statx_attr(inode, stat); 185 186 /* 187 * For non-dir or same fs, we use st_ino of the copy up origin. 188 * This guaranties constant st_dev/st_ino across copy up. 189 * With xino feature and non-samefs, we use st_ino of the copy up 190 * origin masked with high bits that represent the layer id. 191 * 192 * If lower filesystem supports NFS file handles, this also guaranties 193 * persistent st_ino across mount cycle. 194 */ 195 if (!is_dir || ovl_same_dev(OVL_FS(dentry->d_sb))) { 196 if (!OVL_TYPE_UPPER(type)) { 197 fsid = ovl_layer_lower(dentry)->fsid; 198 } else if (OVL_TYPE_ORIGIN(type)) { 199 struct kstat lowerstat; 200 u32 lowermask = STATX_INO | STATX_BLOCKS | 201 (!is_dir ? STATX_NLINK : 0); 202 203 ovl_path_lower(dentry, &realpath); 204 err = ovl_real_getattr_nosec(sb, &realpath, &lowerstat, lowermask, flags); 205 if (err) 206 return err; 207 208 /* 209 * Lower hardlinks may be broken on copy up to different 210 * upper files, so we cannot use the lower origin st_ino 211 * for those different files, even for the same fs case. 212 * 213 * Similarly, several redirected dirs can point to the 214 * same dir on a lower layer. With the "verify_lower" 215 * feature, we do not use the lower origin st_ino, if 216 * we haven't verified that this redirect is unique. 217 * 218 * With inodes index enabled, it is safe to use st_ino 219 * of an indexed origin. The index validates that the 220 * upper hardlink is not broken and that a redirected 221 * dir is the only redirect to that origin. 222 */ 223 if (ovl_test_flag(OVL_INDEX, d_inode(dentry)) || 224 (!ovl_verify_lower(dentry->d_sb) && 225 (is_dir || lowerstat.nlink == 1))) { 226 fsid = ovl_layer_lower(dentry)->fsid; 227 stat->ino = lowerstat.ino; 228 } 229 230 /* 231 * If we are querying a metacopy dentry and lower 232 * dentry is data dentry, then use the blocks we 233 * queried just now. We don't have to do additional 234 * vfs_getattr(). If lower itself is metacopy, then 235 * additional vfs_getattr() is unavoidable. 236 */ 237 if (metacopy_blocks && 238 realpath.dentry == ovl_dentry_lowerdata(dentry)) { 239 stat->blocks = lowerstat.blocks; 240 metacopy_blocks = false; 241 } 242 } 243 244 if (metacopy_blocks) { 245 /* 246 * If lower is not same as lowerdata or if there was 247 * no origin on upper, we can end up here. 248 * With lazy lowerdata lookup, guess lowerdata blocks 249 * from size to avoid lowerdata lookup on stat(2). 250 */ 251 struct kstat lowerdatastat; 252 u32 lowermask = STATX_BLOCKS; 253 254 ovl_path_lowerdata(dentry, &realpath); 255 if (realpath.dentry) { 256 err = ovl_real_getattr_nosec(sb, &realpath, &lowerdatastat, 257 lowermask, flags); 258 if (err) 259 return err; 260 } else { 261 lowerdatastat.blocks = 262 round_up(stat->size, stat->blksize) >> 9; 263 } 264 stat->blocks = lowerdatastat.blocks; 265 } 266 } 267 268 ovl_map_dev_ino(dentry, stat, fsid); 269 270 /* 271 * It's probably not worth it to count subdirs to get the 272 * correct link count. nlink=1 seems to pacify 'find' and 273 * other utilities. 274 */ 275 if (is_dir && OVL_TYPE_MERGE(type)) 276 stat->nlink = 1; 277 278 /* 279 * Return the overlay inode nlinks for indexed upper inodes. 280 * Overlay inode nlink counts the union of the upper hardlinks 281 * and non-covered lower hardlinks. It does not include the upper 282 * index hardlink. 283 */ 284 if (!is_dir && ovl_test_flag(OVL_INDEX, d_inode(dentry))) 285 stat->nlink = dentry->d_inode->i_nlink; 286 287 return err; 288 } 289 290 int ovl_permission(struct mnt_idmap *idmap, 291 struct inode *inode, int mask) 292 { 293 struct inode *upperinode = ovl_inode_upper(inode); 294 struct inode *realinode; 295 struct path realpath; 296 int err; 297 298 /* Careful in RCU walk mode */ 299 realinode = ovl_i_path_real(inode, &realpath); 300 if (!realinode) { 301 WARN_ON(!(mask & MAY_NOT_BLOCK)); 302 return -ECHILD; 303 } 304 305 /* 306 * Check overlay inode with the creds of task and underlying inode 307 * with creds of mounter 308 */ 309 err = generic_permission(&nop_mnt_idmap, inode, mask); 310 if (err) 311 return err; 312 313 if (!upperinode && 314 !special_file(realinode->i_mode) && mask & MAY_WRITE) { 315 mask &= ~(MAY_WRITE | MAY_APPEND); 316 /* Make sure mounter can read file for copy up later */ 317 mask |= MAY_READ; 318 } 319 320 with_ovl_creds(inode->i_sb) 321 return inode_permission(mnt_idmap(realpath.mnt), realinode, mask); 322 } 323 324 static const char *ovl_get_link(struct dentry *dentry, 325 struct inode *inode, 326 struct delayed_call *done) 327 { 328 if (!dentry) 329 return ERR_PTR(-ECHILD); 330 331 with_ovl_creds(dentry->d_sb) 332 return vfs_get_link(ovl_dentry_real(dentry), done); 333 } 334 335 #ifdef CONFIG_FS_POSIX_ACL 336 /* 337 * Apply the idmapping of the layer to POSIX ACLs. The caller must pass a clone 338 * of the POSIX ACLs retrieved from the lower layer to this function to not 339 * alter the POSIX ACLs for the underlying filesystem. 340 */ 341 static void ovl_idmap_posix_acl(const struct inode *realinode, 342 struct mnt_idmap *idmap, 343 struct posix_acl *acl) 344 { 345 struct user_namespace *fs_userns = i_user_ns(realinode); 346 347 for (unsigned int i = 0; i < acl->a_count; i++) { 348 vfsuid_t vfsuid; 349 vfsgid_t vfsgid; 350 351 struct posix_acl_entry *e = &acl->a_entries[i]; 352 switch (e->e_tag) { 353 case ACL_USER: 354 vfsuid = make_vfsuid(idmap, fs_userns, e->e_uid); 355 e->e_uid = vfsuid_into_kuid(vfsuid); 356 break; 357 case ACL_GROUP: 358 vfsgid = make_vfsgid(idmap, fs_userns, e->e_gid); 359 e->e_gid = vfsgid_into_kgid(vfsgid); 360 break; 361 } 362 } 363 } 364 365 /* 366 * The @noperm argument is used to skip permission checking and is a temporary 367 * measure. Quoting Miklos from an earlier discussion: 368 * 369 * > So there are two paths to getting an acl: 370 * > 1) permission checking and 2) retrieving the value via getxattr(2). 371 * > This is a similar situation as reading a symlink vs. following it. 372 * > When following a symlink overlayfs always reads the link on the 373 * > underlying fs just as if it was a readlink(2) call, calling 374 * > security_inode_readlink() instead of security_inode_follow_link(). 375 * > This is logical: we are reading the link from the underlying storage, 376 * > and following it on overlayfs. 377 * > 378 * > Applying the same logic to acl: we do need to call the 379 * > security_inode_getxattr() on the underlying fs, even if just want to 380 * > check permissions on overlay. This is currently not done, which is an 381 * > inconsistency. 382 * > 383 * > Maybe adding the check to ovl_get_acl() is the right way to go, but 384 * > I'm a little afraid of a performance regression. Will look into that. 385 * 386 * Until we have made a decision allow this helper to take the @noperm 387 * argument. We should hopefully be able to remove it soon. 388 */ 389 struct posix_acl *ovl_get_acl_path(const struct path *path, 390 const char *acl_name, bool noperm) 391 { 392 struct posix_acl *real_acl, *clone; 393 struct mnt_idmap *idmap; 394 struct inode *realinode = d_inode(path->dentry); 395 396 idmap = mnt_idmap(path->mnt); 397 398 if (noperm) 399 real_acl = get_inode_acl(realinode, posix_acl_type(acl_name)); 400 else 401 real_acl = vfs_get_acl(idmap, path->dentry, acl_name); 402 if (IS_ERR_OR_NULL(real_acl)) 403 return real_acl; 404 405 if (!is_idmapped_mnt(path->mnt)) 406 return real_acl; 407 408 /* 409 * We cannot alter the ACLs returned from the relevant layer as that 410 * would alter the cached values filesystem wide for the lower 411 * filesystem. Instead we can clone the ACLs and then apply the 412 * relevant idmapping of the layer. 413 */ 414 clone = posix_acl_clone(real_acl, GFP_KERNEL); 415 posix_acl_release(real_acl); /* release original acl */ 416 if (!clone) 417 return ERR_PTR(-ENOMEM); 418 419 ovl_idmap_posix_acl(realinode, idmap, clone); 420 return clone; 421 } 422 423 /* 424 * When the relevant layer is an idmapped mount we need to take the idmapping 425 * of the layer into account and translate any ACL_{GROUP,USER} values 426 * according to the idmapped mount. 427 * 428 * We cannot alter the ACLs returned from the relevant layer as that would 429 * alter the cached values filesystem wide for the lower filesystem. Instead we 430 * can clone the ACLs and then apply the relevant idmapping of the layer. 431 * 432 * This is obviously only relevant when idmapped layers are used. 433 */ 434 struct posix_acl *do_ovl_get_acl(struct mnt_idmap *idmap, 435 struct inode *inode, int type, 436 bool rcu, bool noperm) 437 { 438 struct inode *realinode; 439 struct posix_acl *acl; 440 struct path realpath; 441 442 /* Careful in RCU walk mode */ 443 realinode = ovl_i_path_real(inode, &realpath); 444 if (!realinode) { 445 WARN_ON(!rcu); 446 return ERR_PTR(-ECHILD); 447 } 448 449 if (!IS_POSIXACL(realinode)) 450 return NULL; 451 452 if (rcu) { 453 /* 454 * If the layer is idmapped drop out of RCU path walk 455 * so we can clone the ACLs. 456 */ 457 if (is_idmapped_mnt(realpath.mnt)) 458 return ERR_PTR(-ECHILD); 459 460 acl = get_cached_acl_rcu(realinode, type); 461 } else { 462 with_ovl_creds(inode->i_sb) 463 acl = ovl_get_acl_path(&realpath, posix_acl_xattr_name(type), noperm); 464 } 465 466 return acl; 467 } 468 469 static int ovl_set_or_remove_acl(struct dentry *dentry, struct inode *inode, 470 struct posix_acl *acl, int type) 471 { 472 int err; 473 struct path realpath; 474 const char *acl_name; 475 struct ovl_fs *ofs = OVL_FS(dentry->d_sb); 476 struct dentry *upperdentry = ovl_dentry_upper(dentry); 477 struct dentry *realdentry = upperdentry ?: ovl_dentry_lower(dentry); 478 479 /* 480 * If ACL is to be removed from a lower file, check if it exists in 481 * the first place before copying it up. 482 */ 483 acl_name = posix_acl_xattr_name(type); 484 if (!acl && !upperdentry) { 485 struct posix_acl *real_acl; 486 487 ovl_path_lower(dentry, &realpath); 488 with_ovl_creds(dentry->d_sb) 489 real_acl = vfs_get_acl(mnt_idmap(realpath.mnt), realdentry, acl_name); 490 if (IS_ERR(real_acl)) { 491 err = PTR_ERR(real_acl); 492 goto out; 493 } 494 posix_acl_release(real_acl); 495 } 496 497 if (!upperdentry) { 498 err = ovl_copy_up(dentry); 499 if (err) 500 goto out; 501 502 realdentry = ovl_dentry_upper(dentry); 503 } 504 505 err = ovl_want_write(dentry); 506 if (err) 507 goto out; 508 509 with_ovl_creds(dentry->d_sb) { 510 if (acl) 511 err = ovl_do_set_acl(ofs, realdentry, acl_name, acl); 512 else 513 err = ovl_do_remove_acl(ofs, realdentry, acl_name); 514 } 515 ovl_drop_write(dentry); 516 517 /* copy c/mtime */ 518 ovl_copyattr(inode); 519 out: 520 return err; 521 } 522 523 int ovl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, 524 struct posix_acl *acl, int type) 525 { 526 int err; 527 struct inode *inode = d_inode(dentry); 528 struct dentry *workdir = ovl_workdir(dentry); 529 struct inode *realinode = ovl_inode_real(inode); 530 531 if (!IS_POSIXACL(d_inode(workdir))) 532 return -EOPNOTSUPP; 533 if (!realinode->i_op->set_acl) 534 return -EOPNOTSUPP; 535 if (type == ACL_TYPE_DEFAULT && !S_ISDIR(inode->i_mode)) 536 return acl ? -EACCES : 0; 537 if (!inode_owner_or_capable(&nop_mnt_idmap, inode)) 538 return -EPERM; 539 540 /* 541 * Check if sgid bit needs to be cleared (actual setacl operation will 542 * be done with mounter's capabilities and so that won't do it for us). 543 */ 544 if (unlikely(inode->i_mode & S_ISGID) && type == ACL_TYPE_ACCESS && 545 !in_group_p(inode->i_gid) && 546 !capable_wrt_inode_uidgid(&nop_mnt_idmap, inode, CAP_FSETID)) { 547 struct iattr iattr = { .ia_valid = ATTR_KILL_SGID }; 548 549 err = ovl_setattr(&nop_mnt_idmap, dentry, &iattr); 550 if (err) 551 return err; 552 } 553 554 return ovl_set_or_remove_acl(dentry, inode, acl, type); 555 } 556 #endif 557 558 int ovl_update_time(struct inode *inode, enum fs_update_time type, 559 unsigned int flags) 560 { 561 if (type == FS_UPD_ATIME) { 562 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 563 struct path upperpath = { 564 .mnt = ovl_upper_mnt(ofs), 565 .dentry = ovl_upperdentry_dereference(OVL_I(inode)), 566 }; 567 568 if (upperpath.dentry) { 569 if (flags & IOCB_NOWAIT) 570 return -EAGAIN; 571 touch_atime(&upperpath); 572 inode_set_atime_to_ts(inode, 573 inode_get_atime(d_inode(upperpath.dentry))); 574 } 575 } 576 return 0; 577 } 578 579 static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 580 u64 start, u64 len) 581 { 582 struct inode *realinode = ovl_inode_realdata(inode); 583 584 if (!realinode) 585 return -EIO; 586 587 if (!realinode->i_op->fiemap) 588 return -EOPNOTSUPP; 589 590 with_ovl_creds(inode->i_sb) 591 return realinode->i_op->fiemap(realinode, fieinfo, start, len); 592 } 593 594 /* 595 * Work around the fact that security_file_ioctl() takes a file argument. 596 * Introducing security_inode_fileattr_get/set() hooks would solve this issue 597 * properly. 598 */ 599 static int ovl_security_fileattr(const struct path *realpath, struct file_kattr *fa, 600 bool set) 601 { 602 struct file *file; 603 unsigned int cmd; 604 int err; 605 unsigned int flags; 606 607 flags = O_RDONLY; 608 if (force_o_largefile()) 609 flags |= O_LARGEFILE; 610 611 file = dentry_open(realpath, flags, current_cred()); 612 if (IS_ERR(file)) 613 return PTR_ERR(file); 614 615 if (set) 616 cmd = fa->fsx_valid ? FS_IOC_FSSETXATTR : FS_IOC_SETFLAGS; 617 else 618 cmd = fa->fsx_valid ? FS_IOC_FSGETXATTR : FS_IOC_GETFLAGS; 619 620 err = security_file_ioctl(file, cmd, 0); 621 fput(file); 622 623 return err; 624 } 625 626 int ovl_real_fileattr_set(const struct path *realpath, struct file_kattr *fa) 627 { 628 int err; 629 630 err = ovl_security_fileattr(realpath, fa, true); 631 if (err) 632 return err; 633 634 return vfs_fileattr_set(mnt_idmap(realpath->mnt), realpath->dentry, fa); 635 } 636 637 int ovl_fileattr_set(struct mnt_idmap *idmap, 638 struct dentry *dentry, struct file_kattr *fa) 639 { 640 struct inode *inode = d_inode(dentry); 641 struct path upperpath; 642 unsigned int flags; 643 int err; 644 645 err = ovl_copy_up(dentry); 646 if (!err) { 647 ovl_path_real(dentry, &upperpath); 648 649 err = ovl_want_write(dentry); 650 if (err) 651 goto out; 652 653 with_ovl_creds(inode->i_sb) { 654 /* 655 * Store immutable/append-only flags in xattr and clear them 656 * in upper fileattr (in case they were set by older kernel) 657 * so children of "ovl-immutable" directories lower aliases of 658 * "ovl-immutable" hardlinks could be copied up. 659 * Clear xattr when flags are cleared. 660 */ 661 err = ovl_set_protattr(inode, upperpath.dentry, fa); 662 if (!err) 663 err = ovl_real_fileattr_set(&upperpath, fa); 664 } 665 ovl_drop_write(dentry); 666 667 /* 668 * Merge real inode flags with inode flags read from 669 * overlay.protattr xattr 670 */ 671 flags = ovl_inode_real(inode)->i_flags & OVL_COPY_I_FLAGS_MASK; 672 673 BUILD_BUG_ON(OVL_PROT_I_FLAGS_MASK & ~OVL_COPY_I_FLAGS_MASK); 674 flags |= inode->i_flags & OVL_PROT_I_FLAGS_MASK; 675 inode_set_flags(inode, flags, OVL_COPY_I_FLAGS_MASK); 676 677 /* Update ctime */ 678 ovl_copyattr(inode); 679 } 680 out: 681 return err; 682 } 683 684 /* Convert inode protection flags to fileattr flags */ 685 static void ovl_fileattr_prot_flags(struct inode *inode, struct file_kattr *fa) 686 { 687 BUILD_BUG_ON(OVL_PROT_FS_FLAGS_MASK & ~FS_COMMON_FL); 688 BUILD_BUG_ON(OVL_PROT_FSX_FLAGS_MASK & ~FS_XFLAG_COMMON); 689 690 if (inode->i_flags & S_APPEND) { 691 fa->flags |= FS_APPEND_FL; 692 fa->fsx_xflags |= FS_XFLAG_APPEND; 693 } 694 if (inode->i_flags & S_IMMUTABLE) { 695 fa->flags |= FS_IMMUTABLE_FL; 696 fa->fsx_xflags |= FS_XFLAG_IMMUTABLE; 697 } 698 } 699 700 int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) 701 { 702 int err; 703 704 err = ovl_security_fileattr(realpath, fa, false); 705 if (err) 706 return err; 707 708 err = vfs_fileattr_get(realpath->dentry, fa); 709 if (err == -ENOIOCTLCMD) 710 err = -ENOTTY; 711 return err; 712 } 713 714 int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) 715 { 716 struct inode *inode = d_inode(dentry); 717 struct path realpath; 718 int err; 719 720 ovl_path_real(dentry, &realpath); 721 722 with_ovl_creds(inode->i_sb) 723 err = ovl_real_fileattr_get(&realpath, fa); 724 ovl_fileattr_prot_flags(inode, fa); 725 726 return err; 727 } 728 729 static const struct inode_operations ovl_file_inode_operations = { 730 .setattr = ovl_setattr, 731 .permission = ovl_permission, 732 .getattr = ovl_getattr, 733 .listxattr = ovl_listxattr, 734 .get_inode_acl = ovl_get_inode_acl, 735 .get_acl = ovl_get_acl, 736 .set_acl = ovl_set_acl, 737 .update_time = ovl_update_time, 738 .fiemap = ovl_fiemap, 739 .fileattr_get = ovl_fileattr_get, 740 .fileattr_set = ovl_fileattr_set, 741 }; 742 743 static const struct inode_operations ovl_symlink_inode_operations = { 744 .setattr = ovl_setattr, 745 .get_link = ovl_get_link, 746 .getattr = ovl_getattr, 747 .listxattr = ovl_listxattr, 748 .update_time = ovl_update_time, 749 }; 750 751 static const struct inode_operations ovl_special_inode_operations = { 752 .setattr = ovl_setattr, 753 .permission = ovl_permission, 754 .getattr = ovl_getattr, 755 .listxattr = ovl_listxattr, 756 .get_inode_acl = ovl_get_inode_acl, 757 .get_acl = ovl_get_acl, 758 .set_acl = ovl_set_acl, 759 .update_time = ovl_update_time, 760 }; 761 762 static const struct address_space_operations ovl_aops = { 763 /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */ 764 .direct_IO = noop_direct_IO, 765 }; 766 767 /* 768 * It is possible to stack overlayfs instance on top of another 769 * overlayfs instance as lower layer. We need to annotate the 770 * stackable i_mutex locks according to stack level of the super 771 * block instance. An overlayfs instance can never be in stack 772 * depth 0 (there is always a real fs below it). An overlayfs 773 * inode lock will use the lockdep annotation ovl_i_mutex_key[depth]. 774 * 775 * For example, here is a snip from /proc/lockdep_chains after 776 * dir_iterate of nested overlayfs: 777 * 778 * [...] &ovl_i_mutex_dir_key[depth] (stack_depth=2) 779 * [...] &ovl_i_mutex_dir_key[depth]#2 (stack_depth=1) 780 * [...] &type->i_mutex_dir_key (stack_depth=0) 781 * 782 * Locking order w.r.t ovl_want_write() is important for nested overlayfs. 783 * 784 * This chain is valid: 785 * - inode->i_rwsem (inode_lock[2]) 786 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 787 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 788 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 789 * 790 * And this chain is valid: 791 * - inode->i_rwsem (inode_lock[2]) 792 * - OVL_I(inode)->lock (ovl_inode_lock[2]) 793 * - lowerinode->i_rwsem (inode_lock[1]) 794 * - OVL_I(lowerinode)->lock (ovl_inode_lock[1]) 795 * 796 * But lowerinode->i_rwsem SHOULD NOT be acquired while ovl_want_write() is 797 * held, because it is in reverse order of the non-nested case using the same 798 * upper fs: 799 * - inode->i_rwsem (inode_lock[1]) 800 * - upper_mnt->mnt_sb->s_writers (ovl_want_write[0]) 801 * - OVL_I(inode)->lock (ovl_inode_lock[1]) 802 */ 803 #define OVL_MAX_NESTING FILESYSTEM_MAX_STACK_DEPTH 804 805 static inline void ovl_lockdep_annotate_inode_mutex_key(struct inode *inode) 806 { 807 #ifdef CONFIG_LOCKDEP 808 static struct lock_class_key ovl_i_mutex_key[OVL_MAX_NESTING]; 809 static struct lock_class_key ovl_i_mutex_dir_key[OVL_MAX_NESTING]; 810 static struct lock_class_key ovl_i_lock_key[OVL_MAX_NESTING]; 811 812 int depth = inode->i_sb->s_stack_depth - 1; 813 814 if (WARN_ON_ONCE(depth < 0 || depth >= OVL_MAX_NESTING)) 815 depth = 0; 816 817 if (S_ISDIR(inode->i_mode)) 818 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_dir_key[depth]); 819 else 820 lockdep_set_class(&inode->i_rwsem, &ovl_i_mutex_key[depth]); 821 822 lockdep_set_class(&OVL_I(inode)->lock, &ovl_i_lock_key[depth]); 823 #endif 824 } 825 826 static void ovl_next_ino(struct inode *inode) 827 { 828 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 829 830 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 831 if (unlikely(!inode->i_ino)) 832 inode->i_ino = atomic_long_inc_return(&ofs->last_ino); 833 } 834 835 static void ovl_map_ino(struct inode *inode, unsigned long ino, int fsid) 836 { 837 struct ovl_fs *ofs = OVL_FS(inode->i_sb); 838 int xinobits = ovl_xino_bits(ofs); 839 unsigned int xinoshift = 64 - xinobits; 840 841 /* 842 * When d_ino is consistent with st_ino (samefs or i_ino has enough 843 * bits to encode layer), set the same value used for st_ino to i_ino, 844 * so inode number exposed via /proc/locks and a like will be 845 * consistent with d_ino and st_ino values. An i_ino value inconsistent 846 * with d_ino also causes nfsd readdirplus to fail. 847 */ 848 inode->i_ino = ino; 849 if (ovl_same_fs(ofs)) { 850 return; 851 } else if (xinobits && likely(!(ino >> xinoshift))) { 852 inode->i_ino |= (unsigned long)fsid << (xinoshift + 1); 853 return; 854 } 855 856 /* 857 * For directory inodes on non-samefs with xino disabled or xino 858 * overflow, we allocate a non-persistent inode number, to be used for 859 * resolving st_ino collisions in ovl_map_dev_ino(). 860 * 861 * To avoid ino collision with legitimate xino values from upper 862 * layer (fsid 0), use the lowest xinobit to map the non 863 * persistent inode numbers to the unified st_ino address space. 864 */ 865 if (S_ISDIR(inode->i_mode)) { 866 ovl_next_ino(inode); 867 if (xinobits) { 868 inode->i_ino &= ~0UL >> xinobits; 869 inode->i_ino |= 1UL << xinoshift; 870 } 871 } 872 } 873 874 void ovl_inode_init(struct inode *inode, struct ovl_inode_params *oip, 875 unsigned long ino, int fsid) 876 { 877 struct inode *realinode; 878 struct ovl_inode *oi = OVL_I(inode); 879 880 oi->__upperdentry = oip->upperdentry; 881 oi->oe = oip->oe; 882 oi->redirect = oip->redirect; 883 oi->lowerdata_redirect = oip->lowerdata_redirect; 884 885 realinode = ovl_inode_real(inode); 886 ovl_copyattr(inode); 887 ovl_copyflags(realinode, inode); 888 ovl_map_ino(inode, ino, fsid); 889 } 890 891 static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev) 892 { 893 inode->i_mode = mode; 894 inode->i_flags |= S_NOCMTIME; 895 #ifdef CONFIG_FS_POSIX_ACL 896 inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE; 897 #endif 898 899 ovl_lockdep_annotate_inode_mutex_key(inode); 900 901 switch (mode & S_IFMT) { 902 case S_IFREG: 903 inode->i_op = &ovl_file_inode_operations; 904 inode->i_fop = &ovl_file_operations; 905 inode->i_mapping->a_ops = &ovl_aops; 906 break; 907 908 case S_IFDIR: 909 inode->i_op = &ovl_dir_inode_operations; 910 inode->i_fop = &ovl_dir_operations; 911 break; 912 913 case S_IFLNK: 914 inode->i_op = &ovl_symlink_inode_operations; 915 break; 916 917 default: 918 inode->i_op = &ovl_special_inode_operations; 919 init_special_inode(inode, mode, rdev); 920 break; 921 } 922 } 923 924 /* 925 * With inodes index enabled, an overlay inode nlink counts the union of upper 926 * hardlinks and non-covered lower hardlinks. During the lifetime of a non-pure 927 * upper inode, the following nlink modifying operations can happen: 928 * 929 * 1. Lower hardlink copy up 930 * 2. Upper hardlink created, unlinked or renamed over 931 * 3. Lower hardlink whiteout or renamed over 932 * 933 * For the first, copy up case, the union nlink does not change, whether the 934 * operation succeeds or fails, but the upper inode nlink may change. 935 * Therefore, before copy up, we store the union nlink value relative to the 936 * lower inode nlink in the index inode xattr .overlay.nlink. 937 * 938 * For the second, upper hardlink case, the union nlink should be incremented 939 * or decremented IFF the operation succeeds, aligned with nlink change of the 940 * upper inode. Therefore, before link/unlink/rename, we store the union nlink 941 * value relative to the upper inode nlink in the index inode. 942 * 943 * For the last, lower cover up case, we simplify things by preceding the 944 * whiteout or cover up with copy up. This makes sure that there is an index 945 * upper inode where the nlink xattr can be stored before the copied up upper 946 * entry is unlink. 947 */ 948 #define OVL_NLINK_ADD_UPPER (1 << 0) 949 950 /* 951 * On-disk format for indexed nlink: 952 * 953 * nlink relative to the upper inode - "U[+-]NUM" 954 * nlink relative to the lower inode - "L[+-]NUM" 955 */ 956 957 static int ovl_set_nlink_common(struct dentry *dentry, 958 struct dentry *realdentry, const char *format) 959 { 960 struct inode *inode = d_inode(dentry); 961 struct inode *realinode = d_inode(realdentry); 962 char buf[13]; 963 int len; 964 965 len = snprintf(buf, sizeof(buf), format, 966 (int) (inode->i_nlink - realinode->i_nlink)); 967 968 if (WARN_ON(len >= sizeof(buf))) 969 return -EIO; 970 971 return ovl_setxattr(OVL_FS(inode->i_sb), ovl_dentry_upper(dentry), 972 OVL_XATTR_NLINK, buf, len); 973 } 974 975 int ovl_set_nlink_upper(struct dentry *dentry) 976 { 977 return ovl_set_nlink_common(dentry, ovl_dentry_upper(dentry), "U%+i"); 978 } 979 980 int ovl_set_nlink_lower(struct dentry *dentry) 981 { 982 return ovl_set_nlink_common(dentry, ovl_dentry_lower(dentry), "L%+i"); 983 } 984 985 unsigned int ovl_get_nlink(struct ovl_fs *ofs, struct dentry *lowerdentry, 986 struct dentry *upperdentry, 987 unsigned int fallback) 988 { 989 int nlink_diff; 990 int nlink; 991 char buf[13]; 992 int err; 993 994 if (!lowerdentry || !upperdentry || d_inode(lowerdentry)->i_nlink == 1) 995 return fallback; 996 997 err = ovl_getxattr_upper(ofs, upperdentry, OVL_XATTR_NLINK, 998 &buf, sizeof(buf) - 1); 999 if (err < 0) 1000 goto fail; 1001 1002 buf[err] = '\0'; 1003 if ((buf[0] != 'L' && buf[0] != 'U') || 1004 (buf[1] != '+' && buf[1] != '-')) 1005 goto fail; 1006 1007 err = kstrtoint(buf + 1, 10, &nlink_diff); 1008 if (err < 0) 1009 goto fail; 1010 1011 nlink = d_inode(buf[0] == 'L' ? lowerdentry : upperdentry)->i_nlink; 1012 nlink += nlink_diff; 1013 1014 if (nlink <= 0) 1015 goto fail; 1016 1017 return nlink; 1018 1019 fail: 1020 pr_warn_ratelimited("failed to get index nlink (%pd2, err=%i)\n", 1021 upperdentry, err); 1022 return fallback; 1023 } 1024 1025 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev) 1026 { 1027 struct inode *inode; 1028 1029 inode = new_inode(sb); 1030 if (inode) 1031 ovl_fill_inode(inode, mode, rdev); 1032 1033 return inode; 1034 } 1035 1036 static int ovl_inode_test(struct inode *inode, void *data) 1037 { 1038 return inode->i_private == data; 1039 } 1040 1041 static int ovl_inode_set(struct inode *inode, void *data) 1042 { 1043 inode->i_private = data; 1044 return 0; 1045 } 1046 1047 static bool ovl_verify_inode(struct inode *inode, struct dentry *lowerdentry, 1048 struct dentry *upperdentry, bool strict) 1049 { 1050 /* 1051 * For directories, @strict verify from lookup path performs consistency 1052 * checks, so NULL lower/upper in dentry must match NULL lower/upper in 1053 * inode. Non @strict verify from NFS handle decode path passes NULL for 1054 * 'unknown' lower/upper. 1055 */ 1056 if (S_ISDIR(inode->i_mode) && strict) { 1057 /* Real lower dir moved to upper layer under us? */ 1058 if (!lowerdentry && ovl_inode_lower(inode)) 1059 return false; 1060 1061 /* Lookup of an uncovered redirect origin? */ 1062 if (!upperdentry && ovl_inode_upper(inode)) 1063 return false; 1064 } 1065 1066 /* 1067 * Allow non-NULL lower inode in ovl_inode even if lowerdentry is NULL. 1068 * This happens when finding a copied up overlay inode for a renamed 1069 * or hardlinked overlay dentry and lower dentry cannot be followed 1070 * by origin because lower fs does not support file handles. 1071 */ 1072 if (lowerdentry && ovl_inode_lower(inode) != d_inode(lowerdentry)) 1073 return false; 1074 1075 /* 1076 * Allow non-NULL __upperdentry in inode even if upperdentry is NULL. 1077 * This happens when finding a lower alias for a copied up hard link. 1078 */ 1079 if (upperdentry && ovl_inode_upper(inode) != d_inode(upperdentry)) 1080 return false; 1081 1082 return true; 1083 } 1084 1085 struct inode *ovl_lookup_inode(struct super_block *sb, struct dentry *real, 1086 bool is_upper) 1087 { 1088 struct inode *inode, *key = d_inode(real); 1089 1090 inode = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 1091 if (!inode) 1092 return NULL; 1093 1094 if (!ovl_verify_inode(inode, is_upper ? NULL : real, 1095 is_upper ? real : NULL, false)) { 1096 iput(inode); 1097 return ERR_PTR(-ESTALE); 1098 } 1099 1100 return inode; 1101 } 1102 1103 bool ovl_lookup_trap_inode(struct super_block *sb, struct dentry *dir) 1104 { 1105 struct inode *key = d_inode(dir); 1106 struct inode *trap; 1107 bool res; 1108 1109 trap = ilookup5(sb, (unsigned long) key, ovl_inode_test, key); 1110 if (!trap) 1111 return false; 1112 1113 res = IS_DEADDIR(trap) && !ovl_inode_upper(trap) && 1114 !ovl_inode_lower(trap); 1115 1116 iput(trap); 1117 return res; 1118 } 1119 1120 /* 1121 * Create an inode cache entry for layer root dir, that will intentionally 1122 * fail ovl_verify_inode(), so any lookup that will find some layer root 1123 * will fail. 1124 */ 1125 struct inode *ovl_get_trap_inode(struct super_block *sb, struct dentry *dir) 1126 { 1127 struct inode *key = d_inode(dir); 1128 struct inode *trap; 1129 1130 if (!d_is_dir(dir)) 1131 return ERR_PTR(-ENOTDIR); 1132 1133 trap = iget5_locked(sb, (unsigned long) key, ovl_inode_test, 1134 ovl_inode_set, key); 1135 if (!trap) 1136 return ERR_PTR(-ENOMEM); 1137 1138 if (!(inode_state_read_once(trap) & I_NEW)) { 1139 /* Conflicting layer roots? */ 1140 iput(trap); 1141 return ERR_PTR(-ELOOP); 1142 } 1143 1144 trap->i_mode = S_IFDIR; 1145 trap->i_flags = S_DEAD; 1146 unlock_new_inode(trap); 1147 1148 return trap; 1149 } 1150 1151 /* 1152 * Does overlay inode need to be hashed by lower inode? 1153 */ 1154 static bool ovl_hash_bylower(struct super_block *sb, struct dentry *upper, 1155 struct dentry *lower, bool index) 1156 { 1157 struct ovl_fs *ofs = OVL_FS(sb); 1158 1159 /* No, if pure upper */ 1160 if (!lower) 1161 return false; 1162 1163 /* Yes, if already indexed */ 1164 if (index) 1165 return true; 1166 1167 /* Yes, if won't be copied up */ 1168 if (!ovl_upper_mnt(ofs)) 1169 return true; 1170 1171 /* No, if lower hardlink is or will be broken on copy up */ 1172 if ((upper || !ovl_indexdir(sb)) && 1173 !d_is_dir(lower) && d_inode(lower)->i_nlink > 1) 1174 return false; 1175 1176 /* No, if non-indexed upper with NFS export */ 1177 if (ofs->config.nfs_export && upper) 1178 return false; 1179 1180 /* Otherwise, hash by lower inode for fsnotify */ 1181 return true; 1182 } 1183 1184 static struct inode *ovl_iget5(struct super_block *sb, struct inode *newinode, 1185 struct inode *key) 1186 { 1187 return newinode ? inode_insert5(newinode, (unsigned long) key, 1188 ovl_inode_test, ovl_inode_set, key) : 1189 iget5_locked(sb, (unsigned long) key, 1190 ovl_inode_test, ovl_inode_set, key); 1191 } 1192 1193 struct inode *ovl_get_inode(struct super_block *sb, 1194 struct ovl_inode_params *oip) 1195 { 1196 struct ovl_fs *ofs = OVL_FS(sb); 1197 struct dentry *upperdentry = oip->upperdentry; 1198 struct ovl_path *lowerpath = ovl_lowerpath(oip->oe); 1199 struct inode *realinode = upperdentry ? d_inode(upperdentry) : NULL; 1200 struct inode *inode; 1201 struct dentry *lowerdentry = lowerpath ? lowerpath->dentry : NULL; 1202 struct path realpath = { 1203 .dentry = upperdentry ?: lowerdentry, 1204 .mnt = upperdentry ? ovl_upper_mnt(ofs) : lowerpath->layer->mnt, 1205 }; 1206 bool bylower = ovl_hash_bylower(sb, upperdentry, lowerdentry, 1207 oip->index); 1208 int fsid = bylower ? lowerpath->layer->fsid : 0; 1209 bool is_dir; 1210 unsigned long ino = 0; 1211 int err = oip->newinode ? -EEXIST : -ENOMEM; 1212 1213 if (!realinode) 1214 realinode = d_inode(lowerdentry); 1215 1216 /* 1217 * Copy up origin (lower) may exist for non-indexed upper, but we must 1218 * not use lower as hash key if this is a broken hardlink. 1219 */ 1220 is_dir = S_ISDIR(realinode->i_mode); 1221 if (upperdentry || bylower) { 1222 struct inode *key = d_inode(bylower ? lowerdentry : 1223 upperdentry); 1224 unsigned int nlink = is_dir ? 1 : realinode->i_nlink; 1225 1226 inode = ovl_iget5(sb, oip->newinode, key); 1227 if (!inode) 1228 goto out_err; 1229 if (!(inode_state_read_once(inode) & I_NEW)) { 1230 /* 1231 * Verify that the underlying files stored in the inode 1232 * match those in the dentry. 1233 */ 1234 if (!ovl_verify_inode(inode, lowerdentry, upperdentry, 1235 true)) { 1236 iput(inode); 1237 err = -ESTALE; 1238 goto out_err; 1239 } 1240 1241 dput(upperdentry); 1242 ovl_free_entry(oip->oe); 1243 kfree(oip->redirect); 1244 kfree(oip->lowerdata_redirect); 1245 goto out; 1246 } 1247 1248 /* Recalculate nlink for non-dir due to indexing */ 1249 if (!is_dir) 1250 nlink = ovl_get_nlink(ofs, lowerdentry, upperdentry, 1251 nlink); 1252 set_nlink(inode, nlink); 1253 ino = key->i_ino; 1254 } else { 1255 /* Lower hardlink that will be broken on copy up */ 1256 inode = new_inode(sb); 1257 if (!inode) { 1258 err = -ENOMEM; 1259 goto out_err; 1260 } 1261 ino = realinode->i_ino; 1262 fsid = lowerpath->layer->fsid; 1263 } 1264 ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); 1265 ovl_inode_init(inode, oip, ino, fsid); 1266 WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold); 1267 1268 if (upperdentry && ovl_is_impuredir(sb, upperdentry)) 1269 ovl_set_flag(OVL_IMPURE, inode); 1270 1271 if (oip->index) 1272 ovl_set_flag(OVL_INDEX, inode); 1273 1274 if (bylower) 1275 ovl_set_flag(OVL_CONST_INO, inode); 1276 1277 /* Check for non-merge dir that may have whiteouts */ 1278 if (is_dir) { 1279 if (((upperdentry && lowerdentry) || ovl_numlower(oip->oe) > 1) || 1280 ovl_path_check_origin_xattr(ofs, &realpath)) { 1281 ovl_set_flag(OVL_WHITEOUTS, inode); 1282 } 1283 } 1284 1285 /* Check for immutable/append-only inode flags in xattr */ 1286 if (upperdentry) 1287 ovl_check_protattr(inode, upperdentry); 1288 1289 if (inode_state_read_once(inode) & I_NEW) 1290 unlock_new_inode(inode); 1291 out: 1292 return inode; 1293 1294 out_err: 1295 pr_warn_ratelimited("failed to get inode (%i)\n", err); 1296 inode = ERR_PTR(err); 1297 goto out; 1298 } 1299