1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/libfs.c 4 * Library for filesystems writers. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/export.h> 9 #include <linux/pagemap.h> 10 #include <linux/slab.h> 11 #include <linux/cred.h> 12 #include <linux/mount.h> 13 #include <linux/vfs.h> 14 #include <linux/quotaops.h> 15 #include <linux/mutex.h> 16 #include <linux/namei.h> 17 #include <linux/exportfs.h> 18 #include <linux/iversion.h> 19 #include <linux/writeback.h> 20 #include <linux/buffer_head.h> /* sync_mapping_buffers */ 21 #include <linux/fs_context.h> 22 #include <linux/pseudo_fs.h> 23 #include <linux/fsnotify.h> 24 #include <linux/unicode.h> 25 #include <linux/fscrypt.h> 26 #include <linux/pidfs.h> 27 28 #include <linux/uaccess.h> 29 30 #include "internal.h" 31 32 int simple_getattr(struct mnt_idmap *idmap, const struct path *path, 33 struct kstat *stat, u32 request_mask, 34 unsigned int query_flags) 35 { 36 struct inode *inode = d_inode(path->dentry); 37 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 38 stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); 39 return 0; 40 } 41 EXPORT_SYMBOL(simple_getattr); 42 43 int simple_statfs(struct dentry *dentry, struct kstatfs *buf) 44 { 45 u64 id = huge_encode_dev(dentry->d_sb->s_dev); 46 47 buf->f_fsid = u64_to_fsid(id); 48 buf->f_type = dentry->d_sb->s_magic; 49 buf->f_bsize = PAGE_SIZE; 50 buf->f_namelen = NAME_MAX; 51 return 0; 52 } 53 EXPORT_SYMBOL(simple_statfs); 54 55 /* 56 * Retaining negative dentries for an in-memory filesystem just wastes 57 * memory and lookup time: arrange for them to be deleted immediately. 58 */ 59 int always_delete_dentry(const struct dentry *dentry) 60 { 61 return 1; 62 } 63 EXPORT_SYMBOL(always_delete_dentry); 64 65 const struct dentry_operations simple_dentry_operations = { 66 .d_delete = always_delete_dentry, 67 }; 68 EXPORT_SYMBOL(simple_dentry_operations); 69 70 /* 71 * Lookup the data. This is trivial - if the dentry didn't already 72 * exist, we know it is negative. Set d_op to delete negative dentries. 73 */ 74 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 75 { 76 if (dentry->d_name.len > NAME_MAX) 77 return ERR_PTR(-ENAMETOOLONG); 78 if (!dentry->d_sb->s_d_op) 79 d_set_d_op(dentry, &simple_dentry_operations); 80 81 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) 82 return NULL; 83 84 d_add(dentry, NULL); 85 return NULL; 86 } 87 EXPORT_SYMBOL(simple_lookup); 88 89 int dcache_dir_open(struct inode *inode, struct file *file) 90 { 91 file->private_data = d_alloc_cursor(file->f_path.dentry); 92 93 return file->private_data ? 0 : -ENOMEM; 94 } 95 EXPORT_SYMBOL(dcache_dir_open); 96 97 int dcache_dir_close(struct inode *inode, struct file *file) 98 { 99 dput(file->private_data); 100 return 0; 101 } 102 EXPORT_SYMBOL(dcache_dir_close); 103 104 /* parent is locked at least shared */ 105 /* 106 * Returns an element of siblings' list. 107 * We are looking for <count>th positive after <p>; if 108 * found, dentry is grabbed and returned to caller. 109 * If no such element exists, NULL is returned. 110 */ 111 static struct dentry *scan_positives(struct dentry *cursor, 112 struct hlist_node **p, 113 loff_t count, 114 struct dentry *last) 115 { 116 struct dentry *dentry = cursor->d_parent, *found = NULL; 117 118 spin_lock(&dentry->d_lock); 119 while (*p) { 120 struct dentry *d = hlist_entry(*p, struct dentry, d_sib); 121 p = &d->d_sib.next; 122 // we must at least skip cursors, to avoid livelocks 123 if (d->d_flags & DCACHE_DENTRY_CURSOR) 124 continue; 125 if (simple_positive(d) && !--count) { 126 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 127 if (simple_positive(d)) 128 found = dget_dlock(d); 129 spin_unlock(&d->d_lock); 130 if (likely(found)) 131 break; 132 count = 1; 133 } 134 if (need_resched()) { 135 if (!hlist_unhashed(&cursor->d_sib)) 136 __hlist_del(&cursor->d_sib); 137 hlist_add_behind(&cursor->d_sib, &d->d_sib); 138 p = &cursor->d_sib.next; 139 spin_unlock(&dentry->d_lock); 140 cond_resched(); 141 spin_lock(&dentry->d_lock); 142 } 143 } 144 spin_unlock(&dentry->d_lock); 145 dput(last); 146 return found; 147 } 148 149 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) 150 { 151 struct dentry *dentry = file->f_path.dentry; 152 switch (whence) { 153 case 1: 154 offset += file->f_pos; 155 fallthrough; 156 case 0: 157 if (offset >= 0) 158 break; 159 fallthrough; 160 default: 161 return -EINVAL; 162 } 163 if (offset != file->f_pos) { 164 struct dentry *cursor = file->private_data; 165 struct dentry *to = NULL; 166 167 inode_lock_shared(dentry->d_inode); 168 169 if (offset > 2) 170 to = scan_positives(cursor, &dentry->d_children.first, 171 offset - 2, NULL); 172 spin_lock(&dentry->d_lock); 173 hlist_del_init(&cursor->d_sib); 174 if (to) 175 hlist_add_behind(&cursor->d_sib, &to->d_sib); 176 spin_unlock(&dentry->d_lock); 177 dput(to); 178 179 file->f_pos = offset; 180 181 inode_unlock_shared(dentry->d_inode); 182 } 183 return offset; 184 } 185 EXPORT_SYMBOL(dcache_dir_lseek); 186 187 /* 188 * Directory is locked and all positive dentries in it are safe, since 189 * for ramfs-type trees they can't go away without unlink() or rmdir(), 190 * both impossible due to the lock on directory. 191 */ 192 193 int dcache_readdir(struct file *file, struct dir_context *ctx) 194 { 195 struct dentry *dentry = file->f_path.dentry; 196 struct dentry *cursor = file->private_data; 197 struct dentry *next = NULL; 198 struct hlist_node **p; 199 200 if (!dir_emit_dots(file, ctx)) 201 return 0; 202 203 if (ctx->pos == 2) 204 p = &dentry->d_children.first; 205 else 206 p = &cursor->d_sib.next; 207 208 while ((next = scan_positives(cursor, p, 1, next)) != NULL) { 209 if (!dir_emit(ctx, next->d_name.name, next->d_name.len, 210 d_inode(next)->i_ino, 211 fs_umode_to_dtype(d_inode(next)->i_mode))) 212 break; 213 ctx->pos++; 214 p = &next->d_sib.next; 215 } 216 spin_lock(&dentry->d_lock); 217 hlist_del_init(&cursor->d_sib); 218 if (next) 219 hlist_add_before(&cursor->d_sib, &next->d_sib); 220 spin_unlock(&dentry->d_lock); 221 dput(next); 222 223 return 0; 224 } 225 EXPORT_SYMBOL(dcache_readdir); 226 227 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) 228 { 229 return -EISDIR; 230 } 231 EXPORT_SYMBOL(generic_read_dir); 232 233 const struct file_operations simple_dir_operations = { 234 .open = dcache_dir_open, 235 .release = dcache_dir_close, 236 .llseek = dcache_dir_lseek, 237 .read = generic_read_dir, 238 .iterate_shared = dcache_readdir, 239 .fsync = noop_fsync, 240 }; 241 EXPORT_SYMBOL(simple_dir_operations); 242 243 const struct inode_operations simple_dir_inode_operations = { 244 .lookup = simple_lookup, 245 }; 246 EXPORT_SYMBOL(simple_dir_inode_operations); 247 248 /* 0 is '.', 1 is '..', so always start with offset 2 or more */ 249 enum { 250 DIR_OFFSET_MIN = 2, 251 }; 252 253 static void offset_set(struct dentry *dentry, long offset) 254 { 255 dentry->d_fsdata = (void *)offset; 256 } 257 258 static long dentry2offset(struct dentry *dentry) 259 { 260 return (long)dentry->d_fsdata; 261 } 262 263 static struct lock_class_key simple_offset_lock_class; 264 265 /** 266 * simple_offset_init - initialize an offset_ctx 267 * @octx: directory offset map to be initialized 268 * 269 */ 270 void simple_offset_init(struct offset_ctx *octx) 271 { 272 mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); 273 lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); 274 octx->next_offset = DIR_OFFSET_MIN; 275 } 276 277 /** 278 * simple_offset_add - Add an entry to a directory's offset map 279 * @octx: directory offset ctx to be updated 280 * @dentry: new dentry being added 281 * 282 * Returns zero on success. @octx and the dentry's offset are updated. 283 * Otherwise, a negative errno value is returned. 284 */ 285 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) 286 { 287 unsigned long offset; 288 int ret; 289 290 if (dentry2offset(dentry) != 0) 291 return -EBUSY; 292 293 ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, 294 LONG_MAX, &octx->next_offset, GFP_KERNEL); 295 if (ret < 0) 296 return ret; 297 298 offset_set(dentry, offset); 299 return 0; 300 } 301 302 static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, 303 long offset) 304 { 305 int ret; 306 307 ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL); 308 if (ret) 309 return ret; 310 offset_set(dentry, offset); 311 return 0; 312 } 313 314 /** 315 * simple_offset_remove - Remove an entry to a directory's offset map 316 * @octx: directory offset ctx to be updated 317 * @dentry: dentry being removed 318 * 319 */ 320 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) 321 { 322 long offset; 323 324 offset = dentry2offset(dentry); 325 if (offset == 0) 326 return; 327 328 mtree_erase(&octx->mt, offset); 329 offset_set(dentry, 0); 330 } 331 332 /** 333 * simple_offset_empty - Check if a dentry can be unlinked 334 * @dentry: dentry to be tested 335 * 336 * Returns 0 if @dentry is a non-empty directory; otherwise returns 1. 337 */ 338 int simple_offset_empty(struct dentry *dentry) 339 { 340 struct inode *inode = d_inode(dentry); 341 struct offset_ctx *octx; 342 struct dentry *child; 343 unsigned long index; 344 int ret = 1; 345 346 if (!inode || !S_ISDIR(inode->i_mode)) 347 return ret; 348 349 index = DIR_OFFSET_MIN; 350 octx = inode->i_op->get_offset_ctx(inode); 351 mt_for_each(&octx->mt, child, index, LONG_MAX) { 352 spin_lock(&child->d_lock); 353 if (simple_positive(child)) { 354 spin_unlock(&child->d_lock); 355 ret = 0; 356 break; 357 } 358 spin_unlock(&child->d_lock); 359 } 360 361 return ret; 362 } 363 364 /** 365 * simple_offset_rename - handle directory offsets for rename 366 * @old_dir: parent directory of source entry 367 * @old_dentry: dentry of source entry 368 * @new_dir: parent_directory of destination entry 369 * @new_dentry: dentry of destination 370 * 371 * Caller provides appropriate serialization. 372 * 373 * User space expects the directory offset value of the replaced 374 * (new) directory entry to be unchanged after a rename. 375 * 376 * Returns zero on success, a negative errno value on failure. 377 */ 378 int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, 379 struct inode *new_dir, struct dentry *new_dentry) 380 { 381 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 382 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 383 long new_offset = dentry2offset(new_dentry); 384 385 simple_offset_remove(old_ctx, old_dentry); 386 387 if (new_offset) { 388 offset_set(new_dentry, 0); 389 return simple_offset_replace(new_ctx, old_dentry, new_offset); 390 } 391 return simple_offset_add(new_ctx, old_dentry); 392 } 393 394 /** 395 * simple_offset_rename_exchange - exchange rename with directory offsets 396 * @old_dir: parent of dentry being moved 397 * @old_dentry: dentry being moved 398 * @new_dir: destination parent 399 * @new_dentry: destination dentry 400 * 401 * This API preserves the directory offset values. Caller provides 402 * appropriate serialization. 403 * 404 * Returns zero on success. Otherwise a negative errno is returned and the 405 * rename is rolled back. 406 */ 407 int simple_offset_rename_exchange(struct inode *old_dir, 408 struct dentry *old_dentry, 409 struct inode *new_dir, 410 struct dentry *new_dentry) 411 { 412 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 413 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 414 long old_index = dentry2offset(old_dentry); 415 long new_index = dentry2offset(new_dentry); 416 int ret; 417 418 simple_offset_remove(old_ctx, old_dentry); 419 simple_offset_remove(new_ctx, new_dentry); 420 421 ret = simple_offset_replace(new_ctx, old_dentry, new_index); 422 if (ret) 423 goto out_restore; 424 425 ret = simple_offset_replace(old_ctx, new_dentry, old_index); 426 if (ret) { 427 simple_offset_remove(new_ctx, old_dentry); 428 goto out_restore; 429 } 430 431 ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 432 if (ret) { 433 simple_offset_remove(new_ctx, old_dentry); 434 simple_offset_remove(old_ctx, new_dentry); 435 goto out_restore; 436 } 437 return 0; 438 439 out_restore: 440 (void)simple_offset_replace(old_ctx, old_dentry, old_index); 441 (void)simple_offset_replace(new_ctx, new_dentry, new_index); 442 return ret; 443 } 444 445 /** 446 * simple_offset_destroy - Release offset map 447 * @octx: directory offset ctx that is about to be destroyed 448 * 449 * During fs teardown (eg. umount), a directory's offset map might still 450 * contain entries. xa_destroy() cleans out anything that remains. 451 */ 452 void simple_offset_destroy(struct offset_ctx *octx) 453 { 454 mtree_destroy(&octx->mt); 455 } 456 457 static int offset_dir_open(struct inode *inode, struct file *file) 458 { 459 struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); 460 461 file->private_data = (void *)ctx->next_offset; 462 return 0; 463 } 464 465 /** 466 * offset_dir_llseek - Advance the read position of a directory descriptor 467 * @file: an open directory whose position is to be updated 468 * @offset: a byte offset 469 * @whence: enumerator describing the starting position for this update 470 * 471 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. 472 * 473 * Returns the updated read position if successful; otherwise a 474 * negative errno is returned and the read position remains unchanged. 475 */ 476 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) 477 { 478 struct inode *inode = file->f_inode; 479 struct offset_ctx *ctx = inode->i_op->get_offset_ctx(inode); 480 481 switch (whence) { 482 case SEEK_CUR: 483 offset += file->f_pos; 484 fallthrough; 485 case SEEK_SET: 486 if (offset >= 0) 487 break; 488 fallthrough; 489 default: 490 return -EINVAL; 491 } 492 493 /* In this case, ->private_data is protected by f_pos_lock */ 494 if (!offset) 495 file->private_data = (void *)ctx->next_offset; 496 return vfs_setpos(file, offset, LONG_MAX); 497 } 498 499 static struct dentry *offset_find_next(struct offset_ctx *octx, loff_t offset) 500 { 501 MA_STATE(mas, &octx->mt, offset, offset); 502 struct dentry *child, *found = NULL; 503 504 rcu_read_lock(); 505 child = mas_find(&mas, LONG_MAX); 506 if (!child) 507 goto out; 508 spin_lock(&child->d_lock); 509 if (simple_positive(child)) 510 found = dget_dlock(child); 511 spin_unlock(&child->d_lock); 512 out: 513 rcu_read_unlock(); 514 return found; 515 } 516 517 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) 518 { 519 struct inode *inode = d_inode(dentry); 520 long offset = dentry2offset(dentry); 521 522 return ctx->actor(ctx, dentry->d_name.name, dentry->d_name.len, offset, 523 inode->i_ino, fs_umode_to_dtype(inode->i_mode)); 524 } 525 526 static void offset_iterate_dir(struct inode *inode, struct dir_context *ctx, long last_index) 527 { 528 struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); 529 struct dentry *dentry; 530 531 while (true) { 532 dentry = offset_find_next(octx, ctx->pos); 533 if (!dentry) 534 return; 535 536 if (dentry2offset(dentry) >= last_index) { 537 dput(dentry); 538 return; 539 } 540 541 if (!offset_dir_emit(ctx, dentry)) { 542 dput(dentry); 543 return; 544 } 545 546 ctx->pos = dentry2offset(dentry) + 1; 547 dput(dentry); 548 } 549 } 550 551 /** 552 * offset_readdir - Emit entries starting at offset @ctx->pos 553 * @file: an open directory to iterate over 554 * @ctx: directory iteration context 555 * 556 * Caller must hold @file's i_rwsem to prevent insertion or removal of 557 * entries during this call. 558 * 559 * On entry, @ctx->pos contains an offset that represents the first entry 560 * to be read from the directory. 561 * 562 * The operation continues until there are no more entries to read, or 563 * until the ctx->actor indicates there is no more space in the caller's 564 * output buffer. 565 * 566 * On return, @ctx->pos contains an offset that will read the next entry 567 * in this directory when offset_readdir() is called again with @ctx. 568 * 569 * Return values: 570 * %0 - Complete 571 */ 572 static int offset_readdir(struct file *file, struct dir_context *ctx) 573 { 574 struct dentry *dir = file->f_path.dentry; 575 long last_index = (long)file->private_data; 576 577 lockdep_assert_held(&d_inode(dir)->i_rwsem); 578 579 if (!dir_emit_dots(file, ctx)) 580 return 0; 581 582 offset_iterate_dir(d_inode(dir), ctx, last_index); 583 return 0; 584 } 585 586 const struct file_operations simple_offset_dir_operations = { 587 .open = offset_dir_open, 588 .llseek = offset_dir_llseek, 589 .iterate_shared = offset_readdir, 590 .read = generic_read_dir, 591 .fsync = noop_fsync, 592 }; 593 594 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) 595 { 596 struct dentry *child = NULL, *d; 597 598 spin_lock(&parent->d_lock); 599 d = prev ? d_next_sibling(prev) : d_first_child(parent); 600 hlist_for_each_entry_from(d, d_sib) { 601 if (simple_positive(d)) { 602 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 603 if (simple_positive(d)) 604 child = dget_dlock(d); 605 spin_unlock(&d->d_lock); 606 if (likely(child)) 607 break; 608 } 609 } 610 spin_unlock(&parent->d_lock); 611 dput(prev); 612 return child; 613 } 614 615 void simple_recursive_removal(struct dentry *dentry, 616 void (*callback)(struct dentry *)) 617 { 618 struct dentry *this = dget(dentry); 619 while (true) { 620 struct dentry *victim = NULL, *child; 621 struct inode *inode = this->d_inode; 622 623 inode_lock(inode); 624 if (d_is_dir(this)) 625 inode->i_flags |= S_DEAD; 626 while ((child = find_next_child(this, victim)) == NULL) { 627 // kill and ascend 628 // update metadata while it's still locked 629 inode_set_ctime_current(inode); 630 clear_nlink(inode); 631 inode_unlock(inode); 632 victim = this; 633 this = this->d_parent; 634 inode = this->d_inode; 635 inode_lock(inode); 636 if (simple_positive(victim)) { 637 d_invalidate(victim); // avoid lost mounts 638 if (d_is_dir(victim)) 639 fsnotify_rmdir(inode, victim); 640 else 641 fsnotify_unlink(inode, victim); 642 if (callback) 643 callback(victim); 644 dput(victim); // unpin it 645 } 646 if (victim == dentry) { 647 inode_set_mtime_to_ts(inode, 648 inode_set_ctime_current(inode)); 649 if (d_is_dir(dentry)) 650 drop_nlink(inode); 651 inode_unlock(inode); 652 dput(dentry); 653 return; 654 } 655 } 656 inode_unlock(inode); 657 this = child; 658 } 659 } 660 EXPORT_SYMBOL(simple_recursive_removal); 661 662 static const struct super_operations simple_super_operations = { 663 .statfs = simple_statfs, 664 }; 665 666 static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) 667 { 668 struct pseudo_fs_context *ctx = fc->fs_private; 669 struct inode *root; 670 671 s->s_maxbytes = MAX_LFS_FILESIZE; 672 s->s_blocksize = PAGE_SIZE; 673 s->s_blocksize_bits = PAGE_SHIFT; 674 s->s_magic = ctx->magic; 675 s->s_op = ctx->ops ?: &simple_super_operations; 676 s->s_xattr = ctx->xattr; 677 s->s_time_gran = 1; 678 root = new_inode(s); 679 if (!root) 680 return -ENOMEM; 681 682 /* 683 * since this is the first inode, make it number 1. New inodes created 684 * after this must take care not to collide with it (by passing 685 * max_reserved of 1 to iunique). 686 */ 687 root->i_ino = 1; 688 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 689 simple_inode_init_ts(root); 690 s->s_root = d_make_root(root); 691 if (!s->s_root) 692 return -ENOMEM; 693 s->s_d_op = ctx->dops; 694 return 0; 695 } 696 697 static int pseudo_fs_get_tree(struct fs_context *fc) 698 { 699 return get_tree_nodev(fc, pseudo_fs_fill_super); 700 } 701 702 static void pseudo_fs_free(struct fs_context *fc) 703 { 704 kfree(fc->fs_private); 705 } 706 707 static const struct fs_context_operations pseudo_fs_context_ops = { 708 .free = pseudo_fs_free, 709 .get_tree = pseudo_fs_get_tree, 710 }; 711 712 /* 713 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 714 * will never be mountable) 715 */ 716 struct pseudo_fs_context *init_pseudo(struct fs_context *fc, 717 unsigned long magic) 718 { 719 struct pseudo_fs_context *ctx; 720 721 ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); 722 if (likely(ctx)) { 723 ctx->magic = magic; 724 fc->fs_private = ctx; 725 fc->ops = &pseudo_fs_context_ops; 726 fc->sb_flags |= SB_NOUSER; 727 fc->global = true; 728 } 729 return ctx; 730 } 731 EXPORT_SYMBOL(init_pseudo); 732 733 int simple_open(struct inode *inode, struct file *file) 734 { 735 if (inode->i_private) 736 file->private_data = inode->i_private; 737 return 0; 738 } 739 EXPORT_SYMBOL(simple_open); 740 741 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 742 { 743 struct inode *inode = d_inode(old_dentry); 744 745 inode_set_mtime_to_ts(dir, 746 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 747 inc_nlink(inode); 748 ihold(inode); 749 dget(dentry); 750 d_instantiate(dentry, inode); 751 return 0; 752 } 753 EXPORT_SYMBOL(simple_link); 754 755 int simple_empty(struct dentry *dentry) 756 { 757 struct dentry *child; 758 int ret = 0; 759 760 spin_lock(&dentry->d_lock); 761 hlist_for_each_entry(child, &dentry->d_children, d_sib) { 762 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); 763 if (simple_positive(child)) { 764 spin_unlock(&child->d_lock); 765 goto out; 766 } 767 spin_unlock(&child->d_lock); 768 } 769 ret = 1; 770 out: 771 spin_unlock(&dentry->d_lock); 772 return ret; 773 } 774 EXPORT_SYMBOL(simple_empty); 775 776 int simple_unlink(struct inode *dir, struct dentry *dentry) 777 { 778 struct inode *inode = d_inode(dentry); 779 780 inode_set_mtime_to_ts(dir, 781 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 782 drop_nlink(inode); 783 dput(dentry); 784 return 0; 785 } 786 EXPORT_SYMBOL(simple_unlink); 787 788 int simple_rmdir(struct inode *dir, struct dentry *dentry) 789 { 790 if (!simple_empty(dentry)) 791 return -ENOTEMPTY; 792 793 drop_nlink(d_inode(dentry)); 794 simple_unlink(dir, dentry); 795 drop_nlink(dir); 796 return 0; 797 } 798 EXPORT_SYMBOL(simple_rmdir); 799 800 /** 801 * simple_rename_timestamp - update the various inode timestamps for rename 802 * @old_dir: old parent directory 803 * @old_dentry: dentry that is being renamed 804 * @new_dir: new parent directory 805 * @new_dentry: target for rename 806 * 807 * POSIX mandates that the old and new parent directories have their ctime and 808 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have 809 * their ctime updated. 810 */ 811 void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, 812 struct inode *new_dir, struct dentry *new_dentry) 813 { 814 struct inode *newino = d_inode(new_dentry); 815 816 inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); 817 if (new_dir != old_dir) 818 inode_set_mtime_to_ts(new_dir, 819 inode_set_ctime_current(new_dir)); 820 inode_set_ctime_current(d_inode(old_dentry)); 821 if (newino) 822 inode_set_ctime_current(newino); 823 } 824 EXPORT_SYMBOL_GPL(simple_rename_timestamp); 825 826 int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, 827 struct inode *new_dir, struct dentry *new_dentry) 828 { 829 bool old_is_dir = d_is_dir(old_dentry); 830 bool new_is_dir = d_is_dir(new_dentry); 831 832 if (old_dir != new_dir && old_is_dir != new_is_dir) { 833 if (old_is_dir) { 834 drop_nlink(old_dir); 835 inc_nlink(new_dir); 836 } else { 837 drop_nlink(new_dir); 838 inc_nlink(old_dir); 839 } 840 } 841 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 842 return 0; 843 } 844 EXPORT_SYMBOL_GPL(simple_rename_exchange); 845 846 int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, 847 struct dentry *old_dentry, struct inode *new_dir, 848 struct dentry *new_dentry, unsigned int flags) 849 { 850 int they_are_dirs = d_is_dir(old_dentry); 851 852 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 853 return -EINVAL; 854 855 if (flags & RENAME_EXCHANGE) 856 return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 857 858 if (!simple_empty(new_dentry)) 859 return -ENOTEMPTY; 860 861 if (d_really_is_positive(new_dentry)) { 862 simple_unlink(new_dir, new_dentry); 863 if (they_are_dirs) { 864 drop_nlink(d_inode(new_dentry)); 865 drop_nlink(old_dir); 866 } 867 } else if (they_are_dirs) { 868 drop_nlink(old_dir); 869 inc_nlink(new_dir); 870 } 871 872 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 873 return 0; 874 } 875 EXPORT_SYMBOL(simple_rename); 876 877 /** 878 * simple_setattr - setattr for simple filesystem 879 * @idmap: idmap of the target mount 880 * @dentry: dentry 881 * @iattr: iattr structure 882 * 883 * Returns 0 on success, -error on failure. 884 * 885 * simple_setattr is a simple ->setattr implementation without a proper 886 * implementation of size changes. 887 * 888 * It can either be used for in-memory filesystems or special files 889 * on simple regular filesystems. Anything that needs to change on-disk 890 * or wire state on size changes needs its own setattr method. 891 */ 892 int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 893 struct iattr *iattr) 894 { 895 struct inode *inode = d_inode(dentry); 896 int error; 897 898 error = setattr_prepare(idmap, dentry, iattr); 899 if (error) 900 return error; 901 902 if (iattr->ia_valid & ATTR_SIZE) 903 truncate_setsize(inode, iattr->ia_size); 904 setattr_copy(idmap, inode, iattr); 905 mark_inode_dirty(inode); 906 return 0; 907 } 908 EXPORT_SYMBOL(simple_setattr); 909 910 static int simple_read_folio(struct file *file, struct folio *folio) 911 { 912 folio_zero_range(folio, 0, folio_size(folio)); 913 flush_dcache_folio(folio); 914 folio_mark_uptodate(folio); 915 folio_unlock(folio); 916 return 0; 917 } 918 919 int simple_write_begin(struct file *file, struct address_space *mapping, 920 loff_t pos, unsigned len, 921 struct folio **foliop, void **fsdata) 922 { 923 struct folio *folio; 924 925 folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, 926 mapping_gfp_mask(mapping)); 927 if (IS_ERR(folio)) 928 return PTR_ERR(folio); 929 930 *foliop = folio; 931 932 if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { 933 size_t from = offset_in_folio(folio, pos); 934 935 folio_zero_segments(folio, 0, from, 936 from + len, folio_size(folio)); 937 } 938 return 0; 939 } 940 EXPORT_SYMBOL(simple_write_begin); 941 942 /** 943 * simple_write_end - .write_end helper for non-block-device FSes 944 * @file: See .write_end of address_space_operations 945 * @mapping: " 946 * @pos: " 947 * @len: " 948 * @copied: " 949 * @folio: " 950 * @fsdata: " 951 * 952 * simple_write_end does the minimum needed for updating a folio after 953 * writing is done. It has the same API signature as the .write_end of 954 * address_space_operations vector. So it can just be set onto .write_end for 955 * FSes that don't need any other processing. i_mutex is assumed to be held. 956 * Block based filesystems should use generic_write_end(). 957 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty 958 * is not called, so a filesystem that actually does store data in .write_inode 959 * should extend on what's done here with a call to mark_inode_dirty() in the 960 * case that i_size has changed. 961 * 962 * Use *ONLY* with simple_read_folio() 963 */ 964 static int simple_write_end(struct file *file, struct address_space *mapping, 965 loff_t pos, unsigned len, unsigned copied, 966 struct folio *folio, void *fsdata) 967 { 968 struct inode *inode = folio->mapping->host; 969 loff_t last_pos = pos + copied; 970 971 /* zero the stale part of the folio if we did a short copy */ 972 if (!folio_test_uptodate(folio)) { 973 if (copied < len) { 974 size_t from = offset_in_folio(folio, pos); 975 976 folio_zero_range(folio, from + copied, len - copied); 977 } 978 folio_mark_uptodate(folio); 979 } 980 /* 981 * No need to use i_size_read() here, the i_size 982 * cannot change under us because we hold the i_mutex. 983 */ 984 if (last_pos > inode->i_size) 985 i_size_write(inode, last_pos); 986 987 folio_mark_dirty(folio); 988 folio_unlock(folio); 989 folio_put(folio); 990 991 return copied; 992 } 993 994 /* 995 * Provides ramfs-style behavior: data in the pagecache, but no writeback. 996 */ 997 const struct address_space_operations ram_aops = { 998 .read_folio = simple_read_folio, 999 .write_begin = simple_write_begin, 1000 .write_end = simple_write_end, 1001 .dirty_folio = noop_dirty_folio, 1002 }; 1003 EXPORT_SYMBOL(ram_aops); 1004 1005 /* 1006 * the inodes created here are not hashed. If you use iunique to generate 1007 * unique inode values later for this filesystem, then you must take care 1008 * to pass it an appropriate max_reserved value to avoid collisions. 1009 */ 1010 int simple_fill_super(struct super_block *s, unsigned long magic, 1011 const struct tree_descr *files) 1012 { 1013 struct inode *inode; 1014 struct dentry *dentry; 1015 int i; 1016 1017 s->s_blocksize = PAGE_SIZE; 1018 s->s_blocksize_bits = PAGE_SHIFT; 1019 s->s_magic = magic; 1020 s->s_op = &simple_super_operations; 1021 s->s_time_gran = 1; 1022 1023 inode = new_inode(s); 1024 if (!inode) 1025 return -ENOMEM; 1026 /* 1027 * because the root inode is 1, the files array must not contain an 1028 * entry at index 1 1029 */ 1030 inode->i_ino = 1; 1031 inode->i_mode = S_IFDIR | 0755; 1032 simple_inode_init_ts(inode); 1033 inode->i_op = &simple_dir_inode_operations; 1034 inode->i_fop = &simple_dir_operations; 1035 set_nlink(inode, 2); 1036 s->s_root = d_make_root(inode); 1037 if (!s->s_root) 1038 return -ENOMEM; 1039 for (i = 0; !files->name || files->name[0]; i++, files++) { 1040 if (!files->name) 1041 continue; 1042 1043 /* warn if it tries to conflict with the root inode */ 1044 if (unlikely(i == 1)) 1045 printk(KERN_WARNING "%s: %s passed in a files array" 1046 "with an index of 1!\n", __func__, 1047 s->s_type->name); 1048 1049 dentry = d_alloc_name(s->s_root, files->name); 1050 if (!dentry) 1051 return -ENOMEM; 1052 inode = new_inode(s); 1053 if (!inode) { 1054 dput(dentry); 1055 return -ENOMEM; 1056 } 1057 inode->i_mode = S_IFREG | files->mode; 1058 simple_inode_init_ts(inode); 1059 inode->i_fop = files->ops; 1060 inode->i_ino = i; 1061 d_add(dentry, inode); 1062 } 1063 return 0; 1064 } 1065 EXPORT_SYMBOL(simple_fill_super); 1066 1067 static DEFINE_SPINLOCK(pin_fs_lock); 1068 1069 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) 1070 { 1071 struct vfsmount *mnt = NULL; 1072 spin_lock(&pin_fs_lock); 1073 if (unlikely(!*mount)) { 1074 spin_unlock(&pin_fs_lock); 1075 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); 1076 if (IS_ERR(mnt)) 1077 return PTR_ERR(mnt); 1078 spin_lock(&pin_fs_lock); 1079 if (!*mount) 1080 *mount = mnt; 1081 } 1082 mntget(*mount); 1083 ++*count; 1084 spin_unlock(&pin_fs_lock); 1085 mntput(mnt); 1086 return 0; 1087 } 1088 EXPORT_SYMBOL(simple_pin_fs); 1089 1090 void simple_release_fs(struct vfsmount **mount, int *count) 1091 { 1092 struct vfsmount *mnt; 1093 spin_lock(&pin_fs_lock); 1094 mnt = *mount; 1095 if (!--*count) 1096 *mount = NULL; 1097 spin_unlock(&pin_fs_lock); 1098 mntput(mnt); 1099 } 1100 EXPORT_SYMBOL(simple_release_fs); 1101 1102 /** 1103 * simple_read_from_buffer - copy data from the buffer to user space 1104 * @to: the user space buffer to read to 1105 * @count: the maximum number of bytes to read 1106 * @ppos: the current position in the buffer 1107 * @from: the buffer to read from 1108 * @available: the size of the buffer 1109 * 1110 * The simple_read_from_buffer() function reads up to @count bytes from the 1111 * buffer @from at offset @ppos into the user space address starting at @to. 1112 * 1113 * On success, the number of bytes read is returned and the offset @ppos is 1114 * advanced by this number, or negative value is returned on error. 1115 **/ 1116 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 1117 const void *from, size_t available) 1118 { 1119 loff_t pos = *ppos; 1120 size_t ret; 1121 1122 if (pos < 0) 1123 return -EINVAL; 1124 if (pos >= available || !count) 1125 return 0; 1126 if (count > available - pos) 1127 count = available - pos; 1128 ret = copy_to_user(to, from + pos, count); 1129 if (ret == count) 1130 return -EFAULT; 1131 count -= ret; 1132 *ppos = pos + count; 1133 return count; 1134 } 1135 EXPORT_SYMBOL(simple_read_from_buffer); 1136 1137 /** 1138 * simple_write_to_buffer - copy data from user space to the buffer 1139 * @to: the buffer to write to 1140 * @available: the size of the buffer 1141 * @ppos: the current position in the buffer 1142 * @from: the user space buffer to read from 1143 * @count: the maximum number of bytes to read 1144 * 1145 * The simple_write_to_buffer() function reads up to @count bytes from the user 1146 * space address starting at @from into the buffer @to at offset @ppos. 1147 * 1148 * On success, the number of bytes written is returned and the offset @ppos is 1149 * advanced by this number, or negative value is returned on error. 1150 **/ 1151 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 1152 const void __user *from, size_t count) 1153 { 1154 loff_t pos = *ppos; 1155 size_t res; 1156 1157 if (pos < 0) 1158 return -EINVAL; 1159 if (pos >= available || !count) 1160 return 0; 1161 if (count > available - pos) 1162 count = available - pos; 1163 res = copy_from_user(to + pos, from, count); 1164 if (res == count) 1165 return -EFAULT; 1166 count -= res; 1167 *ppos = pos + count; 1168 return count; 1169 } 1170 EXPORT_SYMBOL(simple_write_to_buffer); 1171 1172 /** 1173 * memory_read_from_buffer - copy data from the buffer 1174 * @to: the kernel space buffer to read to 1175 * @count: the maximum number of bytes to read 1176 * @ppos: the current position in the buffer 1177 * @from: the buffer to read from 1178 * @available: the size of the buffer 1179 * 1180 * The memory_read_from_buffer() function reads up to @count bytes from the 1181 * buffer @from at offset @ppos into the kernel space address starting at @to. 1182 * 1183 * On success, the number of bytes read is returned and the offset @ppos is 1184 * advanced by this number, or negative value is returned on error. 1185 **/ 1186 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 1187 const void *from, size_t available) 1188 { 1189 loff_t pos = *ppos; 1190 1191 if (pos < 0) 1192 return -EINVAL; 1193 if (pos >= available) 1194 return 0; 1195 if (count > available - pos) 1196 count = available - pos; 1197 memcpy(to, from + pos, count); 1198 *ppos = pos + count; 1199 1200 return count; 1201 } 1202 EXPORT_SYMBOL(memory_read_from_buffer); 1203 1204 /* 1205 * Transaction based IO. 1206 * The file expects a single write which triggers the transaction, and then 1207 * possibly a read which collects the result - which is stored in a 1208 * file-local buffer. 1209 */ 1210 1211 void simple_transaction_set(struct file *file, size_t n) 1212 { 1213 struct simple_transaction_argresp *ar = file->private_data; 1214 1215 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 1216 1217 /* 1218 * The barrier ensures that ar->size will really remain zero until 1219 * ar->data is ready for reading. 1220 */ 1221 smp_mb(); 1222 ar->size = n; 1223 } 1224 EXPORT_SYMBOL(simple_transaction_set); 1225 1226 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 1227 { 1228 struct simple_transaction_argresp *ar; 1229 static DEFINE_SPINLOCK(simple_transaction_lock); 1230 1231 if (size > SIMPLE_TRANSACTION_LIMIT - 1) 1232 return ERR_PTR(-EFBIG); 1233 1234 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); 1235 if (!ar) 1236 return ERR_PTR(-ENOMEM); 1237 1238 spin_lock(&simple_transaction_lock); 1239 1240 /* only one write allowed per open */ 1241 if (file->private_data) { 1242 spin_unlock(&simple_transaction_lock); 1243 free_page((unsigned long)ar); 1244 return ERR_PTR(-EBUSY); 1245 } 1246 1247 file->private_data = ar; 1248 1249 spin_unlock(&simple_transaction_lock); 1250 1251 if (copy_from_user(ar->data, buf, size)) 1252 return ERR_PTR(-EFAULT); 1253 1254 return ar->data; 1255 } 1256 EXPORT_SYMBOL(simple_transaction_get); 1257 1258 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 1259 { 1260 struct simple_transaction_argresp *ar = file->private_data; 1261 1262 if (!ar) 1263 return 0; 1264 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); 1265 } 1266 EXPORT_SYMBOL(simple_transaction_read); 1267 1268 int simple_transaction_release(struct inode *inode, struct file *file) 1269 { 1270 free_page((unsigned long)file->private_data); 1271 return 0; 1272 } 1273 EXPORT_SYMBOL(simple_transaction_release); 1274 1275 /* Simple attribute files */ 1276 1277 struct simple_attr { 1278 int (*get)(void *, u64 *); 1279 int (*set)(void *, u64); 1280 char get_buf[24]; /* enough to store a u64 and "\n\0" */ 1281 char set_buf[24]; 1282 void *data; 1283 const char *fmt; /* format for read operation */ 1284 struct mutex mutex; /* protects access to these buffers */ 1285 }; 1286 1287 /* simple_attr_open is called by an actual attribute open file operation 1288 * to set the attribute specific access operations. */ 1289 int simple_attr_open(struct inode *inode, struct file *file, 1290 int (*get)(void *, u64 *), int (*set)(void *, u64), 1291 const char *fmt) 1292 { 1293 struct simple_attr *attr; 1294 1295 attr = kzalloc(sizeof(*attr), GFP_KERNEL); 1296 if (!attr) 1297 return -ENOMEM; 1298 1299 attr->get = get; 1300 attr->set = set; 1301 attr->data = inode->i_private; 1302 attr->fmt = fmt; 1303 mutex_init(&attr->mutex); 1304 1305 file->private_data = attr; 1306 1307 return nonseekable_open(inode, file); 1308 } 1309 EXPORT_SYMBOL_GPL(simple_attr_open); 1310 1311 int simple_attr_release(struct inode *inode, struct file *file) 1312 { 1313 kfree(file->private_data); 1314 return 0; 1315 } 1316 EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ 1317 1318 /* read from the buffer that is filled with the get function */ 1319 ssize_t simple_attr_read(struct file *file, char __user *buf, 1320 size_t len, loff_t *ppos) 1321 { 1322 struct simple_attr *attr; 1323 size_t size; 1324 ssize_t ret; 1325 1326 attr = file->private_data; 1327 1328 if (!attr->get) 1329 return -EACCES; 1330 1331 ret = mutex_lock_interruptible(&attr->mutex); 1332 if (ret) 1333 return ret; 1334 1335 if (*ppos && attr->get_buf[0]) { 1336 /* continued read */ 1337 size = strlen(attr->get_buf); 1338 } else { 1339 /* first read */ 1340 u64 val; 1341 ret = attr->get(attr->data, &val); 1342 if (ret) 1343 goto out; 1344 1345 size = scnprintf(attr->get_buf, sizeof(attr->get_buf), 1346 attr->fmt, (unsigned long long)val); 1347 } 1348 1349 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); 1350 out: 1351 mutex_unlock(&attr->mutex); 1352 return ret; 1353 } 1354 EXPORT_SYMBOL_GPL(simple_attr_read); 1355 1356 /* interpret the buffer as a number to call the set function with */ 1357 static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, 1358 size_t len, loff_t *ppos, bool is_signed) 1359 { 1360 struct simple_attr *attr; 1361 unsigned long long val; 1362 size_t size; 1363 ssize_t ret; 1364 1365 attr = file->private_data; 1366 if (!attr->set) 1367 return -EACCES; 1368 1369 ret = mutex_lock_interruptible(&attr->mutex); 1370 if (ret) 1371 return ret; 1372 1373 ret = -EFAULT; 1374 size = min(sizeof(attr->set_buf) - 1, len); 1375 if (copy_from_user(attr->set_buf, buf, size)) 1376 goto out; 1377 1378 attr->set_buf[size] = '\0'; 1379 if (is_signed) 1380 ret = kstrtoll(attr->set_buf, 0, &val); 1381 else 1382 ret = kstrtoull(attr->set_buf, 0, &val); 1383 if (ret) 1384 goto out; 1385 ret = attr->set(attr->data, val); 1386 if (ret == 0) 1387 ret = len; /* on success, claim we got the whole input */ 1388 out: 1389 mutex_unlock(&attr->mutex); 1390 return ret; 1391 } 1392 1393 ssize_t simple_attr_write(struct file *file, const char __user *buf, 1394 size_t len, loff_t *ppos) 1395 { 1396 return simple_attr_write_xsigned(file, buf, len, ppos, false); 1397 } 1398 EXPORT_SYMBOL_GPL(simple_attr_write); 1399 1400 ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, 1401 size_t len, loff_t *ppos) 1402 { 1403 return simple_attr_write_xsigned(file, buf, len, ppos, true); 1404 } 1405 EXPORT_SYMBOL_GPL(simple_attr_write_signed); 1406 1407 /** 1408 * generic_encode_ino32_fh - generic export_operations->encode_fh function 1409 * @inode: the object to encode 1410 * @fh: where to store the file handle fragment 1411 * @max_len: maximum length to store there (in 4 byte units) 1412 * @parent: parent directory inode, if wanted 1413 * 1414 * This generic encode_fh function assumes that the 32 inode number 1415 * is suitable for locating an inode, and that the generation number 1416 * can be used to check that it is still valid. It places them in the 1417 * filehandle fragment where export_decode_fh expects to find them. 1418 */ 1419 int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, 1420 struct inode *parent) 1421 { 1422 struct fid *fid = (void *)fh; 1423 int len = *max_len; 1424 int type = FILEID_INO32_GEN; 1425 1426 if (parent && (len < 4)) { 1427 *max_len = 4; 1428 return FILEID_INVALID; 1429 } else if (len < 2) { 1430 *max_len = 2; 1431 return FILEID_INVALID; 1432 } 1433 1434 len = 2; 1435 fid->i32.ino = inode->i_ino; 1436 fid->i32.gen = inode->i_generation; 1437 if (parent) { 1438 fid->i32.parent_ino = parent->i_ino; 1439 fid->i32.parent_gen = parent->i_generation; 1440 len = 4; 1441 type = FILEID_INO32_GEN_PARENT; 1442 } 1443 *max_len = len; 1444 return type; 1445 } 1446 EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); 1447 1448 /** 1449 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation 1450 * @sb: filesystem to do the file handle conversion on 1451 * @fid: file handle to convert 1452 * @fh_len: length of the file handle in bytes 1453 * @fh_type: type of file handle 1454 * @get_inode: filesystem callback to retrieve inode 1455 * 1456 * This function decodes @fid as long as it has one of the well-known 1457 * Linux filehandle types and calls @get_inode on it to retrieve the 1458 * inode for the object specified in the file handle. 1459 */ 1460 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, 1461 int fh_len, int fh_type, struct inode *(*get_inode) 1462 (struct super_block *sb, u64 ino, u32 gen)) 1463 { 1464 struct inode *inode = NULL; 1465 1466 if (fh_len < 2) 1467 return NULL; 1468 1469 switch (fh_type) { 1470 case FILEID_INO32_GEN: 1471 case FILEID_INO32_GEN_PARENT: 1472 inode = get_inode(sb, fid->i32.ino, fid->i32.gen); 1473 break; 1474 } 1475 1476 return d_obtain_alias(inode); 1477 } 1478 EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 1479 1480 /** 1481 * generic_fh_to_parent - generic helper for the fh_to_parent export operation 1482 * @sb: filesystem to do the file handle conversion on 1483 * @fid: file handle to convert 1484 * @fh_len: length of the file handle in bytes 1485 * @fh_type: type of file handle 1486 * @get_inode: filesystem callback to retrieve inode 1487 * 1488 * This function decodes @fid as long as it has one of the well-known 1489 * Linux filehandle types and calls @get_inode on it to retrieve the 1490 * inode for the _parent_ object specified in the file handle if it 1491 * is specified in the file handle, or NULL otherwise. 1492 */ 1493 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, 1494 int fh_len, int fh_type, struct inode *(*get_inode) 1495 (struct super_block *sb, u64 ino, u32 gen)) 1496 { 1497 struct inode *inode = NULL; 1498 1499 if (fh_len <= 2) 1500 return NULL; 1501 1502 switch (fh_type) { 1503 case FILEID_INO32_GEN_PARENT: 1504 inode = get_inode(sb, fid->i32.parent_ino, 1505 (fh_len > 3 ? fid->i32.parent_gen : 0)); 1506 break; 1507 } 1508 1509 return d_obtain_alias(inode); 1510 } 1511 EXPORT_SYMBOL_GPL(generic_fh_to_parent); 1512 1513 /** 1514 * __generic_file_fsync - generic fsync implementation for simple filesystems 1515 * 1516 * @file: file to synchronize 1517 * @start: start offset in bytes 1518 * @end: end offset in bytes (inclusive) 1519 * @datasync: only synchronize essential metadata if true 1520 * 1521 * This is a generic implementation of the fsync method for simple 1522 * filesystems which track all non-inode metadata in the buffers list 1523 * hanging off the address_space structure. 1524 */ 1525 int __generic_file_fsync(struct file *file, loff_t start, loff_t end, 1526 int datasync) 1527 { 1528 struct inode *inode = file->f_mapping->host; 1529 int err; 1530 int ret; 1531 1532 err = file_write_and_wait_range(file, start, end); 1533 if (err) 1534 return err; 1535 1536 inode_lock(inode); 1537 ret = sync_mapping_buffers(inode->i_mapping); 1538 if (!(inode->i_state & I_DIRTY_ALL)) 1539 goto out; 1540 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 1541 goto out; 1542 1543 err = sync_inode_metadata(inode, 1); 1544 if (ret == 0) 1545 ret = err; 1546 1547 out: 1548 inode_unlock(inode); 1549 /* check and advance again to catch errors after syncing out buffers */ 1550 err = file_check_and_advance_wb_err(file); 1551 if (ret == 0) 1552 ret = err; 1553 return ret; 1554 } 1555 EXPORT_SYMBOL(__generic_file_fsync); 1556 1557 /** 1558 * generic_file_fsync - generic fsync implementation for simple filesystems 1559 * with flush 1560 * @file: file to synchronize 1561 * @start: start offset in bytes 1562 * @end: end offset in bytes (inclusive) 1563 * @datasync: only synchronize essential metadata if true 1564 * 1565 */ 1566 1567 int generic_file_fsync(struct file *file, loff_t start, loff_t end, 1568 int datasync) 1569 { 1570 struct inode *inode = file->f_mapping->host; 1571 int err; 1572 1573 err = __generic_file_fsync(file, start, end, datasync); 1574 if (err) 1575 return err; 1576 return blkdev_issue_flush(inode->i_sb->s_bdev); 1577 } 1578 EXPORT_SYMBOL(generic_file_fsync); 1579 1580 /** 1581 * generic_check_addressable - Check addressability of file system 1582 * @blocksize_bits: log of file system block size 1583 * @num_blocks: number of blocks in file system 1584 * 1585 * Determine whether a file system with @num_blocks blocks (and a 1586 * block size of 2**@blocksize_bits) is addressable by the sector_t 1587 * and page cache of the system. Return 0 if so and -EFBIG otherwise. 1588 */ 1589 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) 1590 { 1591 u64 last_fs_block = num_blocks - 1; 1592 u64 last_fs_page = 1593 last_fs_block >> (PAGE_SHIFT - blocksize_bits); 1594 1595 if (unlikely(num_blocks == 0)) 1596 return 0; 1597 1598 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) 1599 return -EINVAL; 1600 1601 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || 1602 (last_fs_page > (pgoff_t)(~0ULL))) { 1603 return -EFBIG; 1604 } 1605 return 0; 1606 } 1607 EXPORT_SYMBOL(generic_check_addressable); 1608 1609 /* 1610 * No-op implementation of ->fsync for in-memory filesystems. 1611 */ 1612 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) 1613 { 1614 return 0; 1615 } 1616 EXPORT_SYMBOL(noop_fsync); 1617 1618 ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 1619 { 1620 /* 1621 * iomap based filesystems support direct I/O without need for 1622 * this callback. However, it still needs to be set in 1623 * inode->a_ops so that open/fcntl know that direct I/O is 1624 * generally supported. 1625 */ 1626 return -EINVAL; 1627 } 1628 EXPORT_SYMBOL_GPL(noop_direct_IO); 1629 1630 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ 1631 void kfree_link(void *p) 1632 { 1633 kfree(p); 1634 } 1635 EXPORT_SYMBOL(kfree_link); 1636 1637 struct inode *alloc_anon_inode(struct super_block *s) 1638 { 1639 static const struct address_space_operations anon_aops = { 1640 .dirty_folio = noop_dirty_folio, 1641 }; 1642 struct inode *inode = new_inode_pseudo(s); 1643 1644 if (!inode) 1645 return ERR_PTR(-ENOMEM); 1646 1647 inode->i_ino = get_next_ino(); 1648 inode->i_mapping->a_ops = &anon_aops; 1649 1650 /* 1651 * Mark the inode dirty from the very beginning, 1652 * that way it will never be moved to the dirty 1653 * list because mark_inode_dirty() will think 1654 * that it already _is_ on the dirty list. 1655 */ 1656 inode->i_state = I_DIRTY; 1657 inode->i_mode = S_IRUSR | S_IWUSR; 1658 inode->i_uid = current_fsuid(); 1659 inode->i_gid = current_fsgid(); 1660 inode->i_flags |= S_PRIVATE; 1661 simple_inode_init_ts(inode); 1662 return inode; 1663 } 1664 EXPORT_SYMBOL(alloc_anon_inode); 1665 1666 /** 1667 * simple_nosetlease - generic helper for prohibiting leases 1668 * @filp: file pointer 1669 * @arg: type of lease to obtain 1670 * @flp: new lease supplied for insertion 1671 * @priv: private data for lm_setup operation 1672 * 1673 * Generic helper for filesystems that do not wish to allow leases to be set. 1674 * All arguments are ignored and it just returns -EINVAL. 1675 */ 1676 int 1677 simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, 1678 void **priv) 1679 { 1680 return -EINVAL; 1681 } 1682 EXPORT_SYMBOL(simple_nosetlease); 1683 1684 /** 1685 * simple_get_link - generic helper to get the target of "fast" symlinks 1686 * @dentry: not used here 1687 * @inode: the symlink inode 1688 * @done: not used here 1689 * 1690 * Generic helper for filesystems to use for symlink inodes where a pointer to 1691 * the symlink target is stored in ->i_link. NOTE: this isn't normally called, 1692 * since as an optimization the path lookup code uses any non-NULL ->i_link 1693 * directly, without calling ->get_link(). But ->get_link() still must be set, 1694 * to mark the inode_operations as being for a symlink. 1695 * 1696 * Return: the symlink target 1697 */ 1698 const char *simple_get_link(struct dentry *dentry, struct inode *inode, 1699 struct delayed_call *done) 1700 { 1701 return inode->i_link; 1702 } 1703 EXPORT_SYMBOL(simple_get_link); 1704 1705 const struct inode_operations simple_symlink_inode_operations = { 1706 .get_link = simple_get_link, 1707 }; 1708 EXPORT_SYMBOL(simple_symlink_inode_operations); 1709 1710 /* 1711 * Operations for a permanently empty directory. 1712 */ 1713 static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1714 { 1715 return ERR_PTR(-ENOENT); 1716 } 1717 1718 static int empty_dir_setattr(struct mnt_idmap *idmap, 1719 struct dentry *dentry, struct iattr *attr) 1720 { 1721 return -EPERM; 1722 } 1723 1724 static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) 1725 { 1726 return -EOPNOTSUPP; 1727 } 1728 1729 static const struct inode_operations empty_dir_inode_operations = { 1730 .lookup = empty_dir_lookup, 1731 .setattr = empty_dir_setattr, 1732 .listxattr = empty_dir_listxattr, 1733 }; 1734 1735 static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) 1736 { 1737 /* An empty directory has two entries . and .. at offsets 0 and 1 */ 1738 return generic_file_llseek_size(file, offset, whence, 2, 2); 1739 } 1740 1741 static int empty_dir_readdir(struct file *file, struct dir_context *ctx) 1742 { 1743 dir_emit_dots(file, ctx); 1744 return 0; 1745 } 1746 1747 static const struct file_operations empty_dir_operations = { 1748 .llseek = empty_dir_llseek, 1749 .read = generic_read_dir, 1750 .iterate_shared = empty_dir_readdir, 1751 .fsync = noop_fsync, 1752 }; 1753 1754 1755 void make_empty_dir_inode(struct inode *inode) 1756 { 1757 set_nlink(inode, 2); 1758 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 1759 inode->i_uid = GLOBAL_ROOT_UID; 1760 inode->i_gid = GLOBAL_ROOT_GID; 1761 inode->i_rdev = 0; 1762 inode->i_size = 0; 1763 inode->i_blkbits = PAGE_SHIFT; 1764 inode->i_blocks = 0; 1765 1766 inode->i_op = &empty_dir_inode_operations; 1767 inode->i_opflags &= ~IOP_XATTR; 1768 inode->i_fop = &empty_dir_operations; 1769 } 1770 1771 bool is_empty_dir_inode(struct inode *inode) 1772 { 1773 return (inode->i_fop == &empty_dir_operations) && 1774 (inode->i_op == &empty_dir_inode_operations); 1775 } 1776 1777 #if IS_ENABLED(CONFIG_UNICODE) 1778 /** 1779 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems 1780 * @dentry: dentry whose name we are checking against 1781 * @len: len of name of dentry 1782 * @str: str pointer to name of dentry 1783 * @name: Name to compare against 1784 * 1785 * Return: 0 if names match, 1 if mismatch, or -ERRNO 1786 */ 1787 int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, 1788 const char *str, const struct qstr *name) 1789 { 1790 const struct dentry *parent; 1791 const struct inode *dir; 1792 char strbuf[DNAME_INLINE_LEN]; 1793 struct qstr qstr; 1794 1795 /* 1796 * Attempt a case-sensitive match first. It is cheaper and 1797 * should cover most lookups, including all the sane 1798 * applications that expect a case-sensitive filesystem. 1799 * 1800 * This comparison is safe under RCU because the caller 1801 * guarantees the consistency between str and len. See 1802 * __d_lookup_rcu_op_compare() for details. 1803 */ 1804 if (len == name->len && !memcmp(str, name->name, len)) 1805 return 0; 1806 1807 parent = READ_ONCE(dentry->d_parent); 1808 dir = READ_ONCE(parent->d_inode); 1809 if (!dir || !IS_CASEFOLDED(dir)) 1810 return 1; 1811 1812 /* 1813 * If the dentry name is stored in-line, then it may be concurrently 1814 * modified by a rename. If this happens, the VFS will eventually retry 1815 * the lookup, so it doesn't matter what ->d_compare() returns. 1816 * However, it's unsafe to call utf8_strncasecmp() with an unstable 1817 * string. Therefore, we have to copy the name into a temporary buffer. 1818 */ 1819 if (len <= DNAME_INLINE_LEN - 1) { 1820 memcpy(strbuf, str, len); 1821 strbuf[len] = 0; 1822 str = strbuf; 1823 /* prevent compiler from optimizing out the temporary buffer */ 1824 barrier(); 1825 } 1826 qstr.len = len; 1827 qstr.name = str; 1828 1829 return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); 1830 } 1831 EXPORT_SYMBOL(generic_ci_d_compare); 1832 1833 /** 1834 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems 1835 * @dentry: dentry of the parent directory 1836 * @str: qstr of name whose hash we should fill in 1837 * 1838 * Return: 0 if hash was successful or unchanged, and -EINVAL on error 1839 */ 1840 int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) 1841 { 1842 const struct inode *dir = READ_ONCE(dentry->d_inode); 1843 struct super_block *sb = dentry->d_sb; 1844 const struct unicode_map *um = sb->s_encoding; 1845 int ret; 1846 1847 if (!dir || !IS_CASEFOLDED(dir)) 1848 return 0; 1849 1850 ret = utf8_casefold_hash(um, dentry, str); 1851 if (ret < 0 && sb_has_strict_encoding(sb)) 1852 return -EINVAL; 1853 return 0; 1854 } 1855 EXPORT_SYMBOL(generic_ci_d_hash); 1856 1857 static const struct dentry_operations generic_ci_dentry_ops = { 1858 .d_hash = generic_ci_d_hash, 1859 .d_compare = generic_ci_d_compare, 1860 #ifdef CONFIG_FS_ENCRYPTION 1861 .d_revalidate = fscrypt_d_revalidate, 1862 #endif 1863 }; 1864 1865 /** 1866 * generic_ci_match() - Match a name (case-insensitively) with a dirent. 1867 * This is a filesystem helper for comparison with directory entries. 1868 * generic_ci_d_compare should be used in VFS' ->d_compare instead. 1869 * 1870 * @parent: Inode of the parent of the dirent under comparison 1871 * @name: name under lookup. 1872 * @folded_name: Optional pre-folded name under lookup 1873 * @de_name: Dirent name. 1874 * @de_name_len: dirent name length. 1875 * 1876 * Test whether a case-insensitive directory entry matches the filename 1877 * being searched. If @folded_name is provided, it is used instead of 1878 * recalculating the casefold of @name. 1879 * 1880 * Return: > 0 if the directory entry matches, 0 if it doesn't match, or 1881 * < 0 on error. 1882 */ 1883 int generic_ci_match(const struct inode *parent, 1884 const struct qstr *name, 1885 const struct qstr *folded_name, 1886 const u8 *de_name, u32 de_name_len) 1887 { 1888 const struct super_block *sb = parent->i_sb; 1889 const struct unicode_map *um = sb->s_encoding; 1890 struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); 1891 struct qstr dirent = QSTR_INIT(de_name, de_name_len); 1892 int res = 0; 1893 1894 if (IS_ENCRYPTED(parent)) { 1895 const struct fscrypt_str encrypted_name = 1896 FSTR_INIT((u8 *) de_name, de_name_len); 1897 1898 if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) 1899 return -EINVAL; 1900 1901 decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); 1902 if (!decrypted_name.name) 1903 return -ENOMEM; 1904 res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, 1905 &decrypted_name); 1906 if (res < 0) { 1907 kfree(decrypted_name.name); 1908 return res; 1909 } 1910 dirent.name = decrypted_name.name; 1911 dirent.len = decrypted_name.len; 1912 } 1913 1914 /* 1915 * Attempt a case-sensitive match first. It is cheaper and 1916 * should cover most lookups, including all the sane 1917 * applications that expect a case-sensitive filesystem. 1918 */ 1919 1920 if (dirent.len == name->len && 1921 !memcmp(name->name, dirent.name, dirent.len)) 1922 goto out; 1923 1924 if (folded_name->name) 1925 res = utf8_strncasecmp_folded(um, folded_name, &dirent); 1926 else 1927 res = utf8_strncasecmp(um, name, &dirent); 1928 1929 out: 1930 kfree(decrypted_name.name); 1931 if (res < 0 && sb_has_strict_encoding(sb)) { 1932 pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); 1933 return 0; 1934 } 1935 return !res; 1936 } 1937 EXPORT_SYMBOL(generic_ci_match); 1938 #endif 1939 1940 #ifdef CONFIG_FS_ENCRYPTION 1941 static const struct dentry_operations generic_encrypted_dentry_ops = { 1942 .d_revalidate = fscrypt_d_revalidate, 1943 }; 1944 #endif 1945 1946 /** 1947 * generic_set_sb_d_ops - helper for choosing the set of 1948 * filesystem-wide dentry operations for the enabled features 1949 * @sb: superblock to be configured 1950 * 1951 * Filesystems supporting casefolding and/or fscrypt can call this 1952 * helper at mount-time to configure sb->s_d_op to best set of dentry 1953 * operations required for the enabled features. The helper must be 1954 * called after these have been configured, but before the root dentry 1955 * is created. 1956 */ 1957 void generic_set_sb_d_ops(struct super_block *sb) 1958 { 1959 #if IS_ENABLED(CONFIG_UNICODE) 1960 if (sb->s_encoding) { 1961 sb->s_d_op = &generic_ci_dentry_ops; 1962 return; 1963 } 1964 #endif 1965 #ifdef CONFIG_FS_ENCRYPTION 1966 if (sb->s_cop) { 1967 sb->s_d_op = &generic_encrypted_dentry_ops; 1968 return; 1969 } 1970 #endif 1971 } 1972 EXPORT_SYMBOL(generic_set_sb_d_ops); 1973 1974 /** 1975 * inode_maybe_inc_iversion - increments i_version 1976 * @inode: inode with the i_version that should be updated 1977 * @force: increment the counter even if it's not necessary? 1978 * 1979 * Every time the inode is modified, the i_version field must be seen to have 1980 * changed by any observer. 1981 * 1982 * If "force" is set or the QUERIED flag is set, then ensure that we increment 1983 * the value, and clear the queried flag. 1984 * 1985 * In the common case where neither is set, then we can return "false" without 1986 * updating i_version. 1987 * 1988 * If this function returns false, and no other metadata has changed, then we 1989 * can avoid logging the metadata. 1990 */ 1991 bool inode_maybe_inc_iversion(struct inode *inode, bool force) 1992 { 1993 u64 cur, new; 1994 1995 /* 1996 * The i_version field is not strictly ordered with any other inode 1997 * information, but the legacy inode_inc_iversion code used a spinlock 1998 * to serialize increments. 1999 * 2000 * We add a full memory barrier to ensure that any de facto ordering 2001 * with other state is preserved (either implicitly coming from cmpxchg 2002 * or explicitly from smp_mb if we don't know upfront if we will execute 2003 * the former). 2004 * 2005 * These barriers pair with inode_query_iversion(). 2006 */ 2007 cur = inode_peek_iversion_raw(inode); 2008 if (!force && !(cur & I_VERSION_QUERIED)) { 2009 smp_mb(); 2010 cur = inode_peek_iversion_raw(inode); 2011 } 2012 2013 do { 2014 /* If flag is clear then we needn't do anything */ 2015 if (!force && !(cur & I_VERSION_QUERIED)) 2016 return false; 2017 2018 /* Since lowest bit is flag, add 2 to avoid it */ 2019 new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; 2020 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 2021 return true; 2022 } 2023 EXPORT_SYMBOL(inode_maybe_inc_iversion); 2024 2025 /** 2026 * inode_query_iversion - read i_version for later use 2027 * @inode: inode from which i_version should be read 2028 * 2029 * Read the inode i_version counter. This should be used by callers that wish 2030 * to store the returned i_version for later comparison. This will guarantee 2031 * that a later query of the i_version will result in a different value if 2032 * anything has changed. 2033 * 2034 * In this implementation, we fetch the current value, set the QUERIED flag and 2035 * then try to swap it into place with a cmpxchg, if it wasn't already set. If 2036 * that fails, we try again with the newly fetched value from the cmpxchg. 2037 */ 2038 u64 inode_query_iversion(struct inode *inode) 2039 { 2040 u64 cur, new; 2041 bool fenced = false; 2042 2043 /* 2044 * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with 2045 * inode_maybe_inc_iversion(), see that routine for more details. 2046 */ 2047 cur = inode_peek_iversion_raw(inode); 2048 do { 2049 /* If flag is already set, then no need to swap */ 2050 if (cur & I_VERSION_QUERIED) { 2051 if (!fenced) 2052 smp_mb(); 2053 break; 2054 } 2055 2056 fenced = true; 2057 new = cur | I_VERSION_QUERIED; 2058 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 2059 return cur >> I_VERSION_QUERIED_SHIFT; 2060 } 2061 EXPORT_SYMBOL(inode_query_iversion); 2062 2063 ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, 2064 ssize_t direct_written, ssize_t buffered_written) 2065 { 2066 struct address_space *mapping = iocb->ki_filp->f_mapping; 2067 loff_t pos = iocb->ki_pos - buffered_written; 2068 loff_t end = iocb->ki_pos - 1; 2069 int err; 2070 2071 /* 2072 * If the buffered write fallback returned an error, we want to return 2073 * the number of bytes which were written by direct I/O, or the error 2074 * code if that was zero. 2075 * 2076 * Note that this differs from normal direct-io semantics, which will 2077 * return -EFOO even if some bytes were written. 2078 */ 2079 if (unlikely(buffered_written < 0)) { 2080 if (direct_written) 2081 return direct_written; 2082 return buffered_written; 2083 } 2084 2085 /* 2086 * We need to ensure that the page cache pages are written to disk and 2087 * invalidated to preserve the expected O_DIRECT semantics. 2088 */ 2089 err = filemap_write_and_wait_range(mapping, pos, end); 2090 if (err < 0) { 2091 /* 2092 * We don't know how much we wrote, so just return the number of 2093 * bytes which were direct-written 2094 */ 2095 iocb->ki_pos -= buffered_written; 2096 if (direct_written) 2097 return direct_written; 2098 return err; 2099 } 2100 invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); 2101 return direct_written + buffered_written; 2102 } 2103 EXPORT_SYMBOL_GPL(direct_write_fallback); 2104 2105 /** 2106 * simple_inode_init_ts - initialize the timestamps for a new inode 2107 * @inode: inode to be initialized 2108 * 2109 * When a new inode is created, most filesystems set the timestamps to the 2110 * current time. Add a helper to do this. 2111 */ 2112 struct timespec64 simple_inode_init_ts(struct inode *inode) 2113 { 2114 struct timespec64 ts = inode_set_ctime_current(inode); 2115 2116 inode_set_atime_to_ts(inode, ts); 2117 inode_set_mtime_to_ts(inode, ts); 2118 return ts; 2119 } 2120 EXPORT_SYMBOL(simple_inode_init_ts); 2121 2122 static inline struct dentry *get_stashed_dentry(struct dentry **stashed) 2123 { 2124 struct dentry *dentry; 2125 2126 guard(rcu)(); 2127 dentry = rcu_dereference(*stashed); 2128 if (!dentry) 2129 return NULL; 2130 if (!lockref_get_not_dead(&dentry->d_lockref)) 2131 return NULL; 2132 return dentry; 2133 } 2134 2135 static struct dentry *prepare_anon_dentry(struct dentry **stashed, 2136 struct super_block *sb, 2137 void *data) 2138 { 2139 struct dentry *dentry; 2140 struct inode *inode; 2141 const struct stashed_operations *sops = sb->s_fs_info; 2142 int ret; 2143 2144 inode = new_inode_pseudo(sb); 2145 if (!inode) { 2146 sops->put_data(data); 2147 return ERR_PTR(-ENOMEM); 2148 } 2149 2150 inode->i_flags |= S_IMMUTABLE; 2151 inode->i_mode = S_IFREG; 2152 simple_inode_init_ts(inode); 2153 2154 ret = sops->init_inode(inode, data); 2155 if (ret < 0) { 2156 iput(inode); 2157 return ERR_PTR(ret); 2158 } 2159 2160 /* Notice when this is changed. */ 2161 WARN_ON_ONCE(!S_ISREG(inode->i_mode)); 2162 WARN_ON_ONCE(!IS_IMMUTABLE(inode)); 2163 2164 dentry = d_alloc_anon(sb); 2165 if (!dentry) { 2166 iput(inode); 2167 return ERR_PTR(-ENOMEM); 2168 } 2169 2170 /* Store address of location where dentry's supposed to be stashed. */ 2171 dentry->d_fsdata = stashed; 2172 2173 /* @data is now owned by the fs */ 2174 d_instantiate(dentry, inode); 2175 return dentry; 2176 } 2177 2178 static struct dentry *stash_dentry(struct dentry **stashed, 2179 struct dentry *dentry) 2180 { 2181 guard(rcu)(); 2182 for (;;) { 2183 struct dentry *old; 2184 2185 /* Assume any old dentry was cleared out. */ 2186 old = cmpxchg(stashed, NULL, dentry); 2187 if (likely(!old)) 2188 return dentry; 2189 2190 /* Check if somebody else installed a reusable dentry. */ 2191 if (lockref_get_not_dead(&old->d_lockref)) 2192 return old; 2193 2194 /* There's an old dead dentry there, try to take it over. */ 2195 if (likely(try_cmpxchg(stashed, &old, dentry))) 2196 return dentry; 2197 } 2198 } 2199 2200 /** 2201 * path_from_stashed - create path from stashed or new dentry 2202 * @stashed: where to retrieve or stash dentry 2203 * @mnt: mnt of the filesystems to use 2204 * @data: data to store in inode->i_private 2205 * @path: path to create 2206 * 2207 * The function tries to retrieve a stashed dentry from @stashed. If the dentry 2208 * is still valid then it will be reused. If the dentry isn't able the function 2209 * will allocate a new dentry and inode. It will then check again whether it 2210 * can reuse an existing dentry in case one has been added in the meantime or 2211 * update @stashed with the newly added dentry. 2212 * 2213 * Special-purpose helper for nsfs and pidfs. 2214 * 2215 * Return: On success zero and on failure a negative error is returned. 2216 */ 2217 int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, 2218 struct path *path) 2219 { 2220 struct dentry *dentry; 2221 const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; 2222 2223 /* See if dentry can be reused. */ 2224 path->dentry = get_stashed_dentry(stashed); 2225 if (path->dentry) { 2226 sops->put_data(data); 2227 goto out_path; 2228 } 2229 2230 /* Allocate a new dentry. */ 2231 dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data); 2232 if (IS_ERR(dentry)) 2233 return PTR_ERR(dentry); 2234 2235 /* Added a new dentry. @data is now owned by the filesystem. */ 2236 path->dentry = stash_dentry(stashed, dentry); 2237 if (path->dentry != dentry) 2238 dput(dentry); 2239 2240 out_path: 2241 WARN_ON_ONCE(path->dentry->d_fsdata != stashed); 2242 WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); 2243 path->mnt = mntget(mnt); 2244 return 0; 2245 } 2246 2247 void stashed_dentry_prune(struct dentry *dentry) 2248 { 2249 struct dentry **stashed = dentry->d_fsdata; 2250 struct inode *inode = d_inode(dentry); 2251 2252 if (WARN_ON_ONCE(!stashed)) 2253 return; 2254 2255 if (!inode) 2256 return; 2257 2258 /* 2259 * Only replace our own @dentry as someone else might've 2260 * already cleared out @dentry and stashed their own 2261 * dentry in there. 2262 */ 2263 cmpxchg(stashed, dentry, NULL); 2264 } 2265