1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/libfs.c 4 * Library for filesystems writers. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/export.h> 9 #include <linux/pagemap.h> 10 #include <linux/slab.h> 11 #include <linux/cred.h> 12 #include <linux/mount.h> 13 #include <linux/vfs.h> 14 #include <linux/quotaops.h> 15 #include <linux/mutex.h> 16 #include <linux/namei.h> 17 #include <linux/exportfs.h> 18 #include <linux/iversion.h> 19 #include <linux/writeback.h> 20 #include <linux/buffer_head.h> /* sync_mapping_buffers */ 21 #include <linux/fs_context.h> 22 #include <linux/pseudo_fs.h> 23 #include <linux/fsnotify.h> 24 #include <linux/unicode.h> 25 #include <linux/fscrypt.h> 26 #include <linux/pidfs.h> 27 28 #include <linux/uaccess.h> 29 30 #include "internal.h" 31 32 int simple_getattr(struct mnt_idmap *idmap, const struct path *path, 33 struct kstat *stat, u32 request_mask, 34 unsigned int query_flags) 35 { 36 struct inode *inode = d_inode(path->dentry); 37 generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); 38 stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); 39 return 0; 40 } 41 EXPORT_SYMBOL(simple_getattr); 42 43 int simple_statfs(struct dentry *dentry, struct kstatfs *buf) 44 { 45 u64 id = huge_encode_dev(dentry->d_sb->s_dev); 46 47 buf->f_fsid = u64_to_fsid(id); 48 buf->f_type = dentry->d_sb->s_magic; 49 buf->f_bsize = PAGE_SIZE; 50 buf->f_namelen = NAME_MAX; 51 return 0; 52 } 53 EXPORT_SYMBOL(simple_statfs); 54 55 /* 56 * Retaining negative dentries for an in-memory filesystem just wastes 57 * memory and lookup time: arrange for them to be deleted immediately. 58 */ 59 int always_delete_dentry(const struct dentry *dentry) 60 { 61 return 1; 62 } 63 EXPORT_SYMBOL(always_delete_dentry); 64 65 const struct dentry_operations simple_dentry_operations = { 66 .d_delete = always_delete_dentry, 67 }; 68 EXPORT_SYMBOL(simple_dentry_operations); 69 70 /* 71 * Lookup the data. This is trivial - if the dentry didn't already 72 * exist, we know it is negative. Set d_op to delete negative dentries. 73 */ 74 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 75 { 76 if (dentry->d_name.len > NAME_MAX) 77 return ERR_PTR(-ENAMETOOLONG); 78 if (!dentry->d_sb->s_d_op) 79 d_set_d_op(dentry, &simple_dentry_operations); 80 81 if (IS_ENABLED(CONFIG_UNICODE) && IS_CASEFOLDED(dir)) 82 return NULL; 83 84 d_add(dentry, NULL); 85 return NULL; 86 } 87 EXPORT_SYMBOL(simple_lookup); 88 89 int dcache_dir_open(struct inode *inode, struct file *file) 90 { 91 file->private_data = d_alloc_cursor(file->f_path.dentry); 92 93 return file->private_data ? 0 : -ENOMEM; 94 } 95 EXPORT_SYMBOL(dcache_dir_open); 96 97 int dcache_dir_close(struct inode *inode, struct file *file) 98 { 99 dput(file->private_data); 100 return 0; 101 } 102 EXPORT_SYMBOL(dcache_dir_close); 103 104 /* parent is locked at least shared */ 105 /* 106 * Returns an element of siblings' list. 107 * We are looking for <count>th positive after <p>; if 108 * found, dentry is grabbed and returned to caller. 109 * If no such element exists, NULL is returned. 110 */ 111 static struct dentry *scan_positives(struct dentry *cursor, 112 struct hlist_node **p, 113 loff_t count, 114 struct dentry *last) 115 { 116 struct dentry *dentry = cursor->d_parent, *found = NULL; 117 118 spin_lock(&dentry->d_lock); 119 while (*p) { 120 struct dentry *d = hlist_entry(*p, struct dentry, d_sib); 121 p = &d->d_sib.next; 122 // we must at least skip cursors, to avoid livelocks 123 if (d->d_flags & DCACHE_DENTRY_CURSOR) 124 continue; 125 if (simple_positive(d) && !--count) { 126 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 127 if (simple_positive(d)) 128 found = dget_dlock(d); 129 spin_unlock(&d->d_lock); 130 if (likely(found)) 131 break; 132 count = 1; 133 } 134 if (need_resched()) { 135 if (!hlist_unhashed(&cursor->d_sib)) 136 __hlist_del(&cursor->d_sib); 137 hlist_add_behind(&cursor->d_sib, &d->d_sib); 138 p = &cursor->d_sib.next; 139 spin_unlock(&dentry->d_lock); 140 cond_resched(); 141 spin_lock(&dentry->d_lock); 142 } 143 } 144 spin_unlock(&dentry->d_lock); 145 dput(last); 146 return found; 147 } 148 149 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) 150 { 151 struct dentry *dentry = file->f_path.dentry; 152 switch (whence) { 153 case 1: 154 offset += file->f_pos; 155 fallthrough; 156 case 0: 157 if (offset >= 0) 158 break; 159 fallthrough; 160 default: 161 return -EINVAL; 162 } 163 if (offset != file->f_pos) { 164 struct dentry *cursor = file->private_data; 165 struct dentry *to = NULL; 166 167 inode_lock_shared(dentry->d_inode); 168 169 if (offset > 2) 170 to = scan_positives(cursor, &dentry->d_children.first, 171 offset - 2, NULL); 172 spin_lock(&dentry->d_lock); 173 hlist_del_init(&cursor->d_sib); 174 if (to) 175 hlist_add_behind(&cursor->d_sib, &to->d_sib); 176 spin_unlock(&dentry->d_lock); 177 dput(to); 178 179 file->f_pos = offset; 180 181 inode_unlock_shared(dentry->d_inode); 182 } 183 return offset; 184 } 185 EXPORT_SYMBOL(dcache_dir_lseek); 186 187 /* 188 * Directory is locked and all positive dentries in it are safe, since 189 * for ramfs-type trees they can't go away without unlink() or rmdir(), 190 * both impossible due to the lock on directory. 191 */ 192 193 int dcache_readdir(struct file *file, struct dir_context *ctx) 194 { 195 struct dentry *dentry = file->f_path.dentry; 196 struct dentry *cursor = file->private_data; 197 struct dentry *next = NULL; 198 struct hlist_node **p; 199 200 if (!dir_emit_dots(file, ctx)) 201 return 0; 202 203 if (ctx->pos == 2) 204 p = &dentry->d_children.first; 205 else 206 p = &cursor->d_sib.next; 207 208 while ((next = scan_positives(cursor, p, 1, next)) != NULL) { 209 if (!dir_emit(ctx, next->d_name.name, next->d_name.len, 210 d_inode(next)->i_ino, 211 fs_umode_to_dtype(d_inode(next)->i_mode))) 212 break; 213 ctx->pos++; 214 p = &next->d_sib.next; 215 } 216 spin_lock(&dentry->d_lock); 217 hlist_del_init(&cursor->d_sib); 218 if (next) 219 hlist_add_before(&cursor->d_sib, &next->d_sib); 220 spin_unlock(&dentry->d_lock); 221 dput(next); 222 223 return 0; 224 } 225 EXPORT_SYMBOL(dcache_readdir); 226 227 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) 228 { 229 return -EISDIR; 230 } 231 EXPORT_SYMBOL(generic_read_dir); 232 233 const struct file_operations simple_dir_operations = { 234 .open = dcache_dir_open, 235 .release = dcache_dir_close, 236 .llseek = dcache_dir_lseek, 237 .read = generic_read_dir, 238 .iterate_shared = dcache_readdir, 239 .fsync = noop_fsync, 240 }; 241 EXPORT_SYMBOL(simple_dir_operations); 242 243 const struct inode_operations simple_dir_inode_operations = { 244 .lookup = simple_lookup, 245 }; 246 EXPORT_SYMBOL(simple_dir_inode_operations); 247 248 /* simple_offset_add() never assigns these to a dentry */ 249 enum { 250 DIR_OFFSET_FIRST = 2, /* Find first real entry */ 251 DIR_OFFSET_EOD = S32_MAX, 252 }; 253 254 /* simple_offset_add() allocation range */ 255 enum { 256 DIR_OFFSET_MIN = DIR_OFFSET_FIRST + 1, 257 DIR_OFFSET_MAX = DIR_OFFSET_EOD - 1, 258 }; 259 260 static void offset_set(struct dentry *dentry, long offset) 261 { 262 dentry->d_fsdata = (void *)offset; 263 } 264 265 static long dentry2offset(struct dentry *dentry) 266 { 267 return (long)dentry->d_fsdata; 268 } 269 270 static struct lock_class_key simple_offset_lock_class; 271 272 /** 273 * simple_offset_init - initialize an offset_ctx 274 * @octx: directory offset map to be initialized 275 * 276 */ 277 void simple_offset_init(struct offset_ctx *octx) 278 { 279 mt_init_flags(&octx->mt, MT_FLAGS_ALLOC_RANGE); 280 lockdep_set_class(&octx->mt.ma_lock, &simple_offset_lock_class); 281 octx->next_offset = DIR_OFFSET_MIN; 282 } 283 284 /** 285 * simple_offset_add - Add an entry to a directory's offset map 286 * @octx: directory offset ctx to be updated 287 * @dentry: new dentry being added 288 * 289 * Returns zero on success. @octx and the dentry's offset are updated. 290 * Otherwise, a negative errno value is returned. 291 */ 292 int simple_offset_add(struct offset_ctx *octx, struct dentry *dentry) 293 { 294 unsigned long offset; 295 int ret; 296 297 if (dentry2offset(dentry) != 0) 298 return -EBUSY; 299 300 ret = mtree_alloc_cyclic(&octx->mt, &offset, dentry, DIR_OFFSET_MIN, 301 DIR_OFFSET_MAX, &octx->next_offset, 302 GFP_KERNEL); 303 if (unlikely(ret < 0)) 304 return ret == -EBUSY ? -ENOSPC : ret; 305 306 offset_set(dentry, offset); 307 return 0; 308 } 309 310 static int simple_offset_replace(struct offset_ctx *octx, struct dentry *dentry, 311 long offset) 312 { 313 int ret; 314 315 ret = mtree_store(&octx->mt, offset, dentry, GFP_KERNEL); 316 if (ret) 317 return ret; 318 offset_set(dentry, offset); 319 return 0; 320 } 321 322 /** 323 * simple_offset_remove - Remove an entry to a directory's offset map 324 * @octx: directory offset ctx to be updated 325 * @dentry: dentry being removed 326 * 327 */ 328 void simple_offset_remove(struct offset_ctx *octx, struct dentry *dentry) 329 { 330 long offset; 331 332 offset = dentry2offset(dentry); 333 if (offset == 0) 334 return; 335 336 mtree_erase(&octx->mt, offset); 337 offset_set(dentry, 0); 338 } 339 340 /** 341 * simple_offset_rename - handle directory offsets for rename 342 * @old_dir: parent directory of source entry 343 * @old_dentry: dentry of source entry 344 * @new_dir: parent_directory of destination entry 345 * @new_dentry: dentry of destination 346 * 347 * Caller provides appropriate serialization. 348 * 349 * User space expects the directory offset value of the replaced 350 * (new) directory entry to be unchanged after a rename. 351 * 352 * Returns zero on success, a negative errno value on failure. 353 */ 354 int simple_offset_rename(struct inode *old_dir, struct dentry *old_dentry, 355 struct inode *new_dir, struct dentry *new_dentry) 356 { 357 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 358 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 359 long new_offset = dentry2offset(new_dentry); 360 361 simple_offset_remove(old_ctx, old_dentry); 362 363 if (new_offset) { 364 offset_set(new_dentry, 0); 365 return simple_offset_replace(new_ctx, old_dentry, new_offset); 366 } 367 return simple_offset_add(new_ctx, old_dentry); 368 } 369 370 /** 371 * simple_offset_rename_exchange - exchange rename with directory offsets 372 * @old_dir: parent of dentry being moved 373 * @old_dentry: dentry being moved 374 * @new_dir: destination parent 375 * @new_dentry: destination dentry 376 * 377 * This API preserves the directory offset values. Caller provides 378 * appropriate serialization. 379 * 380 * Returns zero on success. Otherwise a negative errno is returned and the 381 * rename is rolled back. 382 */ 383 int simple_offset_rename_exchange(struct inode *old_dir, 384 struct dentry *old_dentry, 385 struct inode *new_dir, 386 struct dentry *new_dentry) 387 { 388 struct offset_ctx *old_ctx = old_dir->i_op->get_offset_ctx(old_dir); 389 struct offset_ctx *new_ctx = new_dir->i_op->get_offset_ctx(new_dir); 390 long old_index = dentry2offset(old_dentry); 391 long new_index = dentry2offset(new_dentry); 392 int ret; 393 394 simple_offset_remove(old_ctx, old_dentry); 395 simple_offset_remove(new_ctx, new_dentry); 396 397 ret = simple_offset_replace(new_ctx, old_dentry, new_index); 398 if (ret) 399 goto out_restore; 400 401 ret = simple_offset_replace(old_ctx, new_dentry, old_index); 402 if (ret) { 403 simple_offset_remove(new_ctx, old_dentry); 404 goto out_restore; 405 } 406 407 ret = simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 408 if (ret) { 409 simple_offset_remove(new_ctx, old_dentry); 410 simple_offset_remove(old_ctx, new_dentry); 411 goto out_restore; 412 } 413 return 0; 414 415 out_restore: 416 (void)simple_offset_replace(old_ctx, old_dentry, old_index); 417 (void)simple_offset_replace(new_ctx, new_dentry, new_index); 418 return ret; 419 } 420 421 /** 422 * simple_offset_destroy - Release offset map 423 * @octx: directory offset ctx that is about to be destroyed 424 * 425 * During fs teardown (eg. umount), a directory's offset map might still 426 * contain entries. xa_destroy() cleans out anything that remains. 427 */ 428 void simple_offset_destroy(struct offset_ctx *octx) 429 { 430 mtree_destroy(&octx->mt); 431 } 432 433 /** 434 * offset_dir_llseek - Advance the read position of a directory descriptor 435 * @file: an open directory whose position is to be updated 436 * @offset: a byte offset 437 * @whence: enumerator describing the starting position for this update 438 * 439 * SEEK_END, SEEK_DATA, and SEEK_HOLE are not supported for directories. 440 * 441 * Returns the updated read position if successful; otherwise a 442 * negative errno is returned and the read position remains unchanged. 443 */ 444 static loff_t offset_dir_llseek(struct file *file, loff_t offset, int whence) 445 { 446 switch (whence) { 447 case SEEK_CUR: 448 offset += file->f_pos; 449 fallthrough; 450 case SEEK_SET: 451 if (offset >= 0) 452 break; 453 fallthrough; 454 default: 455 return -EINVAL; 456 } 457 458 return vfs_setpos(file, offset, LONG_MAX); 459 } 460 461 static struct dentry *find_positive_dentry(struct dentry *parent, 462 struct dentry *dentry, 463 bool next) 464 { 465 struct dentry *found = NULL; 466 467 spin_lock(&parent->d_lock); 468 if (next) 469 dentry = d_next_sibling(dentry); 470 else if (!dentry) 471 dentry = d_first_child(parent); 472 hlist_for_each_entry_from(dentry, d_sib) { 473 if (!simple_positive(dentry)) 474 continue; 475 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 476 if (simple_positive(dentry)) 477 found = dget_dlock(dentry); 478 spin_unlock(&dentry->d_lock); 479 if (likely(found)) 480 break; 481 } 482 spin_unlock(&parent->d_lock); 483 return found; 484 } 485 486 static noinline_for_stack struct dentry * 487 offset_dir_lookup(struct dentry *parent, loff_t offset) 488 { 489 struct inode *inode = d_inode(parent); 490 struct offset_ctx *octx = inode->i_op->get_offset_ctx(inode); 491 struct dentry *child, *found = NULL; 492 493 MA_STATE(mas, &octx->mt, offset, offset); 494 495 if (offset == DIR_OFFSET_FIRST) 496 found = find_positive_dentry(parent, NULL, false); 497 else { 498 rcu_read_lock(); 499 child = mas_find_rev(&mas, DIR_OFFSET_MIN); 500 found = find_positive_dentry(parent, child, false); 501 rcu_read_unlock(); 502 } 503 return found; 504 } 505 506 static bool offset_dir_emit(struct dir_context *ctx, struct dentry *dentry) 507 { 508 struct inode *inode = d_inode(dentry); 509 510 return dir_emit(ctx, dentry->d_name.name, dentry->d_name.len, 511 inode->i_ino, fs_umode_to_dtype(inode->i_mode)); 512 } 513 514 static void offset_iterate_dir(struct file *file, struct dir_context *ctx) 515 { 516 struct dentry *dir = file->f_path.dentry; 517 struct dentry *dentry; 518 519 dentry = offset_dir_lookup(dir, ctx->pos); 520 if (!dentry) 521 goto out_eod; 522 while (true) { 523 struct dentry *next; 524 525 ctx->pos = dentry2offset(dentry); 526 if (!offset_dir_emit(ctx, dentry)) 527 break; 528 529 next = find_positive_dentry(dir, dentry, true); 530 dput(dentry); 531 532 if (!next) 533 goto out_eod; 534 dentry = next; 535 } 536 dput(dentry); 537 return; 538 539 out_eod: 540 ctx->pos = DIR_OFFSET_EOD; 541 } 542 543 /** 544 * offset_readdir - Emit entries starting at offset @ctx->pos 545 * @file: an open directory to iterate over 546 * @ctx: directory iteration context 547 * 548 * Caller must hold @file's i_rwsem to prevent insertion or removal of 549 * entries during this call. 550 * 551 * On entry, @ctx->pos contains an offset that represents the first entry 552 * to be read from the directory. 553 * 554 * The operation continues until there are no more entries to read, or 555 * until the ctx->actor indicates there is no more space in the caller's 556 * output buffer. 557 * 558 * On return, @ctx->pos contains an offset that will read the next entry 559 * in this directory when offset_readdir() is called again with @ctx. 560 * Caller places this value in the d_off field of the last entry in the 561 * user's buffer. 562 * 563 * Return values: 564 * %0 - Complete 565 */ 566 static int offset_readdir(struct file *file, struct dir_context *ctx) 567 { 568 struct dentry *dir = file->f_path.dentry; 569 570 lockdep_assert_held(&d_inode(dir)->i_rwsem); 571 572 if (!dir_emit_dots(file, ctx)) 573 return 0; 574 if (ctx->pos != DIR_OFFSET_EOD) 575 offset_iterate_dir(file, ctx); 576 return 0; 577 } 578 579 const struct file_operations simple_offset_dir_operations = { 580 .llseek = offset_dir_llseek, 581 .iterate_shared = offset_readdir, 582 .read = generic_read_dir, 583 .fsync = noop_fsync, 584 }; 585 586 struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) 587 { 588 struct dentry *child = NULL, *d; 589 590 spin_lock(&parent->d_lock); 591 d = prev ? d_next_sibling(prev) : d_first_child(parent); 592 hlist_for_each_entry_from(d, d_sib) { 593 if (simple_positive(d)) { 594 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 595 if (simple_positive(d)) 596 child = dget_dlock(d); 597 spin_unlock(&d->d_lock); 598 if (likely(child)) 599 break; 600 } 601 } 602 spin_unlock(&parent->d_lock); 603 dput(prev); 604 return child; 605 } 606 EXPORT_SYMBOL(find_next_child); 607 608 void simple_recursive_removal(struct dentry *dentry, 609 void (*callback)(struct dentry *)) 610 { 611 struct dentry *this = dget(dentry); 612 while (true) { 613 struct dentry *victim = NULL, *child; 614 struct inode *inode = this->d_inode; 615 616 inode_lock(inode); 617 if (d_is_dir(this)) 618 inode->i_flags |= S_DEAD; 619 while ((child = find_next_child(this, victim)) == NULL) { 620 // kill and ascend 621 // update metadata while it's still locked 622 inode_set_ctime_current(inode); 623 clear_nlink(inode); 624 inode_unlock(inode); 625 victim = this; 626 this = this->d_parent; 627 inode = this->d_inode; 628 inode_lock(inode); 629 if (simple_positive(victim)) { 630 d_invalidate(victim); // avoid lost mounts 631 if (d_is_dir(victim)) 632 fsnotify_rmdir(inode, victim); 633 else 634 fsnotify_unlink(inode, victim); 635 if (callback) 636 callback(victim); 637 dput(victim); // unpin it 638 } 639 if (victim == dentry) { 640 inode_set_mtime_to_ts(inode, 641 inode_set_ctime_current(inode)); 642 if (d_is_dir(dentry)) 643 drop_nlink(inode); 644 inode_unlock(inode); 645 dput(dentry); 646 return; 647 } 648 } 649 inode_unlock(inode); 650 this = child; 651 } 652 } 653 EXPORT_SYMBOL(simple_recursive_removal); 654 655 static const struct super_operations simple_super_operations = { 656 .statfs = simple_statfs, 657 }; 658 659 static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) 660 { 661 struct pseudo_fs_context *ctx = fc->fs_private; 662 struct inode *root; 663 664 s->s_maxbytes = MAX_LFS_FILESIZE; 665 s->s_blocksize = PAGE_SIZE; 666 s->s_blocksize_bits = PAGE_SHIFT; 667 s->s_magic = ctx->magic; 668 s->s_op = ctx->ops ?: &simple_super_operations; 669 s->s_export_op = ctx->eops; 670 s->s_xattr = ctx->xattr; 671 s->s_time_gran = 1; 672 root = new_inode(s); 673 if (!root) 674 return -ENOMEM; 675 676 /* 677 * since this is the first inode, make it number 1. New inodes created 678 * after this must take care not to collide with it (by passing 679 * max_reserved of 1 to iunique). 680 */ 681 root->i_ino = 1; 682 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 683 simple_inode_init_ts(root); 684 s->s_root = d_make_root(root); 685 if (!s->s_root) 686 return -ENOMEM; 687 s->s_d_op = ctx->dops; 688 return 0; 689 } 690 691 static int pseudo_fs_get_tree(struct fs_context *fc) 692 { 693 return get_tree_nodev(fc, pseudo_fs_fill_super); 694 } 695 696 static void pseudo_fs_free(struct fs_context *fc) 697 { 698 kfree(fc->fs_private); 699 } 700 701 static const struct fs_context_operations pseudo_fs_context_ops = { 702 .free = pseudo_fs_free, 703 .get_tree = pseudo_fs_get_tree, 704 }; 705 706 /* 707 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 708 * will never be mountable) 709 */ 710 struct pseudo_fs_context *init_pseudo(struct fs_context *fc, 711 unsigned long magic) 712 { 713 struct pseudo_fs_context *ctx; 714 715 ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); 716 if (likely(ctx)) { 717 ctx->magic = magic; 718 fc->fs_private = ctx; 719 fc->ops = &pseudo_fs_context_ops; 720 fc->sb_flags |= SB_NOUSER; 721 fc->global = true; 722 } 723 return ctx; 724 } 725 EXPORT_SYMBOL(init_pseudo); 726 727 int simple_open(struct inode *inode, struct file *file) 728 { 729 if (inode->i_private) 730 file->private_data = inode->i_private; 731 return 0; 732 } 733 EXPORT_SYMBOL(simple_open); 734 735 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 736 { 737 struct inode *inode = d_inode(old_dentry); 738 739 inode_set_mtime_to_ts(dir, 740 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 741 inc_nlink(inode); 742 ihold(inode); 743 dget(dentry); 744 d_instantiate(dentry, inode); 745 return 0; 746 } 747 EXPORT_SYMBOL(simple_link); 748 749 int simple_empty(struct dentry *dentry) 750 { 751 struct dentry *child; 752 int ret = 0; 753 754 spin_lock(&dentry->d_lock); 755 hlist_for_each_entry(child, &dentry->d_children, d_sib) { 756 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); 757 if (simple_positive(child)) { 758 spin_unlock(&child->d_lock); 759 goto out; 760 } 761 spin_unlock(&child->d_lock); 762 } 763 ret = 1; 764 out: 765 spin_unlock(&dentry->d_lock); 766 return ret; 767 } 768 EXPORT_SYMBOL(simple_empty); 769 770 int simple_unlink(struct inode *dir, struct dentry *dentry) 771 { 772 struct inode *inode = d_inode(dentry); 773 774 inode_set_mtime_to_ts(dir, 775 inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); 776 drop_nlink(inode); 777 dput(dentry); 778 return 0; 779 } 780 EXPORT_SYMBOL(simple_unlink); 781 782 int simple_rmdir(struct inode *dir, struct dentry *dentry) 783 { 784 if (!simple_empty(dentry)) 785 return -ENOTEMPTY; 786 787 drop_nlink(d_inode(dentry)); 788 simple_unlink(dir, dentry); 789 drop_nlink(dir); 790 return 0; 791 } 792 EXPORT_SYMBOL(simple_rmdir); 793 794 /** 795 * simple_rename_timestamp - update the various inode timestamps for rename 796 * @old_dir: old parent directory 797 * @old_dentry: dentry that is being renamed 798 * @new_dir: new parent directory 799 * @new_dentry: target for rename 800 * 801 * POSIX mandates that the old and new parent directories have their ctime and 802 * mtime updated, and that inodes of @old_dentry and @new_dentry (if any), have 803 * their ctime updated. 804 */ 805 void simple_rename_timestamp(struct inode *old_dir, struct dentry *old_dentry, 806 struct inode *new_dir, struct dentry *new_dentry) 807 { 808 struct inode *newino = d_inode(new_dentry); 809 810 inode_set_mtime_to_ts(old_dir, inode_set_ctime_current(old_dir)); 811 if (new_dir != old_dir) 812 inode_set_mtime_to_ts(new_dir, 813 inode_set_ctime_current(new_dir)); 814 inode_set_ctime_current(d_inode(old_dentry)); 815 if (newino) 816 inode_set_ctime_current(newino); 817 } 818 EXPORT_SYMBOL_GPL(simple_rename_timestamp); 819 820 int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry, 821 struct inode *new_dir, struct dentry *new_dentry) 822 { 823 bool old_is_dir = d_is_dir(old_dentry); 824 bool new_is_dir = d_is_dir(new_dentry); 825 826 if (old_dir != new_dir && old_is_dir != new_is_dir) { 827 if (old_is_dir) { 828 drop_nlink(old_dir); 829 inc_nlink(new_dir); 830 } else { 831 drop_nlink(new_dir); 832 inc_nlink(old_dir); 833 } 834 } 835 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 836 return 0; 837 } 838 EXPORT_SYMBOL_GPL(simple_rename_exchange); 839 840 int simple_rename(struct mnt_idmap *idmap, struct inode *old_dir, 841 struct dentry *old_dentry, struct inode *new_dir, 842 struct dentry *new_dentry, unsigned int flags) 843 { 844 int they_are_dirs = d_is_dir(old_dentry); 845 846 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) 847 return -EINVAL; 848 849 if (flags & RENAME_EXCHANGE) 850 return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); 851 852 if (!simple_empty(new_dentry)) 853 return -ENOTEMPTY; 854 855 if (d_really_is_positive(new_dentry)) { 856 simple_unlink(new_dir, new_dentry); 857 if (they_are_dirs) { 858 drop_nlink(d_inode(new_dentry)); 859 drop_nlink(old_dir); 860 } 861 } else if (they_are_dirs) { 862 drop_nlink(old_dir); 863 inc_nlink(new_dir); 864 } 865 866 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry); 867 return 0; 868 } 869 EXPORT_SYMBOL(simple_rename); 870 871 /** 872 * simple_setattr - setattr for simple filesystem 873 * @idmap: idmap of the target mount 874 * @dentry: dentry 875 * @iattr: iattr structure 876 * 877 * Returns 0 on success, -error on failure. 878 * 879 * simple_setattr is a simple ->setattr implementation without a proper 880 * implementation of size changes. 881 * 882 * It can either be used for in-memory filesystems or special files 883 * on simple regular filesystems. Anything that needs to change on-disk 884 * or wire state on size changes needs its own setattr method. 885 */ 886 int simple_setattr(struct mnt_idmap *idmap, struct dentry *dentry, 887 struct iattr *iattr) 888 { 889 struct inode *inode = d_inode(dentry); 890 int error; 891 892 error = setattr_prepare(idmap, dentry, iattr); 893 if (error) 894 return error; 895 896 if (iattr->ia_valid & ATTR_SIZE) 897 truncate_setsize(inode, iattr->ia_size); 898 setattr_copy(idmap, inode, iattr); 899 mark_inode_dirty(inode); 900 return 0; 901 } 902 EXPORT_SYMBOL(simple_setattr); 903 904 static int simple_read_folio(struct file *file, struct folio *folio) 905 { 906 folio_zero_range(folio, 0, folio_size(folio)); 907 flush_dcache_folio(folio); 908 folio_mark_uptodate(folio); 909 folio_unlock(folio); 910 return 0; 911 } 912 913 int simple_write_begin(struct file *file, struct address_space *mapping, 914 loff_t pos, unsigned len, 915 struct folio **foliop, void **fsdata) 916 { 917 struct folio *folio; 918 919 folio = __filemap_get_folio(mapping, pos / PAGE_SIZE, FGP_WRITEBEGIN, 920 mapping_gfp_mask(mapping)); 921 if (IS_ERR(folio)) 922 return PTR_ERR(folio); 923 924 *foliop = folio; 925 926 if (!folio_test_uptodate(folio) && (len != folio_size(folio))) { 927 size_t from = offset_in_folio(folio, pos); 928 929 folio_zero_segments(folio, 0, from, 930 from + len, folio_size(folio)); 931 } 932 return 0; 933 } 934 EXPORT_SYMBOL(simple_write_begin); 935 936 /** 937 * simple_write_end - .write_end helper for non-block-device FSes 938 * @file: See .write_end of address_space_operations 939 * @mapping: " 940 * @pos: " 941 * @len: " 942 * @copied: " 943 * @folio: " 944 * @fsdata: " 945 * 946 * simple_write_end does the minimum needed for updating a folio after 947 * writing is done. It has the same API signature as the .write_end of 948 * address_space_operations vector. So it can just be set onto .write_end for 949 * FSes that don't need any other processing. i_mutex is assumed to be held. 950 * Block based filesystems should use generic_write_end(). 951 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty 952 * is not called, so a filesystem that actually does store data in .write_inode 953 * should extend on what's done here with a call to mark_inode_dirty() in the 954 * case that i_size has changed. 955 * 956 * Use *ONLY* with simple_read_folio() 957 */ 958 static int simple_write_end(struct file *file, struct address_space *mapping, 959 loff_t pos, unsigned len, unsigned copied, 960 struct folio *folio, void *fsdata) 961 { 962 struct inode *inode = folio->mapping->host; 963 loff_t last_pos = pos + copied; 964 965 /* zero the stale part of the folio if we did a short copy */ 966 if (!folio_test_uptodate(folio)) { 967 if (copied < len) { 968 size_t from = offset_in_folio(folio, pos); 969 970 folio_zero_range(folio, from + copied, len - copied); 971 } 972 folio_mark_uptodate(folio); 973 } 974 /* 975 * No need to use i_size_read() here, the i_size 976 * cannot change under us because we hold the i_mutex. 977 */ 978 if (last_pos > inode->i_size) 979 i_size_write(inode, last_pos); 980 981 folio_mark_dirty(folio); 982 folio_unlock(folio); 983 folio_put(folio); 984 985 return copied; 986 } 987 988 /* 989 * Provides ramfs-style behavior: data in the pagecache, but no writeback. 990 */ 991 const struct address_space_operations ram_aops = { 992 .read_folio = simple_read_folio, 993 .write_begin = simple_write_begin, 994 .write_end = simple_write_end, 995 .dirty_folio = noop_dirty_folio, 996 }; 997 EXPORT_SYMBOL(ram_aops); 998 999 /* 1000 * the inodes created here are not hashed. If you use iunique to generate 1001 * unique inode values later for this filesystem, then you must take care 1002 * to pass it an appropriate max_reserved value to avoid collisions. 1003 */ 1004 int simple_fill_super(struct super_block *s, unsigned long magic, 1005 const struct tree_descr *files) 1006 { 1007 struct inode *inode; 1008 struct dentry *dentry; 1009 int i; 1010 1011 s->s_blocksize = PAGE_SIZE; 1012 s->s_blocksize_bits = PAGE_SHIFT; 1013 s->s_magic = magic; 1014 s->s_op = &simple_super_operations; 1015 s->s_time_gran = 1; 1016 1017 inode = new_inode(s); 1018 if (!inode) 1019 return -ENOMEM; 1020 /* 1021 * because the root inode is 1, the files array must not contain an 1022 * entry at index 1 1023 */ 1024 inode->i_ino = 1; 1025 inode->i_mode = S_IFDIR | 0755; 1026 simple_inode_init_ts(inode); 1027 inode->i_op = &simple_dir_inode_operations; 1028 inode->i_fop = &simple_dir_operations; 1029 set_nlink(inode, 2); 1030 s->s_root = d_make_root(inode); 1031 if (!s->s_root) 1032 return -ENOMEM; 1033 for (i = 0; !files->name || files->name[0]; i++, files++) { 1034 if (!files->name) 1035 continue; 1036 1037 /* warn if it tries to conflict with the root inode */ 1038 if (unlikely(i == 1)) 1039 printk(KERN_WARNING "%s: %s passed in a files array" 1040 "with an index of 1!\n", __func__, 1041 s->s_type->name); 1042 1043 dentry = d_alloc_name(s->s_root, files->name); 1044 if (!dentry) 1045 return -ENOMEM; 1046 inode = new_inode(s); 1047 if (!inode) { 1048 dput(dentry); 1049 return -ENOMEM; 1050 } 1051 inode->i_mode = S_IFREG | files->mode; 1052 simple_inode_init_ts(inode); 1053 inode->i_fop = files->ops; 1054 inode->i_ino = i; 1055 d_add(dentry, inode); 1056 } 1057 return 0; 1058 } 1059 EXPORT_SYMBOL(simple_fill_super); 1060 1061 static DEFINE_SPINLOCK(pin_fs_lock); 1062 1063 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) 1064 { 1065 struct vfsmount *mnt = NULL; 1066 spin_lock(&pin_fs_lock); 1067 if (unlikely(!*mount)) { 1068 spin_unlock(&pin_fs_lock); 1069 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); 1070 if (IS_ERR(mnt)) 1071 return PTR_ERR(mnt); 1072 spin_lock(&pin_fs_lock); 1073 if (!*mount) 1074 *mount = mnt; 1075 } 1076 mntget(*mount); 1077 ++*count; 1078 spin_unlock(&pin_fs_lock); 1079 mntput(mnt); 1080 return 0; 1081 } 1082 EXPORT_SYMBOL(simple_pin_fs); 1083 1084 void simple_release_fs(struct vfsmount **mount, int *count) 1085 { 1086 struct vfsmount *mnt; 1087 spin_lock(&pin_fs_lock); 1088 mnt = *mount; 1089 if (!--*count) 1090 *mount = NULL; 1091 spin_unlock(&pin_fs_lock); 1092 mntput(mnt); 1093 } 1094 EXPORT_SYMBOL(simple_release_fs); 1095 1096 /** 1097 * simple_read_from_buffer - copy data from the buffer to user space 1098 * @to: the user space buffer to read to 1099 * @count: the maximum number of bytes to read 1100 * @ppos: the current position in the buffer 1101 * @from: the buffer to read from 1102 * @available: the size of the buffer 1103 * 1104 * The simple_read_from_buffer() function reads up to @count bytes from the 1105 * buffer @from at offset @ppos into the user space address starting at @to. 1106 * 1107 * On success, the number of bytes read is returned and the offset @ppos is 1108 * advanced by this number, or negative value is returned on error. 1109 **/ 1110 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 1111 const void *from, size_t available) 1112 { 1113 loff_t pos = *ppos; 1114 size_t ret; 1115 1116 if (pos < 0) 1117 return -EINVAL; 1118 if (pos >= available || !count) 1119 return 0; 1120 if (count > available - pos) 1121 count = available - pos; 1122 ret = copy_to_user(to, from + pos, count); 1123 if (ret == count) 1124 return -EFAULT; 1125 count -= ret; 1126 *ppos = pos + count; 1127 return count; 1128 } 1129 EXPORT_SYMBOL(simple_read_from_buffer); 1130 1131 /** 1132 * simple_write_to_buffer - copy data from user space to the buffer 1133 * @to: the buffer to write to 1134 * @available: the size of the buffer 1135 * @ppos: the current position in the buffer 1136 * @from: the user space buffer to read from 1137 * @count: the maximum number of bytes to read 1138 * 1139 * The simple_write_to_buffer() function reads up to @count bytes from the user 1140 * space address starting at @from into the buffer @to at offset @ppos. 1141 * 1142 * On success, the number of bytes written is returned and the offset @ppos is 1143 * advanced by this number, or negative value is returned on error. 1144 **/ 1145 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 1146 const void __user *from, size_t count) 1147 { 1148 loff_t pos = *ppos; 1149 size_t res; 1150 1151 if (pos < 0) 1152 return -EINVAL; 1153 if (pos >= available || !count) 1154 return 0; 1155 if (count > available - pos) 1156 count = available - pos; 1157 res = copy_from_user(to + pos, from, count); 1158 if (res == count) 1159 return -EFAULT; 1160 count -= res; 1161 *ppos = pos + count; 1162 return count; 1163 } 1164 EXPORT_SYMBOL(simple_write_to_buffer); 1165 1166 /** 1167 * memory_read_from_buffer - copy data from the buffer 1168 * @to: the kernel space buffer to read to 1169 * @count: the maximum number of bytes to read 1170 * @ppos: the current position in the buffer 1171 * @from: the buffer to read from 1172 * @available: the size of the buffer 1173 * 1174 * The memory_read_from_buffer() function reads up to @count bytes from the 1175 * buffer @from at offset @ppos into the kernel space address starting at @to. 1176 * 1177 * On success, the number of bytes read is returned and the offset @ppos is 1178 * advanced by this number, or negative value is returned on error. 1179 **/ 1180 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 1181 const void *from, size_t available) 1182 { 1183 loff_t pos = *ppos; 1184 1185 if (pos < 0) 1186 return -EINVAL; 1187 if (pos >= available) 1188 return 0; 1189 if (count > available - pos) 1190 count = available - pos; 1191 memcpy(to, from + pos, count); 1192 *ppos = pos + count; 1193 1194 return count; 1195 } 1196 EXPORT_SYMBOL(memory_read_from_buffer); 1197 1198 /* 1199 * Transaction based IO. 1200 * The file expects a single write which triggers the transaction, and then 1201 * possibly a read which collects the result - which is stored in a 1202 * file-local buffer. 1203 */ 1204 1205 void simple_transaction_set(struct file *file, size_t n) 1206 { 1207 struct simple_transaction_argresp *ar = file->private_data; 1208 1209 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 1210 1211 /* 1212 * The barrier ensures that ar->size will really remain zero until 1213 * ar->data is ready for reading. 1214 */ 1215 smp_mb(); 1216 ar->size = n; 1217 } 1218 EXPORT_SYMBOL(simple_transaction_set); 1219 1220 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 1221 { 1222 struct simple_transaction_argresp *ar; 1223 static DEFINE_SPINLOCK(simple_transaction_lock); 1224 1225 if (size > SIMPLE_TRANSACTION_LIMIT - 1) 1226 return ERR_PTR(-EFBIG); 1227 1228 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); 1229 if (!ar) 1230 return ERR_PTR(-ENOMEM); 1231 1232 spin_lock(&simple_transaction_lock); 1233 1234 /* only one write allowed per open */ 1235 if (file->private_data) { 1236 spin_unlock(&simple_transaction_lock); 1237 free_page((unsigned long)ar); 1238 return ERR_PTR(-EBUSY); 1239 } 1240 1241 file->private_data = ar; 1242 1243 spin_unlock(&simple_transaction_lock); 1244 1245 if (copy_from_user(ar->data, buf, size)) 1246 return ERR_PTR(-EFAULT); 1247 1248 return ar->data; 1249 } 1250 EXPORT_SYMBOL(simple_transaction_get); 1251 1252 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 1253 { 1254 struct simple_transaction_argresp *ar = file->private_data; 1255 1256 if (!ar) 1257 return 0; 1258 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); 1259 } 1260 EXPORT_SYMBOL(simple_transaction_read); 1261 1262 int simple_transaction_release(struct inode *inode, struct file *file) 1263 { 1264 free_page((unsigned long)file->private_data); 1265 return 0; 1266 } 1267 EXPORT_SYMBOL(simple_transaction_release); 1268 1269 /* Simple attribute files */ 1270 1271 struct simple_attr { 1272 int (*get)(void *, u64 *); 1273 int (*set)(void *, u64); 1274 char get_buf[24]; /* enough to store a u64 and "\n\0" */ 1275 char set_buf[24]; 1276 void *data; 1277 const char *fmt; /* format for read operation */ 1278 struct mutex mutex; /* protects access to these buffers */ 1279 }; 1280 1281 /* simple_attr_open is called by an actual attribute open file operation 1282 * to set the attribute specific access operations. */ 1283 int simple_attr_open(struct inode *inode, struct file *file, 1284 int (*get)(void *, u64 *), int (*set)(void *, u64), 1285 const char *fmt) 1286 { 1287 struct simple_attr *attr; 1288 1289 attr = kzalloc(sizeof(*attr), GFP_KERNEL); 1290 if (!attr) 1291 return -ENOMEM; 1292 1293 attr->get = get; 1294 attr->set = set; 1295 attr->data = inode->i_private; 1296 attr->fmt = fmt; 1297 mutex_init(&attr->mutex); 1298 1299 file->private_data = attr; 1300 1301 return nonseekable_open(inode, file); 1302 } 1303 EXPORT_SYMBOL_GPL(simple_attr_open); 1304 1305 int simple_attr_release(struct inode *inode, struct file *file) 1306 { 1307 kfree(file->private_data); 1308 return 0; 1309 } 1310 EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ 1311 1312 /* read from the buffer that is filled with the get function */ 1313 ssize_t simple_attr_read(struct file *file, char __user *buf, 1314 size_t len, loff_t *ppos) 1315 { 1316 struct simple_attr *attr; 1317 size_t size; 1318 ssize_t ret; 1319 1320 attr = file->private_data; 1321 1322 if (!attr->get) 1323 return -EACCES; 1324 1325 ret = mutex_lock_interruptible(&attr->mutex); 1326 if (ret) 1327 return ret; 1328 1329 if (*ppos && attr->get_buf[0]) { 1330 /* continued read */ 1331 size = strlen(attr->get_buf); 1332 } else { 1333 /* first read */ 1334 u64 val; 1335 ret = attr->get(attr->data, &val); 1336 if (ret) 1337 goto out; 1338 1339 size = scnprintf(attr->get_buf, sizeof(attr->get_buf), 1340 attr->fmt, (unsigned long long)val); 1341 } 1342 1343 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); 1344 out: 1345 mutex_unlock(&attr->mutex); 1346 return ret; 1347 } 1348 EXPORT_SYMBOL_GPL(simple_attr_read); 1349 1350 /* interpret the buffer as a number to call the set function with */ 1351 static ssize_t simple_attr_write_xsigned(struct file *file, const char __user *buf, 1352 size_t len, loff_t *ppos, bool is_signed) 1353 { 1354 struct simple_attr *attr; 1355 unsigned long long val; 1356 size_t size; 1357 ssize_t ret; 1358 1359 attr = file->private_data; 1360 if (!attr->set) 1361 return -EACCES; 1362 1363 ret = mutex_lock_interruptible(&attr->mutex); 1364 if (ret) 1365 return ret; 1366 1367 ret = -EFAULT; 1368 size = min(sizeof(attr->set_buf) - 1, len); 1369 if (copy_from_user(attr->set_buf, buf, size)) 1370 goto out; 1371 1372 attr->set_buf[size] = '\0'; 1373 if (is_signed) 1374 ret = kstrtoll(attr->set_buf, 0, &val); 1375 else 1376 ret = kstrtoull(attr->set_buf, 0, &val); 1377 if (ret) 1378 goto out; 1379 ret = attr->set(attr->data, val); 1380 if (ret == 0) 1381 ret = len; /* on success, claim we got the whole input */ 1382 out: 1383 mutex_unlock(&attr->mutex); 1384 return ret; 1385 } 1386 1387 ssize_t simple_attr_write(struct file *file, const char __user *buf, 1388 size_t len, loff_t *ppos) 1389 { 1390 return simple_attr_write_xsigned(file, buf, len, ppos, false); 1391 } 1392 EXPORT_SYMBOL_GPL(simple_attr_write); 1393 1394 ssize_t simple_attr_write_signed(struct file *file, const char __user *buf, 1395 size_t len, loff_t *ppos) 1396 { 1397 return simple_attr_write_xsigned(file, buf, len, ppos, true); 1398 } 1399 EXPORT_SYMBOL_GPL(simple_attr_write_signed); 1400 1401 /** 1402 * generic_encode_ino32_fh - generic export_operations->encode_fh function 1403 * @inode: the object to encode 1404 * @fh: where to store the file handle fragment 1405 * @max_len: maximum length to store there (in 4 byte units) 1406 * @parent: parent directory inode, if wanted 1407 * 1408 * This generic encode_fh function assumes that the 32 inode number 1409 * is suitable for locating an inode, and that the generation number 1410 * can be used to check that it is still valid. It places them in the 1411 * filehandle fragment where export_decode_fh expects to find them. 1412 */ 1413 int generic_encode_ino32_fh(struct inode *inode, __u32 *fh, int *max_len, 1414 struct inode *parent) 1415 { 1416 struct fid *fid = (void *)fh; 1417 int len = *max_len; 1418 int type = FILEID_INO32_GEN; 1419 1420 if (parent && (len < 4)) { 1421 *max_len = 4; 1422 return FILEID_INVALID; 1423 } else if (len < 2) { 1424 *max_len = 2; 1425 return FILEID_INVALID; 1426 } 1427 1428 len = 2; 1429 fid->i32.ino = inode->i_ino; 1430 fid->i32.gen = inode->i_generation; 1431 if (parent) { 1432 fid->i32.parent_ino = parent->i_ino; 1433 fid->i32.parent_gen = parent->i_generation; 1434 len = 4; 1435 type = FILEID_INO32_GEN_PARENT; 1436 } 1437 *max_len = len; 1438 return type; 1439 } 1440 EXPORT_SYMBOL_GPL(generic_encode_ino32_fh); 1441 1442 /** 1443 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation 1444 * @sb: filesystem to do the file handle conversion on 1445 * @fid: file handle to convert 1446 * @fh_len: length of the file handle in bytes 1447 * @fh_type: type of file handle 1448 * @get_inode: filesystem callback to retrieve inode 1449 * 1450 * This function decodes @fid as long as it has one of the well-known 1451 * Linux filehandle types and calls @get_inode on it to retrieve the 1452 * inode for the object specified in the file handle. 1453 */ 1454 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, 1455 int fh_len, int fh_type, struct inode *(*get_inode) 1456 (struct super_block *sb, u64 ino, u32 gen)) 1457 { 1458 struct inode *inode = NULL; 1459 1460 if (fh_len < 2) 1461 return NULL; 1462 1463 switch (fh_type) { 1464 case FILEID_INO32_GEN: 1465 case FILEID_INO32_GEN_PARENT: 1466 inode = get_inode(sb, fid->i32.ino, fid->i32.gen); 1467 break; 1468 } 1469 1470 return d_obtain_alias(inode); 1471 } 1472 EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 1473 1474 /** 1475 * generic_fh_to_parent - generic helper for the fh_to_parent export operation 1476 * @sb: filesystem to do the file handle conversion on 1477 * @fid: file handle to convert 1478 * @fh_len: length of the file handle in bytes 1479 * @fh_type: type of file handle 1480 * @get_inode: filesystem callback to retrieve inode 1481 * 1482 * This function decodes @fid as long as it has one of the well-known 1483 * Linux filehandle types and calls @get_inode on it to retrieve the 1484 * inode for the _parent_ object specified in the file handle if it 1485 * is specified in the file handle, or NULL otherwise. 1486 */ 1487 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, 1488 int fh_len, int fh_type, struct inode *(*get_inode) 1489 (struct super_block *sb, u64 ino, u32 gen)) 1490 { 1491 struct inode *inode = NULL; 1492 1493 if (fh_len <= 2) 1494 return NULL; 1495 1496 switch (fh_type) { 1497 case FILEID_INO32_GEN_PARENT: 1498 inode = get_inode(sb, fid->i32.parent_ino, 1499 (fh_len > 3 ? fid->i32.parent_gen : 0)); 1500 break; 1501 } 1502 1503 return d_obtain_alias(inode); 1504 } 1505 EXPORT_SYMBOL_GPL(generic_fh_to_parent); 1506 1507 /** 1508 * __generic_file_fsync - generic fsync implementation for simple filesystems 1509 * 1510 * @file: file to synchronize 1511 * @start: start offset in bytes 1512 * @end: end offset in bytes (inclusive) 1513 * @datasync: only synchronize essential metadata if true 1514 * 1515 * This is a generic implementation of the fsync method for simple 1516 * filesystems which track all non-inode metadata in the buffers list 1517 * hanging off the address_space structure. 1518 */ 1519 int __generic_file_fsync(struct file *file, loff_t start, loff_t end, 1520 int datasync) 1521 { 1522 struct inode *inode = file->f_mapping->host; 1523 int err; 1524 int ret; 1525 1526 err = file_write_and_wait_range(file, start, end); 1527 if (err) 1528 return err; 1529 1530 inode_lock(inode); 1531 ret = sync_mapping_buffers(inode->i_mapping); 1532 if (!(inode->i_state & I_DIRTY_ALL)) 1533 goto out; 1534 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 1535 goto out; 1536 1537 err = sync_inode_metadata(inode, 1); 1538 if (ret == 0) 1539 ret = err; 1540 1541 out: 1542 inode_unlock(inode); 1543 /* check and advance again to catch errors after syncing out buffers */ 1544 err = file_check_and_advance_wb_err(file); 1545 if (ret == 0) 1546 ret = err; 1547 return ret; 1548 } 1549 EXPORT_SYMBOL(__generic_file_fsync); 1550 1551 /** 1552 * generic_file_fsync - generic fsync implementation for simple filesystems 1553 * with flush 1554 * @file: file to synchronize 1555 * @start: start offset in bytes 1556 * @end: end offset in bytes (inclusive) 1557 * @datasync: only synchronize essential metadata if true 1558 * 1559 */ 1560 1561 int generic_file_fsync(struct file *file, loff_t start, loff_t end, 1562 int datasync) 1563 { 1564 struct inode *inode = file->f_mapping->host; 1565 int err; 1566 1567 err = __generic_file_fsync(file, start, end, datasync); 1568 if (err) 1569 return err; 1570 return blkdev_issue_flush(inode->i_sb->s_bdev); 1571 } 1572 EXPORT_SYMBOL(generic_file_fsync); 1573 1574 /** 1575 * generic_check_addressable - Check addressability of file system 1576 * @blocksize_bits: log of file system block size 1577 * @num_blocks: number of blocks in file system 1578 * 1579 * Determine whether a file system with @num_blocks blocks (and a 1580 * block size of 2**@blocksize_bits) is addressable by the sector_t 1581 * and page cache of the system. Return 0 if so and -EFBIG otherwise. 1582 */ 1583 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) 1584 { 1585 u64 last_fs_block = num_blocks - 1; 1586 u64 last_fs_page = 1587 last_fs_block >> (PAGE_SHIFT - blocksize_bits); 1588 1589 if (unlikely(num_blocks == 0)) 1590 return 0; 1591 1592 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) 1593 return -EINVAL; 1594 1595 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || 1596 (last_fs_page > (pgoff_t)(~0ULL))) { 1597 return -EFBIG; 1598 } 1599 return 0; 1600 } 1601 EXPORT_SYMBOL(generic_check_addressable); 1602 1603 /* 1604 * No-op implementation of ->fsync for in-memory filesystems. 1605 */ 1606 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) 1607 { 1608 return 0; 1609 } 1610 EXPORT_SYMBOL(noop_fsync); 1611 1612 ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 1613 { 1614 /* 1615 * iomap based filesystems support direct I/O without need for 1616 * this callback. However, it still needs to be set in 1617 * inode->a_ops so that open/fcntl know that direct I/O is 1618 * generally supported. 1619 */ 1620 return -EINVAL; 1621 } 1622 EXPORT_SYMBOL_GPL(noop_direct_IO); 1623 1624 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ 1625 void kfree_link(void *p) 1626 { 1627 kfree(p); 1628 } 1629 EXPORT_SYMBOL(kfree_link); 1630 1631 struct inode *alloc_anon_inode(struct super_block *s) 1632 { 1633 static const struct address_space_operations anon_aops = { 1634 .dirty_folio = noop_dirty_folio, 1635 }; 1636 struct inode *inode = new_inode_pseudo(s); 1637 1638 if (!inode) 1639 return ERR_PTR(-ENOMEM); 1640 1641 inode->i_ino = get_next_ino(); 1642 inode->i_mapping->a_ops = &anon_aops; 1643 1644 /* 1645 * Mark the inode dirty from the very beginning, 1646 * that way it will never be moved to the dirty 1647 * list because mark_inode_dirty() will think 1648 * that it already _is_ on the dirty list. 1649 */ 1650 inode->i_state = I_DIRTY; 1651 /* 1652 * Historically anonymous inodes didn't have a type at all and 1653 * userspace has come to rely on this. Internally they're just 1654 * regular files but S_IFREG is masked off when reporting 1655 * information to userspace. 1656 */ 1657 inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; 1658 inode->i_uid = current_fsuid(); 1659 inode->i_gid = current_fsgid(); 1660 inode->i_flags |= S_PRIVATE | S_ANON_INODE; 1661 simple_inode_init_ts(inode); 1662 return inode; 1663 } 1664 EXPORT_SYMBOL(alloc_anon_inode); 1665 1666 /** 1667 * simple_nosetlease - generic helper for prohibiting leases 1668 * @filp: file pointer 1669 * @arg: type of lease to obtain 1670 * @flp: new lease supplied for insertion 1671 * @priv: private data for lm_setup operation 1672 * 1673 * Generic helper for filesystems that do not wish to allow leases to be set. 1674 * All arguments are ignored and it just returns -EINVAL. 1675 */ 1676 int 1677 simple_nosetlease(struct file *filp, int arg, struct file_lease **flp, 1678 void **priv) 1679 { 1680 return -EINVAL; 1681 } 1682 EXPORT_SYMBOL(simple_nosetlease); 1683 1684 /** 1685 * simple_get_link - generic helper to get the target of "fast" symlinks 1686 * @dentry: not used here 1687 * @inode: the symlink inode 1688 * @done: not used here 1689 * 1690 * Generic helper for filesystems to use for symlink inodes where a pointer to 1691 * the symlink target is stored in ->i_link. NOTE: this isn't normally called, 1692 * since as an optimization the path lookup code uses any non-NULL ->i_link 1693 * directly, without calling ->get_link(). But ->get_link() still must be set, 1694 * to mark the inode_operations as being for a symlink. 1695 * 1696 * Return: the symlink target 1697 */ 1698 const char *simple_get_link(struct dentry *dentry, struct inode *inode, 1699 struct delayed_call *done) 1700 { 1701 return inode->i_link; 1702 } 1703 EXPORT_SYMBOL(simple_get_link); 1704 1705 const struct inode_operations simple_symlink_inode_operations = { 1706 .get_link = simple_get_link, 1707 }; 1708 EXPORT_SYMBOL(simple_symlink_inode_operations); 1709 1710 /* 1711 * Operations for a permanently empty directory. 1712 */ 1713 static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1714 { 1715 return ERR_PTR(-ENOENT); 1716 } 1717 1718 static int empty_dir_setattr(struct mnt_idmap *idmap, 1719 struct dentry *dentry, struct iattr *attr) 1720 { 1721 return -EPERM; 1722 } 1723 1724 static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) 1725 { 1726 return -EOPNOTSUPP; 1727 } 1728 1729 static const struct inode_operations empty_dir_inode_operations = { 1730 .lookup = empty_dir_lookup, 1731 .setattr = empty_dir_setattr, 1732 .listxattr = empty_dir_listxattr, 1733 }; 1734 1735 static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) 1736 { 1737 /* An empty directory has two entries . and .. at offsets 0 and 1 */ 1738 return generic_file_llseek_size(file, offset, whence, 2, 2); 1739 } 1740 1741 static int empty_dir_readdir(struct file *file, struct dir_context *ctx) 1742 { 1743 dir_emit_dots(file, ctx); 1744 return 0; 1745 } 1746 1747 static const struct file_operations empty_dir_operations = { 1748 .llseek = empty_dir_llseek, 1749 .read = generic_read_dir, 1750 .iterate_shared = empty_dir_readdir, 1751 .fsync = noop_fsync, 1752 }; 1753 1754 1755 void make_empty_dir_inode(struct inode *inode) 1756 { 1757 set_nlink(inode, 2); 1758 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 1759 inode->i_uid = GLOBAL_ROOT_UID; 1760 inode->i_gid = GLOBAL_ROOT_GID; 1761 inode->i_rdev = 0; 1762 inode->i_size = 0; 1763 inode->i_blkbits = PAGE_SHIFT; 1764 inode->i_blocks = 0; 1765 1766 inode->i_op = &empty_dir_inode_operations; 1767 inode->i_opflags &= ~IOP_XATTR; 1768 inode->i_fop = &empty_dir_operations; 1769 } 1770 1771 bool is_empty_dir_inode(struct inode *inode) 1772 { 1773 return (inode->i_fop == &empty_dir_operations) && 1774 (inode->i_op == &empty_dir_inode_operations); 1775 } 1776 1777 #if IS_ENABLED(CONFIG_UNICODE) 1778 /** 1779 * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems 1780 * @dentry: dentry whose name we are checking against 1781 * @len: len of name of dentry 1782 * @str: str pointer to name of dentry 1783 * @name: Name to compare against 1784 * 1785 * Return: 0 if names match, 1 if mismatch, or -ERRNO 1786 */ 1787 int generic_ci_d_compare(const struct dentry *dentry, unsigned int len, 1788 const char *str, const struct qstr *name) 1789 { 1790 const struct dentry *parent; 1791 const struct inode *dir; 1792 union shortname_store strbuf; 1793 struct qstr qstr; 1794 1795 /* 1796 * Attempt a case-sensitive match first. It is cheaper and 1797 * should cover most lookups, including all the sane 1798 * applications that expect a case-sensitive filesystem. 1799 * 1800 * This comparison is safe under RCU because the caller 1801 * guarantees the consistency between str and len. See 1802 * __d_lookup_rcu_op_compare() for details. 1803 */ 1804 if (len == name->len && !memcmp(str, name->name, len)) 1805 return 0; 1806 1807 parent = READ_ONCE(dentry->d_parent); 1808 dir = READ_ONCE(parent->d_inode); 1809 if (!dir || !IS_CASEFOLDED(dir)) 1810 return 1; 1811 1812 qstr.len = len; 1813 qstr.name = str; 1814 /* 1815 * If the dentry name is stored in-line, then it may be concurrently 1816 * modified by a rename. If this happens, the VFS will eventually retry 1817 * the lookup, so it doesn't matter what ->d_compare() returns. 1818 * However, it's unsafe to call utf8_strncasecmp() with an unstable 1819 * string. Therefore, we have to copy the name into a temporary buffer. 1820 * As above, len is guaranteed to match str, so the shortname case 1821 * is exactly when str points to ->d_shortname. 1822 */ 1823 if (qstr.name == dentry->d_shortname.string) { 1824 strbuf = dentry->d_shortname; // NUL is guaranteed to be in there 1825 qstr.name = strbuf.string; 1826 /* prevent compiler from optimizing out the temporary buffer */ 1827 barrier(); 1828 } 1829 1830 return utf8_strncasecmp(dentry->d_sb->s_encoding, name, &qstr); 1831 } 1832 EXPORT_SYMBOL(generic_ci_d_compare); 1833 1834 /** 1835 * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems 1836 * @dentry: dentry of the parent directory 1837 * @str: qstr of name whose hash we should fill in 1838 * 1839 * Return: 0 if hash was successful or unchanged, and -EINVAL on error 1840 */ 1841 int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str) 1842 { 1843 const struct inode *dir = READ_ONCE(dentry->d_inode); 1844 struct super_block *sb = dentry->d_sb; 1845 const struct unicode_map *um = sb->s_encoding; 1846 int ret; 1847 1848 if (!dir || !IS_CASEFOLDED(dir)) 1849 return 0; 1850 1851 ret = utf8_casefold_hash(um, dentry, str); 1852 if (ret < 0 && sb_has_strict_encoding(sb)) 1853 return -EINVAL; 1854 return 0; 1855 } 1856 EXPORT_SYMBOL(generic_ci_d_hash); 1857 1858 static const struct dentry_operations generic_ci_dentry_ops = { 1859 .d_hash = generic_ci_d_hash, 1860 .d_compare = generic_ci_d_compare, 1861 #ifdef CONFIG_FS_ENCRYPTION 1862 .d_revalidate = fscrypt_d_revalidate, 1863 #endif 1864 }; 1865 1866 /** 1867 * generic_ci_match() - Match a name (case-insensitively) with a dirent. 1868 * This is a filesystem helper for comparison with directory entries. 1869 * generic_ci_d_compare should be used in VFS' ->d_compare instead. 1870 * 1871 * @parent: Inode of the parent of the dirent under comparison 1872 * @name: name under lookup. 1873 * @folded_name: Optional pre-folded name under lookup 1874 * @de_name: Dirent name. 1875 * @de_name_len: dirent name length. 1876 * 1877 * Test whether a case-insensitive directory entry matches the filename 1878 * being searched. If @folded_name is provided, it is used instead of 1879 * recalculating the casefold of @name. 1880 * 1881 * Return: > 0 if the directory entry matches, 0 if it doesn't match, or 1882 * < 0 on error. 1883 */ 1884 int generic_ci_match(const struct inode *parent, 1885 const struct qstr *name, 1886 const struct qstr *folded_name, 1887 const u8 *de_name, u32 de_name_len) 1888 { 1889 const struct super_block *sb = parent->i_sb; 1890 const struct unicode_map *um = sb->s_encoding; 1891 struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); 1892 struct qstr dirent = QSTR_INIT(de_name, de_name_len); 1893 int res = 0; 1894 1895 if (IS_ENCRYPTED(parent)) { 1896 const struct fscrypt_str encrypted_name = 1897 FSTR_INIT((u8 *) de_name, de_name_len); 1898 1899 if (WARN_ON_ONCE(!fscrypt_has_encryption_key(parent))) 1900 return -EINVAL; 1901 1902 decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); 1903 if (!decrypted_name.name) 1904 return -ENOMEM; 1905 res = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, 1906 &decrypted_name); 1907 if (res < 0) { 1908 kfree(decrypted_name.name); 1909 return res; 1910 } 1911 dirent.name = decrypted_name.name; 1912 dirent.len = decrypted_name.len; 1913 } 1914 1915 /* 1916 * Attempt a case-sensitive match first. It is cheaper and 1917 * should cover most lookups, including all the sane 1918 * applications that expect a case-sensitive filesystem. 1919 */ 1920 1921 if (dirent.len == name->len && 1922 !memcmp(name->name, dirent.name, dirent.len)) 1923 goto out; 1924 1925 if (folded_name->name) 1926 res = utf8_strncasecmp_folded(um, folded_name, &dirent); 1927 else 1928 res = utf8_strncasecmp(um, name, &dirent); 1929 1930 out: 1931 kfree(decrypted_name.name); 1932 if (res < 0 && sb_has_strict_encoding(sb)) { 1933 pr_err_ratelimited("Directory contains filename that is invalid UTF-8"); 1934 return 0; 1935 } 1936 return !res; 1937 } 1938 EXPORT_SYMBOL(generic_ci_match); 1939 #endif 1940 1941 #ifdef CONFIG_FS_ENCRYPTION 1942 static const struct dentry_operations generic_encrypted_dentry_ops = { 1943 .d_revalidate = fscrypt_d_revalidate, 1944 }; 1945 #endif 1946 1947 /** 1948 * generic_set_sb_d_ops - helper for choosing the set of 1949 * filesystem-wide dentry operations for the enabled features 1950 * @sb: superblock to be configured 1951 * 1952 * Filesystems supporting casefolding and/or fscrypt can call this 1953 * helper at mount-time to configure sb->s_d_op to best set of dentry 1954 * operations required for the enabled features. The helper must be 1955 * called after these have been configured, but before the root dentry 1956 * is created. 1957 */ 1958 void generic_set_sb_d_ops(struct super_block *sb) 1959 { 1960 #if IS_ENABLED(CONFIG_UNICODE) 1961 if (sb->s_encoding) { 1962 sb->s_d_op = &generic_ci_dentry_ops; 1963 return; 1964 } 1965 #endif 1966 #ifdef CONFIG_FS_ENCRYPTION 1967 if (sb->s_cop) { 1968 sb->s_d_op = &generic_encrypted_dentry_ops; 1969 return; 1970 } 1971 #endif 1972 } 1973 EXPORT_SYMBOL(generic_set_sb_d_ops); 1974 1975 /** 1976 * inode_maybe_inc_iversion - increments i_version 1977 * @inode: inode with the i_version that should be updated 1978 * @force: increment the counter even if it's not necessary? 1979 * 1980 * Every time the inode is modified, the i_version field must be seen to have 1981 * changed by any observer. 1982 * 1983 * If "force" is set or the QUERIED flag is set, then ensure that we increment 1984 * the value, and clear the queried flag. 1985 * 1986 * In the common case where neither is set, then we can return "false" without 1987 * updating i_version. 1988 * 1989 * If this function returns false, and no other metadata has changed, then we 1990 * can avoid logging the metadata. 1991 */ 1992 bool inode_maybe_inc_iversion(struct inode *inode, bool force) 1993 { 1994 u64 cur, new; 1995 1996 /* 1997 * The i_version field is not strictly ordered with any other inode 1998 * information, but the legacy inode_inc_iversion code used a spinlock 1999 * to serialize increments. 2000 * 2001 * We add a full memory barrier to ensure that any de facto ordering 2002 * with other state is preserved (either implicitly coming from cmpxchg 2003 * or explicitly from smp_mb if we don't know upfront if we will execute 2004 * the former). 2005 * 2006 * These barriers pair with inode_query_iversion(). 2007 */ 2008 cur = inode_peek_iversion_raw(inode); 2009 if (!force && !(cur & I_VERSION_QUERIED)) { 2010 smp_mb(); 2011 cur = inode_peek_iversion_raw(inode); 2012 } 2013 2014 do { 2015 /* If flag is clear then we needn't do anything */ 2016 if (!force && !(cur & I_VERSION_QUERIED)) 2017 return false; 2018 2019 /* Since lowest bit is flag, add 2 to avoid it */ 2020 new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT; 2021 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 2022 return true; 2023 } 2024 EXPORT_SYMBOL(inode_maybe_inc_iversion); 2025 2026 /** 2027 * inode_query_iversion - read i_version for later use 2028 * @inode: inode from which i_version should be read 2029 * 2030 * Read the inode i_version counter. This should be used by callers that wish 2031 * to store the returned i_version for later comparison. This will guarantee 2032 * that a later query of the i_version will result in a different value if 2033 * anything has changed. 2034 * 2035 * In this implementation, we fetch the current value, set the QUERIED flag and 2036 * then try to swap it into place with a cmpxchg, if it wasn't already set. If 2037 * that fails, we try again with the newly fetched value from the cmpxchg. 2038 */ 2039 u64 inode_query_iversion(struct inode *inode) 2040 { 2041 u64 cur, new; 2042 bool fenced = false; 2043 2044 /* 2045 * Memory barriers (implicit in cmpxchg, explicit in smp_mb) pair with 2046 * inode_maybe_inc_iversion(), see that routine for more details. 2047 */ 2048 cur = inode_peek_iversion_raw(inode); 2049 do { 2050 /* If flag is already set, then no need to swap */ 2051 if (cur & I_VERSION_QUERIED) { 2052 if (!fenced) 2053 smp_mb(); 2054 break; 2055 } 2056 2057 fenced = true; 2058 new = cur | I_VERSION_QUERIED; 2059 } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new)); 2060 return cur >> I_VERSION_QUERIED_SHIFT; 2061 } 2062 EXPORT_SYMBOL(inode_query_iversion); 2063 2064 ssize_t direct_write_fallback(struct kiocb *iocb, struct iov_iter *iter, 2065 ssize_t direct_written, ssize_t buffered_written) 2066 { 2067 struct address_space *mapping = iocb->ki_filp->f_mapping; 2068 loff_t pos = iocb->ki_pos - buffered_written; 2069 loff_t end = iocb->ki_pos - 1; 2070 int err; 2071 2072 /* 2073 * If the buffered write fallback returned an error, we want to return 2074 * the number of bytes which were written by direct I/O, or the error 2075 * code if that was zero. 2076 * 2077 * Note that this differs from normal direct-io semantics, which will 2078 * return -EFOO even if some bytes were written. 2079 */ 2080 if (unlikely(buffered_written < 0)) { 2081 if (direct_written) 2082 return direct_written; 2083 return buffered_written; 2084 } 2085 2086 /* 2087 * We need to ensure that the page cache pages are written to disk and 2088 * invalidated to preserve the expected O_DIRECT semantics. 2089 */ 2090 err = filemap_write_and_wait_range(mapping, pos, end); 2091 if (err < 0) { 2092 /* 2093 * We don't know how much we wrote, so just return the number of 2094 * bytes which were direct-written 2095 */ 2096 iocb->ki_pos -= buffered_written; 2097 if (direct_written) 2098 return direct_written; 2099 return err; 2100 } 2101 invalidate_mapping_pages(mapping, pos >> PAGE_SHIFT, end >> PAGE_SHIFT); 2102 return direct_written + buffered_written; 2103 } 2104 EXPORT_SYMBOL_GPL(direct_write_fallback); 2105 2106 /** 2107 * simple_inode_init_ts - initialize the timestamps for a new inode 2108 * @inode: inode to be initialized 2109 * 2110 * When a new inode is created, most filesystems set the timestamps to the 2111 * current time. Add a helper to do this. 2112 */ 2113 struct timespec64 simple_inode_init_ts(struct inode *inode) 2114 { 2115 struct timespec64 ts = inode_set_ctime_current(inode); 2116 2117 inode_set_atime_to_ts(inode, ts); 2118 inode_set_mtime_to_ts(inode, ts); 2119 return ts; 2120 } 2121 EXPORT_SYMBOL(simple_inode_init_ts); 2122 2123 struct dentry *stashed_dentry_get(struct dentry **stashed) 2124 { 2125 struct dentry *dentry; 2126 2127 guard(rcu)(); 2128 dentry = rcu_dereference(*stashed); 2129 if (!dentry) 2130 return NULL; 2131 if (!lockref_get_not_dead(&dentry->d_lockref)) 2132 return NULL; 2133 return dentry; 2134 } 2135 2136 static struct dentry *prepare_anon_dentry(struct dentry **stashed, 2137 struct super_block *sb, 2138 void *data) 2139 { 2140 struct dentry *dentry; 2141 struct inode *inode; 2142 const struct stashed_operations *sops = sb->s_fs_info; 2143 int ret; 2144 2145 inode = new_inode_pseudo(sb); 2146 if (!inode) { 2147 sops->put_data(data); 2148 return ERR_PTR(-ENOMEM); 2149 } 2150 2151 inode->i_flags |= S_IMMUTABLE; 2152 inode->i_mode = S_IFREG; 2153 simple_inode_init_ts(inode); 2154 2155 ret = sops->init_inode(inode, data); 2156 if (ret < 0) { 2157 iput(inode); 2158 return ERR_PTR(ret); 2159 } 2160 2161 /* Notice when this is changed. */ 2162 WARN_ON_ONCE(!S_ISREG(inode->i_mode)); 2163 WARN_ON_ONCE(!IS_IMMUTABLE(inode)); 2164 2165 dentry = d_alloc_anon(sb); 2166 if (!dentry) { 2167 iput(inode); 2168 return ERR_PTR(-ENOMEM); 2169 } 2170 2171 /* Store address of location where dentry's supposed to be stashed. */ 2172 dentry->d_fsdata = stashed; 2173 2174 /* @data is now owned by the fs */ 2175 d_instantiate(dentry, inode); 2176 return dentry; 2177 } 2178 2179 static struct dentry *stash_dentry(struct dentry **stashed, 2180 struct dentry *dentry) 2181 { 2182 guard(rcu)(); 2183 for (;;) { 2184 struct dentry *old; 2185 2186 /* Assume any old dentry was cleared out. */ 2187 old = cmpxchg(stashed, NULL, dentry); 2188 if (likely(!old)) 2189 return dentry; 2190 2191 /* Check if somebody else installed a reusable dentry. */ 2192 if (lockref_get_not_dead(&old->d_lockref)) 2193 return old; 2194 2195 /* There's an old dead dentry there, try to take it over. */ 2196 if (likely(try_cmpxchg(stashed, &old, dentry))) 2197 return dentry; 2198 } 2199 } 2200 2201 /** 2202 * path_from_stashed - create path from stashed or new dentry 2203 * @stashed: where to retrieve or stash dentry 2204 * @mnt: mnt of the filesystems to use 2205 * @data: data to store in inode->i_private 2206 * @path: path to create 2207 * 2208 * The function tries to retrieve a stashed dentry from @stashed. If the dentry 2209 * is still valid then it will be reused. If the dentry isn't able the function 2210 * will allocate a new dentry and inode. It will then check again whether it 2211 * can reuse an existing dentry in case one has been added in the meantime or 2212 * update @stashed with the newly added dentry. 2213 * 2214 * Special-purpose helper for nsfs and pidfs. 2215 * 2216 * Return: On success zero and on failure a negative error is returned. 2217 */ 2218 int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, 2219 struct path *path) 2220 { 2221 struct dentry *dentry; 2222 const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; 2223 2224 /* See if dentry can be reused. */ 2225 path->dentry = stashed_dentry_get(stashed); 2226 if (path->dentry) { 2227 sops->put_data(data); 2228 goto out_path; 2229 } 2230 2231 /* Allocate a new dentry. */ 2232 dentry = prepare_anon_dentry(stashed, mnt->mnt_sb, data); 2233 if (IS_ERR(dentry)) 2234 return PTR_ERR(dentry); 2235 2236 /* Added a new dentry. @data is now owned by the filesystem. */ 2237 path->dentry = stash_dentry(stashed, dentry); 2238 if (path->dentry != dentry) 2239 dput(dentry); 2240 2241 out_path: 2242 WARN_ON_ONCE(path->dentry->d_fsdata != stashed); 2243 WARN_ON_ONCE(d_inode(path->dentry)->i_private != data); 2244 path->mnt = mntget(mnt); 2245 return 0; 2246 } 2247 2248 void stashed_dentry_prune(struct dentry *dentry) 2249 { 2250 struct dentry **stashed = dentry->d_fsdata; 2251 struct inode *inode = d_inode(dentry); 2252 2253 if (WARN_ON_ONCE(!stashed)) 2254 return; 2255 2256 if (!inode) 2257 return; 2258 2259 /* 2260 * Only replace our own @dentry as someone else might've 2261 * already cleared out @dentry and stashed their own 2262 * dentry in there. 2263 */ 2264 cmpxchg(stashed, dentry, NULL); 2265 } 2266