1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * fs/libfs.c 4 * Library for filesystems writers. 5 */ 6 7 #include <linux/blkdev.h> 8 #include <linux/export.h> 9 #include <linux/pagemap.h> 10 #include <linux/slab.h> 11 #include <linux/cred.h> 12 #include <linux/mount.h> 13 #include <linux/vfs.h> 14 #include <linux/quotaops.h> 15 #include <linux/mutex.h> 16 #include <linux/namei.h> 17 #include <linux/exportfs.h> 18 #include <linux/writeback.h> 19 #include <linux/buffer_head.h> /* sync_mapping_buffers */ 20 #include <linux/fs_context.h> 21 #include <linux/pseudo_fs.h> 22 #include <linux/fsnotify.h> 23 24 #include <linux/uaccess.h> 25 26 #include "internal.h" 27 28 int simple_getattr(const struct path *path, struct kstat *stat, 29 u32 request_mask, unsigned int query_flags) 30 { 31 struct inode *inode = d_inode(path->dentry); 32 generic_fillattr(inode, stat); 33 stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9); 34 return 0; 35 } 36 EXPORT_SYMBOL(simple_getattr); 37 38 int simple_statfs(struct dentry *dentry, struct kstatfs *buf) 39 { 40 buf->f_type = dentry->d_sb->s_magic; 41 buf->f_bsize = PAGE_SIZE; 42 buf->f_namelen = NAME_MAX; 43 return 0; 44 } 45 EXPORT_SYMBOL(simple_statfs); 46 47 /* 48 * Retaining negative dentries for an in-memory filesystem just wastes 49 * memory and lookup time: arrange for them to be deleted immediately. 50 */ 51 int always_delete_dentry(const struct dentry *dentry) 52 { 53 return 1; 54 } 55 EXPORT_SYMBOL(always_delete_dentry); 56 57 const struct dentry_operations simple_dentry_operations = { 58 .d_delete = always_delete_dentry, 59 }; 60 EXPORT_SYMBOL(simple_dentry_operations); 61 62 /* 63 * Lookup the data. This is trivial - if the dentry didn't already 64 * exist, we know it is negative. Set d_op to delete negative dentries. 65 */ 66 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 67 { 68 if (dentry->d_name.len > NAME_MAX) 69 return ERR_PTR(-ENAMETOOLONG); 70 if (!dentry->d_sb->s_d_op) 71 d_set_d_op(dentry, &simple_dentry_operations); 72 d_add(dentry, NULL); 73 return NULL; 74 } 75 EXPORT_SYMBOL(simple_lookup); 76 77 int dcache_dir_open(struct inode *inode, struct file *file) 78 { 79 file->private_data = d_alloc_cursor(file->f_path.dentry); 80 81 return file->private_data ? 0 : -ENOMEM; 82 } 83 EXPORT_SYMBOL(dcache_dir_open); 84 85 int dcache_dir_close(struct inode *inode, struct file *file) 86 { 87 dput(file->private_data); 88 return 0; 89 } 90 EXPORT_SYMBOL(dcache_dir_close); 91 92 /* parent is locked at least shared */ 93 /* 94 * Returns an element of siblings' list. 95 * We are looking for <count>th positive after <p>; if 96 * found, dentry is grabbed and returned to caller. 97 * If no such element exists, NULL is returned. 98 */ 99 static struct dentry *scan_positives(struct dentry *cursor, 100 struct list_head *p, 101 loff_t count, 102 struct dentry *last) 103 { 104 struct dentry *dentry = cursor->d_parent, *found = NULL; 105 106 spin_lock(&dentry->d_lock); 107 while ((p = p->next) != &dentry->d_subdirs) { 108 struct dentry *d = list_entry(p, struct dentry, d_child); 109 // we must at least skip cursors, to avoid livelocks 110 if (d->d_flags & DCACHE_DENTRY_CURSOR) 111 continue; 112 if (simple_positive(d) && !--count) { 113 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 114 if (simple_positive(d)) 115 found = dget_dlock(d); 116 spin_unlock(&d->d_lock); 117 if (likely(found)) 118 break; 119 count = 1; 120 } 121 if (need_resched()) { 122 list_move(&cursor->d_child, p); 123 p = &cursor->d_child; 124 spin_unlock(&dentry->d_lock); 125 cond_resched(); 126 spin_lock(&dentry->d_lock); 127 } 128 } 129 spin_unlock(&dentry->d_lock); 130 dput(last); 131 return found; 132 } 133 134 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) 135 { 136 struct dentry *dentry = file->f_path.dentry; 137 switch (whence) { 138 case 1: 139 offset += file->f_pos; 140 /* fall through */ 141 case 0: 142 if (offset >= 0) 143 break; 144 /* fall through */ 145 default: 146 return -EINVAL; 147 } 148 if (offset != file->f_pos) { 149 struct dentry *cursor = file->private_data; 150 struct dentry *to = NULL; 151 152 inode_lock_shared(dentry->d_inode); 153 154 if (offset > 2) 155 to = scan_positives(cursor, &dentry->d_subdirs, 156 offset - 2, NULL); 157 spin_lock(&dentry->d_lock); 158 if (to) 159 list_move(&cursor->d_child, &to->d_child); 160 else 161 list_del_init(&cursor->d_child); 162 spin_unlock(&dentry->d_lock); 163 dput(to); 164 165 file->f_pos = offset; 166 167 inode_unlock_shared(dentry->d_inode); 168 } 169 return offset; 170 } 171 EXPORT_SYMBOL(dcache_dir_lseek); 172 173 /* Relationship between i_mode and the DT_xxx types */ 174 static inline unsigned char dt_type(struct inode *inode) 175 { 176 return (inode->i_mode >> 12) & 15; 177 } 178 179 /* 180 * Directory is locked and all positive dentries in it are safe, since 181 * for ramfs-type trees they can't go away without unlink() or rmdir(), 182 * both impossible due to the lock on directory. 183 */ 184 185 int dcache_readdir(struct file *file, struct dir_context *ctx) 186 { 187 struct dentry *dentry = file->f_path.dentry; 188 struct dentry *cursor = file->private_data; 189 struct list_head *anchor = &dentry->d_subdirs; 190 struct dentry *next = NULL; 191 struct list_head *p; 192 193 if (!dir_emit_dots(file, ctx)) 194 return 0; 195 196 if (ctx->pos == 2) 197 p = anchor; 198 else if (!list_empty(&cursor->d_child)) 199 p = &cursor->d_child; 200 else 201 return 0; 202 203 while ((next = scan_positives(cursor, p, 1, next)) != NULL) { 204 if (!dir_emit(ctx, next->d_name.name, next->d_name.len, 205 d_inode(next)->i_ino, dt_type(d_inode(next)))) 206 break; 207 ctx->pos++; 208 p = &next->d_child; 209 } 210 spin_lock(&dentry->d_lock); 211 if (next) 212 list_move_tail(&cursor->d_child, &next->d_child); 213 else 214 list_del_init(&cursor->d_child); 215 spin_unlock(&dentry->d_lock); 216 dput(next); 217 218 return 0; 219 } 220 EXPORT_SYMBOL(dcache_readdir); 221 222 ssize_t generic_read_dir(struct file *filp, char __user *buf, size_t siz, loff_t *ppos) 223 { 224 return -EISDIR; 225 } 226 EXPORT_SYMBOL(generic_read_dir); 227 228 const struct file_operations simple_dir_operations = { 229 .open = dcache_dir_open, 230 .release = dcache_dir_close, 231 .llseek = dcache_dir_lseek, 232 .read = generic_read_dir, 233 .iterate_shared = dcache_readdir, 234 .fsync = noop_fsync, 235 }; 236 EXPORT_SYMBOL(simple_dir_operations); 237 238 const struct inode_operations simple_dir_inode_operations = { 239 .lookup = simple_lookup, 240 }; 241 EXPORT_SYMBOL(simple_dir_inode_operations); 242 243 static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) 244 { 245 struct dentry *child = NULL; 246 struct list_head *p = prev ? &prev->d_child : &parent->d_subdirs; 247 248 spin_lock(&parent->d_lock); 249 while ((p = p->next) != &parent->d_subdirs) { 250 struct dentry *d = container_of(p, struct dentry, d_child); 251 if (simple_positive(d)) { 252 spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); 253 if (simple_positive(d)) 254 child = dget_dlock(d); 255 spin_unlock(&d->d_lock); 256 if (likely(child)) 257 break; 258 } 259 } 260 spin_unlock(&parent->d_lock); 261 dput(prev); 262 return child; 263 } 264 265 void simple_recursive_removal(struct dentry *dentry, 266 void (*callback)(struct dentry *)) 267 { 268 struct dentry *this = dget(dentry); 269 while (true) { 270 struct dentry *victim = NULL, *child; 271 struct inode *inode = this->d_inode; 272 273 inode_lock(inode); 274 if (d_is_dir(this)) 275 inode->i_flags |= S_DEAD; 276 while ((child = find_next_child(this, victim)) == NULL) { 277 // kill and ascend 278 // update metadata while it's still locked 279 inode->i_ctime = current_time(inode); 280 clear_nlink(inode); 281 inode_unlock(inode); 282 victim = this; 283 this = this->d_parent; 284 inode = this->d_inode; 285 inode_lock(inode); 286 if (simple_positive(victim)) { 287 d_invalidate(victim); // avoid lost mounts 288 if (d_is_dir(victim)) 289 fsnotify_rmdir(inode, victim); 290 else 291 fsnotify_unlink(inode, victim); 292 if (callback) 293 callback(victim); 294 dput(victim); // unpin it 295 } 296 if (victim == dentry) { 297 inode->i_ctime = inode->i_mtime = 298 current_time(inode); 299 if (d_is_dir(dentry)) 300 drop_nlink(inode); 301 inode_unlock(inode); 302 dput(dentry); 303 return; 304 } 305 } 306 inode_unlock(inode); 307 this = child; 308 } 309 } 310 EXPORT_SYMBOL(simple_recursive_removal); 311 312 static const struct super_operations simple_super_operations = { 313 .statfs = simple_statfs, 314 }; 315 316 static int pseudo_fs_fill_super(struct super_block *s, struct fs_context *fc) 317 { 318 struct pseudo_fs_context *ctx = fc->fs_private; 319 struct inode *root; 320 321 s->s_maxbytes = MAX_LFS_FILESIZE; 322 s->s_blocksize = PAGE_SIZE; 323 s->s_blocksize_bits = PAGE_SHIFT; 324 s->s_magic = ctx->magic; 325 s->s_op = ctx->ops ?: &simple_super_operations; 326 s->s_xattr = ctx->xattr; 327 s->s_time_gran = 1; 328 root = new_inode(s); 329 if (!root) 330 return -ENOMEM; 331 332 /* 333 * since this is the first inode, make it number 1. New inodes created 334 * after this must take care not to collide with it (by passing 335 * max_reserved of 1 to iunique). 336 */ 337 root->i_ino = 1; 338 root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; 339 root->i_atime = root->i_mtime = root->i_ctime = current_time(root); 340 s->s_root = d_make_root(root); 341 if (!s->s_root) 342 return -ENOMEM; 343 s->s_d_op = ctx->dops; 344 return 0; 345 } 346 347 static int pseudo_fs_get_tree(struct fs_context *fc) 348 { 349 return get_tree_nodev(fc, pseudo_fs_fill_super); 350 } 351 352 static void pseudo_fs_free(struct fs_context *fc) 353 { 354 kfree(fc->fs_private); 355 } 356 357 static const struct fs_context_operations pseudo_fs_context_ops = { 358 .free = pseudo_fs_free, 359 .get_tree = pseudo_fs_get_tree, 360 }; 361 362 /* 363 * Common helper for pseudo-filesystems (sockfs, pipefs, bdev - stuff that 364 * will never be mountable) 365 */ 366 struct pseudo_fs_context *init_pseudo(struct fs_context *fc, 367 unsigned long magic) 368 { 369 struct pseudo_fs_context *ctx; 370 371 ctx = kzalloc(sizeof(struct pseudo_fs_context), GFP_KERNEL); 372 if (likely(ctx)) { 373 ctx->magic = magic; 374 fc->fs_private = ctx; 375 fc->ops = &pseudo_fs_context_ops; 376 fc->sb_flags |= SB_NOUSER; 377 fc->global = true; 378 } 379 return ctx; 380 } 381 EXPORT_SYMBOL(init_pseudo); 382 383 int simple_open(struct inode *inode, struct file *file) 384 { 385 if (inode->i_private) 386 file->private_data = inode->i_private; 387 return 0; 388 } 389 EXPORT_SYMBOL(simple_open); 390 391 int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) 392 { 393 struct inode *inode = d_inode(old_dentry); 394 395 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 396 inc_nlink(inode); 397 ihold(inode); 398 dget(dentry); 399 d_instantiate(dentry, inode); 400 return 0; 401 } 402 EXPORT_SYMBOL(simple_link); 403 404 int simple_empty(struct dentry *dentry) 405 { 406 struct dentry *child; 407 int ret = 0; 408 409 spin_lock(&dentry->d_lock); 410 list_for_each_entry(child, &dentry->d_subdirs, d_child) { 411 spin_lock_nested(&child->d_lock, DENTRY_D_LOCK_NESTED); 412 if (simple_positive(child)) { 413 spin_unlock(&child->d_lock); 414 goto out; 415 } 416 spin_unlock(&child->d_lock); 417 } 418 ret = 1; 419 out: 420 spin_unlock(&dentry->d_lock); 421 return ret; 422 } 423 EXPORT_SYMBOL(simple_empty); 424 425 int simple_unlink(struct inode *dir, struct dentry *dentry) 426 { 427 struct inode *inode = d_inode(dentry); 428 429 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); 430 drop_nlink(inode); 431 dput(dentry); 432 return 0; 433 } 434 EXPORT_SYMBOL(simple_unlink); 435 436 int simple_rmdir(struct inode *dir, struct dentry *dentry) 437 { 438 if (!simple_empty(dentry)) 439 return -ENOTEMPTY; 440 441 drop_nlink(d_inode(dentry)); 442 simple_unlink(dir, dentry); 443 drop_nlink(dir); 444 return 0; 445 } 446 EXPORT_SYMBOL(simple_rmdir); 447 448 int simple_rename(struct inode *old_dir, struct dentry *old_dentry, 449 struct inode *new_dir, struct dentry *new_dentry, 450 unsigned int flags) 451 { 452 struct inode *inode = d_inode(old_dentry); 453 int they_are_dirs = d_is_dir(old_dentry); 454 455 if (flags & ~RENAME_NOREPLACE) 456 return -EINVAL; 457 458 if (!simple_empty(new_dentry)) 459 return -ENOTEMPTY; 460 461 if (d_really_is_positive(new_dentry)) { 462 simple_unlink(new_dir, new_dentry); 463 if (they_are_dirs) { 464 drop_nlink(d_inode(new_dentry)); 465 drop_nlink(old_dir); 466 } 467 } else if (they_are_dirs) { 468 drop_nlink(old_dir); 469 inc_nlink(new_dir); 470 } 471 472 old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = 473 new_dir->i_mtime = inode->i_ctime = current_time(old_dir); 474 475 return 0; 476 } 477 EXPORT_SYMBOL(simple_rename); 478 479 /** 480 * simple_setattr - setattr for simple filesystem 481 * @dentry: dentry 482 * @iattr: iattr structure 483 * 484 * Returns 0 on success, -error on failure. 485 * 486 * simple_setattr is a simple ->setattr implementation without a proper 487 * implementation of size changes. 488 * 489 * It can either be used for in-memory filesystems or special files 490 * on simple regular filesystems. Anything that needs to change on-disk 491 * or wire state on size changes needs its own setattr method. 492 */ 493 int simple_setattr(struct dentry *dentry, struct iattr *iattr) 494 { 495 struct inode *inode = d_inode(dentry); 496 int error; 497 498 error = setattr_prepare(dentry, iattr); 499 if (error) 500 return error; 501 502 if (iattr->ia_valid & ATTR_SIZE) 503 truncate_setsize(inode, iattr->ia_size); 504 setattr_copy(inode, iattr); 505 mark_inode_dirty(inode); 506 return 0; 507 } 508 EXPORT_SYMBOL(simple_setattr); 509 510 int simple_readpage(struct file *file, struct page *page) 511 { 512 clear_highpage(page); 513 flush_dcache_page(page); 514 SetPageUptodate(page); 515 unlock_page(page); 516 return 0; 517 } 518 EXPORT_SYMBOL(simple_readpage); 519 520 int simple_write_begin(struct file *file, struct address_space *mapping, 521 loff_t pos, unsigned len, unsigned flags, 522 struct page **pagep, void **fsdata) 523 { 524 struct page *page; 525 pgoff_t index; 526 527 index = pos >> PAGE_SHIFT; 528 529 page = grab_cache_page_write_begin(mapping, index, flags); 530 if (!page) 531 return -ENOMEM; 532 533 *pagep = page; 534 535 if (!PageUptodate(page) && (len != PAGE_SIZE)) { 536 unsigned from = pos & (PAGE_SIZE - 1); 537 538 zero_user_segments(page, 0, from, from + len, PAGE_SIZE); 539 } 540 return 0; 541 } 542 EXPORT_SYMBOL(simple_write_begin); 543 544 /** 545 * simple_write_end - .write_end helper for non-block-device FSes 546 * @file: See .write_end of address_space_operations 547 * @mapping: " 548 * @pos: " 549 * @len: " 550 * @copied: " 551 * @page: " 552 * @fsdata: " 553 * 554 * simple_write_end does the minimum needed for updating a page after writing is 555 * done. It has the same API signature as the .write_end of 556 * address_space_operations vector. So it can just be set onto .write_end for 557 * FSes that don't need any other processing. i_mutex is assumed to be held. 558 * Block based filesystems should use generic_write_end(). 559 * NOTE: Even though i_size might get updated by this function, mark_inode_dirty 560 * is not called, so a filesystem that actually does store data in .write_inode 561 * should extend on what's done here with a call to mark_inode_dirty() in the 562 * case that i_size has changed. 563 * 564 * Use *ONLY* with simple_readpage() 565 */ 566 int simple_write_end(struct file *file, struct address_space *mapping, 567 loff_t pos, unsigned len, unsigned copied, 568 struct page *page, void *fsdata) 569 { 570 struct inode *inode = page->mapping->host; 571 loff_t last_pos = pos + copied; 572 573 /* zero the stale part of the page if we did a short copy */ 574 if (!PageUptodate(page)) { 575 if (copied < len) { 576 unsigned from = pos & (PAGE_SIZE - 1); 577 578 zero_user(page, from + copied, len - copied); 579 } 580 SetPageUptodate(page); 581 } 582 /* 583 * No need to use i_size_read() here, the i_size 584 * cannot change under us because we hold the i_mutex. 585 */ 586 if (last_pos > inode->i_size) 587 i_size_write(inode, last_pos); 588 589 set_page_dirty(page); 590 unlock_page(page); 591 put_page(page); 592 593 return copied; 594 } 595 EXPORT_SYMBOL(simple_write_end); 596 597 /* 598 * the inodes created here are not hashed. If you use iunique to generate 599 * unique inode values later for this filesystem, then you must take care 600 * to pass it an appropriate max_reserved value to avoid collisions. 601 */ 602 int simple_fill_super(struct super_block *s, unsigned long magic, 603 const struct tree_descr *files) 604 { 605 struct inode *inode; 606 struct dentry *root; 607 struct dentry *dentry; 608 int i; 609 610 s->s_blocksize = PAGE_SIZE; 611 s->s_blocksize_bits = PAGE_SHIFT; 612 s->s_magic = magic; 613 s->s_op = &simple_super_operations; 614 s->s_time_gran = 1; 615 616 inode = new_inode(s); 617 if (!inode) 618 return -ENOMEM; 619 /* 620 * because the root inode is 1, the files array must not contain an 621 * entry at index 1 622 */ 623 inode->i_ino = 1; 624 inode->i_mode = S_IFDIR | 0755; 625 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 626 inode->i_op = &simple_dir_inode_operations; 627 inode->i_fop = &simple_dir_operations; 628 set_nlink(inode, 2); 629 root = d_make_root(inode); 630 if (!root) 631 return -ENOMEM; 632 for (i = 0; !files->name || files->name[0]; i++, files++) { 633 if (!files->name) 634 continue; 635 636 /* warn if it tries to conflict with the root inode */ 637 if (unlikely(i == 1)) 638 printk(KERN_WARNING "%s: %s passed in a files array" 639 "with an index of 1!\n", __func__, 640 s->s_type->name); 641 642 dentry = d_alloc_name(root, files->name); 643 if (!dentry) 644 goto out; 645 inode = new_inode(s); 646 if (!inode) { 647 dput(dentry); 648 goto out; 649 } 650 inode->i_mode = S_IFREG | files->mode; 651 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 652 inode->i_fop = files->ops; 653 inode->i_ino = i; 654 d_add(dentry, inode); 655 } 656 s->s_root = root; 657 return 0; 658 out: 659 d_genocide(root); 660 shrink_dcache_parent(root); 661 dput(root); 662 return -ENOMEM; 663 } 664 EXPORT_SYMBOL(simple_fill_super); 665 666 static DEFINE_SPINLOCK(pin_fs_lock); 667 668 int simple_pin_fs(struct file_system_type *type, struct vfsmount **mount, int *count) 669 { 670 struct vfsmount *mnt = NULL; 671 spin_lock(&pin_fs_lock); 672 if (unlikely(!*mount)) { 673 spin_unlock(&pin_fs_lock); 674 mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); 675 if (IS_ERR(mnt)) 676 return PTR_ERR(mnt); 677 spin_lock(&pin_fs_lock); 678 if (!*mount) 679 *mount = mnt; 680 } 681 mntget(*mount); 682 ++*count; 683 spin_unlock(&pin_fs_lock); 684 mntput(mnt); 685 return 0; 686 } 687 EXPORT_SYMBOL(simple_pin_fs); 688 689 void simple_release_fs(struct vfsmount **mount, int *count) 690 { 691 struct vfsmount *mnt; 692 spin_lock(&pin_fs_lock); 693 mnt = *mount; 694 if (!--*count) 695 *mount = NULL; 696 spin_unlock(&pin_fs_lock); 697 mntput(mnt); 698 } 699 EXPORT_SYMBOL(simple_release_fs); 700 701 /** 702 * simple_read_from_buffer - copy data from the buffer to user space 703 * @to: the user space buffer to read to 704 * @count: the maximum number of bytes to read 705 * @ppos: the current position in the buffer 706 * @from: the buffer to read from 707 * @available: the size of the buffer 708 * 709 * The simple_read_from_buffer() function reads up to @count bytes from the 710 * buffer @from at offset @ppos into the user space address starting at @to. 711 * 712 * On success, the number of bytes read is returned and the offset @ppos is 713 * advanced by this number, or negative value is returned on error. 714 **/ 715 ssize_t simple_read_from_buffer(void __user *to, size_t count, loff_t *ppos, 716 const void *from, size_t available) 717 { 718 loff_t pos = *ppos; 719 size_t ret; 720 721 if (pos < 0) 722 return -EINVAL; 723 if (pos >= available || !count) 724 return 0; 725 if (count > available - pos) 726 count = available - pos; 727 ret = copy_to_user(to, from + pos, count); 728 if (ret == count) 729 return -EFAULT; 730 count -= ret; 731 *ppos = pos + count; 732 return count; 733 } 734 EXPORT_SYMBOL(simple_read_from_buffer); 735 736 /** 737 * simple_write_to_buffer - copy data from user space to the buffer 738 * @to: the buffer to write to 739 * @available: the size of the buffer 740 * @ppos: the current position in the buffer 741 * @from: the user space buffer to read from 742 * @count: the maximum number of bytes to read 743 * 744 * The simple_write_to_buffer() function reads up to @count bytes from the user 745 * space address starting at @from into the buffer @to at offset @ppos. 746 * 747 * On success, the number of bytes written is returned and the offset @ppos is 748 * advanced by this number, or negative value is returned on error. 749 **/ 750 ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, 751 const void __user *from, size_t count) 752 { 753 loff_t pos = *ppos; 754 size_t res; 755 756 if (pos < 0) 757 return -EINVAL; 758 if (pos >= available || !count) 759 return 0; 760 if (count > available - pos) 761 count = available - pos; 762 res = copy_from_user(to + pos, from, count); 763 if (res == count) 764 return -EFAULT; 765 count -= res; 766 *ppos = pos + count; 767 return count; 768 } 769 EXPORT_SYMBOL(simple_write_to_buffer); 770 771 /** 772 * memory_read_from_buffer - copy data from the buffer 773 * @to: the kernel space buffer to read to 774 * @count: the maximum number of bytes to read 775 * @ppos: the current position in the buffer 776 * @from: the buffer to read from 777 * @available: the size of the buffer 778 * 779 * The memory_read_from_buffer() function reads up to @count bytes from the 780 * buffer @from at offset @ppos into the kernel space address starting at @to. 781 * 782 * On success, the number of bytes read is returned and the offset @ppos is 783 * advanced by this number, or negative value is returned on error. 784 **/ 785 ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos, 786 const void *from, size_t available) 787 { 788 loff_t pos = *ppos; 789 790 if (pos < 0) 791 return -EINVAL; 792 if (pos >= available) 793 return 0; 794 if (count > available - pos) 795 count = available - pos; 796 memcpy(to, from + pos, count); 797 *ppos = pos + count; 798 799 return count; 800 } 801 EXPORT_SYMBOL(memory_read_from_buffer); 802 803 /* 804 * Transaction based IO. 805 * The file expects a single write which triggers the transaction, and then 806 * possibly a read which collects the result - which is stored in a 807 * file-local buffer. 808 */ 809 810 void simple_transaction_set(struct file *file, size_t n) 811 { 812 struct simple_transaction_argresp *ar = file->private_data; 813 814 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT); 815 816 /* 817 * The barrier ensures that ar->size will really remain zero until 818 * ar->data is ready for reading. 819 */ 820 smp_mb(); 821 ar->size = n; 822 } 823 EXPORT_SYMBOL(simple_transaction_set); 824 825 char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 826 { 827 struct simple_transaction_argresp *ar; 828 static DEFINE_SPINLOCK(simple_transaction_lock); 829 830 if (size > SIMPLE_TRANSACTION_LIMIT - 1) 831 return ERR_PTR(-EFBIG); 832 833 ar = (struct simple_transaction_argresp *)get_zeroed_page(GFP_KERNEL); 834 if (!ar) 835 return ERR_PTR(-ENOMEM); 836 837 spin_lock(&simple_transaction_lock); 838 839 /* only one write allowed per open */ 840 if (file->private_data) { 841 spin_unlock(&simple_transaction_lock); 842 free_page((unsigned long)ar); 843 return ERR_PTR(-EBUSY); 844 } 845 846 file->private_data = ar; 847 848 spin_unlock(&simple_transaction_lock); 849 850 if (copy_from_user(ar->data, buf, size)) 851 return ERR_PTR(-EFAULT); 852 853 return ar->data; 854 } 855 EXPORT_SYMBOL(simple_transaction_get); 856 857 ssize_t simple_transaction_read(struct file *file, char __user *buf, size_t size, loff_t *pos) 858 { 859 struct simple_transaction_argresp *ar = file->private_data; 860 861 if (!ar) 862 return 0; 863 return simple_read_from_buffer(buf, size, pos, ar->data, ar->size); 864 } 865 EXPORT_SYMBOL(simple_transaction_read); 866 867 int simple_transaction_release(struct inode *inode, struct file *file) 868 { 869 free_page((unsigned long)file->private_data); 870 return 0; 871 } 872 EXPORT_SYMBOL(simple_transaction_release); 873 874 /* Simple attribute files */ 875 876 struct simple_attr { 877 int (*get)(void *, u64 *); 878 int (*set)(void *, u64); 879 char get_buf[24]; /* enough to store a u64 and "\n\0" */ 880 char set_buf[24]; 881 void *data; 882 const char *fmt; /* format for read operation */ 883 struct mutex mutex; /* protects access to these buffers */ 884 }; 885 886 /* simple_attr_open is called by an actual attribute open file operation 887 * to set the attribute specific access operations. */ 888 int simple_attr_open(struct inode *inode, struct file *file, 889 int (*get)(void *, u64 *), int (*set)(void *, u64), 890 const char *fmt) 891 { 892 struct simple_attr *attr; 893 894 attr = kzalloc(sizeof(*attr), GFP_KERNEL); 895 if (!attr) 896 return -ENOMEM; 897 898 attr->get = get; 899 attr->set = set; 900 attr->data = inode->i_private; 901 attr->fmt = fmt; 902 mutex_init(&attr->mutex); 903 904 file->private_data = attr; 905 906 return nonseekable_open(inode, file); 907 } 908 EXPORT_SYMBOL_GPL(simple_attr_open); 909 910 int simple_attr_release(struct inode *inode, struct file *file) 911 { 912 kfree(file->private_data); 913 return 0; 914 } 915 EXPORT_SYMBOL_GPL(simple_attr_release); /* GPL-only? This? Really? */ 916 917 /* read from the buffer that is filled with the get function */ 918 ssize_t simple_attr_read(struct file *file, char __user *buf, 919 size_t len, loff_t *ppos) 920 { 921 struct simple_attr *attr; 922 size_t size; 923 ssize_t ret; 924 925 attr = file->private_data; 926 927 if (!attr->get) 928 return -EACCES; 929 930 ret = mutex_lock_interruptible(&attr->mutex); 931 if (ret) 932 return ret; 933 934 if (*ppos && attr->get_buf[0]) { 935 /* continued read */ 936 size = strlen(attr->get_buf); 937 } else { 938 /* first read */ 939 u64 val; 940 ret = attr->get(attr->data, &val); 941 if (ret) 942 goto out; 943 944 size = scnprintf(attr->get_buf, sizeof(attr->get_buf), 945 attr->fmt, (unsigned long long)val); 946 } 947 948 ret = simple_read_from_buffer(buf, len, ppos, attr->get_buf, size); 949 out: 950 mutex_unlock(&attr->mutex); 951 return ret; 952 } 953 EXPORT_SYMBOL_GPL(simple_attr_read); 954 955 /* interpret the buffer as a number to call the set function with */ 956 ssize_t simple_attr_write(struct file *file, const char __user *buf, 957 size_t len, loff_t *ppos) 958 { 959 struct simple_attr *attr; 960 u64 val; 961 size_t size; 962 ssize_t ret; 963 964 attr = file->private_data; 965 if (!attr->set) 966 return -EACCES; 967 968 ret = mutex_lock_interruptible(&attr->mutex); 969 if (ret) 970 return ret; 971 972 ret = -EFAULT; 973 size = min(sizeof(attr->set_buf) - 1, len); 974 if (copy_from_user(attr->set_buf, buf, size)) 975 goto out; 976 977 attr->set_buf[size] = '\0'; 978 val = simple_strtoll(attr->set_buf, NULL, 0); 979 ret = attr->set(attr->data, val); 980 if (ret == 0) 981 ret = len; /* on success, claim we got the whole input */ 982 out: 983 mutex_unlock(&attr->mutex); 984 return ret; 985 } 986 EXPORT_SYMBOL_GPL(simple_attr_write); 987 988 /** 989 * generic_fh_to_dentry - generic helper for the fh_to_dentry export operation 990 * @sb: filesystem to do the file handle conversion on 991 * @fid: file handle to convert 992 * @fh_len: length of the file handle in bytes 993 * @fh_type: type of file handle 994 * @get_inode: filesystem callback to retrieve inode 995 * 996 * This function decodes @fid as long as it has one of the well-known 997 * Linux filehandle types and calls @get_inode on it to retrieve the 998 * inode for the object specified in the file handle. 999 */ 1000 struct dentry *generic_fh_to_dentry(struct super_block *sb, struct fid *fid, 1001 int fh_len, int fh_type, struct inode *(*get_inode) 1002 (struct super_block *sb, u64 ino, u32 gen)) 1003 { 1004 struct inode *inode = NULL; 1005 1006 if (fh_len < 2) 1007 return NULL; 1008 1009 switch (fh_type) { 1010 case FILEID_INO32_GEN: 1011 case FILEID_INO32_GEN_PARENT: 1012 inode = get_inode(sb, fid->i32.ino, fid->i32.gen); 1013 break; 1014 } 1015 1016 return d_obtain_alias(inode); 1017 } 1018 EXPORT_SYMBOL_GPL(generic_fh_to_dentry); 1019 1020 /** 1021 * generic_fh_to_parent - generic helper for the fh_to_parent export operation 1022 * @sb: filesystem to do the file handle conversion on 1023 * @fid: file handle to convert 1024 * @fh_len: length of the file handle in bytes 1025 * @fh_type: type of file handle 1026 * @get_inode: filesystem callback to retrieve inode 1027 * 1028 * This function decodes @fid as long as it has one of the well-known 1029 * Linux filehandle types and calls @get_inode on it to retrieve the 1030 * inode for the _parent_ object specified in the file handle if it 1031 * is specified in the file handle, or NULL otherwise. 1032 */ 1033 struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, 1034 int fh_len, int fh_type, struct inode *(*get_inode) 1035 (struct super_block *sb, u64 ino, u32 gen)) 1036 { 1037 struct inode *inode = NULL; 1038 1039 if (fh_len <= 2) 1040 return NULL; 1041 1042 switch (fh_type) { 1043 case FILEID_INO32_GEN_PARENT: 1044 inode = get_inode(sb, fid->i32.parent_ino, 1045 (fh_len > 3 ? fid->i32.parent_gen : 0)); 1046 break; 1047 } 1048 1049 return d_obtain_alias(inode); 1050 } 1051 EXPORT_SYMBOL_GPL(generic_fh_to_parent); 1052 1053 /** 1054 * __generic_file_fsync - generic fsync implementation for simple filesystems 1055 * 1056 * @file: file to synchronize 1057 * @start: start offset in bytes 1058 * @end: end offset in bytes (inclusive) 1059 * @datasync: only synchronize essential metadata if true 1060 * 1061 * This is a generic implementation of the fsync method for simple 1062 * filesystems which track all non-inode metadata in the buffers list 1063 * hanging off the address_space structure. 1064 */ 1065 int __generic_file_fsync(struct file *file, loff_t start, loff_t end, 1066 int datasync) 1067 { 1068 struct inode *inode = file->f_mapping->host; 1069 int err; 1070 int ret; 1071 1072 err = file_write_and_wait_range(file, start, end); 1073 if (err) 1074 return err; 1075 1076 inode_lock(inode); 1077 ret = sync_mapping_buffers(inode->i_mapping); 1078 if (!(inode->i_state & I_DIRTY_ALL)) 1079 goto out; 1080 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) 1081 goto out; 1082 1083 err = sync_inode_metadata(inode, 1); 1084 if (ret == 0) 1085 ret = err; 1086 1087 out: 1088 inode_unlock(inode); 1089 /* check and advance again to catch errors after syncing out buffers */ 1090 err = file_check_and_advance_wb_err(file); 1091 if (ret == 0) 1092 ret = err; 1093 return ret; 1094 } 1095 EXPORT_SYMBOL(__generic_file_fsync); 1096 1097 /** 1098 * generic_file_fsync - generic fsync implementation for simple filesystems 1099 * with flush 1100 * @file: file to synchronize 1101 * @start: start offset in bytes 1102 * @end: end offset in bytes (inclusive) 1103 * @datasync: only synchronize essential metadata if true 1104 * 1105 */ 1106 1107 int generic_file_fsync(struct file *file, loff_t start, loff_t end, 1108 int datasync) 1109 { 1110 struct inode *inode = file->f_mapping->host; 1111 int err; 1112 1113 err = __generic_file_fsync(file, start, end, datasync); 1114 if (err) 1115 return err; 1116 return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); 1117 } 1118 EXPORT_SYMBOL(generic_file_fsync); 1119 1120 /** 1121 * generic_check_addressable - Check addressability of file system 1122 * @blocksize_bits: log of file system block size 1123 * @num_blocks: number of blocks in file system 1124 * 1125 * Determine whether a file system with @num_blocks blocks (and a 1126 * block size of 2**@blocksize_bits) is addressable by the sector_t 1127 * and page cache of the system. Return 0 if so and -EFBIG otherwise. 1128 */ 1129 int generic_check_addressable(unsigned blocksize_bits, u64 num_blocks) 1130 { 1131 u64 last_fs_block = num_blocks - 1; 1132 u64 last_fs_page = 1133 last_fs_block >> (PAGE_SHIFT - blocksize_bits); 1134 1135 if (unlikely(num_blocks == 0)) 1136 return 0; 1137 1138 if ((blocksize_bits < 9) || (blocksize_bits > PAGE_SHIFT)) 1139 return -EINVAL; 1140 1141 if ((last_fs_block > (sector_t)(~0ULL) >> (blocksize_bits - 9)) || 1142 (last_fs_page > (pgoff_t)(~0ULL))) { 1143 return -EFBIG; 1144 } 1145 return 0; 1146 } 1147 EXPORT_SYMBOL(generic_check_addressable); 1148 1149 /* 1150 * No-op implementation of ->fsync for in-memory filesystems. 1151 */ 1152 int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) 1153 { 1154 return 0; 1155 } 1156 EXPORT_SYMBOL(noop_fsync); 1157 1158 int noop_set_page_dirty(struct page *page) 1159 { 1160 /* 1161 * Unlike __set_page_dirty_no_writeback that handles dirty page 1162 * tracking in the page object, dax does all dirty tracking in 1163 * the inode address_space in response to mkwrite faults. In the 1164 * dax case we only need to worry about potentially dirty CPU 1165 * caches, not dirty page cache pages to write back. 1166 * 1167 * This callback is defined to prevent fallback to 1168 * __set_page_dirty_buffers() in set_page_dirty(). 1169 */ 1170 return 0; 1171 } 1172 EXPORT_SYMBOL_GPL(noop_set_page_dirty); 1173 1174 void noop_invalidatepage(struct page *page, unsigned int offset, 1175 unsigned int length) 1176 { 1177 /* 1178 * There is no page cache to invalidate in the dax case, however 1179 * we need this callback defined to prevent falling back to 1180 * block_invalidatepage() in do_invalidatepage(). 1181 */ 1182 } 1183 EXPORT_SYMBOL_GPL(noop_invalidatepage); 1184 1185 ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter) 1186 { 1187 /* 1188 * iomap based filesystems support direct I/O without need for 1189 * this callback. However, it still needs to be set in 1190 * inode->a_ops so that open/fcntl know that direct I/O is 1191 * generally supported. 1192 */ 1193 return -EINVAL; 1194 } 1195 EXPORT_SYMBOL_GPL(noop_direct_IO); 1196 1197 /* Because kfree isn't assignment-compatible with void(void*) ;-/ */ 1198 void kfree_link(void *p) 1199 { 1200 kfree(p); 1201 } 1202 EXPORT_SYMBOL(kfree_link); 1203 1204 /* 1205 * nop .set_page_dirty method so that people can use .page_mkwrite on 1206 * anon inodes. 1207 */ 1208 static int anon_set_page_dirty(struct page *page) 1209 { 1210 return 0; 1211 }; 1212 1213 /* 1214 * A single inode exists for all anon_inode files. Contrary to pipes, 1215 * anon_inode inodes have no associated per-instance data, so we need 1216 * only allocate one of them. 1217 */ 1218 struct inode *alloc_anon_inode(struct super_block *s) 1219 { 1220 static const struct address_space_operations anon_aops = { 1221 .set_page_dirty = anon_set_page_dirty, 1222 }; 1223 struct inode *inode = new_inode_pseudo(s); 1224 1225 if (!inode) 1226 return ERR_PTR(-ENOMEM); 1227 1228 inode->i_ino = get_next_ino(); 1229 inode->i_mapping->a_ops = &anon_aops; 1230 1231 /* 1232 * Mark the inode dirty from the very beginning, 1233 * that way it will never be moved to the dirty 1234 * list because mark_inode_dirty() will think 1235 * that it already _is_ on the dirty list. 1236 */ 1237 inode->i_state = I_DIRTY; 1238 inode->i_mode = S_IRUSR | S_IWUSR; 1239 inode->i_uid = current_fsuid(); 1240 inode->i_gid = current_fsgid(); 1241 inode->i_flags |= S_PRIVATE; 1242 inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); 1243 return inode; 1244 } 1245 EXPORT_SYMBOL(alloc_anon_inode); 1246 1247 /** 1248 * simple_nosetlease - generic helper for prohibiting leases 1249 * @filp: file pointer 1250 * @arg: type of lease to obtain 1251 * @flp: new lease supplied for insertion 1252 * @priv: private data for lm_setup operation 1253 * 1254 * Generic helper for filesystems that do not wish to allow leases to be set. 1255 * All arguments are ignored and it just returns -EINVAL. 1256 */ 1257 int 1258 simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, 1259 void **priv) 1260 { 1261 return -EINVAL; 1262 } 1263 EXPORT_SYMBOL(simple_nosetlease); 1264 1265 /** 1266 * simple_get_link - generic helper to get the target of "fast" symlinks 1267 * @dentry: not used here 1268 * @inode: the symlink inode 1269 * @done: not used here 1270 * 1271 * Generic helper for filesystems to use for symlink inodes where a pointer to 1272 * the symlink target is stored in ->i_link. NOTE: this isn't normally called, 1273 * since as an optimization the path lookup code uses any non-NULL ->i_link 1274 * directly, without calling ->get_link(). But ->get_link() still must be set, 1275 * to mark the inode_operations as being for a symlink. 1276 * 1277 * Return: the symlink target 1278 */ 1279 const char *simple_get_link(struct dentry *dentry, struct inode *inode, 1280 struct delayed_call *done) 1281 { 1282 return inode->i_link; 1283 } 1284 EXPORT_SYMBOL(simple_get_link); 1285 1286 const struct inode_operations simple_symlink_inode_operations = { 1287 .get_link = simple_get_link, 1288 }; 1289 EXPORT_SYMBOL(simple_symlink_inode_operations); 1290 1291 /* 1292 * Operations for a permanently empty directory. 1293 */ 1294 static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 1295 { 1296 return ERR_PTR(-ENOENT); 1297 } 1298 1299 static int empty_dir_getattr(const struct path *path, struct kstat *stat, 1300 u32 request_mask, unsigned int query_flags) 1301 { 1302 struct inode *inode = d_inode(path->dentry); 1303 generic_fillattr(inode, stat); 1304 return 0; 1305 } 1306 1307 static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr) 1308 { 1309 return -EPERM; 1310 } 1311 1312 static ssize_t empty_dir_listxattr(struct dentry *dentry, char *list, size_t size) 1313 { 1314 return -EOPNOTSUPP; 1315 } 1316 1317 static const struct inode_operations empty_dir_inode_operations = { 1318 .lookup = empty_dir_lookup, 1319 .permission = generic_permission, 1320 .setattr = empty_dir_setattr, 1321 .getattr = empty_dir_getattr, 1322 .listxattr = empty_dir_listxattr, 1323 }; 1324 1325 static loff_t empty_dir_llseek(struct file *file, loff_t offset, int whence) 1326 { 1327 /* An empty directory has two entries . and .. at offsets 0 and 1 */ 1328 return generic_file_llseek_size(file, offset, whence, 2, 2); 1329 } 1330 1331 static int empty_dir_readdir(struct file *file, struct dir_context *ctx) 1332 { 1333 dir_emit_dots(file, ctx); 1334 return 0; 1335 } 1336 1337 static const struct file_operations empty_dir_operations = { 1338 .llseek = empty_dir_llseek, 1339 .read = generic_read_dir, 1340 .iterate_shared = empty_dir_readdir, 1341 .fsync = noop_fsync, 1342 }; 1343 1344 1345 void make_empty_dir_inode(struct inode *inode) 1346 { 1347 set_nlink(inode, 2); 1348 inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; 1349 inode->i_uid = GLOBAL_ROOT_UID; 1350 inode->i_gid = GLOBAL_ROOT_GID; 1351 inode->i_rdev = 0; 1352 inode->i_size = 0; 1353 inode->i_blkbits = PAGE_SHIFT; 1354 inode->i_blocks = 0; 1355 1356 inode->i_op = &empty_dir_inode_operations; 1357 inode->i_opflags &= ~IOP_XATTR; 1358 inode->i_fop = &empty_dir_operations; 1359 } 1360 1361 bool is_empty_dir_inode(struct inode *inode) 1362 { 1363 return (inode->i_fop == &empty_dir_operations) && 1364 (inode->i_op == &empty_dir_inode_operations); 1365 } 1366