1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <linux/time.h>
15 #include <linux/init.h>
16 #include <linux/string.h>
17 #include <linux/backing-dev.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/xattr.h>
21 #include <linux/posix_acl.h>
22 #include <linux/falloc.h>
23 #include <linux/slab.h>
24 #include <linux/ratelimit.h>
25 #include <linux/btrfs.h>
26 #include <linux/blkdev.h>
27 #include <linux/posix_acl_xattr.h>
28 #include <linux/uio.h>
29 #include <linux/magic.h>
30 #include <linux/iversion.h>
31 #include <linux/swap.h>
32 #include <linux/migrate.h>
33 #include <linux/sched/mm.h>
34 #include <linux/iomap.h>
35 #include <linux/unaligned.h>
36 #include <linux/fsverity.h>
37 #include "misc.h"
38 #include "ctree.h"
39 #include "disk-io.h"
40 #include "transaction.h"
41 #include "btrfs_inode.h"
42 #include "ordered-data.h"
43 #include "xattr.h"
44 #include "tree-log.h"
45 #include "bio.h"
46 #include "compression.h"
47 #include "locking.h"
48 #include "props.h"
49 #include "qgroup.h"
50 #include "delalloc-space.h"
51 #include "block-group.h"
52 #include "space-info.h"
53 #include "zoned.h"
54 #include "subpage.h"
55 #include "inode-item.h"
56 #include "fs.h"
57 #include "accessors.h"
58 #include "extent-tree.h"
59 #include "root-tree.h"
60 #include "defrag.h"
61 #include "dir-item.h"
62 #include "file-item.h"
63 #include "uuid-tree.h"
64 #include "ioctl.h"
65 #include "file.h"
66 #include "acl.h"
67 #include "relocation.h"
68 #include "verity.h"
69 #include "super.h"
70 #include "orphan.h"
71 #include "backref.h"
72 #include "raid-stripe-tree.h"
73 #include "fiemap.h"
74
75 #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0)
76 #define COW_FILE_RANGE_NO_INLINE (1UL << 1)
77
78 struct btrfs_iget_args {
79 u64 ino;
80 struct btrfs_root *root;
81 };
82
83 struct btrfs_rename_ctx {
84 /* Output field. Stores the index number of the old directory entry. */
85 u64 index;
86 };
87
88 /*
89 * Used by data_reloc_print_warning_inode() to pass needed info for filename
90 * resolution and output of error message.
91 */
92 struct data_reloc_warn {
93 struct btrfs_path path;
94 struct btrfs_fs_info *fs_info;
95 u64 extent_item_size;
96 u64 logical;
97 int mirror_num;
98 };
99
100 /*
101 * For the file_extent_tree, we want to hold the inode lock when we lookup and
102 * update the disk_i_size, but lockdep will complain because our io_tree we hold
103 * the tree lock and get the inode lock when setting delalloc. These two things
104 * are unrelated, so make a class for the file_extent_tree so we don't get the
105 * two locking patterns mixed up.
106 */
107 static struct lock_class_key file_extent_tree_class;
108
109 static const struct inode_operations btrfs_dir_inode_operations;
110 static const struct inode_operations btrfs_symlink_inode_operations;
111 static const struct inode_operations btrfs_special_inode_operations;
112 static const struct inode_operations btrfs_file_inode_operations;
113 static const struct address_space_operations btrfs_aops;
114 static const struct file_operations btrfs_dir_file_operations;
115
116 static struct kmem_cache *btrfs_inode_cachep;
117
118 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
119 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
120
121 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
122 struct folio *locked_folio, u64 start,
123 u64 end, struct writeback_control *wbc,
124 bool pages_dirty);
125
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)126 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
127 u64 root, void *warn_ctx)
128 {
129 struct data_reloc_warn *warn = warn_ctx;
130 struct btrfs_fs_info *fs_info = warn->fs_info;
131 struct extent_buffer *eb;
132 struct btrfs_inode_item *inode_item;
133 struct inode_fs_paths *ipath = NULL;
134 struct btrfs_root *local_root;
135 struct btrfs_key key;
136 unsigned int nofs_flag;
137 u32 nlink;
138 int ret;
139
140 local_root = btrfs_get_fs_root(fs_info, root, true);
141 if (IS_ERR(local_root)) {
142 ret = PTR_ERR(local_root);
143 goto err;
144 }
145
146 /* This makes the path point to (inum INODE_ITEM ioff). */
147 key.objectid = inum;
148 key.type = BTRFS_INODE_ITEM_KEY;
149 key.offset = 0;
150
151 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
152 if (ret) {
153 btrfs_put_root(local_root);
154 btrfs_release_path(&warn->path);
155 goto err;
156 }
157
158 eb = warn->path.nodes[0];
159 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
160 nlink = btrfs_inode_nlink(eb, inode_item);
161 btrfs_release_path(&warn->path);
162
163 nofs_flag = memalloc_nofs_save();
164 ipath = init_ipath(4096, local_root, &warn->path);
165 memalloc_nofs_restore(nofs_flag);
166 if (IS_ERR(ipath)) {
167 btrfs_put_root(local_root);
168 ret = PTR_ERR(ipath);
169 ipath = NULL;
170 /*
171 * -ENOMEM, not a critical error, just output an generic error
172 * without filename.
173 */
174 btrfs_warn(fs_info,
175 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
176 warn->logical, warn->mirror_num, root, inum, offset);
177 return ret;
178 }
179 ret = paths_from_inode(inum, ipath);
180 if (ret < 0) {
181 btrfs_put_root(local_root);
182 goto err;
183 }
184
185 /*
186 * We deliberately ignore the bit ipath might have been too small to
187 * hold all of the paths here
188 */
189 for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
190 btrfs_warn(fs_info,
191 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
192 warn->logical, warn->mirror_num, root, inum, offset,
193 fs_info->sectorsize, nlink,
194 (char *)(unsigned long)ipath->fspath->val[i]);
195 }
196
197 btrfs_put_root(local_root);
198 free_ipath(ipath);
199 return 0;
200
201 err:
202 btrfs_warn(fs_info,
203 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
204 warn->logical, warn->mirror_num, root, inum, offset, ret);
205
206 free_ipath(ipath);
207 return ret;
208 }
209
210 /*
211 * Do extra user-friendly error output (e.g. lookup all the affected files).
212 *
213 * Return true if we succeeded doing the backref lookup.
214 * Return false if such lookup failed, and has to fallback to the old error message.
215 */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)216 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
217 const u8 *csum, const u8 *csum_expected,
218 int mirror_num)
219 {
220 struct btrfs_fs_info *fs_info = inode->root->fs_info;
221 struct btrfs_path path = { 0 };
222 struct btrfs_key found_key = { 0 };
223 struct extent_buffer *eb;
224 struct btrfs_extent_item *ei;
225 const u32 csum_size = fs_info->csum_size;
226 u64 logical;
227 u64 flags;
228 u32 item_size;
229 int ret;
230
231 mutex_lock(&fs_info->reloc_mutex);
232 logical = btrfs_get_reloc_bg_bytenr(fs_info);
233 mutex_unlock(&fs_info->reloc_mutex);
234
235 if (logical == U64_MAX) {
236 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
237 btrfs_warn_rl(fs_info,
238 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
239 btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
240 CSUM_FMT_VALUE(csum_size, csum),
241 CSUM_FMT_VALUE(csum_size, csum_expected),
242 mirror_num);
243 return;
244 }
245
246 logical += file_off;
247 btrfs_warn_rl(fs_info,
248 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
249 btrfs_root_id(inode->root),
250 btrfs_ino(inode), file_off, logical,
251 CSUM_FMT_VALUE(csum_size, csum),
252 CSUM_FMT_VALUE(csum_size, csum_expected),
253 mirror_num);
254
255 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
256 if (ret < 0) {
257 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
258 logical, ret);
259 return;
260 }
261 eb = path.nodes[0];
262 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
263 item_size = btrfs_item_size(eb, path.slots[0]);
264 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
265 unsigned long ptr = 0;
266 u64 ref_root;
267 u8 ref_level;
268
269 while (true) {
270 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
271 item_size, &ref_root,
272 &ref_level);
273 if (ret < 0) {
274 btrfs_warn_rl(fs_info,
275 "failed to resolve tree backref for logical %llu: %d",
276 logical, ret);
277 break;
278 }
279 if (ret > 0)
280 break;
281
282 btrfs_warn_rl(fs_info,
283 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
284 logical, mirror_num,
285 (ref_level ? "node" : "leaf"),
286 ref_level, ref_root);
287 }
288 btrfs_release_path(&path);
289 } else {
290 struct btrfs_backref_walk_ctx ctx = { 0 };
291 struct data_reloc_warn reloc_warn = { 0 };
292
293 btrfs_release_path(&path);
294
295 ctx.bytenr = found_key.objectid;
296 ctx.extent_item_pos = logical - found_key.objectid;
297 ctx.fs_info = fs_info;
298
299 reloc_warn.logical = logical;
300 reloc_warn.extent_item_size = found_key.offset;
301 reloc_warn.mirror_num = mirror_num;
302 reloc_warn.fs_info = fs_info;
303
304 iterate_extent_inodes(&ctx, true,
305 data_reloc_print_warning_inode, &reloc_warn);
306 }
307 }
308
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)309 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
310 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
311 {
312 struct btrfs_root *root = inode->root;
313 const u32 csum_size = root->fs_info->csum_size;
314
315 /* For data reloc tree, it's better to do a backref lookup instead. */
316 if (btrfs_is_data_reloc_root(root))
317 return print_data_reloc_error(inode, logical_start, csum,
318 csum_expected, mirror_num);
319
320 /* Output without objectid, which is more meaningful */
321 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
322 btrfs_warn_rl(root->fs_info,
323 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
324 btrfs_root_id(root), btrfs_ino(inode),
325 logical_start,
326 CSUM_FMT_VALUE(csum_size, csum),
327 CSUM_FMT_VALUE(csum_size, csum_expected),
328 mirror_num);
329 } else {
330 btrfs_warn_rl(root->fs_info,
331 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
332 btrfs_root_id(root), btrfs_ino(inode),
333 logical_start,
334 CSUM_FMT_VALUE(csum_size, csum),
335 CSUM_FMT_VALUE(csum_size, csum_expected),
336 mirror_num);
337 }
338 }
339
340 /*
341 * Lock inode i_rwsem based on arguments passed.
342 *
343 * ilock_flags can have the following bit set:
344 *
345 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
346 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
347 * return -EAGAIN
348 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
349 */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)350 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
351 {
352 if (ilock_flags & BTRFS_ILOCK_SHARED) {
353 if (ilock_flags & BTRFS_ILOCK_TRY) {
354 if (!inode_trylock_shared(&inode->vfs_inode))
355 return -EAGAIN;
356 else
357 return 0;
358 }
359 inode_lock_shared(&inode->vfs_inode);
360 } else {
361 if (ilock_flags & BTRFS_ILOCK_TRY) {
362 if (!inode_trylock(&inode->vfs_inode))
363 return -EAGAIN;
364 else
365 return 0;
366 }
367 inode_lock(&inode->vfs_inode);
368 }
369 if (ilock_flags & BTRFS_ILOCK_MMAP)
370 down_write(&inode->i_mmap_lock);
371 return 0;
372 }
373
374 /*
375 * Unlock inode i_rwsem.
376 *
377 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
378 * to decide whether the lock acquired is shared or exclusive.
379 */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)380 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
381 {
382 if (ilock_flags & BTRFS_ILOCK_MMAP)
383 up_write(&inode->i_mmap_lock);
384 if (ilock_flags & BTRFS_ILOCK_SHARED)
385 inode_unlock_shared(&inode->vfs_inode);
386 else
387 inode_unlock(&inode->vfs_inode);
388 }
389
390 /*
391 * Cleanup all submitted ordered extents in specified range to handle errors
392 * from the btrfs_run_delalloc_range() callback.
393 *
394 * NOTE: caller must ensure that when an error happens, it can not call
395 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
396 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
397 * to be released, which we want to happen only when finishing the ordered
398 * extent (btrfs_finish_ordered_io()).
399 */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,u64 offset,u64 bytes)400 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
401 u64 offset, u64 bytes)
402 {
403 pgoff_t index = offset >> PAGE_SHIFT;
404 const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
405 struct folio *folio;
406
407 while (index <= end_index) {
408 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
409 if (IS_ERR(folio)) {
410 index++;
411 continue;
412 }
413
414 index = folio_end(folio) >> PAGE_SHIFT;
415 /*
416 * Here we just clear all Ordered bits for every page in the
417 * range, then btrfs_mark_ordered_io_finished() will handle
418 * the ordered extent accounting for the range.
419 */
420 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
421 offset, bytes);
422 folio_put(folio);
423 }
424
425 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
426 }
427
428 static int btrfs_dirty_inode(struct btrfs_inode *inode);
429
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)430 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
431 struct btrfs_new_inode_args *args)
432 {
433 int ret;
434
435 if (args->default_acl) {
436 ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
437 ACL_TYPE_DEFAULT);
438 if (ret)
439 return ret;
440 }
441 if (args->acl) {
442 ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
443 if (ret)
444 return ret;
445 }
446 if (!args->default_acl && !args->acl)
447 cache_no_acl(args->inode);
448 return btrfs_xattr_security_init(trans, args->inode, args->dir,
449 &args->dentry->d_name);
450 }
451
452 /*
453 * this does all the hard work for inserting an inline extent into
454 * the btree. The caller should have done a btrfs_drop_extents so that
455 * no overlapping inline items exist in the btree
456 */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)457 static int insert_inline_extent(struct btrfs_trans_handle *trans,
458 struct btrfs_path *path,
459 struct btrfs_inode *inode, bool extent_inserted,
460 size_t size, size_t compressed_size,
461 int compress_type,
462 struct folio *compressed_folio,
463 bool update_i_size)
464 {
465 struct btrfs_root *root = inode->root;
466 struct extent_buffer *leaf;
467 const u32 sectorsize = trans->fs_info->sectorsize;
468 char *kaddr;
469 unsigned long ptr;
470 struct btrfs_file_extent_item *ei;
471 int ret;
472 size_t cur_size = size;
473 u64 i_size;
474
475 /*
476 * The decompressed size must still be no larger than a sector. Under
477 * heavy race, we can have size == 0 passed in, but that shouldn't be a
478 * big deal and we can continue the insertion.
479 */
480 ASSERT(size <= sectorsize);
481
482 /*
483 * The compressed size also needs to be no larger than a sector.
484 * That's also why we only need one page as the parameter.
485 */
486 if (compressed_folio)
487 ASSERT(compressed_size <= sectorsize);
488 else
489 ASSERT(compressed_size == 0);
490
491 if (compressed_size && compressed_folio)
492 cur_size = compressed_size;
493
494 if (!extent_inserted) {
495 struct btrfs_key key;
496 size_t datasize;
497
498 key.objectid = btrfs_ino(inode);
499 key.type = BTRFS_EXTENT_DATA_KEY;
500 key.offset = 0;
501
502 datasize = btrfs_file_extent_calc_inline_size(cur_size);
503 ret = btrfs_insert_empty_item(trans, root, path, &key,
504 datasize);
505 if (ret)
506 goto fail;
507 }
508 leaf = path->nodes[0];
509 ei = btrfs_item_ptr(leaf, path->slots[0],
510 struct btrfs_file_extent_item);
511 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
512 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
513 btrfs_set_file_extent_encryption(leaf, ei, 0);
514 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
515 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
516 ptr = btrfs_file_extent_inline_start(ei);
517
518 if (compress_type != BTRFS_COMPRESS_NONE) {
519 kaddr = kmap_local_folio(compressed_folio, 0);
520 write_extent_buffer(leaf, kaddr, ptr, compressed_size);
521 kunmap_local(kaddr);
522
523 btrfs_set_file_extent_compression(leaf, ei,
524 compress_type);
525 } else {
526 struct folio *folio;
527
528 folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
529 ASSERT(!IS_ERR(folio));
530 btrfs_set_file_extent_compression(leaf, ei, 0);
531 kaddr = kmap_local_folio(folio, 0);
532 write_extent_buffer(leaf, kaddr, ptr, size);
533 kunmap_local(kaddr);
534 folio_put(folio);
535 }
536 btrfs_release_path(path);
537
538 /*
539 * We align size to sectorsize for inline extents just for simplicity
540 * sake.
541 */
542 ret = btrfs_inode_set_file_extent_range(inode, 0,
543 ALIGN(size, root->fs_info->sectorsize));
544 if (ret)
545 goto fail;
546
547 /*
548 * We're an inline extent, so nobody can extend the file past i_size
549 * without locking a page we already have locked.
550 *
551 * We must do any i_size and inode updates before we unlock the pages.
552 * Otherwise we could end up racing with unlink.
553 */
554 i_size = i_size_read(&inode->vfs_inode);
555 if (update_i_size && size > i_size) {
556 i_size_write(&inode->vfs_inode, size);
557 i_size = size;
558 }
559 inode->disk_i_size = i_size;
560
561 fail:
562 return ret;
563 }
564
can_cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size)565 static bool can_cow_file_range_inline(struct btrfs_inode *inode,
566 u64 offset, u64 size,
567 size_t compressed_size)
568 {
569 struct btrfs_fs_info *fs_info = inode->root->fs_info;
570 u64 data_len = (compressed_size ?: size);
571
572 /* Inline extents must start at offset 0. */
573 if (offset != 0)
574 return false;
575
576 /* Inline extents are limited to sectorsize. */
577 if (size > fs_info->sectorsize)
578 return false;
579
580 /* We do not allow a non-compressed extent to be as large as block size. */
581 if (data_len >= fs_info->sectorsize)
582 return false;
583
584 /* We cannot exceed the maximum inline data size. */
585 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
586 return false;
587
588 /* We cannot exceed the user specified max_inline size. */
589 if (data_len > fs_info->max_inline)
590 return false;
591
592 /* Inline extents must be the entirety of the file. */
593 if (size < i_size_read(&inode->vfs_inode))
594 return false;
595
596 return true;
597 }
598
599 /*
600 * conditionally insert an inline extent into the file. This
601 * does the checks required to make sure the data is small enough
602 * to fit as an inline extent.
603 *
604 * If being used directly, you must have already checked we're allowed to cow
605 * the range by getting true from can_cow_file_range_inline().
606 */
__cow_file_range_inline(struct btrfs_inode * inode,u64 size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)607 static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
608 u64 size, size_t compressed_size,
609 int compress_type,
610 struct folio *compressed_folio,
611 bool update_i_size)
612 {
613 struct btrfs_drop_extents_args drop_args = { 0 };
614 struct btrfs_root *root = inode->root;
615 struct btrfs_fs_info *fs_info = root->fs_info;
616 struct btrfs_trans_handle *trans;
617 u64 data_len = (compressed_size ?: size);
618 int ret;
619 struct btrfs_path *path;
620
621 path = btrfs_alloc_path();
622 if (!path)
623 return -ENOMEM;
624
625 trans = btrfs_join_transaction(root);
626 if (IS_ERR(trans)) {
627 btrfs_free_path(path);
628 return PTR_ERR(trans);
629 }
630 trans->block_rsv = &inode->block_rsv;
631
632 drop_args.path = path;
633 drop_args.start = 0;
634 drop_args.end = fs_info->sectorsize;
635 drop_args.drop_cache = true;
636 drop_args.replace_extent = true;
637 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
638 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
639 if (unlikely(ret)) {
640 btrfs_abort_transaction(trans, ret);
641 goto out;
642 }
643
644 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
645 size, compressed_size, compress_type,
646 compressed_folio, update_i_size);
647 if (unlikely(ret && ret != -ENOSPC)) {
648 btrfs_abort_transaction(trans, ret);
649 goto out;
650 } else if (ret == -ENOSPC) {
651 ret = 1;
652 goto out;
653 }
654
655 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
656 ret = btrfs_update_inode(trans, inode);
657 if (unlikely(ret && ret != -ENOSPC)) {
658 btrfs_abort_transaction(trans, ret);
659 goto out;
660 } else if (ret == -ENOSPC) {
661 ret = 1;
662 goto out;
663 }
664
665 btrfs_set_inode_full_sync(inode);
666 out:
667 /*
668 * Don't forget to free the reserved space, as for inlined extent
669 * it won't count as data extent, free them directly here.
670 * And at reserve time, it's always aligned to page size, so
671 * just free one page here.
672 */
673 btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
674 btrfs_free_path(path);
675 btrfs_end_transaction(trans);
676 return ret;
677 }
678
cow_file_range_inline(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 end,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)679 static noinline int cow_file_range_inline(struct btrfs_inode *inode,
680 struct folio *locked_folio,
681 u64 offset, u64 end,
682 size_t compressed_size,
683 int compress_type,
684 struct folio *compressed_folio,
685 bool update_i_size)
686 {
687 struct extent_state *cached = NULL;
688 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
689 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
690 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
691 int ret;
692
693 if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
694 return 1;
695
696 btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
697 ret = __cow_file_range_inline(inode, size, compressed_size,
698 compress_type, compressed_folio,
699 update_i_size);
700 if (ret > 0) {
701 btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
702 return ret;
703 }
704
705 /*
706 * In the successful case (ret == 0 here), cow_file_range will return 1.
707 *
708 * Quite a bit further up the callstack in extent_writepage(), ret == 1
709 * is treated as a short circuited success and does not unlock the folio,
710 * so we must do it here.
711 *
712 * In the failure case, the locked_folio does get unlocked by
713 * btrfs_folio_end_all_writers, which asserts that it is still locked
714 * at that point, so we must *not* unlock it here.
715 *
716 * The other two callsites in compress_file_range do not have a
717 * locked_folio, so they are not relevant to this logic.
718 */
719 if (ret == 0)
720 locked_folio = NULL;
721
722 extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
723 clear_flags, PAGE_UNLOCK |
724 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
725 return ret;
726 }
727
728 struct async_extent {
729 u64 start;
730 u64 ram_size;
731 u64 compressed_size;
732 struct folio **folios;
733 unsigned long nr_folios;
734 int compress_type;
735 struct list_head list;
736 };
737
738 struct async_chunk {
739 struct btrfs_inode *inode;
740 struct folio *locked_folio;
741 u64 start;
742 u64 end;
743 blk_opf_t write_flags;
744 struct list_head extents;
745 struct cgroup_subsys_state *blkcg_css;
746 struct btrfs_work work;
747 struct async_cow *async_cow;
748 };
749
750 struct async_cow {
751 atomic_t num_chunks;
752 struct async_chunk chunks[];
753 };
754
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct folio ** folios,unsigned long nr_folios,int compress_type)755 static noinline int add_async_extent(struct async_chunk *cow,
756 u64 start, u64 ram_size,
757 u64 compressed_size,
758 struct folio **folios,
759 unsigned long nr_folios,
760 int compress_type)
761 {
762 struct async_extent *async_extent;
763
764 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
765 if (!async_extent)
766 return -ENOMEM;
767 async_extent->start = start;
768 async_extent->ram_size = ram_size;
769 async_extent->compressed_size = compressed_size;
770 async_extent->folios = folios;
771 async_extent->nr_folios = nr_folios;
772 async_extent->compress_type = compress_type;
773 list_add_tail(&async_extent->list, &cow->extents);
774 return 0;
775 }
776
777 /*
778 * Check if the inode needs to be submitted to compression, based on mount
779 * options, defragmentation, properties or heuristics.
780 */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)781 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
782 u64 end)
783 {
784 struct btrfs_fs_info *fs_info = inode->root->fs_info;
785
786 if (!btrfs_inode_can_compress(inode)) {
787 DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
788 return 0;
789 }
790
791 /* Defrag ioctl takes precedence over mount options and properties. */
792 if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
793 return 0;
794 if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
795 inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
796 return 1;
797 /* force compress */
798 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
799 return 1;
800 /* bad compression ratios */
801 if (inode->flags & BTRFS_INODE_NOCOMPRESS)
802 return 0;
803 if (btrfs_test_opt(fs_info, COMPRESS) ||
804 inode->flags & BTRFS_INODE_COMPRESS ||
805 inode->prop_compress)
806 return btrfs_compress_heuristic(inode, start, end);
807 return 0;
808 }
809
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)810 static inline void inode_should_defrag(struct btrfs_inode *inode,
811 u64 start, u64 end, u64 num_bytes, u32 small_write)
812 {
813 /* If this is a small write inside eof, kick off a defrag */
814 if (num_bytes < small_write &&
815 (start > 0 || end + 1 < inode->disk_i_size))
816 btrfs_add_inode_defrag(inode, small_write);
817 }
818
extent_range_clear_dirty_for_io(struct btrfs_inode * inode,u64 start,u64 end)819 static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
820 {
821 const pgoff_t end_index = end >> PAGE_SHIFT;
822 struct folio *folio;
823 int ret = 0;
824
825 for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
826 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
827 if (IS_ERR(folio)) {
828 if (!ret)
829 ret = PTR_ERR(folio);
830 continue;
831 }
832 btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
833 end + 1 - start);
834 folio_put(folio);
835 }
836 return ret;
837 }
838
839 /*
840 * Work queue call back to started compression on a file and pages.
841 *
842 * This is done inside an ordered work queue, and the compression is spread
843 * across many cpus. The actual IO submission is step two, and the ordered work
844 * queue takes care of making sure that happens in the same order things were
845 * put onto the queue by writepages and friends.
846 *
847 * If this code finds it can't get good compression, it puts an entry onto the
848 * work queue to write the uncompressed bytes. This makes sure that both
849 * compressed inodes and uncompressed inodes are written in the same order that
850 * the flusher thread sent them down.
851 */
compress_file_range(struct btrfs_work * work)852 static void compress_file_range(struct btrfs_work *work)
853 {
854 struct async_chunk *async_chunk =
855 container_of(work, struct async_chunk, work);
856 struct btrfs_inode *inode = async_chunk->inode;
857 struct btrfs_fs_info *fs_info = inode->root->fs_info;
858 struct address_space *mapping = inode->vfs_inode.i_mapping;
859 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
860 const u32 min_folio_size = btrfs_min_folio_size(fs_info);
861 u64 blocksize = fs_info->sectorsize;
862 u64 start = async_chunk->start;
863 u64 end = async_chunk->end;
864 u64 actual_end;
865 u64 i_size;
866 int ret = 0;
867 struct folio **folios;
868 unsigned long nr_folios;
869 unsigned long total_compressed = 0;
870 unsigned long total_in = 0;
871 unsigned int loff;
872 int i;
873 int compress_type = fs_info->compress_type;
874 int compress_level = fs_info->compress_level;
875
876 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
877
878 /*
879 * We need to call clear_page_dirty_for_io on each page in the range.
880 * Otherwise applications with the file mmap'd can wander in and change
881 * the page contents while we are compressing them.
882 */
883 ret = extent_range_clear_dirty_for_io(inode, start, end);
884
885 /*
886 * All the folios should have been locked thus no failure.
887 *
888 * And even if some folios are missing, btrfs_compress_folios()
889 * would handle them correctly, so here just do an ASSERT() check for
890 * early logic errors.
891 */
892 ASSERT(ret == 0);
893
894 /*
895 * We need to save i_size before now because it could change in between
896 * us evaluating the size and assigning it. This is because we lock and
897 * unlock the page in truncate and fallocate, and then modify the i_size
898 * later on.
899 *
900 * The barriers are to emulate READ_ONCE, remove that once i_size_read
901 * does that for us.
902 */
903 barrier();
904 i_size = i_size_read(&inode->vfs_inode);
905 barrier();
906 actual_end = min_t(u64, i_size, end + 1);
907 again:
908 folios = NULL;
909 nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
910 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
911
912 /*
913 * we don't want to send crud past the end of i_size through
914 * compression, that's just a waste of CPU time. So, if the
915 * end of the file is before the start of our current
916 * requested range of bytes, we bail out to the uncompressed
917 * cleanup code that can deal with all of this.
918 *
919 * It isn't really the fastest way to fix things, but this is a
920 * very uncommon corner.
921 */
922 if (actual_end <= start)
923 goto cleanup_and_bail_uncompressed;
924
925 total_compressed = actual_end - start;
926
927 /*
928 * Skip compression for a small file range(<=blocksize) that
929 * isn't an inline extent, since it doesn't save disk space at all.
930 */
931 if (total_compressed <= blocksize &&
932 (start > 0 || end + 1 < inode->disk_i_size))
933 goto cleanup_and_bail_uncompressed;
934
935 total_compressed = min_t(unsigned long, total_compressed,
936 BTRFS_MAX_UNCOMPRESSED);
937 total_in = 0;
938 ret = 0;
939
940 /*
941 * We do compression for mount -o compress and when the inode has not
942 * been flagged as NOCOMPRESS. This flag can change at any time if we
943 * discover bad compression ratios.
944 */
945 if (!inode_need_compress(inode, start, end))
946 goto cleanup_and_bail_uncompressed;
947
948 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
949 if (!folios) {
950 /*
951 * Memory allocation failure is not a fatal error, we can fall
952 * back to uncompressed code.
953 */
954 goto cleanup_and_bail_uncompressed;
955 }
956
957 if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
958 compress_type = inode->defrag_compress;
959 compress_level = inode->defrag_compress_level;
960 } else if (inode->prop_compress) {
961 compress_type = inode->prop_compress;
962 }
963
964 /* Compression level is applied here. */
965 ret = btrfs_compress_folios(compress_type, compress_level,
966 inode, start, folios, &nr_folios, &total_in,
967 &total_compressed);
968 if (ret)
969 goto mark_incompressible;
970
971 /*
972 * Zero the tail end of the last folio, as we might be sending it down
973 * to disk.
974 */
975 loff = (total_compressed & (min_folio_size - 1));
976 if (loff)
977 folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
978
979 /*
980 * Try to create an inline extent.
981 *
982 * If we didn't compress the entire range, try to create an uncompressed
983 * inline extent, else a compressed one.
984 *
985 * Check cow_file_range() for why we don't even try to create inline
986 * extent for the subpage case.
987 */
988 if (total_in < actual_end)
989 ret = cow_file_range_inline(inode, NULL, start, end, 0,
990 BTRFS_COMPRESS_NONE, NULL, false);
991 else
992 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
993 compress_type, folios[0], false);
994 if (ret <= 0) {
995 if (ret < 0)
996 mapping_set_error(mapping, -EIO);
997 goto free_pages;
998 }
999
1000 /*
1001 * We aren't doing an inline extent. Round the compressed size up to a
1002 * block size boundary so the allocator does sane things.
1003 */
1004 total_compressed = ALIGN(total_compressed, blocksize);
1005
1006 /*
1007 * One last check to make sure the compression is really a win, compare
1008 * the page count read with the blocks on disk, compression must free at
1009 * least one sector.
1010 */
1011 total_in = round_up(total_in, fs_info->sectorsize);
1012 if (total_compressed + blocksize > total_in)
1013 goto mark_incompressible;
1014
1015 /*
1016 * The async work queues will take care of doing actual allocation on
1017 * disk for these compressed pages, and will submit the bios.
1018 */
1019 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1020 nr_folios, compress_type);
1021 BUG_ON(ret);
1022 if (start + total_in < end) {
1023 start += total_in;
1024 cond_resched();
1025 goto again;
1026 }
1027 return;
1028
1029 mark_incompressible:
1030 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1031 inode->flags |= BTRFS_INODE_NOCOMPRESS;
1032 cleanup_and_bail_uncompressed:
1033 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1034 BTRFS_COMPRESS_NONE);
1035 BUG_ON(ret);
1036 free_pages:
1037 if (folios) {
1038 for (i = 0; i < nr_folios; i++) {
1039 WARN_ON(folios[i]->mapping);
1040 btrfs_free_compr_folio(folios[i]);
1041 }
1042 kfree(folios);
1043 }
1044 }
1045
free_async_extent_pages(struct async_extent * async_extent)1046 static void free_async_extent_pages(struct async_extent *async_extent)
1047 {
1048 int i;
1049
1050 if (!async_extent->folios)
1051 return;
1052
1053 for (i = 0; i < async_extent->nr_folios; i++) {
1054 WARN_ON(async_extent->folios[i]->mapping);
1055 btrfs_free_compr_folio(async_extent->folios[i]);
1056 }
1057 kfree(async_extent->folios);
1058 async_extent->nr_folios = 0;
1059 async_extent->folios = NULL;
1060 }
1061
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct folio * locked_folio)1062 static void submit_uncompressed_range(struct btrfs_inode *inode,
1063 struct async_extent *async_extent,
1064 struct folio *locked_folio)
1065 {
1066 u64 start = async_extent->start;
1067 u64 end = async_extent->start + async_extent->ram_size - 1;
1068 int ret;
1069 struct writeback_control wbc = {
1070 .sync_mode = WB_SYNC_ALL,
1071 .range_start = start,
1072 .range_end = end,
1073 .no_cgroup_owner = 1,
1074 };
1075
1076 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1077 ret = run_delalloc_cow(inode, locked_folio, start, end,
1078 &wbc, false);
1079 wbc_detach_inode(&wbc);
1080 if (ret < 0) {
1081 if (locked_folio)
1082 btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
1083 start, async_extent->ram_size);
1084 btrfs_err_rl(inode->root->fs_info,
1085 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1086 __func__, btrfs_root_id(inode->root),
1087 btrfs_ino(inode), start, async_extent->ram_size, ret);
1088 }
1089 }
1090
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1091 static void submit_one_async_extent(struct async_chunk *async_chunk,
1092 struct async_extent *async_extent,
1093 u64 *alloc_hint)
1094 {
1095 struct btrfs_inode *inode = async_chunk->inode;
1096 struct extent_io_tree *io_tree = &inode->io_tree;
1097 struct btrfs_root *root = inode->root;
1098 struct btrfs_fs_info *fs_info = root->fs_info;
1099 struct btrfs_ordered_extent *ordered;
1100 struct btrfs_file_extent file_extent;
1101 struct btrfs_key ins;
1102 struct folio *locked_folio = NULL;
1103 struct extent_state *cached = NULL;
1104 struct extent_map *em;
1105 int ret = 0;
1106 bool free_pages = false;
1107 u64 start = async_extent->start;
1108 u64 end = async_extent->start + async_extent->ram_size - 1;
1109
1110 if (async_chunk->blkcg_css)
1111 kthread_associate_blkcg(async_chunk->blkcg_css);
1112
1113 /*
1114 * If async_chunk->locked_folio is in the async_extent range, we need to
1115 * handle it.
1116 */
1117 if (async_chunk->locked_folio) {
1118 u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
1119 u64 locked_folio_end = locked_folio_start +
1120 folio_size(async_chunk->locked_folio) - 1;
1121
1122 if (!(start >= locked_folio_end || end <= locked_folio_start))
1123 locked_folio = async_chunk->locked_folio;
1124 }
1125
1126 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1127 ASSERT(!async_extent->folios);
1128 ASSERT(async_extent->nr_folios == 0);
1129 submit_uncompressed_range(inode, async_extent, locked_folio);
1130 free_pages = true;
1131 goto done;
1132 }
1133
1134 ret = btrfs_reserve_extent(root, async_extent->ram_size,
1135 async_extent->compressed_size,
1136 async_extent->compressed_size,
1137 0, *alloc_hint, &ins, 1, 1);
1138 if (ret) {
1139 /*
1140 * We can't reserve contiguous space for the compressed size.
1141 * Unlikely, but it's possible that we could have enough
1142 * non-contiguous space for the uncompressed size instead. So
1143 * fall back to uncompressed.
1144 */
1145 submit_uncompressed_range(inode, async_extent, locked_folio);
1146 free_pages = true;
1147 goto done;
1148 }
1149
1150 btrfs_lock_extent(io_tree, start, end, &cached);
1151
1152 /* Here we're doing allocation and writeback of the compressed pages */
1153 file_extent.disk_bytenr = ins.objectid;
1154 file_extent.disk_num_bytes = ins.offset;
1155 file_extent.ram_bytes = async_extent->ram_size;
1156 file_extent.num_bytes = async_extent->ram_size;
1157 file_extent.offset = 0;
1158 file_extent.compression = async_extent->compress_type;
1159
1160 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
1161 if (IS_ERR(em)) {
1162 ret = PTR_ERR(em);
1163 goto out_free_reserve;
1164 }
1165 btrfs_free_extent_map(em);
1166
1167 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1168 1U << BTRFS_ORDERED_COMPRESSED);
1169 if (IS_ERR(ordered)) {
1170 btrfs_drop_extent_map_range(inode, start, end, false);
1171 ret = PTR_ERR(ordered);
1172 goto out_free_reserve;
1173 }
1174 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1175
1176 /* Clear dirty, set writeback and unlock the pages. */
1177 extent_clear_unlock_delalloc(inode, start, end,
1178 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
1179 PAGE_UNLOCK | PAGE_START_WRITEBACK);
1180 btrfs_submit_compressed_write(ordered,
1181 async_extent->folios, /* compressed_folios */
1182 async_extent->nr_folios,
1183 async_chunk->write_flags, true);
1184 *alloc_hint = ins.objectid + ins.offset;
1185 done:
1186 if (async_chunk->blkcg_css)
1187 kthread_associate_blkcg(NULL);
1188 if (free_pages)
1189 free_async_extent_pages(async_extent);
1190 kfree(async_extent);
1191 return;
1192
1193 out_free_reserve:
1194 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1195 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
1196 mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1197 extent_clear_unlock_delalloc(inode, start, end,
1198 NULL, &cached,
1199 EXTENT_LOCKED | EXTENT_DELALLOC |
1200 EXTENT_DELALLOC_NEW |
1201 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1202 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1203 PAGE_END_WRITEBACK);
1204 free_async_extent_pages(async_extent);
1205 if (async_chunk->blkcg_css)
1206 kthread_associate_blkcg(NULL);
1207 btrfs_debug(fs_info,
1208 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1209 btrfs_root_id(root), btrfs_ino(inode), start,
1210 async_extent->ram_size, ret);
1211 kfree(async_extent);
1212 }
1213
btrfs_get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1214 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1215 u64 num_bytes)
1216 {
1217 struct extent_map_tree *em_tree = &inode->extent_tree;
1218 struct extent_map *em;
1219 u64 alloc_hint = 0;
1220
1221 read_lock(&em_tree->lock);
1222 em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
1223 if (em) {
1224 /*
1225 * if block start isn't an actual block number then find the
1226 * first block in this inode and use that as a hint. If that
1227 * block is also bogus then just don't worry about it.
1228 */
1229 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
1230 btrfs_free_extent_map(em);
1231 em = btrfs_search_extent_mapping(em_tree, 0, 0);
1232 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1233 alloc_hint = btrfs_extent_map_block_start(em);
1234 if (em)
1235 btrfs_free_extent_map(em);
1236 } else {
1237 alloc_hint = btrfs_extent_map_block_start(em);
1238 btrfs_free_extent_map(em);
1239 }
1240 }
1241 read_unlock(&em_tree->lock);
1242
1243 return alloc_hint;
1244 }
1245
1246 /*
1247 * when extent_io.c finds a delayed allocation range in the file,
1248 * the call backs end up in this code. The basic idea is to
1249 * allocate extents on disk for the range, and create ordered data structs
1250 * in ram to track those extents.
1251 *
1252 * locked_folio is the folio that writepage had locked already. We use
1253 * it to make sure we don't do extra locks or unlocks.
1254 *
1255 * When this function fails, it unlocks all folios except @locked_folio.
1256 *
1257 * When this function successfully creates an inline extent, it returns 1 and
1258 * unlocks all folios including locked_folio and starts I/O on them.
1259 * (In reality inline extents are limited to a single block, so locked_folio is
1260 * the only folio handled anyway).
1261 *
1262 * When this function succeed and creates a normal extent, the folio locking
1263 * status depends on the passed in flags:
1264 *
1265 * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
1266 * - Else all folios except for @locked_folio are unlocked.
1267 *
1268 * When a failure happens in the second or later iteration of the
1269 * while-loop, the ordered extents created in previous iterations are cleaned up.
1270 */
cow_file_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,u64 * done_offset,unsigned long flags)1271 static noinline int cow_file_range(struct btrfs_inode *inode,
1272 struct folio *locked_folio, u64 start,
1273 u64 end, u64 *done_offset,
1274 unsigned long flags)
1275 {
1276 struct btrfs_root *root = inode->root;
1277 struct btrfs_fs_info *fs_info = root->fs_info;
1278 struct extent_state *cached = NULL;
1279 u64 alloc_hint = 0;
1280 u64 orig_start = start;
1281 u64 num_bytes;
1282 u64 cur_alloc_size = 0;
1283 u64 min_alloc_size;
1284 u64 blocksize = fs_info->sectorsize;
1285 struct btrfs_key ins;
1286 struct extent_map *em;
1287 unsigned clear_bits;
1288 unsigned long page_ops;
1289 int ret = 0;
1290
1291 if (btrfs_is_free_space_inode(inode)) {
1292 ret = -EINVAL;
1293 goto out_unlock;
1294 }
1295
1296 num_bytes = ALIGN(end - start + 1, blocksize);
1297 num_bytes = max(blocksize, num_bytes);
1298 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1299
1300 inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1301
1302 if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
1303 /* lets try to make an inline extent */
1304 ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
1305 BTRFS_COMPRESS_NONE, NULL, false);
1306 if (ret <= 0) {
1307 /*
1308 * We succeeded, return 1 so the caller knows we're done
1309 * with this page and already handled the IO.
1310 *
1311 * If there was an error then cow_file_range_inline() has
1312 * already done the cleanup.
1313 */
1314 if (ret == 0)
1315 ret = 1;
1316 goto done;
1317 }
1318 }
1319
1320 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
1321
1322 /*
1323 * We're not doing compressed IO, don't unlock the first page (which
1324 * the caller expects to stay locked), don't clear any dirty bits and
1325 * don't set any writeback bits.
1326 *
1327 * Do set the Ordered (Private2) bit so we know this page was properly
1328 * setup for writepage.
1329 */
1330 page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
1331 page_ops |= PAGE_SET_ORDERED;
1332
1333 /*
1334 * Relocation relies on the relocated extents to have exactly the same
1335 * size as the original extents. Normally writeback for relocation data
1336 * extents follows a NOCOW path because relocation preallocates the
1337 * extents. However, due to an operation such as scrub turning a block
1338 * group to RO mode, it may fallback to COW mode, so we must make sure
1339 * an extent allocated during COW has exactly the requested size and can
1340 * not be split into smaller extents, otherwise relocation breaks and
1341 * fails during the stage where it updates the bytenr of file extent
1342 * items.
1343 */
1344 if (btrfs_is_data_reloc_root(root))
1345 min_alloc_size = num_bytes;
1346 else
1347 min_alloc_size = fs_info->sectorsize;
1348
1349 while (num_bytes > 0) {
1350 struct btrfs_ordered_extent *ordered;
1351 struct btrfs_file_extent file_extent;
1352
1353 ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
1354 min_alloc_size, 0, alloc_hint,
1355 &ins, 1, 1);
1356 if (ret == -EAGAIN) {
1357 /*
1358 * btrfs_reserve_extent only returns -EAGAIN for zoned
1359 * file systems, which is an indication that there are
1360 * no active zones to allocate from at the moment.
1361 *
1362 * If this is the first loop iteration, wait for at
1363 * least one zone to finish before retrying the
1364 * allocation. Otherwise ask the caller to write out
1365 * the already allocated blocks before coming back to
1366 * us, or return -ENOSPC if it can't handle retries.
1367 */
1368 ASSERT(btrfs_is_zoned(fs_info));
1369 if (start == orig_start) {
1370 wait_on_bit_io(&inode->root->fs_info->flags,
1371 BTRFS_FS_NEED_ZONE_FINISH,
1372 TASK_UNINTERRUPTIBLE);
1373 continue;
1374 }
1375 if (done_offset) {
1376 /*
1377 * Move @end to the end of the processed range,
1378 * and exit the loop to unlock the processed extents.
1379 */
1380 end = start - 1;
1381 ret = 0;
1382 break;
1383 }
1384 ret = -ENOSPC;
1385 }
1386 if (ret < 0)
1387 goto out_unlock;
1388 cur_alloc_size = ins.offset;
1389
1390 file_extent.disk_bytenr = ins.objectid;
1391 file_extent.disk_num_bytes = ins.offset;
1392 file_extent.num_bytes = ins.offset;
1393 file_extent.ram_bytes = ins.offset;
1394 file_extent.offset = 0;
1395 file_extent.compression = BTRFS_COMPRESS_NONE;
1396
1397 /*
1398 * Locked range will be released either during error clean up or
1399 * after the whole range is finished.
1400 */
1401 btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
1402 &cached);
1403
1404 em = btrfs_create_io_em(inode, start, &file_extent,
1405 BTRFS_ORDERED_REGULAR);
1406 if (IS_ERR(em)) {
1407 btrfs_unlock_extent(&inode->io_tree, start,
1408 start + cur_alloc_size - 1, &cached);
1409 ret = PTR_ERR(em);
1410 goto out_reserve;
1411 }
1412 btrfs_free_extent_map(em);
1413
1414 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1415 1U << BTRFS_ORDERED_REGULAR);
1416 if (IS_ERR(ordered)) {
1417 btrfs_unlock_extent(&inode->io_tree, start,
1418 start + cur_alloc_size - 1, &cached);
1419 ret = PTR_ERR(ordered);
1420 goto out_drop_extent_cache;
1421 }
1422
1423 if (btrfs_is_data_reloc_root(root)) {
1424 ret = btrfs_reloc_clone_csums(ordered);
1425
1426 /*
1427 * Only drop cache here, and process as normal.
1428 *
1429 * We must not allow extent_clear_unlock_delalloc()
1430 * at out_unlock label to free meta of this ordered
1431 * extent, as its meta should be freed by
1432 * btrfs_finish_ordered_io().
1433 *
1434 * So we must continue until @start is increased to
1435 * skip current ordered extent.
1436 */
1437 if (ret)
1438 btrfs_drop_extent_map_range(inode, start,
1439 start + cur_alloc_size - 1,
1440 false);
1441 }
1442 btrfs_put_ordered_extent(ordered);
1443
1444 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1445
1446 if (num_bytes < cur_alloc_size)
1447 num_bytes = 0;
1448 else
1449 num_bytes -= cur_alloc_size;
1450 alloc_hint = ins.objectid + ins.offset;
1451 start += cur_alloc_size;
1452 cur_alloc_size = 0;
1453
1454 /*
1455 * btrfs_reloc_clone_csums() error, since start is increased
1456 * extent_clear_unlock_delalloc() at out_unlock label won't
1457 * free metadata of current ordered extent, we're OK to exit.
1458 */
1459 if (ret)
1460 goto out_unlock;
1461 }
1462 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
1463 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
1464 done:
1465 if (done_offset)
1466 *done_offset = end;
1467 return ret;
1468
1469 out_drop_extent_cache:
1470 btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
1471 out_reserve:
1472 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1473 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
1474 out_unlock:
1475 /*
1476 * Now, we have three regions to clean up:
1477 *
1478 * |-------(1)----|---(2)---|-------------(3)----------|
1479 * `- orig_start `- start `- start + cur_alloc_size `- end
1480 *
1481 * We process each region below.
1482 */
1483
1484 /*
1485 * For the range (1). We have already instantiated the ordered extents
1486 * for this region, thus we need to cleanup those ordered extents.
1487 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
1488 * are also handled by the ordered extents cleanup.
1489 *
1490 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
1491 * finish the writeback of the involved folios, which will be never submitted.
1492 */
1493 if (orig_start < start) {
1494 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1495 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1496
1497 if (!locked_folio)
1498 mapping_set_error(inode->vfs_inode.i_mapping, ret);
1499
1500 btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
1501 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1502 locked_folio, NULL, clear_bits, page_ops);
1503 }
1504
1505 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1506 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1507 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1508
1509 /*
1510 * For the range (2). If we reserved an extent for our delalloc range
1511 * (or a subrange) and failed to create the respective ordered extent,
1512 * then it means that when we reserved the extent we decremented the
1513 * extent's size from the data space_info's bytes_may_use counter and
1514 * incremented the space_info's bytes_reserved counter by the same
1515 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1516 * to decrement again the data space_info's bytes_may_use counter,
1517 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1518 */
1519 if (cur_alloc_size) {
1520 extent_clear_unlock_delalloc(inode, start,
1521 start + cur_alloc_size - 1,
1522 locked_folio, &cached, clear_bits,
1523 page_ops);
1524 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1525 }
1526
1527 /*
1528 * For the range (3). We never touched the region. In addition to the
1529 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1530 * space_info's bytes_may_use counter, reserved in
1531 * btrfs_check_data_free_space().
1532 */
1533 if (start + cur_alloc_size < end) {
1534 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1535 extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
1536 end, locked_folio,
1537 &cached, clear_bits, page_ops);
1538 btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
1539 end - start - cur_alloc_size + 1, NULL);
1540 }
1541 btrfs_err(fs_info,
1542 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
1543 __func__, btrfs_root_id(inode->root),
1544 btrfs_ino(inode), orig_start, end + 1 - orig_start,
1545 start, cur_alloc_size, ret);
1546 return ret;
1547 }
1548
1549 /*
1550 * Phase two of compressed writeback. This is the ordered portion of the code,
1551 * which only gets called in the order the work was queued. We walk all the
1552 * async extents created by compress_file_range and send them down to the disk.
1553 *
1554 * If called with @do_free == true then it'll try to finish the work and free
1555 * the work struct eventually.
1556 */
submit_compressed_extents(struct btrfs_work * work,bool do_free)1557 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1558 {
1559 struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1560 work);
1561 struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1562 struct async_extent *async_extent;
1563 unsigned long nr_pages;
1564 u64 alloc_hint = 0;
1565
1566 if (do_free) {
1567 struct async_cow *async_cow;
1568
1569 btrfs_add_delayed_iput(async_chunk->inode);
1570 if (async_chunk->blkcg_css)
1571 css_put(async_chunk->blkcg_css);
1572
1573 async_cow = async_chunk->async_cow;
1574 if (atomic_dec_and_test(&async_cow->num_chunks))
1575 kvfree(async_cow);
1576 return;
1577 }
1578
1579 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1580 PAGE_SHIFT;
1581
1582 while (!list_empty(&async_chunk->extents)) {
1583 async_extent = list_first_entry(&async_chunk->extents,
1584 struct async_extent, list);
1585 list_del(&async_extent->list);
1586 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1587 }
1588
1589 /* atomic_sub_return implies a barrier */
1590 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1591 5 * SZ_1M)
1592 cond_wake_up_nomb(&fs_info->async_submit_wait);
1593 }
1594
run_delalloc_compressed(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)1595 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1596 struct folio *locked_folio, u64 start,
1597 u64 end, struct writeback_control *wbc)
1598 {
1599 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1600 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1601 struct async_cow *ctx;
1602 struct async_chunk *async_chunk;
1603 unsigned long nr_pages;
1604 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1605 int i;
1606 unsigned nofs_flag;
1607 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1608
1609 nofs_flag = memalloc_nofs_save();
1610 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1611 memalloc_nofs_restore(nofs_flag);
1612 if (!ctx)
1613 return false;
1614
1615 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1616
1617 async_chunk = ctx->chunks;
1618 atomic_set(&ctx->num_chunks, num_chunks);
1619
1620 for (i = 0; i < num_chunks; i++) {
1621 u64 cur_end = min(end, start + SZ_512K - 1);
1622
1623 /*
1624 * igrab is called higher up in the call chain, take only the
1625 * lightweight reference for the callback lifetime
1626 */
1627 ihold(&inode->vfs_inode);
1628 async_chunk[i].async_cow = ctx;
1629 async_chunk[i].inode = inode;
1630 async_chunk[i].start = start;
1631 async_chunk[i].end = cur_end;
1632 async_chunk[i].write_flags = write_flags;
1633 INIT_LIST_HEAD(&async_chunk[i].extents);
1634
1635 /*
1636 * The locked_folio comes all the way from writepage and its
1637 * the original folio we were actually given. As we spread
1638 * this large delalloc region across multiple async_chunk
1639 * structs, only the first struct needs a pointer to
1640 * locked_folio.
1641 *
1642 * This way we don't need racey decisions about who is supposed
1643 * to unlock it.
1644 */
1645 if (locked_folio) {
1646 /*
1647 * Depending on the compressibility, the pages might or
1648 * might not go through async. We want all of them to
1649 * be accounted against wbc once. Let's do it here
1650 * before the paths diverge. wbc accounting is used
1651 * only for foreign writeback detection and doesn't
1652 * need full accuracy. Just account the whole thing
1653 * against the first page.
1654 */
1655 wbc_account_cgroup_owner(wbc, locked_folio,
1656 cur_end - start);
1657 async_chunk[i].locked_folio = locked_folio;
1658 locked_folio = NULL;
1659 } else {
1660 async_chunk[i].locked_folio = NULL;
1661 }
1662
1663 if (blkcg_css != blkcg_root_css) {
1664 css_get(blkcg_css);
1665 async_chunk[i].blkcg_css = blkcg_css;
1666 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1667 } else {
1668 async_chunk[i].blkcg_css = NULL;
1669 }
1670
1671 btrfs_init_work(&async_chunk[i].work, compress_file_range,
1672 submit_compressed_extents);
1673
1674 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1675 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1676
1677 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1678
1679 start = cur_end + 1;
1680 }
1681 return true;
1682 }
1683
1684 /*
1685 * Run the delalloc range from start to end, and write back any dirty pages
1686 * covered by the range.
1687 */
run_delalloc_cow(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1688 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1689 struct folio *locked_folio, u64 start,
1690 u64 end, struct writeback_control *wbc,
1691 bool pages_dirty)
1692 {
1693 u64 done_offset = end;
1694 int ret;
1695
1696 while (start <= end) {
1697 ret = cow_file_range(inode, locked_folio, start, end,
1698 &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
1699 if (ret)
1700 return ret;
1701 extent_write_locked_range(&inode->vfs_inode, locked_folio,
1702 start, done_offset, wbc, pages_dirty);
1703 start = done_offset + 1;
1704 }
1705
1706 return 1;
1707 }
1708
fallback_to_cow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)1709 static int fallback_to_cow(struct btrfs_inode *inode,
1710 struct folio *locked_folio, const u64 start,
1711 const u64 end)
1712 {
1713 const bool is_space_ino = btrfs_is_free_space_inode(inode);
1714 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1715 const u64 range_bytes = end + 1 - start;
1716 struct extent_io_tree *io_tree = &inode->io_tree;
1717 struct extent_state *cached_state = NULL;
1718 u64 range_start = start;
1719 u64 count;
1720 int ret;
1721
1722 /*
1723 * If EXTENT_NORESERVE is set it means that when the buffered write was
1724 * made we had not enough available data space and therefore we did not
1725 * reserve data space for it, since we though we could do NOCOW for the
1726 * respective file range (either there is prealloc extent or the inode
1727 * has the NOCOW bit set).
1728 *
1729 * However when we need to fallback to COW mode (because for example the
1730 * block group for the corresponding extent was turned to RO mode by a
1731 * scrub or relocation) we need to do the following:
1732 *
1733 * 1) We increment the bytes_may_use counter of the data space info.
1734 * If COW succeeds, it allocates a new data extent and after doing
1735 * that it decrements the space info's bytes_may_use counter and
1736 * increments its bytes_reserved counter by the same amount (we do
1737 * this at btrfs_add_reserved_bytes()). So we need to increment the
1738 * bytes_may_use counter to compensate (when space is reserved at
1739 * buffered write time, the bytes_may_use counter is incremented);
1740 *
1741 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1742 * that if the COW path fails for any reason, it decrements (through
1743 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1744 * data space info, which we incremented in the step above.
1745 *
1746 * If we need to fallback to cow and the inode corresponds to a free
1747 * space cache inode or an inode of the data relocation tree, we must
1748 * also increment bytes_may_use of the data space_info for the same
1749 * reason. Space caches and relocated data extents always get a prealloc
1750 * extent for them, however scrub or balance may have set the block
1751 * group that contains that extent to RO mode and therefore force COW
1752 * when starting writeback.
1753 */
1754 btrfs_lock_extent(io_tree, start, end, &cached_state);
1755 count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
1756 EXTENT_NORESERVE, 0, NULL);
1757 if (count > 0 || is_space_ino || is_reloc_ino) {
1758 u64 bytes = count;
1759 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1760 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1761
1762 if (is_space_ino || is_reloc_ino)
1763 bytes = range_bytes;
1764
1765 spin_lock(&sinfo->lock);
1766 btrfs_space_info_update_bytes_may_use(sinfo, bytes);
1767 spin_unlock(&sinfo->lock);
1768
1769 if (count > 0)
1770 btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1771 &cached_state);
1772 }
1773 btrfs_unlock_extent(io_tree, start, end, &cached_state);
1774
1775 /*
1776 * Don't try to create inline extents, as a mix of inline extent that
1777 * is written out and unlocked directly and a normal NOCOW extent
1778 * doesn't work.
1779 *
1780 * And here we do not unlock the folio after a successful run.
1781 * The folios will be unlocked after everything is finished, or by error handling.
1782 *
1783 * This is to ensure error handling won't need to clear dirty/ordered flags without
1784 * a locked folio, which can race with writeback.
1785 */
1786 ret = cow_file_range(inode, locked_folio, start, end, NULL,
1787 COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
1788 ASSERT(ret != 1);
1789 return ret;
1790 }
1791
1792 struct can_nocow_file_extent_args {
1793 /* Input fields. */
1794
1795 /* Start file offset of the range we want to NOCOW. */
1796 u64 start;
1797 /* End file offset (inclusive) of the range we want to NOCOW. */
1798 u64 end;
1799 bool writeback_path;
1800 /*
1801 * Free the path passed to can_nocow_file_extent() once it's not needed
1802 * anymore.
1803 */
1804 bool free_path;
1805
1806 /*
1807 * Output fields. Only set when can_nocow_file_extent() returns 1.
1808 * The expected file extent for the NOCOW write.
1809 */
1810 struct btrfs_file_extent file_extent;
1811 };
1812
1813 /*
1814 * Check if we can NOCOW the file extent that the path points to.
1815 * This function may return with the path released, so the caller should check
1816 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1817 *
1818 * Returns: < 0 on error
1819 * 0 if we can not NOCOW
1820 * 1 if we can NOCOW
1821 */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1822 static int can_nocow_file_extent(struct btrfs_path *path,
1823 struct btrfs_key *key,
1824 struct btrfs_inode *inode,
1825 struct can_nocow_file_extent_args *args)
1826 {
1827 const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1828 struct extent_buffer *leaf = path->nodes[0];
1829 struct btrfs_root *root = inode->root;
1830 struct btrfs_file_extent_item *fi;
1831 struct btrfs_root *csum_root;
1832 u64 io_start;
1833 u64 extent_end;
1834 u8 extent_type;
1835 int can_nocow = 0;
1836 int ret = 0;
1837 bool nowait = path->nowait;
1838
1839 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1840 extent_type = btrfs_file_extent_type(leaf, fi);
1841
1842 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1843 goto out;
1844
1845 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1846 extent_type == BTRFS_FILE_EXTENT_REG)
1847 goto out;
1848
1849 /*
1850 * If the extent was created before the generation where the last snapshot
1851 * for its subvolume was created, then this implies the extent is shared,
1852 * hence we must COW.
1853 */
1854 if (btrfs_file_extent_generation(leaf, fi) <=
1855 btrfs_root_last_snapshot(&root->root_item))
1856 goto out;
1857
1858 /* An explicit hole, must COW. */
1859 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
1860 goto out;
1861
1862 /* Compressed/encrypted/encoded extents must be COWed. */
1863 if (btrfs_file_extent_compression(leaf, fi) ||
1864 btrfs_file_extent_encryption(leaf, fi) ||
1865 btrfs_file_extent_other_encoding(leaf, fi))
1866 goto out;
1867
1868 extent_end = btrfs_file_extent_end(path);
1869
1870 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1871 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1872 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1873 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1874 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1875
1876 /*
1877 * The following checks can be expensive, as they need to take other
1878 * locks and do btree or rbtree searches, so release the path to avoid
1879 * blocking other tasks for too long.
1880 */
1881 btrfs_release_path(path);
1882
1883 ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
1884 args->file_extent.disk_bytenr, path);
1885 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1886 if (ret != 0)
1887 goto out;
1888
1889 if (args->free_path) {
1890 /*
1891 * We don't need the path anymore, plus through the
1892 * btrfs_lookup_csums_list() call below we will end up allocating
1893 * another path. So free the path to avoid unnecessary extra
1894 * memory usage.
1895 */
1896 btrfs_free_path(path);
1897 path = NULL;
1898 }
1899
1900 /* If there are pending snapshots for this root, we must COW. */
1901 if (args->writeback_path && !is_freespace_inode &&
1902 atomic_read(&root->snapshot_force_cow))
1903 goto out;
1904
1905 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
1906 args->file_extent.offset += args->start - key->offset;
1907 io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
1908
1909 /*
1910 * Force COW if csums exist in the range. This ensures that csums for a
1911 * given extent are either valid or do not exist.
1912 */
1913
1914 csum_root = btrfs_csum_root(root->fs_info, io_start);
1915 ret = btrfs_lookup_csums_list(csum_root, io_start,
1916 io_start + args->file_extent.num_bytes - 1,
1917 NULL, nowait);
1918 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1919 if (ret != 0)
1920 goto out;
1921
1922 can_nocow = 1;
1923 out:
1924 if (args->free_path && path)
1925 btrfs_free_path(path);
1926
1927 return ret < 0 ? ret : can_nocow;
1928 }
1929
nocow_one_range(struct btrfs_inode * inode,struct folio * locked_folio,struct extent_state ** cached,struct can_nocow_file_extent_args * nocow_args,u64 file_pos,bool is_prealloc)1930 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
1931 struct extent_state **cached,
1932 struct can_nocow_file_extent_args *nocow_args,
1933 u64 file_pos, bool is_prealloc)
1934 {
1935 struct btrfs_ordered_extent *ordered;
1936 const u64 len = nocow_args->file_extent.num_bytes;
1937 const u64 end = file_pos + len - 1;
1938 int ret = 0;
1939
1940 btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
1941
1942 if (is_prealloc) {
1943 struct extent_map *em;
1944
1945 em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
1946 BTRFS_ORDERED_PREALLOC);
1947 if (IS_ERR(em)) {
1948 ret = PTR_ERR(em);
1949 goto error;
1950 }
1951 btrfs_free_extent_map(em);
1952 }
1953
1954 ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
1955 is_prealloc
1956 ? (1U << BTRFS_ORDERED_PREALLOC)
1957 : (1U << BTRFS_ORDERED_NOCOW));
1958 if (IS_ERR(ordered)) {
1959 if (is_prealloc)
1960 btrfs_drop_extent_map_range(inode, file_pos, end, false);
1961 ret = PTR_ERR(ordered);
1962 goto error;
1963 }
1964
1965 if (btrfs_is_data_reloc_root(inode->root))
1966 /*
1967 * Errors are handled later, as we must prevent
1968 * extent_clear_unlock_delalloc() in error handler from freeing
1969 * metadata of the created ordered extent.
1970 */
1971 ret = btrfs_reloc_clone_csums(ordered);
1972 btrfs_put_ordered_extent(ordered);
1973
1974 if (ret < 0)
1975 goto error;
1976 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
1977 EXTENT_LOCKED | EXTENT_DELALLOC |
1978 EXTENT_CLEAR_DATA_RESV,
1979 PAGE_SET_ORDERED);
1980 return ret;
1981
1982 error:
1983 btrfs_cleanup_ordered_extents(inode, file_pos, len);
1984 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
1985 EXTENT_LOCKED | EXTENT_DELALLOC |
1986 EXTENT_CLEAR_DATA_RESV,
1987 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1988 PAGE_END_WRITEBACK);
1989 btrfs_err(inode->root->fs_info,
1990 "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
1991 __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
1992 file_pos, len, ret);
1993 return ret;
1994 }
1995
1996 /*
1997 * When nocow writeback calls back. This checks for snapshots or COW copies
1998 * of the extents that exist in the file, and COWs the file as required.
1999 *
2000 * If no cow copies or snapshots exist, we write directly to the existing
2001 * blocks on disk
2002 */
run_delalloc_nocow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)2003 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2004 struct folio *locked_folio,
2005 const u64 start, const u64 end)
2006 {
2007 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2008 struct btrfs_root *root = inode->root;
2009 struct btrfs_path *path;
2010 u64 cow_start = (u64)-1;
2011 /*
2012 * If not 0, represents the inclusive end of the last fallback_to_cow()
2013 * range. Only for error handling.
2014 *
2015 * The same for nocow_end, it's to avoid double cleaning up the range
2016 * already cleaned by nocow_one_range().
2017 */
2018 u64 cow_end = 0;
2019 u64 nocow_end = 0;
2020 u64 cur_offset = start;
2021 int ret;
2022 bool check_prev = true;
2023 u64 ino = btrfs_ino(inode);
2024 struct can_nocow_file_extent_args nocow_args = { 0 };
2025 /* The range that has ordered extent(s). */
2026 u64 oe_cleanup_start;
2027 u64 oe_cleanup_len = 0;
2028 /* The range that is untouched. */
2029 u64 untouched_start;
2030 u64 untouched_len = 0;
2031
2032 /*
2033 * Normally on a zoned device we're only doing COW writes, but in case
2034 * of relocation on a zoned filesystem serializes I/O so that we're only
2035 * writing sequentially and can end up here as well.
2036 */
2037 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2038
2039 path = btrfs_alloc_path();
2040 if (!path) {
2041 ret = -ENOMEM;
2042 goto error;
2043 }
2044
2045 nocow_args.end = end;
2046 nocow_args.writeback_path = true;
2047
2048 while (cur_offset <= end) {
2049 struct btrfs_block_group *nocow_bg = NULL;
2050 struct btrfs_key found_key;
2051 struct btrfs_file_extent_item *fi;
2052 struct extent_buffer *leaf;
2053 struct extent_state *cached_state = NULL;
2054 u64 extent_end;
2055 int extent_type;
2056
2057 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2058 cur_offset, 0);
2059 if (ret < 0)
2060 goto error;
2061
2062 /*
2063 * If there is no extent for our range when doing the initial
2064 * search, then go back to the previous slot as it will be the
2065 * one containing the search offset
2066 */
2067 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2068 leaf = path->nodes[0];
2069 btrfs_item_key_to_cpu(leaf, &found_key,
2070 path->slots[0] - 1);
2071 if (found_key.objectid == ino &&
2072 found_key.type == BTRFS_EXTENT_DATA_KEY)
2073 path->slots[0]--;
2074 }
2075 check_prev = false;
2076 next_slot:
2077 /* Go to next leaf if we have exhausted the current one */
2078 leaf = path->nodes[0];
2079 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2080 ret = btrfs_next_leaf(root, path);
2081 if (ret < 0)
2082 goto error;
2083 if (ret > 0)
2084 break;
2085 leaf = path->nodes[0];
2086 }
2087
2088 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2089
2090 /* Didn't find anything for our INO */
2091 if (found_key.objectid > ino)
2092 break;
2093 /*
2094 * Keep searching until we find an EXTENT_ITEM or there are no
2095 * more extents for this inode
2096 */
2097 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2098 found_key.type < BTRFS_EXTENT_DATA_KEY) {
2099 path->slots[0]++;
2100 goto next_slot;
2101 }
2102
2103 /* Found key is not EXTENT_DATA_KEY or starts after req range */
2104 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2105 found_key.offset > end)
2106 break;
2107
2108 /*
2109 * If the found extent starts after requested offset, then
2110 * adjust cur_offset to be right before this extent begins.
2111 */
2112 if (found_key.offset > cur_offset) {
2113 if (cow_start == (u64)-1)
2114 cow_start = cur_offset;
2115 cur_offset = found_key.offset;
2116 goto next_slot;
2117 }
2118
2119 /*
2120 * Found extent which begins before our range and potentially
2121 * intersect it
2122 */
2123 fi = btrfs_item_ptr(leaf, path->slots[0],
2124 struct btrfs_file_extent_item);
2125 extent_type = btrfs_file_extent_type(leaf, fi);
2126 /* If this is triggered then we have a memory corruption. */
2127 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2128 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2129 ret = -EUCLEAN;
2130 goto error;
2131 }
2132 extent_end = btrfs_file_extent_end(path);
2133
2134 /*
2135 * If the extent we got ends before our current offset, skip to
2136 * the next extent.
2137 */
2138 if (extent_end <= cur_offset) {
2139 path->slots[0]++;
2140 goto next_slot;
2141 }
2142
2143 nocow_args.start = cur_offset;
2144 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2145 if (ret < 0)
2146 goto error;
2147 if (ret == 0)
2148 goto must_cow;
2149
2150 ret = 0;
2151 nocow_bg = btrfs_inc_nocow_writers(fs_info,
2152 nocow_args.file_extent.disk_bytenr +
2153 nocow_args.file_extent.offset);
2154 if (!nocow_bg) {
2155 must_cow:
2156 /*
2157 * If we can't perform NOCOW writeback for the range,
2158 * then record the beginning of the range that needs to
2159 * be COWed. It will be written out before the next
2160 * NOCOW range if we find one, or when exiting this
2161 * loop.
2162 */
2163 if (cow_start == (u64)-1)
2164 cow_start = cur_offset;
2165 cur_offset = extent_end;
2166 if (cur_offset > end)
2167 break;
2168 if (!path->nodes[0])
2169 continue;
2170 path->slots[0]++;
2171 goto next_slot;
2172 }
2173
2174 /*
2175 * COW range from cow_start to found_key.offset - 1. As the key
2176 * will contain the beginning of the first extent that can be
2177 * NOCOW, following one which needs to be COW'ed
2178 */
2179 if (cow_start != (u64)-1) {
2180 ret = fallback_to_cow(inode, locked_folio, cow_start,
2181 found_key.offset - 1);
2182 if (ret) {
2183 cow_end = found_key.offset - 1;
2184 btrfs_dec_nocow_writers(nocow_bg);
2185 goto error;
2186 }
2187 cow_start = (u64)-1;
2188 }
2189
2190 ret = nocow_one_range(inode, locked_folio, &cached_state,
2191 &nocow_args, cur_offset,
2192 extent_type == BTRFS_FILE_EXTENT_PREALLOC);
2193 btrfs_dec_nocow_writers(nocow_bg);
2194 if (ret < 0) {
2195 nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
2196 goto error;
2197 }
2198 cur_offset = extent_end;
2199 }
2200 btrfs_release_path(path);
2201
2202 if (cur_offset <= end && cow_start == (u64)-1)
2203 cow_start = cur_offset;
2204
2205 if (cow_start != (u64)-1) {
2206 ret = fallback_to_cow(inode, locked_folio, cow_start, end);
2207 if (ret) {
2208 cow_end = end;
2209 goto error;
2210 }
2211 cow_start = (u64)-1;
2212 }
2213
2214 /*
2215 * Everything is finished without an error, can unlock the folios now.
2216 *
2217 * No need to touch the io tree range nor set folio ordered flag, as
2218 * fallback_to_cow() and nocow_one_range() have already handled them.
2219 */
2220 extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
2221
2222 btrfs_free_path(path);
2223 return 0;
2224
2225 error:
2226 if (cow_start == (u64)-1) {
2227 /*
2228 * case a)
2229 * start cur_offset end
2230 * | OE cleanup | Untouched |
2231 *
2232 * We finished a fallback_to_cow() or nocow_one_range() call,
2233 * but failed to check the next range.
2234 *
2235 * or
2236 * start cur_offset nocow_end end
2237 * | OE cleanup | Skip | Untouched |
2238 *
2239 * nocow_one_range() failed, the range [cur_offset, nocow_end] is
2240 * already cleaned up.
2241 */
2242 oe_cleanup_start = start;
2243 oe_cleanup_len = cur_offset - start;
2244 if (nocow_end)
2245 untouched_start = nocow_end + 1;
2246 else
2247 untouched_start = cur_offset;
2248 untouched_len = end + 1 - untouched_start;
2249 } else if (cow_start != (u64)-1 && cow_end == 0) {
2250 /*
2251 * case b)
2252 * start cow_start cur_offset end
2253 * | OE cleanup | Untouched |
2254 *
2255 * We got a range that needs COW, but before we hit the next NOCOW range,
2256 * thus [cow_start, cur_offset) doesn't yet have any OE.
2257 */
2258 oe_cleanup_start = start;
2259 oe_cleanup_len = cow_start - start;
2260 untouched_start = cow_start;
2261 untouched_len = end + 1 - untouched_start;
2262 } else {
2263 /*
2264 * case c)
2265 * start cow_start cow_end end
2266 * | OE cleanup | Skip | Untouched |
2267 *
2268 * fallback_to_cow() failed, and fallback_to_cow() will do the
2269 * cleanup for its range, we shouldn't touch the range
2270 * [cow_start, cow_end].
2271 */
2272 ASSERT(cow_start != (u64)-1 && cow_end != 0);
2273 oe_cleanup_start = start;
2274 oe_cleanup_len = cow_start - start;
2275 untouched_start = cow_end + 1;
2276 untouched_len = end + 1 - untouched_start;
2277 }
2278
2279 if (oe_cleanup_len) {
2280 const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
2281 btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
2282 extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
2283 locked_folio, NULL,
2284 EXTENT_LOCKED | EXTENT_DELALLOC,
2285 PAGE_UNLOCK | PAGE_START_WRITEBACK |
2286 PAGE_END_WRITEBACK);
2287 }
2288
2289 if (untouched_len) {
2290 struct extent_state *cached = NULL;
2291 const u64 untouched_end = untouched_start + untouched_len - 1;
2292
2293 /*
2294 * We need to lock the extent here because we're clearing DELALLOC and
2295 * we're not locked at this point.
2296 */
2297 btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
2298 extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
2299 locked_folio, &cached,
2300 EXTENT_LOCKED | EXTENT_DELALLOC |
2301 EXTENT_DEFRAG |
2302 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2303 PAGE_START_WRITEBACK |
2304 PAGE_END_WRITEBACK);
2305 btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
2306 }
2307 btrfs_free_path(path);
2308 btrfs_err(fs_info,
2309 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
2310 __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
2311 start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
2312 untouched_start, untouched_len, ret);
2313 return ret;
2314 }
2315
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2316 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2317 {
2318 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2319 if (inode->defrag_bytes &&
2320 btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
2321 return false;
2322 return true;
2323 }
2324 return false;
2325 }
2326
2327 /*
2328 * Function to process delayed allocation (create CoW) for ranges which are
2329 * being touched for the first time.
2330 */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)2331 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
2332 u64 start, u64 end, struct writeback_control *wbc)
2333 {
2334 const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2335 int ret;
2336
2337 /*
2338 * The range must cover part of the @locked_folio, or a return of 1
2339 * can confuse the caller.
2340 */
2341 ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_end(locked_folio)));
2342
2343 if (should_nocow(inode, start, end)) {
2344 ret = run_delalloc_nocow(inode, locked_folio, start, end);
2345 return ret;
2346 }
2347
2348 if (btrfs_inode_can_compress(inode) &&
2349 inode_need_compress(inode, start, end) &&
2350 run_delalloc_compressed(inode, locked_folio, start, end, wbc))
2351 return 1;
2352
2353 if (zoned)
2354 ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
2355 true);
2356 else
2357 ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
2358 return ret;
2359 }
2360
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2361 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2362 struct extent_state *orig, u64 split)
2363 {
2364 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2365 u64 size;
2366
2367 lockdep_assert_held(&inode->io_tree.lock);
2368
2369 /* not delalloc, ignore it */
2370 if (!(orig->state & EXTENT_DELALLOC))
2371 return;
2372
2373 size = orig->end - orig->start + 1;
2374 if (size > fs_info->max_extent_size) {
2375 u32 num_extents;
2376 u64 new_size;
2377
2378 /*
2379 * See the explanation in btrfs_merge_delalloc_extent, the same
2380 * applies here, just in reverse.
2381 */
2382 new_size = orig->end - split + 1;
2383 num_extents = count_max_extents(fs_info, new_size);
2384 new_size = split - orig->start;
2385 num_extents += count_max_extents(fs_info, new_size);
2386 if (count_max_extents(fs_info, size) >= num_extents)
2387 return;
2388 }
2389
2390 spin_lock(&inode->lock);
2391 btrfs_mod_outstanding_extents(inode, 1);
2392 spin_unlock(&inode->lock);
2393 }
2394
2395 /*
2396 * Handle merged delayed allocation extents so we can keep track of new extents
2397 * that are just merged onto old extents, such as when we are doing sequential
2398 * writes, so we can properly account for the metadata space we'll need.
2399 */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2400 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2401 struct extent_state *other)
2402 {
2403 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2404 u64 new_size, old_size;
2405 u32 num_extents;
2406
2407 lockdep_assert_held(&inode->io_tree.lock);
2408
2409 /* not delalloc, ignore it */
2410 if (!(other->state & EXTENT_DELALLOC))
2411 return;
2412
2413 if (new->start > other->start)
2414 new_size = new->end - other->start + 1;
2415 else
2416 new_size = other->end - new->start + 1;
2417
2418 /* we're not bigger than the max, unreserve the space and go */
2419 if (new_size <= fs_info->max_extent_size) {
2420 spin_lock(&inode->lock);
2421 btrfs_mod_outstanding_extents(inode, -1);
2422 spin_unlock(&inode->lock);
2423 return;
2424 }
2425
2426 /*
2427 * We have to add up either side to figure out how many extents were
2428 * accounted for before we merged into one big extent. If the number of
2429 * extents we accounted for is <= the amount we need for the new range
2430 * then we can return, otherwise drop. Think of it like this
2431 *
2432 * [ 4k][MAX_SIZE]
2433 *
2434 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2435 * need 2 outstanding extents, on one side we have 1 and the other side
2436 * we have 1 so they are == and we can return. But in this case
2437 *
2438 * [MAX_SIZE+4k][MAX_SIZE+4k]
2439 *
2440 * Each range on their own accounts for 2 extents, but merged together
2441 * they are only 3 extents worth of accounting, so we need to drop in
2442 * this case.
2443 */
2444 old_size = other->end - other->start + 1;
2445 num_extents = count_max_extents(fs_info, old_size);
2446 old_size = new->end - new->start + 1;
2447 num_extents += count_max_extents(fs_info, old_size);
2448 if (count_max_extents(fs_info, new_size) >= num_extents)
2449 return;
2450
2451 spin_lock(&inode->lock);
2452 btrfs_mod_outstanding_extents(inode, -1);
2453 spin_unlock(&inode->lock);
2454 }
2455
btrfs_add_delalloc_inode(struct btrfs_inode * inode)2456 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
2457 {
2458 struct btrfs_root *root = inode->root;
2459 struct btrfs_fs_info *fs_info = root->fs_info;
2460
2461 spin_lock(&root->delalloc_lock);
2462 ASSERT(list_empty(&inode->delalloc_inodes));
2463 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2464 root->nr_delalloc_inodes++;
2465 if (root->nr_delalloc_inodes == 1) {
2466 spin_lock(&fs_info->delalloc_root_lock);
2467 ASSERT(list_empty(&root->delalloc_root));
2468 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2469 spin_unlock(&fs_info->delalloc_root_lock);
2470 }
2471 spin_unlock(&root->delalloc_lock);
2472 }
2473
btrfs_del_delalloc_inode(struct btrfs_inode * inode)2474 void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
2475 {
2476 struct btrfs_root *root = inode->root;
2477 struct btrfs_fs_info *fs_info = root->fs_info;
2478
2479 lockdep_assert_held(&root->delalloc_lock);
2480
2481 /*
2482 * We may be called after the inode was already deleted from the list,
2483 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2484 * and then later through btrfs_clear_delalloc_extent() while the inode
2485 * still has ->delalloc_bytes > 0.
2486 */
2487 if (!list_empty(&inode->delalloc_inodes)) {
2488 list_del_init(&inode->delalloc_inodes);
2489 root->nr_delalloc_inodes--;
2490 if (!root->nr_delalloc_inodes) {
2491 ASSERT(list_empty(&root->delalloc_inodes));
2492 spin_lock(&fs_info->delalloc_root_lock);
2493 ASSERT(!list_empty(&root->delalloc_root));
2494 list_del_init(&root->delalloc_root);
2495 spin_unlock(&fs_info->delalloc_root_lock);
2496 }
2497 }
2498 }
2499
2500 /*
2501 * Properly track delayed allocation bytes in the inode and to maintain the
2502 * list of inodes that have pending delalloc work to be done.
2503 */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2504 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2505 u32 bits)
2506 {
2507 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2508
2509 lockdep_assert_held(&inode->io_tree.lock);
2510
2511 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2512 WARN_ON(1);
2513 /*
2514 * set_bit and clear bit hooks normally require _irqsave/restore
2515 * but in this case, we are only testing for the DELALLOC
2516 * bit, which is only set or cleared with irqs on
2517 */
2518 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2519 u64 len = state->end + 1 - state->start;
2520 u64 prev_delalloc_bytes;
2521 u32 num_extents = count_max_extents(fs_info, len);
2522
2523 spin_lock(&inode->lock);
2524 btrfs_mod_outstanding_extents(inode, num_extents);
2525 spin_unlock(&inode->lock);
2526
2527 /* For sanity tests */
2528 if (btrfs_is_testing(fs_info))
2529 return;
2530
2531 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2532 fs_info->delalloc_batch);
2533 spin_lock(&inode->lock);
2534 prev_delalloc_bytes = inode->delalloc_bytes;
2535 inode->delalloc_bytes += len;
2536 if (bits & EXTENT_DEFRAG)
2537 inode->defrag_bytes += len;
2538 spin_unlock(&inode->lock);
2539
2540 /*
2541 * We don't need to be under the protection of the inode's lock,
2542 * because we are called while holding the inode's io_tree lock
2543 * and are therefore protected against concurrent calls of this
2544 * function and btrfs_clear_delalloc_extent().
2545 */
2546 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
2547 btrfs_add_delalloc_inode(inode);
2548 }
2549
2550 if (!(state->state & EXTENT_DELALLOC_NEW) &&
2551 (bits & EXTENT_DELALLOC_NEW)) {
2552 spin_lock(&inode->lock);
2553 inode->new_delalloc_bytes += state->end + 1 - state->start;
2554 spin_unlock(&inode->lock);
2555 }
2556 }
2557
2558 /*
2559 * Once a range is no longer delalloc this function ensures that proper
2560 * accounting happens.
2561 */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2562 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2563 struct extent_state *state, u32 bits)
2564 {
2565 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2566 u64 len = state->end + 1 - state->start;
2567 u32 num_extents = count_max_extents(fs_info, len);
2568
2569 lockdep_assert_held(&inode->io_tree.lock);
2570
2571 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2572 spin_lock(&inode->lock);
2573 inode->defrag_bytes -= len;
2574 spin_unlock(&inode->lock);
2575 }
2576
2577 /*
2578 * set_bit and clear bit hooks normally require _irqsave/restore
2579 * but in this case, we are only testing for the DELALLOC
2580 * bit, which is only set or cleared with irqs on
2581 */
2582 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2583 struct btrfs_root *root = inode->root;
2584 u64 new_delalloc_bytes;
2585
2586 spin_lock(&inode->lock);
2587 btrfs_mod_outstanding_extents(inode, -num_extents);
2588 spin_unlock(&inode->lock);
2589
2590 /*
2591 * We don't reserve metadata space for space cache inodes so we
2592 * don't need to call delalloc_release_metadata if there is an
2593 * error.
2594 */
2595 if (bits & EXTENT_CLEAR_META_RESV &&
2596 root != fs_info->tree_root)
2597 btrfs_delalloc_release_metadata(inode, len, true);
2598
2599 /* For sanity tests. */
2600 if (btrfs_is_testing(fs_info))
2601 return;
2602
2603 if (!btrfs_is_data_reloc_root(root) &&
2604 !btrfs_is_free_space_inode(inode) &&
2605 !(state->state & EXTENT_NORESERVE) &&
2606 (bits & EXTENT_CLEAR_DATA_RESV))
2607 btrfs_free_reserved_data_space_noquota(inode, len);
2608
2609 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2610 fs_info->delalloc_batch);
2611 spin_lock(&inode->lock);
2612 inode->delalloc_bytes -= len;
2613 new_delalloc_bytes = inode->delalloc_bytes;
2614 spin_unlock(&inode->lock);
2615
2616 /*
2617 * We don't need to be under the protection of the inode's lock,
2618 * because we are called while holding the inode's io_tree lock
2619 * and are therefore protected against concurrent calls of this
2620 * function and btrfs_set_delalloc_extent().
2621 */
2622 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2623 spin_lock(&root->delalloc_lock);
2624 btrfs_del_delalloc_inode(inode);
2625 spin_unlock(&root->delalloc_lock);
2626 }
2627 }
2628
2629 if ((state->state & EXTENT_DELALLOC_NEW) &&
2630 (bits & EXTENT_DELALLOC_NEW)) {
2631 spin_lock(&inode->lock);
2632 ASSERT(inode->new_delalloc_bytes >= len);
2633 inode->new_delalloc_bytes -= len;
2634 if (bits & EXTENT_ADD_INODE_BYTES)
2635 inode_add_bytes(&inode->vfs_inode, len);
2636 spin_unlock(&inode->lock);
2637 }
2638 }
2639
2640 /*
2641 * given a list of ordered sums record them in the inode. This happens
2642 * at IO completion time based on sums calculated at bio submission time.
2643 */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2644 static int add_pending_csums(struct btrfs_trans_handle *trans,
2645 struct list_head *list)
2646 {
2647 struct btrfs_ordered_sum *sum;
2648 struct btrfs_root *csum_root = NULL;
2649 int ret;
2650
2651 list_for_each_entry(sum, list, list) {
2652 trans->adding_csums = true;
2653 if (!csum_root)
2654 csum_root = btrfs_csum_root(trans->fs_info,
2655 sum->logical);
2656 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2657 trans->adding_csums = false;
2658 if (ret)
2659 return ret;
2660 }
2661 return 0;
2662 }
2663
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2664 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2665 const u64 start,
2666 const u64 len,
2667 struct extent_state **cached_state)
2668 {
2669 u64 search_start = start;
2670 const u64 end = start + len - 1;
2671
2672 while (search_start < end) {
2673 const u64 search_len = end - search_start + 1;
2674 struct extent_map *em;
2675 u64 em_len;
2676 int ret = 0;
2677
2678 em = btrfs_get_extent(inode, NULL, search_start, search_len);
2679 if (IS_ERR(em))
2680 return PTR_ERR(em);
2681
2682 if (em->disk_bytenr != EXTENT_MAP_HOLE)
2683 goto next;
2684
2685 em_len = em->len;
2686 if (em->start < search_start)
2687 em_len -= search_start - em->start;
2688 if (em_len > search_len)
2689 em_len = search_len;
2690
2691 ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
2692 search_start + em_len - 1,
2693 EXTENT_DELALLOC_NEW, cached_state);
2694 next:
2695 search_start = btrfs_extent_map_end(em);
2696 btrfs_free_extent_map(em);
2697 if (ret)
2698 return ret;
2699 }
2700 return 0;
2701 }
2702
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2703 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2704 unsigned int extra_bits,
2705 struct extent_state **cached_state)
2706 {
2707 WARN_ON(PAGE_ALIGNED(end));
2708
2709 if (start >= i_size_read(&inode->vfs_inode) &&
2710 !(inode->flags & BTRFS_INODE_PREALLOC)) {
2711 /*
2712 * There can't be any extents following eof in this case so just
2713 * set the delalloc new bit for the range directly.
2714 */
2715 extra_bits |= EXTENT_DELALLOC_NEW;
2716 } else {
2717 int ret;
2718
2719 ret = btrfs_find_new_delalloc_bytes(inode, start,
2720 end + 1 - start,
2721 cached_state);
2722 if (ret)
2723 return ret;
2724 }
2725
2726 return btrfs_set_extent_bit(&inode->io_tree, start, end,
2727 EXTENT_DELALLOC | extra_bits, cached_state);
2728 }
2729
2730 /* see btrfs_writepage_start_hook for details on why this is required */
2731 struct btrfs_writepage_fixup {
2732 struct folio *folio;
2733 struct btrfs_inode *inode;
2734 struct btrfs_work work;
2735 };
2736
btrfs_writepage_fixup_worker(struct btrfs_work * work)2737 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2738 {
2739 struct btrfs_writepage_fixup *fixup =
2740 container_of(work, struct btrfs_writepage_fixup, work);
2741 struct btrfs_ordered_extent *ordered;
2742 struct extent_state *cached_state = NULL;
2743 struct extent_changeset *data_reserved = NULL;
2744 struct folio *folio = fixup->folio;
2745 struct btrfs_inode *inode = fixup->inode;
2746 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2747 u64 page_start = folio_pos(folio);
2748 u64 page_end = folio_end(folio) - 1;
2749 int ret = 0;
2750 bool free_delalloc_space = true;
2751
2752 /*
2753 * This is similar to page_mkwrite, we need to reserve the space before
2754 * we take the folio lock.
2755 */
2756 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2757 folio_size(folio));
2758 again:
2759 folio_lock(folio);
2760
2761 /*
2762 * Before we queued this fixup, we took a reference on the folio.
2763 * folio->mapping may go NULL, but it shouldn't be moved to a different
2764 * address space.
2765 */
2766 if (!folio->mapping || !folio_test_dirty(folio) ||
2767 !folio_test_checked(folio)) {
2768 /*
2769 * Unfortunately this is a little tricky, either
2770 *
2771 * 1) We got here and our folio had already been dealt with and
2772 * we reserved our space, thus ret == 0, so we need to just
2773 * drop our space reservation and bail. This can happen the
2774 * first time we come into the fixup worker, or could happen
2775 * while waiting for the ordered extent.
2776 * 2) Our folio was already dealt with, but we happened to get an
2777 * ENOSPC above from the btrfs_delalloc_reserve_space. In
2778 * this case we obviously don't have anything to release, but
2779 * because the folio was already dealt with we don't want to
2780 * mark the folio with an error, so make sure we're resetting
2781 * ret to 0. This is why we have this check _before_ the ret
2782 * check, because we do not want to have a surprise ENOSPC
2783 * when the folio was already properly dealt with.
2784 */
2785 if (!ret) {
2786 btrfs_delalloc_release_extents(inode, folio_size(folio));
2787 btrfs_delalloc_release_space(inode, data_reserved,
2788 page_start, folio_size(folio),
2789 true);
2790 }
2791 ret = 0;
2792 goto out_page;
2793 }
2794
2795 /*
2796 * We can't mess with the folio state unless it is locked, so now that
2797 * it is locked bail if we failed to make our space reservation.
2798 */
2799 if (ret)
2800 goto out_page;
2801
2802 btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2803
2804 /* already ordered? We're done */
2805 if (folio_test_ordered(folio))
2806 goto out_reserved;
2807
2808 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2809 if (ordered) {
2810 btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
2811 &cached_state);
2812 folio_unlock(folio);
2813 btrfs_start_ordered_extent(ordered);
2814 btrfs_put_ordered_extent(ordered);
2815 goto again;
2816 }
2817
2818 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2819 &cached_state);
2820 if (ret)
2821 goto out_reserved;
2822
2823 /*
2824 * Everything went as planned, we're now the owner of a dirty page with
2825 * delayed allocation bits set and space reserved for our COW
2826 * destination.
2827 *
2828 * The page was dirty when we started, nothing should have cleaned it.
2829 */
2830 BUG_ON(!folio_test_dirty(folio));
2831 free_delalloc_space = false;
2832 out_reserved:
2833 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2834 if (free_delalloc_space)
2835 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2836 PAGE_SIZE, true);
2837 btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2838 out_page:
2839 if (ret) {
2840 /*
2841 * We hit ENOSPC or other errors. Update the mapping and page
2842 * to reflect the errors and clean the page.
2843 */
2844 mapping_set_error(folio->mapping, ret);
2845 btrfs_mark_ordered_io_finished(inode, folio, page_start,
2846 folio_size(folio), !ret);
2847 folio_clear_dirty_for_io(folio);
2848 }
2849 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2850 folio_unlock(folio);
2851 folio_put(folio);
2852 kfree(fixup);
2853 extent_changeset_free(data_reserved);
2854 /*
2855 * As a precaution, do a delayed iput in case it would be the last iput
2856 * that could need flushing space. Recursing back to fixup worker would
2857 * deadlock.
2858 */
2859 btrfs_add_delayed_iput(inode);
2860 }
2861
2862 /*
2863 * There are a few paths in the higher layers of the kernel that directly
2864 * set the folio dirty bit without asking the filesystem if it is a
2865 * good idea. This causes problems because we want to make sure COW
2866 * properly happens and the data=ordered rules are followed.
2867 *
2868 * In our case any range that doesn't have the ORDERED bit set
2869 * hasn't been properly setup for IO. We kick off an async process
2870 * to fix it up. The async helper will wait for ordered extents, set
2871 * the delalloc bit and make it safe to write the folio.
2872 */
btrfs_writepage_cow_fixup(struct folio * folio)2873 int btrfs_writepage_cow_fixup(struct folio *folio)
2874 {
2875 struct inode *inode = folio->mapping->host;
2876 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2877 struct btrfs_writepage_fixup *fixup;
2878
2879 /* This folio has ordered extent covering it already */
2880 if (folio_test_ordered(folio))
2881 return 0;
2882
2883 /*
2884 * For experimental build, we error out instead of EAGAIN.
2885 *
2886 * We should not hit such out-of-band dirty folios anymore.
2887 */
2888 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
2889 DEBUG_WARN();
2890 btrfs_err_rl(fs_info,
2891 "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
2892 btrfs_root_id(BTRFS_I(inode)->root),
2893 btrfs_ino(BTRFS_I(inode)),
2894 folio_pos(folio));
2895 return -EUCLEAN;
2896 }
2897
2898 /*
2899 * folio_checked is set below when we create a fixup worker for this
2900 * folio, don't try to create another one if we're already
2901 * folio_test_checked.
2902 *
2903 * The extent_io writepage code will redirty the foio if we send back
2904 * EAGAIN.
2905 */
2906 if (folio_test_checked(folio))
2907 return -EAGAIN;
2908
2909 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2910 if (!fixup)
2911 return -EAGAIN;
2912
2913 /*
2914 * We are already holding a reference to this inode from
2915 * write_cache_pages. We need to hold it because the space reservation
2916 * takes place outside of the folio lock, and we can't trust
2917 * folio->mapping outside of the folio lock.
2918 */
2919 ihold(inode);
2920 btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
2921 folio_get(folio);
2922 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
2923 fixup->folio = folio;
2924 fixup->inode = BTRFS_I(inode);
2925 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2926
2927 return -EAGAIN;
2928 }
2929
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2930 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2931 struct btrfs_inode *inode, u64 file_pos,
2932 struct btrfs_file_extent_item *stack_fi,
2933 const bool update_inode_bytes,
2934 u64 qgroup_reserved)
2935 {
2936 struct btrfs_root *root = inode->root;
2937 const u64 sectorsize = root->fs_info->sectorsize;
2938 BTRFS_PATH_AUTO_FREE(path);
2939 struct extent_buffer *leaf;
2940 struct btrfs_key ins;
2941 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2942 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2943 u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2944 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2945 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2946 struct btrfs_drop_extents_args drop_args = { 0 };
2947 int ret;
2948
2949 path = btrfs_alloc_path();
2950 if (!path)
2951 return -ENOMEM;
2952
2953 /*
2954 * we may be replacing one extent in the tree with another.
2955 * The new extent is pinned in the extent map, and we don't want
2956 * to drop it from the cache until it is completely in the btree.
2957 *
2958 * So, tell btrfs_drop_extents to leave this extent in the cache.
2959 * the caller is expected to unpin it and allow it to be merged
2960 * with the others.
2961 */
2962 drop_args.path = path;
2963 drop_args.start = file_pos;
2964 drop_args.end = file_pos + num_bytes;
2965 drop_args.replace_extent = true;
2966 drop_args.extent_item_size = sizeof(*stack_fi);
2967 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2968 if (ret)
2969 goto out;
2970
2971 if (!drop_args.extent_inserted) {
2972 ins.objectid = btrfs_ino(inode);
2973 ins.type = BTRFS_EXTENT_DATA_KEY;
2974 ins.offset = file_pos;
2975
2976 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2977 sizeof(*stack_fi));
2978 if (ret)
2979 goto out;
2980 }
2981 leaf = path->nodes[0];
2982 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2983 write_extent_buffer(leaf, stack_fi,
2984 btrfs_item_ptr_offset(leaf, path->slots[0]),
2985 sizeof(struct btrfs_file_extent_item));
2986
2987 btrfs_release_path(path);
2988
2989 /*
2990 * If we dropped an inline extent here, we know the range where it is
2991 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2992 * number of bytes only for that range containing the inline extent.
2993 * The remaining of the range will be processed when clearing the
2994 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2995 */
2996 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2997 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
2998
2999 inline_size = drop_args.bytes_found - inline_size;
3000 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
3001 drop_args.bytes_found -= inline_size;
3002 num_bytes -= sectorsize;
3003 }
3004
3005 if (update_inode_bytes)
3006 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
3007
3008 ins.objectid = disk_bytenr;
3009 ins.type = BTRFS_EXTENT_ITEM_KEY;
3010 ins.offset = disk_num_bytes;
3011
3012 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
3013 if (ret)
3014 goto out;
3015
3016 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
3017 file_pos - offset,
3018 qgroup_reserved, &ins);
3019 out:
3020 return ret;
3021 }
3022
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)3023 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
3024 u64 start, u64 len)
3025 {
3026 struct btrfs_block_group *cache;
3027
3028 cache = btrfs_lookup_block_group(fs_info, start);
3029 ASSERT(cache);
3030
3031 spin_lock(&cache->lock);
3032 cache->delalloc_bytes -= len;
3033 spin_unlock(&cache->lock);
3034
3035 btrfs_put_block_group(cache);
3036 }
3037
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)3038 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3039 struct btrfs_ordered_extent *oe)
3040 {
3041 struct btrfs_file_extent_item stack_fi;
3042 bool update_inode_bytes;
3043 u64 num_bytes = oe->num_bytes;
3044 u64 ram_bytes = oe->ram_bytes;
3045
3046 memset(&stack_fi, 0, sizeof(stack_fi));
3047 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3048 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3049 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3050 oe->disk_num_bytes);
3051 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3052 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
3053 num_bytes = oe->truncated_len;
3054 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3055 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3056 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3057 /* Encryption and other encoding is reserved and all 0 */
3058
3059 /*
3060 * For delalloc, when completing an ordered extent we update the inode's
3061 * bytes when clearing the range in the inode's io tree, so pass false
3062 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3063 * except if the ordered extent was truncated.
3064 */
3065 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3066 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3067 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3068
3069 return insert_reserved_file_extent(trans, oe->inode,
3070 oe->file_offset, &stack_fi,
3071 update_inode_bytes, oe->qgroup_rsv);
3072 }
3073
3074 /*
3075 * As ordered data IO finishes, this gets called so we can finish
3076 * an ordered extent if the range of bytes in the file it covers are
3077 * fully written.
3078 */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3079 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3080 {
3081 struct btrfs_inode *inode = ordered_extent->inode;
3082 struct btrfs_root *root = inode->root;
3083 struct btrfs_fs_info *fs_info = root->fs_info;
3084 struct btrfs_trans_handle *trans = NULL;
3085 struct extent_io_tree *io_tree = &inode->io_tree;
3086 struct extent_state *cached_state = NULL;
3087 u64 start, end;
3088 int compress_type = 0;
3089 int ret = 0;
3090 u64 logical_len = ordered_extent->num_bytes;
3091 bool freespace_inode;
3092 bool truncated = false;
3093 bool clear_reserved_extent = true;
3094 unsigned int clear_bits = EXTENT_DEFRAG;
3095
3096 start = ordered_extent->file_offset;
3097 end = start + ordered_extent->num_bytes - 1;
3098
3099 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3100 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3101 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3102 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3103 clear_bits |= EXTENT_DELALLOC_NEW;
3104
3105 freespace_inode = btrfs_is_free_space_inode(inode);
3106 if (!freespace_inode)
3107 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3108
3109 if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
3110 ret = -EIO;
3111 goto out;
3112 }
3113
3114 ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3115 ordered_extent->disk_num_bytes);
3116 if (ret)
3117 goto out;
3118
3119 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3120 truncated = true;
3121 logical_len = ordered_extent->truncated_len;
3122 /* Truncated the entire extent, don't bother adding */
3123 if (!logical_len)
3124 goto out;
3125 }
3126
3127 /*
3128 * If it's a COW write we need to lock the extent range as we will be
3129 * inserting/replacing file extent items and unpinning an extent map.
3130 * This must be taken before joining a transaction, as it's a higher
3131 * level lock (like the inode's VFS lock), otherwise we can run into an
3132 * ABBA deadlock with other tasks (transactions work like a lock,
3133 * depending on their current state).
3134 */
3135 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3136 clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
3137 btrfs_lock_extent_bits(io_tree, start, end,
3138 EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
3139 &cached_state);
3140 }
3141
3142 if (freespace_inode)
3143 trans = btrfs_join_transaction_spacecache(root);
3144 else
3145 trans = btrfs_join_transaction(root);
3146 if (IS_ERR(trans)) {
3147 ret = PTR_ERR(trans);
3148 trans = NULL;
3149 goto out;
3150 }
3151
3152 trans->block_rsv = &inode->block_rsv;
3153
3154 ret = btrfs_insert_raid_extent(trans, ordered_extent);
3155 if (unlikely(ret)) {
3156 btrfs_abort_transaction(trans, ret);
3157 goto out;
3158 }
3159
3160 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3161 /* Logic error */
3162 ASSERT(list_empty(&ordered_extent->list));
3163 if (unlikely(!list_empty(&ordered_extent->list))) {
3164 ret = -EINVAL;
3165 btrfs_abort_transaction(trans, ret);
3166 goto out;
3167 }
3168
3169 btrfs_inode_safe_disk_i_size_write(inode, 0);
3170 ret = btrfs_update_inode_fallback(trans, inode);
3171 if (unlikely(ret)) {
3172 /* -ENOMEM or corruption */
3173 btrfs_abort_transaction(trans, ret);
3174 }
3175 goto out;
3176 }
3177
3178 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3179 compress_type = ordered_extent->compress_type;
3180 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3181 BUG_ON(compress_type);
3182 ret = btrfs_mark_extent_written(trans, inode,
3183 ordered_extent->file_offset,
3184 ordered_extent->file_offset +
3185 logical_len);
3186 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3187 ordered_extent->disk_num_bytes);
3188 } else {
3189 BUG_ON(root == fs_info->tree_root);
3190 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3191 if (!ret) {
3192 clear_reserved_extent = false;
3193 btrfs_release_delalloc_bytes(fs_info,
3194 ordered_extent->disk_bytenr,
3195 ordered_extent->disk_num_bytes);
3196 }
3197 }
3198 if (unlikely(ret < 0)) {
3199 btrfs_abort_transaction(trans, ret);
3200 goto out;
3201 }
3202
3203 ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
3204 ordered_extent->num_bytes, trans->transid);
3205 if (unlikely(ret < 0)) {
3206 btrfs_abort_transaction(trans, ret);
3207 goto out;
3208 }
3209
3210 ret = add_pending_csums(trans, &ordered_extent->list);
3211 if (unlikely(ret)) {
3212 btrfs_abort_transaction(trans, ret);
3213 goto out;
3214 }
3215
3216 /*
3217 * If this is a new delalloc range, clear its new delalloc flag to
3218 * update the inode's number of bytes. This needs to be done first
3219 * before updating the inode item.
3220 */
3221 if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3222 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3223 btrfs_clear_extent_bit(&inode->io_tree, start, end,
3224 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3225 &cached_state);
3226
3227 btrfs_inode_safe_disk_i_size_write(inode, 0);
3228 ret = btrfs_update_inode_fallback(trans, inode);
3229 if (unlikely(ret)) { /* -ENOMEM or corruption */
3230 btrfs_abort_transaction(trans, ret);
3231 goto out;
3232 }
3233 out:
3234 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3235 &cached_state);
3236
3237 if (trans)
3238 btrfs_end_transaction(trans);
3239
3240 if (ret || truncated) {
3241 /*
3242 * If we failed to finish this ordered extent for any reason we
3243 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3244 * extent, and mark the inode with the error if it wasn't
3245 * already set. Any error during writeback would have already
3246 * set the mapping error, so we need to set it if we're the ones
3247 * marking this ordered extent as failed.
3248 */
3249 if (ret)
3250 btrfs_mark_ordered_extent_error(ordered_extent);
3251
3252 /*
3253 * Drop extent maps for the part of the extent we didn't write.
3254 *
3255 * We have an exception here for the free_space_inode, this is
3256 * because when we do btrfs_get_extent() on the free space inode
3257 * we will search the commit root. If this is a new block group
3258 * we won't find anything, and we will trip over the assert in
3259 * writepage where we do ASSERT(em->block_start !=
3260 * EXTENT_MAP_HOLE).
3261 *
3262 * Theoretically we could also skip this for any NOCOW extent as
3263 * we don't mess with the extent map tree in the NOCOW case, but
3264 * for now simply skip this if we are the free space inode.
3265 */
3266 if (!btrfs_is_free_space_inode(inode)) {
3267 u64 unwritten_start = start;
3268
3269 if (truncated)
3270 unwritten_start += logical_len;
3271
3272 btrfs_drop_extent_map_range(inode, unwritten_start,
3273 end, false);
3274 }
3275
3276 /*
3277 * If the ordered extent had an IOERR or something else went
3278 * wrong we need to return the space for this ordered extent
3279 * back to the allocator. We only free the extent in the
3280 * truncated case if we didn't write out the extent at all.
3281 *
3282 * If we made it past insert_reserved_file_extent before we
3283 * errored out then we don't need to do this as the accounting
3284 * has already been done.
3285 */
3286 if ((ret || !logical_len) &&
3287 clear_reserved_extent &&
3288 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3289 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3290 /*
3291 * Discard the range before returning it back to the
3292 * free space pool
3293 */
3294 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3295 btrfs_discard_extent(fs_info,
3296 ordered_extent->disk_bytenr,
3297 ordered_extent->disk_num_bytes,
3298 NULL);
3299 btrfs_free_reserved_extent(fs_info,
3300 ordered_extent->disk_bytenr,
3301 ordered_extent->disk_num_bytes, true);
3302 /*
3303 * Actually free the qgroup rsv which was released when
3304 * the ordered extent was created.
3305 */
3306 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
3307 ordered_extent->qgroup_rsv,
3308 BTRFS_QGROUP_RSV_DATA);
3309 }
3310 }
3311
3312 /*
3313 * This needs to be done to make sure anybody waiting knows we are done
3314 * updating everything for this ordered extent.
3315 */
3316 btrfs_remove_ordered_extent(inode, ordered_extent);
3317
3318 /* once for us */
3319 btrfs_put_ordered_extent(ordered_extent);
3320 /* once for the tree */
3321 btrfs_put_ordered_extent(ordered_extent);
3322
3323 return ret;
3324 }
3325
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3326 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3327 {
3328 if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
3329 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3330 list_empty(&ordered->bioc_list))
3331 btrfs_finish_ordered_zoned(ordered);
3332 return btrfs_finish_one_ordered(ordered);
3333 }
3334
btrfs_calculate_block_csum(struct btrfs_fs_info * fs_info,phys_addr_t paddr,u8 * dest)3335 void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
3336 u8 *dest)
3337 {
3338 struct folio *folio = page_folio(phys_to_page(paddr));
3339 const u32 blocksize = fs_info->sectorsize;
3340 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3341
3342 shash->tfm = fs_info->csum_shash;
3343 /* The full block must be inside the folio. */
3344 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
3345
3346 if (folio_test_partial_kmap(folio)) {
3347 size_t cur = paddr;
3348
3349 crypto_shash_init(shash);
3350 while (cur < paddr + blocksize) {
3351 void *kaddr;
3352 size_t len = min(paddr + blocksize - cur,
3353 PAGE_SIZE - offset_in_page(cur));
3354
3355 kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur));
3356 crypto_shash_update(shash, kaddr, len);
3357 kunmap_local(kaddr);
3358 cur += len;
3359 }
3360 crypto_shash_final(shash, dest);
3361 } else {
3362 crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest);
3363 }
3364 }
3365 /*
3366 * Verify the checksum for a single sector without any extra action that depend
3367 * on the type of I/O.
3368 *
3369 * @kaddr must be a properly kmapped address.
3370 */
btrfs_check_block_csum(struct btrfs_fs_info * fs_info,phys_addr_t paddr,u8 * csum,const u8 * const csum_expected)3371 int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
3372 const u8 * const csum_expected)
3373 {
3374 btrfs_calculate_block_csum(fs_info, paddr, csum);
3375 if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
3376 return -EIO;
3377 return 0;
3378 }
3379
3380 /*
3381 * Verify the checksum of a single data sector.
3382 *
3383 * @bbio: btrfs_io_bio which contains the csum
3384 * @dev: device the sector is on
3385 * @bio_offset: offset to the beginning of the bio (in bytes)
3386 * @bv: bio_vec to check
3387 *
3388 * Check if the checksum on a data block is valid. When a checksum mismatch is
3389 * detected, report the error and fill the corrupted range with zero.
3390 *
3391 * Return %true if the sector is ok or had no checksum to start with, else %false.
3392 */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,phys_addr_t paddr)3393 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3394 u32 bio_offset, phys_addr_t paddr)
3395 {
3396 struct btrfs_inode *inode = bbio->inode;
3397 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3398 const u32 blocksize = fs_info->sectorsize;
3399 struct folio *folio;
3400 u64 file_offset = bbio->file_offset + bio_offset;
3401 u64 end = file_offset + blocksize - 1;
3402 u8 *csum_expected;
3403 u8 csum[BTRFS_CSUM_SIZE];
3404
3405 if (!bbio->csum)
3406 return true;
3407
3408 if (btrfs_is_data_reloc_root(inode->root) &&
3409 btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3410 NULL)) {
3411 /* Skip the range without csum for data reloc inode */
3412 btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
3413 EXTENT_NODATASUM, NULL);
3414 return true;
3415 }
3416
3417 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3418 fs_info->csum_size;
3419 if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected))
3420 goto zeroit;
3421 return true;
3422
3423 zeroit:
3424 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3425 bbio->mirror_num);
3426 if (dev)
3427 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3428 folio = page_folio(phys_to_page(paddr));
3429 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
3430 folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize);
3431 return false;
3432 }
3433
3434 /*
3435 * Perform a delayed iput on @inode.
3436 *
3437 * @inode: The inode we want to perform iput on
3438 *
3439 * This function uses the generic vfs_inode::i_count to track whether we should
3440 * just decrement it (in case it's > 1) or if this is the last iput then link
3441 * the inode to the delayed iput machinery. Delayed iputs are processed at
3442 * transaction commit time/superblock commit/cleaner kthread.
3443 */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3444 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3445 {
3446 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3447 unsigned long flags;
3448
3449 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3450 return;
3451
3452 WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
3453 atomic_inc(&fs_info->nr_delayed_iputs);
3454 /*
3455 * Need to be irq safe here because we can be called from either an irq
3456 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3457 * context.
3458 */
3459 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3460 ASSERT(list_empty(&inode->delayed_iput));
3461 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3462 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3463 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3464 wake_up_process(fs_info->cleaner_kthread);
3465 }
3466
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3467 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3468 struct btrfs_inode *inode)
3469 {
3470 list_del_init(&inode->delayed_iput);
3471 spin_unlock_irq(&fs_info->delayed_iput_lock);
3472 iput(&inode->vfs_inode);
3473 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3474 wake_up(&fs_info->delayed_iputs_wait);
3475 spin_lock_irq(&fs_info->delayed_iput_lock);
3476 }
3477
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3478 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3479 struct btrfs_inode *inode)
3480 {
3481 if (!list_empty(&inode->delayed_iput)) {
3482 spin_lock_irq(&fs_info->delayed_iput_lock);
3483 if (!list_empty(&inode->delayed_iput))
3484 run_delayed_iput_locked(fs_info, inode);
3485 spin_unlock_irq(&fs_info->delayed_iput_lock);
3486 }
3487 }
3488
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3489 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3490 {
3491 /*
3492 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3493 * calls btrfs_add_delayed_iput() and that needs to lock
3494 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3495 * prevent a deadlock.
3496 */
3497 spin_lock_irq(&fs_info->delayed_iput_lock);
3498 while (!list_empty(&fs_info->delayed_iputs)) {
3499 struct btrfs_inode *inode;
3500
3501 inode = list_first_entry(&fs_info->delayed_iputs,
3502 struct btrfs_inode, delayed_iput);
3503 run_delayed_iput_locked(fs_info, inode);
3504 if (need_resched()) {
3505 spin_unlock_irq(&fs_info->delayed_iput_lock);
3506 cond_resched();
3507 spin_lock_irq(&fs_info->delayed_iput_lock);
3508 }
3509 }
3510 spin_unlock_irq(&fs_info->delayed_iput_lock);
3511 }
3512
3513 /*
3514 * Wait for flushing all delayed iputs
3515 *
3516 * @fs_info: the filesystem
3517 *
3518 * This will wait on any delayed iputs that are currently running with KILLABLE
3519 * set. Once they are all done running we will return, unless we are killed in
3520 * which case we return EINTR. This helps in user operations like fallocate etc
3521 * that might get blocked on the iputs.
3522 *
3523 * Return EINTR if we were killed, 0 if nothing's pending
3524 */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3525 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3526 {
3527 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3528 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3529 if (ret)
3530 return -EINTR;
3531 return 0;
3532 }
3533
3534 /*
3535 * This creates an orphan entry for the given inode in case something goes wrong
3536 * in the middle of an unlink.
3537 */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3538 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3539 struct btrfs_inode *inode)
3540 {
3541 int ret;
3542
3543 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3544 if (unlikely(ret && ret != -EEXIST)) {
3545 btrfs_abort_transaction(trans, ret);
3546 return ret;
3547 }
3548
3549 return 0;
3550 }
3551
3552 /*
3553 * We have done the delete so we can go ahead and remove the orphan item for
3554 * this particular inode.
3555 */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3556 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3557 struct btrfs_inode *inode)
3558 {
3559 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3560 }
3561
3562 /*
3563 * this cleans up any orphans that may be left on the list from the last use
3564 * of this root.
3565 */
btrfs_orphan_cleanup(struct btrfs_root * root)3566 int btrfs_orphan_cleanup(struct btrfs_root *root)
3567 {
3568 struct btrfs_fs_info *fs_info = root->fs_info;
3569 BTRFS_PATH_AUTO_FREE(path);
3570 struct extent_buffer *leaf;
3571 struct btrfs_key key, found_key;
3572 struct btrfs_trans_handle *trans;
3573 u64 last_objectid = 0;
3574 int ret = 0, nr_unlink = 0;
3575
3576 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3577 return 0;
3578
3579 path = btrfs_alloc_path();
3580 if (!path) {
3581 ret = -ENOMEM;
3582 goto out;
3583 }
3584 path->reada = READA_BACK;
3585
3586 key.objectid = BTRFS_ORPHAN_OBJECTID;
3587 key.type = BTRFS_ORPHAN_ITEM_KEY;
3588 key.offset = (u64)-1;
3589
3590 while (1) {
3591 struct btrfs_inode *inode;
3592
3593 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3594 if (ret < 0)
3595 goto out;
3596
3597 /*
3598 * if ret == 0 means we found what we were searching for, which
3599 * is weird, but possible, so only screw with path if we didn't
3600 * find the key and see if we have stuff that matches
3601 */
3602 if (ret > 0) {
3603 ret = 0;
3604 if (path->slots[0] == 0)
3605 break;
3606 path->slots[0]--;
3607 }
3608
3609 /* pull out the item */
3610 leaf = path->nodes[0];
3611 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3612
3613 /* make sure the item matches what we want */
3614 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3615 break;
3616 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3617 break;
3618
3619 /* release the path since we're done with it */
3620 btrfs_release_path(path);
3621
3622 /*
3623 * this is where we are basically btrfs_lookup, without the
3624 * crossing root thing. we store the inode number in the
3625 * offset of the orphan item.
3626 */
3627
3628 if (found_key.offset == last_objectid) {
3629 /*
3630 * We found the same inode as before. This means we were
3631 * not able to remove its items via eviction triggered
3632 * by an iput(). A transaction abort may have happened,
3633 * due to -ENOSPC for example, so try to grab the error
3634 * that lead to a transaction abort, if any.
3635 */
3636 btrfs_err(fs_info,
3637 "Error removing orphan entry, stopping orphan cleanup");
3638 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3639 goto out;
3640 }
3641
3642 last_objectid = found_key.offset;
3643
3644 found_key.objectid = found_key.offset;
3645 found_key.type = BTRFS_INODE_ITEM_KEY;
3646 found_key.offset = 0;
3647 inode = btrfs_iget(last_objectid, root);
3648 if (IS_ERR(inode)) {
3649 ret = PTR_ERR(inode);
3650 inode = NULL;
3651 if (ret != -ENOENT)
3652 goto out;
3653 }
3654
3655 if (!inode && root == fs_info->tree_root) {
3656 struct btrfs_root *dead_root;
3657 int is_dead_root = 0;
3658
3659 /*
3660 * This is an orphan in the tree root. Currently these
3661 * could come from 2 sources:
3662 * a) a root (snapshot/subvolume) deletion in progress
3663 * b) a free space cache inode
3664 * We need to distinguish those two, as the orphan item
3665 * for a root must not get deleted before the deletion
3666 * of the snapshot/subvolume's tree completes.
3667 *
3668 * btrfs_find_orphan_roots() ran before us, which has
3669 * found all deleted roots and loaded them into
3670 * fs_info->fs_roots_radix. So here we can find if an
3671 * orphan item corresponds to a deleted root by looking
3672 * up the root from that radix tree.
3673 */
3674
3675 spin_lock(&fs_info->fs_roots_radix_lock);
3676 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3677 (unsigned long)found_key.objectid);
3678 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3679 is_dead_root = 1;
3680 spin_unlock(&fs_info->fs_roots_radix_lock);
3681
3682 if (is_dead_root) {
3683 /* prevent this orphan from being found again */
3684 key.offset = found_key.objectid - 1;
3685 continue;
3686 }
3687
3688 }
3689
3690 /*
3691 * If we have an inode with links, there are a couple of
3692 * possibilities:
3693 *
3694 * 1. We were halfway through creating fsverity metadata for the
3695 * file. In that case, the orphan item represents incomplete
3696 * fsverity metadata which must be cleaned up with
3697 * btrfs_drop_verity_items and deleting the orphan item.
3698
3699 * 2. Old kernels (before v3.12) used to create an
3700 * orphan item for truncate indicating that there were possibly
3701 * extent items past i_size that needed to be deleted. In v3.12,
3702 * truncate was changed to update i_size in sync with the extent
3703 * items, but the (useless) orphan item was still created. Since
3704 * v4.18, we don't create the orphan item for truncate at all.
3705 *
3706 * So, this item could mean that we need to do a truncate, but
3707 * only if this filesystem was last used on a pre-v3.12 kernel
3708 * and was not cleanly unmounted. The odds of that are quite
3709 * slim, and it's a pain to do the truncate now, so just delete
3710 * the orphan item.
3711 *
3712 * It's also possible that this orphan item was supposed to be
3713 * deleted but wasn't. The inode number may have been reused,
3714 * but either way, we can delete the orphan item.
3715 */
3716 if (!inode || inode->vfs_inode.i_nlink) {
3717 if (inode) {
3718 ret = btrfs_drop_verity_items(inode);
3719 iput(&inode->vfs_inode);
3720 inode = NULL;
3721 if (ret)
3722 goto out;
3723 }
3724 trans = btrfs_start_transaction(root, 1);
3725 if (IS_ERR(trans)) {
3726 ret = PTR_ERR(trans);
3727 goto out;
3728 }
3729 btrfs_debug(fs_info, "auto deleting %Lu",
3730 found_key.objectid);
3731 ret = btrfs_del_orphan_item(trans, root,
3732 found_key.objectid);
3733 btrfs_end_transaction(trans);
3734 if (ret)
3735 goto out;
3736 continue;
3737 }
3738
3739 nr_unlink++;
3740
3741 /* this will do delete_inode and everything for us */
3742 iput(&inode->vfs_inode);
3743 }
3744 /* release the path since we're done with it */
3745 btrfs_release_path(path);
3746
3747 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3748 trans = btrfs_join_transaction(root);
3749 if (!IS_ERR(trans))
3750 btrfs_end_transaction(trans);
3751 }
3752
3753 if (nr_unlink)
3754 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3755
3756 out:
3757 if (ret)
3758 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3759 return ret;
3760 }
3761
3762 /*
3763 * Look ahead in the leaf for xattrs. If we don't find any then we know there
3764 * can't be any ACLs.
3765 *
3766 * @leaf: the eb leaf where to search
3767 * @slot: the slot the inode is in
3768 * @objectid: the objectid of the inode
3769 *
3770 * Return true if there is xattr/ACL, false otherwise.
3771 */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3772 static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
3773 int slot, u64 objectid,
3774 int *first_xattr_slot)
3775 {
3776 u32 nritems = btrfs_header_nritems(leaf);
3777 struct btrfs_key found_key;
3778 static u64 xattr_access = 0;
3779 static u64 xattr_default = 0;
3780 int scanned = 0;
3781
3782 if (!xattr_access) {
3783 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3784 strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3785 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3786 strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3787 }
3788
3789 slot++;
3790 *first_xattr_slot = -1;
3791 while (slot < nritems) {
3792 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3793
3794 /* We found a different objectid, there must be no ACLs. */
3795 if (found_key.objectid != objectid)
3796 return false;
3797
3798 /* We found an xattr, assume we've got an ACL. */
3799 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3800 if (*first_xattr_slot == -1)
3801 *first_xattr_slot = slot;
3802 if (found_key.offset == xattr_access ||
3803 found_key.offset == xattr_default)
3804 return true;
3805 }
3806
3807 /*
3808 * We found a key greater than an xattr key, there can't be any
3809 * ACLs later on.
3810 */
3811 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3812 return false;
3813
3814 slot++;
3815 scanned++;
3816
3817 /*
3818 * The item order goes like:
3819 * - inode
3820 * - inode backrefs
3821 * - xattrs
3822 * - extents,
3823 *
3824 * so if there are lots of hard links to an inode there can be
3825 * a lot of backrefs. Don't waste time searching too hard,
3826 * this is just an optimization.
3827 */
3828 if (scanned >= 8)
3829 break;
3830 }
3831 /*
3832 * We hit the end of the leaf before we found an xattr or something
3833 * larger than an xattr. We have to assume the inode has ACLs.
3834 */
3835 if (*first_xattr_slot == -1)
3836 *first_xattr_slot = slot;
3837 return true;
3838 }
3839
btrfs_init_file_extent_tree(struct btrfs_inode * inode)3840 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3841 {
3842 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3843
3844 if (WARN_ON_ONCE(inode->file_extent_tree))
3845 return 0;
3846 if (btrfs_fs_incompat(fs_info, NO_HOLES))
3847 return 0;
3848 if (!S_ISREG(inode->vfs_inode.i_mode))
3849 return 0;
3850 if (btrfs_is_free_space_inode(inode))
3851 return 0;
3852
3853 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3854 if (!inode->file_extent_tree)
3855 return -ENOMEM;
3856
3857 btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
3858 IO_TREE_INODE_FILE_EXTENT);
3859 /* Lockdep class is set only for the file extent tree. */
3860 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3861
3862 return 0;
3863 }
3864
btrfs_add_inode_to_root(struct btrfs_inode * inode,bool prealloc)3865 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
3866 {
3867 struct btrfs_root *root = inode->root;
3868 struct btrfs_inode *existing;
3869 const u64 ino = btrfs_ino(inode);
3870 int ret;
3871
3872 if (inode_unhashed(&inode->vfs_inode))
3873 return 0;
3874
3875 if (prealloc) {
3876 ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
3877 if (ret)
3878 return ret;
3879 }
3880
3881 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
3882
3883 if (xa_is_err(existing)) {
3884 ret = xa_err(existing);
3885 ASSERT(ret != -EINVAL);
3886 ASSERT(ret != -ENOMEM);
3887 return ret;
3888 } else if (existing) {
3889 WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
3890 }
3891
3892 return 0;
3893 }
3894
3895 /*
3896 * Read a locked inode from the btree into the in-memory inode and add it to
3897 * its root list/tree.
3898 *
3899 * On failure clean up the inode.
3900 */
btrfs_read_locked_inode(struct btrfs_inode * inode,struct btrfs_path * path)3901 static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
3902 {
3903 struct btrfs_root *root = inode->root;
3904 struct btrfs_fs_info *fs_info = root->fs_info;
3905 struct extent_buffer *leaf;
3906 struct btrfs_inode_item *inode_item;
3907 struct inode *vfs_inode = &inode->vfs_inode;
3908 struct btrfs_key location;
3909 unsigned long ptr;
3910 int maybe_acls;
3911 u32 rdev;
3912 int ret;
3913 bool filled = false;
3914 int first_xattr_slot;
3915
3916 ret = btrfs_fill_inode(inode, &rdev);
3917 if (!ret)
3918 filled = true;
3919
3920 ASSERT(path);
3921
3922 btrfs_get_inode_key(inode, &location);
3923
3924 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3925 if (ret) {
3926 /*
3927 * ret > 0 can come from btrfs_search_slot called by
3928 * btrfs_lookup_inode(), this means the inode was not found.
3929 */
3930 if (ret > 0)
3931 ret = -ENOENT;
3932 goto out;
3933 }
3934
3935 leaf = path->nodes[0];
3936
3937 if (filled)
3938 goto cache_index;
3939
3940 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3941 struct btrfs_inode_item);
3942 vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3943 set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
3944 i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
3945 i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
3946 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3947
3948 inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3949 btrfs_timespec_nsec(leaf, &inode_item->atime));
3950
3951 inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3952 btrfs_timespec_nsec(leaf, &inode_item->mtime));
3953
3954 inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3955 btrfs_timespec_nsec(leaf, &inode_item->ctime));
3956
3957 inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3958 inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
3959
3960 inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
3961 inode->generation = btrfs_inode_generation(leaf, inode_item);
3962 inode->last_trans = btrfs_inode_transid(leaf, inode_item);
3963
3964 inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
3965 vfs_inode->i_generation = inode->generation;
3966 vfs_inode->i_rdev = 0;
3967 rdev = btrfs_inode_rdev(leaf, inode_item);
3968
3969 if (S_ISDIR(vfs_inode->i_mode))
3970 inode->index_cnt = (u64)-1;
3971
3972 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3973 &inode->flags, &inode->ro_flags);
3974 btrfs_update_inode_mapping_flags(inode);
3975 btrfs_set_inode_mapping_order(inode);
3976
3977 cache_index:
3978 ret = btrfs_init_file_extent_tree(inode);
3979 if (ret)
3980 goto out;
3981 btrfs_inode_set_file_extent_range(inode, 0,
3982 round_up(i_size_read(vfs_inode), fs_info->sectorsize));
3983 /*
3984 * If we were modified in the current generation and evicted from memory
3985 * and then re-read we need to do a full sync since we don't have any
3986 * idea about which extents were modified before we were evicted from
3987 * cache.
3988 *
3989 * This is required for both inode re-read from disk and delayed inode
3990 * in the delayed_nodes xarray.
3991 */
3992 if (inode->last_trans == btrfs_get_fs_generation(fs_info))
3993 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3994
3995 /*
3996 * We don't persist the id of the transaction where an unlink operation
3997 * against the inode was last made. So here we assume the inode might
3998 * have been evicted, and therefore the exact value of last_unlink_trans
3999 * lost, and set it to last_trans to avoid metadata inconsistencies
4000 * between the inode and its parent if the inode is fsync'ed and the log
4001 * replayed. For example, in the scenario:
4002 *
4003 * touch mydir/foo
4004 * ln mydir/foo mydir/bar
4005 * sync
4006 * unlink mydir/bar
4007 * echo 2 > /proc/sys/vm/drop_caches # evicts inode
4008 * xfs_io -c fsync mydir/foo
4009 * <power failure>
4010 * mount fs, triggers fsync log replay
4011 *
4012 * We must make sure that when we fsync our inode foo we also log its
4013 * parent inode, otherwise after log replay the parent still has the
4014 * dentry with the "bar" name but our inode foo has a link count of 1
4015 * and doesn't have an inode ref with the name "bar" anymore.
4016 *
4017 * Setting last_unlink_trans to last_trans is a pessimistic approach,
4018 * but it guarantees correctness at the expense of occasional full
4019 * transaction commits on fsync if our inode is a directory, or if our
4020 * inode is not a directory, logging its parent unnecessarily.
4021 */
4022 inode->last_unlink_trans = inode->last_trans;
4023
4024 /*
4025 * Same logic as for last_unlink_trans. We don't persist the generation
4026 * of the last transaction where this inode was used for a reflink
4027 * operation, so after eviction and reloading the inode we must be
4028 * pessimistic and assume the last transaction that modified the inode.
4029 */
4030 inode->last_reflink_trans = inode->last_trans;
4031
4032 path->slots[0]++;
4033 if (vfs_inode->i_nlink != 1 ||
4034 path->slots[0] >= btrfs_header_nritems(leaf))
4035 goto cache_acl;
4036
4037 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
4038 if (location.objectid != btrfs_ino(inode))
4039 goto cache_acl;
4040
4041 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4042 if (location.type == BTRFS_INODE_REF_KEY) {
4043 struct btrfs_inode_ref *ref;
4044
4045 ref = (struct btrfs_inode_ref *)ptr;
4046 inode->dir_index = btrfs_inode_ref_index(leaf, ref);
4047 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
4048 struct btrfs_inode_extref *extref;
4049
4050 extref = (struct btrfs_inode_extref *)ptr;
4051 inode->dir_index = btrfs_inode_extref_index(leaf, extref);
4052 }
4053 cache_acl:
4054 /*
4055 * try to precache a NULL acl entry for files that don't have
4056 * any xattrs or acls
4057 */
4058 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
4059 btrfs_ino(inode), &first_xattr_slot);
4060 if (first_xattr_slot != -1) {
4061 path->slots[0] = first_xattr_slot;
4062 ret = btrfs_load_inode_props(inode, path);
4063 if (ret)
4064 btrfs_err(fs_info,
4065 "error loading props for ino %llu (root %llu): %d",
4066 btrfs_ino(inode), btrfs_root_id(root), ret);
4067 }
4068
4069 if (!maybe_acls)
4070 cache_no_acl(vfs_inode);
4071
4072 switch (vfs_inode->i_mode & S_IFMT) {
4073 case S_IFREG:
4074 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4075 vfs_inode->i_fop = &btrfs_file_operations;
4076 vfs_inode->i_op = &btrfs_file_inode_operations;
4077 break;
4078 case S_IFDIR:
4079 vfs_inode->i_fop = &btrfs_dir_file_operations;
4080 vfs_inode->i_op = &btrfs_dir_inode_operations;
4081 break;
4082 case S_IFLNK:
4083 vfs_inode->i_op = &btrfs_symlink_inode_operations;
4084 inode_nohighmem(vfs_inode);
4085 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4086 break;
4087 default:
4088 vfs_inode->i_op = &btrfs_special_inode_operations;
4089 init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
4090 break;
4091 }
4092
4093 btrfs_sync_inode_flags_to_i_flags(inode);
4094
4095 ret = btrfs_add_inode_to_root(inode, true);
4096 if (ret)
4097 goto out;
4098
4099 return 0;
4100 out:
4101 iget_failed(vfs_inode);
4102 return ret;
4103 }
4104
4105 /*
4106 * given a leaf and an inode, copy the inode fields into the leaf
4107 */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)4108 static void fill_inode_item(struct btrfs_trans_handle *trans,
4109 struct extent_buffer *leaf,
4110 struct btrfs_inode_item *item,
4111 struct inode *inode)
4112 {
4113 u64 flags;
4114
4115 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
4116 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
4117 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
4118 btrfs_set_inode_mode(leaf, item, inode->i_mode);
4119 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
4120
4121 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
4122 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
4123
4124 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
4125 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
4126
4127 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
4128 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
4129
4130 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
4131 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4132
4133 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
4134 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
4135 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
4136 btrfs_set_inode_transid(leaf, item, trans->transid);
4137 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
4138 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4139 BTRFS_I(inode)->ro_flags);
4140 btrfs_set_inode_flags(leaf, item, flags);
4141 btrfs_set_inode_block_group(leaf, item, 0);
4142 }
4143
4144 /*
4145 * copy everything in the in-memory inode into the btree.
4146 */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4147 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4148 struct btrfs_inode *inode)
4149 {
4150 struct btrfs_inode_item *inode_item;
4151 BTRFS_PATH_AUTO_FREE(path);
4152 struct extent_buffer *leaf;
4153 struct btrfs_key key;
4154 int ret;
4155
4156 path = btrfs_alloc_path();
4157 if (!path)
4158 return -ENOMEM;
4159
4160 btrfs_get_inode_key(inode, &key);
4161 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
4162 if (ret) {
4163 if (ret > 0)
4164 ret = -ENOENT;
4165 return ret;
4166 }
4167
4168 leaf = path->nodes[0];
4169 inode_item = btrfs_item_ptr(leaf, path->slots[0],
4170 struct btrfs_inode_item);
4171
4172 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4173 btrfs_set_inode_last_trans(trans, inode);
4174 return 0;
4175 }
4176
4177 /*
4178 * copy everything in the in-memory inode into the btree.
4179 */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4180 int btrfs_update_inode(struct btrfs_trans_handle *trans,
4181 struct btrfs_inode *inode)
4182 {
4183 struct btrfs_root *root = inode->root;
4184 struct btrfs_fs_info *fs_info = root->fs_info;
4185 int ret;
4186
4187 /*
4188 * If the inode is a free space inode, we can deadlock during commit
4189 * if we put it into the delayed code.
4190 *
4191 * The data relocation inode should also be directly updated
4192 * without delay
4193 */
4194 if (!btrfs_is_free_space_inode(inode)
4195 && !btrfs_is_data_reloc_root(root)
4196 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4197 btrfs_update_root_times(trans, root);
4198
4199 ret = btrfs_delayed_update_inode(trans, inode);
4200 if (!ret)
4201 btrfs_set_inode_last_trans(trans, inode);
4202 return ret;
4203 }
4204
4205 return btrfs_update_inode_item(trans, inode);
4206 }
4207
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4208 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4209 struct btrfs_inode *inode)
4210 {
4211 int ret;
4212
4213 ret = btrfs_update_inode(trans, inode);
4214 if (ret == -ENOSPC)
4215 return btrfs_update_inode_item(trans, inode);
4216 return ret;
4217 }
4218
update_time_after_link_or_unlink(struct btrfs_inode * dir)4219 static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
4220 {
4221 struct timespec64 now;
4222
4223 /*
4224 * If we are replaying a log tree, we do not want to update the mtime
4225 * and ctime of the parent directory with the current time, since the
4226 * log replay procedure is responsible for setting them to their correct
4227 * values (the ones it had when the fsync was done).
4228 */
4229 if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
4230 return;
4231
4232 now = inode_set_ctime_current(&dir->vfs_inode);
4233 inode_set_mtime_to_ts(&dir->vfs_inode, now);
4234 }
4235
4236 /*
4237 * unlink helper that gets used here in inode.c and in the tree logging
4238 * recovery code. It remove a link in a directory with a given name, and
4239 * also drops the back refs in the inode to the directory
4240 */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4241 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4242 struct btrfs_inode *dir,
4243 struct btrfs_inode *inode,
4244 const struct fscrypt_str *name,
4245 struct btrfs_rename_ctx *rename_ctx)
4246 {
4247 struct btrfs_root *root = dir->root;
4248 struct btrfs_fs_info *fs_info = root->fs_info;
4249 struct btrfs_path *path;
4250 int ret = 0;
4251 struct btrfs_dir_item *di;
4252 u64 index;
4253 u64 ino = btrfs_ino(inode);
4254 u64 dir_ino = btrfs_ino(dir);
4255
4256 path = btrfs_alloc_path();
4257 if (!path)
4258 return -ENOMEM;
4259
4260 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4261 if (IS_ERR_OR_NULL(di)) {
4262 btrfs_free_path(path);
4263 return di ? PTR_ERR(di) : -ENOENT;
4264 }
4265 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4266 /*
4267 * Down the call chains below we'll also need to allocate a path, so no
4268 * need to hold on to this one for longer than necessary.
4269 */
4270 btrfs_free_path(path);
4271 if (ret)
4272 return ret;
4273
4274 /*
4275 * If we don't have dir index, we have to get it by looking up
4276 * the inode ref, since we get the inode ref, remove it directly,
4277 * it is unnecessary to do delayed deletion.
4278 *
4279 * But if we have dir index, needn't search inode ref to get it.
4280 * Since the inode ref is close to the inode item, it is better
4281 * that we delay to delete it, and just do this deletion when
4282 * we update the inode item.
4283 */
4284 if (inode->dir_index) {
4285 ret = btrfs_delayed_delete_inode_ref(inode);
4286 if (!ret) {
4287 index = inode->dir_index;
4288 goto skip_backref;
4289 }
4290 }
4291
4292 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4293 if (unlikely(ret)) {
4294 btrfs_crit(fs_info,
4295 "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
4296 name->len, name->name, btrfs_root_id(root), ino, dir_ino);
4297 btrfs_abort_transaction(trans, ret);
4298 return ret;
4299 }
4300 skip_backref:
4301 if (rename_ctx)
4302 rename_ctx->index = index;
4303
4304 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4305 if (unlikely(ret)) {
4306 btrfs_abort_transaction(trans, ret);
4307 return ret;
4308 }
4309
4310 /*
4311 * If we are in a rename context, we don't need to update anything in the
4312 * log. That will be done later during the rename by btrfs_log_new_name().
4313 * Besides that, doing it here would only cause extra unnecessary btree
4314 * operations on the log tree, increasing latency for applications.
4315 */
4316 if (!rename_ctx) {
4317 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4318 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4319 }
4320
4321 /*
4322 * If we have a pending delayed iput we could end up with the final iput
4323 * being run in btrfs-cleaner context. If we have enough of these built
4324 * up we can end up burning a lot of time in btrfs-cleaner without any
4325 * way to throttle the unlinks. Since we're currently holding a ref on
4326 * the inode we can run the delayed iput here without any issues as the
4327 * final iput won't be done until after we drop the ref we're currently
4328 * holding.
4329 */
4330 btrfs_run_delayed_iput(fs_info, inode);
4331
4332 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4333 inode_inc_iversion(&inode->vfs_inode);
4334 inode_set_ctime_current(&inode->vfs_inode);
4335 inode_inc_iversion(&dir->vfs_inode);
4336 update_time_after_link_or_unlink(dir);
4337
4338 return btrfs_update_inode(trans, dir);
4339 }
4340
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4341 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4342 struct btrfs_inode *dir, struct btrfs_inode *inode,
4343 const struct fscrypt_str *name)
4344 {
4345 int ret;
4346
4347 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4348 if (!ret) {
4349 drop_nlink(&inode->vfs_inode);
4350 ret = btrfs_update_inode(trans, inode);
4351 }
4352 return ret;
4353 }
4354
4355 /*
4356 * helper to start transaction for unlink and rmdir.
4357 *
4358 * unlink and rmdir are special in btrfs, they do not always free space, so
4359 * if we cannot make our reservations the normal way try and see if there is
4360 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4361 * allow the unlink to occur.
4362 */
__unlink_start_trans(struct btrfs_inode * dir)4363 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4364 {
4365 struct btrfs_root *root = dir->root;
4366
4367 return btrfs_start_transaction_fallback_global_rsv(root,
4368 BTRFS_UNLINK_METADATA_UNITS);
4369 }
4370
btrfs_unlink(struct inode * dir,struct dentry * dentry)4371 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4372 {
4373 struct btrfs_trans_handle *trans;
4374 struct inode *inode = d_inode(dentry);
4375 int ret;
4376 struct fscrypt_name fname;
4377
4378 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4379 if (ret)
4380 return ret;
4381
4382 /* This needs to handle no-key deletions later on */
4383
4384 trans = __unlink_start_trans(BTRFS_I(dir));
4385 if (IS_ERR(trans)) {
4386 ret = PTR_ERR(trans);
4387 goto fscrypt_free;
4388 }
4389
4390 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4391 false);
4392
4393 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4394 &fname.disk_name);
4395 if (ret)
4396 goto end_trans;
4397
4398 if (inode->i_nlink == 0) {
4399 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4400 if (ret)
4401 goto end_trans;
4402 }
4403
4404 end_trans:
4405 btrfs_end_transaction(trans);
4406 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4407 fscrypt_free:
4408 fscrypt_free_filename(&fname);
4409 return ret;
4410 }
4411
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4412 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4413 struct btrfs_inode *dir, struct dentry *dentry)
4414 {
4415 struct btrfs_root *root = dir->root;
4416 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4417 struct btrfs_path *path;
4418 struct extent_buffer *leaf;
4419 struct btrfs_dir_item *di;
4420 struct btrfs_key key;
4421 u64 index;
4422 int ret;
4423 u64 objectid;
4424 u64 dir_ino = btrfs_ino(dir);
4425 struct fscrypt_name fname;
4426
4427 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4428 if (ret)
4429 return ret;
4430
4431 /* This needs to handle no-key deletions later on */
4432
4433 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4434 objectid = btrfs_root_id(inode->root);
4435 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4436 objectid = inode->ref_root_id;
4437 } else {
4438 WARN_ON(1);
4439 fscrypt_free_filename(&fname);
4440 return -EINVAL;
4441 }
4442
4443 path = btrfs_alloc_path();
4444 if (!path) {
4445 ret = -ENOMEM;
4446 goto out;
4447 }
4448
4449 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4450 &fname.disk_name, -1);
4451 if (IS_ERR_OR_NULL(di)) {
4452 ret = di ? PTR_ERR(di) : -ENOENT;
4453 goto out;
4454 }
4455
4456 leaf = path->nodes[0];
4457 btrfs_dir_item_key_to_cpu(leaf, di, &key);
4458 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4459 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4460 if (unlikely(ret)) {
4461 btrfs_abort_transaction(trans, ret);
4462 goto out;
4463 }
4464 btrfs_release_path(path);
4465
4466 /*
4467 * This is a placeholder inode for a subvolume we didn't have a
4468 * reference to at the time of the snapshot creation. In the meantime
4469 * we could have renamed the real subvol link into our snapshot, so
4470 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4471 * Instead simply lookup the dir_index_item for this entry so we can
4472 * remove it. Otherwise we know we have a ref to the root and we can
4473 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4474 */
4475 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4476 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4477 if (IS_ERR(di)) {
4478 ret = PTR_ERR(di);
4479 btrfs_abort_transaction(trans, ret);
4480 goto out;
4481 }
4482
4483 leaf = path->nodes[0];
4484 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4485 index = key.offset;
4486 btrfs_release_path(path);
4487 } else {
4488 ret = btrfs_del_root_ref(trans, objectid,
4489 btrfs_root_id(root), dir_ino,
4490 &index, &fname.disk_name);
4491 if (unlikely(ret)) {
4492 btrfs_abort_transaction(trans, ret);
4493 goto out;
4494 }
4495 }
4496
4497 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4498 if (unlikely(ret)) {
4499 btrfs_abort_transaction(trans, ret);
4500 goto out;
4501 }
4502
4503 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4504 inode_inc_iversion(&dir->vfs_inode);
4505 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4506 ret = btrfs_update_inode_fallback(trans, dir);
4507 if (ret)
4508 btrfs_abort_transaction(trans, ret);
4509 out:
4510 btrfs_free_path(path);
4511 fscrypt_free_filename(&fname);
4512 return ret;
4513 }
4514
4515 /*
4516 * Helper to check if the subvolume references other subvolumes or if it's
4517 * default.
4518 */
may_destroy_subvol(struct btrfs_root * root)4519 static noinline int may_destroy_subvol(struct btrfs_root *root)
4520 {
4521 struct btrfs_fs_info *fs_info = root->fs_info;
4522 BTRFS_PATH_AUTO_FREE(path);
4523 struct btrfs_dir_item *di;
4524 struct btrfs_key key;
4525 struct fscrypt_str name = FSTR_INIT("default", 7);
4526 u64 dir_id;
4527 int ret;
4528
4529 path = btrfs_alloc_path();
4530 if (!path)
4531 return -ENOMEM;
4532
4533 /* Make sure this root isn't set as the default subvol */
4534 dir_id = btrfs_super_root_dir(fs_info->super_copy);
4535 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4536 dir_id, &name, 0);
4537 if (di && !IS_ERR(di)) {
4538 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4539 if (key.objectid == btrfs_root_id(root)) {
4540 ret = -EPERM;
4541 btrfs_err(fs_info,
4542 "deleting default subvolume %llu is not allowed",
4543 key.objectid);
4544 return ret;
4545 }
4546 btrfs_release_path(path);
4547 }
4548
4549 key.objectid = btrfs_root_id(root);
4550 key.type = BTRFS_ROOT_REF_KEY;
4551 key.offset = (u64)-1;
4552
4553 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4554 if (ret < 0)
4555 return ret;
4556 if (unlikely(ret == 0)) {
4557 /*
4558 * Key with offset -1 found, there would have to exist a root
4559 * with such id, but this is out of valid range.
4560 */
4561 return -EUCLEAN;
4562 }
4563
4564 ret = 0;
4565 if (path->slots[0] > 0) {
4566 path->slots[0]--;
4567 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4568 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
4569 ret = -ENOTEMPTY;
4570 }
4571
4572 return ret;
4573 }
4574
4575 /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4576 static void btrfs_prune_dentries(struct btrfs_root *root)
4577 {
4578 struct btrfs_fs_info *fs_info = root->fs_info;
4579 struct btrfs_inode *inode;
4580 u64 min_ino = 0;
4581
4582 if (!BTRFS_FS_ERROR(fs_info))
4583 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4584
4585 inode = btrfs_find_first_inode(root, min_ino);
4586 while (inode) {
4587 if (icount_read(&inode->vfs_inode) > 1)
4588 d_prune_aliases(&inode->vfs_inode);
4589
4590 min_ino = btrfs_ino(inode) + 1;
4591 /*
4592 * btrfs_drop_inode() will have it removed from the inode
4593 * cache when its usage count hits zero.
4594 */
4595 iput(&inode->vfs_inode);
4596 cond_resched();
4597 inode = btrfs_find_first_inode(root, min_ino);
4598 }
4599 }
4600
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4601 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4602 {
4603 struct btrfs_root *root = dir->root;
4604 struct btrfs_fs_info *fs_info = root->fs_info;
4605 struct inode *inode = d_inode(dentry);
4606 struct btrfs_root *dest = BTRFS_I(inode)->root;
4607 struct btrfs_trans_handle *trans;
4608 struct btrfs_block_rsv block_rsv;
4609 u64 root_flags;
4610 u64 qgroup_reserved = 0;
4611 int ret;
4612
4613 down_write(&fs_info->subvol_sem);
4614
4615 /*
4616 * Don't allow to delete a subvolume with send in progress. This is
4617 * inside the inode lock so the error handling that has to drop the bit
4618 * again is not run concurrently.
4619 */
4620 spin_lock(&dest->root_item_lock);
4621 if (dest->send_in_progress) {
4622 spin_unlock(&dest->root_item_lock);
4623 btrfs_warn(fs_info,
4624 "attempt to delete subvolume %llu during send",
4625 btrfs_root_id(dest));
4626 ret = -EPERM;
4627 goto out_up_write;
4628 }
4629 if (atomic_read(&dest->nr_swapfiles)) {
4630 spin_unlock(&dest->root_item_lock);
4631 btrfs_warn(fs_info,
4632 "attempt to delete subvolume %llu with active swapfile",
4633 btrfs_root_id(root));
4634 ret = -EPERM;
4635 goto out_up_write;
4636 }
4637 root_flags = btrfs_root_flags(&dest->root_item);
4638 btrfs_set_root_flags(&dest->root_item,
4639 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4640 spin_unlock(&dest->root_item_lock);
4641
4642 ret = may_destroy_subvol(dest);
4643 if (ret)
4644 goto out_undead;
4645
4646 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4647 /*
4648 * One for dir inode,
4649 * two for dir entries,
4650 * two for root ref/backref.
4651 */
4652 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4653 if (ret)
4654 goto out_undead;
4655 qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4656
4657 trans = btrfs_start_transaction(root, 0);
4658 if (IS_ERR(trans)) {
4659 ret = PTR_ERR(trans);
4660 goto out_release;
4661 }
4662 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4663 qgroup_reserved = 0;
4664 trans->block_rsv = &block_rsv;
4665 trans->bytes_reserved = block_rsv.size;
4666
4667 btrfs_record_snapshot_destroy(trans, dir);
4668
4669 ret = btrfs_unlink_subvol(trans, dir, dentry);
4670 if (unlikely(ret)) {
4671 btrfs_abort_transaction(trans, ret);
4672 goto out_end_trans;
4673 }
4674
4675 ret = btrfs_record_root_in_trans(trans, dest);
4676 if (unlikely(ret)) {
4677 btrfs_abort_transaction(trans, ret);
4678 goto out_end_trans;
4679 }
4680
4681 memset(&dest->root_item.drop_progress, 0,
4682 sizeof(dest->root_item.drop_progress));
4683 btrfs_set_root_drop_level(&dest->root_item, 0);
4684 btrfs_set_root_refs(&dest->root_item, 0);
4685
4686 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4687 ret = btrfs_insert_orphan_item(trans,
4688 fs_info->tree_root,
4689 btrfs_root_id(dest));
4690 if (unlikely(ret)) {
4691 btrfs_abort_transaction(trans, ret);
4692 goto out_end_trans;
4693 }
4694 }
4695
4696 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4697 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
4698 if (unlikely(ret && ret != -ENOENT)) {
4699 btrfs_abort_transaction(trans, ret);
4700 goto out_end_trans;
4701 }
4702 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4703 ret = btrfs_uuid_tree_remove(trans,
4704 dest->root_item.received_uuid,
4705 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4706 btrfs_root_id(dest));
4707 if (unlikely(ret && ret != -ENOENT)) {
4708 btrfs_abort_transaction(trans, ret);
4709 goto out_end_trans;
4710 }
4711 }
4712
4713 free_anon_bdev(dest->anon_dev);
4714 dest->anon_dev = 0;
4715 out_end_trans:
4716 trans->block_rsv = NULL;
4717 trans->bytes_reserved = 0;
4718 ret = btrfs_end_transaction(trans);
4719 inode->i_flags |= S_DEAD;
4720 out_release:
4721 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4722 if (qgroup_reserved)
4723 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4724 out_undead:
4725 if (ret) {
4726 spin_lock(&dest->root_item_lock);
4727 root_flags = btrfs_root_flags(&dest->root_item);
4728 btrfs_set_root_flags(&dest->root_item,
4729 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4730 spin_unlock(&dest->root_item_lock);
4731 }
4732 out_up_write:
4733 up_write(&fs_info->subvol_sem);
4734 if (!ret) {
4735 d_invalidate(dentry);
4736 btrfs_prune_dentries(dest);
4737 ASSERT(dest->send_in_progress == 0);
4738 }
4739
4740 return ret;
4741 }
4742
btrfs_rmdir(struct inode * vfs_dir,struct dentry * dentry)4743 static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
4744 {
4745 struct btrfs_inode *dir = BTRFS_I(vfs_dir);
4746 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4747 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4748 int ret = 0;
4749 struct btrfs_trans_handle *trans;
4750 struct fscrypt_name fname;
4751
4752 if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
4753 return -ENOTEMPTY;
4754 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4755 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4756 btrfs_err(fs_info,
4757 "extent tree v2 doesn't support snapshot deletion yet");
4758 return -EOPNOTSUPP;
4759 }
4760 return btrfs_delete_subvolume(dir, dentry);
4761 }
4762
4763 ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
4764 if (ret)
4765 return ret;
4766
4767 /* This needs to handle no-key deletions later on */
4768
4769 trans = __unlink_start_trans(dir);
4770 if (IS_ERR(trans)) {
4771 ret = PTR_ERR(trans);
4772 goto out_notrans;
4773 }
4774
4775 /*
4776 * Propagate the last_unlink_trans value of the deleted dir to its
4777 * parent directory. This is to prevent an unrecoverable log tree in the
4778 * case we do something like this:
4779 * 1) create dir foo
4780 * 2) create snapshot under dir foo
4781 * 3) delete the snapshot
4782 * 4) rmdir foo
4783 * 5) mkdir foo
4784 * 6) fsync foo or some file inside foo
4785 *
4786 * This is because we can't unlink other roots when replaying the dir
4787 * deletes for directory foo.
4788 */
4789 if (inode->last_unlink_trans >= trans->transid)
4790 btrfs_record_snapshot_destroy(trans, dir);
4791
4792 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4793 ret = btrfs_unlink_subvol(trans, dir, dentry);
4794 goto out;
4795 }
4796
4797 ret = btrfs_orphan_add(trans, inode);
4798 if (ret)
4799 goto out;
4800
4801 /* now the directory is empty */
4802 ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
4803 if (!ret)
4804 btrfs_i_size_write(inode, 0);
4805 out:
4806 btrfs_end_transaction(trans);
4807 out_notrans:
4808 btrfs_btree_balance_dirty(fs_info);
4809 fscrypt_free_filename(&fname);
4810
4811 return ret;
4812 }
4813
is_inside_block(u64 bytenr,u64 blockstart,u32 blocksize)4814 static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
4815 {
4816 ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
4817 blockstart, blocksize);
4818
4819 if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
4820 return true;
4821 return false;
4822 }
4823
truncate_block_zero_beyond_eof(struct btrfs_inode * inode,u64 start)4824 static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
4825 {
4826 const pgoff_t index = (start >> PAGE_SHIFT);
4827 struct address_space *mapping = inode->vfs_inode.i_mapping;
4828 struct folio *folio;
4829 u64 zero_start;
4830 u64 zero_end;
4831 int ret = 0;
4832
4833 again:
4834 folio = filemap_lock_folio(mapping, index);
4835 /* No folio present. */
4836 if (IS_ERR(folio))
4837 return 0;
4838
4839 if (!folio_test_uptodate(folio)) {
4840 ret = btrfs_read_folio(NULL, folio);
4841 folio_lock(folio);
4842 if (folio->mapping != mapping) {
4843 folio_unlock(folio);
4844 folio_put(folio);
4845 goto again;
4846 }
4847 if (unlikely(!folio_test_uptodate(folio))) {
4848 ret = -EIO;
4849 goto out_unlock;
4850 }
4851 }
4852 folio_wait_writeback(folio);
4853
4854 /*
4855 * We do not need to lock extents nor wait for OE, as it's already
4856 * beyond EOF.
4857 */
4858
4859 zero_start = max_t(u64, folio_pos(folio), start);
4860 zero_end = folio_end(folio);
4861 folio_zero_range(folio, zero_start - folio_pos(folio),
4862 zero_end - zero_start);
4863
4864 out_unlock:
4865 folio_unlock(folio);
4866 folio_put(folio);
4867 return ret;
4868 }
4869
4870 /*
4871 * Handle the truncation of a fs block.
4872 *
4873 * @inode - inode that we're zeroing
4874 * @offset - the file offset of the block to truncate
4875 * The value must be inside [@start, @end], and the function will do
4876 * extra checks if the block that covers @offset needs to be zeroed.
4877 * @start - the start file offset of the range we want to zero
4878 * @end - the end (inclusive) file offset of the range we want to zero.
4879 *
4880 * If the range is not block aligned, read out the folio that covers @offset,
4881 * and if needed zero blocks that are inside the folio and covered by [@start, @end).
4882 * If @start or @end + 1 lands inside a block, that block will be marked dirty
4883 * for writeback.
4884 *
4885 * This is utilized by hole punch, zero range, file expansion.
4886 */
btrfs_truncate_block(struct btrfs_inode * inode,u64 offset,u64 start,u64 end)4887 int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
4888 {
4889 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4890 struct address_space *mapping = inode->vfs_inode.i_mapping;
4891 struct extent_io_tree *io_tree = &inode->io_tree;
4892 struct btrfs_ordered_extent *ordered;
4893 struct extent_state *cached_state = NULL;
4894 struct extent_changeset *data_reserved = NULL;
4895 bool only_release_metadata = false;
4896 u32 blocksize = fs_info->sectorsize;
4897 pgoff_t index = (offset >> PAGE_SHIFT);
4898 struct folio *folio;
4899 gfp_t mask = btrfs_alloc_write_mask(mapping);
4900 int ret = 0;
4901 const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
4902 blocksize);
4903 const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
4904 blocksize);
4905 bool need_truncate_head = false;
4906 bool need_truncate_tail = false;
4907 u64 zero_start;
4908 u64 zero_end;
4909 u64 block_start;
4910 u64 block_end;
4911
4912 /* @offset should be inside the range. */
4913 ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
4914 offset, start, end);
4915
4916 /* The range is aligned at both ends. */
4917 if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
4918 /*
4919 * For block size < page size case, we may have polluted blocks
4920 * beyond EOF. So we also need to zero them out.
4921 */
4922 if (end == (u64)-1 && blocksize < PAGE_SIZE)
4923 ret = truncate_block_zero_beyond_eof(inode, start);
4924 goto out;
4925 }
4926
4927 /*
4928 * @offset may not be inside the head nor tail block. In that case we
4929 * don't need to do anything.
4930 */
4931 if (!in_head_block && !in_tail_block)
4932 goto out;
4933
4934 /*
4935 * Skip the truncation if the range in the target block is already aligned.
4936 * The seemingly complex check will also handle the same block case.
4937 */
4938 if (in_head_block && !IS_ALIGNED(start, blocksize))
4939 need_truncate_head = true;
4940 if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
4941 need_truncate_tail = true;
4942 if (!need_truncate_head && !need_truncate_tail)
4943 goto out;
4944
4945 block_start = round_down(offset, blocksize);
4946 block_end = block_start + blocksize - 1;
4947
4948 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4949 blocksize, false);
4950 if (ret < 0) {
4951 size_t write_bytes = blocksize;
4952
4953 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4954 /* For nocow case, no need to reserve data space. */
4955 ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
4956 write_bytes, blocksize);
4957 only_release_metadata = true;
4958 } else {
4959 goto out;
4960 }
4961 }
4962 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4963 if (ret < 0) {
4964 if (!only_release_metadata)
4965 btrfs_free_reserved_data_space(inode, data_reserved,
4966 block_start, blocksize);
4967 goto out;
4968 }
4969 again:
4970 folio = __filemap_get_folio(mapping, index,
4971 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4972 if (IS_ERR(folio)) {
4973 if (only_release_metadata)
4974 btrfs_delalloc_release_metadata(inode, blocksize, true);
4975 else
4976 btrfs_delalloc_release_space(inode, data_reserved,
4977 block_start, blocksize, true);
4978 btrfs_delalloc_release_extents(inode, blocksize);
4979 ret = PTR_ERR(folio);
4980 goto out;
4981 }
4982
4983 if (!folio_test_uptodate(folio)) {
4984 ret = btrfs_read_folio(NULL, folio);
4985 folio_lock(folio);
4986 if (folio->mapping != mapping) {
4987 folio_unlock(folio);
4988 folio_put(folio);
4989 goto again;
4990 }
4991 if (unlikely(!folio_test_uptodate(folio))) {
4992 ret = -EIO;
4993 goto out_unlock;
4994 }
4995 }
4996
4997 /*
4998 * We unlock the page after the io is completed and then re-lock it
4999 * above. release_folio() could have come in between that and cleared
5000 * folio private, but left the page in the mapping. Set the page mapped
5001 * here to make sure it's properly set for the subpage stuff.
5002 */
5003 ret = set_folio_extent_mapped(folio);
5004 if (ret < 0)
5005 goto out_unlock;
5006
5007 folio_wait_writeback(folio);
5008
5009 btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
5010
5011 ordered = btrfs_lookup_ordered_extent(inode, block_start);
5012 if (ordered) {
5013 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5014 folio_unlock(folio);
5015 folio_put(folio);
5016 btrfs_start_ordered_extent(ordered);
5017 btrfs_put_ordered_extent(ordered);
5018 goto again;
5019 }
5020
5021 btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
5022 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
5023 &cached_state);
5024
5025 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5026 &cached_state);
5027 if (ret) {
5028 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5029 goto out_unlock;
5030 }
5031
5032 if (end == (u64)-1) {
5033 /*
5034 * We're truncating beyond EOF, the remaining blocks normally are
5035 * already holes thus no need to zero again, but it's possible for
5036 * fs block size < page size cases to have memory mapped writes
5037 * to pollute ranges beyond EOF.
5038 *
5039 * In that case although such polluted blocks beyond EOF will
5040 * not reach disk, it still affects our page caches.
5041 */
5042 zero_start = max_t(u64, folio_pos(folio), start);
5043 zero_end = min_t(u64, folio_end(folio) - 1, end);
5044 } else {
5045 zero_start = max_t(u64, block_start, start);
5046 zero_end = min_t(u64, block_end, end);
5047 }
5048 folio_zero_range(folio, zero_start - folio_pos(folio),
5049 zero_end - zero_start + 1);
5050
5051 btrfs_folio_clear_checked(fs_info, folio, block_start,
5052 block_end + 1 - block_start);
5053 btrfs_folio_set_dirty(fs_info, folio, block_start,
5054 block_end + 1 - block_start);
5055
5056 if (only_release_metadata)
5057 btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
5058 EXTENT_NORESERVE, &cached_state);
5059
5060 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5061
5062 out_unlock:
5063 if (ret) {
5064 if (only_release_metadata)
5065 btrfs_delalloc_release_metadata(inode, blocksize, true);
5066 else
5067 btrfs_delalloc_release_space(inode, data_reserved,
5068 block_start, blocksize, true);
5069 }
5070 btrfs_delalloc_release_extents(inode, blocksize);
5071 folio_unlock(folio);
5072 folio_put(folio);
5073 out:
5074 if (only_release_metadata)
5075 btrfs_check_nocow_unlock(inode);
5076 extent_changeset_free(data_reserved);
5077 return ret;
5078 }
5079
maybe_insert_hole(struct btrfs_inode * inode,u64 offset,u64 len)5080 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
5081 {
5082 struct btrfs_root *root = inode->root;
5083 struct btrfs_fs_info *fs_info = root->fs_info;
5084 struct btrfs_trans_handle *trans;
5085 struct btrfs_drop_extents_args drop_args = { 0 };
5086 int ret;
5087
5088 /*
5089 * If NO_HOLES is enabled, we don't need to do anything.
5090 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
5091 * or btrfs_update_inode() will be called, which guarantee that the next
5092 * fsync will know this inode was changed and needs to be logged.
5093 */
5094 if (btrfs_fs_incompat(fs_info, NO_HOLES))
5095 return 0;
5096
5097 /*
5098 * 1 - for the one we're dropping
5099 * 1 - for the one we're adding
5100 * 1 - for updating the inode.
5101 */
5102 trans = btrfs_start_transaction(root, 3);
5103 if (IS_ERR(trans))
5104 return PTR_ERR(trans);
5105
5106 drop_args.start = offset;
5107 drop_args.end = offset + len;
5108 drop_args.drop_cache = true;
5109
5110 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
5111 if (unlikely(ret)) {
5112 btrfs_abort_transaction(trans, ret);
5113 btrfs_end_transaction(trans);
5114 return ret;
5115 }
5116
5117 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
5118 if (ret) {
5119 btrfs_abort_transaction(trans, ret);
5120 } else {
5121 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
5122 btrfs_update_inode(trans, inode);
5123 }
5124 btrfs_end_transaction(trans);
5125 return ret;
5126 }
5127
5128 /*
5129 * This function puts in dummy file extents for the area we're creating a hole
5130 * for. So if we are truncating this file to a larger size we need to insert
5131 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
5132 * the range between oldsize and size
5133 */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)5134 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
5135 {
5136 struct btrfs_root *root = inode->root;
5137 struct btrfs_fs_info *fs_info = root->fs_info;
5138 struct extent_io_tree *io_tree = &inode->io_tree;
5139 struct extent_map *em = NULL;
5140 struct extent_state *cached_state = NULL;
5141 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5142 u64 block_end = ALIGN(size, fs_info->sectorsize);
5143 u64 last_byte;
5144 u64 cur_offset;
5145 u64 hole_size;
5146 int ret = 0;
5147
5148 /*
5149 * If our size started in the middle of a block we need to zero out the
5150 * rest of the block before we expand the i_size, otherwise we could
5151 * expose stale data.
5152 */
5153 ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
5154 if (ret)
5155 return ret;
5156
5157 if (size <= hole_start)
5158 return 0;
5159
5160 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5161 &cached_state);
5162 cur_offset = hole_start;
5163 while (1) {
5164 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
5165 if (IS_ERR(em)) {
5166 ret = PTR_ERR(em);
5167 em = NULL;
5168 break;
5169 }
5170 last_byte = min(btrfs_extent_map_end(em), block_end);
5171 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5172 hole_size = last_byte - cur_offset;
5173
5174 if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
5175 struct extent_map *hole_em;
5176
5177 ret = maybe_insert_hole(inode, cur_offset, hole_size);
5178 if (ret)
5179 break;
5180
5181 ret = btrfs_inode_set_file_extent_range(inode,
5182 cur_offset, hole_size);
5183 if (ret)
5184 break;
5185
5186 hole_em = btrfs_alloc_extent_map();
5187 if (!hole_em) {
5188 btrfs_drop_extent_map_range(inode, cur_offset,
5189 cur_offset + hole_size - 1,
5190 false);
5191 btrfs_set_inode_full_sync(inode);
5192 goto next;
5193 }
5194 hole_em->start = cur_offset;
5195 hole_em->len = hole_size;
5196
5197 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
5198 hole_em->disk_num_bytes = 0;
5199 hole_em->ram_bytes = hole_size;
5200 hole_em->generation = btrfs_get_fs_generation(fs_info);
5201
5202 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
5203 btrfs_free_extent_map(hole_em);
5204 } else {
5205 ret = btrfs_inode_set_file_extent_range(inode,
5206 cur_offset, hole_size);
5207 if (ret)
5208 break;
5209 }
5210 next:
5211 btrfs_free_extent_map(em);
5212 em = NULL;
5213 cur_offset = last_byte;
5214 if (cur_offset >= block_end)
5215 break;
5216 }
5217 btrfs_free_extent_map(em);
5218 btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5219 return ret;
5220 }
5221
btrfs_setsize(struct inode * inode,struct iattr * attr)5222 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5223 {
5224 struct btrfs_root *root = BTRFS_I(inode)->root;
5225 struct btrfs_trans_handle *trans;
5226 loff_t oldsize = i_size_read(inode);
5227 loff_t newsize = attr->ia_size;
5228 int mask = attr->ia_valid;
5229 int ret;
5230
5231 /*
5232 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5233 * special case where we need to update the times despite not having
5234 * these flags set. For all other operations the VFS set these flags
5235 * explicitly if it wants a timestamp update.
5236 */
5237 if (newsize != oldsize) {
5238 inode_inc_iversion(inode);
5239 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5240 inode_set_mtime_to_ts(inode,
5241 inode_set_ctime_current(inode));
5242 }
5243 }
5244
5245 if (newsize > oldsize) {
5246 /*
5247 * Don't do an expanding truncate while snapshotting is ongoing.
5248 * This is to ensure the snapshot captures a fully consistent
5249 * state of this file - if the snapshot captures this expanding
5250 * truncation, it must capture all writes that happened before
5251 * this truncation.
5252 */
5253 btrfs_drew_write_lock(&root->snapshot_lock);
5254 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5255 if (ret) {
5256 btrfs_drew_write_unlock(&root->snapshot_lock);
5257 return ret;
5258 }
5259
5260 trans = btrfs_start_transaction(root, 1);
5261 if (IS_ERR(trans)) {
5262 btrfs_drew_write_unlock(&root->snapshot_lock);
5263 return PTR_ERR(trans);
5264 }
5265
5266 i_size_write(inode, newsize);
5267 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5268 pagecache_isize_extended(inode, oldsize, newsize);
5269 ret = btrfs_update_inode(trans, BTRFS_I(inode));
5270 btrfs_drew_write_unlock(&root->snapshot_lock);
5271 btrfs_end_transaction(trans);
5272 } else {
5273 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5274
5275 if (btrfs_is_zoned(fs_info)) {
5276 ret = btrfs_wait_ordered_range(BTRFS_I(inode),
5277 ALIGN(newsize, fs_info->sectorsize),
5278 (u64)-1);
5279 if (ret)
5280 return ret;
5281 }
5282
5283 /*
5284 * We're truncating a file that used to have good data down to
5285 * zero. Make sure any new writes to the file get on disk
5286 * on close.
5287 */
5288 if (newsize == 0)
5289 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5290 &BTRFS_I(inode)->runtime_flags);
5291
5292 truncate_setsize(inode, newsize);
5293
5294 inode_dio_wait(inode);
5295
5296 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5297 if (ret && inode->i_nlink) {
5298 int ret2;
5299
5300 /*
5301 * Truncate failed, so fix up the in-memory size. We
5302 * adjusted disk_i_size down as we removed extents, so
5303 * wait for disk_i_size to be stable and then update the
5304 * in-memory size to match.
5305 */
5306 ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
5307 if (ret2)
5308 return ret2;
5309 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5310 }
5311 }
5312
5313 return ret;
5314 }
5315
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5316 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5317 struct iattr *attr)
5318 {
5319 struct inode *inode = d_inode(dentry);
5320 struct btrfs_root *root = BTRFS_I(inode)->root;
5321 int ret;
5322
5323 if (btrfs_root_readonly(root))
5324 return -EROFS;
5325
5326 ret = setattr_prepare(idmap, dentry, attr);
5327 if (ret)
5328 return ret;
5329
5330 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5331 ret = btrfs_setsize(inode, attr);
5332 if (ret)
5333 return ret;
5334 }
5335
5336 if (attr->ia_valid) {
5337 setattr_copy(idmap, inode, attr);
5338 inode_inc_iversion(inode);
5339 ret = btrfs_dirty_inode(BTRFS_I(inode));
5340
5341 if (!ret && attr->ia_valid & ATTR_MODE)
5342 ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
5343 }
5344
5345 return ret;
5346 }
5347
5348 /*
5349 * While truncating the inode pages during eviction, we get the VFS
5350 * calling btrfs_invalidate_folio() against each folio of the inode. This
5351 * is slow because the calls to btrfs_invalidate_folio() result in a
5352 * huge amount of calls to lock_extent() and clear_extent_bit(),
5353 * which keep merging and splitting extent_state structures over and over,
5354 * wasting lots of time.
5355 *
5356 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5357 * skip all those expensive operations on a per folio basis and do only
5358 * the ordered io finishing, while we release here the extent_map and
5359 * extent_state structures, without the excessive merging and splitting.
5360 */
evict_inode_truncate_pages(struct inode * inode)5361 static void evict_inode_truncate_pages(struct inode *inode)
5362 {
5363 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5364 struct rb_node *node;
5365
5366 ASSERT(inode->i_state & I_FREEING);
5367 truncate_inode_pages_final(&inode->i_data);
5368
5369 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5370
5371 /*
5372 * Keep looping until we have no more ranges in the io tree.
5373 * We can have ongoing bios started by readahead that have
5374 * their endio callback (extent_io.c:end_bio_extent_readpage)
5375 * still in progress (unlocked the pages in the bio but did not yet
5376 * unlocked the ranges in the io tree). Therefore this means some
5377 * ranges can still be locked and eviction started because before
5378 * submitting those bios, which are executed by a separate task (work
5379 * queue kthread), inode references (inode->i_count) were not taken
5380 * (which would be dropped in the end io callback of each bio).
5381 * Therefore here we effectively end up waiting for those bios and
5382 * anyone else holding locked ranges without having bumped the inode's
5383 * reference count - if we don't do it, when they access the inode's
5384 * io_tree to unlock a range it may be too late, leading to an
5385 * use-after-free issue.
5386 */
5387 spin_lock(&io_tree->lock);
5388 while (!RB_EMPTY_ROOT(&io_tree->state)) {
5389 struct extent_state *state;
5390 struct extent_state *cached_state = NULL;
5391 u64 start;
5392 u64 end;
5393 unsigned state_flags;
5394
5395 node = rb_first(&io_tree->state);
5396 state = rb_entry(node, struct extent_state, rb_node);
5397 start = state->start;
5398 end = state->end;
5399 state_flags = state->state;
5400 spin_unlock(&io_tree->lock);
5401
5402 btrfs_lock_extent(io_tree, start, end, &cached_state);
5403
5404 /*
5405 * If still has DELALLOC flag, the extent didn't reach disk,
5406 * and its reserved space won't be freed by delayed_ref.
5407 * So we need to free its reserved space here.
5408 * (Refer to comment in btrfs_invalidate_folio, case 2)
5409 *
5410 * Note, end is the bytenr of last byte, so we need + 1 here.
5411 */
5412 if (state_flags & EXTENT_DELALLOC)
5413 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5414 end - start + 1, NULL);
5415
5416 btrfs_clear_extent_bit(io_tree, start, end,
5417 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5418 &cached_state);
5419
5420 cond_resched();
5421 spin_lock(&io_tree->lock);
5422 }
5423 spin_unlock(&io_tree->lock);
5424 }
5425
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5426 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5427 struct btrfs_block_rsv *rsv)
5428 {
5429 struct btrfs_fs_info *fs_info = root->fs_info;
5430 struct btrfs_trans_handle *trans;
5431 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5432 int ret;
5433
5434 /*
5435 * Eviction should be taking place at some place safe because of our
5436 * delayed iputs. However the normal flushing code will run delayed
5437 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5438 *
5439 * We reserve the delayed_refs_extra here again because we can't use
5440 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5441 * above. We reserve our extra bit here because we generate a ton of
5442 * delayed refs activity by truncating.
5443 *
5444 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5445 * if we fail to make this reservation we can re-try without the
5446 * delayed_refs_extra so we can make some forward progress.
5447 */
5448 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5449 BTRFS_RESERVE_FLUSH_EVICT);
5450 if (ret) {
5451 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5452 BTRFS_RESERVE_FLUSH_EVICT);
5453 if (ret) {
5454 btrfs_warn(fs_info,
5455 "could not allocate space for delete; will truncate on mount");
5456 return ERR_PTR(-ENOSPC);
5457 }
5458 delayed_refs_extra = 0;
5459 }
5460
5461 trans = btrfs_join_transaction(root);
5462 if (IS_ERR(trans))
5463 return trans;
5464
5465 if (delayed_refs_extra) {
5466 trans->block_rsv = &fs_info->trans_block_rsv;
5467 trans->bytes_reserved = delayed_refs_extra;
5468 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5469 delayed_refs_extra, true);
5470 }
5471 return trans;
5472 }
5473
btrfs_evict_inode(struct inode * inode)5474 void btrfs_evict_inode(struct inode *inode)
5475 {
5476 struct btrfs_fs_info *fs_info;
5477 struct btrfs_trans_handle *trans;
5478 struct btrfs_root *root = BTRFS_I(inode)->root;
5479 struct btrfs_block_rsv rsv;
5480 int ret;
5481
5482 trace_btrfs_inode_evict(inode);
5483
5484 if (!root) {
5485 fsverity_cleanup_inode(inode);
5486 clear_inode(inode);
5487 return;
5488 }
5489
5490 fs_info = inode_to_fs_info(inode);
5491 evict_inode_truncate_pages(inode);
5492
5493 if (inode->i_nlink &&
5494 ((btrfs_root_refs(&root->root_item) != 0 &&
5495 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
5496 btrfs_is_free_space_inode(BTRFS_I(inode))))
5497 goto out;
5498
5499 if (is_bad_inode(inode))
5500 goto out;
5501
5502 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5503 goto out;
5504
5505 if (inode->i_nlink > 0) {
5506 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5507 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
5508 goto out;
5509 }
5510
5511 /*
5512 * This makes sure the inode item in tree is uptodate and the space for
5513 * the inode update is released.
5514 */
5515 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5516 if (ret)
5517 goto out;
5518
5519 /*
5520 * This drops any pending insert or delete operations we have for this
5521 * inode. We could have a delayed dir index deletion queued up, but
5522 * we're removing the inode completely so that'll be taken care of in
5523 * the truncate.
5524 */
5525 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5526
5527 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
5528 rsv.size = btrfs_calc_metadata_size(fs_info, 1);
5529 rsv.failfast = true;
5530
5531 btrfs_i_size_write(BTRFS_I(inode), 0);
5532
5533 while (1) {
5534 struct btrfs_truncate_control control = {
5535 .inode = BTRFS_I(inode),
5536 .ino = btrfs_ino(BTRFS_I(inode)),
5537 .new_size = 0,
5538 .min_type = 0,
5539 };
5540
5541 trans = evict_refill_and_join(root, &rsv);
5542 if (IS_ERR(trans))
5543 goto out_release;
5544
5545 trans->block_rsv = &rsv;
5546
5547 ret = btrfs_truncate_inode_items(trans, root, &control);
5548 trans->block_rsv = &fs_info->trans_block_rsv;
5549 btrfs_end_transaction(trans);
5550 /*
5551 * We have not added new delayed items for our inode after we
5552 * have flushed its delayed items, so no need to throttle on
5553 * delayed items. However we have modified extent buffers.
5554 */
5555 btrfs_btree_balance_dirty_nodelay(fs_info);
5556 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5557 goto out_release;
5558 else if (!ret)
5559 break;
5560 }
5561
5562 /*
5563 * Errors here aren't a big deal, it just means we leave orphan items in
5564 * the tree. They will be cleaned up on the next mount. If the inode
5565 * number gets reused, cleanup deletes the orphan item without doing
5566 * anything, and unlink reuses the existing orphan item.
5567 *
5568 * If it turns out that we are dropping too many of these, we might want
5569 * to add a mechanism for retrying these after a commit.
5570 */
5571 trans = evict_refill_and_join(root, &rsv);
5572 if (!IS_ERR(trans)) {
5573 trans->block_rsv = &rsv;
5574 btrfs_orphan_del(trans, BTRFS_I(inode));
5575 trans->block_rsv = &fs_info->trans_block_rsv;
5576 btrfs_end_transaction(trans);
5577 }
5578
5579 out_release:
5580 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
5581 out:
5582 /*
5583 * If we didn't successfully delete, the orphan item will still be in
5584 * the tree and we'll retry on the next mount. Again, we might also want
5585 * to retry these periodically in the future.
5586 */
5587 btrfs_remove_delayed_node(BTRFS_I(inode));
5588 fsverity_cleanup_inode(inode);
5589 clear_inode(inode);
5590 }
5591
5592 /*
5593 * Return the key found in the dir entry in the location pointer, fill @type
5594 * with BTRFS_FT_*, and return 0.
5595 *
5596 * If no dir entries were found, returns -ENOENT.
5597 * If found a corrupted location in dir entry, returns -EUCLEAN.
5598 */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5599 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5600 struct btrfs_key *location, u8 *type)
5601 {
5602 struct btrfs_dir_item *di;
5603 BTRFS_PATH_AUTO_FREE(path);
5604 struct btrfs_root *root = dir->root;
5605 int ret = 0;
5606 struct fscrypt_name fname;
5607
5608 path = btrfs_alloc_path();
5609 if (!path)
5610 return -ENOMEM;
5611
5612 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5613 if (ret < 0)
5614 return ret;
5615 /*
5616 * fscrypt_setup_filename() should never return a positive value, but
5617 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5618 */
5619 ASSERT(ret == 0);
5620
5621 /* This needs to handle no-key deletions later on */
5622
5623 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5624 &fname.disk_name, 0);
5625 if (IS_ERR_OR_NULL(di)) {
5626 ret = di ? PTR_ERR(di) : -ENOENT;
5627 goto out;
5628 }
5629
5630 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5631 if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
5632 location->type != BTRFS_ROOT_ITEM_KEY)) {
5633 ret = -EUCLEAN;
5634 btrfs_warn(root->fs_info,
5635 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5636 __func__, fname.disk_name.name, btrfs_ino(dir),
5637 location->objectid, location->type, location->offset);
5638 }
5639 if (!ret)
5640 *type = btrfs_dir_ftype(path->nodes[0], di);
5641 out:
5642 fscrypt_free_filename(&fname);
5643 return ret;
5644 }
5645
5646 /*
5647 * when we hit a tree root in a directory, the btrfs part of the inode
5648 * needs to be changed to reflect the root directory of the tree root. This
5649 * is kind of like crossing a mount point.
5650 */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5651 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5652 struct btrfs_inode *dir,
5653 struct dentry *dentry,
5654 struct btrfs_key *location,
5655 struct btrfs_root **sub_root)
5656 {
5657 BTRFS_PATH_AUTO_FREE(path);
5658 struct btrfs_root *new_root;
5659 struct btrfs_root_ref *ref;
5660 struct extent_buffer *leaf;
5661 struct btrfs_key key;
5662 int ret;
5663 int err = 0;
5664 struct fscrypt_name fname;
5665
5666 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5667 if (ret)
5668 return ret;
5669
5670 path = btrfs_alloc_path();
5671 if (!path) {
5672 err = -ENOMEM;
5673 goto out;
5674 }
5675
5676 err = -ENOENT;
5677 key.objectid = btrfs_root_id(dir->root);
5678 key.type = BTRFS_ROOT_REF_KEY;
5679 key.offset = location->objectid;
5680
5681 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5682 if (ret) {
5683 if (ret < 0)
5684 err = ret;
5685 goto out;
5686 }
5687
5688 leaf = path->nodes[0];
5689 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5690 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5691 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5692 goto out;
5693
5694 ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5695 (unsigned long)(ref + 1), fname.disk_name.len);
5696 if (ret)
5697 goto out;
5698
5699 btrfs_release_path(path);
5700
5701 new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5702 if (IS_ERR(new_root)) {
5703 err = PTR_ERR(new_root);
5704 goto out;
5705 }
5706
5707 *sub_root = new_root;
5708 location->objectid = btrfs_root_dirid(&new_root->root_item);
5709 location->type = BTRFS_INODE_ITEM_KEY;
5710 location->offset = 0;
5711 err = 0;
5712 out:
5713 fscrypt_free_filename(&fname);
5714 return err;
5715 }
5716
5717
5718
btrfs_del_inode_from_root(struct btrfs_inode * inode)5719 static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5720 {
5721 struct btrfs_root *root = inode->root;
5722 struct btrfs_inode *entry;
5723 bool empty = false;
5724
5725 xa_lock(&root->inodes);
5726 /*
5727 * This btrfs_inode is being freed and has already been unhashed at this
5728 * point. It's possible that another btrfs_inode has already been
5729 * allocated for the same inode and inserted itself into the root, so
5730 * don't delete it in that case.
5731 *
5732 * Note that this shouldn't need to allocate memory, so the gfp flags
5733 * don't really matter.
5734 */
5735 entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
5736 GFP_ATOMIC);
5737 if (entry == inode)
5738 empty = xa_empty(&root->inodes);
5739 xa_unlock(&root->inodes);
5740
5741 if (empty && btrfs_root_refs(&root->root_item) == 0) {
5742 xa_lock(&root->inodes);
5743 empty = xa_empty(&root->inodes);
5744 xa_unlock(&root->inodes);
5745 if (empty)
5746 btrfs_add_dead_root(root);
5747 }
5748 }
5749
5750
btrfs_init_locked_inode(struct inode * inode,void * p)5751 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5752 {
5753 struct btrfs_iget_args *args = p;
5754
5755 btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5756 BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5757
5758 if (args->root && args->root == args->root->fs_info->tree_root &&
5759 args->ino != BTRFS_BTREE_INODE_OBJECTID)
5760 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5761 &BTRFS_I(inode)->runtime_flags);
5762 return 0;
5763 }
5764
btrfs_find_actor(struct inode * inode,void * opaque)5765 static int btrfs_find_actor(struct inode *inode, void *opaque)
5766 {
5767 struct btrfs_iget_args *args = opaque;
5768
5769 return args->ino == btrfs_ino(BTRFS_I(inode)) &&
5770 args->root == BTRFS_I(inode)->root;
5771 }
5772
btrfs_iget_locked(u64 ino,struct btrfs_root * root)5773 static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
5774 {
5775 struct inode *inode;
5776 struct btrfs_iget_args args;
5777 unsigned long hashval = btrfs_inode_hash(ino, root);
5778
5779 args.ino = ino;
5780 args.root = root;
5781
5782 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
5783 btrfs_init_locked_inode,
5784 (void *)&args);
5785 if (!inode)
5786 return NULL;
5787 return BTRFS_I(inode);
5788 }
5789
5790 /*
5791 * Get an inode object given its inode number and corresponding root. Path is
5792 * preallocated to prevent recursing back to iget through allocator.
5793 */
btrfs_iget_path(u64 ino,struct btrfs_root * root,struct btrfs_path * path)5794 struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5795 struct btrfs_path *path)
5796 {
5797 struct btrfs_inode *inode;
5798 int ret;
5799
5800 inode = btrfs_iget_locked(ino, root);
5801 if (!inode)
5802 return ERR_PTR(-ENOMEM);
5803
5804 if (!(inode->vfs_inode.i_state & I_NEW))
5805 return inode;
5806
5807 ret = btrfs_read_locked_inode(inode, path);
5808 if (ret)
5809 return ERR_PTR(ret);
5810
5811 unlock_new_inode(&inode->vfs_inode);
5812 return inode;
5813 }
5814
5815 /*
5816 * Get an inode object given its inode number and corresponding root.
5817 */
btrfs_iget(u64 ino,struct btrfs_root * root)5818 struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
5819 {
5820 struct btrfs_inode *inode;
5821 struct btrfs_path *path;
5822 int ret;
5823
5824 inode = btrfs_iget_locked(ino, root);
5825 if (!inode)
5826 return ERR_PTR(-ENOMEM);
5827
5828 if (!(inode->vfs_inode.i_state & I_NEW))
5829 return inode;
5830
5831 path = btrfs_alloc_path();
5832 if (!path) {
5833 iget_failed(&inode->vfs_inode);
5834 return ERR_PTR(-ENOMEM);
5835 }
5836
5837 ret = btrfs_read_locked_inode(inode, path);
5838 btrfs_free_path(path);
5839 if (ret)
5840 return ERR_PTR(ret);
5841
5842 unlock_new_inode(&inode->vfs_inode);
5843 return inode;
5844 }
5845
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5846 static struct btrfs_inode *new_simple_dir(struct inode *dir,
5847 struct btrfs_key *key,
5848 struct btrfs_root *root)
5849 {
5850 struct timespec64 ts;
5851 struct inode *vfs_inode;
5852 struct btrfs_inode *inode;
5853
5854 vfs_inode = new_inode(dir->i_sb);
5855 if (!vfs_inode)
5856 return ERR_PTR(-ENOMEM);
5857
5858 inode = BTRFS_I(vfs_inode);
5859 inode->root = btrfs_grab_root(root);
5860 inode->ref_root_id = key->objectid;
5861 set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
5862 set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
5863
5864 btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
5865 /*
5866 * We only need lookup, the rest is read-only and there's no inode
5867 * associated with the dentry
5868 */
5869 vfs_inode->i_op = &simple_dir_inode_operations;
5870 vfs_inode->i_opflags &= ~IOP_XATTR;
5871 vfs_inode->i_fop = &simple_dir_operations;
5872 vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5873
5874 ts = inode_set_ctime_current(vfs_inode);
5875 inode_set_mtime_to_ts(vfs_inode, ts);
5876 inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
5877 inode->i_otime_sec = ts.tv_sec;
5878 inode->i_otime_nsec = ts.tv_nsec;
5879
5880 vfs_inode->i_uid = dir->i_uid;
5881 vfs_inode->i_gid = dir->i_gid;
5882
5883 return inode;
5884 }
5885
5886 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5887 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5888 static_assert(BTRFS_FT_DIR == FT_DIR);
5889 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5890 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5891 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5892 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5893 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5894
btrfs_inode_type(const struct btrfs_inode * inode)5895 static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
5896 {
5897 return fs_umode_to_ftype(inode->vfs_inode.i_mode);
5898 }
5899
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5900 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5901 {
5902 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
5903 struct btrfs_inode *inode;
5904 struct btrfs_root *root = BTRFS_I(dir)->root;
5905 struct btrfs_root *sub_root = root;
5906 struct btrfs_key location = { 0 };
5907 u8 di_type = 0;
5908 int ret = 0;
5909
5910 if (dentry->d_name.len > BTRFS_NAME_LEN)
5911 return ERR_PTR(-ENAMETOOLONG);
5912
5913 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5914 if (ret < 0)
5915 return ERR_PTR(ret);
5916
5917 if (location.type == BTRFS_INODE_ITEM_KEY) {
5918 inode = btrfs_iget(location.objectid, root);
5919 if (IS_ERR(inode))
5920 return ERR_CAST(inode);
5921
5922 /* Do extra check against inode mode with di_type */
5923 if (unlikely(btrfs_inode_type(inode) != di_type)) {
5924 btrfs_crit(fs_info,
5925 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5926 inode->vfs_inode.i_mode, btrfs_inode_type(inode),
5927 di_type);
5928 iput(&inode->vfs_inode);
5929 return ERR_PTR(-EUCLEAN);
5930 }
5931 return &inode->vfs_inode;
5932 }
5933
5934 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5935 &location, &sub_root);
5936 if (ret < 0) {
5937 if (ret != -ENOENT)
5938 inode = ERR_PTR(ret);
5939 else
5940 inode = new_simple_dir(dir, &location, root);
5941 } else {
5942 inode = btrfs_iget(location.objectid, sub_root);
5943 btrfs_put_root(sub_root);
5944
5945 if (IS_ERR(inode))
5946 return ERR_CAST(inode);
5947
5948 down_read(&fs_info->cleanup_work_sem);
5949 if (!sb_rdonly(inode->vfs_inode.i_sb))
5950 ret = btrfs_orphan_cleanup(sub_root);
5951 up_read(&fs_info->cleanup_work_sem);
5952 if (ret) {
5953 iput(&inode->vfs_inode);
5954 inode = ERR_PTR(ret);
5955 }
5956 }
5957
5958 if (IS_ERR(inode))
5959 return ERR_CAST(inode);
5960
5961 return &inode->vfs_inode;
5962 }
5963
btrfs_dentry_delete(const struct dentry * dentry)5964 static int btrfs_dentry_delete(const struct dentry *dentry)
5965 {
5966 struct btrfs_root *root;
5967 struct inode *inode = d_inode(dentry);
5968
5969 if (!inode && !IS_ROOT(dentry))
5970 inode = d_inode(dentry->d_parent);
5971
5972 if (inode) {
5973 root = BTRFS_I(inode)->root;
5974 if (btrfs_root_refs(&root->root_item) == 0)
5975 return 1;
5976
5977 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5978 return 1;
5979 }
5980 return 0;
5981 }
5982
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5983 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5984 unsigned int flags)
5985 {
5986 struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5987
5988 if (inode == ERR_PTR(-ENOENT))
5989 inode = NULL;
5990 return d_splice_alias(inode, dentry);
5991 }
5992
5993 /*
5994 * Find the highest existing sequence number in a directory and then set the
5995 * in-memory index_cnt variable to the first free sequence number.
5996 */
btrfs_set_inode_index_count(struct btrfs_inode * inode)5997 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5998 {
5999 struct btrfs_root *root = inode->root;
6000 struct btrfs_key key, found_key;
6001 BTRFS_PATH_AUTO_FREE(path);
6002 struct extent_buffer *leaf;
6003 int ret;
6004
6005 key.objectid = btrfs_ino(inode);
6006 key.type = BTRFS_DIR_INDEX_KEY;
6007 key.offset = (u64)-1;
6008
6009 path = btrfs_alloc_path();
6010 if (!path)
6011 return -ENOMEM;
6012
6013 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6014 if (ret < 0)
6015 return ret;
6016 /* FIXME: we should be able to handle this */
6017 if (ret == 0)
6018 return ret;
6019
6020 if (path->slots[0] == 0) {
6021 inode->index_cnt = BTRFS_DIR_START_INDEX;
6022 return 0;
6023 }
6024
6025 path->slots[0]--;
6026
6027 leaf = path->nodes[0];
6028 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6029
6030 if (found_key.objectid != btrfs_ino(inode) ||
6031 found_key.type != BTRFS_DIR_INDEX_KEY) {
6032 inode->index_cnt = BTRFS_DIR_START_INDEX;
6033 return 0;
6034 }
6035
6036 inode->index_cnt = found_key.offset + 1;
6037
6038 return 0;
6039 }
6040
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)6041 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
6042 {
6043 int ret = 0;
6044
6045 btrfs_inode_lock(dir, 0);
6046 if (dir->index_cnt == (u64)-1) {
6047 ret = btrfs_inode_delayed_dir_index_count(dir);
6048 if (ret) {
6049 ret = btrfs_set_inode_index_count(dir);
6050 if (ret)
6051 goto out;
6052 }
6053 }
6054
6055 /* index_cnt is the index number of next new entry, so decrement it. */
6056 *index = dir->index_cnt - 1;
6057 out:
6058 btrfs_inode_unlock(dir, 0);
6059
6060 return ret;
6061 }
6062
6063 /*
6064 * All this infrastructure exists because dir_emit can fault, and we are holding
6065 * the tree lock when doing readdir. For now just allocate a buffer and copy
6066 * our information into that, and then dir_emit from the buffer. This is
6067 * similar to what NFS does, only we don't keep the buffer around in pagecache
6068 * because I'm afraid I'll mess that up. Long term we need to make filldir do
6069 * copy_to_user_inatomic so we don't have to worry about page faulting under the
6070 * tree lock.
6071 */
btrfs_opendir(struct inode * inode,struct file * file)6072 static int btrfs_opendir(struct inode *inode, struct file *file)
6073 {
6074 struct btrfs_file_private *private;
6075 u64 last_index;
6076 int ret;
6077
6078 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
6079 if (ret)
6080 return ret;
6081
6082 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
6083 if (!private)
6084 return -ENOMEM;
6085 private->last_index = last_index;
6086 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
6087 if (!private->filldir_buf) {
6088 kfree(private);
6089 return -ENOMEM;
6090 }
6091 file->private_data = private;
6092 return 0;
6093 }
6094
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)6095 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
6096 {
6097 struct btrfs_file_private *private = file->private_data;
6098 int ret;
6099
6100 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
6101 &private->last_index);
6102 if (ret)
6103 return ret;
6104
6105 return generic_file_llseek(file, offset, whence);
6106 }
6107
6108 struct dir_entry {
6109 u64 ino;
6110 u64 offset;
6111 unsigned type;
6112 int name_len;
6113 };
6114
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)6115 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
6116 {
6117 while (entries--) {
6118 struct dir_entry *entry = addr;
6119 char *name = (char *)(entry + 1);
6120
6121 ctx->pos = get_unaligned(&entry->offset);
6122 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
6123 get_unaligned(&entry->ino),
6124 get_unaligned(&entry->type)))
6125 return 1;
6126 addr += sizeof(struct dir_entry) +
6127 get_unaligned(&entry->name_len);
6128 ctx->pos++;
6129 }
6130 return 0;
6131 }
6132
btrfs_real_readdir(struct file * file,struct dir_context * ctx)6133 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
6134 {
6135 struct inode *inode = file_inode(file);
6136 struct btrfs_root *root = BTRFS_I(inode)->root;
6137 struct btrfs_file_private *private = file->private_data;
6138 struct btrfs_dir_item *di;
6139 struct btrfs_key key;
6140 struct btrfs_key found_key;
6141 BTRFS_PATH_AUTO_FREE(path);
6142 void *addr;
6143 LIST_HEAD(ins_list);
6144 LIST_HEAD(del_list);
6145 int ret;
6146 char *name_ptr;
6147 int name_len;
6148 int entries = 0;
6149 int total_len = 0;
6150 bool put = false;
6151 struct btrfs_key location;
6152
6153 if (!dir_emit_dots(file, ctx))
6154 return 0;
6155
6156 path = btrfs_alloc_path();
6157 if (!path)
6158 return -ENOMEM;
6159
6160 addr = private->filldir_buf;
6161 path->reada = READA_FORWARD;
6162
6163 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
6164 &ins_list, &del_list);
6165
6166 again:
6167 key.type = BTRFS_DIR_INDEX_KEY;
6168 key.offset = ctx->pos;
6169 key.objectid = btrfs_ino(BTRFS_I(inode));
6170
6171 btrfs_for_each_slot(root, &key, &found_key, path, ret) {
6172 struct dir_entry *entry;
6173 struct extent_buffer *leaf = path->nodes[0];
6174 u8 ftype;
6175
6176 if (found_key.objectid != key.objectid)
6177 break;
6178 if (found_key.type != BTRFS_DIR_INDEX_KEY)
6179 break;
6180 if (found_key.offset < ctx->pos)
6181 continue;
6182 if (found_key.offset > private->last_index)
6183 break;
6184 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
6185 continue;
6186 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
6187 name_len = btrfs_dir_name_len(leaf, di);
6188 if ((total_len + sizeof(struct dir_entry) + name_len) >=
6189 PAGE_SIZE) {
6190 btrfs_release_path(path);
6191 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6192 if (ret)
6193 goto nopos;
6194 addr = private->filldir_buf;
6195 entries = 0;
6196 total_len = 0;
6197 goto again;
6198 }
6199
6200 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
6201 entry = addr;
6202 name_ptr = (char *)(entry + 1);
6203 read_extent_buffer(leaf, name_ptr,
6204 (unsigned long)(di + 1), name_len);
6205 put_unaligned(name_len, &entry->name_len);
6206 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
6207 btrfs_dir_item_key_to_cpu(leaf, di, &location);
6208 put_unaligned(location.objectid, &entry->ino);
6209 put_unaligned(found_key.offset, &entry->offset);
6210 entries++;
6211 addr += sizeof(struct dir_entry) + name_len;
6212 total_len += sizeof(struct dir_entry) + name_len;
6213 }
6214 /* Catch error encountered during iteration */
6215 if (ret < 0)
6216 goto err;
6217
6218 btrfs_release_path(path);
6219
6220 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6221 if (ret)
6222 goto nopos;
6223
6224 if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
6225 goto nopos;
6226
6227 /*
6228 * Stop new entries from being returned after we return the last
6229 * entry.
6230 *
6231 * New directory entries are assigned a strictly increasing
6232 * offset. This means that new entries created during readdir
6233 * are *guaranteed* to be seen in the future by that readdir.
6234 * This has broken buggy programs which operate on names as
6235 * they're returned by readdir. Until we reuse freed offsets
6236 * we have this hack to stop new entries from being returned
6237 * under the assumption that they'll never reach this huge
6238 * offset.
6239 *
6240 * This is being careful not to overflow 32bit loff_t unless the
6241 * last entry requires it because doing so has broken 32bit apps
6242 * in the past.
6243 */
6244 if (ctx->pos >= INT_MAX)
6245 ctx->pos = LLONG_MAX;
6246 else
6247 ctx->pos = INT_MAX;
6248 nopos:
6249 ret = 0;
6250 err:
6251 if (put)
6252 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
6253 return ret;
6254 }
6255
6256 /*
6257 * This is somewhat expensive, updating the tree every time the
6258 * inode changes. But, it is most likely to find the inode in cache.
6259 * FIXME, needs more benchmarking...there are no reasons other than performance
6260 * to keep or drop this code.
6261 */
btrfs_dirty_inode(struct btrfs_inode * inode)6262 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6263 {
6264 struct btrfs_root *root = inode->root;
6265 struct btrfs_fs_info *fs_info = root->fs_info;
6266 struct btrfs_trans_handle *trans;
6267 int ret;
6268
6269 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6270 return 0;
6271
6272 trans = btrfs_join_transaction(root);
6273 if (IS_ERR(trans))
6274 return PTR_ERR(trans);
6275
6276 ret = btrfs_update_inode(trans, inode);
6277 if (ret == -ENOSPC || ret == -EDQUOT) {
6278 /* whoops, lets try again with the full transaction */
6279 btrfs_end_transaction(trans);
6280 trans = btrfs_start_transaction(root, 1);
6281 if (IS_ERR(trans))
6282 return PTR_ERR(trans);
6283
6284 ret = btrfs_update_inode(trans, inode);
6285 }
6286 btrfs_end_transaction(trans);
6287 if (inode->delayed_node)
6288 btrfs_balance_delayed_items(fs_info);
6289
6290 return ret;
6291 }
6292
6293 /*
6294 * This is a copy of file_update_time. We need this so we can return error on
6295 * ENOSPC for updating the inode in the case of file write and mmap writes.
6296 */
btrfs_update_time(struct inode * inode,int flags)6297 static int btrfs_update_time(struct inode *inode, int flags)
6298 {
6299 struct btrfs_root *root = BTRFS_I(inode)->root;
6300 bool dirty;
6301
6302 if (btrfs_root_readonly(root))
6303 return -EROFS;
6304
6305 dirty = inode_update_timestamps(inode, flags);
6306 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6307 }
6308
6309 /*
6310 * helper to find a free sequence number in a given directory. This current
6311 * code is very simple, later versions will do smarter things in the btree
6312 */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6313 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6314 {
6315 int ret = 0;
6316
6317 if (dir->index_cnt == (u64)-1) {
6318 ret = btrfs_inode_delayed_dir_index_count(dir);
6319 if (ret) {
6320 ret = btrfs_set_inode_index_count(dir);
6321 if (ret)
6322 return ret;
6323 }
6324 }
6325
6326 *index = dir->index_cnt;
6327 dir->index_cnt++;
6328
6329 return ret;
6330 }
6331
btrfs_insert_inode_locked(struct inode * inode)6332 static int btrfs_insert_inode_locked(struct inode *inode)
6333 {
6334 struct btrfs_iget_args args;
6335
6336 args.ino = btrfs_ino(BTRFS_I(inode));
6337 args.root = BTRFS_I(inode)->root;
6338
6339 return insert_inode_locked4(inode,
6340 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6341 btrfs_find_actor, &args);
6342 }
6343
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6344 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6345 unsigned int *trans_num_items)
6346 {
6347 struct inode *dir = args->dir;
6348 struct inode *inode = args->inode;
6349 int ret;
6350
6351 if (!args->orphan) {
6352 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6353 &args->fname);
6354 if (ret)
6355 return ret;
6356 }
6357
6358 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6359 if (ret) {
6360 fscrypt_free_filename(&args->fname);
6361 return ret;
6362 }
6363
6364 /* 1 to add inode item */
6365 *trans_num_items = 1;
6366 /* 1 to add compression property */
6367 if (BTRFS_I(dir)->prop_compress)
6368 (*trans_num_items)++;
6369 /* 1 to add default ACL xattr */
6370 if (args->default_acl)
6371 (*trans_num_items)++;
6372 /* 1 to add access ACL xattr */
6373 if (args->acl)
6374 (*trans_num_items)++;
6375 #ifdef CONFIG_SECURITY
6376 /* 1 to add LSM xattr */
6377 if (dir->i_security)
6378 (*trans_num_items)++;
6379 #endif
6380 if (args->orphan) {
6381 /* 1 to add orphan item */
6382 (*trans_num_items)++;
6383 } else {
6384 /*
6385 * 1 to add dir item
6386 * 1 to add dir index
6387 * 1 to update parent inode item
6388 *
6389 * No need for 1 unit for the inode ref item because it is
6390 * inserted in a batch together with the inode item at
6391 * btrfs_create_new_inode().
6392 */
6393 *trans_num_items += 3;
6394 }
6395 return 0;
6396 }
6397
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6398 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6399 {
6400 posix_acl_release(args->acl);
6401 posix_acl_release(args->default_acl);
6402 fscrypt_free_filename(&args->fname);
6403 }
6404
6405 /*
6406 * Inherit flags from the parent inode.
6407 *
6408 * Currently only the compression flags and the cow flags are inherited.
6409 */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6410 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6411 {
6412 unsigned int flags;
6413
6414 flags = dir->flags;
6415
6416 if (flags & BTRFS_INODE_NOCOMPRESS) {
6417 inode->flags &= ~BTRFS_INODE_COMPRESS;
6418 inode->flags |= BTRFS_INODE_NOCOMPRESS;
6419 } else if (flags & BTRFS_INODE_COMPRESS) {
6420 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6421 inode->flags |= BTRFS_INODE_COMPRESS;
6422 }
6423
6424 if (flags & BTRFS_INODE_NODATACOW) {
6425 inode->flags |= BTRFS_INODE_NODATACOW;
6426 if (S_ISREG(inode->vfs_inode.i_mode))
6427 inode->flags |= BTRFS_INODE_NODATASUM;
6428 }
6429
6430 btrfs_sync_inode_flags_to_i_flags(inode);
6431 }
6432
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6433 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6434 struct btrfs_new_inode_args *args)
6435 {
6436 struct timespec64 ts;
6437 struct inode *dir = args->dir;
6438 struct inode *inode = args->inode;
6439 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6440 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6441 struct btrfs_root *root;
6442 struct btrfs_inode_item *inode_item;
6443 struct btrfs_path *path;
6444 u64 objectid;
6445 struct btrfs_inode_ref *ref;
6446 struct btrfs_key key[2];
6447 u32 sizes[2];
6448 struct btrfs_item_batch batch;
6449 unsigned long ptr;
6450 int ret;
6451 bool xa_reserved = false;
6452
6453 path = btrfs_alloc_path();
6454 if (!path)
6455 return -ENOMEM;
6456
6457 if (!args->subvol)
6458 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6459 root = BTRFS_I(inode)->root;
6460
6461 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6462 if (ret)
6463 goto out;
6464
6465 ret = btrfs_get_free_objectid(root, &objectid);
6466 if (ret)
6467 goto out;
6468 btrfs_set_inode_number(BTRFS_I(inode), objectid);
6469
6470 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6471 if (ret)
6472 goto out;
6473 xa_reserved = true;
6474
6475 if (args->orphan) {
6476 /*
6477 * O_TMPFILE, set link count to 0, so that after this point, we
6478 * fill in an inode item with the correct link count.
6479 */
6480 set_nlink(inode, 0);
6481 } else {
6482 trace_btrfs_inode_request(dir);
6483
6484 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6485 if (ret)
6486 goto out;
6487 }
6488
6489 if (S_ISDIR(inode->i_mode))
6490 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6491
6492 BTRFS_I(inode)->generation = trans->transid;
6493 inode->i_generation = BTRFS_I(inode)->generation;
6494
6495 /*
6496 * We don't have any capability xattrs set here yet, shortcut any
6497 * queries for the xattrs here. If we add them later via the inode
6498 * security init path or any other path this flag will be cleared.
6499 */
6500 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6501
6502 /*
6503 * Subvolumes don't inherit flags from their parent directory.
6504 * Originally this was probably by accident, but we probably can't
6505 * change it now without compatibility issues.
6506 */
6507 if (!args->subvol)
6508 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6509
6510 btrfs_set_inode_mapping_order(BTRFS_I(inode));
6511 if (S_ISREG(inode->i_mode)) {
6512 if (btrfs_test_opt(fs_info, NODATASUM))
6513 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6514 if (btrfs_test_opt(fs_info, NODATACOW))
6515 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6516 BTRFS_INODE_NODATASUM;
6517 btrfs_update_inode_mapping_flags(BTRFS_I(inode));
6518 }
6519
6520 ret = btrfs_insert_inode_locked(inode);
6521 if (ret < 0) {
6522 if (!args->orphan)
6523 BTRFS_I(dir)->index_cnt--;
6524 goto out;
6525 }
6526
6527 /*
6528 * We could have gotten an inode number from somebody who was fsynced
6529 * and then removed in this same transaction, so let's just set full
6530 * sync since it will be a full sync anyway and this will blow away the
6531 * old info in the log.
6532 */
6533 btrfs_set_inode_full_sync(BTRFS_I(inode));
6534
6535 key[0].objectid = objectid;
6536 key[0].type = BTRFS_INODE_ITEM_KEY;
6537 key[0].offset = 0;
6538
6539 sizes[0] = sizeof(struct btrfs_inode_item);
6540
6541 if (!args->orphan) {
6542 /*
6543 * Start new inodes with an inode_ref. This is slightly more
6544 * efficient for small numbers of hard links since they will
6545 * be packed into one item. Extended refs will kick in if we
6546 * add more hard links than can fit in the ref item.
6547 */
6548 key[1].objectid = objectid;
6549 key[1].type = BTRFS_INODE_REF_KEY;
6550 if (args->subvol) {
6551 key[1].offset = objectid;
6552 sizes[1] = 2 + sizeof(*ref);
6553 } else {
6554 key[1].offset = btrfs_ino(BTRFS_I(dir));
6555 sizes[1] = name->len + sizeof(*ref);
6556 }
6557 }
6558
6559 batch.keys = &key[0];
6560 batch.data_sizes = &sizes[0];
6561 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6562 batch.nr = args->orphan ? 1 : 2;
6563 ret = btrfs_insert_empty_items(trans, root, path, &batch);
6564 if (unlikely(ret != 0)) {
6565 btrfs_abort_transaction(trans, ret);
6566 goto discard;
6567 }
6568
6569 ts = simple_inode_init_ts(inode);
6570 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6571 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6572
6573 /*
6574 * We're going to fill the inode item now, so at this point the inode
6575 * must be fully initialized.
6576 */
6577
6578 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6579 struct btrfs_inode_item);
6580 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6581 sizeof(*inode_item));
6582 fill_inode_item(trans, path->nodes[0], inode_item, inode);
6583
6584 if (!args->orphan) {
6585 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6586 struct btrfs_inode_ref);
6587 ptr = (unsigned long)(ref + 1);
6588 if (args->subvol) {
6589 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6590 btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6591 write_extent_buffer(path->nodes[0], "..", ptr, 2);
6592 } else {
6593 btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6594 name->len);
6595 btrfs_set_inode_ref_index(path->nodes[0], ref,
6596 BTRFS_I(inode)->dir_index);
6597 write_extent_buffer(path->nodes[0], name->name, ptr,
6598 name->len);
6599 }
6600 }
6601
6602 /*
6603 * We don't need the path anymore, plus inheriting properties, adding
6604 * ACLs, security xattrs, orphan item or adding the link, will result in
6605 * allocating yet another path. So just free our path.
6606 */
6607 btrfs_free_path(path);
6608 path = NULL;
6609
6610 if (args->subvol) {
6611 struct btrfs_inode *parent;
6612
6613 /*
6614 * Subvolumes inherit properties from their parent subvolume,
6615 * not the directory they were created in.
6616 */
6617 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6618 if (IS_ERR(parent)) {
6619 ret = PTR_ERR(parent);
6620 } else {
6621 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6622 parent);
6623 iput(&parent->vfs_inode);
6624 }
6625 } else {
6626 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6627 BTRFS_I(dir));
6628 }
6629 if (ret) {
6630 btrfs_err(fs_info,
6631 "error inheriting props for ino %llu (root %llu): %d",
6632 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6633 }
6634
6635 /*
6636 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6637 * probably a bug.
6638 */
6639 if (!args->subvol) {
6640 ret = btrfs_init_inode_security(trans, args);
6641 if (unlikely(ret)) {
6642 btrfs_abort_transaction(trans, ret);
6643 goto discard;
6644 }
6645 }
6646
6647 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6648 if (WARN_ON(ret)) {
6649 /* Shouldn't happen, we used xa_reserve() before. */
6650 btrfs_abort_transaction(trans, ret);
6651 goto discard;
6652 }
6653
6654 trace_btrfs_inode_new(inode);
6655 btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6656
6657 btrfs_update_root_times(trans, root);
6658
6659 if (args->orphan) {
6660 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6661 if (unlikely(ret)) {
6662 btrfs_abort_transaction(trans, ret);
6663 goto discard;
6664 }
6665 } else {
6666 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6667 0, BTRFS_I(inode)->dir_index);
6668 if (unlikely(ret)) {
6669 btrfs_abort_transaction(trans, ret);
6670 goto discard;
6671 }
6672 }
6673
6674 return 0;
6675
6676 discard:
6677 /*
6678 * discard_new_inode() calls iput(), but the caller owns the reference
6679 * to the inode.
6680 */
6681 ihold(inode);
6682 discard_new_inode(inode);
6683 out:
6684 if (xa_reserved)
6685 xa_release(&root->inodes, objectid);
6686
6687 btrfs_free_path(path);
6688 return ret;
6689 }
6690
6691 /*
6692 * utility function to add 'inode' into 'parent_inode' with
6693 * a give name and a given sequence number.
6694 * if 'add_backref' is true, also insert a backref from the
6695 * inode to the parent directory.
6696 */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,bool add_backref,u64 index)6697 int btrfs_add_link(struct btrfs_trans_handle *trans,
6698 struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6699 const struct fscrypt_str *name, bool add_backref, u64 index)
6700 {
6701 int ret = 0;
6702 struct btrfs_key key;
6703 struct btrfs_root *root = parent_inode->root;
6704 u64 ino = btrfs_ino(inode);
6705 u64 parent_ino = btrfs_ino(parent_inode);
6706
6707 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6708 memcpy(&key, &inode->root->root_key, sizeof(key));
6709 } else {
6710 key.objectid = ino;
6711 key.type = BTRFS_INODE_ITEM_KEY;
6712 key.offset = 0;
6713 }
6714
6715 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6716 ret = btrfs_add_root_ref(trans, key.objectid,
6717 btrfs_root_id(root), parent_ino,
6718 index, name);
6719 } else if (add_backref) {
6720 ret = btrfs_insert_inode_ref(trans, root, name,
6721 ino, parent_ino, index);
6722 }
6723
6724 /* Nothing to clean up yet */
6725 if (ret)
6726 return ret;
6727
6728 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6729 btrfs_inode_type(inode), index);
6730 if (ret == -EEXIST || ret == -EOVERFLOW)
6731 goto fail_dir_item;
6732 else if (unlikely(ret)) {
6733 btrfs_abort_transaction(trans, ret);
6734 return ret;
6735 }
6736
6737 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6738 name->len * 2);
6739 inode_inc_iversion(&parent_inode->vfs_inode);
6740 update_time_after_link_or_unlink(parent_inode);
6741
6742 ret = btrfs_update_inode(trans, parent_inode);
6743 if (ret)
6744 btrfs_abort_transaction(trans, ret);
6745 return ret;
6746
6747 fail_dir_item:
6748 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6749 u64 local_index;
6750 int ret2;
6751
6752 ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
6753 parent_ino, &local_index, name);
6754 if (ret2)
6755 btrfs_abort_transaction(trans, ret2);
6756 } else if (add_backref) {
6757 int ret2;
6758
6759 ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
6760 if (ret2)
6761 btrfs_abort_transaction(trans, ret2);
6762 }
6763
6764 /* Return the original error code */
6765 return ret;
6766 }
6767
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6768 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6769 struct inode *inode)
6770 {
6771 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6772 struct btrfs_root *root = BTRFS_I(dir)->root;
6773 struct btrfs_new_inode_args new_inode_args = {
6774 .dir = dir,
6775 .dentry = dentry,
6776 .inode = inode,
6777 };
6778 unsigned int trans_num_items;
6779 struct btrfs_trans_handle *trans;
6780 int ret;
6781
6782 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6783 if (ret)
6784 goto out_inode;
6785
6786 trans = btrfs_start_transaction(root, trans_num_items);
6787 if (IS_ERR(trans)) {
6788 ret = PTR_ERR(trans);
6789 goto out_new_inode_args;
6790 }
6791
6792 ret = btrfs_create_new_inode(trans, &new_inode_args);
6793 if (!ret)
6794 d_instantiate_new(dentry, inode);
6795
6796 btrfs_end_transaction(trans);
6797 btrfs_btree_balance_dirty(fs_info);
6798 out_new_inode_args:
6799 btrfs_new_inode_args_destroy(&new_inode_args);
6800 out_inode:
6801 if (ret)
6802 iput(inode);
6803 return ret;
6804 }
6805
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6806 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6807 struct dentry *dentry, umode_t mode, dev_t rdev)
6808 {
6809 struct inode *inode;
6810
6811 inode = new_inode(dir->i_sb);
6812 if (!inode)
6813 return -ENOMEM;
6814 inode_init_owner(idmap, inode, dir, mode);
6815 inode->i_op = &btrfs_special_inode_operations;
6816 init_special_inode(inode, inode->i_mode, rdev);
6817 return btrfs_create_common(dir, dentry, inode);
6818 }
6819
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6820 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6821 struct dentry *dentry, umode_t mode, bool excl)
6822 {
6823 struct inode *inode;
6824
6825 inode = new_inode(dir->i_sb);
6826 if (!inode)
6827 return -ENOMEM;
6828 inode_init_owner(idmap, inode, dir, mode);
6829 inode->i_fop = &btrfs_file_operations;
6830 inode->i_op = &btrfs_file_inode_operations;
6831 inode->i_mapping->a_ops = &btrfs_aops;
6832 return btrfs_create_common(dir, dentry, inode);
6833 }
6834
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6835 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6836 struct dentry *dentry)
6837 {
6838 struct btrfs_trans_handle *trans = NULL;
6839 struct btrfs_root *root = BTRFS_I(dir)->root;
6840 struct inode *inode = d_inode(old_dentry);
6841 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
6842 struct fscrypt_name fname;
6843 u64 index;
6844 int ret;
6845
6846 /* do not allow sys_link's with other subvols of the same device */
6847 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
6848 return -EXDEV;
6849
6850 if (inode->i_nlink >= BTRFS_LINK_MAX)
6851 return -EMLINK;
6852
6853 ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6854 if (ret)
6855 goto fail;
6856
6857 ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
6858 if (ret)
6859 goto fail;
6860
6861 /*
6862 * 2 items for inode and inode ref
6863 * 2 items for dir items
6864 * 1 item for parent inode
6865 * 1 item for orphan item deletion if O_TMPFILE
6866 */
6867 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6868 if (IS_ERR(trans)) {
6869 ret = PTR_ERR(trans);
6870 trans = NULL;
6871 goto fail;
6872 }
6873
6874 /* There are several dir indexes for this inode, clear the cache. */
6875 BTRFS_I(inode)->dir_index = 0ULL;
6876 inode_inc_iversion(inode);
6877 inode_set_ctime_current(inode);
6878
6879 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6880 &fname.disk_name, 1, index);
6881 if (ret)
6882 goto fail;
6883
6884 /* Link added now we update the inode item with the new link count. */
6885 inc_nlink(inode);
6886 ret = btrfs_update_inode(trans, BTRFS_I(inode));
6887 if (unlikely(ret)) {
6888 btrfs_abort_transaction(trans, ret);
6889 goto fail;
6890 }
6891
6892 if (inode->i_nlink == 1) {
6893 /*
6894 * If the new hard link count is 1, it's a file created with the
6895 * open(2) O_TMPFILE flag.
6896 */
6897 ret = btrfs_orphan_del(trans, BTRFS_I(inode));
6898 if (unlikely(ret)) {
6899 btrfs_abort_transaction(trans, ret);
6900 goto fail;
6901 }
6902 }
6903
6904 /* Grab reference for the new dentry passed to d_instantiate(). */
6905 ihold(inode);
6906 d_instantiate(dentry, inode);
6907 btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
6908
6909 fail:
6910 fscrypt_free_filename(&fname);
6911 if (trans)
6912 btrfs_end_transaction(trans);
6913 btrfs_btree_balance_dirty(fs_info);
6914 return ret;
6915 }
6916
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6917 static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6918 struct dentry *dentry, umode_t mode)
6919 {
6920 struct inode *inode;
6921
6922 inode = new_inode(dir->i_sb);
6923 if (!inode)
6924 return ERR_PTR(-ENOMEM);
6925 inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6926 inode->i_op = &btrfs_dir_inode_operations;
6927 inode->i_fop = &btrfs_dir_file_operations;
6928 return ERR_PTR(btrfs_create_common(dir, dentry, inode));
6929 }
6930
uncompress_inline(struct btrfs_path * path,struct folio * folio,struct btrfs_file_extent_item * item)6931 static noinline int uncompress_inline(struct btrfs_path *path,
6932 struct folio *folio,
6933 struct btrfs_file_extent_item *item)
6934 {
6935 int ret;
6936 struct extent_buffer *leaf = path->nodes[0];
6937 const u32 blocksize = leaf->fs_info->sectorsize;
6938 char *tmp;
6939 size_t max_size;
6940 unsigned long inline_size;
6941 unsigned long ptr;
6942 int compress_type;
6943
6944 compress_type = btrfs_file_extent_compression(leaf, item);
6945 max_size = btrfs_file_extent_ram_bytes(leaf, item);
6946 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6947 tmp = kmalloc(inline_size, GFP_NOFS);
6948 if (!tmp)
6949 return -ENOMEM;
6950 ptr = btrfs_file_extent_inline_start(item);
6951
6952 read_extent_buffer(leaf, tmp, ptr, inline_size);
6953
6954 max_size = min_t(unsigned long, blocksize, max_size);
6955 ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
6956 max_size);
6957
6958 /*
6959 * decompression code contains a memset to fill in any space between the end
6960 * of the uncompressed data and the end of max_size in case the decompressed
6961 * data ends up shorter than ram_bytes. That doesn't cover the hole between
6962 * the end of an inline extent and the beginning of the next block, so we
6963 * cover that region here.
6964 */
6965
6966 if (max_size < blocksize)
6967 folio_zero_range(folio, max_size, blocksize - max_size);
6968 kfree(tmp);
6969 return ret;
6970 }
6971
read_inline_extent(struct btrfs_path * path,struct folio * folio)6972 static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
6973 {
6974 const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
6975 struct btrfs_file_extent_item *fi;
6976 void *kaddr;
6977 size_t copy_size;
6978
6979 if (!folio || folio_test_uptodate(folio))
6980 return 0;
6981
6982 ASSERT(folio_pos(folio) == 0);
6983
6984 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6985 struct btrfs_file_extent_item);
6986 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6987 return uncompress_inline(path, folio, fi);
6988
6989 copy_size = min_t(u64, blocksize,
6990 btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6991 kaddr = kmap_local_folio(folio, 0);
6992 read_extent_buffer(path->nodes[0], kaddr,
6993 btrfs_file_extent_inline_start(fi), copy_size);
6994 kunmap_local(kaddr);
6995 if (copy_size < blocksize)
6996 folio_zero_range(folio, copy_size, blocksize - copy_size);
6997 return 0;
6998 }
6999
7000 /*
7001 * Lookup the first extent overlapping a range in a file.
7002 *
7003 * @inode: file to search in
7004 * @page: page to read extent data into if the extent is inline
7005 * @start: file offset
7006 * @len: length of range starting at @start
7007 *
7008 * Return the first &struct extent_map which overlaps the given range, reading
7009 * it from the B-tree and caching it if necessary. Note that there may be more
7010 * extents which overlap the given range after the returned extent_map.
7011 *
7012 * If @page is not NULL and the extent is inline, this also reads the extent
7013 * data directly into the page and marks the extent up to date in the io_tree.
7014 *
7015 * Return: ERR_PTR on error, non-NULL extent_map on success.
7016 */
btrfs_get_extent(struct btrfs_inode * inode,struct folio * folio,u64 start,u64 len)7017 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
7018 struct folio *folio, u64 start, u64 len)
7019 {
7020 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7021 int ret = 0;
7022 u64 extent_start = 0;
7023 u64 extent_end = 0;
7024 u64 objectid = btrfs_ino(inode);
7025 int extent_type = -1;
7026 struct btrfs_path *path = NULL;
7027 struct btrfs_root *root = inode->root;
7028 struct btrfs_file_extent_item *item;
7029 struct extent_buffer *leaf;
7030 struct btrfs_key found_key;
7031 struct extent_map *em = NULL;
7032 struct extent_map_tree *em_tree = &inode->extent_tree;
7033
7034 read_lock(&em_tree->lock);
7035 em = btrfs_lookup_extent_mapping(em_tree, start, len);
7036 read_unlock(&em_tree->lock);
7037
7038 if (em) {
7039 if (em->start > start || em->start + em->len <= start)
7040 btrfs_free_extent_map(em);
7041 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
7042 btrfs_free_extent_map(em);
7043 else
7044 goto out;
7045 }
7046 em = btrfs_alloc_extent_map();
7047 if (!em) {
7048 ret = -ENOMEM;
7049 goto out;
7050 }
7051 em->start = EXTENT_MAP_HOLE;
7052 em->disk_bytenr = EXTENT_MAP_HOLE;
7053 em->len = (u64)-1;
7054
7055 path = btrfs_alloc_path();
7056 if (!path) {
7057 ret = -ENOMEM;
7058 goto out;
7059 }
7060
7061 /* Chances are we'll be called again, so go ahead and do readahead */
7062 path->reada = READA_FORWARD;
7063
7064 /*
7065 * The same explanation in load_free_space_cache applies here as well,
7066 * we only read when we're loading the free space cache, and at that
7067 * point the commit_root has everything we need.
7068 */
7069 if (btrfs_is_free_space_inode(inode)) {
7070 path->search_commit_root = 1;
7071 path->skip_locking = 1;
7072 }
7073
7074 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
7075 if (ret < 0) {
7076 goto out;
7077 } else if (ret > 0) {
7078 if (path->slots[0] == 0)
7079 goto not_found;
7080 path->slots[0]--;
7081 ret = 0;
7082 }
7083
7084 leaf = path->nodes[0];
7085 item = btrfs_item_ptr(leaf, path->slots[0],
7086 struct btrfs_file_extent_item);
7087 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7088 if (found_key.objectid != objectid ||
7089 found_key.type != BTRFS_EXTENT_DATA_KEY) {
7090 /*
7091 * If we backup past the first extent we want to move forward
7092 * and see if there is an extent in front of us, otherwise we'll
7093 * say there is a hole for our whole search range which can
7094 * cause problems.
7095 */
7096 extent_end = start;
7097 goto next;
7098 }
7099
7100 extent_type = btrfs_file_extent_type(leaf, item);
7101 extent_start = found_key.offset;
7102 extent_end = btrfs_file_extent_end(path);
7103 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7104 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7105 /* Only regular file could have regular/prealloc extent */
7106 if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
7107 ret = -EUCLEAN;
7108 btrfs_crit(fs_info,
7109 "regular/prealloc extent found for non-regular inode %llu",
7110 btrfs_ino(inode));
7111 goto out;
7112 }
7113 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
7114 extent_start);
7115 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7116 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
7117 path->slots[0],
7118 extent_start);
7119 }
7120 next:
7121 if (start >= extent_end) {
7122 path->slots[0]++;
7123 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
7124 ret = btrfs_next_leaf(root, path);
7125 if (ret < 0)
7126 goto out;
7127 else if (ret > 0)
7128 goto not_found;
7129
7130 leaf = path->nodes[0];
7131 }
7132 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7133 if (found_key.objectid != objectid ||
7134 found_key.type != BTRFS_EXTENT_DATA_KEY)
7135 goto not_found;
7136 if (start + len <= found_key.offset)
7137 goto not_found;
7138 if (start > found_key.offset)
7139 goto next;
7140
7141 /* New extent overlaps with existing one */
7142 em->start = start;
7143 em->len = found_key.offset - start;
7144 em->disk_bytenr = EXTENT_MAP_HOLE;
7145 goto insert;
7146 }
7147
7148 btrfs_extent_item_to_extent_map(inode, path, item, em);
7149
7150 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7151 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7152 goto insert;
7153 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7154 /*
7155 * Inline extent can only exist at file offset 0. This is
7156 * ensured by tree-checker and inline extent creation path.
7157 * Thus all members representing file offsets should be zero.
7158 */
7159 ASSERT(extent_start == 0);
7160 ASSERT(em->start == 0);
7161
7162 /*
7163 * btrfs_extent_item_to_extent_map() should have properly
7164 * initialized em members already.
7165 *
7166 * Other members are not utilized for inline extents.
7167 */
7168 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
7169 ASSERT(em->len == fs_info->sectorsize);
7170
7171 ret = read_inline_extent(path, folio);
7172 if (ret < 0)
7173 goto out;
7174 goto insert;
7175 }
7176 not_found:
7177 em->start = start;
7178 em->len = len;
7179 em->disk_bytenr = EXTENT_MAP_HOLE;
7180 insert:
7181 ret = 0;
7182 btrfs_release_path(path);
7183 if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
7184 btrfs_err(fs_info,
7185 "bad extent! em: [%llu %llu] passed [%llu %llu]",
7186 em->start, em->len, start, len);
7187 ret = -EIO;
7188 goto out;
7189 }
7190
7191 write_lock(&em_tree->lock);
7192 ret = btrfs_add_extent_mapping(inode, &em, start, len);
7193 write_unlock(&em_tree->lock);
7194 out:
7195 btrfs_free_path(path);
7196
7197 trace_btrfs_get_extent(root, inode, em);
7198
7199 if (ret) {
7200 btrfs_free_extent_map(em);
7201 return ERR_PTR(ret);
7202 }
7203 return em;
7204 }
7205
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7206 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7207 {
7208 struct btrfs_block_group *block_group;
7209 bool readonly = false;
7210
7211 block_group = btrfs_lookup_block_group(fs_info, bytenr);
7212 if (!block_group || block_group->ro)
7213 readonly = true;
7214 if (block_group)
7215 btrfs_put_block_group(block_group);
7216 return readonly;
7217 }
7218
7219 /*
7220 * Check if we can do nocow write into the range [@offset, @offset + @len)
7221 *
7222 * @offset: File offset
7223 * @len: The length to write, will be updated to the nocow writeable
7224 * range
7225 * @orig_start: (optional) Return the original file offset of the file extent
7226 * @orig_len: (optional) Return the original on-disk length of the file extent
7227 * @ram_bytes: (optional) Return the ram_bytes of the file extent
7228 *
7229 * Return:
7230 * >0 and update @len if we can do nocow write
7231 * 0 if we can't do nocow write
7232 * <0 if error happened
7233 *
7234 * NOTE: This only checks the file extents, caller is responsible to wait for
7235 * any ordered extents.
7236 */
can_nocow_extent(struct btrfs_inode * inode,u64 offset,u64 * len,struct btrfs_file_extent * file_extent,bool nowait)7237 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
7238 struct btrfs_file_extent *file_extent,
7239 bool nowait)
7240 {
7241 struct btrfs_root *root = inode->root;
7242 struct btrfs_fs_info *fs_info = root->fs_info;
7243 struct can_nocow_file_extent_args nocow_args = { 0 };
7244 BTRFS_PATH_AUTO_FREE(path);
7245 int ret;
7246 struct extent_buffer *leaf;
7247 struct extent_io_tree *io_tree = &inode->io_tree;
7248 struct btrfs_file_extent_item *fi;
7249 struct btrfs_key key;
7250 int found_type;
7251
7252 path = btrfs_alloc_path();
7253 if (!path)
7254 return -ENOMEM;
7255 path->nowait = nowait;
7256
7257 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7258 offset, 0);
7259 if (ret < 0)
7260 return ret;
7261
7262 if (ret == 1) {
7263 if (path->slots[0] == 0) {
7264 /* Can't find the item, must COW. */
7265 return 0;
7266 }
7267 path->slots[0]--;
7268 }
7269 ret = 0;
7270 leaf = path->nodes[0];
7271 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7272 if (key.objectid != btrfs_ino(inode) ||
7273 key.type != BTRFS_EXTENT_DATA_KEY) {
7274 /* Not our file or wrong item type, must COW. */
7275 return 0;
7276 }
7277
7278 if (key.offset > offset) {
7279 /* Wrong offset, must COW. */
7280 return 0;
7281 }
7282
7283 if (btrfs_file_extent_end(path) <= offset)
7284 return 0;
7285
7286 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7287 found_type = btrfs_file_extent_type(leaf, fi);
7288
7289 nocow_args.start = offset;
7290 nocow_args.end = offset + *len - 1;
7291 nocow_args.free_path = true;
7292
7293 ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
7294 /* can_nocow_file_extent() has freed the path. */
7295 path = NULL;
7296
7297 if (ret != 1) {
7298 /* Treat errors as not being able to NOCOW. */
7299 return 0;
7300 }
7301
7302 if (btrfs_extent_readonly(fs_info,
7303 nocow_args.file_extent.disk_bytenr +
7304 nocow_args.file_extent.offset))
7305 return 0;
7306
7307 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
7308 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7309 u64 range_end;
7310
7311 range_end = round_up(offset + nocow_args.file_extent.num_bytes,
7312 root->fs_info->sectorsize) - 1;
7313 ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
7314 EXTENT_DELALLOC);
7315 if (ret)
7316 return -EAGAIN;
7317 }
7318
7319 if (file_extent)
7320 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
7321
7322 *len = nocow_args.file_extent.num_bytes;
7323
7324 return 1;
7325 }
7326
7327 /* The callers of this must take lock_extent() */
btrfs_create_io_em(struct btrfs_inode * inode,u64 start,const struct btrfs_file_extent * file_extent,int type)7328 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7329 const struct btrfs_file_extent *file_extent,
7330 int type)
7331 {
7332 struct extent_map *em;
7333 int ret;
7334
7335 /*
7336 * Note the missing NOCOW type.
7337 *
7338 * For pure NOCOW writes, we should not create an io extent map, but
7339 * just reusing the existing one.
7340 * Only PREALLOC writes (NOCOW write into preallocated range) can
7341 * create an io extent map.
7342 */
7343 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7344 type == BTRFS_ORDERED_COMPRESSED ||
7345 type == BTRFS_ORDERED_REGULAR);
7346
7347 switch (type) {
7348 case BTRFS_ORDERED_PREALLOC:
7349 /* We're only referring part of a larger preallocated extent. */
7350 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7351 break;
7352 case BTRFS_ORDERED_REGULAR:
7353 /* COW results a new extent matching our file extent size. */
7354 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7355 ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
7356
7357 /* Since it's a new extent, we should not have any offset. */
7358 ASSERT(file_extent->offset == 0);
7359 break;
7360 case BTRFS_ORDERED_COMPRESSED:
7361 /* Must be compressed. */
7362 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
7363
7364 /*
7365 * Encoded write can make us to refer to part of the
7366 * uncompressed extent.
7367 */
7368 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7369 break;
7370 }
7371
7372 em = btrfs_alloc_extent_map();
7373 if (!em)
7374 return ERR_PTR(-ENOMEM);
7375
7376 em->start = start;
7377 em->len = file_extent->num_bytes;
7378 em->disk_bytenr = file_extent->disk_bytenr;
7379 em->disk_num_bytes = file_extent->disk_num_bytes;
7380 em->ram_bytes = file_extent->ram_bytes;
7381 em->generation = -1;
7382 em->offset = file_extent->offset;
7383 em->flags |= EXTENT_FLAG_PINNED;
7384 if (type == BTRFS_ORDERED_COMPRESSED)
7385 btrfs_extent_map_set_compression(em, file_extent->compression);
7386
7387 ret = btrfs_replace_extent_map_range(inode, em, true);
7388 if (ret) {
7389 btrfs_free_extent_map(em);
7390 return ERR_PTR(ret);
7391 }
7392
7393 /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
7394 return em;
7395 }
7396
7397 /*
7398 * For release_folio() and invalidate_folio() we have a race window where
7399 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7400 * If we continue to release/invalidate the page, we could cause use-after-free
7401 * for subpage spinlock. So this function is to spin and wait for subpage
7402 * spinlock.
7403 */
wait_subpage_spinlock(struct folio * folio)7404 static void wait_subpage_spinlock(struct folio *folio)
7405 {
7406 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
7407 struct btrfs_folio_state *bfs;
7408
7409 if (!btrfs_is_subpage(fs_info, folio))
7410 return;
7411
7412 ASSERT(folio_test_private(folio) && folio_get_private(folio));
7413 bfs = folio_get_private(folio);
7414
7415 /*
7416 * This may look insane as we just acquire the spinlock and release it,
7417 * without doing anything. But we just want to make sure no one is
7418 * still holding the subpage spinlock.
7419 * And since the page is not dirty nor writeback, and we have page
7420 * locked, the only possible way to hold a spinlock is from the endio
7421 * function to clear page writeback.
7422 *
7423 * Here we just acquire the spinlock so that all existing callers
7424 * should exit and we're safe to release/invalidate the page.
7425 */
7426 spin_lock_irq(&bfs->lock);
7427 spin_unlock_irq(&bfs->lock);
7428 }
7429
btrfs_launder_folio(struct folio * folio)7430 static int btrfs_launder_folio(struct folio *folio)
7431 {
7432 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7433 folio_size(folio), NULL);
7434 }
7435
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7436 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7437 {
7438 if (try_release_extent_mapping(folio, gfp_flags)) {
7439 wait_subpage_spinlock(folio);
7440 clear_folio_extent_mapped(folio);
7441 return true;
7442 }
7443 return false;
7444 }
7445
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7446 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7447 {
7448 if (folio_test_writeback(folio) || folio_test_dirty(folio))
7449 return false;
7450 return __btrfs_release_folio(folio, gfp_flags);
7451 }
7452
7453 #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7454 static int btrfs_migrate_folio(struct address_space *mapping,
7455 struct folio *dst, struct folio *src,
7456 enum migrate_mode mode)
7457 {
7458 int ret = filemap_migrate_folio(mapping, dst, src, mode);
7459
7460 if (ret)
7461 return ret;
7462
7463 if (folio_test_ordered(src)) {
7464 folio_clear_ordered(src);
7465 folio_set_ordered(dst);
7466 }
7467
7468 return 0;
7469 }
7470 #else
7471 #define btrfs_migrate_folio NULL
7472 #endif
7473
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7474 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7475 size_t length)
7476 {
7477 struct btrfs_inode *inode = folio_to_inode(folio);
7478 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7479 struct extent_io_tree *tree = &inode->io_tree;
7480 struct extent_state *cached_state = NULL;
7481 u64 page_start = folio_pos(folio);
7482 u64 page_end = page_start + folio_size(folio) - 1;
7483 u64 cur;
7484 int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7485
7486 /*
7487 * We have folio locked so no new ordered extent can be created on this
7488 * page, nor bio can be submitted for this folio.
7489 *
7490 * But already submitted bio can still be finished on this folio.
7491 * Furthermore, endio function won't skip folio which has Ordered
7492 * already cleared, so it's possible for endio and
7493 * invalidate_folio to do the same ordered extent accounting twice
7494 * on one folio.
7495 *
7496 * So here we wait for any submitted bios to finish, so that we won't
7497 * do double ordered extent accounting on the same folio.
7498 */
7499 folio_wait_writeback(folio);
7500 wait_subpage_spinlock(folio);
7501
7502 /*
7503 * For subpage case, we have call sites like
7504 * btrfs_punch_hole_lock_range() which passes range not aligned to
7505 * sectorsize.
7506 * If the range doesn't cover the full folio, we don't need to and
7507 * shouldn't clear page extent mapped, as folio->private can still
7508 * record subpage dirty bits for other part of the range.
7509 *
7510 * For cases that invalidate the full folio even the range doesn't
7511 * cover the full folio, like invalidating the last folio, we're
7512 * still safe to wait for ordered extent to finish.
7513 */
7514 if (!(offset == 0 && length == folio_size(folio))) {
7515 btrfs_release_folio(folio, GFP_NOFS);
7516 return;
7517 }
7518
7519 if (!inode_evicting)
7520 btrfs_lock_extent(tree, page_start, page_end, &cached_state);
7521
7522 cur = page_start;
7523 while (cur < page_end) {
7524 struct btrfs_ordered_extent *ordered;
7525 u64 range_end;
7526 u32 range_len;
7527 u32 extra_flags = 0;
7528
7529 ordered = btrfs_lookup_first_ordered_range(inode, cur,
7530 page_end + 1 - cur);
7531 if (!ordered) {
7532 range_end = page_end;
7533 /*
7534 * No ordered extent covering this range, we are safe
7535 * to delete all extent states in the range.
7536 */
7537 extra_flags = EXTENT_CLEAR_ALL_BITS;
7538 goto next;
7539 }
7540 if (ordered->file_offset > cur) {
7541 /*
7542 * There is a range between [cur, oe->file_offset) not
7543 * covered by any ordered extent.
7544 * We are safe to delete all extent states, and handle
7545 * the ordered extent in the next iteration.
7546 */
7547 range_end = ordered->file_offset - 1;
7548 extra_flags = EXTENT_CLEAR_ALL_BITS;
7549 goto next;
7550 }
7551
7552 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7553 page_end);
7554 ASSERT(range_end + 1 - cur < U32_MAX);
7555 range_len = range_end + 1 - cur;
7556 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
7557 /*
7558 * If Ordered is cleared, it means endio has
7559 * already been executed for the range.
7560 * We can't delete the extent states as
7561 * btrfs_finish_ordered_io() may still use some of them.
7562 */
7563 goto next;
7564 }
7565 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
7566
7567 /*
7568 * IO on this page will never be started, so we need to account
7569 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7570 * here, must leave that up for the ordered extent completion.
7571 *
7572 * This will also unlock the range for incoming
7573 * btrfs_finish_ordered_io().
7574 */
7575 if (!inode_evicting)
7576 btrfs_clear_extent_bit(tree, cur, range_end,
7577 EXTENT_DELALLOC |
7578 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7579 EXTENT_DEFRAG, &cached_state);
7580
7581 spin_lock_irq(&inode->ordered_tree_lock);
7582 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7583 ordered->truncated_len = min(ordered->truncated_len,
7584 cur - ordered->file_offset);
7585 spin_unlock_irq(&inode->ordered_tree_lock);
7586
7587 /*
7588 * If the ordered extent has finished, we're safe to delete all
7589 * the extent states of the range, otherwise
7590 * btrfs_finish_ordered_io() will get executed by endio for
7591 * other pages, so we can't delete extent states.
7592 */
7593 if (btrfs_dec_test_ordered_pending(inode, &ordered,
7594 cur, range_end + 1 - cur)) {
7595 btrfs_finish_ordered_io(ordered);
7596 /*
7597 * The ordered extent has finished, now we're again
7598 * safe to delete all extent states of the range.
7599 */
7600 extra_flags = EXTENT_CLEAR_ALL_BITS;
7601 }
7602 next:
7603 if (ordered)
7604 btrfs_put_ordered_extent(ordered);
7605 /*
7606 * Qgroup reserved space handler
7607 * Sector(s) here will be either:
7608 *
7609 * 1) Already written to disk or bio already finished
7610 * Then its QGROUP_RESERVED bit in io_tree is already cleared.
7611 * Qgroup will be handled by its qgroup_record then.
7612 * btrfs_qgroup_free_data() call will do nothing here.
7613 *
7614 * 2) Not written to disk yet
7615 * Then btrfs_qgroup_free_data() call will clear the
7616 * QGROUP_RESERVED bit of its io_tree, and free the qgroup
7617 * reserved data space.
7618 * Since the IO will never happen for this page.
7619 */
7620 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
7621 if (!inode_evicting)
7622 btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7623 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7624 EXTENT_DEFRAG | extra_flags,
7625 &cached_state);
7626 cur = range_end + 1;
7627 }
7628 /*
7629 * We have iterated through all ordered extents of the page, the page
7630 * should not have Ordered anymore, or the above iteration
7631 * did something wrong.
7632 */
7633 ASSERT(!folio_test_ordered(folio));
7634 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
7635 if (!inode_evicting)
7636 __btrfs_release_folio(folio, GFP_NOFS);
7637 clear_folio_extent_mapped(folio);
7638 }
7639
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)7640 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
7641 {
7642 struct btrfs_truncate_control control = {
7643 .inode = inode,
7644 .ino = btrfs_ino(inode),
7645 .min_type = BTRFS_EXTENT_DATA_KEY,
7646 .clear_extent_range = true,
7647 };
7648 struct btrfs_root *root = inode->root;
7649 struct btrfs_fs_info *fs_info = root->fs_info;
7650 struct btrfs_block_rsv rsv;
7651 int ret;
7652 struct btrfs_trans_handle *trans;
7653 u64 mask = fs_info->sectorsize - 1;
7654 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
7655
7656 if (!skip_writeback) {
7657 ret = btrfs_wait_ordered_range(inode,
7658 inode->vfs_inode.i_size & (~mask),
7659 (u64)-1);
7660 if (ret)
7661 return ret;
7662 }
7663
7664 /*
7665 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
7666 * things going on here:
7667 *
7668 * 1) We need to reserve space to update our inode.
7669 *
7670 * 2) We need to have something to cache all the space that is going to
7671 * be free'd up by the truncate operation, but also have some slack
7672 * space reserved in case it uses space during the truncate (thank you
7673 * very much snapshotting).
7674 *
7675 * And we need these to be separate. The fact is we can use a lot of
7676 * space doing the truncate, and we have no earthly idea how much space
7677 * we will use, so we need the truncate reservation to be separate so it
7678 * doesn't end up using space reserved for updating the inode. We also
7679 * need to be able to stop the transaction and start a new one, which
7680 * means we need to be able to update the inode several times, and we
7681 * have no idea of knowing how many times that will be, so we can't just
7682 * reserve 1 item for the entirety of the operation, so that has to be
7683 * done separately as well.
7684 *
7685 * So that leaves us with
7686 *
7687 * 1) rsv - for the truncate reservation, which we will steal from the
7688 * transaction reservation.
7689 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
7690 * updating the inode.
7691 */
7692 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
7693 rsv.size = min_size;
7694 rsv.failfast = true;
7695
7696 /*
7697 * 1 for the truncate slack space
7698 * 1 for updating the inode.
7699 */
7700 trans = btrfs_start_transaction(root, 2);
7701 if (IS_ERR(trans)) {
7702 ret = PTR_ERR(trans);
7703 goto out;
7704 }
7705
7706 /* Migrate the slack space for the truncate to our reserve */
7707 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
7708 min_size, false);
7709 /*
7710 * We have reserved 2 metadata units when we started the transaction and
7711 * min_size matches 1 unit, so this should never fail, but if it does,
7712 * it's not critical we just fail truncation.
7713 */
7714 if (WARN_ON(ret)) {
7715 btrfs_end_transaction(trans);
7716 goto out;
7717 }
7718
7719 trans->block_rsv = &rsv;
7720
7721 while (1) {
7722 struct extent_state *cached_state = NULL;
7723 const u64 new_size = inode->vfs_inode.i_size;
7724 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7725
7726 control.new_size = new_size;
7727 btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7728 /*
7729 * We want to drop from the next block forward in case this new
7730 * size is not block aligned since we will be keeping the last
7731 * block of the extent just the way it is.
7732 */
7733 btrfs_drop_extent_map_range(inode,
7734 ALIGN(new_size, fs_info->sectorsize),
7735 (u64)-1, false);
7736
7737 ret = btrfs_truncate_inode_items(trans, root, &control);
7738
7739 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7740 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
7741
7742 btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7743
7744 trans->block_rsv = &fs_info->trans_block_rsv;
7745 if (ret != -ENOSPC && ret != -EAGAIN)
7746 break;
7747
7748 ret = btrfs_update_inode(trans, inode);
7749 if (ret)
7750 break;
7751
7752 btrfs_end_transaction(trans);
7753 btrfs_btree_balance_dirty(fs_info);
7754
7755 trans = btrfs_start_transaction(root, 2);
7756 if (IS_ERR(trans)) {
7757 ret = PTR_ERR(trans);
7758 trans = NULL;
7759 break;
7760 }
7761
7762 btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
7763 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
7764 &rsv, min_size, false);
7765 /*
7766 * We have reserved 2 metadata units when we started the
7767 * transaction and min_size matches 1 unit, so this should never
7768 * fail, but if it does, it's not critical we just fail truncation.
7769 */
7770 if (WARN_ON(ret))
7771 break;
7772
7773 trans->block_rsv = &rsv;
7774 }
7775
7776 /*
7777 * We can't call btrfs_truncate_block inside a trans handle as we could
7778 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7779 * know we've truncated everything except the last little bit, and can
7780 * do btrfs_truncate_block and then update the disk_i_size.
7781 */
7782 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
7783 btrfs_end_transaction(trans);
7784 btrfs_btree_balance_dirty(fs_info);
7785
7786 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
7787 inode->vfs_inode.i_size, (u64)-1);
7788 if (ret)
7789 goto out;
7790 trans = btrfs_start_transaction(root, 1);
7791 if (IS_ERR(trans)) {
7792 ret = PTR_ERR(trans);
7793 goto out;
7794 }
7795 btrfs_inode_safe_disk_i_size_write(inode, 0);
7796 }
7797
7798 if (trans) {
7799 int ret2;
7800
7801 trans->block_rsv = &fs_info->trans_block_rsv;
7802 ret2 = btrfs_update_inode(trans, inode);
7803 if (ret2 && !ret)
7804 ret = ret2;
7805
7806 ret2 = btrfs_end_transaction(trans);
7807 if (ret2 && !ret)
7808 ret = ret2;
7809 btrfs_btree_balance_dirty(fs_info);
7810 }
7811 out:
7812 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
7813 /*
7814 * So if we truncate and then write and fsync we normally would just
7815 * write the extents that changed, which is a problem if we need to
7816 * first truncate that entire inode. So set this flag so we write out
7817 * all of the extents in the inode to the sync log so we're completely
7818 * safe.
7819 *
7820 * If no extents were dropped or trimmed we don't need to force the next
7821 * fsync to truncate all the inode's items from the log and re-log them
7822 * all. This means the truncate operation did not change the file size,
7823 * or changed it to a smaller size but there was only an implicit hole
7824 * between the old i_size and the new i_size, and there were no prealloc
7825 * extents beyond i_size to drop.
7826 */
7827 if (control.extents_found > 0)
7828 btrfs_set_inode_full_sync(inode);
7829
7830 return ret;
7831 }
7832
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)7833 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
7834 struct inode *dir)
7835 {
7836 struct inode *inode;
7837
7838 inode = new_inode(dir->i_sb);
7839 if (inode) {
7840 /*
7841 * Subvolumes don't inherit the sgid bit or the parent's gid if
7842 * the parent's sgid bit is set. This is probably a bug.
7843 */
7844 inode_init_owner(idmap, inode, NULL,
7845 S_IFDIR | (~current_umask() & S_IRWXUGO));
7846 inode->i_op = &btrfs_dir_inode_operations;
7847 inode->i_fop = &btrfs_dir_file_operations;
7848 }
7849 return inode;
7850 }
7851
btrfs_alloc_inode(struct super_block * sb)7852 struct inode *btrfs_alloc_inode(struct super_block *sb)
7853 {
7854 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
7855 struct btrfs_inode *ei;
7856 struct inode *inode;
7857
7858 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
7859 if (!ei)
7860 return NULL;
7861
7862 ei->root = NULL;
7863 ei->generation = 0;
7864 ei->last_trans = 0;
7865 ei->last_sub_trans = 0;
7866 ei->logged_trans = 0;
7867 ei->delalloc_bytes = 0;
7868 /* new_delalloc_bytes and last_dir_index_offset are in a union. */
7869 ei->new_delalloc_bytes = 0;
7870 ei->defrag_bytes = 0;
7871 ei->disk_i_size = 0;
7872 ei->flags = 0;
7873 ei->ro_flags = 0;
7874 /*
7875 * ->index_cnt will be properly initialized later when creating a new
7876 * inode (btrfs_create_new_inode()) or when reading an existing inode
7877 * from disk (btrfs_read_locked_inode()).
7878 */
7879 ei->csum_bytes = 0;
7880 ei->dir_index = 0;
7881 ei->last_unlink_trans = 0;
7882 ei->last_reflink_trans = 0;
7883 ei->last_log_commit = 0;
7884
7885 spin_lock_init(&ei->lock);
7886 ei->outstanding_extents = 0;
7887 if (sb->s_magic != BTRFS_TEST_MAGIC)
7888 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7889 BTRFS_BLOCK_RSV_DELALLOC);
7890 ei->runtime_flags = 0;
7891 ei->prop_compress = BTRFS_COMPRESS_NONE;
7892 ei->defrag_compress = BTRFS_COMPRESS_NONE;
7893
7894 ei->delayed_node = NULL;
7895
7896 ei->i_otime_sec = 0;
7897 ei->i_otime_nsec = 0;
7898
7899 inode = &ei->vfs_inode;
7900 btrfs_extent_map_tree_init(&ei->extent_tree);
7901
7902 /* This io tree sets the valid inode. */
7903 btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
7904 ei->io_tree.inode = ei;
7905
7906 ei->file_extent_tree = NULL;
7907
7908 mutex_init(&ei->log_mutex);
7909 spin_lock_init(&ei->ordered_tree_lock);
7910 ei->ordered_tree = RB_ROOT;
7911 ei->ordered_tree_last = NULL;
7912 INIT_LIST_HEAD(&ei->delalloc_inodes);
7913 INIT_LIST_HEAD(&ei->delayed_iput);
7914 init_rwsem(&ei->i_mmap_lock);
7915
7916 return inode;
7917 }
7918
7919 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)7920 void btrfs_test_destroy_inode(struct inode *inode)
7921 {
7922 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
7923 kfree(BTRFS_I(inode)->file_extent_tree);
7924 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7925 }
7926 #endif
7927
btrfs_free_inode(struct inode * inode)7928 void btrfs_free_inode(struct inode *inode)
7929 {
7930 kfree(BTRFS_I(inode)->file_extent_tree);
7931 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7932 }
7933
btrfs_destroy_inode(struct inode * vfs_inode)7934 void btrfs_destroy_inode(struct inode *vfs_inode)
7935 {
7936 struct btrfs_ordered_extent *ordered;
7937 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7938 struct btrfs_root *root = inode->root;
7939 bool freespace_inode;
7940
7941 WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7942 WARN_ON(vfs_inode->i_data.nrpages);
7943 WARN_ON(inode->block_rsv.reserved);
7944 WARN_ON(inode->block_rsv.size);
7945 WARN_ON(inode->outstanding_extents);
7946 if (!S_ISDIR(vfs_inode->i_mode)) {
7947 WARN_ON(inode->delalloc_bytes);
7948 WARN_ON(inode->new_delalloc_bytes);
7949 WARN_ON(inode->csum_bytes);
7950 }
7951 if (!root || !btrfs_is_data_reloc_root(root))
7952 WARN_ON(inode->defrag_bytes);
7953
7954 /*
7955 * This can happen where we create an inode, but somebody else also
7956 * created the same inode and we need to destroy the one we already
7957 * created.
7958 */
7959 if (!root)
7960 return;
7961
7962 /*
7963 * If this is a free space inode do not take the ordered extents lockdep
7964 * map.
7965 */
7966 freespace_inode = btrfs_is_free_space_inode(inode);
7967
7968 while (1) {
7969 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7970 if (!ordered)
7971 break;
7972 else {
7973 btrfs_err(root->fs_info,
7974 "found ordered extent %llu %llu on inode cleanup",
7975 ordered->file_offset, ordered->num_bytes);
7976
7977 if (!freespace_inode)
7978 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7979
7980 btrfs_remove_ordered_extent(inode, ordered);
7981 btrfs_put_ordered_extent(ordered);
7982 btrfs_put_ordered_extent(ordered);
7983 }
7984 }
7985 btrfs_qgroup_check_reserved_leak(inode);
7986 btrfs_del_inode_from_root(inode);
7987 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
7988 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7989 btrfs_put_root(inode->root);
7990 }
7991
btrfs_drop_inode(struct inode * inode)7992 int btrfs_drop_inode(struct inode *inode)
7993 {
7994 struct btrfs_root *root = BTRFS_I(inode)->root;
7995
7996 if (root == NULL)
7997 return 1;
7998
7999 /* the snap/subvol tree is on deleting */
8000 if (btrfs_root_refs(&root->root_item) == 0)
8001 return 1;
8002 else
8003 return inode_generic_drop(inode);
8004 }
8005
init_once(void * foo)8006 static void init_once(void *foo)
8007 {
8008 struct btrfs_inode *ei = foo;
8009
8010 inode_init_once(&ei->vfs_inode);
8011 #ifdef CONFIG_FS_VERITY
8012 ei->i_verity_info = NULL;
8013 #endif
8014 }
8015
btrfs_destroy_cachep(void)8016 void __cold btrfs_destroy_cachep(void)
8017 {
8018 /*
8019 * Make sure all delayed rcu free inodes are flushed before we
8020 * destroy cache.
8021 */
8022 rcu_barrier();
8023 kmem_cache_destroy(btrfs_inode_cachep);
8024 }
8025
btrfs_init_cachep(void)8026 int __init btrfs_init_cachep(void)
8027 {
8028 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8029 sizeof(struct btrfs_inode), 0,
8030 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
8031 init_once);
8032 if (!btrfs_inode_cachep)
8033 return -ENOMEM;
8034
8035 return 0;
8036 }
8037
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)8038 static int btrfs_getattr(struct mnt_idmap *idmap,
8039 const struct path *path, struct kstat *stat,
8040 u32 request_mask, unsigned int flags)
8041 {
8042 u64 delalloc_bytes;
8043 u64 inode_bytes;
8044 struct inode *inode = d_inode(path->dentry);
8045 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
8046 u32 bi_flags = BTRFS_I(inode)->flags;
8047 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8048
8049 stat->result_mask |= STATX_BTIME;
8050 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
8051 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
8052 if (bi_flags & BTRFS_INODE_APPEND)
8053 stat->attributes |= STATX_ATTR_APPEND;
8054 if (bi_flags & BTRFS_INODE_COMPRESS)
8055 stat->attributes |= STATX_ATTR_COMPRESSED;
8056 if (bi_flags & BTRFS_INODE_IMMUTABLE)
8057 stat->attributes |= STATX_ATTR_IMMUTABLE;
8058 if (bi_flags & BTRFS_INODE_NODUMP)
8059 stat->attributes |= STATX_ATTR_NODUMP;
8060 if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8061 stat->attributes |= STATX_ATTR_VERITY;
8062
8063 stat->attributes_mask |= (STATX_ATTR_APPEND |
8064 STATX_ATTR_COMPRESSED |
8065 STATX_ATTR_IMMUTABLE |
8066 STATX_ATTR_NODUMP);
8067
8068 generic_fillattr(idmap, request_mask, inode, stat);
8069 stat->dev = BTRFS_I(inode)->root->anon_dev;
8070
8071 stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
8072 stat->result_mask |= STATX_SUBVOL;
8073
8074 spin_lock(&BTRFS_I(inode)->lock);
8075 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8076 inode_bytes = inode_get_bytes(inode);
8077 spin_unlock(&BTRFS_I(inode)->lock);
8078 stat->blocks = (ALIGN(inode_bytes, blocksize) +
8079 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8080 return 0;
8081 }
8082
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)8083 static int btrfs_rename_exchange(struct inode *old_dir,
8084 struct dentry *old_dentry,
8085 struct inode *new_dir,
8086 struct dentry *new_dentry)
8087 {
8088 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8089 struct btrfs_trans_handle *trans;
8090 unsigned int trans_num_items;
8091 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8092 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8093 struct inode *new_inode = new_dentry->d_inode;
8094 struct inode *old_inode = old_dentry->d_inode;
8095 struct btrfs_rename_ctx old_rename_ctx;
8096 struct btrfs_rename_ctx new_rename_ctx;
8097 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8098 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8099 u64 old_idx = 0;
8100 u64 new_idx = 0;
8101 int ret;
8102 int ret2;
8103 bool need_abort = false;
8104 bool logs_pinned = false;
8105 struct fscrypt_name old_fname, new_fname;
8106 struct fscrypt_str *old_name, *new_name;
8107
8108 /*
8109 * For non-subvolumes allow exchange only within one subvolume, in the
8110 * same inode namespace. Two subvolumes (represented as directory) can
8111 * be exchanged as they're a logical link and have a fixed inode number.
8112 */
8113 if (root != dest &&
8114 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8115 new_ino != BTRFS_FIRST_FREE_OBJECTID))
8116 return -EXDEV;
8117
8118 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8119 if (ret)
8120 return ret;
8121
8122 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8123 if (ret) {
8124 fscrypt_free_filename(&old_fname);
8125 return ret;
8126 }
8127
8128 old_name = &old_fname.disk_name;
8129 new_name = &new_fname.disk_name;
8130
8131 /* close the race window with snapshot create/destroy ioctl */
8132 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8133 new_ino == BTRFS_FIRST_FREE_OBJECTID)
8134 down_read(&fs_info->subvol_sem);
8135
8136 /*
8137 * For each inode:
8138 * 1 to remove old dir item
8139 * 1 to remove old dir index
8140 * 1 to add new dir item
8141 * 1 to add new dir index
8142 * 1 to update parent inode
8143 *
8144 * If the parents are the same, we only need to account for one
8145 */
8146 trans_num_items = (old_dir == new_dir ? 9 : 10);
8147 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8148 /*
8149 * 1 to remove old root ref
8150 * 1 to remove old root backref
8151 * 1 to add new root ref
8152 * 1 to add new root backref
8153 */
8154 trans_num_items += 4;
8155 } else {
8156 /*
8157 * 1 to update inode item
8158 * 1 to remove old inode ref
8159 * 1 to add new inode ref
8160 */
8161 trans_num_items += 3;
8162 }
8163 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8164 trans_num_items += 4;
8165 else
8166 trans_num_items += 3;
8167 trans = btrfs_start_transaction(root, trans_num_items);
8168 if (IS_ERR(trans)) {
8169 ret = PTR_ERR(trans);
8170 goto out_notrans;
8171 }
8172
8173 if (dest != root) {
8174 ret = btrfs_record_root_in_trans(trans, dest);
8175 if (ret)
8176 goto out_fail;
8177 }
8178
8179 /*
8180 * We need to find a free sequence number both in the source and
8181 * in the destination directory for the exchange.
8182 */
8183 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8184 if (ret)
8185 goto out_fail;
8186 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8187 if (ret)
8188 goto out_fail;
8189
8190 BTRFS_I(old_inode)->dir_index = 0ULL;
8191 BTRFS_I(new_inode)->dir_index = 0ULL;
8192
8193 /* Reference for the source. */
8194 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8195 /* force full log commit if subvolume involved. */
8196 btrfs_set_log_full_commit(trans);
8197 } else {
8198 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8199 btrfs_ino(BTRFS_I(new_dir)),
8200 old_idx);
8201 if (ret)
8202 goto out_fail;
8203 need_abort = true;
8204 }
8205
8206 /* And now for the dest. */
8207 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8208 /* force full log commit if subvolume involved. */
8209 btrfs_set_log_full_commit(trans);
8210 } else {
8211 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8212 btrfs_ino(BTRFS_I(old_dir)),
8213 new_idx);
8214 if (ret) {
8215 if (unlikely(need_abort))
8216 btrfs_abort_transaction(trans, ret);
8217 goto out_fail;
8218 }
8219 }
8220
8221 /* Update inode version and ctime/mtime. */
8222 inode_inc_iversion(old_dir);
8223 inode_inc_iversion(new_dir);
8224 inode_inc_iversion(old_inode);
8225 inode_inc_iversion(new_inode);
8226 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8227
8228 if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
8229 new_ino != BTRFS_FIRST_FREE_OBJECTID) {
8230 /*
8231 * If we are renaming in the same directory (and it's not for
8232 * root entries) pin the log early to prevent any concurrent
8233 * task from logging the directory after we removed the old
8234 * entries and before we add the new entries, otherwise that
8235 * task can sync a log without any entry for the inodes we are
8236 * renaming and therefore replaying that log, if a power failure
8237 * happens after syncing the log, would result in deleting the
8238 * inodes.
8239 *
8240 * If the rename affects two different directories, we want to
8241 * make sure the that there's no log commit that contains
8242 * updates for only one of the directories but not for the
8243 * other.
8244 *
8245 * If we are renaming an entry for a root, we don't care about
8246 * log updates since we called btrfs_set_log_full_commit().
8247 */
8248 btrfs_pin_log_trans(root);
8249 btrfs_pin_log_trans(dest);
8250 logs_pinned = true;
8251 }
8252
8253 if (old_dentry->d_parent != new_dentry->d_parent) {
8254 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8255 BTRFS_I(old_inode), true);
8256 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8257 BTRFS_I(new_inode), true);
8258 }
8259
8260 /* src is a subvolume */
8261 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8262 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8263 if (unlikely(ret)) {
8264 btrfs_abort_transaction(trans, ret);
8265 goto out_fail;
8266 }
8267 } else { /* src is an inode */
8268 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8269 BTRFS_I(old_dentry->d_inode),
8270 old_name, &old_rename_ctx);
8271 if (unlikely(ret)) {
8272 btrfs_abort_transaction(trans, ret);
8273 goto out_fail;
8274 }
8275 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8276 if (unlikely(ret)) {
8277 btrfs_abort_transaction(trans, ret);
8278 goto out_fail;
8279 }
8280 }
8281
8282 /* dest is a subvolume */
8283 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8284 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8285 if (unlikely(ret)) {
8286 btrfs_abort_transaction(trans, ret);
8287 goto out_fail;
8288 }
8289 } else { /* dest is an inode */
8290 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8291 BTRFS_I(new_dentry->d_inode),
8292 new_name, &new_rename_ctx);
8293 if (unlikely(ret)) {
8294 btrfs_abort_transaction(trans, ret);
8295 goto out_fail;
8296 }
8297 ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
8298 if (unlikely(ret)) {
8299 btrfs_abort_transaction(trans, ret);
8300 goto out_fail;
8301 }
8302 }
8303
8304 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8305 new_name, 0, old_idx);
8306 if (unlikely(ret)) {
8307 btrfs_abort_transaction(trans, ret);
8308 goto out_fail;
8309 }
8310
8311 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8312 old_name, 0, new_idx);
8313 if (unlikely(ret)) {
8314 btrfs_abort_transaction(trans, ret);
8315 goto out_fail;
8316 }
8317
8318 if (old_inode->i_nlink == 1)
8319 BTRFS_I(old_inode)->dir_index = old_idx;
8320 if (new_inode->i_nlink == 1)
8321 BTRFS_I(new_inode)->dir_index = new_idx;
8322
8323 /*
8324 * Do the log updates for all inodes.
8325 *
8326 * If either entry is for a root we don't need to update the logs since
8327 * we've called btrfs_set_log_full_commit() before.
8328 */
8329 if (logs_pinned) {
8330 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8331 old_rename_ctx.index, new_dentry->d_parent);
8332 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8333 new_rename_ctx.index, old_dentry->d_parent);
8334 }
8335
8336 out_fail:
8337 if (logs_pinned) {
8338 btrfs_end_log_trans(root);
8339 btrfs_end_log_trans(dest);
8340 }
8341 ret2 = btrfs_end_transaction(trans);
8342 ret = ret ? ret : ret2;
8343 out_notrans:
8344 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8345 old_ino == BTRFS_FIRST_FREE_OBJECTID)
8346 up_read(&fs_info->subvol_sem);
8347
8348 fscrypt_free_filename(&new_fname);
8349 fscrypt_free_filename(&old_fname);
8350 return ret;
8351 }
8352
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8353 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8354 struct inode *dir)
8355 {
8356 struct inode *inode;
8357
8358 inode = new_inode(dir->i_sb);
8359 if (inode) {
8360 inode_init_owner(idmap, inode, dir,
8361 S_IFCHR | WHITEOUT_MODE);
8362 inode->i_op = &btrfs_special_inode_operations;
8363 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8364 }
8365 return inode;
8366 }
8367
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8368 static int btrfs_rename(struct mnt_idmap *idmap,
8369 struct inode *old_dir, struct dentry *old_dentry,
8370 struct inode *new_dir, struct dentry *new_dentry,
8371 unsigned int flags)
8372 {
8373 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8374 struct btrfs_new_inode_args whiteout_args = {
8375 .dir = old_dir,
8376 .dentry = old_dentry,
8377 };
8378 struct btrfs_trans_handle *trans;
8379 unsigned int trans_num_items;
8380 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8381 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8382 struct inode *new_inode = d_inode(new_dentry);
8383 struct inode *old_inode = d_inode(old_dentry);
8384 struct btrfs_rename_ctx rename_ctx;
8385 u64 index = 0;
8386 int ret;
8387 int ret2;
8388 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8389 struct fscrypt_name old_fname, new_fname;
8390 bool logs_pinned = false;
8391
8392 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8393 return -EPERM;
8394
8395 /* we only allow rename subvolume link between subvolumes */
8396 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8397 return -EXDEV;
8398
8399 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8400 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8401 return -ENOTEMPTY;
8402
8403 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8404 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8405 return -ENOTEMPTY;
8406
8407 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8408 if (ret)
8409 return ret;
8410
8411 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8412 if (ret) {
8413 fscrypt_free_filename(&old_fname);
8414 return ret;
8415 }
8416
8417 /* check for collisions, even if the name isn't there */
8418 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8419 if (ret) {
8420 if (ret == -EEXIST) {
8421 /* we shouldn't get
8422 * eexist without a new_inode */
8423 if (WARN_ON(!new_inode)) {
8424 goto out_fscrypt_names;
8425 }
8426 } else {
8427 /* maybe -EOVERFLOW */
8428 goto out_fscrypt_names;
8429 }
8430 }
8431 ret = 0;
8432
8433 /*
8434 * we're using rename to replace one file with another. Start IO on it
8435 * now so we don't add too much work to the end of the transaction
8436 */
8437 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8438 filemap_flush(old_inode->i_mapping);
8439
8440 if (flags & RENAME_WHITEOUT) {
8441 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
8442 if (!whiteout_args.inode) {
8443 ret = -ENOMEM;
8444 goto out_fscrypt_names;
8445 }
8446 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8447 if (ret)
8448 goto out_whiteout_inode;
8449 } else {
8450 /* 1 to update the old parent inode. */
8451 trans_num_items = 1;
8452 }
8453
8454 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8455 /* Close the race window with snapshot create/destroy ioctl */
8456 down_read(&fs_info->subvol_sem);
8457 /*
8458 * 1 to remove old root ref
8459 * 1 to remove old root backref
8460 * 1 to add new root ref
8461 * 1 to add new root backref
8462 */
8463 trans_num_items += 4;
8464 } else {
8465 /*
8466 * 1 to update inode
8467 * 1 to remove old inode ref
8468 * 1 to add new inode ref
8469 */
8470 trans_num_items += 3;
8471 }
8472 /*
8473 * 1 to remove old dir item
8474 * 1 to remove old dir index
8475 * 1 to add new dir item
8476 * 1 to add new dir index
8477 */
8478 trans_num_items += 4;
8479 /* 1 to update new parent inode if it's not the same as the old parent */
8480 if (new_dir != old_dir)
8481 trans_num_items++;
8482 if (new_inode) {
8483 /*
8484 * 1 to update inode
8485 * 1 to remove inode ref
8486 * 1 to remove dir item
8487 * 1 to remove dir index
8488 * 1 to possibly add orphan item
8489 */
8490 trans_num_items += 5;
8491 }
8492 trans = btrfs_start_transaction(root, trans_num_items);
8493 if (IS_ERR(trans)) {
8494 ret = PTR_ERR(trans);
8495 goto out_notrans;
8496 }
8497
8498 if (dest != root) {
8499 ret = btrfs_record_root_in_trans(trans, dest);
8500 if (ret)
8501 goto out_fail;
8502 }
8503
8504 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
8505 if (ret)
8506 goto out_fail;
8507
8508 BTRFS_I(old_inode)->dir_index = 0ULL;
8509 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8510 /* force full log commit if subvolume involved. */
8511 btrfs_set_log_full_commit(trans);
8512 } else {
8513 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8514 old_ino, btrfs_ino(BTRFS_I(new_dir)),
8515 index);
8516 if (ret)
8517 goto out_fail;
8518 }
8519
8520 inode_inc_iversion(old_dir);
8521 inode_inc_iversion(new_dir);
8522 inode_inc_iversion(old_inode);
8523 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8524
8525 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8526 /*
8527 * If we are renaming in the same directory (and it's not a
8528 * root entry) pin the log to prevent any concurrent task from
8529 * logging the directory after we removed the old entry and
8530 * before we add the new entry, otherwise that task can sync
8531 * a log without any entry for the inode we are renaming and
8532 * therefore replaying that log, if a power failure happens
8533 * after syncing the log, would result in deleting the inode.
8534 *
8535 * If the rename affects two different directories, we want to
8536 * make sure the that there's no log commit that contains
8537 * updates for only one of the directories but not for the
8538 * other.
8539 *
8540 * If we are renaming an entry for a root, we don't care about
8541 * log updates since we called btrfs_set_log_full_commit().
8542 */
8543 btrfs_pin_log_trans(root);
8544 btrfs_pin_log_trans(dest);
8545 logs_pinned = true;
8546 }
8547
8548 if (old_dentry->d_parent != new_dentry->d_parent)
8549 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8550 BTRFS_I(old_inode), true);
8551
8552 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8553 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8554 if (unlikely(ret)) {
8555 btrfs_abort_transaction(trans, ret);
8556 goto out_fail;
8557 }
8558 } else {
8559 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8560 BTRFS_I(d_inode(old_dentry)),
8561 &old_fname.disk_name, &rename_ctx);
8562 if (unlikely(ret)) {
8563 btrfs_abort_transaction(trans, ret);
8564 goto out_fail;
8565 }
8566 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8567 if (unlikely(ret)) {
8568 btrfs_abort_transaction(trans, ret);
8569 goto out_fail;
8570 }
8571 }
8572
8573 if (new_inode) {
8574 inode_inc_iversion(new_inode);
8575 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
8576 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8577 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8578 if (unlikely(ret)) {
8579 btrfs_abort_transaction(trans, ret);
8580 goto out_fail;
8581 }
8582 BUG_ON(new_inode->i_nlink == 0);
8583 } else {
8584 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8585 BTRFS_I(d_inode(new_dentry)),
8586 &new_fname.disk_name);
8587 if (unlikely(ret)) {
8588 btrfs_abort_transaction(trans, ret);
8589 goto out_fail;
8590 }
8591 }
8592 if (new_inode->i_nlink == 0) {
8593 ret = btrfs_orphan_add(trans,
8594 BTRFS_I(d_inode(new_dentry)));
8595 if (unlikely(ret)) {
8596 btrfs_abort_transaction(trans, ret);
8597 goto out_fail;
8598 }
8599 }
8600 }
8601
8602 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8603 &new_fname.disk_name, 0, index);
8604 if (unlikely(ret)) {
8605 btrfs_abort_transaction(trans, ret);
8606 goto out_fail;
8607 }
8608
8609 if (old_inode->i_nlink == 1)
8610 BTRFS_I(old_inode)->dir_index = index;
8611
8612 if (logs_pinned)
8613 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8614 rename_ctx.index, new_dentry->d_parent);
8615
8616 if (flags & RENAME_WHITEOUT) {
8617 ret = btrfs_create_new_inode(trans, &whiteout_args);
8618 if (unlikely(ret)) {
8619 btrfs_abort_transaction(trans, ret);
8620 goto out_fail;
8621 } else {
8622 unlock_new_inode(whiteout_args.inode);
8623 iput(whiteout_args.inode);
8624 whiteout_args.inode = NULL;
8625 }
8626 }
8627 out_fail:
8628 if (logs_pinned) {
8629 btrfs_end_log_trans(root);
8630 btrfs_end_log_trans(dest);
8631 }
8632 ret2 = btrfs_end_transaction(trans);
8633 ret = ret ? ret : ret2;
8634 out_notrans:
8635 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8636 up_read(&fs_info->subvol_sem);
8637 if (flags & RENAME_WHITEOUT)
8638 btrfs_new_inode_args_destroy(&whiteout_args);
8639 out_whiteout_inode:
8640 if (flags & RENAME_WHITEOUT)
8641 iput(whiteout_args.inode);
8642 out_fscrypt_names:
8643 fscrypt_free_filename(&old_fname);
8644 fscrypt_free_filename(&new_fname);
8645 return ret;
8646 }
8647
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8648 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
8649 struct dentry *old_dentry, struct inode *new_dir,
8650 struct dentry *new_dentry, unsigned int flags)
8651 {
8652 int ret;
8653
8654 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
8655 return -EINVAL;
8656
8657 if (flags & RENAME_EXCHANGE)
8658 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8659 new_dentry);
8660 else
8661 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
8662 new_dentry, flags);
8663
8664 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8665
8666 return ret;
8667 }
8668
8669 struct btrfs_delalloc_work {
8670 struct inode *inode;
8671 struct completion completion;
8672 struct list_head list;
8673 struct btrfs_work work;
8674 };
8675
btrfs_run_delalloc_work(struct btrfs_work * work)8676 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8677 {
8678 struct btrfs_delalloc_work *delalloc_work;
8679 struct inode *inode;
8680
8681 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8682 work);
8683 inode = delalloc_work->inode;
8684 filemap_flush(inode->i_mapping);
8685 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8686 &BTRFS_I(inode)->runtime_flags))
8687 filemap_flush(inode->i_mapping);
8688
8689 iput(inode);
8690 complete(&delalloc_work->completion);
8691 }
8692
btrfs_alloc_delalloc_work(struct inode * inode)8693 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8694 {
8695 struct btrfs_delalloc_work *work;
8696
8697 work = kmalloc(sizeof(*work), GFP_NOFS);
8698 if (!work)
8699 return NULL;
8700
8701 init_completion(&work->completion);
8702 INIT_LIST_HEAD(&work->list);
8703 work->inode = inode;
8704 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8705
8706 return work;
8707 }
8708
8709 /*
8710 * some fairly slow code that needs optimization. This walks the list
8711 * of all the inodes with pending delalloc and forces them to disk.
8712 */
start_delalloc_inodes(struct btrfs_root * root,struct writeback_control * wbc,bool snapshot,bool in_reclaim_context)8713 static int start_delalloc_inodes(struct btrfs_root *root,
8714 struct writeback_control *wbc, bool snapshot,
8715 bool in_reclaim_context)
8716 {
8717 struct btrfs_delalloc_work *work, *next;
8718 LIST_HEAD(works);
8719 LIST_HEAD(splice);
8720 int ret = 0;
8721 bool full_flush = wbc->nr_to_write == LONG_MAX;
8722
8723 mutex_lock(&root->delalloc_mutex);
8724 spin_lock(&root->delalloc_lock);
8725 list_splice_init(&root->delalloc_inodes, &splice);
8726 while (!list_empty(&splice)) {
8727 struct btrfs_inode *inode;
8728 struct inode *tmp_inode;
8729
8730 inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
8731
8732 list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
8733
8734 if (in_reclaim_context &&
8735 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
8736 continue;
8737
8738 tmp_inode = igrab(&inode->vfs_inode);
8739 if (!tmp_inode) {
8740 cond_resched_lock(&root->delalloc_lock);
8741 continue;
8742 }
8743 spin_unlock(&root->delalloc_lock);
8744
8745 if (snapshot)
8746 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
8747 if (full_flush) {
8748 work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
8749 if (!work) {
8750 iput(&inode->vfs_inode);
8751 ret = -ENOMEM;
8752 goto out;
8753 }
8754 list_add_tail(&work->list, &works);
8755 btrfs_queue_work(root->fs_info->flush_workers,
8756 &work->work);
8757 } else {
8758 ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
8759 btrfs_add_delayed_iput(inode);
8760 if (ret || wbc->nr_to_write <= 0)
8761 goto out;
8762 }
8763 cond_resched();
8764 spin_lock(&root->delalloc_lock);
8765 }
8766 spin_unlock(&root->delalloc_lock);
8767
8768 out:
8769 list_for_each_entry_safe(work, next, &works, list) {
8770 list_del_init(&work->list);
8771 wait_for_completion(&work->completion);
8772 kfree(work);
8773 }
8774
8775 if (!list_empty(&splice)) {
8776 spin_lock(&root->delalloc_lock);
8777 list_splice_tail(&splice, &root->delalloc_inodes);
8778 spin_unlock(&root->delalloc_lock);
8779 }
8780 mutex_unlock(&root->delalloc_mutex);
8781 return ret;
8782 }
8783
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)8784 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
8785 {
8786 struct writeback_control wbc = {
8787 .nr_to_write = LONG_MAX,
8788 .sync_mode = WB_SYNC_NONE,
8789 .range_start = 0,
8790 .range_end = LLONG_MAX,
8791 };
8792 struct btrfs_fs_info *fs_info = root->fs_info;
8793
8794 if (BTRFS_FS_ERROR(fs_info))
8795 return -EROFS;
8796
8797 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
8798 }
8799
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)8800 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
8801 bool in_reclaim_context)
8802 {
8803 struct writeback_control wbc = {
8804 .nr_to_write = nr,
8805 .sync_mode = WB_SYNC_NONE,
8806 .range_start = 0,
8807 .range_end = LLONG_MAX,
8808 };
8809 struct btrfs_root *root;
8810 LIST_HEAD(splice);
8811 int ret;
8812
8813 if (BTRFS_FS_ERROR(fs_info))
8814 return -EROFS;
8815
8816 mutex_lock(&fs_info->delalloc_root_mutex);
8817 spin_lock(&fs_info->delalloc_root_lock);
8818 list_splice_init(&fs_info->delalloc_roots, &splice);
8819 while (!list_empty(&splice)) {
8820 /*
8821 * Reset nr_to_write here so we know that we're doing a full
8822 * flush.
8823 */
8824 if (nr == LONG_MAX)
8825 wbc.nr_to_write = LONG_MAX;
8826
8827 root = list_first_entry(&splice, struct btrfs_root,
8828 delalloc_root);
8829 root = btrfs_grab_root(root);
8830 BUG_ON(!root);
8831 list_move_tail(&root->delalloc_root,
8832 &fs_info->delalloc_roots);
8833 spin_unlock(&fs_info->delalloc_root_lock);
8834
8835 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
8836 btrfs_put_root(root);
8837 if (ret < 0 || wbc.nr_to_write <= 0)
8838 goto out;
8839 spin_lock(&fs_info->delalloc_root_lock);
8840 }
8841 spin_unlock(&fs_info->delalloc_root_lock);
8842
8843 ret = 0;
8844 out:
8845 if (!list_empty(&splice)) {
8846 spin_lock(&fs_info->delalloc_root_lock);
8847 list_splice_tail(&splice, &fs_info->delalloc_roots);
8848 spin_unlock(&fs_info->delalloc_root_lock);
8849 }
8850 mutex_unlock(&fs_info->delalloc_root_mutex);
8851 return ret;
8852 }
8853
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)8854 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
8855 struct dentry *dentry, const char *symname)
8856 {
8857 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8858 struct btrfs_trans_handle *trans;
8859 struct btrfs_root *root = BTRFS_I(dir)->root;
8860 struct btrfs_path *path;
8861 struct btrfs_key key;
8862 struct inode *inode;
8863 struct btrfs_new_inode_args new_inode_args = {
8864 .dir = dir,
8865 .dentry = dentry,
8866 };
8867 unsigned int trans_num_items;
8868 int ret;
8869 int name_len;
8870 int datasize;
8871 unsigned long ptr;
8872 struct btrfs_file_extent_item *ei;
8873 struct extent_buffer *leaf;
8874
8875 name_len = strlen(symname);
8876 /*
8877 * Symlinks utilize uncompressed inline extent data, which should not
8878 * reach block size.
8879 */
8880 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
8881 name_len >= fs_info->sectorsize)
8882 return -ENAMETOOLONG;
8883
8884 inode = new_inode(dir->i_sb);
8885 if (!inode)
8886 return -ENOMEM;
8887 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
8888 inode->i_op = &btrfs_symlink_inode_operations;
8889 inode_nohighmem(inode);
8890 inode->i_mapping->a_ops = &btrfs_aops;
8891 btrfs_i_size_write(BTRFS_I(inode), name_len);
8892 inode_set_bytes(inode, name_len);
8893
8894 new_inode_args.inode = inode;
8895 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8896 if (ret)
8897 goto out_inode;
8898 /* 1 additional item for the inline extent */
8899 trans_num_items++;
8900
8901 trans = btrfs_start_transaction(root, trans_num_items);
8902 if (IS_ERR(trans)) {
8903 ret = PTR_ERR(trans);
8904 goto out_new_inode_args;
8905 }
8906
8907 ret = btrfs_create_new_inode(trans, &new_inode_args);
8908 if (ret)
8909 goto out;
8910
8911 path = btrfs_alloc_path();
8912 if (unlikely(!path)) {
8913 ret = -ENOMEM;
8914 btrfs_abort_transaction(trans, ret);
8915 discard_new_inode(inode);
8916 inode = NULL;
8917 goto out;
8918 }
8919 key.objectid = btrfs_ino(BTRFS_I(inode));
8920 key.type = BTRFS_EXTENT_DATA_KEY;
8921 key.offset = 0;
8922 datasize = btrfs_file_extent_calc_inline_size(name_len);
8923 ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
8924 if (unlikely(ret)) {
8925 btrfs_abort_transaction(trans, ret);
8926 btrfs_free_path(path);
8927 discard_new_inode(inode);
8928 inode = NULL;
8929 goto out;
8930 }
8931 leaf = path->nodes[0];
8932 ei = btrfs_item_ptr(leaf, path->slots[0],
8933 struct btrfs_file_extent_item);
8934 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8935 btrfs_set_file_extent_type(leaf, ei,
8936 BTRFS_FILE_EXTENT_INLINE);
8937 btrfs_set_file_extent_encryption(leaf, ei, 0);
8938 btrfs_set_file_extent_compression(leaf, ei, 0);
8939 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8940 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8941
8942 ptr = btrfs_file_extent_inline_start(ei);
8943 write_extent_buffer(leaf, symname, ptr, name_len);
8944 btrfs_free_path(path);
8945
8946 d_instantiate_new(dentry, inode);
8947 ret = 0;
8948 out:
8949 btrfs_end_transaction(trans);
8950 btrfs_btree_balance_dirty(fs_info);
8951 out_new_inode_args:
8952 btrfs_new_inode_args_destroy(&new_inode_args);
8953 out_inode:
8954 if (ret)
8955 iput(inode);
8956 return ret;
8957 }
8958
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)8959 static struct btrfs_trans_handle *insert_prealloc_file_extent(
8960 struct btrfs_trans_handle *trans_in,
8961 struct btrfs_inode *inode,
8962 struct btrfs_key *ins,
8963 u64 file_offset)
8964 {
8965 struct btrfs_file_extent_item stack_fi;
8966 struct btrfs_replace_extent_info extent_info;
8967 struct btrfs_trans_handle *trans = trans_in;
8968 struct btrfs_path *path;
8969 u64 start = ins->objectid;
8970 u64 len = ins->offset;
8971 u64 qgroup_released = 0;
8972 int ret;
8973
8974 memset(&stack_fi, 0, sizeof(stack_fi));
8975
8976 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8977 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8978 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8979 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8980 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8981 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8982 /* Encryption and other encoding is reserved and all 0 */
8983
8984 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8985 if (ret < 0)
8986 return ERR_PTR(ret);
8987
8988 if (trans) {
8989 ret = insert_reserved_file_extent(trans, inode,
8990 file_offset, &stack_fi,
8991 true, qgroup_released);
8992 if (ret)
8993 goto free_qgroup;
8994 return trans;
8995 }
8996
8997 extent_info.disk_offset = start;
8998 extent_info.disk_len = len;
8999 extent_info.data_offset = 0;
9000 extent_info.data_len = len;
9001 extent_info.file_offset = file_offset;
9002 extent_info.extent_buf = (char *)&stack_fi;
9003 extent_info.is_new_extent = true;
9004 extent_info.update_times = true;
9005 extent_info.qgroup_reserved = qgroup_released;
9006 extent_info.insertions = 0;
9007
9008 path = btrfs_alloc_path();
9009 if (!path) {
9010 ret = -ENOMEM;
9011 goto free_qgroup;
9012 }
9013
9014 ret = btrfs_replace_file_extents(inode, path, file_offset,
9015 file_offset + len - 1, &extent_info,
9016 &trans);
9017 btrfs_free_path(path);
9018 if (ret)
9019 goto free_qgroup;
9020 return trans;
9021
9022 free_qgroup:
9023 /*
9024 * We have released qgroup data range at the beginning of the function,
9025 * and normally qgroup_released bytes will be freed when committing
9026 * transaction.
9027 * But if we error out early, we have to free what we have released
9028 * or we leak qgroup data reservation.
9029 */
9030 btrfs_qgroup_free_refroot(inode->root->fs_info,
9031 btrfs_root_id(inode->root), qgroup_released,
9032 BTRFS_QGROUP_RSV_DATA);
9033 return ERR_PTR(ret);
9034 }
9035
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)9036 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9037 u64 start, u64 num_bytes, u64 min_size,
9038 loff_t actual_len, u64 *alloc_hint,
9039 struct btrfs_trans_handle *trans)
9040 {
9041 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
9042 struct extent_map *em;
9043 struct btrfs_root *root = BTRFS_I(inode)->root;
9044 struct btrfs_key ins;
9045 u64 cur_offset = start;
9046 u64 clear_offset = start;
9047 u64 i_size;
9048 u64 cur_bytes;
9049 u64 last_alloc = (u64)-1;
9050 int ret = 0;
9051 bool own_trans = true;
9052 u64 end = start + num_bytes - 1;
9053
9054 if (trans)
9055 own_trans = false;
9056 while (num_bytes > 0) {
9057 cur_bytes = min_t(u64, num_bytes, SZ_256M);
9058 cur_bytes = max(cur_bytes, min_size);
9059 /*
9060 * If we are severely fragmented we could end up with really
9061 * small allocations, so if the allocator is returning small
9062 * chunks lets make its job easier by only searching for those
9063 * sized chunks.
9064 */
9065 cur_bytes = min(cur_bytes, last_alloc);
9066 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9067 min_size, 0, *alloc_hint, &ins, 1, 0);
9068 if (ret)
9069 break;
9070
9071 /*
9072 * We've reserved this space, and thus converted it from
9073 * ->bytes_may_use to ->bytes_reserved. Any error that happens
9074 * from here on out we will only need to clear our reservation
9075 * for the remaining unreserved area, so advance our
9076 * clear_offset by our extent size.
9077 */
9078 clear_offset += ins.offset;
9079
9080 last_alloc = ins.offset;
9081 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9082 &ins, cur_offset);
9083 /*
9084 * Now that we inserted the prealloc extent we can finally
9085 * decrement the number of reservations in the block group.
9086 * If we did it before, we could race with relocation and have
9087 * relocation miss the reserved extent, making it fail later.
9088 */
9089 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9090 if (IS_ERR(trans)) {
9091 ret = PTR_ERR(trans);
9092 btrfs_free_reserved_extent(fs_info, ins.objectid,
9093 ins.offset, false);
9094 break;
9095 }
9096
9097 em = btrfs_alloc_extent_map();
9098 if (!em) {
9099 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9100 cur_offset + ins.offset - 1, false);
9101 btrfs_set_inode_full_sync(BTRFS_I(inode));
9102 goto next;
9103 }
9104
9105 em->start = cur_offset;
9106 em->len = ins.offset;
9107 em->disk_bytenr = ins.objectid;
9108 em->offset = 0;
9109 em->disk_num_bytes = ins.offset;
9110 em->ram_bytes = ins.offset;
9111 em->flags |= EXTENT_FLAG_PREALLOC;
9112 em->generation = trans->transid;
9113
9114 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9115 btrfs_free_extent_map(em);
9116 next:
9117 num_bytes -= ins.offset;
9118 cur_offset += ins.offset;
9119 *alloc_hint = ins.objectid + ins.offset;
9120
9121 inode_inc_iversion(inode);
9122 inode_set_ctime_current(inode);
9123 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9124 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9125 (actual_len > inode->i_size) &&
9126 (cur_offset > inode->i_size)) {
9127 if (cur_offset > actual_len)
9128 i_size = actual_len;
9129 else
9130 i_size = cur_offset;
9131 i_size_write(inode, i_size);
9132 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9133 }
9134
9135 ret = btrfs_update_inode(trans, BTRFS_I(inode));
9136
9137 if (unlikely(ret)) {
9138 btrfs_abort_transaction(trans, ret);
9139 if (own_trans)
9140 btrfs_end_transaction(trans);
9141 break;
9142 }
9143
9144 if (own_trans) {
9145 btrfs_end_transaction(trans);
9146 trans = NULL;
9147 }
9148 }
9149 if (clear_offset < end)
9150 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9151 end - clear_offset + 1);
9152 return ret;
9153 }
9154
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9155 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9156 u64 start, u64 num_bytes, u64 min_size,
9157 loff_t actual_len, u64 *alloc_hint)
9158 {
9159 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9160 min_size, actual_len, alloc_hint,
9161 NULL);
9162 }
9163
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9164 int btrfs_prealloc_file_range_trans(struct inode *inode,
9165 struct btrfs_trans_handle *trans, int mode,
9166 u64 start, u64 num_bytes, u64 min_size,
9167 loff_t actual_len, u64 *alloc_hint)
9168 {
9169 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9170 min_size, actual_len, alloc_hint, trans);
9171 }
9172
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)9173 static int btrfs_permission(struct mnt_idmap *idmap,
9174 struct inode *inode, int mask)
9175 {
9176 struct btrfs_root *root = BTRFS_I(inode)->root;
9177 umode_t mode = inode->i_mode;
9178
9179 if (mask & MAY_WRITE &&
9180 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9181 if (btrfs_root_readonly(root))
9182 return -EROFS;
9183 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9184 return -EACCES;
9185 }
9186 return generic_permission(idmap, inode, mask);
9187 }
9188
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)9189 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9190 struct file *file, umode_t mode)
9191 {
9192 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
9193 struct btrfs_trans_handle *trans;
9194 struct btrfs_root *root = BTRFS_I(dir)->root;
9195 struct inode *inode;
9196 struct btrfs_new_inode_args new_inode_args = {
9197 .dir = dir,
9198 .dentry = file->f_path.dentry,
9199 .orphan = true,
9200 };
9201 unsigned int trans_num_items;
9202 int ret;
9203
9204 inode = new_inode(dir->i_sb);
9205 if (!inode)
9206 return -ENOMEM;
9207 inode_init_owner(idmap, inode, dir, mode);
9208 inode->i_fop = &btrfs_file_operations;
9209 inode->i_op = &btrfs_file_inode_operations;
9210 inode->i_mapping->a_ops = &btrfs_aops;
9211
9212 new_inode_args.inode = inode;
9213 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9214 if (ret)
9215 goto out_inode;
9216
9217 trans = btrfs_start_transaction(root, trans_num_items);
9218 if (IS_ERR(trans)) {
9219 ret = PTR_ERR(trans);
9220 goto out_new_inode_args;
9221 }
9222
9223 ret = btrfs_create_new_inode(trans, &new_inode_args);
9224
9225 /*
9226 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9227 * set it to 1 because d_tmpfile() will issue a warning if the count is
9228 * 0, through:
9229 *
9230 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9231 */
9232 set_nlink(inode, 1);
9233
9234 if (!ret) {
9235 d_tmpfile(file, inode);
9236 unlock_new_inode(inode);
9237 mark_inode_dirty(inode);
9238 }
9239
9240 btrfs_end_transaction(trans);
9241 btrfs_btree_balance_dirty(fs_info);
9242 out_new_inode_args:
9243 btrfs_new_inode_args_destroy(&new_inode_args);
9244 out_inode:
9245 if (ret)
9246 iput(inode);
9247 return finish_open_simple(file, ret);
9248 }
9249
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)9250 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9251 int compress_type)
9252 {
9253 switch (compress_type) {
9254 case BTRFS_COMPRESS_NONE:
9255 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9256 case BTRFS_COMPRESS_ZLIB:
9257 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9258 case BTRFS_COMPRESS_LZO:
9259 /*
9260 * The LZO format depends on the sector size. 64K is the maximum
9261 * sector size that we support.
9262 */
9263 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9264 return -EINVAL;
9265 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9266 (fs_info->sectorsize_bits - 12);
9267 case BTRFS_COMPRESS_ZSTD:
9268 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9269 default:
9270 return -EUCLEAN;
9271 }
9272 }
9273
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9274 static ssize_t btrfs_encoded_read_inline(
9275 struct kiocb *iocb,
9276 struct iov_iter *iter, u64 start,
9277 u64 lockend,
9278 struct extent_state **cached_state,
9279 u64 extent_start, size_t count,
9280 struct btrfs_ioctl_encoded_io_args *encoded,
9281 bool *unlocked)
9282 {
9283 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9284 struct btrfs_root *root = inode->root;
9285 struct btrfs_fs_info *fs_info = root->fs_info;
9286 struct extent_io_tree *io_tree = &inode->io_tree;
9287 BTRFS_PATH_AUTO_FREE(path);
9288 struct extent_buffer *leaf;
9289 struct btrfs_file_extent_item *item;
9290 u64 ram_bytes;
9291 unsigned long ptr;
9292 void *tmp;
9293 ssize_t ret;
9294 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9295
9296 path = btrfs_alloc_path();
9297 if (!path)
9298 return -ENOMEM;
9299
9300 path->nowait = nowait;
9301
9302 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9303 extent_start, 0);
9304 if (ret) {
9305 if (unlikely(ret > 0)) {
9306 /* The extent item disappeared? */
9307 return -EIO;
9308 }
9309 return ret;
9310 }
9311 leaf = path->nodes[0];
9312 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9313
9314 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9315 ptr = btrfs_file_extent_inline_start(item);
9316
9317 encoded->len = min_t(u64, extent_start + ram_bytes,
9318 inode->vfs_inode.i_size) - iocb->ki_pos;
9319 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9320 btrfs_file_extent_compression(leaf, item));
9321 if (ret < 0)
9322 return ret;
9323 encoded->compression = ret;
9324 if (encoded->compression) {
9325 size_t inline_size;
9326
9327 inline_size = btrfs_file_extent_inline_item_len(leaf,
9328 path->slots[0]);
9329 if (inline_size > count)
9330 return -ENOBUFS;
9331
9332 count = inline_size;
9333 encoded->unencoded_len = ram_bytes;
9334 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9335 } else {
9336 count = min_t(u64, count, encoded->len);
9337 encoded->len = count;
9338 encoded->unencoded_len = count;
9339 ptr += iocb->ki_pos - extent_start;
9340 }
9341
9342 tmp = kmalloc(count, GFP_NOFS);
9343 if (!tmp)
9344 return -ENOMEM;
9345
9346 read_extent_buffer(leaf, tmp, ptr, count);
9347 btrfs_release_path(path);
9348 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9349 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9350 *unlocked = true;
9351
9352 ret = copy_to_iter(tmp, count, iter);
9353 if (ret != count)
9354 ret = -EFAULT;
9355 kfree(tmp);
9356
9357 return ret;
9358 }
9359
9360 struct btrfs_encoded_read_private {
9361 struct completion *sync_reads;
9362 void *uring_ctx;
9363 refcount_t pending_refs;
9364 blk_status_t status;
9365 };
9366
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9367 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9368 {
9369 struct btrfs_encoded_read_private *priv = bbio->private;
9370
9371 if (bbio->bio.bi_status) {
9372 /*
9373 * The memory barrier implied by the refcount_dec_and_test() here
9374 * pairs with the memory barrier implied by the refcount_dec_and_test()
9375 * in btrfs_encoded_read_regular_fill_pages() to ensure that
9376 * this write is observed before the load of status in
9377 * btrfs_encoded_read_regular_fill_pages().
9378 */
9379 WRITE_ONCE(priv->status, bbio->bio.bi_status);
9380 }
9381 if (refcount_dec_and_test(&priv->pending_refs)) {
9382 int err = blk_status_to_errno(READ_ONCE(priv->status));
9383
9384 if (priv->uring_ctx) {
9385 btrfs_uring_read_extent_endio(priv->uring_ctx, err);
9386 kfree(priv);
9387 } else {
9388 complete(priv->sync_reads);
9389 }
9390 }
9391 bio_put(&bbio->bio);
9392 }
9393
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 disk_bytenr,u64 disk_io_size,struct page ** pages,void * uring_ctx)9394 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9395 u64 disk_bytenr, u64 disk_io_size,
9396 struct page **pages, void *uring_ctx)
9397 {
9398 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9399 struct btrfs_encoded_read_private *priv, sync_priv;
9400 struct completion sync_reads;
9401 unsigned long i = 0;
9402 struct btrfs_bio *bbio;
9403 int ret;
9404
9405 /*
9406 * Fast path for synchronous reads which completes in this call, io_uring
9407 * needs longer time span.
9408 */
9409 if (uring_ctx) {
9410 priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
9411 if (!priv)
9412 return -ENOMEM;
9413 } else {
9414 priv = &sync_priv;
9415 init_completion(&sync_reads);
9416 priv->sync_reads = &sync_reads;
9417 }
9418
9419 refcount_set(&priv->pending_refs, 1);
9420 priv->status = 0;
9421 priv->uring_ctx = uring_ctx;
9422
9423 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9424 btrfs_encoded_read_endio, priv);
9425 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9426 bbio->inode = inode;
9427
9428 do {
9429 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9430
9431 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9432 refcount_inc(&priv->pending_refs);
9433 btrfs_submit_bbio(bbio, 0);
9434
9435 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9436 btrfs_encoded_read_endio, priv);
9437 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9438 bbio->inode = inode;
9439 continue;
9440 }
9441
9442 i++;
9443 disk_bytenr += bytes;
9444 disk_io_size -= bytes;
9445 } while (disk_io_size);
9446
9447 refcount_inc(&priv->pending_refs);
9448 btrfs_submit_bbio(bbio, 0);
9449
9450 if (uring_ctx) {
9451 if (refcount_dec_and_test(&priv->pending_refs)) {
9452 ret = blk_status_to_errno(READ_ONCE(priv->status));
9453 btrfs_uring_read_extent_endio(uring_ctx, ret);
9454 kfree(priv);
9455 return ret;
9456 }
9457
9458 return -EIOCBQUEUED;
9459 } else {
9460 if (!refcount_dec_and_test(&priv->pending_refs))
9461 wait_for_completion_io(&sync_reads);
9462 /* See btrfs_encoded_read_endio() for ordering. */
9463 return blk_status_to_errno(READ_ONCE(priv->status));
9464 }
9465 }
9466
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)9467 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
9468 u64 start, u64 lockend,
9469 struct extent_state **cached_state,
9470 u64 disk_bytenr, u64 disk_io_size,
9471 size_t count, bool compressed, bool *unlocked)
9472 {
9473 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9474 struct extent_io_tree *io_tree = &inode->io_tree;
9475 struct page **pages;
9476 unsigned long nr_pages, i;
9477 u64 cur;
9478 size_t page_offset;
9479 ssize_t ret;
9480
9481 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9482 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9483 if (!pages)
9484 return -ENOMEM;
9485 ret = btrfs_alloc_page_array(nr_pages, pages, false);
9486 if (ret) {
9487 ret = -ENOMEM;
9488 goto out;
9489 }
9490
9491 ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
9492 disk_io_size, pages, NULL);
9493 if (ret)
9494 goto out;
9495
9496 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9497 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9498 *unlocked = true;
9499
9500 if (compressed) {
9501 i = 0;
9502 page_offset = 0;
9503 } else {
9504 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9505 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9506 }
9507 cur = 0;
9508 while (cur < count) {
9509 size_t bytes = min_t(size_t, count - cur,
9510 PAGE_SIZE - page_offset);
9511
9512 if (copy_page_to_iter(pages[i], page_offset, bytes,
9513 iter) != bytes) {
9514 ret = -EFAULT;
9515 goto out;
9516 }
9517 i++;
9518 cur += bytes;
9519 page_offset = 0;
9520 }
9521 ret = count;
9522 out:
9523 for (i = 0; i < nr_pages; i++) {
9524 if (pages[i])
9525 __free_page(pages[i]);
9526 }
9527 kfree(pages);
9528 return ret;
9529 }
9530
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded,struct extent_state ** cached_state,u64 * disk_bytenr,u64 * disk_io_size)9531 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9532 struct btrfs_ioctl_encoded_io_args *encoded,
9533 struct extent_state **cached_state,
9534 u64 *disk_bytenr, u64 *disk_io_size)
9535 {
9536 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9537 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9538 struct extent_io_tree *io_tree = &inode->io_tree;
9539 ssize_t ret;
9540 size_t count = iov_iter_count(iter);
9541 u64 start, lockend;
9542 struct extent_map *em;
9543 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9544 bool unlocked = false;
9545
9546 file_accessed(iocb->ki_filp);
9547
9548 ret = btrfs_inode_lock(inode,
9549 BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
9550 if (ret)
9551 return ret;
9552
9553 if (iocb->ki_pos >= inode->vfs_inode.i_size) {
9554 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9555 return 0;
9556 }
9557 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9558 /*
9559 * We don't know how long the extent containing iocb->ki_pos is, but if
9560 * it's compressed we know that it won't be longer than this.
9561 */
9562 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9563
9564 if (nowait) {
9565 struct btrfs_ordered_extent *ordered;
9566
9567 if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
9568 start, lockend)) {
9569 ret = -EAGAIN;
9570 goto out_unlock_inode;
9571 }
9572
9573 if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
9574 ret = -EAGAIN;
9575 goto out_unlock_inode;
9576 }
9577
9578 ordered = btrfs_lookup_ordered_range(inode, start,
9579 lockend - start + 1);
9580 if (ordered) {
9581 btrfs_put_ordered_extent(ordered);
9582 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9583 ret = -EAGAIN;
9584 goto out_unlock_inode;
9585 }
9586 } else {
9587 for (;;) {
9588 struct btrfs_ordered_extent *ordered;
9589
9590 ret = btrfs_wait_ordered_range(inode, start,
9591 lockend - start + 1);
9592 if (ret)
9593 goto out_unlock_inode;
9594
9595 btrfs_lock_extent(io_tree, start, lockend, cached_state);
9596 ordered = btrfs_lookup_ordered_range(inode, start,
9597 lockend - start + 1);
9598 if (!ordered)
9599 break;
9600 btrfs_put_ordered_extent(ordered);
9601 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9602 cond_resched();
9603 }
9604 }
9605
9606 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
9607 if (IS_ERR(em)) {
9608 ret = PTR_ERR(em);
9609 goto out_unlock_extent;
9610 }
9611
9612 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9613 u64 extent_start = em->start;
9614
9615 /*
9616 * For inline extents we get everything we need out of the
9617 * extent item.
9618 */
9619 btrfs_free_extent_map(em);
9620 em = NULL;
9621 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9622 cached_state, extent_start,
9623 count, encoded, &unlocked);
9624 goto out_unlock_extent;
9625 }
9626
9627 /*
9628 * We only want to return up to EOF even if the extent extends beyond
9629 * that.
9630 */
9631 encoded->len = min_t(u64, btrfs_extent_map_end(em),
9632 inode->vfs_inode.i_size) - iocb->ki_pos;
9633 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
9634 (em->flags & EXTENT_FLAG_PREALLOC)) {
9635 *disk_bytenr = EXTENT_MAP_HOLE;
9636 count = min_t(u64, count, encoded->len);
9637 encoded->len = count;
9638 encoded->unencoded_len = count;
9639 } else if (btrfs_extent_map_is_compressed(em)) {
9640 *disk_bytenr = em->disk_bytenr;
9641 /*
9642 * Bail if the buffer isn't large enough to return the whole
9643 * compressed extent.
9644 */
9645 if (em->disk_num_bytes > count) {
9646 ret = -ENOBUFS;
9647 goto out_em;
9648 }
9649 *disk_io_size = em->disk_num_bytes;
9650 count = em->disk_num_bytes;
9651 encoded->unencoded_len = em->ram_bytes;
9652 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
9653 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9654 btrfs_extent_map_compression(em));
9655 if (ret < 0)
9656 goto out_em;
9657 encoded->compression = ret;
9658 } else {
9659 *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
9660 if (encoded->len > count)
9661 encoded->len = count;
9662 /*
9663 * Don't read beyond what we locked. This also limits the page
9664 * allocations that we'll do.
9665 */
9666 *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9667 count = start + *disk_io_size - iocb->ki_pos;
9668 encoded->len = count;
9669 encoded->unencoded_len = count;
9670 *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
9671 }
9672 btrfs_free_extent_map(em);
9673 em = NULL;
9674
9675 if (*disk_bytenr == EXTENT_MAP_HOLE) {
9676 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9677 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9678 unlocked = true;
9679 ret = iov_iter_zero(count, iter);
9680 if (ret != count)
9681 ret = -EFAULT;
9682 } else {
9683 ret = -EIOCBQUEUED;
9684 goto out_unlock_extent;
9685 }
9686
9687 out_em:
9688 btrfs_free_extent_map(em);
9689 out_unlock_extent:
9690 /* Leave inode and extent locked if we need to do a read. */
9691 if (!unlocked && ret != -EIOCBQUEUED)
9692 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9693 out_unlock_inode:
9694 if (!unlocked && ret != -EIOCBQUEUED)
9695 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9696 return ret;
9697 }
9698
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)9699 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9700 const struct btrfs_ioctl_encoded_io_args *encoded)
9701 {
9702 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9703 struct btrfs_root *root = inode->root;
9704 struct btrfs_fs_info *fs_info = root->fs_info;
9705 struct extent_io_tree *io_tree = &inode->io_tree;
9706 struct extent_changeset *data_reserved = NULL;
9707 struct extent_state *cached_state = NULL;
9708 struct btrfs_ordered_extent *ordered;
9709 struct btrfs_file_extent file_extent;
9710 int compression;
9711 size_t orig_count;
9712 u64 start, end;
9713 u64 num_bytes, ram_bytes, disk_num_bytes;
9714 unsigned long nr_folios, i;
9715 struct folio **folios;
9716 struct btrfs_key ins;
9717 bool extent_reserved = false;
9718 struct extent_map *em;
9719 ssize_t ret;
9720
9721 switch (encoded->compression) {
9722 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9723 compression = BTRFS_COMPRESS_ZLIB;
9724 break;
9725 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9726 compression = BTRFS_COMPRESS_ZSTD;
9727 break;
9728 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9729 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9730 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9731 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9732 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9733 /* The sector size must match for LZO. */
9734 if (encoded->compression -
9735 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9736 fs_info->sectorsize_bits)
9737 return -EINVAL;
9738 compression = BTRFS_COMPRESS_LZO;
9739 break;
9740 default:
9741 return -EINVAL;
9742 }
9743 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9744 return -EINVAL;
9745
9746 /*
9747 * Compressed extents should always have checksums, so error out if we
9748 * have a NOCOW file or inode was created while mounted with NODATASUM.
9749 */
9750 if (inode->flags & BTRFS_INODE_NODATASUM)
9751 return -EINVAL;
9752
9753 orig_count = iov_iter_count(from);
9754
9755 /* The extent size must be sane. */
9756 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9757 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9758 return -EINVAL;
9759
9760 /*
9761 * The compressed data must be smaller than the decompressed data.
9762 *
9763 * It's of course possible for data to compress to larger or the same
9764 * size, but the buffered I/O path falls back to no compression for such
9765 * data, and we don't want to break any assumptions by creating these
9766 * extents.
9767 *
9768 * Note that this is less strict than the current check we have that the
9769 * compressed data must be at least one sector smaller than the
9770 * decompressed data. We only want to enforce the weaker requirement
9771 * from old kernels that it is at least one byte smaller.
9772 */
9773 if (orig_count >= encoded->unencoded_len)
9774 return -EINVAL;
9775
9776 /* The extent must start on a sector boundary. */
9777 start = iocb->ki_pos;
9778 if (!IS_ALIGNED(start, fs_info->sectorsize))
9779 return -EINVAL;
9780
9781 /*
9782 * The extent must end on a sector boundary. However, we allow a write
9783 * which ends at or extends i_size to have an unaligned length; we round
9784 * up the extent size and set i_size to the unaligned end.
9785 */
9786 if (start + encoded->len < inode->vfs_inode.i_size &&
9787 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9788 return -EINVAL;
9789
9790 /* Finally, the offset in the unencoded data must be sector-aligned. */
9791 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9792 return -EINVAL;
9793
9794 num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9795 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9796 end = start + num_bytes - 1;
9797
9798 /*
9799 * If the extent cannot be inline, the compressed data on disk must be
9800 * sector-aligned. For convenience, we extend it with zeroes if it
9801 * isn't.
9802 */
9803 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
9804 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9805 folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
9806 if (!folios)
9807 return -ENOMEM;
9808 for (i = 0; i < nr_folios; i++) {
9809 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9810 char *kaddr;
9811
9812 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9813 if (!folios[i]) {
9814 ret = -ENOMEM;
9815 goto out_folios;
9816 }
9817 kaddr = kmap_local_folio(folios[i], 0);
9818 if (copy_from_iter(kaddr, bytes, from) != bytes) {
9819 kunmap_local(kaddr);
9820 ret = -EFAULT;
9821 goto out_folios;
9822 }
9823 if (bytes < PAGE_SIZE)
9824 memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
9825 kunmap_local(kaddr);
9826 }
9827
9828 for (;;) {
9829 struct btrfs_ordered_extent *ordered;
9830
9831 ret = btrfs_wait_ordered_range(inode, start, num_bytes);
9832 if (ret)
9833 goto out_folios;
9834 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9835 start >> PAGE_SHIFT,
9836 end >> PAGE_SHIFT);
9837 if (ret)
9838 goto out_folios;
9839 btrfs_lock_extent(io_tree, start, end, &cached_state);
9840 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9841 if (!ordered &&
9842 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9843 break;
9844 if (ordered)
9845 btrfs_put_ordered_extent(ordered);
9846 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9847 cond_resched();
9848 }
9849
9850 /*
9851 * We don't use the higher-level delalloc space functions because our
9852 * num_bytes and disk_num_bytes are different.
9853 */
9854 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9855 if (ret)
9856 goto out_unlock;
9857 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9858 if (ret)
9859 goto out_free_data_space;
9860 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9861 false);
9862 if (ret)
9863 goto out_qgroup_free_data;
9864
9865 /* Try an inline extent first. */
9866 if (encoded->unencoded_len == encoded->len &&
9867 encoded->unencoded_offset == 0 &&
9868 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
9869 ret = __cow_file_range_inline(inode, encoded->len,
9870 orig_count, compression, folios[0],
9871 true);
9872 if (ret <= 0) {
9873 if (ret == 0)
9874 ret = orig_count;
9875 goto out_delalloc_release;
9876 }
9877 }
9878
9879 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9880 disk_num_bytes, 0, 0, &ins, 1, 1);
9881 if (ret)
9882 goto out_delalloc_release;
9883 extent_reserved = true;
9884
9885 file_extent.disk_bytenr = ins.objectid;
9886 file_extent.disk_num_bytes = ins.offset;
9887 file_extent.num_bytes = num_bytes;
9888 file_extent.ram_bytes = ram_bytes;
9889 file_extent.offset = encoded->unencoded_offset;
9890 file_extent.compression = compression;
9891 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
9892 if (IS_ERR(em)) {
9893 ret = PTR_ERR(em);
9894 goto out_free_reserved;
9895 }
9896 btrfs_free_extent_map(em);
9897
9898 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
9899 (1U << BTRFS_ORDERED_ENCODED) |
9900 (1U << BTRFS_ORDERED_COMPRESSED));
9901 if (IS_ERR(ordered)) {
9902 btrfs_drop_extent_map_range(inode, start, end, false);
9903 ret = PTR_ERR(ordered);
9904 goto out_free_reserved;
9905 }
9906 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9907
9908 if (start + encoded->len > inode->vfs_inode.i_size)
9909 i_size_write(&inode->vfs_inode, start + encoded->len);
9910
9911 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9912
9913 btrfs_delalloc_release_extents(inode, num_bytes);
9914
9915 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
9916 ret = orig_count;
9917 goto out;
9918
9919 out_free_reserved:
9920 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9921 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
9922 out_delalloc_release:
9923 btrfs_delalloc_release_extents(inode, num_bytes);
9924 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9925 out_qgroup_free_data:
9926 if (ret < 0)
9927 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
9928 out_free_data_space:
9929 /*
9930 * If btrfs_reserve_extent() succeeded, then we already decremented
9931 * bytes_may_use.
9932 */
9933 if (!extent_reserved)
9934 btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
9935 out_unlock:
9936 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9937 out_folios:
9938 for (i = 0; i < nr_folios; i++) {
9939 if (folios[i])
9940 folio_put(folios[i]);
9941 }
9942 kvfree(folios);
9943 out:
9944 if (ret >= 0)
9945 iocb->ki_pos += encoded->len;
9946 return ret;
9947 }
9948
9949 #ifdef CONFIG_SWAP
9950 /*
9951 * Add an entry indicating a block group or device which is pinned by a
9952 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9953 * negative errno on failure.
9954 */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)9955 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9956 bool is_block_group)
9957 {
9958 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9959 struct btrfs_swapfile_pin *sp, *entry;
9960 struct rb_node **p;
9961 struct rb_node *parent = NULL;
9962
9963 sp = kmalloc(sizeof(*sp), GFP_NOFS);
9964 if (!sp)
9965 return -ENOMEM;
9966 sp->ptr = ptr;
9967 sp->inode = inode;
9968 sp->is_block_group = is_block_group;
9969 sp->bg_extent_count = 1;
9970
9971 spin_lock(&fs_info->swapfile_pins_lock);
9972 p = &fs_info->swapfile_pins.rb_node;
9973 while (*p) {
9974 parent = *p;
9975 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9976 if (sp->ptr < entry->ptr ||
9977 (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9978 p = &(*p)->rb_left;
9979 } else if (sp->ptr > entry->ptr ||
9980 (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9981 p = &(*p)->rb_right;
9982 } else {
9983 if (is_block_group)
9984 entry->bg_extent_count++;
9985 spin_unlock(&fs_info->swapfile_pins_lock);
9986 kfree(sp);
9987 return 1;
9988 }
9989 }
9990 rb_link_node(&sp->node, parent, p);
9991 rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9992 spin_unlock(&fs_info->swapfile_pins_lock);
9993 return 0;
9994 }
9995
9996 /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)9997 static void btrfs_free_swapfile_pins(struct inode *inode)
9998 {
9999 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
10000 struct btrfs_swapfile_pin *sp;
10001 struct rb_node *node, *next;
10002
10003 spin_lock(&fs_info->swapfile_pins_lock);
10004 node = rb_first(&fs_info->swapfile_pins);
10005 while (node) {
10006 next = rb_next(node);
10007 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10008 if (sp->inode == inode) {
10009 rb_erase(&sp->node, &fs_info->swapfile_pins);
10010 if (sp->is_block_group) {
10011 btrfs_dec_block_group_swap_extents(sp->ptr,
10012 sp->bg_extent_count);
10013 btrfs_put_block_group(sp->ptr);
10014 }
10015 kfree(sp);
10016 }
10017 node = next;
10018 }
10019 spin_unlock(&fs_info->swapfile_pins_lock);
10020 }
10021
10022 struct btrfs_swap_info {
10023 u64 start;
10024 u64 block_start;
10025 u64 block_len;
10026 u64 lowest_ppage;
10027 u64 highest_ppage;
10028 unsigned long nr_pages;
10029 int nr_extents;
10030 };
10031
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)10032 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10033 struct btrfs_swap_info *bsi)
10034 {
10035 unsigned long nr_pages;
10036 unsigned long max_pages;
10037 u64 first_ppage, first_ppage_reported, next_ppage;
10038 int ret;
10039
10040 /*
10041 * Our swapfile may have had its size extended after the swap header was
10042 * written. In that case activating the swapfile should not go beyond
10043 * the max size set in the swap header.
10044 */
10045 if (bsi->nr_pages >= sis->max)
10046 return 0;
10047
10048 max_pages = sis->max - bsi->nr_pages;
10049 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10050 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10051
10052 if (first_ppage >= next_ppage)
10053 return 0;
10054 nr_pages = next_ppage - first_ppage;
10055 nr_pages = min(nr_pages, max_pages);
10056
10057 first_ppage_reported = first_ppage;
10058 if (bsi->start == 0)
10059 first_ppage_reported++;
10060 if (bsi->lowest_ppage > first_ppage_reported)
10061 bsi->lowest_ppage = first_ppage_reported;
10062 if (bsi->highest_ppage < (next_ppage - 1))
10063 bsi->highest_ppage = next_ppage - 1;
10064
10065 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10066 if (ret < 0)
10067 return ret;
10068 bsi->nr_extents += ret;
10069 bsi->nr_pages += nr_pages;
10070 return 0;
10071 }
10072
btrfs_swap_deactivate(struct file * file)10073 static void btrfs_swap_deactivate(struct file *file)
10074 {
10075 struct inode *inode = file_inode(file);
10076
10077 btrfs_free_swapfile_pins(inode);
10078 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10079 }
10080
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10081 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10082 sector_t *span)
10083 {
10084 struct inode *inode = file_inode(file);
10085 struct btrfs_root *root = BTRFS_I(inode)->root;
10086 struct btrfs_fs_info *fs_info = root->fs_info;
10087 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10088 struct extent_state *cached_state = NULL;
10089 struct btrfs_chunk_map *map = NULL;
10090 struct btrfs_device *device = NULL;
10091 struct btrfs_swap_info bsi = {
10092 .lowest_ppage = (sector_t)-1ULL,
10093 };
10094 struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
10095 struct btrfs_path *path = NULL;
10096 int ret = 0;
10097 u64 isize;
10098 u64 prev_extent_end = 0;
10099
10100 /*
10101 * Acquire the inode's mmap lock to prevent races with memory mapped
10102 * writes, as they could happen after we flush delalloc below and before
10103 * we lock the extent range further below. The inode was already locked
10104 * up in the call chain.
10105 */
10106 btrfs_assert_inode_locked(BTRFS_I(inode));
10107 down_write(&BTRFS_I(inode)->i_mmap_lock);
10108
10109 /*
10110 * If the swap file was just created, make sure delalloc is done. If the
10111 * file changes again after this, the user is doing something stupid and
10112 * we don't really care.
10113 */
10114 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
10115 if (ret)
10116 goto out_unlock_mmap;
10117
10118 /*
10119 * The inode is locked, so these flags won't change after we check them.
10120 */
10121 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10122 btrfs_warn(fs_info, "swapfile must not be compressed");
10123 ret = -EINVAL;
10124 goto out_unlock_mmap;
10125 }
10126 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10127 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10128 ret = -EINVAL;
10129 goto out_unlock_mmap;
10130 }
10131 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10132 btrfs_warn(fs_info, "swapfile must not be checksummed");
10133 ret = -EINVAL;
10134 goto out_unlock_mmap;
10135 }
10136
10137 path = btrfs_alloc_path();
10138 backref_ctx = btrfs_alloc_backref_share_check_ctx();
10139 if (!path || !backref_ctx) {
10140 ret = -ENOMEM;
10141 goto out_unlock_mmap;
10142 }
10143
10144 /*
10145 * Balance or device remove/replace/resize can move stuff around from
10146 * under us. The exclop protection makes sure they aren't running/won't
10147 * run concurrently while we are mapping the swap extents, and
10148 * fs_info->swapfile_pins prevents them from running while the swap
10149 * file is active and moving the extents. Note that this also prevents
10150 * a concurrent device add which isn't actually necessary, but it's not
10151 * really worth the trouble to allow it.
10152 */
10153 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10154 btrfs_warn(fs_info,
10155 "cannot activate swapfile while exclusive operation is running");
10156 ret = -EBUSY;
10157 goto out_unlock_mmap;
10158 }
10159
10160 /*
10161 * Prevent snapshot creation while we are activating the swap file.
10162 * We do not want to race with snapshot creation. If snapshot creation
10163 * already started before we bumped nr_swapfiles from 0 to 1 and
10164 * completes before the first write into the swap file after it is
10165 * activated, than that write would fallback to COW.
10166 */
10167 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10168 btrfs_exclop_finish(fs_info);
10169 btrfs_warn(fs_info,
10170 "cannot activate swapfile because snapshot creation is in progress");
10171 ret = -EINVAL;
10172 goto out_unlock_mmap;
10173 }
10174 /*
10175 * Snapshots can create extents which require COW even if NODATACOW is
10176 * set. We use this counter to prevent snapshots. We must increment it
10177 * before walking the extents because we don't want a concurrent
10178 * snapshot to run after we've already checked the extents.
10179 *
10180 * It is possible that subvolume is marked for deletion but still not
10181 * removed yet. To prevent this race, we check the root status before
10182 * activating the swapfile.
10183 */
10184 spin_lock(&root->root_item_lock);
10185 if (btrfs_root_dead(root)) {
10186 spin_unlock(&root->root_item_lock);
10187
10188 btrfs_drew_write_unlock(&root->snapshot_lock);
10189 btrfs_exclop_finish(fs_info);
10190 btrfs_warn(fs_info,
10191 "cannot activate swapfile because subvolume %llu is being deleted",
10192 btrfs_root_id(root));
10193 ret = -EPERM;
10194 goto out_unlock_mmap;
10195 }
10196 atomic_inc(&root->nr_swapfiles);
10197 spin_unlock(&root->root_item_lock);
10198
10199 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10200
10201 btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
10202 while (prev_extent_end < isize) {
10203 struct btrfs_key key;
10204 struct extent_buffer *leaf;
10205 struct btrfs_file_extent_item *ei;
10206 struct btrfs_block_group *bg;
10207 u64 logical_block_start;
10208 u64 physical_block_start;
10209 u64 extent_gen;
10210 u64 disk_bytenr;
10211 u64 len;
10212
10213 key.objectid = btrfs_ino(BTRFS_I(inode));
10214 key.type = BTRFS_EXTENT_DATA_KEY;
10215 key.offset = prev_extent_end;
10216
10217 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
10218 if (ret < 0)
10219 goto out;
10220
10221 /*
10222 * If key not found it means we have an implicit hole (NO_HOLES
10223 * is enabled).
10224 */
10225 if (ret > 0) {
10226 btrfs_warn(fs_info, "swapfile must not have holes");
10227 ret = -EINVAL;
10228 goto out;
10229 }
10230
10231 leaf = path->nodes[0];
10232 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10233
10234 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
10235 /*
10236 * It's unlikely we'll ever actually find ourselves
10237 * here, as a file small enough to fit inline won't be
10238 * big enough to store more than the swap header, but in
10239 * case something changes in the future, let's catch it
10240 * here rather than later.
10241 */
10242 btrfs_warn(fs_info, "swapfile must not be inline");
10243 ret = -EINVAL;
10244 goto out;
10245 }
10246
10247 if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
10248 btrfs_warn(fs_info, "swapfile must not be compressed");
10249 ret = -EINVAL;
10250 goto out;
10251 }
10252
10253 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
10254 if (disk_bytenr == 0) {
10255 btrfs_warn(fs_info, "swapfile must not have holes");
10256 ret = -EINVAL;
10257 goto out;
10258 }
10259
10260 logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
10261 extent_gen = btrfs_file_extent_generation(leaf, ei);
10262 prev_extent_end = btrfs_file_extent_end(path);
10263
10264 if (prev_extent_end > isize)
10265 len = isize - key.offset;
10266 else
10267 len = btrfs_file_extent_num_bytes(leaf, ei);
10268
10269 backref_ctx->curr_leaf_bytenr = leaf->start;
10270
10271 /*
10272 * Don't need the path anymore, release to avoid deadlocks when
10273 * calling btrfs_is_data_extent_shared() because when joining a
10274 * transaction it can block waiting for the current one's commit
10275 * which in turn may be trying to lock the same leaf to flush
10276 * delayed items for example.
10277 */
10278 btrfs_release_path(path);
10279
10280 ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
10281 extent_gen, backref_ctx);
10282 if (ret < 0) {
10283 goto out;
10284 } else if (ret > 0) {
10285 btrfs_warn(fs_info,
10286 "swapfile must not be copy-on-write");
10287 ret = -EINVAL;
10288 goto out;
10289 }
10290
10291 map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10292 if (IS_ERR(map)) {
10293 ret = PTR_ERR(map);
10294 goto out;
10295 }
10296
10297 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10298 btrfs_warn(fs_info,
10299 "swapfile must have single data profile");
10300 ret = -EINVAL;
10301 goto out;
10302 }
10303
10304 if (device == NULL) {
10305 device = map->stripes[0].dev;
10306 ret = btrfs_add_swapfile_pin(inode, device, false);
10307 if (ret == 1)
10308 ret = 0;
10309 else if (ret)
10310 goto out;
10311 } else if (device != map->stripes[0].dev) {
10312 btrfs_warn(fs_info, "swapfile must be on one device");
10313 ret = -EINVAL;
10314 goto out;
10315 }
10316
10317 physical_block_start = (map->stripes[0].physical +
10318 (logical_block_start - map->start));
10319 btrfs_free_chunk_map(map);
10320 map = NULL;
10321
10322 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10323 if (!bg) {
10324 btrfs_warn(fs_info,
10325 "could not find block group containing swapfile");
10326 ret = -EINVAL;
10327 goto out;
10328 }
10329
10330 if (!btrfs_inc_block_group_swap_extents(bg)) {
10331 btrfs_warn(fs_info,
10332 "block group for swapfile at %llu is read-only%s",
10333 bg->start,
10334 atomic_read(&fs_info->scrubs_running) ?
10335 " (scrub running)" : "");
10336 btrfs_put_block_group(bg);
10337 ret = -EINVAL;
10338 goto out;
10339 }
10340
10341 ret = btrfs_add_swapfile_pin(inode, bg, true);
10342 if (ret) {
10343 btrfs_put_block_group(bg);
10344 if (ret == 1)
10345 ret = 0;
10346 else
10347 goto out;
10348 }
10349
10350 if (bsi.block_len &&
10351 bsi.block_start + bsi.block_len == physical_block_start) {
10352 bsi.block_len += len;
10353 } else {
10354 if (bsi.block_len) {
10355 ret = btrfs_add_swap_extent(sis, &bsi);
10356 if (ret)
10357 goto out;
10358 }
10359 bsi.start = key.offset;
10360 bsi.block_start = physical_block_start;
10361 bsi.block_len = len;
10362 }
10363
10364 if (fatal_signal_pending(current)) {
10365 ret = -EINTR;
10366 goto out;
10367 }
10368
10369 cond_resched();
10370 }
10371
10372 if (bsi.block_len)
10373 ret = btrfs_add_swap_extent(sis, &bsi);
10374
10375 out:
10376 if (!IS_ERR_OR_NULL(map))
10377 btrfs_free_chunk_map(map);
10378
10379 btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
10380
10381 if (ret)
10382 btrfs_swap_deactivate(file);
10383
10384 btrfs_drew_write_unlock(&root->snapshot_lock);
10385
10386 btrfs_exclop_finish(fs_info);
10387
10388 out_unlock_mmap:
10389 up_write(&BTRFS_I(inode)->i_mmap_lock);
10390 btrfs_free_backref_share_ctx(backref_ctx);
10391 btrfs_free_path(path);
10392 if (ret)
10393 return ret;
10394
10395 if (device)
10396 sis->bdev = device->bdev;
10397 *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10398 sis->max = bsi.nr_pages;
10399 sis->pages = bsi.nr_pages - 1;
10400 return bsi.nr_extents;
10401 }
10402 #else
btrfs_swap_deactivate(struct file * file)10403 static void btrfs_swap_deactivate(struct file *file)
10404 {
10405 }
10406
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10407 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10408 sector_t *span)
10409 {
10410 return -EOPNOTSUPP;
10411 }
10412 #endif
10413
10414 /*
10415 * Update the number of bytes used in the VFS' inode. When we replace extents in
10416 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10417 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10418 * always get a correct value.
10419 */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10420 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10421 const u64 add_bytes,
10422 const u64 del_bytes)
10423 {
10424 if (add_bytes == del_bytes)
10425 return;
10426
10427 spin_lock(&inode->lock);
10428 if (del_bytes > 0)
10429 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10430 if (add_bytes > 0)
10431 inode_add_bytes(&inode->vfs_inode, add_bytes);
10432 spin_unlock(&inode->lock);
10433 }
10434
10435 /*
10436 * Verify that there are no ordered extents for a given file range.
10437 *
10438 * @inode: The target inode.
10439 * @start: Start offset of the file range, should be sector size aligned.
10440 * @end: End offset (inclusive) of the file range, its value +1 should be
10441 * sector size aligned.
10442 *
10443 * This should typically be used for cases where we locked an inode's VFS lock in
10444 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10445 * we have flushed all delalloc in the range, we have waited for all ordered
10446 * extents in the range to complete and finally we have locked the file range in
10447 * the inode's io_tree.
10448 */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10449 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10450 {
10451 struct btrfs_root *root = inode->root;
10452 struct btrfs_ordered_extent *ordered;
10453
10454 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10455 return;
10456
10457 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10458 if (ordered) {
10459 btrfs_err(root->fs_info,
10460 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10461 start, end, btrfs_ino(inode), btrfs_root_id(root),
10462 ordered->file_offset,
10463 ordered->file_offset + ordered->num_bytes - 1);
10464 btrfs_put_ordered_extent(ordered);
10465 }
10466
10467 ASSERT(ordered == NULL);
10468 }
10469
10470 /*
10471 * Find the first inode with a minimum number.
10472 *
10473 * @root: The root to search for.
10474 * @min_ino: The minimum inode number.
10475 *
10476 * Find the first inode in the @root with a number >= @min_ino and return it.
10477 * Returns NULL if no such inode found.
10478 */
btrfs_find_first_inode(struct btrfs_root * root,u64 min_ino)10479 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10480 {
10481 struct btrfs_inode *inode;
10482 unsigned long from = min_ino;
10483
10484 xa_lock(&root->inodes);
10485 while (true) {
10486 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10487 if (!inode)
10488 break;
10489 if (igrab(&inode->vfs_inode))
10490 break;
10491
10492 from = btrfs_ino(inode) + 1;
10493 cond_resched_lock(&root->inodes.xa_lock);
10494 }
10495 xa_unlock(&root->inodes);
10496
10497 return inode;
10498 }
10499
10500 static const struct inode_operations btrfs_dir_inode_operations = {
10501 .getattr = btrfs_getattr,
10502 .lookup = btrfs_lookup,
10503 .create = btrfs_create,
10504 .unlink = btrfs_unlink,
10505 .link = btrfs_link,
10506 .mkdir = btrfs_mkdir,
10507 .rmdir = btrfs_rmdir,
10508 .rename = btrfs_rename2,
10509 .symlink = btrfs_symlink,
10510 .setattr = btrfs_setattr,
10511 .mknod = btrfs_mknod,
10512 .listxattr = btrfs_listxattr,
10513 .permission = btrfs_permission,
10514 .get_inode_acl = btrfs_get_acl,
10515 .set_acl = btrfs_set_acl,
10516 .update_time = btrfs_update_time,
10517 .tmpfile = btrfs_tmpfile,
10518 .fileattr_get = btrfs_fileattr_get,
10519 .fileattr_set = btrfs_fileattr_set,
10520 };
10521
10522 static const struct file_operations btrfs_dir_file_operations = {
10523 .llseek = btrfs_dir_llseek,
10524 .read = generic_read_dir,
10525 .iterate_shared = btrfs_real_readdir,
10526 .open = btrfs_opendir,
10527 .unlocked_ioctl = btrfs_ioctl,
10528 #ifdef CONFIG_COMPAT
10529 .compat_ioctl = btrfs_compat_ioctl,
10530 #endif
10531 .release = btrfs_release_file,
10532 .fsync = btrfs_sync_file,
10533 };
10534
10535 /*
10536 * btrfs doesn't support the bmap operation because swapfiles
10537 * use bmap to make a mapping of extents in the file. They assume
10538 * these extents won't change over the life of the file and they
10539 * use the bmap result to do IO directly to the drive.
10540 *
10541 * the btrfs bmap call would return logical addresses that aren't
10542 * suitable for IO and they also will change frequently as COW
10543 * operations happen. So, swapfile + btrfs == corruption.
10544 *
10545 * For now we're avoiding this by dropping bmap.
10546 */
10547 static const struct address_space_operations btrfs_aops = {
10548 .read_folio = btrfs_read_folio,
10549 .writepages = btrfs_writepages,
10550 .readahead = btrfs_readahead,
10551 .invalidate_folio = btrfs_invalidate_folio,
10552 .launder_folio = btrfs_launder_folio,
10553 .release_folio = btrfs_release_folio,
10554 .migrate_folio = btrfs_migrate_folio,
10555 .dirty_folio = filemap_dirty_folio,
10556 .error_remove_folio = generic_error_remove_folio,
10557 .swap_activate = btrfs_swap_activate,
10558 .swap_deactivate = btrfs_swap_deactivate,
10559 };
10560
10561 static const struct inode_operations btrfs_file_inode_operations = {
10562 .getattr = btrfs_getattr,
10563 .setattr = btrfs_setattr,
10564 .listxattr = btrfs_listxattr,
10565 .permission = btrfs_permission,
10566 .fiemap = btrfs_fiemap,
10567 .get_inode_acl = btrfs_get_acl,
10568 .set_acl = btrfs_set_acl,
10569 .update_time = btrfs_update_time,
10570 .fileattr_get = btrfs_fileattr_get,
10571 .fileattr_set = btrfs_fileattr_set,
10572 };
10573 static const struct inode_operations btrfs_special_inode_operations = {
10574 .getattr = btrfs_getattr,
10575 .setattr = btrfs_setattr,
10576 .permission = btrfs_permission,
10577 .listxattr = btrfs_listxattr,
10578 .get_inode_acl = btrfs_get_acl,
10579 .set_acl = btrfs_set_acl,
10580 .update_time = btrfs_update_time,
10581 };
10582 static const struct inode_operations btrfs_symlink_inode_operations = {
10583 .get_link = page_get_link,
10584 .getattr = btrfs_getattr,
10585 .setattr = btrfs_setattr,
10586 .permission = btrfs_permission,
10587 .listxattr = btrfs_listxattr,
10588 .update_time = btrfs_update_time,
10589 };
10590
10591 const struct dentry_operations btrfs_dentry_operations = {
10592 .d_delete = btrfs_dentry_delete,
10593 };
10594