1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/fs_struct.h>
13 #include <linux/pagemap.h>
14 #include <linux/highmem.h>
15 #include <linux/time.h>
16 #include <linux/init.h>
17 #include <linux/string.h>
18 #include <linux/backing-dev.h>
19 #include <linux/writeback.h>
20 #include <linux/compat.h>
21 #include <linux/xattr.h>
22 #include <linux/posix_acl.h>
23 #include <linux/falloc.h>
24 #include <linux/slab.h>
25 #include <linux/ratelimit.h>
26 #include <linux/btrfs.h>
27 #include <linux/blkdev.h>
28 #include <linux/posix_acl_xattr.h>
29 #include <linux/uio.h>
30 #include <linux/magic.h>
31 #include <linux/iversion.h>
32 #include <linux/swap.h>
33 #include <linux/migrate.h>
34 #include <linux/sched/mm.h>
35 #include <linux/iomap.h>
36 #include <linux/unaligned.h>
37 #include <linux/fsverity.h>
38 #include "misc.h"
39 #include "ctree.h"
40 #include "disk-io.h"
41 #include "transaction.h"
42 #include "btrfs_inode.h"
43 #include "ordered-data.h"
44 #include "xattr.h"
45 #include "tree-log.h"
46 #include "bio.h"
47 #include "compression.h"
48 #include "locking.h"
49 #include "props.h"
50 #include "qgroup.h"
51 #include "delalloc-space.h"
52 #include "block-group.h"
53 #include "space-info.h"
54 #include "zoned.h"
55 #include "subpage.h"
56 #include "inode-item.h"
57 #include "fs.h"
58 #include "accessors.h"
59 #include "extent-tree.h"
60 #include "root-tree.h"
61 #include "defrag.h"
62 #include "dir-item.h"
63 #include "file-item.h"
64 #include "uuid-tree.h"
65 #include "ioctl.h"
66 #include "file.h"
67 #include "acl.h"
68 #include "relocation.h"
69 #include "verity.h"
70 #include "super.h"
71 #include "orphan.h"
72 #include "backref.h"
73 #include "raid-stripe-tree.h"
74 #include "fiemap.h"
75
76 #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0)
77 #define COW_FILE_RANGE_NO_INLINE (1UL << 1)
78
79 struct btrfs_iget_args {
80 u64 ino;
81 struct btrfs_root *root;
82 };
83
84 struct btrfs_rename_ctx {
85 /* Output field. Stores the index number of the old directory entry. */
86 u64 index;
87 };
88
89 /*
90 * Used by data_reloc_print_warning_inode() to pass needed info for filename
91 * resolution and output of error message.
92 */
93 struct data_reloc_warn {
94 struct btrfs_path path;
95 struct btrfs_fs_info *fs_info;
96 u64 extent_item_size;
97 u64 logical;
98 int mirror_num;
99 };
100
101 /*
102 * For the file_extent_tree, we want to hold the inode lock when we lookup and
103 * update the disk_i_size, but lockdep will complain because our io_tree we hold
104 * the tree lock and get the inode lock when setting delalloc. These two things
105 * are unrelated, so make a class for the file_extent_tree so we don't get the
106 * two locking patterns mixed up.
107 */
108 static struct lock_class_key file_extent_tree_class;
109
110 static const struct inode_operations btrfs_dir_inode_operations;
111 static const struct inode_operations btrfs_symlink_inode_operations;
112 static const struct inode_operations btrfs_special_inode_operations;
113 static const struct inode_operations btrfs_file_inode_operations;
114 static const struct address_space_operations btrfs_aops;
115 static const struct file_operations btrfs_dir_file_operations;
116
117 static struct kmem_cache *btrfs_inode_cachep;
118
119 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
120 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
121
122 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
123 struct folio *locked_folio, u64 start,
124 u64 end, struct writeback_control *wbc,
125 bool pages_dirty);
126
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)127 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
128 u64 root, void *warn_ctx)
129 {
130 struct data_reloc_warn *warn = warn_ctx;
131 struct btrfs_fs_info *fs_info = warn->fs_info;
132 struct extent_buffer *eb;
133 struct btrfs_inode_item *inode_item;
134 struct inode_fs_paths *ipath = NULL;
135 struct btrfs_root *local_root;
136 struct btrfs_key key;
137 unsigned int nofs_flag;
138 u32 nlink;
139 int ret;
140
141 local_root = btrfs_get_fs_root(fs_info, root, true);
142 if (IS_ERR(local_root)) {
143 ret = PTR_ERR(local_root);
144 goto err;
145 }
146
147 /* This makes the path point to (inum INODE_ITEM ioff). */
148 key.objectid = inum;
149 key.type = BTRFS_INODE_ITEM_KEY;
150 key.offset = 0;
151
152 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
153 if (ret) {
154 btrfs_put_root(local_root);
155 btrfs_release_path(&warn->path);
156 goto err;
157 }
158
159 eb = warn->path.nodes[0];
160 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
161 nlink = btrfs_inode_nlink(eb, inode_item);
162 btrfs_release_path(&warn->path);
163
164 nofs_flag = memalloc_nofs_save();
165 ipath = init_ipath(4096, local_root, &warn->path);
166 memalloc_nofs_restore(nofs_flag);
167 if (IS_ERR(ipath)) {
168 btrfs_put_root(local_root);
169 ret = PTR_ERR(ipath);
170 ipath = NULL;
171 /*
172 * -ENOMEM, not a critical error, just output an generic error
173 * without filename.
174 */
175 btrfs_warn(fs_info,
176 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
177 warn->logical, warn->mirror_num, root, inum, offset);
178 return ret;
179 }
180 ret = paths_from_inode(inum, ipath);
181 if (ret < 0) {
182 btrfs_put_root(local_root);
183 goto err;
184 }
185
186 /*
187 * We deliberately ignore the bit ipath might have been too small to
188 * hold all of the paths here
189 */
190 for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
191 btrfs_warn(fs_info,
192 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
193 warn->logical, warn->mirror_num, root, inum, offset,
194 fs_info->sectorsize, nlink,
195 (char *)(unsigned long)ipath->fspath->val[i]);
196 }
197
198 btrfs_put_root(local_root);
199 free_ipath(ipath);
200 return 0;
201
202 err:
203 btrfs_warn(fs_info,
204 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
205 warn->logical, warn->mirror_num, root, inum, offset, ret);
206
207 free_ipath(ipath);
208 return ret;
209 }
210
211 /*
212 * Do extra user-friendly error output (e.g. lookup all the affected files).
213 *
214 * Return true if we succeeded doing the backref lookup.
215 * Return false if such lookup failed, and has to fallback to the old error message.
216 */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)217 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
218 const u8 *csum, const u8 *csum_expected,
219 int mirror_num)
220 {
221 struct btrfs_fs_info *fs_info = inode->root->fs_info;
222 struct btrfs_path path = { 0 };
223 struct btrfs_key found_key = { 0 };
224 struct extent_buffer *eb;
225 struct btrfs_extent_item *ei;
226 const u32 csum_size = fs_info->csum_size;
227 u64 logical;
228 u64 flags;
229 u32 item_size;
230 int ret;
231
232 mutex_lock(&fs_info->reloc_mutex);
233 logical = btrfs_get_reloc_bg_bytenr(fs_info);
234 mutex_unlock(&fs_info->reloc_mutex);
235
236 if (logical == U64_MAX) {
237 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
238 btrfs_warn_rl(fs_info,
239 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
240 btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
241 CSUM_FMT_VALUE(csum_size, csum),
242 CSUM_FMT_VALUE(csum_size, csum_expected),
243 mirror_num);
244 return;
245 }
246
247 logical += file_off;
248 btrfs_warn_rl(fs_info,
249 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
250 btrfs_root_id(inode->root),
251 btrfs_ino(inode), file_off, logical,
252 CSUM_FMT_VALUE(csum_size, csum),
253 CSUM_FMT_VALUE(csum_size, csum_expected),
254 mirror_num);
255
256 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
257 if (ret < 0) {
258 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
259 logical, ret);
260 return;
261 }
262 eb = path.nodes[0];
263 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
264 item_size = btrfs_item_size(eb, path.slots[0]);
265 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
266 unsigned long ptr = 0;
267 u64 ref_root;
268 u8 ref_level;
269
270 while (true) {
271 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
272 item_size, &ref_root,
273 &ref_level);
274 if (ret < 0) {
275 btrfs_warn_rl(fs_info,
276 "failed to resolve tree backref for logical %llu: %d",
277 logical, ret);
278 break;
279 }
280 if (ret > 0)
281 break;
282
283 btrfs_warn_rl(fs_info,
284 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
285 logical, mirror_num,
286 (ref_level ? "node" : "leaf"),
287 ref_level, ref_root);
288 }
289 btrfs_release_path(&path);
290 } else {
291 struct btrfs_backref_walk_ctx ctx = { 0 };
292 struct data_reloc_warn reloc_warn = { 0 };
293
294 btrfs_release_path(&path);
295
296 ctx.bytenr = found_key.objectid;
297 ctx.extent_item_pos = logical - found_key.objectid;
298 ctx.fs_info = fs_info;
299
300 reloc_warn.logical = logical;
301 reloc_warn.extent_item_size = found_key.offset;
302 reloc_warn.mirror_num = mirror_num;
303 reloc_warn.fs_info = fs_info;
304
305 iterate_extent_inodes(&ctx, true,
306 data_reloc_print_warning_inode, &reloc_warn);
307 }
308 }
309
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)310 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
311 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
312 {
313 struct btrfs_root *root = inode->root;
314 const u32 csum_size = root->fs_info->csum_size;
315
316 /* For data reloc tree, it's better to do a backref lookup instead. */
317 if (btrfs_is_data_reloc_root(root))
318 return print_data_reloc_error(inode, logical_start, csum,
319 csum_expected, mirror_num);
320
321 /* Output without objectid, which is more meaningful */
322 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
323 btrfs_warn_rl(root->fs_info,
324 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
325 btrfs_root_id(root), btrfs_ino(inode),
326 logical_start,
327 CSUM_FMT_VALUE(csum_size, csum),
328 CSUM_FMT_VALUE(csum_size, csum_expected),
329 mirror_num);
330 } else {
331 btrfs_warn_rl(root->fs_info,
332 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
333 btrfs_root_id(root), btrfs_ino(inode),
334 logical_start,
335 CSUM_FMT_VALUE(csum_size, csum),
336 CSUM_FMT_VALUE(csum_size, csum_expected),
337 mirror_num);
338 }
339 }
340
341 /*
342 * Lock inode i_rwsem based on arguments passed.
343 *
344 * ilock_flags can have the following bit set:
345 *
346 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
347 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
348 * return -EAGAIN
349 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
350 */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)351 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
352 {
353 if (ilock_flags & BTRFS_ILOCK_SHARED) {
354 if (ilock_flags & BTRFS_ILOCK_TRY) {
355 if (!inode_trylock_shared(&inode->vfs_inode))
356 return -EAGAIN;
357 else
358 return 0;
359 }
360 inode_lock_shared(&inode->vfs_inode);
361 } else {
362 if (ilock_flags & BTRFS_ILOCK_TRY) {
363 if (!inode_trylock(&inode->vfs_inode))
364 return -EAGAIN;
365 else
366 return 0;
367 }
368 inode_lock(&inode->vfs_inode);
369 }
370 if (ilock_flags & BTRFS_ILOCK_MMAP)
371 down_write(&inode->i_mmap_lock);
372 return 0;
373 }
374
375 /*
376 * Unlock inode i_rwsem.
377 *
378 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
379 * to decide whether the lock acquired is shared or exclusive.
380 */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)381 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
382 {
383 if (ilock_flags & BTRFS_ILOCK_MMAP)
384 up_write(&inode->i_mmap_lock);
385 if (ilock_flags & BTRFS_ILOCK_SHARED)
386 inode_unlock_shared(&inode->vfs_inode);
387 else
388 inode_unlock(&inode->vfs_inode);
389 }
390
391 /*
392 * Cleanup all submitted ordered extents in specified range to handle errors
393 * from the btrfs_run_delalloc_range() callback.
394 *
395 * NOTE: caller must ensure that when an error happens, it can not call
396 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
397 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
398 * to be released, which we want to happen only when finishing the ordered
399 * extent (btrfs_finish_ordered_io()).
400 */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,u64 offset,u64 bytes)401 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
402 u64 offset, u64 bytes)
403 {
404 pgoff_t index = offset >> PAGE_SHIFT;
405 const pgoff_t end_index = (offset + bytes - 1) >> PAGE_SHIFT;
406 struct folio *folio;
407
408 while (index <= end_index) {
409 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
410 if (IS_ERR(folio)) {
411 index++;
412 continue;
413 }
414
415 index = folio_next_index(folio);
416 /*
417 * Here we just clear all Ordered bits for every page in the
418 * range, then btrfs_mark_ordered_io_finished() will handle
419 * the ordered extent accounting for the range.
420 */
421 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
422 offset, bytes);
423 folio_put(folio);
424 }
425
426 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
427 }
428
429 static int btrfs_dirty_inode(struct btrfs_inode *inode);
430
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)431 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
432 struct btrfs_new_inode_args *args)
433 {
434 int ret;
435
436 if (args->default_acl) {
437 ret = __btrfs_set_acl(trans, args->inode, args->default_acl,
438 ACL_TYPE_DEFAULT);
439 if (ret)
440 return ret;
441 }
442 if (args->acl) {
443 ret = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
444 if (ret)
445 return ret;
446 }
447 if (!args->default_acl && !args->acl)
448 cache_no_acl(args->inode);
449 return btrfs_xattr_security_init(trans, args->inode, args->dir,
450 &args->dentry->d_name);
451 }
452
453 /*
454 * this does all the hard work for inserting an inline extent into
455 * the btree. The caller should have done a btrfs_drop_extents so that
456 * no overlapping inline items exist in the btree
457 */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)458 static int insert_inline_extent(struct btrfs_trans_handle *trans,
459 struct btrfs_path *path,
460 struct btrfs_inode *inode, bool extent_inserted,
461 size_t size, size_t compressed_size,
462 int compress_type,
463 struct folio *compressed_folio,
464 bool update_i_size)
465 {
466 struct btrfs_root *root = inode->root;
467 struct extent_buffer *leaf;
468 const u32 sectorsize = trans->fs_info->sectorsize;
469 char *kaddr;
470 unsigned long ptr;
471 struct btrfs_file_extent_item *ei;
472 int ret;
473 size_t cur_size = size;
474 u64 i_size;
475
476 /*
477 * The decompressed size must still be no larger than a sector. Under
478 * heavy race, we can have size == 0 passed in, but that shouldn't be a
479 * big deal and we can continue the insertion.
480 */
481 ASSERT(size <= sectorsize);
482
483 /*
484 * The compressed size also needs to be no larger than a sector.
485 * That's also why we only need one page as the parameter.
486 */
487 if (compressed_folio)
488 ASSERT(compressed_size <= sectorsize);
489 else
490 ASSERT(compressed_size == 0);
491
492 if (compressed_size && compressed_folio)
493 cur_size = compressed_size;
494
495 if (!extent_inserted) {
496 struct btrfs_key key;
497 size_t datasize;
498
499 key.objectid = btrfs_ino(inode);
500 key.type = BTRFS_EXTENT_DATA_KEY;
501 key.offset = 0;
502
503 datasize = btrfs_file_extent_calc_inline_size(cur_size);
504 ret = btrfs_insert_empty_item(trans, root, path, &key,
505 datasize);
506 if (ret)
507 goto fail;
508 }
509 leaf = path->nodes[0];
510 ei = btrfs_item_ptr(leaf, path->slots[0],
511 struct btrfs_file_extent_item);
512 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
513 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
514 btrfs_set_file_extent_encryption(leaf, ei, 0);
515 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
516 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
517 ptr = btrfs_file_extent_inline_start(ei);
518
519 if (compress_type != BTRFS_COMPRESS_NONE) {
520 kaddr = kmap_local_folio(compressed_folio, 0);
521 write_extent_buffer(leaf, kaddr, ptr, compressed_size);
522 kunmap_local(kaddr);
523
524 btrfs_set_file_extent_compression(leaf, ei,
525 compress_type);
526 } else {
527 struct folio *folio;
528
529 folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
530 ASSERT(!IS_ERR(folio));
531 btrfs_set_file_extent_compression(leaf, ei, 0);
532 kaddr = kmap_local_folio(folio, 0);
533 write_extent_buffer(leaf, kaddr, ptr, size);
534 kunmap_local(kaddr);
535 folio_put(folio);
536 }
537 btrfs_release_path(path);
538
539 /*
540 * We align size to sectorsize for inline extents just for simplicity
541 * sake.
542 */
543 ret = btrfs_inode_set_file_extent_range(inode, 0,
544 ALIGN(size, root->fs_info->sectorsize));
545 if (ret)
546 goto fail;
547
548 /*
549 * We're an inline extent, so nobody can extend the file past i_size
550 * without locking a page we already have locked.
551 *
552 * We must do any i_size and inode updates before we unlock the pages.
553 * Otherwise we could end up racing with unlink.
554 */
555 i_size = i_size_read(&inode->vfs_inode);
556 if (update_i_size && size > i_size) {
557 i_size_write(&inode->vfs_inode, size);
558 i_size = size;
559 }
560 inode->disk_i_size = i_size;
561
562 fail:
563 return ret;
564 }
565
can_cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size)566 static bool can_cow_file_range_inline(struct btrfs_inode *inode,
567 u64 offset, u64 size,
568 size_t compressed_size)
569 {
570 struct btrfs_fs_info *fs_info = inode->root->fs_info;
571 u64 data_len = (compressed_size ?: size);
572
573 /* Inline extents must start at offset 0. */
574 if (offset != 0)
575 return false;
576
577 /* Inline extents are limited to sectorsize. */
578 if (size > fs_info->sectorsize)
579 return false;
580
581 /* We do not allow a non-compressed extent to be as large as block size. */
582 if (data_len >= fs_info->sectorsize)
583 return false;
584
585 /* We cannot exceed the maximum inline data size. */
586 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
587 return false;
588
589 /* We cannot exceed the user specified max_inline size. */
590 if (data_len > fs_info->max_inline)
591 return false;
592
593 /* Inline extents must be the entirety of the file. */
594 if (size < i_size_read(&inode->vfs_inode))
595 return false;
596
597 return true;
598 }
599
600 /*
601 * conditionally insert an inline extent into the file. This
602 * does the checks required to make sure the data is small enough
603 * to fit as an inline extent.
604 *
605 * If being used directly, you must have already checked we're allowed to cow
606 * the range by getting true from can_cow_file_range_inline().
607 */
__cow_file_range_inline(struct btrfs_inode * inode,u64 size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)608 static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
609 u64 size, size_t compressed_size,
610 int compress_type,
611 struct folio *compressed_folio,
612 bool update_i_size)
613 {
614 struct btrfs_drop_extents_args drop_args = { 0 };
615 struct btrfs_root *root = inode->root;
616 struct btrfs_fs_info *fs_info = root->fs_info;
617 struct btrfs_trans_handle *trans;
618 u64 data_len = (compressed_size ?: size);
619 int ret;
620 struct btrfs_path *path;
621
622 path = btrfs_alloc_path();
623 if (!path)
624 return -ENOMEM;
625
626 trans = btrfs_join_transaction(root);
627 if (IS_ERR(trans)) {
628 btrfs_free_path(path);
629 return PTR_ERR(trans);
630 }
631 trans->block_rsv = &inode->block_rsv;
632
633 drop_args.path = path;
634 drop_args.start = 0;
635 drop_args.end = fs_info->sectorsize;
636 drop_args.drop_cache = true;
637 drop_args.replace_extent = true;
638 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
639 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
640 if (unlikely(ret)) {
641 btrfs_abort_transaction(trans, ret);
642 goto out;
643 }
644
645 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
646 size, compressed_size, compress_type,
647 compressed_folio, update_i_size);
648 if (unlikely(ret && ret != -ENOSPC)) {
649 btrfs_abort_transaction(trans, ret);
650 goto out;
651 } else if (ret == -ENOSPC) {
652 ret = 1;
653 goto out;
654 }
655
656 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
657 ret = btrfs_update_inode(trans, inode);
658 if (unlikely(ret && ret != -ENOSPC)) {
659 btrfs_abort_transaction(trans, ret);
660 goto out;
661 } else if (ret == -ENOSPC) {
662 ret = 1;
663 goto out;
664 }
665
666 btrfs_set_inode_full_sync(inode);
667 out:
668 /*
669 * Don't forget to free the reserved space, as for inlined extent
670 * it won't count as data extent, free them directly here.
671 * And at reserve time, it's always aligned to page size, so
672 * just free one page here.
673 */
674 btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
675 btrfs_free_path(path);
676 btrfs_end_transaction(trans);
677 return ret;
678 }
679
cow_file_range_inline(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 end,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)680 static noinline int cow_file_range_inline(struct btrfs_inode *inode,
681 struct folio *locked_folio,
682 u64 offset, u64 end,
683 size_t compressed_size,
684 int compress_type,
685 struct folio *compressed_folio,
686 bool update_i_size)
687 {
688 struct extent_state *cached = NULL;
689 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
690 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
691 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
692 int ret;
693
694 if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
695 return 1;
696
697 btrfs_lock_extent(&inode->io_tree, offset, end, &cached);
698 ret = __cow_file_range_inline(inode, size, compressed_size,
699 compress_type, compressed_folio,
700 update_i_size);
701 if (ret > 0) {
702 btrfs_unlock_extent(&inode->io_tree, offset, end, &cached);
703 return ret;
704 }
705
706 /*
707 * In the successful case (ret == 0 here), cow_file_range will return 1.
708 *
709 * Quite a bit further up the callstack in extent_writepage(), ret == 1
710 * is treated as a short circuited success and does not unlock the folio,
711 * so we must do it here.
712 *
713 * In the failure case, the locked_folio does get unlocked by
714 * btrfs_folio_end_all_writers, which asserts that it is still locked
715 * at that point, so we must *not* unlock it here.
716 *
717 * The other two callsites in compress_file_range do not have a
718 * locked_folio, so they are not relevant to this logic.
719 */
720 if (ret == 0)
721 locked_folio = NULL;
722
723 extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
724 clear_flags, PAGE_UNLOCK |
725 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
726 return ret;
727 }
728
729 struct async_extent {
730 u64 start;
731 u64 ram_size;
732 u64 compressed_size;
733 struct folio **folios;
734 unsigned long nr_folios;
735 int compress_type;
736 struct list_head list;
737 };
738
739 struct async_chunk {
740 struct btrfs_inode *inode;
741 struct folio *locked_folio;
742 u64 start;
743 u64 end;
744 blk_opf_t write_flags;
745 struct list_head extents;
746 struct cgroup_subsys_state *blkcg_css;
747 struct btrfs_work work;
748 struct async_cow *async_cow;
749 };
750
751 struct async_cow {
752 atomic_t num_chunks;
753 struct async_chunk chunks[];
754 };
755
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct folio ** folios,unsigned long nr_folios,int compress_type)756 static noinline int add_async_extent(struct async_chunk *cow,
757 u64 start, u64 ram_size,
758 u64 compressed_size,
759 struct folio **folios,
760 unsigned long nr_folios,
761 int compress_type)
762 {
763 struct async_extent *async_extent;
764
765 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
766 if (!async_extent)
767 return -ENOMEM;
768 async_extent->start = start;
769 async_extent->ram_size = ram_size;
770 async_extent->compressed_size = compressed_size;
771 async_extent->folios = folios;
772 async_extent->nr_folios = nr_folios;
773 async_extent->compress_type = compress_type;
774 list_add_tail(&async_extent->list, &cow->extents);
775 return 0;
776 }
777
778 /*
779 * Check if the inode needs to be submitted to compression, based on mount
780 * options, defragmentation, properties or heuristics.
781 */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)782 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
783 u64 end)
784 {
785 struct btrfs_fs_info *fs_info = inode->root->fs_info;
786
787 if (!btrfs_inode_can_compress(inode)) {
788 DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode));
789 return 0;
790 }
791
792 /* Defrag ioctl takes precedence over mount options and properties. */
793 if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS)
794 return 0;
795 if (BTRFS_COMPRESS_NONE < inode->defrag_compress &&
796 inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES)
797 return 1;
798 /* force compress */
799 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
800 return 1;
801 /* bad compression ratios */
802 if (inode->flags & BTRFS_INODE_NOCOMPRESS)
803 return 0;
804 if (btrfs_test_opt(fs_info, COMPRESS) ||
805 inode->flags & BTRFS_INODE_COMPRESS ||
806 inode->prop_compress)
807 return btrfs_compress_heuristic(inode, start, end);
808 return 0;
809 }
810
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)811 static inline void inode_should_defrag(struct btrfs_inode *inode,
812 u64 start, u64 end, u64 num_bytes, u32 small_write)
813 {
814 /* If this is a small write inside eof, kick off a defrag */
815 if (num_bytes < small_write &&
816 (start > 0 || end + 1 < inode->disk_i_size))
817 btrfs_add_inode_defrag(inode, small_write);
818 }
819
extent_range_clear_dirty_for_io(struct btrfs_inode * inode,u64 start,u64 end)820 static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
821 {
822 const pgoff_t end_index = end >> PAGE_SHIFT;
823 struct folio *folio;
824 int ret = 0;
825
826 for (pgoff_t index = start >> PAGE_SHIFT; index <= end_index; index++) {
827 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
828 if (IS_ERR(folio)) {
829 if (!ret)
830 ret = PTR_ERR(folio);
831 continue;
832 }
833 btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
834 end + 1 - start);
835 folio_put(folio);
836 }
837 return ret;
838 }
839
840 /*
841 * Work queue call back to started compression on a file and pages.
842 *
843 * This is done inside an ordered work queue, and the compression is spread
844 * across many cpus. The actual IO submission is step two, and the ordered work
845 * queue takes care of making sure that happens in the same order things were
846 * put onto the queue by writepages and friends.
847 *
848 * If this code finds it can't get good compression, it puts an entry onto the
849 * work queue to write the uncompressed bytes. This makes sure that both
850 * compressed inodes and uncompressed inodes are written in the same order that
851 * the flusher thread sent them down.
852 */
compress_file_range(struct btrfs_work * work)853 static void compress_file_range(struct btrfs_work *work)
854 {
855 struct async_chunk *async_chunk =
856 container_of(work, struct async_chunk, work);
857 struct btrfs_inode *inode = async_chunk->inode;
858 struct btrfs_fs_info *fs_info = inode->root->fs_info;
859 struct address_space *mapping = inode->vfs_inode.i_mapping;
860 const u32 min_folio_shift = PAGE_SHIFT + fs_info->block_min_order;
861 const u32 min_folio_size = btrfs_min_folio_size(fs_info);
862 u64 blocksize = fs_info->sectorsize;
863 u64 start = async_chunk->start;
864 u64 end = async_chunk->end;
865 u64 actual_end;
866 u64 i_size;
867 int ret = 0;
868 struct folio **folios;
869 unsigned long nr_folios;
870 unsigned long total_compressed = 0;
871 unsigned long total_in = 0;
872 unsigned int loff;
873 int i;
874 int compress_type = fs_info->compress_type;
875 int compress_level = fs_info->compress_level;
876
877 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
878
879 /*
880 * We need to call clear_page_dirty_for_io on each page in the range.
881 * Otherwise applications with the file mmap'd can wander in and change
882 * the page contents while we are compressing them.
883 */
884 ret = extent_range_clear_dirty_for_io(inode, start, end);
885
886 /*
887 * All the folios should have been locked thus no failure.
888 *
889 * And even if some folios are missing, btrfs_compress_folios()
890 * would handle them correctly, so here just do an ASSERT() check for
891 * early logic errors.
892 */
893 ASSERT(ret == 0);
894
895 /*
896 * We need to save i_size before now because it could change in between
897 * us evaluating the size and assigning it. This is because we lock and
898 * unlock the page in truncate and fallocate, and then modify the i_size
899 * later on.
900 *
901 * The barriers are to emulate READ_ONCE, remove that once i_size_read
902 * does that for us.
903 */
904 barrier();
905 i_size = i_size_read(&inode->vfs_inode);
906 barrier();
907 actual_end = min_t(u64, i_size, end + 1);
908 again:
909 folios = NULL;
910 nr_folios = (end >> min_folio_shift) - (start >> min_folio_shift) + 1;
911 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED >> min_folio_shift);
912
913 /*
914 * we don't want to send crud past the end of i_size through
915 * compression, that's just a waste of CPU time. So, if the
916 * end of the file is before the start of our current
917 * requested range of bytes, we bail out to the uncompressed
918 * cleanup code that can deal with all of this.
919 *
920 * It isn't really the fastest way to fix things, but this is a
921 * very uncommon corner.
922 */
923 if (actual_end <= start)
924 goto cleanup_and_bail_uncompressed;
925
926 total_compressed = actual_end - start;
927
928 /*
929 * Skip compression for a small file range(<=blocksize) that
930 * isn't an inline extent, since it doesn't save disk space at all.
931 */
932 if (total_compressed <= blocksize &&
933 (start > 0 || end + 1 < inode->disk_i_size))
934 goto cleanup_and_bail_uncompressed;
935
936 total_compressed = min_t(unsigned long, total_compressed,
937 BTRFS_MAX_UNCOMPRESSED);
938 total_in = 0;
939 ret = 0;
940
941 /*
942 * We do compression for mount -o compress and when the inode has not
943 * been flagged as NOCOMPRESS. This flag can change at any time if we
944 * discover bad compression ratios.
945 */
946 if (!inode_need_compress(inode, start, end))
947 goto cleanup_and_bail_uncompressed;
948
949 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
950 if (!folios) {
951 /*
952 * Memory allocation failure is not a fatal error, we can fall
953 * back to uncompressed code.
954 */
955 goto cleanup_and_bail_uncompressed;
956 }
957
958 if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) {
959 compress_type = inode->defrag_compress;
960 compress_level = inode->defrag_compress_level;
961 } else if (inode->prop_compress) {
962 compress_type = inode->prop_compress;
963 }
964
965 /* Compression level is applied here. */
966 ret = btrfs_compress_folios(compress_type, compress_level,
967 inode, start, folios, &nr_folios, &total_in,
968 &total_compressed);
969 if (ret)
970 goto mark_incompressible;
971
972 /*
973 * Zero the tail end of the last folio, as we might be sending it down
974 * to disk.
975 */
976 loff = (total_compressed & (min_folio_size - 1));
977 if (loff)
978 folio_zero_range(folios[nr_folios - 1], loff, min_folio_size - loff);
979
980 /*
981 * Try to create an inline extent.
982 *
983 * If we didn't compress the entire range, try to create an uncompressed
984 * inline extent, else a compressed one.
985 *
986 * Check cow_file_range() for why we don't even try to create inline
987 * extent for the subpage case.
988 */
989 if (total_in < actual_end)
990 ret = cow_file_range_inline(inode, NULL, start, end, 0,
991 BTRFS_COMPRESS_NONE, NULL, false);
992 else
993 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
994 compress_type, folios[0], false);
995 if (ret <= 0) {
996 if (ret < 0)
997 mapping_set_error(mapping, -EIO);
998 goto free_pages;
999 }
1000
1001 /*
1002 * We aren't doing an inline extent. Round the compressed size up to a
1003 * block size boundary so the allocator does sane things.
1004 */
1005 total_compressed = ALIGN(total_compressed, blocksize);
1006
1007 /*
1008 * One last check to make sure the compression is really a win, compare
1009 * the page count read with the blocks on disk, compression must free at
1010 * least one sector.
1011 */
1012 total_in = round_up(total_in, fs_info->sectorsize);
1013 if (total_compressed + blocksize > total_in)
1014 goto mark_incompressible;
1015
1016 /*
1017 * The async work queues will take care of doing actual allocation on
1018 * disk for these compressed pages, and will submit the bios.
1019 */
1020 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1021 nr_folios, compress_type);
1022 BUG_ON(ret);
1023 if (start + total_in < end) {
1024 start += total_in;
1025 cond_resched();
1026 goto again;
1027 }
1028 return;
1029
1030 mark_incompressible:
1031 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1032 inode->flags |= BTRFS_INODE_NOCOMPRESS;
1033 cleanup_and_bail_uncompressed:
1034 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1035 BTRFS_COMPRESS_NONE);
1036 BUG_ON(ret);
1037 free_pages:
1038 if (folios) {
1039 for (i = 0; i < nr_folios; i++) {
1040 WARN_ON(folios[i]->mapping);
1041 btrfs_free_compr_folio(folios[i]);
1042 }
1043 kfree(folios);
1044 }
1045 }
1046
free_async_extent_pages(struct async_extent * async_extent)1047 static void free_async_extent_pages(struct async_extent *async_extent)
1048 {
1049 int i;
1050
1051 if (!async_extent->folios)
1052 return;
1053
1054 for (i = 0; i < async_extent->nr_folios; i++) {
1055 WARN_ON(async_extent->folios[i]->mapping);
1056 btrfs_free_compr_folio(async_extent->folios[i]);
1057 }
1058 kfree(async_extent->folios);
1059 async_extent->nr_folios = 0;
1060 async_extent->folios = NULL;
1061 }
1062
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct folio * locked_folio)1063 static void submit_uncompressed_range(struct btrfs_inode *inode,
1064 struct async_extent *async_extent,
1065 struct folio *locked_folio)
1066 {
1067 u64 start = async_extent->start;
1068 u64 end = async_extent->start + async_extent->ram_size - 1;
1069 int ret;
1070 struct writeback_control wbc = {
1071 .sync_mode = WB_SYNC_ALL,
1072 .range_start = start,
1073 .range_end = end,
1074 .no_cgroup_owner = 1,
1075 };
1076
1077 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1078 ret = run_delalloc_cow(inode, locked_folio, start, end,
1079 &wbc, false);
1080 wbc_detach_inode(&wbc);
1081 if (ret < 0) {
1082 if (locked_folio)
1083 btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
1084 start, async_extent->ram_size);
1085 btrfs_err_rl(inode->root->fs_info,
1086 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1087 __func__, btrfs_root_id(inode->root),
1088 btrfs_ino(inode), start, async_extent->ram_size, ret);
1089 }
1090 }
1091
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1092 static void submit_one_async_extent(struct async_chunk *async_chunk,
1093 struct async_extent *async_extent,
1094 u64 *alloc_hint)
1095 {
1096 struct btrfs_inode *inode = async_chunk->inode;
1097 struct extent_io_tree *io_tree = &inode->io_tree;
1098 struct btrfs_root *root = inode->root;
1099 struct btrfs_fs_info *fs_info = root->fs_info;
1100 struct btrfs_ordered_extent *ordered;
1101 struct btrfs_file_extent file_extent;
1102 struct btrfs_key ins;
1103 struct folio *locked_folio = NULL;
1104 struct extent_state *cached = NULL;
1105 struct extent_map *em;
1106 int ret = 0;
1107 bool free_pages = false;
1108 u64 start = async_extent->start;
1109 u64 end = async_extent->start + async_extent->ram_size - 1;
1110
1111 if (async_chunk->blkcg_css)
1112 kthread_associate_blkcg(async_chunk->blkcg_css);
1113
1114 /*
1115 * If async_chunk->locked_folio is in the async_extent range, we need to
1116 * handle it.
1117 */
1118 if (async_chunk->locked_folio) {
1119 u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
1120 u64 locked_folio_end = locked_folio_start +
1121 folio_size(async_chunk->locked_folio) - 1;
1122
1123 if (!(start >= locked_folio_end || end <= locked_folio_start))
1124 locked_folio = async_chunk->locked_folio;
1125 }
1126
1127 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1128 ASSERT(!async_extent->folios);
1129 ASSERT(async_extent->nr_folios == 0);
1130 submit_uncompressed_range(inode, async_extent, locked_folio);
1131 free_pages = true;
1132 goto done;
1133 }
1134
1135 ret = btrfs_reserve_extent(root, async_extent->ram_size,
1136 async_extent->compressed_size,
1137 async_extent->compressed_size,
1138 0, *alloc_hint, &ins, 1, 1);
1139 if (ret) {
1140 /*
1141 * We can't reserve contiguous space for the compressed size.
1142 * Unlikely, but it's possible that we could have enough
1143 * non-contiguous space for the uncompressed size instead. So
1144 * fall back to uncompressed.
1145 */
1146 submit_uncompressed_range(inode, async_extent, locked_folio);
1147 free_pages = true;
1148 goto done;
1149 }
1150
1151 btrfs_lock_extent(io_tree, start, end, &cached);
1152
1153 /* Here we're doing allocation and writeback of the compressed pages */
1154 file_extent.disk_bytenr = ins.objectid;
1155 file_extent.disk_num_bytes = ins.offset;
1156 file_extent.ram_bytes = async_extent->ram_size;
1157 file_extent.num_bytes = async_extent->ram_size;
1158 file_extent.offset = 0;
1159 file_extent.compression = async_extent->compress_type;
1160
1161 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
1162 if (IS_ERR(em)) {
1163 ret = PTR_ERR(em);
1164 goto out_free_reserve;
1165 }
1166 btrfs_free_extent_map(em);
1167
1168 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1169 1U << BTRFS_ORDERED_COMPRESSED);
1170 if (IS_ERR(ordered)) {
1171 btrfs_drop_extent_map_range(inode, start, end, false);
1172 ret = PTR_ERR(ordered);
1173 goto out_free_reserve;
1174 }
1175 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1176
1177 /* Clear dirty, set writeback and unlock the pages. */
1178 extent_clear_unlock_delalloc(inode, start, end,
1179 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
1180 PAGE_UNLOCK | PAGE_START_WRITEBACK);
1181 btrfs_submit_compressed_write(ordered,
1182 async_extent->folios, /* compressed_folios */
1183 async_extent->nr_folios,
1184 async_chunk->write_flags, true);
1185 *alloc_hint = ins.objectid + ins.offset;
1186 done:
1187 if (async_chunk->blkcg_css)
1188 kthread_associate_blkcg(NULL);
1189 if (free_pages)
1190 free_async_extent_pages(async_extent);
1191 kfree(async_extent);
1192 return;
1193
1194 out_free_reserve:
1195 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1196 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
1197 mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1198 extent_clear_unlock_delalloc(inode, start, end,
1199 NULL, &cached,
1200 EXTENT_LOCKED | EXTENT_DELALLOC |
1201 EXTENT_DELALLOC_NEW |
1202 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1203 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1204 PAGE_END_WRITEBACK);
1205 free_async_extent_pages(async_extent);
1206 if (async_chunk->blkcg_css)
1207 kthread_associate_blkcg(NULL);
1208 btrfs_debug(fs_info,
1209 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1210 btrfs_root_id(root), btrfs_ino(inode), start,
1211 async_extent->ram_size, ret);
1212 kfree(async_extent);
1213 }
1214
btrfs_get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1215 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1216 u64 num_bytes)
1217 {
1218 struct extent_map_tree *em_tree = &inode->extent_tree;
1219 struct extent_map *em;
1220 u64 alloc_hint = 0;
1221
1222 read_lock(&em_tree->lock);
1223 em = btrfs_search_extent_mapping(em_tree, start, num_bytes);
1224 if (em) {
1225 /*
1226 * if block start isn't an actual block number then find the
1227 * first block in this inode and use that as a hint. If that
1228 * block is also bogus then just don't worry about it.
1229 */
1230 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
1231 btrfs_free_extent_map(em);
1232 em = btrfs_search_extent_mapping(em_tree, 0, 0);
1233 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1234 alloc_hint = btrfs_extent_map_block_start(em);
1235 if (em)
1236 btrfs_free_extent_map(em);
1237 } else {
1238 alloc_hint = btrfs_extent_map_block_start(em);
1239 btrfs_free_extent_map(em);
1240 }
1241 }
1242 read_unlock(&em_tree->lock);
1243
1244 return alloc_hint;
1245 }
1246
1247 /*
1248 * when extent_io.c finds a delayed allocation range in the file,
1249 * the call backs end up in this code. The basic idea is to
1250 * allocate extents on disk for the range, and create ordered data structs
1251 * in ram to track those extents.
1252 *
1253 * locked_folio is the folio that writepage had locked already. We use
1254 * it to make sure we don't do extra locks or unlocks.
1255 *
1256 * When this function fails, it unlocks all folios except @locked_folio.
1257 *
1258 * When this function successfully creates an inline extent, it returns 1 and
1259 * unlocks all folios including locked_folio and starts I/O on them.
1260 * (In reality inline extents are limited to a single block, so locked_folio is
1261 * the only folio handled anyway).
1262 *
1263 * When this function succeed and creates a normal extent, the folio locking
1264 * status depends on the passed in flags:
1265 *
1266 * - If COW_FILE_RANGE_KEEP_LOCKED flag is set, all folios are kept locked.
1267 * - Else all folios except for @locked_folio are unlocked.
1268 *
1269 * When a failure happens in the second or later iteration of the
1270 * while-loop, the ordered extents created in previous iterations are cleaned up.
1271 */
cow_file_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,u64 * done_offset,unsigned long flags)1272 static noinline int cow_file_range(struct btrfs_inode *inode,
1273 struct folio *locked_folio, u64 start,
1274 u64 end, u64 *done_offset,
1275 unsigned long flags)
1276 {
1277 struct btrfs_root *root = inode->root;
1278 struct btrfs_fs_info *fs_info = root->fs_info;
1279 struct extent_state *cached = NULL;
1280 u64 alloc_hint = 0;
1281 u64 orig_start = start;
1282 u64 num_bytes;
1283 u64 cur_alloc_size = 0;
1284 u64 min_alloc_size;
1285 u64 blocksize = fs_info->sectorsize;
1286 struct btrfs_key ins;
1287 struct extent_map *em;
1288 unsigned clear_bits;
1289 unsigned long page_ops;
1290 int ret = 0;
1291
1292 if (btrfs_is_free_space_inode(inode)) {
1293 ret = -EINVAL;
1294 goto out_unlock;
1295 }
1296
1297 num_bytes = ALIGN(end - start + 1, blocksize);
1298 num_bytes = max(blocksize, num_bytes);
1299 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1300
1301 inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1302
1303 if (!(flags & COW_FILE_RANGE_NO_INLINE)) {
1304 /* lets try to make an inline extent */
1305 ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
1306 BTRFS_COMPRESS_NONE, NULL, false);
1307 if (ret <= 0) {
1308 /*
1309 * We succeeded, return 1 so the caller knows we're done
1310 * with this page and already handled the IO.
1311 *
1312 * If there was an error then cow_file_range_inline() has
1313 * already done the cleanup.
1314 */
1315 if (ret == 0)
1316 ret = 1;
1317 goto done;
1318 }
1319 }
1320
1321 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
1322
1323 /*
1324 * We're not doing compressed IO, don't unlock the first page (which
1325 * the caller expects to stay locked), don't clear any dirty bits and
1326 * don't set any writeback bits.
1327 *
1328 * Do set the Ordered (Private2) bit so we know this page was properly
1329 * setup for writepage.
1330 */
1331 page_ops = ((flags & COW_FILE_RANGE_KEEP_LOCKED) ? 0 : PAGE_UNLOCK);
1332 page_ops |= PAGE_SET_ORDERED;
1333
1334 /*
1335 * Relocation relies on the relocated extents to have exactly the same
1336 * size as the original extents. Normally writeback for relocation data
1337 * extents follows a NOCOW path because relocation preallocates the
1338 * extents. However, due to an operation such as scrub turning a block
1339 * group to RO mode, it may fallback to COW mode, so we must make sure
1340 * an extent allocated during COW has exactly the requested size and can
1341 * not be split into smaller extents, otherwise relocation breaks and
1342 * fails during the stage where it updates the bytenr of file extent
1343 * items.
1344 */
1345 if (btrfs_is_data_reloc_root(root))
1346 min_alloc_size = num_bytes;
1347 else
1348 min_alloc_size = fs_info->sectorsize;
1349
1350 while (num_bytes > 0) {
1351 struct btrfs_ordered_extent *ordered;
1352 struct btrfs_file_extent file_extent;
1353
1354 ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
1355 min_alloc_size, 0, alloc_hint,
1356 &ins, 1, 1);
1357 if (ret == -EAGAIN) {
1358 /*
1359 * btrfs_reserve_extent only returns -EAGAIN for zoned
1360 * file systems, which is an indication that there are
1361 * no active zones to allocate from at the moment.
1362 *
1363 * If this is the first loop iteration, wait for at
1364 * least one zone to finish before retrying the
1365 * allocation. Otherwise ask the caller to write out
1366 * the already allocated blocks before coming back to
1367 * us, or return -ENOSPC if it can't handle retries.
1368 */
1369 ASSERT(btrfs_is_zoned(fs_info));
1370 if (start == orig_start) {
1371 wait_on_bit_io(&inode->root->fs_info->flags,
1372 BTRFS_FS_NEED_ZONE_FINISH,
1373 TASK_UNINTERRUPTIBLE);
1374 continue;
1375 }
1376 if (done_offset) {
1377 /*
1378 * Move @end to the end of the processed range,
1379 * and exit the loop to unlock the processed extents.
1380 */
1381 end = start - 1;
1382 ret = 0;
1383 break;
1384 }
1385 ret = -ENOSPC;
1386 }
1387 if (ret < 0)
1388 goto out_unlock;
1389 cur_alloc_size = ins.offset;
1390
1391 file_extent.disk_bytenr = ins.objectid;
1392 file_extent.disk_num_bytes = ins.offset;
1393 file_extent.num_bytes = ins.offset;
1394 file_extent.ram_bytes = ins.offset;
1395 file_extent.offset = 0;
1396 file_extent.compression = BTRFS_COMPRESS_NONE;
1397
1398 /*
1399 * Locked range will be released either during error clean up or
1400 * after the whole range is finished.
1401 */
1402 btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
1403 &cached);
1404
1405 em = btrfs_create_io_em(inode, start, &file_extent,
1406 BTRFS_ORDERED_REGULAR);
1407 if (IS_ERR(em)) {
1408 btrfs_unlock_extent(&inode->io_tree, start,
1409 start + cur_alloc_size - 1, &cached);
1410 ret = PTR_ERR(em);
1411 goto out_reserve;
1412 }
1413 btrfs_free_extent_map(em);
1414
1415 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1416 1U << BTRFS_ORDERED_REGULAR);
1417 if (IS_ERR(ordered)) {
1418 btrfs_unlock_extent(&inode->io_tree, start,
1419 start + cur_alloc_size - 1, &cached);
1420 ret = PTR_ERR(ordered);
1421 goto out_drop_extent_cache;
1422 }
1423
1424 if (btrfs_is_data_reloc_root(root)) {
1425 ret = btrfs_reloc_clone_csums(ordered);
1426
1427 /*
1428 * Only drop cache here, and process as normal.
1429 *
1430 * We must not allow extent_clear_unlock_delalloc()
1431 * at out_unlock label to free meta of this ordered
1432 * extent, as its meta should be freed by
1433 * btrfs_finish_ordered_io().
1434 *
1435 * So we must continue until @start is increased to
1436 * skip current ordered extent.
1437 */
1438 if (ret)
1439 btrfs_drop_extent_map_range(inode, start,
1440 start + cur_alloc_size - 1,
1441 false);
1442 }
1443 btrfs_put_ordered_extent(ordered);
1444
1445 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1446
1447 if (num_bytes < cur_alloc_size)
1448 num_bytes = 0;
1449 else
1450 num_bytes -= cur_alloc_size;
1451 alloc_hint = ins.objectid + ins.offset;
1452 start += cur_alloc_size;
1453 cur_alloc_size = 0;
1454
1455 /*
1456 * btrfs_reloc_clone_csums() error, since start is increased
1457 * extent_clear_unlock_delalloc() at out_unlock label won't
1458 * free metadata of current ordered extent, we're OK to exit.
1459 */
1460 if (ret)
1461 goto out_unlock;
1462 }
1463 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
1464 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
1465 done:
1466 if (done_offset)
1467 *done_offset = end;
1468 return ret;
1469
1470 out_drop_extent_cache:
1471 btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
1472 out_reserve:
1473 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1474 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
1475 out_unlock:
1476 /*
1477 * Now, we have three regions to clean up:
1478 *
1479 * |-------(1)----|---(2)---|-------------(3)----------|
1480 * `- orig_start `- start `- start + cur_alloc_size `- end
1481 *
1482 * We process each region below.
1483 */
1484
1485 /*
1486 * For the range (1). We have already instantiated the ordered extents
1487 * for this region, thus we need to cleanup those ordered extents.
1488 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
1489 * are also handled by the ordered extents cleanup.
1490 *
1491 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
1492 * finish the writeback of the involved folios, which will be never submitted.
1493 */
1494 if (orig_start < start) {
1495 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1496 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1497
1498 if (!locked_folio)
1499 mapping_set_error(inode->vfs_inode.i_mapping, ret);
1500
1501 btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
1502 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1503 locked_folio, NULL, clear_bits, page_ops);
1504 }
1505
1506 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1507 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1508 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1509
1510 /*
1511 * For the range (2). If we reserved an extent for our delalloc range
1512 * (or a subrange) and failed to create the respective ordered extent,
1513 * then it means that when we reserved the extent we decremented the
1514 * extent's size from the data space_info's bytes_may_use counter and
1515 * incremented the space_info's bytes_reserved counter by the same
1516 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1517 * to decrement again the data space_info's bytes_may_use counter,
1518 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1519 */
1520 if (cur_alloc_size) {
1521 extent_clear_unlock_delalloc(inode, start,
1522 start + cur_alloc_size - 1,
1523 locked_folio, &cached, clear_bits,
1524 page_ops);
1525 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1526 }
1527
1528 /*
1529 * For the range (3). We never touched the region. In addition to the
1530 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1531 * space_info's bytes_may_use counter, reserved in
1532 * btrfs_check_data_free_space().
1533 */
1534 if (start + cur_alloc_size < end) {
1535 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1536 extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
1537 end, locked_folio,
1538 &cached, clear_bits, page_ops);
1539 btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
1540 end - start - cur_alloc_size + 1, NULL);
1541 }
1542 btrfs_err(fs_info,
1543 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu cur_alloc_size=%llu: %d",
1544 __func__, btrfs_root_id(inode->root),
1545 btrfs_ino(inode), orig_start, end + 1 - orig_start,
1546 start, cur_alloc_size, ret);
1547 return ret;
1548 }
1549
1550 /*
1551 * Phase two of compressed writeback. This is the ordered portion of the code,
1552 * which only gets called in the order the work was queued. We walk all the
1553 * async extents created by compress_file_range and send them down to the disk.
1554 *
1555 * If called with @do_free == true then it'll try to finish the work and free
1556 * the work struct eventually.
1557 */
submit_compressed_extents(struct btrfs_work * work,bool do_free)1558 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1559 {
1560 struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1561 work);
1562 struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1563 struct async_extent *async_extent;
1564 unsigned long nr_pages;
1565 u64 alloc_hint = 0;
1566
1567 if (do_free) {
1568 struct async_cow *async_cow;
1569
1570 btrfs_add_delayed_iput(async_chunk->inode);
1571 if (async_chunk->blkcg_css)
1572 css_put(async_chunk->blkcg_css);
1573
1574 async_cow = async_chunk->async_cow;
1575 if (atomic_dec_and_test(&async_cow->num_chunks))
1576 kvfree(async_cow);
1577 return;
1578 }
1579
1580 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1581 PAGE_SHIFT;
1582
1583 while (!list_empty(&async_chunk->extents)) {
1584 async_extent = list_first_entry(&async_chunk->extents,
1585 struct async_extent, list);
1586 list_del(&async_extent->list);
1587 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1588 }
1589
1590 /* atomic_sub_return implies a barrier */
1591 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1592 5 * SZ_1M)
1593 cond_wake_up_nomb(&fs_info->async_submit_wait);
1594 }
1595
run_delalloc_compressed(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)1596 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1597 struct folio *locked_folio, u64 start,
1598 u64 end, struct writeback_control *wbc)
1599 {
1600 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1601 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1602 struct async_cow *ctx;
1603 struct async_chunk *async_chunk;
1604 unsigned long nr_pages;
1605 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1606 int i;
1607 unsigned nofs_flag;
1608 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1609
1610 nofs_flag = memalloc_nofs_save();
1611 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1612 memalloc_nofs_restore(nofs_flag);
1613 if (!ctx)
1614 return false;
1615
1616 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1617
1618 async_chunk = ctx->chunks;
1619 atomic_set(&ctx->num_chunks, num_chunks);
1620
1621 for (i = 0; i < num_chunks; i++) {
1622 u64 cur_end = min(end, start + SZ_512K - 1);
1623
1624 /*
1625 * igrab is called higher up in the call chain, take only the
1626 * lightweight reference for the callback lifetime
1627 */
1628 ihold(&inode->vfs_inode);
1629 async_chunk[i].async_cow = ctx;
1630 async_chunk[i].inode = inode;
1631 async_chunk[i].start = start;
1632 async_chunk[i].end = cur_end;
1633 async_chunk[i].write_flags = write_flags;
1634 INIT_LIST_HEAD(&async_chunk[i].extents);
1635
1636 /*
1637 * The locked_folio comes all the way from writepage and its
1638 * the original folio we were actually given. As we spread
1639 * this large delalloc region across multiple async_chunk
1640 * structs, only the first struct needs a pointer to
1641 * locked_folio.
1642 *
1643 * This way we don't need racey decisions about who is supposed
1644 * to unlock it.
1645 */
1646 if (locked_folio) {
1647 /*
1648 * Depending on the compressibility, the pages might or
1649 * might not go through async. We want all of them to
1650 * be accounted against wbc once. Let's do it here
1651 * before the paths diverge. wbc accounting is used
1652 * only for foreign writeback detection and doesn't
1653 * need full accuracy. Just account the whole thing
1654 * against the first page.
1655 */
1656 wbc_account_cgroup_owner(wbc, locked_folio,
1657 cur_end - start);
1658 async_chunk[i].locked_folio = locked_folio;
1659 locked_folio = NULL;
1660 } else {
1661 async_chunk[i].locked_folio = NULL;
1662 }
1663
1664 if (blkcg_css != blkcg_root_css) {
1665 css_get(blkcg_css);
1666 async_chunk[i].blkcg_css = blkcg_css;
1667 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1668 } else {
1669 async_chunk[i].blkcg_css = NULL;
1670 }
1671
1672 btrfs_init_work(&async_chunk[i].work, compress_file_range,
1673 submit_compressed_extents);
1674
1675 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1676 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1677
1678 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1679
1680 start = cur_end + 1;
1681 }
1682 return true;
1683 }
1684
1685 /*
1686 * Run the delalloc range from start to end, and write back any dirty pages
1687 * covered by the range.
1688 */
run_delalloc_cow(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1689 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1690 struct folio *locked_folio, u64 start,
1691 u64 end, struct writeback_control *wbc,
1692 bool pages_dirty)
1693 {
1694 u64 done_offset = end;
1695 int ret;
1696
1697 while (start <= end) {
1698 ret = cow_file_range(inode, locked_folio, start, end,
1699 &done_offset, COW_FILE_RANGE_KEEP_LOCKED);
1700 if (ret)
1701 return ret;
1702 extent_write_locked_range(&inode->vfs_inode, locked_folio,
1703 start, done_offset, wbc, pages_dirty);
1704 start = done_offset + 1;
1705 }
1706
1707 return 1;
1708 }
1709
fallback_to_cow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)1710 static int fallback_to_cow(struct btrfs_inode *inode,
1711 struct folio *locked_folio, const u64 start,
1712 const u64 end)
1713 {
1714 const bool is_space_ino = btrfs_is_free_space_inode(inode);
1715 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1716 const u64 range_bytes = end + 1 - start;
1717 struct extent_io_tree *io_tree = &inode->io_tree;
1718 struct extent_state *cached_state = NULL;
1719 u64 range_start = start;
1720 u64 count;
1721 int ret;
1722
1723 /*
1724 * If EXTENT_NORESERVE is set it means that when the buffered write was
1725 * made we had not enough available data space and therefore we did not
1726 * reserve data space for it, since we though we could do NOCOW for the
1727 * respective file range (either there is prealloc extent or the inode
1728 * has the NOCOW bit set).
1729 *
1730 * However when we need to fallback to COW mode (because for example the
1731 * block group for the corresponding extent was turned to RO mode by a
1732 * scrub or relocation) we need to do the following:
1733 *
1734 * 1) We increment the bytes_may_use counter of the data space info.
1735 * If COW succeeds, it allocates a new data extent and after doing
1736 * that it decrements the space info's bytes_may_use counter and
1737 * increments its bytes_reserved counter by the same amount (we do
1738 * this at btrfs_add_reserved_bytes()). So we need to increment the
1739 * bytes_may_use counter to compensate (when space is reserved at
1740 * buffered write time, the bytes_may_use counter is incremented);
1741 *
1742 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1743 * that if the COW path fails for any reason, it decrements (through
1744 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1745 * data space info, which we incremented in the step above.
1746 *
1747 * If we need to fallback to cow and the inode corresponds to a free
1748 * space cache inode or an inode of the data relocation tree, we must
1749 * also increment bytes_may_use of the data space_info for the same
1750 * reason. Space caches and relocated data extents always get a prealloc
1751 * extent for them, however scrub or balance may have set the block
1752 * group that contains that extent to RO mode and therefore force COW
1753 * when starting writeback.
1754 */
1755 btrfs_lock_extent(io_tree, start, end, &cached_state);
1756 count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes,
1757 EXTENT_NORESERVE, 0, NULL);
1758 if (count > 0 || is_space_ino || is_reloc_ino) {
1759 u64 bytes = count;
1760 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1761 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1762
1763 if (is_space_ino || is_reloc_ino)
1764 bytes = range_bytes;
1765
1766 spin_lock(&sinfo->lock);
1767 btrfs_space_info_update_bytes_may_use(sinfo, bytes);
1768 spin_unlock(&sinfo->lock);
1769
1770 if (count > 0)
1771 btrfs_clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1772 &cached_state);
1773 }
1774 btrfs_unlock_extent(io_tree, start, end, &cached_state);
1775
1776 /*
1777 * Don't try to create inline extents, as a mix of inline extent that
1778 * is written out and unlocked directly and a normal NOCOW extent
1779 * doesn't work.
1780 *
1781 * And here we do not unlock the folio after a successful run.
1782 * The folios will be unlocked after everything is finished, or by error handling.
1783 *
1784 * This is to ensure error handling won't need to clear dirty/ordered flags without
1785 * a locked folio, which can race with writeback.
1786 */
1787 ret = cow_file_range(inode, locked_folio, start, end, NULL,
1788 COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED);
1789 ASSERT(ret != 1);
1790 return ret;
1791 }
1792
1793 struct can_nocow_file_extent_args {
1794 /* Input fields. */
1795
1796 /* Start file offset of the range we want to NOCOW. */
1797 u64 start;
1798 /* End file offset (inclusive) of the range we want to NOCOW. */
1799 u64 end;
1800 bool writeback_path;
1801 /*
1802 * Free the path passed to can_nocow_file_extent() once it's not needed
1803 * anymore.
1804 */
1805 bool free_path;
1806
1807 /*
1808 * Output fields. Only set when can_nocow_file_extent() returns 1.
1809 * The expected file extent for the NOCOW write.
1810 */
1811 struct btrfs_file_extent file_extent;
1812 };
1813
1814 /*
1815 * Check if we can NOCOW the file extent that the path points to.
1816 * This function may return with the path released, so the caller should check
1817 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1818 *
1819 * Returns: < 0 on error
1820 * 0 if we can not NOCOW
1821 * 1 if we can NOCOW
1822 */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1823 static int can_nocow_file_extent(struct btrfs_path *path,
1824 struct btrfs_key *key,
1825 struct btrfs_inode *inode,
1826 struct can_nocow_file_extent_args *args)
1827 {
1828 const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1829 struct extent_buffer *leaf = path->nodes[0];
1830 struct btrfs_root *root = inode->root;
1831 struct btrfs_file_extent_item *fi;
1832 struct btrfs_root *csum_root;
1833 u64 io_start;
1834 u64 extent_end;
1835 u8 extent_type;
1836 int can_nocow = 0;
1837 int ret = 0;
1838 bool nowait = path->nowait;
1839
1840 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1841 extent_type = btrfs_file_extent_type(leaf, fi);
1842
1843 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1844 goto out;
1845
1846 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1847 extent_type == BTRFS_FILE_EXTENT_REG)
1848 goto out;
1849
1850 /*
1851 * If the extent was created before the generation where the last snapshot
1852 * for its subvolume was created, then this implies the extent is shared,
1853 * hence we must COW.
1854 */
1855 if (btrfs_file_extent_generation(leaf, fi) <=
1856 btrfs_root_last_snapshot(&root->root_item))
1857 goto out;
1858
1859 /* An explicit hole, must COW. */
1860 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
1861 goto out;
1862
1863 /* Compressed/encrypted/encoded extents must be COWed. */
1864 if (btrfs_file_extent_compression(leaf, fi) ||
1865 btrfs_file_extent_encryption(leaf, fi) ||
1866 btrfs_file_extent_other_encoding(leaf, fi))
1867 goto out;
1868
1869 extent_end = btrfs_file_extent_end(path);
1870
1871 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1872 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1873 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1874 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1875 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1876
1877 /*
1878 * The following checks can be expensive, as they need to take other
1879 * locks and do btree or rbtree searches, so release the path to avoid
1880 * blocking other tasks for too long.
1881 */
1882 btrfs_release_path(path);
1883
1884 ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
1885 args->file_extent.disk_bytenr, path);
1886 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1887 if (ret != 0)
1888 goto out;
1889
1890 if (args->free_path) {
1891 /*
1892 * We don't need the path anymore, plus through the
1893 * btrfs_lookup_csums_list() call below we will end up allocating
1894 * another path. So free the path to avoid unnecessary extra
1895 * memory usage.
1896 */
1897 btrfs_free_path(path);
1898 path = NULL;
1899 }
1900
1901 /* If there are pending snapshots for this root, we must COW. */
1902 if (args->writeback_path && !is_freespace_inode &&
1903 atomic_read(&root->snapshot_force_cow))
1904 goto out;
1905
1906 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
1907 args->file_extent.offset += args->start - key->offset;
1908 io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
1909
1910 /*
1911 * Force COW if csums exist in the range. This ensures that csums for a
1912 * given extent are either valid or do not exist.
1913 */
1914
1915 csum_root = btrfs_csum_root(root->fs_info, io_start);
1916 ret = btrfs_lookup_csums_list(csum_root, io_start,
1917 io_start + args->file_extent.num_bytes - 1,
1918 NULL, nowait);
1919 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1920 if (ret != 0)
1921 goto out;
1922
1923 can_nocow = 1;
1924 out:
1925 if (args->free_path && path)
1926 btrfs_free_path(path);
1927
1928 return ret < 0 ? ret : can_nocow;
1929 }
1930
nocow_one_range(struct btrfs_inode * inode,struct folio * locked_folio,struct extent_state ** cached,struct can_nocow_file_extent_args * nocow_args,u64 file_pos,bool is_prealloc)1931 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
1932 struct extent_state **cached,
1933 struct can_nocow_file_extent_args *nocow_args,
1934 u64 file_pos, bool is_prealloc)
1935 {
1936 struct btrfs_ordered_extent *ordered;
1937 const u64 len = nocow_args->file_extent.num_bytes;
1938 const u64 end = file_pos + len - 1;
1939 int ret = 0;
1940
1941 btrfs_lock_extent(&inode->io_tree, file_pos, end, cached);
1942
1943 if (is_prealloc) {
1944 struct extent_map *em;
1945
1946 em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
1947 BTRFS_ORDERED_PREALLOC);
1948 if (IS_ERR(em)) {
1949 ret = PTR_ERR(em);
1950 goto error;
1951 }
1952 btrfs_free_extent_map(em);
1953 }
1954
1955 ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
1956 is_prealloc
1957 ? (1U << BTRFS_ORDERED_PREALLOC)
1958 : (1U << BTRFS_ORDERED_NOCOW));
1959 if (IS_ERR(ordered)) {
1960 if (is_prealloc)
1961 btrfs_drop_extent_map_range(inode, file_pos, end, false);
1962 ret = PTR_ERR(ordered);
1963 goto error;
1964 }
1965
1966 if (btrfs_is_data_reloc_root(inode->root))
1967 /*
1968 * Errors are handled later, as we must prevent
1969 * extent_clear_unlock_delalloc() in error handler from freeing
1970 * metadata of the created ordered extent.
1971 */
1972 ret = btrfs_reloc_clone_csums(ordered);
1973 btrfs_put_ordered_extent(ordered);
1974
1975 if (ret < 0)
1976 goto error;
1977 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
1978 EXTENT_LOCKED | EXTENT_DELALLOC |
1979 EXTENT_CLEAR_DATA_RESV,
1980 PAGE_SET_ORDERED);
1981 return ret;
1982
1983 error:
1984 btrfs_cleanup_ordered_extents(inode, file_pos, len);
1985 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
1986 EXTENT_LOCKED | EXTENT_DELALLOC |
1987 EXTENT_CLEAR_DATA_RESV,
1988 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1989 PAGE_END_WRITEBACK);
1990 btrfs_err(inode->root->fs_info,
1991 "%s failed, root=%lld inode=%llu start=%llu len=%llu: %d",
1992 __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
1993 file_pos, len, ret);
1994 return ret;
1995 }
1996
1997 /*
1998 * When nocow writeback calls back. This checks for snapshots or COW copies
1999 * of the extents that exist in the file, and COWs the file as required.
2000 *
2001 * If no cow copies or snapshots exist, we write directly to the existing
2002 * blocks on disk
2003 */
run_delalloc_nocow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)2004 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2005 struct folio *locked_folio,
2006 const u64 start, const u64 end)
2007 {
2008 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2009 struct btrfs_root *root = inode->root;
2010 struct btrfs_path *path;
2011 u64 cow_start = (u64)-1;
2012 /*
2013 * If not 0, represents the inclusive end of the last fallback_to_cow()
2014 * range. Only for error handling.
2015 *
2016 * The same for nocow_end, it's to avoid double cleaning up the range
2017 * already cleaned by nocow_one_range().
2018 */
2019 u64 cow_end = 0;
2020 u64 nocow_end = 0;
2021 u64 cur_offset = start;
2022 int ret;
2023 bool check_prev = true;
2024 u64 ino = btrfs_ino(inode);
2025 struct can_nocow_file_extent_args nocow_args = { 0 };
2026 /* The range that has ordered extent(s). */
2027 u64 oe_cleanup_start;
2028 u64 oe_cleanup_len = 0;
2029 /* The range that is untouched. */
2030 u64 untouched_start;
2031 u64 untouched_len = 0;
2032
2033 /*
2034 * Normally on a zoned device we're only doing COW writes, but in case
2035 * of relocation on a zoned filesystem serializes I/O so that we're only
2036 * writing sequentially and can end up here as well.
2037 */
2038 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2039
2040 path = btrfs_alloc_path();
2041 if (!path) {
2042 ret = -ENOMEM;
2043 goto error;
2044 }
2045
2046 nocow_args.end = end;
2047 nocow_args.writeback_path = true;
2048
2049 while (cur_offset <= end) {
2050 struct btrfs_block_group *nocow_bg = NULL;
2051 struct btrfs_key found_key;
2052 struct btrfs_file_extent_item *fi;
2053 struct extent_buffer *leaf;
2054 struct extent_state *cached_state = NULL;
2055 u64 extent_end;
2056 int extent_type;
2057
2058 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2059 cur_offset, 0);
2060 if (ret < 0)
2061 goto error;
2062
2063 /*
2064 * If there is no extent for our range when doing the initial
2065 * search, then go back to the previous slot as it will be the
2066 * one containing the search offset
2067 */
2068 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2069 leaf = path->nodes[0];
2070 btrfs_item_key_to_cpu(leaf, &found_key,
2071 path->slots[0] - 1);
2072 if (found_key.objectid == ino &&
2073 found_key.type == BTRFS_EXTENT_DATA_KEY)
2074 path->slots[0]--;
2075 }
2076 check_prev = false;
2077 next_slot:
2078 /* Go to next leaf if we have exhausted the current one */
2079 leaf = path->nodes[0];
2080 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2081 ret = btrfs_next_leaf(root, path);
2082 if (ret < 0)
2083 goto error;
2084 if (ret > 0)
2085 break;
2086 leaf = path->nodes[0];
2087 }
2088
2089 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2090
2091 /* Didn't find anything for our INO */
2092 if (found_key.objectid > ino)
2093 break;
2094 /*
2095 * Keep searching until we find an EXTENT_ITEM or there are no
2096 * more extents for this inode
2097 */
2098 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2099 found_key.type < BTRFS_EXTENT_DATA_KEY) {
2100 path->slots[0]++;
2101 goto next_slot;
2102 }
2103
2104 /* Found key is not EXTENT_DATA_KEY or starts after req range */
2105 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2106 found_key.offset > end)
2107 break;
2108
2109 /*
2110 * If the found extent starts after requested offset, then
2111 * adjust cur_offset to be right before this extent begins.
2112 */
2113 if (found_key.offset > cur_offset) {
2114 if (cow_start == (u64)-1)
2115 cow_start = cur_offset;
2116 cur_offset = found_key.offset;
2117 goto next_slot;
2118 }
2119
2120 /*
2121 * Found extent which begins before our range and potentially
2122 * intersect it
2123 */
2124 fi = btrfs_item_ptr(leaf, path->slots[0],
2125 struct btrfs_file_extent_item);
2126 extent_type = btrfs_file_extent_type(leaf, fi);
2127 /* If this is triggered then we have a memory corruption. */
2128 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2129 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2130 ret = -EUCLEAN;
2131 goto error;
2132 }
2133 extent_end = btrfs_file_extent_end(path);
2134
2135 /*
2136 * If the extent we got ends before our current offset, skip to
2137 * the next extent.
2138 */
2139 if (extent_end <= cur_offset) {
2140 path->slots[0]++;
2141 goto next_slot;
2142 }
2143
2144 nocow_args.start = cur_offset;
2145 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2146 if (ret < 0)
2147 goto error;
2148 if (ret == 0)
2149 goto must_cow;
2150
2151 ret = 0;
2152 nocow_bg = btrfs_inc_nocow_writers(fs_info,
2153 nocow_args.file_extent.disk_bytenr +
2154 nocow_args.file_extent.offset);
2155 if (!nocow_bg) {
2156 must_cow:
2157 /*
2158 * If we can't perform NOCOW writeback for the range,
2159 * then record the beginning of the range that needs to
2160 * be COWed. It will be written out before the next
2161 * NOCOW range if we find one, or when exiting this
2162 * loop.
2163 */
2164 if (cow_start == (u64)-1)
2165 cow_start = cur_offset;
2166 cur_offset = extent_end;
2167 if (cur_offset > end)
2168 break;
2169 if (!path->nodes[0])
2170 continue;
2171 path->slots[0]++;
2172 goto next_slot;
2173 }
2174
2175 /*
2176 * COW range from cow_start to found_key.offset - 1. As the key
2177 * will contain the beginning of the first extent that can be
2178 * NOCOW, following one which needs to be COW'ed
2179 */
2180 if (cow_start != (u64)-1) {
2181 ret = fallback_to_cow(inode, locked_folio, cow_start,
2182 found_key.offset - 1);
2183 if (ret) {
2184 cow_end = found_key.offset - 1;
2185 btrfs_dec_nocow_writers(nocow_bg);
2186 goto error;
2187 }
2188 cow_start = (u64)-1;
2189 }
2190
2191 ret = nocow_one_range(inode, locked_folio, &cached_state,
2192 &nocow_args, cur_offset,
2193 extent_type == BTRFS_FILE_EXTENT_PREALLOC);
2194 btrfs_dec_nocow_writers(nocow_bg);
2195 if (ret < 0) {
2196 nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1;
2197 goto error;
2198 }
2199 cur_offset = extent_end;
2200 }
2201 btrfs_release_path(path);
2202
2203 if (cur_offset <= end && cow_start == (u64)-1)
2204 cow_start = cur_offset;
2205
2206 if (cow_start != (u64)-1) {
2207 ret = fallback_to_cow(inode, locked_folio, cow_start, end);
2208 if (ret) {
2209 cow_end = end;
2210 goto error;
2211 }
2212 cow_start = (u64)-1;
2213 }
2214
2215 /*
2216 * Everything is finished without an error, can unlock the folios now.
2217 *
2218 * No need to touch the io tree range nor set folio ordered flag, as
2219 * fallback_to_cow() and nocow_one_range() have already handled them.
2220 */
2221 extent_clear_unlock_delalloc(inode, start, end, locked_folio, NULL, 0, PAGE_UNLOCK);
2222
2223 btrfs_free_path(path);
2224 return 0;
2225
2226 error:
2227 if (cow_start == (u64)-1) {
2228 /*
2229 * case a)
2230 * start cur_offset end
2231 * | OE cleanup | Untouched |
2232 *
2233 * We finished a fallback_to_cow() or nocow_one_range() call,
2234 * but failed to check the next range.
2235 *
2236 * or
2237 * start cur_offset nocow_end end
2238 * | OE cleanup | Skip | Untouched |
2239 *
2240 * nocow_one_range() failed, the range [cur_offset, nocow_end] is
2241 * already cleaned up.
2242 */
2243 oe_cleanup_start = start;
2244 oe_cleanup_len = cur_offset - start;
2245 if (nocow_end)
2246 untouched_start = nocow_end + 1;
2247 else
2248 untouched_start = cur_offset;
2249 untouched_len = end + 1 - untouched_start;
2250 } else if (cow_start != (u64)-1 && cow_end == 0) {
2251 /*
2252 * case b)
2253 * start cow_start cur_offset end
2254 * | OE cleanup | Untouched |
2255 *
2256 * We got a range that needs COW, but before we hit the next NOCOW range,
2257 * thus [cow_start, cur_offset) doesn't yet have any OE.
2258 */
2259 oe_cleanup_start = start;
2260 oe_cleanup_len = cow_start - start;
2261 untouched_start = cow_start;
2262 untouched_len = end + 1 - untouched_start;
2263 } else {
2264 /*
2265 * case c)
2266 * start cow_start cow_end end
2267 * | OE cleanup | Skip | Untouched |
2268 *
2269 * fallback_to_cow() failed, and fallback_to_cow() will do the
2270 * cleanup for its range, we shouldn't touch the range
2271 * [cow_start, cow_end].
2272 */
2273 ASSERT(cow_start != (u64)-1 && cow_end != 0);
2274 oe_cleanup_start = start;
2275 oe_cleanup_len = cow_start - start;
2276 untouched_start = cow_end + 1;
2277 untouched_len = end + 1 - untouched_start;
2278 }
2279
2280 if (oe_cleanup_len) {
2281 const u64 oe_cleanup_end = oe_cleanup_start + oe_cleanup_len - 1;
2282 btrfs_cleanup_ordered_extents(inode, oe_cleanup_start, oe_cleanup_len);
2283 extent_clear_unlock_delalloc(inode, oe_cleanup_start, oe_cleanup_end,
2284 locked_folio, NULL,
2285 EXTENT_LOCKED | EXTENT_DELALLOC,
2286 PAGE_UNLOCK | PAGE_START_WRITEBACK |
2287 PAGE_END_WRITEBACK);
2288 }
2289
2290 if (untouched_len) {
2291 struct extent_state *cached = NULL;
2292 const u64 untouched_end = untouched_start + untouched_len - 1;
2293
2294 /*
2295 * We need to lock the extent here because we're clearing DELALLOC and
2296 * we're not locked at this point.
2297 */
2298 btrfs_lock_extent(&inode->io_tree, untouched_start, untouched_end, &cached);
2299 extent_clear_unlock_delalloc(inode, untouched_start, untouched_end,
2300 locked_folio, &cached,
2301 EXTENT_LOCKED | EXTENT_DELALLOC |
2302 EXTENT_DEFRAG |
2303 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2304 PAGE_START_WRITEBACK |
2305 PAGE_END_WRITEBACK);
2306 btrfs_qgroup_free_data(inode, NULL, untouched_start, untouched_len, NULL);
2307 }
2308 btrfs_free_path(path);
2309 btrfs_err(fs_info,
2310 "%s failed, root=%llu inode=%llu start=%llu len=%llu cur_offset=%llu oe_cleanup=%llu oe_cleanup_len=%llu untouched_start=%llu untouched_len=%llu: %d",
2311 __func__, btrfs_root_id(inode->root), btrfs_ino(inode),
2312 start, end + 1 - start, cur_offset, oe_cleanup_start, oe_cleanup_len,
2313 untouched_start, untouched_len, ret);
2314 return ret;
2315 }
2316
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2317 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2318 {
2319 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2320 if (inode->defrag_bytes &&
2321 btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
2322 return false;
2323 return true;
2324 }
2325 return false;
2326 }
2327
2328 /*
2329 * Function to process delayed allocation (create CoW) for ranges which are
2330 * being touched for the first time.
2331 */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)2332 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
2333 u64 start, u64 end, struct writeback_control *wbc)
2334 {
2335 const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2336 int ret;
2337
2338 /*
2339 * The range must cover part of the @locked_folio, or a return of 1
2340 * can confuse the caller.
2341 */
2342 ASSERT(!(end <= folio_pos(locked_folio) ||
2343 start >= folio_next_pos(locked_folio)));
2344
2345 if (should_nocow(inode, start, end)) {
2346 ret = run_delalloc_nocow(inode, locked_folio, start, end);
2347 return ret;
2348 }
2349
2350 if (btrfs_inode_can_compress(inode) &&
2351 inode_need_compress(inode, start, end) &&
2352 run_delalloc_compressed(inode, locked_folio, start, end, wbc))
2353 return 1;
2354
2355 if (zoned)
2356 ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
2357 true);
2358 else
2359 ret = cow_file_range(inode, locked_folio, start, end, NULL, 0);
2360 return ret;
2361 }
2362
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2363 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2364 struct extent_state *orig, u64 split)
2365 {
2366 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2367 u64 size;
2368
2369 lockdep_assert_held(&inode->io_tree.lock);
2370
2371 /* not delalloc, ignore it */
2372 if (!(orig->state & EXTENT_DELALLOC))
2373 return;
2374
2375 size = orig->end - orig->start + 1;
2376 if (size > fs_info->max_extent_size) {
2377 u32 num_extents;
2378 u64 new_size;
2379
2380 /*
2381 * See the explanation in btrfs_merge_delalloc_extent, the same
2382 * applies here, just in reverse.
2383 */
2384 new_size = orig->end - split + 1;
2385 num_extents = count_max_extents(fs_info, new_size);
2386 new_size = split - orig->start;
2387 num_extents += count_max_extents(fs_info, new_size);
2388 if (count_max_extents(fs_info, size) >= num_extents)
2389 return;
2390 }
2391
2392 spin_lock(&inode->lock);
2393 btrfs_mod_outstanding_extents(inode, 1);
2394 spin_unlock(&inode->lock);
2395 }
2396
2397 /*
2398 * Handle merged delayed allocation extents so we can keep track of new extents
2399 * that are just merged onto old extents, such as when we are doing sequential
2400 * writes, so we can properly account for the metadata space we'll need.
2401 */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2402 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2403 struct extent_state *other)
2404 {
2405 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2406 u64 new_size, old_size;
2407 u32 num_extents;
2408
2409 lockdep_assert_held(&inode->io_tree.lock);
2410
2411 /* not delalloc, ignore it */
2412 if (!(other->state & EXTENT_DELALLOC))
2413 return;
2414
2415 if (new->start > other->start)
2416 new_size = new->end - other->start + 1;
2417 else
2418 new_size = other->end - new->start + 1;
2419
2420 /* we're not bigger than the max, unreserve the space and go */
2421 if (new_size <= fs_info->max_extent_size) {
2422 spin_lock(&inode->lock);
2423 btrfs_mod_outstanding_extents(inode, -1);
2424 spin_unlock(&inode->lock);
2425 return;
2426 }
2427
2428 /*
2429 * We have to add up either side to figure out how many extents were
2430 * accounted for before we merged into one big extent. If the number of
2431 * extents we accounted for is <= the amount we need for the new range
2432 * then we can return, otherwise drop. Think of it like this
2433 *
2434 * [ 4k][MAX_SIZE]
2435 *
2436 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2437 * need 2 outstanding extents, on one side we have 1 and the other side
2438 * we have 1 so they are == and we can return. But in this case
2439 *
2440 * [MAX_SIZE+4k][MAX_SIZE+4k]
2441 *
2442 * Each range on their own accounts for 2 extents, but merged together
2443 * they are only 3 extents worth of accounting, so we need to drop in
2444 * this case.
2445 */
2446 old_size = other->end - other->start + 1;
2447 num_extents = count_max_extents(fs_info, old_size);
2448 old_size = new->end - new->start + 1;
2449 num_extents += count_max_extents(fs_info, old_size);
2450 if (count_max_extents(fs_info, new_size) >= num_extents)
2451 return;
2452
2453 spin_lock(&inode->lock);
2454 btrfs_mod_outstanding_extents(inode, -1);
2455 spin_unlock(&inode->lock);
2456 }
2457
btrfs_add_delalloc_inode(struct btrfs_inode * inode)2458 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
2459 {
2460 struct btrfs_root *root = inode->root;
2461 struct btrfs_fs_info *fs_info = root->fs_info;
2462
2463 spin_lock(&root->delalloc_lock);
2464 ASSERT(list_empty(&inode->delalloc_inodes));
2465 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2466 root->nr_delalloc_inodes++;
2467 if (root->nr_delalloc_inodes == 1) {
2468 spin_lock(&fs_info->delalloc_root_lock);
2469 ASSERT(list_empty(&root->delalloc_root));
2470 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2471 spin_unlock(&fs_info->delalloc_root_lock);
2472 }
2473 spin_unlock(&root->delalloc_lock);
2474 }
2475
btrfs_del_delalloc_inode(struct btrfs_inode * inode)2476 void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
2477 {
2478 struct btrfs_root *root = inode->root;
2479 struct btrfs_fs_info *fs_info = root->fs_info;
2480
2481 lockdep_assert_held(&root->delalloc_lock);
2482
2483 /*
2484 * We may be called after the inode was already deleted from the list,
2485 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2486 * and then later through btrfs_clear_delalloc_extent() while the inode
2487 * still has ->delalloc_bytes > 0.
2488 */
2489 if (!list_empty(&inode->delalloc_inodes)) {
2490 list_del_init(&inode->delalloc_inodes);
2491 root->nr_delalloc_inodes--;
2492 if (!root->nr_delalloc_inodes) {
2493 ASSERT(list_empty(&root->delalloc_inodes));
2494 spin_lock(&fs_info->delalloc_root_lock);
2495 ASSERT(!list_empty(&root->delalloc_root));
2496 list_del_init(&root->delalloc_root);
2497 spin_unlock(&fs_info->delalloc_root_lock);
2498 }
2499 }
2500 }
2501
2502 /*
2503 * Properly track delayed allocation bytes in the inode and to maintain the
2504 * list of inodes that have pending delalloc work to be done.
2505 */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2506 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2507 u32 bits)
2508 {
2509 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2510
2511 lockdep_assert_held(&inode->io_tree.lock);
2512
2513 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2514 WARN_ON(1);
2515 /*
2516 * set_bit and clear bit hooks normally require _irqsave/restore
2517 * but in this case, we are only testing for the DELALLOC
2518 * bit, which is only set or cleared with irqs on
2519 */
2520 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2521 u64 len = state->end + 1 - state->start;
2522 u64 prev_delalloc_bytes;
2523 u32 num_extents = count_max_extents(fs_info, len);
2524
2525 spin_lock(&inode->lock);
2526 btrfs_mod_outstanding_extents(inode, num_extents);
2527 spin_unlock(&inode->lock);
2528
2529 /* For sanity tests */
2530 if (btrfs_is_testing(fs_info))
2531 return;
2532
2533 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2534 fs_info->delalloc_batch);
2535 spin_lock(&inode->lock);
2536 prev_delalloc_bytes = inode->delalloc_bytes;
2537 inode->delalloc_bytes += len;
2538 if (bits & EXTENT_DEFRAG)
2539 inode->defrag_bytes += len;
2540 spin_unlock(&inode->lock);
2541
2542 /*
2543 * We don't need to be under the protection of the inode's lock,
2544 * because we are called while holding the inode's io_tree lock
2545 * and are therefore protected against concurrent calls of this
2546 * function and btrfs_clear_delalloc_extent().
2547 */
2548 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
2549 btrfs_add_delalloc_inode(inode);
2550 }
2551
2552 if (!(state->state & EXTENT_DELALLOC_NEW) &&
2553 (bits & EXTENT_DELALLOC_NEW)) {
2554 spin_lock(&inode->lock);
2555 inode->new_delalloc_bytes += state->end + 1 - state->start;
2556 spin_unlock(&inode->lock);
2557 }
2558 }
2559
2560 /*
2561 * Once a range is no longer delalloc this function ensures that proper
2562 * accounting happens.
2563 */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2564 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2565 struct extent_state *state, u32 bits)
2566 {
2567 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2568 u64 len = state->end + 1 - state->start;
2569 u32 num_extents = count_max_extents(fs_info, len);
2570
2571 lockdep_assert_held(&inode->io_tree.lock);
2572
2573 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2574 spin_lock(&inode->lock);
2575 inode->defrag_bytes -= len;
2576 spin_unlock(&inode->lock);
2577 }
2578
2579 /*
2580 * set_bit and clear bit hooks normally require _irqsave/restore
2581 * but in this case, we are only testing for the DELALLOC
2582 * bit, which is only set or cleared with irqs on
2583 */
2584 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2585 struct btrfs_root *root = inode->root;
2586 u64 new_delalloc_bytes;
2587
2588 spin_lock(&inode->lock);
2589 btrfs_mod_outstanding_extents(inode, -num_extents);
2590 spin_unlock(&inode->lock);
2591
2592 /*
2593 * We don't reserve metadata space for space cache inodes so we
2594 * don't need to call delalloc_release_metadata if there is an
2595 * error.
2596 */
2597 if (bits & EXTENT_CLEAR_META_RESV &&
2598 root != fs_info->tree_root)
2599 btrfs_delalloc_release_metadata(inode, len, true);
2600
2601 /* For sanity tests. */
2602 if (btrfs_is_testing(fs_info))
2603 return;
2604
2605 if (!btrfs_is_data_reloc_root(root) &&
2606 !btrfs_is_free_space_inode(inode) &&
2607 !(state->state & EXTENT_NORESERVE) &&
2608 (bits & EXTENT_CLEAR_DATA_RESV))
2609 btrfs_free_reserved_data_space_noquota(inode, len);
2610
2611 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2612 fs_info->delalloc_batch);
2613 spin_lock(&inode->lock);
2614 inode->delalloc_bytes -= len;
2615 new_delalloc_bytes = inode->delalloc_bytes;
2616 spin_unlock(&inode->lock);
2617
2618 /*
2619 * We don't need to be under the protection of the inode's lock,
2620 * because we are called while holding the inode's io_tree lock
2621 * and are therefore protected against concurrent calls of this
2622 * function and btrfs_set_delalloc_extent().
2623 */
2624 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2625 spin_lock(&root->delalloc_lock);
2626 btrfs_del_delalloc_inode(inode);
2627 spin_unlock(&root->delalloc_lock);
2628 }
2629 }
2630
2631 if ((state->state & EXTENT_DELALLOC_NEW) &&
2632 (bits & EXTENT_DELALLOC_NEW)) {
2633 spin_lock(&inode->lock);
2634 ASSERT(inode->new_delalloc_bytes >= len);
2635 inode->new_delalloc_bytes -= len;
2636 if (bits & EXTENT_ADD_INODE_BYTES)
2637 inode_add_bytes(&inode->vfs_inode, len);
2638 spin_unlock(&inode->lock);
2639 }
2640 }
2641
2642 /*
2643 * given a list of ordered sums record them in the inode. This happens
2644 * at IO completion time based on sums calculated at bio submission time.
2645 */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2646 static int add_pending_csums(struct btrfs_trans_handle *trans,
2647 struct list_head *list)
2648 {
2649 struct btrfs_ordered_sum *sum;
2650 struct btrfs_root *csum_root = NULL;
2651 int ret;
2652
2653 list_for_each_entry(sum, list, list) {
2654 trans->adding_csums = true;
2655 if (!csum_root)
2656 csum_root = btrfs_csum_root(trans->fs_info,
2657 sum->logical);
2658 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2659 trans->adding_csums = false;
2660 if (ret)
2661 return ret;
2662 }
2663 return 0;
2664 }
2665
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2666 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2667 const u64 start,
2668 const u64 len,
2669 struct extent_state **cached_state)
2670 {
2671 u64 search_start = start;
2672 const u64 end = start + len - 1;
2673
2674 while (search_start < end) {
2675 const u64 search_len = end - search_start + 1;
2676 struct extent_map *em;
2677 u64 em_len;
2678 int ret = 0;
2679
2680 em = btrfs_get_extent(inode, NULL, search_start, search_len);
2681 if (IS_ERR(em))
2682 return PTR_ERR(em);
2683
2684 if (em->disk_bytenr != EXTENT_MAP_HOLE)
2685 goto next;
2686
2687 em_len = em->len;
2688 if (em->start < search_start)
2689 em_len -= search_start - em->start;
2690 if (em_len > search_len)
2691 em_len = search_len;
2692
2693 ret = btrfs_set_extent_bit(&inode->io_tree, search_start,
2694 search_start + em_len - 1,
2695 EXTENT_DELALLOC_NEW, cached_state);
2696 next:
2697 search_start = btrfs_extent_map_end(em);
2698 btrfs_free_extent_map(em);
2699 if (ret)
2700 return ret;
2701 }
2702 return 0;
2703 }
2704
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2705 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2706 unsigned int extra_bits,
2707 struct extent_state **cached_state)
2708 {
2709 WARN_ON(PAGE_ALIGNED(end));
2710
2711 if (start >= i_size_read(&inode->vfs_inode) &&
2712 !(inode->flags & BTRFS_INODE_PREALLOC)) {
2713 /*
2714 * There can't be any extents following eof in this case so just
2715 * set the delalloc new bit for the range directly.
2716 */
2717 extra_bits |= EXTENT_DELALLOC_NEW;
2718 } else {
2719 int ret;
2720
2721 ret = btrfs_find_new_delalloc_bytes(inode, start,
2722 end + 1 - start,
2723 cached_state);
2724 if (ret)
2725 return ret;
2726 }
2727
2728 return btrfs_set_extent_bit(&inode->io_tree, start, end,
2729 EXTENT_DELALLOC | extra_bits, cached_state);
2730 }
2731
2732 /* see btrfs_writepage_start_hook for details on why this is required */
2733 struct btrfs_writepage_fixup {
2734 struct folio *folio;
2735 struct btrfs_inode *inode;
2736 struct btrfs_work work;
2737 };
2738
btrfs_writepage_fixup_worker(struct btrfs_work * work)2739 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2740 {
2741 struct btrfs_writepage_fixup *fixup =
2742 container_of(work, struct btrfs_writepage_fixup, work);
2743 struct btrfs_ordered_extent *ordered;
2744 struct extent_state *cached_state = NULL;
2745 struct extent_changeset *data_reserved = NULL;
2746 struct folio *folio = fixup->folio;
2747 struct btrfs_inode *inode = fixup->inode;
2748 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2749 u64 page_start = folio_pos(folio);
2750 u64 page_end = folio_next_pos(folio) - 1;
2751 int ret = 0;
2752 bool free_delalloc_space = true;
2753
2754 /*
2755 * This is similar to page_mkwrite, we need to reserve the space before
2756 * we take the folio lock.
2757 */
2758 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2759 folio_size(folio));
2760 again:
2761 folio_lock(folio);
2762
2763 /*
2764 * Before we queued this fixup, we took a reference on the folio.
2765 * folio->mapping may go NULL, but it shouldn't be moved to a different
2766 * address space.
2767 */
2768 if (!folio->mapping || !folio_test_dirty(folio) ||
2769 !folio_test_checked(folio)) {
2770 /*
2771 * Unfortunately this is a little tricky, either
2772 *
2773 * 1) We got here and our folio had already been dealt with and
2774 * we reserved our space, thus ret == 0, so we need to just
2775 * drop our space reservation and bail. This can happen the
2776 * first time we come into the fixup worker, or could happen
2777 * while waiting for the ordered extent.
2778 * 2) Our folio was already dealt with, but we happened to get an
2779 * ENOSPC above from the btrfs_delalloc_reserve_space. In
2780 * this case we obviously don't have anything to release, but
2781 * because the folio was already dealt with we don't want to
2782 * mark the folio with an error, so make sure we're resetting
2783 * ret to 0. This is why we have this check _before_ the ret
2784 * check, because we do not want to have a surprise ENOSPC
2785 * when the folio was already properly dealt with.
2786 */
2787 if (!ret) {
2788 btrfs_delalloc_release_extents(inode, folio_size(folio));
2789 btrfs_delalloc_release_space(inode, data_reserved,
2790 page_start, folio_size(folio),
2791 true);
2792 }
2793 ret = 0;
2794 goto out_page;
2795 }
2796
2797 /*
2798 * We can't mess with the folio state unless it is locked, so now that
2799 * it is locked bail if we failed to make our space reservation.
2800 */
2801 if (ret)
2802 goto out_page;
2803
2804 btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2805
2806 /* already ordered? We're done */
2807 if (folio_test_ordered(folio))
2808 goto out_reserved;
2809
2810 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2811 if (ordered) {
2812 btrfs_unlock_extent(&inode->io_tree, page_start, page_end,
2813 &cached_state);
2814 folio_unlock(folio);
2815 btrfs_start_ordered_extent(ordered);
2816 btrfs_put_ordered_extent(ordered);
2817 goto again;
2818 }
2819
2820 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2821 &cached_state);
2822 if (ret)
2823 goto out_reserved;
2824
2825 /*
2826 * Everything went as planned, we're now the owner of a dirty page with
2827 * delayed allocation bits set and space reserved for our COW
2828 * destination.
2829 *
2830 * The page was dirty when we started, nothing should have cleaned it.
2831 */
2832 BUG_ON(!folio_test_dirty(folio));
2833 free_delalloc_space = false;
2834 out_reserved:
2835 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2836 if (free_delalloc_space)
2837 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2838 PAGE_SIZE, true);
2839 btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2840 out_page:
2841 if (ret) {
2842 /*
2843 * We hit ENOSPC or other errors. Update the mapping and page
2844 * to reflect the errors and clean the page.
2845 */
2846 mapping_set_error(folio->mapping, ret);
2847 btrfs_mark_ordered_io_finished(inode, folio, page_start,
2848 folio_size(folio), !ret);
2849 folio_clear_dirty_for_io(folio);
2850 }
2851 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2852 folio_unlock(folio);
2853 folio_put(folio);
2854 kfree(fixup);
2855 extent_changeset_free(data_reserved);
2856 /*
2857 * As a precaution, do a delayed iput in case it would be the last iput
2858 * that could need flushing space. Recursing back to fixup worker would
2859 * deadlock.
2860 */
2861 btrfs_add_delayed_iput(inode);
2862 }
2863
2864 /*
2865 * There are a few paths in the higher layers of the kernel that directly
2866 * set the folio dirty bit without asking the filesystem if it is a
2867 * good idea. This causes problems because we want to make sure COW
2868 * properly happens and the data=ordered rules are followed.
2869 *
2870 * In our case any range that doesn't have the ORDERED bit set
2871 * hasn't been properly setup for IO. We kick off an async process
2872 * to fix it up. The async helper will wait for ordered extents, set
2873 * the delalloc bit and make it safe to write the folio.
2874 */
btrfs_writepage_cow_fixup(struct folio * folio)2875 int btrfs_writepage_cow_fixup(struct folio *folio)
2876 {
2877 struct inode *inode = folio->mapping->host;
2878 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2879 struct btrfs_writepage_fixup *fixup;
2880
2881 /* This folio has ordered extent covering it already */
2882 if (folio_test_ordered(folio))
2883 return 0;
2884
2885 /*
2886 * For experimental build, we error out instead of EAGAIN.
2887 *
2888 * We should not hit such out-of-band dirty folios anymore.
2889 */
2890 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
2891 DEBUG_WARN();
2892 btrfs_err_rl(fs_info,
2893 "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
2894 btrfs_root_id(BTRFS_I(inode)->root),
2895 btrfs_ino(BTRFS_I(inode)),
2896 folio_pos(folio));
2897 return -EUCLEAN;
2898 }
2899
2900 /*
2901 * folio_checked is set below when we create a fixup worker for this
2902 * folio, don't try to create another one if we're already
2903 * folio_test_checked.
2904 *
2905 * The extent_io writepage code will redirty the foio if we send back
2906 * EAGAIN.
2907 */
2908 if (folio_test_checked(folio))
2909 return -EAGAIN;
2910
2911 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2912 if (!fixup)
2913 return -EAGAIN;
2914
2915 /*
2916 * We are already holding a reference to this inode from
2917 * write_cache_pages. We need to hold it because the space reservation
2918 * takes place outside of the folio lock, and we can't trust
2919 * folio->mapping outside of the folio lock.
2920 */
2921 ihold(inode);
2922 btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
2923 folio_get(folio);
2924 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
2925 fixup->folio = folio;
2926 fixup->inode = BTRFS_I(inode);
2927 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2928
2929 return -EAGAIN;
2930 }
2931
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2932 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2933 struct btrfs_inode *inode, u64 file_pos,
2934 struct btrfs_file_extent_item *stack_fi,
2935 const bool update_inode_bytes,
2936 u64 qgroup_reserved)
2937 {
2938 struct btrfs_root *root = inode->root;
2939 const u64 sectorsize = root->fs_info->sectorsize;
2940 BTRFS_PATH_AUTO_FREE(path);
2941 struct extent_buffer *leaf;
2942 struct btrfs_key ins;
2943 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2944 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2945 u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2946 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2947 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2948 struct btrfs_drop_extents_args drop_args = { 0 };
2949 int ret;
2950
2951 path = btrfs_alloc_path();
2952 if (!path)
2953 return -ENOMEM;
2954
2955 /*
2956 * we may be replacing one extent in the tree with another.
2957 * The new extent is pinned in the extent map, and we don't want
2958 * to drop it from the cache until it is completely in the btree.
2959 *
2960 * So, tell btrfs_drop_extents to leave this extent in the cache.
2961 * the caller is expected to unpin it and allow it to be merged
2962 * with the others.
2963 */
2964 drop_args.path = path;
2965 drop_args.start = file_pos;
2966 drop_args.end = file_pos + num_bytes;
2967 drop_args.replace_extent = true;
2968 drop_args.extent_item_size = sizeof(*stack_fi);
2969 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2970 if (ret)
2971 goto out;
2972
2973 if (!drop_args.extent_inserted) {
2974 ins.objectid = btrfs_ino(inode);
2975 ins.type = BTRFS_EXTENT_DATA_KEY;
2976 ins.offset = file_pos;
2977
2978 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2979 sizeof(*stack_fi));
2980 if (ret)
2981 goto out;
2982 }
2983 leaf = path->nodes[0];
2984 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2985 write_extent_buffer(leaf, stack_fi,
2986 btrfs_item_ptr_offset(leaf, path->slots[0]),
2987 sizeof(struct btrfs_file_extent_item));
2988
2989 btrfs_release_path(path);
2990
2991 /*
2992 * If we dropped an inline extent here, we know the range where it is
2993 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2994 * number of bytes only for that range containing the inline extent.
2995 * The remaining of the range will be processed when clearing the
2996 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2997 */
2998 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2999 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
3000
3001 inline_size = drop_args.bytes_found - inline_size;
3002 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
3003 drop_args.bytes_found -= inline_size;
3004 num_bytes -= sectorsize;
3005 }
3006
3007 if (update_inode_bytes)
3008 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
3009
3010 ins.objectid = disk_bytenr;
3011 ins.type = BTRFS_EXTENT_ITEM_KEY;
3012 ins.offset = disk_num_bytes;
3013
3014 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
3015 if (ret)
3016 goto out;
3017
3018 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
3019 file_pos - offset,
3020 qgroup_reserved, &ins);
3021 out:
3022 return ret;
3023 }
3024
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)3025 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
3026 u64 start, u64 len)
3027 {
3028 struct btrfs_block_group *cache;
3029
3030 cache = btrfs_lookup_block_group(fs_info, start);
3031 ASSERT(cache);
3032
3033 spin_lock(&cache->lock);
3034 cache->delalloc_bytes -= len;
3035 spin_unlock(&cache->lock);
3036
3037 btrfs_put_block_group(cache);
3038 }
3039
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)3040 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3041 struct btrfs_ordered_extent *oe)
3042 {
3043 struct btrfs_file_extent_item stack_fi;
3044 bool update_inode_bytes;
3045 u64 num_bytes = oe->num_bytes;
3046 u64 ram_bytes = oe->ram_bytes;
3047
3048 memset(&stack_fi, 0, sizeof(stack_fi));
3049 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3050 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3051 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3052 oe->disk_num_bytes);
3053 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3054 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
3055 num_bytes = oe->truncated_len;
3056 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3057 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3058 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3059 /* Encryption and other encoding is reserved and all 0 */
3060
3061 /*
3062 * For delalloc, when completing an ordered extent we update the inode's
3063 * bytes when clearing the range in the inode's io tree, so pass false
3064 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3065 * except if the ordered extent was truncated.
3066 */
3067 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3068 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3069 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3070
3071 return insert_reserved_file_extent(trans, oe->inode,
3072 oe->file_offset, &stack_fi,
3073 update_inode_bytes, oe->qgroup_rsv);
3074 }
3075
3076 /*
3077 * As ordered data IO finishes, this gets called so we can finish
3078 * an ordered extent if the range of bytes in the file it covers are
3079 * fully written.
3080 */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3081 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3082 {
3083 struct btrfs_inode *inode = ordered_extent->inode;
3084 struct btrfs_root *root = inode->root;
3085 struct btrfs_fs_info *fs_info = root->fs_info;
3086 struct btrfs_trans_handle *trans = NULL;
3087 struct extent_io_tree *io_tree = &inode->io_tree;
3088 struct extent_state *cached_state = NULL;
3089 u64 start, end;
3090 int compress_type = 0;
3091 int ret = 0;
3092 u64 logical_len = ordered_extent->num_bytes;
3093 bool freespace_inode;
3094 bool truncated = false;
3095 bool clear_reserved_extent = true;
3096 unsigned int clear_bits = EXTENT_DEFRAG;
3097
3098 start = ordered_extent->file_offset;
3099 end = start + ordered_extent->num_bytes - 1;
3100
3101 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3102 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3103 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3104 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3105 clear_bits |= EXTENT_DELALLOC_NEW;
3106
3107 freespace_inode = btrfs_is_free_space_inode(inode);
3108 if (!freespace_inode)
3109 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3110
3111 if (unlikely(test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags))) {
3112 ret = -EIO;
3113 goto out;
3114 }
3115
3116 ret = btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3117 ordered_extent->disk_num_bytes);
3118 if (ret)
3119 goto out;
3120
3121 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3122 truncated = true;
3123 logical_len = ordered_extent->truncated_len;
3124 /* Truncated the entire extent, don't bother adding */
3125 if (!logical_len)
3126 goto out;
3127 }
3128
3129 /*
3130 * If it's a COW write we need to lock the extent range as we will be
3131 * inserting/replacing file extent items and unpinning an extent map.
3132 * This must be taken before joining a transaction, as it's a higher
3133 * level lock (like the inode's VFS lock), otherwise we can run into an
3134 * ABBA deadlock with other tasks (transactions work like a lock,
3135 * depending on their current state).
3136 */
3137 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3138 clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED;
3139 btrfs_lock_extent_bits(io_tree, start, end,
3140 EXTENT_LOCKED | EXTENT_FINISHING_ORDERED,
3141 &cached_state);
3142 }
3143
3144 if (freespace_inode)
3145 trans = btrfs_join_transaction_spacecache(root);
3146 else
3147 trans = btrfs_join_transaction(root);
3148 if (IS_ERR(trans)) {
3149 ret = PTR_ERR(trans);
3150 trans = NULL;
3151 goto out;
3152 }
3153
3154 trans->block_rsv = &inode->block_rsv;
3155
3156 ret = btrfs_insert_raid_extent(trans, ordered_extent);
3157 if (unlikely(ret)) {
3158 btrfs_abort_transaction(trans, ret);
3159 goto out;
3160 }
3161
3162 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3163 /* Logic error */
3164 ASSERT(list_empty(&ordered_extent->list));
3165 if (unlikely(!list_empty(&ordered_extent->list))) {
3166 ret = -EINVAL;
3167 btrfs_abort_transaction(trans, ret);
3168 goto out;
3169 }
3170
3171 btrfs_inode_safe_disk_i_size_write(inode, 0);
3172 ret = btrfs_update_inode_fallback(trans, inode);
3173 if (unlikely(ret)) {
3174 /* -ENOMEM or corruption */
3175 btrfs_abort_transaction(trans, ret);
3176 }
3177 goto out;
3178 }
3179
3180 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3181 compress_type = ordered_extent->compress_type;
3182 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3183 BUG_ON(compress_type);
3184 ret = btrfs_mark_extent_written(trans, inode,
3185 ordered_extent->file_offset,
3186 ordered_extent->file_offset +
3187 logical_len);
3188 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3189 ordered_extent->disk_num_bytes);
3190 } else {
3191 BUG_ON(root == fs_info->tree_root);
3192 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3193 if (!ret) {
3194 clear_reserved_extent = false;
3195 btrfs_release_delalloc_bytes(fs_info,
3196 ordered_extent->disk_bytenr,
3197 ordered_extent->disk_num_bytes);
3198 }
3199 }
3200 if (unlikely(ret < 0)) {
3201 btrfs_abort_transaction(trans, ret);
3202 goto out;
3203 }
3204
3205 ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset,
3206 ordered_extent->num_bytes, trans->transid);
3207 if (unlikely(ret < 0)) {
3208 btrfs_abort_transaction(trans, ret);
3209 goto out;
3210 }
3211
3212 ret = add_pending_csums(trans, &ordered_extent->list);
3213 if (unlikely(ret)) {
3214 btrfs_abort_transaction(trans, ret);
3215 goto out;
3216 }
3217
3218 /*
3219 * If this is a new delalloc range, clear its new delalloc flag to
3220 * update the inode's number of bytes. This needs to be done first
3221 * before updating the inode item.
3222 */
3223 if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3224 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3225 btrfs_clear_extent_bit(&inode->io_tree, start, end,
3226 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3227 &cached_state);
3228
3229 btrfs_inode_safe_disk_i_size_write(inode, 0);
3230 ret = btrfs_update_inode_fallback(trans, inode);
3231 if (unlikely(ret)) { /* -ENOMEM or corruption */
3232 btrfs_abort_transaction(trans, ret);
3233 goto out;
3234 }
3235 out:
3236 btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3237 &cached_state);
3238
3239 if (trans)
3240 btrfs_end_transaction(trans);
3241
3242 if (ret || truncated) {
3243 /*
3244 * If we failed to finish this ordered extent for any reason we
3245 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3246 * extent, and mark the inode with the error if it wasn't
3247 * already set. Any error during writeback would have already
3248 * set the mapping error, so we need to set it if we're the ones
3249 * marking this ordered extent as failed.
3250 */
3251 if (ret)
3252 btrfs_mark_ordered_extent_error(ordered_extent);
3253
3254 /*
3255 * Drop extent maps for the part of the extent we didn't write.
3256 *
3257 * We have an exception here for the free_space_inode, this is
3258 * because when we do btrfs_get_extent() on the free space inode
3259 * we will search the commit root. If this is a new block group
3260 * we won't find anything, and we will trip over the assert in
3261 * writepage where we do ASSERT(em->block_start !=
3262 * EXTENT_MAP_HOLE).
3263 *
3264 * Theoretically we could also skip this for any NOCOW extent as
3265 * we don't mess with the extent map tree in the NOCOW case, but
3266 * for now simply skip this if we are the free space inode.
3267 */
3268 if (!btrfs_is_free_space_inode(inode)) {
3269 u64 unwritten_start = start;
3270
3271 if (truncated)
3272 unwritten_start += logical_len;
3273
3274 btrfs_drop_extent_map_range(inode, unwritten_start,
3275 end, false);
3276 }
3277
3278 /*
3279 * If the ordered extent had an IOERR or something else went
3280 * wrong we need to return the space for this ordered extent
3281 * back to the allocator. We only free the extent in the
3282 * truncated case if we didn't write out the extent at all.
3283 *
3284 * If we made it past insert_reserved_file_extent before we
3285 * errored out then we don't need to do this as the accounting
3286 * has already been done.
3287 */
3288 if ((ret || !logical_len) &&
3289 clear_reserved_extent &&
3290 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3291 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3292 /*
3293 * Discard the range before returning it back to the
3294 * free space pool
3295 */
3296 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3297 btrfs_discard_extent(fs_info,
3298 ordered_extent->disk_bytenr,
3299 ordered_extent->disk_num_bytes,
3300 NULL);
3301 btrfs_free_reserved_extent(fs_info,
3302 ordered_extent->disk_bytenr,
3303 ordered_extent->disk_num_bytes, true);
3304 /*
3305 * Actually free the qgroup rsv which was released when
3306 * the ordered extent was created.
3307 */
3308 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
3309 ordered_extent->qgroup_rsv,
3310 BTRFS_QGROUP_RSV_DATA);
3311 }
3312 }
3313
3314 /*
3315 * This needs to be done to make sure anybody waiting knows we are done
3316 * updating everything for this ordered extent.
3317 */
3318 btrfs_remove_ordered_extent(inode, ordered_extent);
3319
3320 /* once for us */
3321 btrfs_put_ordered_extent(ordered_extent);
3322 /* once for the tree */
3323 btrfs_put_ordered_extent(ordered_extent);
3324
3325 return ret;
3326 }
3327
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3328 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3329 {
3330 if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
3331 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3332 list_empty(&ordered->bioc_list))
3333 btrfs_finish_ordered_zoned(ordered);
3334 return btrfs_finish_one_ordered(ordered);
3335 }
3336
btrfs_calculate_block_csum(struct btrfs_fs_info * fs_info,phys_addr_t paddr,u8 * dest)3337 void btrfs_calculate_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr,
3338 u8 *dest)
3339 {
3340 struct folio *folio = page_folio(phys_to_page(paddr));
3341 const u32 blocksize = fs_info->sectorsize;
3342 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3343
3344 shash->tfm = fs_info->csum_shash;
3345 /* The full block must be inside the folio. */
3346 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
3347
3348 if (folio_test_partial_kmap(folio)) {
3349 size_t cur = paddr;
3350
3351 crypto_shash_init(shash);
3352 while (cur < paddr + blocksize) {
3353 void *kaddr;
3354 size_t len = min(paddr + blocksize - cur,
3355 PAGE_SIZE - offset_in_page(cur));
3356
3357 kaddr = kmap_local_folio(folio, offset_in_folio(folio, cur));
3358 crypto_shash_update(shash, kaddr, len);
3359 kunmap_local(kaddr);
3360 cur += len;
3361 }
3362 crypto_shash_final(shash, dest);
3363 } else {
3364 crypto_shash_digest(shash, phys_to_virt(paddr), blocksize, dest);
3365 }
3366 }
3367 /*
3368 * Verify the checksum for a single sector without any extra action that depend
3369 * on the type of I/O.
3370 *
3371 * @kaddr must be a properly kmapped address.
3372 */
btrfs_check_block_csum(struct btrfs_fs_info * fs_info,phys_addr_t paddr,u8 * csum,const u8 * const csum_expected)3373 int btrfs_check_block_csum(struct btrfs_fs_info *fs_info, phys_addr_t paddr, u8 *csum,
3374 const u8 * const csum_expected)
3375 {
3376 btrfs_calculate_block_csum(fs_info, paddr, csum);
3377 if (unlikely(memcmp(csum, csum_expected, fs_info->csum_size) != 0))
3378 return -EIO;
3379 return 0;
3380 }
3381
3382 /*
3383 * Verify the checksum of a single data sector.
3384 *
3385 * @bbio: btrfs_io_bio which contains the csum
3386 * @dev: device the sector is on
3387 * @bio_offset: offset to the beginning of the bio (in bytes)
3388 * @bv: bio_vec to check
3389 *
3390 * Check if the checksum on a data block is valid. When a checksum mismatch is
3391 * detected, report the error and fill the corrupted range with zero.
3392 *
3393 * Return %true if the sector is ok or had no checksum to start with, else %false.
3394 */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,phys_addr_t paddr)3395 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3396 u32 bio_offset, phys_addr_t paddr)
3397 {
3398 struct btrfs_inode *inode = bbio->inode;
3399 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3400 const u32 blocksize = fs_info->sectorsize;
3401 struct folio *folio;
3402 u64 file_offset = bbio->file_offset + bio_offset;
3403 u64 end = file_offset + blocksize - 1;
3404 u8 *csum_expected;
3405 u8 csum[BTRFS_CSUM_SIZE];
3406
3407 if (!bbio->csum)
3408 return true;
3409
3410 if (btrfs_is_data_reloc_root(inode->root) &&
3411 btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3412 NULL)) {
3413 /* Skip the range without csum for data reloc inode */
3414 btrfs_clear_extent_bit(&inode->io_tree, file_offset, end,
3415 EXTENT_NODATASUM, NULL);
3416 return true;
3417 }
3418
3419 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3420 fs_info->csum_size;
3421 if (btrfs_check_block_csum(fs_info, paddr, csum, csum_expected))
3422 goto zeroit;
3423 return true;
3424
3425 zeroit:
3426 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3427 bbio->mirror_num);
3428 if (dev)
3429 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3430 folio = page_folio(phys_to_page(paddr));
3431 ASSERT(offset_in_folio(folio, paddr) + blocksize <= folio_size(folio));
3432 folio_zero_range(folio, offset_in_folio(folio, paddr), blocksize);
3433 return false;
3434 }
3435
3436 /*
3437 * Perform a delayed iput on @inode.
3438 *
3439 * @inode: The inode we want to perform iput on
3440 *
3441 * This function uses the generic vfs_inode::i_count to track whether we should
3442 * just decrement it (in case it's > 1) or if this is the last iput then link
3443 * the inode to the delayed iput machinery. Delayed iputs are processed at
3444 * transaction commit time/superblock commit/cleaner kthread.
3445 */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3446 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3447 {
3448 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3449 unsigned long flags;
3450
3451 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3452 return;
3453
3454 WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
3455 atomic_inc(&fs_info->nr_delayed_iputs);
3456 /*
3457 * Need to be irq safe here because we can be called from either an irq
3458 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3459 * context.
3460 */
3461 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3462 ASSERT(list_empty(&inode->delayed_iput));
3463 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3464 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3465 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3466 wake_up_process(fs_info->cleaner_kthread);
3467 }
3468
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3469 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3470 struct btrfs_inode *inode)
3471 {
3472 list_del_init(&inode->delayed_iput);
3473 spin_unlock_irq(&fs_info->delayed_iput_lock);
3474 iput(&inode->vfs_inode);
3475 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3476 wake_up(&fs_info->delayed_iputs_wait);
3477 spin_lock_irq(&fs_info->delayed_iput_lock);
3478 }
3479
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3480 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3481 struct btrfs_inode *inode)
3482 {
3483 if (!list_empty(&inode->delayed_iput)) {
3484 spin_lock_irq(&fs_info->delayed_iput_lock);
3485 if (!list_empty(&inode->delayed_iput))
3486 run_delayed_iput_locked(fs_info, inode);
3487 spin_unlock_irq(&fs_info->delayed_iput_lock);
3488 }
3489 }
3490
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3491 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3492 {
3493 /*
3494 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3495 * calls btrfs_add_delayed_iput() and that needs to lock
3496 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3497 * prevent a deadlock.
3498 */
3499 spin_lock_irq(&fs_info->delayed_iput_lock);
3500 while (!list_empty(&fs_info->delayed_iputs)) {
3501 struct btrfs_inode *inode;
3502
3503 inode = list_first_entry(&fs_info->delayed_iputs,
3504 struct btrfs_inode, delayed_iput);
3505 run_delayed_iput_locked(fs_info, inode);
3506 if (need_resched()) {
3507 spin_unlock_irq(&fs_info->delayed_iput_lock);
3508 cond_resched();
3509 spin_lock_irq(&fs_info->delayed_iput_lock);
3510 }
3511 }
3512 spin_unlock_irq(&fs_info->delayed_iput_lock);
3513 }
3514
3515 /*
3516 * Wait for flushing all delayed iputs
3517 *
3518 * @fs_info: the filesystem
3519 *
3520 * This will wait on any delayed iputs that are currently running with KILLABLE
3521 * set. Once they are all done running we will return, unless we are killed in
3522 * which case we return EINTR. This helps in user operations like fallocate etc
3523 * that might get blocked on the iputs.
3524 *
3525 * Return EINTR if we were killed, 0 if nothing's pending
3526 */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3527 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3528 {
3529 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3530 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3531 if (ret)
3532 return -EINTR;
3533 return 0;
3534 }
3535
3536 /*
3537 * This creates an orphan entry for the given inode in case something goes wrong
3538 * in the middle of an unlink.
3539 */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3540 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3541 struct btrfs_inode *inode)
3542 {
3543 int ret;
3544
3545 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3546 if (unlikely(ret && ret != -EEXIST)) {
3547 btrfs_abort_transaction(trans, ret);
3548 return ret;
3549 }
3550
3551 return 0;
3552 }
3553
3554 /*
3555 * We have done the delete so we can go ahead and remove the orphan item for
3556 * this particular inode.
3557 */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3558 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3559 struct btrfs_inode *inode)
3560 {
3561 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3562 }
3563
3564 /*
3565 * this cleans up any orphans that may be left on the list from the last use
3566 * of this root.
3567 */
btrfs_orphan_cleanup(struct btrfs_root * root)3568 int btrfs_orphan_cleanup(struct btrfs_root *root)
3569 {
3570 struct btrfs_fs_info *fs_info = root->fs_info;
3571 BTRFS_PATH_AUTO_FREE(path);
3572 struct extent_buffer *leaf;
3573 struct btrfs_key key, found_key;
3574 struct btrfs_trans_handle *trans;
3575 u64 last_objectid = 0;
3576 int ret = 0, nr_unlink = 0;
3577
3578 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3579 return 0;
3580
3581 path = btrfs_alloc_path();
3582 if (!path) {
3583 ret = -ENOMEM;
3584 goto out;
3585 }
3586 path->reada = READA_BACK;
3587
3588 key.objectid = BTRFS_ORPHAN_OBJECTID;
3589 key.type = BTRFS_ORPHAN_ITEM_KEY;
3590 key.offset = (u64)-1;
3591
3592 while (1) {
3593 struct btrfs_inode *inode;
3594
3595 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3596 if (ret < 0)
3597 goto out;
3598
3599 /*
3600 * if ret == 0 means we found what we were searching for, which
3601 * is weird, but possible, so only screw with path if we didn't
3602 * find the key and see if we have stuff that matches
3603 */
3604 if (ret > 0) {
3605 ret = 0;
3606 if (path->slots[0] == 0)
3607 break;
3608 path->slots[0]--;
3609 }
3610
3611 /* pull out the item */
3612 leaf = path->nodes[0];
3613 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3614
3615 /* make sure the item matches what we want */
3616 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3617 break;
3618 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3619 break;
3620
3621 /* release the path since we're done with it */
3622 btrfs_release_path(path);
3623
3624 /*
3625 * this is where we are basically btrfs_lookup, without the
3626 * crossing root thing. we store the inode number in the
3627 * offset of the orphan item.
3628 */
3629
3630 if (found_key.offset == last_objectid) {
3631 /*
3632 * We found the same inode as before. This means we were
3633 * not able to remove its items via eviction triggered
3634 * by an iput(). A transaction abort may have happened,
3635 * due to -ENOSPC for example, so try to grab the error
3636 * that lead to a transaction abort, if any.
3637 */
3638 btrfs_err(fs_info,
3639 "Error removing orphan entry, stopping orphan cleanup");
3640 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3641 goto out;
3642 }
3643
3644 last_objectid = found_key.offset;
3645
3646 found_key.objectid = found_key.offset;
3647 found_key.type = BTRFS_INODE_ITEM_KEY;
3648 found_key.offset = 0;
3649 inode = btrfs_iget(last_objectid, root);
3650 if (IS_ERR(inode)) {
3651 ret = PTR_ERR(inode);
3652 inode = NULL;
3653 if (ret != -ENOENT)
3654 goto out;
3655 }
3656
3657 if (!inode && root == fs_info->tree_root) {
3658 struct btrfs_root *dead_root;
3659 int is_dead_root = 0;
3660
3661 /*
3662 * This is an orphan in the tree root. Currently these
3663 * could come from 2 sources:
3664 * a) a root (snapshot/subvolume) deletion in progress
3665 * b) a free space cache inode
3666 * We need to distinguish those two, as the orphan item
3667 * for a root must not get deleted before the deletion
3668 * of the snapshot/subvolume's tree completes.
3669 *
3670 * btrfs_find_orphan_roots() ran before us, which has
3671 * found all deleted roots and loaded them into
3672 * fs_info->fs_roots_radix. So here we can find if an
3673 * orphan item corresponds to a deleted root by looking
3674 * up the root from that radix tree.
3675 */
3676
3677 spin_lock(&fs_info->fs_roots_radix_lock);
3678 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3679 (unsigned long)found_key.objectid);
3680 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3681 is_dead_root = 1;
3682 spin_unlock(&fs_info->fs_roots_radix_lock);
3683
3684 if (is_dead_root) {
3685 /* prevent this orphan from being found again */
3686 key.offset = found_key.objectid - 1;
3687 continue;
3688 }
3689
3690 }
3691
3692 /*
3693 * If we have an inode with links, there are a couple of
3694 * possibilities:
3695 *
3696 * 1. We were halfway through creating fsverity metadata for the
3697 * file. In that case, the orphan item represents incomplete
3698 * fsverity metadata which must be cleaned up with
3699 * btrfs_drop_verity_items and deleting the orphan item.
3700
3701 * 2. Old kernels (before v3.12) used to create an
3702 * orphan item for truncate indicating that there were possibly
3703 * extent items past i_size that needed to be deleted. In v3.12,
3704 * truncate was changed to update i_size in sync with the extent
3705 * items, but the (useless) orphan item was still created. Since
3706 * v4.18, we don't create the orphan item for truncate at all.
3707 *
3708 * So, this item could mean that we need to do a truncate, but
3709 * only if this filesystem was last used on a pre-v3.12 kernel
3710 * and was not cleanly unmounted. The odds of that are quite
3711 * slim, and it's a pain to do the truncate now, so just delete
3712 * the orphan item.
3713 *
3714 * It's also possible that this orphan item was supposed to be
3715 * deleted but wasn't. The inode number may have been reused,
3716 * but either way, we can delete the orphan item.
3717 */
3718 if (!inode || inode->vfs_inode.i_nlink) {
3719 if (inode) {
3720 ret = btrfs_drop_verity_items(inode);
3721 iput(&inode->vfs_inode);
3722 inode = NULL;
3723 if (ret)
3724 goto out;
3725 }
3726 trans = btrfs_start_transaction(root, 1);
3727 if (IS_ERR(trans)) {
3728 ret = PTR_ERR(trans);
3729 goto out;
3730 }
3731 btrfs_debug(fs_info, "auto deleting %Lu",
3732 found_key.objectid);
3733 ret = btrfs_del_orphan_item(trans, root,
3734 found_key.objectid);
3735 btrfs_end_transaction(trans);
3736 if (ret)
3737 goto out;
3738 continue;
3739 }
3740
3741 nr_unlink++;
3742
3743 /* this will do delete_inode and everything for us */
3744 iput(&inode->vfs_inode);
3745 }
3746 /* release the path since we're done with it */
3747 btrfs_release_path(path);
3748
3749 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3750 trans = btrfs_join_transaction(root);
3751 if (!IS_ERR(trans))
3752 btrfs_end_transaction(trans);
3753 }
3754
3755 if (nr_unlink)
3756 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3757
3758 out:
3759 if (ret)
3760 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3761 return ret;
3762 }
3763
3764 /*
3765 * Look ahead in the leaf for xattrs. If we don't find any then we know there
3766 * can't be any ACLs.
3767 *
3768 * @leaf: the eb leaf where to search
3769 * @slot: the slot the inode is in
3770 * @objectid: the objectid of the inode
3771 *
3772 * Return true if there is xattr/ACL, false otherwise.
3773 */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3774 static noinline bool acls_after_inode_item(struct extent_buffer *leaf,
3775 int slot, u64 objectid,
3776 int *first_xattr_slot)
3777 {
3778 u32 nritems = btrfs_header_nritems(leaf);
3779 struct btrfs_key found_key;
3780 static u64 xattr_access = 0;
3781 static u64 xattr_default = 0;
3782 int scanned = 0;
3783
3784 if (!xattr_access) {
3785 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3786 strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3787 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3788 strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3789 }
3790
3791 slot++;
3792 *first_xattr_slot = -1;
3793 while (slot < nritems) {
3794 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3795
3796 /* We found a different objectid, there must be no ACLs. */
3797 if (found_key.objectid != objectid)
3798 return false;
3799
3800 /* We found an xattr, assume we've got an ACL. */
3801 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3802 if (*first_xattr_slot == -1)
3803 *first_xattr_slot = slot;
3804 if (found_key.offset == xattr_access ||
3805 found_key.offset == xattr_default)
3806 return true;
3807 }
3808
3809 /*
3810 * We found a key greater than an xattr key, there can't be any
3811 * ACLs later on.
3812 */
3813 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3814 return false;
3815
3816 slot++;
3817 scanned++;
3818
3819 /*
3820 * The item order goes like:
3821 * - inode
3822 * - inode backrefs
3823 * - xattrs
3824 * - extents,
3825 *
3826 * so if there are lots of hard links to an inode there can be
3827 * a lot of backrefs. Don't waste time searching too hard,
3828 * this is just an optimization.
3829 */
3830 if (scanned >= 8)
3831 break;
3832 }
3833 /*
3834 * We hit the end of the leaf before we found an xattr or something
3835 * larger than an xattr. We have to assume the inode has ACLs.
3836 */
3837 if (*first_xattr_slot == -1)
3838 *first_xattr_slot = slot;
3839 return true;
3840 }
3841
btrfs_init_file_extent_tree(struct btrfs_inode * inode)3842 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3843 {
3844 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3845
3846 if (WARN_ON_ONCE(inode->file_extent_tree))
3847 return 0;
3848 if (btrfs_fs_incompat(fs_info, NO_HOLES))
3849 return 0;
3850 if (!S_ISREG(inode->vfs_inode.i_mode))
3851 return 0;
3852 if (btrfs_is_free_space_inode(inode))
3853 return 0;
3854
3855 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3856 if (!inode->file_extent_tree)
3857 return -ENOMEM;
3858
3859 btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree,
3860 IO_TREE_INODE_FILE_EXTENT);
3861 /* Lockdep class is set only for the file extent tree. */
3862 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3863
3864 return 0;
3865 }
3866
btrfs_add_inode_to_root(struct btrfs_inode * inode,bool prealloc)3867 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
3868 {
3869 struct btrfs_root *root = inode->root;
3870 struct btrfs_inode *existing;
3871 const u64 ino = btrfs_ino(inode);
3872 int ret;
3873
3874 if (inode_unhashed(&inode->vfs_inode))
3875 return 0;
3876
3877 if (prealloc) {
3878 ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
3879 if (ret)
3880 return ret;
3881 }
3882
3883 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
3884
3885 if (xa_is_err(existing)) {
3886 ret = xa_err(existing);
3887 ASSERT(ret != -EINVAL);
3888 ASSERT(ret != -ENOMEM);
3889 return ret;
3890 } else if (existing) {
3891 WARN_ON(!(inode_state_read_once(&existing->vfs_inode) & (I_WILL_FREE | I_FREEING)));
3892 }
3893
3894 return 0;
3895 }
3896
3897 /*
3898 * Read a locked inode from the btree into the in-memory inode and add it to
3899 * its root list/tree.
3900 *
3901 * On failure clean up the inode.
3902 */
btrfs_read_locked_inode(struct btrfs_inode * inode,struct btrfs_path * path)3903 static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
3904 {
3905 struct btrfs_root *root = inode->root;
3906 struct btrfs_fs_info *fs_info = root->fs_info;
3907 struct extent_buffer *leaf;
3908 struct btrfs_inode_item *inode_item;
3909 struct inode *vfs_inode = &inode->vfs_inode;
3910 struct btrfs_key location;
3911 unsigned long ptr;
3912 int maybe_acls;
3913 u32 rdev;
3914 int ret;
3915 bool filled = false;
3916 int first_xattr_slot;
3917
3918 ret = btrfs_fill_inode(inode, &rdev);
3919 if (!ret)
3920 filled = true;
3921
3922 ASSERT(path);
3923
3924 btrfs_get_inode_key(inode, &location);
3925
3926 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3927 if (ret) {
3928 /*
3929 * ret > 0 can come from btrfs_search_slot called by
3930 * btrfs_lookup_inode(), this means the inode was not found.
3931 */
3932 if (ret > 0)
3933 ret = -ENOENT;
3934 goto out;
3935 }
3936
3937 leaf = path->nodes[0];
3938
3939 if (filled)
3940 goto cache_index;
3941
3942 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3943 struct btrfs_inode_item);
3944 vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3945 set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
3946 i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
3947 i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
3948 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3949
3950 inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3951 btrfs_timespec_nsec(leaf, &inode_item->atime));
3952
3953 inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3954 btrfs_timespec_nsec(leaf, &inode_item->mtime));
3955
3956 inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3957 btrfs_timespec_nsec(leaf, &inode_item->ctime));
3958
3959 inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3960 inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
3961
3962 inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
3963 inode->generation = btrfs_inode_generation(leaf, inode_item);
3964 inode->last_trans = btrfs_inode_transid(leaf, inode_item);
3965
3966 inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
3967 vfs_inode->i_generation = inode->generation;
3968 vfs_inode->i_rdev = 0;
3969 rdev = btrfs_inode_rdev(leaf, inode_item);
3970
3971 if (S_ISDIR(vfs_inode->i_mode))
3972 inode->index_cnt = (u64)-1;
3973
3974 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3975 &inode->flags, &inode->ro_flags);
3976 btrfs_update_inode_mapping_flags(inode);
3977 btrfs_set_inode_mapping_order(inode);
3978
3979 cache_index:
3980 ret = btrfs_init_file_extent_tree(inode);
3981 if (ret)
3982 goto out;
3983 btrfs_inode_set_file_extent_range(inode, 0,
3984 round_up(i_size_read(vfs_inode), fs_info->sectorsize));
3985 /*
3986 * If we were modified in the current generation and evicted from memory
3987 * and then re-read we need to do a full sync since we don't have any
3988 * idea about which extents were modified before we were evicted from
3989 * cache.
3990 *
3991 * This is required for both inode re-read from disk and delayed inode
3992 * in the delayed_nodes xarray.
3993 */
3994 if (inode->last_trans == btrfs_get_fs_generation(fs_info))
3995 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3996
3997 /*
3998 * We don't persist the id of the transaction where an unlink operation
3999 * against the inode was last made. So here we assume the inode might
4000 * have been evicted, and therefore the exact value of last_unlink_trans
4001 * lost, and set it to last_trans to avoid metadata inconsistencies
4002 * between the inode and its parent if the inode is fsync'ed and the log
4003 * replayed. For example, in the scenario:
4004 *
4005 * touch mydir/foo
4006 * ln mydir/foo mydir/bar
4007 * sync
4008 * unlink mydir/bar
4009 * echo 2 > /proc/sys/vm/drop_caches # evicts inode
4010 * xfs_io -c fsync mydir/foo
4011 * <power failure>
4012 * mount fs, triggers fsync log replay
4013 *
4014 * We must make sure that when we fsync our inode foo we also log its
4015 * parent inode, otherwise after log replay the parent still has the
4016 * dentry with the "bar" name but our inode foo has a link count of 1
4017 * and doesn't have an inode ref with the name "bar" anymore.
4018 *
4019 * Setting last_unlink_trans to last_trans is a pessimistic approach,
4020 * but it guarantees correctness at the expense of occasional full
4021 * transaction commits on fsync if our inode is a directory, or if our
4022 * inode is not a directory, logging its parent unnecessarily.
4023 */
4024 inode->last_unlink_trans = inode->last_trans;
4025
4026 /*
4027 * Same logic as for last_unlink_trans. We don't persist the generation
4028 * of the last transaction where this inode was used for a reflink
4029 * operation, so after eviction and reloading the inode we must be
4030 * pessimistic and assume the last transaction that modified the inode.
4031 */
4032 inode->last_reflink_trans = inode->last_trans;
4033
4034 path->slots[0]++;
4035 if (vfs_inode->i_nlink != 1 ||
4036 path->slots[0] >= btrfs_header_nritems(leaf))
4037 goto cache_acl;
4038
4039 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
4040 if (location.objectid != btrfs_ino(inode))
4041 goto cache_acl;
4042
4043 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4044 if (location.type == BTRFS_INODE_REF_KEY) {
4045 struct btrfs_inode_ref *ref;
4046
4047 ref = (struct btrfs_inode_ref *)ptr;
4048 inode->dir_index = btrfs_inode_ref_index(leaf, ref);
4049 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
4050 struct btrfs_inode_extref *extref;
4051
4052 extref = (struct btrfs_inode_extref *)ptr;
4053 inode->dir_index = btrfs_inode_extref_index(leaf, extref);
4054 }
4055 cache_acl:
4056 /*
4057 * try to precache a NULL acl entry for files that don't have
4058 * any xattrs or acls
4059 */
4060 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
4061 btrfs_ino(inode), &first_xattr_slot);
4062 if (first_xattr_slot != -1) {
4063 path->slots[0] = first_xattr_slot;
4064 ret = btrfs_load_inode_props(inode, path);
4065 if (ret)
4066 btrfs_err(fs_info,
4067 "error loading props for ino %llu (root %llu): %d",
4068 btrfs_ino(inode), btrfs_root_id(root), ret);
4069 }
4070
4071 if (!maybe_acls)
4072 cache_no_acl(vfs_inode);
4073
4074 switch (vfs_inode->i_mode & S_IFMT) {
4075 case S_IFREG:
4076 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4077 vfs_inode->i_fop = &btrfs_file_operations;
4078 vfs_inode->i_op = &btrfs_file_inode_operations;
4079 break;
4080 case S_IFDIR:
4081 vfs_inode->i_fop = &btrfs_dir_file_operations;
4082 vfs_inode->i_op = &btrfs_dir_inode_operations;
4083 break;
4084 case S_IFLNK:
4085 vfs_inode->i_op = &btrfs_symlink_inode_operations;
4086 inode_nohighmem(vfs_inode);
4087 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4088 break;
4089 default:
4090 vfs_inode->i_op = &btrfs_special_inode_operations;
4091 init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
4092 break;
4093 }
4094
4095 btrfs_sync_inode_flags_to_i_flags(inode);
4096
4097 ret = btrfs_add_inode_to_root(inode, true);
4098 if (ret)
4099 goto out;
4100
4101 return 0;
4102 out:
4103 iget_failed(vfs_inode);
4104 return ret;
4105 }
4106
4107 /*
4108 * given a leaf and an inode, copy the inode fields into the leaf
4109 */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)4110 static void fill_inode_item(struct btrfs_trans_handle *trans,
4111 struct extent_buffer *leaf,
4112 struct btrfs_inode_item *item,
4113 struct inode *inode)
4114 {
4115 u64 flags;
4116
4117 btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
4118 btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
4119 btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
4120 btrfs_set_inode_mode(leaf, item, inode->i_mode);
4121 btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
4122
4123 btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode));
4124 btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode));
4125
4126 btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode));
4127 btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode));
4128
4129 btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode));
4130 btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode));
4131
4132 btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec);
4133 btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4134
4135 btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
4136 btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
4137 btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode));
4138 btrfs_set_inode_transid(leaf, item, trans->transid);
4139 btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
4140 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4141 BTRFS_I(inode)->ro_flags);
4142 btrfs_set_inode_flags(leaf, item, flags);
4143 btrfs_set_inode_block_group(leaf, item, 0);
4144 }
4145
4146 /*
4147 * copy everything in the in-memory inode into the btree.
4148 */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4149 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4150 struct btrfs_inode *inode)
4151 {
4152 struct btrfs_inode_item *inode_item;
4153 BTRFS_PATH_AUTO_FREE(path);
4154 struct extent_buffer *leaf;
4155 struct btrfs_key key;
4156 int ret;
4157
4158 path = btrfs_alloc_path();
4159 if (!path)
4160 return -ENOMEM;
4161
4162 btrfs_get_inode_key(inode, &key);
4163 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
4164 if (ret) {
4165 if (ret > 0)
4166 ret = -ENOENT;
4167 return ret;
4168 }
4169
4170 leaf = path->nodes[0];
4171 inode_item = btrfs_item_ptr(leaf, path->slots[0],
4172 struct btrfs_inode_item);
4173
4174 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4175 btrfs_set_inode_last_trans(trans, inode);
4176 return 0;
4177 }
4178
4179 /*
4180 * copy everything in the in-memory inode into the btree.
4181 */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4182 int btrfs_update_inode(struct btrfs_trans_handle *trans,
4183 struct btrfs_inode *inode)
4184 {
4185 struct btrfs_root *root = inode->root;
4186 struct btrfs_fs_info *fs_info = root->fs_info;
4187 int ret;
4188
4189 /*
4190 * If the inode is a free space inode, we can deadlock during commit
4191 * if we put it into the delayed code.
4192 *
4193 * The data relocation inode should also be directly updated
4194 * without delay
4195 */
4196 if (!btrfs_is_free_space_inode(inode)
4197 && !btrfs_is_data_reloc_root(root)
4198 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4199 btrfs_update_root_times(trans, root);
4200
4201 ret = btrfs_delayed_update_inode(trans, inode);
4202 if (!ret)
4203 btrfs_set_inode_last_trans(trans, inode);
4204 return ret;
4205 }
4206
4207 return btrfs_update_inode_item(trans, inode);
4208 }
4209
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4210 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4211 struct btrfs_inode *inode)
4212 {
4213 int ret;
4214
4215 ret = btrfs_update_inode(trans, inode);
4216 if (ret == -ENOSPC)
4217 return btrfs_update_inode_item(trans, inode);
4218 return ret;
4219 }
4220
update_time_after_link_or_unlink(struct btrfs_inode * dir)4221 static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
4222 {
4223 struct timespec64 now;
4224
4225 /*
4226 * If we are replaying a log tree, we do not want to update the mtime
4227 * and ctime of the parent directory with the current time, since the
4228 * log replay procedure is responsible for setting them to their correct
4229 * values (the ones it had when the fsync was done).
4230 */
4231 if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
4232 return;
4233
4234 now = inode_set_ctime_current(&dir->vfs_inode);
4235 inode_set_mtime_to_ts(&dir->vfs_inode, now);
4236 }
4237
4238 /*
4239 * unlink helper that gets used here in inode.c and in the tree logging
4240 * recovery code. It remove a link in a directory with a given name, and
4241 * also drops the back refs in the inode to the directory
4242 */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4243 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4244 struct btrfs_inode *dir,
4245 struct btrfs_inode *inode,
4246 const struct fscrypt_str *name,
4247 struct btrfs_rename_ctx *rename_ctx)
4248 {
4249 struct btrfs_root *root = dir->root;
4250 struct btrfs_fs_info *fs_info = root->fs_info;
4251 struct btrfs_path *path;
4252 int ret = 0;
4253 struct btrfs_dir_item *di;
4254 u64 index;
4255 u64 ino = btrfs_ino(inode);
4256 u64 dir_ino = btrfs_ino(dir);
4257
4258 path = btrfs_alloc_path();
4259 if (!path)
4260 return -ENOMEM;
4261
4262 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4263 if (IS_ERR_OR_NULL(di)) {
4264 btrfs_free_path(path);
4265 return di ? PTR_ERR(di) : -ENOENT;
4266 }
4267 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4268 /*
4269 * Down the call chains below we'll also need to allocate a path, so no
4270 * need to hold on to this one for longer than necessary.
4271 */
4272 btrfs_free_path(path);
4273 if (ret)
4274 return ret;
4275
4276 /*
4277 * If we don't have dir index, we have to get it by looking up
4278 * the inode ref, since we get the inode ref, remove it directly,
4279 * it is unnecessary to do delayed deletion.
4280 *
4281 * But if we have dir index, needn't search inode ref to get it.
4282 * Since the inode ref is close to the inode item, it is better
4283 * that we delay to delete it, and just do this deletion when
4284 * we update the inode item.
4285 */
4286 if (inode->dir_index) {
4287 ret = btrfs_delayed_delete_inode_ref(inode);
4288 if (!ret) {
4289 index = inode->dir_index;
4290 goto skip_backref;
4291 }
4292 }
4293
4294 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4295 if (unlikely(ret)) {
4296 btrfs_crit(fs_info,
4297 "failed to delete reference to %.*s, root %llu inode %llu parent %llu",
4298 name->len, name->name, btrfs_root_id(root), ino, dir_ino);
4299 btrfs_abort_transaction(trans, ret);
4300 return ret;
4301 }
4302 skip_backref:
4303 if (rename_ctx)
4304 rename_ctx->index = index;
4305
4306 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4307 if (unlikely(ret)) {
4308 btrfs_abort_transaction(trans, ret);
4309 return ret;
4310 }
4311
4312 /*
4313 * If we are in a rename context, we don't need to update anything in the
4314 * log. That will be done later during the rename by btrfs_log_new_name().
4315 * Besides that, doing it here would only cause extra unnecessary btree
4316 * operations on the log tree, increasing latency for applications.
4317 */
4318 if (!rename_ctx) {
4319 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4320 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4321 }
4322
4323 /*
4324 * If we have a pending delayed iput we could end up with the final iput
4325 * being run in btrfs-cleaner context. If we have enough of these built
4326 * up we can end up burning a lot of time in btrfs-cleaner without any
4327 * way to throttle the unlinks. Since we're currently holding a ref on
4328 * the inode we can run the delayed iput here without any issues as the
4329 * final iput won't be done until after we drop the ref we're currently
4330 * holding.
4331 */
4332 btrfs_run_delayed_iput(fs_info, inode);
4333
4334 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4335 inode_inc_iversion(&inode->vfs_inode);
4336 inode_set_ctime_current(&inode->vfs_inode);
4337 inode_inc_iversion(&dir->vfs_inode);
4338 update_time_after_link_or_unlink(dir);
4339
4340 return btrfs_update_inode(trans, dir);
4341 }
4342
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4343 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4344 struct btrfs_inode *dir, struct btrfs_inode *inode,
4345 const struct fscrypt_str *name)
4346 {
4347 int ret;
4348
4349 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4350 if (!ret) {
4351 drop_nlink(&inode->vfs_inode);
4352 ret = btrfs_update_inode(trans, inode);
4353 }
4354 return ret;
4355 }
4356
4357 /*
4358 * helper to start transaction for unlink and rmdir.
4359 *
4360 * unlink and rmdir are special in btrfs, they do not always free space, so
4361 * if we cannot make our reservations the normal way try and see if there is
4362 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4363 * allow the unlink to occur.
4364 */
__unlink_start_trans(struct btrfs_inode * dir)4365 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4366 {
4367 struct btrfs_root *root = dir->root;
4368
4369 return btrfs_start_transaction_fallback_global_rsv(root,
4370 BTRFS_UNLINK_METADATA_UNITS);
4371 }
4372
btrfs_unlink(struct inode * dir,struct dentry * dentry)4373 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4374 {
4375 struct btrfs_trans_handle *trans;
4376 struct inode *inode = d_inode(dentry);
4377 int ret;
4378 struct fscrypt_name fname;
4379
4380 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4381 if (ret)
4382 return ret;
4383
4384 /* This needs to handle no-key deletions later on */
4385
4386 trans = __unlink_start_trans(BTRFS_I(dir));
4387 if (IS_ERR(trans)) {
4388 ret = PTR_ERR(trans);
4389 goto fscrypt_free;
4390 }
4391
4392 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4393 false);
4394
4395 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4396 &fname.disk_name);
4397 if (ret)
4398 goto end_trans;
4399
4400 if (inode->i_nlink == 0) {
4401 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4402 if (ret)
4403 goto end_trans;
4404 }
4405
4406 end_trans:
4407 btrfs_end_transaction(trans);
4408 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4409 fscrypt_free:
4410 fscrypt_free_filename(&fname);
4411 return ret;
4412 }
4413
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4414 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4415 struct btrfs_inode *dir, struct dentry *dentry)
4416 {
4417 struct btrfs_root *root = dir->root;
4418 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4419 struct btrfs_path *path;
4420 struct extent_buffer *leaf;
4421 struct btrfs_dir_item *di;
4422 struct btrfs_key key;
4423 u64 index;
4424 int ret;
4425 u64 objectid;
4426 u64 dir_ino = btrfs_ino(dir);
4427 struct fscrypt_name fname;
4428
4429 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4430 if (ret)
4431 return ret;
4432
4433 /* This needs to handle no-key deletions later on */
4434
4435 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4436 objectid = btrfs_root_id(inode->root);
4437 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4438 objectid = inode->ref_root_id;
4439 } else {
4440 WARN_ON(1);
4441 fscrypt_free_filename(&fname);
4442 return -EINVAL;
4443 }
4444
4445 path = btrfs_alloc_path();
4446 if (!path) {
4447 ret = -ENOMEM;
4448 goto out;
4449 }
4450
4451 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4452 &fname.disk_name, -1);
4453 if (IS_ERR_OR_NULL(di)) {
4454 ret = di ? PTR_ERR(di) : -ENOENT;
4455 goto out;
4456 }
4457
4458 leaf = path->nodes[0];
4459 btrfs_dir_item_key_to_cpu(leaf, di, &key);
4460 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4461 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4462 if (unlikely(ret)) {
4463 btrfs_abort_transaction(trans, ret);
4464 goto out;
4465 }
4466 btrfs_release_path(path);
4467
4468 /*
4469 * This is a placeholder inode for a subvolume we didn't have a
4470 * reference to at the time of the snapshot creation. In the meantime
4471 * we could have renamed the real subvol link into our snapshot, so
4472 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4473 * Instead simply lookup the dir_index_item for this entry so we can
4474 * remove it. Otherwise we know we have a ref to the root and we can
4475 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4476 */
4477 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4478 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4479 if (IS_ERR(di)) {
4480 ret = PTR_ERR(di);
4481 btrfs_abort_transaction(trans, ret);
4482 goto out;
4483 }
4484
4485 leaf = path->nodes[0];
4486 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4487 index = key.offset;
4488 btrfs_release_path(path);
4489 } else {
4490 ret = btrfs_del_root_ref(trans, objectid,
4491 btrfs_root_id(root), dir_ino,
4492 &index, &fname.disk_name);
4493 if (unlikely(ret)) {
4494 btrfs_abort_transaction(trans, ret);
4495 goto out;
4496 }
4497 }
4498
4499 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4500 if (unlikely(ret)) {
4501 btrfs_abort_transaction(trans, ret);
4502 goto out;
4503 }
4504
4505 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4506 inode_inc_iversion(&dir->vfs_inode);
4507 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4508 ret = btrfs_update_inode_fallback(trans, dir);
4509 if (ret)
4510 btrfs_abort_transaction(trans, ret);
4511 out:
4512 btrfs_free_path(path);
4513 fscrypt_free_filename(&fname);
4514 return ret;
4515 }
4516
4517 /*
4518 * Helper to check if the subvolume references other subvolumes or if it's
4519 * default.
4520 */
may_destroy_subvol(struct btrfs_root * root)4521 static noinline int may_destroy_subvol(struct btrfs_root *root)
4522 {
4523 struct btrfs_fs_info *fs_info = root->fs_info;
4524 BTRFS_PATH_AUTO_FREE(path);
4525 struct btrfs_dir_item *di;
4526 struct btrfs_key key;
4527 struct fscrypt_str name = FSTR_INIT("default", 7);
4528 u64 dir_id;
4529 int ret;
4530
4531 path = btrfs_alloc_path();
4532 if (!path)
4533 return -ENOMEM;
4534
4535 /* Make sure this root isn't set as the default subvol */
4536 dir_id = btrfs_super_root_dir(fs_info->super_copy);
4537 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4538 dir_id, &name, 0);
4539 if (di && !IS_ERR(di)) {
4540 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4541 if (key.objectid == btrfs_root_id(root)) {
4542 ret = -EPERM;
4543 btrfs_err(fs_info,
4544 "deleting default subvolume %llu is not allowed",
4545 key.objectid);
4546 return ret;
4547 }
4548 btrfs_release_path(path);
4549 }
4550
4551 key.objectid = btrfs_root_id(root);
4552 key.type = BTRFS_ROOT_REF_KEY;
4553 key.offset = (u64)-1;
4554
4555 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4556 if (ret < 0)
4557 return ret;
4558 if (unlikely(ret == 0)) {
4559 /*
4560 * Key with offset -1 found, there would have to exist a root
4561 * with such id, but this is out of valid range.
4562 */
4563 return -EUCLEAN;
4564 }
4565
4566 ret = 0;
4567 if (path->slots[0] > 0) {
4568 path->slots[0]--;
4569 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4570 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
4571 ret = -ENOTEMPTY;
4572 }
4573
4574 return ret;
4575 }
4576
4577 /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4578 static void btrfs_prune_dentries(struct btrfs_root *root)
4579 {
4580 struct btrfs_fs_info *fs_info = root->fs_info;
4581 struct btrfs_inode *inode;
4582 u64 min_ino = 0;
4583
4584 if (!BTRFS_FS_ERROR(fs_info))
4585 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4586
4587 inode = btrfs_find_first_inode(root, min_ino);
4588 while (inode) {
4589 if (icount_read(&inode->vfs_inode) > 1)
4590 d_prune_aliases(&inode->vfs_inode);
4591
4592 min_ino = btrfs_ino(inode) + 1;
4593 /*
4594 * btrfs_drop_inode() will have it removed from the inode
4595 * cache when its usage count hits zero.
4596 */
4597 iput(&inode->vfs_inode);
4598 cond_resched();
4599 inode = btrfs_find_first_inode(root, min_ino);
4600 }
4601 }
4602
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4603 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4604 {
4605 struct btrfs_root *root = dir->root;
4606 struct btrfs_fs_info *fs_info = root->fs_info;
4607 struct inode *inode = d_inode(dentry);
4608 struct btrfs_root *dest = BTRFS_I(inode)->root;
4609 struct btrfs_trans_handle *trans;
4610 struct btrfs_block_rsv block_rsv;
4611 u64 root_flags;
4612 u64 qgroup_reserved = 0;
4613 int ret;
4614
4615 down_write(&fs_info->subvol_sem);
4616
4617 /*
4618 * Don't allow to delete a subvolume with send in progress. This is
4619 * inside the inode lock so the error handling that has to drop the bit
4620 * again is not run concurrently.
4621 */
4622 spin_lock(&dest->root_item_lock);
4623 if (dest->send_in_progress) {
4624 spin_unlock(&dest->root_item_lock);
4625 btrfs_warn(fs_info,
4626 "attempt to delete subvolume %llu during send",
4627 btrfs_root_id(dest));
4628 ret = -EPERM;
4629 goto out_up_write;
4630 }
4631 if (atomic_read(&dest->nr_swapfiles)) {
4632 spin_unlock(&dest->root_item_lock);
4633 btrfs_warn(fs_info,
4634 "attempt to delete subvolume %llu with active swapfile",
4635 btrfs_root_id(root));
4636 ret = -EPERM;
4637 goto out_up_write;
4638 }
4639 root_flags = btrfs_root_flags(&dest->root_item);
4640 btrfs_set_root_flags(&dest->root_item,
4641 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4642 spin_unlock(&dest->root_item_lock);
4643
4644 ret = may_destroy_subvol(dest);
4645 if (ret)
4646 goto out_undead;
4647
4648 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4649 /*
4650 * One for dir inode,
4651 * two for dir entries,
4652 * two for root ref/backref.
4653 */
4654 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4655 if (ret)
4656 goto out_undead;
4657 qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4658
4659 trans = btrfs_start_transaction(root, 0);
4660 if (IS_ERR(trans)) {
4661 ret = PTR_ERR(trans);
4662 goto out_release;
4663 }
4664 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4665 qgroup_reserved = 0;
4666 trans->block_rsv = &block_rsv;
4667 trans->bytes_reserved = block_rsv.size;
4668
4669 btrfs_record_snapshot_destroy(trans, dir);
4670
4671 ret = btrfs_unlink_subvol(trans, dir, dentry);
4672 if (unlikely(ret)) {
4673 btrfs_abort_transaction(trans, ret);
4674 goto out_end_trans;
4675 }
4676
4677 ret = btrfs_record_root_in_trans(trans, dest);
4678 if (unlikely(ret)) {
4679 btrfs_abort_transaction(trans, ret);
4680 goto out_end_trans;
4681 }
4682
4683 memset(&dest->root_item.drop_progress, 0,
4684 sizeof(dest->root_item.drop_progress));
4685 btrfs_set_root_drop_level(&dest->root_item, 0);
4686 btrfs_set_root_refs(&dest->root_item, 0);
4687
4688 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4689 ret = btrfs_insert_orphan_item(trans,
4690 fs_info->tree_root,
4691 btrfs_root_id(dest));
4692 if (unlikely(ret)) {
4693 btrfs_abort_transaction(trans, ret);
4694 goto out_end_trans;
4695 }
4696 }
4697
4698 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4699 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
4700 if (unlikely(ret && ret != -ENOENT)) {
4701 btrfs_abort_transaction(trans, ret);
4702 goto out_end_trans;
4703 }
4704 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4705 ret = btrfs_uuid_tree_remove(trans,
4706 dest->root_item.received_uuid,
4707 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4708 btrfs_root_id(dest));
4709 if (unlikely(ret && ret != -ENOENT)) {
4710 btrfs_abort_transaction(trans, ret);
4711 goto out_end_trans;
4712 }
4713 }
4714
4715 free_anon_bdev(dest->anon_dev);
4716 dest->anon_dev = 0;
4717 out_end_trans:
4718 trans->block_rsv = NULL;
4719 trans->bytes_reserved = 0;
4720 ret = btrfs_end_transaction(trans);
4721 inode->i_flags |= S_DEAD;
4722 out_release:
4723 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4724 if (qgroup_reserved)
4725 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4726 out_undead:
4727 if (ret) {
4728 spin_lock(&dest->root_item_lock);
4729 root_flags = btrfs_root_flags(&dest->root_item);
4730 btrfs_set_root_flags(&dest->root_item,
4731 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4732 spin_unlock(&dest->root_item_lock);
4733 }
4734 out_up_write:
4735 up_write(&fs_info->subvol_sem);
4736 if (!ret) {
4737 d_invalidate(dentry);
4738 btrfs_prune_dentries(dest);
4739 ASSERT(dest->send_in_progress == 0);
4740 }
4741
4742 return ret;
4743 }
4744
btrfs_rmdir(struct inode * vfs_dir,struct dentry * dentry)4745 static int btrfs_rmdir(struct inode *vfs_dir, struct dentry *dentry)
4746 {
4747 struct btrfs_inode *dir = BTRFS_I(vfs_dir);
4748 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4749 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4750 int ret = 0;
4751 struct btrfs_trans_handle *trans;
4752 struct fscrypt_name fname;
4753
4754 if (inode->vfs_inode.i_size > BTRFS_EMPTY_DIR_SIZE)
4755 return -ENOTEMPTY;
4756 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4757 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4758 btrfs_err(fs_info,
4759 "extent tree v2 doesn't support snapshot deletion yet");
4760 return -EOPNOTSUPP;
4761 }
4762 return btrfs_delete_subvolume(dir, dentry);
4763 }
4764
4765 ret = fscrypt_setup_filename(vfs_dir, &dentry->d_name, 1, &fname);
4766 if (ret)
4767 return ret;
4768
4769 /* This needs to handle no-key deletions later on */
4770
4771 trans = __unlink_start_trans(dir);
4772 if (IS_ERR(trans)) {
4773 ret = PTR_ERR(trans);
4774 goto out_notrans;
4775 }
4776
4777 /*
4778 * Propagate the last_unlink_trans value of the deleted dir to its
4779 * parent directory. This is to prevent an unrecoverable log tree in the
4780 * case we do something like this:
4781 * 1) create dir foo
4782 * 2) create snapshot under dir foo
4783 * 3) delete the snapshot
4784 * 4) rmdir foo
4785 * 5) mkdir foo
4786 * 6) fsync foo or some file inside foo
4787 *
4788 * This is because we can't unlink other roots when replaying the dir
4789 * deletes for directory foo.
4790 */
4791 if (inode->last_unlink_trans >= trans->transid)
4792 btrfs_record_snapshot_destroy(trans, dir);
4793
4794 if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4795 ret = btrfs_unlink_subvol(trans, dir, dentry);
4796 goto out;
4797 }
4798
4799 ret = btrfs_orphan_add(trans, inode);
4800 if (ret)
4801 goto out;
4802
4803 /* now the directory is empty */
4804 ret = btrfs_unlink_inode(trans, dir, inode, &fname.disk_name);
4805 if (!ret)
4806 btrfs_i_size_write(inode, 0);
4807 out:
4808 btrfs_end_transaction(trans);
4809 out_notrans:
4810 btrfs_btree_balance_dirty(fs_info);
4811 fscrypt_free_filename(&fname);
4812
4813 return ret;
4814 }
4815
is_inside_block(u64 bytenr,u64 blockstart,u32 blocksize)4816 static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize)
4817 {
4818 ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u",
4819 blockstart, blocksize);
4820
4821 if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1)
4822 return true;
4823 return false;
4824 }
4825
truncate_block_zero_beyond_eof(struct btrfs_inode * inode,u64 start)4826 static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start)
4827 {
4828 const pgoff_t index = (start >> PAGE_SHIFT);
4829 struct address_space *mapping = inode->vfs_inode.i_mapping;
4830 struct folio *folio;
4831 u64 zero_start;
4832 u64 zero_end;
4833 int ret = 0;
4834
4835 again:
4836 folio = filemap_lock_folio(mapping, index);
4837 /* No folio present. */
4838 if (IS_ERR(folio))
4839 return 0;
4840
4841 if (!folio_test_uptodate(folio)) {
4842 ret = btrfs_read_folio(NULL, folio);
4843 folio_lock(folio);
4844 if (folio->mapping != mapping) {
4845 folio_unlock(folio);
4846 folio_put(folio);
4847 goto again;
4848 }
4849 if (unlikely(!folio_test_uptodate(folio))) {
4850 ret = -EIO;
4851 goto out_unlock;
4852 }
4853 }
4854 folio_wait_writeback(folio);
4855
4856 /*
4857 * We do not need to lock extents nor wait for OE, as it's already
4858 * beyond EOF.
4859 */
4860
4861 zero_start = max_t(u64, folio_pos(folio), start);
4862 zero_end = folio_next_pos(folio);
4863 folio_zero_range(folio, zero_start - folio_pos(folio),
4864 zero_end - zero_start);
4865
4866 out_unlock:
4867 folio_unlock(folio);
4868 folio_put(folio);
4869 return ret;
4870 }
4871
4872 /*
4873 * Handle the truncation of a fs block.
4874 *
4875 * @inode - inode that we're zeroing
4876 * @offset - the file offset of the block to truncate
4877 * The value must be inside [@start, @end], and the function will do
4878 * extra checks if the block that covers @offset needs to be zeroed.
4879 * @start - the start file offset of the range we want to zero
4880 * @end - the end (inclusive) file offset of the range we want to zero.
4881 *
4882 * If the range is not block aligned, read out the folio that covers @offset,
4883 * and if needed zero blocks that are inside the folio and covered by [@start, @end).
4884 * If @start or @end + 1 lands inside a block, that block will be marked dirty
4885 * for writeback.
4886 *
4887 * This is utilized by hole punch, zero range, file expansion.
4888 */
btrfs_truncate_block(struct btrfs_inode * inode,u64 offset,u64 start,u64 end)4889 int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end)
4890 {
4891 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4892 struct address_space *mapping = inode->vfs_inode.i_mapping;
4893 struct extent_io_tree *io_tree = &inode->io_tree;
4894 struct btrfs_ordered_extent *ordered;
4895 struct extent_state *cached_state = NULL;
4896 struct extent_changeset *data_reserved = NULL;
4897 bool only_release_metadata = false;
4898 u32 blocksize = fs_info->sectorsize;
4899 pgoff_t index = (offset >> PAGE_SHIFT);
4900 struct folio *folio;
4901 gfp_t mask = btrfs_alloc_write_mask(mapping);
4902 int ret = 0;
4903 const bool in_head_block = is_inside_block(offset, round_down(start, blocksize),
4904 blocksize);
4905 const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize),
4906 blocksize);
4907 bool need_truncate_head = false;
4908 bool need_truncate_tail = false;
4909 u64 zero_start;
4910 u64 zero_end;
4911 u64 block_start;
4912 u64 block_end;
4913
4914 /* @offset should be inside the range. */
4915 ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu",
4916 offset, start, end);
4917
4918 /* The range is aligned at both ends. */
4919 if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) {
4920 /*
4921 * For block size < page size case, we may have polluted blocks
4922 * beyond EOF. So we also need to zero them out.
4923 */
4924 if (end == (u64)-1 && blocksize < PAGE_SIZE)
4925 ret = truncate_block_zero_beyond_eof(inode, start);
4926 goto out;
4927 }
4928
4929 /*
4930 * @offset may not be inside the head nor tail block. In that case we
4931 * don't need to do anything.
4932 */
4933 if (!in_head_block && !in_tail_block)
4934 goto out;
4935
4936 /*
4937 * Skip the truncation if the range in the target block is already aligned.
4938 * The seemingly complex check will also handle the same block case.
4939 */
4940 if (in_head_block && !IS_ALIGNED(start, blocksize))
4941 need_truncate_head = true;
4942 if (in_tail_block && !IS_ALIGNED(end + 1, blocksize))
4943 need_truncate_tail = true;
4944 if (!need_truncate_head && !need_truncate_tail)
4945 goto out;
4946
4947 block_start = round_down(offset, blocksize);
4948 block_end = block_start + blocksize - 1;
4949
4950 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4951 blocksize, false);
4952 if (ret < 0) {
4953 size_t write_bytes = blocksize;
4954
4955 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4956 /* For nocow case, no need to reserve data space. */
4957 ASSERT(write_bytes == blocksize, "write_bytes=%zu blocksize=%u",
4958 write_bytes, blocksize);
4959 only_release_metadata = true;
4960 } else {
4961 goto out;
4962 }
4963 }
4964 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4965 if (ret < 0) {
4966 if (!only_release_metadata)
4967 btrfs_free_reserved_data_space(inode, data_reserved,
4968 block_start, blocksize);
4969 goto out;
4970 }
4971 again:
4972 folio = __filemap_get_folio(mapping, index,
4973 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4974 if (IS_ERR(folio)) {
4975 if (only_release_metadata)
4976 btrfs_delalloc_release_metadata(inode, blocksize, true);
4977 else
4978 btrfs_delalloc_release_space(inode, data_reserved,
4979 block_start, blocksize, true);
4980 btrfs_delalloc_release_extents(inode, blocksize);
4981 ret = PTR_ERR(folio);
4982 goto out;
4983 }
4984
4985 if (!folio_test_uptodate(folio)) {
4986 ret = btrfs_read_folio(NULL, folio);
4987 folio_lock(folio);
4988 if (folio->mapping != mapping) {
4989 folio_unlock(folio);
4990 folio_put(folio);
4991 goto again;
4992 }
4993 if (unlikely(!folio_test_uptodate(folio))) {
4994 ret = -EIO;
4995 goto out_unlock;
4996 }
4997 }
4998
4999 /*
5000 * We unlock the page after the io is completed and then re-lock it
5001 * above. release_folio() could have come in between that and cleared
5002 * folio private, but left the page in the mapping. Set the page mapped
5003 * here to make sure it's properly set for the subpage stuff.
5004 */
5005 ret = set_folio_extent_mapped(folio);
5006 if (ret < 0)
5007 goto out_unlock;
5008
5009 folio_wait_writeback(folio);
5010
5011 btrfs_lock_extent(io_tree, block_start, block_end, &cached_state);
5012
5013 ordered = btrfs_lookup_ordered_extent(inode, block_start);
5014 if (ordered) {
5015 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5016 folio_unlock(folio);
5017 folio_put(folio);
5018 btrfs_start_ordered_extent(ordered);
5019 btrfs_put_ordered_extent(ordered);
5020 goto again;
5021 }
5022
5023 btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end,
5024 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
5025 &cached_state);
5026
5027 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
5028 &cached_state);
5029 if (ret) {
5030 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5031 goto out_unlock;
5032 }
5033
5034 if (end == (u64)-1) {
5035 /*
5036 * We're truncating beyond EOF, the remaining blocks normally are
5037 * already holes thus no need to zero again, but it's possible for
5038 * fs block size < page size cases to have memory mapped writes
5039 * to pollute ranges beyond EOF.
5040 *
5041 * In that case although such polluted blocks beyond EOF will
5042 * not reach disk, it still affects our page caches.
5043 */
5044 zero_start = max_t(u64, folio_pos(folio), start);
5045 zero_end = min_t(u64, folio_next_pos(folio) - 1, end);
5046 } else {
5047 zero_start = max_t(u64, block_start, start);
5048 zero_end = min_t(u64, block_end, end);
5049 }
5050 folio_zero_range(folio, zero_start - folio_pos(folio),
5051 zero_end - zero_start + 1);
5052
5053 btrfs_folio_clear_checked(fs_info, folio, block_start,
5054 block_end + 1 - block_start);
5055 btrfs_folio_set_dirty(fs_info, folio, block_start,
5056 block_end + 1 - block_start);
5057
5058 if (only_release_metadata)
5059 btrfs_set_extent_bit(&inode->io_tree, block_start, block_end,
5060 EXTENT_NORESERVE, &cached_state);
5061
5062 btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state);
5063
5064 out_unlock:
5065 if (ret) {
5066 if (only_release_metadata)
5067 btrfs_delalloc_release_metadata(inode, blocksize, true);
5068 else
5069 btrfs_delalloc_release_space(inode, data_reserved,
5070 block_start, blocksize, true);
5071 }
5072 btrfs_delalloc_release_extents(inode, blocksize);
5073 folio_unlock(folio);
5074 folio_put(folio);
5075 out:
5076 if (only_release_metadata)
5077 btrfs_check_nocow_unlock(inode);
5078 extent_changeset_free(data_reserved);
5079 return ret;
5080 }
5081
maybe_insert_hole(struct btrfs_inode * inode,u64 offset,u64 len)5082 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
5083 {
5084 struct btrfs_root *root = inode->root;
5085 struct btrfs_fs_info *fs_info = root->fs_info;
5086 struct btrfs_trans_handle *trans;
5087 struct btrfs_drop_extents_args drop_args = { 0 };
5088 int ret;
5089
5090 /*
5091 * If NO_HOLES is enabled, we don't need to do anything.
5092 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
5093 * or btrfs_update_inode() will be called, which guarantee that the next
5094 * fsync will know this inode was changed and needs to be logged.
5095 */
5096 if (btrfs_fs_incompat(fs_info, NO_HOLES))
5097 return 0;
5098
5099 /*
5100 * 1 - for the one we're dropping
5101 * 1 - for the one we're adding
5102 * 1 - for updating the inode.
5103 */
5104 trans = btrfs_start_transaction(root, 3);
5105 if (IS_ERR(trans))
5106 return PTR_ERR(trans);
5107
5108 drop_args.start = offset;
5109 drop_args.end = offset + len;
5110 drop_args.drop_cache = true;
5111
5112 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
5113 if (unlikely(ret)) {
5114 btrfs_abort_transaction(trans, ret);
5115 btrfs_end_transaction(trans);
5116 return ret;
5117 }
5118
5119 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
5120 if (ret) {
5121 btrfs_abort_transaction(trans, ret);
5122 } else {
5123 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
5124 btrfs_update_inode(trans, inode);
5125 }
5126 btrfs_end_transaction(trans);
5127 return ret;
5128 }
5129
5130 /*
5131 * This function puts in dummy file extents for the area we're creating a hole
5132 * for. So if we are truncating this file to a larger size we need to insert
5133 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
5134 * the range between oldsize and size
5135 */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)5136 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
5137 {
5138 struct btrfs_root *root = inode->root;
5139 struct btrfs_fs_info *fs_info = root->fs_info;
5140 struct extent_io_tree *io_tree = &inode->io_tree;
5141 struct extent_map *em = NULL;
5142 struct extent_state *cached_state = NULL;
5143 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5144 u64 block_end = ALIGN(size, fs_info->sectorsize);
5145 u64 last_byte;
5146 u64 cur_offset;
5147 u64 hole_size;
5148 int ret = 0;
5149
5150 /*
5151 * If our size started in the middle of a block we need to zero out the
5152 * rest of the block before we expand the i_size, otherwise we could
5153 * expose stale data.
5154 */
5155 ret = btrfs_truncate_block(inode, oldsize, oldsize, -1);
5156 if (ret)
5157 return ret;
5158
5159 if (size <= hole_start)
5160 return 0;
5161
5162 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5163 &cached_state);
5164 cur_offset = hole_start;
5165 while (1) {
5166 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
5167 if (IS_ERR(em)) {
5168 ret = PTR_ERR(em);
5169 em = NULL;
5170 break;
5171 }
5172 last_byte = min(btrfs_extent_map_end(em), block_end);
5173 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5174 hole_size = last_byte - cur_offset;
5175
5176 if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
5177 struct extent_map *hole_em;
5178
5179 ret = maybe_insert_hole(inode, cur_offset, hole_size);
5180 if (ret)
5181 break;
5182
5183 ret = btrfs_inode_set_file_extent_range(inode,
5184 cur_offset, hole_size);
5185 if (ret)
5186 break;
5187
5188 hole_em = btrfs_alloc_extent_map();
5189 if (!hole_em) {
5190 btrfs_drop_extent_map_range(inode, cur_offset,
5191 cur_offset + hole_size - 1,
5192 false);
5193 btrfs_set_inode_full_sync(inode);
5194 goto next;
5195 }
5196 hole_em->start = cur_offset;
5197 hole_em->len = hole_size;
5198
5199 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
5200 hole_em->disk_num_bytes = 0;
5201 hole_em->ram_bytes = hole_size;
5202 hole_em->generation = btrfs_get_fs_generation(fs_info);
5203
5204 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
5205 btrfs_free_extent_map(hole_em);
5206 } else {
5207 ret = btrfs_inode_set_file_extent_range(inode,
5208 cur_offset, hole_size);
5209 if (ret)
5210 break;
5211 }
5212 next:
5213 btrfs_free_extent_map(em);
5214 em = NULL;
5215 cur_offset = last_byte;
5216 if (cur_offset >= block_end)
5217 break;
5218 }
5219 btrfs_free_extent_map(em);
5220 btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5221 return ret;
5222 }
5223
btrfs_setsize(struct inode * inode,struct iattr * attr)5224 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5225 {
5226 struct btrfs_root *root = BTRFS_I(inode)->root;
5227 struct btrfs_trans_handle *trans;
5228 loff_t oldsize = i_size_read(inode);
5229 loff_t newsize = attr->ia_size;
5230 int mask = attr->ia_valid;
5231 int ret;
5232
5233 /*
5234 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5235 * special case where we need to update the times despite not having
5236 * these flags set. For all other operations the VFS set these flags
5237 * explicitly if it wants a timestamp update.
5238 */
5239 if (newsize != oldsize) {
5240 inode_inc_iversion(inode);
5241 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5242 inode_set_mtime_to_ts(inode,
5243 inode_set_ctime_current(inode));
5244 }
5245 }
5246
5247 if (newsize > oldsize) {
5248 /*
5249 * Don't do an expanding truncate while snapshotting is ongoing.
5250 * This is to ensure the snapshot captures a fully consistent
5251 * state of this file - if the snapshot captures this expanding
5252 * truncation, it must capture all writes that happened before
5253 * this truncation.
5254 */
5255 btrfs_drew_write_lock(&root->snapshot_lock);
5256 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5257 if (ret) {
5258 btrfs_drew_write_unlock(&root->snapshot_lock);
5259 return ret;
5260 }
5261
5262 trans = btrfs_start_transaction(root, 1);
5263 if (IS_ERR(trans)) {
5264 btrfs_drew_write_unlock(&root->snapshot_lock);
5265 return PTR_ERR(trans);
5266 }
5267
5268 i_size_write(inode, newsize);
5269 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5270 pagecache_isize_extended(inode, oldsize, newsize);
5271 ret = btrfs_update_inode(trans, BTRFS_I(inode));
5272 btrfs_drew_write_unlock(&root->snapshot_lock);
5273 btrfs_end_transaction(trans);
5274 } else {
5275 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5276
5277 if (btrfs_is_zoned(fs_info)) {
5278 ret = btrfs_wait_ordered_range(BTRFS_I(inode),
5279 ALIGN(newsize, fs_info->sectorsize),
5280 (u64)-1);
5281 if (ret)
5282 return ret;
5283 }
5284
5285 /*
5286 * We're truncating a file that used to have good data down to
5287 * zero. Make sure any new writes to the file get on disk
5288 * on close.
5289 */
5290 if (newsize == 0)
5291 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5292 &BTRFS_I(inode)->runtime_flags);
5293
5294 truncate_setsize(inode, newsize);
5295
5296 inode_dio_wait(inode);
5297
5298 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5299 if (ret && inode->i_nlink) {
5300 int ret2;
5301
5302 /*
5303 * Truncate failed, so fix up the in-memory size. We
5304 * adjusted disk_i_size down as we removed extents, so
5305 * wait for disk_i_size to be stable and then update the
5306 * in-memory size to match.
5307 */
5308 ret2 = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
5309 if (ret2)
5310 return ret2;
5311 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5312 }
5313 }
5314
5315 return ret;
5316 }
5317
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5318 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5319 struct iattr *attr)
5320 {
5321 struct inode *inode = d_inode(dentry);
5322 struct btrfs_root *root = BTRFS_I(inode)->root;
5323 int ret;
5324
5325 if (btrfs_root_readonly(root))
5326 return -EROFS;
5327
5328 ret = setattr_prepare(idmap, dentry, attr);
5329 if (ret)
5330 return ret;
5331
5332 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5333 ret = btrfs_setsize(inode, attr);
5334 if (ret)
5335 return ret;
5336 }
5337
5338 if (attr->ia_valid) {
5339 setattr_copy(idmap, inode, attr);
5340 inode_inc_iversion(inode);
5341 ret = btrfs_dirty_inode(BTRFS_I(inode));
5342
5343 if (!ret && attr->ia_valid & ATTR_MODE)
5344 ret = posix_acl_chmod(idmap, dentry, inode->i_mode);
5345 }
5346
5347 return ret;
5348 }
5349
5350 /*
5351 * While truncating the inode pages during eviction, we get the VFS
5352 * calling btrfs_invalidate_folio() against each folio of the inode. This
5353 * is slow because the calls to btrfs_invalidate_folio() result in a
5354 * huge amount of calls to lock_extent() and clear_extent_bit(),
5355 * which keep merging and splitting extent_state structures over and over,
5356 * wasting lots of time.
5357 *
5358 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5359 * skip all those expensive operations on a per folio basis and do only
5360 * the ordered io finishing, while we release here the extent_map and
5361 * extent_state structures, without the excessive merging and splitting.
5362 */
evict_inode_truncate_pages(struct inode * inode)5363 static void evict_inode_truncate_pages(struct inode *inode)
5364 {
5365 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5366 struct rb_node *node;
5367
5368 ASSERT(inode_state_read_once(inode) & I_FREEING);
5369 truncate_inode_pages_final(&inode->i_data);
5370
5371 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5372
5373 /*
5374 * Keep looping until we have no more ranges in the io tree.
5375 * We can have ongoing bios started by readahead that have
5376 * their endio callback (extent_io.c:end_bio_extent_readpage)
5377 * still in progress (unlocked the pages in the bio but did not yet
5378 * unlocked the ranges in the io tree). Therefore this means some
5379 * ranges can still be locked and eviction started because before
5380 * submitting those bios, which are executed by a separate task (work
5381 * queue kthread), inode references (inode->i_count) were not taken
5382 * (which would be dropped in the end io callback of each bio).
5383 * Therefore here we effectively end up waiting for those bios and
5384 * anyone else holding locked ranges without having bumped the inode's
5385 * reference count - if we don't do it, when they access the inode's
5386 * io_tree to unlock a range it may be too late, leading to an
5387 * use-after-free issue.
5388 */
5389 spin_lock(&io_tree->lock);
5390 while (!RB_EMPTY_ROOT(&io_tree->state)) {
5391 struct extent_state *state;
5392 struct extent_state *cached_state = NULL;
5393 u64 start;
5394 u64 end;
5395 unsigned state_flags;
5396
5397 node = rb_first(&io_tree->state);
5398 state = rb_entry(node, struct extent_state, rb_node);
5399 start = state->start;
5400 end = state->end;
5401 state_flags = state->state;
5402 spin_unlock(&io_tree->lock);
5403
5404 btrfs_lock_extent(io_tree, start, end, &cached_state);
5405
5406 /*
5407 * If still has DELALLOC flag, the extent didn't reach disk,
5408 * and its reserved space won't be freed by delayed_ref.
5409 * So we need to free its reserved space here.
5410 * (Refer to comment in btrfs_invalidate_folio, case 2)
5411 *
5412 * Note, end is the bytenr of last byte, so we need + 1 here.
5413 */
5414 if (state_flags & EXTENT_DELALLOC)
5415 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5416 end - start + 1, NULL);
5417
5418 btrfs_clear_extent_bit(io_tree, start, end,
5419 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5420 &cached_state);
5421
5422 cond_resched();
5423 spin_lock(&io_tree->lock);
5424 }
5425 spin_unlock(&io_tree->lock);
5426 }
5427
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5428 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5429 struct btrfs_block_rsv *rsv)
5430 {
5431 struct btrfs_fs_info *fs_info = root->fs_info;
5432 struct btrfs_trans_handle *trans;
5433 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5434 int ret;
5435
5436 /*
5437 * Eviction should be taking place at some place safe because of our
5438 * delayed iputs. However the normal flushing code will run delayed
5439 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5440 *
5441 * We reserve the delayed_refs_extra here again because we can't use
5442 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5443 * above. We reserve our extra bit here because we generate a ton of
5444 * delayed refs activity by truncating.
5445 *
5446 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5447 * if we fail to make this reservation we can re-try without the
5448 * delayed_refs_extra so we can make some forward progress.
5449 */
5450 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5451 BTRFS_RESERVE_FLUSH_EVICT);
5452 if (ret) {
5453 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5454 BTRFS_RESERVE_FLUSH_EVICT);
5455 if (ret) {
5456 btrfs_warn(fs_info,
5457 "could not allocate space for delete; will truncate on mount");
5458 return ERR_PTR(-ENOSPC);
5459 }
5460 delayed_refs_extra = 0;
5461 }
5462
5463 trans = btrfs_join_transaction(root);
5464 if (IS_ERR(trans))
5465 return trans;
5466
5467 if (delayed_refs_extra) {
5468 trans->block_rsv = &fs_info->trans_block_rsv;
5469 trans->bytes_reserved = delayed_refs_extra;
5470 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5471 delayed_refs_extra, true);
5472 }
5473 return trans;
5474 }
5475
btrfs_evict_inode(struct inode * inode)5476 void btrfs_evict_inode(struct inode *inode)
5477 {
5478 struct btrfs_fs_info *fs_info;
5479 struct btrfs_trans_handle *trans;
5480 struct btrfs_root *root = BTRFS_I(inode)->root;
5481 struct btrfs_block_rsv rsv;
5482 int ret;
5483
5484 trace_btrfs_inode_evict(inode);
5485
5486 if (!root) {
5487 fsverity_cleanup_inode(inode);
5488 clear_inode(inode);
5489 return;
5490 }
5491
5492 fs_info = inode_to_fs_info(inode);
5493 evict_inode_truncate_pages(inode);
5494
5495 if (inode->i_nlink &&
5496 ((btrfs_root_refs(&root->root_item) != 0 &&
5497 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
5498 btrfs_is_free_space_inode(BTRFS_I(inode))))
5499 goto out;
5500
5501 if (is_bad_inode(inode))
5502 goto out;
5503
5504 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5505 goto out;
5506
5507 if (inode->i_nlink > 0) {
5508 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5509 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
5510 goto out;
5511 }
5512
5513 /*
5514 * This makes sure the inode item in tree is uptodate and the space for
5515 * the inode update is released.
5516 */
5517 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5518 if (ret)
5519 goto out;
5520
5521 /*
5522 * This drops any pending insert or delete operations we have for this
5523 * inode. We could have a delayed dir index deletion queued up, but
5524 * we're removing the inode completely so that'll be taken care of in
5525 * the truncate.
5526 */
5527 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5528
5529 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
5530 rsv.size = btrfs_calc_metadata_size(fs_info, 1);
5531 rsv.failfast = true;
5532
5533 btrfs_i_size_write(BTRFS_I(inode), 0);
5534
5535 while (1) {
5536 struct btrfs_truncate_control control = {
5537 .inode = BTRFS_I(inode),
5538 .ino = btrfs_ino(BTRFS_I(inode)),
5539 .new_size = 0,
5540 .min_type = 0,
5541 };
5542
5543 trans = evict_refill_and_join(root, &rsv);
5544 if (IS_ERR(trans))
5545 goto out_release;
5546
5547 trans->block_rsv = &rsv;
5548
5549 ret = btrfs_truncate_inode_items(trans, root, &control);
5550 trans->block_rsv = &fs_info->trans_block_rsv;
5551 btrfs_end_transaction(trans);
5552 /*
5553 * We have not added new delayed items for our inode after we
5554 * have flushed its delayed items, so no need to throttle on
5555 * delayed items. However we have modified extent buffers.
5556 */
5557 btrfs_btree_balance_dirty_nodelay(fs_info);
5558 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5559 goto out_release;
5560 else if (!ret)
5561 break;
5562 }
5563
5564 /*
5565 * Errors here aren't a big deal, it just means we leave orphan items in
5566 * the tree. They will be cleaned up on the next mount. If the inode
5567 * number gets reused, cleanup deletes the orphan item without doing
5568 * anything, and unlink reuses the existing orphan item.
5569 *
5570 * If it turns out that we are dropping too many of these, we might want
5571 * to add a mechanism for retrying these after a commit.
5572 */
5573 trans = evict_refill_and_join(root, &rsv);
5574 if (!IS_ERR(trans)) {
5575 trans->block_rsv = &rsv;
5576 btrfs_orphan_del(trans, BTRFS_I(inode));
5577 trans->block_rsv = &fs_info->trans_block_rsv;
5578 btrfs_end_transaction(trans);
5579 }
5580
5581 out_release:
5582 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
5583 out:
5584 /*
5585 * If we didn't successfully delete, the orphan item will still be in
5586 * the tree and we'll retry on the next mount. Again, we might also want
5587 * to retry these periodically in the future.
5588 */
5589 btrfs_remove_delayed_node(BTRFS_I(inode));
5590 fsverity_cleanup_inode(inode);
5591 clear_inode(inode);
5592 }
5593
5594 /*
5595 * Return the key found in the dir entry in the location pointer, fill @type
5596 * with BTRFS_FT_*, and return 0.
5597 *
5598 * If no dir entries were found, returns -ENOENT.
5599 * If found a corrupted location in dir entry, returns -EUCLEAN.
5600 */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5601 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5602 struct btrfs_key *location, u8 *type)
5603 {
5604 struct btrfs_dir_item *di;
5605 BTRFS_PATH_AUTO_FREE(path);
5606 struct btrfs_root *root = dir->root;
5607 int ret = 0;
5608 struct fscrypt_name fname;
5609
5610 path = btrfs_alloc_path();
5611 if (!path)
5612 return -ENOMEM;
5613
5614 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5615 if (ret < 0)
5616 return ret;
5617 /*
5618 * fscrypt_setup_filename() should never return a positive value, but
5619 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5620 */
5621 ASSERT(ret == 0);
5622
5623 /* This needs to handle no-key deletions later on */
5624
5625 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5626 &fname.disk_name, 0);
5627 if (IS_ERR_OR_NULL(di)) {
5628 ret = di ? PTR_ERR(di) : -ENOENT;
5629 goto out;
5630 }
5631
5632 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5633 if (unlikely(location->type != BTRFS_INODE_ITEM_KEY &&
5634 location->type != BTRFS_ROOT_ITEM_KEY)) {
5635 ret = -EUCLEAN;
5636 btrfs_warn(root->fs_info,
5637 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5638 __func__, fname.disk_name.name, btrfs_ino(dir),
5639 location->objectid, location->type, location->offset);
5640 }
5641 if (!ret)
5642 *type = btrfs_dir_ftype(path->nodes[0], di);
5643 out:
5644 fscrypt_free_filename(&fname);
5645 return ret;
5646 }
5647
5648 /*
5649 * when we hit a tree root in a directory, the btrfs part of the inode
5650 * needs to be changed to reflect the root directory of the tree root. This
5651 * is kind of like crossing a mount point.
5652 */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5653 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5654 struct btrfs_inode *dir,
5655 struct dentry *dentry,
5656 struct btrfs_key *location,
5657 struct btrfs_root **sub_root)
5658 {
5659 BTRFS_PATH_AUTO_FREE(path);
5660 struct btrfs_root *new_root;
5661 struct btrfs_root_ref *ref;
5662 struct extent_buffer *leaf;
5663 struct btrfs_key key;
5664 int ret;
5665 int err = 0;
5666 struct fscrypt_name fname;
5667
5668 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5669 if (ret)
5670 return ret;
5671
5672 path = btrfs_alloc_path();
5673 if (!path) {
5674 err = -ENOMEM;
5675 goto out;
5676 }
5677
5678 err = -ENOENT;
5679 key.objectid = btrfs_root_id(dir->root);
5680 key.type = BTRFS_ROOT_REF_KEY;
5681 key.offset = location->objectid;
5682
5683 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5684 if (ret) {
5685 if (ret < 0)
5686 err = ret;
5687 goto out;
5688 }
5689
5690 leaf = path->nodes[0];
5691 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5692 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5693 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5694 goto out;
5695
5696 ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5697 (unsigned long)(ref + 1), fname.disk_name.len);
5698 if (ret)
5699 goto out;
5700
5701 btrfs_release_path(path);
5702
5703 new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5704 if (IS_ERR(new_root)) {
5705 err = PTR_ERR(new_root);
5706 goto out;
5707 }
5708
5709 *sub_root = new_root;
5710 location->objectid = btrfs_root_dirid(&new_root->root_item);
5711 location->type = BTRFS_INODE_ITEM_KEY;
5712 location->offset = 0;
5713 err = 0;
5714 out:
5715 fscrypt_free_filename(&fname);
5716 return err;
5717 }
5718
5719
5720
btrfs_del_inode_from_root(struct btrfs_inode * inode)5721 static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5722 {
5723 struct btrfs_root *root = inode->root;
5724 struct btrfs_inode *entry;
5725 bool empty = false;
5726
5727 xa_lock(&root->inodes);
5728 /*
5729 * This btrfs_inode is being freed and has already been unhashed at this
5730 * point. It's possible that another btrfs_inode has already been
5731 * allocated for the same inode and inserted itself into the root, so
5732 * don't delete it in that case.
5733 *
5734 * Note that this shouldn't need to allocate memory, so the gfp flags
5735 * don't really matter.
5736 */
5737 entry = __xa_cmpxchg(&root->inodes, btrfs_ino(inode), inode, NULL,
5738 GFP_ATOMIC);
5739 if (entry == inode)
5740 empty = xa_empty(&root->inodes);
5741 xa_unlock(&root->inodes);
5742
5743 if (empty && btrfs_root_refs(&root->root_item) == 0) {
5744 xa_lock(&root->inodes);
5745 empty = xa_empty(&root->inodes);
5746 xa_unlock(&root->inodes);
5747 if (empty)
5748 btrfs_add_dead_root(root);
5749 }
5750 }
5751
5752
btrfs_init_locked_inode(struct inode * inode,void * p)5753 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5754 {
5755 struct btrfs_iget_args *args = p;
5756
5757 btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5758 BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5759
5760 if (args->root && args->root == args->root->fs_info->tree_root &&
5761 args->ino != BTRFS_BTREE_INODE_OBJECTID)
5762 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5763 &BTRFS_I(inode)->runtime_flags);
5764 return 0;
5765 }
5766
btrfs_find_actor(struct inode * inode,void * opaque)5767 static int btrfs_find_actor(struct inode *inode, void *opaque)
5768 {
5769 struct btrfs_iget_args *args = opaque;
5770
5771 return args->ino == btrfs_ino(BTRFS_I(inode)) &&
5772 args->root == BTRFS_I(inode)->root;
5773 }
5774
btrfs_iget_locked(u64 ino,struct btrfs_root * root)5775 static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
5776 {
5777 struct inode *inode;
5778 struct btrfs_iget_args args;
5779 unsigned long hashval = btrfs_inode_hash(ino, root);
5780
5781 args.ino = ino;
5782 args.root = root;
5783
5784 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
5785 btrfs_init_locked_inode,
5786 (void *)&args);
5787 if (!inode)
5788 return NULL;
5789 return BTRFS_I(inode);
5790 }
5791
5792 /*
5793 * Get an inode object given its inode number and corresponding root. Path is
5794 * preallocated to prevent recursing back to iget through allocator.
5795 */
btrfs_iget_path(u64 ino,struct btrfs_root * root,struct btrfs_path * path)5796 struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5797 struct btrfs_path *path)
5798 {
5799 struct btrfs_inode *inode;
5800 int ret;
5801
5802 inode = btrfs_iget_locked(ino, root);
5803 if (!inode)
5804 return ERR_PTR(-ENOMEM);
5805
5806 if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
5807 return inode;
5808
5809 ret = btrfs_read_locked_inode(inode, path);
5810 if (ret)
5811 return ERR_PTR(ret);
5812
5813 unlock_new_inode(&inode->vfs_inode);
5814 return inode;
5815 }
5816
5817 /*
5818 * Get an inode object given its inode number and corresponding root.
5819 */
btrfs_iget(u64 ino,struct btrfs_root * root)5820 struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
5821 {
5822 struct btrfs_inode *inode;
5823 struct btrfs_path *path;
5824 int ret;
5825
5826 inode = btrfs_iget_locked(ino, root);
5827 if (!inode)
5828 return ERR_PTR(-ENOMEM);
5829
5830 if (!(inode_state_read_once(&inode->vfs_inode) & I_NEW))
5831 return inode;
5832
5833 path = btrfs_alloc_path();
5834 if (!path) {
5835 iget_failed(&inode->vfs_inode);
5836 return ERR_PTR(-ENOMEM);
5837 }
5838
5839 ret = btrfs_read_locked_inode(inode, path);
5840 btrfs_free_path(path);
5841 if (ret)
5842 return ERR_PTR(ret);
5843
5844 if (S_ISDIR(inode->vfs_inode.i_mode))
5845 inode->vfs_inode.i_opflags |= IOP_FASTPERM_MAY_EXEC;
5846 unlock_new_inode(&inode->vfs_inode);
5847 return inode;
5848 }
5849
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5850 static struct btrfs_inode *new_simple_dir(struct inode *dir,
5851 struct btrfs_key *key,
5852 struct btrfs_root *root)
5853 {
5854 struct timespec64 ts;
5855 struct inode *vfs_inode;
5856 struct btrfs_inode *inode;
5857
5858 vfs_inode = new_inode(dir->i_sb);
5859 if (!vfs_inode)
5860 return ERR_PTR(-ENOMEM);
5861
5862 inode = BTRFS_I(vfs_inode);
5863 inode->root = btrfs_grab_root(root);
5864 inode->ref_root_id = key->objectid;
5865 set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
5866 set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
5867
5868 btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
5869 /*
5870 * We only need lookup, the rest is read-only and there's no inode
5871 * associated with the dentry
5872 */
5873 vfs_inode->i_op = &simple_dir_inode_operations;
5874 vfs_inode->i_opflags &= ~IOP_XATTR;
5875 vfs_inode->i_fop = &simple_dir_operations;
5876 vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5877
5878 ts = inode_set_ctime_current(vfs_inode);
5879 inode_set_mtime_to_ts(vfs_inode, ts);
5880 inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
5881 inode->i_otime_sec = ts.tv_sec;
5882 inode->i_otime_nsec = ts.tv_nsec;
5883
5884 vfs_inode->i_uid = dir->i_uid;
5885 vfs_inode->i_gid = dir->i_gid;
5886
5887 return inode;
5888 }
5889
5890 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5891 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5892 static_assert(BTRFS_FT_DIR == FT_DIR);
5893 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5894 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5895 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5896 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5897 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5898
btrfs_inode_type(const struct btrfs_inode * inode)5899 static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
5900 {
5901 return fs_umode_to_ftype(inode->vfs_inode.i_mode);
5902 }
5903
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5904 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5905 {
5906 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
5907 struct btrfs_inode *inode;
5908 struct btrfs_root *root = BTRFS_I(dir)->root;
5909 struct btrfs_root *sub_root = root;
5910 struct btrfs_key location = { 0 };
5911 u8 di_type = 0;
5912 int ret = 0;
5913
5914 if (dentry->d_name.len > BTRFS_NAME_LEN)
5915 return ERR_PTR(-ENAMETOOLONG);
5916
5917 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5918 if (ret < 0)
5919 return ERR_PTR(ret);
5920
5921 if (location.type == BTRFS_INODE_ITEM_KEY) {
5922 inode = btrfs_iget(location.objectid, root);
5923 if (IS_ERR(inode))
5924 return ERR_CAST(inode);
5925
5926 /* Do extra check against inode mode with di_type */
5927 if (unlikely(btrfs_inode_type(inode) != di_type)) {
5928 btrfs_crit(fs_info,
5929 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5930 inode->vfs_inode.i_mode, btrfs_inode_type(inode),
5931 di_type);
5932 iput(&inode->vfs_inode);
5933 return ERR_PTR(-EUCLEAN);
5934 }
5935 return &inode->vfs_inode;
5936 }
5937
5938 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5939 &location, &sub_root);
5940 if (ret < 0) {
5941 if (ret != -ENOENT)
5942 inode = ERR_PTR(ret);
5943 else
5944 inode = new_simple_dir(dir, &location, root);
5945 } else {
5946 inode = btrfs_iget(location.objectid, sub_root);
5947 btrfs_put_root(sub_root);
5948
5949 if (IS_ERR(inode))
5950 return ERR_CAST(inode);
5951
5952 down_read(&fs_info->cleanup_work_sem);
5953 if (!sb_rdonly(inode->vfs_inode.i_sb))
5954 ret = btrfs_orphan_cleanup(sub_root);
5955 up_read(&fs_info->cleanup_work_sem);
5956 if (ret) {
5957 iput(&inode->vfs_inode);
5958 inode = ERR_PTR(ret);
5959 }
5960 }
5961
5962 if (IS_ERR(inode))
5963 return ERR_CAST(inode);
5964
5965 return &inode->vfs_inode;
5966 }
5967
btrfs_dentry_delete(const struct dentry * dentry)5968 static int btrfs_dentry_delete(const struct dentry *dentry)
5969 {
5970 struct btrfs_root *root;
5971 struct inode *inode = d_inode(dentry);
5972
5973 if (!inode && !IS_ROOT(dentry))
5974 inode = d_inode(dentry->d_parent);
5975
5976 if (inode) {
5977 root = BTRFS_I(inode)->root;
5978 if (btrfs_root_refs(&root->root_item) == 0)
5979 return 1;
5980
5981 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5982 return 1;
5983 }
5984 return 0;
5985 }
5986
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5987 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5988 unsigned int flags)
5989 {
5990 struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5991
5992 if (inode == ERR_PTR(-ENOENT))
5993 inode = NULL;
5994 return d_splice_alias(inode, dentry);
5995 }
5996
5997 /*
5998 * Find the highest existing sequence number in a directory and then set the
5999 * in-memory index_cnt variable to the first free sequence number.
6000 */
btrfs_set_inode_index_count(struct btrfs_inode * inode)6001 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
6002 {
6003 struct btrfs_root *root = inode->root;
6004 struct btrfs_key key, found_key;
6005 BTRFS_PATH_AUTO_FREE(path);
6006 struct extent_buffer *leaf;
6007 int ret;
6008
6009 key.objectid = btrfs_ino(inode);
6010 key.type = BTRFS_DIR_INDEX_KEY;
6011 key.offset = (u64)-1;
6012
6013 path = btrfs_alloc_path();
6014 if (!path)
6015 return -ENOMEM;
6016
6017 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
6018 if (ret < 0)
6019 return ret;
6020 /* FIXME: we should be able to handle this */
6021 if (ret == 0)
6022 return ret;
6023
6024 if (path->slots[0] == 0) {
6025 inode->index_cnt = BTRFS_DIR_START_INDEX;
6026 return 0;
6027 }
6028
6029 path->slots[0]--;
6030
6031 leaf = path->nodes[0];
6032 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6033
6034 if (found_key.objectid != btrfs_ino(inode) ||
6035 found_key.type != BTRFS_DIR_INDEX_KEY) {
6036 inode->index_cnt = BTRFS_DIR_START_INDEX;
6037 return 0;
6038 }
6039
6040 inode->index_cnt = found_key.offset + 1;
6041
6042 return 0;
6043 }
6044
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)6045 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
6046 {
6047 int ret = 0;
6048
6049 btrfs_inode_lock(dir, 0);
6050 if (dir->index_cnt == (u64)-1) {
6051 ret = btrfs_inode_delayed_dir_index_count(dir);
6052 if (ret) {
6053 ret = btrfs_set_inode_index_count(dir);
6054 if (ret)
6055 goto out;
6056 }
6057 }
6058
6059 /* index_cnt is the index number of next new entry, so decrement it. */
6060 *index = dir->index_cnt - 1;
6061 out:
6062 btrfs_inode_unlock(dir, 0);
6063
6064 return ret;
6065 }
6066
6067 /*
6068 * All this infrastructure exists because dir_emit can fault, and we are holding
6069 * the tree lock when doing readdir. For now just allocate a buffer and copy
6070 * our information into that, and then dir_emit from the buffer. This is
6071 * similar to what NFS does, only we don't keep the buffer around in pagecache
6072 * because I'm afraid I'll mess that up. Long term we need to make filldir do
6073 * copy_to_user_inatomic so we don't have to worry about page faulting under the
6074 * tree lock.
6075 */
btrfs_opendir(struct inode * inode,struct file * file)6076 static int btrfs_opendir(struct inode *inode, struct file *file)
6077 {
6078 struct btrfs_file_private *private;
6079 u64 last_index;
6080 int ret;
6081
6082 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
6083 if (ret)
6084 return ret;
6085
6086 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
6087 if (!private)
6088 return -ENOMEM;
6089 private->last_index = last_index;
6090 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
6091 if (!private->filldir_buf) {
6092 kfree(private);
6093 return -ENOMEM;
6094 }
6095 file->private_data = private;
6096 return 0;
6097 }
6098
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)6099 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
6100 {
6101 struct btrfs_file_private *private = file->private_data;
6102 int ret;
6103
6104 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
6105 &private->last_index);
6106 if (ret)
6107 return ret;
6108
6109 return generic_file_llseek(file, offset, whence);
6110 }
6111
6112 struct dir_entry {
6113 u64 ino;
6114 u64 offset;
6115 unsigned type;
6116 int name_len;
6117 };
6118
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)6119 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
6120 {
6121 while (entries--) {
6122 struct dir_entry *entry = addr;
6123 char *name = (char *)(entry + 1);
6124
6125 ctx->pos = get_unaligned(&entry->offset);
6126 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
6127 get_unaligned(&entry->ino),
6128 get_unaligned(&entry->type)))
6129 return 1;
6130 addr += sizeof(struct dir_entry) +
6131 get_unaligned(&entry->name_len);
6132 ctx->pos++;
6133 }
6134 return 0;
6135 }
6136
btrfs_real_readdir(struct file * file,struct dir_context * ctx)6137 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
6138 {
6139 struct inode *inode = file_inode(file);
6140 struct btrfs_root *root = BTRFS_I(inode)->root;
6141 struct btrfs_file_private *private = file->private_data;
6142 struct btrfs_dir_item *di;
6143 struct btrfs_key key;
6144 struct btrfs_key found_key;
6145 BTRFS_PATH_AUTO_FREE(path);
6146 void *addr;
6147 LIST_HEAD(ins_list);
6148 LIST_HEAD(del_list);
6149 int ret;
6150 char *name_ptr;
6151 int name_len;
6152 int entries = 0;
6153 int total_len = 0;
6154 bool put = false;
6155 struct btrfs_key location;
6156
6157 if (!dir_emit_dots(file, ctx))
6158 return 0;
6159
6160 path = btrfs_alloc_path();
6161 if (!path)
6162 return -ENOMEM;
6163
6164 addr = private->filldir_buf;
6165 path->reada = READA_FORWARD;
6166
6167 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
6168 &ins_list, &del_list);
6169
6170 again:
6171 key.type = BTRFS_DIR_INDEX_KEY;
6172 key.offset = ctx->pos;
6173 key.objectid = btrfs_ino(BTRFS_I(inode));
6174
6175 btrfs_for_each_slot(root, &key, &found_key, path, ret) {
6176 struct dir_entry *entry;
6177 struct extent_buffer *leaf = path->nodes[0];
6178 u8 ftype;
6179
6180 if (found_key.objectid != key.objectid)
6181 break;
6182 if (found_key.type != BTRFS_DIR_INDEX_KEY)
6183 break;
6184 if (found_key.offset < ctx->pos)
6185 continue;
6186 if (found_key.offset > private->last_index)
6187 break;
6188 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
6189 continue;
6190 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
6191 name_len = btrfs_dir_name_len(leaf, di);
6192 if ((total_len + sizeof(struct dir_entry) + name_len) >=
6193 PAGE_SIZE) {
6194 btrfs_release_path(path);
6195 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6196 if (ret)
6197 goto nopos;
6198 addr = private->filldir_buf;
6199 entries = 0;
6200 total_len = 0;
6201 goto again;
6202 }
6203
6204 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
6205 entry = addr;
6206 name_ptr = (char *)(entry + 1);
6207 read_extent_buffer(leaf, name_ptr,
6208 (unsigned long)(di + 1), name_len);
6209 put_unaligned(name_len, &entry->name_len);
6210 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
6211 btrfs_dir_item_key_to_cpu(leaf, di, &location);
6212 put_unaligned(location.objectid, &entry->ino);
6213 put_unaligned(found_key.offset, &entry->offset);
6214 entries++;
6215 addr += sizeof(struct dir_entry) + name_len;
6216 total_len += sizeof(struct dir_entry) + name_len;
6217 }
6218 /* Catch error encountered during iteration */
6219 if (ret < 0)
6220 goto err;
6221
6222 btrfs_release_path(path);
6223
6224 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6225 if (ret)
6226 goto nopos;
6227
6228 if (btrfs_readdir_delayed_dir_index(ctx, &ins_list))
6229 goto nopos;
6230
6231 /*
6232 * Stop new entries from being returned after we return the last
6233 * entry.
6234 *
6235 * New directory entries are assigned a strictly increasing
6236 * offset. This means that new entries created during readdir
6237 * are *guaranteed* to be seen in the future by that readdir.
6238 * This has broken buggy programs which operate on names as
6239 * they're returned by readdir. Until we reuse freed offsets
6240 * we have this hack to stop new entries from being returned
6241 * under the assumption that they'll never reach this huge
6242 * offset.
6243 *
6244 * This is being careful not to overflow 32bit loff_t unless the
6245 * last entry requires it because doing so has broken 32bit apps
6246 * in the past.
6247 */
6248 if (ctx->pos >= INT_MAX)
6249 ctx->pos = LLONG_MAX;
6250 else
6251 ctx->pos = INT_MAX;
6252 nopos:
6253 ret = 0;
6254 err:
6255 if (put)
6256 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
6257 return ret;
6258 }
6259
6260 /*
6261 * This is somewhat expensive, updating the tree every time the
6262 * inode changes. But, it is most likely to find the inode in cache.
6263 * FIXME, needs more benchmarking...there are no reasons other than performance
6264 * to keep or drop this code.
6265 */
btrfs_dirty_inode(struct btrfs_inode * inode)6266 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6267 {
6268 struct btrfs_root *root = inode->root;
6269 struct btrfs_fs_info *fs_info = root->fs_info;
6270 struct btrfs_trans_handle *trans;
6271 int ret;
6272
6273 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6274 return 0;
6275
6276 trans = btrfs_join_transaction(root);
6277 if (IS_ERR(trans))
6278 return PTR_ERR(trans);
6279
6280 ret = btrfs_update_inode(trans, inode);
6281 if (ret == -ENOSPC || ret == -EDQUOT) {
6282 /* whoops, lets try again with the full transaction */
6283 btrfs_end_transaction(trans);
6284 trans = btrfs_start_transaction(root, 1);
6285 if (IS_ERR(trans))
6286 return PTR_ERR(trans);
6287
6288 ret = btrfs_update_inode(trans, inode);
6289 }
6290 btrfs_end_transaction(trans);
6291 if (inode->delayed_node)
6292 btrfs_balance_delayed_items(fs_info);
6293
6294 return ret;
6295 }
6296
6297 /*
6298 * We need our own ->update_time so that we can return error on ENOSPC for
6299 * updating the inode in the case of file write and mmap writes.
6300 */
btrfs_update_time(struct inode * inode,int flags)6301 static int btrfs_update_time(struct inode *inode, int flags)
6302 {
6303 struct btrfs_root *root = BTRFS_I(inode)->root;
6304 bool dirty;
6305
6306 if (btrfs_root_readonly(root))
6307 return -EROFS;
6308
6309 dirty = inode_update_timestamps(inode, flags);
6310 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6311 }
6312
6313 /*
6314 * helper to find a free sequence number in a given directory. This current
6315 * code is very simple, later versions will do smarter things in the btree
6316 */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6317 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6318 {
6319 int ret = 0;
6320
6321 if (dir->index_cnt == (u64)-1) {
6322 ret = btrfs_inode_delayed_dir_index_count(dir);
6323 if (ret) {
6324 ret = btrfs_set_inode_index_count(dir);
6325 if (ret)
6326 return ret;
6327 }
6328 }
6329
6330 *index = dir->index_cnt;
6331 dir->index_cnt++;
6332
6333 return ret;
6334 }
6335
btrfs_insert_inode_locked(struct inode * inode)6336 static int btrfs_insert_inode_locked(struct inode *inode)
6337 {
6338 struct btrfs_iget_args args;
6339
6340 args.ino = btrfs_ino(BTRFS_I(inode));
6341 args.root = BTRFS_I(inode)->root;
6342
6343 return insert_inode_locked4(inode,
6344 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6345 btrfs_find_actor, &args);
6346 }
6347
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6348 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6349 unsigned int *trans_num_items)
6350 {
6351 struct inode *dir = args->dir;
6352 struct inode *inode = args->inode;
6353 int ret;
6354
6355 if (!args->orphan) {
6356 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6357 &args->fname);
6358 if (ret)
6359 return ret;
6360 }
6361
6362 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6363 if (ret) {
6364 fscrypt_free_filename(&args->fname);
6365 return ret;
6366 }
6367
6368 /* 1 to add inode item */
6369 *trans_num_items = 1;
6370 /* 1 to add compression property */
6371 if (BTRFS_I(dir)->prop_compress)
6372 (*trans_num_items)++;
6373 /* 1 to add default ACL xattr */
6374 if (args->default_acl)
6375 (*trans_num_items)++;
6376 /* 1 to add access ACL xattr */
6377 if (args->acl)
6378 (*trans_num_items)++;
6379 #ifdef CONFIG_SECURITY
6380 /* 1 to add LSM xattr */
6381 if (dir->i_security)
6382 (*trans_num_items)++;
6383 #endif
6384 if (args->orphan) {
6385 /* 1 to add orphan item */
6386 (*trans_num_items)++;
6387 } else {
6388 /*
6389 * 1 to add dir item
6390 * 1 to add dir index
6391 * 1 to update parent inode item
6392 *
6393 * No need for 1 unit for the inode ref item because it is
6394 * inserted in a batch together with the inode item at
6395 * btrfs_create_new_inode().
6396 */
6397 *trans_num_items += 3;
6398 }
6399 return 0;
6400 }
6401
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6402 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6403 {
6404 posix_acl_release(args->acl);
6405 posix_acl_release(args->default_acl);
6406 fscrypt_free_filename(&args->fname);
6407 }
6408
6409 /*
6410 * Inherit flags from the parent inode.
6411 *
6412 * Currently only the compression flags and the cow flags are inherited.
6413 */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6414 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6415 {
6416 unsigned int flags;
6417
6418 flags = dir->flags;
6419
6420 if (flags & BTRFS_INODE_NOCOMPRESS) {
6421 inode->flags &= ~BTRFS_INODE_COMPRESS;
6422 inode->flags |= BTRFS_INODE_NOCOMPRESS;
6423 } else if (flags & BTRFS_INODE_COMPRESS) {
6424 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6425 inode->flags |= BTRFS_INODE_COMPRESS;
6426 }
6427
6428 if (flags & BTRFS_INODE_NODATACOW) {
6429 inode->flags |= BTRFS_INODE_NODATACOW;
6430 if (S_ISREG(inode->vfs_inode.i_mode))
6431 inode->flags |= BTRFS_INODE_NODATASUM;
6432 }
6433
6434 btrfs_sync_inode_flags_to_i_flags(inode);
6435 }
6436
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6437 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6438 struct btrfs_new_inode_args *args)
6439 {
6440 struct timespec64 ts;
6441 struct inode *dir = args->dir;
6442 struct inode *inode = args->inode;
6443 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6444 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6445 struct btrfs_root *root;
6446 struct btrfs_inode_item *inode_item;
6447 struct btrfs_path *path;
6448 u64 objectid;
6449 struct btrfs_inode_ref *ref;
6450 struct btrfs_key key[2];
6451 u32 sizes[2];
6452 struct btrfs_item_batch batch;
6453 unsigned long ptr;
6454 int ret;
6455 bool xa_reserved = false;
6456
6457 path = btrfs_alloc_path();
6458 if (!path)
6459 return -ENOMEM;
6460
6461 if (!args->subvol)
6462 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6463 root = BTRFS_I(inode)->root;
6464
6465 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6466 if (ret)
6467 goto out;
6468
6469 ret = btrfs_get_free_objectid(root, &objectid);
6470 if (ret)
6471 goto out;
6472 btrfs_set_inode_number(BTRFS_I(inode), objectid);
6473
6474 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6475 if (ret)
6476 goto out;
6477 xa_reserved = true;
6478
6479 if (args->orphan) {
6480 /*
6481 * O_TMPFILE, set link count to 0, so that after this point, we
6482 * fill in an inode item with the correct link count.
6483 */
6484 set_nlink(inode, 0);
6485 } else {
6486 trace_btrfs_inode_request(dir);
6487
6488 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6489 if (ret)
6490 goto out;
6491 }
6492
6493 if (S_ISDIR(inode->i_mode))
6494 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6495
6496 BTRFS_I(inode)->generation = trans->transid;
6497 inode->i_generation = BTRFS_I(inode)->generation;
6498
6499 /*
6500 * We don't have any capability xattrs set here yet, shortcut any
6501 * queries for the xattrs here. If we add them later via the inode
6502 * security init path or any other path this flag will be cleared.
6503 */
6504 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6505
6506 /*
6507 * Subvolumes don't inherit flags from their parent directory.
6508 * Originally this was probably by accident, but we probably can't
6509 * change it now without compatibility issues.
6510 */
6511 if (!args->subvol)
6512 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6513
6514 btrfs_set_inode_mapping_order(BTRFS_I(inode));
6515 if (S_ISREG(inode->i_mode)) {
6516 if (btrfs_test_opt(fs_info, NODATASUM))
6517 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6518 if (btrfs_test_opt(fs_info, NODATACOW))
6519 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6520 BTRFS_INODE_NODATASUM;
6521 btrfs_update_inode_mapping_flags(BTRFS_I(inode));
6522 }
6523
6524 ret = btrfs_insert_inode_locked(inode);
6525 if (ret < 0) {
6526 if (!args->orphan)
6527 BTRFS_I(dir)->index_cnt--;
6528 goto out;
6529 }
6530
6531 /*
6532 * We could have gotten an inode number from somebody who was fsynced
6533 * and then removed in this same transaction, so let's just set full
6534 * sync since it will be a full sync anyway and this will blow away the
6535 * old info in the log.
6536 */
6537 btrfs_set_inode_full_sync(BTRFS_I(inode));
6538
6539 key[0].objectid = objectid;
6540 key[0].type = BTRFS_INODE_ITEM_KEY;
6541 key[0].offset = 0;
6542
6543 sizes[0] = sizeof(struct btrfs_inode_item);
6544
6545 if (!args->orphan) {
6546 /*
6547 * Start new inodes with an inode_ref. This is slightly more
6548 * efficient for small numbers of hard links since they will
6549 * be packed into one item. Extended refs will kick in if we
6550 * add more hard links than can fit in the ref item.
6551 */
6552 key[1].objectid = objectid;
6553 key[1].type = BTRFS_INODE_REF_KEY;
6554 if (args->subvol) {
6555 key[1].offset = objectid;
6556 sizes[1] = 2 + sizeof(*ref);
6557 } else {
6558 key[1].offset = btrfs_ino(BTRFS_I(dir));
6559 sizes[1] = name->len + sizeof(*ref);
6560 }
6561 }
6562
6563 batch.keys = &key[0];
6564 batch.data_sizes = &sizes[0];
6565 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6566 batch.nr = args->orphan ? 1 : 2;
6567 ret = btrfs_insert_empty_items(trans, root, path, &batch);
6568 if (unlikely(ret != 0)) {
6569 btrfs_abort_transaction(trans, ret);
6570 goto discard;
6571 }
6572
6573 ts = simple_inode_init_ts(inode);
6574 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6575 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6576
6577 /*
6578 * We're going to fill the inode item now, so at this point the inode
6579 * must be fully initialized.
6580 */
6581
6582 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6583 struct btrfs_inode_item);
6584 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6585 sizeof(*inode_item));
6586 fill_inode_item(trans, path->nodes[0], inode_item, inode);
6587
6588 if (!args->orphan) {
6589 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6590 struct btrfs_inode_ref);
6591 ptr = (unsigned long)(ref + 1);
6592 if (args->subvol) {
6593 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6594 btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6595 write_extent_buffer(path->nodes[0], "..", ptr, 2);
6596 } else {
6597 btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6598 name->len);
6599 btrfs_set_inode_ref_index(path->nodes[0], ref,
6600 BTRFS_I(inode)->dir_index);
6601 write_extent_buffer(path->nodes[0], name->name, ptr,
6602 name->len);
6603 }
6604 }
6605
6606 /*
6607 * We don't need the path anymore, plus inheriting properties, adding
6608 * ACLs, security xattrs, orphan item or adding the link, will result in
6609 * allocating yet another path. So just free our path.
6610 */
6611 btrfs_free_path(path);
6612 path = NULL;
6613
6614 if (args->subvol) {
6615 struct btrfs_inode *parent;
6616
6617 /*
6618 * Subvolumes inherit properties from their parent subvolume,
6619 * not the directory they were created in.
6620 */
6621 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6622 if (IS_ERR(parent)) {
6623 ret = PTR_ERR(parent);
6624 } else {
6625 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6626 parent);
6627 iput(&parent->vfs_inode);
6628 }
6629 } else {
6630 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6631 BTRFS_I(dir));
6632 }
6633 if (ret) {
6634 btrfs_err(fs_info,
6635 "error inheriting props for ino %llu (root %llu): %d",
6636 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6637 }
6638
6639 /*
6640 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6641 * probably a bug.
6642 */
6643 if (!args->subvol) {
6644 ret = btrfs_init_inode_security(trans, args);
6645 if (unlikely(ret)) {
6646 btrfs_abort_transaction(trans, ret);
6647 goto discard;
6648 }
6649 }
6650
6651 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6652 if (WARN_ON(ret)) {
6653 /* Shouldn't happen, we used xa_reserve() before. */
6654 btrfs_abort_transaction(trans, ret);
6655 goto discard;
6656 }
6657
6658 trace_btrfs_inode_new(inode);
6659 btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6660
6661 btrfs_update_root_times(trans, root);
6662
6663 if (args->orphan) {
6664 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6665 if (unlikely(ret)) {
6666 btrfs_abort_transaction(trans, ret);
6667 goto discard;
6668 }
6669 } else {
6670 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6671 0, BTRFS_I(inode)->dir_index);
6672 if (unlikely(ret)) {
6673 btrfs_abort_transaction(trans, ret);
6674 goto discard;
6675 }
6676 }
6677
6678 return 0;
6679
6680 discard:
6681 /*
6682 * discard_new_inode() calls iput(), but the caller owns the reference
6683 * to the inode.
6684 */
6685 ihold(inode);
6686 discard_new_inode(inode);
6687 out:
6688 if (xa_reserved)
6689 xa_release(&root->inodes, objectid);
6690
6691 btrfs_free_path(path);
6692 return ret;
6693 }
6694
6695 /*
6696 * utility function to add 'inode' into 'parent_inode' with
6697 * a give name and a given sequence number.
6698 * if 'add_backref' is true, also insert a backref from the
6699 * inode to the parent directory.
6700 */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,bool add_backref,u64 index)6701 int btrfs_add_link(struct btrfs_trans_handle *trans,
6702 struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6703 const struct fscrypt_str *name, bool add_backref, u64 index)
6704 {
6705 int ret = 0;
6706 struct btrfs_key key;
6707 struct btrfs_root *root = parent_inode->root;
6708 u64 ino = btrfs_ino(inode);
6709 u64 parent_ino = btrfs_ino(parent_inode);
6710
6711 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6712 memcpy(&key, &inode->root->root_key, sizeof(key));
6713 } else {
6714 key.objectid = ino;
6715 key.type = BTRFS_INODE_ITEM_KEY;
6716 key.offset = 0;
6717 }
6718
6719 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6720 ret = btrfs_add_root_ref(trans, key.objectid,
6721 btrfs_root_id(root), parent_ino,
6722 index, name);
6723 } else if (add_backref) {
6724 ret = btrfs_insert_inode_ref(trans, root, name,
6725 ino, parent_ino, index);
6726 }
6727
6728 /* Nothing to clean up yet */
6729 if (ret)
6730 return ret;
6731
6732 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6733 btrfs_inode_type(inode), index);
6734 if (ret == -EEXIST || ret == -EOVERFLOW)
6735 goto fail_dir_item;
6736 else if (unlikely(ret)) {
6737 btrfs_abort_transaction(trans, ret);
6738 return ret;
6739 }
6740
6741 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6742 name->len * 2);
6743 inode_inc_iversion(&parent_inode->vfs_inode);
6744 update_time_after_link_or_unlink(parent_inode);
6745
6746 ret = btrfs_update_inode(trans, parent_inode);
6747 if (ret)
6748 btrfs_abort_transaction(trans, ret);
6749 return ret;
6750
6751 fail_dir_item:
6752 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6753 u64 local_index;
6754 int ret2;
6755
6756 ret2 = btrfs_del_root_ref(trans, key.objectid, btrfs_root_id(root),
6757 parent_ino, &local_index, name);
6758 if (ret2)
6759 btrfs_abort_transaction(trans, ret2);
6760 } else if (add_backref) {
6761 int ret2;
6762
6763 ret2 = btrfs_del_inode_ref(trans, root, name, ino, parent_ino, NULL);
6764 if (ret2)
6765 btrfs_abort_transaction(trans, ret2);
6766 }
6767
6768 /* Return the original error code */
6769 return ret;
6770 }
6771
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6772 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6773 struct inode *inode)
6774 {
6775 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6776 struct btrfs_root *root = BTRFS_I(dir)->root;
6777 struct btrfs_new_inode_args new_inode_args = {
6778 .dir = dir,
6779 .dentry = dentry,
6780 .inode = inode,
6781 };
6782 unsigned int trans_num_items;
6783 struct btrfs_trans_handle *trans;
6784 int ret;
6785
6786 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6787 if (ret)
6788 goto out_inode;
6789
6790 trans = btrfs_start_transaction(root, trans_num_items);
6791 if (IS_ERR(trans)) {
6792 ret = PTR_ERR(trans);
6793 goto out_new_inode_args;
6794 }
6795
6796 ret = btrfs_create_new_inode(trans, &new_inode_args);
6797 if (!ret) {
6798 if (S_ISDIR(inode->i_mode))
6799 inode->i_opflags |= IOP_FASTPERM_MAY_EXEC;
6800 d_instantiate_new(dentry, inode);
6801 }
6802
6803 btrfs_end_transaction(trans);
6804 btrfs_btree_balance_dirty(fs_info);
6805 out_new_inode_args:
6806 btrfs_new_inode_args_destroy(&new_inode_args);
6807 out_inode:
6808 if (ret)
6809 iput(inode);
6810 return ret;
6811 }
6812
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6813 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6814 struct dentry *dentry, umode_t mode, dev_t rdev)
6815 {
6816 struct inode *inode;
6817
6818 inode = new_inode(dir->i_sb);
6819 if (!inode)
6820 return -ENOMEM;
6821 inode_init_owner(idmap, inode, dir, mode);
6822 inode->i_op = &btrfs_special_inode_operations;
6823 init_special_inode(inode, inode->i_mode, rdev);
6824 return btrfs_create_common(dir, dentry, inode);
6825 }
6826
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6827 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6828 struct dentry *dentry, umode_t mode, bool excl)
6829 {
6830 struct inode *inode;
6831
6832 inode = new_inode(dir->i_sb);
6833 if (!inode)
6834 return -ENOMEM;
6835 inode_init_owner(idmap, inode, dir, mode);
6836 inode->i_fop = &btrfs_file_operations;
6837 inode->i_op = &btrfs_file_inode_operations;
6838 inode->i_mapping->a_ops = &btrfs_aops;
6839 return btrfs_create_common(dir, dentry, inode);
6840 }
6841
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6842 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6843 struct dentry *dentry)
6844 {
6845 struct btrfs_trans_handle *trans = NULL;
6846 struct btrfs_root *root = BTRFS_I(dir)->root;
6847 struct inode *inode = d_inode(old_dentry);
6848 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
6849 struct fscrypt_name fname;
6850 u64 index;
6851 int ret;
6852
6853 /* do not allow sys_link's with other subvols of the same device */
6854 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
6855 return -EXDEV;
6856
6857 if (inode->i_nlink >= BTRFS_LINK_MAX)
6858 return -EMLINK;
6859
6860 ret = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6861 if (ret)
6862 goto fail;
6863
6864 ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
6865 if (ret)
6866 goto fail;
6867
6868 /*
6869 * 2 items for inode and inode ref
6870 * 2 items for dir items
6871 * 1 item for parent inode
6872 * 1 item for orphan item deletion if O_TMPFILE
6873 */
6874 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6875 if (IS_ERR(trans)) {
6876 ret = PTR_ERR(trans);
6877 trans = NULL;
6878 goto fail;
6879 }
6880
6881 /* There are several dir indexes for this inode, clear the cache. */
6882 BTRFS_I(inode)->dir_index = 0ULL;
6883 inode_inc_iversion(inode);
6884 inode_set_ctime_current(inode);
6885
6886 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6887 &fname.disk_name, 1, index);
6888 if (ret)
6889 goto fail;
6890
6891 /* Link added now we update the inode item with the new link count. */
6892 inc_nlink(inode);
6893 ret = btrfs_update_inode(trans, BTRFS_I(inode));
6894 if (unlikely(ret)) {
6895 btrfs_abort_transaction(trans, ret);
6896 goto fail;
6897 }
6898
6899 if (inode->i_nlink == 1) {
6900 /*
6901 * If the new hard link count is 1, it's a file created with the
6902 * open(2) O_TMPFILE flag.
6903 */
6904 ret = btrfs_orphan_del(trans, BTRFS_I(inode));
6905 if (unlikely(ret)) {
6906 btrfs_abort_transaction(trans, ret);
6907 goto fail;
6908 }
6909 }
6910
6911 /* Grab reference for the new dentry passed to d_instantiate(). */
6912 ihold(inode);
6913 d_instantiate(dentry, inode);
6914 btrfs_log_new_name(trans, old_dentry, NULL, 0, dentry->d_parent);
6915
6916 fail:
6917 fscrypt_free_filename(&fname);
6918 if (trans)
6919 btrfs_end_transaction(trans);
6920 btrfs_btree_balance_dirty(fs_info);
6921 return ret;
6922 }
6923
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6924 static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6925 struct dentry *dentry, umode_t mode)
6926 {
6927 struct inode *inode;
6928
6929 inode = new_inode(dir->i_sb);
6930 if (!inode)
6931 return ERR_PTR(-ENOMEM);
6932 inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6933 inode->i_op = &btrfs_dir_inode_operations;
6934 inode->i_fop = &btrfs_dir_file_operations;
6935 return ERR_PTR(btrfs_create_common(dir, dentry, inode));
6936 }
6937
uncompress_inline(struct btrfs_path * path,struct folio * folio,struct btrfs_file_extent_item * item)6938 static noinline int uncompress_inline(struct btrfs_path *path,
6939 struct folio *folio,
6940 struct btrfs_file_extent_item *item)
6941 {
6942 int ret;
6943 struct extent_buffer *leaf = path->nodes[0];
6944 const u32 blocksize = leaf->fs_info->sectorsize;
6945 char *tmp;
6946 size_t max_size;
6947 unsigned long inline_size;
6948 unsigned long ptr;
6949 int compress_type;
6950
6951 compress_type = btrfs_file_extent_compression(leaf, item);
6952 max_size = btrfs_file_extent_ram_bytes(leaf, item);
6953 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6954 tmp = kmalloc(inline_size, GFP_NOFS);
6955 if (!tmp)
6956 return -ENOMEM;
6957 ptr = btrfs_file_extent_inline_start(item);
6958
6959 read_extent_buffer(leaf, tmp, ptr, inline_size);
6960
6961 max_size = min_t(unsigned long, blocksize, max_size);
6962 ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
6963 max_size);
6964
6965 /*
6966 * decompression code contains a memset to fill in any space between the end
6967 * of the uncompressed data and the end of max_size in case the decompressed
6968 * data ends up shorter than ram_bytes. That doesn't cover the hole between
6969 * the end of an inline extent and the beginning of the next block, so we
6970 * cover that region here.
6971 */
6972
6973 if (max_size < blocksize)
6974 folio_zero_range(folio, max_size, blocksize - max_size);
6975 kfree(tmp);
6976 return ret;
6977 }
6978
read_inline_extent(struct btrfs_path * path,struct folio * folio)6979 static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
6980 {
6981 const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
6982 struct btrfs_file_extent_item *fi;
6983 void *kaddr;
6984 size_t copy_size;
6985
6986 if (!folio || folio_test_uptodate(folio))
6987 return 0;
6988
6989 ASSERT(folio_pos(folio) == 0);
6990
6991 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6992 struct btrfs_file_extent_item);
6993 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6994 return uncompress_inline(path, folio, fi);
6995
6996 copy_size = min_t(u64, blocksize,
6997 btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6998 kaddr = kmap_local_folio(folio, 0);
6999 read_extent_buffer(path->nodes[0], kaddr,
7000 btrfs_file_extent_inline_start(fi), copy_size);
7001 kunmap_local(kaddr);
7002 if (copy_size < blocksize)
7003 folio_zero_range(folio, copy_size, blocksize - copy_size);
7004 return 0;
7005 }
7006
7007 /*
7008 * Lookup the first extent overlapping a range in a file.
7009 *
7010 * @inode: file to search in
7011 * @page: page to read extent data into if the extent is inline
7012 * @start: file offset
7013 * @len: length of range starting at @start
7014 *
7015 * Return the first &struct extent_map which overlaps the given range, reading
7016 * it from the B-tree and caching it if necessary. Note that there may be more
7017 * extents which overlap the given range after the returned extent_map.
7018 *
7019 * If @page is not NULL and the extent is inline, this also reads the extent
7020 * data directly into the page and marks the extent up to date in the io_tree.
7021 *
7022 * Return: ERR_PTR on error, non-NULL extent_map on success.
7023 */
btrfs_get_extent(struct btrfs_inode * inode,struct folio * folio,u64 start,u64 len)7024 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
7025 struct folio *folio, u64 start, u64 len)
7026 {
7027 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7028 int ret = 0;
7029 u64 extent_start = 0;
7030 u64 extent_end = 0;
7031 u64 objectid = btrfs_ino(inode);
7032 int extent_type = -1;
7033 struct btrfs_path *path = NULL;
7034 struct btrfs_root *root = inode->root;
7035 struct btrfs_file_extent_item *item;
7036 struct extent_buffer *leaf;
7037 struct btrfs_key found_key;
7038 struct extent_map *em = NULL;
7039 struct extent_map_tree *em_tree = &inode->extent_tree;
7040
7041 read_lock(&em_tree->lock);
7042 em = btrfs_lookup_extent_mapping(em_tree, start, len);
7043 read_unlock(&em_tree->lock);
7044
7045 if (em) {
7046 if (em->start > start || em->start + em->len <= start)
7047 btrfs_free_extent_map(em);
7048 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
7049 btrfs_free_extent_map(em);
7050 else
7051 goto out;
7052 }
7053 em = btrfs_alloc_extent_map();
7054 if (!em) {
7055 ret = -ENOMEM;
7056 goto out;
7057 }
7058 em->start = EXTENT_MAP_HOLE;
7059 em->disk_bytenr = EXTENT_MAP_HOLE;
7060 em->len = (u64)-1;
7061
7062 path = btrfs_alloc_path();
7063 if (!path) {
7064 ret = -ENOMEM;
7065 goto out;
7066 }
7067
7068 /* Chances are we'll be called again, so go ahead and do readahead */
7069 path->reada = READA_FORWARD;
7070
7071 /*
7072 * The same explanation in load_free_space_cache applies here as well,
7073 * we only read when we're loading the free space cache, and at that
7074 * point the commit_root has everything we need.
7075 */
7076 if (btrfs_is_free_space_inode(inode)) {
7077 path->search_commit_root = 1;
7078 path->skip_locking = 1;
7079 }
7080
7081 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
7082 if (ret < 0) {
7083 goto out;
7084 } else if (ret > 0) {
7085 if (path->slots[0] == 0)
7086 goto not_found;
7087 path->slots[0]--;
7088 ret = 0;
7089 }
7090
7091 leaf = path->nodes[0];
7092 item = btrfs_item_ptr(leaf, path->slots[0],
7093 struct btrfs_file_extent_item);
7094 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7095 if (found_key.objectid != objectid ||
7096 found_key.type != BTRFS_EXTENT_DATA_KEY) {
7097 /*
7098 * If we backup past the first extent we want to move forward
7099 * and see if there is an extent in front of us, otherwise we'll
7100 * say there is a hole for our whole search range which can
7101 * cause problems.
7102 */
7103 extent_end = start;
7104 goto next;
7105 }
7106
7107 extent_type = btrfs_file_extent_type(leaf, item);
7108 extent_start = found_key.offset;
7109 extent_end = btrfs_file_extent_end(path);
7110 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7111 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7112 /* Only regular file could have regular/prealloc extent */
7113 if (unlikely(!S_ISREG(inode->vfs_inode.i_mode))) {
7114 ret = -EUCLEAN;
7115 btrfs_crit(fs_info,
7116 "regular/prealloc extent found for non-regular inode %llu",
7117 btrfs_ino(inode));
7118 goto out;
7119 }
7120 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
7121 extent_start);
7122 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7123 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
7124 path->slots[0],
7125 extent_start);
7126 }
7127 next:
7128 if (start >= extent_end) {
7129 path->slots[0]++;
7130 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
7131 ret = btrfs_next_leaf(root, path);
7132 if (ret < 0)
7133 goto out;
7134 else if (ret > 0)
7135 goto not_found;
7136
7137 leaf = path->nodes[0];
7138 }
7139 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7140 if (found_key.objectid != objectid ||
7141 found_key.type != BTRFS_EXTENT_DATA_KEY)
7142 goto not_found;
7143 if (start + len <= found_key.offset)
7144 goto not_found;
7145 if (start > found_key.offset)
7146 goto next;
7147
7148 /* New extent overlaps with existing one */
7149 em->start = start;
7150 em->len = found_key.offset - start;
7151 em->disk_bytenr = EXTENT_MAP_HOLE;
7152 goto insert;
7153 }
7154
7155 btrfs_extent_item_to_extent_map(inode, path, item, em);
7156
7157 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7158 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7159 goto insert;
7160 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7161 /*
7162 * Inline extent can only exist at file offset 0. This is
7163 * ensured by tree-checker and inline extent creation path.
7164 * Thus all members representing file offsets should be zero.
7165 */
7166 ASSERT(extent_start == 0);
7167 ASSERT(em->start == 0);
7168
7169 /*
7170 * btrfs_extent_item_to_extent_map() should have properly
7171 * initialized em members already.
7172 *
7173 * Other members are not utilized for inline extents.
7174 */
7175 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
7176 ASSERT(em->len == fs_info->sectorsize);
7177
7178 ret = read_inline_extent(path, folio);
7179 if (ret < 0)
7180 goto out;
7181 goto insert;
7182 }
7183 not_found:
7184 em->start = start;
7185 em->len = len;
7186 em->disk_bytenr = EXTENT_MAP_HOLE;
7187 insert:
7188 ret = 0;
7189 btrfs_release_path(path);
7190 if (unlikely(em->start > start || btrfs_extent_map_end(em) <= start)) {
7191 btrfs_err(fs_info,
7192 "bad extent! em: [%llu %llu] passed [%llu %llu]",
7193 em->start, em->len, start, len);
7194 ret = -EIO;
7195 goto out;
7196 }
7197
7198 write_lock(&em_tree->lock);
7199 ret = btrfs_add_extent_mapping(inode, &em, start, len);
7200 write_unlock(&em_tree->lock);
7201 out:
7202 btrfs_free_path(path);
7203
7204 trace_btrfs_get_extent(root, inode, em);
7205
7206 if (ret) {
7207 btrfs_free_extent_map(em);
7208 return ERR_PTR(ret);
7209 }
7210 return em;
7211 }
7212
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7213 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7214 {
7215 struct btrfs_block_group *block_group;
7216 bool readonly = false;
7217
7218 block_group = btrfs_lookup_block_group(fs_info, bytenr);
7219 if (!block_group || block_group->ro)
7220 readonly = true;
7221 if (block_group)
7222 btrfs_put_block_group(block_group);
7223 return readonly;
7224 }
7225
7226 /*
7227 * Check if we can do nocow write into the range [@offset, @offset + @len)
7228 *
7229 * @offset: File offset
7230 * @len: The length to write, will be updated to the nocow writeable
7231 * range
7232 * @orig_start: (optional) Return the original file offset of the file extent
7233 * @orig_len: (optional) Return the original on-disk length of the file extent
7234 * @ram_bytes: (optional) Return the ram_bytes of the file extent
7235 *
7236 * Return:
7237 * >0 and update @len if we can do nocow write
7238 * 0 if we can't do nocow write
7239 * <0 if error happened
7240 *
7241 * NOTE: This only checks the file extents, caller is responsible to wait for
7242 * any ordered extents.
7243 */
can_nocow_extent(struct btrfs_inode * inode,u64 offset,u64 * len,struct btrfs_file_extent * file_extent,bool nowait)7244 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
7245 struct btrfs_file_extent *file_extent,
7246 bool nowait)
7247 {
7248 struct btrfs_root *root = inode->root;
7249 struct btrfs_fs_info *fs_info = root->fs_info;
7250 struct can_nocow_file_extent_args nocow_args = { 0 };
7251 BTRFS_PATH_AUTO_FREE(path);
7252 int ret;
7253 struct extent_buffer *leaf;
7254 struct extent_io_tree *io_tree = &inode->io_tree;
7255 struct btrfs_file_extent_item *fi;
7256 struct btrfs_key key;
7257 int found_type;
7258
7259 path = btrfs_alloc_path();
7260 if (!path)
7261 return -ENOMEM;
7262 path->nowait = nowait;
7263
7264 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7265 offset, 0);
7266 if (ret < 0)
7267 return ret;
7268
7269 if (ret == 1) {
7270 if (path->slots[0] == 0) {
7271 /* Can't find the item, must COW. */
7272 return 0;
7273 }
7274 path->slots[0]--;
7275 }
7276 ret = 0;
7277 leaf = path->nodes[0];
7278 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7279 if (key.objectid != btrfs_ino(inode) ||
7280 key.type != BTRFS_EXTENT_DATA_KEY) {
7281 /* Not our file or wrong item type, must COW. */
7282 return 0;
7283 }
7284
7285 if (key.offset > offset) {
7286 /* Wrong offset, must COW. */
7287 return 0;
7288 }
7289
7290 if (btrfs_file_extent_end(path) <= offset)
7291 return 0;
7292
7293 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7294 found_type = btrfs_file_extent_type(leaf, fi);
7295
7296 nocow_args.start = offset;
7297 nocow_args.end = offset + *len - 1;
7298 nocow_args.free_path = true;
7299
7300 ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
7301 /* can_nocow_file_extent() has freed the path. */
7302 path = NULL;
7303
7304 if (ret != 1) {
7305 /* Treat errors as not being able to NOCOW. */
7306 return 0;
7307 }
7308
7309 if (btrfs_extent_readonly(fs_info,
7310 nocow_args.file_extent.disk_bytenr +
7311 nocow_args.file_extent.offset))
7312 return 0;
7313
7314 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
7315 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7316 u64 range_end;
7317
7318 range_end = round_up(offset + nocow_args.file_extent.num_bytes,
7319 root->fs_info->sectorsize) - 1;
7320 ret = btrfs_test_range_bit_exists(io_tree, offset, range_end,
7321 EXTENT_DELALLOC);
7322 if (ret)
7323 return -EAGAIN;
7324 }
7325
7326 if (file_extent)
7327 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
7328
7329 *len = nocow_args.file_extent.num_bytes;
7330
7331 return 1;
7332 }
7333
7334 /* The callers of this must take lock_extent() */
btrfs_create_io_em(struct btrfs_inode * inode,u64 start,const struct btrfs_file_extent * file_extent,int type)7335 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7336 const struct btrfs_file_extent *file_extent,
7337 int type)
7338 {
7339 struct extent_map *em;
7340 int ret;
7341
7342 /*
7343 * Note the missing NOCOW type.
7344 *
7345 * For pure NOCOW writes, we should not create an io extent map, but
7346 * just reusing the existing one.
7347 * Only PREALLOC writes (NOCOW write into preallocated range) can
7348 * create an io extent map.
7349 */
7350 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7351 type == BTRFS_ORDERED_COMPRESSED ||
7352 type == BTRFS_ORDERED_REGULAR);
7353
7354 switch (type) {
7355 case BTRFS_ORDERED_PREALLOC:
7356 /* We're only referring part of a larger preallocated extent. */
7357 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7358 break;
7359 case BTRFS_ORDERED_REGULAR:
7360 /* COW results a new extent matching our file extent size. */
7361 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7362 ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
7363
7364 /* Since it's a new extent, we should not have any offset. */
7365 ASSERT(file_extent->offset == 0);
7366 break;
7367 case BTRFS_ORDERED_COMPRESSED:
7368 /* Must be compressed. */
7369 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
7370
7371 /*
7372 * Encoded write can make us to refer to part of the
7373 * uncompressed extent.
7374 */
7375 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7376 break;
7377 }
7378
7379 em = btrfs_alloc_extent_map();
7380 if (!em)
7381 return ERR_PTR(-ENOMEM);
7382
7383 em->start = start;
7384 em->len = file_extent->num_bytes;
7385 em->disk_bytenr = file_extent->disk_bytenr;
7386 em->disk_num_bytes = file_extent->disk_num_bytes;
7387 em->ram_bytes = file_extent->ram_bytes;
7388 em->generation = -1;
7389 em->offset = file_extent->offset;
7390 em->flags |= EXTENT_FLAG_PINNED;
7391 if (type == BTRFS_ORDERED_COMPRESSED)
7392 btrfs_extent_map_set_compression(em, file_extent->compression);
7393
7394 ret = btrfs_replace_extent_map_range(inode, em, true);
7395 if (ret) {
7396 btrfs_free_extent_map(em);
7397 return ERR_PTR(ret);
7398 }
7399
7400 /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */
7401 return em;
7402 }
7403
7404 /*
7405 * For release_folio() and invalidate_folio() we have a race window where
7406 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7407 * If we continue to release/invalidate the page, we could cause use-after-free
7408 * for subpage spinlock. So this function is to spin and wait for subpage
7409 * spinlock.
7410 */
wait_subpage_spinlock(struct folio * folio)7411 static void wait_subpage_spinlock(struct folio *folio)
7412 {
7413 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
7414 struct btrfs_folio_state *bfs;
7415
7416 if (!btrfs_is_subpage(fs_info, folio))
7417 return;
7418
7419 ASSERT(folio_test_private(folio) && folio_get_private(folio));
7420 bfs = folio_get_private(folio);
7421
7422 /*
7423 * This may look insane as we just acquire the spinlock and release it,
7424 * without doing anything. But we just want to make sure no one is
7425 * still holding the subpage spinlock.
7426 * And since the page is not dirty nor writeback, and we have page
7427 * locked, the only possible way to hold a spinlock is from the endio
7428 * function to clear page writeback.
7429 *
7430 * Here we just acquire the spinlock so that all existing callers
7431 * should exit and we're safe to release/invalidate the page.
7432 */
7433 spin_lock_irq(&bfs->lock);
7434 spin_unlock_irq(&bfs->lock);
7435 }
7436
btrfs_launder_folio(struct folio * folio)7437 static int btrfs_launder_folio(struct folio *folio)
7438 {
7439 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7440 folio_size(folio), NULL);
7441 }
7442
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7443 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7444 {
7445 if (try_release_extent_mapping(folio, gfp_flags)) {
7446 wait_subpage_spinlock(folio);
7447 clear_folio_extent_mapped(folio);
7448 return true;
7449 }
7450 return false;
7451 }
7452
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7453 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7454 {
7455 if (folio_test_writeback(folio) || folio_test_dirty(folio))
7456 return false;
7457 return __btrfs_release_folio(folio, gfp_flags);
7458 }
7459
7460 #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7461 static int btrfs_migrate_folio(struct address_space *mapping,
7462 struct folio *dst, struct folio *src,
7463 enum migrate_mode mode)
7464 {
7465 int ret = filemap_migrate_folio(mapping, dst, src, mode);
7466
7467 if (ret)
7468 return ret;
7469
7470 if (folio_test_ordered(src)) {
7471 folio_clear_ordered(src);
7472 folio_set_ordered(dst);
7473 }
7474
7475 return 0;
7476 }
7477 #else
7478 #define btrfs_migrate_folio NULL
7479 #endif
7480
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7481 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7482 size_t length)
7483 {
7484 struct btrfs_inode *inode = folio_to_inode(folio);
7485 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7486 struct extent_io_tree *tree = &inode->io_tree;
7487 struct extent_state *cached_state = NULL;
7488 u64 page_start = folio_pos(folio);
7489 u64 page_end = page_start + folio_size(folio) - 1;
7490 u64 cur;
7491 int inode_evicting = inode_state_read_once(&inode->vfs_inode) & I_FREEING;
7492
7493 /*
7494 * We have folio locked so no new ordered extent can be created on this
7495 * page, nor bio can be submitted for this folio.
7496 *
7497 * But already submitted bio can still be finished on this folio.
7498 * Furthermore, endio function won't skip folio which has Ordered
7499 * already cleared, so it's possible for endio and
7500 * invalidate_folio to do the same ordered extent accounting twice
7501 * on one folio.
7502 *
7503 * So here we wait for any submitted bios to finish, so that we won't
7504 * do double ordered extent accounting on the same folio.
7505 */
7506 folio_wait_writeback(folio);
7507 wait_subpage_spinlock(folio);
7508
7509 /*
7510 * For subpage case, we have call sites like
7511 * btrfs_punch_hole_lock_range() which passes range not aligned to
7512 * sectorsize.
7513 * If the range doesn't cover the full folio, we don't need to and
7514 * shouldn't clear page extent mapped, as folio->private can still
7515 * record subpage dirty bits for other part of the range.
7516 *
7517 * For cases that invalidate the full folio even the range doesn't
7518 * cover the full folio, like invalidating the last folio, we're
7519 * still safe to wait for ordered extent to finish.
7520 */
7521 if (!(offset == 0 && length == folio_size(folio))) {
7522 btrfs_release_folio(folio, GFP_NOFS);
7523 return;
7524 }
7525
7526 if (!inode_evicting)
7527 btrfs_lock_extent(tree, page_start, page_end, &cached_state);
7528
7529 cur = page_start;
7530 while (cur < page_end) {
7531 struct btrfs_ordered_extent *ordered;
7532 u64 range_end;
7533 u32 range_len;
7534 u32 extra_flags = 0;
7535
7536 ordered = btrfs_lookup_first_ordered_range(inode, cur,
7537 page_end + 1 - cur);
7538 if (!ordered) {
7539 range_end = page_end;
7540 /*
7541 * No ordered extent covering this range, we are safe
7542 * to delete all extent states in the range.
7543 */
7544 extra_flags = EXTENT_CLEAR_ALL_BITS;
7545 goto next;
7546 }
7547 if (ordered->file_offset > cur) {
7548 /*
7549 * There is a range between [cur, oe->file_offset) not
7550 * covered by any ordered extent.
7551 * We are safe to delete all extent states, and handle
7552 * the ordered extent in the next iteration.
7553 */
7554 range_end = ordered->file_offset - 1;
7555 extra_flags = EXTENT_CLEAR_ALL_BITS;
7556 goto next;
7557 }
7558
7559 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7560 page_end);
7561 ASSERT(range_end + 1 - cur < U32_MAX);
7562 range_len = range_end + 1 - cur;
7563 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
7564 /*
7565 * If Ordered is cleared, it means endio has
7566 * already been executed for the range.
7567 * We can't delete the extent states as
7568 * btrfs_finish_ordered_io() may still use some of them.
7569 */
7570 goto next;
7571 }
7572 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
7573
7574 /*
7575 * IO on this page will never be started, so we need to account
7576 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7577 * here, must leave that up for the ordered extent completion.
7578 *
7579 * This will also unlock the range for incoming
7580 * btrfs_finish_ordered_io().
7581 */
7582 if (!inode_evicting)
7583 btrfs_clear_extent_bit(tree, cur, range_end,
7584 EXTENT_DELALLOC |
7585 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7586 EXTENT_DEFRAG, &cached_state);
7587
7588 spin_lock_irq(&inode->ordered_tree_lock);
7589 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7590 ordered->truncated_len = min(ordered->truncated_len,
7591 cur - ordered->file_offset);
7592 spin_unlock_irq(&inode->ordered_tree_lock);
7593
7594 /*
7595 * If the ordered extent has finished, we're safe to delete all
7596 * the extent states of the range, otherwise
7597 * btrfs_finish_ordered_io() will get executed by endio for
7598 * other pages, so we can't delete extent states.
7599 */
7600 if (btrfs_dec_test_ordered_pending(inode, &ordered,
7601 cur, range_end + 1 - cur)) {
7602 btrfs_finish_ordered_io(ordered);
7603 /*
7604 * The ordered extent has finished, now we're again
7605 * safe to delete all extent states of the range.
7606 */
7607 extra_flags = EXTENT_CLEAR_ALL_BITS;
7608 }
7609 next:
7610 if (ordered)
7611 btrfs_put_ordered_extent(ordered);
7612 /*
7613 * Qgroup reserved space handler
7614 * Sector(s) here will be either:
7615 *
7616 * 1) Already written to disk or bio already finished
7617 * Then its QGROUP_RESERVED bit in io_tree is already cleared.
7618 * Qgroup will be handled by its qgroup_record then.
7619 * btrfs_qgroup_free_data() call will do nothing here.
7620 *
7621 * 2) Not written to disk yet
7622 * Then btrfs_qgroup_free_data() call will clear the
7623 * QGROUP_RESERVED bit of its io_tree, and free the qgroup
7624 * reserved data space.
7625 * Since the IO will never happen for this page.
7626 */
7627 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
7628 if (!inode_evicting)
7629 btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7630 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
7631 EXTENT_DEFRAG | extra_flags,
7632 &cached_state);
7633 cur = range_end + 1;
7634 }
7635 /*
7636 * We have iterated through all ordered extents of the page, the page
7637 * should not have Ordered anymore, or the above iteration
7638 * did something wrong.
7639 */
7640 ASSERT(!folio_test_ordered(folio));
7641 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
7642 if (!inode_evicting)
7643 __btrfs_release_folio(folio, GFP_NOFS);
7644 clear_folio_extent_mapped(folio);
7645 }
7646
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)7647 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
7648 {
7649 struct btrfs_truncate_control control = {
7650 .inode = inode,
7651 .ino = btrfs_ino(inode),
7652 .min_type = BTRFS_EXTENT_DATA_KEY,
7653 .clear_extent_range = true,
7654 };
7655 struct btrfs_root *root = inode->root;
7656 struct btrfs_fs_info *fs_info = root->fs_info;
7657 struct btrfs_block_rsv rsv;
7658 int ret;
7659 struct btrfs_trans_handle *trans;
7660 u64 mask = fs_info->sectorsize - 1;
7661 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
7662
7663 if (!skip_writeback) {
7664 ret = btrfs_wait_ordered_range(inode,
7665 inode->vfs_inode.i_size & (~mask),
7666 (u64)-1);
7667 if (ret)
7668 return ret;
7669 }
7670
7671 /*
7672 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
7673 * things going on here:
7674 *
7675 * 1) We need to reserve space to update our inode.
7676 *
7677 * 2) We need to have something to cache all the space that is going to
7678 * be free'd up by the truncate operation, but also have some slack
7679 * space reserved in case it uses space during the truncate (thank you
7680 * very much snapshotting).
7681 *
7682 * And we need these to be separate. The fact is we can use a lot of
7683 * space doing the truncate, and we have no earthly idea how much space
7684 * we will use, so we need the truncate reservation to be separate so it
7685 * doesn't end up using space reserved for updating the inode. We also
7686 * need to be able to stop the transaction and start a new one, which
7687 * means we need to be able to update the inode several times, and we
7688 * have no idea of knowing how many times that will be, so we can't just
7689 * reserve 1 item for the entirety of the operation, so that has to be
7690 * done separately as well.
7691 *
7692 * So that leaves us with
7693 *
7694 * 1) rsv - for the truncate reservation, which we will steal from the
7695 * transaction reservation.
7696 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
7697 * updating the inode.
7698 */
7699 btrfs_init_metadata_block_rsv(fs_info, &rsv, BTRFS_BLOCK_RSV_TEMP);
7700 rsv.size = min_size;
7701 rsv.failfast = true;
7702
7703 /*
7704 * 1 for the truncate slack space
7705 * 1 for updating the inode.
7706 */
7707 trans = btrfs_start_transaction(root, 2);
7708 if (IS_ERR(trans)) {
7709 ret = PTR_ERR(trans);
7710 goto out;
7711 }
7712
7713 /* Migrate the slack space for the truncate to our reserve */
7714 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, &rsv,
7715 min_size, false);
7716 /*
7717 * We have reserved 2 metadata units when we started the transaction and
7718 * min_size matches 1 unit, so this should never fail, but if it does,
7719 * it's not critical we just fail truncation.
7720 */
7721 if (WARN_ON(ret)) {
7722 btrfs_end_transaction(trans);
7723 goto out;
7724 }
7725
7726 trans->block_rsv = &rsv;
7727
7728 while (1) {
7729 struct extent_state *cached_state = NULL;
7730 const u64 new_size = inode->vfs_inode.i_size;
7731 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7732
7733 control.new_size = new_size;
7734 btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7735 /*
7736 * We want to drop from the next block forward in case this new
7737 * size is not block aligned since we will be keeping the last
7738 * block of the extent just the way it is.
7739 */
7740 btrfs_drop_extent_map_range(inode,
7741 ALIGN(new_size, fs_info->sectorsize),
7742 (u64)-1, false);
7743
7744 ret = btrfs_truncate_inode_items(trans, root, &control);
7745
7746 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7747 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
7748
7749 btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7750
7751 trans->block_rsv = &fs_info->trans_block_rsv;
7752 if (ret != -ENOSPC && ret != -EAGAIN)
7753 break;
7754
7755 ret = btrfs_update_inode(trans, inode);
7756 if (ret)
7757 break;
7758
7759 btrfs_end_transaction(trans);
7760 btrfs_btree_balance_dirty(fs_info);
7761
7762 trans = btrfs_start_transaction(root, 2);
7763 if (IS_ERR(trans)) {
7764 ret = PTR_ERR(trans);
7765 trans = NULL;
7766 break;
7767 }
7768
7769 btrfs_block_rsv_release(fs_info, &rsv, -1, NULL);
7770 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
7771 &rsv, min_size, false);
7772 /*
7773 * We have reserved 2 metadata units when we started the
7774 * transaction and min_size matches 1 unit, so this should never
7775 * fail, but if it does, it's not critical we just fail truncation.
7776 */
7777 if (WARN_ON(ret))
7778 break;
7779
7780 trans->block_rsv = &rsv;
7781 }
7782
7783 /*
7784 * We can't call btrfs_truncate_block inside a trans handle as we could
7785 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7786 * know we've truncated everything except the last little bit, and can
7787 * do btrfs_truncate_block and then update the disk_i_size.
7788 */
7789 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
7790 btrfs_end_transaction(trans);
7791 btrfs_btree_balance_dirty(fs_info);
7792
7793 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size,
7794 inode->vfs_inode.i_size, (u64)-1);
7795 if (ret)
7796 goto out;
7797 trans = btrfs_start_transaction(root, 1);
7798 if (IS_ERR(trans)) {
7799 ret = PTR_ERR(trans);
7800 goto out;
7801 }
7802 btrfs_inode_safe_disk_i_size_write(inode, 0);
7803 }
7804
7805 if (trans) {
7806 int ret2;
7807
7808 trans->block_rsv = &fs_info->trans_block_rsv;
7809 ret2 = btrfs_update_inode(trans, inode);
7810 if (ret2 && !ret)
7811 ret = ret2;
7812
7813 ret2 = btrfs_end_transaction(trans);
7814 if (ret2 && !ret)
7815 ret = ret2;
7816 btrfs_btree_balance_dirty(fs_info);
7817 }
7818 out:
7819 btrfs_block_rsv_release(fs_info, &rsv, (u64)-1, NULL);
7820 /*
7821 * So if we truncate and then write and fsync we normally would just
7822 * write the extents that changed, which is a problem if we need to
7823 * first truncate that entire inode. So set this flag so we write out
7824 * all of the extents in the inode to the sync log so we're completely
7825 * safe.
7826 *
7827 * If no extents were dropped or trimmed we don't need to force the next
7828 * fsync to truncate all the inode's items from the log and re-log them
7829 * all. This means the truncate operation did not change the file size,
7830 * or changed it to a smaller size but there was only an implicit hole
7831 * between the old i_size and the new i_size, and there were no prealloc
7832 * extents beyond i_size to drop.
7833 */
7834 if (control.extents_found > 0)
7835 btrfs_set_inode_full_sync(inode);
7836
7837 return ret;
7838 }
7839
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)7840 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
7841 struct inode *dir)
7842 {
7843 struct inode *inode;
7844
7845 inode = new_inode(dir->i_sb);
7846 if (inode) {
7847 /*
7848 * Subvolumes don't inherit the sgid bit or the parent's gid if
7849 * the parent's sgid bit is set. This is probably a bug.
7850 */
7851 inode_init_owner(idmap, inode, NULL,
7852 S_IFDIR | (~current_umask() & S_IRWXUGO));
7853 inode->i_op = &btrfs_dir_inode_operations;
7854 inode->i_fop = &btrfs_dir_file_operations;
7855 }
7856 return inode;
7857 }
7858
btrfs_alloc_inode(struct super_block * sb)7859 struct inode *btrfs_alloc_inode(struct super_block *sb)
7860 {
7861 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
7862 struct btrfs_inode *ei;
7863 struct inode *inode;
7864
7865 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
7866 if (!ei)
7867 return NULL;
7868
7869 ei->root = NULL;
7870 ei->generation = 0;
7871 ei->last_trans = 0;
7872 ei->last_sub_trans = 0;
7873 ei->logged_trans = 0;
7874 ei->delalloc_bytes = 0;
7875 /* new_delalloc_bytes and last_dir_index_offset are in a union. */
7876 ei->new_delalloc_bytes = 0;
7877 ei->defrag_bytes = 0;
7878 ei->disk_i_size = 0;
7879 ei->flags = 0;
7880 ei->ro_flags = 0;
7881 /*
7882 * ->index_cnt will be properly initialized later when creating a new
7883 * inode (btrfs_create_new_inode()) or when reading an existing inode
7884 * from disk (btrfs_read_locked_inode()).
7885 */
7886 ei->csum_bytes = 0;
7887 ei->dir_index = 0;
7888 ei->last_unlink_trans = 0;
7889 ei->last_reflink_trans = 0;
7890 ei->last_log_commit = 0;
7891
7892 spin_lock_init(&ei->lock);
7893 ei->outstanding_extents = 0;
7894 if (sb->s_magic != BTRFS_TEST_MAGIC)
7895 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7896 BTRFS_BLOCK_RSV_DELALLOC);
7897 ei->runtime_flags = 0;
7898 ei->prop_compress = BTRFS_COMPRESS_NONE;
7899 ei->defrag_compress = BTRFS_COMPRESS_NONE;
7900
7901 ei->delayed_node = NULL;
7902
7903 ei->i_otime_sec = 0;
7904 ei->i_otime_nsec = 0;
7905
7906 inode = &ei->vfs_inode;
7907 btrfs_extent_map_tree_init(&ei->extent_tree);
7908
7909 /* This io tree sets the valid inode. */
7910 btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
7911 ei->io_tree.inode = ei;
7912
7913 ei->file_extent_tree = NULL;
7914
7915 mutex_init(&ei->log_mutex);
7916 spin_lock_init(&ei->ordered_tree_lock);
7917 ei->ordered_tree = RB_ROOT;
7918 ei->ordered_tree_last = NULL;
7919 INIT_LIST_HEAD(&ei->delalloc_inodes);
7920 INIT_LIST_HEAD(&ei->delayed_iput);
7921 init_rwsem(&ei->i_mmap_lock);
7922
7923 return inode;
7924 }
7925
7926 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)7927 void btrfs_test_destroy_inode(struct inode *inode)
7928 {
7929 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
7930 kfree(BTRFS_I(inode)->file_extent_tree);
7931 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7932 }
7933 #endif
7934
btrfs_free_inode(struct inode * inode)7935 void btrfs_free_inode(struct inode *inode)
7936 {
7937 kfree(BTRFS_I(inode)->file_extent_tree);
7938 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7939 }
7940
btrfs_destroy_inode(struct inode * vfs_inode)7941 void btrfs_destroy_inode(struct inode *vfs_inode)
7942 {
7943 struct btrfs_ordered_extent *ordered;
7944 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7945 struct btrfs_root *root = inode->root;
7946 bool freespace_inode;
7947
7948 WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7949 WARN_ON(vfs_inode->i_data.nrpages);
7950 WARN_ON(inode->block_rsv.reserved);
7951 WARN_ON(inode->block_rsv.size);
7952 WARN_ON(inode->outstanding_extents);
7953 if (!S_ISDIR(vfs_inode->i_mode)) {
7954 WARN_ON(inode->delalloc_bytes);
7955 WARN_ON(inode->new_delalloc_bytes);
7956 WARN_ON(inode->csum_bytes);
7957 }
7958 if (!root || !btrfs_is_data_reloc_root(root))
7959 WARN_ON(inode->defrag_bytes);
7960
7961 /*
7962 * This can happen where we create an inode, but somebody else also
7963 * created the same inode and we need to destroy the one we already
7964 * created.
7965 */
7966 if (!root)
7967 return;
7968
7969 /*
7970 * If this is a free space inode do not take the ordered extents lockdep
7971 * map.
7972 */
7973 freespace_inode = btrfs_is_free_space_inode(inode);
7974
7975 while (1) {
7976 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7977 if (!ordered)
7978 break;
7979 else {
7980 btrfs_err(root->fs_info,
7981 "found ordered extent %llu %llu on inode cleanup",
7982 ordered->file_offset, ordered->num_bytes);
7983
7984 if (!freespace_inode)
7985 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7986
7987 btrfs_remove_ordered_extent(inode, ordered);
7988 btrfs_put_ordered_extent(ordered);
7989 btrfs_put_ordered_extent(ordered);
7990 }
7991 }
7992 btrfs_qgroup_check_reserved_leak(inode);
7993 btrfs_del_inode_from_root(inode);
7994 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
7995 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7996 btrfs_put_root(inode->root);
7997 }
7998
btrfs_drop_inode(struct inode * inode)7999 int btrfs_drop_inode(struct inode *inode)
8000 {
8001 struct btrfs_root *root = BTRFS_I(inode)->root;
8002
8003 if (root == NULL)
8004 return 1;
8005
8006 /* the snap/subvol tree is on deleting */
8007 if (btrfs_root_refs(&root->root_item) == 0)
8008 return 1;
8009 else
8010 return inode_generic_drop(inode);
8011 }
8012
init_once(void * foo)8013 static void init_once(void *foo)
8014 {
8015 struct btrfs_inode *ei = foo;
8016
8017 inode_init_once(&ei->vfs_inode);
8018 #ifdef CONFIG_FS_VERITY
8019 ei->i_verity_info = NULL;
8020 #endif
8021 }
8022
btrfs_destroy_cachep(void)8023 void __cold btrfs_destroy_cachep(void)
8024 {
8025 /*
8026 * Make sure all delayed rcu free inodes are flushed before we
8027 * destroy cache.
8028 */
8029 rcu_barrier();
8030 kmem_cache_destroy(btrfs_inode_cachep);
8031 }
8032
btrfs_init_cachep(void)8033 int __init btrfs_init_cachep(void)
8034 {
8035 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
8036 sizeof(struct btrfs_inode), 0,
8037 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
8038 init_once);
8039 if (!btrfs_inode_cachep)
8040 return -ENOMEM;
8041
8042 return 0;
8043 }
8044
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)8045 static int btrfs_getattr(struct mnt_idmap *idmap,
8046 const struct path *path, struct kstat *stat,
8047 u32 request_mask, unsigned int flags)
8048 {
8049 u64 delalloc_bytes;
8050 u64 inode_bytes;
8051 struct inode *inode = d_inode(path->dentry);
8052 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
8053 u32 bi_flags = BTRFS_I(inode)->flags;
8054 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
8055
8056 stat->result_mask |= STATX_BTIME;
8057 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
8058 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
8059 if (bi_flags & BTRFS_INODE_APPEND)
8060 stat->attributes |= STATX_ATTR_APPEND;
8061 if (bi_flags & BTRFS_INODE_COMPRESS)
8062 stat->attributes |= STATX_ATTR_COMPRESSED;
8063 if (bi_flags & BTRFS_INODE_IMMUTABLE)
8064 stat->attributes |= STATX_ATTR_IMMUTABLE;
8065 if (bi_flags & BTRFS_INODE_NODUMP)
8066 stat->attributes |= STATX_ATTR_NODUMP;
8067 if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
8068 stat->attributes |= STATX_ATTR_VERITY;
8069
8070 stat->attributes_mask |= (STATX_ATTR_APPEND |
8071 STATX_ATTR_COMPRESSED |
8072 STATX_ATTR_IMMUTABLE |
8073 STATX_ATTR_NODUMP);
8074
8075 generic_fillattr(idmap, request_mask, inode, stat);
8076 stat->dev = BTRFS_I(inode)->root->anon_dev;
8077
8078 stat->subvol = btrfs_root_id(BTRFS_I(inode)->root);
8079 stat->result_mask |= STATX_SUBVOL;
8080
8081 spin_lock(&BTRFS_I(inode)->lock);
8082 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
8083 inode_bytes = inode_get_bytes(inode);
8084 spin_unlock(&BTRFS_I(inode)->lock);
8085 stat->blocks = (ALIGN(inode_bytes, blocksize) +
8086 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
8087 return 0;
8088 }
8089
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)8090 static int btrfs_rename_exchange(struct inode *old_dir,
8091 struct dentry *old_dentry,
8092 struct inode *new_dir,
8093 struct dentry *new_dentry)
8094 {
8095 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8096 struct btrfs_trans_handle *trans;
8097 unsigned int trans_num_items;
8098 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8099 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8100 struct inode *new_inode = new_dentry->d_inode;
8101 struct inode *old_inode = old_dentry->d_inode;
8102 struct btrfs_rename_ctx old_rename_ctx;
8103 struct btrfs_rename_ctx new_rename_ctx;
8104 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8105 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
8106 u64 old_idx = 0;
8107 u64 new_idx = 0;
8108 int ret;
8109 int ret2;
8110 bool need_abort = false;
8111 bool logs_pinned = false;
8112 struct fscrypt_name old_fname, new_fname;
8113 struct fscrypt_str *old_name, *new_name;
8114
8115 /*
8116 * For non-subvolumes allow exchange only within one subvolume, in the
8117 * same inode namespace. Two subvolumes (represented as directory) can
8118 * be exchanged as they're a logical link and have a fixed inode number.
8119 */
8120 if (root != dest &&
8121 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
8122 new_ino != BTRFS_FIRST_FREE_OBJECTID))
8123 return -EXDEV;
8124
8125 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8126 if (ret)
8127 return ret;
8128
8129 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8130 if (ret) {
8131 fscrypt_free_filename(&old_fname);
8132 return ret;
8133 }
8134
8135 old_name = &old_fname.disk_name;
8136 new_name = &new_fname.disk_name;
8137
8138 /* close the race window with snapshot create/destroy ioctl */
8139 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
8140 new_ino == BTRFS_FIRST_FREE_OBJECTID)
8141 down_read(&fs_info->subvol_sem);
8142
8143 /*
8144 * For each inode:
8145 * 1 to remove old dir item
8146 * 1 to remove old dir index
8147 * 1 to add new dir item
8148 * 1 to add new dir index
8149 * 1 to update parent inode
8150 *
8151 * If the parents are the same, we only need to account for one
8152 */
8153 trans_num_items = (old_dir == new_dir ? 9 : 10);
8154 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8155 /*
8156 * 1 to remove old root ref
8157 * 1 to remove old root backref
8158 * 1 to add new root ref
8159 * 1 to add new root backref
8160 */
8161 trans_num_items += 4;
8162 } else {
8163 /*
8164 * 1 to update inode item
8165 * 1 to remove old inode ref
8166 * 1 to add new inode ref
8167 */
8168 trans_num_items += 3;
8169 }
8170 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8171 trans_num_items += 4;
8172 else
8173 trans_num_items += 3;
8174 trans = btrfs_start_transaction(root, trans_num_items);
8175 if (IS_ERR(trans)) {
8176 ret = PTR_ERR(trans);
8177 goto out_notrans;
8178 }
8179
8180 if (dest != root) {
8181 ret = btrfs_record_root_in_trans(trans, dest);
8182 if (ret)
8183 goto out_fail;
8184 }
8185
8186 /*
8187 * We need to find a free sequence number both in the source and
8188 * in the destination directory for the exchange.
8189 */
8190 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8191 if (ret)
8192 goto out_fail;
8193 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8194 if (ret)
8195 goto out_fail;
8196
8197 BTRFS_I(old_inode)->dir_index = 0ULL;
8198 BTRFS_I(new_inode)->dir_index = 0ULL;
8199
8200 /* Reference for the source. */
8201 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8202 /* force full log commit if subvolume involved. */
8203 btrfs_set_log_full_commit(trans);
8204 } else {
8205 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8206 btrfs_ino(BTRFS_I(new_dir)),
8207 old_idx);
8208 if (ret)
8209 goto out_fail;
8210 need_abort = true;
8211 }
8212
8213 /* And now for the dest. */
8214 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8215 /* force full log commit if subvolume involved. */
8216 btrfs_set_log_full_commit(trans);
8217 } else {
8218 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8219 btrfs_ino(BTRFS_I(old_dir)),
8220 new_idx);
8221 if (ret) {
8222 if (unlikely(need_abort))
8223 btrfs_abort_transaction(trans, ret);
8224 goto out_fail;
8225 }
8226 }
8227
8228 /* Update inode version and ctime/mtime. */
8229 inode_inc_iversion(old_dir);
8230 inode_inc_iversion(new_dir);
8231 inode_inc_iversion(old_inode);
8232 inode_inc_iversion(new_inode);
8233 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8234
8235 if (old_ino != BTRFS_FIRST_FREE_OBJECTID &&
8236 new_ino != BTRFS_FIRST_FREE_OBJECTID) {
8237 /*
8238 * If we are renaming in the same directory (and it's not for
8239 * root entries) pin the log early to prevent any concurrent
8240 * task from logging the directory after we removed the old
8241 * entries and before we add the new entries, otherwise that
8242 * task can sync a log without any entry for the inodes we are
8243 * renaming and therefore replaying that log, if a power failure
8244 * happens after syncing the log, would result in deleting the
8245 * inodes.
8246 *
8247 * If the rename affects two different directories, we want to
8248 * make sure the that there's no log commit that contains
8249 * updates for only one of the directories but not for the
8250 * other.
8251 *
8252 * If we are renaming an entry for a root, we don't care about
8253 * log updates since we called btrfs_set_log_full_commit().
8254 */
8255 btrfs_pin_log_trans(root);
8256 btrfs_pin_log_trans(dest);
8257 logs_pinned = true;
8258 }
8259
8260 if (old_dentry->d_parent != new_dentry->d_parent) {
8261 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8262 BTRFS_I(old_inode), true);
8263 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8264 BTRFS_I(new_inode), true);
8265 }
8266
8267 /* src is a subvolume */
8268 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8269 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8270 if (unlikely(ret)) {
8271 btrfs_abort_transaction(trans, ret);
8272 goto out_fail;
8273 }
8274 } else { /* src is an inode */
8275 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8276 BTRFS_I(old_dentry->d_inode),
8277 old_name, &old_rename_ctx);
8278 if (unlikely(ret)) {
8279 btrfs_abort_transaction(trans, ret);
8280 goto out_fail;
8281 }
8282 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8283 if (unlikely(ret)) {
8284 btrfs_abort_transaction(trans, ret);
8285 goto out_fail;
8286 }
8287 }
8288
8289 /* dest is a subvolume */
8290 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8291 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8292 if (unlikely(ret)) {
8293 btrfs_abort_transaction(trans, ret);
8294 goto out_fail;
8295 }
8296 } else { /* dest is an inode */
8297 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8298 BTRFS_I(new_dentry->d_inode),
8299 new_name, &new_rename_ctx);
8300 if (unlikely(ret)) {
8301 btrfs_abort_transaction(trans, ret);
8302 goto out_fail;
8303 }
8304 ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
8305 if (unlikely(ret)) {
8306 btrfs_abort_transaction(trans, ret);
8307 goto out_fail;
8308 }
8309 }
8310
8311 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8312 new_name, 0, old_idx);
8313 if (unlikely(ret)) {
8314 btrfs_abort_transaction(trans, ret);
8315 goto out_fail;
8316 }
8317
8318 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8319 old_name, 0, new_idx);
8320 if (unlikely(ret)) {
8321 btrfs_abort_transaction(trans, ret);
8322 goto out_fail;
8323 }
8324
8325 if (old_inode->i_nlink == 1)
8326 BTRFS_I(old_inode)->dir_index = old_idx;
8327 if (new_inode->i_nlink == 1)
8328 BTRFS_I(new_inode)->dir_index = new_idx;
8329
8330 /*
8331 * Do the log updates for all inodes.
8332 *
8333 * If either entry is for a root we don't need to update the logs since
8334 * we've called btrfs_set_log_full_commit() before.
8335 */
8336 if (logs_pinned) {
8337 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8338 old_rename_ctx.index, new_dentry->d_parent);
8339 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8340 new_rename_ctx.index, old_dentry->d_parent);
8341 }
8342
8343 out_fail:
8344 if (logs_pinned) {
8345 btrfs_end_log_trans(root);
8346 btrfs_end_log_trans(dest);
8347 }
8348 ret2 = btrfs_end_transaction(trans);
8349 ret = ret ? ret : ret2;
8350 out_notrans:
8351 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8352 old_ino == BTRFS_FIRST_FREE_OBJECTID)
8353 up_read(&fs_info->subvol_sem);
8354
8355 fscrypt_free_filename(&new_fname);
8356 fscrypt_free_filename(&old_fname);
8357 return ret;
8358 }
8359
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8360 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8361 struct inode *dir)
8362 {
8363 struct inode *inode;
8364
8365 inode = new_inode(dir->i_sb);
8366 if (inode) {
8367 inode_init_owner(idmap, inode, dir,
8368 S_IFCHR | WHITEOUT_MODE);
8369 inode->i_op = &btrfs_special_inode_operations;
8370 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8371 }
8372 return inode;
8373 }
8374
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8375 static int btrfs_rename(struct mnt_idmap *idmap,
8376 struct inode *old_dir, struct dentry *old_dentry,
8377 struct inode *new_dir, struct dentry *new_dentry,
8378 unsigned int flags)
8379 {
8380 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8381 struct btrfs_new_inode_args whiteout_args = {
8382 .dir = old_dir,
8383 .dentry = old_dentry,
8384 };
8385 struct btrfs_trans_handle *trans;
8386 unsigned int trans_num_items;
8387 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8388 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8389 struct inode *new_inode = d_inode(new_dentry);
8390 struct inode *old_inode = d_inode(old_dentry);
8391 struct btrfs_rename_ctx rename_ctx;
8392 u64 index = 0;
8393 int ret;
8394 int ret2;
8395 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8396 struct fscrypt_name old_fname, new_fname;
8397 bool logs_pinned = false;
8398
8399 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8400 return -EPERM;
8401
8402 /* we only allow rename subvolume link between subvolumes */
8403 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8404 return -EXDEV;
8405
8406 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8407 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8408 return -ENOTEMPTY;
8409
8410 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8411 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8412 return -ENOTEMPTY;
8413
8414 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8415 if (ret)
8416 return ret;
8417
8418 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8419 if (ret) {
8420 fscrypt_free_filename(&old_fname);
8421 return ret;
8422 }
8423
8424 /* check for collisions, even if the name isn't there */
8425 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8426 if (ret) {
8427 if (ret == -EEXIST) {
8428 /* we shouldn't get
8429 * eexist without a new_inode */
8430 if (WARN_ON(!new_inode)) {
8431 goto out_fscrypt_names;
8432 }
8433 } else {
8434 /* maybe -EOVERFLOW */
8435 goto out_fscrypt_names;
8436 }
8437 }
8438 ret = 0;
8439
8440 /*
8441 * we're using rename to replace one file with another. Start IO on it
8442 * now so we don't add too much work to the end of the transaction
8443 */
8444 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8445 filemap_flush(old_inode->i_mapping);
8446
8447 if (flags & RENAME_WHITEOUT) {
8448 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
8449 if (!whiteout_args.inode) {
8450 ret = -ENOMEM;
8451 goto out_fscrypt_names;
8452 }
8453 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8454 if (ret)
8455 goto out_whiteout_inode;
8456 } else {
8457 /* 1 to update the old parent inode. */
8458 trans_num_items = 1;
8459 }
8460
8461 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8462 /* Close the race window with snapshot create/destroy ioctl */
8463 down_read(&fs_info->subvol_sem);
8464 /*
8465 * 1 to remove old root ref
8466 * 1 to remove old root backref
8467 * 1 to add new root ref
8468 * 1 to add new root backref
8469 */
8470 trans_num_items += 4;
8471 } else {
8472 /*
8473 * 1 to update inode
8474 * 1 to remove old inode ref
8475 * 1 to add new inode ref
8476 */
8477 trans_num_items += 3;
8478 }
8479 /*
8480 * 1 to remove old dir item
8481 * 1 to remove old dir index
8482 * 1 to add new dir item
8483 * 1 to add new dir index
8484 */
8485 trans_num_items += 4;
8486 /* 1 to update new parent inode if it's not the same as the old parent */
8487 if (new_dir != old_dir)
8488 trans_num_items++;
8489 if (new_inode) {
8490 /*
8491 * 1 to update inode
8492 * 1 to remove inode ref
8493 * 1 to remove dir item
8494 * 1 to remove dir index
8495 * 1 to possibly add orphan item
8496 */
8497 trans_num_items += 5;
8498 }
8499 trans = btrfs_start_transaction(root, trans_num_items);
8500 if (IS_ERR(trans)) {
8501 ret = PTR_ERR(trans);
8502 goto out_notrans;
8503 }
8504
8505 if (dest != root) {
8506 ret = btrfs_record_root_in_trans(trans, dest);
8507 if (ret)
8508 goto out_fail;
8509 }
8510
8511 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
8512 if (ret)
8513 goto out_fail;
8514
8515 BTRFS_I(old_inode)->dir_index = 0ULL;
8516 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8517 /* force full log commit if subvolume involved. */
8518 btrfs_set_log_full_commit(trans);
8519 } else {
8520 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8521 old_ino, btrfs_ino(BTRFS_I(new_dir)),
8522 index);
8523 if (ret)
8524 goto out_fail;
8525 }
8526
8527 inode_inc_iversion(old_dir);
8528 inode_inc_iversion(new_dir);
8529 inode_inc_iversion(old_inode);
8530 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8531
8532 if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
8533 /*
8534 * If we are renaming in the same directory (and it's not a
8535 * root entry) pin the log to prevent any concurrent task from
8536 * logging the directory after we removed the old entry and
8537 * before we add the new entry, otherwise that task can sync
8538 * a log without any entry for the inode we are renaming and
8539 * therefore replaying that log, if a power failure happens
8540 * after syncing the log, would result in deleting the inode.
8541 *
8542 * If the rename affects two different directories, we want to
8543 * make sure the that there's no log commit that contains
8544 * updates for only one of the directories but not for the
8545 * other.
8546 *
8547 * If we are renaming an entry for a root, we don't care about
8548 * log updates since we called btrfs_set_log_full_commit().
8549 */
8550 btrfs_pin_log_trans(root);
8551 btrfs_pin_log_trans(dest);
8552 logs_pinned = true;
8553 }
8554
8555 if (old_dentry->d_parent != new_dentry->d_parent)
8556 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8557 BTRFS_I(old_inode), true);
8558
8559 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8560 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8561 if (unlikely(ret)) {
8562 btrfs_abort_transaction(trans, ret);
8563 goto out_fail;
8564 }
8565 } else {
8566 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8567 BTRFS_I(d_inode(old_dentry)),
8568 &old_fname.disk_name, &rename_ctx);
8569 if (unlikely(ret)) {
8570 btrfs_abort_transaction(trans, ret);
8571 goto out_fail;
8572 }
8573 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8574 if (unlikely(ret)) {
8575 btrfs_abort_transaction(trans, ret);
8576 goto out_fail;
8577 }
8578 }
8579
8580 if (new_inode) {
8581 inode_inc_iversion(new_inode);
8582 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
8583 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8584 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8585 if (unlikely(ret)) {
8586 btrfs_abort_transaction(trans, ret);
8587 goto out_fail;
8588 }
8589 BUG_ON(new_inode->i_nlink == 0);
8590 } else {
8591 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8592 BTRFS_I(d_inode(new_dentry)),
8593 &new_fname.disk_name);
8594 if (unlikely(ret)) {
8595 btrfs_abort_transaction(trans, ret);
8596 goto out_fail;
8597 }
8598 }
8599 if (new_inode->i_nlink == 0) {
8600 ret = btrfs_orphan_add(trans,
8601 BTRFS_I(d_inode(new_dentry)));
8602 if (unlikely(ret)) {
8603 btrfs_abort_transaction(trans, ret);
8604 goto out_fail;
8605 }
8606 }
8607 }
8608
8609 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8610 &new_fname.disk_name, 0, index);
8611 if (unlikely(ret)) {
8612 btrfs_abort_transaction(trans, ret);
8613 goto out_fail;
8614 }
8615
8616 if (old_inode->i_nlink == 1)
8617 BTRFS_I(old_inode)->dir_index = index;
8618
8619 if (logs_pinned)
8620 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8621 rename_ctx.index, new_dentry->d_parent);
8622
8623 if (flags & RENAME_WHITEOUT) {
8624 ret = btrfs_create_new_inode(trans, &whiteout_args);
8625 if (unlikely(ret)) {
8626 btrfs_abort_transaction(trans, ret);
8627 goto out_fail;
8628 } else {
8629 unlock_new_inode(whiteout_args.inode);
8630 iput(whiteout_args.inode);
8631 whiteout_args.inode = NULL;
8632 }
8633 }
8634 out_fail:
8635 if (logs_pinned) {
8636 btrfs_end_log_trans(root);
8637 btrfs_end_log_trans(dest);
8638 }
8639 ret2 = btrfs_end_transaction(trans);
8640 ret = ret ? ret : ret2;
8641 out_notrans:
8642 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8643 up_read(&fs_info->subvol_sem);
8644 if (flags & RENAME_WHITEOUT)
8645 btrfs_new_inode_args_destroy(&whiteout_args);
8646 out_whiteout_inode:
8647 if (flags & RENAME_WHITEOUT)
8648 iput(whiteout_args.inode);
8649 out_fscrypt_names:
8650 fscrypt_free_filename(&old_fname);
8651 fscrypt_free_filename(&new_fname);
8652 return ret;
8653 }
8654
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8655 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
8656 struct dentry *old_dentry, struct inode *new_dir,
8657 struct dentry *new_dentry, unsigned int flags)
8658 {
8659 int ret;
8660
8661 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
8662 return -EINVAL;
8663
8664 if (flags & RENAME_EXCHANGE)
8665 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8666 new_dentry);
8667 else
8668 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
8669 new_dentry, flags);
8670
8671 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8672
8673 return ret;
8674 }
8675
8676 struct btrfs_delalloc_work {
8677 struct inode *inode;
8678 struct completion completion;
8679 struct list_head list;
8680 struct btrfs_work work;
8681 };
8682
btrfs_run_delalloc_work(struct btrfs_work * work)8683 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8684 {
8685 struct btrfs_delalloc_work *delalloc_work;
8686 struct inode *inode;
8687
8688 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8689 work);
8690 inode = delalloc_work->inode;
8691 filemap_flush(inode->i_mapping);
8692 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8693 &BTRFS_I(inode)->runtime_flags))
8694 filemap_flush(inode->i_mapping);
8695
8696 iput(inode);
8697 complete(&delalloc_work->completion);
8698 }
8699
btrfs_alloc_delalloc_work(struct inode * inode)8700 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8701 {
8702 struct btrfs_delalloc_work *work;
8703
8704 work = kmalloc(sizeof(*work), GFP_NOFS);
8705 if (!work)
8706 return NULL;
8707
8708 init_completion(&work->completion);
8709 INIT_LIST_HEAD(&work->list);
8710 work->inode = inode;
8711 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8712
8713 return work;
8714 }
8715
8716 /*
8717 * some fairly slow code that needs optimization. This walks the list
8718 * of all the inodes with pending delalloc and forces them to disk.
8719 */
start_delalloc_inodes(struct btrfs_root * root,long * nr_to_write,bool snapshot,bool in_reclaim_context)8720 static int start_delalloc_inodes(struct btrfs_root *root, long *nr_to_write,
8721 bool snapshot, bool in_reclaim_context)
8722 {
8723 struct btrfs_delalloc_work *work, *next;
8724 LIST_HEAD(works);
8725 LIST_HEAD(splice);
8726 int ret = 0;
8727
8728 mutex_lock(&root->delalloc_mutex);
8729 spin_lock(&root->delalloc_lock);
8730 list_splice_init(&root->delalloc_inodes, &splice);
8731 while (!list_empty(&splice)) {
8732 struct btrfs_inode *inode;
8733 struct inode *tmp_inode;
8734
8735 inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes);
8736
8737 list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
8738
8739 if (in_reclaim_context &&
8740 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
8741 continue;
8742
8743 tmp_inode = igrab(&inode->vfs_inode);
8744 if (!tmp_inode) {
8745 cond_resched_lock(&root->delalloc_lock);
8746 continue;
8747 }
8748 spin_unlock(&root->delalloc_lock);
8749
8750 if (snapshot)
8751 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
8752 if (nr_to_write == NULL) {
8753 work = btrfs_alloc_delalloc_work(tmp_inode);
8754 if (!work) {
8755 iput(tmp_inode);
8756 ret = -ENOMEM;
8757 goto out;
8758 }
8759 list_add_tail(&work->list, &works);
8760 btrfs_queue_work(root->fs_info->flush_workers,
8761 &work->work);
8762 } else {
8763 ret = filemap_flush_nr(tmp_inode->i_mapping,
8764 nr_to_write);
8765 btrfs_add_delayed_iput(inode);
8766
8767 if (ret || *nr_to_write <= 0)
8768 goto out;
8769 }
8770 cond_resched();
8771 spin_lock(&root->delalloc_lock);
8772 }
8773 spin_unlock(&root->delalloc_lock);
8774
8775 out:
8776 list_for_each_entry_safe(work, next, &works, list) {
8777 list_del_init(&work->list);
8778 wait_for_completion(&work->completion);
8779 kfree(work);
8780 }
8781
8782 if (!list_empty(&splice)) {
8783 spin_lock(&root->delalloc_lock);
8784 list_splice_tail(&splice, &root->delalloc_inodes);
8785 spin_unlock(&root->delalloc_lock);
8786 }
8787 mutex_unlock(&root->delalloc_mutex);
8788 return ret;
8789 }
8790
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)8791 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
8792 {
8793 struct btrfs_fs_info *fs_info = root->fs_info;
8794
8795 if (BTRFS_FS_ERROR(fs_info))
8796 return -EROFS;
8797 return start_delalloc_inodes(root, NULL, true, in_reclaim_context);
8798 }
8799
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)8800 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
8801 bool in_reclaim_context)
8802 {
8803 long *nr_to_write = nr == LONG_MAX ? NULL : &nr;
8804 struct btrfs_root *root;
8805 LIST_HEAD(splice);
8806 int ret;
8807
8808 if (BTRFS_FS_ERROR(fs_info))
8809 return -EROFS;
8810
8811 mutex_lock(&fs_info->delalloc_root_mutex);
8812 spin_lock(&fs_info->delalloc_root_lock);
8813 list_splice_init(&fs_info->delalloc_roots, &splice);
8814 while (!list_empty(&splice)) {
8815 root = list_first_entry(&splice, struct btrfs_root,
8816 delalloc_root);
8817 root = btrfs_grab_root(root);
8818 BUG_ON(!root);
8819 list_move_tail(&root->delalloc_root,
8820 &fs_info->delalloc_roots);
8821 spin_unlock(&fs_info->delalloc_root_lock);
8822
8823 ret = start_delalloc_inodes(root, nr_to_write, false,
8824 in_reclaim_context);
8825 btrfs_put_root(root);
8826 if (ret < 0 || nr <= 0)
8827 goto out;
8828 spin_lock(&fs_info->delalloc_root_lock);
8829 }
8830 spin_unlock(&fs_info->delalloc_root_lock);
8831
8832 ret = 0;
8833 out:
8834 if (!list_empty(&splice)) {
8835 spin_lock(&fs_info->delalloc_root_lock);
8836 list_splice_tail(&splice, &fs_info->delalloc_roots);
8837 spin_unlock(&fs_info->delalloc_root_lock);
8838 }
8839 mutex_unlock(&fs_info->delalloc_root_mutex);
8840 return ret;
8841 }
8842
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)8843 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
8844 struct dentry *dentry, const char *symname)
8845 {
8846 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8847 struct btrfs_trans_handle *trans;
8848 struct btrfs_root *root = BTRFS_I(dir)->root;
8849 struct btrfs_path *path;
8850 struct btrfs_key key;
8851 struct inode *inode;
8852 struct btrfs_new_inode_args new_inode_args = {
8853 .dir = dir,
8854 .dentry = dentry,
8855 };
8856 unsigned int trans_num_items;
8857 int ret;
8858 int name_len;
8859 int datasize;
8860 unsigned long ptr;
8861 struct btrfs_file_extent_item *ei;
8862 struct extent_buffer *leaf;
8863
8864 name_len = strlen(symname);
8865 /*
8866 * Symlinks utilize uncompressed inline extent data, which should not
8867 * reach block size.
8868 */
8869 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
8870 name_len >= fs_info->sectorsize)
8871 return -ENAMETOOLONG;
8872
8873 inode = new_inode(dir->i_sb);
8874 if (!inode)
8875 return -ENOMEM;
8876 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
8877 inode->i_op = &btrfs_symlink_inode_operations;
8878 inode_nohighmem(inode);
8879 inode->i_mapping->a_ops = &btrfs_aops;
8880 btrfs_i_size_write(BTRFS_I(inode), name_len);
8881 inode_set_bytes(inode, name_len);
8882
8883 new_inode_args.inode = inode;
8884 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8885 if (ret)
8886 goto out_inode;
8887 /* 1 additional item for the inline extent */
8888 trans_num_items++;
8889
8890 trans = btrfs_start_transaction(root, trans_num_items);
8891 if (IS_ERR(trans)) {
8892 ret = PTR_ERR(trans);
8893 goto out_new_inode_args;
8894 }
8895
8896 ret = btrfs_create_new_inode(trans, &new_inode_args);
8897 if (ret)
8898 goto out;
8899
8900 path = btrfs_alloc_path();
8901 if (unlikely(!path)) {
8902 ret = -ENOMEM;
8903 btrfs_abort_transaction(trans, ret);
8904 discard_new_inode(inode);
8905 inode = NULL;
8906 goto out;
8907 }
8908 key.objectid = btrfs_ino(BTRFS_I(inode));
8909 key.type = BTRFS_EXTENT_DATA_KEY;
8910 key.offset = 0;
8911 datasize = btrfs_file_extent_calc_inline_size(name_len);
8912 ret = btrfs_insert_empty_item(trans, root, path, &key, datasize);
8913 if (unlikely(ret)) {
8914 btrfs_abort_transaction(trans, ret);
8915 btrfs_free_path(path);
8916 discard_new_inode(inode);
8917 inode = NULL;
8918 goto out;
8919 }
8920 leaf = path->nodes[0];
8921 ei = btrfs_item_ptr(leaf, path->slots[0],
8922 struct btrfs_file_extent_item);
8923 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8924 btrfs_set_file_extent_type(leaf, ei,
8925 BTRFS_FILE_EXTENT_INLINE);
8926 btrfs_set_file_extent_encryption(leaf, ei, 0);
8927 btrfs_set_file_extent_compression(leaf, ei, 0);
8928 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8929 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8930
8931 ptr = btrfs_file_extent_inline_start(ei);
8932 write_extent_buffer(leaf, symname, ptr, name_len);
8933 btrfs_free_path(path);
8934
8935 d_instantiate_new(dentry, inode);
8936 ret = 0;
8937 out:
8938 btrfs_end_transaction(trans);
8939 btrfs_btree_balance_dirty(fs_info);
8940 out_new_inode_args:
8941 btrfs_new_inode_args_destroy(&new_inode_args);
8942 out_inode:
8943 if (ret)
8944 iput(inode);
8945 return ret;
8946 }
8947
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)8948 static struct btrfs_trans_handle *insert_prealloc_file_extent(
8949 struct btrfs_trans_handle *trans_in,
8950 struct btrfs_inode *inode,
8951 struct btrfs_key *ins,
8952 u64 file_offset)
8953 {
8954 struct btrfs_file_extent_item stack_fi;
8955 struct btrfs_replace_extent_info extent_info;
8956 struct btrfs_trans_handle *trans = trans_in;
8957 struct btrfs_path *path;
8958 u64 start = ins->objectid;
8959 u64 len = ins->offset;
8960 u64 qgroup_released = 0;
8961 int ret;
8962
8963 memset(&stack_fi, 0, sizeof(stack_fi));
8964
8965 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8966 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8967 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8968 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8969 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8970 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8971 /* Encryption and other encoding is reserved and all 0 */
8972
8973 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8974 if (ret < 0)
8975 return ERR_PTR(ret);
8976
8977 if (trans) {
8978 ret = insert_reserved_file_extent(trans, inode,
8979 file_offset, &stack_fi,
8980 true, qgroup_released);
8981 if (ret)
8982 goto free_qgroup;
8983 return trans;
8984 }
8985
8986 extent_info.disk_offset = start;
8987 extent_info.disk_len = len;
8988 extent_info.data_offset = 0;
8989 extent_info.data_len = len;
8990 extent_info.file_offset = file_offset;
8991 extent_info.extent_buf = (char *)&stack_fi;
8992 extent_info.is_new_extent = true;
8993 extent_info.update_times = true;
8994 extent_info.qgroup_reserved = qgroup_released;
8995 extent_info.insertions = 0;
8996
8997 path = btrfs_alloc_path();
8998 if (!path) {
8999 ret = -ENOMEM;
9000 goto free_qgroup;
9001 }
9002
9003 ret = btrfs_replace_file_extents(inode, path, file_offset,
9004 file_offset + len - 1, &extent_info,
9005 &trans);
9006 btrfs_free_path(path);
9007 if (ret)
9008 goto free_qgroup;
9009 return trans;
9010
9011 free_qgroup:
9012 /*
9013 * We have released qgroup data range at the beginning of the function,
9014 * and normally qgroup_released bytes will be freed when committing
9015 * transaction.
9016 * But if we error out early, we have to free what we have released
9017 * or we leak qgroup data reservation.
9018 */
9019 btrfs_qgroup_free_refroot(inode->root->fs_info,
9020 btrfs_root_id(inode->root), qgroup_released,
9021 BTRFS_QGROUP_RSV_DATA);
9022 return ERR_PTR(ret);
9023 }
9024
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)9025 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
9026 u64 start, u64 num_bytes, u64 min_size,
9027 loff_t actual_len, u64 *alloc_hint,
9028 struct btrfs_trans_handle *trans)
9029 {
9030 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
9031 struct extent_map *em;
9032 struct btrfs_root *root = BTRFS_I(inode)->root;
9033 struct btrfs_key ins;
9034 u64 cur_offset = start;
9035 u64 clear_offset = start;
9036 u64 i_size;
9037 u64 cur_bytes;
9038 u64 last_alloc = (u64)-1;
9039 int ret = 0;
9040 bool own_trans = true;
9041 u64 end = start + num_bytes - 1;
9042
9043 if (trans)
9044 own_trans = false;
9045 while (num_bytes > 0) {
9046 cur_bytes = min_t(u64, num_bytes, SZ_256M);
9047 cur_bytes = max(cur_bytes, min_size);
9048 /*
9049 * If we are severely fragmented we could end up with really
9050 * small allocations, so if the allocator is returning small
9051 * chunks lets make its job easier by only searching for those
9052 * sized chunks.
9053 */
9054 cur_bytes = min(cur_bytes, last_alloc);
9055 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
9056 min_size, 0, *alloc_hint, &ins, 1, 0);
9057 if (ret)
9058 break;
9059
9060 /*
9061 * We've reserved this space, and thus converted it from
9062 * ->bytes_may_use to ->bytes_reserved. Any error that happens
9063 * from here on out we will only need to clear our reservation
9064 * for the remaining unreserved area, so advance our
9065 * clear_offset by our extent size.
9066 */
9067 clear_offset += ins.offset;
9068
9069 last_alloc = ins.offset;
9070 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
9071 &ins, cur_offset);
9072 /*
9073 * Now that we inserted the prealloc extent we can finally
9074 * decrement the number of reservations in the block group.
9075 * If we did it before, we could race with relocation and have
9076 * relocation miss the reserved extent, making it fail later.
9077 */
9078 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9079 if (IS_ERR(trans)) {
9080 ret = PTR_ERR(trans);
9081 btrfs_free_reserved_extent(fs_info, ins.objectid,
9082 ins.offset, false);
9083 break;
9084 }
9085
9086 em = btrfs_alloc_extent_map();
9087 if (!em) {
9088 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
9089 cur_offset + ins.offset - 1, false);
9090 btrfs_set_inode_full_sync(BTRFS_I(inode));
9091 goto next;
9092 }
9093
9094 em->start = cur_offset;
9095 em->len = ins.offset;
9096 em->disk_bytenr = ins.objectid;
9097 em->offset = 0;
9098 em->disk_num_bytes = ins.offset;
9099 em->ram_bytes = ins.offset;
9100 em->flags |= EXTENT_FLAG_PREALLOC;
9101 em->generation = trans->transid;
9102
9103 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
9104 btrfs_free_extent_map(em);
9105 next:
9106 num_bytes -= ins.offset;
9107 cur_offset += ins.offset;
9108 *alloc_hint = ins.objectid + ins.offset;
9109
9110 inode_inc_iversion(inode);
9111 inode_set_ctime_current(inode);
9112 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
9113 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
9114 (actual_len > inode->i_size) &&
9115 (cur_offset > inode->i_size)) {
9116 if (cur_offset > actual_len)
9117 i_size = actual_len;
9118 else
9119 i_size = cur_offset;
9120 i_size_write(inode, i_size);
9121 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
9122 }
9123
9124 ret = btrfs_update_inode(trans, BTRFS_I(inode));
9125
9126 if (unlikely(ret)) {
9127 btrfs_abort_transaction(trans, ret);
9128 if (own_trans)
9129 btrfs_end_transaction(trans);
9130 break;
9131 }
9132
9133 if (own_trans) {
9134 btrfs_end_transaction(trans);
9135 trans = NULL;
9136 }
9137 }
9138 if (clear_offset < end)
9139 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
9140 end - clear_offset + 1);
9141 return ret;
9142 }
9143
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9144 int btrfs_prealloc_file_range(struct inode *inode, int mode,
9145 u64 start, u64 num_bytes, u64 min_size,
9146 loff_t actual_len, u64 *alloc_hint)
9147 {
9148 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9149 min_size, actual_len, alloc_hint,
9150 NULL);
9151 }
9152
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)9153 int btrfs_prealloc_file_range_trans(struct inode *inode,
9154 struct btrfs_trans_handle *trans, int mode,
9155 u64 start, u64 num_bytes, u64 min_size,
9156 loff_t actual_len, u64 *alloc_hint)
9157 {
9158 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
9159 min_size, actual_len, alloc_hint, trans);
9160 }
9161
9162 /*
9163 * NOTE: in case you are adding MAY_EXEC check for directories:
9164 * we are marking them with IOP_FASTPERM_MAY_EXEC, allowing path lookup to
9165 * elide calls here.
9166 */
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)9167 static int btrfs_permission(struct mnt_idmap *idmap,
9168 struct inode *inode, int mask)
9169 {
9170 struct btrfs_root *root = BTRFS_I(inode)->root;
9171 umode_t mode = inode->i_mode;
9172
9173 if (mask & MAY_WRITE &&
9174 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9175 if (btrfs_root_readonly(root))
9176 return -EROFS;
9177 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9178 return -EACCES;
9179 }
9180 return generic_permission(idmap, inode, mask);
9181 }
9182
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)9183 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9184 struct file *file, umode_t mode)
9185 {
9186 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
9187 struct btrfs_trans_handle *trans;
9188 struct btrfs_root *root = BTRFS_I(dir)->root;
9189 struct inode *inode;
9190 struct btrfs_new_inode_args new_inode_args = {
9191 .dir = dir,
9192 .dentry = file->f_path.dentry,
9193 .orphan = true,
9194 };
9195 unsigned int trans_num_items;
9196 int ret;
9197
9198 inode = new_inode(dir->i_sb);
9199 if (!inode)
9200 return -ENOMEM;
9201 inode_init_owner(idmap, inode, dir, mode);
9202 inode->i_fop = &btrfs_file_operations;
9203 inode->i_op = &btrfs_file_inode_operations;
9204 inode->i_mapping->a_ops = &btrfs_aops;
9205
9206 new_inode_args.inode = inode;
9207 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9208 if (ret)
9209 goto out_inode;
9210
9211 trans = btrfs_start_transaction(root, trans_num_items);
9212 if (IS_ERR(trans)) {
9213 ret = PTR_ERR(trans);
9214 goto out_new_inode_args;
9215 }
9216
9217 ret = btrfs_create_new_inode(trans, &new_inode_args);
9218
9219 /*
9220 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9221 * set it to 1 because d_tmpfile() will issue a warning if the count is
9222 * 0, through:
9223 *
9224 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9225 */
9226 set_nlink(inode, 1);
9227
9228 if (!ret) {
9229 d_tmpfile(file, inode);
9230 unlock_new_inode(inode);
9231 mark_inode_dirty(inode);
9232 }
9233
9234 btrfs_end_transaction(trans);
9235 btrfs_btree_balance_dirty(fs_info);
9236 out_new_inode_args:
9237 btrfs_new_inode_args_destroy(&new_inode_args);
9238 out_inode:
9239 if (ret)
9240 iput(inode);
9241 return finish_open_simple(file, ret);
9242 }
9243
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)9244 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9245 int compress_type)
9246 {
9247 switch (compress_type) {
9248 case BTRFS_COMPRESS_NONE:
9249 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9250 case BTRFS_COMPRESS_ZLIB:
9251 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9252 case BTRFS_COMPRESS_LZO:
9253 /*
9254 * The LZO format depends on the sector size. 64K is the maximum
9255 * sector size that we support.
9256 */
9257 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9258 return -EINVAL;
9259 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9260 (fs_info->sectorsize_bits - 12);
9261 case BTRFS_COMPRESS_ZSTD:
9262 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9263 default:
9264 return -EUCLEAN;
9265 }
9266 }
9267
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9268 static ssize_t btrfs_encoded_read_inline(
9269 struct kiocb *iocb,
9270 struct iov_iter *iter, u64 start,
9271 u64 lockend,
9272 struct extent_state **cached_state,
9273 u64 extent_start, size_t count,
9274 struct btrfs_ioctl_encoded_io_args *encoded,
9275 bool *unlocked)
9276 {
9277 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9278 struct btrfs_root *root = inode->root;
9279 struct btrfs_fs_info *fs_info = root->fs_info;
9280 struct extent_io_tree *io_tree = &inode->io_tree;
9281 BTRFS_PATH_AUTO_FREE(path);
9282 struct extent_buffer *leaf;
9283 struct btrfs_file_extent_item *item;
9284 u64 ram_bytes;
9285 unsigned long ptr;
9286 void *tmp;
9287 ssize_t ret;
9288 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9289
9290 path = btrfs_alloc_path();
9291 if (!path)
9292 return -ENOMEM;
9293
9294 path->nowait = nowait;
9295
9296 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9297 extent_start, 0);
9298 if (ret) {
9299 if (unlikely(ret > 0)) {
9300 /* The extent item disappeared? */
9301 return -EIO;
9302 }
9303 return ret;
9304 }
9305 leaf = path->nodes[0];
9306 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9307
9308 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9309 ptr = btrfs_file_extent_inline_start(item);
9310
9311 encoded->len = min_t(u64, extent_start + ram_bytes,
9312 inode->vfs_inode.i_size) - iocb->ki_pos;
9313 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9314 btrfs_file_extent_compression(leaf, item));
9315 if (ret < 0)
9316 return ret;
9317 encoded->compression = ret;
9318 if (encoded->compression) {
9319 size_t inline_size;
9320
9321 inline_size = btrfs_file_extent_inline_item_len(leaf,
9322 path->slots[0]);
9323 if (inline_size > count)
9324 return -ENOBUFS;
9325
9326 count = inline_size;
9327 encoded->unencoded_len = ram_bytes;
9328 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9329 } else {
9330 count = min_t(u64, count, encoded->len);
9331 encoded->len = count;
9332 encoded->unencoded_len = count;
9333 ptr += iocb->ki_pos - extent_start;
9334 }
9335
9336 tmp = kmalloc(count, GFP_NOFS);
9337 if (!tmp)
9338 return -ENOMEM;
9339
9340 read_extent_buffer(leaf, tmp, ptr, count);
9341 btrfs_release_path(path);
9342 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9343 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9344 *unlocked = true;
9345
9346 ret = copy_to_iter(tmp, count, iter);
9347 if (ret != count)
9348 ret = -EFAULT;
9349 kfree(tmp);
9350
9351 return ret;
9352 }
9353
9354 struct btrfs_encoded_read_private {
9355 struct completion *sync_reads;
9356 void *uring_ctx;
9357 refcount_t pending_refs;
9358 blk_status_t status;
9359 };
9360
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9361 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9362 {
9363 struct btrfs_encoded_read_private *priv = bbio->private;
9364
9365 if (bbio->bio.bi_status) {
9366 /*
9367 * The memory barrier implied by the refcount_dec_and_test() here
9368 * pairs with the memory barrier implied by the refcount_dec_and_test()
9369 * in btrfs_encoded_read_regular_fill_pages() to ensure that
9370 * this write is observed before the load of status in
9371 * btrfs_encoded_read_regular_fill_pages().
9372 */
9373 WRITE_ONCE(priv->status, bbio->bio.bi_status);
9374 }
9375 if (refcount_dec_and_test(&priv->pending_refs)) {
9376 int err = blk_status_to_errno(READ_ONCE(priv->status));
9377
9378 if (priv->uring_ctx) {
9379 btrfs_uring_read_extent_endio(priv->uring_ctx, err);
9380 kfree(priv);
9381 } else {
9382 complete(priv->sync_reads);
9383 }
9384 }
9385 bio_put(&bbio->bio);
9386 }
9387
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 disk_bytenr,u64 disk_io_size,struct page ** pages,void * uring_ctx)9388 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9389 u64 disk_bytenr, u64 disk_io_size,
9390 struct page **pages, void *uring_ctx)
9391 {
9392 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9393 struct btrfs_encoded_read_private *priv, sync_priv;
9394 struct completion sync_reads;
9395 unsigned long i = 0;
9396 struct btrfs_bio *bbio;
9397 int ret;
9398
9399 /*
9400 * Fast path for synchronous reads which completes in this call, io_uring
9401 * needs longer time span.
9402 */
9403 if (uring_ctx) {
9404 priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
9405 if (!priv)
9406 return -ENOMEM;
9407 } else {
9408 priv = &sync_priv;
9409 init_completion(&sync_reads);
9410 priv->sync_reads = &sync_reads;
9411 }
9412
9413 refcount_set(&priv->pending_refs, 1);
9414 priv->status = 0;
9415 priv->uring_ctx = uring_ctx;
9416
9417 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9418 btrfs_encoded_read_endio, priv);
9419 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9420 bbio->inode = inode;
9421
9422 do {
9423 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9424
9425 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9426 refcount_inc(&priv->pending_refs);
9427 btrfs_submit_bbio(bbio, 0);
9428
9429 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9430 btrfs_encoded_read_endio, priv);
9431 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9432 bbio->inode = inode;
9433 continue;
9434 }
9435
9436 i++;
9437 disk_bytenr += bytes;
9438 disk_io_size -= bytes;
9439 } while (disk_io_size);
9440
9441 refcount_inc(&priv->pending_refs);
9442 btrfs_submit_bbio(bbio, 0);
9443
9444 if (uring_ctx) {
9445 if (refcount_dec_and_test(&priv->pending_refs)) {
9446 ret = blk_status_to_errno(READ_ONCE(priv->status));
9447 btrfs_uring_read_extent_endio(uring_ctx, ret);
9448 kfree(priv);
9449 return ret;
9450 }
9451
9452 return -EIOCBQUEUED;
9453 } else {
9454 if (!refcount_dec_and_test(&priv->pending_refs))
9455 wait_for_completion_io(&sync_reads);
9456 /* See btrfs_encoded_read_endio() for ordering. */
9457 return blk_status_to_errno(READ_ONCE(priv->status));
9458 }
9459 }
9460
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)9461 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
9462 u64 start, u64 lockend,
9463 struct extent_state **cached_state,
9464 u64 disk_bytenr, u64 disk_io_size,
9465 size_t count, bool compressed, bool *unlocked)
9466 {
9467 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9468 struct extent_io_tree *io_tree = &inode->io_tree;
9469 struct page **pages;
9470 unsigned long nr_pages, i;
9471 u64 cur;
9472 size_t page_offset;
9473 ssize_t ret;
9474
9475 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9476 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9477 if (!pages)
9478 return -ENOMEM;
9479 ret = btrfs_alloc_page_array(nr_pages, pages, false);
9480 if (ret) {
9481 ret = -ENOMEM;
9482 goto out;
9483 }
9484
9485 ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
9486 disk_io_size, pages, NULL);
9487 if (ret)
9488 goto out;
9489
9490 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9491 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9492 *unlocked = true;
9493
9494 if (compressed) {
9495 i = 0;
9496 page_offset = 0;
9497 } else {
9498 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9499 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9500 }
9501 cur = 0;
9502 while (cur < count) {
9503 size_t bytes = min_t(size_t, count - cur,
9504 PAGE_SIZE - page_offset);
9505
9506 if (copy_page_to_iter(pages[i], page_offset, bytes,
9507 iter) != bytes) {
9508 ret = -EFAULT;
9509 goto out;
9510 }
9511 i++;
9512 cur += bytes;
9513 page_offset = 0;
9514 }
9515 ret = count;
9516 out:
9517 for (i = 0; i < nr_pages; i++) {
9518 if (pages[i])
9519 __free_page(pages[i]);
9520 }
9521 kfree(pages);
9522 return ret;
9523 }
9524
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded,struct extent_state ** cached_state,u64 * disk_bytenr,u64 * disk_io_size)9525 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9526 struct btrfs_ioctl_encoded_io_args *encoded,
9527 struct extent_state **cached_state,
9528 u64 *disk_bytenr, u64 *disk_io_size)
9529 {
9530 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9531 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9532 struct extent_io_tree *io_tree = &inode->io_tree;
9533 ssize_t ret;
9534 size_t count = iov_iter_count(iter);
9535 u64 start, lockend;
9536 struct extent_map *em;
9537 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9538 bool unlocked = false;
9539
9540 file_accessed(iocb->ki_filp);
9541
9542 ret = btrfs_inode_lock(inode,
9543 BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
9544 if (ret)
9545 return ret;
9546
9547 if (iocb->ki_pos >= inode->vfs_inode.i_size) {
9548 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9549 return 0;
9550 }
9551 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9552 /*
9553 * We don't know how long the extent containing iocb->ki_pos is, but if
9554 * it's compressed we know that it won't be longer than this.
9555 */
9556 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9557
9558 if (nowait) {
9559 struct btrfs_ordered_extent *ordered;
9560
9561 if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
9562 start, lockend)) {
9563 ret = -EAGAIN;
9564 goto out_unlock_inode;
9565 }
9566
9567 if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) {
9568 ret = -EAGAIN;
9569 goto out_unlock_inode;
9570 }
9571
9572 ordered = btrfs_lookup_ordered_range(inode, start,
9573 lockend - start + 1);
9574 if (ordered) {
9575 btrfs_put_ordered_extent(ordered);
9576 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9577 ret = -EAGAIN;
9578 goto out_unlock_inode;
9579 }
9580 } else {
9581 for (;;) {
9582 struct btrfs_ordered_extent *ordered;
9583
9584 ret = btrfs_wait_ordered_range(inode, start,
9585 lockend - start + 1);
9586 if (ret)
9587 goto out_unlock_inode;
9588
9589 btrfs_lock_extent(io_tree, start, lockend, cached_state);
9590 ordered = btrfs_lookup_ordered_range(inode, start,
9591 lockend - start + 1);
9592 if (!ordered)
9593 break;
9594 btrfs_put_ordered_extent(ordered);
9595 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9596 cond_resched();
9597 }
9598 }
9599
9600 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
9601 if (IS_ERR(em)) {
9602 ret = PTR_ERR(em);
9603 goto out_unlock_extent;
9604 }
9605
9606 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9607 u64 extent_start = em->start;
9608
9609 /*
9610 * For inline extents we get everything we need out of the
9611 * extent item.
9612 */
9613 btrfs_free_extent_map(em);
9614 em = NULL;
9615 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9616 cached_state, extent_start,
9617 count, encoded, &unlocked);
9618 goto out_unlock_extent;
9619 }
9620
9621 /*
9622 * We only want to return up to EOF even if the extent extends beyond
9623 * that.
9624 */
9625 encoded->len = min_t(u64, btrfs_extent_map_end(em),
9626 inode->vfs_inode.i_size) - iocb->ki_pos;
9627 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
9628 (em->flags & EXTENT_FLAG_PREALLOC)) {
9629 *disk_bytenr = EXTENT_MAP_HOLE;
9630 count = min_t(u64, count, encoded->len);
9631 encoded->len = count;
9632 encoded->unencoded_len = count;
9633 } else if (btrfs_extent_map_is_compressed(em)) {
9634 *disk_bytenr = em->disk_bytenr;
9635 /*
9636 * Bail if the buffer isn't large enough to return the whole
9637 * compressed extent.
9638 */
9639 if (em->disk_num_bytes > count) {
9640 ret = -ENOBUFS;
9641 goto out_em;
9642 }
9643 *disk_io_size = em->disk_num_bytes;
9644 count = em->disk_num_bytes;
9645 encoded->unencoded_len = em->ram_bytes;
9646 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
9647 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9648 btrfs_extent_map_compression(em));
9649 if (ret < 0)
9650 goto out_em;
9651 encoded->compression = ret;
9652 } else {
9653 *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start);
9654 if (encoded->len > count)
9655 encoded->len = count;
9656 /*
9657 * Don't read beyond what we locked. This also limits the page
9658 * allocations that we'll do.
9659 */
9660 *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9661 count = start + *disk_io_size - iocb->ki_pos;
9662 encoded->len = count;
9663 encoded->unencoded_len = count;
9664 *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
9665 }
9666 btrfs_free_extent_map(em);
9667 em = NULL;
9668
9669 if (*disk_bytenr == EXTENT_MAP_HOLE) {
9670 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9671 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9672 unlocked = true;
9673 ret = iov_iter_zero(count, iter);
9674 if (ret != count)
9675 ret = -EFAULT;
9676 } else {
9677 ret = -EIOCBQUEUED;
9678 goto out_unlock_extent;
9679 }
9680
9681 out_em:
9682 btrfs_free_extent_map(em);
9683 out_unlock_extent:
9684 /* Leave inode and extent locked if we need to do a read. */
9685 if (!unlocked && ret != -EIOCBQUEUED)
9686 btrfs_unlock_extent(io_tree, start, lockend, cached_state);
9687 out_unlock_inode:
9688 if (!unlocked && ret != -EIOCBQUEUED)
9689 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9690 return ret;
9691 }
9692
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)9693 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9694 const struct btrfs_ioctl_encoded_io_args *encoded)
9695 {
9696 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9697 struct btrfs_root *root = inode->root;
9698 struct btrfs_fs_info *fs_info = root->fs_info;
9699 struct extent_io_tree *io_tree = &inode->io_tree;
9700 struct extent_changeset *data_reserved = NULL;
9701 struct extent_state *cached_state = NULL;
9702 struct btrfs_ordered_extent *ordered;
9703 struct btrfs_file_extent file_extent;
9704 int compression;
9705 size_t orig_count;
9706 u64 start, end;
9707 u64 num_bytes, ram_bytes, disk_num_bytes;
9708 unsigned long nr_folios, i;
9709 struct folio **folios;
9710 struct btrfs_key ins;
9711 bool extent_reserved = false;
9712 struct extent_map *em;
9713 ssize_t ret;
9714
9715 switch (encoded->compression) {
9716 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9717 compression = BTRFS_COMPRESS_ZLIB;
9718 break;
9719 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9720 compression = BTRFS_COMPRESS_ZSTD;
9721 break;
9722 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9723 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9724 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9725 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9726 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9727 /* The sector size must match for LZO. */
9728 if (encoded->compression -
9729 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9730 fs_info->sectorsize_bits)
9731 return -EINVAL;
9732 compression = BTRFS_COMPRESS_LZO;
9733 break;
9734 default:
9735 return -EINVAL;
9736 }
9737 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9738 return -EINVAL;
9739
9740 /*
9741 * Compressed extents should always have checksums, so error out if we
9742 * have a NOCOW file or inode was created while mounted with NODATASUM.
9743 */
9744 if (inode->flags & BTRFS_INODE_NODATASUM)
9745 return -EINVAL;
9746
9747 orig_count = iov_iter_count(from);
9748
9749 /* The extent size must be sane. */
9750 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9751 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9752 return -EINVAL;
9753
9754 /*
9755 * The compressed data must be smaller than the decompressed data.
9756 *
9757 * It's of course possible for data to compress to larger or the same
9758 * size, but the buffered I/O path falls back to no compression for such
9759 * data, and we don't want to break any assumptions by creating these
9760 * extents.
9761 *
9762 * Note that this is less strict than the current check we have that the
9763 * compressed data must be at least one sector smaller than the
9764 * decompressed data. We only want to enforce the weaker requirement
9765 * from old kernels that it is at least one byte smaller.
9766 */
9767 if (orig_count >= encoded->unencoded_len)
9768 return -EINVAL;
9769
9770 /* The extent must start on a sector boundary. */
9771 start = iocb->ki_pos;
9772 if (!IS_ALIGNED(start, fs_info->sectorsize))
9773 return -EINVAL;
9774
9775 /*
9776 * The extent must end on a sector boundary. However, we allow a write
9777 * which ends at or extends i_size to have an unaligned length; we round
9778 * up the extent size and set i_size to the unaligned end.
9779 */
9780 if (start + encoded->len < inode->vfs_inode.i_size &&
9781 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9782 return -EINVAL;
9783
9784 /* Finally, the offset in the unencoded data must be sector-aligned. */
9785 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9786 return -EINVAL;
9787
9788 num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9789 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9790 end = start + num_bytes - 1;
9791
9792 /*
9793 * If the extent cannot be inline, the compressed data on disk must be
9794 * sector-aligned. For convenience, we extend it with zeroes if it
9795 * isn't.
9796 */
9797 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
9798 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9799 folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
9800 if (!folios)
9801 return -ENOMEM;
9802 for (i = 0; i < nr_folios; i++) {
9803 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9804 char *kaddr;
9805
9806 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9807 if (!folios[i]) {
9808 ret = -ENOMEM;
9809 goto out_folios;
9810 }
9811 kaddr = kmap_local_folio(folios[i], 0);
9812 if (copy_from_iter(kaddr, bytes, from) != bytes) {
9813 kunmap_local(kaddr);
9814 ret = -EFAULT;
9815 goto out_folios;
9816 }
9817 if (bytes < PAGE_SIZE)
9818 memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
9819 kunmap_local(kaddr);
9820 }
9821
9822 for (;;) {
9823 struct btrfs_ordered_extent *ordered;
9824
9825 ret = btrfs_wait_ordered_range(inode, start, num_bytes);
9826 if (ret)
9827 goto out_folios;
9828 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9829 start >> PAGE_SHIFT,
9830 end >> PAGE_SHIFT);
9831 if (ret)
9832 goto out_folios;
9833 btrfs_lock_extent(io_tree, start, end, &cached_state);
9834 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9835 if (!ordered &&
9836 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9837 break;
9838 if (ordered)
9839 btrfs_put_ordered_extent(ordered);
9840 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9841 cond_resched();
9842 }
9843
9844 /*
9845 * We don't use the higher-level delalloc space functions because our
9846 * num_bytes and disk_num_bytes are different.
9847 */
9848 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9849 if (ret)
9850 goto out_unlock;
9851 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9852 if (ret)
9853 goto out_free_data_space;
9854 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9855 false);
9856 if (ret)
9857 goto out_qgroup_free_data;
9858
9859 /* Try an inline extent first. */
9860 if (encoded->unencoded_len == encoded->len &&
9861 encoded->unencoded_offset == 0 &&
9862 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
9863 ret = __cow_file_range_inline(inode, encoded->len,
9864 orig_count, compression, folios[0],
9865 true);
9866 if (ret <= 0) {
9867 if (ret == 0)
9868 ret = orig_count;
9869 goto out_delalloc_release;
9870 }
9871 }
9872
9873 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9874 disk_num_bytes, 0, 0, &ins, 1, 1);
9875 if (ret)
9876 goto out_delalloc_release;
9877 extent_reserved = true;
9878
9879 file_extent.disk_bytenr = ins.objectid;
9880 file_extent.disk_num_bytes = ins.offset;
9881 file_extent.num_bytes = num_bytes;
9882 file_extent.ram_bytes = ram_bytes;
9883 file_extent.offset = encoded->unencoded_offset;
9884 file_extent.compression = compression;
9885 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
9886 if (IS_ERR(em)) {
9887 ret = PTR_ERR(em);
9888 goto out_free_reserved;
9889 }
9890 btrfs_free_extent_map(em);
9891
9892 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
9893 (1U << BTRFS_ORDERED_ENCODED) |
9894 (1U << BTRFS_ORDERED_COMPRESSED));
9895 if (IS_ERR(ordered)) {
9896 btrfs_drop_extent_map_range(inode, start, end, false);
9897 ret = PTR_ERR(ordered);
9898 goto out_free_reserved;
9899 }
9900 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9901
9902 if (start + encoded->len > inode->vfs_inode.i_size)
9903 i_size_write(&inode->vfs_inode, start + encoded->len);
9904
9905 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9906
9907 btrfs_delalloc_release_extents(inode, num_bytes);
9908
9909 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
9910 ret = orig_count;
9911 goto out;
9912
9913 out_free_reserved:
9914 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9915 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true);
9916 out_delalloc_release:
9917 btrfs_delalloc_release_extents(inode, num_bytes);
9918 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9919 out_qgroup_free_data:
9920 if (ret < 0)
9921 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
9922 out_free_data_space:
9923 /*
9924 * If btrfs_reserve_extent() succeeded, then we already decremented
9925 * bytes_may_use.
9926 */
9927 if (!extent_reserved)
9928 btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes);
9929 out_unlock:
9930 btrfs_unlock_extent(io_tree, start, end, &cached_state);
9931 out_folios:
9932 for (i = 0; i < nr_folios; i++) {
9933 if (folios[i])
9934 folio_put(folios[i]);
9935 }
9936 kvfree(folios);
9937 out:
9938 if (ret >= 0)
9939 iocb->ki_pos += encoded->len;
9940 return ret;
9941 }
9942
9943 #ifdef CONFIG_SWAP
9944 /*
9945 * Add an entry indicating a block group or device which is pinned by a
9946 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9947 * negative errno on failure.
9948 */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)9949 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9950 bool is_block_group)
9951 {
9952 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9953 struct btrfs_swapfile_pin *sp, *entry;
9954 struct rb_node **p;
9955 struct rb_node *parent = NULL;
9956
9957 sp = kmalloc(sizeof(*sp), GFP_NOFS);
9958 if (!sp)
9959 return -ENOMEM;
9960 sp->ptr = ptr;
9961 sp->inode = inode;
9962 sp->is_block_group = is_block_group;
9963 sp->bg_extent_count = 1;
9964
9965 spin_lock(&fs_info->swapfile_pins_lock);
9966 p = &fs_info->swapfile_pins.rb_node;
9967 while (*p) {
9968 parent = *p;
9969 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9970 if (sp->ptr < entry->ptr ||
9971 (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9972 p = &(*p)->rb_left;
9973 } else if (sp->ptr > entry->ptr ||
9974 (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9975 p = &(*p)->rb_right;
9976 } else {
9977 if (is_block_group)
9978 entry->bg_extent_count++;
9979 spin_unlock(&fs_info->swapfile_pins_lock);
9980 kfree(sp);
9981 return 1;
9982 }
9983 }
9984 rb_link_node(&sp->node, parent, p);
9985 rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9986 spin_unlock(&fs_info->swapfile_pins_lock);
9987 return 0;
9988 }
9989
9990 /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)9991 static void btrfs_free_swapfile_pins(struct inode *inode)
9992 {
9993 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9994 struct btrfs_swapfile_pin *sp;
9995 struct rb_node *node, *next;
9996
9997 spin_lock(&fs_info->swapfile_pins_lock);
9998 node = rb_first(&fs_info->swapfile_pins);
9999 while (node) {
10000 next = rb_next(node);
10001 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
10002 if (sp->inode == inode) {
10003 rb_erase(&sp->node, &fs_info->swapfile_pins);
10004 if (sp->is_block_group) {
10005 btrfs_dec_block_group_swap_extents(sp->ptr,
10006 sp->bg_extent_count);
10007 btrfs_put_block_group(sp->ptr);
10008 }
10009 kfree(sp);
10010 }
10011 node = next;
10012 }
10013 spin_unlock(&fs_info->swapfile_pins_lock);
10014 }
10015
10016 struct btrfs_swap_info {
10017 u64 start;
10018 u64 block_start;
10019 u64 block_len;
10020 u64 lowest_ppage;
10021 u64 highest_ppage;
10022 unsigned long nr_pages;
10023 int nr_extents;
10024 };
10025
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)10026 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
10027 struct btrfs_swap_info *bsi)
10028 {
10029 unsigned long nr_pages;
10030 unsigned long max_pages;
10031 u64 first_ppage, first_ppage_reported, next_ppage;
10032 int ret;
10033
10034 /*
10035 * Our swapfile may have had its size extended after the swap header was
10036 * written. In that case activating the swapfile should not go beyond
10037 * the max size set in the swap header.
10038 */
10039 if (bsi->nr_pages >= sis->max)
10040 return 0;
10041
10042 max_pages = sis->max - bsi->nr_pages;
10043 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
10044 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
10045
10046 if (first_ppage >= next_ppage)
10047 return 0;
10048 nr_pages = next_ppage - first_ppage;
10049 nr_pages = min(nr_pages, max_pages);
10050
10051 first_ppage_reported = first_ppage;
10052 if (bsi->start == 0)
10053 first_ppage_reported++;
10054 if (bsi->lowest_ppage > first_ppage_reported)
10055 bsi->lowest_ppage = first_ppage_reported;
10056 if (bsi->highest_ppage < (next_ppage - 1))
10057 bsi->highest_ppage = next_ppage - 1;
10058
10059 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
10060 if (ret < 0)
10061 return ret;
10062 bsi->nr_extents += ret;
10063 bsi->nr_pages += nr_pages;
10064 return 0;
10065 }
10066
btrfs_swap_deactivate(struct file * file)10067 static void btrfs_swap_deactivate(struct file *file)
10068 {
10069 struct inode *inode = file_inode(file);
10070
10071 btrfs_free_swapfile_pins(inode);
10072 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
10073 }
10074
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10075 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10076 sector_t *span)
10077 {
10078 struct inode *inode = file_inode(file);
10079 struct btrfs_root *root = BTRFS_I(inode)->root;
10080 struct btrfs_fs_info *fs_info = root->fs_info;
10081 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
10082 struct extent_state *cached_state = NULL;
10083 struct btrfs_chunk_map *map = NULL;
10084 struct btrfs_device *device = NULL;
10085 struct btrfs_swap_info bsi = {
10086 .lowest_ppage = (sector_t)-1ULL,
10087 };
10088 struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
10089 struct btrfs_path *path = NULL;
10090 int ret = 0;
10091 u64 isize;
10092 u64 prev_extent_end = 0;
10093
10094 /*
10095 * Acquire the inode's mmap lock to prevent races with memory mapped
10096 * writes, as they could happen after we flush delalloc below and before
10097 * we lock the extent range further below. The inode was already locked
10098 * up in the call chain.
10099 */
10100 btrfs_assert_inode_locked(BTRFS_I(inode));
10101 down_write(&BTRFS_I(inode)->i_mmap_lock);
10102
10103 /*
10104 * If the swap file was just created, make sure delalloc is done. If the
10105 * file changes again after this, the user is doing something stupid and
10106 * we don't really care.
10107 */
10108 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
10109 if (ret)
10110 goto out_unlock_mmap;
10111
10112 /*
10113 * The inode is locked, so these flags won't change after we check them.
10114 */
10115 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
10116 btrfs_warn(fs_info, "swapfile must not be compressed");
10117 ret = -EINVAL;
10118 goto out_unlock_mmap;
10119 }
10120 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
10121 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
10122 ret = -EINVAL;
10123 goto out_unlock_mmap;
10124 }
10125 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
10126 btrfs_warn(fs_info, "swapfile must not be checksummed");
10127 ret = -EINVAL;
10128 goto out_unlock_mmap;
10129 }
10130
10131 path = btrfs_alloc_path();
10132 backref_ctx = btrfs_alloc_backref_share_check_ctx();
10133 if (!path || !backref_ctx) {
10134 ret = -ENOMEM;
10135 goto out_unlock_mmap;
10136 }
10137
10138 /*
10139 * Balance or device remove/replace/resize can move stuff around from
10140 * under us. The exclop protection makes sure they aren't running/won't
10141 * run concurrently while we are mapping the swap extents, and
10142 * fs_info->swapfile_pins prevents them from running while the swap
10143 * file is active and moving the extents. Note that this also prevents
10144 * a concurrent device add which isn't actually necessary, but it's not
10145 * really worth the trouble to allow it.
10146 */
10147 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
10148 btrfs_warn(fs_info,
10149 "cannot activate swapfile while exclusive operation is running");
10150 ret = -EBUSY;
10151 goto out_unlock_mmap;
10152 }
10153
10154 /*
10155 * Prevent snapshot creation while we are activating the swap file.
10156 * We do not want to race with snapshot creation. If snapshot creation
10157 * already started before we bumped nr_swapfiles from 0 to 1 and
10158 * completes before the first write into the swap file after it is
10159 * activated, than that write would fallback to COW.
10160 */
10161 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
10162 btrfs_exclop_finish(fs_info);
10163 btrfs_warn(fs_info,
10164 "cannot activate swapfile because snapshot creation is in progress");
10165 ret = -EINVAL;
10166 goto out_unlock_mmap;
10167 }
10168 /*
10169 * Snapshots can create extents which require COW even if NODATACOW is
10170 * set. We use this counter to prevent snapshots. We must increment it
10171 * before walking the extents because we don't want a concurrent
10172 * snapshot to run after we've already checked the extents.
10173 *
10174 * It is possible that subvolume is marked for deletion but still not
10175 * removed yet. To prevent this race, we check the root status before
10176 * activating the swapfile.
10177 */
10178 spin_lock(&root->root_item_lock);
10179 if (btrfs_root_dead(root)) {
10180 spin_unlock(&root->root_item_lock);
10181
10182 btrfs_drew_write_unlock(&root->snapshot_lock);
10183 btrfs_exclop_finish(fs_info);
10184 btrfs_warn(fs_info,
10185 "cannot activate swapfile because subvolume %llu is being deleted",
10186 btrfs_root_id(root));
10187 ret = -EPERM;
10188 goto out_unlock_mmap;
10189 }
10190 atomic_inc(&root->nr_swapfiles);
10191 spin_unlock(&root->root_item_lock);
10192
10193 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10194
10195 btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state);
10196 while (prev_extent_end < isize) {
10197 struct btrfs_key key;
10198 struct extent_buffer *leaf;
10199 struct btrfs_file_extent_item *ei;
10200 struct btrfs_block_group *bg;
10201 u64 logical_block_start;
10202 u64 physical_block_start;
10203 u64 extent_gen;
10204 u64 disk_bytenr;
10205 u64 len;
10206
10207 key.objectid = btrfs_ino(BTRFS_I(inode));
10208 key.type = BTRFS_EXTENT_DATA_KEY;
10209 key.offset = prev_extent_end;
10210
10211 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
10212 if (ret < 0)
10213 goto out;
10214
10215 /*
10216 * If key not found it means we have an implicit hole (NO_HOLES
10217 * is enabled).
10218 */
10219 if (ret > 0) {
10220 btrfs_warn(fs_info, "swapfile must not have holes");
10221 ret = -EINVAL;
10222 goto out;
10223 }
10224
10225 leaf = path->nodes[0];
10226 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10227
10228 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
10229 /*
10230 * It's unlikely we'll ever actually find ourselves
10231 * here, as a file small enough to fit inline won't be
10232 * big enough to store more than the swap header, but in
10233 * case something changes in the future, let's catch it
10234 * here rather than later.
10235 */
10236 btrfs_warn(fs_info, "swapfile must not be inline");
10237 ret = -EINVAL;
10238 goto out;
10239 }
10240
10241 if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
10242 btrfs_warn(fs_info, "swapfile must not be compressed");
10243 ret = -EINVAL;
10244 goto out;
10245 }
10246
10247 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
10248 if (disk_bytenr == 0) {
10249 btrfs_warn(fs_info, "swapfile must not have holes");
10250 ret = -EINVAL;
10251 goto out;
10252 }
10253
10254 logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
10255 extent_gen = btrfs_file_extent_generation(leaf, ei);
10256 prev_extent_end = btrfs_file_extent_end(path);
10257
10258 if (prev_extent_end > isize)
10259 len = isize - key.offset;
10260 else
10261 len = btrfs_file_extent_num_bytes(leaf, ei);
10262
10263 backref_ctx->curr_leaf_bytenr = leaf->start;
10264
10265 /*
10266 * Don't need the path anymore, release to avoid deadlocks when
10267 * calling btrfs_is_data_extent_shared() because when joining a
10268 * transaction it can block waiting for the current one's commit
10269 * which in turn may be trying to lock the same leaf to flush
10270 * delayed items for example.
10271 */
10272 btrfs_release_path(path);
10273
10274 ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
10275 extent_gen, backref_ctx);
10276 if (ret < 0) {
10277 goto out;
10278 } else if (ret > 0) {
10279 btrfs_warn(fs_info,
10280 "swapfile must not be copy-on-write");
10281 ret = -EINVAL;
10282 goto out;
10283 }
10284
10285 map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10286 if (IS_ERR(map)) {
10287 ret = PTR_ERR(map);
10288 goto out;
10289 }
10290
10291 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10292 btrfs_warn(fs_info,
10293 "swapfile must have single data profile");
10294 ret = -EINVAL;
10295 goto out;
10296 }
10297
10298 if (device == NULL) {
10299 device = map->stripes[0].dev;
10300 ret = btrfs_add_swapfile_pin(inode, device, false);
10301 if (ret == 1)
10302 ret = 0;
10303 else if (ret)
10304 goto out;
10305 } else if (device != map->stripes[0].dev) {
10306 btrfs_warn(fs_info, "swapfile must be on one device");
10307 ret = -EINVAL;
10308 goto out;
10309 }
10310
10311 physical_block_start = (map->stripes[0].physical +
10312 (logical_block_start - map->start));
10313 btrfs_free_chunk_map(map);
10314 map = NULL;
10315
10316 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10317 if (!bg) {
10318 btrfs_warn(fs_info,
10319 "could not find block group containing swapfile");
10320 ret = -EINVAL;
10321 goto out;
10322 }
10323
10324 if (!btrfs_inc_block_group_swap_extents(bg)) {
10325 btrfs_warn(fs_info,
10326 "block group for swapfile at %llu is read-only%s",
10327 bg->start,
10328 atomic_read(&fs_info->scrubs_running) ?
10329 " (scrub running)" : "");
10330 btrfs_put_block_group(bg);
10331 ret = -EINVAL;
10332 goto out;
10333 }
10334
10335 ret = btrfs_add_swapfile_pin(inode, bg, true);
10336 if (ret) {
10337 btrfs_put_block_group(bg);
10338 if (ret == 1)
10339 ret = 0;
10340 else
10341 goto out;
10342 }
10343
10344 if (bsi.block_len &&
10345 bsi.block_start + bsi.block_len == physical_block_start) {
10346 bsi.block_len += len;
10347 } else {
10348 if (bsi.block_len) {
10349 ret = btrfs_add_swap_extent(sis, &bsi);
10350 if (ret)
10351 goto out;
10352 }
10353 bsi.start = key.offset;
10354 bsi.block_start = physical_block_start;
10355 bsi.block_len = len;
10356 }
10357
10358 if (fatal_signal_pending(current)) {
10359 ret = -EINTR;
10360 goto out;
10361 }
10362
10363 cond_resched();
10364 }
10365
10366 if (bsi.block_len)
10367 ret = btrfs_add_swap_extent(sis, &bsi);
10368
10369 out:
10370 if (!IS_ERR_OR_NULL(map))
10371 btrfs_free_chunk_map(map);
10372
10373 btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state);
10374
10375 if (ret)
10376 btrfs_swap_deactivate(file);
10377
10378 btrfs_drew_write_unlock(&root->snapshot_lock);
10379
10380 btrfs_exclop_finish(fs_info);
10381
10382 out_unlock_mmap:
10383 up_write(&BTRFS_I(inode)->i_mmap_lock);
10384 btrfs_free_backref_share_ctx(backref_ctx);
10385 btrfs_free_path(path);
10386 if (ret)
10387 return ret;
10388
10389 if (device)
10390 sis->bdev = device->bdev;
10391 *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10392 sis->max = bsi.nr_pages;
10393 sis->pages = bsi.nr_pages - 1;
10394 return bsi.nr_extents;
10395 }
10396 #else
btrfs_swap_deactivate(struct file * file)10397 static void btrfs_swap_deactivate(struct file *file)
10398 {
10399 }
10400
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10401 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10402 sector_t *span)
10403 {
10404 return -EOPNOTSUPP;
10405 }
10406 #endif
10407
10408 /*
10409 * Update the number of bytes used in the VFS' inode. When we replace extents in
10410 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10411 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10412 * always get a correct value.
10413 */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10414 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10415 const u64 add_bytes,
10416 const u64 del_bytes)
10417 {
10418 if (add_bytes == del_bytes)
10419 return;
10420
10421 spin_lock(&inode->lock);
10422 if (del_bytes > 0)
10423 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10424 if (add_bytes > 0)
10425 inode_add_bytes(&inode->vfs_inode, add_bytes);
10426 spin_unlock(&inode->lock);
10427 }
10428
10429 /*
10430 * Verify that there are no ordered extents for a given file range.
10431 *
10432 * @inode: The target inode.
10433 * @start: Start offset of the file range, should be sector size aligned.
10434 * @end: End offset (inclusive) of the file range, its value +1 should be
10435 * sector size aligned.
10436 *
10437 * This should typically be used for cases where we locked an inode's VFS lock in
10438 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10439 * we have flushed all delalloc in the range, we have waited for all ordered
10440 * extents in the range to complete and finally we have locked the file range in
10441 * the inode's io_tree.
10442 */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10443 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10444 {
10445 struct btrfs_root *root = inode->root;
10446 struct btrfs_ordered_extent *ordered;
10447
10448 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10449 return;
10450
10451 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10452 if (ordered) {
10453 btrfs_err(root->fs_info,
10454 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10455 start, end, btrfs_ino(inode), btrfs_root_id(root),
10456 ordered->file_offset,
10457 ordered->file_offset + ordered->num_bytes - 1);
10458 btrfs_put_ordered_extent(ordered);
10459 }
10460
10461 ASSERT(ordered == NULL);
10462 }
10463
10464 /*
10465 * Find the first inode with a minimum number.
10466 *
10467 * @root: The root to search for.
10468 * @min_ino: The minimum inode number.
10469 *
10470 * Find the first inode in the @root with a number >= @min_ino and return it.
10471 * Returns NULL if no such inode found.
10472 */
btrfs_find_first_inode(struct btrfs_root * root,u64 min_ino)10473 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10474 {
10475 struct btrfs_inode *inode;
10476 unsigned long from = min_ino;
10477
10478 xa_lock(&root->inodes);
10479 while (true) {
10480 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10481 if (!inode)
10482 break;
10483 if (igrab(&inode->vfs_inode))
10484 break;
10485
10486 from = btrfs_ino(inode) + 1;
10487 cond_resched_lock(&root->inodes.xa_lock);
10488 }
10489 xa_unlock(&root->inodes);
10490
10491 return inode;
10492 }
10493
10494 static const struct inode_operations btrfs_dir_inode_operations = {
10495 .getattr = btrfs_getattr,
10496 .lookup = btrfs_lookup,
10497 .create = btrfs_create,
10498 .unlink = btrfs_unlink,
10499 .link = btrfs_link,
10500 .mkdir = btrfs_mkdir,
10501 .rmdir = btrfs_rmdir,
10502 .rename = btrfs_rename2,
10503 .symlink = btrfs_symlink,
10504 .setattr = btrfs_setattr,
10505 .mknod = btrfs_mknod,
10506 .listxattr = btrfs_listxattr,
10507 .permission = btrfs_permission,
10508 .get_inode_acl = btrfs_get_acl,
10509 .set_acl = btrfs_set_acl,
10510 .update_time = btrfs_update_time,
10511 .tmpfile = btrfs_tmpfile,
10512 .fileattr_get = btrfs_fileattr_get,
10513 .fileattr_set = btrfs_fileattr_set,
10514 };
10515
10516 static const struct file_operations btrfs_dir_file_operations = {
10517 .llseek = btrfs_dir_llseek,
10518 .read = generic_read_dir,
10519 .iterate_shared = btrfs_real_readdir,
10520 .open = btrfs_opendir,
10521 .unlocked_ioctl = btrfs_ioctl,
10522 #ifdef CONFIG_COMPAT
10523 .compat_ioctl = btrfs_compat_ioctl,
10524 #endif
10525 .release = btrfs_release_file,
10526 .fsync = btrfs_sync_file,
10527 };
10528
10529 /*
10530 * btrfs doesn't support the bmap operation because swapfiles
10531 * use bmap to make a mapping of extents in the file. They assume
10532 * these extents won't change over the life of the file and they
10533 * use the bmap result to do IO directly to the drive.
10534 *
10535 * the btrfs bmap call would return logical addresses that aren't
10536 * suitable for IO and they also will change frequently as COW
10537 * operations happen. So, swapfile + btrfs == corruption.
10538 *
10539 * For now we're avoiding this by dropping bmap.
10540 */
10541 static const struct address_space_operations btrfs_aops = {
10542 .read_folio = btrfs_read_folio,
10543 .writepages = btrfs_writepages,
10544 .readahead = btrfs_readahead,
10545 .invalidate_folio = btrfs_invalidate_folio,
10546 .launder_folio = btrfs_launder_folio,
10547 .release_folio = btrfs_release_folio,
10548 .migrate_folio = btrfs_migrate_folio,
10549 .dirty_folio = filemap_dirty_folio,
10550 .error_remove_folio = generic_error_remove_folio,
10551 .swap_activate = btrfs_swap_activate,
10552 .swap_deactivate = btrfs_swap_deactivate,
10553 };
10554
10555 static const struct inode_operations btrfs_file_inode_operations = {
10556 .getattr = btrfs_getattr,
10557 .setattr = btrfs_setattr,
10558 .listxattr = btrfs_listxattr,
10559 .permission = btrfs_permission,
10560 .fiemap = btrfs_fiemap,
10561 .get_inode_acl = btrfs_get_acl,
10562 .set_acl = btrfs_set_acl,
10563 .update_time = btrfs_update_time,
10564 .fileattr_get = btrfs_fileattr_get,
10565 .fileattr_set = btrfs_fileattr_set,
10566 };
10567 static const struct inode_operations btrfs_special_inode_operations = {
10568 .getattr = btrfs_getattr,
10569 .setattr = btrfs_setattr,
10570 .permission = btrfs_permission,
10571 .listxattr = btrfs_listxattr,
10572 .get_inode_acl = btrfs_get_acl,
10573 .set_acl = btrfs_set_acl,
10574 .update_time = btrfs_update_time,
10575 };
10576 static const struct inode_operations btrfs_symlink_inode_operations = {
10577 .get_link = page_get_link,
10578 .getattr = btrfs_getattr,
10579 .setattr = btrfs_setattr,
10580 .permission = btrfs_permission,
10581 .listxattr = btrfs_listxattr,
10582 .update_time = btrfs_update_time,
10583 };
10584
10585 const struct dentry_operations btrfs_dentry_operations = {
10586 .d_delete = btrfs_dentry_delete,
10587 };
10588