1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <crypto/hash.h>
7 #include <linux/kernel.h>
8 #include <linux/bio.h>
9 #include <linux/blk-cgroup.h>
10 #include <linux/file.h>
11 #include <linux/fs.h>
12 #include <linux/pagemap.h>
13 #include <linux/highmem.h>
14 #include <linux/time.h>
15 #include <linux/init.h>
16 #include <linux/string.h>
17 #include <linux/backing-dev.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/xattr.h>
21 #include <linux/posix_acl.h>
22 #include <linux/falloc.h>
23 #include <linux/slab.h>
24 #include <linux/ratelimit.h>
25 #include <linux/btrfs.h>
26 #include <linux/blkdev.h>
27 #include <linux/posix_acl_xattr.h>
28 #include <linux/uio.h>
29 #include <linux/magic.h>
30 #include <linux/iversion.h>
31 #include <linux/swap.h>
32 #include <linux/migrate.h>
33 #include <linux/sched/mm.h>
34 #include <linux/iomap.h>
35 #include <linux/unaligned.h>
36 #include <linux/fsverity.h>
37 #include "misc.h"
38 #include "ctree.h"
39 #include "disk-io.h"
40 #include "transaction.h"
41 #include "btrfs_inode.h"
42 #include "ordered-data.h"
43 #include "xattr.h"
44 #include "tree-log.h"
45 #include "bio.h"
46 #include "compression.h"
47 #include "locking.h"
48 #include "props.h"
49 #include "qgroup.h"
50 #include "delalloc-space.h"
51 #include "block-group.h"
52 #include "space-info.h"
53 #include "zoned.h"
54 #include "subpage.h"
55 #include "inode-item.h"
56 #include "fs.h"
57 #include "accessors.h"
58 #include "extent-tree.h"
59 #include "root-tree.h"
60 #include "defrag.h"
61 #include "dir-item.h"
62 #include "file-item.h"
63 #include "uuid-tree.h"
64 #include "ioctl.h"
65 #include "file.h"
66 #include "acl.h"
67 #include "relocation.h"
68 #include "verity.h"
69 #include "super.h"
70 #include "orphan.h"
71 #include "backref.h"
72 #include "raid-stripe-tree.h"
73 #include "fiemap.h"
74
75 struct btrfs_iget_args {
76 u64 ino;
77 struct btrfs_root *root;
78 };
79
80 struct btrfs_rename_ctx {
81 /* Output field. Stores the index number of the old directory entry. */
82 u64 index;
83 };
84
85 /*
86 * Used by data_reloc_print_warning_inode() to pass needed info for filename
87 * resolution and output of error message.
88 */
89 struct data_reloc_warn {
90 struct btrfs_path path;
91 struct btrfs_fs_info *fs_info;
92 u64 extent_item_size;
93 u64 logical;
94 int mirror_num;
95 };
96
97 /*
98 * For the file_extent_tree, we want to hold the inode lock when we lookup and
99 * update the disk_i_size, but lockdep will complain because our io_tree we hold
100 * the tree lock and get the inode lock when setting delalloc. These two things
101 * are unrelated, so make a class for the file_extent_tree so we don't get the
102 * two locking patterns mixed up.
103 */
104 static struct lock_class_key file_extent_tree_class;
105
106 static const struct inode_operations btrfs_dir_inode_operations;
107 static const struct inode_operations btrfs_symlink_inode_operations;
108 static const struct inode_operations btrfs_special_inode_operations;
109 static const struct inode_operations btrfs_file_inode_operations;
110 static const struct address_space_operations btrfs_aops;
111 static const struct file_operations btrfs_dir_file_operations;
112
113 static struct kmem_cache *btrfs_inode_cachep;
114
115 static int btrfs_setsize(struct inode *inode, struct iattr *attr);
116 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback);
117
118 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
119 struct folio *locked_folio, u64 start,
120 u64 end, struct writeback_control *wbc,
121 bool pages_dirty);
122
data_reloc_print_warning_inode(u64 inum,u64 offset,u64 num_bytes,u64 root,void * warn_ctx)123 static int data_reloc_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
124 u64 root, void *warn_ctx)
125 {
126 struct data_reloc_warn *warn = warn_ctx;
127 struct btrfs_fs_info *fs_info = warn->fs_info;
128 struct extent_buffer *eb;
129 struct btrfs_inode_item *inode_item;
130 struct inode_fs_paths *ipath = NULL;
131 struct btrfs_root *local_root;
132 struct btrfs_key key;
133 unsigned int nofs_flag;
134 u32 nlink;
135 int ret;
136
137 local_root = btrfs_get_fs_root(fs_info, root, true);
138 if (IS_ERR(local_root)) {
139 ret = PTR_ERR(local_root);
140 goto err;
141 }
142
143 /* This makes the path point to (inum INODE_ITEM ioff). */
144 key.objectid = inum;
145 key.type = BTRFS_INODE_ITEM_KEY;
146 key.offset = 0;
147
148 ret = btrfs_search_slot(NULL, local_root, &key, &warn->path, 0, 0);
149 if (ret) {
150 btrfs_put_root(local_root);
151 btrfs_release_path(&warn->path);
152 goto err;
153 }
154
155 eb = warn->path.nodes[0];
156 inode_item = btrfs_item_ptr(eb, warn->path.slots[0], struct btrfs_inode_item);
157 nlink = btrfs_inode_nlink(eb, inode_item);
158 btrfs_release_path(&warn->path);
159
160 nofs_flag = memalloc_nofs_save();
161 ipath = init_ipath(4096, local_root, &warn->path);
162 memalloc_nofs_restore(nofs_flag);
163 if (IS_ERR(ipath)) {
164 btrfs_put_root(local_root);
165 ret = PTR_ERR(ipath);
166 ipath = NULL;
167 /*
168 * -ENOMEM, not a critical error, just output an generic error
169 * without filename.
170 */
171 btrfs_warn(fs_info,
172 "checksum error at logical %llu mirror %u root %llu, inode %llu offset %llu",
173 warn->logical, warn->mirror_num, root, inum, offset);
174 return ret;
175 }
176 ret = paths_from_inode(inum, ipath);
177 if (ret < 0)
178 goto err;
179
180 /*
181 * We deliberately ignore the bit ipath might have been too small to
182 * hold all of the paths here
183 */
184 for (int i = 0; i < ipath->fspath->elem_cnt; i++) {
185 btrfs_warn(fs_info,
186 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu length %u links %u (path: %s)",
187 warn->logical, warn->mirror_num, root, inum, offset,
188 fs_info->sectorsize, nlink,
189 (char *)(unsigned long)ipath->fspath->val[i]);
190 }
191
192 btrfs_put_root(local_root);
193 free_ipath(ipath);
194 return 0;
195
196 err:
197 btrfs_warn(fs_info,
198 "checksum error at logical %llu mirror %u root %llu inode %llu offset %llu, path resolving failed with ret=%d",
199 warn->logical, warn->mirror_num, root, inum, offset, ret);
200
201 free_ipath(ipath);
202 return ret;
203 }
204
205 /*
206 * Do extra user-friendly error output (e.g. lookup all the affected files).
207 *
208 * Return true if we succeeded doing the backref lookup.
209 * Return false if such lookup failed, and has to fallback to the old error message.
210 */
print_data_reloc_error(const struct btrfs_inode * inode,u64 file_off,const u8 * csum,const u8 * csum_expected,int mirror_num)211 static void print_data_reloc_error(const struct btrfs_inode *inode, u64 file_off,
212 const u8 *csum, const u8 *csum_expected,
213 int mirror_num)
214 {
215 struct btrfs_fs_info *fs_info = inode->root->fs_info;
216 struct btrfs_path path = { 0 };
217 struct btrfs_key found_key = { 0 };
218 struct extent_buffer *eb;
219 struct btrfs_extent_item *ei;
220 const u32 csum_size = fs_info->csum_size;
221 u64 logical;
222 u64 flags;
223 u32 item_size;
224 int ret;
225
226 mutex_lock(&fs_info->reloc_mutex);
227 logical = btrfs_get_reloc_bg_bytenr(fs_info);
228 mutex_unlock(&fs_info->reloc_mutex);
229
230 if (logical == U64_MAX) {
231 btrfs_warn_rl(fs_info, "has data reloc tree but no running relocation");
232 btrfs_warn_rl(fs_info,
233 "csum failed root %lld ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
234 btrfs_root_id(inode->root), btrfs_ino(inode), file_off,
235 CSUM_FMT_VALUE(csum_size, csum),
236 CSUM_FMT_VALUE(csum_size, csum_expected),
237 mirror_num);
238 return;
239 }
240
241 logical += file_off;
242 btrfs_warn_rl(fs_info,
243 "csum failed root %lld ino %llu off %llu logical %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
244 btrfs_root_id(inode->root),
245 btrfs_ino(inode), file_off, logical,
246 CSUM_FMT_VALUE(csum_size, csum),
247 CSUM_FMT_VALUE(csum_size, csum_expected),
248 mirror_num);
249
250 ret = extent_from_logical(fs_info, logical, &path, &found_key, &flags);
251 if (ret < 0) {
252 btrfs_err_rl(fs_info, "failed to lookup extent item for logical %llu: %d",
253 logical, ret);
254 return;
255 }
256 eb = path.nodes[0];
257 ei = btrfs_item_ptr(eb, path.slots[0], struct btrfs_extent_item);
258 item_size = btrfs_item_size(eb, path.slots[0]);
259 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
260 unsigned long ptr = 0;
261 u64 ref_root;
262 u8 ref_level;
263
264 while (true) {
265 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
266 item_size, &ref_root,
267 &ref_level);
268 if (ret < 0) {
269 btrfs_warn_rl(fs_info,
270 "failed to resolve tree backref for logical %llu: %d",
271 logical, ret);
272 break;
273 }
274 if (ret > 0)
275 break;
276
277 btrfs_warn_rl(fs_info,
278 "csum error at logical %llu mirror %u: metadata %s (level %d) in tree %llu",
279 logical, mirror_num,
280 (ref_level ? "node" : "leaf"),
281 ref_level, ref_root);
282 }
283 btrfs_release_path(&path);
284 } else {
285 struct btrfs_backref_walk_ctx ctx = { 0 };
286 struct data_reloc_warn reloc_warn = { 0 };
287
288 btrfs_release_path(&path);
289
290 ctx.bytenr = found_key.objectid;
291 ctx.extent_item_pos = logical - found_key.objectid;
292 ctx.fs_info = fs_info;
293
294 reloc_warn.logical = logical;
295 reloc_warn.extent_item_size = found_key.offset;
296 reloc_warn.mirror_num = mirror_num;
297 reloc_warn.fs_info = fs_info;
298
299 iterate_extent_inodes(&ctx, true,
300 data_reloc_print_warning_inode, &reloc_warn);
301 }
302 }
303
btrfs_print_data_csum_error(struct btrfs_inode * inode,u64 logical_start,u8 * csum,u8 * csum_expected,int mirror_num)304 static void __cold btrfs_print_data_csum_error(struct btrfs_inode *inode,
305 u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
306 {
307 struct btrfs_root *root = inode->root;
308 const u32 csum_size = root->fs_info->csum_size;
309
310 /* For data reloc tree, it's better to do a backref lookup instead. */
311 if (btrfs_root_id(root) == BTRFS_DATA_RELOC_TREE_OBJECTID)
312 return print_data_reloc_error(inode, logical_start, csum,
313 csum_expected, mirror_num);
314
315 /* Output without objectid, which is more meaningful */
316 if (btrfs_root_id(root) >= BTRFS_LAST_FREE_OBJECTID) {
317 btrfs_warn_rl(root->fs_info,
318 "csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
319 btrfs_root_id(root), btrfs_ino(inode),
320 logical_start,
321 CSUM_FMT_VALUE(csum_size, csum),
322 CSUM_FMT_VALUE(csum_size, csum_expected),
323 mirror_num);
324 } else {
325 btrfs_warn_rl(root->fs_info,
326 "csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
327 btrfs_root_id(root), btrfs_ino(inode),
328 logical_start,
329 CSUM_FMT_VALUE(csum_size, csum),
330 CSUM_FMT_VALUE(csum_size, csum_expected),
331 mirror_num);
332 }
333 }
334
335 /*
336 * Lock inode i_rwsem based on arguments passed.
337 *
338 * ilock_flags can have the following bit set:
339 *
340 * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
341 * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
342 * return -EAGAIN
343 * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
344 */
btrfs_inode_lock(struct btrfs_inode * inode,unsigned int ilock_flags)345 int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags)
346 {
347 if (ilock_flags & BTRFS_ILOCK_SHARED) {
348 if (ilock_flags & BTRFS_ILOCK_TRY) {
349 if (!inode_trylock_shared(&inode->vfs_inode))
350 return -EAGAIN;
351 else
352 return 0;
353 }
354 inode_lock_shared(&inode->vfs_inode);
355 } else {
356 if (ilock_flags & BTRFS_ILOCK_TRY) {
357 if (!inode_trylock(&inode->vfs_inode))
358 return -EAGAIN;
359 else
360 return 0;
361 }
362 inode_lock(&inode->vfs_inode);
363 }
364 if (ilock_flags & BTRFS_ILOCK_MMAP)
365 down_write(&inode->i_mmap_lock);
366 return 0;
367 }
368
369 /*
370 * Unock inode i_rwsem.
371 *
372 * ilock_flags should contain the same bits set as passed to btrfs_inode_lock()
373 * to decide whether the lock acquired is shared or exclusive.
374 */
btrfs_inode_unlock(struct btrfs_inode * inode,unsigned int ilock_flags)375 void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags)
376 {
377 if (ilock_flags & BTRFS_ILOCK_MMAP)
378 up_write(&inode->i_mmap_lock);
379 if (ilock_flags & BTRFS_ILOCK_SHARED)
380 inode_unlock_shared(&inode->vfs_inode);
381 else
382 inode_unlock(&inode->vfs_inode);
383 }
384
385 /*
386 * Cleanup all submitted ordered extents in specified range to handle errors
387 * from the btrfs_run_delalloc_range() callback.
388 *
389 * NOTE: caller must ensure that when an error happens, it can not call
390 * extent_clear_unlock_delalloc() to clear both the bits EXTENT_DO_ACCOUNTING
391 * and EXTENT_DELALLOC simultaneously, because that causes the reserved metadata
392 * to be released, which we want to happen only when finishing the ordered
393 * extent (btrfs_finish_ordered_io()).
394 */
btrfs_cleanup_ordered_extents(struct btrfs_inode * inode,u64 offset,u64 bytes)395 static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
396 u64 offset, u64 bytes)
397 {
398 unsigned long index = offset >> PAGE_SHIFT;
399 unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
400 struct folio *folio;
401
402 while (index <= end_index) {
403 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
404 index++;
405 if (IS_ERR(folio))
406 continue;
407
408 /*
409 * Here we just clear all Ordered bits for every page in the
410 * range, then btrfs_mark_ordered_io_finished() will handle
411 * the ordered extent accounting for the range.
412 */
413 btrfs_folio_clamp_clear_ordered(inode->root->fs_info, folio,
414 offset, bytes);
415 folio_put(folio);
416 }
417
418 return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
419 }
420
421 static int btrfs_dirty_inode(struct btrfs_inode *inode);
422
btrfs_init_inode_security(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)423 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
424 struct btrfs_new_inode_args *args)
425 {
426 int err;
427
428 if (args->default_acl) {
429 err = __btrfs_set_acl(trans, args->inode, args->default_acl,
430 ACL_TYPE_DEFAULT);
431 if (err)
432 return err;
433 }
434 if (args->acl) {
435 err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS);
436 if (err)
437 return err;
438 }
439 if (!args->default_acl && !args->acl)
440 cache_no_acl(args->inode);
441 return btrfs_xattr_security_init(trans, args->inode, args->dir,
442 &args->dentry->d_name);
443 }
444
445 /*
446 * this does all the hard work for inserting an inline extent into
447 * the btree. The caller should have done a btrfs_drop_extents so that
448 * no overlapping inline items exist in the btree
449 */
insert_inline_extent(struct btrfs_trans_handle * trans,struct btrfs_path * path,struct btrfs_inode * inode,bool extent_inserted,size_t size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)450 static int insert_inline_extent(struct btrfs_trans_handle *trans,
451 struct btrfs_path *path,
452 struct btrfs_inode *inode, bool extent_inserted,
453 size_t size, size_t compressed_size,
454 int compress_type,
455 struct folio *compressed_folio,
456 bool update_i_size)
457 {
458 struct btrfs_root *root = inode->root;
459 struct extent_buffer *leaf;
460 const u32 sectorsize = trans->fs_info->sectorsize;
461 char *kaddr;
462 unsigned long ptr;
463 struct btrfs_file_extent_item *ei;
464 int ret;
465 size_t cur_size = size;
466 u64 i_size;
467
468 /*
469 * The decompressed size must still be no larger than a sector. Under
470 * heavy race, we can have size == 0 passed in, but that shouldn't be a
471 * big deal and we can continue the insertion.
472 */
473 ASSERT(size <= sectorsize);
474
475 /*
476 * The compressed size also needs to be no larger than a sector.
477 * That's also why we only need one page as the parameter.
478 */
479 if (compressed_folio)
480 ASSERT(compressed_size <= sectorsize);
481 else
482 ASSERT(compressed_size == 0);
483
484 if (compressed_size && compressed_folio)
485 cur_size = compressed_size;
486
487 if (!extent_inserted) {
488 struct btrfs_key key;
489 size_t datasize;
490
491 key.objectid = btrfs_ino(inode);
492 key.type = BTRFS_EXTENT_DATA_KEY;
493 key.offset = 0;
494
495 datasize = btrfs_file_extent_calc_inline_size(cur_size);
496 ret = btrfs_insert_empty_item(trans, root, path, &key,
497 datasize);
498 if (ret)
499 goto fail;
500 }
501 leaf = path->nodes[0];
502 ei = btrfs_item_ptr(leaf, path->slots[0],
503 struct btrfs_file_extent_item);
504 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
505 btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE);
506 btrfs_set_file_extent_encryption(leaf, ei, 0);
507 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
508 btrfs_set_file_extent_ram_bytes(leaf, ei, size);
509 ptr = btrfs_file_extent_inline_start(ei);
510
511 if (compress_type != BTRFS_COMPRESS_NONE) {
512 kaddr = kmap_local_folio(compressed_folio, 0);
513 write_extent_buffer(leaf, kaddr, ptr, compressed_size);
514 kunmap_local(kaddr);
515
516 btrfs_set_file_extent_compression(leaf, ei,
517 compress_type);
518 } else {
519 struct folio *folio;
520
521 folio = filemap_get_folio(inode->vfs_inode.i_mapping, 0);
522 ASSERT(!IS_ERR(folio));
523 btrfs_set_file_extent_compression(leaf, ei, 0);
524 kaddr = kmap_local_folio(folio, 0);
525 write_extent_buffer(leaf, kaddr, ptr, size);
526 kunmap_local(kaddr);
527 folio_put(folio);
528 }
529 btrfs_release_path(path);
530
531 /*
532 * We align size to sectorsize for inline extents just for simplicity
533 * sake.
534 */
535 ret = btrfs_inode_set_file_extent_range(inode, 0,
536 ALIGN(size, root->fs_info->sectorsize));
537 if (ret)
538 goto fail;
539
540 /*
541 * We're an inline extent, so nobody can extend the file past i_size
542 * without locking a page we already have locked.
543 *
544 * We must do any i_size and inode updates before we unlock the pages.
545 * Otherwise we could end up racing with unlink.
546 */
547 i_size = i_size_read(&inode->vfs_inode);
548 if (update_i_size && size > i_size) {
549 i_size_write(&inode->vfs_inode, size);
550 i_size = size;
551 }
552 inode->disk_i_size = i_size;
553
554 fail:
555 return ret;
556 }
557
can_cow_file_range_inline(struct btrfs_inode * inode,u64 offset,u64 size,size_t compressed_size)558 static bool can_cow_file_range_inline(struct btrfs_inode *inode,
559 u64 offset, u64 size,
560 size_t compressed_size)
561 {
562 struct btrfs_fs_info *fs_info = inode->root->fs_info;
563 u64 data_len = (compressed_size ?: size);
564
565 /* Inline extents must start at offset 0. */
566 if (offset != 0)
567 return false;
568
569 /* Inline extents are limited to sectorsize. */
570 if (size > fs_info->sectorsize)
571 return false;
572
573 /* We do not allow a non-compressed extent to be as large as block size. */
574 if (data_len >= fs_info->sectorsize)
575 return false;
576
577 /* We cannot exceed the maximum inline data size. */
578 if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info))
579 return false;
580
581 /* We cannot exceed the user specified max_inline size. */
582 if (data_len > fs_info->max_inline)
583 return false;
584
585 /* Inline extents must be the entirety of the file. */
586 if (size < i_size_read(&inode->vfs_inode))
587 return false;
588
589 return true;
590 }
591
592 /*
593 * conditionally insert an inline extent into the file. This
594 * does the checks required to make sure the data is small enough
595 * to fit as an inline extent.
596 *
597 * If being used directly, you must have already checked we're allowed to cow
598 * the range by getting true from can_cow_file_range_inline().
599 */
__cow_file_range_inline(struct btrfs_inode * inode,u64 size,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)600 static noinline int __cow_file_range_inline(struct btrfs_inode *inode,
601 u64 size, size_t compressed_size,
602 int compress_type,
603 struct folio *compressed_folio,
604 bool update_i_size)
605 {
606 struct btrfs_drop_extents_args drop_args = { 0 };
607 struct btrfs_root *root = inode->root;
608 struct btrfs_fs_info *fs_info = root->fs_info;
609 struct btrfs_trans_handle *trans;
610 u64 data_len = (compressed_size ?: size);
611 int ret;
612 struct btrfs_path *path;
613
614 path = btrfs_alloc_path();
615 if (!path)
616 return -ENOMEM;
617
618 trans = btrfs_join_transaction(root);
619 if (IS_ERR(trans)) {
620 btrfs_free_path(path);
621 return PTR_ERR(trans);
622 }
623 trans->block_rsv = &inode->block_rsv;
624
625 drop_args.path = path;
626 drop_args.start = 0;
627 drop_args.end = fs_info->sectorsize;
628 drop_args.drop_cache = true;
629 drop_args.replace_extent = true;
630 drop_args.extent_item_size = btrfs_file_extent_calc_inline_size(data_len);
631 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
632 if (ret) {
633 btrfs_abort_transaction(trans, ret);
634 goto out;
635 }
636
637 ret = insert_inline_extent(trans, path, inode, drop_args.extent_inserted,
638 size, compressed_size, compress_type,
639 compressed_folio, update_i_size);
640 if (ret && ret != -ENOSPC) {
641 btrfs_abort_transaction(trans, ret);
642 goto out;
643 } else if (ret == -ENOSPC) {
644 ret = 1;
645 goto out;
646 }
647
648 btrfs_update_inode_bytes(inode, size, drop_args.bytes_found);
649 ret = btrfs_update_inode(trans, inode);
650 if (ret && ret != -ENOSPC) {
651 btrfs_abort_transaction(trans, ret);
652 goto out;
653 } else if (ret == -ENOSPC) {
654 ret = 1;
655 goto out;
656 }
657
658 btrfs_set_inode_full_sync(inode);
659 out:
660 /*
661 * Don't forget to free the reserved space, as for inlined extent
662 * it won't count as data extent, free them directly here.
663 * And at reserve time, it's always aligned to page size, so
664 * just free one page here.
665 */
666 btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL);
667 btrfs_free_path(path);
668 btrfs_end_transaction(trans);
669 return ret;
670 }
671
cow_file_range_inline(struct btrfs_inode * inode,struct folio * locked_folio,u64 offset,u64 end,size_t compressed_size,int compress_type,struct folio * compressed_folio,bool update_i_size)672 static noinline int cow_file_range_inline(struct btrfs_inode *inode,
673 struct folio *locked_folio,
674 u64 offset, u64 end,
675 size_t compressed_size,
676 int compress_type,
677 struct folio *compressed_folio,
678 bool update_i_size)
679 {
680 struct extent_state *cached = NULL;
681 unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
682 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED;
683 u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1);
684 int ret;
685
686 if (!can_cow_file_range_inline(inode, offset, size, compressed_size))
687 return 1;
688
689 lock_extent(&inode->io_tree, offset, end, &cached);
690 ret = __cow_file_range_inline(inode, size, compressed_size,
691 compress_type, compressed_folio,
692 update_i_size);
693 if (ret > 0) {
694 unlock_extent(&inode->io_tree, offset, end, &cached);
695 return ret;
696 }
697
698 /*
699 * In the successful case (ret == 0 here), cow_file_range will return 1.
700 *
701 * Quite a bit further up the callstack in extent_writepage(), ret == 1
702 * is treated as a short circuited success and does not unlock the folio,
703 * so we must do it here.
704 *
705 * In the failure case, the locked_folio does get unlocked by
706 * btrfs_folio_end_all_writers, which asserts that it is still locked
707 * at that point, so we must *not* unlock it here.
708 *
709 * The other two callsites in compress_file_range do not have a
710 * locked_folio, so they are not relevant to this logic.
711 */
712 if (ret == 0)
713 locked_folio = NULL;
714
715 extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached,
716 clear_flags, PAGE_UNLOCK |
717 PAGE_START_WRITEBACK | PAGE_END_WRITEBACK);
718 return ret;
719 }
720
721 struct async_extent {
722 u64 start;
723 u64 ram_size;
724 u64 compressed_size;
725 struct folio **folios;
726 unsigned long nr_folios;
727 int compress_type;
728 struct list_head list;
729 };
730
731 struct async_chunk {
732 struct btrfs_inode *inode;
733 struct folio *locked_folio;
734 u64 start;
735 u64 end;
736 blk_opf_t write_flags;
737 struct list_head extents;
738 struct cgroup_subsys_state *blkcg_css;
739 struct btrfs_work work;
740 struct async_cow *async_cow;
741 };
742
743 struct async_cow {
744 atomic_t num_chunks;
745 struct async_chunk chunks[];
746 };
747
add_async_extent(struct async_chunk * cow,u64 start,u64 ram_size,u64 compressed_size,struct folio ** folios,unsigned long nr_folios,int compress_type)748 static noinline int add_async_extent(struct async_chunk *cow,
749 u64 start, u64 ram_size,
750 u64 compressed_size,
751 struct folio **folios,
752 unsigned long nr_folios,
753 int compress_type)
754 {
755 struct async_extent *async_extent;
756
757 async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
758 if (!async_extent)
759 return -ENOMEM;
760 async_extent->start = start;
761 async_extent->ram_size = ram_size;
762 async_extent->compressed_size = compressed_size;
763 async_extent->folios = folios;
764 async_extent->nr_folios = nr_folios;
765 async_extent->compress_type = compress_type;
766 list_add_tail(&async_extent->list, &cow->extents);
767 return 0;
768 }
769
770 /*
771 * Check if the inode needs to be submitted to compression, based on mount
772 * options, defragmentation, properties or heuristics.
773 */
inode_need_compress(struct btrfs_inode * inode,u64 start,u64 end)774 static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
775 u64 end)
776 {
777 struct btrfs_fs_info *fs_info = inode->root->fs_info;
778
779 if (!btrfs_inode_can_compress(inode)) {
780 WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
781 KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
782 btrfs_ino(inode));
783 return 0;
784 }
785 /*
786 * Only enable sector perfect compression for experimental builds.
787 *
788 * This is a big feature change for subpage cases, and can hit
789 * different corner cases, so only limit this feature for
790 * experimental build for now.
791 *
792 * ETA for moving this out of experimental builds is 6.15.
793 */
794 if (fs_info->sectorsize < PAGE_SIZE &&
795 !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
796 if (!PAGE_ALIGNED(start) ||
797 !PAGE_ALIGNED(end + 1))
798 return 0;
799 }
800
801 /* force compress */
802 if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
803 return 1;
804 /* defrag ioctl */
805 if (inode->defrag_compress)
806 return 1;
807 /* bad compression ratios */
808 if (inode->flags & BTRFS_INODE_NOCOMPRESS)
809 return 0;
810 if (btrfs_test_opt(fs_info, COMPRESS) ||
811 inode->flags & BTRFS_INODE_COMPRESS ||
812 inode->prop_compress)
813 return btrfs_compress_heuristic(inode, start, end);
814 return 0;
815 }
816
inode_should_defrag(struct btrfs_inode * inode,u64 start,u64 end,u64 num_bytes,u32 small_write)817 static inline void inode_should_defrag(struct btrfs_inode *inode,
818 u64 start, u64 end, u64 num_bytes, u32 small_write)
819 {
820 /* If this is a small write inside eof, kick off a defrag */
821 if (num_bytes < small_write &&
822 (start > 0 || end + 1 < inode->disk_i_size))
823 btrfs_add_inode_defrag(inode, small_write);
824 }
825
extent_range_clear_dirty_for_io(struct btrfs_inode * inode,u64 start,u64 end)826 static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end)
827 {
828 unsigned long end_index = end >> PAGE_SHIFT;
829 struct folio *folio;
830 int ret = 0;
831
832 for (unsigned long index = start >> PAGE_SHIFT;
833 index <= end_index; index++) {
834 folio = filemap_get_folio(inode->vfs_inode.i_mapping, index);
835 if (IS_ERR(folio)) {
836 if (!ret)
837 ret = PTR_ERR(folio);
838 continue;
839 }
840 btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start,
841 end + 1 - start);
842 folio_put(folio);
843 }
844 return ret;
845 }
846
847 /*
848 * Work queue call back to started compression on a file and pages.
849 *
850 * This is done inside an ordered work queue, and the compression is spread
851 * across many cpus. The actual IO submission is step two, and the ordered work
852 * queue takes care of making sure that happens in the same order things were
853 * put onto the queue by writepages and friends.
854 *
855 * If this code finds it can't get good compression, it puts an entry onto the
856 * work queue to write the uncompressed bytes. This makes sure that both
857 * compressed inodes and uncompressed inodes are written in the same order that
858 * the flusher thread sent them down.
859 */
compress_file_range(struct btrfs_work * work)860 static void compress_file_range(struct btrfs_work *work)
861 {
862 struct async_chunk *async_chunk =
863 container_of(work, struct async_chunk, work);
864 struct btrfs_inode *inode = async_chunk->inode;
865 struct btrfs_fs_info *fs_info = inode->root->fs_info;
866 struct address_space *mapping = inode->vfs_inode.i_mapping;
867 u64 blocksize = fs_info->sectorsize;
868 u64 start = async_chunk->start;
869 u64 end = async_chunk->end;
870 u64 actual_end;
871 u64 i_size;
872 int ret = 0;
873 struct folio **folios;
874 unsigned long nr_folios;
875 unsigned long total_compressed = 0;
876 unsigned long total_in = 0;
877 unsigned int poff;
878 int i;
879 int compress_type = fs_info->compress_type;
880 int compress_level = fs_info->compress_level;
881
882 inode_should_defrag(inode, start, end, end - start + 1, SZ_16K);
883
884 /*
885 * We need to call clear_page_dirty_for_io on each page in the range.
886 * Otherwise applications with the file mmap'd can wander in and change
887 * the page contents while we are compressing them.
888 */
889 ret = extent_range_clear_dirty_for_io(inode, start, end);
890
891 /*
892 * All the folios should have been locked thus no failure.
893 *
894 * And even if some folios are missing, btrfs_compress_folios()
895 * would handle them correctly, so here just do an ASSERT() check for
896 * early logic errors.
897 */
898 ASSERT(ret == 0);
899
900 /*
901 * We need to save i_size before now because it could change in between
902 * us evaluating the size and assigning it. This is because we lock and
903 * unlock the page in truncate and fallocate, and then modify the i_size
904 * later on.
905 *
906 * The barriers are to emulate READ_ONCE, remove that once i_size_read
907 * does that for us.
908 */
909 barrier();
910 i_size = i_size_read(&inode->vfs_inode);
911 barrier();
912 actual_end = min_t(u64, i_size, end + 1);
913 again:
914 folios = NULL;
915 nr_folios = (end >> PAGE_SHIFT) - (start >> PAGE_SHIFT) + 1;
916 nr_folios = min_t(unsigned long, nr_folios, BTRFS_MAX_COMPRESSED_PAGES);
917
918 /*
919 * we don't want to send crud past the end of i_size through
920 * compression, that's just a waste of CPU time. So, if the
921 * end of the file is before the start of our current
922 * requested range of bytes, we bail out to the uncompressed
923 * cleanup code that can deal with all of this.
924 *
925 * It isn't really the fastest way to fix things, but this is a
926 * very uncommon corner.
927 */
928 if (actual_end <= start)
929 goto cleanup_and_bail_uncompressed;
930
931 total_compressed = actual_end - start;
932
933 /*
934 * Skip compression for a small file range(<=blocksize) that
935 * isn't an inline extent, since it doesn't save disk space at all.
936 */
937 if (total_compressed <= blocksize &&
938 (start > 0 || end + 1 < inode->disk_i_size))
939 goto cleanup_and_bail_uncompressed;
940
941 total_compressed = min_t(unsigned long, total_compressed,
942 BTRFS_MAX_UNCOMPRESSED);
943 total_in = 0;
944 ret = 0;
945
946 /*
947 * We do compression for mount -o compress and when the inode has not
948 * been flagged as NOCOMPRESS. This flag can change at any time if we
949 * discover bad compression ratios.
950 */
951 if (!inode_need_compress(inode, start, end))
952 goto cleanup_and_bail_uncompressed;
953
954 folios = kcalloc(nr_folios, sizeof(struct folio *), GFP_NOFS);
955 if (!folios) {
956 /*
957 * Memory allocation failure is not a fatal error, we can fall
958 * back to uncompressed code.
959 */
960 goto cleanup_and_bail_uncompressed;
961 }
962
963 if (inode->defrag_compress) {
964 compress_type = inode->defrag_compress;
965 compress_level = inode->defrag_compress_level;
966 } else if (inode->prop_compress) {
967 compress_type = inode->prop_compress;
968 }
969
970 /* Compression level is applied here. */
971 ret = btrfs_compress_folios(compress_type, compress_level,
972 mapping, start, folios, &nr_folios, &total_in,
973 &total_compressed);
974 if (ret)
975 goto mark_incompressible;
976
977 /*
978 * Zero the tail end of the last page, as we might be sending it down
979 * to disk.
980 */
981 poff = offset_in_page(total_compressed);
982 if (poff)
983 folio_zero_range(folios[nr_folios - 1], poff, PAGE_SIZE - poff);
984
985 /*
986 * Try to create an inline extent.
987 *
988 * If we didn't compress the entire range, try to create an uncompressed
989 * inline extent, else a compressed one.
990 *
991 * Check cow_file_range() for why we don't even try to create inline
992 * extent for the subpage case.
993 */
994 if (total_in < actual_end)
995 ret = cow_file_range_inline(inode, NULL, start, end, 0,
996 BTRFS_COMPRESS_NONE, NULL, false);
997 else
998 ret = cow_file_range_inline(inode, NULL, start, end, total_compressed,
999 compress_type, folios[0], false);
1000 if (ret <= 0) {
1001 if (ret < 0)
1002 mapping_set_error(mapping, -EIO);
1003 goto free_pages;
1004 }
1005
1006 /*
1007 * We aren't doing an inline extent. Round the compressed size up to a
1008 * block size boundary so the allocator does sane things.
1009 */
1010 total_compressed = ALIGN(total_compressed, blocksize);
1011
1012 /*
1013 * One last check to make sure the compression is really a win, compare
1014 * the page count read with the blocks on disk, compression must free at
1015 * least one sector.
1016 */
1017 total_in = round_up(total_in, fs_info->sectorsize);
1018 if (total_compressed + blocksize > total_in)
1019 goto mark_incompressible;
1020
1021 /*
1022 * The async work queues will take care of doing actual allocation on
1023 * disk for these compressed pages, and will submit the bios.
1024 */
1025 ret = add_async_extent(async_chunk, start, total_in, total_compressed, folios,
1026 nr_folios, compress_type);
1027 BUG_ON(ret);
1028 if (start + total_in < end) {
1029 start += total_in;
1030 cond_resched();
1031 goto again;
1032 }
1033 return;
1034
1035 mark_incompressible:
1036 if (!btrfs_test_opt(fs_info, FORCE_COMPRESS) && !inode->prop_compress)
1037 inode->flags |= BTRFS_INODE_NOCOMPRESS;
1038 cleanup_and_bail_uncompressed:
1039 ret = add_async_extent(async_chunk, start, end - start + 1, 0, NULL, 0,
1040 BTRFS_COMPRESS_NONE);
1041 BUG_ON(ret);
1042 free_pages:
1043 if (folios) {
1044 for (i = 0; i < nr_folios; i++) {
1045 WARN_ON(folios[i]->mapping);
1046 btrfs_free_compr_folio(folios[i]);
1047 }
1048 kfree(folios);
1049 }
1050 }
1051
free_async_extent_pages(struct async_extent * async_extent)1052 static void free_async_extent_pages(struct async_extent *async_extent)
1053 {
1054 int i;
1055
1056 if (!async_extent->folios)
1057 return;
1058
1059 for (i = 0; i < async_extent->nr_folios; i++) {
1060 WARN_ON(async_extent->folios[i]->mapping);
1061 btrfs_free_compr_folio(async_extent->folios[i]);
1062 }
1063 kfree(async_extent->folios);
1064 async_extent->nr_folios = 0;
1065 async_extent->folios = NULL;
1066 }
1067
submit_uncompressed_range(struct btrfs_inode * inode,struct async_extent * async_extent,struct folio * locked_folio)1068 static void submit_uncompressed_range(struct btrfs_inode *inode,
1069 struct async_extent *async_extent,
1070 struct folio *locked_folio)
1071 {
1072 u64 start = async_extent->start;
1073 u64 end = async_extent->start + async_extent->ram_size - 1;
1074 int ret;
1075 struct writeback_control wbc = {
1076 .sync_mode = WB_SYNC_ALL,
1077 .range_start = start,
1078 .range_end = end,
1079 .no_cgroup_owner = 1,
1080 };
1081
1082 wbc_attach_fdatawrite_inode(&wbc, &inode->vfs_inode);
1083 ret = run_delalloc_cow(inode, locked_folio, start, end,
1084 &wbc, false);
1085 wbc_detach_inode(&wbc);
1086 if (ret < 0) {
1087 if (locked_folio)
1088 btrfs_folio_end_lock(inode->root->fs_info, locked_folio,
1089 start, async_extent->ram_size);
1090 btrfs_err_rl(inode->root->fs_info,
1091 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1092 __func__, btrfs_root_id(inode->root),
1093 btrfs_ino(inode), start, async_extent->ram_size, ret);
1094 }
1095 }
1096
submit_one_async_extent(struct async_chunk * async_chunk,struct async_extent * async_extent,u64 * alloc_hint)1097 static void submit_one_async_extent(struct async_chunk *async_chunk,
1098 struct async_extent *async_extent,
1099 u64 *alloc_hint)
1100 {
1101 struct btrfs_inode *inode = async_chunk->inode;
1102 struct extent_io_tree *io_tree = &inode->io_tree;
1103 struct btrfs_root *root = inode->root;
1104 struct btrfs_fs_info *fs_info = root->fs_info;
1105 struct btrfs_ordered_extent *ordered;
1106 struct btrfs_file_extent file_extent;
1107 struct btrfs_key ins;
1108 struct folio *locked_folio = NULL;
1109 struct extent_state *cached = NULL;
1110 struct extent_map *em;
1111 int ret = 0;
1112 u64 start = async_extent->start;
1113 u64 end = async_extent->start + async_extent->ram_size - 1;
1114
1115 if (async_chunk->blkcg_css)
1116 kthread_associate_blkcg(async_chunk->blkcg_css);
1117
1118 /*
1119 * If async_chunk->locked_folio is in the async_extent range, we need to
1120 * handle it.
1121 */
1122 if (async_chunk->locked_folio) {
1123 u64 locked_folio_start = folio_pos(async_chunk->locked_folio);
1124 u64 locked_folio_end = locked_folio_start +
1125 folio_size(async_chunk->locked_folio) - 1;
1126
1127 if (!(start >= locked_folio_end || end <= locked_folio_start))
1128 locked_folio = async_chunk->locked_folio;
1129 }
1130
1131 if (async_extent->compress_type == BTRFS_COMPRESS_NONE) {
1132 submit_uncompressed_range(inode, async_extent, locked_folio);
1133 goto done;
1134 }
1135
1136 ret = btrfs_reserve_extent(root, async_extent->ram_size,
1137 async_extent->compressed_size,
1138 async_extent->compressed_size,
1139 0, *alloc_hint, &ins, 1, 1);
1140 if (ret) {
1141 /*
1142 * We can't reserve contiguous space for the compressed size.
1143 * Unlikely, but it's possible that we could have enough
1144 * non-contiguous space for the uncompressed size instead. So
1145 * fall back to uncompressed.
1146 */
1147 submit_uncompressed_range(inode, async_extent, locked_folio);
1148 goto done;
1149 }
1150
1151 lock_extent(io_tree, start, end, &cached);
1152
1153 /* Here we're doing allocation and writeback of the compressed pages */
1154 file_extent.disk_bytenr = ins.objectid;
1155 file_extent.disk_num_bytes = ins.offset;
1156 file_extent.ram_bytes = async_extent->ram_size;
1157 file_extent.num_bytes = async_extent->ram_size;
1158 file_extent.offset = 0;
1159 file_extent.compression = async_extent->compress_type;
1160
1161 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
1162 if (IS_ERR(em)) {
1163 ret = PTR_ERR(em);
1164 goto out_free_reserve;
1165 }
1166 free_extent_map(em);
1167
1168 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1169 1 << BTRFS_ORDERED_COMPRESSED);
1170 if (IS_ERR(ordered)) {
1171 btrfs_drop_extent_map_range(inode, start, end, false);
1172 ret = PTR_ERR(ordered);
1173 goto out_free_reserve;
1174 }
1175 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1176
1177 /* Clear dirty, set writeback and unlock the pages. */
1178 extent_clear_unlock_delalloc(inode, start, end,
1179 NULL, &cached, EXTENT_LOCKED | EXTENT_DELALLOC,
1180 PAGE_UNLOCK | PAGE_START_WRITEBACK);
1181 btrfs_submit_compressed_write(ordered,
1182 async_extent->folios, /* compressed_folios */
1183 async_extent->nr_folios,
1184 async_chunk->write_flags, true);
1185 *alloc_hint = ins.objectid + ins.offset;
1186 done:
1187 if (async_chunk->blkcg_css)
1188 kthread_associate_blkcg(NULL);
1189 kfree(async_extent);
1190 return;
1191
1192 out_free_reserve:
1193 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1194 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1195 mapping_set_error(inode->vfs_inode.i_mapping, -EIO);
1196 extent_clear_unlock_delalloc(inode, start, end,
1197 NULL, &cached,
1198 EXTENT_LOCKED | EXTENT_DELALLOC |
1199 EXTENT_DELALLOC_NEW |
1200 EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
1201 PAGE_UNLOCK | PAGE_START_WRITEBACK |
1202 PAGE_END_WRITEBACK);
1203 free_async_extent_pages(async_extent);
1204 if (async_chunk->blkcg_css)
1205 kthread_associate_blkcg(NULL);
1206 btrfs_debug(fs_info,
1207 "async extent submission failed root=%lld inode=%llu start=%llu len=%llu ret=%d",
1208 btrfs_root_id(root), btrfs_ino(inode), start,
1209 async_extent->ram_size, ret);
1210 kfree(async_extent);
1211 }
1212
btrfs_get_extent_allocation_hint(struct btrfs_inode * inode,u64 start,u64 num_bytes)1213 u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
1214 u64 num_bytes)
1215 {
1216 struct extent_map_tree *em_tree = &inode->extent_tree;
1217 struct extent_map *em;
1218 u64 alloc_hint = 0;
1219
1220 read_lock(&em_tree->lock);
1221 em = search_extent_mapping(em_tree, start, num_bytes);
1222 if (em) {
1223 /*
1224 * if block start isn't an actual block number then find the
1225 * first block in this inode and use that as a hint. If that
1226 * block is also bogus then just don't worry about it.
1227 */
1228 if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) {
1229 free_extent_map(em);
1230 em = search_extent_mapping(em_tree, 0, 0);
1231 if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE)
1232 alloc_hint = extent_map_block_start(em);
1233 if (em)
1234 free_extent_map(em);
1235 } else {
1236 alloc_hint = extent_map_block_start(em);
1237 free_extent_map(em);
1238 }
1239 }
1240 read_unlock(&em_tree->lock);
1241
1242 return alloc_hint;
1243 }
1244
1245 /*
1246 * when extent_io.c finds a delayed allocation range in the file,
1247 * the call backs end up in this code. The basic idea is to
1248 * allocate extents on disk for the range, and create ordered data structs
1249 * in ram to track those extents.
1250 *
1251 * locked_folio is the folio that writepage had locked already. We use
1252 * it to make sure we don't do extra locks or unlocks.
1253 *
1254 * When this function fails, it unlocks all pages except @locked_folio.
1255 *
1256 * When this function successfully creates an inline extent, it returns 1 and
1257 * unlocks all pages including locked_folio and starts I/O on them.
1258 * (In reality inline extents are limited to a single page, so locked_folio is
1259 * the only page handled anyway).
1260 *
1261 * When this function succeed and creates a normal extent, the page locking
1262 * status depends on the passed in flags:
1263 *
1264 * - If @keep_locked is set, all pages are kept locked.
1265 * - Else all pages except for @locked_folio are unlocked.
1266 *
1267 * When a failure happens in the second or later iteration of the
1268 * while-loop, the ordered extents created in previous iterations are cleaned up.
1269 */
cow_file_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,u64 * done_offset,bool keep_locked,bool no_inline)1270 static noinline int cow_file_range(struct btrfs_inode *inode,
1271 struct folio *locked_folio, u64 start,
1272 u64 end, u64 *done_offset,
1273 bool keep_locked, bool no_inline)
1274 {
1275 struct btrfs_root *root = inode->root;
1276 struct btrfs_fs_info *fs_info = root->fs_info;
1277 struct extent_state *cached = NULL;
1278 u64 alloc_hint = 0;
1279 u64 orig_start = start;
1280 u64 num_bytes;
1281 u64 cur_alloc_size = 0;
1282 u64 min_alloc_size;
1283 u64 blocksize = fs_info->sectorsize;
1284 struct btrfs_key ins;
1285 struct extent_map *em;
1286 unsigned clear_bits;
1287 unsigned long page_ops;
1288 int ret = 0;
1289
1290 if (btrfs_is_free_space_inode(inode)) {
1291 ret = -EINVAL;
1292 goto out_unlock;
1293 }
1294
1295 num_bytes = ALIGN(end - start + 1, blocksize);
1296 num_bytes = max(blocksize, num_bytes);
1297 ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy));
1298
1299 inode_should_defrag(inode, start, end, num_bytes, SZ_64K);
1300
1301 if (!no_inline) {
1302 /* lets try to make an inline extent */
1303 ret = cow_file_range_inline(inode, locked_folio, start, end, 0,
1304 BTRFS_COMPRESS_NONE, NULL, false);
1305 if (ret <= 0) {
1306 /*
1307 * We succeeded, return 1 so the caller knows we're done
1308 * with this page and already handled the IO.
1309 *
1310 * If there was an error then cow_file_range_inline() has
1311 * already done the cleanup.
1312 */
1313 if (ret == 0)
1314 ret = 1;
1315 goto done;
1316 }
1317 }
1318
1319 alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes);
1320
1321 /*
1322 * We're not doing compressed IO, don't unlock the first page (which
1323 * the caller expects to stay locked), don't clear any dirty bits and
1324 * don't set any writeback bits.
1325 *
1326 * Do set the Ordered (Private2) bit so we know this page was properly
1327 * setup for writepage.
1328 */
1329 page_ops = (keep_locked ? 0 : PAGE_UNLOCK);
1330 page_ops |= PAGE_SET_ORDERED;
1331
1332 /*
1333 * Relocation relies on the relocated extents to have exactly the same
1334 * size as the original extents. Normally writeback for relocation data
1335 * extents follows a NOCOW path because relocation preallocates the
1336 * extents. However, due to an operation such as scrub turning a block
1337 * group to RO mode, it may fallback to COW mode, so we must make sure
1338 * an extent allocated during COW has exactly the requested size and can
1339 * not be split into smaller extents, otherwise relocation breaks and
1340 * fails during the stage where it updates the bytenr of file extent
1341 * items.
1342 */
1343 if (btrfs_is_data_reloc_root(root))
1344 min_alloc_size = num_bytes;
1345 else
1346 min_alloc_size = fs_info->sectorsize;
1347
1348 while (num_bytes > 0) {
1349 struct btrfs_ordered_extent *ordered;
1350 struct btrfs_file_extent file_extent;
1351
1352 ret = btrfs_reserve_extent(root, num_bytes, num_bytes,
1353 min_alloc_size, 0, alloc_hint,
1354 &ins, 1, 1);
1355 if (ret == -EAGAIN) {
1356 /*
1357 * btrfs_reserve_extent only returns -EAGAIN for zoned
1358 * file systems, which is an indication that there are
1359 * no active zones to allocate from at the moment.
1360 *
1361 * If this is the first loop iteration, wait for at
1362 * least one zone to finish before retrying the
1363 * allocation. Otherwise ask the caller to write out
1364 * the already allocated blocks before coming back to
1365 * us, or return -ENOSPC if it can't handle retries.
1366 */
1367 ASSERT(btrfs_is_zoned(fs_info));
1368 if (start == orig_start) {
1369 wait_on_bit_io(&inode->root->fs_info->flags,
1370 BTRFS_FS_NEED_ZONE_FINISH,
1371 TASK_UNINTERRUPTIBLE);
1372 continue;
1373 }
1374 if (done_offset) {
1375 /*
1376 * Move @end to the end of the processed range,
1377 * and exit the loop to unlock the processed extents.
1378 */
1379 end = start - 1;
1380 ret = 0;
1381 break;
1382 }
1383 ret = -ENOSPC;
1384 }
1385 if (ret < 0)
1386 goto out_unlock;
1387 cur_alloc_size = ins.offset;
1388
1389 file_extent.disk_bytenr = ins.objectid;
1390 file_extent.disk_num_bytes = ins.offset;
1391 file_extent.num_bytes = ins.offset;
1392 file_extent.ram_bytes = ins.offset;
1393 file_extent.offset = 0;
1394 file_extent.compression = BTRFS_COMPRESS_NONE;
1395
1396 /*
1397 * Locked range will be released either during error clean up or
1398 * after the whole range is finished.
1399 */
1400 lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1,
1401 &cached);
1402
1403 em = btrfs_create_io_em(inode, start, &file_extent,
1404 BTRFS_ORDERED_REGULAR);
1405 if (IS_ERR(em)) {
1406 unlock_extent(&inode->io_tree, start,
1407 start + cur_alloc_size - 1, &cached);
1408 ret = PTR_ERR(em);
1409 goto out_reserve;
1410 }
1411 free_extent_map(em);
1412
1413 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
1414 1 << BTRFS_ORDERED_REGULAR);
1415 if (IS_ERR(ordered)) {
1416 unlock_extent(&inode->io_tree, start,
1417 start + cur_alloc_size - 1, &cached);
1418 ret = PTR_ERR(ordered);
1419 goto out_drop_extent_cache;
1420 }
1421
1422 if (btrfs_is_data_reloc_root(root)) {
1423 ret = btrfs_reloc_clone_csums(ordered);
1424
1425 /*
1426 * Only drop cache here, and process as normal.
1427 *
1428 * We must not allow extent_clear_unlock_delalloc()
1429 * at out_unlock label to free meta of this ordered
1430 * extent, as its meta should be freed by
1431 * btrfs_finish_ordered_io().
1432 *
1433 * So we must continue until @start is increased to
1434 * skip current ordered extent.
1435 */
1436 if (ret)
1437 btrfs_drop_extent_map_range(inode, start,
1438 start + cur_alloc_size - 1,
1439 false);
1440 }
1441 btrfs_put_ordered_extent(ordered);
1442
1443 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1444
1445 if (num_bytes < cur_alloc_size)
1446 num_bytes = 0;
1447 else
1448 num_bytes -= cur_alloc_size;
1449 alloc_hint = ins.objectid + ins.offset;
1450 start += cur_alloc_size;
1451 cur_alloc_size = 0;
1452
1453 /*
1454 * btrfs_reloc_clone_csums() error, since start is increased
1455 * extent_clear_unlock_delalloc() at out_unlock label won't
1456 * free metadata of current ordered extent, we're OK to exit.
1457 */
1458 if (ret)
1459 goto out_unlock;
1460 }
1461 extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached,
1462 EXTENT_LOCKED | EXTENT_DELALLOC, page_ops);
1463 done:
1464 if (done_offset)
1465 *done_offset = end;
1466 return ret;
1467
1468 out_drop_extent_cache:
1469 btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false);
1470 out_reserve:
1471 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
1472 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
1473 out_unlock:
1474 /*
1475 * Now, we have three regions to clean up:
1476 *
1477 * |-------(1)----|---(2)---|-------------(3)----------|
1478 * `- orig_start `- start `- start + cur_alloc_size `- end
1479 *
1480 * We process each region below.
1481 */
1482
1483 /*
1484 * For the range (1). We have already instantiated the ordered extents
1485 * for this region, thus we need to cleanup those ordered extents.
1486 * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV
1487 * are also handled by the ordered extents cleanup.
1488 *
1489 * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and
1490 * finish the writeback of the involved folios, which will be never submitted.
1491 */
1492 if (orig_start < start) {
1493 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC;
1494 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1495
1496 if (!locked_folio)
1497 mapping_set_error(inode->vfs_inode.i_mapping, ret);
1498
1499 btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start);
1500 extent_clear_unlock_delalloc(inode, orig_start, start - 1,
1501 locked_folio, NULL, clear_bits, page_ops);
1502 }
1503
1504 clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
1505 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
1506 page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
1507
1508 /*
1509 * For the range (2). If we reserved an extent for our delalloc range
1510 * (or a subrange) and failed to create the respective ordered extent,
1511 * then it means that when we reserved the extent we decremented the
1512 * extent's size from the data space_info's bytes_may_use counter and
1513 * incremented the space_info's bytes_reserved counter by the same
1514 * amount. We must make sure extent_clear_unlock_delalloc() does not try
1515 * to decrement again the data space_info's bytes_may_use counter,
1516 * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
1517 */
1518 if (cur_alloc_size) {
1519 extent_clear_unlock_delalloc(inode, start,
1520 start + cur_alloc_size - 1,
1521 locked_folio, &cached, clear_bits,
1522 page_ops);
1523 btrfs_qgroup_free_data(inode, NULL, start, cur_alloc_size, NULL);
1524 }
1525
1526 /*
1527 * For the range (3). We never touched the region. In addition to the
1528 * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
1529 * space_info's bytes_may_use counter, reserved in
1530 * btrfs_check_data_free_space().
1531 */
1532 if (start + cur_alloc_size < end) {
1533 clear_bits |= EXTENT_CLEAR_DATA_RESV;
1534 extent_clear_unlock_delalloc(inode, start + cur_alloc_size,
1535 end, locked_folio,
1536 &cached, clear_bits, page_ops);
1537 btrfs_qgroup_free_data(inode, NULL, start + cur_alloc_size,
1538 end - start - cur_alloc_size + 1, NULL);
1539 }
1540 btrfs_err_rl(fs_info,
1541 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
1542 __func__, btrfs_root_id(inode->root),
1543 btrfs_ino(inode), orig_start, end + 1 - orig_start, ret);
1544 return ret;
1545 }
1546
1547 /*
1548 * Phase two of compressed writeback. This is the ordered portion of the code,
1549 * which only gets called in the order the work was queued. We walk all the
1550 * async extents created by compress_file_range and send them down to the disk.
1551 *
1552 * If called with @do_free == true then it'll try to finish the work and free
1553 * the work struct eventually.
1554 */
submit_compressed_extents(struct btrfs_work * work,bool do_free)1555 static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_free)
1556 {
1557 struct async_chunk *async_chunk = container_of(work, struct async_chunk,
1558 work);
1559 struct btrfs_fs_info *fs_info = btrfs_work_owner(work);
1560 struct async_extent *async_extent;
1561 unsigned long nr_pages;
1562 u64 alloc_hint = 0;
1563
1564 if (do_free) {
1565 struct async_cow *async_cow;
1566
1567 btrfs_add_delayed_iput(async_chunk->inode);
1568 if (async_chunk->blkcg_css)
1569 css_put(async_chunk->blkcg_css);
1570
1571 async_cow = async_chunk->async_cow;
1572 if (atomic_dec_and_test(&async_cow->num_chunks))
1573 kvfree(async_cow);
1574 return;
1575 }
1576
1577 nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
1578 PAGE_SHIFT;
1579
1580 while (!list_empty(&async_chunk->extents)) {
1581 async_extent = list_entry(async_chunk->extents.next,
1582 struct async_extent, list);
1583 list_del(&async_extent->list);
1584 submit_one_async_extent(async_chunk, async_extent, &alloc_hint);
1585 }
1586
1587 /* atomic_sub_return implies a barrier */
1588 if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
1589 5 * SZ_1M)
1590 cond_wake_up_nomb(&fs_info->async_submit_wait);
1591 }
1592
run_delalloc_compressed(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)1593 static bool run_delalloc_compressed(struct btrfs_inode *inode,
1594 struct folio *locked_folio, u64 start,
1595 u64 end, struct writeback_control *wbc)
1596 {
1597 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1598 struct cgroup_subsys_state *blkcg_css = wbc_blkcg_css(wbc);
1599 struct async_cow *ctx;
1600 struct async_chunk *async_chunk;
1601 unsigned long nr_pages;
1602 u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K);
1603 int i;
1604 unsigned nofs_flag;
1605 const blk_opf_t write_flags = wbc_to_write_flags(wbc);
1606
1607 nofs_flag = memalloc_nofs_save();
1608 ctx = kvmalloc(struct_size(ctx, chunks, num_chunks), GFP_KERNEL);
1609 memalloc_nofs_restore(nofs_flag);
1610 if (!ctx)
1611 return false;
1612
1613 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
1614
1615 async_chunk = ctx->chunks;
1616 atomic_set(&ctx->num_chunks, num_chunks);
1617
1618 for (i = 0; i < num_chunks; i++) {
1619 u64 cur_end = min(end, start + SZ_512K - 1);
1620
1621 /*
1622 * igrab is called higher up in the call chain, take only the
1623 * lightweight reference for the callback lifetime
1624 */
1625 ihold(&inode->vfs_inode);
1626 async_chunk[i].async_cow = ctx;
1627 async_chunk[i].inode = inode;
1628 async_chunk[i].start = start;
1629 async_chunk[i].end = cur_end;
1630 async_chunk[i].write_flags = write_flags;
1631 INIT_LIST_HEAD(&async_chunk[i].extents);
1632
1633 /*
1634 * The locked_folio comes all the way from writepage and its
1635 * the original folio we were actually given. As we spread
1636 * this large delalloc region across multiple async_chunk
1637 * structs, only the first struct needs a pointer to
1638 * locked_folio.
1639 *
1640 * This way we don't need racey decisions about who is supposed
1641 * to unlock it.
1642 */
1643 if (locked_folio) {
1644 /*
1645 * Depending on the compressibility, the pages might or
1646 * might not go through async. We want all of them to
1647 * be accounted against wbc once. Let's do it here
1648 * before the paths diverge. wbc accounting is used
1649 * only for foreign writeback detection and doesn't
1650 * need full accuracy. Just account the whole thing
1651 * against the first page.
1652 */
1653 wbc_account_cgroup_owner(wbc, locked_folio,
1654 cur_end - start);
1655 async_chunk[i].locked_folio = locked_folio;
1656 locked_folio = NULL;
1657 } else {
1658 async_chunk[i].locked_folio = NULL;
1659 }
1660
1661 if (blkcg_css != blkcg_root_css) {
1662 css_get(blkcg_css);
1663 async_chunk[i].blkcg_css = blkcg_css;
1664 async_chunk[i].write_flags |= REQ_BTRFS_CGROUP_PUNT;
1665 } else {
1666 async_chunk[i].blkcg_css = NULL;
1667 }
1668
1669 btrfs_init_work(&async_chunk[i].work, compress_file_range,
1670 submit_compressed_extents);
1671
1672 nr_pages = DIV_ROUND_UP(cur_end - start, PAGE_SIZE);
1673 atomic_add(nr_pages, &fs_info->async_delalloc_pages);
1674
1675 btrfs_queue_work(fs_info->delalloc_workers, &async_chunk[i].work);
1676
1677 start = cur_end + 1;
1678 }
1679 return true;
1680 }
1681
1682 /*
1683 * Run the delalloc range from start to end, and write back any dirty pages
1684 * covered by the range.
1685 */
run_delalloc_cow(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc,bool pages_dirty)1686 static noinline int run_delalloc_cow(struct btrfs_inode *inode,
1687 struct folio *locked_folio, u64 start,
1688 u64 end, struct writeback_control *wbc,
1689 bool pages_dirty)
1690 {
1691 u64 done_offset = end;
1692 int ret;
1693
1694 while (start <= end) {
1695 ret = cow_file_range(inode, locked_folio, start, end,
1696 &done_offset, true, false);
1697 if (ret)
1698 return ret;
1699 extent_write_locked_range(&inode->vfs_inode, locked_folio,
1700 start, done_offset, wbc, pages_dirty);
1701 start = done_offset + 1;
1702 }
1703
1704 return 1;
1705 }
1706
fallback_to_cow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)1707 static int fallback_to_cow(struct btrfs_inode *inode,
1708 struct folio *locked_folio, const u64 start,
1709 const u64 end)
1710 {
1711 const bool is_space_ino = btrfs_is_free_space_inode(inode);
1712 const bool is_reloc_ino = btrfs_is_data_reloc_root(inode->root);
1713 const u64 range_bytes = end + 1 - start;
1714 struct extent_io_tree *io_tree = &inode->io_tree;
1715 struct extent_state *cached_state = NULL;
1716 u64 range_start = start;
1717 u64 count;
1718 int ret;
1719
1720 /*
1721 * If EXTENT_NORESERVE is set it means that when the buffered write was
1722 * made we had not enough available data space and therefore we did not
1723 * reserve data space for it, since we though we could do NOCOW for the
1724 * respective file range (either there is prealloc extent or the inode
1725 * has the NOCOW bit set).
1726 *
1727 * However when we need to fallback to COW mode (because for example the
1728 * block group for the corresponding extent was turned to RO mode by a
1729 * scrub or relocation) we need to do the following:
1730 *
1731 * 1) We increment the bytes_may_use counter of the data space info.
1732 * If COW succeeds, it allocates a new data extent and after doing
1733 * that it decrements the space info's bytes_may_use counter and
1734 * increments its bytes_reserved counter by the same amount (we do
1735 * this at btrfs_add_reserved_bytes()). So we need to increment the
1736 * bytes_may_use counter to compensate (when space is reserved at
1737 * buffered write time, the bytes_may_use counter is incremented);
1738 *
1739 * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
1740 * that if the COW path fails for any reason, it decrements (through
1741 * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
1742 * data space info, which we incremented in the step above.
1743 *
1744 * If we need to fallback to cow and the inode corresponds to a free
1745 * space cache inode or an inode of the data relocation tree, we must
1746 * also increment bytes_may_use of the data space_info for the same
1747 * reason. Space caches and relocated data extents always get a prealloc
1748 * extent for them, however scrub or balance may have set the block
1749 * group that contains that extent to RO mode and therefore force COW
1750 * when starting writeback.
1751 */
1752 lock_extent(io_tree, start, end, &cached_state);
1753 count = count_range_bits(io_tree, &range_start, end, range_bytes,
1754 EXTENT_NORESERVE, 0, NULL);
1755 if (count > 0 || is_space_ino || is_reloc_ino) {
1756 u64 bytes = count;
1757 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1758 struct btrfs_space_info *sinfo = fs_info->data_sinfo;
1759
1760 if (is_space_ino || is_reloc_ino)
1761 bytes = range_bytes;
1762
1763 spin_lock(&sinfo->lock);
1764 btrfs_space_info_update_bytes_may_use(sinfo, bytes);
1765 spin_unlock(&sinfo->lock);
1766
1767 if (count > 0)
1768 clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
1769 NULL);
1770 }
1771 unlock_extent(io_tree, start, end, &cached_state);
1772
1773 /*
1774 * Don't try to create inline extents, as a mix of inline extent that
1775 * is written out and unlocked directly and a normal NOCOW extent
1776 * doesn't work.
1777 */
1778 ret = cow_file_range(inode, locked_folio, start, end, NULL, false,
1779 true);
1780 ASSERT(ret != 1);
1781 return ret;
1782 }
1783
1784 struct can_nocow_file_extent_args {
1785 /* Input fields. */
1786
1787 /* Start file offset of the range we want to NOCOW. */
1788 u64 start;
1789 /* End file offset (inclusive) of the range we want to NOCOW. */
1790 u64 end;
1791 bool writeback_path;
1792 /*
1793 * Free the path passed to can_nocow_file_extent() once it's not needed
1794 * anymore.
1795 */
1796 bool free_path;
1797
1798 /*
1799 * Output fields. Only set when can_nocow_file_extent() returns 1.
1800 * The expected file extent for the NOCOW write.
1801 */
1802 struct btrfs_file_extent file_extent;
1803 };
1804
1805 /*
1806 * Check if we can NOCOW the file extent that the path points to.
1807 * This function may return with the path released, so the caller should check
1808 * if path->nodes[0] is NULL or not if it needs to use the path afterwards.
1809 *
1810 * Returns: < 0 on error
1811 * 0 if we can not NOCOW
1812 * 1 if we can NOCOW
1813 */
can_nocow_file_extent(struct btrfs_path * path,struct btrfs_key * key,struct btrfs_inode * inode,struct can_nocow_file_extent_args * args)1814 static int can_nocow_file_extent(struct btrfs_path *path,
1815 struct btrfs_key *key,
1816 struct btrfs_inode *inode,
1817 struct can_nocow_file_extent_args *args)
1818 {
1819 const bool is_freespace_inode = btrfs_is_free_space_inode(inode);
1820 struct extent_buffer *leaf = path->nodes[0];
1821 struct btrfs_root *root = inode->root;
1822 struct btrfs_file_extent_item *fi;
1823 struct btrfs_root *csum_root;
1824 u64 io_start;
1825 u64 extent_end;
1826 u8 extent_type;
1827 int can_nocow = 0;
1828 int ret = 0;
1829 bool nowait = path->nowait;
1830
1831 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
1832 extent_type = btrfs_file_extent_type(leaf, fi);
1833
1834 if (extent_type == BTRFS_FILE_EXTENT_INLINE)
1835 goto out;
1836
1837 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
1838 extent_type == BTRFS_FILE_EXTENT_REG)
1839 goto out;
1840
1841 /*
1842 * If the extent was created before the generation where the last snapshot
1843 * for its subvolume was created, then this implies the extent is shared,
1844 * hence we must COW.
1845 */
1846 if (btrfs_file_extent_generation(leaf, fi) <=
1847 btrfs_root_last_snapshot(&root->root_item))
1848 goto out;
1849
1850 /* An explicit hole, must COW. */
1851 if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
1852 goto out;
1853
1854 /* Compressed/encrypted/encoded extents must be COWed. */
1855 if (btrfs_file_extent_compression(leaf, fi) ||
1856 btrfs_file_extent_encryption(leaf, fi) ||
1857 btrfs_file_extent_other_encoding(leaf, fi))
1858 goto out;
1859
1860 extent_end = btrfs_file_extent_end(path);
1861
1862 args->file_extent.disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1863 args->file_extent.disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1864 args->file_extent.ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
1865 args->file_extent.offset = btrfs_file_extent_offset(leaf, fi);
1866 args->file_extent.compression = btrfs_file_extent_compression(leaf, fi);
1867
1868 /*
1869 * The following checks can be expensive, as they need to take other
1870 * locks and do btree or rbtree searches, so release the path to avoid
1871 * blocking other tasks for too long.
1872 */
1873 btrfs_release_path(path);
1874
1875 ret = btrfs_cross_ref_exist(inode, key->offset - args->file_extent.offset,
1876 args->file_extent.disk_bytenr, path);
1877 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1878 if (ret != 0)
1879 goto out;
1880
1881 if (args->free_path) {
1882 /*
1883 * We don't need the path anymore, plus through the
1884 * btrfs_lookup_csums_list() call below we will end up allocating
1885 * another path. So free the path to avoid unnecessary extra
1886 * memory usage.
1887 */
1888 btrfs_free_path(path);
1889 path = NULL;
1890 }
1891
1892 /* If there are pending snapshots for this root, we must COW. */
1893 if (args->writeback_path && !is_freespace_inode &&
1894 atomic_read(&root->snapshot_force_cow))
1895 goto out;
1896
1897 args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start;
1898 args->file_extent.offset += args->start - key->offset;
1899 io_start = args->file_extent.disk_bytenr + args->file_extent.offset;
1900
1901 /*
1902 * Force COW if csums exist in the range. This ensures that csums for a
1903 * given extent are either valid or do not exist.
1904 */
1905
1906 csum_root = btrfs_csum_root(root->fs_info, io_start);
1907 ret = btrfs_lookup_csums_list(csum_root, io_start,
1908 io_start + args->file_extent.num_bytes - 1,
1909 NULL, nowait);
1910 WARN_ON_ONCE(ret > 0 && is_freespace_inode);
1911 if (ret != 0)
1912 goto out;
1913
1914 can_nocow = 1;
1915 out:
1916 if (args->free_path && path)
1917 btrfs_free_path(path);
1918
1919 return ret < 0 ? ret : can_nocow;
1920 }
1921
1922 /*
1923 * Cleanup the dirty folios which will never be submitted due to error.
1924 *
1925 * When running a delalloc range, we may need to split the ranges (due to
1926 * fragmentation or NOCOW). If we hit an error in the later part, we will error
1927 * out and previously successfully executed range will never be submitted, thus
1928 * we have to cleanup those folios by clearing their dirty flag, starting and
1929 * finishing the writeback.
1930 */
cleanup_dirty_folios(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,int error)1931 static void cleanup_dirty_folios(struct btrfs_inode *inode,
1932 struct folio *locked_folio,
1933 u64 start, u64 end, int error)
1934 {
1935 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1936 struct address_space *mapping = inode->vfs_inode.i_mapping;
1937 pgoff_t start_index = start >> PAGE_SHIFT;
1938 pgoff_t end_index = end >> PAGE_SHIFT;
1939 u32 len;
1940
1941 ASSERT(end + 1 - start < U32_MAX);
1942 ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
1943 IS_ALIGNED(end + 1, fs_info->sectorsize));
1944 len = end + 1 - start;
1945
1946 /*
1947 * Handle the locked folio first.
1948 * The btrfs_folio_clamp_*() helpers can handle range out of the folio case.
1949 */
1950 btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
1951
1952 for (pgoff_t index = start_index; index <= end_index; index++) {
1953 struct folio *folio;
1954
1955 /* Already handled at the beginning. */
1956 if (index == locked_folio->index)
1957 continue;
1958 folio = __filemap_get_folio(mapping, index, FGP_LOCK, GFP_NOFS);
1959 /* Cache already dropped, no need to do any cleanup. */
1960 if (IS_ERR(folio))
1961 continue;
1962 btrfs_folio_clamp_finish_io(fs_info, locked_folio, start, len);
1963 folio_unlock(folio);
1964 folio_put(folio);
1965 }
1966 mapping_set_error(mapping, error);
1967 }
1968
nocow_one_range(struct btrfs_inode * inode,struct folio * locked_folio,struct extent_state ** cached,struct can_nocow_file_extent_args * nocow_args,u64 file_pos,bool is_prealloc)1969 static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio,
1970 struct extent_state **cached,
1971 struct can_nocow_file_extent_args *nocow_args,
1972 u64 file_pos, bool is_prealloc)
1973 {
1974 struct btrfs_ordered_extent *ordered;
1975 u64 len = nocow_args->file_extent.num_bytes;
1976 u64 end = file_pos + len - 1;
1977 int ret = 0;
1978
1979 lock_extent(&inode->io_tree, file_pos, end, cached);
1980
1981 if (is_prealloc) {
1982 struct extent_map *em;
1983
1984 em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent,
1985 BTRFS_ORDERED_PREALLOC);
1986 if (IS_ERR(em)) {
1987 unlock_extent(&inode->io_tree, file_pos, end, cached);
1988 return PTR_ERR(em);
1989 }
1990 free_extent_map(em);
1991 }
1992
1993 ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent,
1994 is_prealloc
1995 ? (1 << BTRFS_ORDERED_PREALLOC)
1996 : (1 << BTRFS_ORDERED_NOCOW));
1997 if (IS_ERR(ordered)) {
1998 if (is_prealloc)
1999 btrfs_drop_extent_map_range(inode, file_pos, end, false);
2000 unlock_extent(&inode->io_tree, file_pos, end, cached);
2001 return PTR_ERR(ordered);
2002 }
2003
2004 if (btrfs_is_data_reloc_root(inode->root))
2005 /*
2006 * Errors are handled later, as we must prevent
2007 * extent_clear_unlock_delalloc() in error handler from freeing
2008 * metadata of the created ordered extent.
2009 */
2010 ret = btrfs_reloc_clone_csums(ordered);
2011 btrfs_put_ordered_extent(ordered);
2012
2013 extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached,
2014 EXTENT_LOCKED | EXTENT_DELALLOC |
2015 EXTENT_CLEAR_DATA_RESV,
2016 PAGE_UNLOCK | PAGE_SET_ORDERED);
2017 /*
2018 * On error, we need to cleanup the ordered extents we created.
2019 *
2020 * We do not clear the folio Dirty flags because they are set and
2021 * cleaered by the caller.
2022 */
2023 if (ret < 0)
2024 btrfs_cleanup_ordered_extents(inode, file_pos, end);
2025 return ret;
2026 }
2027
2028 /*
2029 * when nowcow writeback call back. This checks for snapshots or COW copies
2030 * of the extents that exist in the file, and COWs the file as required.
2031 *
2032 * If no cow copies or snapshots exist, we write directly to the existing
2033 * blocks on disk
2034 */
run_delalloc_nocow(struct btrfs_inode * inode,struct folio * locked_folio,const u64 start,const u64 end)2035 static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
2036 struct folio *locked_folio,
2037 const u64 start, const u64 end)
2038 {
2039 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2040 struct btrfs_root *root = inode->root;
2041 struct btrfs_path *path;
2042 u64 cow_start = (u64)-1;
2043 /*
2044 * If not 0, represents the inclusive end of the last fallback_to_cow()
2045 * range. Only for error handling.
2046 */
2047 u64 cow_end = 0;
2048 u64 cur_offset = start;
2049 int ret;
2050 bool check_prev = true;
2051 u64 ino = btrfs_ino(inode);
2052 struct can_nocow_file_extent_args nocow_args = { 0 };
2053
2054 /*
2055 * Normally on a zoned device we're only doing COW writes, but in case
2056 * of relocation on a zoned filesystem serializes I/O so that we're only
2057 * writing sequentially and can end up here as well.
2058 */
2059 ASSERT(!btrfs_is_zoned(fs_info) || btrfs_is_data_reloc_root(root));
2060
2061 path = btrfs_alloc_path();
2062 if (!path) {
2063 ret = -ENOMEM;
2064 goto error;
2065 }
2066
2067 nocow_args.end = end;
2068 nocow_args.writeback_path = true;
2069
2070 while (cur_offset <= end) {
2071 struct btrfs_block_group *nocow_bg = NULL;
2072 struct btrfs_key found_key;
2073 struct btrfs_file_extent_item *fi;
2074 struct extent_buffer *leaf;
2075 struct extent_state *cached_state = NULL;
2076 u64 extent_end;
2077 int extent_type;
2078
2079 ret = btrfs_lookup_file_extent(NULL, root, path, ino,
2080 cur_offset, 0);
2081 if (ret < 0)
2082 goto error;
2083
2084 /*
2085 * If there is no extent for our range when doing the initial
2086 * search, then go back to the previous slot as it will be the
2087 * one containing the search offset
2088 */
2089 if (ret > 0 && path->slots[0] > 0 && check_prev) {
2090 leaf = path->nodes[0];
2091 btrfs_item_key_to_cpu(leaf, &found_key,
2092 path->slots[0] - 1);
2093 if (found_key.objectid == ino &&
2094 found_key.type == BTRFS_EXTENT_DATA_KEY)
2095 path->slots[0]--;
2096 }
2097 check_prev = false;
2098 next_slot:
2099 /* Go to next leaf if we have exhausted the current one */
2100 leaf = path->nodes[0];
2101 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2102 ret = btrfs_next_leaf(root, path);
2103 if (ret < 0)
2104 goto error;
2105 if (ret > 0)
2106 break;
2107 leaf = path->nodes[0];
2108 }
2109
2110 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
2111
2112 /* Didn't find anything for our INO */
2113 if (found_key.objectid > ino)
2114 break;
2115 /*
2116 * Keep searching until we find an EXTENT_ITEM or there are no
2117 * more extents for this inode
2118 */
2119 if (WARN_ON_ONCE(found_key.objectid < ino) ||
2120 found_key.type < BTRFS_EXTENT_DATA_KEY) {
2121 path->slots[0]++;
2122 goto next_slot;
2123 }
2124
2125 /* Found key is not EXTENT_DATA_KEY or starts after req range */
2126 if (found_key.type > BTRFS_EXTENT_DATA_KEY ||
2127 found_key.offset > end)
2128 break;
2129
2130 /*
2131 * If the found extent starts after requested offset, then
2132 * adjust extent_end to be right before this extent begins
2133 */
2134 if (found_key.offset > cur_offset) {
2135 extent_end = found_key.offset;
2136 extent_type = 0;
2137 goto must_cow;
2138 }
2139
2140 /*
2141 * Found extent which begins before our range and potentially
2142 * intersect it
2143 */
2144 fi = btrfs_item_ptr(leaf, path->slots[0],
2145 struct btrfs_file_extent_item);
2146 extent_type = btrfs_file_extent_type(leaf, fi);
2147 /* If this is triggered then we have a memory corruption. */
2148 ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES);
2149 if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) {
2150 ret = -EUCLEAN;
2151 goto error;
2152 }
2153 extent_end = btrfs_file_extent_end(path);
2154
2155 /*
2156 * If the extent we got ends before our current offset, skip to
2157 * the next extent.
2158 */
2159 if (extent_end <= cur_offset) {
2160 path->slots[0]++;
2161 goto next_slot;
2162 }
2163
2164 nocow_args.start = cur_offset;
2165 ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args);
2166 if (ret < 0)
2167 goto error;
2168 if (ret == 0)
2169 goto must_cow;
2170
2171 ret = 0;
2172 nocow_bg = btrfs_inc_nocow_writers(fs_info,
2173 nocow_args.file_extent.disk_bytenr +
2174 nocow_args.file_extent.offset);
2175 if (!nocow_bg) {
2176 must_cow:
2177 /*
2178 * If we can't perform NOCOW writeback for the range,
2179 * then record the beginning of the range that needs to
2180 * be COWed. It will be written out before the next
2181 * NOCOW range if we find one, or when exiting this
2182 * loop.
2183 */
2184 if (cow_start == (u64)-1)
2185 cow_start = cur_offset;
2186 cur_offset = extent_end;
2187 if (cur_offset > end)
2188 break;
2189 if (!path->nodes[0])
2190 continue;
2191 path->slots[0]++;
2192 goto next_slot;
2193 }
2194
2195 /*
2196 * COW range from cow_start to found_key.offset - 1. As the key
2197 * will contain the beginning of the first extent that can be
2198 * NOCOW, following one which needs to be COW'ed
2199 */
2200 if (cow_start != (u64)-1) {
2201 ret = fallback_to_cow(inode, locked_folio, cow_start,
2202 found_key.offset - 1);
2203 if (ret) {
2204 cow_end = found_key.offset - 1;
2205 btrfs_dec_nocow_writers(nocow_bg);
2206 goto error;
2207 }
2208 cow_start = (u64)-1;
2209 }
2210
2211 ret = nocow_one_range(inode, locked_folio, &cached_state,
2212 &nocow_args, cur_offset,
2213 extent_type == BTRFS_FILE_EXTENT_PREALLOC);
2214 btrfs_dec_nocow_writers(nocow_bg);
2215 if (ret < 0)
2216 goto error;
2217 cur_offset = extent_end;
2218 }
2219 btrfs_release_path(path);
2220
2221 if (cur_offset <= end && cow_start == (u64)-1)
2222 cow_start = cur_offset;
2223
2224 if (cow_start != (u64)-1) {
2225 ret = fallback_to_cow(inode, locked_folio, cow_start, end);
2226 if (ret) {
2227 cow_end = end;
2228 goto error;
2229 }
2230 cow_start = (u64)-1;
2231 }
2232
2233 btrfs_free_path(path);
2234 return 0;
2235
2236 error:
2237 /*
2238 * There are several error cases:
2239 *
2240 * 1) Failed without falling back to COW
2241 * start cur_offset end
2242 * |/////////////| |
2243 *
2244 * In this case, cow_start should be (u64)-1.
2245 *
2246 * For range [start, cur_offset) the folios are already unlocked (except
2247 * @locked_folio), EXTENT_DELALLOC already removed.
2248 * Need to clear the dirty flags and finish the ordered extents.
2249 *
2250 * 2) Failed with error before calling fallback_to_cow()
2251 *
2252 * start cow_start end
2253 * |/////////////| |
2254 *
2255 * In this case, only @cow_start is set, @cur_offset is between
2256 * [cow_start, end)
2257 *
2258 * It's mostly the same as case 1), just replace @cur_offset with
2259 * @cow_start.
2260 *
2261 * 3) Failed with error from fallback_to_cow()
2262 *
2263 * start cow_start cow_end end
2264 * |/////////////|-----------| |
2265 *
2266 * In this case, both @cow_start and @cow_end is set.
2267 *
2268 * For range [start, cow_start) it's the same as case 1).
2269 * But for range [cow_start, cow_end), all the cleanup is handled by
2270 * cow_file_range(), we should not touch anything in that range.
2271 *
2272 * So for all above cases, if @cow_start is set, cleanup ordered extents
2273 * for range [start, @cow_start), other wise cleanup range [start, @cur_offset).
2274 */
2275 if (cow_start != (u64)-1)
2276 cur_offset = cow_start;
2277
2278 if (cur_offset > start) {
2279 btrfs_cleanup_ordered_extents(inode, start, cur_offset - start);
2280 cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret);
2281 }
2282
2283 /*
2284 * If an error happened while a COW region is outstanding, cur_offset
2285 * needs to be reset to @cow_end + 1 to skip the COW range, as
2286 * cow_file_range() will do the proper cleanup at error.
2287 */
2288 if (cow_end)
2289 cur_offset = cow_end + 1;
2290
2291 /*
2292 * We need to lock the extent here because we're clearing DELALLOC and
2293 * we're not locked at this point.
2294 */
2295 if (cur_offset < end) {
2296 struct extent_state *cached = NULL;
2297
2298 lock_extent(&inode->io_tree, cur_offset, end, &cached);
2299 extent_clear_unlock_delalloc(inode, cur_offset, end,
2300 locked_folio, &cached,
2301 EXTENT_LOCKED | EXTENT_DELALLOC |
2302 EXTENT_DEFRAG |
2303 EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
2304 PAGE_START_WRITEBACK |
2305 PAGE_END_WRITEBACK);
2306 btrfs_qgroup_free_data(inode, NULL, cur_offset, end - cur_offset + 1, NULL);
2307 }
2308 btrfs_free_path(path);
2309 btrfs_err_rl(fs_info,
2310 "%s failed, root=%llu inode=%llu start=%llu len=%llu: %d",
2311 __func__, btrfs_root_id(inode->root),
2312 btrfs_ino(inode), start, end + 1 - start, ret);
2313 return ret;
2314 }
2315
should_nocow(struct btrfs_inode * inode,u64 start,u64 end)2316 static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
2317 {
2318 if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
2319 if (inode->defrag_bytes &&
2320 test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG))
2321 return false;
2322 return true;
2323 }
2324 return false;
2325 }
2326
2327 /*
2328 * Function to process delayed allocation (create CoW) for ranges which are
2329 * being touched for the first time.
2330 */
btrfs_run_delalloc_range(struct btrfs_inode * inode,struct folio * locked_folio,u64 start,u64 end,struct writeback_control * wbc)2331 int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_folio,
2332 u64 start, u64 end, struct writeback_control *wbc)
2333 {
2334 const bool zoned = btrfs_is_zoned(inode->root->fs_info);
2335 int ret;
2336
2337 /*
2338 * The range must cover part of the @locked_folio, or a return of 1
2339 * can confuse the caller.
2340 */
2341 ASSERT(!(end <= folio_pos(locked_folio) ||
2342 start >= folio_pos(locked_folio) + folio_size(locked_folio)));
2343
2344 if (should_nocow(inode, start, end)) {
2345 ret = run_delalloc_nocow(inode, locked_folio, start, end);
2346 return ret;
2347 }
2348
2349 if (btrfs_inode_can_compress(inode) &&
2350 inode_need_compress(inode, start, end) &&
2351 run_delalloc_compressed(inode, locked_folio, start, end, wbc))
2352 return 1;
2353
2354 if (zoned)
2355 ret = run_delalloc_cow(inode, locked_folio, start, end, wbc,
2356 true);
2357 else
2358 ret = cow_file_range(inode, locked_folio, start, end, NULL,
2359 false, false);
2360 return ret;
2361 }
2362
btrfs_split_delalloc_extent(struct btrfs_inode * inode,struct extent_state * orig,u64 split)2363 void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
2364 struct extent_state *orig, u64 split)
2365 {
2366 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2367 u64 size;
2368
2369 lockdep_assert_held(&inode->io_tree.lock);
2370
2371 /* not delalloc, ignore it */
2372 if (!(orig->state & EXTENT_DELALLOC))
2373 return;
2374
2375 size = orig->end - orig->start + 1;
2376 if (size > fs_info->max_extent_size) {
2377 u32 num_extents;
2378 u64 new_size;
2379
2380 /*
2381 * See the explanation in btrfs_merge_delalloc_extent, the same
2382 * applies here, just in reverse.
2383 */
2384 new_size = orig->end - split + 1;
2385 num_extents = count_max_extents(fs_info, new_size);
2386 new_size = split - orig->start;
2387 num_extents += count_max_extents(fs_info, new_size);
2388 if (count_max_extents(fs_info, size) >= num_extents)
2389 return;
2390 }
2391
2392 spin_lock(&inode->lock);
2393 btrfs_mod_outstanding_extents(inode, 1);
2394 spin_unlock(&inode->lock);
2395 }
2396
2397 /*
2398 * Handle merged delayed allocation extents so we can keep track of new extents
2399 * that are just merged onto old extents, such as when we are doing sequential
2400 * writes, so we can properly account for the metadata space we'll need.
2401 */
btrfs_merge_delalloc_extent(struct btrfs_inode * inode,struct extent_state * new,struct extent_state * other)2402 void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
2403 struct extent_state *other)
2404 {
2405 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2406 u64 new_size, old_size;
2407 u32 num_extents;
2408
2409 lockdep_assert_held(&inode->io_tree.lock);
2410
2411 /* not delalloc, ignore it */
2412 if (!(other->state & EXTENT_DELALLOC))
2413 return;
2414
2415 if (new->start > other->start)
2416 new_size = new->end - other->start + 1;
2417 else
2418 new_size = other->end - new->start + 1;
2419
2420 /* we're not bigger than the max, unreserve the space and go */
2421 if (new_size <= fs_info->max_extent_size) {
2422 spin_lock(&inode->lock);
2423 btrfs_mod_outstanding_extents(inode, -1);
2424 spin_unlock(&inode->lock);
2425 return;
2426 }
2427
2428 /*
2429 * We have to add up either side to figure out how many extents were
2430 * accounted for before we merged into one big extent. If the number of
2431 * extents we accounted for is <= the amount we need for the new range
2432 * then we can return, otherwise drop. Think of it like this
2433 *
2434 * [ 4k][MAX_SIZE]
2435 *
2436 * So we've grown the extent by a MAX_SIZE extent, this would mean we
2437 * need 2 outstanding extents, on one side we have 1 and the other side
2438 * we have 1 so they are == and we can return. But in this case
2439 *
2440 * [MAX_SIZE+4k][MAX_SIZE+4k]
2441 *
2442 * Each range on their own accounts for 2 extents, but merged together
2443 * they are only 3 extents worth of accounting, so we need to drop in
2444 * this case.
2445 */
2446 old_size = other->end - other->start + 1;
2447 num_extents = count_max_extents(fs_info, old_size);
2448 old_size = new->end - new->start + 1;
2449 num_extents += count_max_extents(fs_info, old_size);
2450 if (count_max_extents(fs_info, new_size) >= num_extents)
2451 return;
2452
2453 spin_lock(&inode->lock);
2454 btrfs_mod_outstanding_extents(inode, -1);
2455 spin_unlock(&inode->lock);
2456 }
2457
btrfs_add_delalloc_inode(struct btrfs_inode * inode)2458 static void btrfs_add_delalloc_inode(struct btrfs_inode *inode)
2459 {
2460 struct btrfs_root *root = inode->root;
2461 struct btrfs_fs_info *fs_info = root->fs_info;
2462
2463 spin_lock(&root->delalloc_lock);
2464 ASSERT(list_empty(&inode->delalloc_inodes));
2465 list_add_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
2466 root->nr_delalloc_inodes++;
2467 if (root->nr_delalloc_inodes == 1) {
2468 spin_lock(&fs_info->delalloc_root_lock);
2469 ASSERT(list_empty(&root->delalloc_root));
2470 list_add_tail(&root->delalloc_root, &fs_info->delalloc_roots);
2471 spin_unlock(&fs_info->delalloc_root_lock);
2472 }
2473 spin_unlock(&root->delalloc_lock);
2474 }
2475
btrfs_del_delalloc_inode(struct btrfs_inode * inode)2476 void btrfs_del_delalloc_inode(struct btrfs_inode *inode)
2477 {
2478 struct btrfs_root *root = inode->root;
2479 struct btrfs_fs_info *fs_info = root->fs_info;
2480
2481 lockdep_assert_held(&root->delalloc_lock);
2482
2483 /*
2484 * We may be called after the inode was already deleted from the list,
2485 * namely in the transaction abort path btrfs_destroy_delalloc_inodes(),
2486 * and then later through btrfs_clear_delalloc_extent() while the inode
2487 * still has ->delalloc_bytes > 0.
2488 */
2489 if (!list_empty(&inode->delalloc_inodes)) {
2490 list_del_init(&inode->delalloc_inodes);
2491 root->nr_delalloc_inodes--;
2492 if (!root->nr_delalloc_inodes) {
2493 ASSERT(list_empty(&root->delalloc_inodes));
2494 spin_lock(&fs_info->delalloc_root_lock);
2495 ASSERT(!list_empty(&root->delalloc_root));
2496 list_del_init(&root->delalloc_root);
2497 spin_unlock(&fs_info->delalloc_root_lock);
2498 }
2499 }
2500 }
2501
2502 /*
2503 * Properly track delayed allocation bytes in the inode and to maintain the
2504 * list of inodes that have pending delalloc work to be done.
2505 */
btrfs_set_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2506 void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
2507 u32 bits)
2508 {
2509 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2510
2511 lockdep_assert_held(&inode->io_tree.lock);
2512
2513 if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
2514 WARN_ON(1);
2515 /*
2516 * set_bit and clear bit hooks normally require _irqsave/restore
2517 * but in this case, we are only testing for the DELALLOC
2518 * bit, which is only set or cleared with irqs on
2519 */
2520 if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2521 u64 len = state->end + 1 - state->start;
2522 u64 prev_delalloc_bytes;
2523 u32 num_extents = count_max_extents(fs_info, len);
2524
2525 spin_lock(&inode->lock);
2526 btrfs_mod_outstanding_extents(inode, num_extents);
2527 spin_unlock(&inode->lock);
2528
2529 /* For sanity tests */
2530 if (btrfs_is_testing(fs_info))
2531 return;
2532
2533 percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
2534 fs_info->delalloc_batch);
2535 spin_lock(&inode->lock);
2536 prev_delalloc_bytes = inode->delalloc_bytes;
2537 inode->delalloc_bytes += len;
2538 if (bits & EXTENT_DEFRAG)
2539 inode->defrag_bytes += len;
2540 spin_unlock(&inode->lock);
2541
2542 /*
2543 * We don't need to be under the protection of the inode's lock,
2544 * because we are called while holding the inode's io_tree lock
2545 * and are therefore protected against concurrent calls of this
2546 * function and btrfs_clear_delalloc_extent().
2547 */
2548 if (!btrfs_is_free_space_inode(inode) && prev_delalloc_bytes == 0)
2549 btrfs_add_delalloc_inode(inode);
2550 }
2551
2552 if (!(state->state & EXTENT_DELALLOC_NEW) &&
2553 (bits & EXTENT_DELALLOC_NEW)) {
2554 spin_lock(&inode->lock);
2555 inode->new_delalloc_bytes += state->end + 1 - state->start;
2556 spin_unlock(&inode->lock);
2557 }
2558 }
2559
2560 /*
2561 * Once a range is no longer delalloc this function ensures that proper
2562 * accounting happens.
2563 */
btrfs_clear_delalloc_extent(struct btrfs_inode * inode,struct extent_state * state,u32 bits)2564 void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
2565 struct extent_state *state, u32 bits)
2566 {
2567 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2568 u64 len = state->end + 1 - state->start;
2569 u32 num_extents = count_max_extents(fs_info, len);
2570
2571 lockdep_assert_held(&inode->io_tree.lock);
2572
2573 if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
2574 spin_lock(&inode->lock);
2575 inode->defrag_bytes -= len;
2576 spin_unlock(&inode->lock);
2577 }
2578
2579 /*
2580 * set_bit and clear bit hooks normally require _irqsave/restore
2581 * but in this case, we are only testing for the DELALLOC
2582 * bit, which is only set or cleared with irqs on
2583 */
2584 if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
2585 struct btrfs_root *root = inode->root;
2586 u64 new_delalloc_bytes;
2587
2588 spin_lock(&inode->lock);
2589 btrfs_mod_outstanding_extents(inode, -num_extents);
2590 spin_unlock(&inode->lock);
2591
2592 /*
2593 * We don't reserve metadata space for space cache inodes so we
2594 * don't need to call delalloc_release_metadata if there is an
2595 * error.
2596 */
2597 if (bits & EXTENT_CLEAR_META_RESV &&
2598 root != fs_info->tree_root)
2599 btrfs_delalloc_release_metadata(inode, len, true);
2600
2601 /* For sanity tests. */
2602 if (btrfs_is_testing(fs_info))
2603 return;
2604
2605 if (!btrfs_is_data_reloc_root(root) &&
2606 !btrfs_is_free_space_inode(inode) &&
2607 !(state->state & EXTENT_NORESERVE) &&
2608 (bits & EXTENT_CLEAR_DATA_RESV))
2609 btrfs_free_reserved_data_space_noquota(fs_info, len);
2610
2611 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
2612 fs_info->delalloc_batch);
2613 spin_lock(&inode->lock);
2614 inode->delalloc_bytes -= len;
2615 new_delalloc_bytes = inode->delalloc_bytes;
2616 spin_unlock(&inode->lock);
2617
2618 /*
2619 * We don't need to be under the protection of the inode's lock,
2620 * because we are called while holding the inode's io_tree lock
2621 * and are therefore protected against concurrent calls of this
2622 * function and btrfs_set_delalloc_extent().
2623 */
2624 if (!btrfs_is_free_space_inode(inode) && new_delalloc_bytes == 0) {
2625 spin_lock(&root->delalloc_lock);
2626 btrfs_del_delalloc_inode(inode);
2627 spin_unlock(&root->delalloc_lock);
2628 }
2629 }
2630
2631 if ((state->state & EXTENT_DELALLOC_NEW) &&
2632 (bits & EXTENT_DELALLOC_NEW)) {
2633 spin_lock(&inode->lock);
2634 ASSERT(inode->new_delalloc_bytes >= len);
2635 inode->new_delalloc_bytes -= len;
2636 if (bits & EXTENT_ADD_INODE_BYTES)
2637 inode_add_bytes(&inode->vfs_inode, len);
2638 spin_unlock(&inode->lock);
2639 }
2640 }
2641
2642 /*
2643 * given a list of ordered sums record them in the inode. This happens
2644 * at IO completion time based on sums calculated at bio submission time.
2645 */
add_pending_csums(struct btrfs_trans_handle * trans,struct list_head * list)2646 static int add_pending_csums(struct btrfs_trans_handle *trans,
2647 struct list_head *list)
2648 {
2649 struct btrfs_ordered_sum *sum;
2650 struct btrfs_root *csum_root = NULL;
2651 int ret;
2652
2653 list_for_each_entry(sum, list, list) {
2654 trans->adding_csums = true;
2655 if (!csum_root)
2656 csum_root = btrfs_csum_root(trans->fs_info,
2657 sum->logical);
2658 ret = btrfs_csum_file_blocks(trans, csum_root, sum);
2659 trans->adding_csums = false;
2660 if (ret)
2661 return ret;
2662 }
2663 return 0;
2664 }
2665
btrfs_find_new_delalloc_bytes(struct btrfs_inode * inode,const u64 start,const u64 len,struct extent_state ** cached_state)2666 static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode,
2667 const u64 start,
2668 const u64 len,
2669 struct extent_state **cached_state)
2670 {
2671 u64 search_start = start;
2672 const u64 end = start + len - 1;
2673
2674 while (search_start < end) {
2675 const u64 search_len = end - search_start + 1;
2676 struct extent_map *em;
2677 u64 em_len;
2678 int ret = 0;
2679
2680 em = btrfs_get_extent(inode, NULL, search_start, search_len);
2681 if (IS_ERR(em))
2682 return PTR_ERR(em);
2683
2684 if (em->disk_bytenr != EXTENT_MAP_HOLE)
2685 goto next;
2686
2687 em_len = em->len;
2688 if (em->start < search_start)
2689 em_len -= search_start - em->start;
2690 if (em_len > search_len)
2691 em_len = search_len;
2692
2693 ret = set_extent_bit(&inode->io_tree, search_start,
2694 search_start + em_len - 1,
2695 EXTENT_DELALLOC_NEW, cached_state);
2696 next:
2697 search_start = extent_map_end(em);
2698 free_extent_map(em);
2699 if (ret)
2700 return ret;
2701 }
2702 return 0;
2703 }
2704
btrfs_set_extent_delalloc(struct btrfs_inode * inode,u64 start,u64 end,unsigned int extra_bits,struct extent_state ** cached_state)2705 int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
2706 unsigned int extra_bits,
2707 struct extent_state **cached_state)
2708 {
2709 WARN_ON(PAGE_ALIGNED(end));
2710
2711 if (start >= i_size_read(&inode->vfs_inode) &&
2712 !(inode->flags & BTRFS_INODE_PREALLOC)) {
2713 /*
2714 * There can't be any extents following eof in this case so just
2715 * set the delalloc new bit for the range directly.
2716 */
2717 extra_bits |= EXTENT_DELALLOC_NEW;
2718 } else {
2719 int ret;
2720
2721 ret = btrfs_find_new_delalloc_bytes(inode, start,
2722 end + 1 - start,
2723 cached_state);
2724 if (ret)
2725 return ret;
2726 }
2727
2728 return set_extent_bit(&inode->io_tree, start, end,
2729 EXTENT_DELALLOC | extra_bits, cached_state);
2730 }
2731
2732 /* see btrfs_writepage_start_hook for details on why this is required */
2733 struct btrfs_writepage_fixup {
2734 struct folio *folio;
2735 struct btrfs_inode *inode;
2736 struct btrfs_work work;
2737 };
2738
btrfs_writepage_fixup_worker(struct btrfs_work * work)2739 static void btrfs_writepage_fixup_worker(struct btrfs_work *work)
2740 {
2741 struct btrfs_writepage_fixup *fixup =
2742 container_of(work, struct btrfs_writepage_fixup, work);
2743 struct btrfs_ordered_extent *ordered;
2744 struct extent_state *cached_state = NULL;
2745 struct extent_changeset *data_reserved = NULL;
2746 struct folio *folio = fixup->folio;
2747 struct btrfs_inode *inode = fixup->inode;
2748 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2749 u64 page_start = folio_pos(folio);
2750 u64 page_end = folio_pos(folio) + folio_size(folio) - 1;
2751 int ret = 0;
2752 bool free_delalloc_space = true;
2753
2754 /*
2755 * This is similar to page_mkwrite, we need to reserve the space before
2756 * we take the folio lock.
2757 */
2758 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start,
2759 folio_size(folio));
2760 again:
2761 folio_lock(folio);
2762
2763 /*
2764 * Before we queued this fixup, we took a reference on the folio.
2765 * folio->mapping may go NULL, but it shouldn't be moved to a different
2766 * address space.
2767 */
2768 if (!folio->mapping || !folio_test_dirty(folio) ||
2769 !folio_test_checked(folio)) {
2770 /*
2771 * Unfortunately this is a little tricky, either
2772 *
2773 * 1) We got here and our folio had already been dealt with and
2774 * we reserved our space, thus ret == 0, so we need to just
2775 * drop our space reservation and bail. This can happen the
2776 * first time we come into the fixup worker, or could happen
2777 * while waiting for the ordered extent.
2778 * 2) Our folio was already dealt with, but we happened to get an
2779 * ENOSPC above from the btrfs_delalloc_reserve_space. In
2780 * this case we obviously don't have anything to release, but
2781 * because the folio was already dealt with we don't want to
2782 * mark the folio with an error, so make sure we're resetting
2783 * ret to 0. This is why we have this check _before_ the ret
2784 * check, because we do not want to have a surprise ENOSPC
2785 * when the folio was already properly dealt with.
2786 */
2787 if (!ret) {
2788 btrfs_delalloc_release_extents(inode, folio_size(folio));
2789 btrfs_delalloc_release_space(inode, data_reserved,
2790 page_start, folio_size(folio),
2791 true);
2792 }
2793 ret = 0;
2794 goto out_page;
2795 }
2796
2797 /*
2798 * We can't mess with the folio state unless it is locked, so now that
2799 * it is locked bail if we failed to make our space reservation.
2800 */
2801 if (ret)
2802 goto out_page;
2803
2804 lock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2805
2806 /* already ordered? We're done */
2807 if (folio_test_ordered(folio))
2808 goto out_reserved;
2809
2810 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
2811 if (ordered) {
2812 unlock_extent(&inode->io_tree, page_start, page_end,
2813 &cached_state);
2814 folio_unlock(folio);
2815 btrfs_start_ordered_extent(ordered);
2816 btrfs_put_ordered_extent(ordered);
2817 goto again;
2818 }
2819
2820 ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0,
2821 &cached_state);
2822 if (ret)
2823 goto out_reserved;
2824
2825 /*
2826 * Everything went as planned, we're now the owner of a dirty page with
2827 * delayed allocation bits set and space reserved for our COW
2828 * destination.
2829 *
2830 * The page was dirty when we started, nothing should have cleaned it.
2831 */
2832 BUG_ON(!folio_test_dirty(folio));
2833 free_delalloc_space = false;
2834 out_reserved:
2835 btrfs_delalloc_release_extents(inode, PAGE_SIZE);
2836 if (free_delalloc_space)
2837 btrfs_delalloc_release_space(inode, data_reserved, page_start,
2838 PAGE_SIZE, true);
2839 unlock_extent(&inode->io_tree, page_start, page_end, &cached_state);
2840 out_page:
2841 if (ret) {
2842 /*
2843 * We hit ENOSPC or other errors. Update the mapping and page
2844 * to reflect the errors and clean the page.
2845 */
2846 mapping_set_error(folio->mapping, ret);
2847 btrfs_mark_ordered_io_finished(inode, folio, page_start,
2848 folio_size(folio), !ret);
2849 folio_clear_dirty_for_io(folio);
2850 }
2851 btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE);
2852 folio_unlock(folio);
2853 folio_put(folio);
2854 kfree(fixup);
2855 extent_changeset_free(data_reserved);
2856 /*
2857 * As a precaution, do a delayed iput in case it would be the last iput
2858 * that could need flushing space. Recursing back to fixup worker would
2859 * deadlock.
2860 */
2861 btrfs_add_delayed_iput(inode);
2862 }
2863
2864 /*
2865 * There are a few paths in the higher layers of the kernel that directly
2866 * set the folio dirty bit without asking the filesystem if it is a
2867 * good idea. This causes problems because we want to make sure COW
2868 * properly happens and the data=ordered rules are followed.
2869 *
2870 * In our case any range that doesn't have the ORDERED bit set
2871 * hasn't been properly setup for IO. We kick off an async process
2872 * to fix it up. The async helper will wait for ordered extents, set
2873 * the delalloc bit and make it safe to write the folio.
2874 */
btrfs_writepage_cow_fixup(struct folio * folio)2875 int btrfs_writepage_cow_fixup(struct folio *folio)
2876 {
2877 struct inode *inode = folio->mapping->host;
2878 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2879 struct btrfs_writepage_fixup *fixup;
2880
2881 /* This folio has ordered extent covering it already */
2882 if (folio_test_ordered(folio))
2883 return 0;
2884
2885 /*
2886 * For experimental build, we error out instead of EAGAIN.
2887 *
2888 * We should not hit such out-of-band dirty folios anymore.
2889 */
2890 if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) {
2891 WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
2892 btrfs_err_rl(fs_info,
2893 "root %lld ino %llu folio %llu is marked dirty without notifying the fs",
2894 BTRFS_I(inode)->root->root_key.objectid,
2895 btrfs_ino(BTRFS_I(inode)),
2896 folio_pos(folio));
2897 return -EUCLEAN;
2898 }
2899
2900 /*
2901 * folio_checked is set below when we create a fixup worker for this
2902 * folio, don't try to create another one if we're already
2903 * folio_test_checked.
2904 *
2905 * The extent_io writepage code will redirty the foio if we send back
2906 * EAGAIN.
2907 */
2908 if (folio_test_checked(folio))
2909 return -EAGAIN;
2910
2911 fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
2912 if (!fixup)
2913 return -EAGAIN;
2914
2915 /*
2916 * We are already holding a reference to this inode from
2917 * write_cache_pages. We need to hold it because the space reservation
2918 * takes place outside of the folio lock, and we can't trust
2919 * folio->mapping outside of the folio lock.
2920 */
2921 ihold(inode);
2922 btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
2923 folio_get(folio);
2924 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL);
2925 fixup->folio = folio;
2926 fixup->inode = BTRFS_I(inode);
2927 btrfs_queue_work(fs_info->fixup_workers, &fixup->work);
2928
2929 return -EAGAIN;
2930 }
2931
insert_reserved_file_extent(struct btrfs_trans_handle * trans,struct btrfs_inode * inode,u64 file_pos,struct btrfs_file_extent_item * stack_fi,const bool update_inode_bytes,u64 qgroup_reserved)2932 static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
2933 struct btrfs_inode *inode, u64 file_pos,
2934 struct btrfs_file_extent_item *stack_fi,
2935 const bool update_inode_bytes,
2936 u64 qgroup_reserved)
2937 {
2938 struct btrfs_root *root = inode->root;
2939 const u64 sectorsize = root->fs_info->sectorsize;
2940 struct btrfs_path *path;
2941 struct extent_buffer *leaf;
2942 struct btrfs_key ins;
2943 u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi);
2944 u64 disk_bytenr = btrfs_stack_file_extent_disk_bytenr(stack_fi);
2945 u64 offset = btrfs_stack_file_extent_offset(stack_fi);
2946 u64 num_bytes = btrfs_stack_file_extent_num_bytes(stack_fi);
2947 u64 ram_bytes = btrfs_stack_file_extent_ram_bytes(stack_fi);
2948 struct btrfs_drop_extents_args drop_args = { 0 };
2949 int ret;
2950
2951 path = btrfs_alloc_path();
2952 if (!path)
2953 return -ENOMEM;
2954
2955 /*
2956 * we may be replacing one extent in the tree with another.
2957 * The new extent is pinned in the extent map, and we don't want
2958 * to drop it from the cache until it is completely in the btree.
2959 *
2960 * So, tell btrfs_drop_extents to leave this extent in the cache.
2961 * the caller is expected to unpin it and allow it to be merged
2962 * with the others.
2963 */
2964 drop_args.path = path;
2965 drop_args.start = file_pos;
2966 drop_args.end = file_pos + num_bytes;
2967 drop_args.replace_extent = true;
2968 drop_args.extent_item_size = sizeof(*stack_fi);
2969 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2970 if (ret)
2971 goto out;
2972
2973 if (!drop_args.extent_inserted) {
2974 ins.objectid = btrfs_ino(inode);
2975 ins.type = BTRFS_EXTENT_DATA_KEY;
2976 ins.offset = file_pos;
2977
2978 ret = btrfs_insert_empty_item(trans, root, path, &ins,
2979 sizeof(*stack_fi));
2980 if (ret)
2981 goto out;
2982 }
2983 leaf = path->nodes[0];
2984 btrfs_set_stack_file_extent_generation(stack_fi, trans->transid);
2985 write_extent_buffer(leaf, stack_fi,
2986 btrfs_item_ptr_offset(leaf, path->slots[0]),
2987 sizeof(struct btrfs_file_extent_item));
2988
2989 btrfs_release_path(path);
2990
2991 /*
2992 * If we dropped an inline extent here, we know the range where it is
2993 * was not marked with the EXTENT_DELALLOC_NEW bit, so we update the
2994 * number of bytes only for that range containing the inline extent.
2995 * The remaining of the range will be processed when clearning the
2996 * EXTENT_DELALLOC_BIT bit through the ordered extent completion.
2997 */
2998 if (file_pos == 0 && !IS_ALIGNED(drop_args.bytes_found, sectorsize)) {
2999 u64 inline_size = round_down(drop_args.bytes_found, sectorsize);
3000
3001 inline_size = drop_args.bytes_found - inline_size;
3002 btrfs_update_inode_bytes(inode, sectorsize, inline_size);
3003 drop_args.bytes_found -= inline_size;
3004 num_bytes -= sectorsize;
3005 }
3006
3007 if (update_inode_bytes)
3008 btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found);
3009
3010 ins.objectid = disk_bytenr;
3011 ins.type = BTRFS_EXTENT_ITEM_KEY;
3012 ins.offset = disk_num_bytes;
3013
3014 ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes);
3015 if (ret)
3016 goto out;
3017
3018 ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(inode),
3019 file_pos - offset,
3020 qgroup_reserved, &ins);
3021 out:
3022 btrfs_free_path(path);
3023
3024 return ret;
3025 }
3026
btrfs_release_delalloc_bytes(struct btrfs_fs_info * fs_info,u64 start,u64 len)3027 static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info,
3028 u64 start, u64 len)
3029 {
3030 struct btrfs_block_group *cache;
3031
3032 cache = btrfs_lookup_block_group(fs_info, start);
3033 ASSERT(cache);
3034
3035 spin_lock(&cache->lock);
3036 cache->delalloc_bytes -= len;
3037 spin_unlock(&cache->lock);
3038
3039 btrfs_put_block_group(cache);
3040 }
3041
insert_ordered_extent_file_extent(struct btrfs_trans_handle * trans,struct btrfs_ordered_extent * oe)3042 static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
3043 struct btrfs_ordered_extent *oe)
3044 {
3045 struct btrfs_file_extent_item stack_fi;
3046 bool update_inode_bytes;
3047 u64 num_bytes = oe->num_bytes;
3048 u64 ram_bytes = oe->ram_bytes;
3049
3050 memset(&stack_fi, 0, sizeof(stack_fi));
3051 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_REG);
3052 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, oe->disk_bytenr);
3053 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
3054 oe->disk_num_bytes);
3055 btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
3056 if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
3057 num_bytes = oe->truncated_len;
3058 btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
3059 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
3060 btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
3061 /* Encryption and other encoding is reserved and all 0 */
3062
3063 /*
3064 * For delalloc, when completing an ordered extent we update the inode's
3065 * bytes when clearing the range in the inode's io tree, so pass false
3066 * as the argument 'update_inode_bytes' to insert_reserved_file_extent(),
3067 * except if the ordered extent was truncated.
3068 */
3069 update_inode_bytes = test_bit(BTRFS_ORDERED_DIRECT, &oe->flags) ||
3070 test_bit(BTRFS_ORDERED_ENCODED, &oe->flags) ||
3071 test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags);
3072
3073 return insert_reserved_file_extent(trans, oe->inode,
3074 oe->file_offset, &stack_fi,
3075 update_inode_bytes, oe->qgroup_rsv);
3076 }
3077
3078 /*
3079 * As ordered data IO finishes, this gets called so we can finish
3080 * an ordered extent if the range of bytes in the file it covers are
3081 * fully written.
3082 */
btrfs_finish_one_ordered(struct btrfs_ordered_extent * ordered_extent)3083 int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent)
3084 {
3085 struct btrfs_inode *inode = ordered_extent->inode;
3086 struct btrfs_root *root = inode->root;
3087 struct btrfs_fs_info *fs_info = root->fs_info;
3088 struct btrfs_trans_handle *trans = NULL;
3089 struct extent_io_tree *io_tree = &inode->io_tree;
3090 struct extent_state *cached_state = NULL;
3091 u64 start, end;
3092 int compress_type = 0;
3093 int ret = 0;
3094 u64 logical_len = ordered_extent->num_bytes;
3095 bool freespace_inode;
3096 bool truncated = false;
3097 bool clear_reserved_extent = true;
3098 unsigned int clear_bits = EXTENT_DEFRAG;
3099
3100 start = ordered_extent->file_offset;
3101 end = start + ordered_extent->num_bytes - 1;
3102
3103 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3104 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) &&
3105 !test_bit(BTRFS_ORDERED_DIRECT, &ordered_extent->flags) &&
3106 !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags))
3107 clear_bits |= EXTENT_DELALLOC_NEW;
3108
3109 freespace_inode = btrfs_is_free_space_inode(inode);
3110 if (!freespace_inode)
3111 btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent);
3112
3113 if (test_bit(BTRFS_ORDERED_IOERR, &ordered_extent->flags)) {
3114 ret = -EIO;
3115 goto out;
3116 }
3117
3118 if (btrfs_is_zoned(fs_info))
3119 btrfs_zone_finish_endio(fs_info, ordered_extent->disk_bytenr,
3120 ordered_extent->disk_num_bytes);
3121
3122 if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
3123 truncated = true;
3124 logical_len = ordered_extent->truncated_len;
3125 /* Truncated the entire extent, don't bother adding */
3126 if (!logical_len)
3127 goto out;
3128 }
3129
3130 /*
3131 * If it's a COW write we need to lock the extent range as we will be
3132 * inserting/replacing file extent items and unpinning an extent map.
3133 * This must be taken before joining a transaction, as it's a higher
3134 * level lock (like the inode's VFS lock), otherwise we can run into an
3135 * ABBA deadlock with other tasks (transactions work like a lock,
3136 * depending on their current state).
3137 */
3138 if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3139 clear_bits |= EXTENT_LOCKED;
3140 lock_extent(io_tree, start, end, &cached_state);
3141 }
3142
3143 if (freespace_inode)
3144 trans = btrfs_join_transaction_spacecache(root);
3145 else
3146 trans = btrfs_join_transaction(root);
3147 if (IS_ERR(trans)) {
3148 ret = PTR_ERR(trans);
3149 trans = NULL;
3150 goto out;
3151 }
3152
3153 trans->block_rsv = &inode->block_rsv;
3154
3155 ret = btrfs_insert_raid_extent(trans, ordered_extent);
3156 if (ret) {
3157 btrfs_abort_transaction(trans, ret);
3158 goto out;
3159 }
3160
3161 if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
3162 /* Logic error */
3163 ASSERT(list_empty(&ordered_extent->list));
3164 if (!list_empty(&ordered_extent->list)) {
3165 ret = -EINVAL;
3166 btrfs_abort_transaction(trans, ret);
3167 goto out;
3168 }
3169
3170 btrfs_inode_safe_disk_i_size_write(inode, 0);
3171 ret = btrfs_update_inode_fallback(trans, inode);
3172 if (ret) {
3173 /* -ENOMEM or corruption */
3174 btrfs_abort_transaction(trans, ret);
3175 }
3176 goto out;
3177 }
3178
3179 if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
3180 compress_type = ordered_extent->compress_type;
3181 if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3182 BUG_ON(compress_type);
3183 ret = btrfs_mark_extent_written(trans, inode,
3184 ordered_extent->file_offset,
3185 ordered_extent->file_offset +
3186 logical_len);
3187 btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr,
3188 ordered_extent->disk_num_bytes);
3189 } else {
3190 BUG_ON(root == fs_info->tree_root);
3191 ret = insert_ordered_extent_file_extent(trans, ordered_extent);
3192 if (!ret) {
3193 clear_reserved_extent = false;
3194 btrfs_release_delalloc_bytes(fs_info,
3195 ordered_extent->disk_bytenr,
3196 ordered_extent->disk_num_bytes);
3197 }
3198 }
3199 if (ret < 0) {
3200 btrfs_abort_transaction(trans, ret);
3201 goto out;
3202 }
3203
3204 ret = unpin_extent_cache(inode, ordered_extent->file_offset,
3205 ordered_extent->num_bytes, trans->transid);
3206 if (ret < 0) {
3207 btrfs_abort_transaction(trans, ret);
3208 goto out;
3209 }
3210
3211 ret = add_pending_csums(trans, &ordered_extent->list);
3212 if (ret) {
3213 btrfs_abort_transaction(trans, ret);
3214 goto out;
3215 }
3216
3217 /*
3218 * If this is a new delalloc range, clear its new delalloc flag to
3219 * update the inode's number of bytes. This needs to be done first
3220 * before updating the inode item.
3221 */
3222 if ((clear_bits & EXTENT_DELALLOC_NEW) &&
3223 !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags))
3224 clear_extent_bit(&inode->io_tree, start, end,
3225 EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES,
3226 &cached_state);
3227
3228 btrfs_inode_safe_disk_i_size_write(inode, 0);
3229 ret = btrfs_update_inode_fallback(trans, inode);
3230 if (ret) { /* -ENOMEM or corruption */
3231 btrfs_abort_transaction(trans, ret);
3232 goto out;
3233 }
3234 out:
3235 clear_extent_bit(&inode->io_tree, start, end, clear_bits,
3236 &cached_state);
3237
3238 if (trans)
3239 btrfs_end_transaction(trans);
3240
3241 if (ret || truncated) {
3242 u64 unwritten_start = start;
3243
3244 /*
3245 * If we failed to finish this ordered extent for any reason we
3246 * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
3247 * extent, and mark the inode with the error if it wasn't
3248 * already set. Any error during writeback would have already
3249 * set the mapping error, so we need to set it if we're the ones
3250 * marking this ordered extent as failed.
3251 */
3252 if (ret)
3253 btrfs_mark_ordered_extent_error(ordered_extent);
3254
3255 if (truncated)
3256 unwritten_start += logical_len;
3257 clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
3258
3259 /*
3260 * Drop extent maps for the part of the extent we didn't write.
3261 *
3262 * We have an exception here for the free_space_inode, this is
3263 * because when we do btrfs_get_extent() on the free space inode
3264 * we will search the commit root. If this is a new block group
3265 * we won't find anything, and we will trip over the assert in
3266 * writepage where we do ASSERT(em->block_start !=
3267 * EXTENT_MAP_HOLE).
3268 *
3269 * Theoretically we could also skip this for any NOCOW extent as
3270 * we don't mess with the extent map tree in the NOCOW case, but
3271 * for now simply skip this if we are the free space inode.
3272 */
3273 if (!btrfs_is_free_space_inode(inode))
3274 btrfs_drop_extent_map_range(inode, unwritten_start,
3275 end, false);
3276
3277 /*
3278 * If the ordered extent had an IOERR or something else went
3279 * wrong we need to return the space for this ordered extent
3280 * back to the allocator. We only free the extent in the
3281 * truncated case if we didn't write out the extent at all.
3282 *
3283 * If we made it past insert_reserved_file_extent before we
3284 * errored out then we don't need to do this as the accounting
3285 * has already been done.
3286 */
3287 if ((ret || !logical_len) &&
3288 clear_reserved_extent &&
3289 !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
3290 !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
3291 /*
3292 * Discard the range before returning it back to the
3293 * free space pool
3294 */
3295 if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC))
3296 btrfs_discard_extent(fs_info,
3297 ordered_extent->disk_bytenr,
3298 ordered_extent->disk_num_bytes,
3299 NULL);
3300 btrfs_free_reserved_extent(fs_info,
3301 ordered_extent->disk_bytenr,
3302 ordered_extent->disk_num_bytes, 1);
3303 /*
3304 * Actually free the qgroup rsv which was released when
3305 * the ordered extent was created.
3306 */
3307 btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(inode->root),
3308 ordered_extent->qgroup_rsv,
3309 BTRFS_QGROUP_RSV_DATA);
3310 }
3311 }
3312
3313 /*
3314 * This needs to be done to make sure anybody waiting knows we are done
3315 * updating everything for this ordered extent.
3316 */
3317 btrfs_remove_ordered_extent(inode, ordered_extent);
3318
3319 /* once for us */
3320 btrfs_put_ordered_extent(ordered_extent);
3321 /* once for the tree */
3322 btrfs_put_ordered_extent(ordered_extent);
3323
3324 return ret;
3325 }
3326
btrfs_finish_ordered_io(struct btrfs_ordered_extent * ordered)3327 int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered)
3328 {
3329 if (btrfs_is_zoned(ordered->inode->root->fs_info) &&
3330 !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) &&
3331 list_empty(&ordered->bioc_list))
3332 btrfs_finish_ordered_zoned(ordered);
3333 return btrfs_finish_one_ordered(ordered);
3334 }
3335
3336 /*
3337 * Verify the checksum for a single sector without any extra action that depend
3338 * on the type of I/O.
3339 */
btrfs_check_sector_csum(struct btrfs_fs_info * fs_info,struct page * page,u32 pgoff,u8 * csum,const u8 * const csum_expected)3340 int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
3341 u32 pgoff, u8 *csum, const u8 * const csum_expected)
3342 {
3343 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
3344 char *kaddr;
3345
3346 ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
3347
3348 shash->tfm = fs_info->csum_shash;
3349
3350 kaddr = kmap_local_page(page) + pgoff;
3351 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
3352 kunmap_local(kaddr);
3353
3354 if (memcmp(csum, csum_expected, fs_info->csum_size))
3355 return -EIO;
3356 return 0;
3357 }
3358
3359 /*
3360 * Verify the checksum of a single data sector.
3361 *
3362 * @bbio: btrfs_io_bio which contains the csum
3363 * @dev: device the sector is on
3364 * @bio_offset: offset to the beginning of the bio (in bytes)
3365 * @bv: bio_vec to check
3366 *
3367 * Check if the checksum on a data block is valid. When a checksum mismatch is
3368 * detected, report the error and fill the corrupted range with zero.
3369 *
3370 * Return %true if the sector is ok or had no checksum to start with, else %false.
3371 */
btrfs_data_csum_ok(struct btrfs_bio * bbio,struct btrfs_device * dev,u32 bio_offset,struct bio_vec * bv)3372 bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev,
3373 u32 bio_offset, struct bio_vec *bv)
3374 {
3375 struct btrfs_inode *inode = bbio->inode;
3376 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3377 u64 file_offset = bbio->file_offset + bio_offset;
3378 u64 end = file_offset + bv->bv_len - 1;
3379 u8 *csum_expected;
3380 u8 csum[BTRFS_CSUM_SIZE];
3381
3382 ASSERT(bv->bv_len == fs_info->sectorsize);
3383
3384 if (!bbio->csum)
3385 return true;
3386
3387 if (btrfs_is_data_reloc_root(inode->root) &&
3388 test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM,
3389 NULL)) {
3390 /* Skip the range without csum for data reloc inode */
3391 clear_extent_bits(&inode->io_tree, file_offset, end,
3392 EXTENT_NODATASUM);
3393 return true;
3394 }
3395
3396 csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) *
3397 fs_info->csum_size;
3398 if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum,
3399 csum_expected))
3400 goto zeroit;
3401 return true;
3402
3403 zeroit:
3404 btrfs_print_data_csum_error(inode, file_offset, csum, csum_expected,
3405 bbio->mirror_num);
3406 if (dev)
3407 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS);
3408 memzero_bvec(bv);
3409 return false;
3410 }
3411
3412 /*
3413 * Perform a delayed iput on @inode.
3414 *
3415 * @inode: The inode we want to perform iput on
3416 *
3417 * This function uses the generic vfs_inode::i_count to track whether we should
3418 * just decrement it (in case it's > 1) or if this is the last iput then link
3419 * the inode to the delayed iput machinery. Delayed iputs are processed at
3420 * transaction commit time/superblock commit/cleaner kthread.
3421 */
btrfs_add_delayed_iput(struct btrfs_inode * inode)3422 void btrfs_add_delayed_iput(struct btrfs_inode *inode)
3423 {
3424 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3425 unsigned long flags;
3426
3427 if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1))
3428 return;
3429
3430 WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state));
3431 atomic_inc(&fs_info->nr_delayed_iputs);
3432 /*
3433 * Need to be irq safe here because we can be called from either an irq
3434 * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq
3435 * context.
3436 */
3437 spin_lock_irqsave(&fs_info->delayed_iput_lock, flags);
3438 ASSERT(list_empty(&inode->delayed_iput));
3439 list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs);
3440 spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags);
3441 if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags))
3442 wake_up_process(fs_info->cleaner_kthread);
3443 }
3444
run_delayed_iput_locked(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3445 static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info,
3446 struct btrfs_inode *inode)
3447 {
3448 list_del_init(&inode->delayed_iput);
3449 spin_unlock_irq(&fs_info->delayed_iput_lock);
3450 iput(&inode->vfs_inode);
3451 if (atomic_dec_and_test(&fs_info->nr_delayed_iputs))
3452 wake_up(&fs_info->delayed_iputs_wait);
3453 spin_lock_irq(&fs_info->delayed_iput_lock);
3454 }
3455
btrfs_run_delayed_iput(struct btrfs_fs_info * fs_info,struct btrfs_inode * inode)3456 static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info,
3457 struct btrfs_inode *inode)
3458 {
3459 if (!list_empty(&inode->delayed_iput)) {
3460 spin_lock_irq(&fs_info->delayed_iput_lock);
3461 if (!list_empty(&inode->delayed_iput))
3462 run_delayed_iput_locked(fs_info, inode);
3463 spin_unlock_irq(&fs_info->delayed_iput_lock);
3464 }
3465 }
3466
btrfs_run_delayed_iputs(struct btrfs_fs_info * fs_info)3467 void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
3468 {
3469 /*
3470 * btrfs_put_ordered_extent() can run in irq context (see bio.c), which
3471 * calls btrfs_add_delayed_iput() and that needs to lock
3472 * fs_info->delayed_iput_lock. So we need to disable irqs here to
3473 * prevent a deadlock.
3474 */
3475 spin_lock_irq(&fs_info->delayed_iput_lock);
3476 while (!list_empty(&fs_info->delayed_iputs)) {
3477 struct btrfs_inode *inode;
3478
3479 inode = list_first_entry(&fs_info->delayed_iputs,
3480 struct btrfs_inode, delayed_iput);
3481 run_delayed_iput_locked(fs_info, inode);
3482 if (need_resched()) {
3483 spin_unlock_irq(&fs_info->delayed_iput_lock);
3484 cond_resched();
3485 spin_lock_irq(&fs_info->delayed_iput_lock);
3486 }
3487 }
3488 spin_unlock_irq(&fs_info->delayed_iput_lock);
3489 }
3490
3491 /*
3492 * Wait for flushing all delayed iputs
3493 *
3494 * @fs_info: the filesystem
3495 *
3496 * This will wait on any delayed iputs that are currently running with KILLABLE
3497 * set. Once they are all done running we will return, unless we are killed in
3498 * which case we return EINTR. This helps in user operations like fallocate etc
3499 * that might get blocked on the iputs.
3500 *
3501 * Return EINTR if we were killed, 0 if nothing's pending
3502 */
btrfs_wait_on_delayed_iputs(struct btrfs_fs_info * fs_info)3503 int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info)
3504 {
3505 int ret = wait_event_killable(fs_info->delayed_iputs_wait,
3506 atomic_read(&fs_info->nr_delayed_iputs) == 0);
3507 if (ret)
3508 return -EINTR;
3509 return 0;
3510 }
3511
3512 /*
3513 * This creates an orphan entry for the given inode in case something goes wrong
3514 * in the middle of an unlink.
3515 */
btrfs_orphan_add(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3516 int btrfs_orphan_add(struct btrfs_trans_handle *trans,
3517 struct btrfs_inode *inode)
3518 {
3519 int ret;
3520
3521 ret = btrfs_insert_orphan_item(trans, inode->root, btrfs_ino(inode));
3522 if (ret && ret != -EEXIST) {
3523 btrfs_abort_transaction(trans, ret);
3524 return ret;
3525 }
3526
3527 return 0;
3528 }
3529
3530 /*
3531 * We have done the delete so we can go ahead and remove the orphan item for
3532 * this particular inode.
3533 */
btrfs_orphan_del(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)3534 static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
3535 struct btrfs_inode *inode)
3536 {
3537 return btrfs_del_orphan_item(trans, inode->root, btrfs_ino(inode));
3538 }
3539
3540 /*
3541 * this cleans up any orphans that may be left on the list from the last use
3542 * of this root.
3543 */
btrfs_orphan_cleanup(struct btrfs_root * root)3544 int btrfs_orphan_cleanup(struct btrfs_root *root)
3545 {
3546 struct btrfs_fs_info *fs_info = root->fs_info;
3547 struct btrfs_path *path;
3548 struct extent_buffer *leaf;
3549 struct btrfs_key key, found_key;
3550 struct btrfs_trans_handle *trans;
3551 u64 last_objectid = 0;
3552 int ret = 0, nr_unlink = 0;
3553
3554 if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
3555 return 0;
3556
3557 path = btrfs_alloc_path();
3558 if (!path) {
3559 ret = -ENOMEM;
3560 goto out;
3561 }
3562 path->reada = READA_BACK;
3563
3564 key.objectid = BTRFS_ORPHAN_OBJECTID;
3565 key.type = BTRFS_ORPHAN_ITEM_KEY;
3566 key.offset = (u64)-1;
3567
3568 while (1) {
3569 struct btrfs_inode *inode;
3570
3571 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3572 if (ret < 0)
3573 goto out;
3574
3575 /*
3576 * if ret == 0 means we found what we were searching for, which
3577 * is weird, but possible, so only screw with path if we didn't
3578 * find the key and see if we have stuff that matches
3579 */
3580 if (ret > 0) {
3581 ret = 0;
3582 if (path->slots[0] == 0)
3583 break;
3584 path->slots[0]--;
3585 }
3586
3587 /* pull out the item */
3588 leaf = path->nodes[0];
3589 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3590
3591 /* make sure the item matches what we want */
3592 if (found_key.objectid != BTRFS_ORPHAN_OBJECTID)
3593 break;
3594 if (found_key.type != BTRFS_ORPHAN_ITEM_KEY)
3595 break;
3596
3597 /* release the path since we're done with it */
3598 btrfs_release_path(path);
3599
3600 /*
3601 * this is where we are basically btrfs_lookup, without the
3602 * crossing root thing. we store the inode number in the
3603 * offset of the orphan item.
3604 */
3605
3606 if (found_key.offset == last_objectid) {
3607 /*
3608 * We found the same inode as before. This means we were
3609 * not able to remove its items via eviction triggered
3610 * by an iput(). A transaction abort may have happened,
3611 * due to -ENOSPC for example, so try to grab the error
3612 * that lead to a transaction abort, if any.
3613 */
3614 btrfs_err(fs_info,
3615 "Error removing orphan entry, stopping orphan cleanup");
3616 ret = BTRFS_FS_ERROR(fs_info) ?: -EINVAL;
3617 goto out;
3618 }
3619
3620 last_objectid = found_key.offset;
3621
3622 found_key.objectid = found_key.offset;
3623 found_key.type = BTRFS_INODE_ITEM_KEY;
3624 found_key.offset = 0;
3625 inode = btrfs_iget(last_objectid, root);
3626 if (IS_ERR(inode)) {
3627 ret = PTR_ERR(inode);
3628 inode = NULL;
3629 if (ret != -ENOENT)
3630 goto out;
3631 }
3632
3633 if (!inode && root == fs_info->tree_root) {
3634 struct btrfs_root *dead_root;
3635 int is_dead_root = 0;
3636
3637 /*
3638 * This is an orphan in the tree root. Currently these
3639 * could come from 2 sources:
3640 * a) a root (snapshot/subvolume) deletion in progress
3641 * b) a free space cache inode
3642 * We need to distinguish those two, as the orphan item
3643 * for a root must not get deleted before the deletion
3644 * of the snapshot/subvolume's tree completes.
3645 *
3646 * btrfs_find_orphan_roots() ran before us, which has
3647 * found all deleted roots and loaded them into
3648 * fs_info->fs_roots_radix. So here we can find if an
3649 * orphan item corresponds to a deleted root by looking
3650 * up the root from that radix tree.
3651 */
3652
3653 spin_lock(&fs_info->fs_roots_radix_lock);
3654 dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
3655 (unsigned long)found_key.objectid);
3656 if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
3657 is_dead_root = 1;
3658 spin_unlock(&fs_info->fs_roots_radix_lock);
3659
3660 if (is_dead_root) {
3661 /* prevent this orphan from being found again */
3662 key.offset = found_key.objectid - 1;
3663 continue;
3664 }
3665
3666 }
3667
3668 /*
3669 * If we have an inode with links, there are a couple of
3670 * possibilities:
3671 *
3672 * 1. We were halfway through creating fsverity metadata for the
3673 * file. In that case, the orphan item represents incomplete
3674 * fsverity metadata which must be cleaned up with
3675 * btrfs_drop_verity_items and deleting the orphan item.
3676
3677 * 2. Old kernels (before v3.12) used to create an
3678 * orphan item for truncate indicating that there were possibly
3679 * extent items past i_size that needed to be deleted. In v3.12,
3680 * truncate was changed to update i_size in sync with the extent
3681 * items, but the (useless) orphan item was still created. Since
3682 * v4.18, we don't create the orphan item for truncate at all.
3683 *
3684 * So, this item could mean that we need to do a truncate, but
3685 * only if this filesystem was last used on a pre-v3.12 kernel
3686 * and was not cleanly unmounted. The odds of that are quite
3687 * slim, and it's a pain to do the truncate now, so just delete
3688 * the orphan item.
3689 *
3690 * It's also possible that this orphan item was supposed to be
3691 * deleted but wasn't. The inode number may have been reused,
3692 * but either way, we can delete the orphan item.
3693 */
3694 if (!inode || inode->vfs_inode.i_nlink) {
3695 if (inode) {
3696 ret = btrfs_drop_verity_items(inode);
3697 iput(&inode->vfs_inode);
3698 inode = NULL;
3699 if (ret)
3700 goto out;
3701 }
3702 trans = btrfs_start_transaction(root, 1);
3703 if (IS_ERR(trans)) {
3704 ret = PTR_ERR(trans);
3705 goto out;
3706 }
3707 btrfs_debug(fs_info, "auto deleting %Lu",
3708 found_key.objectid);
3709 ret = btrfs_del_orphan_item(trans, root,
3710 found_key.objectid);
3711 btrfs_end_transaction(trans);
3712 if (ret)
3713 goto out;
3714 continue;
3715 }
3716
3717 nr_unlink++;
3718
3719 /* this will do delete_inode and everything for us */
3720 iput(&inode->vfs_inode);
3721 }
3722 /* release the path since we're done with it */
3723 btrfs_release_path(path);
3724
3725 if (test_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &root->state)) {
3726 trans = btrfs_join_transaction(root);
3727 if (!IS_ERR(trans))
3728 btrfs_end_transaction(trans);
3729 }
3730
3731 if (nr_unlink)
3732 btrfs_debug(fs_info, "unlinked %d orphans", nr_unlink);
3733
3734 out:
3735 if (ret)
3736 btrfs_err(fs_info, "could not do orphan cleanup %d", ret);
3737 btrfs_free_path(path);
3738 return ret;
3739 }
3740
3741 /*
3742 * very simple check to peek ahead in the leaf looking for xattrs. If we
3743 * don't find any xattrs, we know there can't be any acls.
3744 *
3745 * slot is the slot the inode is in, objectid is the objectid of the inode
3746 */
acls_after_inode_item(struct extent_buffer * leaf,int slot,u64 objectid,int * first_xattr_slot)3747 static noinline int acls_after_inode_item(struct extent_buffer *leaf,
3748 int slot, u64 objectid,
3749 int *first_xattr_slot)
3750 {
3751 u32 nritems = btrfs_header_nritems(leaf);
3752 struct btrfs_key found_key;
3753 static u64 xattr_access = 0;
3754 static u64 xattr_default = 0;
3755 int scanned = 0;
3756
3757 if (!xattr_access) {
3758 xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
3759 strlen(XATTR_NAME_POSIX_ACL_ACCESS));
3760 xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
3761 strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
3762 }
3763
3764 slot++;
3765 *first_xattr_slot = -1;
3766 while (slot < nritems) {
3767 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3768
3769 /* we found a different objectid, there must not be acls */
3770 if (found_key.objectid != objectid)
3771 return 0;
3772
3773 /* we found an xattr, assume we've got an acl */
3774 if (found_key.type == BTRFS_XATTR_ITEM_KEY) {
3775 if (*first_xattr_slot == -1)
3776 *first_xattr_slot = slot;
3777 if (found_key.offset == xattr_access ||
3778 found_key.offset == xattr_default)
3779 return 1;
3780 }
3781
3782 /*
3783 * we found a key greater than an xattr key, there can't
3784 * be any acls later on
3785 */
3786 if (found_key.type > BTRFS_XATTR_ITEM_KEY)
3787 return 0;
3788
3789 slot++;
3790 scanned++;
3791
3792 /*
3793 * it goes inode, inode backrefs, xattrs, extents,
3794 * so if there are a ton of hard links to an inode there can
3795 * be a lot of backrefs. Don't waste time searching too hard,
3796 * this is just an optimization
3797 */
3798 if (scanned >= 8)
3799 break;
3800 }
3801 /* we hit the end of the leaf before we found an xattr or
3802 * something larger than an xattr. We have to assume the inode
3803 * has acls
3804 */
3805 if (*first_xattr_slot == -1)
3806 *first_xattr_slot = slot;
3807 return 1;
3808 }
3809
btrfs_init_file_extent_tree(struct btrfs_inode * inode)3810 static int btrfs_init_file_extent_tree(struct btrfs_inode *inode)
3811 {
3812 struct btrfs_fs_info *fs_info = inode->root->fs_info;
3813
3814 if (WARN_ON_ONCE(inode->file_extent_tree))
3815 return 0;
3816 if (btrfs_fs_incompat(fs_info, NO_HOLES))
3817 return 0;
3818 if (!S_ISREG(inode->vfs_inode.i_mode))
3819 return 0;
3820 if (btrfs_is_free_space_inode(inode))
3821 return 0;
3822
3823 inode->file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL);
3824 if (!inode->file_extent_tree)
3825 return -ENOMEM;
3826
3827 extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT);
3828 /* Lockdep class is set only for the file extent tree. */
3829 lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class);
3830
3831 return 0;
3832 }
3833
btrfs_add_inode_to_root(struct btrfs_inode * inode,bool prealloc)3834 static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc)
3835 {
3836 struct btrfs_root *root = inode->root;
3837 struct btrfs_inode *existing;
3838 const u64 ino = btrfs_ino(inode);
3839 int ret;
3840
3841 if (inode_unhashed(&inode->vfs_inode))
3842 return 0;
3843
3844 if (prealloc) {
3845 ret = xa_reserve(&root->inodes, ino, GFP_NOFS);
3846 if (ret)
3847 return ret;
3848 }
3849
3850 existing = xa_store(&root->inodes, ino, inode, GFP_ATOMIC);
3851
3852 if (xa_is_err(existing)) {
3853 ret = xa_err(existing);
3854 ASSERT(ret != -EINVAL);
3855 ASSERT(ret != -ENOMEM);
3856 return ret;
3857 } else if (existing) {
3858 WARN_ON(!(existing->vfs_inode.i_state & (I_WILL_FREE | I_FREEING)));
3859 }
3860
3861 return 0;
3862 }
3863
3864 /*
3865 * Read a locked inode from the btree into the in-memory inode and add it to
3866 * its root list/tree.
3867 *
3868 * On failure clean up the inode.
3869 */
btrfs_read_locked_inode(struct btrfs_inode * inode,struct btrfs_path * path)3870 static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path)
3871 {
3872 struct btrfs_root *root = inode->root;
3873 struct btrfs_fs_info *fs_info = root->fs_info;
3874 struct extent_buffer *leaf;
3875 struct btrfs_inode_item *inode_item;
3876 struct inode *vfs_inode = &inode->vfs_inode;
3877 struct btrfs_key location;
3878 unsigned long ptr;
3879 int maybe_acls;
3880 u32 rdev;
3881 int ret;
3882 bool filled = false;
3883 int first_xattr_slot;
3884
3885 ret = btrfs_init_file_extent_tree(inode);
3886 if (ret)
3887 goto out;
3888
3889 ret = btrfs_fill_inode(inode, &rdev);
3890 if (!ret)
3891 filled = true;
3892
3893 ASSERT(path);
3894
3895 btrfs_get_inode_key(inode, &location);
3896
3897 ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
3898 if (ret) {
3899 /*
3900 * ret > 0 can come from btrfs_search_slot called by
3901 * btrfs_lookup_inode(), this means the inode was not found.
3902 */
3903 if (ret > 0)
3904 ret = -ENOENT;
3905 goto out;
3906 }
3907
3908 leaf = path->nodes[0];
3909
3910 if (filled)
3911 goto cache_index;
3912
3913 inode_item = btrfs_item_ptr(leaf, path->slots[0],
3914 struct btrfs_inode_item);
3915 vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item);
3916 set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item));
3917 i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item));
3918 i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item));
3919 btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
3920 btrfs_inode_set_file_extent_range(inode, 0,
3921 round_up(i_size_read(vfs_inode), fs_info->sectorsize));
3922
3923 inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime),
3924 btrfs_timespec_nsec(leaf, &inode_item->atime));
3925
3926 inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime),
3927 btrfs_timespec_nsec(leaf, &inode_item->mtime));
3928
3929 inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime),
3930 btrfs_timespec_nsec(leaf, &inode_item->ctime));
3931
3932 inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime);
3933 inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime);
3934
3935 inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item));
3936 inode->generation = btrfs_inode_generation(leaf, inode_item);
3937 inode->last_trans = btrfs_inode_transid(leaf, inode_item);
3938
3939 inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item));
3940 vfs_inode->i_generation = inode->generation;
3941 vfs_inode->i_rdev = 0;
3942 rdev = btrfs_inode_rdev(leaf, inode_item);
3943
3944 if (S_ISDIR(vfs_inode->i_mode))
3945 inode->index_cnt = (u64)-1;
3946
3947 btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item),
3948 &inode->flags, &inode->ro_flags);
3949 btrfs_update_inode_mapping_flags(inode);
3950
3951 cache_index:
3952 /*
3953 * If we were modified in the current generation and evicted from memory
3954 * and then re-read we need to do a full sync since we don't have any
3955 * idea about which extents were modified before we were evicted from
3956 * cache.
3957 *
3958 * This is required for both inode re-read from disk and delayed inode
3959 * in the delayed_nodes xarray.
3960 */
3961 if (inode->last_trans == btrfs_get_fs_generation(fs_info))
3962 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
3963
3964 /*
3965 * We don't persist the id of the transaction where an unlink operation
3966 * against the inode was last made. So here we assume the inode might
3967 * have been evicted, and therefore the exact value of last_unlink_trans
3968 * lost, and set it to last_trans to avoid metadata inconsistencies
3969 * between the inode and its parent if the inode is fsync'ed and the log
3970 * replayed. For example, in the scenario:
3971 *
3972 * touch mydir/foo
3973 * ln mydir/foo mydir/bar
3974 * sync
3975 * unlink mydir/bar
3976 * echo 2 > /proc/sys/vm/drop_caches # evicts inode
3977 * xfs_io -c fsync mydir/foo
3978 * <power failure>
3979 * mount fs, triggers fsync log replay
3980 *
3981 * We must make sure that when we fsync our inode foo we also log its
3982 * parent inode, otherwise after log replay the parent still has the
3983 * dentry with the "bar" name but our inode foo has a link count of 1
3984 * and doesn't have an inode ref with the name "bar" anymore.
3985 *
3986 * Setting last_unlink_trans to last_trans is a pessimistic approach,
3987 * but it guarantees correctness at the expense of occasional full
3988 * transaction commits on fsync if our inode is a directory, or if our
3989 * inode is not a directory, logging its parent unnecessarily.
3990 */
3991 inode->last_unlink_trans = inode->last_trans;
3992
3993 /*
3994 * Same logic as for last_unlink_trans. We don't persist the generation
3995 * of the last transaction where this inode was used for a reflink
3996 * operation, so after eviction and reloading the inode we must be
3997 * pessimistic and assume the last transaction that modified the inode.
3998 */
3999 inode->last_reflink_trans = inode->last_trans;
4000
4001 path->slots[0]++;
4002 if (vfs_inode->i_nlink != 1 ||
4003 path->slots[0] >= btrfs_header_nritems(leaf))
4004 goto cache_acl;
4005
4006 btrfs_item_key_to_cpu(leaf, &location, path->slots[0]);
4007 if (location.objectid != btrfs_ino(inode))
4008 goto cache_acl;
4009
4010 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4011 if (location.type == BTRFS_INODE_REF_KEY) {
4012 struct btrfs_inode_ref *ref;
4013
4014 ref = (struct btrfs_inode_ref *)ptr;
4015 inode->dir_index = btrfs_inode_ref_index(leaf, ref);
4016 } else if (location.type == BTRFS_INODE_EXTREF_KEY) {
4017 struct btrfs_inode_extref *extref;
4018
4019 extref = (struct btrfs_inode_extref *)ptr;
4020 inode->dir_index = btrfs_inode_extref_index(leaf, extref);
4021 }
4022 cache_acl:
4023 /*
4024 * try to precache a NULL acl entry for files that don't have
4025 * any xattrs or acls
4026 */
4027 maybe_acls = acls_after_inode_item(leaf, path->slots[0],
4028 btrfs_ino(inode), &first_xattr_slot);
4029 if (first_xattr_slot != -1) {
4030 path->slots[0] = first_xattr_slot;
4031 ret = btrfs_load_inode_props(inode, path);
4032 if (ret)
4033 btrfs_err(fs_info,
4034 "error loading props for ino %llu (root %llu): %d",
4035 btrfs_ino(inode), btrfs_root_id(root), ret);
4036 }
4037
4038 if (!maybe_acls)
4039 cache_no_acl(vfs_inode);
4040
4041 switch (vfs_inode->i_mode & S_IFMT) {
4042 case S_IFREG:
4043 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4044 vfs_inode->i_fop = &btrfs_file_operations;
4045 vfs_inode->i_op = &btrfs_file_inode_operations;
4046 break;
4047 case S_IFDIR:
4048 vfs_inode->i_fop = &btrfs_dir_file_operations;
4049 vfs_inode->i_op = &btrfs_dir_inode_operations;
4050 break;
4051 case S_IFLNK:
4052 vfs_inode->i_op = &btrfs_symlink_inode_operations;
4053 inode_nohighmem(vfs_inode);
4054 vfs_inode->i_mapping->a_ops = &btrfs_aops;
4055 break;
4056 default:
4057 vfs_inode->i_op = &btrfs_special_inode_operations;
4058 init_special_inode(vfs_inode, vfs_inode->i_mode, rdev);
4059 break;
4060 }
4061
4062 btrfs_sync_inode_flags_to_i_flags(inode);
4063
4064 ret = btrfs_add_inode_to_root(inode, true);
4065 if (ret)
4066 goto out;
4067
4068 return 0;
4069 out:
4070 iget_failed(vfs_inode);
4071 return ret;
4072 }
4073
4074 /*
4075 * given a leaf and an inode, copy the inode fields into the leaf
4076 */
fill_inode_item(struct btrfs_trans_handle * trans,struct extent_buffer * leaf,struct btrfs_inode_item * item,struct inode * inode)4077 static void fill_inode_item(struct btrfs_trans_handle *trans,
4078 struct extent_buffer *leaf,
4079 struct btrfs_inode_item *item,
4080 struct inode *inode)
4081 {
4082 struct btrfs_map_token token;
4083 u64 flags;
4084
4085 btrfs_init_map_token(&token, leaf);
4086
4087 btrfs_set_token_inode_uid(&token, item, i_uid_read(inode));
4088 btrfs_set_token_inode_gid(&token, item, i_gid_read(inode));
4089 btrfs_set_token_inode_size(&token, item, BTRFS_I(inode)->disk_i_size);
4090 btrfs_set_token_inode_mode(&token, item, inode->i_mode);
4091 btrfs_set_token_inode_nlink(&token, item, inode->i_nlink);
4092
4093 btrfs_set_token_timespec_sec(&token, &item->atime,
4094 inode_get_atime_sec(inode));
4095 btrfs_set_token_timespec_nsec(&token, &item->atime,
4096 inode_get_atime_nsec(inode));
4097
4098 btrfs_set_token_timespec_sec(&token, &item->mtime,
4099 inode_get_mtime_sec(inode));
4100 btrfs_set_token_timespec_nsec(&token, &item->mtime,
4101 inode_get_mtime_nsec(inode));
4102
4103 btrfs_set_token_timespec_sec(&token, &item->ctime,
4104 inode_get_ctime_sec(inode));
4105 btrfs_set_token_timespec_nsec(&token, &item->ctime,
4106 inode_get_ctime_nsec(inode));
4107
4108 btrfs_set_token_timespec_sec(&token, &item->otime, BTRFS_I(inode)->i_otime_sec);
4109 btrfs_set_token_timespec_nsec(&token, &item->otime, BTRFS_I(inode)->i_otime_nsec);
4110
4111 btrfs_set_token_inode_nbytes(&token, item, inode_get_bytes(inode));
4112 btrfs_set_token_inode_generation(&token, item,
4113 BTRFS_I(inode)->generation);
4114 btrfs_set_token_inode_sequence(&token, item, inode_peek_iversion(inode));
4115 btrfs_set_token_inode_transid(&token, item, trans->transid);
4116 btrfs_set_token_inode_rdev(&token, item, inode->i_rdev);
4117 flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags,
4118 BTRFS_I(inode)->ro_flags);
4119 btrfs_set_token_inode_flags(&token, item, flags);
4120 btrfs_set_token_inode_block_group(&token, item, 0);
4121 }
4122
4123 /*
4124 * copy everything in the in-memory inode into the btree.
4125 */
btrfs_update_inode_item(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4126 static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans,
4127 struct btrfs_inode *inode)
4128 {
4129 struct btrfs_inode_item *inode_item;
4130 struct btrfs_path *path;
4131 struct extent_buffer *leaf;
4132 struct btrfs_key key;
4133 int ret;
4134
4135 path = btrfs_alloc_path();
4136 if (!path)
4137 return -ENOMEM;
4138
4139 btrfs_get_inode_key(inode, &key);
4140 ret = btrfs_lookup_inode(trans, inode->root, path, &key, 1);
4141 if (ret) {
4142 if (ret > 0)
4143 ret = -ENOENT;
4144 goto failed;
4145 }
4146
4147 leaf = path->nodes[0];
4148 inode_item = btrfs_item_ptr(leaf, path->slots[0],
4149 struct btrfs_inode_item);
4150
4151 fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode);
4152 btrfs_set_inode_last_trans(trans, inode);
4153 ret = 0;
4154 failed:
4155 btrfs_free_path(path);
4156 return ret;
4157 }
4158
4159 /*
4160 * copy everything in the in-memory inode into the btree.
4161 */
btrfs_update_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4162 int btrfs_update_inode(struct btrfs_trans_handle *trans,
4163 struct btrfs_inode *inode)
4164 {
4165 struct btrfs_root *root = inode->root;
4166 struct btrfs_fs_info *fs_info = root->fs_info;
4167 int ret;
4168
4169 /*
4170 * If the inode is a free space inode, we can deadlock during commit
4171 * if we put it into the delayed code.
4172 *
4173 * The data relocation inode should also be directly updated
4174 * without delay
4175 */
4176 if (!btrfs_is_free_space_inode(inode)
4177 && !btrfs_is_data_reloc_root(root)
4178 && !test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) {
4179 btrfs_update_root_times(trans, root);
4180
4181 ret = btrfs_delayed_update_inode(trans, inode);
4182 if (!ret)
4183 btrfs_set_inode_last_trans(trans, inode);
4184 return ret;
4185 }
4186
4187 return btrfs_update_inode_item(trans, inode);
4188 }
4189
btrfs_update_inode_fallback(struct btrfs_trans_handle * trans,struct btrfs_inode * inode)4190 int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
4191 struct btrfs_inode *inode)
4192 {
4193 int ret;
4194
4195 ret = btrfs_update_inode(trans, inode);
4196 if (ret == -ENOSPC)
4197 return btrfs_update_inode_item(trans, inode);
4198 return ret;
4199 }
4200
4201 /*
4202 * unlink helper that gets used here in inode.c and in the tree logging
4203 * recovery code. It remove a link in a directory with a given name, and
4204 * also drops the back refs in the inode to the directory
4205 */
__btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name,struct btrfs_rename_ctx * rename_ctx)4206 static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4207 struct btrfs_inode *dir,
4208 struct btrfs_inode *inode,
4209 const struct fscrypt_str *name,
4210 struct btrfs_rename_ctx *rename_ctx)
4211 {
4212 struct btrfs_root *root = dir->root;
4213 struct btrfs_fs_info *fs_info = root->fs_info;
4214 struct btrfs_path *path;
4215 int ret = 0;
4216 struct btrfs_dir_item *di;
4217 u64 index;
4218 u64 ino = btrfs_ino(inode);
4219 u64 dir_ino = btrfs_ino(dir);
4220
4221 path = btrfs_alloc_path();
4222 if (!path) {
4223 ret = -ENOMEM;
4224 goto out;
4225 }
4226
4227 di = btrfs_lookup_dir_item(trans, root, path, dir_ino, name, -1);
4228 if (IS_ERR_OR_NULL(di)) {
4229 ret = di ? PTR_ERR(di) : -ENOENT;
4230 goto err;
4231 }
4232 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4233 if (ret)
4234 goto err;
4235 btrfs_release_path(path);
4236
4237 /*
4238 * If we don't have dir index, we have to get it by looking up
4239 * the inode ref, since we get the inode ref, remove it directly,
4240 * it is unnecessary to do delayed deletion.
4241 *
4242 * But if we have dir index, needn't search inode ref to get it.
4243 * Since the inode ref is close to the inode item, it is better
4244 * that we delay to delete it, and just do this deletion when
4245 * we update the inode item.
4246 */
4247 if (inode->dir_index) {
4248 ret = btrfs_delayed_delete_inode_ref(inode);
4249 if (!ret) {
4250 index = inode->dir_index;
4251 goto skip_backref;
4252 }
4253 }
4254
4255 ret = btrfs_del_inode_ref(trans, root, name, ino, dir_ino, &index);
4256 if (ret) {
4257 btrfs_info(fs_info,
4258 "failed to delete reference to %.*s, inode %llu parent %llu",
4259 name->len, name->name, ino, dir_ino);
4260 btrfs_abort_transaction(trans, ret);
4261 goto err;
4262 }
4263 skip_backref:
4264 if (rename_ctx)
4265 rename_ctx->index = index;
4266
4267 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4268 if (ret) {
4269 btrfs_abort_transaction(trans, ret);
4270 goto err;
4271 }
4272
4273 /*
4274 * If we are in a rename context, we don't need to update anything in the
4275 * log. That will be done later during the rename by btrfs_log_new_name().
4276 * Besides that, doing it here would only cause extra unnecessary btree
4277 * operations on the log tree, increasing latency for applications.
4278 */
4279 if (!rename_ctx) {
4280 btrfs_del_inode_ref_in_log(trans, root, name, inode, dir_ino);
4281 btrfs_del_dir_entries_in_log(trans, root, name, dir, index);
4282 }
4283
4284 /*
4285 * If we have a pending delayed iput we could end up with the final iput
4286 * being run in btrfs-cleaner context. If we have enough of these built
4287 * up we can end up burning a lot of time in btrfs-cleaner without any
4288 * way to throttle the unlinks. Since we're currently holding a ref on
4289 * the inode we can run the delayed iput here without any issues as the
4290 * final iput won't be done until after we drop the ref we're currently
4291 * holding.
4292 */
4293 btrfs_run_delayed_iput(fs_info, inode);
4294 err:
4295 btrfs_free_path(path);
4296 if (ret)
4297 goto out;
4298
4299 btrfs_i_size_write(dir, dir->vfs_inode.i_size - name->len * 2);
4300 inode_inc_iversion(&inode->vfs_inode);
4301 inode_set_ctime_current(&inode->vfs_inode);
4302 inode_inc_iversion(&dir->vfs_inode);
4303 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4304 ret = btrfs_update_inode(trans, dir);
4305 out:
4306 return ret;
4307 }
4308
btrfs_unlink_inode(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct btrfs_inode * inode,const struct fscrypt_str * name)4309 int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
4310 struct btrfs_inode *dir, struct btrfs_inode *inode,
4311 const struct fscrypt_str *name)
4312 {
4313 int ret;
4314
4315 ret = __btrfs_unlink_inode(trans, dir, inode, name, NULL);
4316 if (!ret) {
4317 drop_nlink(&inode->vfs_inode);
4318 ret = btrfs_update_inode(trans, inode);
4319 }
4320 return ret;
4321 }
4322
4323 /*
4324 * helper to start transaction for unlink and rmdir.
4325 *
4326 * unlink and rmdir are special in btrfs, they do not always free space, so
4327 * if we cannot make our reservations the normal way try and see if there is
4328 * plenty of slack room in the global reserve to migrate, otherwise we cannot
4329 * allow the unlink to occur.
4330 */
__unlink_start_trans(struct btrfs_inode * dir)4331 static struct btrfs_trans_handle *__unlink_start_trans(struct btrfs_inode *dir)
4332 {
4333 struct btrfs_root *root = dir->root;
4334
4335 return btrfs_start_transaction_fallback_global_rsv(root,
4336 BTRFS_UNLINK_METADATA_UNITS);
4337 }
4338
btrfs_unlink(struct inode * dir,struct dentry * dentry)4339 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
4340 {
4341 struct btrfs_trans_handle *trans;
4342 struct inode *inode = d_inode(dentry);
4343 int ret;
4344 struct fscrypt_name fname;
4345
4346 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4347 if (ret)
4348 return ret;
4349
4350 /* This needs to handle no-key deletions later on */
4351
4352 trans = __unlink_start_trans(BTRFS_I(dir));
4353 if (IS_ERR(trans)) {
4354 ret = PTR_ERR(trans);
4355 goto fscrypt_free;
4356 }
4357
4358 btrfs_record_unlink_dir(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4359 false);
4360
4361 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4362 &fname.disk_name);
4363 if (ret)
4364 goto end_trans;
4365
4366 if (inode->i_nlink == 0) {
4367 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4368 if (ret)
4369 goto end_trans;
4370 }
4371
4372 end_trans:
4373 btrfs_end_transaction(trans);
4374 btrfs_btree_balance_dirty(BTRFS_I(dir)->root->fs_info);
4375 fscrypt_free:
4376 fscrypt_free_filename(&fname);
4377 return ret;
4378 }
4379
btrfs_unlink_subvol(struct btrfs_trans_handle * trans,struct btrfs_inode * dir,struct dentry * dentry)4380 static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
4381 struct btrfs_inode *dir, struct dentry *dentry)
4382 {
4383 struct btrfs_root *root = dir->root;
4384 struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
4385 struct btrfs_path *path;
4386 struct extent_buffer *leaf;
4387 struct btrfs_dir_item *di;
4388 struct btrfs_key key;
4389 u64 index;
4390 int ret;
4391 u64 objectid;
4392 u64 dir_ino = btrfs_ino(dir);
4393 struct fscrypt_name fname;
4394
4395 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
4396 if (ret)
4397 return ret;
4398
4399 /* This needs to handle no-key deletions later on */
4400
4401 if (btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID) {
4402 objectid = btrfs_root_id(inode->root);
4403 } else if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4404 objectid = inode->ref_root_id;
4405 } else {
4406 WARN_ON(1);
4407 fscrypt_free_filename(&fname);
4408 return -EINVAL;
4409 }
4410
4411 path = btrfs_alloc_path();
4412 if (!path) {
4413 ret = -ENOMEM;
4414 goto out;
4415 }
4416
4417 di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
4418 &fname.disk_name, -1);
4419 if (IS_ERR_OR_NULL(di)) {
4420 ret = di ? PTR_ERR(di) : -ENOENT;
4421 goto out;
4422 }
4423
4424 leaf = path->nodes[0];
4425 btrfs_dir_item_key_to_cpu(leaf, di, &key);
4426 WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
4427 ret = btrfs_delete_one_dir_name(trans, root, path, di);
4428 if (ret) {
4429 btrfs_abort_transaction(trans, ret);
4430 goto out;
4431 }
4432 btrfs_release_path(path);
4433
4434 /*
4435 * This is a placeholder inode for a subvolume we didn't have a
4436 * reference to at the time of the snapshot creation. In the meantime
4437 * we could have renamed the real subvol link into our snapshot, so
4438 * depending on btrfs_del_root_ref to return -ENOENT here is incorrect.
4439 * Instead simply lookup the dir_index_item for this entry so we can
4440 * remove it. Otherwise we know we have a ref to the root and we can
4441 * call btrfs_del_root_ref, and it _shouldn't_ fail.
4442 */
4443 if (btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) {
4444 di = btrfs_search_dir_index_item(root, path, dir_ino, &fname.disk_name);
4445 if (IS_ERR(di)) {
4446 ret = PTR_ERR(di);
4447 btrfs_abort_transaction(trans, ret);
4448 goto out;
4449 }
4450
4451 leaf = path->nodes[0];
4452 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4453 index = key.offset;
4454 btrfs_release_path(path);
4455 } else {
4456 ret = btrfs_del_root_ref(trans, objectid,
4457 btrfs_root_id(root), dir_ino,
4458 &index, &fname.disk_name);
4459 if (ret) {
4460 btrfs_abort_transaction(trans, ret);
4461 goto out;
4462 }
4463 }
4464
4465 ret = btrfs_delete_delayed_dir_index(trans, dir, index);
4466 if (ret) {
4467 btrfs_abort_transaction(trans, ret);
4468 goto out;
4469 }
4470
4471 btrfs_i_size_write(dir, dir->vfs_inode.i_size - fname.disk_name.len * 2);
4472 inode_inc_iversion(&dir->vfs_inode);
4473 inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
4474 ret = btrfs_update_inode_fallback(trans, dir);
4475 if (ret)
4476 btrfs_abort_transaction(trans, ret);
4477 out:
4478 btrfs_free_path(path);
4479 fscrypt_free_filename(&fname);
4480 return ret;
4481 }
4482
4483 /*
4484 * Helper to check if the subvolume references other subvolumes or if it's
4485 * default.
4486 */
may_destroy_subvol(struct btrfs_root * root)4487 static noinline int may_destroy_subvol(struct btrfs_root *root)
4488 {
4489 struct btrfs_fs_info *fs_info = root->fs_info;
4490 struct btrfs_path *path;
4491 struct btrfs_dir_item *di;
4492 struct btrfs_key key;
4493 struct fscrypt_str name = FSTR_INIT("default", 7);
4494 u64 dir_id;
4495 int ret;
4496
4497 path = btrfs_alloc_path();
4498 if (!path)
4499 return -ENOMEM;
4500
4501 /* Make sure this root isn't set as the default subvol */
4502 dir_id = btrfs_super_root_dir(fs_info->super_copy);
4503 di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path,
4504 dir_id, &name, 0);
4505 if (di && !IS_ERR(di)) {
4506 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
4507 if (key.objectid == btrfs_root_id(root)) {
4508 ret = -EPERM;
4509 btrfs_err(fs_info,
4510 "deleting default subvolume %llu is not allowed",
4511 key.objectid);
4512 goto out;
4513 }
4514 btrfs_release_path(path);
4515 }
4516
4517 key.objectid = btrfs_root_id(root);
4518 key.type = BTRFS_ROOT_REF_KEY;
4519 key.offset = (u64)-1;
4520
4521 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4522 if (ret < 0)
4523 goto out;
4524 if (ret == 0) {
4525 /*
4526 * Key with offset -1 found, there would have to exist a root
4527 * with such id, but this is out of valid range.
4528 */
4529 ret = -EUCLEAN;
4530 goto out;
4531 }
4532
4533 ret = 0;
4534 if (path->slots[0] > 0) {
4535 path->slots[0]--;
4536 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
4537 if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY)
4538 ret = -ENOTEMPTY;
4539 }
4540 out:
4541 btrfs_free_path(path);
4542 return ret;
4543 }
4544
4545 /* Delete all dentries for inodes belonging to the root */
btrfs_prune_dentries(struct btrfs_root * root)4546 static void btrfs_prune_dentries(struct btrfs_root *root)
4547 {
4548 struct btrfs_fs_info *fs_info = root->fs_info;
4549 struct btrfs_inode *inode;
4550 u64 min_ino = 0;
4551
4552 if (!BTRFS_FS_ERROR(fs_info))
4553 WARN_ON(btrfs_root_refs(&root->root_item) != 0);
4554
4555 inode = btrfs_find_first_inode(root, min_ino);
4556 while (inode) {
4557 if (atomic_read(&inode->vfs_inode.i_count) > 1)
4558 d_prune_aliases(&inode->vfs_inode);
4559
4560 min_ino = btrfs_ino(inode) + 1;
4561 /*
4562 * btrfs_drop_inode() will have it removed from the inode
4563 * cache when its usage count hits zero.
4564 */
4565 iput(&inode->vfs_inode);
4566 cond_resched();
4567 inode = btrfs_find_first_inode(root, min_ino);
4568 }
4569 }
4570
btrfs_delete_subvolume(struct btrfs_inode * dir,struct dentry * dentry)4571 int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry)
4572 {
4573 struct btrfs_root *root = dir->root;
4574 struct btrfs_fs_info *fs_info = root->fs_info;
4575 struct inode *inode = d_inode(dentry);
4576 struct btrfs_root *dest = BTRFS_I(inode)->root;
4577 struct btrfs_trans_handle *trans;
4578 struct btrfs_block_rsv block_rsv;
4579 u64 root_flags;
4580 u64 qgroup_reserved = 0;
4581 int ret;
4582
4583 down_write(&fs_info->subvol_sem);
4584
4585 /*
4586 * Don't allow to delete a subvolume with send in progress. This is
4587 * inside the inode lock so the error handling that has to drop the bit
4588 * again is not run concurrently.
4589 */
4590 spin_lock(&dest->root_item_lock);
4591 if (dest->send_in_progress) {
4592 spin_unlock(&dest->root_item_lock);
4593 btrfs_warn(fs_info,
4594 "attempt to delete subvolume %llu during send",
4595 btrfs_root_id(dest));
4596 ret = -EPERM;
4597 goto out_up_write;
4598 }
4599 if (atomic_read(&dest->nr_swapfiles)) {
4600 spin_unlock(&dest->root_item_lock);
4601 btrfs_warn(fs_info,
4602 "attempt to delete subvolume %llu with active swapfile",
4603 btrfs_root_id(root));
4604 ret = -EPERM;
4605 goto out_up_write;
4606 }
4607 root_flags = btrfs_root_flags(&dest->root_item);
4608 btrfs_set_root_flags(&dest->root_item,
4609 root_flags | BTRFS_ROOT_SUBVOL_DEAD);
4610 spin_unlock(&dest->root_item_lock);
4611
4612 ret = may_destroy_subvol(dest);
4613 if (ret)
4614 goto out_undead;
4615
4616 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
4617 /*
4618 * One for dir inode,
4619 * two for dir entries,
4620 * two for root ref/backref.
4621 */
4622 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 5, true);
4623 if (ret)
4624 goto out_undead;
4625 qgroup_reserved = block_rsv.qgroup_rsv_reserved;
4626
4627 trans = btrfs_start_transaction(root, 0);
4628 if (IS_ERR(trans)) {
4629 ret = PTR_ERR(trans);
4630 goto out_release;
4631 }
4632 btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
4633 qgroup_reserved = 0;
4634 trans->block_rsv = &block_rsv;
4635 trans->bytes_reserved = block_rsv.size;
4636
4637 btrfs_record_snapshot_destroy(trans, dir);
4638
4639 ret = btrfs_unlink_subvol(trans, dir, dentry);
4640 if (ret) {
4641 btrfs_abort_transaction(trans, ret);
4642 goto out_end_trans;
4643 }
4644
4645 ret = btrfs_record_root_in_trans(trans, dest);
4646 if (ret) {
4647 btrfs_abort_transaction(trans, ret);
4648 goto out_end_trans;
4649 }
4650
4651 memset(&dest->root_item.drop_progress, 0,
4652 sizeof(dest->root_item.drop_progress));
4653 btrfs_set_root_drop_level(&dest->root_item, 0);
4654 btrfs_set_root_refs(&dest->root_item, 0);
4655
4656 if (!test_and_set_bit(BTRFS_ROOT_ORPHAN_ITEM_INSERTED, &dest->state)) {
4657 ret = btrfs_insert_orphan_item(trans,
4658 fs_info->tree_root,
4659 btrfs_root_id(dest));
4660 if (ret) {
4661 btrfs_abort_transaction(trans, ret);
4662 goto out_end_trans;
4663 }
4664 }
4665
4666 ret = btrfs_uuid_tree_remove(trans, dest->root_item.uuid,
4667 BTRFS_UUID_KEY_SUBVOL, btrfs_root_id(dest));
4668 if (ret && ret != -ENOENT) {
4669 btrfs_abort_transaction(trans, ret);
4670 goto out_end_trans;
4671 }
4672 if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
4673 ret = btrfs_uuid_tree_remove(trans,
4674 dest->root_item.received_uuid,
4675 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4676 btrfs_root_id(dest));
4677 if (ret && ret != -ENOENT) {
4678 btrfs_abort_transaction(trans, ret);
4679 goto out_end_trans;
4680 }
4681 }
4682
4683 free_anon_bdev(dest->anon_dev);
4684 dest->anon_dev = 0;
4685 out_end_trans:
4686 trans->block_rsv = NULL;
4687 trans->bytes_reserved = 0;
4688 ret = btrfs_end_transaction(trans);
4689 inode->i_flags |= S_DEAD;
4690 out_release:
4691 btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
4692 if (qgroup_reserved)
4693 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
4694 out_undead:
4695 if (ret) {
4696 spin_lock(&dest->root_item_lock);
4697 root_flags = btrfs_root_flags(&dest->root_item);
4698 btrfs_set_root_flags(&dest->root_item,
4699 root_flags & ~BTRFS_ROOT_SUBVOL_DEAD);
4700 spin_unlock(&dest->root_item_lock);
4701 }
4702 out_up_write:
4703 up_write(&fs_info->subvol_sem);
4704 if (!ret) {
4705 d_invalidate(dentry);
4706 btrfs_prune_dentries(dest);
4707 ASSERT(dest->send_in_progress == 0);
4708 }
4709
4710 return ret;
4711 }
4712
btrfs_rmdir(struct inode * dir,struct dentry * dentry)4713 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
4714 {
4715 struct inode *inode = d_inode(dentry);
4716 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
4717 int ret = 0;
4718 struct btrfs_trans_handle *trans;
4719 u64 last_unlink_trans;
4720 struct fscrypt_name fname;
4721
4722 if (inode->i_size > BTRFS_EMPTY_DIR_SIZE)
4723 return -ENOTEMPTY;
4724 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_FIRST_FREE_OBJECTID) {
4725 if (unlikely(btrfs_fs_incompat(fs_info, EXTENT_TREE_V2))) {
4726 btrfs_err(fs_info,
4727 "extent tree v2 doesn't support snapshot deletion yet");
4728 return -EOPNOTSUPP;
4729 }
4730 return btrfs_delete_subvolume(BTRFS_I(dir), dentry);
4731 }
4732
4733 ret = fscrypt_setup_filename(dir, &dentry->d_name, 1, &fname);
4734 if (ret)
4735 return ret;
4736
4737 /* This needs to handle no-key deletions later on */
4738
4739 trans = __unlink_start_trans(BTRFS_I(dir));
4740 if (IS_ERR(trans)) {
4741 ret = PTR_ERR(trans);
4742 goto out_notrans;
4743 }
4744
4745 if (unlikely(btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
4746 ret = btrfs_unlink_subvol(trans, BTRFS_I(dir), dentry);
4747 goto out;
4748 }
4749
4750 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
4751 if (ret)
4752 goto out;
4753
4754 last_unlink_trans = BTRFS_I(inode)->last_unlink_trans;
4755
4756 /* now the directory is empty */
4757 ret = btrfs_unlink_inode(trans, BTRFS_I(dir), BTRFS_I(d_inode(dentry)),
4758 &fname.disk_name);
4759 if (!ret) {
4760 btrfs_i_size_write(BTRFS_I(inode), 0);
4761 /*
4762 * Propagate the last_unlink_trans value of the deleted dir to
4763 * its parent directory. This is to prevent an unrecoverable
4764 * log tree in the case we do something like this:
4765 * 1) create dir foo
4766 * 2) create snapshot under dir foo
4767 * 3) delete the snapshot
4768 * 4) rmdir foo
4769 * 5) mkdir foo
4770 * 6) fsync foo or some file inside foo
4771 */
4772 if (last_unlink_trans >= trans->transid)
4773 BTRFS_I(dir)->last_unlink_trans = last_unlink_trans;
4774 }
4775 out:
4776 btrfs_end_transaction(trans);
4777 out_notrans:
4778 btrfs_btree_balance_dirty(fs_info);
4779 fscrypt_free_filename(&fname);
4780
4781 return ret;
4782 }
4783
4784 /*
4785 * Read, zero a chunk and write a block.
4786 *
4787 * @inode - inode that we're zeroing
4788 * @from - the offset to start zeroing
4789 * @len - the length to zero, 0 to zero the entire range respective to the
4790 * offset
4791 * @front - zero up to the offset instead of from the offset on
4792 *
4793 * This will find the block for the "from" offset and cow the block and zero the
4794 * part we want to zero. This is used with truncate and hole punching.
4795 */
btrfs_truncate_block(struct btrfs_inode * inode,loff_t from,loff_t len,int front)4796 int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
4797 int front)
4798 {
4799 struct btrfs_fs_info *fs_info = inode->root->fs_info;
4800 struct address_space *mapping = inode->vfs_inode.i_mapping;
4801 struct extent_io_tree *io_tree = &inode->io_tree;
4802 struct btrfs_ordered_extent *ordered;
4803 struct extent_state *cached_state = NULL;
4804 struct extent_changeset *data_reserved = NULL;
4805 bool only_release_metadata = false;
4806 u32 blocksize = fs_info->sectorsize;
4807 pgoff_t index = from >> PAGE_SHIFT;
4808 unsigned offset = from & (blocksize - 1);
4809 struct folio *folio;
4810 gfp_t mask = btrfs_alloc_write_mask(mapping);
4811 size_t write_bytes = blocksize;
4812 int ret = 0;
4813 u64 block_start;
4814 u64 block_end;
4815
4816 if (IS_ALIGNED(offset, blocksize) &&
4817 (!len || IS_ALIGNED(len, blocksize)))
4818 goto out;
4819
4820 block_start = round_down(from, blocksize);
4821 block_end = block_start + blocksize - 1;
4822
4823 ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
4824 blocksize, false);
4825 if (ret < 0) {
4826 if (btrfs_check_nocow_lock(inode, block_start, &write_bytes, false) > 0) {
4827 /* For nocow case, no need to reserve data space */
4828 only_release_metadata = true;
4829 } else {
4830 goto out;
4831 }
4832 }
4833 ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false);
4834 if (ret < 0) {
4835 if (!only_release_metadata)
4836 btrfs_free_reserved_data_space(inode, data_reserved,
4837 block_start, blocksize);
4838 goto out;
4839 }
4840 again:
4841 folio = __filemap_get_folio(mapping, index,
4842 FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask);
4843 if (IS_ERR(folio)) {
4844 btrfs_delalloc_release_space(inode, data_reserved, block_start,
4845 blocksize, true);
4846 btrfs_delalloc_release_extents(inode, blocksize);
4847 ret = -ENOMEM;
4848 goto out;
4849 }
4850
4851 if (!folio_test_uptodate(folio)) {
4852 ret = btrfs_read_folio(NULL, folio);
4853 folio_lock(folio);
4854 if (folio->mapping != mapping) {
4855 folio_unlock(folio);
4856 folio_put(folio);
4857 goto again;
4858 }
4859 if (!folio_test_uptodate(folio)) {
4860 ret = -EIO;
4861 goto out_unlock;
4862 }
4863 }
4864
4865 /*
4866 * We unlock the page after the io is completed and then re-lock it
4867 * above. release_folio() could have come in between that and cleared
4868 * folio private, but left the page in the mapping. Set the page mapped
4869 * here to make sure it's properly set for the subpage stuff.
4870 */
4871 ret = set_folio_extent_mapped(folio);
4872 if (ret < 0)
4873 goto out_unlock;
4874
4875 folio_wait_writeback(folio);
4876
4877 lock_extent(io_tree, block_start, block_end, &cached_state);
4878
4879 ordered = btrfs_lookup_ordered_extent(inode, block_start);
4880 if (ordered) {
4881 unlock_extent(io_tree, block_start, block_end, &cached_state);
4882 folio_unlock(folio);
4883 folio_put(folio);
4884 btrfs_start_ordered_extent(ordered);
4885 btrfs_put_ordered_extent(ordered);
4886 goto again;
4887 }
4888
4889 clear_extent_bit(&inode->io_tree, block_start, block_end,
4890 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
4891 &cached_state);
4892
4893 ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0,
4894 &cached_state);
4895 if (ret) {
4896 unlock_extent(io_tree, block_start, block_end, &cached_state);
4897 goto out_unlock;
4898 }
4899
4900 if (offset != blocksize) {
4901 if (!len)
4902 len = blocksize - offset;
4903 if (front)
4904 folio_zero_range(folio, block_start - folio_pos(folio),
4905 offset);
4906 else
4907 folio_zero_range(folio,
4908 (block_start - folio_pos(folio)) + offset,
4909 len);
4910 }
4911 btrfs_folio_clear_checked(fs_info, folio, block_start,
4912 block_end + 1 - block_start);
4913 btrfs_folio_set_dirty(fs_info, folio, block_start,
4914 block_end + 1 - block_start);
4915 unlock_extent(io_tree, block_start, block_end, &cached_state);
4916
4917 if (only_release_metadata)
4918 set_extent_bit(&inode->io_tree, block_start, block_end,
4919 EXTENT_NORESERVE, NULL);
4920
4921 out_unlock:
4922 if (ret) {
4923 if (only_release_metadata)
4924 btrfs_delalloc_release_metadata(inode, blocksize, true);
4925 else
4926 btrfs_delalloc_release_space(inode, data_reserved,
4927 block_start, blocksize, true);
4928 }
4929 btrfs_delalloc_release_extents(inode, blocksize);
4930 folio_unlock(folio);
4931 folio_put(folio);
4932 out:
4933 if (only_release_metadata)
4934 btrfs_check_nocow_unlock(inode);
4935 extent_changeset_free(data_reserved);
4936 return ret;
4937 }
4938
maybe_insert_hole(struct btrfs_inode * inode,u64 offset,u64 len)4939 static int maybe_insert_hole(struct btrfs_inode *inode, u64 offset, u64 len)
4940 {
4941 struct btrfs_root *root = inode->root;
4942 struct btrfs_fs_info *fs_info = root->fs_info;
4943 struct btrfs_trans_handle *trans;
4944 struct btrfs_drop_extents_args drop_args = { 0 };
4945 int ret;
4946
4947 /*
4948 * If NO_HOLES is enabled, we don't need to do anything.
4949 * Later, up in the call chain, either btrfs_set_inode_last_sub_trans()
4950 * or btrfs_update_inode() will be called, which guarantee that the next
4951 * fsync will know this inode was changed and needs to be logged.
4952 */
4953 if (btrfs_fs_incompat(fs_info, NO_HOLES))
4954 return 0;
4955
4956 /*
4957 * 1 - for the one we're dropping
4958 * 1 - for the one we're adding
4959 * 1 - for updating the inode.
4960 */
4961 trans = btrfs_start_transaction(root, 3);
4962 if (IS_ERR(trans))
4963 return PTR_ERR(trans);
4964
4965 drop_args.start = offset;
4966 drop_args.end = offset + len;
4967 drop_args.drop_cache = true;
4968
4969 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
4970 if (ret) {
4971 btrfs_abort_transaction(trans, ret);
4972 btrfs_end_transaction(trans);
4973 return ret;
4974 }
4975
4976 ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset, len);
4977 if (ret) {
4978 btrfs_abort_transaction(trans, ret);
4979 } else {
4980 btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found);
4981 btrfs_update_inode(trans, inode);
4982 }
4983 btrfs_end_transaction(trans);
4984 return ret;
4985 }
4986
4987 /*
4988 * This function puts in dummy file extents for the area we're creating a hole
4989 * for. So if we are truncating this file to a larger size we need to insert
4990 * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
4991 * the range between oldsize and size
4992 */
btrfs_cont_expand(struct btrfs_inode * inode,loff_t oldsize,loff_t size)4993 int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size)
4994 {
4995 struct btrfs_root *root = inode->root;
4996 struct btrfs_fs_info *fs_info = root->fs_info;
4997 struct extent_io_tree *io_tree = &inode->io_tree;
4998 struct extent_map *em = NULL;
4999 struct extent_state *cached_state = NULL;
5000 u64 hole_start = ALIGN(oldsize, fs_info->sectorsize);
5001 u64 block_end = ALIGN(size, fs_info->sectorsize);
5002 u64 last_byte;
5003 u64 cur_offset;
5004 u64 hole_size;
5005 int ret = 0;
5006
5007 /*
5008 * If our size started in the middle of a block we need to zero out the
5009 * rest of the block before we expand the i_size, otherwise we could
5010 * expose stale data.
5011 */
5012 ret = btrfs_truncate_block(inode, oldsize, 0, 0);
5013 if (ret)
5014 return ret;
5015
5016 if (size <= hole_start)
5017 return 0;
5018
5019 btrfs_lock_and_flush_ordered_range(inode, hole_start, block_end - 1,
5020 &cached_state);
5021 cur_offset = hole_start;
5022 while (1) {
5023 em = btrfs_get_extent(inode, NULL, cur_offset, block_end - cur_offset);
5024 if (IS_ERR(em)) {
5025 ret = PTR_ERR(em);
5026 em = NULL;
5027 break;
5028 }
5029 last_byte = min(extent_map_end(em), block_end);
5030 last_byte = ALIGN(last_byte, fs_info->sectorsize);
5031 hole_size = last_byte - cur_offset;
5032
5033 if (!(em->flags & EXTENT_FLAG_PREALLOC)) {
5034 struct extent_map *hole_em;
5035
5036 ret = maybe_insert_hole(inode, cur_offset, hole_size);
5037 if (ret)
5038 break;
5039
5040 ret = btrfs_inode_set_file_extent_range(inode,
5041 cur_offset, hole_size);
5042 if (ret)
5043 break;
5044
5045 hole_em = alloc_extent_map();
5046 if (!hole_em) {
5047 btrfs_drop_extent_map_range(inode, cur_offset,
5048 cur_offset + hole_size - 1,
5049 false);
5050 btrfs_set_inode_full_sync(inode);
5051 goto next;
5052 }
5053 hole_em->start = cur_offset;
5054 hole_em->len = hole_size;
5055
5056 hole_em->disk_bytenr = EXTENT_MAP_HOLE;
5057 hole_em->disk_num_bytes = 0;
5058 hole_em->ram_bytes = hole_size;
5059 hole_em->generation = btrfs_get_fs_generation(fs_info);
5060
5061 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
5062 free_extent_map(hole_em);
5063 } else {
5064 ret = btrfs_inode_set_file_extent_range(inode,
5065 cur_offset, hole_size);
5066 if (ret)
5067 break;
5068 }
5069 next:
5070 free_extent_map(em);
5071 em = NULL;
5072 cur_offset = last_byte;
5073 if (cur_offset >= block_end)
5074 break;
5075 }
5076 free_extent_map(em);
5077 unlock_extent(io_tree, hole_start, block_end - 1, &cached_state);
5078 return ret;
5079 }
5080
btrfs_setsize(struct inode * inode,struct iattr * attr)5081 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
5082 {
5083 struct btrfs_root *root = BTRFS_I(inode)->root;
5084 struct btrfs_trans_handle *trans;
5085 loff_t oldsize = i_size_read(inode);
5086 loff_t newsize = attr->ia_size;
5087 int mask = attr->ia_valid;
5088 int ret;
5089
5090 /*
5091 * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
5092 * special case where we need to update the times despite not having
5093 * these flags set. For all other operations the VFS set these flags
5094 * explicitly if it wants a timestamp update.
5095 */
5096 if (newsize != oldsize) {
5097 inode_inc_iversion(inode);
5098 if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
5099 inode_set_mtime_to_ts(inode,
5100 inode_set_ctime_current(inode));
5101 }
5102 }
5103
5104 if (newsize > oldsize) {
5105 /*
5106 * Don't do an expanding truncate while snapshotting is ongoing.
5107 * This is to ensure the snapshot captures a fully consistent
5108 * state of this file - if the snapshot captures this expanding
5109 * truncation, it must capture all writes that happened before
5110 * this truncation.
5111 */
5112 btrfs_drew_write_lock(&root->snapshot_lock);
5113 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, newsize);
5114 if (ret) {
5115 btrfs_drew_write_unlock(&root->snapshot_lock);
5116 return ret;
5117 }
5118
5119 trans = btrfs_start_transaction(root, 1);
5120 if (IS_ERR(trans)) {
5121 btrfs_drew_write_unlock(&root->snapshot_lock);
5122 return PTR_ERR(trans);
5123 }
5124
5125 i_size_write(inode, newsize);
5126 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
5127 pagecache_isize_extended(inode, oldsize, newsize);
5128 ret = btrfs_update_inode(trans, BTRFS_I(inode));
5129 btrfs_drew_write_unlock(&root->snapshot_lock);
5130 btrfs_end_transaction(trans);
5131 } else {
5132 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5133
5134 if (btrfs_is_zoned(fs_info)) {
5135 ret = btrfs_wait_ordered_range(BTRFS_I(inode),
5136 ALIGN(newsize, fs_info->sectorsize),
5137 (u64)-1);
5138 if (ret)
5139 return ret;
5140 }
5141
5142 /*
5143 * We're truncating a file that used to have good data down to
5144 * zero. Make sure any new writes to the file get on disk
5145 * on close.
5146 */
5147 if (newsize == 0)
5148 set_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
5149 &BTRFS_I(inode)->runtime_flags);
5150
5151 truncate_setsize(inode, newsize);
5152
5153 inode_dio_wait(inode);
5154
5155 ret = btrfs_truncate(BTRFS_I(inode), newsize == oldsize);
5156 if (ret && inode->i_nlink) {
5157 int err;
5158
5159 /*
5160 * Truncate failed, so fix up the in-memory size. We
5161 * adjusted disk_i_size down as we removed extents, so
5162 * wait for disk_i_size to be stable and then update the
5163 * in-memory size to match.
5164 */
5165 err = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
5166 if (err)
5167 return err;
5168 i_size_write(inode, BTRFS_I(inode)->disk_i_size);
5169 }
5170 }
5171
5172 return ret;
5173 }
5174
btrfs_setattr(struct mnt_idmap * idmap,struct dentry * dentry,struct iattr * attr)5175 static int btrfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
5176 struct iattr *attr)
5177 {
5178 struct inode *inode = d_inode(dentry);
5179 struct btrfs_root *root = BTRFS_I(inode)->root;
5180 int err;
5181
5182 if (btrfs_root_readonly(root))
5183 return -EROFS;
5184
5185 err = setattr_prepare(idmap, dentry, attr);
5186 if (err)
5187 return err;
5188
5189 if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
5190 err = btrfs_setsize(inode, attr);
5191 if (err)
5192 return err;
5193 }
5194
5195 if (attr->ia_valid) {
5196 setattr_copy(idmap, inode, attr);
5197 inode_inc_iversion(inode);
5198 err = btrfs_dirty_inode(BTRFS_I(inode));
5199
5200 if (!err && attr->ia_valid & ATTR_MODE)
5201 err = posix_acl_chmod(idmap, dentry, inode->i_mode);
5202 }
5203
5204 return err;
5205 }
5206
5207 /*
5208 * While truncating the inode pages during eviction, we get the VFS
5209 * calling btrfs_invalidate_folio() against each folio of the inode. This
5210 * is slow because the calls to btrfs_invalidate_folio() result in a
5211 * huge amount of calls to lock_extent() and clear_extent_bit(),
5212 * which keep merging and splitting extent_state structures over and over,
5213 * wasting lots of time.
5214 *
5215 * Therefore if the inode is being evicted, let btrfs_invalidate_folio()
5216 * skip all those expensive operations on a per folio basis and do only
5217 * the ordered io finishing, while we release here the extent_map and
5218 * extent_state structures, without the excessive merging and splitting.
5219 */
evict_inode_truncate_pages(struct inode * inode)5220 static void evict_inode_truncate_pages(struct inode *inode)
5221 {
5222 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
5223 struct rb_node *node;
5224
5225 ASSERT(inode->i_state & I_FREEING);
5226 truncate_inode_pages_final(&inode->i_data);
5227
5228 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
5229
5230 /*
5231 * Keep looping until we have no more ranges in the io tree.
5232 * We can have ongoing bios started by readahead that have
5233 * their endio callback (extent_io.c:end_bio_extent_readpage)
5234 * still in progress (unlocked the pages in the bio but did not yet
5235 * unlocked the ranges in the io tree). Therefore this means some
5236 * ranges can still be locked and eviction started because before
5237 * submitting those bios, which are executed by a separate task (work
5238 * queue kthread), inode references (inode->i_count) were not taken
5239 * (which would be dropped in the end io callback of each bio).
5240 * Therefore here we effectively end up waiting for those bios and
5241 * anyone else holding locked ranges without having bumped the inode's
5242 * reference count - if we don't do it, when they access the inode's
5243 * io_tree to unlock a range it may be too late, leading to an
5244 * use-after-free issue.
5245 */
5246 spin_lock(&io_tree->lock);
5247 while (!RB_EMPTY_ROOT(&io_tree->state)) {
5248 struct extent_state *state;
5249 struct extent_state *cached_state = NULL;
5250 u64 start;
5251 u64 end;
5252 unsigned state_flags;
5253
5254 node = rb_first(&io_tree->state);
5255 state = rb_entry(node, struct extent_state, rb_node);
5256 start = state->start;
5257 end = state->end;
5258 state_flags = state->state;
5259 spin_unlock(&io_tree->lock);
5260
5261 lock_extent(io_tree, start, end, &cached_state);
5262
5263 /*
5264 * If still has DELALLOC flag, the extent didn't reach disk,
5265 * and its reserved space won't be freed by delayed_ref.
5266 * So we need to free its reserved space here.
5267 * (Refer to comment in btrfs_invalidate_folio, case 2)
5268 *
5269 * Note, end is the bytenr of last byte, so we need + 1 here.
5270 */
5271 if (state_flags & EXTENT_DELALLOC)
5272 btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start,
5273 end - start + 1, NULL);
5274
5275 clear_extent_bit(io_tree, start, end,
5276 EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING,
5277 &cached_state);
5278
5279 cond_resched();
5280 spin_lock(&io_tree->lock);
5281 }
5282 spin_unlock(&io_tree->lock);
5283 }
5284
evict_refill_and_join(struct btrfs_root * root,struct btrfs_block_rsv * rsv)5285 static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
5286 struct btrfs_block_rsv *rsv)
5287 {
5288 struct btrfs_fs_info *fs_info = root->fs_info;
5289 struct btrfs_trans_handle *trans;
5290 u64 delayed_refs_extra = btrfs_calc_delayed_ref_bytes(fs_info, 1);
5291 int ret;
5292
5293 /*
5294 * Eviction should be taking place at some place safe because of our
5295 * delayed iputs. However the normal flushing code will run delayed
5296 * iputs, so we cannot use FLUSH_ALL otherwise we'll deadlock.
5297 *
5298 * We reserve the delayed_refs_extra here again because we can't use
5299 * btrfs_start_transaction(root, 0) for the same deadlocky reason as
5300 * above. We reserve our extra bit here because we generate a ton of
5301 * delayed refs activity by truncating.
5302 *
5303 * BTRFS_RESERVE_FLUSH_EVICT will steal from the global_rsv if it can,
5304 * if we fail to make this reservation we can re-try without the
5305 * delayed_refs_extra so we can make some forward progress.
5306 */
5307 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size + delayed_refs_extra,
5308 BTRFS_RESERVE_FLUSH_EVICT);
5309 if (ret) {
5310 ret = btrfs_block_rsv_refill(fs_info, rsv, rsv->size,
5311 BTRFS_RESERVE_FLUSH_EVICT);
5312 if (ret) {
5313 btrfs_warn(fs_info,
5314 "could not allocate space for delete; will truncate on mount");
5315 return ERR_PTR(-ENOSPC);
5316 }
5317 delayed_refs_extra = 0;
5318 }
5319
5320 trans = btrfs_join_transaction(root);
5321 if (IS_ERR(trans))
5322 return trans;
5323
5324 if (delayed_refs_extra) {
5325 trans->block_rsv = &fs_info->trans_block_rsv;
5326 trans->bytes_reserved = delayed_refs_extra;
5327 btrfs_block_rsv_migrate(rsv, trans->block_rsv,
5328 delayed_refs_extra, true);
5329 }
5330 return trans;
5331 }
5332
btrfs_evict_inode(struct inode * inode)5333 void btrfs_evict_inode(struct inode *inode)
5334 {
5335 struct btrfs_fs_info *fs_info;
5336 struct btrfs_trans_handle *trans;
5337 struct btrfs_root *root = BTRFS_I(inode)->root;
5338 struct btrfs_block_rsv *rsv = NULL;
5339 int ret;
5340
5341 trace_btrfs_inode_evict(inode);
5342
5343 if (!root) {
5344 fsverity_cleanup_inode(inode);
5345 clear_inode(inode);
5346 return;
5347 }
5348
5349 fs_info = inode_to_fs_info(inode);
5350 evict_inode_truncate_pages(inode);
5351
5352 if (inode->i_nlink &&
5353 ((btrfs_root_refs(&root->root_item) != 0 &&
5354 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID) ||
5355 btrfs_is_free_space_inode(BTRFS_I(inode))))
5356 goto out;
5357
5358 if (is_bad_inode(inode))
5359 goto out;
5360
5361 if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags))
5362 goto out;
5363
5364 if (inode->i_nlink > 0) {
5365 BUG_ON(btrfs_root_refs(&root->root_item) != 0 &&
5366 btrfs_root_id(root) != BTRFS_ROOT_TREE_OBJECTID);
5367 goto out;
5368 }
5369
5370 /*
5371 * This makes sure the inode item in tree is uptodate and the space for
5372 * the inode update is released.
5373 */
5374 ret = btrfs_commit_inode_delayed_inode(BTRFS_I(inode));
5375 if (ret)
5376 goto out;
5377
5378 /*
5379 * This drops any pending insert or delete operations we have for this
5380 * inode. We could have a delayed dir index deletion queued up, but
5381 * we're removing the inode completely so that'll be taken care of in
5382 * the truncate.
5383 */
5384 btrfs_kill_delayed_inode_items(BTRFS_I(inode));
5385
5386 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
5387 if (!rsv)
5388 goto out;
5389 rsv->size = btrfs_calc_metadata_size(fs_info, 1);
5390 rsv->failfast = true;
5391
5392 btrfs_i_size_write(BTRFS_I(inode), 0);
5393
5394 while (1) {
5395 struct btrfs_truncate_control control = {
5396 .inode = BTRFS_I(inode),
5397 .ino = btrfs_ino(BTRFS_I(inode)),
5398 .new_size = 0,
5399 .min_type = 0,
5400 };
5401
5402 trans = evict_refill_and_join(root, rsv);
5403 if (IS_ERR(trans))
5404 goto out;
5405
5406 trans->block_rsv = rsv;
5407
5408 ret = btrfs_truncate_inode_items(trans, root, &control);
5409 trans->block_rsv = &fs_info->trans_block_rsv;
5410 btrfs_end_transaction(trans);
5411 /*
5412 * We have not added new delayed items for our inode after we
5413 * have flushed its delayed items, so no need to throttle on
5414 * delayed items. However we have modified extent buffers.
5415 */
5416 btrfs_btree_balance_dirty_nodelay(fs_info);
5417 if (ret && ret != -ENOSPC && ret != -EAGAIN)
5418 goto out;
5419 else if (!ret)
5420 break;
5421 }
5422
5423 /*
5424 * Errors here aren't a big deal, it just means we leave orphan items in
5425 * the tree. They will be cleaned up on the next mount. If the inode
5426 * number gets reused, cleanup deletes the orphan item without doing
5427 * anything, and unlink reuses the existing orphan item.
5428 *
5429 * If it turns out that we are dropping too many of these, we might want
5430 * to add a mechanism for retrying these after a commit.
5431 */
5432 trans = evict_refill_and_join(root, rsv);
5433 if (!IS_ERR(trans)) {
5434 trans->block_rsv = rsv;
5435 btrfs_orphan_del(trans, BTRFS_I(inode));
5436 trans->block_rsv = &fs_info->trans_block_rsv;
5437 btrfs_end_transaction(trans);
5438 }
5439
5440 out:
5441 btrfs_free_block_rsv(fs_info, rsv);
5442 /*
5443 * If we didn't successfully delete, the orphan item will still be in
5444 * the tree and we'll retry on the next mount. Again, we might also want
5445 * to retry these periodically in the future.
5446 */
5447 btrfs_remove_delayed_node(BTRFS_I(inode));
5448 fsverity_cleanup_inode(inode);
5449 clear_inode(inode);
5450 }
5451
5452 /*
5453 * Return the key found in the dir entry in the location pointer, fill @type
5454 * with BTRFS_FT_*, and return 0.
5455 *
5456 * If no dir entries were found, returns -ENOENT.
5457 * If found a corrupted location in dir entry, returns -EUCLEAN.
5458 */
btrfs_inode_by_name(struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,u8 * type)5459 static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry,
5460 struct btrfs_key *location, u8 *type)
5461 {
5462 struct btrfs_dir_item *di;
5463 struct btrfs_path *path;
5464 struct btrfs_root *root = dir->root;
5465 int ret = 0;
5466 struct fscrypt_name fname;
5467
5468 path = btrfs_alloc_path();
5469 if (!path)
5470 return -ENOMEM;
5471
5472 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname);
5473 if (ret < 0)
5474 goto out;
5475 /*
5476 * fscrypt_setup_filename() should never return a positive value, but
5477 * gcc on sparc/parisc thinks it can, so assert that doesn't happen.
5478 */
5479 ASSERT(ret == 0);
5480
5481 /* This needs to handle no-key deletions later on */
5482
5483 di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir),
5484 &fname.disk_name, 0);
5485 if (IS_ERR_OR_NULL(di)) {
5486 ret = di ? PTR_ERR(di) : -ENOENT;
5487 goto out;
5488 }
5489
5490 btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
5491 if (location->type != BTRFS_INODE_ITEM_KEY &&
5492 location->type != BTRFS_ROOT_ITEM_KEY) {
5493 ret = -EUCLEAN;
5494 btrfs_warn(root->fs_info,
5495 "%s gets something invalid in DIR_ITEM (name %s, directory ino %llu, location(%llu %u %llu))",
5496 __func__, fname.disk_name.name, btrfs_ino(dir),
5497 location->objectid, location->type, location->offset);
5498 }
5499 if (!ret)
5500 *type = btrfs_dir_ftype(path->nodes[0], di);
5501 out:
5502 fscrypt_free_filename(&fname);
5503 btrfs_free_path(path);
5504 return ret;
5505 }
5506
5507 /*
5508 * when we hit a tree root in a directory, the btrfs part of the inode
5509 * needs to be changed to reflect the root directory of the tree root. This
5510 * is kind of like crossing a mount point.
5511 */
fixup_tree_root_location(struct btrfs_fs_info * fs_info,struct btrfs_inode * dir,struct dentry * dentry,struct btrfs_key * location,struct btrfs_root ** sub_root)5512 static int fixup_tree_root_location(struct btrfs_fs_info *fs_info,
5513 struct btrfs_inode *dir,
5514 struct dentry *dentry,
5515 struct btrfs_key *location,
5516 struct btrfs_root **sub_root)
5517 {
5518 struct btrfs_path *path;
5519 struct btrfs_root *new_root;
5520 struct btrfs_root_ref *ref;
5521 struct extent_buffer *leaf;
5522 struct btrfs_key key;
5523 int ret;
5524 int err = 0;
5525 struct fscrypt_name fname;
5526
5527 ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 0, &fname);
5528 if (ret)
5529 return ret;
5530
5531 path = btrfs_alloc_path();
5532 if (!path) {
5533 err = -ENOMEM;
5534 goto out;
5535 }
5536
5537 err = -ENOENT;
5538 key.objectid = btrfs_root_id(dir->root);
5539 key.type = BTRFS_ROOT_REF_KEY;
5540 key.offset = location->objectid;
5541
5542 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
5543 if (ret) {
5544 if (ret < 0)
5545 err = ret;
5546 goto out;
5547 }
5548
5549 leaf = path->nodes[0];
5550 ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
5551 if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
5552 btrfs_root_ref_name_len(leaf, ref) != fname.disk_name.len)
5553 goto out;
5554
5555 ret = memcmp_extent_buffer(leaf, fname.disk_name.name,
5556 (unsigned long)(ref + 1), fname.disk_name.len);
5557 if (ret)
5558 goto out;
5559
5560 btrfs_release_path(path);
5561
5562 new_root = btrfs_get_fs_root(fs_info, location->objectid, true);
5563 if (IS_ERR(new_root)) {
5564 err = PTR_ERR(new_root);
5565 goto out;
5566 }
5567
5568 *sub_root = new_root;
5569 location->objectid = btrfs_root_dirid(&new_root->root_item);
5570 location->type = BTRFS_INODE_ITEM_KEY;
5571 location->offset = 0;
5572 err = 0;
5573 out:
5574 btrfs_free_path(path);
5575 fscrypt_free_filename(&fname);
5576 return err;
5577 }
5578
5579
5580
btrfs_del_inode_from_root(struct btrfs_inode * inode)5581 static void btrfs_del_inode_from_root(struct btrfs_inode *inode)
5582 {
5583 struct btrfs_root *root = inode->root;
5584 struct btrfs_inode *entry;
5585 bool empty = false;
5586
5587 xa_lock(&root->inodes);
5588 entry = __xa_erase(&root->inodes, btrfs_ino(inode));
5589 if (entry == inode)
5590 empty = xa_empty(&root->inodes);
5591 xa_unlock(&root->inodes);
5592
5593 if (empty && btrfs_root_refs(&root->root_item) == 0) {
5594 xa_lock(&root->inodes);
5595 empty = xa_empty(&root->inodes);
5596 xa_unlock(&root->inodes);
5597 if (empty)
5598 btrfs_add_dead_root(root);
5599 }
5600 }
5601
5602
btrfs_init_locked_inode(struct inode * inode,void * p)5603 static int btrfs_init_locked_inode(struct inode *inode, void *p)
5604 {
5605 struct btrfs_iget_args *args = p;
5606
5607 btrfs_set_inode_number(BTRFS_I(inode), args->ino);
5608 BTRFS_I(inode)->root = btrfs_grab_root(args->root);
5609
5610 if (args->root && args->root == args->root->fs_info->tree_root &&
5611 args->ino != BTRFS_BTREE_INODE_OBJECTID)
5612 set_bit(BTRFS_INODE_FREE_SPACE_INODE,
5613 &BTRFS_I(inode)->runtime_flags);
5614 return 0;
5615 }
5616
btrfs_find_actor(struct inode * inode,void * opaque)5617 static int btrfs_find_actor(struct inode *inode, void *opaque)
5618 {
5619 struct btrfs_iget_args *args = opaque;
5620
5621 return args->ino == btrfs_ino(BTRFS_I(inode)) &&
5622 args->root == BTRFS_I(inode)->root;
5623 }
5624
btrfs_iget_locked(u64 ino,struct btrfs_root * root)5625 static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root)
5626 {
5627 struct inode *inode;
5628 struct btrfs_iget_args args;
5629 unsigned long hashval = btrfs_inode_hash(ino, root);
5630
5631 args.ino = ino;
5632 args.root = root;
5633
5634 inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor,
5635 btrfs_init_locked_inode,
5636 (void *)&args);
5637 if (!inode)
5638 return NULL;
5639 return BTRFS_I(inode);
5640 }
5641
5642 /*
5643 * Get an inode object given its inode number and corresponding root. Path is
5644 * preallocated to prevent recursing back to iget through allocator.
5645 */
btrfs_iget_path(u64 ino,struct btrfs_root * root,struct btrfs_path * path)5646 struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root,
5647 struct btrfs_path *path)
5648 {
5649 struct btrfs_inode *inode;
5650 int ret;
5651
5652 inode = btrfs_iget_locked(ino, root);
5653 if (!inode)
5654 return ERR_PTR(-ENOMEM);
5655
5656 if (!(inode->vfs_inode.i_state & I_NEW))
5657 return inode;
5658
5659 ret = btrfs_read_locked_inode(inode, path);
5660 if (ret)
5661 return ERR_PTR(ret);
5662
5663 unlock_new_inode(&inode->vfs_inode);
5664 return inode;
5665 }
5666
5667 /*
5668 * Get an inode object given its inode number and corresponding root.
5669 */
btrfs_iget(u64 ino,struct btrfs_root * root)5670 struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root)
5671 {
5672 struct btrfs_inode *inode;
5673 struct btrfs_path *path;
5674 int ret;
5675
5676 inode = btrfs_iget_locked(ino, root);
5677 if (!inode)
5678 return ERR_PTR(-ENOMEM);
5679
5680 if (!(inode->vfs_inode.i_state & I_NEW))
5681 return inode;
5682
5683 path = btrfs_alloc_path();
5684 if (!path)
5685 return ERR_PTR(-ENOMEM);
5686
5687 ret = btrfs_read_locked_inode(inode, path);
5688 btrfs_free_path(path);
5689 if (ret)
5690 return ERR_PTR(ret);
5691
5692 unlock_new_inode(&inode->vfs_inode);
5693 return inode;
5694 }
5695
new_simple_dir(struct inode * dir,struct btrfs_key * key,struct btrfs_root * root)5696 static struct btrfs_inode *new_simple_dir(struct inode *dir,
5697 struct btrfs_key *key,
5698 struct btrfs_root *root)
5699 {
5700 struct timespec64 ts;
5701 struct inode *vfs_inode;
5702 struct btrfs_inode *inode;
5703
5704 vfs_inode = new_inode(dir->i_sb);
5705 if (!vfs_inode)
5706 return ERR_PTR(-ENOMEM);
5707
5708 inode = BTRFS_I(vfs_inode);
5709 inode->root = btrfs_grab_root(root);
5710 inode->ref_root_id = key->objectid;
5711 set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags);
5712 set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags);
5713
5714 btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID);
5715 /*
5716 * We only need lookup, the rest is read-only and there's no inode
5717 * associated with the dentry
5718 */
5719 vfs_inode->i_op = &simple_dir_inode_operations;
5720 vfs_inode->i_opflags &= ~IOP_XATTR;
5721 vfs_inode->i_fop = &simple_dir_operations;
5722 vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
5723
5724 ts = inode_set_ctime_current(vfs_inode);
5725 inode_set_mtime_to_ts(vfs_inode, ts);
5726 inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir));
5727 inode->i_otime_sec = ts.tv_sec;
5728 inode->i_otime_nsec = ts.tv_nsec;
5729
5730 vfs_inode->i_uid = dir->i_uid;
5731 vfs_inode->i_gid = dir->i_gid;
5732
5733 return inode;
5734 }
5735
5736 static_assert(BTRFS_FT_UNKNOWN == FT_UNKNOWN);
5737 static_assert(BTRFS_FT_REG_FILE == FT_REG_FILE);
5738 static_assert(BTRFS_FT_DIR == FT_DIR);
5739 static_assert(BTRFS_FT_CHRDEV == FT_CHRDEV);
5740 static_assert(BTRFS_FT_BLKDEV == FT_BLKDEV);
5741 static_assert(BTRFS_FT_FIFO == FT_FIFO);
5742 static_assert(BTRFS_FT_SOCK == FT_SOCK);
5743 static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK);
5744
btrfs_inode_type(const struct btrfs_inode * inode)5745 static inline u8 btrfs_inode_type(const struct btrfs_inode *inode)
5746 {
5747 return fs_umode_to_ftype(inode->vfs_inode.i_mode);
5748 }
5749
btrfs_lookup_dentry(struct inode * dir,struct dentry * dentry)5750 struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
5751 {
5752 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
5753 struct btrfs_inode *inode;
5754 struct btrfs_root *root = BTRFS_I(dir)->root;
5755 struct btrfs_root *sub_root = root;
5756 struct btrfs_key location = { 0 };
5757 u8 di_type = 0;
5758 int ret = 0;
5759
5760 if (dentry->d_name.len > BTRFS_NAME_LEN)
5761 return ERR_PTR(-ENAMETOOLONG);
5762
5763 ret = btrfs_inode_by_name(BTRFS_I(dir), dentry, &location, &di_type);
5764 if (ret < 0)
5765 return ERR_PTR(ret);
5766
5767 if (location.type == BTRFS_INODE_ITEM_KEY) {
5768 inode = btrfs_iget(location.objectid, root);
5769 if (IS_ERR(inode))
5770 return ERR_CAST(inode);
5771
5772 /* Do extra check against inode mode with di_type */
5773 if (btrfs_inode_type(inode) != di_type) {
5774 btrfs_crit(fs_info,
5775 "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u",
5776 inode->vfs_inode.i_mode, btrfs_inode_type(inode),
5777 di_type);
5778 iput(&inode->vfs_inode);
5779 return ERR_PTR(-EUCLEAN);
5780 }
5781 return &inode->vfs_inode;
5782 }
5783
5784 ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry,
5785 &location, &sub_root);
5786 if (ret < 0) {
5787 if (ret != -ENOENT)
5788 inode = ERR_PTR(ret);
5789 else
5790 inode = new_simple_dir(dir, &location, root);
5791 } else {
5792 inode = btrfs_iget(location.objectid, sub_root);
5793 btrfs_put_root(sub_root);
5794
5795 if (IS_ERR(inode))
5796 return ERR_CAST(inode);
5797
5798 down_read(&fs_info->cleanup_work_sem);
5799 if (!sb_rdonly(inode->vfs_inode.i_sb))
5800 ret = btrfs_orphan_cleanup(sub_root);
5801 up_read(&fs_info->cleanup_work_sem);
5802 if (ret) {
5803 iput(&inode->vfs_inode);
5804 inode = ERR_PTR(ret);
5805 }
5806 }
5807
5808 if (IS_ERR(inode))
5809 return ERR_CAST(inode);
5810
5811 return &inode->vfs_inode;
5812 }
5813
btrfs_dentry_delete(const struct dentry * dentry)5814 static int btrfs_dentry_delete(const struct dentry *dentry)
5815 {
5816 struct btrfs_root *root;
5817 struct inode *inode = d_inode(dentry);
5818
5819 if (!inode && !IS_ROOT(dentry))
5820 inode = d_inode(dentry->d_parent);
5821
5822 if (inode) {
5823 root = BTRFS_I(inode)->root;
5824 if (btrfs_root_refs(&root->root_item) == 0)
5825 return 1;
5826
5827 if (btrfs_ino(BTRFS_I(inode)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
5828 return 1;
5829 }
5830 return 0;
5831 }
5832
btrfs_lookup(struct inode * dir,struct dentry * dentry,unsigned int flags)5833 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
5834 unsigned int flags)
5835 {
5836 struct inode *inode = btrfs_lookup_dentry(dir, dentry);
5837
5838 if (inode == ERR_PTR(-ENOENT))
5839 inode = NULL;
5840 return d_splice_alias(inode, dentry);
5841 }
5842
5843 /*
5844 * Find the highest existing sequence number in a directory and then set the
5845 * in-memory index_cnt variable to the first free sequence number.
5846 */
btrfs_set_inode_index_count(struct btrfs_inode * inode)5847 static int btrfs_set_inode_index_count(struct btrfs_inode *inode)
5848 {
5849 struct btrfs_root *root = inode->root;
5850 struct btrfs_key key, found_key;
5851 struct btrfs_path *path;
5852 struct extent_buffer *leaf;
5853 int ret;
5854
5855 key.objectid = btrfs_ino(inode);
5856 key.type = BTRFS_DIR_INDEX_KEY;
5857 key.offset = (u64)-1;
5858
5859 path = btrfs_alloc_path();
5860 if (!path)
5861 return -ENOMEM;
5862
5863 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5864 if (ret < 0)
5865 goto out;
5866 /* FIXME: we should be able to handle this */
5867 if (ret == 0)
5868 goto out;
5869 ret = 0;
5870
5871 if (path->slots[0] == 0) {
5872 inode->index_cnt = BTRFS_DIR_START_INDEX;
5873 goto out;
5874 }
5875
5876 path->slots[0]--;
5877
5878 leaf = path->nodes[0];
5879 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5880
5881 if (found_key.objectid != btrfs_ino(inode) ||
5882 found_key.type != BTRFS_DIR_INDEX_KEY) {
5883 inode->index_cnt = BTRFS_DIR_START_INDEX;
5884 goto out;
5885 }
5886
5887 inode->index_cnt = found_key.offset + 1;
5888 out:
5889 btrfs_free_path(path);
5890 return ret;
5891 }
5892
btrfs_get_dir_last_index(struct btrfs_inode * dir,u64 * index)5893 static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index)
5894 {
5895 int ret = 0;
5896
5897 btrfs_inode_lock(dir, 0);
5898 if (dir->index_cnt == (u64)-1) {
5899 ret = btrfs_inode_delayed_dir_index_count(dir);
5900 if (ret) {
5901 ret = btrfs_set_inode_index_count(dir);
5902 if (ret)
5903 goto out;
5904 }
5905 }
5906
5907 /* index_cnt is the index number of next new entry, so decrement it. */
5908 *index = dir->index_cnt - 1;
5909 out:
5910 btrfs_inode_unlock(dir, 0);
5911
5912 return ret;
5913 }
5914
5915 /*
5916 * All this infrastructure exists because dir_emit can fault, and we are holding
5917 * the tree lock when doing readdir. For now just allocate a buffer and copy
5918 * our information into that, and then dir_emit from the buffer. This is
5919 * similar to what NFS does, only we don't keep the buffer around in pagecache
5920 * because I'm afraid I'll mess that up. Long term we need to make filldir do
5921 * copy_to_user_inatomic so we don't have to worry about page faulting under the
5922 * tree lock.
5923 */
btrfs_opendir(struct inode * inode,struct file * file)5924 static int btrfs_opendir(struct inode *inode, struct file *file)
5925 {
5926 struct btrfs_file_private *private;
5927 u64 last_index;
5928 int ret;
5929
5930 ret = btrfs_get_dir_last_index(BTRFS_I(inode), &last_index);
5931 if (ret)
5932 return ret;
5933
5934 private = kzalloc(sizeof(struct btrfs_file_private), GFP_KERNEL);
5935 if (!private)
5936 return -ENOMEM;
5937 private->last_index = last_index;
5938 private->filldir_buf = kzalloc(PAGE_SIZE, GFP_KERNEL);
5939 if (!private->filldir_buf) {
5940 kfree(private);
5941 return -ENOMEM;
5942 }
5943 file->private_data = private;
5944 return 0;
5945 }
5946
btrfs_dir_llseek(struct file * file,loff_t offset,int whence)5947 static loff_t btrfs_dir_llseek(struct file *file, loff_t offset, int whence)
5948 {
5949 struct btrfs_file_private *private = file->private_data;
5950 int ret;
5951
5952 ret = btrfs_get_dir_last_index(BTRFS_I(file_inode(file)),
5953 &private->last_index);
5954 if (ret)
5955 return ret;
5956
5957 return generic_file_llseek(file, offset, whence);
5958 }
5959
5960 struct dir_entry {
5961 u64 ino;
5962 u64 offset;
5963 unsigned type;
5964 int name_len;
5965 };
5966
btrfs_filldir(void * addr,int entries,struct dir_context * ctx)5967 static int btrfs_filldir(void *addr, int entries, struct dir_context *ctx)
5968 {
5969 while (entries--) {
5970 struct dir_entry *entry = addr;
5971 char *name = (char *)(entry + 1);
5972
5973 ctx->pos = get_unaligned(&entry->offset);
5974 if (!dir_emit(ctx, name, get_unaligned(&entry->name_len),
5975 get_unaligned(&entry->ino),
5976 get_unaligned(&entry->type)))
5977 return 1;
5978 addr += sizeof(struct dir_entry) +
5979 get_unaligned(&entry->name_len);
5980 ctx->pos++;
5981 }
5982 return 0;
5983 }
5984
btrfs_real_readdir(struct file * file,struct dir_context * ctx)5985 static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
5986 {
5987 struct inode *inode = file_inode(file);
5988 struct btrfs_root *root = BTRFS_I(inode)->root;
5989 struct btrfs_file_private *private = file->private_data;
5990 struct btrfs_dir_item *di;
5991 struct btrfs_key key;
5992 struct btrfs_key found_key;
5993 struct btrfs_path *path;
5994 void *addr;
5995 LIST_HEAD(ins_list);
5996 LIST_HEAD(del_list);
5997 int ret;
5998 char *name_ptr;
5999 int name_len;
6000 int entries = 0;
6001 int total_len = 0;
6002 bool put = false;
6003 struct btrfs_key location;
6004
6005 if (!dir_emit_dots(file, ctx))
6006 return 0;
6007
6008 path = btrfs_alloc_path();
6009 if (!path)
6010 return -ENOMEM;
6011
6012 addr = private->filldir_buf;
6013 path->reada = READA_FORWARD;
6014
6015 put = btrfs_readdir_get_delayed_items(BTRFS_I(inode), private->last_index,
6016 &ins_list, &del_list);
6017
6018 again:
6019 key.type = BTRFS_DIR_INDEX_KEY;
6020 key.offset = ctx->pos;
6021 key.objectid = btrfs_ino(BTRFS_I(inode));
6022
6023 btrfs_for_each_slot(root, &key, &found_key, path, ret) {
6024 struct dir_entry *entry;
6025 struct extent_buffer *leaf = path->nodes[0];
6026 u8 ftype;
6027
6028 if (found_key.objectid != key.objectid)
6029 break;
6030 if (found_key.type != BTRFS_DIR_INDEX_KEY)
6031 break;
6032 if (found_key.offset < ctx->pos)
6033 continue;
6034 if (found_key.offset > private->last_index)
6035 break;
6036 if (btrfs_should_delete_dir_index(&del_list, found_key.offset))
6037 continue;
6038 di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
6039 name_len = btrfs_dir_name_len(leaf, di);
6040 if ((total_len + sizeof(struct dir_entry) + name_len) >=
6041 PAGE_SIZE) {
6042 btrfs_release_path(path);
6043 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6044 if (ret)
6045 goto nopos;
6046 addr = private->filldir_buf;
6047 entries = 0;
6048 total_len = 0;
6049 goto again;
6050 }
6051
6052 ftype = btrfs_dir_flags_to_ftype(btrfs_dir_flags(leaf, di));
6053 entry = addr;
6054 name_ptr = (char *)(entry + 1);
6055 read_extent_buffer(leaf, name_ptr,
6056 (unsigned long)(di + 1), name_len);
6057 put_unaligned(name_len, &entry->name_len);
6058 put_unaligned(fs_ftype_to_dtype(ftype), &entry->type);
6059 btrfs_dir_item_key_to_cpu(leaf, di, &location);
6060 put_unaligned(location.objectid, &entry->ino);
6061 put_unaligned(found_key.offset, &entry->offset);
6062 entries++;
6063 addr += sizeof(struct dir_entry) + name_len;
6064 total_len += sizeof(struct dir_entry) + name_len;
6065 }
6066 /* Catch error encountered during iteration */
6067 if (ret < 0)
6068 goto err;
6069
6070 btrfs_release_path(path);
6071
6072 ret = btrfs_filldir(private->filldir_buf, entries, ctx);
6073 if (ret)
6074 goto nopos;
6075
6076 ret = btrfs_readdir_delayed_dir_index(ctx, &ins_list);
6077 if (ret)
6078 goto nopos;
6079
6080 /*
6081 * Stop new entries from being returned after we return the last
6082 * entry.
6083 *
6084 * New directory entries are assigned a strictly increasing
6085 * offset. This means that new entries created during readdir
6086 * are *guaranteed* to be seen in the future by that readdir.
6087 * This has broken buggy programs which operate on names as
6088 * they're returned by readdir. Until we reuse freed offsets
6089 * we have this hack to stop new entries from being returned
6090 * under the assumption that they'll never reach this huge
6091 * offset.
6092 *
6093 * This is being careful not to overflow 32bit loff_t unless the
6094 * last entry requires it because doing so has broken 32bit apps
6095 * in the past.
6096 */
6097 if (ctx->pos >= INT_MAX)
6098 ctx->pos = LLONG_MAX;
6099 else
6100 ctx->pos = INT_MAX;
6101 nopos:
6102 ret = 0;
6103 err:
6104 if (put)
6105 btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list);
6106 btrfs_free_path(path);
6107 return ret;
6108 }
6109
6110 /*
6111 * This is somewhat expensive, updating the tree every time the
6112 * inode changes. But, it is most likely to find the inode in cache.
6113 * FIXME, needs more benchmarking...there are no reasons other than performance
6114 * to keep or drop this code.
6115 */
btrfs_dirty_inode(struct btrfs_inode * inode)6116 static int btrfs_dirty_inode(struct btrfs_inode *inode)
6117 {
6118 struct btrfs_root *root = inode->root;
6119 struct btrfs_fs_info *fs_info = root->fs_info;
6120 struct btrfs_trans_handle *trans;
6121 int ret;
6122
6123 if (test_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags))
6124 return 0;
6125
6126 trans = btrfs_join_transaction(root);
6127 if (IS_ERR(trans))
6128 return PTR_ERR(trans);
6129
6130 ret = btrfs_update_inode(trans, inode);
6131 if (ret == -ENOSPC || ret == -EDQUOT) {
6132 /* whoops, lets try again with the full transaction */
6133 btrfs_end_transaction(trans);
6134 trans = btrfs_start_transaction(root, 1);
6135 if (IS_ERR(trans))
6136 return PTR_ERR(trans);
6137
6138 ret = btrfs_update_inode(trans, inode);
6139 }
6140 btrfs_end_transaction(trans);
6141 if (inode->delayed_node)
6142 btrfs_balance_delayed_items(fs_info);
6143
6144 return ret;
6145 }
6146
6147 /*
6148 * This is a copy of file_update_time. We need this so we can return error on
6149 * ENOSPC for updating the inode in the case of file write and mmap writes.
6150 */
btrfs_update_time(struct inode * inode,int flags)6151 static int btrfs_update_time(struct inode *inode, int flags)
6152 {
6153 struct btrfs_root *root = BTRFS_I(inode)->root;
6154 bool dirty;
6155
6156 if (btrfs_root_readonly(root))
6157 return -EROFS;
6158
6159 dirty = inode_update_timestamps(inode, flags);
6160 return dirty ? btrfs_dirty_inode(BTRFS_I(inode)) : 0;
6161 }
6162
6163 /*
6164 * helper to find a free sequence number in a given directory. This current
6165 * code is very simple, later versions will do smarter things in the btree
6166 */
btrfs_set_inode_index(struct btrfs_inode * dir,u64 * index)6167 int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index)
6168 {
6169 int ret = 0;
6170
6171 if (dir->index_cnt == (u64)-1) {
6172 ret = btrfs_inode_delayed_dir_index_count(dir);
6173 if (ret) {
6174 ret = btrfs_set_inode_index_count(dir);
6175 if (ret)
6176 return ret;
6177 }
6178 }
6179
6180 *index = dir->index_cnt;
6181 dir->index_cnt++;
6182
6183 return ret;
6184 }
6185
btrfs_insert_inode_locked(struct inode * inode)6186 static int btrfs_insert_inode_locked(struct inode *inode)
6187 {
6188 struct btrfs_iget_args args;
6189
6190 args.ino = btrfs_ino(BTRFS_I(inode));
6191 args.root = BTRFS_I(inode)->root;
6192
6193 return insert_inode_locked4(inode,
6194 btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
6195 btrfs_find_actor, &args);
6196 }
6197
btrfs_new_inode_prepare(struct btrfs_new_inode_args * args,unsigned int * trans_num_items)6198 int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
6199 unsigned int *trans_num_items)
6200 {
6201 struct inode *dir = args->dir;
6202 struct inode *inode = args->inode;
6203 int ret;
6204
6205 if (!args->orphan) {
6206 ret = fscrypt_setup_filename(dir, &args->dentry->d_name, 0,
6207 &args->fname);
6208 if (ret)
6209 return ret;
6210 }
6211
6212 ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl);
6213 if (ret) {
6214 fscrypt_free_filename(&args->fname);
6215 return ret;
6216 }
6217
6218 /* 1 to add inode item */
6219 *trans_num_items = 1;
6220 /* 1 to add compression property */
6221 if (BTRFS_I(dir)->prop_compress)
6222 (*trans_num_items)++;
6223 /* 1 to add default ACL xattr */
6224 if (args->default_acl)
6225 (*trans_num_items)++;
6226 /* 1 to add access ACL xattr */
6227 if (args->acl)
6228 (*trans_num_items)++;
6229 #ifdef CONFIG_SECURITY
6230 /* 1 to add LSM xattr */
6231 if (dir->i_security)
6232 (*trans_num_items)++;
6233 #endif
6234 if (args->orphan) {
6235 /* 1 to add orphan item */
6236 (*trans_num_items)++;
6237 } else {
6238 /*
6239 * 1 to add dir item
6240 * 1 to add dir index
6241 * 1 to update parent inode item
6242 *
6243 * No need for 1 unit for the inode ref item because it is
6244 * inserted in a batch together with the inode item at
6245 * btrfs_create_new_inode().
6246 */
6247 *trans_num_items += 3;
6248 }
6249 return 0;
6250 }
6251
btrfs_new_inode_args_destroy(struct btrfs_new_inode_args * args)6252 void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args)
6253 {
6254 posix_acl_release(args->acl);
6255 posix_acl_release(args->default_acl);
6256 fscrypt_free_filename(&args->fname);
6257 }
6258
6259 /*
6260 * Inherit flags from the parent inode.
6261 *
6262 * Currently only the compression flags and the cow flags are inherited.
6263 */
btrfs_inherit_iflags(struct btrfs_inode * inode,struct btrfs_inode * dir)6264 static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode *dir)
6265 {
6266 unsigned int flags;
6267
6268 flags = dir->flags;
6269
6270 if (flags & BTRFS_INODE_NOCOMPRESS) {
6271 inode->flags &= ~BTRFS_INODE_COMPRESS;
6272 inode->flags |= BTRFS_INODE_NOCOMPRESS;
6273 } else if (flags & BTRFS_INODE_COMPRESS) {
6274 inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
6275 inode->flags |= BTRFS_INODE_COMPRESS;
6276 }
6277
6278 if (flags & BTRFS_INODE_NODATACOW) {
6279 inode->flags |= BTRFS_INODE_NODATACOW;
6280 if (S_ISREG(inode->vfs_inode.i_mode))
6281 inode->flags |= BTRFS_INODE_NODATASUM;
6282 }
6283
6284 btrfs_sync_inode_flags_to_i_flags(inode);
6285 }
6286
btrfs_create_new_inode(struct btrfs_trans_handle * trans,struct btrfs_new_inode_args * args)6287 int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
6288 struct btrfs_new_inode_args *args)
6289 {
6290 struct timespec64 ts;
6291 struct inode *dir = args->dir;
6292 struct inode *inode = args->inode;
6293 const struct fscrypt_str *name = args->orphan ? NULL : &args->fname.disk_name;
6294 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6295 struct btrfs_root *root;
6296 struct btrfs_inode_item *inode_item;
6297 struct btrfs_path *path;
6298 u64 objectid;
6299 struct btrfs_inode_ref *ref;
6300 struct btrfs_key key[2];
6301 u32 sizes[2];
6302 struct btrfs_item_batch batch;
6303 unsigned long ptr;
6304 int ret;
6305 bool xa_reserved = false;
6306
6307 path = btrfs_alloc_path();
6308 if (!path)
6309 return -ENOMEM;
6310
6311 if (!args->subvol)
6312 BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root);
6313 root = BTRFS_I(inode)->root;
6314
6315 ret = btrfs_init_file_extent_tree(BTRFS_I(inode));
6316 if (ret)
6317 goto out;
6318
6319 ret = btrfs_get_free_objectid(root, &objectid);
6320 if (ret)
6321 goto out;
6322 btrfs_set_inode_number(BTRFS_I(inode), objectid);
6323
6324 ret = xa_reserve(&root->inodes, objectid, GFP_NOFS);
6325 if (ret)
6326 goto out;
6327 xa_reserved = true;
6328
6329 if (args->orphan) {
6330 /*
6331 * O_TMPFILE, set link count to 0, so that after this point, we
6332 * fill in an inode item with the correct link count.
6333 */
6334 set_nlink(inode, 0);
6335 } else {
6336 trace_btrfs_inode_request(dir);
6337
6338 ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index);
6339 if (ret)
6340 goto out;
6341 }
6342
6343 if (S_ISDIR(inode->i_mode))
6344 BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX;
6345
6346 BTRFS_I(inode)->generation = trans->transid;
6347 inode->i_generation = BTRFS_I(inode)->generation;
6348
6349 /*
6350 * We don't have any capability xattrs set here yet, shortcut any
6351 * queries for the xattrs here. If we add them later via the inode
6352 * security init path or any other path this flag will be cleared.
6353 */
6354 set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags);
6355
6356 /*
6357 * Subvolumes don't inherit flags from their parent directory.
6358 * Originally this was probably by accident, but we probably can't
6359 * change it now without compatibility issues.
6360 */
6361 if (!args->subvol)
6362 btrfs_inherit_iflags(BTRFS_I(inode), BTRFS_I(dir));
6363
6364 if (S_ISREG(inode->i_mode)) {
6365 if (btrfs_test_opt(fs_info, NODATASUM))
6366 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
6367 if (btrfs_test_opt(fs_info, NODATACOW))
6368 BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
6369 BTRFS_INODE_NODATASUM;
6370 btrfs_update_inode_mapping_flags(BTRFS_I(inode));
6371 }
6372
6373 ret = btrfs_insert_inode_locked(inode);
6374 if (ret < 0) {
6375 if (!args->orphan)
6376 BTRFS_I(dir)->index_cnt--;
6377 goto out;
6378 }
6379
6380 /*
6381 * We could have gotten an inode number from somebody who was fsynced
6382 * and then removed in this same transaction, so let's just set full
6383 * sync since it will be a full sync anyway and this will blow away the
6384 * old info in the log.
6385 */
6386 btrfs_set_inode_full_sync(BTRFS_I(inode));
6387
6388 key[0].objectid = objectid;
6389 key[0].type = BTRFS_INODE_ITEM_KEY;
6390 key[0].offset = 0;
6391
6392 sizes[0] = sizeof(struct btrfs_inode_item);
6393
6394 if (!args->orphan) {
6395 /*
6396 * Start new inodes with an inode_ref. This is slightly more
6397 * efficient for small numbers of hard links since they will
6398 * be packed into one item. Extended refs will kick in if we
6399 * add more hard links than can fit in the ref item.
6400 */
6401 key[1].objectid = objectid;
6402 key[1].type = BTRFS_INODE_REF_KEY;
6403 if (args->subvol) {
6404 key[1].offset = objectid;
6405 sizes[1] = 2 + sizeof(*ref);
6406 } else {
6407 key[1].offset = btrfs_ino(BTRFS_I(dir));
6408 sizes[1] = name->len + sizeof(*ref);
6409 }
6410 }
6411
6412 batch.keys = &key[0];
6413 batch.data_sizes = &sizes[0];
6414 batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]);
6415 batch.nr = args->orphan ? 1 : 2;
6416 ret = btrfs_insert_empty_items(trans, root, path, &batch);
6417 if (ret != 0) {
6418 btrfs_abort_transaction(trans, ret);
6419 goto discard;
6420 }
6421
6422 ts = simple_inode_init_ts(inode);
6423 BTRFS_I(inode)->i_otime_sec = ts.tv_sec;
6424 BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec;
6425
6426 /*
6427 * We're going to fill the inode item now, so at this point the inode
6428 * must be fully initialized.
6429 */
6430
6431 inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
6432 struct btrfs_inode_item);
6433 memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item,
6434 sizeof(*inode_item));
6435 fill_inode_item(trans, path->nodes[0], inode_item, inode);
6436
6437 if (!args->orphan) {
6438 ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
6439 struct btrfs_inode_ref);
6440 ptr = (unsigned long)(ref + 1);
6441 if (args->subvol) {
6442 btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2);
6443 btrfs_set_inode_ref_index(path->nodes[0], ref, 0);
6444 write_extent_buffer(path->nodes[0], "..", ptr, 2);
6445 } else {
6446 btrfs_set_inode_ref_name_len(path->nodes[0], ref,
6447 name->len);
6448 btrfs_set_inode_ref_index(path->nodes[0], ref,
6449 BTRFS_I(inode)->dir_index);
6450 write_extent_buffer(path->nodes[0], name->name, ptr,
6451 name->len);
6452 }
6453 }
6454
6455 /*
6456 * We don't need the path anymore, plus inheriting properties, adding
6457 * ACLs, security xattrs, orphan item or adding the link, will result in
6458 * allocating yet another path. So just free our path.
6459 */
6460 btrfs_free_path(path);
6461 path = NULL;
6462
6463 if (args->subvol) {
6464 struct btrfs_inode *parent;
6465
6466 /*
6467 * Subvolumes inherit properties from their parent subvolume,
6468 * not the directory they were created in.
6469 */
6470 parent = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, BTRFS_I(dir)->root);
6471 if (IS_ERR(parent)) {
6472 ret = PTR_ERR(parent);
6473 } else {
6474 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6475 parent);
6476 iput(&parent->vfs_inode);
6477 }
6478 } else {
6479 ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode),
6480 BTRFS_I(dir));
6481 }
6482 if (ret) {
6483 btrfs_err(fs_info,
6484 "error inheriting props for ino %llu (root %llu): %d",
6485 btrfs_ino(BTRFS_I(inode)), btrfs_root_id(root), ret);
6486 }
6487
6488 /*
6489 * Subvolumes don't inherit ACLs or get passed to the LSM. This is
6490 * probably a bug.
6491 */
6492 if (!args->subvol) {
6493 ret = btrfs_init_inode_security(trans, args);
6494 if (ret) {
6495 btrfs_abort_transaction(trans, ret);
6496 goto discard;
6497 }
6498 }
6499
6500 ret = btrfs_add_inode_to_root(BTRFS_I(inode), false);
6501 if (WARN_ON(ret)) {
6502 /* Shouldn't happen, we used xa_reserve() before. */
6503 btrfs_abort_transaction(trans, ret);
6504 goto discard;
6505 }
6506
6507 trace_btrfs_inode_new(inode);
6508 btrfs_set_inode_last_trans(trans, BTRFS_I(inode));
6509
6510 btrfs_update_root_times(trans, root);
6511
6512 if (args->orphan) {
6513 ret = btrfs_orphan_add(trans, BTRFS_I(inode));
6514 } else {
6515 ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name,
6516 0, BTRFS_I(inode)->dir_index);
6517 }
6518 if (ret) {
6519 btrfs_abort_transaction(trans, ret);
6520 goto discard;
6521 }
6522
6523 return 0;
6524
6525 discard:
6526 /*
6527 * discard_new_inode() calls iput(), but the caller owns the reference
6528 * to the inode.
6529 */
6530 ihold(inode);
6531 discard_new_inode(inode);
6532 out:
6533 if (xa_reserved)
6534 xa_release(&root->inodes, objectid);
6535
6536 btrfs_free_path(path);
6537 return ret;
6538 }
6539
6540 /*
6541 * utility function to add 'inode' into 'parent_inode' with
6542 * a give name and a given sequence number.
6543 * if 'add_backref' is true, also insert a backref from the
6544 * inode to the parent directory.
6545 */
btrfs_add_link(struct btrfs_trans_handle * trans,struct btrfs_inode * parent_inode,struct btrfs_inode * inode,const struct fscrypt_str * name,int add_backref,u64 index)6546 int btrfs_add_link(struct btrfs_trans_handle *trans,
6547 struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
6548 const struct fscrypt_str *name, int add_backref, u64 index)
6549 {
6550 int ret = 0;
6551 struct btrfs_key key;
6552 struct btrfs_root *root = parent_inode->root;
6553 u64 ino = btrfs_ino(inode);
6554 u64 parent_ino = btrfs_ino(parent_inode);
6555
6556 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6557 memcpy(&key, &inode->root->root_key, sizeof(key));
6558 } else {
6559 key.objectid = ino;
6560 key.type = BTRFS_INODE_ITEM_KEY;
6561 key.offset = 0;
6562 }
6563
6564 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6565 ret = btrfs_add_root_ref(trans, key.objectid,
6566 btrfs_root_id(root), parent_ino,
6567 index, name);
6568 } else if (add_backref) {
6569 ret = btrfs_insert_inode_ref(trans, root, name,
6570 ino, parent_ino, index);
6571 }
6572
6573 /* Nothing to clean up yet */
6574 if (ret)
6575 return ret;
6576
6577 ret = btrfs_insert_dir_item(trans, name, parent_inode, &key,
6578 btrfs_inode_type(inode), index);
6579 if (ret == -EEXIST || ret == -EOVERFLOW)
6580 goto fail_dir_item;
6581 else if (ret) {
6582 btrfs_abort_transaction(trans, ret);
6583 return ret;
6584 }
6585
6586 btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
6587 name->len * 2);
6588 inode_inc_iversion(&parent_inode->vfs_inode);
6589 /*
6590 * If we are replaying a log tree, we do not want to update the mtime
6591 * and ctime of the parent directory with the current time, since the
6592 * log replay procedure is responsible for setting them to their correct
6593 * values (the ones it had when the fsync was done).
6594 */
6595 if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
6596 inode_set_mtime_to_ts(&parent_inode->vfs_inode,
6597 inode_set_ctime_current(&parent_inode->vfs_inode));
6598
6599 ret = btrfs_update_inode(trans, parent_inode);
6600 if (ret)
6601 btrfs_abort_transaction(trans, ret);
6602 return ret;
6603
6604 fail_dir_item:
6605 if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
6606 u64 local_index;
6607 int err;
6608 err = btrfs_del_root_ref(trans, key.objectid,
6609 btrfs_root_id(root), parent_ino,
6610 &local_index, name);
6611 if (err)
6612 btrfs_abort_transaction(trans, err);
6613 } else if (add_backref) {
6614 u64 local_index;
6615 int err;
6616
6617 err = btrfs_del_inode_ref(trans, root, name, ino, parent_ino,
6618 &local_index);
6619 if (err)
6620 btrfs_abort_transaction(trans, err);
6621 }
6622
6623 /* Return the original error code */
6624 return ret;
6625 }
6626
btrfs_create_common(struct inode * dir,struct dentry * dentry,struct inode * inode)6627 static int btrfs_create_common(struct inode *dir, struct dentry *dentry,
6628 struct inode *inode)
6629 {
6630 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
6631 struct btrfs_root *root = BTRFS_I(dir)->root;
6632 struct btrfs_new_inode_args new_inode_args = {
6633 .dir = dir,
6634 .dentry = dentry,
6635 .inode = inode,
6636 };
6637 unsigned int trans_num_items;
6638 struct btrfs_trans_handle *trans;
6639 int err;
6640
6641 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
6642 if (err)
6643 goto out_inode;
6644
6645 trans = btrfs_start_transaction(root, trans_num_items);
6646 if (IS_ERR(trans)) {
6647 err = PTR_ERR(trans);
6648 goto out_new_inode_args;
6649 }
6650
6651 err = btrfs_create_new_inode(trans, &new_inode_args);
6652 if (!err)
6653 d_instantiate_new(dentry, inode);
6654
6655 btrfs_end_transaction(trans);
6656 btrfs_btree_balance_dirty(fs_info);
6657 out_new_inode_args:
6658 btrfs_new_inode_args_destroy(&new_inode_args);
6659 out_inode:
6660 if (err)
6661 iput(inode);
6662 return err;
6663 }
6664
btrfs_mknod(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,dev_t rdev)6665 static int btrfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
6666 struct dentry *dentry, umode_t mode, dev_t rdev)
6667 {
6668 struct inode *inode;
6669
6670 inode = new_inode(dir->i_sb);
6671 if (!inode)
6672 return -ENOMEM;
6673 inode_init_owner(idmap, inode, dir, mode);
6674 inode->i_op = &btrfs_special_inode_operations;
6675 init_special_inode(inode, inode->i_mode, rdev);
6676 return btrfs_create_common(dir, dentry, inode);
6677 }
6678
btrfs_create(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode,bool excl)6679 static int btrfs_create(struct mnt_idmap *idmap, struct inode *dir,
6680 struct dentry *dentry, umode_t mode, bool excl)
6681 {
6682 struct inode *inode;
6683
6684 inode = new_inode(dir->i_sb);
6685 if (!inode)
6686 return -ENOMEM;
6687 inode_init_owner(idmap, inode, dir, mode);
6688 inode->i_fop = &btrfs_file_operations;
6689 inode->i_op = &btrfs_file_inode_operations;
6690 inode->i_mapping->a_ops = &btrfs_aops;
6691 return btrfs_create_common(dir, dentry, inode);
6692 }
6693
btrfs_link(struct dentry * old_dentry,struct inode * dir,struct dentry * dentry)6694 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
6695 struct dentry *dentry)
6696 {
6697 struct btrfs_trans_handle *trans = NULL;
6698 struct btrfs_root *root = BTRFS_I(dir)->root;
6699 struct inode *inode = d_inode(old_dentry);
6700 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
6701 struct fscrypt_name fname;
6702 u64 index;
6703 int err;
6704 int drop_inode = 0;
6705
6706 /* do not allow sys_link's with other subvols of the same device */
6707 if (btrfs_root_id(root) != btrfs_root_id(BTRFS_I(inode)->root))
6708 return -EXDEV;
6709
6710 if (inode->i_nlink >= BTRFS_LINK_MAX)
6711 return -EMLINK;
6712
6713 err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &fname);
6714 if (err)
6715 goto fail;
6716
6717 err = btrfs_set_inode_index(BTRFS_I(dir), &index);
6718 if (err)
6719 goto fail;
6720
6721 /*
6722 * 2 items for inode and inode ref
6723 * 2 items for dir items
6724 * 1 item for parent inode
6725 * 1 item for orphan item deletion if O_TMPFILE
6726 */
6727 trans = btrfs_start_transaction(root, inode->i_nlink ? 5 : 6);
6728 if (IS_ERR(trans)) {
6729 err = PTR_ERR(trans);
6730 trans = NULL;
6731 goto fail;
6732 }
6733
6734 /* There are several dir indexes for this inode, clear the cache. */
6735 BTRFS_I(inode)->dir_index = 0ULL;
6736 inc_nlink(inode);
6737 inode_inc_iversion(inode);
6738 inode_set_ctime_current(inode);
6739 ihold(inode);
6740 set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags);
6741
6742 err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode),
6743 &fname.disk_name, 1, index);
6744
6745 if (err) {
6746 drop_inode = 1;
6747 } else {
6748 struct dentry *parent = dentry->d_parent;
6749
6750 err = btrfs_update_inode(trans, BTRFS_I(inode));
6751 if (err)
6752 goto fail;
6753 if (inode->i_nlink == 1) {
6754 /*
6755 * If new hard link count is 1, it's a file created
6756 * with open(2) O_TMPFILE flag.
6757 */
6758 err = btrfs_orphan_del(trans, BTRFS_I(inode));
6759 if (err)
6760 goto fail;
6761 }
6762 d_instantiate(dentry, inode);
6763 btrfs_log_new_name(trans, old_dentry, NULL, 0, parent);
6764 }
6765
6766 fail:
6767 fscrypt_free_filename(&fname);
6768 if (trans)
6769 btrfs_end_transaction(trans);
6770 if (drop_inode) {
6771 inode_dec_link_count(inode);
6772 iput(inode);
6773 }
6774 btrfs_btree_balance_dirty(fs_info);
6775 return err;
6776 }
6777
btrfs_mkdir(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,umode_t mode)6778 static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
6779 struct dentry *dentry, umode_t mode)
6780 {
6781 struct inode *inode;
6782
6783 inode = new_inode(dir->i_sb);
6784 if (!inode)
6785 return ERR_PTR(-ENOMEM);
6786 inode_init_owner(idmap, inode, dir, S_IFDIR | mode);
6787 inode->i_op = &btrfs_dir_inode_operations;
6788 inode->i_fop = &btrfs_dir_file_operations;
6789 return ERR_PTR(btrfs_create_common(dir, dentry, inode));
6790 }
6791
uncompress_inline(struct btrfs_path * path,struct folio * folio,struct btrfs_file_extent_item * item)6792 static noinline int uncompress_inline(struct btrfs_path *path,
6793 struct folio *folio,
6794 struct btrfs_file_extent_item *item)
6795 {
6796 int ret;
6797 struct extent_buffer *leaf = path->nodes[0];
6798 const u32 blocksize = leaf->fs_info->sectorsize;
6799 char *tmp;
6800 size_t max_size;
6801 unsigned long inline_size;
6802 unsigned long ptr;
6803 int compress_type;
6804
6805 compress_type = btrfs_file_extent_compression(leaf, item);
6806 max_size = btrfs_file_extent_ram_bytes(leaf, item);
6807 inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]);
6808 tmp = kmalloc(inline_size, GFP_NOFS);
6809 if (!tmp)
6810 return -ENOMEM;
6811 ptr = btrfs_file_extent_inline_start(item);
6812
6813 read_extent_buffer(leaf, tmp, ptr, inline_size);
6814
6815 max_size = min_t(unsigned long, blocksize, max_size);
6816 ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size,
6817 max_size);
6818
6819 /*
6820 * decompression code contains a memset to fill in any space between the end
6821 * of the uncompressed data and the end of max_size in case the decompressed
6822 * data ends up shorter than ram_bytes. That doesn't cover the hole between
6823 * the end of an inline extent and the beginning of the next block, so we
6824 * cover that region here.
6825 */
6826
6827 if (max_size < blocksize)
6828 folio_zero_range(folio, max_size, blocksize - max_size);
6829 kfree(tmp);
6830 return ret;
6831 }
6832
read_inline_extent(struct btrfs_path * path,struct folio * folio)6833 static int read_inline_extent(struct btrfs_path *path, struct folio *folio)
6834 {
6835 const u32 blocksize = path->nodes[0]->fs_info->sectorsize;
6836 struct btrfs_file_extent_item *fi;
6837 void *kaddr;
6838 size_t copy_size;
6839
6840 if (!folio || folio_test_uptodate(folio))
6841 return 0;
6842
6843 ASSERT(folio_pos(folio) == 0);
6844
6845 fi = btrfs_item_ptr(path->nodes[0], path->slots[0],
6846 struct btrfs_file_extent_item);
6847 if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE)
6848 return uncompress_inline(path, folio, fi);
6849
6850 copy_size = min_t(u64, blocksize,
6851 btrfs_file_extent_ram_bytes(path->nodes[0], fi));
6852 kaddr = kmap_local_folio(folio, 0);
6853 read_extent_buffer(path->nodes[0], kaddr,
6854 btrfs_file_extent_inline_start(fi), copy_size);
6855 kunmap_local(kaddr);
6856 if (copy_size < blocksize)
6857 folio_zero_range(folio, copy_size, blocksize - copy_size);
6858 return 0;
6859 }
6860
6861 /*
6862 * Lookup the first extent overlapping a range in a file.
6863 *
6864 * @inode: file to search in
6865 * @page: page to read extent data into if the extent is inline
6866 * @start: file offset
6867 * @len: length of range starting at @start
6868 *
6869 * Return the first &struct extent_map which overlaps the given range, reading
6870 * it from the B-tree and caching it if necessary. Note that there may be more
6871 * extents which overlap the given range after the returned extent_map.
6872 *
6873 * If @page is not NULL and the extent is inline, this also reads the extent
6874 * data directly into the page and marks the extent up to date in the io_tree.
6875 *
6876 * Return: ERR_PTR on error, non-NULL extent_map on success.
6877 */
btrfs_get_extent(struct btrfs_inode * inode,struct folio * folio,u64 start,u64 len)6878 struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
6879 struct folio *folio, u64 start, u64 len)
6880 {
6881 struct btrfs_fs_info *fs_info = inode->root->fs_info;
6882 int ret = 0;
6883 u64 extent_start = 0;
6884 u64 extent_end = 0;
6885 u64 objectid = btrfs_ino(inode);
6886 int extent_type = -1;
6887 struct btrfs_path *path = NULL;
6888 struct btrfs_root *root = inode->root;
6889 struct btrfs_file_extent_item *item;
6890 struct extent_buffer *leaf;
6891 struct btrfs_key found_key;
6892 struct extent_map *em = NULL;
6893 struct extent_map_tree *em_tree = &inode->extent_tree;
6894
6895 read_lock(&em_tree->lock);
6896 em = lookup_extent_mapping(em_tree, start, len);
6897 read_unlock(&em_tree->lock);
6898
6899 if (em) {
6900 if (em->start > start || em->start + em->len <= start)
6901 free_extent_map(em);
6902 else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio)
6903 free_extent_map(em);
6904 else
6905 goto out;
6906 }
6907 em = alloc_extent_map();
6908 if (!em) {
6909 ret = -ENOMEM;
6910 goto out;
6911 }
6912 em->start = EXTENT_MAP_HOLE;
6913 em->disk_bytenr = EXTENT_MAP_HOLE;
6914 em->len = (u64)-1;
6915
6916 path = btrfs_alloc_path();
6917 if (!path) {
6918 ret = -ENOMEM;
6919 goto out;
6920 }
6921
6922 /* Chances are we'll be called again, so go ahead and do readahead */
6923 path->reada = READA_FORWARD;
6924
6925 /*
6926 * The same explanation in load_free_space_cache applies here as well,
6927 * we only read when we're loading the free space cache, and at that
6928 * point the commit_root has everything we need.
6929 */
6930 if (btrfs_is_free_space_inode(inode)) {
6931 path->search_commit_root = 1;
6932 path->skip_locking = 1;
6933 }
6934
6935 ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
6936 if (ret < 0) {
6937 goto out;
6938 } else if (ret > 0) {
6939 if (path->slots[0] == 0)
6940 goto not_found;
6941 path->slots[0]--;
6942 ret = 0;
6943 }
6944
6945 leaf = path->nodes[0];
6946 item = btrfs_item_ptr(leaf, path->slots[0],
6947 struct btrfs_file_extent_item);
6948 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6949 if (found_key.objectid != objectid ||
6950 found_key.type != BTRFS_EXTENT_DATA_KEY) {
6951 /*
6952 * If we backup past the first extent we want to move forward
6953 * and see if there is an extent in front of us, otherwise we'll
6954 * say there is a hole for our whole search range which can
6955 * cause problems.
6956 */
6957 extent_end = start;
6958 goto next;
6959 }
6960
6961 extent_type = btrfs_file_extent_type(leaf, item);
6962 extent_start = found_key.offset;
6963 extent_end = btrfs_file_extent_end(path);
6964 if (extent_type == BTRFS_FILE_EXTENT_REG ||
6965 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
6966 /* Only regular file could have regular/prealloc extent */
6967 if (!S_ISREG(inode->vfs_inode.i_mode)) {
6968 ret = -EUCLEAN;
6969 btrfs_crit(fs_info,
6970 "regular/prealloc extent found for non-regular inode %llu",
6971 btrfs_ino(inode));
6972 goto out;
6973 }
6974 trace_btrfs_get_extent_show_fi_regular(inode, leaf, item,
6975 extent_start);
6976 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
6977 trace_btrfs_get_extent_show_fi_inline(inode, leaf, item,
6978 path->slots[0],
6979 extent_start);
6980 }
6981 next:
6982 if (start >= extent_end) {
6983 path->slots[0]++;
6984 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
6985 ret = btrfs_next_leaf(root, path);
6986 if (ret < 0)
6987 goto out;
6988 else if (ret > 0)
6989 goto not_found;
6990
6991 leaf = path->nodes[0];
6992 }
6993 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
6994 if (found_key.objectid != objectid ||
6995 found_key.type != BTRFS_EXTENT_DATA_KEY)
6996 goto not_found;
6997 if (start + len <= found_key.offset)
6998 goto not_found;
6999 if (start > found_key.offset)
7000 goto next;
7001
7002 /* New extent overlaps with existing one */
7003 em->start = start;
7004 em->len = found_key.offset - start;
7005 em->disk_bytenr = EXTENT_MAP_HOLE;
7006 goto insert;
7007 }
7008
7009 btrfs_extent_item_to_extent_map(inode, path, item, em);
7010
7011 if (extent_type == BTRFS_FILE_EXTENT_REG ||
7012 extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
7013 goto insert;
7014 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
7015 /*
7016 * Inline extent can only exist at file offset 0. This is
7017 * ensured by tree-checker and inline extent creation path.
7018 * Thus all members representing file offsets should be zero.
7019 */
7020 ASSERT(extent_start == 0);
7021 ASSERT(em->start == 0);
7022
7023 /*
7024 * btrfs_extent_item_to_extent_map() should have properly
7025 * initialized em members already.
7026 *
7027 * Other members are not utilized for inline extents.
7028 */
7029 ASSERT(em->disk_bytenr == EXTENT_MAP_INLINE);
7030 ASSERT(em->len == fs_info->sectorsize);
7031
7032 ret = read_inline_extent(path, folio);
7033 if (ret < 0)
7034 goto out;
7035 goto insert;
7036 }
7037 not_found:
7038 em->start = start;
7039 em->len = len;
7040 em->disk_bytenr = EXTENT_MAP_HOLE;
7041 insert:
7042 ret = 0;
7043 btrfs_release_path(path);
7044 if (em->start > start || extent_map_end(em) <= start) {
7045 btrfs_err(fs_info,
7046 "bad extent! em: [%llu %llu] passed [%llu %llu]",
7047 em->start, em->len, start, len);
7048 ret = -EIO;
7049 goto out;
7050 }
7051
7052 write_lock(&em_tree->lock);
7053 ret = btrfs_add_extent_mapping(inode, &em, start, len);
7054 write_unlock(&em_tree->lock);
7055 out:
7056 btrfs_free_path(path);
7057
7058 trace_btrfs_get_extent(root, inode, em);
7059
7060 if (ret) {
7061 free_extent_map(em);
7062 return ERR_PTR(ret);
7063 }
7064 return em;
7065 }
7066
btrfs_extent_readonly(struct btrfs_fs_info * fs_info,u64 bytenr)7067 static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
7068 {
7069 struct btrfs_block_group *block_group;
7070 bool readonly = false;
7071
7072 block_group = btrfs_lookup_block_group(fs_info, bytenr);
7073 if (!block_group || block_group->ro)
7074 readonly = true;
7075 if (block_group)
7076 btrfs_put_block_group(block_group);
7077 return readonly;
7078 }
7079
7080 /*
7081 * Check if we can do nocow write into the range [@offset, @offset + @len)
7082 *
7083 * @offset: File offset
7084 * @len: The length to write, will be updated to the nocow writeable
7085 * range
7086 * @orig_start: (optional) Return the original file offset of the file extent
7087 * @orig_len: (optional) Return the original on-disk length of the file extent
7088 * @ram_bytes: (optional) Return the ram_bytes of the file extent
7089 *
7090 * Return:
7091 * >0 and update @len if we can do nocow write
7092 * 0 if we can't do nocow write
7093 * <0 if error happened
7094 *
7095 * NOTE: This only checks the file extents, caller is responsible to wait for
7096 * any ordered extents.
7097 */
can_nocow_extent(struct btrfs_inode * inode,u64 offset,u64 * len,struct btrfs_file_extent * file_extent,bool nowait)7098 noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len,
7099 struct btrfs_file_extent *file_extent,
7100 bool nowait)
7101 {
7102 struct btrfs_root *root = inode->root;
7103 struct btrfs_fs_info *fs_info = root->fs_info;
7104 struct can_nocow_file_extent_args nocow_args = { 0 };
7105 struct btrfs_path *path;
7106 int ret;
7107 struct extent_buffer *leaf;
7108 struct extent_io_tree *io_tree = &inode->io_tree;
7109 struct btrfs_file_extent_item *fi;
7110 struct btrfs_key key;
7111 int found_type;
7112
7113 path = btrfs_alloc_path();
7114 if (!path)
7115 return -ENOMEM;
7116 path->nowait = nowait;
7117
7118 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
7119 offset, 0);
7120 if (ret < 0)
7121 goto out;
7122
7123 if (ret == 1) {
7124 if (path->slots[0] == 0) {
7125 /* can't find the item, must cow */
7126 ret = 0;
7127 goto out;
7128 }
7129 path->slots[0]--;
7130 }
7131 ret = 0;
7132 leaf = path->nodes[0];
7133 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
7134 if (key.objectid != btrfs_ino(inode) ||
7135 key.type != BTRFS_EXTENT_DATA_KEY) {
7136 /* not our file or wrong item type, must cow */
7137 goto out;
7138 }
7139
7140 if (key.offset > offset) {
7141 /* Wrong offset, must cow */
7142 goto out;
7143 }
7144
7145 if (btrfs_file_extent_end(path) <= offset)
7146 goto out;
7147
7148 fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
7149 found_type = btrfs_file_extent_type(leaf, fi);
7150
7151 nocow_args.start = offset;
7152 nocow_args.end = offset + *len - 1;
7153 nocow_args.free_path = true;
7154
7155 ret = can_nocow_file_extent(path, &key, inode, &nocow_args);
7156 /* can_nocow_file_extent() has freed the path. */
7157 path = NULL;
7158
7159 if (ret != 1) {
7160 /* Treat errors as not being able to NOCOW. */
7161 ret = 0;
7162 goto out;
7163 }
7164
7165 ret = 0;
7166 if (btrfs_extent_readonly(fs_info,
7167 nocow_args.file_extent.disk_bytenr +
7168 nocow_args.file_extent.offset))
7169 goto out;
7170
7171 if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
7172 found_type == BTRFS_FILE_EXTENT_PREALLOC) {
7173 u64 range_end;
7174
7175 range_end = round_up(offset + nocow_args.file_extent.num_bytes,
7176 root->fs_info->sectorsize) - 1;
7177 ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC);
7178 if (ret) {
7179 ret = -EAGAIN;
7180 goto out;
7181 }
7182 }
7183
7184 if (file_extent)
7185 memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent));
7186
7187 *len = nocow_args.file_extent.num_bytes;
7188 ret = 1;
7189 out:
7190 btrfs_free_path(path);
7191 return ret;
7192 }
7193
7194 /* The callers of this must take lock_extent() */
btrfs_create_io_em(struct btrfs_inode * inode,u64 start,const struct btrfs_file_extent * file_extent,int type)7195 struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start,
7196 const struct btrfs_file_extent *file_extent,
7197 int type)
7198 {
7199 struct extent_map *em;
7200 int ret;
7201
7202 /*
7203 * Note the missing NOCOW type.
7204 *
7205 * For pure NOCOW writes, we should not create an io extent map, but
7206 * just reusing the existing one.
7207 * Only PREALLOC writes (NOCOW write into preallocated range) can
7208 * create an io extent map.
7209 */
7210 ASSERT(type == BTRFS_ORDERED_PREALLOC ||
7211 type == BTRFS_ORDERED_COMPRESSED ||
7212 type == BTRFS_ORDERED_REGULAR);
7213
7214 switch (type) {
7215 case BTRFS_ORDERED_PREALLOC:
7216 /* We're only referring part of a larger preallocated extent. */
7217 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7218 break;
7219 case BTRFS_ORDERED_REGULAR:
7220 /* COW results a new extent matching our file extent size. */
7221 ASSERT(file_extent->disk_num_bytes == file_extent->num_bytes);
7222 ASSERT(file_extent->ram_bytes == file_extent->num_bytes);
7223
7224 /* Since it's a new extent, we should not have any offset. */
7225 ASSERT(file_extent->offset == 0);
7226 break;
7227 case BTRFS_ORDERED_COMPRESSED:
7228 /* Must be compressed. */
7229 ASSERT(file_extent->compression != BTRFS_COMPRESS_NONE);
7230
7231 /*
7232 * Encoded write can make us to refer to part of the
7233 * uncompressed extent.
7234 */
7235 ASSERT(file_extent->num_bytes <= file_extent->ram_bytes);
7236 break;
7237 }
7238
7239 em = alloc_extent_map();
7240 if (!em)
7241 return ERR_PTR(-ENOMEM);
7242
7243 em->start = start;
7244 em->len = file_extent->num_bytes;
7245 em->disk_bytenr = file_extent->disk_bytenr;
7246 em->disk_num_bytes = file_extent->disk_num_bytes;
7247 em->ram_bytes = file_extent->ram_bytes;
7248 em->generation = -1;
7249 em->offset = file_extent->offset;
7250 em->flags |= EXTENT_FLAG_PINNED;
7251 if (type == BTRFS_ORDERED_COMPRESSED)
7252 extent_map_set_compression(em, file_extent->compression);
7253
7254 ret = btrfs_replace_extent_map_range(inode, em, true);
7255 if (ret) {
7256 free_extent_map(em);
7257 return ERR_PTR(ret);
7258 }
7259
7260 /* em got 2 refs now, callers needs to do free_extent_map once. */
7261 return em;
7262 }
7263
7264 /*
7265 * For release_folio() and invalidate_folio() we have a race window where
7266 * folio_end_writeback() is called but the subpage spinlock is not yet released.
7267 * If we continue to release/invalidate the page, we could cause use-after-free
7268 * for subpage spinlock. So this function is to spin and wait for subpage
7269 * spinlock.
7270 */
wait_subpage_spinlock(struct folio * folio)7271 static void wait_subpage_spinlock(struct folio *folio)
7272 {
7273 struct btrfs_fs_info *fs_info = folio_to_fs_info(folio);
7274 struct btrfs_subpage *subpage;
7275
7276 if (!btrfs_is_subpage(fs_info, folio))
7277 return;
7278
7279 ASSERT(folio_test_private(folio) && folio_get_private(folio));
7280 subpage = folio_get_private(folio);
7281
7282 /*
7283 * This may look insane as we just acquire the spinlock and release it,
7284 * without doing anything. But we just want to make sure no one is
7285 * still holding the subpage spinlock.
7286 * And since the page is not dirty nor writeback, and we have page
7287 * locked, the only possible way to hold a spinlock is from the endio
7288 * function to clear page writeback.
7289 *
7290 * Here we just acquire the spinlock so that all existing callers
7291 * should exit and we're safe to release/invalidate the page.
7292 */
7293 spin_lock_irq(&subpage->lock);
7294 spin_unlock_irq(&subpage->lock);
7295 }
7296
btrfs_launder_folio(struct folio * folio)7297 static int btrfs_launder_folio(struct folio *folio)
7298 {
7299 return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio),
7300 folio_size(folio), NULL);
7301 }
7302
__btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7303 static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7304 {
7305 if (try_release_extent_mapping(folio, gfp_flags)) {
7306 wait_subpage_spinlock(folio);
7307 clear_folio_extent_mapped(folio);
7308 return true;
7309 }
7310 return false;
7311 }
7312
btrfs_release_folio(struct folio * folio,gfp_t gfp_flags)7313 static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
7314 {
7315 if (folio_test_writeback(folio) || folio_test_dirty(folio))
7316 return false;
7317 return __btrfs_release_folio(folio, gfp_flags);
7318 }
7319
7320 #ifdef CONFIG_MIGRATION
btrfs_migrate_folio(struct address_space * mapping,struct folio * dst,struct folio * src,enum migrate_mode mode)7321 static int btrfs_migrate_folio(struct address_space *mapping,
7322 struct folio *dst, struct folio *src,
7323 enum migrate_mode mode)
7324 {
7325 int ret = filemap_migrate_folio(mapping, dst, src, mode);
7326
7327 if (ret != MIGRATEPAGE_SUCCESS)
7328 return ret;
7329
7330 if (folio_test_ordered(src)) {
7331 folio_clear_ordered(src);
7332 folio_set_ordered(dst);
7333 }
7334
7335 return MIGRATEPAGE_SUCCESS;
7336 }
7337 #else
7338 #define btrfs_migrate_folio NULL
7339 #endif
7340
btrfs_invalidate_folio(struct folio * folio,size_t offset,size_t length)7341 static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
7342 size_t length)
7343 {
7344 struct btrfs_inode *inode = folio_to_inode(folio);
7345 struct btrfs_fs_info *fs_info = inode->root->fs_info;
7346 struct extent_io_tree *tree = &inode->io_tree;
7347 struct extent_state *cached_state = NULL;
7348 u64 page_start = folio_pos(folio);
7349 u64 page_end = page_start + folio_size(folio) - 1;
7350 u64 cur;
7351 int inode_evicting = inode->vfs_inode.i_state & I_FREEING;
7352
7353 /*
7354 * We have folio locked so no new ordered extent can be created on this
7355 * page, nor bio can be submitted for this folio.
7356 *
7357 * But already submitted bio can still be finished on this folio.
7358 * Furthermore, endio function won't skip folio which has Ordered
7359 * already cleared, so it's possible for endio and
7360 * invalidate_folio to do the same ordered extent accounting twice
7361 * on one folio.
7362 *
7363 * So here we wait for any submitted bios to finish, so that we won't
7364 * do double ordered extent accounting on the same folio.
7365 */
7366 folio_wait_writeback(folio);
7367 wait_subpage_spinlock(folio);
7368
7369 /*
7370 * For subpage case, we have call sites like
7371 * btrfs_punch_hole_lock_range() which passes range not aligned to
7372 * sectorsize.
7373 * If the range doesn't cover the full folio, we don't need to and
7374 * shouldn't clear page extent mapped, as folio->private can still
7375 * record subpage dirty bits for other part of the range.
7376 *
7377 * For cases that invalidate the full folio even the range doesn't
7378 * cover the full folio, like invalidating the last folio, we're
7379 * still safe to wait for ordered extent to finish.
7380 */
7381 if (!(offset == 0 && length == folio_size(folio))) {
7382 btrfs_release_folio(folio, GFP_NOFS);
7383 return;
7384 }
7385
7386 if (!inode_evicting)
7387 lock_extent(tree, page_start, page_end, &cached_state);
7388
7389 cur = page_start;
7390 while (cur < page_end) {
7391 struct btrfs_ordered_extent *ordered;
7392 u64 range_end;
7393 u32 range_len;
7394 u32 extra_flags = 0;
7395
7396 ordered = btrfs_lookup_first_ordered_range(inode, cur,
7397 page_end + 1 - cur);
7398 if (!ordered) {
7399 range_end = page_end;
7400 /*
7401 * No ordered extent covering this range, we are safe
7402 * to delete all extent states in the range.
7403 */
7404 extra_flags = EXTENT_CLEAR_ALL_BITS;
7405 goto next;
7406 }
7407 if (ordered->file_offset > cur) {
7408 /*
7409 * There is a range between [cur, oe->file_offset) not
7410 * covered by any ordered extent.
7411 * We are safe to delete all extent states, and handle
7412 * the ordered extent in the next iteration.
7413 */
7414 range_end = ordered->file_offset - 1;
7415 extra_flags = EXTENT_CLEAR_ALL_BITS;
7416 goto next;
7417 }
7418
7419 range_end = min(ordered->file_offset + ordered->num_bytes - 1,
7420 page_end);
7421 ASSERT(range_end + 1 - cur < U32_MAX);
7422 range_len = range_end + 1 - cur;
7423 if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) {
7424 /*
7425 * If Ordered is cleared, it means endio has
7426 * already been executed for the range.
7427 * We can't delete the extent states as
7428 * btrfs_finish_ordered_io() may still use some of them.
7429 */
7430 goto next;
7431 }
7432 btrfs_folio_clear_ordered(fs_info, folio, cur, range_len);
7433
7434 /*
7435 * IO on this page will never be started, so we need to account
7436 * for any ordered extents now. Don't clear EXTENT_DELALLOC_NEW
7437 * here, must leave that up for the ordered extent completion.
7438 *
7439 * This will also unlock the range for incoming
7440 * btrfs_finish_ordered_io().
7441 */
7442 if (!inode_evicting)
7443 clear_extent_bit(tree, cur, range_end,
7444 EXTENT_DELALLOC |
7445 EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
7446 EXTENT_DEFRAG, &cached_state);
7447
7448 spin_lock_irq(&inode->ordered_tree_lock);
7449 set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
7450 ordered->truncated_len = min(ordered->truncated_len,
7451 cur - ordered->file_offset);
7452 spin_unlock_irq(&inode->ordered_tree_lock);
7453
7454 /*
7455 * If the ordered extent has finished, we're safe to delete all
7456 * the extent states of the range, otherwise
7457 * btrfs_finish_ordered_io() will get executed by endio for
7458 * other pages, so we can't delete extent states.
7459 */
7460 if (btrfs_dec_test_ordered_pending(inode, &ordered,
7461 cur, range_end + 1 - cur)) {
7462 btrfs_finish_ordered_io(ordered);
7463 /*
7464 * The ordered extent has finished, now we're again
7465 * safe to delete all extent states of the range.
7466 */
7467 extra_flags = EXTENT_CLEAR_ALL_BITS;
7468 }
7469 next:
7470 if (ordered)
7471 btrfs_put_ordered_extent(ordered);
7472 /*
7473 * Qgroup reserved space handler
7474 * Sector(s) here will be either:
7475 *
7476 * 1) Already written to disk or bio already finished
7477 * Then its QGROUP_RESERVED bit in io_tree is already cleared.
7478 * Qgroup will be handled by its qgroup_record then.
7479 * btrfs_qgroup_free_data() call will do nothing here.
7480 *
7481 * 2) Not written to disk yet
7482 * Then btrfs_qgroup_free_data() call will clear the
7483 * QGROUP_RESERVED bit of its io_tree, and free the qgroup
7484 * reserved data space.
7485 * Since the IO will never happen for this page.
7486 */
7487 btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL);
7488 if (!inode_evicting) {
7489 clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED |
7490 EXTENT_DELALLOC | EXTENT_UPTODATE |
7491 EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG |
7492 extra_flags, &cached_state);
7493 }
7494 cur = range_end + 1;
7495 }
7496 /*
7497 * We have iterated through all ordered extents of the page, the page
7498 * should not have Ordered anymore, or the above iteration
7499 * did something wrong.
7500 */
7501 ASSERT(!folio_test_ordered(folio));
7502 btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio));
7503 if (!inode_evicting)
7504 __btrfs_release_folio(folio, GFP_NOFS);
7505 clear_folio_extent_mapped(folio);
7506 }
7507
btrfs_truncate(struct btrfs_inode * inode,bool skip_writeback)7508 static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback)
7509 {
7510 struct btrfs_truncate_control control = {
7511 .inode = inode,
7512 .ino = btrfs_ino(inode),
7513 .min_type = BTRFS_EXTENT_DATA_KEY,
7514 .clear_extent_range = true,
7515 };
7516 struct btrfs_root *root = inode->root;
7517 struct btrfs_fs_info *fs_info = root->fs_info;
7518 struct btrfs_block_rsv *rsv;
7519 int ret;
7520 struct btrfs_trans_handle *trans;
7521 u64 mask = fs_info->sectorsize - 1;
7522 const u64 min_size = btrfs_calc_metadata_size(fs_info, 1);
7523
7524 if (!skip_writeback) {
7525 ret = btrfs_wait_ordered_range(inode,
7526 inode->vfs_inode.i_size & (~mask),
7527 (u64)-1);
7528 if (ret)
7529 return ret;
7530 }
7531
7532 /*
7533 * Yes ladies and gentlemen, this is indeed ugly. We have a couple of
7534 * things going on here:
7535 *
7536 * 1) We need to reserve space to update our inode.
7537 *
7538 * 2) We need to have something to cache all the space that is going to
7539 * be free'd up by the truncate operation, but also have some slack
7540 * space reserved in case it uses space during the truncate (thank you
7541 * very much snapshotting).
7542 *
7543 * And we need these to be separate. The fact is we can use a lot of
7544 * space doing the truncate, and we have no earthly idea how much space
7545 * we will use, so we need the truncate reservation to be separate so it
7546 * doesn't end up using space reserved for updating the inode. We also
7547 * need to be able to stop the transaction and start a new one, which
7548 * means we need to be able to update the inode several times, and we
7549 * have no idea of knowing how many times that will be, so we can't just
7550 * reserve 1 item for the entirety of the operation, so that has to be
7551 * done separately as well.
7552 *
7553 * So that leaves us with
7554 *
7555 * 1) rsv - for the truncate reservation, which we will steal from the
7556 * transaction reservation.
7557 * 2) fs_info->trans_block_rsv - this will have 1 items worth left for
7558 * updating the inode.
7559 */
7560 rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
7561 if (!rsv)
7562 return -ENOMEM;
7563 rsv->size = min_size;
7564 rsv->failfast = true;
7565
7566 /*
7567 * 1 for the truncate slack space
7568 * 1 for updating the inode.
7569 */
7570 trans = btrfs_start_transaction(root, 2);
7571 if (IS_ERR(trans)) {
7572 ret = PTR_ERR(trans);
7573 goto out;
7574 }
7575
7576 /* Migrate the slack space for the truncate to our reserve */
7577 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
7578 min_size, false);
7579 /*
7580 * We have reserved 2 metadata units when we started the transaction and
7581 * min_size matches 1 unit, so this should never fail, but if it does,
7582 * it's not critical we just fail truncation.
7583 */
7584 if (WARN_ON(ret)) {
7585 btrfs_end_transaction(trans);
7586 goto out;
7587 }
7588
7589 trans->block_rsv = rsv;
7590
7591 while (1) {
7592 struct extent_state *cached_state = NULL;
7593 const u64 new_size = inode->vfs_inode.i_size;
7594 const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize);
7595
7596 control.new_size = new_size;
7597 lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7598 /*
7599 * We want to drop from the next block forward in case this new
7600 * size is not block aligned since we will be keeping the last
7601 * block of the extent just the way it is.
7602 */
7603 btrfs_drop_extent_map_range(inode,
7604 ALIGN(new_size, fs_info->sectorsize),
7605 (u64)-1, false);
7606
7607 ret = btrfs_truncate_inode_items(trans, root, &control);
7608
7609 inode_sub_bytes(&inode->vfs_inode, control.sub_bytes);
7610 btrfs_inode_safe_disk_i_size_write(inode, control.last_size);
7611
7612 unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state);
7613
7614 trans->block_rsv = &fs_info->trans_block_rsv;
7615 if (ret != -ENOSPC && ret != -EAGAIN)
7616 break;
7617
7618 ret = btrfs_update_inode(trans, inode);
7619 if (ret)
7620 break;
7621
7622 btrfs_end_transaction(trans);
7623 btrfs_btree_balance_dirty(fs_info);
7624
7625 trans = btrfs_start_transaction(root, 2);
7626 if (IS_ERR(trans)) {
7627 ret = PTR_ERR(trans);
7628 trans = NULL;
7629 break;
7630 }
7631
7632 btrfs_block_rsv_release(fs_info, rsv, -1, NULL);
7633 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
7634 rsv, min_size, false);
7635 /*
7636 * We have reserved 2 metadata units when we started the
7637 * transaction and min_size matches 1 unit, so this should never
7638 * fail, but if it does, it's not critical we just fail truncation.
7639 */
7640 if (WARN_ON(ret))
7641 break;
7642
7643 trans->block_rsv = rsv;
7644 }
7645
7646 /*
7647 * We can't call btrfs_truncate_block inside a trans handle as we could
7648 * deadlock with freeze, if we got BTRFS_NEED_TRUNCATE_BLOCK then we
7649 * know we've truncated everything except the last little bit, and can
7650 * do btrfs_truncate_block and then update the disk_i_size.
7651 */
7652 if (ret == BTRFS_NEED_TRUNCATE_BLOCK) {
7653 btrfs_end_transaction(trans);
7654 btrfs_btree_balance_dirty(fs_info);
7655
7656 ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0);
7657 if (ret)
7658 goto out;
7659 trans = btrfs_start_transaction(root, 1);
7660 if (IS_ERR(trans)) {
7661 ret = PTR_ERR(trans);
7662 goto out;
7663 }
7664 btrfs_inode_safe_disk_i_size_write(inode, 0);
7665 }
7666
7667 if (trans) {
7668 int ret2;
7669
7670 trans->block_rsv = &fs_info->trans_block_rsv;
7671 ret2 = btrfs_update_inode(trans, inode);
7672 if (ret2 && !ret)
7673 ret = ret2;
7674
7675 ret2 = btrfs_end_transaction(trans);
7676 if (ret2 && !ret)
7677 ret = ret2;
7678 btrfs_btree_balance_dirty(fs_info);
7679 }
7680 out:
7681 btrfs_free_block_rsv(fs_info, rsv);
7682 /*
7683 * So if we truncate and then write and fsync we normally would just
7684 * write the extents that changed, which is a problem if we need to
7685 * first truncate that entire inode. So set this flag so we write out
7686 * all of the extents in the inode to the sync log so we're completely
7687 * safe.
7688 *
7689 * If no extents were dropped or trimmed we don't need to force the next
7690 * fsync to truncate all the inode's items from the log and re-log them
7691 * all. This means the truncate operation did not change the file size,
7692 * or changed it to a smaller size but there was only an implicit hole
7693 * between the old i_size and the new i_size, and there were no prealloc
7694 * extents beyond i_size to drop.
7695 */
7696 if (control.extents_found > 0)
7697 btrfs_set_inode_full_sync(inode);
7698
7699 return ret;
7700 }
7701
btrfs_new_subvol_inode(struct mnt_idmap * idmap,struct inode * dir)7702 struct inode *btrfs_new_subvol_inode(struct mnt_idmap *idmap,
7703 struct inode *dir)
7704 {
7705 struct inode *inode;
7706
7707 inode = new_inode(dir->i_sb);
7708 if (inode) {
7709 /*
7710 * Subvolumes don't inherit the sgid bit or the parent's gid if
7711 * the parent's sgid bit is set. This is probably a bug.
7712 */
7713 inode_init_owner(idmap, inode, NULL,
7714 S_IFDIR | (~current_umask() & S_IRWXUGO));
7715 inode->i_op = &btrfs_dir_inode_operations;
7716 inode->i_fop = &btrfs_dir_file_operations;
7717 }
7718 return inode;
7719 }
7720
btrfs_alloc_inode(struct super_block * sb)7721 struct inode *btrfs_alloc_inode(struct super_block *sb)
7722 {
7723 struct btrfs_fs_info *fs_info = btrfs_sb(sb);
7724 struct btrfs_inode *ei;
7725 struct inode *inode;
7726
7727 ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL);
7728 if (!ei)
7729 return NULL;
7730
7731 ei->root = NULL;
7732 ei->generation = 0;
7733 ei->last_trans = 0;
7734 ei->last_sub_trans = 0;
7735 ei->logged_trans = 0;
7736 ei->delalloc_bytes = 0;
7737 ei->new_delalloc_bytes = 0;
7738 ei->defrag_bytes = 0;
7739 ei->disk_i_size = 0;
7740 ei->flags = 0;
7741 ei->ro_flags = 0;
7742 /*
7743 * ->index_cnt will be properly initialized later when creating a new
7744 * inode (btrfs_create_new_inode()) or when reading an existing inode
7745 * from disk (btrfs_read_locked_inode()).
7746 */
7747 ei->csum_bytes = 0;
7748 ei->dir_index = 0;
7749 ei->last_unlink_trans = 0;
7750 ei->last_reflink_trans = 0;
7751 ei->last_log_commit = 0;
7752
7753 spin_lock_init(&ei->lock);
7754 ei->outstanding_extents = 0;
7755 if (sb->s_magic != BTRFS_TEST_MAGIC)
7756 btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
7757 BTRFS_BLOCK_RSV_DELALLOC);
7758 ei->runtime_flags = 0;
7759 ei->prop_compress = BTRFS_COMPRESS_NONE;
7760 ei->defrag_compress = BTRFS_COMPRESS_NONE;
7761
7762 ei->delayed_node = NULL;
7763
7764 ei->i_otime_sec = 0;
7765 ei->i_otime_nsec = 0;
7766
7767 inode = &ei->vfs_inode;
7768 extent_map_tree_init(&ei->extent_tree);
7769
7770 /* This io tree sets the valid inode. */
7771 extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO);
7772 ei->io_tree.inode = ei;
7773
7774 ei->file_extent_tree = NULL;
7775
7776 mutex_init(&ei->log_mutex);
7777 spin_lock_init(&ei->ordered_tree_lock);
7778 ei->ordered_tree = RB_ROOT;
7779 ei->ordered_tree_last = NULL;
7780 INIT_LIST_HEAD(&ei->delalloc_inodes);
7781 INIT_LIST_HEAD(&ei->delayed_iput);
7782 init_rwsem(&ei->i_mmap_lock);
7783
7784 return inode;
7785 }
7786
7787 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
btrfs_test_destroy_inode(struct inode * inode)7788 void btrfs_test_destroy_inode(struct inode *inode)
7789 {
7790 btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false);
7791 kfree(BTRFS_I(inode)->file_extent_tree);
7792 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7793 }
7794 #endif
7795
btrfs_free_inode(struct inode * inode)7796 void btrfs_free_inode(struct inode *inode)
7797 {
7798 kfree(BTRFS_I(inode)->file_extent_tree);
7799 kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
7800 }
7801
btrfs_destroy_inode(struct inode * vfs_inode)7802 void btrfs_destroy_inode(struct inode *vfs_inode)
7803 {
7804 struct btrfs_ordered_extent *ordered;
7805 struct btrfs_inode *inode = BTRFS_I(vfs_inode);
7806 struct btrfs_root *root = inode->root;
7807 bool freespace_inode;
7808
7809 WARN_ON(!hlist_empty(&vfs_inode->i_dentry));
7810 WARN_ON(vfs_inode->i_data.nrpages);
7811 WARN_ON(inode->block_rsv.reserved);
7812 WARN_ON(inode->block_rsv.size);
7813 WARN_ON(inode->outstanding_extents);
7814 if (!S_ISDIR(vfs_inode->i_mode)) {
7815 WARN_ON(inode->delalloc_bytes);
7816 WARN_ON(inode->new_delalloc_bytes);
7817 WARN_ON(inode->csum_bytes);
7818 }
7819 if (!root || !btrfs_is_data_reloc_root(root))
7820 WARN_ON(inode->defrag_bytes);
7821
7822 /*
7823 * This can happen where we create an inode, but somebody else also
7824 * created the same inode and we need to destroy the one we already
7825 * created.
7826 */
7827 if (!root)
7828 return;
7829
7830 /*
7831 * If this is a free space inode do not take the ordered extents lockdep
7832 * map.
7833 */
7834 freespace_inode = btrfs_is_free_space_inode(inode);
7835
7836 while (1) {
7837 ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
7838 if (!ordered)
7839 break;
7840 else {
7841 btrfs_err(root->fs_info,
7842 "found ordered extent %llu %llu on inode cleanup",
7843 ordered->file_offset, ordered->num_bytes);
7844
7845 if (!freespace_inode)
7846 btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent);
7847
7848 btrfs_remove_ordered_extent(inode, ordered);
7849 btrfs_put_ordered_extent(ordered);
7850 btrfs_put_ordered_extent(ordered);
7851 }
7852 }
7853 btrfs_qgroup_check_reserved_leak(inode);
7854 btrfs_del_inode_from_root(inode);
7855 btrfs_drop_extent_map_range(inode, 0, (u64)-1, false);
7856 btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1);
7857 btrfs_put_root(inode->root);
7858 }
7859
btrfs_drop_inode(struct inode * inode)7860 int btrfs_drop_inode(struct inode *inode)
7861 {
7862 struct btrfs_root *root = BTRFS_I(inode)->root;
7863
7864 if (root == NULL)
7865 return 1;
7866
7867 /* the snap/subvol tree is on deleting */
7868 if (btrfs_root_refs(&root->root_item) == 0)
7869 return 1;
7870 else
7871 return generic_drop_inode(inode);
7872 }
7873
init_once(void * foo)7874 static void init_once(void *foo)
7875 {
7876 struct btrfs_inode *ei = foo;
7877
7878 inode_init_once(&ei->vfs_inode);
7879 }
7880
btrfs_destroy_cachep(void)7881 void __cold btrfs_destroy_cachep(void)
7882 {
7883 /*
7884 * Make sure all delayed rcu free inodes are flushed before we
7885 * destroy cache.
7886 */
7887 rcu_barrier();
7888 kmem_cache_destroy(btrfs_inode_cachep);
7889 }
7890
btrfs_init_cachep(void)7891 int __init btrfs_init_cachep(void)
7892 {
7893 btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
7894 sizeof(struct btrfs_inode), 0,
7895 SLAB_RECLAIM_ACCOUNT | SLAB_ACCOUNT,
7896 init_once);
7897 if (!btrfs_inode_cachep)
7898 return -ENOMEM;
7899
7900 return 0;
7901 }
7902
btrfs_getattr(struct mnt_idmap * idmap,const struct path * path,struct kstat * stat,u32 request_mask,unsigned int flags)7903 static int btrfs_getattr(struct mnt_idmap *idmap,
7904 const struct path *path, struct kstat *stat,
7905 u32 request_mask, unsigned int flags)
7906 {
7907 u64 delalloc_bytes;
7908 u64 inode_bytes;
7909 struct inode *inode = d_inode(path->dentry);
7910 u32 blocksize = btrfs_sb(inode->i_sb)->sectorsize;
7911 u32 bi_flags = BTRFS_I(inode)->flags;
7912 u32 bi_ro_flags = BTRFS_I(inode)->ro_flags;
7913
7914 stat->result_mask |= STATX_BTIME;
7915 stat->btime.tv_sec = BTRFS_I(inode)->i_otime_sec;
7916 stat->btime.tv_nsec = BTRFS_I(inode)->i_otime_nsec;
7917 if (bi_flags & BTRFS_INODE_APPEND)
7918 stat->attributes |= STATX_ATTR_APPEND;
7919 if (bi_flags & BTRFS_INODE_COMPRESS)
7920 stat->attributes |= STATX_ATTR_COMPRESSED;
7921 if (bi_flags & BTRFS_INODE_IMMUTABLE)
7922 stat->attributes |= STATX_ATTR_IMMUTABLE;
7923 if (bi_flags & BTRFS_INODE_NODUMP)
7924 stat->attributes |= STATX_ATTR_NODUMP;
7925 if (bi_ro_flags & BTRFS_INODE_RO_VERITY)
7926 stat->attributes |= STATX_ATTR_VERITY;
7927
7928 stat->attributes_mask |= (STATX_ATTR_APPEND |
7929 STATX_ATTR_COMPRESSED |
7930 STATX_ATTR_IMMUTABLE |
7931 STATX_ATTR_NODUMP);
7932
7933 generic_fillattr(idmap, request_mask, inode, stat);
7934 stat->dev = BTRFS_I(inode)->root->anon_dev;
7935
7936 stat->subvol = BTRFS_I(inode)->root->root_key.objectid;
7937 stat->result_mask |= STATX_SUBVOL;
7938
7939 spin_lock(&BTRFS_I(inode)->lock);
7940 delalloc_bytes = BTRFS_I(inode)->new_delalloc_bytes;
7941 inode_bytes = inode_get_bytes(inode);
7942 spin_unlock(&BTRFS_I(inode)->lock);
7943 stat->blocks = (ALIGN(inode_bytes, blocksize) +
7944 ALIGN(delalloc_bytes, blocksize)) >> SECTOR_SHIFT;
7945 return 0;
7946 }
7947
btrfs_rename_exchange(struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry)7948 static int btrfs_rename_exchange(struct inode *old_dir,
7949 struct dentry *old_dentry,
7950 struct inode *new_dir,
7951 struct dentry *new_dentry)
7952 {
7953 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
7954 struct btrfs_trans_handle *trans;
7955 unsigned int trans_num_items;
7956 struct btrfs_root *root = BTRFS_I(old_dir)->root;
7957 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
7958 struct inode *new_inode = new_dentry->d_inode;
7959 struct inode *old_inode = old_dentry->d_inode;
7960 struct btrfs_rename_ctx old_rename_ctx;
7961 struct btrfs_rename_ctx new_rename_ctx;
7962 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
7963 u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
7964 u64 old_idx = 0;
7965 u64 new_idx = 0;
7966 int ret;
7967 int ret2;
7968 bool need_abort = false;
7969 struct fscrypt_name old_fname, new_fname;
7970 struct fscrypt_str *old_name, *new_name;
7971
7972 /*
7973 * For non-subvolumes allow exchange only within one subvolume, in the
7974 * same inode namespace. Two subvolumes (represented as directory) can
7975 * be exchanged as they're a logical link and have a fixed inode number.
7976 */
7977 if (root != dest &&
7978 (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
7979 new_ino != BTRFS_FIRST_FREE_OBJECTID))
7980 return -EXDEV;
7981
7982 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
7983 if (ret)
7984 return ret;
7985
7986 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
7987 if (ret) {
7988 fscrypt_free_filename(&old_fname);
7989 return ret;
7990 }
7991
7992 old_name = &old_fname.disk_name;
7993 new_name = &new_fname.disk_name;
7994
7995 /* close the race window with snapshot create/destroy ioctl */
7996 if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
7997 new_ino == BTRFS_FIRST_FREE_OBJECTID)
7998 down_read(&fs_info->subvol_sem);
7999
8000 /*
8001 * For each inode:
8002 * 1 to remove old dir item
8003 * 1 to remove old dir index
8004 * 1 to add new dir item
8005 * 1 to add new dir index
8006 * 1 to update parent inode
8007 *
8008 * If the parents are the same, we only need to account for one
8009 */
8010 trans_num_items = (old_dir == new_dir ? 9 : 10);
8011 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8012 /*
8013 * 1 to remove old root ref
8014 * 1 to remove old root backref
8015 * 1 to add new root ref
8016 * 1 to add new root backref
8017 */
8018 trans_num_items += 4;
8019 } else {
8020 /*
8021 * 1 to update inode item
8022 * 1 to remove old inode ref
8023 * 1 to add new inode ref
8024 */
8025 trans_num_items += 3;
8026 }
8027 if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
8028 trans_num_items += 4;
8029 else
8030 trans_num_items += 3;
8031 trans = btrfs_start_transaction(root, trans_num_items);
8032 if (IS_ERR(trans)) {
8033 ret = PTR_ERR(trans);
8034 goto out_notrans;
8035 }
8036
8037 if (dest != root) {
8038 ret = btrfs_record_root_in_trans(trans, dest);
8039 if (ret)
8040 goto out_fail;
8041 }
8042
8043 /*
8044 * We need to find a free sequence number both in the source and
8045 * in the destination directory for the exchange.
8046 */
8047 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &old_idx);
8048 if (ret)
8049 goto out_fail;
8050 ret = btrfs_set_inode_index(BTRFS_I(old_dir), &new_idx);
8051 if (ret)
8052 goto out_fail;
8053
8054 BTRFS_I(old_inode)->dir_index = 0ULL;
8055 BTRFS_I(new_inode)->dir_index = 0ULL;
8056
8057 /* Reference for the source. */
8058 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8059 /* force full log commit if subvolume involved. */
8060 btrfs_set_log_full_commit(trans);
8061 } else {
8062 ret = btrfs_insert_inode_ref(trans, dest, new_name, old_ino,
8063 btrfs_ino(BTRFS_I(new_dir)),
8064 old_idx);
8065 if (ret)
8066 goto out_fail;
8067 need_abort = true;
8068 }
8069
8070 /* And now for the dest. */
8071 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8072 /* force full log commit if subvolume involved. */
8073 btrfs_set_log_full_commit(trans);
8074 } else {
8075 ret = btrfs_insert_inode_ref(trans, root, old_name, new_ino,
8076 btrfs_ino(BTRFS_I(old_dir)),
8077 new_idx);
8078 if (ret) {
8079 if (need_abort)
8080 btrfs_abort_transaction(trans, ret);
8081 goto out_fail;
8082 }
8083 }
8084
8085 /* Update inode version and ctime/mtime. */
8086 inode_inc_iversion(old_dir);
8087 inode_inc_iversion(new_dir);
8088 inode_inc_iversion(old_inode);
8089 inode_inc_iversion(new_inode);
8090 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8091
8092 if (old_dentry->d_parent != new_dentry->d_parent) {
8093 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8094 BTRFS_I(old_inode), true);
8095 btrfs_record_unlink_dir(trans, BTRFS_I(new_dir),
8096 BTRFS_I(new_inode), true);
8097 }
8098
8099 /* src is a subvolume */
8100 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8101 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8102 if (ret) {
8103 btrfs_abort_transaction(trans, ret);
8104 goto out_fail;
8105 }
8106 } else { /* src is an inode */
8107 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8108 BTRFS_I(old_dentry->d_inode),
8109 old_name, &old_rename_ctx);
8110 if (ret) {
8111 btrfs_abort_transaction(trans, ret);
8112 goto out_fail;
8113 }
8114 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8115 if (ret) {
8116 btrfs_abort_transaction(trans, ret);
8117 goto out_fail;
8118 }
8119 }
8120
8121 /* dest is a subvolume */
8122 if (new_ino == BTRFS_FIRST_FREE_OBJECTID) {
8123 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8124 if (ret) {
8125 btrfs_abort_transaction(trans, ret);
8126 goto out_fail;
8127 }
8128 } else { /* dest is an inode */
8129 ret = __btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8130 BTRFS_I(new_dentry->d_inode),
8131 new_name, &new_rename_ctx);
8132 if (ret) {
8133 btrfs_abort_transaction(trans, ret);
8134 goto out_fail;
8135 }
8136 ret = btrfs_update_inode(trans, BTRFS_I(new_inode));
8137 if (ret) {
8138 btrfs_abort_transaction(trans, ret);
8139 goto out_fail;
8140 }
8141 }
8142
8143 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8144 new_name, 0, old_idx);
8145 if (ret) {
8146 btrfs_abort_transaction(trans, ret);
8147 goto out_fail;
8148 }
8149
8150 ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode),
8151 old_name, 0, new_idx);
8152 if (ret) {
8153 btrfs_abort_transaction(trans, ret);
8154 goto out_fail;
8155 }
8156
8157 if (old_inode->i_nlink == 1)
8158 BTRFS_I(old_inode)->dir_index = old_idx;
8159 if (new_inode->i_nlink == 1)
8160 BTRFS_I(new_inode)->dir_index = new_idx;
8161
8162 /*
8163 * Now pin the logs of the roots. We do it to ensure that no other task
8164 * can sync the logs while we are in progress with the rename, because
8165 * that could result in an inconsistency in case any of the inodes that
8166 * are part of this rename operation were logged before.
8167 */
8168 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8169 btrfs_pin_log_trans(root);
8170 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8171 btrfs_pin_log_trans(dest);
8172
8173 /* Do the log updates for all inodes. */
8174 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8175 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8176 old_rename_ctx.index, new_dentry->d_parent);
8177 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8178 btrfs_log_new_name(trans, new_dentry, BTRFS_I(new_dir),
8179 new_rename_ctx.index, old_dentry->d_parent);
8180
8181 /* Now unpin the logs. */
8182 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8183 btrfs_end_log_trans(root);
8184 if (new_ino != BTRFS_FIRST_FREE_OBJECTID)
8185 btrfs_end_log_trans(dest);
8186 out_fail:
8187 ret2 = btrfs_end_transaction(trans);
8188 ret = ret ? ret : ret2;
8189 out_notrans:
8190 if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
8191 old_ino == BTRFS_FIRST_FREE_OBJECTID)
8192 up_read(&fs_info->subvol_sem);
8193
8194 fscrypt_free_filename(&new_fname);
8195 fscrypt_free_filename(&old_fname);
8196 return ret;
8197 }
8198
new_whiteout_inode(struct mnt_idmap * idmap,struct inode * dir)8199 static struct inode *new_whiteout_inode(struct mnt_idmap *idmap,
8200 struct inode *dir)
8201 {
8202 struct inode *inode;
8203
8204 inode = new_inode(dir->i_sb);
8205 if (inode) {
8206 inode_init_owner(idmap, inode, dir,
8207 S_IFCHR | WHITEOUT_MODE);
8208 inode->i_op = &btrfs_special_inode_operations;
8209 init_special_inode(inode, inode->i_mode, WHITEOUT_DEV);
8210 }
8211 return inode;
8212 }
8213
btrfs_rename(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8214 static int btrfs_rename(struct mnt_idmap *idmap,
8215 struct inode *old_dir, struct dentry *old_dentry,
8216 struct inode *new_dir, struct dentry *new_dentry,
8217 unsigned int flags)
8218 {
8219 struct btrfs_fs_info *fs_info = inode_to_fs_info(old_dir);
8220 struct btrfs_new_inode_args whiteout_args = {
8221 .dir = old_dir,
8222 .dentry = old_dentry,
8223 };
8224 struct btrfs_trans_handle *trans;
8225 unsigned int trans_num_items;
8226 struct btrfs_root *root = BTRFS_I(old_dir)->root;
8227 struct btrfs_root *dest = BTRFS_I(new_dir)->root;
8228 struct inode *new_inode = d_inode(new_dentry);
8229 struct inode *old_inode = d_inode(old_dentry);
8230 struct btrfs_rename_ctx rename_ctx;
8231 u64 index = 0;
8232 int ret;
8233 int ret2;
8234 u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
8235 struct fscrypt_name old_fname, new_fname;
8236
8237 if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
8238 return -EPERM;
8239
8240 /* we only allow rename subvolume link between subvolumes */
8241 if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
8242 return -EXDEV;
8243
8244 if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
8245 (new_inode && btrfs_ino(BTRFS_I(new_inode)) == BTRFS_FIRST_FREE_OBJECTID))
8246 return -ENOTEMPTY;
8247
8248 if (S_ISDIR(old_inode->i_mode) && new_inode &&
8249 new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
8250 return -ENOTEMPTY;
8251
8252 ret = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_fname);
8253 if (ret)
8254 return ret;
8255
8256 ret = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_fname);
8257 if (ret) {
8258 fscrypt_free_filename(&old_fname);
8259 return ret;
8260 }
8261
8262 /* check for collisions, even if the name isn't there */
8263 ret = btrfs_check_dir_item_collision(dest, new_dir->i_ino, &new_fname.disk_name);
8264 if (ret) {
8265 if (ret == -EEXIST) {
8266 /* we shouldn't get
8267 * eexist without a new_inode */
8268 if (WARN_ON(!new_inode)) {
8269 goto out_fscrypt_names;
8270 }
8271 } else {
8272 /* maybe -EOVERFLOW */
8273 goto out_fscrypt_names;
8274 }
8275 }
8276 ret = 0;
8277
8278 /*
8279 * we're using rename to replace one file with another. Start IO on it
8280 * now so we don't add too much work to the end of the transaction
8281 */
8282 if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size)
8283 filemap_flush(old_inode->i_mapping);
8284
8285 if (flags & RENAME_WHITEOUT) {
8286 whiteout_args.inode = new_whiteout_inode(idmap, old_dir);
8287 if (!whiteout_args.inode) {
8288 ret = -ENOMEM;
8289 goto out_fscrypt_names;
8290 }
8291 ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items);
8292 if (ret)
8293 goto out_whiteout_inode;
8294 } else {
8295 /* 1 to update the old parent inode. */
8296 trans_num_items = 1;
8297 }
8298
8299 if (old_ino == BTRFS_FIRST_FREE_OBJECTID) {
8300 /* Close the race window with snapshot create/destroy ioctl */
8301 down_read(&fs_info->subvol_sem);
8302 /*
8303 * 1 to remove old root ref
8304 * 1 to remove old root backref
8305 * 1 to add new root ref
8306 * 1 to add new root backref
8307 */
8308 trans_num_items += 4;
8309 } else {
8310 /*
8311 * 1 to update inode
8312 * 1 to remove old inode ref
8313 * 1 to add new inode ref
8314 */
8315 trans_num_items += 3;
8316 }
8317 /*
8318 * 1 to remove old dir item
8319 * 1 to remove old dir index
8320 * 1 to add new dir item
8321 * 1 to add new dir index
8322 */
8323 trans_num_items += 4;
8324 /* 1 to update new parent inode if it's not the same as the old parent */
8325 if (new_dir != old_dir)
8326 trans_num_items++;
8327 if (new_inode) {
8328 /*
8329 * 1 to update inode
8330 * 1 to remove inode ref
8331 * 1 to remove dir item
8332 * 1 to remove dir index
8333 * 1 to possibly add orphan item
8334 */
8335 trans_num_items += 5;
8336 }
8337 trans = btrfs_start_transaction(root, trans_num_items);
8338 if (IS_ERR(trans)) {
8339 ret = PTR_ERR(trans);
8340 goto out_notrans;
8341 }
8342
8343 if (dest != root) {
8344 ret = btrfs_record_root_in_trans(trans, dest);
8345 if (ret)
8346 goto out_fail;
8347 }
8348
8349 ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
8350 if (ret)
8351 goto out_fail;
8352
8353 BTRFS_I(old_inode)->dir_index = 0ULL;
8354 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8355 /* force full log commit if subvolume involved. */
8356 btrfs_set_log_full_commit(trans);
8357 } else {
8358 ret = btrfs_insert_inode_ref(trans, dest, &new_fname.disk_name,
8359 old_ino, btrfs_ino(BTRFS_I(new_dir)),
8360 index);
8361 if (ret)
8362 goto out_fail;
8363 }
8364
8365 inode_inc_iversion(old_dir);
8366 inode_inc_iversion(new_dir);
8367 inode_inc_iversion(old_inode);
8368 simple_rename_timestamp(old_dir, old_dentry, new_dir, new_dentry);
8369
8370 if (old_dentry->d_parent != new_dentry->d_parent)
8371 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
8372 BTRFS_I(old_inode), true);
8373
8374 if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
8375 ret = btrfs_unlink_subvol(trans, BTRFS_I(old_dir), old_dentry);
8376 if (ret) {
8377 btrfs_abort_transaction(trans, ret);
8378 goto out_fail;
8379 }
8380 } else {
8381 ret = __btrfs_unlink_inode(trans, BTRFS_I(old_dir),
8382 BTRFS_I(d_inode(old_dentry)),
8383 &old_fname.disk_name, &rename_ctx);
8384 if (ret) {
8385 btrfs_abort_transaction(trans, ret);
8386 goto out_fail;
8387 }
8388 ret = btrfs_update_inode(trans, BTRFS_I(old_inode));
8389 if (ret) {
8390 btrfs_abort_transaction(trans, ret);
8391 goto out_fail;
8392 }
8393 }
8394
8395 if (new_inode) {
8396 inode_inc_iversion(new_inode);
8397 if (unlikely(btrfs_ino(BTRFS_I(new_inode)) ==
8398 BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
8399 ret = btrfs_unlink_subvol(trans, BTRFS_I(new_dir), new_dentry);
8400 if (ret) {
8401 btrfs_abort_transaction(trans, ret);
8402 goto out_fail;
8403 }
8404 BUG_ON(new_inode->i_nlink == 0);
8405 } else {
8406 ret = btrfs_unlink_inode(trans, BTRFS_I(new_dir),
8407 BTRFS_I(d_inode(new_dentry)),
8408 &new_fname.disk_name);
8409 if (ret) {
8410 btrfs_abort_transaction(trans, ret);
8411 goto out_fail;
8412 }
8413 }
8414 if (new_inode->i_nlink == 0) {
8415 ret = btrfs_orphan_add(trans,
8416 BTRFS_I(d_inode(new_dentry)));
8417 if (ret) {
8418 btrfs_abort_transaction(trans, ret);
8419 goto out_fail;
8420 }
8421 }
8422 }
8423
8424 ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode),
8425 &new_fname.disk_name, 0, index);
8426 if (ret) {
8427 btrfs_abort_transaction(trans, ret);
8428 goto out_fail;
8429 }
8430
8431 if (old_inode->i_nlink == 1)
8432 BTRFS_I(old_inode)->dir_index = index;
8433
8434 if (old_ino != BTRFS_FIRST_FREE_OBJECTID)
8435 btrfs_log_new_name(trans, old_dentry, BTRFS_I(old_dir),
8436 rename_ctx.index, new_dentry->d_parent);
8437
8438 if (flags & RENAME_WHITEOUT) {
8439 ret = btrfs_create_new_inode(trans, &whiteout_args);
8440 if (ret) {
8441 btrfs_abort_transaction(trans, ret);
8442 goto out_fail;
8443 } else {
8444 unlock_new_inode(whiteout_args.inode);
8445 iput(whiteout_args.inode);
8446 whiteout_args.inode = NULL;
8447 }
8448 }
8449 out_fail:
8450 ret2 = btrfs_end_transaction(trans);
8451 ret = ret ? ret : ret2;
8452 out_notrans:
8453 if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
8454 up_read(&fs_info->subvol_sem);
8455 if (flags & RENAME_WHITEOUT)
8456 btrfs_new_inode_args_destroy(&whiteout_args);
8457 out_whiteout_inode:
8458 if (flags & RENAME_WHITEOUT)
8459 iput(whiteout_args.inode);
8460 out_fscrypt_names:
8461 fscrypt_free_filename(&old_fname);
8462 fscrypt_free_filename(&new_fname);
8463 return ret;
8464 }
8465
btrfs_rename2(struct mnt_idmap * idmap,struct inode * old_dir,struct dentry * old_dentry,struct inode * new_dir,struct dentry * new_dentry,unsigned int flags)8466 static int btrfs_rename2(struct mnt_idmap *idmap, struct inode *old_dir,
8467 struct dentry *old_dentry, struct inode *new_dir,
8468 struct dentry *new_dentry, unsigned int flags)
8469 {
8470 int ret;
8471
8472 if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
8473 return -EINVAL;
8474
8475 if (flags & RENAME_EXCHANGE)
8476 ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
8477 new_dentry);
8478 else
8479 ret = btrfs_rename(idmap, old_dir, old_dentry, new_dir,
8480 new_dentry, flags);
8481
8482 btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
8483
8484 return ret;
8485 }
8486
8487 struct btrfs_delalloc_work {
8488 struct inode *inode;
8489 struct completion completion;
8490 struct list_head list;
8491 struct btrfs_work work;
8492 };
8493
btrfs_run_delalloc_work(struct btrfs_work * work)8494 static void btrfs_run_delalloc_work(struct btrfs_work *work)
8495 {
8496 struct btrfs_delalloc_work *delalloc_work;
8497 struct inode *inode;
8498
8499 delalloc_work = container_of(work, struct btrfs_delalloc_work,
8500 work);
8501 inode = delalloc_work->inode;
8502 filemap_flush(inode->i_mapping);
8503 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
8504 &BTRFS_I(inode)->runtime_flags))
8505 filemap_flush(inode->i_mapping);
8506
8507 iput(inode);
8508 complete(&delalloc_work->completion);
8509 }
8510
btrfs_alloc_delalloc_work(struct inode * inode)8511 static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode)
8512 {
8513 struct btrfs_delalloc_work *work;
8514
8515 work = kmalloc(sizeof(*work), GFP_NOFS);
8516 if (!work)
8517 return NULL;
8518
8519 init_completion(&work->completion);
8520 INIT_LIST_HEAD(&work->list);
8521 work->inode = inode;
8522 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL);
8523
8524 return work;
8525 }
8526
8527 /*
8528 * some fairly slow code that needs optimization. This walks the list
8529 * of all the inodes with pending delalloc and forces them to disk.
8530 */
start_delalloc_inodes(struct btrfs_root * root,struct writeback_control * wbc,bool snapshot,bool in_reclaim_context)8531 static int start_delalloc_inodes(struct btrfs_root *root,
8532 struct writeback_control *wbc, bool snapshot,
8533 bool in_reclaim_context)
8534 {
8535 struct btrfs_delalloc_work *work, *next;
8536 LIST_HEAD(works);
8537 LIST_HEAD(splice);
8538 int ret = 0;
8539 bool full_flush = wbc->nr_to_write == LONG_MAX;
8540
8541 mutex_lock(&root->delalloc_mutex);
8542 spin_lock(&root->delalloc_lock);
8543 list_splice_init(&root->delalloc_inodes, &splice);
8544 while (!list_empty(&splice)) {
8545 struct btrfs_inode *inode;
8546 struct inode *tmp_inode;
8547
8548 inode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes);
8549
8550 list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes);
8551
8552 if (in_reclaim_context &&
8553 test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags))
8554 continue;
8555
8556 tmp_inode = igrab(&inode->vfs_inode);
8557 if (!tmp_inode) {
8558 cond_resched_lock(&root->delalloc_lock);
8559 continue;
8560 }
8561 spin_unlock(&root->delalloc_lock);
8562
8563 if (snapshot)
8564 set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags);
8565 if (full_flush) {
8566 work = btrfs_alloc_delalloc_work(&inode->vfs_inode);
8567 if (!work) {
8568 iput(&inode->vfs_inode);
8569 ret = -ENOMEM;
8570 goto out;
8571 }
8572 list_add_tail(&work->list, &works);
8573 btrfs_queue_work(root->fs_info->flush_workers,
8574 &work->work);
8575 } else {
8576 ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc);
8577 btrfs_add_delayed_iput(inode);
8578 if (ret || wbc->nr_to_write <= 0)
8579 goto out;
8580 }
8581 cond_resched();
8582 spin_lock(&root->delalloc_lock);
8583 }
8584 spin_unlock(&root->delalloc_lock);
8585
8586 out:
8587 list_for_each_entry_safe(work, next, &works, list) {
8588 list_del_init(&work->list);
8589 wait_for_completion(&work->completion);
8590 kfree(work);
8591 }
8592
8593 if (!list_empty(&splice)) {
8594 spin_lock(&root->delalloc_lock);
8595 list_splice_tail(&splice, &root->delalloc_inodes);
8596 spin_unlock(&root->delalloc_lock);
8597 }
8598 mutex_unlock(&root->delalloc_mutex);
8599 return ret;
8600 }
8601
btrfs_start_delalloc_snapshot(struct btrfs_root * root,bool in_reclaim_context)8602 int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
8603 {
8604 struct writeback_control wbc = {
8605 .nr_to_write = LONG_MAX,
8606 .sync_mode = WB_SYNC_NONE,
8607 .range_start = 0,
8608 .range_end = LLONG_MAX,
8609 };
8610 struct btrfs_fs_info *fs_info = root->fs_info;
8611
8612 if (BTRFS_FS_ERROR(fs_info))
8613 return -EROFS;
8614
8615 return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
8616 }
8617
btrfs_start_delalloc_roots(struct btrfs_fs_info * fs_info,long nr,bool in_reclaim_context)8618 int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
8619 bool in_reclaim_context)
8620 {
8621 struct writeback_control wbc = {
8622 .nr_to_write = nr,
8623 .sync_mode = WB_SYNC_NONE,
8624 .range_start = 0,
8625 .range_end = LLONG_MAX,
8626 };
8627 struct btrfs_root *root;
8628 LIST_HEAD(splice);
8629 int ret;
8630
8631 if (BTRFS_FS_ERROR(fs_info))
8632 return -EROFS;
8633
8634 mutex_lock(&fs_info->delalloc_root_mutex);
8635 spin_lock(&fs_info->delalloc_root_lock);
8636 list_splice_init(&fs_info->delalloc_roots, &splice);
8637 while (!list_empty(&splice)) {
8638 /*
8639 * Reset nr_to_write here so we know that we're doing a full
8640 * flush.
8641 */
8642 if (nr == LONG_MAX)
8643 wbc.nr_to_write = LONG_MAX;
8644
8645 root = list_first_entry(&splice, struct btrfs_root,
8646 delalloc_root);
8647 root = btrfs_grab_root(root);
8648 BUG_ON(!root);
8649 list_move_tail(&root->delalloc_root,
8650 &fs_info->delalloc_roots);
8651 spin_unlock(&fs_info->delalloc_root_lock);
8652
8653 ret = start_delalloc_inodes(root, &wbc, false, in_reclaim_context);
8654 btrfs_put_root(root);
8655 if (ret < 0 || wbc.nr_to_write <= 0)
8656 goto out;
8657 spin_lock(&fs_info->delalloc_root_lock);
8658 }
8659 spin_unlock(&fs_info->delalloc_root_lock);
8660
8661 ret = 0;
8662 out:
8663 if (!list_empty(&splice)) {
8664 spin_lock(&fs_info->delalloc_root_lock);
8665 list_splice_tail(&splice, &fs_info->delalloc_roots);
8666 spin_unlock(&fs_info->delalloc_root_lock);
8667 }
8668 mutex_unlock(&fs_info->delalloc_root_mutex);
8669 return ret;
8670 }
8671
btrfs_symlink(struct mnt_idmap * idmap,struct inode * dir,struct dentry * dentry,const char * symname)8672 static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
8673 struct dentry *dentry, const char *symname)
8674 {
8675 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
8676 struct btrfs_trans_handle *trans;
8677 struct btrfs_root *root = BTRFS_I(dir)->root;
8678 struct btrfs_path *path;
8679 struct btrfs_key key;
8680 struct inode *inode;
8681 struct btrfs_new_inode_args new_inode_args = {
8682 .dir = dir,
8683 .dentry = dentry,
8684 };
8685 unsigned int trans_num_items;
8686 int err;
8687 int name_len;
8688 int datasize;
8689 unsigned long ptr;
8690 struct btrfs_file_extent_item *ei;
8691 struct extent_buffer *leaf;
8692
8693 name_len = strlen(symname);
8694 /*
8695 * Symlinks utilize uncompressed inline extent data, which should not
8696 * reach block size.
8697 */
8698 if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) ||
8699 name_len >= fs_info->sectorsize)
8700 return -ENAMETOOLONG;
8701
8702 inode = new_inode(dir->i_sb);
8703 if (!inode)
8704 return -ENOMEM;
8705 inode_init_owner(idmap, inode, dir, S_IFLNK | S_IRWXUGO);
8706 inode->i_op = &btrfs_symlink_inode_operations;
8707 inode_nohighmem(inode);
8708 inode->i_mapping->a_ops = &btrfs_aops;
8709 btrfs_i_size_write(BTRFS_I(inode), name_len);
8710 inode_set_bytes(inode, name_len);
8711
8712 new_inode_args.inode = inode;
8713 err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
8714 if (err)
8715 goto out_inode;
8716 /* 1 additional item for the inline extent */
8717 trans_num_items++;
8718
8719 trans = btrfs_start_transaction(root, trans_num_items);
8720 if (IS_ERR(trans)) {
8721 err = PTR_ERR(trans);
8722 goto out_new_inode_args;
8723 }
8724
8725 err = btrfs_create_new_inode(trans, &new_inode_args);
8726 if (err)
8727 goto out;
8728
8729 path = btrfs_alloc_path();
8730 if (!path) {
8731 err = -ENOMEM;
8732 btrfs_abort_transaction(trans, err);
8733 discard_new_inode(inode);
8734 inode = NULL;
8735 goto out;
8736 }
8737 key.objectid = btrfs_ino(BTRFS_I(inode));
8738 key.type = BTRFS_EXTENT_DATA_KEY;
8739 key.offset = 0;
8740 datasize = btrfs_file_extent_calc_inline_size(name_len);
8741 err = btrfs_insert_empty_item(trans, root, path, &key,
8742 datasize);
8743 if (err) {
8744 btrfs_abort_transaction(trans, err);
8745 btrfs_free_path(path);
8746 discard_new_inode(inode);
8747 inode = NULL;
8748 goto out;
8749 }
8750 leaf = path->nodes[0];
8751 ei = btrfs_item_ptr(leaf, path->slots[0],
8752 struct btrfs_file_extent_item);
8753 btrfs_set_file_extent_generation(leaf, ei, trans->transid);
8754 btrfs_set_file_extent_type(leaf, ei,
8755 BTRFS_FILE_EXTENT_INLINE);
8756 btrfs_set_file_extent_encryption(leaf, ei, 0);
8757 btrfs_set_file_extent_compression(leaf, ei, 0);
8758 btrfs_set_file_extent_other_encoding(leaf, ei, 0);
8759 btrfs_set_file_extent_ram_bytes(leaf, ei, name_len);
8760
8761 ptr = btrfs_file_extent_inline_start(ei);
8762 write_extent_buffer(leaf, symname, ptr, name_len);
8763 btrfs_free_path(path);
8764
8765 d_instantiate_new(dentry, inode);
8766 err = 0;
8767 out:
8768 btrfs_end_transaction(trans);
8769 btrfs_btree_balance_dirty(fs_info);
8770 out_new_inode_args:
8771 btrfs_new_inode_args_destroy(&new_inode_args);
8772 out_inode:
8773 if (err)
8774 iput(inode);
8775 return err;
8776 }
8777
insert_prealloc_file_extent(struct btrfs_trans_handle * trans_in,struct btrfs_inode * inode,struct btrfs_key * ins,u64 file_offset)8778 static struct btrfs_trans_handle *insert_prealloc_file_extent(
8779 struct btrfs_trans_handle *trans_in,
8780 struct btrfs_inode *inode,
8781 struct btrfs_key *ins,
8782 u64 file_offset)
8783 {
8784 struct btrfs_file_extent_item stack_fi;
8785 struct btrfs_replace_extent_info extent_info;
8786 struct btrfs_trans_handle *trans = trans_in;
8787 struct btrfs_path *path;
8788 u64 start = ins->objectid;
8789 u64 len = ins->offset;
8790 u64 qgroup_released = 0;
8791 int ret;
8792
8793 memset(&stack_fi, 0, sizeof(stack_fi));
8794
8795 btrfs_set_stack_file_extent_type(&stack_fi, BTRFS_FILE_EXTENT_PREALLOC);
8796 btrfs_set_stack_file_extent_disk_bytenr(&stack_fi, start);
8797 btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, len);
8798 btrfs_set_stack_file_extent_num_bytes(&stack_fi, len);
8799 btrfs_set_stack_file_extent_ram_bytes(&stack_fi, len);
8800 btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE);
8801 /* Encryption and other encoding is reserved and all 0 */
8802
8803 ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released);
8804 if (ret < 0)
8805 return ERR_PTR(ret);
8806
8807 if (trans) {
8808 ret = insert_reserved_file_extent(trans, inode,
8809 file_offset, &stack_fi,
8810 true, qgroup_released);
8811 if (ret)
8812 goto free_qgroup;
8813 return trans;
8814 }
8815
8816 extent_info.disk_offset = start;
8817 extent_info.disk_len = len;
8818 extent_info.data_offset = 0;
8819 extent_info.data_len = len;
8820 extent_info.file_offset = file_offset;
8821 extent_info.extent_buf = (char *)&stack_fi;
8822 extent_info.is_new_extent = true;
8823 extent_info.update_times = true;
8824 extent_info.qgroup_reserved = qgroup_released;
8825 extent_info.insertions = 0;
8826
8827 path = btrfs_alloc_path();
8828 if (!path) {
8829 ret = -ENOMEM;
8830 goto free_qgroup;
8831 }
8832
8833 ret = btrfs_replace_file_extents(inode, path, file_offset,
8834 file_offset + len - 1, &extent_info,
8835 &trans);
8836 btrfs_free_path(path);
8837 if (ret)
8838 goto free_qgroup;
8839 return trans;
8840
8841 free_qgroup:
8842 /*
8843 * We have released qgroup data range at the beginning of the function,
8844 * and normally qgroup_released bytes will be freed when committing
8845 * transaction.
8846 * But if we error out early, we have to free what we have released
8847 * or we leak qgroup data reservation.
8848 */
8849 btrfs_qgroup_free_refroot(inode->root->fs_info,
8850 btrfs_root_id(inode->root), qgroup_released,
8851 BTRFS_QGROUP_RSV_DATA);
8852 return ERR_PTR(ret);
8853 }
8854
__btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint,struct btrfs_trans_handle * trans)8855 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
8856 u64 start, u64 num_bytes, u64 min_size,
8857 loff_t actual_len, u64 *alloc_hint,
8858 struct btrfs_trans_handle *trans)
8859 {
8860 struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
8861 struct extent_map *em;
8862 struct btrfs_root *root = BTRFS_I(inode)->root;
8863 struct btrfs_key ins;
8864 u64 cur_offset = start;
8865 u64 clear_offset = start;
8866 u64 i_size;
8867 u64 cur_bytes;
8868 u64 last_alloc = (u64)-1;
8869 int ret = 0;
8870 bool own_trans = true;
8871 u64 end = start + num_bytes - 1;
8872
8873 if (trans)
8874 own_trans = false;
8875 while (num_bytes > 0) {
8876 cur_bytes = min_t(u64, num_bytes, SZ_256M);
8877 cur_bytes = max(cur_bytes, min_size);
8878 /*
8879 * If we are severely fragmented we could end up with really
8880 * small allocations, so if the allocator is returning small
8881 * chunks lets make its job easier by only searching for those
8882 * sized chunks.
8883 */
8884 cur_bytes = min(cur_bytes, last_alloc);
8885 ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes,
8886 min_size, 0, *alloc_hint, &ins, 1, 0);
8887 if (ret)
8888 break;
8889
8890 /*
8891 * We've reserved this space, and thus converted it from
8892 * ->bytes_may_use to ->bytes_reserved. Any error that happens
8893 * from here on out we will only need to clear our reservation
8894 * for the remaining unreserved area, so advance our
8895 * clear_offset by our extent size.
8896 */
8897 clear_offset += ins.offset;
8898
8899 last_alloc = ins.offset;
8900 trans = insert_prealloc_file_extent(trans, BTRFS_I(inode),
8901 &ins, cur_offset);
8902 /*
8903 * Now that we inserted the prealloc extent we can finally
8904 * decrement the number of reservations in the block group.
8905 * If we did it before, we could race with relocation and have
8906 * relocation miss the reserved extent, making it fail later.
8907 */
8908 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
8909 if (IS_ERR(trans)) {
8910 ret = PTR_ERR(trans);
8911 btrfs_free_reserved_extent(fs_info, ins.objectid,
8912 ins.offset, 0);
8913 break;
8914 }
8915
8916 em = alloc_extent_map();
8917 if (!em) {
8918 btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset,
8919 cur_offset + ins.offset - 1, false);
8920 btrfs_set_inode_full_sync(BTRFS_I(inode));
8921 goto next;
8922 }
8923
8924 em->start = cur_offset;
8925 em->len = ins.offset;
8926 em->disk_bytenr = ins.objectid;
8927 em->offset = 0;
8928 em->disk_num_bytes = ins.offset;
8929 em->ram_bytes = ins.offset;
8930 em->flags |= EXTENT_FLAG_PREALLOC;
8931 em->generation = trans->transid;
8932
8933 ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true);
8934 free_extent_map(em);
8935 next:
8936 num_bytes -= ins.offset;
8937 cur_offset += ins.offset;
8938 *alloc_hint = ins.objectid + ins.offset;
8939
8940 inode_inc_iversion(inode);
8941 inode_set_ctime_current(inode);
8942 BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
8943 if (!(mode & FALLOC_FL_KEEP_SIZE) &&
8944 (actual_len > inode->i_size) &&
8945 (cur_offset > inode->i_size)) {
8946 if (cur_offset > actual_len)
8947 i_size = actual_len;
8948 else
8949 i_size = cur_offset;
8950 i_size_write(inode, i_size);
8951 btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
8952 }
8953
8954 ret = btrfs_update_inode(trans, BTRFS_I(inode));
8955
8956 if (ret) {
8957 btrfs_abort_transaction(trans, ret);
8958 if (own_trans)
8959 btrfs_end_transaction(trans);
8960 break;
8961 }
8962
8963 if (own_trans) {
8964 btrfs_end_transaction(trans);
8965 trans = NULL;
8966 }
8967 }
8968 if (clear_offset < end)
8969 btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset,
8970 end - clear_offset + 1);
8971 return ret;
8972 }
8973
btrfs_prealloc_file_range(struct inode * inode,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8974 int btrfs_prealloc_file_range(struct inode *inode, int mode,
8975 u64 start, u64 num_bytes, u64 min_size,
8976 loff_t actual_len, u64 *alloc_hint)
8977 {
8978 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8979 min_size, actual_len, alloc_hint,
8980 NULL);
8981 }
8982
btrfs_prealloc_file_range_trans(struct inode * inode,struct btrfs_trans_handle * trans,int mode,u64 start,u64 num_bytes,u64 min_size,loff_t actual_len,u64 * alloc_hint)8983 int btrfs_prealloc_file_range_trans(struct inode *inode,
8984 struct btrfs_trans_handle *trans, int mode,
8985 u64 start, u64 num_bytes, u64 min_size,
8986 loff_t actual_len, u64 *alloc_hint)
8987 {
8988 return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
8989 min_size, actual_len, alloc_hint, trans);
8990 }
8991
btrfs_permission(struct mnt_idmap * idmap,struct inode * inode,int mask)8992 static int btrfs_permission(struct mnt_idmap *idmap,
8993 struct inode *inode, int mask)
8994 {
8995 struct btrfs_root *root = BTRFS_I(inode)->root;
8996 umode_t mode = inode->i_mode;
8997
8998 if (mask & MAY_WRITE &&
8999 (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) {
9000 if (btrfs_root_readonly(root))
9001 return -EROFS;
9002 if (BTRFS_I(inode)->flags & BTRFS_INODE_READONLY)
9003 return -EACCES;
9004 }
9005 return generic_permission(idmap, inode, mask);
9006 }
9007
btrfs_tmpfile(struct mnt_idmap * idmap,struct inode * dir,struct file * file,umode_t mode)9008 static int btrfs_tmpfile(struct mnt_idmap *idmap, struct inode *dir,
9009 struct file *file, umode_t mode)
9010 {
9011 struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
9012 struct btrfs_trans_handle *trans;
9013 struct btrfs_root *root = BTRFS_I(dir)->root;
9014 struct inode *inode;
9015 struct btrfs_new_inode_args new_inode_args = {
9016 .dir = dir,
9017 .dentry = file->f_path.dentry,
9018 .orphan = true,
9019 };
9020 unsigned int trans_num_items;
9021 int ret;
9022
9023 inode = new_inode(dir->i_sb);
9024 if (!inode)
9025 return -ENOMEM;
9026 inode_init_owner(idmap, inode, dir, mode);
9027 inode->i_fop = &btrfs_file_operations;
9028 inode->i_op = &btrfs_file_inode_operations;
9029 inode->i_mapping->a_ops = &btrfs_aops;
9030
9031 new_inode_args.inode = inode;
9032 ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
9033 if (ret)
9034 goto out_inode;
9035
9036 trans = btrfs_start_transaction(root, trans_num_items);
9037 if (IS_ERR(trans)) {
9038 ret = PTR_ERR(trans);
9039 goto out_new_inode_args;
9040 }
9041
9042 ret = btrfs_create_new_inode(trans, &new_inode_args);
9043
9044 /*
9045 * We set number of links to 0 in btrfs_create_new_inode(), and here we
9046 * set it to 1 because d_tmpfile() will issue a warning if the count is
9047 * 0, through:
9048 *
9049 * d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
9050 */
9051 set_nlink(inode, 1);
9052
9053 if (!ret) {
9054 d_tmpfile(file, inode);
9055 unlock_new_inode(inode);
9056 mark_inode_dirty(inode);
9057 }
9058
9059 btrfs_end_transaction(trans);
9060 btrfs_btree_balance_dirty(fs_info);
9061 out_new_inode_args:
9062 btrfs_new_inode_args_destroy(&new_inode_args);
9063 out_inode:
9064 if (ret)
9065 iput(inode);
9066 return finish_open_simple(file, ret);
9067 }
9068
btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info * fs_info,int compress_type)9069 int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
9070 int compress_type)
9071 {
9072 switch (compress_type) {
9073 case BTRFS_COMPRESS_NONE:
9074 return BTRFS_ENCODED_IO_COMPRESSION_NONE;
9075 case BTRFS_COMPRESS_ZLIB:
9076 return BTRFS_ENCODED_IO_COMPRESSION_ZLIB;
9077 case BTRFS_COMPRESS_LZO:
9078 /*
9079 * The LZO format depends on the sector size. 64K is the maximum
9080 * sector size that we support.
9081 */
9082 if (fs_info->sectorsize < SZ_4K || fs_info->sectorsize > SZ_64K)
9083 return -EINVAL;
9084 return BTRFS_ENCODED_IO_COMPRESSION_LZO_4K +
9085 (fs_info->sectorsize_bits - 12);
9086 case BTRFS_COMPRESS_ZSTD:
9087 return BTRFS_ENCODED_IO_COMPRESSION_ZSTD;
9088 default:
9089 return -EUCLEAN;
9090 }
9091 }
9092
btrfs_encoded_read_inline(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 extent_start,size_t count,struct btrfs_ioctl_encoded_io_args * encoded,bool * unlocked)9093 static ssize_t btrfs_encoded_read_inline(
9094 struct kiocb *iocb,
9095 struct iov_iter *iter, u64 start,
9096 u64 lockend,
9097 struct extent_state **cached_state,
9098 u64 extent_start, size_t count,
9099 struct btrfs_ioctl_encoded_io_args *encoded,
9100 bool *unlocked)
9101 {
9102 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9103 struct btrfs_root *root = inode->root;
9104 struct btrfs_fs_info *fs_info = root->fs_info;
9105 struct extent_io_tree *io_tree = &inode->io_tree;
9106 struct btrfs_path *path;
9107 struct extent_buffer *leaf;
9108 struct btrfs_file_extent_item *item;
9109 u64 ram_bytes;
9110 unsigned long ptr;
9111 void *tmp;
9112 ssize_t ret;
9113 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9114
9115 path = btrfs_alloc_path();
9116 if (!path) {
9117 ret = -ENOMEM;
9118 goto out;
9119 }
9120
9121 path->nowait = nowait;
9122
9123 ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
9124 extent_start, 0);
9125 if (ret) {
9126 if (ret > 0) {
9127 /* The extent item disappeared? */
9128 ret = -EIO;
9129 }
9130 goto out;
9131 }
9132 leaf = path->nodes[0];
9133 item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
9134
9135 ram_bytes = btrfs_file_extent_ram_bytes(leaf, item);
9136 ptr = btrfs_file_extent_inline_start(item);
9137
9138 encoded->len = min_t(u64, extent_start + ram_bytes,
9139 inode->vfs_inode.i_size) - iocb->ki_pos;
9140 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9141 btrfs_file_extent_compression(leaf, item));
9142 if (ret < 0)
9143 goto out;
9144 encoded->compression = ret;
9145 if (encoded->compression) {
9146 size_t inline_size;
9147
9148 inline_size = btrfs_file_extent_inline_item_len(leaf,
9149 path->slots[0]);
9150 if (inline_size > count) {
9151 ret = -ENOBUFS;
9152 goto out;
9153 }
9154 count = inline_size;
9155 encoded->unencoded_len = ram_bytes;
9156 encoded->unencoded_offset = iocb->ki_pos - extent_start;
9157 } else {
9158 count = min_t(u64, count, encoded->len);
9159 encoded->len = count;
9160 encoded->unencoded_len = count;
9161 ptr += iocb->ki_pos - extent_start;
9162 }
9163
9164 tmp = kmalloc(count, GFP_NOFS);
9165 if (!tmp) {
9166 ret = -ENOMEM;
9167 goto out;
9168 }
9169 read_extent_buffer(leaf, tmp, ptr, count);
9170 btrfs_release_path(path);
9171 unlock_extent(io_tree, start, lockend, cached_state);
9172 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9173 *unlocked = true;
9174
9175 ret = copy_to_iter(tmp, count, iter);
9176 if (ret != count)
9177 ret = -EFAULT;
9178 kfree(tmp);
9179 out:
9180 btrfs_free_path(path);
9181 return ret;
9182 }
9183
9184 struct btrfs_encoded_read_private {
9185 struct completion *sync_reads;
9186 void *uring_ctx;
9187 refcount_t pending_refs;
9188 blk_status_t status;
9189 };
9190
btrfs_encoded_read_endio(struct btrfs_bio * bbio)9191 static void btrfs_encoded_read_endio(struct btrfs_bio *bbio)
9192 {
9193 struct btrfs_encoded_read_private *priv = bbio->private;
9194
9195 if (bbio->bio.bi_status) {
9196 /*
9197 * The memory barrier implied by the refcount_dec_and_test() here
9198 * pairs with the memory barrier implied by the refcount_dec_and_test()
9199 * in btrfs_encoded_read_regular_fill_pages() to ensure that
9200 * this write is observed before the load of status in
9201 * btrfs_encoded_read_regular_fill_pages().
9202 */
9203 WRITE_ONCE(priv->status, bbio->bio.bi_status);
9204 }
9205 if (refcount_dec_and_test(&priv->pending_refs)) {
9206 int err = blk_status_to_errno(READ_ONCE(priv->status));
9207
9208 if (priv->uring_ctx) {
9209 btrfs_uring_read_extent_endio(priv->uring_ctx, err);
9210 kfree(priv);
9211 } else {
9212 complete(priv->sync_reads);
9213 }
9214 }
9215 bio_put(&bbio->bio);
9216 }
9217
btrfs_encoded_read_regular_fill_pages(struct btrfs_inode * inode,u64 disk_bytenr,u64 disk_io_size,struct page ** pages,void * uring_ctx)9218 int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
9219 u64 disk_bytenr, u64 disk_io_size,
9220 struct page **pages, void *uring_ctx)
9221 {
9222 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9223 struct btrfs_encoded_read_private *priv, sync_priv;
9224 struct completion sync_reads;
9225 unsigned long i = 0;
9226 struct btrfs_bio *bbio;
9227 int ret;
9228
9229 /*
9230 * Fast path for synchronous reads which completes in this call, io_uring
9231 * needs longer time span.
9232 */
9233 if (uring_ctx) {
9234 priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS);
9235 if (!priv)
9236 return -ENOMEM;
9237 } else {
9238 priv = &sync_priv;
9239 init_completion(&sync_reads);
9240 priv->sync_reads = &sync_reads;
9241 }
9242
9243 refcount_set(&priv->pending_refs, 1);
9244 priv->status = 0;
9245 priv->uring_ctx = uring_ctx;
9246
9247 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9248 btrfs_encoded_read_endio, priv);
9249 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9250 bbio->inode = inode;
9251
9252 do {
9253 size_t bytes = min_t(u64, disk_io_size, PAGE_SIZE);
9254
9255 if (bio_add_page(&bbio->bio, pages[i], bytes, 0) < bytes) {
9256 refcount_inc(&priv->pending_refs);
9257 btrfs_submit_bbio(bbio, 0);
9258
9259 bbio = btrfs_bio_alloc(BIO_MAX_VECS, REQ_OP_READ, fs_info,
9260 btrfs_encoded_read_endio, priv);
9261 bbio->bio.bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT;
9262 bbio->inode = inode;
9263 continue;
9264 }
9265
9266 i++;
9267 disk_bytenr += bytes;
9268 disk_io_size -= bytes;
9269 } while (disk_io_size);
9270
9271 refcount_inc(&priv->pending_refs);
9272 btrfs_submit_bbio(bbio, 0);
9273
9274 if (uring_ctx) {
9275 if (refcount_dec_and_test(&priv->pending_refs)) {
9276 ret = blk_status_to_errno(READ_ONCE(priv->status));
9277 btrfs_uring_read_extent_endio(uring_ctx, ret);
9278 kfree(priv);
9279 return ret;
9280 }
9281
9282 return -EIOCBQUEUED;
9283 } else {
9284 if (!refcount_dec_and_test(&priv->pending_refs))
9285 wait_for_completion_io(&sync_reads);
9286 /* See btrfs_encoded_read_endio() for ordering. */
9287 return blk_status_to_errno(READ_ONCE(priv->status));
9288 }
9289 }
9290
btrfs_encoded_read_regular(struct kiocb * iocb,struct iov_iter * iter,u64 start,u64 lockend,struct extent_state ** cached_state,u64 disk_bytenr,u64 disk_io_size,size_t count,bool compressed,bool * unlocked)9291 ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter,
9292 u64 start, u64 lockend,
9293 struct extent_state **cached_state,
9294 u64 disk_bytenr, u64 disk_io_size,
9295 size_t count, bool compressed, bool *unlocked)
9296 {
9297 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9298 struct extent_io_tree *io_tree = &inode->io_tree;
9299 struct page **pages;
9300 unsigned long nr_pages, i;
9301 u64 cur;
9302 size_t page_offset;
9303 ssize_t ret;
9304
9305 nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
9306 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
9307 if (!pages)
9308 return -ENOMEM;
9309 ret = btrfs_alloc_page_array(nr_pages, pages, false);
9310 if (ret) {
9311 ret = -ENOMEM;
9312 goto out;
9313 }
9314
9315 ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
9316 disk_io_size, pages, NULL);
9317 if (ret)
9318 goto out;
9319
9320 unlock_extent(io_tree, start, lockend, cached_state);
9321 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9322 *unlocked = true;
9323
9324 if (compressed) {
9325 i = 0;
9326 page_offset = 0;
9327 } else {
9328 i = (iocb->ki_pos - start) >> PAGE_SHIFT;
9329 page_offset = (iocb->ki_pos - start) & (PAGE_SIZE - 1);
9330 }
9331 cur = 0;
9332 while (cur < count) {
9333 size_t bytes = min_t(size_t, count - cur,
9334 PAGE_SIZE - page_offset);
9335
9336 if (copy_page_to_iter(pages[i], page_offset, bytes,
9337 iter) != bytes) {
9338 ret = -EFAULT;
9339 goto out;
9340 }
9341 i++;
9342 cur += bytes;
9343 page_offset = 0;
9344 }
9345 ret = count;
9346 out:
9347 for (i = 0; i < nr_pages; i++) {
9348 if (pages[i])
9349 __free_page(pages[i]);
9350 }
9351 kfree(pages);
9352 return ret;
9353 }
9354
btrfs_encoded_read(struct kiocb * iocb,struct iov_iter * iter,struct btrfs_ioctl_encoded_io_args * encoded,struct extent_state ** cached_state,u64 * disk_bytenr,u64 * disk_io_size)9355 ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
9356 struct btrfs_ioctl_encoded_io_args *encoded,
9357 struct extent_state **cached_state,
9358 u64 *disk_bytenr, u64 *disk_io_size)
9359 {
9360 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9361 struct btrfs_fs_info *fs_info = inode->root->fs_info;
9362 struct extent_io_tree *io_tree = &inode->io_tree;
9363 ssize_t ret;
9364 size_t count = iov_iter_count(iter);
9365 u64 start, lockend;
9366 struct extent_map *em;
9367 const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
9368 bool unlocked = false;
9369
9370 file_accessed(iocb->ki_filp);
9371
9372 ret = btrfs_inode_lock(inode,
9373 BTRFS_ILOCK_SHARED | (nowait ? BTRFS_ILOCK_TRY : 0));
9374 if (ret)
9375 return ret;
9376
9377 if (iocb->ki_pos >= inode->vfs_inode.i_size) {
9378 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9379 return 0;
9380 }
9381 start = ALIGN_DOWN(iocb->ki_pos, fs_info->sectorsize);
9382 /*
9383 * We don't know how long the extent containing iocb->ki_pos is, but if
9384 * it's compressed we know that it won't be longer than this.
9385 */
9386 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
9387
9388 if (nowait) {
9389 struct btrfs_ordered_extent *ordered;
9390
9391 if (filemap_range_needs_writeback(inode->vfs_inode.i_mapping,
9392 start, lockend)) {
9393 ret = -EAGAIN;
9394 goto out_unlock_inode;
9395 }
9396
9397 if (!try_lock_extent(io_tree, start, lockend, cached_state)) {
9398 ret = -EAGAIN;
9399 goto out_unlock_inode;
9400 }
9401
9402 ordered = btrfs_lookup_ordered_range(inode, start,
9403 lockend - start + 1);
9404 if (ordered) {
9405 btrfs_put_ordered_extent(ordered);
9406 unlock_extent(io_tree, start, lockend, cached_state);
9407 ret = -EAGAIN;
9408 goto out_unlock_inode;
9409 }
9410 } else {
9411 for (;;) {
9412 struct btrfs_ordered_extent *ordered;
9413
9414 ret = btrfs_wait_ordered_range(inode, start,
9415 lockend - start + 1);
9416 if (ret)
9417 goto out_unlock_inode;
9418
9419 lock_extent(io_tree, start, lockend, cached_state);
9420 ordered = btrfs_lookup_ordered_range(inode, start,
9421 lockend - start + 1);
9422 if (!ordered)
9423 break;
9424 btrfs_put_ordered_extent(ordered);
9425 unlock_extent(io_tree, start, lockend, cached_state);
9426 cond_resched();
9427 }
9428 }
9429
9430 em = btrfs_get_extent(inode, NULL, start, lockend - start + 1);
9431 if (IS_ERR(em)) {
9432 ret = PTR_ERR(em);
9433 goto out_unlock_extent;
9434 }
9435
9436 if (em->disk_bytenr == EXTENT_MAP_INLINE) {
9437 u64 extent_start = em->start;
9438
9439 /*
9440 * For inline extents we get everything we need out of the
9441 * extent item.
9442 */
9443 free_extent_map(em);
9444 em = NULL;
9445 ret = btrfs_encoded_read_inline(iocb, iter, start, lockend,
9446 cached_state, extent_start,
9447 count, encoded, &unlocked);
9448 goto out_unlock_extent;
9449 }
9450
9451 /*
9452 * We only want to return up to EOF even if the extent extends beyond
9453 * that.
9454 */
9455 encoded->len = min_t(u64, extent_map_end(em),
9456 inode->vfs_inode.i_size) - iocb->ki_pos;
9457 if (em->disk_bytenr == EXTENT_MAP_HOLE ||
9458 (em->flags & EXTENT_FLAG_PREALLOC)) {
9459 *disk_bytenr = EXTENT_MAP_HOLE;
9460 count = min_t(u64, count, encoded->len);
9461 encoded->len = count;
9462 encoded->unencoded_len = count;
9463 } else if (extent_map_is_compressed(em)) {
9464 *disk_bytenr = em->disk_bytenr;
9465 /*
9466 * Bail if the buffer isn't large enough to return the whole
9467 * compressed extent.
9468 */
9469 if (em->disk_num_bytes > count) {
9470 ret = -ENOBUFS;
9471 goto out_em;
9472 }
9473 *disk_io_size = em->disk_num_bytes;
9474 count = em->disk_num_bytes;
9475 encoded->unencoded_len = em->ram_bytes;
9476 encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset);
9477 ret = btrfs_encoded_io_compression_from_extent(fs_info,
9478 extent_map_compression(em));
9479 if (ret < 0)
9480 goto out_em;
9481 encoded->compression = ret;
9482 } else {
9483 *disk_bytenr = extent_map_block_start(em) + (start - em->start);
9484 if (encoded->len > count)
9485 encoded->len = count;
9486 /*
9487 * Don't read beyond what we locked. This also limits the page
9488 * allocations that we'll do.
9489 */
9490 *disk_io_size = min(lockend + 1, iocb->ki_pos + encoded->len) - start;
9491 count = start + *disk_io_size - iocb->ki_pos;
9492 encoded->len = count;
9493 encoded->unencoded_len = count;
9494 *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize);
9495 }
9496 free_extent_map(em);
9497 em = NULL;
9498
9499 if (*disk_bytenr == EXTENT_MAP_HOLE) {
9500 unlock_extent(io_tree, start, lockend, cached_state);
9501 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9502 unlocked = true;
9503 ret = iov_iter_zero(count, iter);
9504 if (ret != count)
9505 ret = -EFAULT;
9506 } else {
9507 ret = -EIOCBQUEUED;
9508 goto out_unlock_extent;
9509 }
9510
9511 out_em:
9512 free_extent_map(em);
9513 out_unlock_extent:
9514 /* Leave inode and extent locked if we need to do a read. */
9515 if (!unlocked && ret != -EIOCBQUEUED)
9516 unlock_extent(io_tree, start, lockend, cached_state);
9517 out_unlock_inode:
9518 if (!unlocked && ret != -EIOCBQUEUED)
9519 btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
9520 return ret;
9521 }
9522
btrfs_do_encoded_write(struct kiocb * iocb,struct iov_iter * from,const struct btrfs_ioctl_encoded_io_args * encoded)9523 ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
9524 const struct btrfs_ioctl_encoded_io_args *encoded)
9525 {
9526 struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
9527 struct btrfs_root *root = inode->root;
9528 struct btrfs_fs_info *fs_info = root->fs_info;
9529 struct extent_io_tree *io_tree = &inode->io_tree;
9530 struct extent_changeset *data_reserved = NULL;
9531 struct extent_state *cached_state = NULL;
9532 struct btrfs_ordered_extent *ordered;
9533 struct btrfs_file_extent file_extent;
9534 int compression;
9535 size_t orig_count;
9536 u64 start, end;
9537 u64 num_bytes, ram_bytes, disk_num_bytes;
9538 unsigned long nr_folios, i;
9539 struct folio **folios;
9540 struct btrfs_key ins;
9541 bool extent_reserved = false;
9542 struct extent_map *em;
9543 ssize_t ret;
9544
9545 switch (encoded->compression) {
9546 case BTRFS_ENCODED_IO_COMPRESSION_ZLIB:
9547 compression = BTRFS_COMPRESS_ZLIB;
9548 break;
9549 case BTRFS_ENCODED_IO_COMPRESSION_ZSTD:
9550 compression = BTRFS_COMPRESS_ZSTD;
9551 break;
9552 case BTRFS_ENCODED_IO_COMPRESSION_LZO_4K:
9553 case BTRFS_ENCODED_IO_COMPRESSION_LZO_8K:
9554 case BTRFS_ENCODED_IO_COMPRESSION_LZO_16K:
9555 case BTRFS_ENCODED_IO_COMPRESSION_LZO_32K:
9556 case BTRFS_ENCODED_IO_COMPRESSION_LZO_64K:
9557 /* The sector size must match for LZO. */
9558 if (encoded->compression -
9559 BTRFS_ENCODED_IO_COMPRESSION_LZO_4K + 12 !=
9560 fs_info->sectorsize_bits)
9561 return -EINVAL;
9562 compression = BTRFS_COMPRESS_LZO;
9563 break;
9564 default:
9565 return -EINVAL;
9566 }
9567 if (encoded->encryption != BTRFS_ENCODED_IO_ENCRYPTION_NONE)
9568 return -EINVAL;
9569
9570 /*
9571 * Compressed extents should always have checksums, so error out if we
9572 * have a NOCOW file or inode was created while mounted with NODATASUM.
9573 */
9574 if (inode->flags & BTRFS_INODE_NODATASUM)
9575 return -EINVAL;
9576
9577 orig_count = iov_iter_count(from);
9578
9579 /* The extent size must be sane. */
9580 if (encoded->unencoded_len > BTRFS_MAX_UNCOMPRESSED ||
9581 orig_count > BTRFS_MAX_COMPRESSED || orig_count == 0)
9582 return -EINVAL;
9583
9584 /*
9585 * The compressed data must be smaller than the decompressed data.
9586 *
9587 * It's of course possible for data to compress to larger or the same
9588 * size, but the buffered I/O path falls back to no compression for such
9589 * data, and we don't want to break any assumptions by creating these
9590 * extents.
9591 *
9592 * Note that this is less strict than the current check we have that the
9593 * compressed data must be at least one sector smaller than the
9594 * decompressed data. We only want to enforce the weaker requirement
9595 * from old kernels that it is at least one byte smaller.
9596 */
9597 if (orig_count >= encoded->unencoded_len)
9598 return -EINVAL;
9599
9600 /* The extent must start on a sector boundary. */
9601 start = iocb->ki_pos;
9602 if (!IS_ALIGNED(start, fs_info->sectorsize))
9603 return -EINVAL;
9604
9605 /*
9606 * The extent must end on a sector boundary. However, we allow a write
9607 * which ends at or extends i_size to have an unaligned length; we round
9608 * up the extent size and set i_size to the unaligned end.
9609 */
9610 if (start + encoded->len < inode->vfs_inode.i_size &&
9611 !IS_ALIGNED(start + encoded->len, fs_info->sectorsize))
9612 return -EINVAL;
9613
9614 /* Finally, the offset in the unencoded data must be sector-aligned. */
9615 if (!IS_ALIGNED(encoded->unencoded_offset, fs_info->sectorsize))
9616 return -EINVAL;
9617
9618 num_bytes = ALIGN(encoded->len, fs_info->sectorsize);
9619 ram_bytes = ALIGN(encoded->unencoded_len, fs_info->sectorsize);
9620 end = start + num_bytes - 1;
9621
9622 /*
9623 * If the extent cannot be inline, the compressed data on disk must be
9624 * sector-aligned. For convenience, we extend it with zeroes if it
9625 * isn't.
9626 */
9627 disk_num_bytes = ALIGN(orig_count, fs_info->sectorsize);
9628 nr_folios = DIV_ROUND_UP(disk_num_bytes, PAGE_SIZE);
9629 folios = kvcalloc(nr_folios, sizeof(struct folio *), GFP_KERNEL_ACCOUNT);
9630 if (!folios)
9631 return -ENOMEM;
9632 for (i = 0; i < nr_folios; i++) {
9633 size_t bytes = min_t(size_t, PAGE_SIZE, iov_iter_count(from));
9634 char *kaddr;
9635
9636 folios[i] = folio_alloc(GFP_KERNEL_ACCOUNT, 0);
9637 if (!folios[i]) {
9638 ret = -ENOMEM;
9639 goto out_folios;
9640 }
9641 kaddr = kmap_local_folio(folios[i], 0);
9642 if (copy_from_iter(kaddr, bytes, from) != bytes) {
9643 kunmap_local(kaddr);
9644 ret = -EFAULT;
9645 goto out_folios;
9646 }
9647 if (bytes < PAGE_SIZE)
9648 memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
9649 kunmap_local(kaddr);
9650 }
9651
9652 for (;;) {
9653 struct btrfs_ordered_extent *ordered;
9654
9655 ret = btrfs_wait_ordered_range(inode, start, num_bytes);
9656 if (ret)
9657 goto out_folios;
9658 ret = invalidate_inode_pages2_range(inode->vfs_inode.i_mapping,
9659 start >> PAGE_SHIFT,
9660 end >> PAGE_SHIFT);
9661 if (ret)
9662 goto out_folios;
9663 lock_extent(io_tree, start, end, &cached_state);
9664 ordered = btrfs_lookup_ordered_range(inode, start, num_bytes);
9665 if (!ordered &&
9666 !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end))
9667 break;
9668 if (ordered)
9669 btrfs_put_ordered_extent(ordered);
9670 unlock_extent(io_tree, start, end, &cached_state);
9671 cond_resched();
9672 }
9673
9674 /*
9675 * We don't use the higher-level delalloc space functions because our
9676 * num_bytes and disk_num_bytes are different.
9677 */
9678 ret = btrfs_alloc_data_chunk_ondemand(inode, disk_num_bytes);
9679 if (ret)
9680 goto out_unlock;
9681 ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes);
9682 if (ret)
9683 goto out_free_data_space;
9684 ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes,
9685 false);
9686 if (ret)
9687 goto out_qgroup_free_data;
9688
9689 /* Try an inline extent first. */
9690 if (encoded->unencoded_len == encoded->len &&
9691 encoded->unencoded_offset == 0 &&
9692 can_cow_file_range_inline(inode, start, encoded->len, orig_count)) {
9693 ret = __cow_file_range_inline(inode, encoded->len,
9694 orig_count, compression, folios[0],
9695 true);
9696 if (ret <= 0) {
9697 if (ret == 0)
9698 ret = orig_count;
9699 goto out_delalloc_release;
9700 }
9701 }
9702
9703 ret = btrfs_reserve_extent(root, disk_num_bytes, disk_num_bytes,
9704 disk_num_bytes, 0, 0, &ins, 1, 1);
9705 if (ret)
9706 goto out_delalloc_release;
9707 extent_reserved = true;
9708
9709 file_extent.disk_bytenr = ins.objectid;
9710 file_extent.disk_num_bytes = ins.offset;
9711 file_extent.num_bytes = num_bytes;
9712 file_extent.ram_bytes = ram_bytes;
9713 file_extent.offset = encoded->unencoded_offset;
9714 file_extent.compression = compression;
9715 em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_COMPRESSED);
9716 if (IS_ERR(em)) {
9717 ret = PTR_ERR(em);
9718 goto out_free_reserved;
9719 }
9720 free_extent_map(em);
9721
9722 ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent,
9723 (1 << BTRFS_ORDERED_ENCODED) |
9724 (1 << BTRFS_ORDERED_COMPRESSED));
9725 if (IS_ERR(ordered)) {
9726 btrfs_drop_extent_map_range(inode, start, end, false);
9727 ret = PTR_ERR(ordered);
9728 goto out_free_reserved;
9729 }
9730 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9731
9732 if (start + encoded->len > inode->vfs_inode.i_size)
9733 i_size_write(&inode->vfs_inode, start + encoded->len);
9734
9735 unlock_extent(io_tree, start, end, &cached_state);
9736
9737 btrfs_delalloc_release_extents(inode, num_bytes);
9738
9739 btrfs_submit_compressed_write(ordered, folios, nr_folios, 0, false);
9740 ret = orig_count;
9741 goto out;
9742
9743 out_free_reserved:
9744 btrfs_dec_block_group_reservations(fs_info, ins.objectid);
9745 btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
9746 out_delalloc_release:
9747 btrfs_delalloc_release_extents(inode, num_bytes);
9748 btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0);
9749 out_qgroup_free_data:
9750 if (ret < 0)
9751 btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL);
9752 out_free_data_space:
9753 /*
9754 * If btrfs_reserve_extent() succeeded, then we already decremented
9755 * bytes_may_use.
9756 */
9757 if (!extent_reserved)
9758 btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes);
9759 out_unlock:
9760 unlock_extent(io_tree, start, end, &cached_state);
9761 out_folios:
9762 for (i = 0; i < nr_folios; i++) {
9763 if (folios[i])
9764 folio_put(folios[i]);
9765 }
9766 kvfree(folios);
9767 out:
9768 if (ret >= 0)
9769 iocb->ki_pos += encoded->len;
9770 return ret;
9771 }
9772
9773 #ifdef CONFIG_SWAP
9774 /*
9775 * Add an entry indicating a block group or device which is pinned by a
9776 * swapfile. Returns 0 on success, 1 if there is already an entry for it, or a
9777 * negative errno on failure.
9778 */
btrfs_add_swapfile_pin(struct inode * inode,void * ptr,bool is_block_group)9779 static int btrfs_add_swapfile_pin(struct inode *inode, void *ptr,
9780 bool is_block_group)
9781 {
9782 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9783 struct btrfs_swapfile_pin *sp, *entry;
9784 struct rb_node **p;
9785 struct rb_node *parent = NULL;
9786
9787 sp = kmalloc(sizeof(*sp), GFP_NOFS);
9788 if (!sp)
9789 return -ENOMEM;
9790 sp->ptr = ptr;
9791 sp->inode = inode;
9792 sp->is_block_group = is_block_group;
9793 sp->bg_extent_count = 1;
9794
9795 spin_lock(&fs_info->swapfile_pins_lock);
9796 p = &fs_info->swapfile_pins.rb_node;
9797 while (*p) {
9798 parent = *p;
9799 entry = rb_entry(parent, struct btrfs_swapfile_pin, node);
9800 if (sp->ptr < entry->ptr ||
9801 (sp->ptr == entry->ptr && sp->inode < entry->inode)) {
9802 p = &(*p)->rb_left;
9803 } else if (sp->ptr > entry->ptr ||
9804 (sp->ptr == entry->ptr && sp->inode > entry->inode)) {
9805 p = &(*p)->rb_right;
9806 } else {
9807 if (is_block_group)
9808 entry->bg_extent_count++;
9809 spin_unlock(&fs_info->swapfile_pins_lock);
9810 kfree(sp);
9811 return 1;
9812 }
9813 }
9814 rb_link_node(&sp->node, parent, p);
9815 rb_insert_color(&sp->node, &fs_info->swapfile_pins);
9816 spin_unlock(&fs_info->swapfile_pins_lock);
9817 return 0;
9818 }
9819
9820 /* Free all of the entries pinned by this swapfile. */
btrfs_free_swapfile_pins(struct inode * inode)9821 static void btrfs_free_swapfile_pins(struct inode *inode)
9822 {
9823 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
9824 struct btrfs_swapfile_pin *sp;
9825 struct rb_node *node, *next;
9826
9827 spin_lock(&fs_info->swapfile_pins_lock);
9828 node = rb_first(&fs_info->swapfile_pins);
9829 while (node) {
9830 next = rb_next(node);
9831 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
9832 if (sp->inode == inode) {
9833 rb_erase(&sp->node, &fs_info->swapfile_pins);
9834 if (sp->is_block_group) {
9835 btrfs_dec_block_group_swap_extents(sp->ptr,
9836 sp->bg_extent_count);
9837 btrfs_put_block_group(sp->ptr);
9838 }
9839 kfree(sp);
9840 }
9841 node = next;
9842 }
9843 spin_unlock(&fs_info->swapfile_pins_lock);
9844 }
9845
9846 struct btrfs_swap_info {
9847 u64 start;
9848 u64 block_start;
9849 u64 block_len;
9850 u64 lowest_ppage;
9851 u64 highest_ppage;
9852 unsigned long nr_pages;
9853 int nr_extents;
9854 };
9855
btrfs_add_swap_extent(struct swap_info_struct * sis,struct btrfs_swap_info * bsi)9856 static int btrfs_add_swap_extent(struct swap_info_struct *sis,
9857 struct btrfs_swap_info *bsi)
9858 {
9859 unsigned long nr_pages;
9860 unsigned long max_pages;
9861 u64 first_ppage, first_ppage_reported, next_ppage;
9862 int ret;
9863
9864 /*
9865 * Our swapfile may have had its size extended after the swap header was
9866 * written. In that case activating the swapfile should not go beyond
9867 * the max size set in the swap header.
9868 */
9869 if (bsi->nr_pages >= sis->max)
9870 return 0;
9871
9872 max_pages = sis->max - bsi->nr_pages;
9873 first_ppage = PAGE_ALIGN(bsi->block_start) >> PAGE_SHIFT;
9874 next_ppage = PAGE_ALIGN_DOWN(bsi->block_start + bsi->block_len) >> PAGE_SHIFT;
9875
9876 if (first_ppage >= next_ppage)
9877 return 0;
9878 nr_pages = next_ppage - first_ppage;
9879 nr_pages = min(nr_pages, max_pages);
9880
9881 first_ppage_reported = first_ppage;
9882 if (bsi->start == 0)
9883 first_ppage_reported++;
9884 if (bsi->lowest_ppage > first_ppage_reported)
9885 bsi->lowest_ppage = first_ppage_reported;
9886 if (bsi->highest_ppage < (next_ppage - 1))
9887 bsi->highest_ppage = next_ppage - 1;
9888
9889 ret = add_swap_extent(sis, bsi->nr_pages, nr_pages, first_ppage);
9890 if (ret < 0)
9891 return ret;
9892 bsi->nr_extents += ret;
9893 bsi->nr_pages += nr_pages;
9894 return 0;
9895 }
9896
btrfs_swap_deactivate(struct file * file)9897 static void btrfs_swap_deactivate(struct file *file)
9898 {
9899 struct inode *inode = file_inode(file);
9900
9901 btrfs_free_swapfile_pins(inode);
9902 atomic_dec(&BTRFS_I(inode)->root->nr_swapfiles);
9903 }
9904
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)9905 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
9906 sector_t *span)
9907 {
9908 struct inode *inode = file_inode(file);
9909 struct btrfs_root *root = BTRFS_I(inode)->root;
9910 struct btrfs_fs_info *fs_info = root->fs_info;
9911 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
9912 struct extent_state *cached_state = NULL;
9913 struct btrfs_chunk_map *map = NULL;
9914 struct btrfs_device *device = NULL;
9915 struct btrfs_swap_info bsi = {
9916 .lowest_ppage = (sector_t)-1ULL,
9917 };
9918 struct btrfs_backref_share_check_ctx *backref_ctx = NULL;
9919 struct btrfs_path *path = NULL;
9920 int ret = 0;
9921 u64 isize;
9922 u64 prev_extent_end = 0;
9923
9924 /*
9925 * Acquire the inode's mmap lock to prevent races with memory mapped
9926 * writes, as they could happen after we flush delalloc below and before
9927 * we lock the extent range further below. The inode was already locked
9928 * up in the call chain.
9929 */
9930 btrfs_assert_inode_locked(BTRFS_I(inode));
9931 down_write(&BTRFS_I(inode)->i_mmap_lock);
9932
9933 /*
9934 * If the swap file was just created, make sure delalloc is done. If the
9935 * file changes again after this, the user is doing something stupid and
9936 * we don't really care.
9937 */
9938 ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1);
9939 if (ret)
9940 goto out_unlock_mmap;
9941
9942 /*
9943 * The inode is locked, so these flags won't change after we check them.
9944 */
9945 if (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS) {
9946 btrfs_warn(fs_info, "swapfile must not be compressed");
9947 ret = -EINVAL;
9948 goto out_unlock_mmap;
9949 }
9950 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW)) {
9951 btrfs_warn(fs_info, "swapfile must not be copy-on-write");
9952 ret = -EINVAL;
9953 goto out_unlock_mmap;
9954 }
9955 if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
9956 btrfs_warn(fs_info, "swapfile must not be checksummed");
9957 ret = -EINVAL;
9958 goto out_unlock_mmap;
9959 }
9960
9961 path = btrfs_alloc_path();
9962 backref_ctx = btrfs_alloc_backref_share_check_ctx();
9963 if (!path || !backref_ctx) {
9964 ret = -ENOMEM;
9965 goto out_unlock_mmap;
9966 }
9967
9968 /*
9969 * Balance or device remove/replace/resize can move stuff around from
9970 * under us. The exclop protection makes sure they aren't running/won't
9971 * run concurrently while we are mapping the swap extents, and
9972 * fs_info->swapfile_pins prevents them from running while the swap
9973 * file is active and moving the extents. Note that this also prevents
9974 * a concurrent device add which isn't actually necessary, but it's not
9975 * really worth the trouble to allow it.
9976 */
9977 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) {
9978 btrfs_warn(fs_info,
9979 "cannot activate swapfile while exclusive operation is running");
9980 ret = -EBUSY;
9981 goto out_unlock_mmap;
9982 }
9983
9984 /*
9985 * Prevent snapshot creation while we are activating the swap file.
9986 * We do not want to race with snapshot creation. If snapshot creation
9987 * already started before we bumped nr_swapfiles from 0 to 1 and
9988 * completes before the first write into the swap file after it is
9989 * activated, than that write would fallback to COW.
9990 */
9991 if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) {
9992 btrfs_exclop_finish(fs_info);
9993 btrfs_warn(fs_info,
9994 "cannot activate swapfile because snapshot creation is in progress");
9995 ret = -EINVAL;
9996 goto out_unlock_mmap;
9997 }
9998 /*
9999 * Snapshots can create extents which require COW even if NODATACOW is
10000 * set. We use this counter to prevent snapshots. We must increment it
10001 * before walking the extents because we don't want a concurrent
10002 * snapshot to run after we've already checked the extents.
10003 *
10004 * It is possible that subvolume is marked for deletion but still not
10005 * removed yet. To prevent this race, we check the root status before
10006 * activating the swapfile.
10007 */
10008 spin_lock(&root->root_item_lock);
10009 if (btrfs_root_dead(root)) {
10010 spin_unlock(&root->root_item_lock);
10011
10012 btrfs_drew_write_unlock(&root->snapshot_lock);
10013 btrfs_exclop_finish(fs_info);
10014 btrfs_warn(fs_info,
10015 "cannot activate swapfile because subvolume %llu is being deleted",
10016 btrfs_root_id(root));
10017 ret = -EPERM;
10018 goto out_unlock_mmap;
10019 }
10020 atomic_inc(&root->nr_swapfiles);
10021 spin_unlock(&root->root_item_lock);
10022
10023 isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
10024
10025 lock_extent(io_tree, 0, isize - 1, &cached_state);
10026 while (prev_extent_end < isize) {
10027 struct btrfs_key key;
10028 struct extent_buffer *leaf;
10029 struct btrfs_file_extent_item *ei;
10030 struct btrfs_block_group *bg;
10031 u64 logical_block_start;
10032 u64 physical_block_start;
10033 u64 extent_gen;
10034 u64 disk_bytenr;
10035 u64 len;
10036
10037 key.objectid = btrfs_ino(BTRFS_I(inode));
10038 key.type = BTRFS_EXTENT_DATA_KEY;
10039 key.offset = prev_extent_end;
10040
10041 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
10042 if (ret < 0)
10043 goto out;
10044
10045 /*
10046 * If key not found it means we have an implicit hole (NO_HOLES
10047 * is enabled).
10048 */
10049 if (ret > 0) {
10050 btrfs_warn(fs_info, "swapfile must not have holes");
10051 ret = -EINVAL;
10052 goto out;
10053 }
10054
10055 leaf = path->nodes[0];
10056 ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item);
10057
10058 if (btrfs_file_extent_type(leaf, ei) == BTRFS_FILE_EXTENT_INLINE) {
10059 /*
10060 * It's unlikely we'll ever actually find ourselves
10061 * here, as a file small enough to fit inline won't be
10062 * big enough to store more than the swap header, but in
10063 * case something changes in the future, let's catch it
10064 * here rather than later.
10065 */
10066 btrfs_warn(fs_info, "swapfile must not be inline");
10067 ret = -EINVAL;
10068 goto out;
10069 }
10070
10071 if (btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) {
10072 btrfs_warn(fs_info, "swapfile must not be compressed");
10073 ret = -EINVAL;
10074 goto out;
10075 }
10076
10077 disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei);
10078 if (disk_bytenr == 0) {
10079 btrfs_warn(fs_info, "swapfile must not have holes");
10080 ret = -EINVAL;
10081 goto out;
10082 }
10083
10084 logical_block_start = disk_bytenr + btrfs_file_extent_offset(leaf, ei);
10085 extent_gen = btrfs_file_extent_generation(leaf, ei);
10086 prev_extent_end = btrfs_file_extent_end(path);
10087
10088 if (prev_extent_end > isize)
10089 len = isize - key.offset;
10090 else
10091 len = btrfs_file_extent_num_bytes(leaf, ei);
10092
10093 backref_ctx->curr_leaf_bytenr = leaf->start;
10094
10095 /*
10096 * Don't need the path anymore, release to avoid deadlocks when
10097 * calling btrfs_is_data_extent_shared() because when joining a
10098 * transaction it can block waiting for the current one's commit
10099 * which in turn may be trying to lock the same leaf to flush
10100 * delayed items for example.
10101 */
10102 btrfs_release_path(path);
10103
10104 ret = btrfs_is_data_extent_shared(BTRFS_I(inode), disk_bytenr,
10105 extent_gen, backref_ctx);
10106 if (ret < 0) {
10107 goto out;
10108 } else if (ret > 0) {
10109 btrfs_warn(fs_info,
10110 "swapfile must not be copy-on-write");
10111 ret = -EINVAL;
10112 goto out;
10113 }
10114
10115 map = btrfs_get_chunk_map(fs_info, logical_block_start, len);
10116 if (IS_ERR(map)) {
10117 ret = PTR_ERR(map);
10118 goto out;
10119 }
10120
10121 if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
10122 btrfs_warn(fs_info,
10123 "swapfile must have single data profile");
10124 ret = -EINVAL;
10125 goto out;
10126 }
10127
10128 if (device == NULL) {
10129 device = map->stripes[0].dev;
10130 ret = btrfs_add_swapfile_pin(inode, device, false);
10131 if (ret == 1)
10132 ret = 0;
10133 else if (ret)
10134 goto out;
10135 } else if (device != map->stripes[0].dev) {
10136 btrfs_warn(fs_info, "swapfile must be on one device");
10137 ret = -EINVAL;
10138 goto out;
10139 }
10140
10141 physical_block_start = (map->stripes[0].physical +
10142 (logical_block_start - map->start));
10143 btrfs_free_chunk_map(map);
10144 map = NULL;
10145
10146 bg = btrfs_lookup_block_group(fs_info, logical_block_start);
10147 if (!bg) {
10148 btrfs_warn(fs_info,
10149 "could not find block group containing swapfile");
10150 ret = -EINVAL;
10151 goto out;
10152 }
10153
10154 if (!btrfs_inc_block_group_swap_extents(bg)) {
10155 btrfs_warn(fs_info,
10156 "block group for swapfile at %llu is read-only%s",
10157 bg->start,
10158 atomic_read(&fs_info->scrubs_running) ?
10159 " (scrub running)" : "");
10160 btrfs_put_block_group(bg);
10161 ret = -EINVAL;
10162 goto out;
10163 }
10164
10165 ret = btrfs_add_swapfile_pin(inode, bg, true);
10166 if (ret) {
10167 btrfs_put_block_group(bg);
10168 if (ret == 1)
10169 ret = 0;
10170 else
10171 goto out;
10172 }
10173
10174 if (bsi.block_len &&
10175 bsi.block_start + bsi.block_len == physical_block_start) {
10176 bsi.block_len += len;
10177 } else {
10178 if (bsi.block_len) {
10179 ret = btrfs_add_swap_extent(sis, &bsi);
10180 if (ret)
10181 goto out;
10182 }
10183 bsi.start = key.offset;
10184 bsi.block_start = physical_block_start;
10185 bsi.block_len = len;
10186 }
10187
10188 if (fatal_signal_pending(current)) {
10189 ret = -EINTR;
10190 goto out;
10191 }
10192
10193 cond_resched();
10194 }
10195
10196 if (bsi.block_len)
10197 ret = btrfs_add_swap_extent(sis, &bsi);
10198
10199 out:
10200 if (!IS_ERR_OR_NULL(map))
10201 btrfs_free_chunk_map(map);
10202
10203 unlock_extent(io_tree, 0, isize - 1, &cached_state);
10204
10205 if (ret)
10206 btrfs_swap_deactivate(file);
10207
10208 btrfs_drew_write_unlock(&root->snapshot_lock);
10209
10210 btrfs_exclop_finish(fs_info);
10211
10212 out_unlock_mmap:
10213 up_write(&BTRFS_I(inode)->i_mmap_lock);
10214 btrfs_free_backref_share_ctx(backref_ctx);
10215 btrfs_free_path(path);
10216 if (ret)
10217 return ret;
10218
10219 if (device)
10220 sis->bdev = device->bdev;
10221 *span = bsi.highest_ppage - bsi.lowest_ppage + 1;
10222 sis->max = bsi.nr_pages;
10223 sis->pages = bsi.nr_pages - 1;
10224 return bsi.nr_extents;
10225 }
10226 #else
btrfs_swap_deactivate(struct file * file)10227 static void btrfs_swap_deactivate(struct file *file)
10228 {
10229 }
10230
btrfs_swap_activate(struct swap_info_struct * sis,struct file * file,sector_t * span)10231 static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
10232 sector_t *span)
10233 {
10234 return -EOPNOTSUPP;
10235 }
10236 #endif
10237
10238 /*
10239 * Update the number of bytes used in the VFS' inode. When we replace extents in
10240 * a range (clone, dedupe, fallocate's zero range), we must update the number of
10241 * bytes used by the inode in an atomic manner, so that concurrent stat(2) calls
10242 * always get a correct value.
10243 */
btrfs_update_inode_bytes(struct btrfs_inode * inode,const u64 add_bytes,const u64 del_bytes)10244 void btrfs_update_inode_bytes(struct btrfs_inode *inode,
10245 const u64 add_bytes,
10246 const u64 del_bytes)
10247 {
10248 if (add_bytes == del_bytes)
10249 return;
10250
10251 spin_lock(&inode->lock);
10252 if (del_bytes > 0)
10253 inode_sub_bytes(&inode->vfs_inode, del_bytes);
10254 if (add_bytes > 0)
10255 inode_add_bytes(&inode->vfs_inode, add_bytes);
10256 spin_unlock(&inode->lock);
10257 }
10258
10259 /*
10260 * Verify that there are no ordered extents for a given file range.
10261 *
10262 * @inode: The target inode.
10263 * @start: Start offset of the file range, should be sector size aligned.
10264 * @end: End offset (inclusive) of the file range, its value +1 should be
10265 * sector size aligned.
10266 *
10267 * This should typically be used for cases where we locked an inode's VFS lock in
10268 * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode,
10269 * we have flushed all delalloc in the range, we have waited for all ordered
10270 * extents in the range to complete and finally we have locked the file range in
10271 * the inode's io_tree.
10272 */
btrfs_assert_inode_range_clean(struct btrfs_inode * inode,u64 start,u64 end)10273 void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end)
10274 {
10275 struct btrfs_root *root = inode->root;
10276 struct btrfs_ordered_extent *ordered;
10277
10278 if (!IS_ENABLED(CONFIG_BTRFS_ASSERT))
10279 return;
10280
10281 ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start);
10282 if (ordered) {
10283 btrfs_err(root->fs_info,
10284 "found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])",
10285 start, end, btrfs_ino(inode), btrfs_root_id(root),
10286 ordered->file_offset,
10287 ordered->file_offset + ordered->num_bytes - 1);
10288 btrfs_put_ordered_extent(ordered);
10289 }
10290
10291 ASSERT(ordered == NULL);
10292 }
10293
10294 /*
10295 * Find the first inode with a minimum number.
10296 *
10297 * @root: The root to search for.
10298 * @min_ino: The minimum inode number.
10299 *
10300 * Find the first inode in the @root with a number >= @min_ino and return it.
10301 * Returns NULL if no such inode found.
10302 */
btrfs_find_first_inode(struct btrfs_root * root,u64 min_ino)10303 struct btrfs_inode *btrfs_find_first_inode(struct btrfs_root *root, u64 min_ino)
10304 {
10305 struct btrfs_inode *inode;
10306 unsigned long from = min_ino;
10307
10308 xa_lock(&root->inodes);
10309 while (true) {
10310 inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT);
10311 if (!inode)
10312 break;
10313 if (igrab(&inode->vfs_inode))
10314 break;
10315
10316 from = btrfs_ino(inode) + 1;
10317 cond_resched_lock(&root->inodes.xa_lock);
10318 }
10319 xa_unlock(&root->inodes);
10320
10321 return inode;
10322 }
10323
10324 static const struct inode_operations btrfs_dir_inode_operations = {
10325 .getattr = btrfs_getattr,
10326 .lookup = btrfs_lookup,
10327 .create = btrfs_create,
10328 .unlink = btrfs_unlink,
10329 .link = btrfs_link,
10330 .mkdir = btrfs_mkdir,
10331 .rmdir = btrfs_rmdir,
10332 .rename = btrfs_rename2,
10333 .symlink = btrfs_symlink,
10334 .setattr = btrfs_setattr,
10335 .mknod = btrfs_mknod,
10336 .listxattr = btrfs_listxattr,
10337 .permission = btrfs_permission,
10338 .get_inode_acl = btrfs_get_acl,
10339 .set_acl = btrfs_set_acl,
10340 .update_time = btrfs_update_time,
10341 .tmpfile = btrfs_tmpfile,
10342 .fileattr_get = btrfs_fileattr_get,
10343 .fileattr_set = btrfs_fileattr_set,
10344 };
10345
10346 static const struct file_operations btrfs_dir_file_operations = {
10347 .llseek = btrfs_dir_llseek,
10348 .read = generic_read_dir,
10349 .iterate_shared = btrfs_real_readdir,
10350 .open = btrfs_opendir,
10351 .unlocked_ioctl = btrfs_ioctl,
10352 #ifdef CONFIG_COMPAT
10353 .compat_ioctl = btrfs_compat_ioctl,
10354 #endif
10355 .release = btrfs_release_file,
10356 .fsync = btrfs_sync_file,
10357 };
10358
10359 /*
10360 * btrfs doesn't support the bmap operation because swapfiles
10361 * use bmap to make a mapping of extents in the file. They assume
10362 * these extents won't change over the life of the file and they
10363 * use the bmap result to do IO directly to the drive.
10364 *
10365 * the btrfs bmap call would return logical addresses that aren't
10366 * suitable for IO and they also will change frequently as COW
10367 * operations happen. So, swapfile + btrfs == corruption.
10368 *
10369 * For now we're avoiding this by dropping bmap.
10370 */
10371 static const struct address_space_operations btrfs_aops = {
10372 .read_folio = btrfs_read_folio,
10373 .writepages = btrfs_writepages,
10374 .readahead = btrfs_readahead,
10375 .invalidate_folio = btrfs_invalidate_folio,
10376 .launder_folio = btrfs_launder_folio,
10377 .release_folio = btrfs_release_folio,
10378 .migrate_folio = btrfs_migrate_folio,
10379 .dirty_folio = filemap_dirty_folio,
10380 .error_remove_folio = generic_error_remove_folio,
10381 .swap_activate = btrfs_swap_activate,
10382 .swap_deactivate = btrfs_swap_deactivate,
10383 };
10384
10385 static const struct inode_operations btrfs_file_inode_operations = {
10386 .getattr = btrfs_getattr,
10387 .setattr = btrfs_setattr,
10388 .listxattr = btrfs_listxattr,
10389 .permission = btrfs_permission,
10390 .fiemap = btrfs_fiemap,
10391 .get_inode_acl = btrfs_get_acl,
10392 .set_acl = btrfs_set_acl,
10393 .update_time = btrfs_update_time,
10394 .fileattr_get = btrfs_fileattr_get,
10395 .fileattr_set = btrfs_fileattr_set,
10396 };
10397 static const struct inode_operations btrfs_special_inode_operations = {
10398 .getattr = btrfs_getattr,
10399 .setattr = btrfs_setattr,
10400 .permission = btrfs_permission,
10401 .listxattr = btrfs_listxattr,
10402 .get_inode_acl = btrfs_get_acl,
10403 .set_acl = btrfs_set_acl,
10404 .update_time = btrfs_update_time,
10405 };
10406 static const struct inode_operations btrfs_symlink_inode_operations = {
10407 .get_link = page_get_link,
10408 .getattr = btrfs_getattr,
10409 .setattr = btrfs_setattr,
10410 .permission = btrfs_permission,
10411 .listxattr = btrfs_listxattr,
10412 .update_time = btrfs_update_time,
10413 };
10414
10415 const struct dentry_operations btrfs_dentry_operations = {
10416 .d_delete = btrfs_dentry_delete,
10417 };
10418