xref: /linux/fs/btrfs/locking.c (revision 9acb51e9617c28a92f9ce2af767db6bd660a6d4f)
1  // SPDX-License-Identifier: GPL-2.0
2  /*
3   * Copyright (C) 2008 Oracle.  All rights reserved.
4   */
5  
6  #include <linux/sched.h>
7  #include <linux/pagemap.h>
8  #include <linux/spinlock.h>
9  #include <linux/page-flags.h>
10  #include <asm/bug.h>
11  #include <trace/events/btrfs.h>
12  #include "misc.h"
13  #include "ctree.h"
14  #include "extent_io.h"
15  #include "locking.h"
16  
17  /*
18   * Lockdep class keys for extent_buffer->lock's in this root.  For a given
19   * eb, the lockdep key is determined by the btrfs_root it belongs to and
20   * the level the eb occupies in the tree.
21   *
22   * Different roots are used for different purposes and may nest inside each
23   * other and they require separate keysets.  As lockdep keys should be
24   * static, assign keysets according to the purpose of the root as indicated
25   * by btrfs_root->root_key.objectid.  This ensures that all special purpose
26   * roots have separate keysets.
27   *
28   * Lock-nesting across peer nodes is always done with the immediate parent
29   * node locked thus preventing deadlock.  As lockdep doesn't know this, use
30   * subclass to avoid triggering lockdep warning in such cases.
31   *
32   * The key is set by the readpage_end_io_hook after the buffer has passed
33   * csum validation but before the pages are unlocked.  It is also set by
34   * btrfs_init_new_buffer on freshly allocated blocks.
35   *
36   * We also add a check to make sure the highest level of the tree is the
37   * same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this code
38   * needs update as well.
39   */
40  #ifdef CONFIG_DEBUG_LOCK_ALLOC
41  #if BTRFS_MAX_LEVEL != 8
42  #error
43  #endif
44  
45  #define DEFINE_LEVEL(stem, level)					\
46  	.names[level] = "btrfs-" stem "-0" #level,
47  
48  #define DEFINE_NAME(stem)						\
49  	DEFINE_LEVEL(stem, 0)						\
50  	DEFINE_LEVEL(stem, 1)						\
51  	DEFINE_LEVEL(stem, 2)						\
52  	DEFINE_LEVEL(stem, 3)						\
53  	DEFINE_LEVEL(stem, 4)						\
54  	DEFINE_LEVEL(stem, 5)						\
55  	DEFINE_LEVEL(stem, 6)						\
56  	DEFINE_LEVEL(stem, 7)
57  
58  static struct btrfs_lockdep_keyset {
59  	u64			id;		/* root objectid */
60  	/* Longest entry: btrfs-block-group-00 */
61  	char			names[BTRFS_MAX_LEVEL][24];
62  	struct lock_class_key	keys[BTRFS_MAX_LEVEL];
63  } btrfs_lockdep_keysets[] = {
64  	{ .id = BTRFS_ROOT_TREE_OBJECTID,	DEFINE_NAME("root")	},
65  	{ .id = BTRFS_EXTENT_TREE_OBJECTID,	DEFINE_NAME("extent")	},
66  	{ .id = BTRFS_CHUNK_TREE_OBJECTID,	DEFINE_NAME("chunk")	},
67  	{ .id = BTRFS_DEV_TREE_OBJECTID,	DEFINE_NAME("dev")	},
68  	{ .id = BTRFS_CSUM_TREE_OBJECTID,	DEFINE_NAME("csum")	},
69  	{ .id = BTRFS_QUOTA_TREE_OBJECTID,	DEFINE_NAME("quota")	},
70  	{ .id = BTRFS_TREE_LOG_OBJECTID,	DEFINE_NAME("log")	},
71  	{ .id = BTRFS_TREE_RELOC_OBJECTID,	DEFINE_NAME("treloc")	},
72  	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	DEFINE_NAME("dreloc")	},
73  	{ .id = BTRFS_UUID_TREE_OBJECTID,	DEFINE_NAME("uuid")	},
74  	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	DEFINE_NAME("free-space") },
75  	{ .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") },
76  	{ .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") },
77  	{ .id = 0,				DEFINE_NAME("tree")	},
78  };
79  
80  #undef DEFINE_LEVEL
81  #undef DEFINE_NAME
82  
83  void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level)
84  {
85  	struct btrfs_lockdep_keyset *ks;
86  
87  	ASSERT(level < ARRAY_SIZE(ks->keys));
88  
89  	/* Find the matching keyset, id 0 is the default entry */
90  	for (ks = btrfs_lockdep_keysets; ks->id; ks++)
91  		if (ks->id == objectid)
92  			break;
93  
94  	lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]);
95  }
96  
97  void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb)
98  {
99  	if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state))
100  		btrfs_set_buffer_lockdep_class(btrfs_root_id(root),
101  					       eb, btrfs_header_level(eb));
102  }
103  
104  #endif
105  
106  #ifdef CONFIG_BTRFS_DEBUG
107  static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner)
108  {
109  	eb->lock_owner = owner;
110  }
111  #else
112  static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { }
113  #endif
114  
115  /*
116   * Extent buffer locking
117   * =====================
118   *
119   * We use a rw_semaphore for tree locking, and the semantics are exactly the
120   * same:
121   *
122   * - reader/writer exclusion
123   * - writer/writer exclusion
124   * - reader/reader sharing
125   * - try-lock semantics for readers and writers
126   *
127   * The rwsem implementation does opportunistic spinning which reduces number of
128   * times the locking task needs to sleep.
129   */
130  
131  /*
132   * btrfs_tree_read_lock_nested - lock extent buffer for read
133   * @eb:		the eb to be locked
134   * @nest:	the nesting level to be used for lockdep
135   *
136   * This takes the read lock on the extent buffer, using the specified nesting
137   * level for lockdep purposes.
138   */
139  void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
140  {
141  	u64 start_ns = 0;
142  
143  	if (trace_btrfs_tree_read_lock_enabled())
144  		start_ns = ktime_get_ns();
145  
146  	down_read_nested(&eb->lock, nest);
147  	trace_btrfs_tree_read_lock(eb, start_ns);
148  }
149  
150  /*
151   * Try-lock for read.
152   *
153   * Return 1 if the rwlock has been taken, 0 otherwise
154   */
155  int btrfs_try_tree_read_lock(struct extent_buffer *eb)
156  {
157  	if (down_read_trylock(&eb->lock)) {
158  		trace_btrfs_try_tree_read_lock(eb);
159  		return 1;
160  	}
161  	return 0;
162  }
163  
164  /*
165   * Try-lock for write.
166   *
167   * Return 1 if the rwlock has been taken, 0 otherwise
168   */
169  int btrfs_try_tree_write_lock(struct extent_buffer *eb)
170  {
171  	if (down_write_trylock(&eb->lock)) {
172  		btrfs_set_eb_lock_owner(eb, current->pid);
173  		trace_btrfs_try_tree_write_lock(eb);
174  		return 1;
175  	}
176  	return 0;
177  }
178  
179  /*
180   * Release read lock.
181   */
182  void btrfs_tree_read_unlock(struct extent_buffer *eb)
183  {
184  	trace_btrfs_tree_read_unlock(eb);
185  	up_read(&eb->lock);
186  }
187  
188  /*
189   * Lock eb for write.
190   *
191   * @eb:		the eb to lock
192   * @nest:	the nesting to use for the lock
193   *
194   * Returns with the eb->lock write locked.
195   */
196  void btrfs_tree_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesting nest)
197  	__acquires(&eb->lock)
198  {
199  	u64 start_ns = 0;
200  
201  	if (trace_btrfs_tree_lock_enabled())
202  		start_ns = ktime_get_ns();
203  
204  	down_write_nested(&eb->lock, nest);
205  	btrfs_set_eb_lock_owner(eb, current->pid);
206  	trace_btrfs_tree_lock(eb, start_ns);
207  }
208  
209  /*
210   * Release the write lock.
211   */
212  void btrfs_tree_unlock(struct extent_buffer *eb)
213  {
214  	trace_btrfs_tree_unlock(eb);
215  	btrfs_set_eb_lock_owner(eb, 0);
216  	up_write(&eb->lock);
217  }
218  
219  /*
220   * This releases any locks held in the path starting at level and going all the
221   * way up to the root.
222   *
223   * btrfs_search_slot will keep the lock held on higher nodes in a few corner
224   * cases, such as COW of the block at slot zero in the node.  This ignores
225   * those rules, and it should only be called when there are no more updates to
226   * be done higher up in the tree.
227   */
228  void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
229  {
230  	int i;
231  
232  	if (path->keep_locks)
233  		return;
234  
235  	for (i = level; i < BTRFS_MAX_LEVEL; i++) {
236  		if (!path->nodes[i])
237  			continue;
238  		if (!path->locks[i])
239  			continue;
240  		btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
241  		path->locks[i] = 0;
242  	}
243  }
244  
245  /*
246   * Loop around taking references on and locking the root node of the tree until
247   * we end up with a lock on the root node.
248   *
249   * Return: root extent buffer with write lock held
250   */
251  struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
252  {
253  	struct extent_buffer *eb;
254  
255  	while (1) {
256  		eb = btrfs_root_node(root);
257  
258  		btrfs_maybe_reset_lockdep_class(root, eb);
259  		btrfs_tree_lock(eb);
260  		if (eb == root->node)
261  			break;
262  		btrfs_tree_unlock(eb);
263  		free_extent_buffer(eb);
264  	}
265  	return eb;
266  }
267  
268  /*
269   * Loop around taking references on and locking the root node of the tree until
270   * we end up with a lock on the root node.
271   *
272   * Return: root extent buffer with read lock held
273   */
274  struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
275  {
276  	struct extent_buffer *eb;
277  
278  	while (1) {
279  		eb = btrfs_root_node(root);
280  
281  		btrfs_maybe_reset_lockdep_class(root, eb);
282  		btrfs_tree_read_lock(eb);
283  		if (eb == root->node)
284  			break;
285  		btrfs_tree_read_unlock(eb);
286  		free_extent_buffer(eb);
287  	}
288  	return eb;
289  }
290  
291  /*
292   * Loop around taking references on and locking the root node of the tree in
293   * nowait mode until we end up with a lock on the root node or returning to
294   * avoid blocking.
295   *
296   * Return: root extent buffer with read lock held or -EAGAIN.
297   */
298  struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root)
299  {
300  	struct extent_buffer *eb;
301  
302  	while (1) {
303  		eb = btrfs_root_node(root);
304  		if (!btrfs_try_tree_read_lock(eb)) {
305  			free_extent_buffer(eb);
306  			return ERR_PTR(-EAGAIN);
307  		}
308  		if (eb == root->node)
309  			break;
310  		btrfs_tree_read_unlock(eb);
311  		free_extent_buffer(eb);
312  	}
313  	return eb;
314  }
315  
316  /*
317   * DREW locks
318   * ==========
319   *
320   * DREW stands for double-reader-writer-exclusion lock. It's used in situation
321   * where you want to provide A-B exclusion but not AA or BB.
322   *
323   * Currently implementation gives more priority to reader. If a reader and a
324   * writer both race to acquire their respective sides of the lock the writer
325   * would yield its lock as soon as it detects a concurrent reader. Additionally
326   * if there are pending readers no new writers would be allowed to come in and
327   * acquire the lock.
328   */
329  
330  void btrfs_drew_lock_init(struct btrfs_drew_lock *lock)
331  {
332  	atomic_set(&lock->readers, 0);
333  	atomic_set(&lock->writers, 0);
334  	init_waitqueue_head(&lock->pending_readers);
335  	init_waitqueue_head(&lock->pending_writers);
336  }
337  
338  /* Return true if acquisition is successful, false otherwise */
339  bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock)
340  {
341  	if (atomic_read(&lock->readers))
342  		return false;
343  
344  	atomic_inc(&lock->writers);
345  
346  	/* Ensure writers count is updated before we check for pending readers */
347  	smp_mb__after_atomic();
348  	if (atomic_read(&lock->readers)) {
349  		btrfs_drew_write_unlock(lock);
350  		return false;
351  	}
352  
353  	return true;
354  }
355  
356  void btrfs_drew_write_lock(struct btrfs_drew_lock *lock)
357  {
358  	while (true) {
359  		if (btrfs_drew_try_write_lock(lock))
360  			return;
361  		wait_event(lock->pending_writers, !atomic_read(&lock->readers));
362  	}
363  }
364  
365  void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock)
366  {
367  	/*
368  	 * atomic_dec_and_test() implies a full barrier, so woken up readers are
369  	 * guaranteed to see the decrement.
370  	 */
371  	if (atomic_dec_and_test(&lock->writers))
372  		wake_up(&lock->pending_readers);
373  }
374  
375  void btrfs_drew_read_lock(struct btrfs_drew_lock *lock)
376  {
377  	atomic_inc(&lock->readers);
378  
379  	/*
380  	 * Ensure the pending reader count is perceieved BEFORE this reader
381  	 * goes to sleep in case of active writers. This guarantees new writers
382  	 * won't be allowed and that the current reader will be woken up when
383  	 * the last active writer finishes its jobs.
384  	 */
385  	smp_mb__after_atomic();
386  
387  	wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0);
388  }
389  
390  void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock)
391  {
392  	/*
393  	 * atomic_dec_and_test implies a full barrier, so woken up writers
394  	 * are guaranteed to see the decrement
395  	 */
396  	if (atomic_dec_and_test(&lock->readers))
397  		wake_up(&lock->pending_writers);
398  }
399