1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2008 Oracle. All rights reserved. 4 */ 5 6 #include <linux/sched.h> 7 #include <linux/pagemap.h> 8 #include <linux/spinlock.h> 9 #include <linux/page-flags.h> 10 #include <asm/bug.h> 11 #include <trace/events/btrfs.h> 12 #include "misc.h" 13 #include "ctree.h" 14 #include "extent_io.h" 15 #include "locking.h" 16 #include "accessors.h" 17 18 /* 19 * Lockdep class keys for extent_buffer->lock's in this root. For a given 20 * eb, the lockdep key is determined by the btrfs_root it belongs to and 21 * the level the eb occupies in the tree. 22 * 23 * Different roots are used for different purposes and may nest inside each 24 * other and they require separate keysets. As lockdep keys should be 25 * static, assign keysets according to the purpose of the root as indicated 26 * by btrfs_root->root_key.objectid. This ensures that all special purpose 27 * roots have separate keysets. 28 * 29 * Lock-nesting across peer nodes is always done with the immediate parent 30 * node locked thus preventing deadlock. As lockdep doesn't know this, use 31 * subclass to avoid triggering lockdep warning in such cases. 32 * 33 * The key is set by the readpage_end_io_hook after the buffer has passed 34 * csum validation but before the pages are unlocked. It is also set by 35 * btrfs_init_new_buffer on freshly allocated blocks. 36 * 37 * We also add a check to make sure the highest level of the tree is the 38 * same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this code 39 * needs update as well. 40 */ 41 #ifdef CONFIG_DEBUG_LOCK_ALLOC 42 #if BTRFS_MAX_LEVEL != 8 43 #error 44 #endif 45 46 #define DEFINE_LEVEL(stem, level) \ 47 .names[level] = "btrfs-" stem "-0" #level, 48 49 #define DEFINE_NAME(stem) \ 50 DEFINE_LEVEL(stem, 0) \ 51 DEFINE_LEVEL(stem, 1) \ 52 DEFINE_LEVEL(stem, 2) \ 53 DEFINE_LEVEL(stem, 3) \ 54 DEFINE_LEVEL(stem, 4) \ 55 DEFINE_LEVEL(stem, 5) \ 56 DEFINE_LEVEL(stem, 6) \ 57 DEFINE_LEVEL(stem, 7) 58 59 static struct btrfs_lockdep_keyset { 60 u64 id; /* root objectid */ 61 /* Longest entry: btrfs-block-group-00 */ 62 char names[BTRFS_MAX_LEVEL][24]; 63 struct lock_class_key keys[BTRFS_MAX_LEVEL]; 64 } btrfs_lockdep_keysets[] = { 65 { .id = BTRFS_ROOT_TREE_OBJECTID, DEFINE_NAME("root") }, 66 { .id = BTRFS_EXTENT_TREE_OBJECTID, DEFINE_NAME("extent") }, 67 { .id = BTRFS_CHUNK_TREE_OBJECTID, DEFINE_NAME("chunk") }, 68 { .id = BTRFS_DEV_TREE_OBJECTID, DEFINE_NAME("dev") }, 69 { .id = BTRFS_CSUM_TREE_OBJECTID, DEFINE_NAME("csum") }, 70 { .id = BTRFS_QUOTA_TREE_OBJECTID, DEFINE_NAME("quota") }, 71 { .id = BTRFS_TREE_LOG_OBJECTID, DEFINE_NAME("log") }, 72 { .id = BTRFS_TREE_RELOC_OBJECTID, DEFINE_NAME("treloc") }, 73 { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, DEFINE_NAME("dreloc") }, 74 { .id = BTRFS_UUID_TREE_OBJECTID, DEFINE_NAME("uuid") }, 75 { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, DEFINE_NAME("free-space") }, 76 { .id = BTRFS_BLOCK_GROUP_TREE_OBJECTID, DEFINE_NAME("block-group") }, 77 { .id = BTRFS_RAID_STRIPE_TREE_OBJECTID, DEFINE_NAME("raid-stripe") }, 78 { .id = 0, DEFINE_NAME("tree") }, 79 }; 80 81 #undef DEFINE_LEVEL 82 #undef DEFINE_NAME 83 84 void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, int level) 85 { 86 struct btrfs_lockdep_keyset *ks; 87 88 BUG_ON(level >= ARRAY_SIZE(ks->keys)); 89 90 /* Find the matching keyset, id 0 is the default entry */ 91 for (ks = btrfs_lockdep_keysets; ks->id; ks++) 92 if (ks->id == objectid) 93 break; 94 95 lockdep_set_class_and_name(&eb->lock, &ks->keys[level], ks->names[level]); 96 } 97 98 void btrfs_maybe_reset_lockdep_class(struct btrfs_root *root, struct extent_buffer *eb) 99 { 100 if (test_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &root->state)) 101 btrfs_set_buffer_lockdep_class(root->root_key.objectid, 102 eb, btrfs_header_level(eb)); 103 } 104 105 #endif 106 107 #ifdef CONFIG_BTRFS_DEBUG 108 static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) 109 { 110 eb->lock_owner = owner; 111 } 112 #else 113 static void btrfs_set_eb_lock_owner(struct extent_buffer *eb, pid_t owner) { } 114 #endif 115 116 /* 117 * Extent buffer locking 118 * ===================== 119 * 120 * We use a rw_semaphore for tree locking, and the semantics are exactly the 121 * same: 122 * 123 * - reader/writer exclusion 124 * - writer/writer exclusion 125 * - reader/reader sharing 126 * - try-lock semantics for readers and writers 127 * 128 * The rwsem implementation does opportunistic spinning which reduces number of 129 * times the locking task needs to sleep. 130 */ 131 132 /* 133 * __btrfs_tree_read_lock - lock extent buffer for read 134 * @eb: the eb to be locked 135 * @nest: the nesting level to be used for lockdep 136 * 137 * This takes the read lock on the extent buffer, using the specified nesting 138 * level for lockdep purposes. 139 */ 140 void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) 141 { 142 u64 start_ns = 0; 143 144 if (trace_btrfs_tree_read_lock_enabled()) 145 start_ns = ktime_get_ns(); 146 147 down_read_nested(&eb->lock, nest); 148 trace_btrfs_tree_read_lock(eb, start_ns); 149 } 150 151 void btrfs_tree_read_lock(struct extent_buffer *eb) 152 { 153 __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL); 154 } 155 156 /* 157 * Try-lock for read. 158 * 159 * Return 1 if the rwlock has been taken, 0 otherwise 160 */ 161 int btrfs_try_tree_read_lock(struct extent_buffer *eb) 162 { 163 if (down_read_trylock(&eb->lock)) { 164 trace_btrfs_try_tree_read_lock(eb); 165 return 1; 166 } 167 return 0; 168 } 169 170 /* 171 * Try-lock for write. 172 * 173 * Return 1 if the rwlock has been taken, 0 otherwise 174 */ 175 int btrfs_try_tree_write_lock(struct extent_buffer *eb) 176 { 177 if (down_write_trylock(&eb->lock)) { 178 btrfs_set_eb_lock_owner(eb, current->pid); 179 trace_btrfs_try_tree_write_lock(eb); 180 return 1; 181 } 182 return 0; 183 } 184 185 /* 186 * Release read lock. 187 */ 188 void btrfs_tree_read_unlock(struct extent_buffer *eb) 189 { 190 trace_btrfs_tree_read_unlock(eb); 191 up_read(&eb->lock); 192 } 193 194 /* 195 * Lock eb for write. 196 * 197 * @eb: the eb to lock 198 * @nest: the nesting to use for the lock 199 * 200 * Returns with the eb->lock write locked. 201 */ 202 void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) 203 __acquires(&eb->lock) 204 { 205 u64 start_ns = 0; 206 207 if (trace_btrfs_tree_lock_enabled()) 208 start_ns = ktime_get_ns(); 209 210 down_write_nested(&eb->lock, nest); 211 btrfs_set_eb_lock_owner(eb, current->pid); 212 trace_btrfs_tree_lock(eb, start_ns); 213 } 214 215 void btrfs_tree_lock(struct extent_buffer *eb) 216 { 217 __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); 218 } 219 220 /* 221 * Release the write lock. 222 */ 223 void btrfs_tree_unlock(struct extent_buffer *eb) 224 { 225 trace_btrfs_tree_unlock(eb); 226 btrfs_set_eb_lock_owner(eb, 0); 227 up_write(&eb->lock); 228 } 229 230 /* 231 * This releases any locks held in the path starting at level and going all the 232 * way up to the root. 233 * 234 * btrfs_search_slot will keep the lock held on higher nodes in a few corner 235 * cases, such as COW of the block at slot zero in the node. This ignores 236 * those rules, and it should only be called when there are no more updates to 237 * be done higher up in the tree. 238 */ 239 void btrfs_unlock_up_safe(struct btrfs_path *path, int level) 240 { 241 int i; 242 243 if (path->keep_locks) 244 return; 245 246 for (i = level; i < BTRFS_MAX_LEVEL; i++) { 247 if (!path->nodes[i]) 248 continue; 249 if (!path->locks[i]) 250 continue; 251 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]); 252 path->locks[i] = 0; 253 } 254 } 255 256 /* 257 * Loop around taking references on and locking the root node of the tree until 258 * we end up with a lock on the root node. 259 * 260 * Return: root extent buffer with write lock held 261 */ 262 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) 263 { 264 struct extent_buffer *eb; 265 266 while (1) { 267 eb = btrfs_root_node(root); 268 269 btrfs_maybe_reset_lockdep_class(root, eb); 270 btrfs_tree_lock(eb); 271 if (eb == root->node) 272 break; 273 btrfs_tree_unlock(eb); 274 free_extent_buffer(eb); 275 } 276 return eb; 277 } 278 279 /* 280 * Loop around taking references on and locking the root node of the tree until 281 * we end up with a lock on the root node. 282 * 283 * Return: root extent buffer with read lock held 284 */ 285 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) 286 { 287 struct extent_buffer *eb; 288 289 while (1) { 290 eb = btrfs_root_node(root); 291 292 btrfs_maybe_reset_lockdep_class(root, eb); 293 btrfs_tree_read_lock(eb); 294 if (eb == root->node) 295 break; 296 btrfs_tree_read_unlock(eb); 297 free_extent_buffer(eb); 298 } 299 return eb; 300 } 301 302 /* 303 * Loop around taking references on and locking the root node of the tree in 304 * nowait mode until we end up with a lock on the root node or returning to 305 * avoid blocking. 306 * 307 * Return: root extent buffer with read lock held or -EAGAIN. 308 */ 309 struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root) 310 { 311 struct extent_buffer *eb; 312 313 while (1) { 314 eb = btrfs_root_node(root); 315 if (!btrfs_try_tree_read_lock(eb)) { 316 free_extent_buffer(eb); 317 return ERR_PTR(-EAGAIN); 318 } 319 if (eb == root->node) 320 break; 321 btrfs_tree_read_unlock(eb); 322 free_extent_buffer(eb); 323 } 324 return eb; 325 } 326 327 /* 328 * DREW locks 329 * ========== 330 * 331 * DREW stands for double-reader-writer-exclusion lock. It's used in situation 332 * where you want to provide A-B exclusion but not AA or BB. 333 * 334 * Currently implementation gives more priority to reader. If a reader and a 335 * writer both race to acquire their respective sides of the lock the writer 336 * would yield its lock as soon as it detects a concurrent reader. Additionally 337 * if there are pending readers no new writers would be allowed to come in and 338 * acquire the lock. 339 */ 340 341 void btrfs_drew_lock_init(struct btrfs_drew_lock *lock) 342 { 343 atomic_set(&lock->readers, 0); 344 atomic_set(&lock->writers, 0); 345 init_waitqueue_head(&lock->pending_readers); 346 init_waitqueue_head(&lock->pending_writers); 347 } 348 349 /* Return true if acquisition is successful, false otherwise */ 350 bool btrfs_drew_try_write_lock(struct btrfs_drew_lock *lock) 351 { 352 if (atomic_read(&lock->readers)) 353 return false; 354 355 atomic_inc(&lock->writers); 356 357 /* Ensure writers count is updated before we check for pending readers */ 358 smp_mb__after_atomic(); 359 if (atomic_read(&lock->readers)) { 360 btrfs_drew_write_unlock(lock); 361 return false; 362 } 363 364 return true; 365 } 366 367 void btrfs_drew_write_lock(struct btrfs_drew_lock *lock) 368 { 369 while (true) { 370 if (btrfs_drew_try_write_lock(lock)) 371 return; 372 wait_event(lock->pending_writers, !atomic_read(&lock->readers)); 373 } 374 } 375 376 void btrfs_drew_write_unlock(struct btrfs_drew_lock *lock) 377 { 378 atomic_dec(&lock->writers); 379 cond_wake_up(&lock->pending_readers); 380 } 381 382 void btrfs_drew_read_lock(struct btrfs_drew_lock *lock) 383 { 384 atomic_inc(&lock->readers); 385 386 /* 387 * Ensure the pending reader count is perceieved BEFORE this reader 388 * goes to sleep in case of active writers. This guarantees new writers 389 * won't be allowed and that the current reader will be woken up when 390 * the last active writer finishes its jobs. 391 */ 392 smp_mb__after_atomic(); 393 394 wait_event(lock->pending_readers, atomic_read(&lock->writers) == 0); 395 } 396 397 void btrfs_drew_read_unlock(struct btrfs_drew_lock *lock) 398 { 399 /* 400 * atomic_dec_and_test implies a full barrier, so woken up writers 401 * are guaranteed to see the decrement 402 */ 403 if (atomic_dec_and_test(&lock->readers)) 404 wake_up(&lock->pending_writers); 405 } 406