1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Sleepable Read-Copy Update mechanism for mutual exclusion. 4 * 5 * Copyright (C) IBM Corporation, 2006 6 * Copyright (C) Fujitsu, 2012 7 * 8 * Authors: Paul McKenney <paulmck@linux.ibm.com> 9 * Lai Jiangshan <laijs@cn.fujitsu.com> 10 * 11 * For detailed explanation of Read-Copy Update mechanism see - 12 * Documentation/RCU/ *.txt 13 * 14 */ 15 16 #define pr_fmt(fmt) "rcu: " fmt 17 18 #include <linux/export.h> 19 #include <linux/mutex.h> 20 #include <linux/percpu.h> 21 #include <linux/preempt.h> 22 #include <linux/rcupdate_wait.h> 23 #include <linux/sched.h> 24 #include <linux/smp.h> 25 #include <linux/delay.h> 26 #include <linux/module.h> 27 #include <linux/slab.h> 28 #include <linux/srcu.h> 29 30 #include "rcu.h" 31 #include "rcu_segcblist.h" 32 33 /* Holdoff in nanoseconds for auto-expediting. */ 34 #define DEFAULT_SRCU_EXP_HOLDOFF (25 * 1000) 35 static ulong exp_holdoff = DEFAULT_SRCU_EXP_HOLDOFF; 36 module_param(exp_holdoff, ulong, 0444); 37 38 /* Overflow-check frequency. N bits roughly says every 2**N grace periods. */ 39 static ulong counter_wrap_check = (ULONG_MAX >> 2); 40 module_param(counter_wrap_check, ulong, 0444); 41 42 /* 43 * Control conversion to SRCU_SIZE_BIG: 44 * 0: Don't convert at all. 45 * 1: Convert at init_srcu_struct() time. 46 * 2: Convert when rcutorture invokes srcu_torture_stats_print(). 47 * 3: Decide at boot time based on system shape (default). 48 * 0x1x: Convert when excessive contention encountered. 49 */ 50 #define SRCU_SIZING_NONE 0 51 #define SRCU_SIZING_INIT 1 52 #define SRCU_SIZING_TORTURE 2 53 #define SRCU_SIZING_AUTO 3 54 #define SRCU_SIZING_CONTEND 0x10 55 #define SRCU_SIZING_IS(x) ((convert_to_big & ~SRCU_SIZING_CONTEND) == x) 56 #define SRCU_SIZING_IS_NONE() (SRCU_SIZING_IS(SRCU_SIZING_NONE)) 57 #define SRCU_SIZING_IS_INIT() (SRCU_SIZING_IS(SRCU_SIZING_INIT)) 58 #define SRCU_SIZING_IS_TORTURE() (SRCU_SIZING_IS(SRCU_SIZING_TORTURE)) 59 #define SRCU_SIZING_IS_CONTEND() (convert_to_big & SRCU_SIZING_CONTEND) 60 static int convert_to_big = SRCU_SIZING_AUTO; 61 module_param(convert_to_big, int, 0444); 62 63 /* Number of CPUs to trigger init_srcu_struct()-time transition to big. */ 64 static int big_cpu_lim __read_mostly = 128; 65 module_param(big_cpu_lim, int, 0444); 66 67 /* Contention events per jiffy to initiate transition to big. */ 68 static int small_contention_lim __read_mostly = 100; 69 module_param(small_contention_lim, int, 0444); 70 71 /* Early-boot callback-management, so early that no lock is required! */ 72 static LIST_HEAD(srcu_boot_list); 73 static bool __read_mostly srcu_init_done; 74 75 static void srcu_invoke_callbacks(struct work_struct *work); 76 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay); 77 static void process_srcu(struct work_struct *work); 78 static void srcu_delay_timer(struct timer_list *t); 79 80 /* Wrappers for lock acquisition and release, see raw_spin_lock_rcu_node(). */ 81 #define spin_lock_rcu_node(p) \ 82 do { \ 83 spin_lock(&ACCESS_PRIVATE(p, lock)); \ 84 smp_mb__after_unlock_lock(); \ 85 } while (0) 86 87 #define spin_unlock_rcu_node(p) spin_unlock(&ACCESS_PRIVATE(p, lock)) 88 89 #define spin_lock_irq_rcu_node(p) \ 90 do { \ 91 spin_lock_irq(&ACCESS_PRIVATE(p, lock)); \ 92 smp_mb__after_unlock_lock(); \ 93 } while (0) 94 95 #define spin_unlock_irq_rcu_node(p) \ 96 spin_unlock_irq(&ACCESS_PRIVATE(p, lock)) 97 98 #define spin_lock_irqsave_rcu_node(p, flags) \ 99 do { \ 100 spin_lock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 101 smp_mb__after_unlock_lock(); \ 102 } while (0) 103 104 #define spin_trylock_irqsave_rcu_node(p, flags) \ 105 ({ \ 106 bool ___locked = spin_trylock_irqsave(&ACCESS_PRIVATE(p, lock), flags); \ 107 \ 108 if (___locked) \ 109 smp_mb__after_unlock_lock(); \ 110 ___locked; \ 111 }) 112 113 #define spin_unlock_irqrestore_rcu_node(p, flags) \ 114 spin_unlock_irqrestore(&ACCESS_PRIVATE(p, lock), flags) \ 115 116 /* 117 * Initialize SRCU per-CPU data. Note that statically allocated 118 * srcu_struct structures might already have srcu_read_lock() and 119 * srcu_read_unlock() running against them. So if the is_static parameter 120 * is set, don't initialize ->srcu_lock_count[] and ->srcu_unlock_count[]. 121 */ 122 static void init_srcu_struct_data(struct srcu_struct *ssp) 123 { 124 int cpu; 125 struct srcu_data *sdp; 126 127 /* 128 * Initialize the per-CPU srcu_data array, which feeds into the 129 * leaves of the srcu_node tree. 130 */ 131 BUILD_BUG_ON(ARRAY_SIZE(sdp->srcu_lock_count) != 132 ARRAY_SIZE(sdp->srcu_unlock_count)); 133 for_each_possible_cpu(cpu) { 134 sdp = per_cpu_ptr(ssp->sda, cpu); 135 spin_lock_init(&ACCESS_PRIVATE(sdp, lock)); 136 rcu_segcblist_init(&sdp->srcu_cblist); 137 sdp->srcu_cblist_invoking = false; 138 sdp->srcu_gp_seq_needed = ssp->srcu_sup->srcu_gp_seq; 139 sdp->srcu_gp_seq_needed_exp = ssp->srcu_sup->srcu_gp_seq; 140 sdp->srcu_barrier_head.next = &sdp->srcu_barrier_head; 141 sdp->mynode = NULL; 142 sdp->cpu = cpu; 143 INIT_WORK(&sdp->work, srcu_invoke_callbacks); 144 timer_setup(&sdp->delay_work, srcu_delay_timer, 0); 145 sdp->ssp = ssp; 146 } 147 } 148 149 /* Invalid seq state, used during snp node initialization */ 150 #define SRCU_SNP_INIT_SEQ 0x2 151 152 /* 153 * Check whether sequence number corresponding to snp node, 154 * is invalid. 155 */ 156 static inline bool srcu_invl_snp_seq(unsigned long s) 157 { 158 return s == SRCU_SNP_INIT_SEQ; 159 } 160 161 /* 162 * Allocated and initialize SRCU combining tree. Returns @true if 163 * allocation succeeded and @false otherwise. 164 */ 165 static bool init_srcu_struct_nodes(struct srcu_struct *ssp, gfp_t gfp_flags) 166 { 167 int cpu; 168 int i; 169 int level = 0; 170 int levelspread[RCU_NUM_LVLS]; 171 struct srcu_data *sdp; 172 struct srcu_node *snp; 173 struct srcu_node *snp_first; 174 175 /* Initialize geometry if it has not already been initialized. */ 176 rcu_init_geometry(); 177 ssp->srcu_sup->node = kcalloc(rcu_num_nodes, sizeof(*ssp->srcu_sup->node), gfp_flags); 178 if (!ssp->srcu_sup->node) 179 return false; 180 181 /* Work out the overall tree geometry. */ 182 ssp->srcu_sup->level[0] = &ssp->srcu_sup->node[0]; 183 for (i = 1; i < rcu_num_lvls; i++) 184 ssp->srcu_sup->level[i] = ssp->srcu_sup->level[i - 1] + num_rcu_lvl[i - 1]; 185 rcu_init_levelspread(levelspread, num_rcu_lvl); 186 187 /* Each pass through this loop initializes one srcu_node structure. */ 188 srcu_for_each_node_breadth_first(ssp, snp) { 189 spin_lock_init(&ACCESS_PRIVATE(snp, lock)); 190 BUILD_BUG_ON(ARRAY_SIZE(snp->srcu_have_cbs) != 191 ARRAY_SIZE(snp->srcu_data_have_cbs)); 192 for (i = 0; i < ARRAY_SIZE(snp->srcu_have_cbs); i++) { 193 snp->srcu_have_cbs[i] = SRCU_SNP_INIT_SEQ; 194 snp->srcu_data_have_cbs[i] = 0; 195 } 196 snp->srcu_gp_seq_needed_exp = SRCU_SNP_INIT_SEQ; 197 snp->grplo = -1; 198 snp->grphi = -1; 199 if (snp == &ssp->srcu_sup->node[0]) { 200 /* Root node, special case. */ 201 snp->srcu_parent = NULL; 202 continue; 203 } 204 205 /* Non-root node. */ 206 if (snp == ssp->srcu_sup->level[level + 1]) 207 level++; 208 snp->srcu_parent = ssp->srcu_sup->level[level - 1] + 209 (snp - ssp->srcu_sup->level[level]) / 210 levelspread[level - 1]; 211 } 212 213 /* 214 * Initialize the per-CPU srcu_data array, which feeds into the 215 * leaves of the srcu_node tree. 216 */ 217 level = rcu_num_lvls - 1; 218 snp_first = ssp->srcu_sup->level[level]; 219 for_each_possible_cpu(cpu) { 220 sdp = per_cpu_ptr(ssp->sda, cpu); 221 sdp->mynode = &snp_first[cpu / levelspread[level]]; 222 for (snp = sdp->mynode; snp != NULL; snp = snp->srcu_parent) { 223 if (snp->grplo < 0) 224 snp->grplo = cpu; 225 snp->grphi = cpu; 226 } 227 sdp->grpmask = 1UL << (cpu - sdp->mynode->grplo); 228 } 229 smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_WAIT_BARRIER); 230 return true; 231 } 232 233 /* 234 * Initialize non-compile-time initialized fields, including the 235 * associated srcu_node and srcu_data structures. The is_static parameter 236 * tells us that ->sda has already been wired up to srcu_data. 237 */ 238 static int init_srcu_struct_fields(struct srcu_struct *ssp, bool is_static) 239 { 240 if (!is_static) 241 ssp->srcu_sup = kzalloc(sizeof(*ssp->srcu_sup), GFP_KERNEL); 242 if (!ssp->srcu_sup) 243 return -ENOMEM; 244 if (!is_static) 245 spin_lock_init(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 246 ssp->srcu_sup->srcu_size_state = SRCU_SIZE_SMALL; 247 ssp->srcu_sup->node = NULL; 248 mutex_init(&ssp->srcu_sup->srcu_cb_mutex); 249 mutex_init(&ssp->srcu_sup->srcu_gp_mutex); 250 ssp->srcu_idx = 0; 251 ssp->srcu_sup->srcu_gp_seq = SRCU_GP_SEQ_INITIAL_VAL; 252 ssp->srcu_sup->srcu_barrier_seq = 0; 253 mutex_init(&ssp->srcu_sup->srcu_barrier_mutex); 254 atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 0); 255 INIT_DELAYED_WORK(&ssp->srcu_sup->work, process_srcu); 256 ssp->srcu_sup->sda_is_static = is_static; 257 if (!is_static) 258 ssp->sda = alloc_percpu(struct srcu_data); 259 if (!ssp->sda) 260 goto err_free_sup; 261 init_srcu_struct_data(ssp); 262 ssp->srcu_sup->srcu_gp_seq_needed_exp = SRCU_GP_SEQ_INITIAL_VAL; 263 ssp->srcu_sup->srcu_last_gp_end = ktime_get_mono_fast_ns(); 264 if (READ_ONCE(ssp->srcu_sup->srcu_size_state) == SRCU_SIZE_SMALL && SRCU_SIZING_IS_INIT()) { 265 if (!init_srcu_struct_nodes(ssp, GFP_ATOMIC)) 266 goto err_free_sda; 267 WRITE_ONCE(ssp->srcu_sup->srcu_size_state, SRCU_SIZE_BIG); 268 } 269 ssp->srcu_sup->srcu_ssp = ssp; 270 smp_store_release(&ssp->srcu_sup->srcu_gp_seq_needed, 271 SRCU_GP_SEQ_INITIAL_VAL); /* Init done. */ 272 return 0; 273 274 err_free_sda: 275 if (!is_static) { 276 free_percpu(ssp->sda); 277 ssp->sda = NULL; 278 } 279 err_free_sup: 280 if (!is_static) { 281 kfree(ssp->srcu_sup); 282 ssp->srcu_sup = NULL; 283 } 284 return -ENOMEM; 285 } 286 287 #ifdef CONFIG_DEBUG_LOCK_ALLOC 288 289 int __init_srcu_struct(struct srcu_struct *ssp, const char *name, 290 struct lock_class_key *key) 291 { 292 /* Don't re-initialize a lock while it is held. */ 293 debug_check_no_locks_freed((void *)ssp, sizeof(*ssp)); 294 lockdep_init_map(&ssp->dep_map, name, key, 0); 295 return init_srcu_struct_fields(ssp, false); 296 } 297 EXPORT_SYMBOL_GPL(__init_srcu_struct); 298 299 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 300 301 /** 302 * init_srcu_struct - initialize a sleep-RCU structure 303 * @ssp: structure to initialize. 304 * 305 * Must invoke this on a given srcu_struct before passing that srcu_struct 306 * to any other function. Each srcu_struct represents a separate domain 307 * of SRCU protection. 308 */ 309 int init_srcu_struct(struct srcu_struct *ssp) 310 { 311 return init_srcu_struct_fields(ssp, false); 312 } 313 EXPORT_SYMBOL_GPL(init_srcu_struct); 314 315 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 316 317 /* 318 * Initiate a transition to SRCU_SIZE_BIG with lock held. 319 */ 320 static void __srcu_transition_to_big(struct srcu_struct *ssp) 321 { 322 lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 323 smp_store_release(&ssp->srcu_sup->srcu_size_state, SRCU_SIZE_ALLOC); 324 } 325 326 /* 327 * Initiate an idempotent transition to SRCU_SIZE_BIG. 328 */ 329 static void srcu_transition_to_big(struct srcu_struct *ssp) 330 { 331 unsigned long flags; 332 333 /* Double-checked locking on ->srcu_size-state. */ 334 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) 335 return; 336 spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); 337 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) != SRCU_SIZE_SMALL) { 338 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 339 return; 340 } 341 __srcu_transition_to_big(ssp); 342 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 343 } 344 345 /* 346 * Check to see if the just-encountered contention event justifies 347 * a transition to SRCU_SIZE_BIG. 348 */ 349 static void spin_lock_irqsave_check_contention(struct srcu_struct *ssp) 350 { 351 unsigned long j; 352 353 if (!SRCU_SIZING_IS_CONTEND() || ssp->srcu_sup->srcu_size_state) 354 return; 355 j = jiffies; 356 if (ssp->srcu_sup->srcu_size_jiffies != j) { 357 ssp->srcu_sup->srcu_size_jiffies = j; 358 ssp->srcu_sup->srcu_n_lock_retries = 0; 359 } 360 if (++ssp->srcu_sup->srcu_n_lock_retries <= small_contention_lim) 361 return; 362 __srcu_transition_to_big(ssp); 363 } 364 365 /* 366 * Acquire the specified srcu_data structure's ->lock, but check for 367 * excessive contention, which results in initiation of a transition 368 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 369 * parameter permits this. 370 */ 371 static void spin_lock_irqsave_sdp_contention(struct srcu_data *sdp, unsigned long *flags) 372 { 373 struct srcu_struct *ssp = sdp->ssp; 374 375 if (spin_trylock_irqsave_rcu_node(sdp, *flags)) 376 return; 377 spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); 378 spin_lock_irqsave_check_contention(ssp); 379 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, *flags); 380 spin_lock_irqsave_rcu_node(sdp, *flags); 381 } 382 383 /* 384 * Acquire the specified srcu_struct structure's ->lock, but check for 385 * excessive contention, which results in initiation of a transition 386 * to SRCU_SIZE_BIG. But only if the srcutree.convert_to_big module 387 * parameter permits this. 388 */ 389 static void spin_lock_irqsave_ssp_contention(struct srcu_struct *ssp, unsigned long *flags) 390 { 391 if (spin_trylock_irqsave_rcu_node(ssp->srcu_sup, *flags)) 392 return; 393 spin_lock_irqsave_rcu_node(ssp->srcu_sup, *flags); 394 spin_lock_irqsave_check_contention(ssp); 395 } 396 397 /* 398 * First-use initialization of statically allocated srcu_struct 399 * structure. Wiring up the combining tree is more than can be 400 * done with compile-time initialization, so this check is added 401 * to each update-side SRCU primitive. Use ssp->lock, which -is- 402 * compile-time initialized, to resolve races involving multiple 403 * CPUs trying to garner first-use privileges. 404 */ 405 static void check_init_srcu_struct(struct srcu_struct *ssp) 406 { 407 unsigned long flags; 408 409 /* The smp_load_acquire() pairs with the smp_store_release(). */ 410 if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed))) /*^^^*/ 411 return; /* Already initialized. */ 412 spin_lock_irqsave_rcu_node(ssp->srcu_sup, flags); 413 if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq_needed)) { 414 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 415 return; 416 } 417 init_srcu_struct_fields(ssp, true); 418 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 419 } 420 421 /* 422 * Is the current or any upcoming grace period to be expedited? 423 */ 424 static bool srcu_gp_is_expedited(struct srcu_struct *ssp) 425 { 426 struct srcu_usage *sup = ssp->srcu_sup; 427 428 return ULONG_CMP_LT(READ_ONCE(sup->srcu_gp_seq), READ_ONCE(sup->srcu_gp_seq_needed_exp)); 429 } 430 431 /* 432 * Computes approximate total of the readers' ->srcu_lock_count[] values 433 * for the rank of per-CPU counters specified by idx, and returns true if 434 * the caller did the proper barrier (gp), and if the count of the locks 435 * matches that of the unlocks passed in. 436 */ 437 static bool srcu_readers_lock_idx(struct srcu_struct *ssp, int idx, bool gp, unsigned long unlocks) 438 { 439 int cpu; 440 unsigned long mask = 0; 441 unsigned long sum = 0; 442 443 for_each_possible_cpu(cpu) { 444 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 445 446 sum += atomic_long_read(&sdp->srcu_lock_count[idx]); 447 if (IS_ENABLED(CONFIG_PROVE_RCU)) 448 mask = mask | READ_ONCE(sdp->srcu_reader_flavor); 449 } 450 WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), 451 "Mixed reader flavors for srcu_struct at %ps.\n", ssp); 452 if (mask & SRCU_READ_FLAVOR_LITE && !gp) 453 return false; 454 return sum == unlocks; 455 } 456 457 /* 458 * Returns approximate total of the readers' ->srcu_unlock_count[] values 459 * for the rank of per-CPU counters specified by idx. 460 */ 461 static unsigned long srcu_readers_unlock_idx(struct srcu_struct *ssp, int idx, unsigned long *rdm) 462 { 463 int cpu; 464 unsigned long mask = 0; 465 unsigned long sum = 0; 466 467 for_each_possible_cpu(cpu) { 468 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 469 470 sum += atomic_long_read(&sdp->srcu_unlock_count[idx]); 471 mask = mask | READ_ONCE(sdp->srcu_reader_flavor); 472 } 473 WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && (mask & (mask - 1)), 474 "Mixed reader flavors for srcu_struct at %ps.\n", ssp); 475 *rdm = mask; 476 return sum; 477 } 478 479 /* 480 * Return true if the number of pre-existing readers is determined to 481 * be zero. 482 */ 483 static bool srcu_readers_active_idx_check(struct srcu_struct *ssp, int idx) 484 { 485 bool did_gp; 486 unsigned long rdm; 487 unsigned long unlocks; 488 489 unlocks = srcu_readers_unlock_idx(ssp, idx, &rdm); 490 did_gp = !!(rdm & SRCU_READ_FLAVOR_LITE); 491 492 /* 493 * Make sure that a lock is always counted if the corresponding 494 * unlock is counted. Needs to be a smp_mb() as the read side may 495 * contain a read from a variable that is written to before the 496 * synchronize_srcu() in the write side. In this case smp_mb()s 497 * A and B (or X and Y) act like the store buffering pattern. 498 * 499 * This smp_mb() also pairs with smp_mb() C (or, in the case of X, 500 * Z) to prevent accesses after the synchronize_srcu() from being 501 * executed before the grace period ends. 502 */ 503 if (!did_gp) 504 smp_mb(); /* A */ 505 else 506 synchronize_rcu(); /* X */ 507 508 /* 509 * If the locks are the same as the unlocks, then there must have 510 * been no readers on this index at some point in this function. 511 * But there might be more readers, as a task might have read 512 * the current ->srcu_idx but not yet have incremented its CPU's 513 * ->srcu_lock_count[idx] counter. In fact, it is possible 514 * that most of the tasks have been preempted between fetching 515 * ->srcu_idx and incrementing ->srcu_lock_count[idx]. And there 516 * could be almost (ULONG_MAX / sizeof(struct task_struct)) tasks 517 * in a system whose address space was fully populated with memory. 518 * Call this quantity Nt. 519 * 520 * So suppose that the updater is preempted at this point in the 521 * code for a long time. That now-preempted updater has already 522 * flipped ->srcu_idx (possibly during the preceding grace period), 523 * done an smp_mb() (again, possibly during the preceding grace 524 * period), and summed up the ->srcu_unlock_count[idx] counters. 525 * How many times can a given one of the aforementioned Nt tasks 526 * increment the old ->srcu_idx value's ->srcu_lock_count[idx] 527 * counter, in the absence of nesting? 528 * 529 * It can clearly do so once, given that it has already fetched 530 * the old value of ->srcu_idx and is just about to use that value 531 * to index its increment of ->srcu_lock_count[idx]. But as soon as 532 * it leaves that SRCU read-side critical section, it will increment 533 * ->srcu_unlock_count[idx], which must follow the updater's above 534 * read from that same value. Thus, as soon the reading task does 535 * an smp_mb() and a later fetch from ->srcu_idx, that task will be 536 * guaranteed to get the new index. Except that the increment of 537 * ->srcu_unlock_count[idx] in __srcu_read_unlock() is after the 538 * smp_mb(), and the fetch from ->srcu_idx in __srcu_read_lock() 539 * is before the smp_mb(). Thus, that task might not see the new 540 * value of ->srcu_idx until the -second- __srcu_read_lock(), 541 * which in turn means that this task might well increment 542 * ->srcu_lock_count[idx] for the old value of ->srcu_idx twice, 543 * not just once. 544 * 545 * However, it is important to note that a given smp_mb() takes 546 * effect not just for the task executing it, but also for any 547 * later task running on that same CPU. 548 * 549 * That is, there can be almost Nt + Nc further increments of 550 * ->srcu_lock_count[idx] for the old index, where Nc is the number 551 * of CPUs. But this is OK because the size of the task_struct 552 * structure limits the value of Nt and current systems limit Nc 553 * to a few thousand. 554 * 555 * OK, but what about nesting? This does impose a limit on 556 * nesting of half of the size of the task_struct structure 557 * (measured in bytes), which should be sufficient. A late 2022 558 * TREE01 rcutorture run reported this size to be no less than 559 * 9408 bytes, allowing up to 4704 levels of nesting, which is 560 * comfortably beyond excessive. Especially on 64-bit systems, 561 * which are unlikely to be configured with an address space fully 562 * populated with memory, at least not anytime soon. 563 */ 564 return srcu_readers_lock_idx(ssp, idx, did_gp, unlocks); 565 } 566 567 /** 568 * srcu_readers_active - returns true if there are readers. and false 569 * otherwise 570 * @ssp: which srcu_struct to count active readers (holding srcu_read_lock). 571 * 572 * Note that this is not an atomic primitive, and can therefore suffer 573 * severe errors when invoked on an active srcu_struct. That said, it 574 * can be useful as an error check at cleanup time. 575 */ 576 static bool srcu_readers_active(struct srcu_struct *ssp) 577 { 578 int cpu; 579 unsigned long sum = 0; 580 581 for_each_possible_cpu(cpu) { 582 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 583 584 sum += atomic_long_read(&sdp->srcu_lock_count[0]); 585 sum += atomic_long_read(&sdp->srcu_lock_count[1]); 586 sum -= atomic_long_read(&sdp->srcu_unlock_count[0]); 587 sum -= atomic_long_read(&sdp->srcu_unlock_count[1]); 588 } 589 return sum; 590 } 591 592 /* 593 * We use an adaptive strategy for synchronize_srcu() and especially for 594 * synchronize_srcu_expedited(). We spin for a fixed time period 595 * (defined below, boot time configurable) to allow SRCU readers to exit 596 * their read-side critical sections. If there are still some readers 597 * after one jiffy, we repeatedly block for one jiffy time periods. 598 * The blocking time is increased as the grace-period age increases, 599 * with max blocking time capped at 10 jiffies. 600 */ 601 #define SRCU_DEFAULT_RETRY_CHECK_DELAY 5 602 603 static ulong srcu_retry_check_delay = SRCU_DEFAULT_RETRY_CHECK_DELAY; 604 module_param(srcu_retry_check_delay, ulong, 0444); 605 606 #define SRCU_INTERVAL 1 // Base delay if no expedited GPs pending. 607 #define SRCU_MAX_INTERVAL 10 // Maximum incremental delay from slow readers. 608 609 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_LO 3UL // Lowmark on default per-GP-phase 610 // no-delay instances. 611 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_HI 1000UL // Highmark on default per-GP-phase 612 // no-delay instances. 613 614 #define SRCU_UL_CLAMP_LO(val, low) ((val) > (low) ? (val) : (low)) 615 #define SRCU_UL_CLAMP_HI(val, high) ((val) < (high) ? (val) : (high)) 616 #define SRCU_UL_CLAMP(val, low, high) SRCU_UL_CLAMP_HI(SRCU_UL_CLAMP_LO((val), (low)), (high)) 617 // per-GP-phase no-delay instances adjusted to allow non-sleeping poll upto 618 // one jiffies time duration. Mult by 2 is done to factor in the srcu_get_delay() 619 // called from process_srcu(). 620 #define SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED \ 621 (2UL * USEC_PER_SEC / HZ / SRCU_DEFAULT_RETRY_CHECK_DELAY) 622 623 // Maximum per-GP-phase consecutive no-delay instances. 624 #define SRCU_DEFAULT_MAX_NODELAY_PHASE \ 625 SRCU_UL_CLAMP(SRCU_DEFAULT_MAX_NODELAY_PHASE_ADJUSTED, \ 626 SRCU_DEFAULT_MAX_NODELAY_PHASE_LO, \ 627 SRCU_DEFAULT_MAX_NODELAY_PHASE_HI) 628 629 static ulong srcu_max_nodelay_phase = SRCU_DEFAULT_MAX_NODELAY_PHASE; 630 module_param(srcu_max_nodelay_phase, ulong, 0444); 631 632 // Maximum consecutive no-delay instances. 633 #define SRCU_DEFAULT_MAX_NODELAY (SRCU_DEFAULT_MAX_NODELAY_PHASE > 100 ? \ 634 SRCU_DEFAULT_MAX_NODELAY_PHASE : 100) 635 636 static ulong srcu_max_nodelay = SRCU_DEFAULT_MAX_NODELAY; 637 module_param(srcu_max_nodelay, ulong, 0444); 638 639 /* 640 * Return grace-period delay, zero if there are expedited grace 641 * periods pending, SRCU_INTERVAL otherwise. 642 */ 643 static unsigned long srcu_get_delay(struct srcu_struct *ssp) 644 { 645 unsigned long gpstart; 646 unsigned long j; 647 unsigned long jbase = SRCU_INTERVAL; 648 struct srcu_usage *sup = ssp->srcu_sup; 649 650 if (srcu_gp_is_expedited(ssp)) 651 jbase = 0; 652 if (rcu_seq_state(READ_ONCE(sup->srcu_gp_seq))) { 653 j = jiffies - 1; 654 gpstart = READ_ONCE(sup->srcu_gp_start); 655 if (time_after(j, gpstart)) 656 jbase += j - gpstart; 657 if (!jbase) { 658 ASSERT_EXCLUSIVE_WRITER(sup->srcu_n_exp_nodelay); 659 WRITE_ONCE(sup->srcu_n_exp_nodelay, READ_ONCE(sup->srcu_n_exp_nodelay) + 1); 660 if (READ_ONCE(sup->srcu_n_exp_nodelay) > srcu_max_nodelay_phase) 661 jbase = 1; 662 } 663 } 664 return jbase > SRCU_MAX_INTERVAL ? SRCU_MAX_INTERVAL : jbase; 665 } 666 667 /** 668 * cleanup_srcu_struct - deconstruct a sleep-RCU structure 669 * @ssp: structure to clean up. 670 * 671 * Must invoke this after you are finished using a given srcu_struct that 672 * was initialized via init_srcu_struct(), else you leak memory. 673 */ 674 void cleanup_srcu_struct(struct srcu_struct *ssp) 675 { 676 int cpu; 677 struct srcu_usage *sup = ssp->srcu_sup; 678 679 if (WARN_ON(!srcu_get_delay(ssp))) 680 return; /* Just leak it! */ 681 if (WARN_ON(srcu_readers_active(ssp))) 682 return; /* Just leak it! */ 683 flush_delayed_work(&sup->work); 684 for_each_possible_cpu(cpu) { 685 struct srcu_data *sdp = per_cpu_ptr(ssp->sda, cpu); 686 687 del_timer_sync(&sdp->delay_work); 688 flush_work(&sdp->work); 689 if (WARN_ON(rcu_segcblist_n_cbs(&sdp->srcu_cblist))) 690 return; /* Forgot srcu_barrier(), so just leak it! */ 691 } 692 if (WARN_ON(rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)) != SRCU_STATE_IDLE) || 693 WARN_ON(rcu_seq_current(&sup->srcu_gp_seq) != sup->srcu_gp_seq_needed) || 694 WARN_ON(srcu_readers_active(ssp))) { 695 pr_info("%s: Active srcu_struct %p read state: %d gp state: %lu/%lu\n", 696 __func__, ssp, rcu_seq_state(READ_ONCE(sup->srcu_gp_seq)), 697 rcu_seq_current(&sup->srcu_gp_seq), sup->srcu_gp_seq_needed); 698 return; // Caller forgot to stop doing call_srcu()? 699 // Or caller invoked start_poll_synchronize_srcu() 700 // and then cleanup_srcu_struct() before that grace 701 // period ended? 702 } 703 kfree(sup->node); 704 sup->node = NULL; 705 sup->srcu_size_state = SRCU_SIZE_SMALL; 706 if (!sup->sda_is_static) { 707 free_percpu(ssp->sda); 708 ssp->sda = NULL; 709 kfree(sup); 710 ssp->srcu_sup = NULL; 711 } 712 } 713 EXPORT_SYMBOL_GPL(cleanup_srcu_struct); 714 715 /* 716 * Check for consistent reader flavor. 717 */ 718 void __srcu_check_read_flavor(struct srcu_struct *ssp, int read_flavor) 719 { 720 int old_read_flavor; 721 struct srcu_data *sdp; 722 723 /* NMI-unsafe use in NMI is a bad sign, as is multi-bit read_flavor values. */ 724 WARN_ON_ONCE((read_flavor != SRCU_READ_FLAVOR_NMI) && in_nmi()); 725 WARN_ON_ONCE(read_flavor & (read_flavor - 1)); 726 727 sdp = raw_cpu_ptr(ssp->sda); 728 old_read_flavor = READ_ONCE(sdp->srcu_reader_flavor); 729 if (!old_read_flavor) { 730 old_read_flavor = cmpxchg(&sdp->srcu_reader_flavor, 0, read_flavor); 731 if (!old_read_flavor) 732 return; 733 } 734 WARN_ONCE(old_read_flavor != read_flavor, "CPU %d old state %d new state %d\n", sdp->cpu, old_read_flavor, read_flavor); 735 } 736 EXPORT_SYMBOL_GPL(__srcu_check_read_flavor); 737 738 /* 739 * Counts the new reader in the appropriate per-CPU element of the 740 * srcu_struct. 741 * Returns an index that must be passed to the matching srcu_read_unlock(). 742 */ 743 int __srcu_read_lock(struct srcu_struct *ssp) 744 { 745 int idx; 746 747 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 748 this_cpu_inc(ssp->sda->srcu_lock_count[idx].counter); 749 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 750 return idx; 751 } 752 EXPORT_SYMBOL_GPL(__srcu_read_lock); 753 754 /* 755 * Removes the count for the old reader from the appropriate per-CPU 756 * element of the srcu_struct. Note that this may well be a different 757 * CPU than that which was incremented by the corresponding srcu_read_lock(). 758 */ 759 void __srcu_read_unlock(struct srcu_struct *ssp, int idx) 760 { 761 smp_mb(); /* C */ /* Avoid leaking the critical section. */ 762 this_cpu_inc(ssp->sda->srcu_unlock_count[idx].counter); 763 } 764 EXPORT_SYMBOL_GPL(__srcu_read_unlock); 765 766 #ifdef CONFIG_NEED_SRCU_NMI_SAFE 767 768 /* 769 * Counts the new reader in the appropriate per-CPU element of the 770 * srcu_struct, but in an NMI-safe manner using RMW atomics. 771 * Returns an index that must be passed to the matching srcu_read_unlock(). 772 */ 773 int __srcu_read_lock_nmisafe(struct srcu_struct *ssp) 774 { 775 int idx; 776 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 777 778 idx = READ_ONCE(ssp->srcu_idx) & 0x1; 779 atomic_long_inc(&sdp->srcu_lock_count[idx]); 780 smp_mb__after_atomic(); /* B */ /* Avoid leaking the critical section. */ 781 return idx; 782 } 783 EXPORT_SYMBOL_GPL(__srcu_read_lock_nmisafe); 784 785 /* 786 * Removes the count for the old reader from the appropriate per-CPU 787 * element of the srcu_struct. Note that this may well be a different 788 * CPU than that which was incremented by the corresponding srcu_read_lock(). 789 */ 790 void __srcu_read_unlock_nmisafe(struct srcu_struct *ssp, int idx) 791 { 792 struct srcu_data *sdp = raw_cpu_ptr(ssp->sda); 793 794 smp_mb__before_atomic(); /* C */ /* Avoid leaking the critical section. */ 795 atomic_long_inc(&sdp->srcu_unlock_count[idx]); 796 } 797 EXPORT_SYMBOL_GPL(__srcu_read_unlock_nmisafe); 798 799 #endif // CONFIG_NEED_SRCU_NMI_SAFE 800 801 /* 802 * Start an SRCU grace period. 803 */ 804 static void srcu_gp_start(struct srcu_struct *ssp) 805 { 806 int state; 807 808 lockdep_assert_held(&ACCESS_PRIVATE(ssp->srcu_sup, lock)); 809 WARN_ON_ONCE(ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)); 810 WRITE_ONCE(ssp->srcu_sup->srcu_gp_start, jiffies); 811 WRITE_ONCE(ssp->srcu_sup->srcu_n_exp_nodelay, 0); 812 smp_mb(); /* Order prior store to ->srcu_gp_seq_needed vs. GP start. */ 813 rcu_seq_start(&ssp->srcu_sup->srcu_gp_seq); 814 state = rcu_seq_state(ssp->srcu_sup->srcu_gp_seq); 815 WARN_ON_ONCE(state != SRCU_STATE_SCAN1); 816 } 817 818 819 static void srcu_delay_timer(struct timer_list *t) 820 { 821 struct srcu_data *sdp = container_of(t, struct srcu_data, delay_work); 822 823 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 824 } 825 826 static void srcu_queue_delayed_work_on(struct srcu_data *sdp, 827 unsigned long delay) 828 { 829 if (!delay) { 830 queue_work_on(sdp->cpu, rcu_gp_wq, &sdp->work); 831 return; 832 } 833 834 timer_reduce(&sdp->delay_work, jiffies + delay); 835 } 836 837 /* 838 * Schedule callback invocation for the specified srcu_data structure, 839 * if possible, on the corresponding CPU. 840 */ 841 static void srcu_schedule_cbs_sdp(struct srcu_data *sdp, unsigned long delay) 842 { 843 srcu_queue_delayed_work_on(sdp, delay); 844 } 845 846 /* 847 * Schedule callback invocation for all srcu_data structures associated 848 * with the specified srcu_node structure that have callbacks for the 849 * just-completed grace period, the one corresponding to idx. If possible, 850 * schedule this invocation on the corresponding CPUs. 851 */ 852 static void srcu_schedule_cbs_snp(struct srcu_struct *ssp, struct srcu_node *snp, 853 unsigned long mask, unsigned long delay) 854 { 855 int cpu; 856 857 for (cpu = snp->grplo; cpu <= snp->grphi; cpu++) { 858 if (!(mask & (1UL << (cpu - snp->grplo)))) 859 continue; 860 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, cpu), delay); 861 } 862 } 863 864 /* 865 * Note the end of an SRCU grace period. Initiates callback invocation 866 * and starts a new grace period if needed. 867 * 868 * The ->srcu_cb_mutex acquisition does not protect any data, but 869 * instead prevents more than one grace period from starting while we 870 * are initiating callback invocation. This allows the ->srcu_have_cbs[] 871 * array to have a finite number of elements. 872 */ 873 static void srcu_gp_end(struct srcu_struct *ssp) 874 { 875 unsigned long cbdelay = 1; 876 bool cbs; 877 bool last_lvl; 878 int cpu; 879 unsigned long gpseq; 880 int idx; 881 unsigned long mask; 882 struct srcu_data *sdp; 883 unsigned long sgsne; 884 struct srcu_node *snp; 885 int ss_state; 886 struct srcu_usage *sup = ssp->srcu_sup; 887 888 /* Prevent more than one additional grace period. */ 889 mutex_lock(&sup->srcu_cb_mutex); 890 891 /* End the current grace period. */ 892 spin_lock_irq_rcu_node(sup); 893 idx = rcu_seq_state(sup->srcu_gp_seq); 894 WARN_ON_ONCE(idx != SRCU_STATE_SCAN2); 895 if (srcu_gp_is_expedited(ssp)) 896 cbdelay = 0; 897 898 WRITE_ONCE(sup->srcu_last_gp_end, ktime_get_mono_fast_ns()); 899 rcu_seq_end(&sup->srcu_gp_seq); 900 gpseq = rcu_seq_current(&sup->srcu_gp_seq); 901 if (ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, gpseq)) 902 WRITE_ONCE(sup->srcu_gp_seq_needed_exp, gpseq); 903 spin_unlock_irq_rcu_node(sup); 904 mutex_unlock(&sup->srcu_gp_mutex); 905 /* A new grace period can start at this point. But only one. */ 906 907 /* Initiate callback invocation as needed. */ 908 ss_state = smp_load_acquire(&sup->srcu_size_state); 909 if (ss_state < SRCU_SIZE_WAIT_BARRIER) { 910 srcu_schedule_cbs_sdp(per_cpu_ptr(ssp->sda, get_boot_cpu_id()), 911 cbdelay); 912 } else { 913 idx = rcu_seq_ctr(gpseq) % ARRAY_SIZE(snp->srcu_have_cbs); 914 srcu_for_each_node_breadth_first(ssp, snp) { 915 spin_lock_irq_rcu_node(snp); 916 cbs = false; 917 last_lvl = snp >= sup->level[rcu_num_lvls - 1]; 918 if (last_lvl) 919 cbs = ss_state < SRCU_SIZE_BIG || snp->srcu_have_cbs[idx] == gpseq; 920 snp->srcu_have_cbs[idx] = gpseq; 921 rcu_seq_set_state(&snp->srcu_have_cbs[idx], 1); 922 sgsne = snp->srcu_gp_seq_needed_exp; 923 if (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, gpseq)) 924 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, gpseq); 925 if (ss_state < SRCU_SIZE_BIG) 926 mask = ~0; 927 else 928 mask = snp->srcu_data_have_cbs[idx]; 929 snp->srcu_data_have_cbs[idx] = 0; 930 spin_unlock_irq_rcu_node(snp); 931 if (cbs) 932 srcu_schedule_cbs_snp(ssp, snp, mask, cbdelay); 933 } 934 } 935 936 /* Occasionally prevent srcu_data counter wrap. */ 937 if (!(gpseq & counter_wrap_check)) 938 for_each_possible_cpu(cpu) { 939 sdp = per_cpu_ptr(ssp->sda, cpu); 940 spin_lock_irq_rcu_node(sdp); 941 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed + 100)) 942 sdp->srcu_gp_seq_needed = gpseq; 943 if (ULONG_CMP_GE(gpseq, sdp->srcu_gp_seq_needed_exp + 100)) 944 sdp->srcu_gp_seq_needed_exp = gpseq; 945 spin_unlock_irq_rcu_node(sdp); 946 } 947 948 /* Callback initiation done, allow grace periods after next. */ 949 mutex_unlock(&sup->srcu_cb_mutex); 950 951 /* Start a new grace period if needed. */ 952 spin_lock_irq_rcu_node(sup); 953 gpseq = rcu_seq_current(&sup->srcu_gp_seq); 954 if (!rcu_seq_state(gpseq) && 955 ULONG_CMP_LT(gpseq, sup->srcu_gp_seq_needed)) { 956 srcu_gp_start(ssp); 957 spin_unlock_irq_rcu_node(sup); 958 srcu_reschedule(ssp, 0); 959 } else { 960 spin_unlock_irq_rcu_node(sup); 961 } 962 963 /* Transition to big if needed. */ 964 if (ss_state != SRCU_SIZE_SMALL && ss_state != SRCU_SIZE_BIG) { 965 if (ss_state == SRCU_SIZE_ALLOC) 966 init_srcu_struct_nodes(ssp, GFP_KERNEL); 967 else 968 smp_store_release(&sup->srcu_size_state, ss_state + 1); 969 } 970 } 971 972 /* 973 * Funnel-locking scheme to scalably mediate many concurrent expedited 974 * grace-period requests. This function is invoked for the first known 975 * expedited request for a grace period that has already been requested, 976 * but without expediting. To start a completely new grace period, 977 * whether expedited or not, use srcu_funnel_gp_start() instead. 978 */ 979 static void srcu_funnel_exp_start(struct srcu_struct *ssp, struct srcu_node *snp, 980 unsigned long s) 981 { 982 unsigned long flags; 983 unsigned long sgsne; 984 985 if (snp) 986 for (; snp != NULL; snp = snp->srcu_parent) { 987 sgsne = READ_ONCE(snp->srcu_gp_seq_needed_exp); 988 if (WARN_ON_ONCE(rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, s)) || 989 (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s))) 990 return; 991 spin_lock_irqsave_rcu_node(snp, flags); 992 sgsne = snp->srcu_gp_seq_needed_exp; 993 if (!srcu_invl_snp_seq(sgsne) && ULONG_CMP_GE(sgsne, s)) { 994 spin_unlock_irqrestore_rcu_node(snp, flags); 995 return; 996 } 997 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 998 spin_unlock_irqrestore_rcu_node(snp, flags); 999 } 1000 spin_lock_irqsave_ssp_contention(ssp, &flags); 1001 if (ULONG_CMP_LT(ssp->srcu_sup->srcu_gp_seq_needed_exp, s)) 1002 WRITE_ONCE(ssp->srcu_sup->srcu_gp_seq_needed_exp, s); 1003 spin_unlock_irqrestore_rcu_node(ssp->srcu_sup, flags); 1004 } 1005 1006 /* 1007 * Funnel-locking scheme to scalably mediate many concurrent grace-period 1008 * requests. The winner has to do the work of actually starting grace 1009 * period s. Losers must either ensure that their desired grace-period 1010 * number is recorded on at least their leaf srcu_node structure, or they 1011 * must take steps to invoke their own callbacks. 1012 * 1013 * Note that this function also does the work of srcu_funnel_exp_start(), 1014 * in some cases by directly invoking it. 1015 * 1016 * The srcu read lock should be hold around this function. And s is a seq snap 1017 * after holding that lock. 1018 */ 1019 static void srcu_funnel_gp_start(struct srcu_struct *ssp, struct srcu_data *sdp, 1020 unsigned long s, bool do_norm) 1021 { 1022 unsigned long flags; 1023 int idx = rcu_seq_ctr(s) % ARRAY_SIZE(sdp->mynode->srcu_have_cbs); 1024 unsigned long sgsne; 1025 struct srcu_node *snp; 1026 struct srcu_node *snp_leaf; 1027 unsigned long snp_seq; 1028 struct srcu_usage *sup = ssp->srcu_sup; 1029 1030 /* Ensure that snp node tree is fully initialized before traversing it */ 1031 if (smp_load_acquire(&sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 1032 snp_leaf = NULL; 1033 else 1034 snp_leaf = sdp->mynode; 1035 1036 if (snp_leaf) 1037 /* Each pass through the loop does one level of the srcu_node tree. */ 1038 for (snp = snp_leaf; snp != NULL; snp = snp->srcu_parent) { 1039 if (WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && snp != snp_leaf) 1040 return; /* GP already done and CBs recorded. */ 1041 spin_lock_irqsave_rcu_node(snp, flags); 1042 snp_seq = snp->srcu_have_cbs[idx]; 1043 if (!srcu_invl_snp_seq(snp_seq) && ULONG_CMP_GE(snp_seq, s)) { 1044 if (snp == snp_leaf && snp_seq == s) 1045 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 1046 spin_unlock_irqrestore_rcu_node(snp, flags); 1047 if (snp == snp_leaf && snp_seq != s) { 1048 srcu_schedule_cbs_sdp(sdp, do_norm ? SRCU_INTERVAL : 0); 1049 return; 1050 } 1051 if (!do_norm) 1052 srcu_funnel_exp_start(ssp, snp, s); 1053 return; 1054 } 1055 snp->srcu_have_cbs[idx] = s; 1056 if (snp == snp_leaf) 1057 snp->srcu_data_have_cbs[idx] |= sdp->grpmask; 1058 sgsne = snp->srcu_gp_seq_needed_exp; 1059 if (!do_norm && (srcu_invl_snp_seq(sgsne) || ULONG_CMP_LT(sgsne, s))) 1060 WRITE_ONCE(snp->srcu_gp_seq_needed_exp, s); 1061 spin_unlock_irqrestore_rcu_node(snp, flags); 1062 } 1063 1064 /* Top of tree, must ensure the grace period will be started. */ 1065 spin_lock_irqsave_ssp_contention(ssp, &flags); 1066 if (ULONG_CMP_LT(sup->srcu_gp_seq_needed, s)) { 1067 /* 1068 * Record need for grace period s. Pair with load 1069 * acquire setting up for initialization. 1070 */ 1071 smp_store_release(&sup->srcu_gp_seq_needed, s); /*^^^*/ 1072 } 1073 if (!do_norm && ULONG_CMP_LT(sup->srcu_gp_seq_needed_exp, s)) 1074 WRITE_ONCE(sup->srcu_gp_seq_needed_exp, s); 1075 1076 /* If grace period not already in progress, start it. */ 1077 if (!WARN_ON_ONCE(rcu_seq_done(&sup->srcu_gp_seq, s)) && 1078 rcu_seq_state(sup->srcu_gp_seq) == SRCU_STATE_IDLE) { 1079 WARN_ON_ONCE(ULONG_CMP_GE(sup->srcu_gp_seq, sup->srcu_gp_seq_needed)); 1080 srcu_gp_start(ssp); 1081 1082 // And how can that list_add() in the "else" clause 1083 // possibly be safe for concurrent execution? Well, 1084 // it isn't. And it does not have to be. After all, it 1085 // can only be executed during early boot when there is only 1086 // the one boot CPU running with interrupts still disabled. 1087 if (likely(srcu_init_done)) 1088 queue_delayed_work(rcu_gp_wq, &sup->work, 1089 !!srcu_get_delay(ssp)); 1090 else if (list_empty(&sup->work.work.entry)) 1091 list_add(&sup->work.work.entry, &srcu_boot_list); 1092 } 1093 spin_unlock_irqrestore_rcu_node(sup, flags); 1094 } 1095 1096 /* 1097 * Wait until all readers counted by array index idx complete, but 1098 * loop an additional time if there is an expedited grace period pending. 1099 * The caller must ensure that ->srcu_idx is not changed while checking. 1100 */ 1101 static bool try_check_zero(struct srcu_struct *ssp, int idx, int trycount) 1102 { 1103 unsigned long curdelay; 1104 1105 curdelay = !srcu_get_delay(ssp); 1106 1107 for (;;) { 1108 if (srcu_readers_active_idx_check(ssp, idx)) 1109 return true; 1110 if ((--trycount + curdelay) <= 0) 1111 return false; 1112 udelay(srcu_retry_check_delay); 1113 } 1114 } 1115 1116 /* 1117 * Increment the ->srcu_idx counter so that future SRCU readers will 1118 * use the other rank of the ->srcu_(un)lock_count[] arrays. This allows 1119 * us to wait for pre-existing readers in a starvation-free manner. 1120 */ 1121 static void srcu_flip(struct srcu_struct *ssp) 1122 { 1123 /* 1124 * Because the flip of ->srcu_idx is executed only if the 1125 * preceding call to srcu_readers_active_idx_check() found that 1126 * the ->srcu_unlock_count[] and ->srcu_lock_count[] sums matched 1127 * and because that summing uses atomic_long_read(), there is 1128 * ordering due to a control dependency between that summing and 1129 * the WRITE_ONCE() in this call to srcu_flip(). This ordering 1130 * ensures that if this updater saw a given reader's increment from 1131 * __srcu_read_lock(), that reader was using a value of ->srcu_idx 1132 * from before the previous call to srcu_flip(), which should be 1133 * quite rare. This ordering thus helps forward progress because 1134 * the grace period could otherwise be delayed by additional 1135 * calls to __srcu_read_lock() using that old (soon to be new) 1136 * value of ->srcu_idx. 1137 * 1138 * This sum-equality check and ordering also ensures that if 1139 * a given call to __srcu_read_lock() uses the new value of 1140 * ->srcu_idx, this updater's earlier scans cannot have seen 1141 * that reader's increments, which is all to the good, because 1142 * this grace period need not wait on that reader. After all, 1143 * if those earlier scans had seen that reader, there would have 1144 * been a sum mismatch and this code would not be reached. 1145 * 1146 * This means that the following smp_mb() is redundant, but 1147 * it stays until either (1) Compilers learn about this sort of 1148 * control dependency or (2) Some production workload running on 1149 * a production system is unduly delayed by this slowpath smp_mb(). 1150 * Except for _lite() readers, where it is inoperative, which 1151 * means that it is a good thing that it is redundant. 1152 */ 1153 smp_mb(); /* E */ /* Pairs with B and C. */ 1154 1155 WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); // Flip the counter. 1156 1157 /* 1158 * Ensure that if the updater misses an __srcu_read_unlock() 1159 * increment, that task's __srcu_read_lock() following its next 1160 * __srcu_read_lock() or __srcu_read_unlock() will see the above 1161 * counter update. Note that both this memory barrier and the 1162 * one in srcu_readers_active_idx_check() provide the guarantee 1163 * for __srcu_read_lock(). 1164 */ 1165 smp_mb(); /* D */ /* Pairs with C. */ 1166 } 1167 1168 /* 1169 * If SRCU is likely idle, in other words, the next SRCU grace period 1170 * should be expedited, return true, otherwise return false. Except that 1171 * in the presence of _lite() readers, always return false. 1172 * 1173 * Note that it is OK for several current from-idle requests for a new 1174 * grace period from idle to specify expediting because they will all end 1175 * up requesting the same grace period anyhow. So no loss. 1176 * 1177 * Note also that if any CPU (including the current one) is still invoking 1178 * callbacks, this function will nevertheless say "idle". This is not 1179 * ideal, but the overhead of checking all CPUs' callback lists is even 1180 * less ideal, especially on large systems. Furthermore, the wakeup 1181 * can happen before the callback is fully removed, so we have no choice 1182 * but to accept this type of error. 1183 * 1184 * This function is also subject to counter-wrap errors, but let's face 1185 * it, if this function was preempted for enough time for the counters 1186 * to wrap, it really doesn't matter whether or not we expedite the grace 1187 * period. The extra overhead of a needlessly expedited grace period is 1188 * negligible when amortized over that time period, and the extra latency 1189 * of a needlessly non-expedited grace period is similarly negligible. 1190 */ 1191 static bool srcu_should_expedite(struct srcu_struct *ssp) 1192 { 1193 unsigned long curseq; 1194 unsigned long flags; 1195 struct srcu_data *sdp; 1196 unsigned long t; 1197 unsigned long tlast; 1198 1199 check_init_srcu_struct(ssp); 1200 /* If _lite() readers, don't do unsolicited expediting. */ 1201 if (this_cpu_read(ssp->sda->srcu_reader_flavor) & SRCU_READ_FLAVOR_LITE) 1202 return false; 1203 /* If the local srcu_data structure has callbacks, not idle. */ 1204 sdp = raw_cpu_ptr(ssp->sda); 1205 spin_lock_irqsave_rcu_node(sdp, flags); 1206 if (rcu_segcblist_pend_cbs(&sdp->srcu_cblist)) { 1207 spin_unlock_irqrestore_rcu_node(sdp, flags); 1208 return false; /* Callbacks already present, so not idle. */ 1209 } 1210 spin_unlock_irqrestore_rcu_node(sdp, flags); 1211 1212 /* 1213 * No local callbacks, so probabilistically probe global state. 1214 * Exact information would require acquiring locks, which would 1215 * kill scalability, hence the probabilistic nature of the probe. 1216 */ 1217 1218 /* First, see if enough time has passed since the last GP. */ 1219 t = ktime_get_mono_fast_ns(); 1220 tlast = READ_ONCE(ssp->srcu_sup->srcu_last_gp_end); 1221 if (exp_holdoff == 0 || 1222 time_in_range_open(t, tlast, tlast + exp_holdoff)) 1223 return false; /* Too soon after last GP. */ 1224 1225 /* Next, check for probable idleness. */ 1226 curseq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); 1227 smp_mb(); /* Order ->srcu_gp_seq with ->srcu_gp_seq_needed. */ 1228 if (ULONG_CMP_LT(curseq, READ_ONCE(ssp->srcu_sup->srcu_gp_seq_needed))) 1229 return false; /* Grace period in progress, so not idle. */ 1230 smp_mb(); /* Order ->srcu_gp_seq with prior access. */ 1231 if (curseq != rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)) 1232 return false; /* GP # changed, so not idle. */ 1233 return true; /* With reasonable probability, idle! */ 1234 } 1235 1236 /* 1237 * SRCU callback function to leak a callback. 1238 */ 1239 static void srcu_leak_callback(struct rcu_head *rhp) 1240 { 1241 } 1242 1243 /* 1244 * Start an SRCU grace period, and also queue the callback if non-NULL. 1245 */ 1246 static unsigned long srcu_gp_start_if_needed(struct srcu_struct *ssp, 1247 struct rcu_head *rhp, bool do_norm) 1248 { 1249 unsigned long flags; 1250 int idx; 1251 bool needexp = false; 1252 bool needgp = false; 1253 unsigned long s; 1254 struct srcu_data *sdp; 1255 struct srcu_node *sdp_mynode; 1256 int ss_state; 1257 1258 check_init_srcu_struct(ssp); 1259 /* 1260 * While starting a new grace period, make sure we are in an 1261 * SRCU read-side critical section so that the grace-period 1262 * sequence number cannot wrap around in the meantime. 1263 */ 1264 idx = __srcu_read_lock_nmisafe(ssp); 1265 ss_state = smp_load_acquire(&ssp->srcu_sup->srcu_size_state); 1266 if (ss_state < SRCU_SIZE_WAIT_CALL) 1267 sdp = per_cpu_ptr(ssp->sda, get_boot_cpu_id()); 1268 else 1269 sdp = raw_cpu_ptr(ssp->sda); 1270 spin_lock_irqsave_sdp_contention(sdp, &flags); 1271 if (rhp) 1272 rcu_segcblist_enqueue(&sdp->srcu_cblist, rhp); 1273 /* 1274 * It's crucial to capture the snapshot 's' for acceleration before 1275 * reading the current gp_seq that is used for advancing. This is 1276 * essential because if the acceleration snapshot is taken after a 1277 * failed advancement attempt, there's a risk that a grace period may 1278 * conclude and a new one may start in the interim. If the snapshot is 1279 * captured after this sequence of events, the acceleration snapshot 's' 1280 * could be excessively advanced, leading to acceleration failure. 1281 * In such a scenario, an 'acceleration leak' can occur, where new 1282 * callbacks become indefinitely stuck in the RCU_NEXT_TAIL segment. 1283 * Also note that encountering advancing failures is a normal 1284 * occurrence when the grace period for RCU_WAIT_TAIL is in progress. 1285 * 1286 * To see this, consider the following events which occur if 1287 * rcu_seq_snap() were to be called after advance: 1288 * 1289 * 1) The RCU_WAIT_TAIL segment has callbacks (gp_num = X + 4) and the 1290 * RCU_NEXT_READY_TAIL also has callbacks (gp_num = X + 8). 1291 * 1292 * 2) The grace period for RCU_WAIT_TAIL is seen as started but not 1293 * completed so rcu_seq_current() returns X + SRCU_STATE_SCAN1. 1294 * 1295 * 3) This value is passed to rcu_segcblist_advance() which can't move 1296 * any segment forward and fails. 1297 * 1298 * 4) srcu_gp_start_if_needed() still proceeds with callback acceleration. 1299 * But then the call to rcu_seq_snap() observes the grace period for the 1300 * RCU_WAIT_TAIL segment as completed and the subsequent one for the 1301 * RCU_NEXT_READY_TAIL segment as started (ie: X + 4 + SRCU_STATE_SCAN1) 1302 * so it returns a snapshot of the next grace period, which is X + 12. 1303 * 1304 * 5) The value of X + 12 is passed to rcu_segcblist_accelerate() but the 1305 * freshly enqueued callback in RCU_NEXT_TAIL can't move to 1306 * RCU_NEXT_READY_TAIL which already has callbacks for a previous grace 1307 * period (gp_num = X + 8). So acceleration fails. 1308 */ 1309 s = rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); 1310 if (rhp) { 1311 rcu_segcblist_advance(&sdp->srcu_cblist, 1312 rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); 1313 /* 1314 * Acceleration can never fail because the base current gp_seq 1315 * used for acceleration is <= the value of gp_seq used for 1316 * advancing. This means that RCU_NEXT_TAIL segment will 1317 * always be able to be emptied by the acceleration into the 1318 * RCU_NEXT_READY_TAIL or RCU_WAIT_TAIL segments. 1319 */ 1320 WARN_ON_ONCE(!rcu_segcblist_accelerate(&sdp->srcu_cblist, s)); 1321 } 1322 if (ULONG_CMP_LT(sdp->srcu_gp_seq_needed, s)) { 1323 sdp->srcu_gp_seq_needed = s; 1324 needgp = true; 1325 } 1326 if (!do_norm && ULONG_CMP_LT(sdp->srcu_gp_seq_needed_exp, s)) { 1327 sdp->srcu_gp_seq_needed_exp = s; 1328 needexp = true; 1329 } 1330 spin_unlock_irqrestore_rcu_node(sdp, flags); 1331 1332 /* Ensure that snp node tree is fully initialized before traversing it */ 1333 if (ss_state < SRCU_SIZE_WAIT_BARRIER) 1334 sdp_mynode = NULL; 1335 else 1336 sdp_mynode = sdp->mynode; 1337 1338 if (needgp) 1339 srcu_funnel_gp_start(ssp, sdp, s, do_norm); 1340 else if (needexp) 1341 srcu_funnel_exp_start(ssp, sdp_mynode, s); 1342 __srcu_read_unlock_nmisafe(ssp, idx); 1343 return s; 1344 } 1345 1346 /* 1347 * Enqueue an SRCU callback on the srcu_data structure associated with 1348 * the current CPU and the specified srcu_struct structure, initiating 1349 * grace-period processing if it is not already running. 1350 * 1351 * Note that all CPUs must agree that the grace period extended beyond 1352 * all pre-existing SRCU read-side critical section. On systems with 1353 * more than one CPU, this means that when "func()" is invoked, each CPU 1354 * is guaranteed to have executed a full memory barrier since the end of 1355 * its last corresponding SRCU read-side critical section whose beginning 1356 * preceded the call to call_srcu(). It also means that each CPU executing 1357 * an SRCU read-side critical section that continues beyond the start of 1358 * "func()" must have executed a memory barrier after the call_srcu() 1359 * but before the beginning of that SRCU read-side critical section. 1360 * Note that these guarantees include CPUs that are offline, idle, or 1361 * executing in user mode, as well as CPUs that are executing in the kernel. 1362 * 1363 * Furthermore, if CPU A invoked call_srcu() and CPU B invoked the 1364 * resulting SRCU callback function "func()", then both CPU A and CPU 1365 * B are guaranteed to execute a full memory barrier during the time 1366 * interval between the call to call_srcu() and the invocation of "func()". 1367 * This guarantee applies even if CPU A and CPU B are the same CPU (but 1368 * again only if the system has more than one CPU). 1369 * 1370 * Of course, these guarantees apply only for invocations of call_srcu(), 1371 * srcu_read_lock(), and srcu_read_unlock() that are all passed the same 1372 * srcu_struct structure. 1373 */ 1374 static void __call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1375 rcu_callback_t func, bool do_norm) 1376 { 1377 if (debug_rcu_head_queue(rhp)) { 1378 /* Probable double call_srcu(), so leak the callback. */ 1379 WRITE_ONCE(rhp->func, srcu_leak_callback); 1380 WARN_ONCE(1, "call_srcu(): Leaked duplicate callback\n"); 1381 return; 1382 } 1383 rhp->func = func; 1384 (void)srcu_gp_start_if_needed(ssp, rhp, do_norm); 1385 } 1386 1387 /** 1388 * call_srcu() - Queue a callback for invocation after an SRCU grace period 1389 * @ssp: srcu_struct in queue the callback 1390 * @rhp: structure to be used for queueing the SRCU callback. 1391 * @func: function to be invoked after the SRCU grace period 1392 * 1393 * The callback function will be invoked some time after a full SRCU 1394 * grace period elapses, in other words after all pre-existing SRCU 1395 * read-side critical sections have completed. However, the callback 1396 * function might well execute concurrently with other SRCU read-side 1397 * critical sections that started after call_srcu() was invoked. SRCU 1398 * read-side critical sections are delimited by srcu_read_lock() and 1399 * srcu_read_unlock(), and may be nested. 1400 * 1401 * The callback will be invoked from process context, but must nevertheless 1402 * be fast and must not block. 1403 */ 1404 void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, 1405 rcu_callback_t func) 1406 { 1407 __call_srcu(ssp, rhp, func, true); 1408 } 1409 EXPORT_SYMBOL_GPL(call_srcu); 1410 1411 /* 1412 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 1413 */ 1414 static void __synchronize_srcu(struct srcu_struct *ssp, bool do_norm) 1415 { 1416 struct rcu_synchronize rcu; 1417 1418 srcu_lock_sync(&ssp->dep_map); 1419 1420 RCU_LOCKDEP_WARN(lockdep_is_held(ssp) || 1421 lock_is_held(&rcu_bh_lock_map) || 1422 lock_is_held(&rcu_lock_map) || 1423 lock_is_held(&rcu_sched_lock_map), 1424 "Illegal synchronize_srcu() in same-type SRCU (or in RCU) read-side critical section"); 1425 1426 if (rcu_scheduler_active == RCU_SCHEDULER_INACTIVE) 1427 return; 1428 might_sleep(); 1429 check_init_srcu_struct(ssp); 1430 init_completion(&rcu.completion); 1431 init_rcu_head_on_stack(&rcu.head); 1432 __call_srcu(ssp, &rcu.head, wakeme_after_rcu, do_norm); 1433 wait_for_completion(&rcu.completion); 1434 destroy_rcu_head_on_stack(&rcu.head); 1435 1436 /* 1437 * Make sure that later code is ordered after the SRCU grace 1438 * period. This pairs with the spin_lock_irq_rcu_node() 1439 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed 1440 * because the current CPU might have been totally uninvolved with 1441 * (and thus unordered against) that grace period. 1442 */ 1443 smp_mb(); 1444 } 1445 1446 /** 1447 * synchronize_srcu_expedited - Brute-force SRCU grace period 1448 * @ssp: srcu_struct with which to synchronize. 1449 * 1450 * Wait for an SRCU grace period to elapse, but be more aggressive about 1451 * spinning rather than blocking when waiting. 1452 * 1453 * Note that synchronize_srcu_expedited() has the same deadlock and 1454 * memory-ordering properties as does synchronize_srcu(). 1455 */ 1456 void synchronize_srcu_expedited(struct srcu_struct *ssp) 1457 { 1458 __synchronize_srcu(ssp, rcu_gp_is_normal()); 1459 } 1460 EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 1461 1462 /** 1463 * synchronize_srcu - wait for prior SRCU read-side critical-section completion 1464 * @ssp: srcu_struct with which to synchronize. 1465 * 1466 * Wait for the count to drain to zero of both indexes. To avoid the 1467 * possible starvation of synchronize_srcu(), it waits for the count of 1468 * the index=((->srcu_idx & 1) ^ 1) to drain to zero at first, 1469 * and then flip the srcu_idx and wait for the count of the other index. 1470 * 1471 * Can block; must be called from process context. 1472 * 1473 * Note that it is illegal to call synchronize_srcu() from the corresponding 1474 * SRCU read-side critical section; doing so will result in deadlock. 1475 * However, it is perfectly legal to call synchronize_srcu() on one 1476 * srcu_struct from some other srcu_struct's read-side critical section, 1477 * as long as the resulting graph of srcu_structs is acyclic. 1478 * 1479 * There are memory-ordering constraints implied by synchronize_srcu(). 1480 * On systems with more than one CPU, when synchronize_srcu() returns, 1481 * each CPU is guaranteed to have executed a full memory barrier since 1482 * the end of its last corresponding SRCU read-side critical section 1483 * whose beginning preceded the call to synchronize_srcu(). In addition, 1484 * each CPU having an SRCU read-side critical section that extends beyond 1485 * the return from synchronize_srcu() is guaranteed to have executed a 1486 * full memory barrier after the beginning of synchronize_srcu() and before 1487 * the beginning of that SRCU read-side critical section. Note that these 1488 * guarantees include CPUs that are offline, idle, or executing in user mode, 1489 * as well as CPUs that are executing in the kernel. 1490 * 1491 * Furthermore, if CPU A invoked synchronize_srcu(), which returned 1492 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 1493 * to have executed a full memory barrier during the execution of 1494 * synchronize_srcu(). This guarantee applies even if CPU A and CPU B 1495 * are the same CPU, but again only if the system has more than one CPU. 1496 * 1497 * Of course, these memory-ordering guarantees apply only when 1498 * synchronize_srcu(), srcu_read_lock(), and srcu_read_unlock() are 1499 * passed the same srcu_struct structure. 1500 * 1501 * Implementation of these memory-ordering guarantees is similar to 1502 * that of synchronize_rcu(). 1503 * 1504 * If SRCU is likely idle as determined by srcu_should_expedite(), 1505 * expedite the first request. This semantic was provided by Classic SRCU, 1506 * and is relied upon by its users, so TREE SRCU must also provide it. 1507 * Note that detecting idleness is heuristic and subject to both false 1508 * positives and negatives. 1509 */ 1510 void synchronize_srcu(struct srcu_struct *ssp) 1511 { 1512 if (srcu_should_expedite(ssp) || rcu_gp_is_expedited()) 1513 synchronize_srcu_expedited(ssp); 1514 else 1515 __synchronize_srcu(ssp, true); 1516 } 1517 EXPORT_SYMBOL_GPL(synchronize_srcu); 1518 1519 /** 1520 * get_state_synchronize_srcu - Provide an end-of-grace-period cookie 1521 * @ssp: srcu_struct to provide cookie for. 1522 * 1523 * This function returns a cookie that can be passed to 1524 * poll_state_synchronize_srcu(), which will return true if a full grace 1525 * period has elapsed in the meantime. It is the caller's responsibility 1526 * to make sure that grace period happens, for example, by invoking 1527 * call_srcu() after return from get_state_synchronize_srcu(). 1528 */ 1529 unsigned long get_state_synchronize_srcu(struct srcu_struct *ssp) 1530 { 1531 // Any prior manipulation of SRCU-protected data must happen 1532 // before the load from ->srcu_gp_seq. 1533 smp_mb(); 1534 return rcu_seq_snap(&ssp->srcu_sup->srcu_gp_seq); 1535 } 1536 EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); 1537 1538 /** 1539 * start_poll_synchronize_srcu - Provide cookie and start grace period 1540 * @ssp: srcu_struct to provide cookie for. 1541 * 1542 * This function returns a cookie that can be passed to 1543 * poll_state_synchronize_srcu(), which will return true if a full grace 1544 * period has elapsed in the meantime. Unlike get_state_synchronize_srcu(), 1545 * this function also ensures that any needed SRCU grace period will be 1546 * started. This convenience does come at a cost in terms of CPU overhead. 1547 */ 1548 unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) 1549 { 1550 return srcu_gp_start_if_needed(ssp, NULL, true); 1551 } 1552 EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); 1553 1554 /** 1555 * poll_state_synchronize_srcu - Has cookie's grace period ended? 1556 * @ssp: srcu_struct to provide cookie for. 1557 * @cookie: Return value from get_state_synchronize_srcu() or start_poll_synchronize_srcu(). 1558 * 1559 * This function takes the cookie that was returned from either 1560 * get_state_synchronize_srcu() or start_poll_synchronize_srcu(), and 1561 * returns @true if an SRCU grace period elapsed since the time that the 1562 * cookie was created. 1563 * 1564 * Because cookies are finite in size, wrapping/overflow is possible. 1565 * This is more pronounced on 32-bit systems where cookies are 32 bits, 1566 * where in theory wrapping could happen in about 14 hours assuming 1567 * 25-microsecond expedited SRCU grace periods. However, a more likely 1568 * overflow lower bound is on the order of 24 days in the case of 1569 * one-millisecond SRCU grace periods. Of course, wrapping in a 64-bit 1570 * system requires geologic timespans, as in more than seven million years 1571 * even for expedited SRCU grace periods. 1572 * 1573 * Wrapping/overflow is much more of an issue for CONFIG_SMP=n systems 1574 * that also have CONFIG_PREEMPTION=n, which selects Tiny SRCU. This uses 1575 * a 16-bit cookie, which rcutorture routinely wraps in a matter of a 1576 * few minutes. If this proves to be a problem, this counter will be 1577 * expanded to the same size as for Tree SRCU. 1578 */ 1579 bool poll_state_synchronize_srcu(struct srcu_struct *ssp, unsigned long cookie) 1580 { 1581 if (cookie != SRCU_GET_STATE_COMPLETED && 1582 !rcu_seq_done(&ssp->srcu_sup->srcu_gp_seq, cookie)) 1583 return false; 1584 // Ensure that the end of the SRCU grace period happens before 1585 // any subsequent code that the caller might execute. 1586 smp_mb(); // ^^^ 1587 return true; 1588 } 1589 EXPORT_SYMBOL_GPL(poll_state_synchronize_srcu); 1590 1591 /* 1592 * Callback function for srcu_barrier() use. 1593 */ 1594 static void srcu_barrier_cb(struct rcu_head *rhp) 1595 { 1596 struct srcu_data *sdp; 1597 struct srcu_struct *ssp; 1598 1599 rhp->next = rhp; // Mark the callback as having been invoked. 1600 sdp = container_of(rhp, struct srcu_data, srcu_barrier_head); 1601 ssp = sdp->ssp; 1602 if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) 1603 complete(&ssp->srcu_sup->srcu_barrier_completion); 1604 } 1605 1606 /* 1607 * Enqueue an srcu_barrier() callback on the specified srcu_data 1608 * structure's ->cblist. but only if that ->cblist already has at least one 1609 * callback enqueued. Note that if a CPU already has callbacks enqueue, 1610 * it must have already registered the need for a future grace period, 1611 * so all we need do is enqueue a callback that will use the same grace 1612 * period as the last callback already in the queue. 1613 */ 1614 static void srcu_barrier_one_cpu(struct srcu_struct *ssp, struct srcu_data *sdp) 1615 { 1616 spin_lock_irq_rcu_node(sdp); 1617 atomic_inc(&ssp->srcu_sup->srcu_barrier_cpu_cnt); 1618 sdp->srcu_barrier_head.func = srcu_barrier_cb; 1619 debug_rcu_head_queue(&sdp->srcu_barrier_head); 1620 if (!rcu_segcblist_entrain(&sdp->srcu_cblist, 1621 &sdp->srcu_barrier_head)) { 1622 debug_rcu_head_unqueue(&sdp->srcu_barrier_head); 1623 atomic_dec(&ssp->srcu_sup->srcu_barrier_cpu_cnt); 1624 } 1625 spin_unlock_irq_rcu_node(sdp); 1626 } 1627 1628 /** 1629 * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. 1630 * @ssp: srcu_struct on which to wait for in-flight callbacks. 1631 */ 1632 void srcu_barrier(struct srcu_struct *ssp) 1633 { 1634 int cpu; 1635 int idx; 1636 unsigned long s = rcu_seq_snap(&ssp->srcu_sup->srcu_barrier_seq); 1637 1638 check_init_srcu_struct(ssp); 1639 mutex_lock(&ssp->srcu_sup->srcu_barrier_mutex); 1640 if (rcu_seq_done(&ssp->srcu_sup->srcu_barrier_seq, s)) { 1641 smp_mb(); /* Force ordering following return. */ 1642 mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); 1643 return; /* Someone else did our work for us. */ 1644 } 1645 rcu_seq_start(&ssp->srcu_sup->srcu_barrier_seq); 1646 init_completion(&ssp->srcu_sup->srcu_barrier_completion); 1647 1648 /* Initial count prevents reaching zero until all CBs are posted. */ 1649 atomic_set(&ssp->srcu_sup->srcu_barrier_cpu_cnt, 1); 1650 1651 idx = __srcu_read_lock_nmisafe(ssp); 1652 if (smp_load_acquire(&ssp->srcu_sup->srcu_size_state) < SRCU_SIZE_WAIT_BARRIER) 1653 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, get_boot_cpu_id())); 1654 else 1655 for_each_possible_cpu(cpu) 1656 srcu_barrier_one_cpu(ssp, per_cpu_ptr(ssp->sda, cpu)); 1657 __srcu_read_unlock_nmisafe(ssp, idx); 1658 1659 /* Remove the initial count, at which point reaching zero can happen. */ 1660 if (atomic_dec_and_test(&ssp->srcu_sup->srcu_barrier_cpu_cnt)) 1661 complete(&ssp->srcu_sup->srcu_barrier_completion); 1662 wait_for_completion(&ssp->srcu_sup->srcu_barrier_completion); 1663 1664 rcu_seq_end(&ssp->srcu_sup->srcu_barrier_seq); 1665 mutex_unlock(&ssp->srcu_sup->srcu_barrier_mutex); 1666 } 1667 EXPORT_SYMBOL_GPL(srcu_barrier); 1668 1669 /** 1670 * srcu_batches_completed - return batches completed. 1671 * @ssp: srcu_struct on which to report batch completion. 1672 * 1673 * Report the number of batches, correlated with, but not necessarily 1674 * precisely the same as, the number of grace periods that have elapsed. 1675 */ 1676 unsigned long srcu_batches_completed(struct srcu_struct *ssp) 1677 { 1678 return READ_ONCE(ssp->srcu_idx); 1679 } 1680 EXPORT_SYMBOL_GPL(srcu_batches_completed); 1681 1682 /* 1683 * Core SRCU state machine. Push state bits of ->srcu_gp_seq 1684 * to SRCU_STATE_SCAN2, and invoke srcu_gp_end() when scan has 1685 * completed in that state. 1686 */ 1687 static void srcu_advance_state(struct srcu_struct *ssp) 1688 { 1689 int idx; 1690 1691 mutex_lock(&ssp->srcu_sup->srcu_gp_mutex); 1692 1693 /* 1694 * Because readers might be delayed for an extended period after 1695 * fetching ->srcu_idx for their index, at any point in time there 1696 * might well be readers using both idx=0 and idx=1. We therefore 1697 * need to wait for readers to clear from both index values before 1698 * invoking a callback. 1699 * 1700 * The load-acquire ensures that we see the accesses performed 1701 * by the prior grace period. 1702 */ 1703 idx = rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq)); /* ^^^ */ 1704 if (idx == SRCU_STATE_IDLE) { 1705 spin_lock_irq_rcu_node(ssp->srcu_sup); 1706 if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { 1707 WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)); 1708 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1709 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1710 return; 1711 } 1712 idx = rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)); 1713 if (idx == SRCU_STATE_IDLE) 1714 srcu_gp_start(ssp); 1715 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1716 if (idx != SRCU_STATE_IDLE) { 1717 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1718 return; /* Someone else started the grace period. */ 1719 } 1720 } 1721 1722 if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN1) { 1723 idx = 1 ^ (ssp->srcu_idx & 1); 1724 if (!try_check_zero(ssp, idx, 1)) { 1725 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1726 return; /* readers present, retry later. */ 1727 } 1728 srcu_flip(ssp); 1729 spin_lock_irq_rcu_node(ssp->srcu_sup); 1730 rcu_seq_set_state(&ssp->srcu_sup->srcu_gp_seq, SRCU_STATE_SCAN2); 1731 ssp->srcu_sup->srcu_n_exp_nodelay = 0; 1732 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1733 } 1734 1735 if (rcu_seq_state(READ_ONCE(ssp->srcu_sup->srcu_gp_seq)) == SRCU_STATE_SCAN2) { 1736 1737 /* 1738 * SRCU read-side critical sections are normally short, 1739 * so check at least twice in quick succession after a flip. 1740 */ 1741 idx = 1 ^ (ssp->srcu_idx & 1); 1742 if (!try_check_zero(ssp, idx, 2)) { 1743 mutex_unlock(&ssp->srcu_sup->srcu_gp_mutex); 1744 return; /* readers present, retry later. */ 1745 } 1746 ssp->srcu_sup->srcu_n_exp_nodelay = 0; 1747 srcu_gp_end(ssp); /* Releases ->srcu_gp_mutex. */ 1748 } 1749 } 1750 1751 /* 1752 * Invoke a limited number of SRCU callbacks that have passed through 1753 * their grace period. If there are more to do, SRCU will reschedule 1754 * the workqueue. Note that needed memory barriers have been executed 1755 * in this task's context by srcu_readers_active_idx_check(). 1756 */ 1757 static void srcu_invoke_callbacks(struct work_struct *work) 1758 { 1759 long len; 1760 bool more; 1761 struct rcu_cblist ready_cbs; 1762 struct rcu_head *rhp; 1763 struct srcu_data *sdp; 1764 struct srcu_struct *ssp; 1765 1766 sdp = container_of(work, struct srcu_data, work); 1767 1768 ssp = sdp->ssp; 1769 rcu_cblist_init(&ready_cbs); 1770 spin_lock_irq_rcu_node(sdp); 1771 WARN_ON_ONCE(!rcu_segcblist_segempty(&sdp->srcu_cblist, RCU_NEXT_TAIL)); 1772 rcu_segcblist_advance(&sdp->srcu_cblist, 1773 rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq)); 1774 /* 1775 * Although this function is theoretically re-entrant, concurrent 1776 * callbacks invocation is disallowed to avoid executing an SRCU barrier 1777 * too early. 1778 */ 1779 if (sdp->srcu_cblist_invoking || 1780 !rcu_segcblist_ready_cbs(&sdp->srcu_cblist)) { 1781 spin_unlock_irq_rcu_node(sdp); 1782 return; /* Someone else on the job or nothing to do. */ 1783 } 1784 1785 /* We are on the job! Extract and invoke ready callbacks. */ 1786 sdp->srcu_cblist_invoking = true; 1787 rcu_segcblist_extract_done_cbs(&sdp->srcu_cblist, &ready_cbs); 1788 len = ready_cbs.len; 1789 spin_unlock_irq_rcu_node(sdp); 1790 rhp = rcu_cblist_dequeue(&ready_cbs); 1791 for (; rhp != NULL; rhp = rcu_cblist_dequeue(&ready_cbs)) { 1792 debug_rcu_head_unqueue(rhp); 1793 debug_rcu_head_callback(rhp); 1794 local_bh_disable(); 1795 rhp->func(rhp); 1796 local_bh_enable(); 1797 } 1798 WARN_ON_ONCE(ready_cbs.len); 1799 1800 /* 1801 * Update counts, accelerate new callbacks, and if needed, 1802 * schedule another round of callback invocation. 1803 */ 1804 spin_lock_irq_rcu_node(sdp); 1805 rcu_segcblist_add_len(&sdp->srcu_cblist, -len); 1806 sdp->srcu_cblist_invoking = false; 1807 more = rcu_segcblist_ready_cbs(&sdp->srcu_cblist); 1808 spin_unlock_irq_rcu_node(sdp); 1809 /* An SRCU barrier or callbacks from previous nesting work pending */ 1810 if (more) 1811 srcu_schedule_cbs_sdp(sdp, 0); 1812 } 1813 1814 /* 1815 * Finished one round of SRCU grace period. Start another if there are 1816 * more SRCU callbacks queued, otherwise put SRCU into not-running state. 1817 */ 1818 static void srcu_reschedule(struct srcu_struct *ssp, unsigned long delay) 1819 { 1820 bool pushgp = true; 1821 1822 spin_lock_irq_rcu_node(ssp->srcu_sup); 1823 if (ULONG_CMP_GE(ssp->srcu_sup->srcu_gp_seq, ssp->srcu_sup->srcu_gp_seq_needed)) { 1824 if (!WARN_ON_ONCE(rcu_seq_state(ssp->srcu_sup->srcu_gp_seq))) { 1825 /* All requests fulfilled, time to go idle. */ 1826 pushgp = false; 1827 } 1828 } else if (!rcu_seq_state(ssp->srcu_sup->srcu_gp_seq)) { 1829 /* Outstanding request and no GP. Start one. */ 1830 srcu_gp_start(ssp); 1831 } 1832 spin_unlock_irq_rcu_node(ssp->srcu_sup); 1833 1834 if (pushgp) 1835 queue_delayed_work(rcu_gp_wq, &ssp->srcu_sup->work, delay); 1836 } 1837 1838 /* 1839 * This is the work-queue function that handles SRCU grace periods. 1840 */ 1841 static void process_srcu(struct work_struct *work) 1842 { 1843 unsigned long curdelay; 1844 unsigned long j; 1845 struct srcu_struct *ssp; 1846 struct srcu_usage *sup; 1847 1848 sup = container_of(work, struct srcu_usage, work.work); 1849 ssp = sup->srcu_ssp; 1850 1851 srcu_advance_state(ssp); 1852 curdelay = srcu_get_delay(ssp); 1853 if (curdelay) { 1854 WRITE_ONCE(sup->reschedule_count, 0); 1855 } else { 1856 j = jiffies; 1857 if (READ_ONCE(sup->reschedule_jiffies) == j) { 1858 ASSERT_EXCLUSIVE_WRITER(sup->reschedule_count); 1859 WRITE_ONCE(sup->reschedule_count, READ_ONCE(sup->reschedule_count) + 1); 1860 if (READ_ONCE(sup->reschedule_count) > srcu_max_nodelay) 1861 curdelay = 1; 1862 } else { 1863 WRITE_ONCE(sup->reschedule_count, 1); 1864 WRITE_ONCE(sup->reschedule_jiffies, j); 1865 } 1866 } 1867 srcu_reschedule(ssp, curdelay); 1868 } 1869 1870 void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, 1871 unsigned long *gp_seq) 1872 { 1873 *flags = 0; 1874 *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); 1875 } 1876 EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); 1877 1878 static const char * const srcu_size_state_name[] = { 1879 "SRCU_SIZE_SMALL", 1880 "SRCU_SIZE_ALLOC", 1881 "SRCU_SIZE_WAIT_BARRIER", 1882 "SRCU_SIZE_WAIT_CALL", 1883 "SRCU_SIZE_WAIT_CBS1", 1884 "SRCU_SIZE_WAIT_CBS2", 1885 "SRCU_SIZE_WAIT_CBS3", 1886 "SRCU_SIZE_WAIT_CBS4", 1887 "SRCU_SIZE_BIG", 1888 "SRCU_SIZE_???", 1889 }; 1890 1891 void srcu_torture_stats_print(struct srcu_struct *ssp, char *tt, char *tf) 1892 { 1893 int cpu; 1894 int idx; 1895 unsigned long s0 = 0, s1 = 0; 1896 int ss_state = READ_ONCE(ssp->srcu_sup->srcu_size_state); 1897 int ss_state_idx = ss_state; 1898 1899 idx = ssp->srcu_idx & 0x1; 1900 if (ss_state < 0 || ss_state >= ARRAY_SIZE(srcu_size_state_name)) 1901 ss_state_idx = ARRAY_SIZE(srcu_size_state_name) - 1; 1902 pr_alert("%s%s Tree SRCU g%ld state %d (%s)", 1903 tt, tf, rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq), ss_state, 1904 srcu_size_state_name[ss_state_idx]); 1905 if (!ssp->sda) { 1906 // Called after cleanup_srcu_struct(), perhaps. 1907 pr_cont(" No per-CPU srcu_data structures (->sda == NULL).\n"); 1908 } else { 1909 pr_cont(" per-CPU(idx=%d):", idx); 1910 for_each_possible_cpu(cpu) { 1911 unsigned long l0, l1; 1912 unsigned long u0, u1; 1913 long c0, c1; 1914 struct srcu_data *sdp; 1915 1916 sdp = per_cpu_ptr(ssp->sda, cpu); 1917 u0 = data_race(atomic_long_read(&sdp->srcu_unlock_count[!idx])); 1918 u1 = data_race(atomic_long_read(&sdp->srcu_unlock_count[idx])); 1919 1920 /* 1921 * Make sure that a lock is always counted if the corresponding 1922 * unlock is counted. 1923 */ 1924 smp_rmb(); 1925 1926 l0 = data_race(atomic_long_read(&sdp->srcu_lock_count[!idx])); 1927 l1 = data_race(atomic_long_read(&sdp->srcu_lock_count[idx])); 1928 1929 c0 = l0 - u0; 1930 c1 = l1 - u1; 1931 pr_cont(" %d(%ld,%ld %c)", 1932 cpu, c0, c1, 1933 "C."[rcu_segcblist_empty(&sdp->srcu_cblist)]); 1934 s0 += c0; 1935 s1 += c1; 1936 } 1937 pr_cont(" T(%ld,%ld)\n", s0, s1); 1938 } 1939 if (SRCU_SIZING_IS_TORTURE()) 1940 srcu_transition_to_big(ssp); 1941 } 1942 EXPORT_SYMBOL_GPL(srcu_torture_stats_print); 1943 1944 static int __init srcu_bootup_announce(void) 1945 { 1946 pr_info("Hierarchical SRCU implementation.\n"); 1947 if (exp_holdoff != DEFAULT_SRCU_EXP_HOLDOFF) 1948 pr_info("\tNon-default auto-expedite holdoff of %lu ns.\n", exp_holdoff); 1949 if (srcu_retry_check_delay != SRCU_DEFAULT_RETRY_CHECK_DELAY) 1950 pr_info("\tNon-default retry check delay of %lu us.\n", srcu_retry_check_delay); 1951 if (srcu_max_nodelay != SRCU_DEFAULT_MAX_NODELAY) 1952 pr_info("\tNon-default max no-delay of %lu.\n", srcu_max_nodelay); 1953 pr_info("\tMax phase no-delay instances is %lu.\n", srcu_max_nodelay_phase); 1954 return 0; 1955 } 1956 early_initcall(srcu_bootup_announce); 1957 1958 void __init srcu_init(void) 1959 { 1960 struct srcu_usage *sup; 1961 1962 /* Decide on srcu_struct-size strategy. */ 1963 if (SRCU_SIZING_IS(SRCU_SIZING_AUTO)) { 1964 if (nr_cpu_ids >= big_cpu_lim) { 1965 convert_to_big = SRCU_SIZING_INIT; // Don't bother waiting for contention. 1966 pr_info("%s: Setting srcu_struct sizes to big.\n", __func__); 1967 } else { 1968 convert_to_big = SRCU_SIZING_NONE | SRCU_SIZING_CONTEND; 1969 pr_info("%s: Setting srcu_struct sizes based on contention.\n", __func__); 1970 } 1971 } 1972 1973 /* 1974 * Once that is set, call_srcu() can follow the normal path and 1975 * queue delayed work. This must follow RCU workqueues creation 1976 * and timers initialization. 1977 */ 1978 srcu_init_done = true; 1979 while (!list_empty(&srcu_boot_list)) { 1980 sup = list_first_entry(&srcu_boot_list, struct srcu_usage, 1981 work.work.entry); 1982 list_del_init(&sup->work.work.entry); 1983 if (SRCU_SIZING_IS(SRCU_SIZING_INIT) && 1984 sup->srcu_size_state == SRCU_SIZE_SMALL) 1985 sup->srcu_size_state = SRCU_SIZE_ALLOC; 1986 queue_work(rcu_gp_wq, &sup->work.work); 1987 } 1988 } 1989 1990 #ifdef CONFIG_MODULES 1991 1992 /* Initialize any global-scope srcu_struct structures used by this module. */ 1993 static int srcu_module_coming(struct module *mod) 1994 { 1995 int i; 1996 struct srcu_struct *ssp; 1997 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 1998 1999 for (i = 0; i < mod->num_srcu_structs; i++) { 2000 ssp = *(sspp++); 2001 ssp->sda = alloc_percpu(struct srcu_data); 2002 if (WARN_ON_ONCE(!ssp->sda)) 2003 return -ENOMEM; 2004 } 2005 return 0; 2006 } 2007 2008 /* Clean up any global-scope srcu_struct structures used by this module. */ 2009 static void srcu_module_going(struct module *mod) 2010 { 2011 int i; 2012 struct srcu_struct *ssp; 2013 struct srcu_struct **sspp = mod->srcu_struct_ptrs; 2014 2015 for (i = 0; i < mod->num_srcu_structs; i++) { 2016 ssp = *(sspp++); 2017 if (!rcu_seq_state(smp_load_acquire(&ssp->srcu_sup->srcu_gp_seq_needed)) && 2018 !WARN_ON_ONCE(!ssp->srcu_sup->sda_is_static)) 2019 cleanup_srcu_struct(ssp); 2020 if (!WARN_ON(srcu_readers_active(ssp))) 2021 free_percpu(ssp->sda); 2022 } 2023 } 2024 2025 /* Handle one module, either coming or going. */ 2026 static int srcu_module_notify(struct notifier_block *self, 2027 unsigned long val, void *data) 2028 { 2029 struct module *mod = data; 2030 int ret = 0; 2031 2032 switch (val) { 2033 case MODULE_STATE_COMING: 2034 ret = srcu_module_coming(mod); 2035 break; 2036 case MODULE_STATE_GOING: 2037 srcu_module_going(mod); 2038 break; 2039 default: 2040 break; 2041 } 2042 return ret; 2043 } 2044 2045 static struct notifier_block srcu_module_nb = { 2046 .notifier_call = srcu_module_notify, 2047 .priority = 0, 2048 }; 2049 2050 static __init int init_srcu_module_notifier(void) 2051 { 2052 int ret; 2053 2054 ret = register_module_notifier(&srcu_module_nb); 2055 if (ret) 2056 pr_warn("Failed to register srcu module notifier\n"); 2057 return ret; 2058 } 2059 late_initcall(init_srcu_module_notifier); 2060 2061 #endif /* #ifdef CONFIG_MODULES */ 2062