1 /* 2 * Read-Copy Update mechanism for mutual exclusion 3 * 4 * This program is free software; you can redistribute it and/or modify 5 * it under the terms of the GNU General Public License as published by 6 * the Free Software Foundation; either version 2 of the License, or 7 * (at your option) any later version. 8 * 9 * This program is distributed in the hope that it will be useful, 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 * GNU General Public License for more details. 13 * 14 * You should have received a copy of the GNU General Public License 15 * along with this program; if not, you can access it online at 16 * http://www.gnu.org/licenses/gpl-2.0.html. 17 * 18 * Copyright IBM Corporation, 2008 19 * 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 21 * Manfred Spraul <manfred@colorfullife.com> 22 * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version 23 * 24 * Based on the original work by Paul McKenney <paulmck@us.ibm.com> 25 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. 26 * 27 * For detailed explanation of Read-Copy Update mechanism see - 28 * Documentation/RCU 29 */ 30 #include <linux/types.h> 31 #include <linux/kernel.h> 32 #include <linux/init.h> 33 #include <linux/spinlock.h> 34 #include <linux/smp.h> 35 #include <linux/rcupdate_wait.h> 36 #include <linux/interrupt.h> 37 #include <linux/sched.h> 38 #include <linux/sched/debug.h> 39 #include <linux/nmi.h> 40 #include <linux/atomic.h> 41 #include <linux/bitops.h> 42 #include <linux/export.h> 43 #include <linux/completion.h> 44 #include <linux/moduleparam.h> 45 #include <linux/percpu.h> 46 #include <linux/notifier.h> 47 #include <linux/cpu.h> 48 #include <linux/mutex.h> 49 #include <linux/time.h> 50 #include <linux/kernel_stat.h> 51 #include <linux/wait.h> 52 #include <linux/kthread.h> 53 #include <uapi/linux/sched/types.h> 54 #include <linux/prefetch.h> 55 #include <linux/delay.h> 56 #include <linux/stop_machine.h> 57 #include <linux/random.h> 58 #include <linux/trace_events.h> 59 #include <linux/suspend.h> 60 #include <linux/ftrace.h> 61 62 #include "tree.h" 63 #include "rcu.h" 64 65 #ifdef MODULE_PARAM_PREFIX 66 #undef MODULE_PARAM_PREFIX 67 #endif 68 #define MODULE_PARAM_PREFIX "rcutree." 69 70 /* Data structures. */ 71 72 /* 73 * In order to export the rcu_state name to the tracing tools, it 74 * needs to be added in the __tracepoint_string section. 75 * This requires defining a separate variable tp_<sname>_varname 76 * that points to the string being used, and this will allow 77 * the tracing userspace tools to be able to decipher the string 78 * address to the matching string. 79 */ 80 #ifdef CONFIG_TRACING 81 # define DEFINE_RCU_TPS(sname) \ 82 static char sname##_varname[] = #sname; \ 83 static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; 84 # define RCU_STATE_NAME(sname) sname##_varname 85 #else 86 # define DEFINE_RCU_TPS(sname) 87 # define RCU_STATE_NAME(sname) __stringify(sname) 88 #endif 89 90 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ 91 DEFINE_RCU_TPS(sname) \ 92 static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \ 93 struct rcu_state sname##_state = { \ 94 .level = { &sname##_state.node[0] }, \ 95 .rda = &sname##_data, \ 96 .call = cr, \ 97 .gp_state = RCU_GP_IDLE, \ 98 .gpnum = 0UL - 300UL, \ 99 .completed = 0UL - 300UL, \ 100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ 101 .orphan_nxttail = &sname##_state.orphan_nxtlist, \ 102 .orphan_donetail = &sname##_state.orphan_donelist, \ 103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 104 .name = RCU_STATE_NAME(sname), \ 105 .abbr = sabbr, \ 106 .exp_mutex = __MUTEX_INITIALIZER(sname##_state.exp_mutex), \ 107 .exp_wake_mutex = __MUTEX_INITIALIZER(sname##_state.exp_wake_mutex), \ 108 } 109 110 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); 111 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); 112 113 static struct rcu_state *const rcu_state_p; 114 LIST_HEAD(rcu_struct_flavors); 115 116 /* Dump rcu_node combining tree at boot to verify correct setup. */ 117 static bool dump_tree; 118 module_param(dump_tree, bool, 0444); 119 /* Control rcu_node-tree auto-balancing at boot time. */ 120 static bool rcu_fanout_exact; 121 module_param(rcu_fanout_exact, bool, 0444); 122 /* Increase (but not decrease) the RCU_FANOUT_LEAF at boot time. */ 123 static int rcu_fanout_leaf = RCU_FANOUT_LEAF; 124 module_param(rcu_fanout_leaf, int, 0444); 125 int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; 126 /* Number of rcu_nodes at specified level. */ 127 static int num_rcu_lvl[] = NUM_RCU_LVL_INIT; 128 int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ 129 /* panic() on RCU Stall sysctl. */ 130 int sysctl_panic_on_rcu_stall __read_mostly; 131 132 /* 133 * The rcu_scheduler_active variable is initialized to the value 134 * RCU_SCHEDULER_INACTIVE and transitions RCU_SCHEDULER_INIT just before the 135 * first task is spawned. So when this variable is RCU_SCHEDULER_INACTIVE, 136 * RCU can assume that there is but one task, allowing RCU to (for example) 137 * optimize synchronize_rcu() to a simple barrier(). When this variable 138 * is RCU_SCHEDULER_INIT, RCU must actually do all the hard work required 139 * to detect real grace periods. This variable is also used to suppress 140 * boot-time false positives from lockdep-RCU error checking. Finally, it 141 * transitions from RCU_SCHEDULER_INIT to RCU_SCHEDULER_RUNNING after RCU 142 * is fully initialized, including all of its kthreads having been spawned. 143 */ 144 int rcu_scheduler_active __read_mostly; 145 EXPORT_SYMBOL_GPL(rcu_scheduler_active); 146 147 /* 148 * The rcu_scheduler_fully_active variable transitions from zero to one 149 * during the early_initcall() processing, which is after the scheduler 150 * is capable of creating new tasks. So RCU processing (for example, 151 * creating tasks for RCU priority boosting) must be delayed until after 152 * rcu_scheduler_fully_active transitions from zero to one. We also 153 * currently delay invocation of any RCU callbacks until after this point. 154 * 155 * It might later prove better for people registering RCU callbacks during 156 * early boot to take responsibility for these callbacks, but one step at 157 * a time. 158 */ 159 static int rcu_scheduler_fully_active __read_mostly; 160 161 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf); 162 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf); 163 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); 164 static void invoke_rcu_core(void); 165 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); 166 static void rcu_report_exp_rdp(struct rcu_state *rsp, 167 struct rcu_data *rdp, bool wake); 168 static void sync_sched_exp_online_cleanup(int cpu); 169 170 /* rcuc/rcub kthread realtime priority */ 171 #ifdef CONFIG_RCU_KTHREAD_PRIO 172 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO; 173 #else /* #ifdef CONFIG_RCU_KTHREAD_PRIO */ 174 static int kthread_prio = IS_ENABLED(CONFIG_RCU_BOOST) ? 1 : 0; 175 #endif /* #else #ifdef CONFIG_RCU_KTHREAD_PRIO */ 176 module_param(kthread_prio, int, 0644); 177 178 /* Delay in jiffies for grace-period initialization delays, debug only. */ 179 180 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT 181 static int gp_preinit_delay = CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT_DELAY; 182 module_param(gp_preinit_delay, int, 0644); 183 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 184 static const int gp_preinit_delay; 185 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_PREINIT */ 186 187 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT 188 static int gp_init_delay = CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY; 189 module_param(gp_init_delay, int, 0644); 190 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ 191 static const int gp_init_delay; 192 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_INIT */ 193 194 #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP 195 static int gp_cleanup_delay = CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP_DELAY; 196 module_param(gp_cleanup_delay, int, 0644); 197 #else /* #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ 198 static const int gp_cleanup_delay; 199 #endif /* #else #ifdef CONFIG_RCU_TORTURE_TEST_SLOW_CLEANUP */ 200 201 /* 202 * Number of grace periods between delays, normalized by the duration of 203 * the delay. The longer the the delay, the more the grace periods between 204 * each delay. The reason for this normalization is that it means that, 205 * for non-zero delays, the overall slowdown of grace periods is constant 206 * regardless of the duration of the delay. This arrangement balances 207 * the need for long delays to increase some race probabilities with the 208 * need for fast grace periods to increase other race probabilities. 209 */ 210 #define PER_RCU_NODE_PERIOD 3 /* Number of grace periods between delays. */ 211 212 /* 213 * Track the rcutorture test sequence number and the update version 214 * number within a given test. The rcutorture_testseq is incremented 215 * on every rcutorture module load and unload, so has an odd value 216 * when a test is running. The rcutorture_vernum is set to zero 217 * when rcutorture starts and is incremented on each rcutorture update. 218 * These variables enable correlating rcutorture output with the 219 * RCU tracing information. 220 */ 221 unsigned long rcutorture_testseq; 222 unsigned long rcutorture_vernum; 223 224 /* 225 * Compute the mask of online CPUs for the specified rcu_node structure. 226 * This will not be stable unless the rcu_node structure's ->lock is 227 * held, but the bit corresponding to the current CPU will be stable 228 * in most contexts. 229 */ 230 unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp) 231 { 232 return READ_ONCE(rnp->qsmaskinitnext); 233 } 234 235 /* 236 * Return true if an RCU grace period is in progress. The READ_ONCE()s 237 * permit this function to be invoked without holding the root rcu_node 238 * structure's ->lock, but of course results can be subject to change. 239 */ 240 static int rcu_gp_in_progress(struct rcu_state *rsp) 241 { 242 return READ_ONCE(rsp->completed) != READ_ONCE(rsp->gpnum); 243 } 244 245 /* 246 * Note a quiescent state. Because we do not need to know 247 * how many quiescent states passed, just if there was at least 248 * one since the start of the grace period, this just sets a flag. 249 * The caller must have disabled preemption. 250 */ 251 void rcu_sched_qs(void) 252 { 253 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) 254 return; 255 trace_rcu_grace_period(TPS("rcu_sched"), 256 __this_cpu_read(rcu_sched_data.gpnum), 257 TPS("cpuqs")); 258 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); 259 if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) 260 return; 261 __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); 262 rcu_report_exp_rdp(&rcu_sched_state, 263 this_cpu_ptr(&rcu_sched_data), true); 264 } 265 266 void rcu_bh_qs(void) 267 { 268 if (__this_cpu_read(rcu_bh_data.cpu_no_qs.s)) { 269 trace_rcu_grace_period(TPS("rcu_bh"), 270 __this_cpu_read(rcu_bh_data.gpnum), 271 TPS("cpuqs")); 272 __this_cpu_write(rcu_bh_data.cpu_no_qs.b.norm, false); 273 } 274 } 275 276 static DEFINE_PER_CPU(int, rcu_sched_qs_mask); 277 278 static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { 279 .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, 280 .dynticks = ATOMIC_INIT(1), 281 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE 282 .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, 283 .dynticks_idle = ATOMIC_INIT(1), 284 #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ 285 }; 286 287 /* 288 * There's a few places, currently just in the tracing infrastructure, 289 * that uses rcu_irq_enter() to make sure RCU is watching. But there's 290 * a small location where that will not even work. In those cases 291 * rcu_irq_enter_disabled() needs to be checked to make sure rcu_irq_enter() 292 * can be called. 293 */ 294 static DEFINE_PER_CPU(bool, disable_rcu_irq_enter); 295 296 bool rcu_irq_enter_disabled(void) 297 { 298 return this_cpu_read(disable_rcu_irq_enter); 299 } 300 301 /* 302 * Record entry into an extended quiescent state. This is only to be 303 * called when not already in an extended quiescent state. 304 */ 305 static void rcu_dynticks_eqs_enter(void) 306 { 307 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 308 int special; 309 310 /* 311 * CPUs seeing atomic_inc_return() must see prior RCU read-side 312 * critical sections, and we also must force ordering with the 313 * next idle sojourn. 314 */ 315 special = atomic_inc_return(&rdtp->dynticks); 316 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && special & 0x1); 317 } 318 319 /* 320 * Record exit from an extended quiescent state. This is only to be 321 * called from an extended quiescent state. 322 */ 323 static void rcu_dynticks_eqs_exit(void) 324 { 325 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 326 int special; 327 328 /* 329 * CPUs seeing atomic_inc_return() must see prior idle sojourns, 330 * and we also must force ordering with the next RCU read-side 331 * critical section. 332 */ 333 special = atomic_inc_return(&rdtp->dynticks); 334 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(special & 0x1)); 335 } 336 337 /* 338 * Reset the current CPU's ->dynticks counter to indicate that the 339 * newly onlined CPU is no longer in an extended quiescent state. 340 * This will either leave the counter unchanged, or increment it 341 * to the next non-quiescent value. 342 * 343 * The non-atomic test/increment sequence works because the upper bits 344 * of the ->dynticks counter are manipulated only by the corresponding CPU, 345 * or when the corresponding CPU is offline. 346 */ 347 static void rcu_dynticks_eqs_online(void) 348 { 349 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 350 351 if (atomic_read(&rdtp->dynticks) & 0x1) 352 return; 353 atomic_add(0x1, &rdtp->dynticks); 354 } 355 356 /* 357 * Is the current CPU in an extended quiescent state? 358 * 359 * No ordering, as we are sampling CPU-local information. 360 */ 361 bool rcu_dynticks_curr_cpu_in_eqs(void) 362 { 363 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 364 365 return !(atomic_read(&rdtp->dynticks) & 0x1); 366 } 367 368 /* 369 * Snapshot the ->dynticks counter with full ordering so as to allow 370 * stable comparison of this counter with past and future snapshots. 371 */ 372 int rcu_dynticks_snap(struct rcu_dynticks *rdtp) 373 { 374 int snap = atomic_add_return(0, &rdtp->dynticks); 375 376 return snap; 377 } 378 379 /* 380 * Return true if the snapshot returned from rcu_dynticks_snap() 381 * indicates that RCU is in an extended quiescent state. 382 */ 383 static bool rcu_dynticks_in_eqs(int snap) 384 { 385 return !(snap & 0x1); 386 } 387 388 /* 389 * Return true if the CPU corresponding to the specified rcu_dynticks 390 * structure has spent some time in an extended quiescent state since 391 * rcu_dynticks_snap() returned the specified snapshot. 392 */ 393 static bool rcu_dynticks_in_eqs_since(struct rcu_dynticks *rdtp, int snap) 394 { 395 return snap != rcu_dynticks_snap(rdtp); 396 } 397 398 /* 399 * Do a double-increment of the ->dynticks counter to emulate a 400 * momentary idle-CPU quiescent state. 401 */ 402 static void rcu_dynticks_momentary_idle(void) 403 { 404 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 405 int special = atomic_add_return(2, &rdtp->dynticks); 406 407 /* It is illegal to call this from idle state. */ 408 WARN_ON_ONCE(!(special & 0x1)); 409 } 410 411 DEFINE_PER_CPU_SHARED_ALIGNED(unsigned long, rcu_qs_ctr); 412 EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); 413 414 /* 415 * Let the RCU core know that this CPU has gone through the scheduler, 416 * which is a quiescent state. This is called when the need for a 417 * quiescent state is urgent, so we burn an atomic operation and full 418 * memory barriers to let the RCU core know about it, regardless of what 419 * this CPU might (or might not) do in the near future. 420 * 421 * We inform the RCU core by emulating a zero-duration dyntick-idle 422 * period, which we in turn do by incrementing the ->dynticks counter 423 * by two. 424 * 425 * The caller must have disabled interrupts. 426 */ 427 static void rcu_momentary_dyntick_idle(void) 428 { 429 struct rcu_data *rdp; 430 int resched_mask; 431 struct rcu_state *rsp; 432 433 /* 434 * Yes, we can lose flag-setting operations. This is OK, because 435 * the flag will be set again after some delay. 436 */ 437 resched_mask = raw_cpu_read(rcu_sched_qs_mask); 438 raw_cpu_write(rcu_sched_qs_mask, 0); 439 440 /* Find the flavor that needs a quiescent state. */ 441 for_each_rcu_flavor(rsp) { 442 rdp = raw_cpu_ptr(rsp->rda); 443 if (!(resched_mask & rsp->flavor_mask)) 444 continue; 445 smp_mb(); /* rcu_sched_qs_mask before cond_resched_completed. */ 446 if (READ_ONCE(rdp->mynode->completed) != 447 READ_ONCE(rdp->cond_resched_completed)) 448 continue; 449 450 /* 451 * Pretend to be momentarily idle for the quiescent state. 452 * This allows the grace-period kthread to record the 453 * quiescent state, with no need for this CPU to do anything 454 * further. 455 */ 456 rcu_dynticks_momentary_idle(); 457 break; 458 } 459 } 460 461 /* 462 * Note a context switch. This is a quiescent state for RCU-sched, 463 * and requires special handling for preemptible RCU. 464 * The caller must have disabled interrupts. 465 */ 466 void rcu_note_context_switch(void) 467 { 468 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 469 trace_rcu_utilization(TPS("Start context switch")); 470 rcu_sched_qs(); 471 rcu_preempt_note_context_switch(); 472 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) 473 rcu_momentary_dyntick_idle(); 474 trace_rcu_utilization(TPS("End context switch")); 475 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 476 } 477 EXPORT_SYMBOL_GPL(rcu_note_context_switch); 478 479 /* 480 * Register a quiescent state for all RCU flavors. If there is an 481 * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight 482 * dyntick-idle quiescent state visible to other CPUs (but only for those 483 * RCU flavors in desperate need of a quiescent state, which will normally 484 * be none of them). Either way, do a lightweight quiescent state for 485 * all RCU flavors. 486 * 487 * The barrier() calls are redundant in the common case when this is 488 * called externally, but just in case this is called from within this 489 * file. 490 * 491 */ 492 void rcu_all_qs(void) 493 { 494 unsigned long flags; 495 496 barrier(); /* Avoid RCU read-side critical sections leaking down. */ 497 if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { 498 local_irq_save(flags); 499 rcu_momentary_dyntick_idle(); 500 local_irq_restore(flags); 501 } 502 if (unlikely(raw_cpu_read(rcu_sched_data.cpu_no_qs.b.exp))) { 503 /* 504 * Yes, we just checked a per-CPU variable with preemption 505 * enabled, so we might be migrated to some other CPU at 506 * this point. That is OK because in that case, the 507 * migration will supply the needed quiescent state. 508 * We might end up needlessly disabling preemption and 509 * invoking rcu_sched_qs() on the destination CPU, but 510 * the probability and cost are both quite low, so this 511 * should not be a problem in practice. 512 */ 513 preempt_disable(); 514 rcu_sched_qs(); 515 preempt_enable(); 516 } 517 this_cpu_inc(rcu_qs_ctr); 518 barrier(); /* Avoid RCU read-side critical sections leaking up. */ 519 } 520 EXPORT_SYMBOL_GPL(rcu_all_qs); 521 522 static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ 523 static long qhimark = 10000; /* If this many pending, ignore blimit. */ 524 static long qlowmark = 100; /* Once only this many pending, use blimit. */ 525 526 module_param(blimit, long, 0444); 527 module_param(qhimark, long, 0444); 528 module_param(qlowmark, long, 0444); 529 530 static ulong jiffies_till_first_fqs = ULONG_MAX; 531 static ulong jiffies_till_next_fqs = ULONG_MAX; 532 static bool rcu_kick_kthreads; 533 534 module_param(jiffies_till_first_fqs, ulong, 0644); 535 module_param(jiffies_till_next_fqs, ulong, 0644); 536 module_param(rcu_kick_kthreads, bool, 0644); 537 538 /* 539 * How long the grace period must be before we start recruiting 540 * quiescent-state help from rcu_note_context_switch(). 541 */ 542 static ulong jiffies_till_sched_qs = HZ / 20; 543 module_param(jiffies_till_sched_qs, ulong, 0644); 544 545 static bool rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 546 struct rcu_data *rdp); 547 static void force_qs_rnp(struct rcu_state *rsp, 548 int (*f)(struct rcu_data *rsp, bool *isidle, 549 unsigned long *maxj), 550 bool *isidle, unsigned long *maxj); 551 static void force_quiescent_state(struct rcu_state *rsp); 552 static int rcu_pending(void); 553 554 /* 555 * Return the number of RCU batches started thus far for debug & stats. 556 */ 557 unsigned long rcu_batches_started(void) 558 { 559 return rcu_state_p->gpnum; 560 } 561 EXPORT_SYMBOL_GPL(rcu_batches_started); 562 563 /* 564 * Return the number of RCU-sched batches started thus far for debug & stats. 565 */ 566 unsigned long rcu_batches_started_sched(void) 567 { 568 return rcu_sched_state.gpnum; 569 } 570 EXPORT_SYMBOL_GPL(rcu_batches_started_sched); 571 572 /* 573 * Return the number of RCU BH batches started thus far for debug & stats. 574 */ 575 unsigned long rcu_batches_started_bh(void) 576 { 577 return rcu_bh_state.gpnum; 578 } 579 EXPORT_SYMBOL_GPL(rcu_batches_started_bh); 580 581 /* 582 * Return the number of RCU batches completed thus far for debug & stats. 583 */ 584 unsigned long rcu_batches_completed(void) 585 { 586 return rcu_state_p->completed; 587 } 588 EXPORT_SYMBOL_GPL(rcu_batches_completed); 589 590 /* 591 * Return the number of RCU-sched batches completed thus far for debug & stats. 592 */ 593 unsigned long rcu_batches_completed_sched(void) 594 { 595 return rcu_sched_state.completed; 596 } 597 EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); 598 599 /* 600 * Return the number of RCU BH batches completed thus far for debug & stats. 601 */ 602 unsigned long rcu_batches_completed_bh(void) 603 { 604 return rcu_bh_state.completed; 605 } 606 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); 607 608 /* 609 * Return the number of RCU expedited batches completed thus far for 610 * debug & stats. Odd numbers mean that a batch is in progress, even 611 * numbers mean idle. The value returned will thus be roughly double 612 * the cumulative batches since boot. 613 */ 614 unsigned long rcu_exp_batches_completed(void) 615 { 616 return rcu_state_p->expedited_sequence; 617 } 618 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed); 619 620 /* 621 * Return the number of RCU-sched expedited batches completed thus far 622 * for debug & stats. Similar to rcu_exp_batches_completed(). 623 */ 624 unsigned long rcu_exp_batches_completed_sched(void) 625 { 626 return rcu_sched_state.expedited_sequence; 627 } 628 EXPORT_SYMBOL_GPL(rcu_exp_batches_completed_sched); 629 630 /* 631 * Force a quiescent state. 632 */ 633 void rcu_force_quiescent_state(void) 634 { 635 force_quiescent_state(rcu_state_p); 636 } 637 EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); 638 639 /* 640 * Force a quiescent state for RCU BH. 641 */ 642 void rcu_bh_force_quiescent_state(void) 643 { 644 force_quiescent_state(&rcu_bh_state); 645 } 646 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); 647 648 /* 649 * Force a quiescent state for RCU-sched. 650 */ 651 void rcu_sched_force_quiescent_state(void) 652 { 653 force_quiescent_state(&rcu_sched_state); 654 } 655 EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); 656 657 /* 658 * Show the state of the grace-period kthreads. 659 */ 660 void show_rcu_gp_kthreads(void) 661 { 662 struct rcu_state *rsp; 663 664 for_each_rcu_flavor(rsp) { 665 pr_info("%s: wait state: %d ->state: %#lx\n", 666 rsp->name, rsp->gp_state, rsp->gp_kthread->state); 667 /* sched_show_task(rsp->gp_kthread); */ 668 } 669 } 670 EXPORT_SYMBOL_GPL(show_rcu_gp_kthreads); 671 672 /* 673 * Record the number of times rcutorture tests have been initiated and 674 * terminated. This information allows the debugfs tracing stats to be 675 * correlated to the rcutorture messages, even when the rcutorture module 676 * is being repeatedly loaded and unloaded. In other words, we cannot 677 * store this state in rcutorture itself. 678 */ 679 void rcutorture_record_test_transition(void) 680 { 681 rcutorture_testseq++; 682 rcutorture_vernum = 0; 683 } 684 EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); 685 686 /* 687 * Send along grace-period-related data for rcutorture diagnostics. 688 */ 689 void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, 690 unsigned long *gpnum, unsigned long *completed) 691 { 692 struct rcu_state *rsp = NULL; 693 694 switch (test_type) { 695 case RCU_FLAVOR: 696 rsp = rcu_state_p; 697 break; 698 case RCU_BH_FLAVOR: 699 rsp = &rcu_bh_state; 700 break; 701 case RCU_SCHED_FLAVOR: 702 rsp = &rcu_sched_state; 703 break; 704 default: 705 break; 706 } 707 if (rsp != NULL) { 708 *flags = READ_ONCE(rsp->gp_flags); 709 *gpnum = READ_ONCE(rsp->gpnum); 710 *completed = READ_ONCE(rsp->completed); 711 return; 712 } 713 *flags = 0; 714 *gpnum = 0; 715 *completed = 0; 716 } 717 EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); 718 719 /* 720 * Record the number of writer passes through the current rcutorture test. 721 * This is also used to correlate debugfs tracing stats with the rcutorture 722 * messages. 723 */ 724 void rcutorture_record_progress(unsigned long vernum) 725 { 726 rcutorture_vernum++; 727 } 728 EXPORT_SYMBOL_GPL(rcutorture_record_progress); 729 730 /* 731 * Does the CPU have callbacks ready to be invoked? 732 */ 733 static int 734 cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) 735 { 736 return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && 737 rdp->nxttail[RCU_NEXT_TAIL] != NULL; 738 } 739 740 /* 741 * Return the root node of the specified rcu_state structure. 742 */ 743 static struct rcu_node *rcu_get_root(struct rcu_state *rsp) 744 { 745 return &rsp->node[0]; 746 } 747 748 /* 749 * Is there any need for future grace periods? 750 * Interrupts must be disabled. If the caller does not hold the root 751 * rnp_node structure's ->lock, the results are advisory only. 752 */ 753 static int rcu_future_needs_gp(struct rcu_state *rsp) 754 { 755 struct rcu_node *rnp = rcu_get_root(rsp); 756 int idx = (READ_ONCE(rnp->completed) + 1) & 0x1; 757 int *fp = &rnp->need_future_gp[idx]; 758 759 return READ_ONCE(*fp); 760 } 761 762 /* 763 * Does the current CPU require a not-yet-started grace period? 764 * The caller must have disabled interrupts to prevent races with 765 * normal callback registry. 766 */ 767 static bool 768 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) 769 { 770 int i; 771 772 if (rcu_gp_in_progress(rsp)) 773 return false; /* No, a grace period is already in progress. */ 774 if (rcu_future_needs_gp(rsp)) 775 return true; /* Yes, a no-CBs CPU needs one. */ 776 if (!rdp->nxttail[RCU_NEXT_TAIL]) 777 return false; /* No, this is a no-CBs (or offline) CPU. */ 778 if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) 779 return true; /* Yes, CPU has newly registered callbacks. */ 780 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) 781 if (rdp->nxttail[i - 1] != rdp->nxttail[i] && 782 ULONG_CMP_LT(READ_ONCE(rsp->completed), 783 rdp->nxtcompleted[i])) 784 return true; /* Yes, CBs for future grace period. */ 785 return false; /* No grace period needed. */ 786 } 787 788 /* 789 * rcu_eqs_enter_common - current CPU is entering an extended quiescent state 790 * 791 * Enter idle, doing appropriate accounting. The caller must have 792 * disabled interrupts. 793 */ 794 static void rcu_eqs_enter_common(bool user) 795 { 796 struct rcu_state *rsp; 797 struct rcu_data *rdp; 798 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 799 800 trace_rcu_dyntick(TPS("Start"), rdtp->dynticks_nesting, 0); 801 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 802 !user && !is_idle_task(current)) { 803 struct task_struct *idle __maybe_unused = 804 idle_task(smp_processor_id()); 805 806 trace_rcu_dyntick(TPS("Error on entry: not idle task"), rdtp->dynticks_nesting, 0); 807 rcu_ftrace_dump(DUMP_ORIG); 808 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 809 current->pid, current->comm, 810 idle->pid, idle->comm); /* must be idle task! */ 811 } 812 for_each_rcu_flavor(rsp) { 813 rdp = this_cpu_ptr(rsp->rda); 814 do_nocb_deferred_wakeup(rdp); 815 } 816 rcu_prepare_for_idle(); 817 __this_cpu_inc(disable_rcu_irq_enter); 818 rdtp->dynticks_nesting = 0; /* Breaks tracing momentarily. */ 819 rcu_dynticks_eqs_enter(); /* After this, tracing works again. */ 820 __this_cpu_dec(disable_rcu_irq_enter); 821 rcu_dynticks_task_enter(); 822 823 /* 824 * It is illegal to enter an extended quiescent state while 825 * in an RCU read-side critical section. 826 */ 827 RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), 828 "Illegal idle entry in RCU read-side critical section."); 829 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), 830 "Illegal idle entry in RCU-bh read-side critical section."); 831 RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), 832 "Illegal idle entry in RCU-sched read-side critical section."); 833 } 834 835 /* 836 * Enter an RCU extended quiescent state, which can be either the 837 * idle loop or adaptive-tickless usermode execution. 838 */ 839 static void rcu_eqs_enter(bool user) 840 { 841 struct rcu_dynticks *rdtp; 842 843 rdtp = this_cpu_ptr(&rcu_dynticks); 844 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 845 (rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); 846 if ((rdtp->dynticks_nesting & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) 847 rcu_eqs_enter_common(user); 848 else 849 rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; 850 } 851 852 /** 853 * rcu_idle_enter - inform RCU that current CPU is entering idle 854 * 855 * Enter idle mode, in other words, -leave- the mode in which RCU 856 * read-side critical sections can occur. (Though RCU read-side 857 * critical sections can occur in irq handlers in idle, a possibility 858 * handled by irq_enter() and irq_exit().) 859 * 860 * We crowbar the ->dynticks_nesting field to zero to allow for 861 * the possibility of usermode upcalls having messed up our count 862 * of interrupt nesting level during the prior busy period. 863 */ 864 void rcu_idle_enter(void) 865 { 866 unsigned long flags; 867 868 local_irq_save(flags); 869 rcu_eqs_enter(false); 870 rcu_sysidle_enter(0); 871 local_irq_restore(flags); 872 } 873 EXPORT_SYMBOL_GPL(rcu_idle_enter); 874 875 #ifdef CONFIG_NO_HZ_FULL 876 /** 877 * rcu_user_enter - inform RCU that we are resuming userspace. 878 * 879 * Enter RCU idle mode right before resuming userspace. No use of RCU 880 * is permitted between this call and rcu_user_exit(). This way the 881 * CPU doesn't need to maintain the tick for RCU maintenance purposes 882 * when the CPU runs in userspace. 883 */ 884 void rcu_user_enter(void) 885 { 886 rcu_eqs_enter(1); 887 } 888 #endif /* CONFIG_NO_HZ_FULL */ 889 890 /** 891 * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle 892 * 893 * Exit from an interrupt handler, which might possibly result in entering 894 * idle mode, in other words, leaving the mode in which read-side critical 895 * sections can occur. The caller must have disabled interrupts. 896 * 897 * This code assumes that the idle loop never does anything that might 898 * result in unbalanced calls to irq_enter() and irq_exit(). If your 899 * architecture violates this assumption, RCU will give you what you 900 * deserve, good and hard. But very infrequently and irreproducibly. 901 * 902 * Use things like work queues to work around this limitation. 903 * 904 * You have been warned. 905 */ 906 void rcu_irq_exit(void) 907 { 908 struct rcu_dynticks *rdtp; 909 910 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); 911 rdtp = this_cpu_ptr(&rcu_dynticks); 912 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 913 rdtp->dynticks_nesting < 1); 914 if (rdtp->dynticks_nesting <= 1) { 915 rcu_eqs_enter_common(true); 916 } else { 917 trace_rcu_dyntick(TPS("--="), rdtp->dynticks_nesting, rdtp->dynticks_nesting - 1); 918 rdtp->dynticks_nesting--; 919 } 920 rcu_sysidle_enter(1); 921 } 922 923 /* 924 * Wrapper for rcu_irq_exit() where interrupts are enabled. 925 */ 926 void rcu_irq_exit_irqson(void) 927 { 928 unsigned long flags; 929 930 local_irq_save(flags); 931 rcu_irq_exit(); 932 local_irq_restore(flags); 933 } 934 935 /* 936 * rcu_eqs_exit_common - current CPU moving away from extended quiescent state 937 * 938 * If the new value of the ->dynticks_nesting counter was previously zero, 939 * we really have exited idle, and must do the appropriate accounting. 940 * The caller must have disabled interrupts. 941 */ 942 static void rcu_eqs_exit_common(long long oldval, int user) 943 { 944 RCU_TRACE(struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks);) 945 946 rcu_dynticks_task_exit(); 947 rcu_dynticks_eqs_exit(); 948 rcu_cleanup_after_idle(); 949 trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); 950 if (IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 951 !user && !is_idle_task(current)) { 952 struct task_struct *idle __maybe_unused = 953 idle_task(smp_processor_id()); 954 955 trace_rcu_dyntick(TPS("Error on exit: not idle task"), 956 oldval, rdtp->dynticks_nesting); 957 rcu_ftrace_dump(DUMP_ORIG); 958 WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", 959 current->pid, current->comm, 960 idle->pid, idle->comm); /* must be idle task! */ 961 } 962 } 963 964 /* 965 * Exit an RCU extended quiescent state, which can be either the 966 * idle loop or adaptive-tickless usermode execution. 967 */ 968 static void rcu_eqs_exit(bool user) 969 { 970 struct rcu_dynticks *rdtp; 971 long long oldval; 972 973 rdtp = this_cpu_ptr(&rcu_dynticks); 974 oldval = rdtp->dynticks_nesting; 975 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); 976 if (oldval & DYNTICK_TASK_NEST_MASK) { 977 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 978 } else { 979 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 980 rcu_eqs_exit_common(oldval, user); 981 } 982 } 983 984 /** 985 * rcu_idle_exit - inform RCU that current CPU is leaving idle 986 * 987 * Exit idle mode, in other words, -enter- the mode in which RCU 988 * read-side critical sections can occur. 989 * 990 * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to 991 * allow for the possibility of usermode upcalls messing up our count 992 * of interrupt nesting level during the busy period that is just 993 * now starting. 994 */ 995 void rcu_idle_exit(void) 996 { 997 unsigned long flags; 998 999 local_irq_save(flags); 1000 rcu_eqs_exit(false); 1001 rcu_sysidle_exit(0); 1002 local_irq_restore(flags); 1003 } 1004 EXPORT_SYMBOL_GPL(rcu_idle_exit); 1005 1006 #ifdef CONFIG_NO_HZ_FULL 1007 /** 1008 * rcu_user_exit - inform RCU that we are exiting userspace. 1009 * 1010 * Exit RCU idle mode while entering the kernel because it can 1011 * run a RCU read side critical section anytime. 1012 */ 1013 void rcu_user_exit(void) 1014 { 1015 rcu_eqs_exit(1); 1016 } 1017 #endif /* CONFIG_NO_HZ_FULL */ 1018 1019 /** 1020 * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle 1021 * 1022 * Enter an interrupt handler, which might possibly result in exiting 1023 * idle mode, in other words, entering the mode in which read-side critical 1024 * sections can occur. The caller must have disabled interrupts. 1025 * 1026 * Note that the Linux kernel is fully capable of entering an interrupt 1027 * handler that it never exits, for example when doing upcalls to 1028 * user mode! This code assumes that the idle loop never does upcalls to 1029 * user mode. If your architecture does do upcalls from the idle loop (or 1030 * does anything else that results in unbalanced calls to the irq_enter() 1031 * and irq_exit() functions), RCU will give you what you deserve, good 1032 * and hard. But very infrequently and irreproducibly. 1033 * 1034 * Use things like work queues to work around this limitation. 1035 * 1036 * You have been warned. 1037 */ 1038 void rcu_irq_enter(void) 1039 { 1040 struct rcu_dynticks *rdtp; 1041 long long oldval; 1042 1043 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); 1044 rdtp = this_cpu_ptr(&rcu_dynticks); 1045 oldval = rdtp->dynticks_nesting; 1046 rdtp->dynticks_nesting++; 1047 WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && 1048 rdtp->dynticks_nesting == 0); 1049 if (oldval) 1050 trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); 1051 else 1052 rcu_eqs_exit_common(oldval, true); 1053 rcu_sysidle_exit(1); 1054 } 1055 1056 /* 1057 * Wrapper for rcu_irq_enter() where interrupts are enabled. 1058 */ 1059 void rcu_irq_enter_irqson(void) 1060 { 1061 unsigned long flags; 1062 1063 local_irq_save(flags); 1064 rcu_irq_enter(); 1065 local_irq_restore(flags); 1066 } 1067 1068 /** 1069 * rcu_nmi_enter - inform RCU of entry to NMI context 1070 * 1071 * If the CPU was idle from RCU's viewpoint, update rdtp->dynticks and 1072 * rdtp->dynticks_nmi_nesting to let the RCU grace-period handling know 1073 * that the CPU is active. This implementation permits nested NMIs, as 1074 * long as the nesting level does not overflow an int. (You will probably 1075 * run out of stack space first.) 1076 */ 1077 void rcu_nmi_enter(void) 1078 { 1079 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 1080 int incby = 2; 1081 1082 /* Complain about underflow. */ 1083 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting < 0); 1084 1085 /* 1086 * If idle from RCU viewpoint, atomically increment ->dynticks 1087 * to mark non-idle and increment ->dynticks_nmi_nesting by one. 1088 * Otherwise, increment ->dynticks_nmi_nesting by two. This means 1089 * if ->dynticks_nmi_nesting is equal to one, we are guaranteed 1090 * to be in the outermost NMI handler that interrupted an RCU-idle 1091 * period (observation due to Andy Lutomirski). 1092 */ 1093 if (rcu_dynticks_curr_cpu_in_eqs()) { 1094 rcu_dynticks_eqs_exit(); 1095 incby = 1; 1096 } 1097 rdtp->dynticks_nmi_nesting += incby; 1098 barrier(); 1099 } 1100 1101 /** 1102 * rcu_nmi_exit - inform RCU of exit from NMI context 1103 * 1104 * If we are returning from the outermost NMI handler that interrupted an 1105 * RCU-idle period, update rdtp->dynticks and rdtp->dynticks_nmi_nesting 1106 * to let the RCU grace-period handling know that the CPU is back to 1107 * being RCU-idle. 1108 */ 1109 void rcu_nmi_exit(void) 1110 { 1111 struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); 1112 1113 /* 1114 * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. 1115 * (We are exiting an NMI handler, so RCU better be paying attention 1116 * to us!) 1117 */ 1118 WARN_ON_ONCE(rdtp->dynticks_nmi_nesting <= 0); 1119 WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); 1120 1121 /* 1122 * If the nesting level is not 1, the CPU wasn't RCU-idle, so 1123 * leave it in non-RCU-idle state. 1124 */ 1125 if (rdtp->dynticks_nmi_nesting != 1) { 1126 rdtp->dynticks_nmi_nesting -= 2; 1127 return; 1128 } 1129 1130 /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ 1131 rdtp->dynticks_nmi_nesting = 0; 1132 rcu_dynticks_eqs_enter(); 1133 } 1134 1135 /** 1136 * __rcu_is_watching - are RCU read-side critical sections safe? 1137 * 1138 * Return true if RCU is watching the running CPU, which means that 1139 * this CPU can safely enter RCU read-side critical sections. Unlike 1140 * rcu_is_watching(), the caller of __rcu_is_watching() must have at 1141 * least disabled preemption. 1142 */ 1143 bool notrace __rcu_is_watching(void) 1144 { 1145 return !rcu_dynticks_curr_cpu_in_eqs(); 1146 } 1147 1148 /** 1149 * rcu_is_watching - see if RCU thinks that the current CPU is idle 1150 * 1151 * If the current CPU is in its idle loop and is neither in an interrupt 1152 * or NMI handler, return true. 1153 */ 1154 bool notrace rcu_is_watching(void) 1155 { 1156 bool ret; 1157 1158 preempt_disable_notrace(); 1159 ret = __rcu_is_watching(); 1160 preempt_enable_notrace(); 1161 return ret; 1162 } 1163 EXPORT_SYMBOL_GPL(rcu_is_watching); 1164 1165 #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) 1166 1167 /* 1168 * Is the current CPU online? Disable preemption to avoid false positives 1169 * that could otherwise happen due to the current CPU number being sampled, 1170 * this task being preempted, its old CPU being taken offline, resuming 1171 * on some other CPU, then determining that its old CPU is now offline. 1172 * It is OK to use RCU on an offline processor during initial boot, hence 1173 * the check for rcu_scheduler_fully_active. Note also that it is OK 1174 * for a CPU coming online to use RCU for one jiffy prior to marking itself 1175 * online in the cpu_online_mask. Similarly, it is OK for a CPU going 1176 * offline to continue to use RCU for one jiffy after marking itself 1177 * offline in the cpu_online_mask. This leniency is necessary given the 1178 * non-atomic nature of the online and offline processing, for example, 1179 * the fact that a CPU enters the scheduler after completing the teardown 1180 * of the CPU. 1181 * 1182 * This is also why RCU internally marks CPUs online during in the 1183 * preparation phase and offline after the CPU has been taken down. 1184 * 1185 * Disable checking if in an NMI handler because we cannot safely report 1186 * errors from NMI handlers anyway. 1187 */ 1188 bool rcu_lockdep_current_cpu_online(void) 1189 { 1190 struct rcu_data *rdp; 1191 struct rcu_node *rnp; 1192 bool ret; 1193 1194 if (in_nmi()) 1195 return true; 1196 preempt_disable(); 1197 rdp = this_cpu_ptr(&rcu_sched_data); 1198 rnp = rdp->mynode; 1199 ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) || 1200 !rcu_scheduler_fully_active; 1201 preempt_enable(); 1202 return ret; 1203 } 1204 EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); 1205 1206 #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ 1207 1208 /** 1209 * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle 1210 * 1211 * If the current CPU is idle or running at a first-level (not nested) 1212 * interrupt from idle, return true. The caller must have at least 1213 * disabled preemption. 1214 */ 1215 static int rcu_is_cpu_rrupt_from_idle(void) 1216 { 1217 return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; 1218 } 1219 1220 /* 1221 * Snapshot the specified CPU's dynticks counter so that we can later 1222 * credit them with an implicit quiescent state. Return 1 if this CPU 1223 * is in dynticks idle mode, which is an extended quiescent state. 1224 */ 1225 static int dyntick_save_progress_counter(struct rcu_data *rdp, 1226 bool *isidle, unsigned long *maxj) 1227 { 1228 rdp->dynticks_snap = rcu_dynticks_snap(rdp->dynticks); 1229 rcu_sysidle_check_cpu(rdp, isidle, maxj); 1230 if (rcu_dynticks_in_eqs(rdp->dynticks_snap)) { 1231 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1232 if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, 1233 rdp->mynode->gpnum)) 1234 WRITE_ONCE(rdp->gpwrap, true); 1235 return 1; 1236 } 1237 return 0; 1238 } 1239 1240 /* 1241 * Return true if the specified CPU has passed through a quiescent 1242 * state by virtue of being in or having passed through an dynticks 1243 * idle state since the last call to dyntick_save_progress_counter() 1244 * for this same CPU, or by virtue of having been offline. 1245 */ 1246 static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, 1247 bool *isidle, unsigned long *maxj) 1248 { 1249 unsigned long jtsq; 1250 int *rcrmp; 1251 unsigned long rjtsc; 1252 struct rcu_node *rnp; 1253 1254 /* 1255 * If the CPU passed through or entered a dynticks idle phase with 1256 * no active irq/NMI handlers, then we can safely pretend that the CPU 1257 * already acknowledged the request to pass through a quiescent 1258 * state. Either way, that CPU cannot possibly be in an RCU 1259 * read-side critical section that started before the beginning 1260 * of the current RCU grace period. 1261 */ 1262 if (rcu_dynticks_in_eqs_since(rdp->dynticks, rdp->dynticks_snap)) { 1263 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); 1264 rdp->dynticks_fqs++; 1265 return 1; 1266 } 1267 1268 /* Compute and saturate jiffies_till_sched_qs. */ 1269 jtsq = jiffies_till_sched_qs; 1270 rjtsc = rcu_jiffies_till_stall_check(); 1271 if (jtsq > rjtsc / 2) { 1272 WRITE_ONCE(jiffies_till_sched_qs, rjtsc); 1273 jtsq = rjtsc / 2; 1274 } else if (jtsq < 1) { 1275 WRITE_ONCE(jiffies_till_sched_qs, 1); 1276 jtsq = 1; 1277 } 1278 1279 /* 1280 * Has this CPU encountered a cond_resched_rcu_qs() since the 1281 * beginning of the grace period? For this to be the case, 1282 * the CPU has to have noticed the current grace period. This 1283 * might not be the case for nohz_full CPUs looping in the kernel. 1284 */ 1285 rnp = rdp->mynode; 1286 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) && 1287 READ_ONCE(rdp->rcu_qs_ctr_snap) != per_cpu(rcu_qs_ctr, rdp->cpu) && 1288 READ_ONCE(rdp->gpnum) == rnp->gpnum && !rdp->gpwrap) { 1289 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("rqc")); 1290 return 1; 1291 } 1292 1293 /* Check for the CPU being offline. */ 1294 if (!(rdp->grpmask & rcu_rnp_online_cpus(rnp))) { 1295 trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); 1296 rdp->offline_fqs++; 1297 return 1; 1298 } 1299 1300 /* 1301 * A CPU running for an extended time within the kernel can 1302 * delay RCU grace periods. When the CPU is in NO_HZ_FULL mode, 1303 * even context-switching back and forth between a pair of 1304 * in-kernel CPU-bound tasks cannot advance grace periods. 1305 * So if the grace period is old enough, make the CPU pay attention. 1306 * Note that the unsynchronized assignments to the per-CPU 1307 * rcu_sched_qs_mask variable are safe. Yes, setting of 1308 * bits can be lost, but they will be set again on the next 1309 * force-quiescent-state pass. So lost bit sets do not result 1310 * in incorrect behavior, merely in a grace period lasting 1311 * a few jiffies longer than it might otherwise. Because 1312 * there are at most four threads involved, and because the 1313 * updates are only once every few jiffies, the probability of 1314 * lossage (and thus of slight grace-period extension) is 1315 * quite low. 1316 * 1317 * Note that if the jiffies_till_sched_qs boot/sysfs parameter 1318 * is set too high, we override with half of the RCU CPU stall 1319 * warning delay. 1320 */ 1321 rcrmp = &per_cpu(rcu_sched_qs_mask, rdp->cpu); 1322 if (time_after(jiffies, rdp->rsp->gp_start + jtsq) || 1323 time_after(jiffies, rdp->rsp->jiffies_resched)) { 1324 if (!(READ_ONCE(*rcrmp) & rdp->rsp->flavor_mask)) { 1325 WRITE_ONCE(rdp->cond_resched_completed, 1326 READ_ONCE(rdp->mynode->completed)); 1327 smp_mb(); /* ->cond_resched_completed before *rcrmp. */ 1328 WRITE_ONCE(*rcrmp, 1329 READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); 1330 } 1331 rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ 1332 } 1333 1334 /* 1335 * If more than halfway to RCU CPU stall-warning time, do 1336 * a resched_cpu() to try to loosen things up a bit. 1337 */ 1338 if (jiffies - rdp->rsp->gp_start > rcu_jiffies_till_stall_check() / 2) 1339 resched_cpu(rdp->cpu); 1340 1341 return 0; 1342 } 1343 1344 static void record_gp_stall_check_time(struct rcu_state *rsp) 1345 { 1346 unsigned long j = jiffies; 1347 unsigned long j1; 1348 1349 rsp->gp_start = j; 1350 smp_wmb(); /* Record start time before stall time. */ 1351 j1 = rcu_jiffies_till_stall_check(); 1352 WRITE_ONCE(rsp->jiffies_stall, j + j1); 1353 rsp->jiffies_resched = j + j1 / 2; 1354 rsp->n_force_qs_gpstart = READ_ONCE(rsp->n_force_qs); 1355 } 1356 1357 /* 1358 * Convert a ->gp_state value to a character string. 1359 */ 1360 static const char *gp_state_getname(short gs) 1361 { 1362 if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) 1363 return "???"; 1364 return gp_state_names[gs]; 1365 } 1366 1367 /* 1368 * Complain about starvation of grace-period kthread. 1369 */ 1370 static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) 1371 { 1372 unsigned long gpa; 1373 unsigned long j; 1374 1375 j = jiffies; 1376 gpa = READ_ONCE(rsp->gp_activity); 1377 if (j - gpa > 2 * HZ) { 1378 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", 1379 rsp->name, j - gpa, 1380 rsp->gpnum, rsp->completed, 1381 rsp->gp_flags, 1382 gp_state_getname(rsp->gp_state), rsp->gp_state, 1383 rsp->gp_kthread ? rsp->gp_kthread->state : ~0); 1384 if (rsp->gp_kthread) { 1385 sched_show_task(rsp->gp_kthread); 1386 wake_up_process(rsp->gp_kthread); 1387 } 1388 } 1389 } 1390 1391 /* 1392 * Dump stacks of all tasks running on stalled CPUs. First try using 1393 * NMIs, but fall back to manual remote stack tracing on architectures 1394 * that don't support NMI-based stack dumps. The NMI-triggered stack 1395 * traces are more accurate because they are printed by the target CPU. 1396 */ 1397 static void rcu_dump_cpu_stacks(struct rcu_state *rsp) 1398 { 1399 int cpu; 1400 unsigned long flags; 1401 struct rcu_node *rnp; 1402 1403 rcu_for_each_leaf_node(rsp, rnp) { 1404 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1405 for_each_leaf_node_possible_cpu(rnp, cpu) 1406 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) 1407 if (!trigger_single_cpu_backtrace(cpu)) 1408 dump_cpu_task(cpu); 1409 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1410 } 1411 } 1412 1413 /* 1414 * If too much time has passed in the current grace period, and if 1415 * so configured, go kick the relevant kthreads. 1416 */ 1417 static void rcu_stall_kick_kthreads(struct rcu_state *rsp) 1418 { 1419 unsigned long j; 1420 1421 if (!rcu_kick_kthreads) 1422 return; 1423 j = READ_ONCE(rsp->jiffies_kick_kthreads); 1424 if (time_after(jiffies, j) && rsp->gp_kthread && 1425 (rcu_gp_in_progress(rsp) || READ_ONCE(rsp->gp_flags))) { 1426 WARN_ONCE(1, "Kicking %s grace-period kthread\n", rsp->name); 1427 rcu_ftrace_dump(DUMP_ALL); 1428 wake_up_process(rsp->gp_kthread); 1429 WRITE_ONCE(rsp->jiffies_kick_kthreads, j + HZ); 1430 } 1431 } 1432 1433 static inline void panic_on_rcu_stall(void) 1434 { 1435 if (sysctl_panic_on_rcu_stall) 1436 panic("RCU Stall\n"); 1437 } 1438 1439 static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) 1440 { 1441 int cpu; 1442 long delta; 1443 unsigned long flags; 1444 unsigned long gpa; 1445 unsigned long j; 1446 int ndetected = 0; 1447 struct rcu_node *rnp = rcu_get_root(rsp); 1448 long totqlen = 0; 1449 1450 /* Kick and suppress, if so configured. */ 1451 rcu_stall_kick_kthreads(rsp); 1452 if (rcu_cpu_stall_suppress) 1453 return; 1454 1455 /* Only let one CPU complain about others per time interval. */ 1456 1457 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1458 delta = jiffies - READ_ONCE(rsp->jiffies_stall); 1459 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { 1460 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1461 return; 1462 } 1463 WRITE_ONCE(rsp->jiffies_stall, 1464 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1465 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1466 1467 /* 1468 * OK, time to rat on our buddy... 1469 * See Documentation/RCU/stallwarn.txt for info on how to debug 1470 * RCU CPU stall warnings. 1471 */ 1472 pr_err("INFO: %s detected stalls on CPUs/tasks:", 1473 rsp->name); 1474 print_cpu_stall_info_begin(); 1475 rcu_for_each_leaf_node(rsp, rnp) { 1476 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1477 ndetected += rcu_print_task_stall(rnp); 1478 if (rnp->qsmask != 0) { 1479 for_each_leaf_node_possible_cpu(rnp, cpu) 1480 if (rnp->qsmask & leaf_node_cpu_bit(rnp, cpu)) { 1481 print_cpu_stall_info(rsp, cpu); 1482 ndetected++; 1483 } 1484 } 1485 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1486 } 1487 1488 print_cpu_stall_info_end(); 1489 for_each_possible_cpu(cpu) 1490 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1491 pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", 1492 smp_processor_id(), (long)(jiffies - rsp->gp_start), 1493 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1494 if (ndetected) { 1495 rcu_dump_cpu_stacks(rsp); 1496 1497 /* Complain about tasks blocking the grace period. */ 1498 rcu_print_detail_task_stall(rsp); 1499 } else { 1500 if (READ_ONCE(rsp->gpnum) != gpnum || 1501 READ_ONCE(rsp->completed) == gpnum) { 1502 pr_err("INFO: Stall ended before state dump start\n"); 1503 } else { 1504 j = jiffies; 1505 gpa = READ_ONCE(rsp->gp_activity); 1506 pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n", 1507 rsp->name, j - gpa, j, gpa, 1508 jiffies_till_next_fqs, 1509 rcu_get_root(rsp)->qsmask); 1510 /* In this case, the current CPU might be at fault. */ 1511 sched_show_task(current); 1512 } 1513 } 1514 1515 rcu_check_gp_kthread_starvation(rsp); 1516 1517 panic_on_rcu_stall(); 1518 1519 force_quiescent_state(rsp); /* Kick them all. */ 1520 } 1521 1522 static void print_cpu_stall(struct rcu_state *rsp) 1523 { 1524 int cpu; 1525 unsigned long flags; 1526 struct rcu_node *rnp = rcu_get_root(rsp); 1527 long totqlen = 0; 1528 1529 /* Kick and suppress, if so configured. */ 1530 rcu_stall_kick_kthreads(rsp); 1531 if (rcu_cpu_stall_suppress) 1532 return; 1533 1534 /* 1535 * OK, time to rat on ourselves... 1536 * See Documentation/RCU/stallwarn.txt for info on how to debug 1537 * RCU CPU stall warnings. 1538 */ 1539 pr_err("INFO: %s self-detected stall on CPU", rsp->name); 1540 print_cpu_stall_info_begin(); 1541 print_cpu_stall_info(rsp, smp_processor_id()); 1542 print_cpu_stall_info_end(); 1543 for_each_possible_cpu(cpu) 1544 totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; 1545 pr_cont(" (t=%lu jiffies g=%ld c=%ld q=%lu)\n", 1546 jiffies - rsp->gp_start, 1547 (long)rsp->gpnum, (long)rsp->completed, totqlen); 1548 1549 rcu_check_gp_kthread_starvation(rsp); 1550 1551 rcu_dump_cpu_stacks(rsp); 1552 1553 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1554 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1555 WRITE_ONCE(rsp->jiffies_stall, 1556 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1557 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 1558 1559 panic_on_rcu_stall(); 1560 1561 /* 1562 * Attempt to revive the RCU machinery by forcing a context switch. 1563 * 1564 * A context switch would normally allow the RCU state machine to make 1565 * progress and it could be we're stuck in kernel space without context 1566 * switches for an entirely unreasonable amount of time. 1567 */ 1568 resched_cpu(smp_processor_id()); 1569 } 1570 1571 static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 1572 { 1573 unsigned long completed; 1574 unsigned long gpnum; 1575 unsigned long gps; 1576 unsigned long j; 1577 unsigned long js; 1578 struct rcu_node *rnp; 1579 1580 if ((rcu_cpu_stall_suppress && !rcu_kick_kthreads) || 1581 !rcu_gp_in_progress(rsp)) 1582 return; 1583 rcu_stall_kick_kthreads(rsp); 1584 j = jiffies; 1585 1586 /* 1587 * Lots of memory barriers to reject false positives. 1588 * 1589 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, 1590 * then rsp->gp_start, and finally rsp->completed. These values 1591 * are updated in the opposite order with memory barriers (or 1592 * equivalent) during grace-period initialization and cleanup. 1593 * Now, a false positive can occur if we get an new value of 1594 * rsp->gp_start and a old value of rsp->jiffies_stall. But given 1595 * the memory barriers, the only way that this can happen is if one 1596 * grace period ends and another starts between these two fetches. 1597 * Detect this by comparing rsp->completed with the previous fetch 1598 * from rsp->gpnum. 1599 * 1600 * Given this check, comparisons of jiffies, rsp->jiffies_stall, 1601 * and rsp->gp_start suffice to forestall false positives. 1602 */ 1603 gpnum = READ_ONCE(rsp->gpnum); 1604 smp_rmb(); /* Pick up ->gpnum first... */ 1605 js = READ_ONCE(rsp->jiffies_stall); 1606 smp_rmb(); /* ...then ->jiffies_stall before the rest... */ 1607 gps = READ_ONCE(rsp->gp_start); 1608 smp_rmb(); /* ...and finally ->gp_start before ->completed. */ 1609 completed = READ_ONCE(rsp->completed); 1610 if (ULONG_CMP_GE(completed, gpnum) || 1611 ULONG_CMP_LT(j, js) || 1612 ULONG_CMP_GE(gps, js)) 1613 return; /* No stall or GP completed since entering function. */ 1614 rnp = rdp->mynode; 1615 if (rcu_gp_in_progress(rsp) && 1616 (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { 1617 1618 /* We haven't checked in, so go dump stack. */ 1619 print_cpu_stall(rsp); 1620 1621 } else if (rcu_gp_in_progress(rsp) && 1622 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1623 1624 /* They had a few time units to dump stack, so complain. */ 1625 print_other_cpu_stall(rsp, gpnum); 1626 } 1627 } 1628 1629 /** 1630 * rcu_cpu_stall_reset - prevent further stall warnings in current grace period 1631 * 1632 * Set the stall-warning timeout way off into the future, thus preventing 1633 * any RCU CPU stall-warning messages from appearing in the current set of 1634 * RCU grace periods. 1635 * 1636 * The caller must disable hard irqs. 1637 */ 1638 void rcu_cpu_stall_reset(void) 1639 { 1640 struct rcu_state *rsp; 1641 1642 for_each_rcu_flavor(rsp) 1643 WRITE_ONCE(rsp->jiffies_stall, jiffies + ULONG_MAX / 2); 1644 } 1645 1646 /* 1647 * Initialize the specified rcu_data structure's default callback list 1648 * to empty. The default callback list is the one that is not used by 1649 * no-callbacks CPUs. 1650 */ 1651 static void init_default_callback_list(struct rcu_data *rdp) 1652 { 1653 int i; 1654 1655 rdp->nxtlist = NULL; 1656 for (i = 0; i < RCU_NEXT_SIZE; i++) 1657 rdp->nxttail[i] = &rdp->nxtlist; 1658 } 1659 1660 /* 1661 * Initialize the specified rcu_data structure's callback list to empty. 1662 */ 1663 static void init_callback_list(struct rcu_data *rdp) 1664 { 1665 if (init_nocb_callback_list(rdp)) 1666 return; 1667 init_default_callback_list(rdp); 1668 } 1669 1670 /* 1671 * Determine the value that ->completed will have at the end of the 1672 * next subsequent grace period. This is used to tag callbacks so that 1673 * a CPU can invoke callbacks in a timely fashion even if that CPU has 1674 * been dyntick-idle for an extended period with callbacks under the 1675 * influence of RCU_FAST_NO_HZ. 1676 * 1677 * The caller must hold rnp->lock with interrupts disabled. 1678 */ 1679 static unsigned long rcu_cbs_completed(struct rcu_state *rsp, 1680 struct rcu_node *rnp) 1681 { 1682 /* 1683 * If RCU is idle, we just wait for the next grace period. 1684 * But we can only be sure that RCU is idle if we are looking 1685 * at the root rcu_node structure -- otherwise, a new grace 1686 * period might have started, but just not yet gotten around 1687 * to initializing the current non-root rcu_node structure. 1688 */ 1689 if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) 1690 return rnp->completed + 1; 1691 1692 /* 1693 * Otherwise, wait for a possible partial grace period and 1694 * then the subsequent full grace period. 1695 */ 1696 return rnp->completed + 2; 1697 } 1698 1699 /* 1700 * Trace-event helper function for rcu_start_future_gp() and 1701 * rcu_nocb_wait_gp(). 1702 */ 1703 static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1704 unsigned long c, const char *s) 1705 { 1706 trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, 1707 rnp->completed, c, rnp->level, 1708 rnp->grplo, rnp->grphi, s); 1709 } 1710 1711 /* 1712 * Start some future grace period, as needed to handle newly arrived 1713 * callbacks. The required future grace periods are recorded in each 1714 * rcu_node structure's ->need_future_gp field. Returns true if there 1715 * is reason to awaken the grace-period kthread. 1716 * 1717 * The caller must hold the specified rcu_node structure's ->lock. 1718 */ 1719 static bool __maybe_unused 1720 rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, 1721 unsigned long *c_out) 1722 { 1723 unsigned long c; 1724 int i; 1725 bool ret = false; 1726 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); 1727 1728 /* 1729 * Pick up grace-period number for new callbacks. If this 1730 * grace period is already marked as needed, return to the caller. 1731 */ 1732 c = rcu_cbs_completed(rdp->rsp, rnp); 1733 trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); 1734 if (rnp->need_future_gp[c & 0x1]) { 1735 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); 1736 goto out; 1737 } 1738 1739 /* 1740 * If either this rcu_node structure or the root rcu_node structure 1741 * believe that a grace period is in progress, then we must wait 1742 * for the one following, which is in "c". Because our request 1743 * will be noticed at the end of the current grace period, we don't 1744 * need to explicitly start one. We only do the lockless check 1745 * of rnp_root's fields if the current rcu_node structure thinks 1746 * there is no grace period in flight, and because we hold rnp->lock, 1747 * the only possible change is when rnp_root's two fields are 1748 * equal, in which case rnp_root->gpnum might be concurrently 1749 * incremented. But that is OK, as it will just result in our 1750 * doing some extra useless work. 1751 */ 1752 if (rnp->gpnum != rnp->completed || 1753 READ_ONCE(rnp_root->gpnum) != READ_ONCE(rnp_root->completed)) { 1754 rnp->need_future_gp[c & 0x1]++; 1755 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); 1756 goto out; 1757 } 1758 1759 /* 1760 * There might be no grace period in progress. If we don't already 1761 * hold it, acquire the root rcu_node structure's lock in order to 1762 * start one (if needed). 1763 */ 1764 if (rnp != rnp_root) 1765 raw_spin_lock_rcu_node(rnp_root); 1766 1767 /* 1768 * Get a new grace-period number. If there really is no grace 1769 * period in progress, it will be smaller than the one we obtained 1770 * earlier. Adjust callbacks as needed. Note that even no-CBs 1771 * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. 1772 */ 1773 c = rcu_cbs_completed(rdp->rsp, rnp_root); 1774 for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) 1775 if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) 1776 rdp->nxtcompleted[i] = c; 1777 1778 /* 1779 * If the needed for the required grace period is already 1780 * recorded, trace and leave. 1781 */ 1782 if (rnp_root->need_future_gp[c & 0x1]) { 1783 trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); 1784 goto unlock_out; 1785 } 1786 1787 /* Record the need for the future grace period. */ 1788 rnp_root->need_future_gp[c & 0x1]++; 1789 1790 /* If a grace period is not already in progress, start one. */ 1791 if (rnp_root->gpnum != rnp_root->completed) { 1792 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); 1793 } else { 1794 trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); 1795 ret = rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); 1796 } 1797 unlock_out: 1798 if (rnp != rnp_root) 1799 raw_spin_unlock_rcu_node(rnp_root); 1800 out: 1801 if (c_out != NULL) 1802 *c_out = c; 1803 return ret; 1804 } 1805 1806 /* 1807 * Clean up any old requests for the just-ended grace period. Also return 1808 * whether any additional grace periods have been requested. Also invoke 1809 * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads 1810 * waiting for this grace period to complete. 1811 */ 1812 static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) 1813 { 1814 int c = rnp->completed; 1815 int needmore; 1816 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 1817 1818 rnp->need_future_gp[c & 0x1] = 0; 1819 needmore = rnp->need_future_gp[(c + 1) & 0x1]; 1820 trace_rcu_future_gp(rnp, rdp, c, 1821 needmore ? TPS("CleanupMore") : TPS("Cleanup")); 1822 return needmore; 1823 } 1824 1825 /* 1826 * Awaken the grace-period kthread for the specified flavor of RCU. 1827 * Don't do a self-awaken, and don't bother awakening when there is 1828 * nothing for the grace-period kthread to do (as in several CPUs 1829 * raced to awaken, and we lost), and finally don't try to awaken 1830 * a kthread that has not yet been created. 1831 */ 1832 static void rcu_gp_kthread_wake(struct rcu_state *rsp) 1833 { 1834 if (current == rsp->gp_kthread || 1835 !READ_ONCE(rsp->gp_flags) || 1836 !rsp->gp_kthread) 1837 return; 1838 swake_up(&rsp->gp_wq); 1839 } 1840 1841 /* 1842 * If there is room, assign a ->completed number to any callbacks on 1843 * this CPU that have not already been assigned. Also accelerate any 1844 * callbacks that were previously assigned a ->completed number that has 1845 * since proven to be too conservative, which can happen if callbacks get 1846 * assigned a ->completed number while RCU is idle, but with reference to 1847 * a non-root rcu_node structure. This function is idempotent, so it does 1848 * not hurt to call it repeatedly. Returns an flag saying that we should 1849 * awaken the RCU grace-period kthread. 1850 * 1851 * The caller must hold rnp->lock with interrupts disabled. 1852 */ 1853 static bool rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1854 struct rcu_data *rdp) 1855 { 1856 unsigned long c; 1857 int i; 1858 bool ret; 1859 1860 /* If the CPU has no callbacks, nothing to do. */ 1861 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1862 return false; 1863 1864 /* 1865 * Starting from the sublist containing the callbacks most 1866 * recently assigned a ->completed number and working down, find the 1867 * first sublist that is not assignable to an upcoming grace period. 1868 * Such a sublist has something in it (first two tests) and has 1869 * a ->completed number assigned that will complete sooner than 1870 * the ->completed number for newly arrived callbacks (last test). 1871 * 1872 * The key point is that any later sublist can be assigned the 1873 * same ->completed number as the newly arrived callbacks, which 1874 * means that the callbacks in any of these later sublist can be 1875 * grouped into a single sublist, whether or not they have already 1876 * been assigned a ->completed number. 1877 */ 1878 c = rcu_cbs_completed(rsp, rnp); 1879 for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) 1880 if (rdp->nxttail[i] != rdp->nxttail[i - 1] && 1881 !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) 1882 break; 1883 1884 /* 1885 * If there are no sublist for unassigned callbacks, leave. 1886 * At the same time, advance "i" one sublist, so that "i" will 1887 * index into the sublist where all the remaining callbacks should 1888 * be grouped into. 1889 */ 1890 if (++i >= RCU_NEXT_TAIL) 1891 return false; 1892 1893 /* 1894 * Assign all subsequent callbacks' ->completed number to the next 1895 * full grace period and group them all in the sublist initially 1896 * indexed by "i". 1897 */ 1898 for (; i <= RCU_NEXT_TAIL; i++) { 1899 rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; 1900 rdp->nxtcompleted[i] = c; 1901 } 1902 /* Record any needed additional grace periods. */ 1903 ret = rcu_start_future_gp(rnp, rdp, NULL); 1904 1905 /* Trace depending on how much we were able to accelerate. */ 1906 if (!*rdp->nxttail[RCU_WAIT_TAIL]) 1907 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); 1908 else 1909 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); 1910 return ret; 1911 } 1912 1913 /* 1914 * Move any callbacks whose grace period has completed to the 1915 * RCU_DONE_TAIL sublist, then compact the remaining sublists and 1916 * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL 1917 * sublist. This function is idempotent, so it does not hurt to 1918 * invoke it repeatedly. As long as it is not invoked -too- often... 1919 * Returns true if the RCU grace-period kthread needs to be awakened. 1920 * 1921 * The caller must hold rnp->lock with interrupts disabled. 1922 */ 1923 static bool rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, 1924 struct rcu_data *rdp) 1925 { 1926 int i, j; 1927 1928 /* If the CPU has no callbacks, nothing to do. */ 1929 if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) 1930 return false; 1931 1932 /* 1933 * Find all callbacks whose ->completed numbers indicate that they 1934 * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. 1935 */ 1936 for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { 1937 if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) 1938 break; 1939 rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; 1940 } 1941 /* Clean up any sublist tail pointers that were misordered above. */ 1942 for (j = RCU_WAIT_TAIL; j < i; j++) 1943 rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; 1944 1945 /* Copy down callbacks to fill in empty sublists. */ 1946 for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { 1947 if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) 1948 break; 1949 rdp->nxttail[j] = rdp->nxttail[i]; 1950 rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; 1951 } 1952 1953 /* Classify any remaining callbacks. */ 1954 return rcu_accelerate_cbs(rsp, rnp, rdp); 1955 } 1956 1957 /* 1958 * Update CPU-local rcu_data state to record the beginnings and ends of 1959 * grace periods. The caller must hold the ->lock of the leaf rcu_node 1960 * structure corresponding to the current CPU, and must have irqs disabled. 1961 * Returns true if the grace-period kthread needs to be awakened. 1962 */ 1963 static bool __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, 1964 struct rcu_data *rdp) 1965 { 1966 bool ret; 1967 bool need_gp; 1968 1969 /* Handle the ends of any preceding grace periods first. */ 1970 if (rdp->completed == rnp->completed && 1971 !unlikely(READ_ONCE(rdp->gpwrap))) { 1972 1973 /* No grace period end, so just accelerate recent callbacks. */ 1974 ret = rcu_accelerate_cbs(rsp, rnp, rdp); 1975 1976 } else { 1977 1978 /* Advance callbacks. */ 1979 ret = rcu_advance_cbs(rsp, rnp, rdp); 1980 1981 /* Remember that we saw this grace-period completion. */ 1982 rdp->completed = rnp->completed; 1983 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); 1984 } 1985 1986 if (rdp->gpnum != rnp->gpnum || unlikely(READ_ONCE(rdp->gpwrap))) { 1987 /* 1988 * If the current grace period is waiting for this CPU, 1989 * set up to detect a quiescent state, otherwise don't 1990 * go looking for one. 1991 */ 1992 rdp->gpnum = rnp->gpnum; 1993 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); 1994 need_gp = !!(rnp->qsmask & rdp->grpmask); 1995 rdp->cpu_no_qs.b.norm = need_gp; 1996 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 1997 rdp->core_needs_qs = need_gp; 1998 zero_cpu_stall_ticks(rdp); 1999 WRITE_ONCE(rdp->gpwrap, false); 2000 } 2001 return ret; 2002 } 2003 2004 static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) 2005 { 2006 unsigned long flags; 2007 bool needwake; 2008 struct rcu_node *rnp; 2009 2010 local_irq_save(flags); 2011 rnp = rdp->mynode; 2012 if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && 2013 rdp->completed == READ_ONCE(rnp->completed) && 2014 !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ 2015 !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ 2016 local_irq_restore(flags); 2017 return; 2018 } 2019 needwake = __note_gp_changes(rsp, rnp, rdp); 2020 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2021 if (needwake) 2022 rcu_gp_kthread_wake(rsp); 2023 } 2024 2025 static void rcu_gp_slow(struct rcu_state *rsp, int delay) 2026 { 2027 if (delay > 0 && 2028 !(rsp->gpnum % (rcu_num_nodes * PER_RCU_NODE_PERIOD * delay))) 2029 schedule_timeout_uninterruptible(delay); 2030 } 2031 2032 /* 2033 * Initialize a new grace period. Return false if no grace period required. 2034 */ 2035 static bool rcu_gp_init(struct rcu_state *rsp) 2036 { 2037 unsigned long oldmask; 2038 struct rcu_data *rdp; 2039 struct rcu_node *rnp = rcu_get_root(rsp); 2040 2041 WRITE_ONCE(rsp->gp_activity, jiffies); 2042 raw_spin_lock_irq_rcu_node(rnp); 2043 if (!READ_ONCE(rsp->gp_flags)) { 2044 /* Spurious wakeup, tell caller to go back to sleep. */ 2045 raw_spin_unlock_irq_rcu_node(rnp); 2046 return false; 2047 } 2048 WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ 2049 2050 if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { 2051 /* 2052 * Grace period already in progress, don't start another. 2053 * Not supposed to be able to happen. 2054 */ 2055 raw_spin_unlock_irq_rcu_node(rnp); 2056 return false; 2057 } 2058 2059 /* Advance to a new grace period and initialize state. */ 2060 record_gp_stall_check_time(rsp); 2061 /* Record GP times before starting GP, hence smp_store_release(). */ 2062 smp_store_release(&rsp->gpnum, rsp->gpnum + 1); 2063 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 2064 raw_spin_unlock_irq_rcu_node(rnp); 2065 2066 /* 2067 * Apply per-leaf buffered online and offline operations to the 2068 * rcu_node tree. Note that this new grace period need not wait 2069 * for subsequent online CPUs, and that quiescent-state forcing 2070 * will handle subsequent offline CPUs. 2071 */ 2072 rcu_for_each_leaf_node(rsp, rnp) { 2073 rcu_gp_slow(rsp, gp_preinit_delay); 2074 raw_spin_lock_irq_rcu_node(rnp); 2075 if (rnp->qsmaskinit == rnp->qsmaskinitnext && 2076 !rnp->wait_blkd_tasks) { 2077 /* Nothing to do on this leaf rcu_node structure. */ 2078 raw_spin_unlock_irq_rcu_node(rnp); 2079 continue; 2080 } 2081 2082 /* Record old state, apply changes to ->qsmaskinit field. */ 2083 oldmask = rnp->qsmaskinit; 2084 rnp->qsmaskinit = rnp->qsmaskinitnext; 2085 2086 /* If zero-ness of ->qsmaskinit changed, propagate up tree. */ 2087 if (!oldmask != !rnp->qsmaskinit) { 2088 if (!oldmask) /* First online CPU for this rcu_node. */ 2089 rcu_init_new_rnp(rnp); 2090 else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */ 2091 rnp->wait_blkd_tasks = true; 2092 else /* Last offline CPU and can propagate. */ 2093 rcu_cleanup_dead_rnp(rnp); 2094 } 2095 2096 /* 2097 * If all waited-on tasks from prior grace period are 2098 * done, and if all this rcu_node structure's CPUs are 2099 * still offline, propagate up the rcu_node tree and 2100 * clear ->wait_blkd_tasks. Otherwise, if one of this 2101 * rcu_node structure's CPUs has since come back online, 2102 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp() 2103 * checks for this, so just call it unconditionally). 2104 */ 2105 if (rnp->wait_blkd_tasks && 2106 (!rcu_preempt_has_tasks(rnp) || 2107 rnp->qsmaskinit)) { 2108 rnp->wait_blkd_tasks = false; 2109 rcu_cleanup_dead_rnp(rnp); 2110 } 2111 2112 raw_spin_unlock_irq_rcu_node(rnp); 2113 } 2114 2115 /* 2116 * Set the quiescent-state-needed bits in all the rcu_node 2117 * structures for all currently online CPUs in breadth-first order, 2118 * starting from the root rcu_node structure, relying on the layout 2119 * of the tree within the rsp->node[] array. Note that other CPUs 2120 * will access only the leaves of the hierarchy, thus seeing that no 2121 * grace period is in progress, at least until the corresponding 2122 * leaf node has been initialized. 2123 * 2124 * The grace period cannot complete until the initialization 2125 * process finishes, because this kthread handles both. 2126 */ 2127 rcu_for_each_node_breadth_first(rsp, rnp) { 2128 rcu_gp_slow(rsp, gp_init_delay); 2129 raw_spin_lock_irq_rcu_node(rnp); 2130 rdp = this_cpu_ptr(rsp->rda); 2131 rcu_preempt_check_blocked_tasks(rnp); 2132 rnp->qsmask = rnp->qsmaskinit; 2133 WRITE_ONCE(rnp->gpnum, rsp->gpnum); 2134 if (WARN_ON_ONCE(rnp->completed != rsp->completed)) 2135 WRITE_ONCE(rnp->completed, rsp->completed); 2136 if (rnp == rdp->mynode) 2137 (void)__note_gp_changes(rsp, rnp, rdp); 2138 rcu_preempt_boost_start_gp(rnp); 2139 trace_rcu_grace_period_init(rsp->name, rnp->gpnum, 2140 rnp->level, rnp->grplo, 2141 rnp->grphi, rnp->qsmask); 2142 raw_spin_unlock_irq_rcu_node(rnp); 2143 cond_resched_rcu_qs(); 2144 WRITE_ONCE(rsp->gp_activity, jiffies); 2145 } 2146 2147 return true; 2148 } 2149 2150 /* 2151 * Helper function for wait_event_interruptible_timeout() wakeup 2152 * at force-quiescent-state time. 2153 */ 2154 static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) 2155 { 2156 struct rcu_node *rnp = rcu_get_root(rsp); 2157 2158 /* Someone like call_rcu() requested a force-quiescent-state scan. */ 2159 *gfp = READ_ONCE(rsp->gp_flags); 2160 if (*gfp & RCU_GP_FLAG_FQS) 2161 return true; 2162 2163 /* The current grace period has completed. */ 2164 if (!READ_ONCE(rnp->qsmask) && !rcu_preempt_blocked_readers_cgp(rnp)) 2165 return true; 2166 2167 return false; 2168 } 2169 2170 /* 2171 * Do one round of quiescent-state forcing. 2172 */ 2173 static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) 2174 { 2175 bool isidle = false; 2176 unsigned long maxj; 2177 struct rcu_node *rnp = rcu_get_root(rsp); 2178 2179 WRITE_ONCE(rsp->gp_activity, jiffies); 2180 rsp->n_force_qs++; 2181 if (first_time) { 2182 /* Collect dyntick-idle snapshots. */ 2183 if (is_sysidle_rcu_state(rsp)) { 2184 isidle = true; 2185 maxj = jiffies - ULONG_MAX / 4; 2186 } 2187 force_qs_rnp(rsp, dyntick_save_progress_counter, 2188 &isidle, &maxj); 2189 rcu_sysidle_report_gp(rsp, isidle, maxj); 2190 } else { 2191 /* Handle dyntick-idle and offline CPUs. */ 2192 isidle = true; 2193 force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); 2194 } 2195 /* Clear flag to prevent immediate re-entry. */ 2196 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 2197 raw_spin_lock_irq_rcu_node(rnp); 2198 WRITE_ONCE(rsp->gp_flags, 2199 READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); 2200 raw_spin_unlock_irq_rcu_node(rnp); 2201 } 2202 } 2203 2204 /* 2205 * Clean up after the old grace period. 2206 */ 2207 static void rcu_gp_cleanup(struct rcu_state *rsp) 2208 { 2209 unsigned long gp_duration; 2210 bool needgp = false; 2211 int nocb = 0; 2212 struct rcu_data *rdp; 2213 struct rcu_node *rnp = rcu_get_root(rsp); 2214 struct swait_queue_head *sq; 2215 2216 WRITE_ONCE(rsp->gp_activity, jiffies); 2217 raw_spin_lock_irq_rcu_node(rnp); 2218 gp_duration = jiffies - rsp->gp_start; 2219 if (gp_duration > rsp->gp_max) 2220 rsp->gp_max = gp_duration; 2221 2222 /* 2223 * We know the grace period is complete, but to everyone else 2224 * it appears to still be ongoing. But it is also the case 2225 * that to everyone else it looks like there is nothing that 2226 * they can do to advance the grace period. It is therefore 2227 * safe for us to drop the lock in order to mark the grace 2228 * period as completed in all of the rcu_node structures. 2229 */ 2230 raw_spin_unlock_irq_rcu_node(rnp); 2231 2232 /* 2233 * Propagate new ->completed value to rcu_node structures so 2234 * that other CPUs don't have to wait until the start of the next 2235 * grace period to process their callbacks. This also avoids 2236 * some nasty RCU grace-period initialization races by forcing 2237 * the end of the current grace period to be completely recorded in 2238 * all of the rcu_node structures before the beginning of the next 2239 * grace period is recorded in any of the rcu_node structures. 2240 */ 2241 rcu_for_each_node_breadth_first(rsp, rnp) { 2242 raw_spin_lock_irq_rcu_node(rnp); 2243 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 2244 WARN_ON_ONCE(rnp->qsmask); 2245 WRITE_ONCE(rnp->completed, rsp->gpnum); 2246 rdp = this_cpu_ptr(rsp->rda); 2247 if (rnp == rdp->mynode) 2248 needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; 2249 /* smp_mb() provided by prior unlock-lock pair. */ 2250 nocb += rcu_future_gp_cleanup(rsp, rnp); 2251 sq = rcu_nocb_gp_get(rnp); 2252 raw_spin_unlock_irq_rcu_node(rnp); 2253 rcu_nocb_gp_cleanup(sq); 2254 cond_resched_rcu_qs(); 2255 WRITE_ONCE(rsp->gp_activity, jiffies); 2256 rcu_gp_slow(rsp, gp_cleanup_delay); 2257 } 2258 rnp = rcu_get_root(rsp); 2259 raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ 2260 rcu_nocb_gp_set(rnp, nocb); 2261 2262 /* Declare grace period done. */ 2263 WRITE_ONCE(rsp->completed, rsp->gpnum); 2264 trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); 2265 rsp->gp_state = RCU_GP_IDLE; 2266 rdp = this_cpu_ptr(rsp->rda); 2267 /* Advance CBs to reduce false positives below. */ 2268 needgp = rcu_advance_cbs(rsp, rnp, rdp) || needgp; 2269 if (needgp || cpu_needs_another_gp(rsp, rdp)) { 2270 WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); 2271 trace_rcu_grace_period(rsp->name, 2272 READ_ONCE(rsp->gpnum), 2273 TPS("newreq")); 2274 } 2275 raw_spin_unlock_irq_rcu_node(rnp); 2276 } 2277 2278 /* 2279 * Body of kthread that handles grace periods. 2280 */ 2281 static int __noreturn rcu_gp_kthread(void *arg) 2282 { 2283 bool first_gp_fqs; 2284 int gf; 2285 unsigned long j; 2286 int ret; 2287 struct rcu_state *rsp = arg; 2288 struct rcu_node *rnp = rcu_get_root(rsp); 2289 2290 rcu_bind_gp_kthread(); 2291 for (;;) { 2292 2293 /* Handle grace-period start. */ 2294 for (;;) { 2295 trace_rcu_grace_period(rsp->name, 2296 READ_ONCE(rsp->gpnum), 2297 TPS("reqwait")); 2298 rsp->gp_state = RCU_GP_WAIT_GPS; 2299 swait_event_interruptible(rsp->gp_wq, 2300 READ_ONCE(rsp->gp_flags) & 2301 RCU_GP_FLAG_INIT); 2302 rsp->gp_state = RCU_GP_DONE_GPS; 2303 /* Locking provides needed memory barrier. */ 2304 if (rcu_gp_init(rsp)) 2305 break; 2306 cond_resched_rcu_qs(); 2307 WRITE_ONCE(rsp->gp_activity, jiffies); 2308 WARN_ON(signal_pending(current)); 2309 trace_rcu_grace_period(rsp->name, 2310 READ_ONCE(rsp->gpnum), 2311 TPS("reqwaitsig")); 2312 } 2313 2314 /* Handle quiescent-state forcing. */ 2315 first_gp_fqs = true; 2316 j = jiffies_till_first_fqs; 2317 if (j > HZ) { 2318 j = HZ; 2319 jiffies_till_first_fqs = HZ; 2320 } 2321 ret = 0; 2322 for (;;) { 2323 if (!ret) { 2324 rsp->jiffies_force_qs = jiffies + j; 2325 WRITE_ONCE(rsp->jiffies_kick_kthreads, 2326 jiffies + 3 * j); 2327 } 2328 trace_rcu_grace_period(rsp->name, 2329 READ_ONCE(rsp->gpnum), 2330 TPS("fqswait")); 2331 rsp->gp_state = RCU_GP_WAIT_FQS; 2332 ret = swait_event_interruptible_timeout(rsp->gp_wq, 2333 rcu_gp_fqs_check_wake(rsp, &gf), j); 2334 rsp->gp_state = RCU_GP_DOING_FQS; 2335 /* Locking provides needed memory barriers. */ 2336 /* If grace period done, leave loop. */ 2337 if (!READ_ONCE(rnp->qsmask) && 2338 !rcu_preempt_blocked_readers_cgp(rnp)) 2339 break; 2340 /* If time for quiescent-state forcing, do it. */ 2341 if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || 2342 (gf & RCU_GP_FLAG_FQS)) { 2343 trace_rcu_grace_period(rsp->name, 2344 READ_ONCE(rsp->gpnum), 2345 TPS("fqsstart")); 2346 rcu_gp_fqs(rsp, first_gp_fqs); 2347 first_gp_fqs = false; 2348 trace_rcu_grace_period(rsp->name, 2349 READ_ONCE(rsp->gpnum), 2350 TPS("fqsend")); 2351 cond_resched_rcu_qs(); 2352 WRITE_ONCE(rsp->gp_activity, jiffies); 2353 ret = 0; /* Force full wait till next FQS. */ 2354 j = jiffies_till_next_fqs; 2355 if (j > HZ) { 2356 j = HZ; 2357 jiffies_till_next_fqs = HZ; 2358 } else if (j < 1) { 2359 j = 1; 2360 jiffies_till_next_fqs = 1; 2361 } 2362 } else { 2363 /* Deal with stray signal. */ 2364 cond_resched_rcu_qs(); 2365 WRITE_ONCE(rsp->gp_activity, jiffies); 2366 WARN_ON(signal_pending(current)); 2367 trace_rcu_grace_period(rsp->name, 2368 READ_ONCE(rsp->gpnum), 2369 TPS("fqswaitsig")); 2370 ret = 1; /* Keep old FQS timing. */ 2371 j = jiffies; 2372 if (time_after(jiffies, rsp->jiffies_force_qs)) 2373 j = 1; 2374 else 2375 j = rsp->jiffies_force_qs - j; 2376 } 2377 } 2378 2379 /* Handle grace-period end. */ 2380 rsp->gp_state = RCU_GP_CLEANUP; 2381 rcu_gp_cleanup(rsp); 2382 rsp->gp_state = RCU_GP_CLEANED; 2383 } 2384 } 2385 2386 /* 2387 * Start a new RCU grace period if warranted, re-initializing the hierarchy 2388 * in preparation for detecting the next grace period. The caller must hold 2389 * the root node's ->lock and hard irqs must be disabled. 2390 * 2391 * Note that it is legal for a dying CPU (which is marked as offline) to 2392 * invoke this function. This can happen when the dying CPU reports its 2393 * quiescent state. 2394 * 2395 * Returns true if the grace-period kthread must be awakened. 2396 */ 2397 static bool 2398 rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, 2399 struct rcu_data *rdp) 2400 { 2401 if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { 2402 /* 2403 * Either we have not yet spawned the grace-period 2404 * task, this CPU does not need another grace period, 2405 * or a grace period is already in progress. 2406 * Either way, don't start a new grace period. 2407 */ 2408 return false; 2409 } 2410 WRITE_ONCE(rsp->gp_flags, RCU_GP_FLAG_INIT); 2411 trace_rcu_grace_period(rsp->name, READ_ONCE(rsp->gpnum), 2412 TPS("newreq")); 2413 2414 /* 2415 * We can't do wakeups while holding the rnp->lock, as that 2416 * could cause possible deadlocks with the rq->lock. Defer 2417 * the wakeup to our caller. 2418 */ 2419 return true; 2420 } 2421 2422 /* 2423 * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's 2424 * callbacks. Note that rcu_start_gp_advanced() cannot do this because it 2425 * is invoked indirectly from rcu_advance_cbs(), which would result in 2426 * endless recursion -- or would do so if it wasn't for the self-deadlock 2427 * that is encountered beforehand. 2428 * 2429 * Returns true if the grace-period kthread needs to be awakened. 2430 */ 2431 static bool rcu_start_gp(struct rcu_state *rsp) 2432 { 2433 struct rcu_data *rdp = this_cpu_ptr(rsp->rda); 2434 struct rcu_node *rnp = rcu_get_root(rsp); 2435 bool ret = false; 2436 2437 /* 2438 * If there is no grace period in progress right now, any 2439 * callbacks we have up to this point will be satisfied by the 2440 * next grace period. Also, advancing the callbacks reduces the 2441 * probability of false positives from cpu_needs_another_gp() 2442 * resulting in pointless grace periods. So, advance callbacks 2443 * then start the grace period! 2444 */ 2445 ret = rcu_advance_cbs(rsp, rnp, rdp) || ret; 2446 ret = rcu_start_gp_advanced(rsp, rnp, rdp) || ret; 2447 return ret; 2448 } 2449 2450 /* 2451 * Report a full set of quiescent states to the specified rcu_state data 2452 * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period 2453 * kthread if another grace period is required. Whether we wake 2454 * the grace-period kthread or it awakens itself for the next round 2455 * of quiescent-state forcing, that kthread will clean up after the 2456 * just-completed grace period. Note that the caller must hold rnp->lock, 2457 * which is released before return. 2458 */ 2459 static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) 2460 __releases(rcu_get_root(rsp)->lock) 2461 { 2462 WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); 2463 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 2464 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 2465 rcu_gp_kthread_wake(rsp); 2466 } 2467 2468 /* 2469 * Similar to rcu_report_qs_rdp(), for which it is a helper function. 2470 * Allows quiescent states for a group of CPUs to be reported at one go 2471 * to the specified rcu_node structure, though all the CPUs in the group 2472 * must be represented by the same rcu_node structure (which need not be a 2473 * leaf rcu_node structure, though it often will be). The gps parameter 2474 * is the grace-period snapshot, which means that the quiescent states 2475 * are valid only if rnp->gpnum is equal to gps. That structure's lock 2476 * must be held upon entry, and it is released before return. 2477 */ 2478 static void 2479 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, 2480 struct rcu_node *rnp, unsigned long gps, unsigned long flags) 2481 __releases(rnp->lock) 2482 { 2483 unsigned long oldmask = 0; 2484 struct rcu_node *rnp_c; 2485 2486 /* Walk up the rcu_node hierarchy. */ 2487 for (;;) { 2488 if (!(rnp->qsmask & mask) || rnp->gpnum != gps) { 2489 2490 /* 2491 * Our bit has already been cleared, or the 2492 * relevant grace period is already over, so done. 2493 */ 2494 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2495 return; 2496 } 2497 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2498 rnp->qsmask &= ~mask; 2499 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2500 mask, rnp->qsmask, rnp->level, 2501 rnp->grplo, rnp->grphi, 2502 !!rnp->gp_tasks); 2503 if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2504 2505 /* Other bits still set at this level, so done. */ 2506 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2507 return; 2508 } 2509 mask = rnp->grpmask; 2510 if (rnp->parent == NULL) { 2511 2512 /* No more levels. Exit loop holding root lock. */ 2513 2514 break; 2515 } 2516 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2517 rnp_c = rnp; 2518 rnp = rnp->parent; 2519 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2520 oldmask = rnp_c->qsmask; 2521 } 2522 2523 /* 2524 * Get here if we are the last CPU to pass through a quiescent 2525 * state for this grace period. Invoke rcu_report_qs_rsp() 2526 * to clean up and start the next grace period if one is needed. 2527 */ 2528 rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ 2529 } 2530 2531 /* 2532 * Record a quiescent state for all tasks that were previously queued 2533 * on the specified rcu_node structure and that were blocking the current 2534 * RCU grace period. The caller must hold the specified rnp->lock with 2535 * irqs disabled, and this lock is released upon return, but irqs remain 2536 * disabled. 2537 */ 2538 static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, 2539 struct rcu_node *rnp, unsigned long flags) 2540 __releases(rnp->lock) 2541 { 2542 unsigned long gps; 2543 unsigned long mask; 2544 struct rcu_node *rnp_p; 2545 2546 if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || 2547 rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { 2548 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2549 return; /* Still need more quiescent states! */ 2550 } 2551 2552 rnp_p = rnp->parent; 2553 if (rnp_p == NULL) { 2554 /* 2555 * Only one rcu_node structure in the tree, so don't 2556 * try to report up to its nonexistent parent! 2557 */ 2558 rcu_report_qs_rsp(rsp, flags); 2559 return; 2560 } 2561 2562 /* Report up the rest of the hierarchy, tracking current ->gpnum. */ 2563 gps = rnp->gpnum; 2564 mask = rnp->grpmask; 2565 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2566 raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ 2567 rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); 2568 } 2569 2570 /* 2571 * Record a quiescent state for the specified CPU to that CPU's rcu_data 2572 * structure. This must be called from the specified CPU. 2573 */ 2574 static void 2575 rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) 2576 { 2577 unsigned long flags; 2578 unsigned long mask; 2579 bool needwake; 2580 struct rcu_node *rnp; 2581 2582 rnp = rdp->mynode; 2583 raw_spin_lock_irqsave_rcu_node(rnp, flags); 2584 if (rdp->cpu_no_qs.b.norm || rdp->gpnum != rnp->gpnum || 2585 rnp->completed == rnp->gpnum || rdp->gpwrap) { 2586 2587 /* 2588 * The grace period in which this quiescent state was 2589 * recorded has ended, so don't report it upwards. 2590 * We will instead need a new quiescent state that lies 2591 * within the current grace period. 2592 */ 2593 rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ 2594 rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); 2595 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2596 return; 2597 } 2598 mask = rdp->grpmask; 2599 if ((rnp->qsmask & mask) == 0) { 2600 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 2601 } else { 2602 rdp->core_needs_qs = false; 2603 2604 /* 2605 * This GP can't end until cpu checks in, so all of our 2606 * callbacks can be processed during the next GP. 2607 */ 2608 needwake = rcu_accelerate_cbs(rsp, rnp, rdp); 2609 2610 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); 2611 /* ^^^ Released rnp->lock */ 2612 if (needwake) 2613 rcu_gp_kthread_wake(rsp); 2614 } 2615 } 2616 2617 /* 2618 * Check to see if there is a new grace period of which this CPU 2619 * is not yet aware, and if so, set up local rcu_data state for it. 2620 * Otherwise, see if this CPU has just passed through its first 2621 * quiescent state for this grace period, and record that fact if so. 2622 */ 2623 static void 2624 rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) 2625 { 2626 /* Check for grace-period ends and beginnings. */ 2627 note_gp_changes(rsp, rdp); 2628 2629 /* 2630 * Does this CPU still need to do its part for current grace period? 2631 * If no, return and let the other CPUs do their part as well. 2632 */ 2633 if (!rdp->core_needs_qs) 2634 return; 2635 2636 /* 2637 * Was there a quiescent state since the beginning of the grace 2638 * period? If no, then exit and wait for the next call. 2639 */ 2640 if (rdp->cpu_no_qs.b.norm) 2641 return; 2642 2643 /* 2644 * Tell RCU we are done (but rcu_report_qs_rdp() will be the 2645 * judge of that). 2646 */ 2647 rcu_report_qs_rdp(rdp->cpu, rsp, rdp); 2648 } 2649 2650 /* 2651 * Send the specified CPU's RCU callbacks to the orphanage. The 2652 * specified CPU must be offline, and the caller must hold the 2653 * ->orphan_lock. 2654 */ 2655 static void 2656 rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, 2657 struct rcu_node *rnp, struct rcu_data *rdp) 2658 { 2659 /* No-CBs CPUs do not have orphanable callbacks. */ 2660 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu)) 2661 return; 2662 2663 /* 2664 * Orphan the callbacks. First adjust the counts. This is safe 2665 * because _rcu_barrier() excludes CPU-hotplug operations, so it 2666 * cannot be running now. Thus no memory barrier is required. 2667 */ 2668 if (rdp->nxtlist != NULL) { 2669 rsp->qlen_lazy += rdp->qlen_lazy; 2670 rsp->qlen += rdp->qlen; 2671 rdp->n_cbs_orphaned += rdp->qlen; 2672 rdp->qlen_lazy = 0; 2673 WRITE_ONCE(rdp->qlen, 0); 2674 } 2675 2676 /* 2677 * Next, move those callbacks still needing a grace period to 2678 * the orphanage, where some other CPU will pick them up. 2679 * Some of the callbacks might have gone partway through a grace 2680 * period, but that is too bad. They get to start over because we 2681 * cannot assume that grace periods are synchronized across CPUs. 2682 * We don't bother updating the ->nxttail[] array yet, instead 2683 * we just reset the whole thing later on. 2684 */ 2685 if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { 2686 *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; 2687 rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; 2688 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2689 } 2690 2691 /* 2692 * Then move the ready-to-invoke callbacks to the orphanage, 2693 * where some other CPU will pick them up. These will not be 2694 * required to pass though another grace period: They are done. 2695 */ 2696 if (rdp->nxtlist != NULL) { 2697 *rsp->orphan_donetail = rdp->nxtlist; 2698 rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; 2699 } 2700 2701 /* 2702 * Finally, initialize the rcu_data structure's list to empty and 2703 * disallow further callbacks on this CPU. 2704 */ 2705 init_callback_list(rdp); 2706 rdp->nxttail[RCU_NEXT_TAIL] = NULL; 2707 } 2708 2709 /* 2710 * Adopt the RCU callbacks from the specified rcu_state structure's 2711 * orphanage. The caller must hold the ->orphan_lock. 2712 */ 2713 static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags) 2714 { 2715 int i; 2716 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 2717 2718 /* No-CBs CPUs are handled specially. */ 2719 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2720 rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags)) 2721 return; 2722 2723 /* Do the accounting first. */ 2724 rdp->qlen_lazy += rsp->qlen_lazy; 2725 rdp->qlen += rsp->qlen; 2726 rdp->n_cbs_adopted += rsp->qlen; 2727 if (rsp->qlen_lazy != rsp->qlen) 2728 rcu_idle_count_callbacks_posted(); 2729 rsp->qlen_lazy = 0; 2730 rsp->qlen = 0; 2731 2732 /* 2733 * We do not need a memory barrier here because the only way we 2734 * can get here if there is an rcu_barrier() in flight is if 2735 * we are the task doing the rcu_barrier(). 2736 */ 2737 2738 /* First adopt the ready-to-invoke callbacks. */ 2739 if (rsp->orphan_donelist != NULL) { 2740 *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; 2741 *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; 2742 for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) 2743 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2744 rdp->nxttail[i] = rsp->orphan_donetail; 2745 rsp->orphan_donelist = NULL; 2746 rsp->orphan_donetail = &rsp->orphan_donelist; 2747 } 2748 2749 /* And then adopt the callbacks that still need a grace period. */ 2750 if (rsp->orphan_nxtlist != NULL) { 2751 *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; 2752 rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; 2753 rsp->orphan_nxtlist = NULL; 2754 rsp->orphan_nxttail = &rsp->orphan_nxtlist; 2755 } 2756 } 2757 2758 /* 2759 * Trace the fact that this CPU is going offline. 2760 */ 2761 static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) 2762 { 2763 RCU_TRACE(unsigned long mask); 2764 RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); 2765 RCU_TRACE(struct rcu_node *rnp = rdp->mynode); 2766 2767 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2768 return; 2769 2770 RCU_TRACE(mask = rdp->grpmask); 2771 trace_rcu_grace_period(rsp->name, 2772 rnp->gpnum + 1 - !!(rnp->qsmask & mask), 2773 TPS("cpuofl")); 2774 } 2775 2776 /* 2777 * All CPUs for the specified rcu_node structure have gone offline, 2778 * and all tasks that were preempted within an RCU read-side critical 2779 * section while running on one of those CPUs have since exited their RCU 2780 * read-side critical section. Some other CPU is reporting this fact with 2781 * the specified rcu_node structure's ->lock held and interrupts disabled. 2782 * This function therefore goes up the tree of rcu_node structures, 2783 * clearing the corresponding bits in the ->qsmaskinit fields. Note that 2784 * the leaf rcu_node structure's ->qsmaskinit field has already been 2785 * updated 2786 * 2787 * This function does check that the specified rcu_node structure has 2788 * all CPUs offline and no blocked tasks, so it is OK to invoke it 2789 * prematurely. That said, invoking it after the fact will cost you 2790 * a needless lock acquisition. So once it has done its work, don't 2791 * invoke it again. 2792 */ 2793 static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) 2794 { 2795 long mask; 2796 struct rcu_node *rnp = rnp_leaf; 2797 2798 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || 2799 rnp->qsmaskinit || rcu_preempt_has_tasks(rnp)) 2800 return; 2801 for (;;) { 2802 mask = rnp->grpmask; 2803 rnp = rnp->parent; 2804 if (!rnp) 2805 break; 2806 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 2807 rnp->qsmaskinit &= ~mask; 2808 rnp->qsmask &= ~mask; 2809 if (rnp->qsmaskinit) { 2810 raw_spin_unlock_rcu_node(rnp); 2811 /* irqs remain disabled. */ 2812 return; 2813 } 2814 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 2815 } 2816 } 2817 2818 /* 2819 * The CPU has been completely removed, and some other CPU is reporting 2820 * this fact from process context. Do the remainder of the cleanup, 2821 * including orphaning the outgoing CPU's RCU callbacks, and also 2822 * adopting them. There can only be one CPU hotplug operation at a time, 2823 * so no other CPU can be attempting to update rcu_cpu_kthread_task. 2824 */ 2825 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) 2826 { 2827 unsigned long flags; 2828 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 2829 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 2830 2831 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) 2832 return; 2833 2834 /* Adjust any no-longer-needed kthreads. */ 2835 rcu_boost_kthread_setaffinity(rnp, -1); 2836 2837 /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ 2838 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 2839 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 2840 rcu_adopt_orphan_cbs(rsp, flags); 2841 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 2842 2843 WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, 2844 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", 2845 cpu, rdp->qlen, rdp->nxtlist); 2846 } 2847 2848 /* 2849 * Invoke any RCU callbacks that have made it to the end of their grace 2850 * period. Thottle as specified by rdp->blimit. 2851 */ 2852 static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) 2853 { 2854 unsigned long flags; 2855 struct rcu_head *next, *list, **tail; 2856 long bl, count, count_lazy; 2857 int i; 2858 2859 /* If no callbacks are ready, just return. */ 2860 if (!cpu_has_callbacks_ready_to_invoke(rdp)) { 2861 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); 2862 trace_rcu_batch_end(rsp->name, 0, !!READ_ONCE(rdp->nxtlist), 2863 need_resched(), is_idle_task(current), 2864 rcu_is_callbacks_kthread()); 2865 return; 2866 } 2867 2868 /* 2869 * Extract the list of ready callbacks, disabling to prevent 2870 * races with call_rcu() from interrupt handlers. 2871 */ 2872 local_irq_save(flags); 2873 WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); 2874 bl = rdp->blimit; 2875 trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); 2876 list = rdp->nxtlist; 2877 rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; 2878 *rdp->nxttail[RCU_DONE_TAIL] = NULL; 2879 tail = rdp->nxttail[RCU_DONE_TAIL]; 2880 for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) 2881 if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) 2882 rdp->nxttail[i] = &rdp->nxtlist; 2883 local_irq_restore(flags); 2884 2885 /* Invoke callbacks. */ 2886 count = count_lazy = 0; 2887 while (list) { 2888 next = list->next; 2889 prefetch(next); 2890 debug_rcu_head_unqueue(list); 2891 if (__rcu_reclaim(rsp->name, list)) 2892 count_lazy++; 2893 list = next; 2894 /* Stop only if limit reached and CPU has something to do. */ 2895 if (++count >= bl && 2896 (need_resched() || 2897 (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) 2898 break; 2899 } 2900 2901 local_irq_save(flags); 2902 trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), 2903 is_idle_task(current), 2904 rcu_is_callbacks_kthread()); 2905 2906 /* Update count, and requeue any remaining callbacks. */ 2907 if (list != NULL) { 2908 *tail = rdp->nxtlist; 2909 rdp->nxtlist = list; 2910 for (i = 0; i < RCU_NEXT_SIZE; i++) 2911 if (&rdp->nxtlist == rdp->nxttail[i]) 2912 rdp->nxttail[i] = tail; 2913 else 2914 break; 2915 } 2916 smp_mb(); /* List handling before counting for rcu_barrier(). */ 2917 rdp->qlen_lazy -= count_lazy; 2918 WRITE_ONCE(rdp->qlen, rdp->qlen - count); 2919 rdp->n_cbs_invoked += count; 2920 2921 /* Reinstate batch limit if we have worked down the excess. */ 2922 if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) 2923 rdp->blimit = blimit; 2924 2925 /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ 2926 if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { 2927 rdp->qlen_last_fqs_check = 0; 2928 rdp->n_force_qs_snap = rsp->n_force_qs; 2929 } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) 2930 rdp->qlen_last_fqs_check = rdp->qlen; 2931 WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); 2932 2933 local_irq_restore(flags); 2934 2935 /* Re-invoke RCU core processing if there are callbacks remaining. */ 2936 if (cpu_has_callbacks_ready_to_invoke(rdp)) 2937 invoke_rcu_core(); 2938 } 2939 2940 /* 2941 * Check to see if this CPU is in a non-context-switch quiescent state 2942 * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). 2943 * Also schedule RCU core processing. 2944 * 2945 * This function must be called from hardirq context. It is normally 2946 * invoked from the scheduling-clock interrupt. 2947 */ 2948 void rcu_check_callbacks(int user) 2949 { 2950 trace_rcu_utilization(TPS("Start scheduler-tick")); 2951 increment_cpu_stall_ticks(); 2952 if (user || rcu_is_cpu_rrupt_from_idle()) { 2953 2954 /* 2955 * Get here if this CPU took its interrupt from user 2956 * mode or from the idle loop, and if this is not a 2957 * nested interrupt. In this case, the CPU is in 2958 * a quiescent state, so note it. 2959 * 2960 * No memory barrier is required here because both 2961 * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local 2962 * variables that other CPUs neither access nor modify, 2963 * at least not while the corresponding CPU is online. 2964 */ 2965 2966 rcu_sched_qs(); 2967 rcu_bh_qs(); 2968 2969 } else if (!in_softirq()) { 2970 2971 /* 2972 * Get here if this CPU did not take its interrupt from 2973 * softirq, in other words, if it is not interrupting 2974 * a rcu_bh read-side critical section. This is an _bh 2975 * critical section, so note it. 2976 */ 2977 2978 rcu_bh_qs(); 2979 } 2980 rcu_preempt_check_callbacks(); 2981 if (rcu_pending()) 2982 invoke_rcu_core(); 2983 if (user) 2984 rcu_note_voluntary_context_switch(current); 2985 trace_rcu_utilization(TPS("End scheduler-tick")); 2986 } 2987 2988 /* 2989 * Scan the leaf rcu_node structures, processing dyntick state for any that 2990 * have not yet encountered a quiescent state, using the function specified. 2991 * Also initiate boosting for any threads blocked on the root rcu_node. 2992 * 2993 * The caller must have suppressed start of new grace periods. 2994 */ 2995 static void force_qs_rnp(struct rcu_state *rsp, 2996 int (*f)(struct rcu_data *rsp, bool *isidle, 2997 unsigned long *maxj), 2998 bool *isidle, unsigned long *maxj) 2999 { 3000 int cpu; 3001 unsigned long flags; 3002 unsigned long mask; 3003 struct rcu_node *rnp; 3004 3005 rcu_for_each_leaf_node(rsp, rnp) { 3006 cond_resched_rcu_qs(); 3007 mask = 0; 3008 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3009 if (rnp->qsmask == 0) { 3010 if (rcu_state_p == &rcu_sched_state || 3011 rsp != rcu_state_p || 3012 rcu_preempt_blocked_readers_cgp(rnp)) { 3013 /* 3014 * No point in scanning bits because they 3015 * are all zero. But we might need to 3016 * priority-boost blocked readers. 3017 */ 3018 rcu_initiate_boost(rnp, flags); 3019 /* rcu_initiate_boost() releases rnp->lock */ 3020 continue; 3021 } 3022 if (rnp->parent && 3023 (rnp->parent->qsmask & rnp->grpmask)) { 3024 /* 3025 * Race between grace-period 3026 * initialization and task exiting RCU 3027 * read-side critical section: Report. 3028 */ 3029 rcu_report_unblock_qs_rnp(rsp, rnp, flags); 3030 /* rcu_report_unblock_qs_rnp() rlses ->lock */ 3031 continue; 3032 } 3033 } 3034 for_each_leaf_node_possible_cpu(rnp, cpu) { 3035 unsigned long bit = leaf_node_cpu_bit(rnp, cpu); 3036 if ((rnp->qsmask & bit) != 0) { 3037 if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) 3038 mask |= bit; 3039 } 3040 } 3041 if (mask != 0) { 3042 /* Idle/offline CPUs, report (releases rnp->lock. */ 3043 rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); 3044 } else { 3045 /* Nothing to do here, so just drop the lock. */ 3046 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3047 } 3048 } 3049 } 3050 3051 /* 3052 * Force quiescent states on reluctant CPUs, and also detect which 3053 * CPUs are in dyntick-idle mode. 3054 */ 3055 static void force_quiescent_state(struct rcu_state *rsp) 3056 { 3057 unsigned long flags; 3058 bool ret; 3059 struct rcu_node *rnp; 3060 struct rcu_node *rnp_old = NULL; 3061 3062 /* Funnel through hierarchy to reduce memory contention. */ 3063 rnp = __this_cpu_read(rsp->rda->mynode); 3064 for (; rnp != NULL; rnp = rnp->parent) { 3065 ret = (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || 3066 !raw_spin_trylock(&rnp->fqslock); 3067 if (rnp_old != NULL) 3068 raw_spin_unlock(&rnp_old->fqslock); 3069 if (ret) { 3070 rsp->n_force_qs_lh++; 3071 return; 3072 } 3073 rnp_old = rnp; 3074 } 3075 /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ 3076 3077 /* Reached the root of the rcu_node tree, acquire lock. */ 3078 raw_spin_lock_irqsave_rcu_node(rnp_old, flags); 3079 raw_spin_unlock(&rnp_old->fqslock); 3080 if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { 3081 rsp->n_force_qs_lh++; 3082 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 3083 return; /* Someone beat us to it. */ 3084 } 3085 WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); 3086 raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); 3087 rcu_gp_kthread_wake(rsp); 3088 } 3089 3090 /* 3091 * This does the RCU core processing work for the specified rcu_state 3092 * and rcu_data structures. This may be called only from the CPU to 3093 * whom the rdp belongs. 3094 */ 3095 static void 3096 __rcu_process_callbacks(struct rcu_state *rsp) 3097 { 3098 unsigned long flags; 3099 bool needwake; 3100 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3101 3102 WARN_ON_ONCE(rdp->beenonline == 0); 3103 3104 /* Update RCU state based on any recent quiescent states. */ 3105 rcu_check_quiescent_state(rsp, rdp); 3106 3107 /* Does this CPU require a not-yet-started grace period? */ 3108 local_irq_save(flags); 3109 if (cpu_needs_another_gp(rsp, rdp)) { 3110 raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ 3111 needwake = rcu_start_gp(rsp); 3112 raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); 3113 if (needwake) 3114 rcu_gp_kthread_wake(rsp); 3115 } else { 3116 local_irq_restore(flags); 3117 } 3118 3119 /* If there are callbacks ready, invoke them. */ 3120 if (cpu_has_callbacks_ready_to_invoke(rdp)) 3121 invoke_rcu_callbacks(rsp, rdp); 3122 3123 /* Do any needed deferred wakeups of rcuo kthreads. */ 3124 do_nocb_deferred_wakeup(rdp); 3125 } 3126 3127 /* 3128 * Do RCU core processing for the current CPU. 3129 */ 3130 static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) 3131 { 3132 struct rcu_state *rsp; 3133 3134 if (cpu_is_offline(smp_processor_id())) 3135 return; 3136 trace_rcu_utilization(TPS("Start RCU core")); 3137 for_each_rcu_flavor(rsp) 3138 __rcu_process_callbacks(rsp); 3139 trace_rcu_utilization(TPS("End RCU core")); 3140 } 3141 3142 /* 3143 * Schedule RCU callback invocation. If the specified type of RCU 3144 * does not support RCU priority boosting, just do a direct call, 3145 * otherwise wake up the per-CPU kernel kthread. Note that because we 3146 * are running on the current CPU with softirqs disabled, the 3147 * rcu_cpu_kthread_task cannot disappear out from under us. 3148 */ 3149 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) 3150 { 3151 if (unlikely(!READ_ONCE(rcu_scheduler_fully_active))) 3152 return; 3153 if (likely(!rsp->boost)) { 3154 rcu_do_batch(rsp, rdp); 3155 return; 3156 } 3157 invoke_rcu_callbacks_kthread(); 3158 } 3159 3160 static void invoke_rcu_core(void) 3161 { 3162 if (cpu_online(smp_processor_id())) 3163 raise_softirq(RCU_SOFTIRQ); 3164 } 3165 3166 /* 3167 * Handle any core-RCU processing required by a call_rcu() invocation. 3168 */ 3169 static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, 3170 struct rcu_head *head, unsigned long flags) 3171 { 3172 bool needwake; 3173 3174 /* 3175 * If called from an extended quiescent state, invoke the RCU 3176 * core in order to force a re-evaluation of RCU's idleness. 3177 */ 3178 if (!rcu_is_watching()) 3179 invoke_rcu_core(); 3180 3181 /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ 3182 if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) 3183 return; 3184 3185 /* 3186 * Force the grace period if too many callbacks or too long waiting. 3187 * Enforce hysteresis, and don't invoke force_quiescent_state() 3188 * if some other CPU has recently done so. Also, don't bother 3189 * invoking force_quiescent_state() if the newly enqueued callback 3190 * is the only one waiting for a grace period to complete. 3191 */ 3192 if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { 3193 3194 /* Are we ignoring a completed grace period? */ 3195 note_gp_changes(rsp, rdp); 3196 3197 /* Start a new grace period if one not already started. */ 3198 if (!rcu_gp_in_progress(rsp)) { 3199 struct rcu_node *rnp_root = rcu_get_root(rsp); 3200 3201 raw_spin_lock_rcu_node(rnp_root); 3202 needwake = rcu_start_gp(rsp); 3203 raw_spin_unlock_rcu_node(rnp_root); 3204 if (needwake) 3205 rcu_gp_kthread_wake(rsp); 3206 } else { 3207 /* Give the grace period a kick. */ 3208 rdp->blimit = LONG_MAX; 3209 if (rsp->n_force_qs == rdp->n_force_qs_snap && 3210 *rdp->nxttail[RCU_DONE_TAIL] != head) 3211 force_quiescent_state(rsp); 3212 rdp->n_force_qs_snap = rsp->n_force_qs; 3213 rdp->qlen_last_fqs_check = rdp->qlen; 3214 } 3215 } 3216 } 3217 3218 /* 3219 * RCU callback function to leak a callback. 3220 */ 3221 static void rcu_leak_callback(struct rcu_head *rhp) 3222 { 3223 } 3224 3225 /* 3226 * Helper function for call_rcu() and friends. The cpu argument will 3227 * normally be -1, indicating "currently running CPU". It may specify 3228 * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() 3229 * is expected to specify a CPU. 3230 */ 3231 static void 3232 __call_rcu(struct rcu_head *head, rcu_callback_t func, 3233 struct rcu_state *rsp, int cpu, bool lazy) 3234 { 3235 unsigned long flags; 3236 struct rcu_data *rdp; 3237 3238 /* Misaligned rcu_head! */ 3239 WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1)); 3240 3241 if (debug_rcu_head_queue(head)) { 3242 /* Probable double call_rcu(), so leak the callback. */ 3243 WRITE_ONCE(head->func, rcu_leak_callback); 3244 WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); 3245 return; 3246 } 3247 head->func = func; 3248 head->next = NULL; 3249 local_irq_save(flags); 3250 rdp = this_cpu_ptr(rsp->rda); 3251 3252 /* Add the callback to our list. */ 3253 if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { 3254 int offline; 3255 3256 if (cpu != -1) 3257 rdp = per_cpu_ptr(rsp->rda, cpu); 3258 if (likely(rdp->mynode)) { 3259 /* Post-boot, so this should be for a no-CBs CPU. */ 3260 offline = !__call_rcu_nocb(rdp, head, lazy, flags); 3261 WARN_ON_ONCE(offline); 3262 /* Offline CPU, _call_rcu() illegal, leak callback. */ 3263 local_irq_restore(flags); 3264 return; 3265 } 3266 /* 3267 * Very early boot, before rcu_init(). Initialize if needed 3268 * and then drop through to queue the callback. 3269 */ 3270 BUG_ON(cpu != -1); 3271 WARN_ON_ONCE(!rcu_is_watching()); 3272 if (!likely(rdp->nxtlist)) 3273 init_default_callback_list(rdp); 3274 } 3275 WRITE_ONCE(rdp->qlen, rdp->qlen + 1); 3276 if (lazy) 3277 rdp->qlen_lazy++; 3278 else 3279 rcu_idle_count_callbacks_posted(); 3280 smp_mb(); /* Count before adding callback for rcu_barrier(). */ 3281 *rdp->nxttail[RCU_NEXT_TAIL] = head; 3282 rdp->nxttail[RCU_NEXT_TAIL] = &head->next; 3283 3284 if (__is_kfree_rcu_offset((unsigned long)func)) 3285 trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, 3286 rdp->qlen_lazy, rdp->qlen); 3287 else 3288 trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); 3289 3290 /* Go handle any RCU core processing required. */ 3291 __call_rcu_core(rsp, rdp, head, flags); 3292 local_irq_restore(flags); 3293 } 3294 3295 /* 3296 * Queue an RCU-sched callback for invocation after a grace period. 3297 */ 3298 void call_rcu_sched(struct rcu_head *head, rcu_callback_t func) 3299 { 3300 __call_rcu(head, func, &rcu_sched_state, -1, 0); 3301 } 3302 EXPORT_SYMBOL_GPL(call_rcu_sched); 3303 3304 /* 3305 * Queue an RCU callback for invocation after a quicker grace period. 3306 */ 3307 void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) 3308 { 3309 __call_rcu(head, func, &rcu_bh_state, -1, 0); 3310 } 3311 EXPORT_SYMBOL_GPL(call_rcu_bh); 3312 3313 /* 3314 * Queue an RCU callback for lazy invocation after a grace period. 3315 * This will likely be later named something like "call_rcu_lazy()", 3316 * but this change will require some way of tagging the lazy RCU 3317 * callbacks in the list of pending callbacks. Until then, this 3318 * function may only be called from __kfree_rcu(). 3319 */ 3320 void kfree_call_rcu(struct rcu_head *head, 3321 rcu_callback_t func) 3322 { 3323 __call_rcu(head, func, rcu_state_p, -1, 1); 3324 } 3325 EXPORT_SYMBOL_GPL(kfree_call_rcu); 3326 3327 /* 3328 * Because a context switch is a grace period for RCU-sched and RCU-bh, 3329 * any blocking grace-period wait automatically implies a grace period 3330 * if there is only one CPU online at any point time during execution 3331 * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to 3332 * occasionally incorrectly indicate that there are multiple CPUs online 3333 * when there was in fact only one the whole time, as this just adds 3334 * some overhead: RCU still operates correctly. 3335 */ 3336 static inline int rcu_blocking_is_gp(void) 3337 { 3338 int ret; 3339 3340 might_sleep(); /* Check for RCU read-side critical section. */ 3341 preempt_disable(); 3342 ret = num_online_cpus() <= 1; 3343 preempt_enable(); 3344 return ret; 3345 } 3346 3347 /** 3348 * synchronize_sched - wait until an rcu-sched grace period has elapsed. 3349 * 3350 * Control will return to the caller some time after a full rcu-sched 3351 * grace period has elapsed, in other words after all currently executing 3352 * rcu-sched read-side critical sections have completed. These read-side 3353 * critical sections are delimited by rcu_read_lock_sched() and 3354 * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), 3355 * local_irq_disable(), and so on may be used in place of 3356 * rcu_read_lock_sched(). 3357 * 3358 * This means that all preempt_disable code sequences, including NMI and 3359 * non-threaded hardware-interrupt handlers, in progress on entry will 3360 * have completed before this primitive returns. However, this does not 3361 * guarantee that softirq handlers will have completed, since in some 3362 * kernels, these handlers can run in process context, and can block. 3363 * 3364 * Note that this guarantee implies further memory-ordering guarantees. 3365 * On systems with more than one CPU, when synchronize_sched() returns, 3366 * each CPU is guaranteed to have executed a full memory barrier since the 3367 * end of its last RCU-sched read-side critical section whose beginning 3368 * preceded the call to synchronize_sched(). In addition, each CPU having 3369 * an RCU read-side critical section that extends beyond the return from 3370 * synchronize_sched() is guaranteed to have executed a full memory barrier 3371 * after the beginning of synchronize_sched() and before the beginning of 3372 * that RCU read-side critical section. Note that these guarantees include 3373 * CPUs that are offline, idle, or executing in user mode, as well as CPUs 3374 * that are executing in the kernel. 3375 * 3376 * Furthermore, if CPU A invoked synchronize_sched(), which returned 3377 * to its caller on CPU B, then both CPU A and CPU B are guaranteed 3378 * to have executed a full memory barrier during the execution of 3379 * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but 3380 * again only if the system has more than one CPU). 3381 * 3382 * This primitive provides the guarantees made by the (now removed) 3383 * synchronize_kernel() API. In contrast, synchronize_rcu() only 3384 * guarantees that rcu_read_lock() sections will have completed. 3385 * In "classic RCU", these two guarantees happen to be one and 3386 * the same, but can differ in realtime RCU implementations. 3387 */ 3388 void synchronize_sched(void) 3389 { 3390 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3391 lock_is_held(&rcu_lock_map) || 3392 lock_is_held(&rcu_sched_lock_map), 3393 "Illegal synchronize_sched() in RCU-sched read-side critical section"); 3394 if (rcu_blocking_is_gp()) 3395 return; 3396 if (rcu_gp_is_expedited()) 3397 synchronize_sched_expedited(); 3398 else 3399 wait_rcu_gp(call_rcu_sched); 3400 } 3401 EXPORT_SYMBOL_GPL(synchronize_sched); 3402 3403 /** 3404 * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. 3405 * 3406 * Control will return to the caller some time after a full rcu_bh grace 3407 * period has elapsed, in other words after all currently executing rcu_bh 3408 * read-side critical sections have completed. RCU read-side critical 3409 * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), 3410 * and may be nested. 3411 * 3412 * See the description of synchronize_sched() for more detailed information 3413 * on memory ordering guarantees. 3414 */ 3415 void synchronize_rcu_bh(void) 3416 { 3417 RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || 3418 lock_is_held(&rcu_lock_map) || 3419 lock_is_held(&rcu_sched_lock_map), 3420 "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); 3421 if (rcu_blocking_is_gp()) 3422 return; 3423 if (rcu_gp_is_expedited()) 3424 synchronize_rcu_bh_expedited(); 3425 else 3426 wait_rcu_gp(call_rcu_bh); 3427 } 3428 EXPORT_SYMBOL_GPL(synchronize_rcu_bh); 3429 3430 /** 3431 * get_state_synchronize_rcu - Snapshot current RCU state 3432 * 3433 * Returns a cookie that is used by a later call to cond_synchronize_rcu() 3434 * to determine whether or not a full grace period has elapsed in the 3435 * meantime. 3436 */ 3437 unsigned long get_state_synchronize_rcu(void) 3438 { 3439 /* 3440 * Any prior manipulation of RCU-protected data must happen 3441 * before the load from ->gpnum. 3442 */ 3443 smp_mb(); /* ^^^ */ 3444 3445 /* 3446 * Make sure this load happens before the purportedly 3447 * time-consuming work between get_state_synchronize_rcu() 3448 * and cond_synchronize_rcu(). 3449 */ 3450 return smp_load_acquire(&rcu_state_p->gpnum); 3451 } 3452 EXPORT_SYMBOL_GPL(get_state_synchronize_rcu); 3453 3454 /** 3455 * cond_synchronize_rcu - Conditionally wait for an RCU grace period 3456 * 3457 * @oldstate: return value from earlier call to get_state_synchronize_rcu() 3458 * 3459 * If a full RCU grace period has elapsed since the earlier call to 3460 * get_state_synchronize_rcu(), just return. Otherwise, invoke 3461 * synchronize_rcu() to wait for a full grace period. 3462 * 3463 * Yes, this function does not take counter wrap into account. But 3464 * counter wrap is harmless. If the counter wraps, we have waited for 3465 * more than 2 billion grace periods (and way more on a 64-bit system!), 3466 * so waiting for one additional grace period should be just fine. 3467 */ 3468 void cond_synchronize_rcu(unsigned long oldstate) 3469 { 3470 unsigned long newstate; 3471 3472 /* 3473 * Ensure that this load happens before any RCU-destructive 3474 * actions the caller might carry out after we return. 3475 */ 3476 newstate = smp_load_acquire(&rcu_state_p->completed); 3477 if (ULONG_CMP_GE(oldstate, newstate)) 3478 synchronize_rcu(); 3479 } 3480 EXPORT_SYMBOL_GPL(cond_synchronize_rcu); 3481 3482 /** 3483 * get_state_synchronize_sched - Snapshot current RCU-sched state 3484 * 3485 * Returns a cookie that is used by a later call to cond_synchronize_sched() 3486 * to determine whether or not a full grace period has elapsed in the 3487 * meantime. 3488 */ 3489 unsigned long get_state_synchronize_sched(void) 3490 { 3491 /* 3492 * Any prior manipulation of RCU-protected data must happen 3493 * before the load from ->gpnum. 3494 */ 3495 smp_mb(); /* ^^^ */ 3496 3497 /* 3498 * Make sure this load happens before the purportedly 3499 * time-consuming work between get_state_synchronize_sched() 3500 * and cond_synchronize_sched(). 3501 */ 3502 return smp_load_acquire(&rcu_sched_state.gpnum); 3503 } 3504 EXPORT_SYMBOL_GPL(get_state_synchronize_sched); 3505 3506 /** 3507 * cond_synchronize_sched - Conditionally wait for an RCU-sched grace period 3508 * 3509 * @oldstate: return value from earlier call to get_state_synchronize_sched() 3510 * 3511 * If a full RCU-sched grace period has elapsed since the earlier call to 3512 * get_state_synchronize_sched(), just return. Otherwise, invoke 3513 * synchronize_sched() to wait for a full grace period. 3514 * 3515 * Yes, this function does not take counter wrap into account. But 3516 * counter wrap is harmless. If the counter wraps, we have waited for 3517 * more than 2 billion grace periods (and way more on a 64-bit system!), 3518 * so waiting for one additional grace period should be just fine. 3519 */ 3520 void cond_synchronize_sched(unsigned long oldstate) 3521 { 3522 unsigned long newstate; 3523 3524 /* 3525 * Ensure that this load happens before any RCU-destructive 3526 * actions the caller might carry out after we return. 3527 */ 3528 newstate = smp_load_acquire(&rcu_sched_state.completed); 3529 if (ULONG_CMP_GE(oldstate, newstate)) 3530 synchronize_sched(); 3531 } 3532 EXPORT_SYMBOL_GPL(cond_synchronize_sched); 3533 3534 /* Adjust sequence number for start of update-side operation. */ 3535 static void rcu_seq_start(unsigned long *sp) 3536 { 3537 WRITE_ONCE(*sp, *sp + 1); 3538 smp_mb(); /* Ensure update-side operation after counter increment. */ 3539 WARN_ON_ONCE(!(*sp & 0x1)); 3540 } 3541 3542 /* Adjust sequence number for end of update-side operation. */ 3543 static void rcu_seq_end(unsigned long *sp) 3544 { 3545 smp_mb(); /* Ensure update-side operation before counter increment. */ 3546 WRITE_ONCE(*sp, *sp + 1); 3547 WARN_ON_ONCE(*sp & 0x1); 3548 } 3549 3550 /* Take a snapshot of the update side's sequence number. */ 3551 static unsigned long rcu_seq_snap(unsigned long *sp) 3552 { 3553 unsigned long s; 3554 3555 s = (READ_ONCE(*sp) + 3) & ~0x1; 3556 smp_mb(); /* Above access must not bleed into critical section. */ 3557 return s; 3558 } 3559 3560 /* 3561 * Given a snapshot from rcu_seq_snap(), determine whether or not a 3562 * full update-side operation has occurred. 3563 */ 3564 static bool rcu_seq_done(unsigned long *sp, unsigned long s) 3565 { 3566 return ULONG_CMP_GE(READ_ONCE(*sp), s); 3567 } 3568 3569 /* 3570 * Check to see if there is any immediate RCU-related work to be done 3571 * by the current CPU, for the specified type of RCU, returning 1 if so. 3572 * The checks are in order of increasing expense: checks that can be 3573 * carried out against CPU-local state are performed first. However, 3574 * we must check for CPU stalls first, else we might not get a chance. 3575 */ 3576 static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) 3577 { 3578 struct rcu_node *rnp = rdp->mynode; 3579 3580 rdp->n_rcu_pending++; 3581 3582 /* Check for CPU stalls, if enabled. */ 3583 check_cpu_stall(rsp, rdp); 3584 3585 /* Is this CPU a NO_HZ_FULL CPU that should ignore RCU? */ 3586 if (rcu_nohz_full_cpu(rsp)) 3587 return 0; 3588 3589 /* Is the RCU core waiting for a quiescent state from this CPU? */ 3590 if (rcu_scheduler_fully_active && 3591 rdp->core_needs_qs && rdp->cpu_no_qs.b.norm && 3592 rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) { 3593 rdp->n_rp_core_needs_qs++; 3594 } else if (rdp->core_needs_qs && !rdp->cpu_no_qs.b.norm) { 3595 rdp->n_rp_report_qs++; 3596 return 1; 3597 } 3598 3599 /* Does this CPU have callbacks ready to invoke? */ 3600 if (cpu_has_callbacks_ready_to_invoke(rdp)) { 3601 rdp->n_rp_cb_ready++; 3602 return 1; 3603 } 3604 3605 /* Has RCU gone idle with this CPU needing another grace period? */ 3606 if (cpu_needs_another_gp(rsp, rdp)) { 3607 rdp->n_rp_cpu_needs_gp++; 3608 return 1; 3609 } 3610 3611 /* Has another RCU grace period completed? */ 3612 if (READ_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ 3613 rdp->n_rp_gp_completed++; 3614 return 1; 3615 } 3616 3617 /* Has a new RCU grace period started? */ 3618 if (READ_ONCE(rnp->gpnum) != rdp->gpnum || 3619 unlikely(READ_ONCE(rdp->gpwrap))) { /* outside lock */ 3620 rdp->n_rp_gp_started++; 3621 return 1; 3622 } 3623 3624 /* Does this CPU need a deferred NOCB wakeup? */ 3625 if (rcu_nocb_need_deferred_wakeup(rdp)) { 3626 rdp->n_rp_nocb_defer_wakeup++; 3627 return 1; 3628 } 3629 3630 /* nothing to do */ 3631 rdp->n_rp_need_nothing++; 3632 return 0; 3633 } 3634 3635 /* 3636 * Check to see if there is any immediate RCU-related work to be done 3637 * by the current CPU, returning 1 if so. This function is part of the 3638 * RCU implementation; it is -not- an exported member of the RCU API. 3639 */ 3640 static int rcu_pending(void) 3641 { 3642 struct rcu_state *rsp; 3643 3644 for_each_rcu_flavor(rsp) 3645 if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda))) 3646 return 1; 3647 return 0; 3648 } 3649 3650 /* 3651 * Return true if the specified CPU has any callback. If all_lazy is 3652 * non-NULL, store an indication of whether all callbacks are lazy. 3653 * (If there are no callbacks, all of them are deemed to be lazy.) 3654 */ 3655 static bool __maybe_unused rcu_cpu_has_callbacks(bool *all_lazy) 3656 { 3657 bool al = true; 3658 bool hc = false; 3659 struct rcu_data *rdp; 3660 struct rcu_state *rsp; 3661 3662 for_each_rcu_flavor(rsp) { 3663 rdp = this_cpu_ptr(rsp->rda); 3664 if (!rdp->nxtlist) 3665 continue; 3666 hc = true; 3667 if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { 3668 al = false; 3669 break; 3670 } 3671 } 3672 if (all_lazy) 3673 *all_lazy = al; 3674 return hc; 3675 } 3676 3677 /* 3678 * Helper function for _rcu_barrier() tracing. If tracing is disabled, 3679 * the compiler is expected to optimize this away. 3680 */ 3681 static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, 3682 int cpu, unsigned long done) 3683 { 3684 trace_rcu_barrier(rsp->name, s, cpu, 3685 atomic_read(&rsp->barrier_cpu_count), done); 3686 } 3687 3688 /* 3689 * RCU callback function for _rcu_barrier(). If we are last, wake 3690 * up the task executing _rcu_barrier(). 3691 */ 3692 static void rcu_barrier_callback(struct rcu_head *rhp) 3693 { 3694 struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); 3695 struct rcu_state *rsp = rdp->rsp; 3696 3697 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3698 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); 3699 complete(&rsp->barrier_completion); 3700 } else { 3701 _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); 3702 } 3703 } 3704 3705 /* 3706 * Called with preemption disabled, and from cross-cpu IRQ context. 3707 */ 3708 static void rcu_barrier_func(void *type) 3709 { 3710 struct rcu_state *rsp = type; 3711 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3712 3713 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); 3714 atomic_inc(&rsp->barrier_cpu_count); 3715 rsp->call(&rdp->barrier_head, rcu_barrier_callback); 3716 } 3717 3718 /* 3719 * Orchestrate the specified type of RCU barrier, waiting for all 3720 * RCU callbacks of the specified type to complete. 3721 */ 3722 static void _rcu_barrier(struct rcu_state *rsp) 3723 { 3724 int cpu; 3725 struct rcu_data *rdp; 3726 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); 3727 3728 _rcu_barrier_trace(rsp, "Begin", -1, s); 3729 3730 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3731 mutex_lock(&rsp->barrier_mutex); 3732 3733 /* Did someone else do our work for us? */ 3734 if (rcu_seq_done(&rsp->barrier_sequence, s)) { 3735 _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); 3736 smp_mb(); /* caller's subsequent code after above check. */ 3737 mutex_unlock(&rsp->barrier_mutex); 3738 return; 3739 } 3740 3741 /* Mark the start of the barrier operation. */ 3742 rcu_seq_start(&rsp->barrier_sequence); 3743 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); 3744 3745 /* 3746 * Initialize the count to one rather than to zero in order to 3747 * avoid a too-soon return to zero in case of a short grace period 3748 * (or preemption of this task). Exclude CPU-hotplug operations 3749 * to ensure that no offline CPU has callbacks queued. 3750 */ 3751 init_completion(&rsp->barrier_completion); 3752 atomic_set(&rsp->barrier_cpu_count, 1); 3753 get_online_cpus(); 3754 3755 /* 3756 * Force each CPU with callbacks to register a new callback. 3757 * When that callback is invoked, we will know that all of the 3758 * corresponding CPU's preceding callbacks have been invoked. 3759 */ 3760 for_each_possible_cpu(cpu) { 3761 if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) 3762 continue; 3763 rdp = per_cpu_ptr(rsp->rda, cpu); 3764 if (rcu_is_nocb_cpu(cpu)) { 3765 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { 3766 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, 3767 rsp->barrier_sequence); 3768 } else { 3769 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3770 rsp->barrier_sequence); 3771 smp_mb__before_atomic(); 3772 atomic_inc(&rsp->barrier_cpu_count); 3773 __call_rcu(&rdp->barrier_head, 3774 rcu_barrier_callback, rsp, cpu, 0); 3775 } 3776 } else if (READ_ONCE(rdp->qlen)) { 3777 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3778 rsp->barrier_sequence); 3779 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3780 } else { 3781 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 3782 rsp->barrier_sequence); 3783 } 3784 } 3785 put_online_cpus(); 3786 3787 /* 3788 * Now that we have an rcu_barrier_callback() callback on each 3789 * CPU, and thus each counted, remove the initial count. 3790 */ 3791 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) 3792 complete(&rsp->barrier_completion); 3793 3794 /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ 3795 wait_for_completion(&rsp->barrier_completion); 3796 3797 /* Mark the end of the barrier operation. */ 3798 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); 3799 rcu_seq_end(&rsp->barrier_sequence); 3800 3801 /* Other rcu_barrier() invocations can now safely proceed. */ 3802 mutex_unlock(&rsp->barrier_mutex); 3803 } 3804 3805 /** 3806 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. 3807 */ 3808 void rcu_barrier_bh(void) 3809 { 3810 _rcu_barrier(&rcu_bh_state); 3811 } 3812 EXPORT_SYMBOL_GPL(rcu_barrier_bh); 3813 3814 /** 3815 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. 3816 */ 3817 void rcu_barrier_sched(void) 3818 { 3819 _rcu_barrier(&rcu_sched_state); 3820 } 3821 EXPORT_SYMBOL_GPL(rcu_barrier_sched); 3822 3823 /* 3824 * Propagate ->qsinitmask bits up the rcu_node tree to account for the 3825 * first CPU in a given leaf rcu_node structure coming online. The caller 3826 * must hold the corresponding leaf rcu_node ->lock with interrrupts 3827 * disabled. 3828 */ 3829 static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) 3830 { 3831 long mask; 3832 struct rcu_node *rnp = rnp_leaf; 3833 3834 for (;;) { 3835 mask = rnp->grpmask; 3836 rnp = rnp->parent; 3837 if (rnp == NULL) 3838 return; 3839 raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ 3840 rnp->qsmaskinit |= mask; 3841 raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ 3842 } 3843 } 3844 3845 /* 3846 * Do boot-time initialization of a CPU's per-CPU RCU data. 3847 */ 3848 static void __init 3849 rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) 3850 { 3851 unsigned long flags; 3852 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3853 struct rcu_node *rnp = rcu_get_root(rsp); 3854 3855 /* Set up local state, ensuring consistent view of global state. */ 3856 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3857 rdp->grpmask = leaf_node_cpu_bit(rdp->mynode, cpu); 3858 rdp->dynticks = &per_cpu(rcu_dynticks, cpu); 3859 WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); 3860 WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(rdp->dynticks))); 3861 rdp->cpu = cpu; 3862 rdp->rsp = rsp; 3863 rcu_boot_init_nocb_percpu_data(rdp); 3864 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3865 } 3866 3867 /* 3868 * Initialize a CPU's per-CPU RCU data. Note that only one online or 3869 * offline event can be happening at a given time. Note also that we 3870 * can accept some slop in the rsp->completed access due to the fact 3871 * that this CPU cannot possibly have any RCU callbacks in flight yet. 3872 */ 3873 static void 3874 rcu_init_percpu_data(int cpu, struct rcu_state *rsp) 3875 { 3876 unsigned long flags; 3877 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3878 struct rcu_node *rnp = rcu_get_root(rsp); 3879 3880 /* Set up local state, ensuring consistent view of global state. */ 3881 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3882 rdp->qlen_last_fqs_check = 0; 3883 rdp->n_force_qs_snap = rsp->n_force_qs; 3884 rdp->blimit = blimit; 3885 if (!rdp->nxtlist) 3886 init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ 3887 rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 3888 rcu_sysidle_init_percpu_data(rdp->dynticks); 3889 rcu_dynticks_eqs_online(); 3890 raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ 3891 3892 /* 3893 * Add CPU to leaf rcu_node pending-online bitmask. Any needed 3894 * propagation up the rcu_node tree will happen at the beginning 3895 * of the next grace period. 3896 */ 3897 rnp = rdp->mynode; 3898 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 3899 if (!rdp->beenonline) 3900 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1); 3901 rdp->beenonline = true; /* We have now been online. */ 3902 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 3903 rdp->completed = rnp->completed; 3904 rdp->cpu_no_qs.b.norm = true; 3905 rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); 3906 rdp->core_needs_qs = false; 3907 trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); 3908 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3909 } 3910 3911 int rcutree_prepare_cpu(unsigned int cpu) 3912 { 3913 struct rcu_state *rsp; 3914 3915 for_each_rcu_flavor(rsp) 3916 rcu_init_percpu_data(cpu, rsp); 3917 3918 rcu_prepare_kthreads(cpu); 3919 rcu_spawn_all_nocb_kthreads(cpu); 3920 3921 return 0; 3922 } 3923 3924 static void rcutree_affinity_setting(unsigned int cpu, int outgoing) 3925 { 3926 struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu); 3927 3928 rcu_boost_kthread_setaffinity(rdp->mynode, outgoing); 3929 } 3930 3931 int rcutree_online_cpu(unsigned int cpu) 3932 { 3933 sync_sched_exp_online_cleanup(cpu); 3934 rcutree_affinity_setting(cpu, -1); 3935 return 0; 3936 } 3937 3938 int rcutree_offline_cpu(unsigned int cpu) 3939 { 3940 rcutree_affinity_setting(cpu, cpu); 3941 return 0; 3942 } 3943 3944 3945 int rcutree_dying_cpu(unsigned int cpu) 3946 { 3947 struct rcu_state *rsp; 3948 3949 for_each_rcu_flavor(rsp) 3950 rcu_cleanup_dying_cpu(rsp); 3951 return 0; 3952 } 3953 3954 int rcutree_dead_cpu(unsigned int cpu) 3955 { 3956 struct rcu_state *rsp; 3957 3958 for_each_rcu_flavor(rsp) { 3959 rcu_cleanup_dead_cpu(cpu, rsp); 3960 do_nocb_deferred_wakeup(per_cpu_ptr(rsp->rda, cpu)); 3961 } 3962 return 0; 3963 } 3964 3965 /* 3966 * Mark the specified CPU as being online so that subsequent grace periods 3967 * (both expedited and normal) will wait on it. Note that this means that 3968 * incoming CPUs are not allowed to use RCU read-side critical sections 3969 * until this function is called. Failing to observe this restriction 3970 * will result in lockdep splats. 3971 */ 3972 void rcu_cpu_starting(unsigned int cpu) 3973 { 3974 unsigned long flags; 3975 unsigned long mask; 3976 struct rcu_data *rdp; 3977 struct rcu_node *rnp; 3978 struct rcu_state *rsp; 3979 3980 for_each_rcu_flavor(rsp) { 3981 rdp = per_cpu_ptr(rsp->rda, cpu); 3982 rnp = rdp->mynode; 3983 mask = rdp->grpmask; 3984 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3985 rnp->qsmaskinitnext |= mask; 3986 rnp->expmaskinitnext |= mask; 3987 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3988 } 3989 } 3990 3991 #ifdef CONFIG_HOTPLUG_CPU 3992 /* 3993 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3994 * function. We now remove it from the rcu_node tree's ->qsmaskinit 3995 * bit masks. 3996 * The CPU is exiting the idle loop into the arch_cpu_idle_dead() 3997 * function. We now remove it from the rcu_node tree's ->qsmaskinit 3998 * bit masks. 3999 */ 4000 static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) 4001 { 4002 unsigned long flags; 4003 unsigned long mask; 4004 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 4005 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 4006 4007 /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ 4008 mask = rdp->grpmask; 4009 raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ 4010 rnp->qsmaskinitnext &= ~mask; 4011 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4012 } 4013 4014 void rcu_report_dead(unsigned int cpu) 4015 { 4016 struct rcu_state *rsp; 4017 4018 /* QS for any half-done expedited RCU-sched GP. */ 4019 preempt_disable(); 4020 rcu_report_exp_rdp(&rcu_sched_state, 4021 this_cpu_ptr(rcu_sched_state.rda), true); 4022 preempt_enable(); 4023 for_each_rcu_flavor(rsp) 4024 rcu_cleanup_dying_idle_cpu(cpu, rsp); 4025 } 4026 #endif 4027 4028 static int rcu_pm_notify(struct notifier_block *self, 4029 unsigned long action, void *hcpu) 4030 { 4031 switch (action) { 4032 case PM_HIBERNATION_PREPARE: 4033 case PM_SUSPEND_PREPARE: 4034 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 4035 rcu_expedite_gp(); 4036 break; 4037 case PM_POST_HIBERNATION: 4038 case PM_POST_SUSPEND: 4039 if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ 4040 rcu_unexpedite_gp(); 4041 break; 4042 default: 4043 break; 4044 } 4045 return NOTIFY_OK; 4046 } 4047 4048 /* 4049 * Spawn the kthreads that handle each RCU flavor's grace periods. 4050 */ 4051 static int __init rcu_spawn_gp_kthread(void) 4052 { 4053 unsigned long flags; 4054 int kthread_prio_in = kthread_prio; 4055 struct rcu_node *rnp; 4056 struct rcu_state *rsp; 4057 struct sched_param sp; 4058 struct task_struct *t; 4059 4060 /* Force priority into range. */ 4061 if (IS_ENABLED(CONFIG_RCU_BOOST) && kthread_prio < 1) 4062 kthread_prio = 1; 4063 else if (kthread_prio < 0) 4064 kthread_prio = 0; 4065 else if (kthread_prio > 99) 4066 kthread_prio = 99; 4067 if (kthread_prio != kthread_prio_in) 4068 pr_alert("rcu_spawn_gp_kthread(): Limited prio to %d from %d\n", 4069 kthread_prio, kthread_prio_in); 4070 4071 rcu_scheduler_fully_active = 1; 4072 for_each_rcu_flavor(rsp) { 4073 t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); 4074 BUG_ON(IS_ERR(t)); 4075 rnp = rcu_get_root(rsp); 4076 raw_spin_lock_irqsave_rcu_node(rnp, flags); 4077 rsp->gp_kthread = t; 4078 if (kthread_prio) { 4079 sp.sched_priority = kthread_prio; 4080 sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); 4081 } 4082 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 4083 wake_up_process(t); 4084 } 4085 rcu_spawn_nocb_kthreads(); 4086 rcu_spawn_boost_kthreads(); 4087 return 0; 4088 } 4089 early_initcall(rcu_spawn_gp_kthread); 4090 4091 /* 4092 * This function is invoked towards the end of the scheduler's 4093 * initialization process. Before this is called, the idle task might 4094 * contain synchronous grace-period primitives (during which time, this idle 4095 * task is booting the system, and such primitives are no-ops). After this 4096 * function is called, any synchronous grace-period primitives are run as 4097 * expedited, with the requesting task driving the grace period forward. 4098 * A later core_initcall() rcu_exp_runtime_mode() will switch to full 4099 * runtime RCU functionality. 4100 */ 4101 void rcu_scheduler_starting(void) 4102 { 4103 WARN_ON(num_online_cpus() != 1); 4104 WARN_ON(nr_context_switches() > 0); 4105 rcu_test_sync_prims(); 4106 rcu_scheduler_active = RCU_SCHEDULER_INIT; 4107 rcu_test_sync_prims(); 4108 } 4109 4110 /* 4111 * Compute the per-level fanout, either using the exact fanout specified 4112 * or balancing the tree, depending on the rcu_fanout_exact boot parameter. 4113 */ 4114 static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) 4115 { 4116 int i; 4117 4118 if (rcu_fanout_exact) { 4119 levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf; 4120 for (i = rcu_num_lvls - 2; i >= 0; i--) 4121 levelspread[i] = RCU_FANOUT; 4122 } else { 4123 int ccur; 4124 int cprv; 4125 4126 cprv = nr_cpu_ids; 4127 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4128 ccur = levelcnt[i]; 4129 levelspread[i] = (cprv + ccur - 1) / ccur; 4130 cprv = ccur; 4131 } 4132 } 4133 } 4134 4135 /* 4136 * Helper function for rcu_init() that initializes one rcu_state structure. 4137 */ 4138 static void __init rcu_init_one(struct rcu_state *rsp) 4139 { 4140 static const char * const buf[] = RCU_NODE_NAME_INIT; 4141 static const char * const fqs[] = RCU_FQS_NAME_INIT; 4142 static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; 4143 static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; 4144 static u8 fl_mask = 0x1; 4145 4146 int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ 4147 int levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ 4148 int cpustride = 1; 4149 int i; 4150 int j; 4151 struct rcu_node *rnp; 4152 4153 BUILD_BUG_ON(RCU_NUM_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ 4154 4155 /* Silence gcc 4.8 false positive about array index out of range. */ 4156 if (rcu_num_lvls <= 0 || rcu_num_lvls > RCU_NUM_LVLS) 4157 panic("rcu_init_one: rcu_num_lvls out of range"); 4158 4159 /* Initialize the level-tracking arrays. */ 4160 4161 for (i = 0; i < rcu_num_lvls; i++) 4162 levelcnt[i] = num_rcu_lvl[i]; 4163 for (i = 1; i < rcu_num_lvls; i++) 4164 rsp->level[i] = rsp->level[i - 1] + levelcnt[i - 1]; 4165 rcu_init_levelspread(levelspread, levelcnt); 4166 rsp->flavor_mask = fl_mask; 4167 fl_mask <<= 1; 4168 4169 /* Initialize the elements themselves, starting from the leaves. */ 4170 4171 for (i = rcu_num_lvls - 1; i >= 0; i--) { 4172 cpustride *= levelspread[i]; 4173 rnp = rsp->level[i]; 4174 for (j = 0; j < levelcnt[i]; j++, rnp++) { 4175 raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); 4176 lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), 4177 &rcu_node_class[i], buf[i]); 4178 raw_spin_lock_init(&rnp->fqslock); 4179 lockdep_set_class_and_name(&rnp->fqslock, 4180 &rcu_fqs_class[i], fqs[i]); 4181 rnp->gpnum = rsp->gpnum; 4182 rnp->completed = rsp->completed; 4183 rnp->qsmask = 0; 4184 rnp->qsmaskinit = 0; 4185 rnp->grplo = j * cpustride; 4186 rnp->grphi = (j + 1) * cpustride - 1; 4187 if (rnp->grphi >= nr_cpu_ids) 4188 rnp->grphi = nr_cpu_ids - 1; 4189 if (i == 0) { 4190 rnp->grpnum = 0; 4191 rnp->grpmask = 0; 4192 rnp->parent = NULL; 4193 } else { 4194 rnp->grpnum = j % levelspread[i - 1]; 4195 rnp->grpmask = 1UL << rnp->grpnum; 4196 rnp->parent = rsp->level[i - 1] + 4197 j / levelspread[i - 1]; 4198 } 4199 rnp->level = i; 4200 INIT_LIST_HEAD(&rnp->blkd_tasks); 4201 rcu_init_one_nocb(rnp); 4202 init_waitqueue_head(&rnp->exp_wq[0]); 4203 init_waitqueue_head(&rnp->exp_wq[1]); 4204 init_waitqueue_head(&rnp->exp_wq[2]); 4205 init_waitqueue_head(&rnp->exp_wq[3]); 4206 spin_lock_init(&rnp->exp_lock); 4207 } 4208 } 4209 4210 init_swait_queue_head(&rsp->gp_wq); 4211 init_swait_queue_head(&rsp->expedited_wq); 4212 rnp = rsp->level[rcu_num_lvls - 1]; 4213 for_each_possible_cpu(i) { 4214 while (i > rnp->grphi) 4215 rnp++; 4216 per_cpu_ptr(rsp->rda, i)->mynode = rnp; 4217 rcu_boot_init_percpu_data(i, rsp); 4218 } 4219 list_add(&rsp->flavors, &rcu_struct_flavors); 4220 } 4221 4222 /* 4223 * Compute the rcu_node tree geometry from kernel parameters. This cannot 4224 * replace the definitions in tree.h because those are needed to size 4225 * the ->node array in the rcu_state structure. 4226 */ 4227 static void __init rcu_init_geometry(void) 4228 { 4229 ulong d; 4230 int i; 4231 int rcu_capacity[RCU_NUM_LVLS]; 4232 4233 /* 4234 * Initialize any unspecified boot parameters. 4235 * The default values of jiffies_till_first_fqs and 4236 * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS 4237 * value, which is a function of HZ, then adding one for each 4238 * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. 4239 */ 4240 d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; 4241 if (jiffies_till_first_fqs == ULONG_MAX) 4242 jiffies_till_first_fqs = d; 4243 if (jiffies_till_next_fqs == ULONG_MAX) 4244 jiffies_till_next_fqs = d; 4245 4246 /* If the compile-time values are accurate, just leave. */ 4247 if (rcu_fanout_leaf == RCU_FANOUT_LEAF && 4248 nr_cpu_ids == NR_CPUS) 4249 return; 4250 pr_info("RCU: Adjusting geometry for rcu_fanout_leaf=%d, nr_cpu_ids=%d\n", 4251 rcu_fanout_leaf, nr_cpu_ids); 4252 4253 /* 4254 * The boot-time rcu_fanout_leaf parameter must be at least two 4255 * and cannot exceed the number of bits in the rcu_node masks. 4256 * Complain and fall back to the compile-time values if this 4257 * limit is exceeded. 4258 */ 4259 if (rcu_fanout_leaf < 2 || 4260 rcu_fanout_leaf > sizeof(unsigned long) * 8) { 4261 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4262 WARN_ON(1); 4263 return; 4264 } 4265 4266 /* 4267 * Compute number of nodes that can be handled an rcu_node tree 4268 * with the given number of levels. 4269 */ 4270 rcu_capacity[0] = rcu_fanout_leaf; 4271 for (i = 1; i < RCU_NUM_LVLS; i++) 4272 rcu_capacity[i] = rcu_capacity[i - 1] * RCU_FANOUT; 4273 4274 /* 4275 * The tree must be able to accommodate the configured number of CPUs. 4276 * If this limit is exceeded, fall back to the compile-time values. 4277 */ 4278 if (nr_cpu_ids > rcu_capacity[RCU_NUM_LVLS - 1]) { 4279 rcu_fanout_leaf = RCU_FANOUT_LEAF; 4280 WARN_ON(1); 4281 return; 4282 } 4283 4284 /* Calculate the number of levels in the tree. */ 4285 for (i = 0; nr_cpu_ids > rcu_capacity[i]; i++) { 4286 } 4287 rcu_num_lvls = i + 1; 4288 4289 /* Calculate the number of rcu_nodes at each level of the tree. */ 4290 for (i = 0; i < rcu_num_lvls; i++) { 4291 int cap = rcu_capacity[(rcu_num_lvls - 1) - i]; 4292 num_rcu_lvl[i] = DIV_ROUND_UP(nr_cpu_ids, cap); 4293 } 4294 4295 /* Calculate the total number of rcu_node structures. */ 4296 rcu_num_nodes = 0; 4297 for (i = 0; i < rcu_num_lvls; i++) 4298 rcu_num_nodes += num_rcu_lvl[i]; 4299 } 4300 4301 /* 4302 * Dump out the structure of the rcu_node combining tree associated 4303 * with the rcu_state structure referenced by rsp. 4304 */ 4305 static void __init rcu_dump_rcu_node_tree(struct rcu_state *rsp) 4306 { 4307 int level = 0; 4308 struct rcu_node *rnp; 4309 4310 pr_info("rcu_node tree layout dump\n"); 4311 pr_info(" "); 4312 rcu_for_each_node_breadth_first(rsp, rnp) { 4313 if (rnp->level != level) { 4314 pr_cont("\n"); 4315 pr_info(" "); 4316 level = rnp->level; 4317 } 4318 pr_cont("%d:%d ^%d ", rnp->grplo, rnp->grphi, rnp->grpnum); 4319 } 4320 pr_cont("\n"); 4321 } 4322 4323 void __init rcu_init(void) 4324 { 4325 int cpu; 4326 4327 rcu_early_boot_tests(); 4328 4329 rcu_bootup_announce(); 4330 rcu_init_geometry(); 4331 rcu_init_one(&rcu_bh_state); 4332 rcu_init_one(&rcu_sched_state); 4333 if (dump_tree) 4334 rcu_dump_rcu_node_tree(&rcu_sched_state); 4335 __rcu_init_preempt(); 4336 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); 4337 4338 /* 4339 * We don't need protection against CPU-hotplug here because 4340 * this is called early in boot, before either interrupts 4341 * or the scheduler are operational. 4342 */ 4343 pm_notifier(rcu_pm_notify, 0); 4344 for_each_online_cpu(cpu) { 4345 rcutree_prepare_cpu(cpu); 4346 rcu_cpu_starting(cpu); 4347 } 4348 } 4349 4350 #include "tree_exp.h" 4351 #include "tree_plugin.h" 4352