1 /*- 2 * Copyright (c) 2016 Matthew Macy (mmacy@mattmacy.io) 3 * Copyright (c) 2017-2021 Hans Petter Selasky (hselasky@freebsd.org) 4 * All rights reserved. 5 * Copyright (c) 2024 The FreeBSD Foundation 6 * 7 * Portions of this software were developed by Björn Zeeb 8 * under sponsorship from the FreeBSD Foundation. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice unmodified, this list of conditions, and the following 15 * disclaimer. 16 * 2. Redistributions in binary form must reproduce the above copyright 17 * notice, this list of conditions and the following disclaimer in the 18 * documentation and/or other materials provided with the distribution. 19 * 20 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 21 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 22 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 23 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 24 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 25 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 26 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 27 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 28 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 29 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 #include <sys/types.h> 33 #include <sys/systm.h> 34 #include <sys/malloc.h> 35 #include <sys/kernel.h> 36 #include <sys/lock.h> 37 #include <sys/mutex.h> 38 #include <sys/proc.h> 39 #include <sys/sched.h> 40 #include <sys/smp.h> 41 #include <sys/queue.h> 42 #include <sys/taskqueue.h> 43 #include <sys/kdb.h> 44 45 #include <ck_epoch.h> 46 47 #include <linux/rcupdate.h> 48 #include <linux/sched.h> 49 #include <linux/srcu.h> 50 #include <linux/slab.h> 51 #include <linux/kernel.h> 52 #include <linux/compat.h> 53 #include <linux/llist.h> 54 #include <linux/irq_work.h> 55 56 /* 57 * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will 58 * not be skipped during panic(). 59 */ 60 #ifdef CONFIG_NO_RCU_SKIP 61 #define RCU_SKIP(void) 0 62 #else 63 #define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) 64 #endif 65 66 struct callback_head { 67 union { 68 STAILQ_ENTRY(callback_head) entry; 69 struct llist_node node; 70 }; 71 rcu_callback_t func; 72 }; 73 74 struct linux_epoch_head { 75 struct llist_head cb_head; 76 struct task task; 77 } __aligned(CACHE_LINE_SIZE); 78 79 struct linux_epoch_record { 80 ck_epoch_record_t epoch_record; 81 TAILQ_HEAD(, task_struct) ts_head; 82 int cpuid; 83 int type; 84 } __aligned(CACHE_LINE_SIZE); 85 86 /* 87 * Verify that "struct rcu_head" is big enough to hold "struct 88 * callback_head". This has been done to avoid having to add special 89 * compile flags for including ck_epoch.h to all clients of the 90 * LinuxKPI. 91 */ 92 CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); 93 94 /* 95 * Verify that "rcu_section[0]" has the same size as 96 * "ck_epoch_section_t". This has been done to avoid having to add 97 * special compile flags for including ck_epoch.h to all clients of 98 * the LinuxKPI. 99 */ 100 CTASSERT(sizeof(((struct task_struct *)0)->rcu_section[0] == 101 sizeof(ck_epoch_section_t))); 102 103 /* 104 * Verify that "epoch_record" is at beginning of "struct 105 * linux_epoch_record": 106 */ 107 CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); 108 109 CTASSERT(TS_RCU_TYPE_MAX == RCU_TYPE_MAX); 110 111 static ck_epoch_t linux_epoch[RCU_TYPE_MAX]; 112 static struct linux_epoch_head linux_epoch_head[RCU_TYPE_MAX]; 113 DPCPU_DEFINE_STATIC(struct linux_epoch_record, linux_epoch_record[RCU_TYPE_MAX]); 114 115 static void linux_rcu_cleaner_func(void *, int); 116 117 static void 118 linux_rcu_runtime_init(void *arg __unused) 119 { 120 struct linux_epoch_head *head; 121 int i; 122 int j; 123 124 for (j = 0; j != RCU_TYPE_MAX; j++) { 125 ck_epoch_init(&linux_epoch[j]); 126 127 head = &linux_epoch_head[j]; 128 129 TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, head); 130 init_llist_head(&head->cb_head); 131 132 CPU_FOREACH(i) { 133 struct linux_epoch_record *record; 134 135 record = &DPCPU_ID_GET(i, linux_epoch_record[j]); 136 137 record->cpuid = i; 138 record->type = j; 139 ck_epoch_register(&linux_epoch[j], 140 &record->epoch_record, NULL); 141 TAILQ_INIT(&record->ts_head); 142 } 143 } 144 } 145 SYSINIT(linux_rcu_runtime, SI_SUB_CPU, SI_ORDER_ANY, linux_rcu_runtime_init, NULL); 146 147 static void 148 linux_rcu_cleaner_func(void *context, int pending __unused) 149 { 150 struct linux_epoch_head *head = context; 151 struct callback_head *rcu; 152 STAILQ_HEAD(, callback_head) tmp_head; 153 struct llist_node *node, *next; 154 uintptr_t offset; 155 156 /* move current callbacks into own queue */ 157 STAILQ_INIT(&tmp_head); 158 llist_for_each_safe(node, next, llist_del_all(&head->cb_head)) { 159 rcu = container_of(node, struct callback_head, node); 160 /* re-reverse list to restore chronological order */ 161 STAILQ_INSERT_HEAD(&tmp_head, rcu, entry); 162 } 163 164 /* synchronize */ 165 linux_synchronize_rcu(head - linux_epoch_head); 166 167 /* dispatch all callbacks, if any */ 168 while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { 169 STAILQ_REMOVE_HEAD(&tmp_head, entry); 170 171 offset = (uintptr_t)rcu->func; 172 173 if (offset < LINUX_KFREE_RCU_OFFSET_MAX) 174 kfree((char *)rcu - offset); 175 else 176 rcu->func((struct rcu_head *)rcu); 177 } 178 } 179 180 void 181 linux_rcu_read_lock(unsigned type) 182 { 183 struct linux_epoch_record *record; 184 struct task_struct *ts; 185 186 MPASS(type < RCU_TYPE_MAX); 187 188 if (RCU_SKIP()) 189 return; 190 191 ts = current; 192 193 /* assert valid refcount */ 194 MPASS(ts->rcu_recurse[type] != INT_MAX); 195 196 if (++(ts->rcu_recurse[type]) != 1) 197 return; 198 199 /* 200 * Pin thread to current CPU so that the unlock code gets the 201 * same per-CPU epoch record: 202 */ 203 sched_pin(); 204 205 record = &DPCPU_GET(linux_epoch_record[type]); 206 207 /* 208 * Use a critical section to prevent recursion inside 209 * ck_epoch_begin(). Else this function supports recursion. 210 */ 211 critical_enter(); 212 ck_epoch_begin(&record->epoch_record, 213 (ck_epoch_section_t *)&ts->rcu_section[type]); 214 TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry[type]); 215 critical_exit(); 216 } 217 218 void 219 linux_rcu_read_unlock(unsigned type) 220 { 221 struct linux_epoch_record *record; 222 struct task_struct *ts; 223 224 MPASS(type < RCU_TYPE_MAX); 225 226 if (RCU_SKIP()) 227 return; 228 229 ts = current; 230 231 /* assert valid refcount */ 232 MPASS(ts->rcu_recurse[type] > 0); 233 234 if (--(ts->rcu_recurse[type]) != 0) 235 return; 236 237 record = &DPCPU_GET(linux_epoch_record[type]); 238 239 /* 240 * Use a critical section to prevent recursion inside 241 * ck_epoch_end(). Else this function supports recursion. 242 */ 243 critical_enter(); 244 ck_epoch_end(&record->epoch_record, 245 (ck_epoch_section_t *)&ts->rcu_section[type]); 246 TAILQ_REMOVE(&record->ts_head, ts, rcu_entry[type]); 247 critical_exit(); 248 249 sched_unpin(); 250 } 251 252 bool 253 linux_rcu_read_lock_held(unsigned type) 254 { 255 #ifdef INVARINATS 256 struct linux_epoch_record *record __diagused; 257 struct task_struct *ts; 258 259 MPASS(type < RCU_TYPE_MAX); 260 261 if (RCU_SKIP()) 262 return (false); 263 264 if (__current_unallocated(curthread)) 265 return (false); 266 267 ts = current; 268 if (ts->rcu_recurse[type] == 0) 269 return (false); 270 271 MPASS(curthread->td_pinned != 0); 272 MPASS((record = &DPCPU_GET(linux_epoch_record[type])) && 273 record->epoch_record.active != 0); 274 #endif 275 276 return (true); 277 } 278 279 static void 280 linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) 281 { 282 struct linux_epoch_record *record = 283 container_of(epoch_record, struct linux_epoch_record, epoch_record); 284 struct thread *td = curthread; 285 struct task_struct *ts; 286 287 /* check if blocked on the current CPU */ 288 if (record->cpuid == PCPU_GET(cpuid)) { 289 bool is_sleeping = 0; 290 u_char prio = 0; 291 292 /* 293 * Find the lowest priority or sleeping thread which 294 * is blocking synchronization on this CPU core. All 295 * the threads in the queue are CPU-pinned and cannot 296 * go anywhere while the current thread is locked. 297 */ 298 TAILQ_FOREACH(ts, &record->ts_head, rcu_entry[record->type]) { 299 if (ts->task_thread->td_priority > prio) 300 prio = ts->task_thread->td_priority; 301 is_sleeping |= (ts->task_thread->td_inhibitors != 0); 302 } 303 304 if (is_sleeping) { 305 thread_unlock(td); 306 pause("W", 1); 307 thread_lock(td); 308 } else { 309 /* set new thread priority */ 310 sched_prio(td, prio); 311 /* task switch */ 312 mi_switch(SW_VOL | SWT_RELINQUISH); 313 /* 314 * It is important the thread lock is dropped 315 * while yielding to allow other threads to 316 * acquire the lock pointed to by 317 * TDQ_LOCKPTR(td). Currently mi_switch() will 318 * unlock the thread lock before 319 * returning. Else a deadlock like situation 320 * might happen. 321 */ 322 thread_lock(td); 323 } 324 } else { 325 /* 326 * To avoid spinning move execution to the other CPU 327 * which is blocking synchronization. Set highest 328 * thread priority so that code gets run. The thread 329 * priority will be restored later. 330 */ 331 sched_prio(td, 0); 332 sched_bind(td, record->cpuid); 333 } 334 } 335 336 void 337 linux_synchronize_rcu(unsigned type) 338 { 339 struct thread *td; 340 int was_bound; 341 int old_cpu; 342 int old_pinned; 343 u_char old_prio; 344 345 MPASS(type < RCU_TYPE_MAX); 346 347 if (RCU_SKIP()) 348 return; 349 350 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 351 "linux_synchronize_rcu() can sleep"); 352 353 td = curthread; 354 DROP_GIANT(); 355 356 /* 357 * Synchronizing RCU might change the CPU core this function 358 * is running on. Save current values: 359 */ 360 thread_lock(td); 361 362 old_cpu = PCPU_GET(cpuid); 363 old_pinned = td->td_pinned; 364 old_prio = td->td_priority; 365 was_bound = sched_is_bound(td); 366 sched_unbind(td); 367 td->td_pinned = 0; 368 sched_bind(td, old_cpu); 369 370 ck_epoch_synchronize_wait(&linux_epoch[type], 371 &linux_synchronize_rcu_cb, NULL); 372 373 /* restore CPU binding, if any */ 374 if (was_bound != 0) { 375 sched_bind(td, old_cpu); 376 } else { 377 /* get thread back to initial CPU, if any */ 378 if (old_pinned != 0) 379 sched_bind(td, old_cpu); 380 sched_unbind(td); 381 } 382 /* restore pinned after bind */ 383 td->td_pinned = old_pinned; 384 385 /* restore thread priority */ 386 sched_prio(td, old_prio); 387 thread_unlock(td); 388 389 PICKUP_GIANT(); 390 } 391 392 void 393 linux_rcu_barrier(unsigned type) 394 { 395 struct linux_epoch_head *head; 396 397 MPASS(type < RCU_TYPE_MAX); 398 399 /* 400 * This function is not obligated to wait for a grace period. 401 * It only waits for RCU callbacks that have already been posted. 402 * If there are no RCU callbacks posted, rcu_barrier() can return 403 * immediately. 404 */ 405 head = &linux_epoch_head[type]; 406 407 /* wait for callbacks to complete */ 408 taskqueue_drain(linux_irq_work_tq, &head->task); 409 } 410 411 void 412 linux_call_rcu(unsigned type, struct rcu_head *context, rcu_callback_t func) 413 { 414 struct callback_head *rcu; 415 struct linux_epoch_head *head; 416 417 MPASS(type < RCU_TYPE_MAX); 418 419 rcu = (struct callback_head *)context; 420 head = &linux_epoch_head[type]; 421 422 rcu->func = func; 423 llist_add(&rcu->node, &head->cb_head); 424 taskqueue_enqueue(linux_irq_work_tq, &head->task); 425 } 426 427 int 428 init_srcu_struct(struct srcu_struct *srcu) 429 { 430 return (0); 431 } 432 433 void 434 cleanup_srcu_struct(struct srcu_struct *srcu) 435 { 436 } 437 438 int 439 srcu_read_lock(struct srcu_struct *srcu) 440 { 441 linux_rcu_read_lock(RCU_TYPE_SLEEPABLE); 442 return (0); 443 } 444 445 void 446 srcu_read_unlock(struct srcu_struct *srcu, int key __unused) 447 { 448 linux_rcu_read_unlock(RCU_TYPE_SLEEPABLE); 449 } 450 451 void 452 synchronize_srcu(struct srcu_struct *srcu) 453 { 454 linux_synchronize_rcu(RCU_TYPE_SLEEPABLE); 455 } 456 457 void 458 srcu_barrier(struct srcu_struct *srcu) 459 { 460 linux_rcu_barrier(RCU_TYPE_SLEEPABLE); 461 } 462