1 /*- 2 * Copyright (c) 2016 Matthew Macy (mmacy@mattmacy.io) 3 * Copyright (c) 2017-2021 Hans Petter Selasky (hselasky@freebsd.org) 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice unmodified, this list of conditions, and the following 11 * disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/types.h> 29 #include <sys/systm.h> 30 #include <sys/malloc.h> 31 #include <sys/kernel.h> 32 #include <sys/lock.h> 33 #include <sys/mutex.h> 34 #include <sys/proc.h> 35 #include <sys/sched.h> 36 #include <sys/smp.h> 37 #include <sys/queue.h> 38 #include <sys/taskqueue.h> 39 #include <sys/kdb.h> 40 41 #include <ck_epoch.h> 42 43 #include <linux/rcupdate.h> 44 #include <linux/srcu.h> 45 #include <linux/slab.h> 46 #include <linux/kernel.h> 47 #include <linux/compat.h> 48 #include <linux/llist.h> 49 #include <linux/irq_work.h> 50 51 /* 52 * By defining CONFIG_NO_RCU_SKIP LinuxKPI RCU locks and asserts will 53 * not be skipped during panic(). 54 */ 55 #ifdef CONFIG_NO_RCU_SKIP 56 #define RCU_SKIP(void) 0 57 #else 58 #define RCU_SKIP(void) unlikely(SCHEDULER_STOPPED() || kdb_active) 59 #endif 60 61 struct callback_head { 62 union { 63 STAILQ_ENTRY(callback_head) entry; 64 struct llist_node node; 65 }; 66 rcu_callback_t func; 67 }; 68 69 struct linux_epoch_head { 70 struct llist_head cb_head; 71 struct task task; 72 } __aligned(CACHE_LINE_SIZE); 73 74 struct linux_epoch_record { 75 ck_epoch_record_t epoch_record; 76 TAILQ_HEAD(, task_struct) ts_head; 77 int cpuid; 78 int type; 79 } __aligned(CACHE_LINE_SIZE); 80 81 /* 82 * Verify that "struct rcu_head" is big enough to hold "struct 83 * callback_head". This has been done to avoid having to add special 84 * compile flags for including ck_epoch.h to all clients of the 85 * LinuxKPI. 86 */ 87 CTASSERT(sizeof(struct rcu_head) == sizeof(struct callback_head)); 88 89 /* 90 * Verify that "rcu_section[0]" has the same size as 91 * "ck_epoch_section_t". This has been done to avoid having to add 92 * special compile flags for including ck_epoch.h to all clients of 93 * the LinuxKPI. 94 */ 95 CTASSERT(sizeof(((struct task_struct *)0)->rcu_section[0] == 96 sizeof(ck_epoch_section_t))); 97 98 /* 99 * Verify that "epoch_record" is at beginning of "struct 100 * linux_epoch_record": 101 */ 102 CTASSERT(offsetof(struct linux_epoch_record, epoch_record) == 0); 103 104 CTASSERT(TS_RCU_TYPE_MAX == RCU_TYPE_MAX); 105 106 static ck_epoch_t linux_epoch[RCU_TYPE_MAX]; 107 static struct linux_epoch_head linux_epoch_head[RCU_TYPE_MAX]; 108 DPCPU_DEFINE_STATIC(struct linux_epoch_record, linux_epoch_record[RCU_TYPE_MAX]); 109 110 static void linux_rcu_cleaner_func(void *, int); 111 112 static void 113 linux_rcu_runtime_init(void *arg __unused) 114 { 115 struct linux_epoch_head *head; 116 int i; 117 int j; 118 119 for (j = 0; j != RCU_TYPE_MAX; j++) { 120 ck_epoch_init(&linux_epoch[j]); 121 122 head = &linux_epoch_head[j]; 123 124 TASK_INIT(&head->task, 0, linux_rcu_cleaner_func, head); 125 init_llist_head(&head->cb_head); 126 127 CPU_FOREACH(i) { 128 struct linux_epoch_record *record; 129 130 record = &DPCPU_ID_GET(i, linux_epoch_record[j]); 131 132 record->cpuid = i; 133 record->type = j; 134 ck_epoch_register(&linux_epoch[j], 135 &record->epoch_record, NULL); 136 TAILQ_INIT(&record->ts_head); 137 } 138 } 139 } 140 SYSINIT(linux_rcu_runtime, SI_SUB_CPU, SI_ORDER_ANY, linux_rcu_runtime_init, NULL); 141 142 static void 143 linux_rcu_cleaner_func(void *context, int pending __unused) 144 { 145 struct linux_epoch_head *head = context; 146 struct callback_head *rcu; 147 STAILQ_HEAD(, callback_head) tmp_head; 148 struct llist_node *node, *next; 149 uintptr_t offset; 150 151 /* move current callbacks into own queue */ 152 STAILQ_INIT(&tmp_head); 153 llist_for_each_safe(node, next, llist_del_all(&head->cb_head)) { 154 rcu = container_of(node, struct callback_head, node); 155 /* re-reverse list to restore chronological order */ 156 STAILQ_INSERT_HEAD(&tmp_head, rcu, entry); 157 } 158 159 /* synchronize */ 160 linux_synchronize_rcu(head - linux_epoch_head); 161 162 /* dispatch all callbacks, if any */ 163 while ((rcu = STAILQ_FIRST(&tmp_head)) != NULL) { 164 STAILQ_REMOVE_HEAD(&tmp_head, entry); 165 166 offset = (uintptr_t)rcu->func; 167 168 if (offset < LINUX_KFREE_RCU_OFFSET_MAX) 169 kfree((char *)rcu - offset); 170 else 171 rcu->func((struct rcu_head *)rcu); 172 } 173 } 174 175 void 176 linux_rcu_read_lock(unsigned type) 177 { 178 struct linux_epoch_record *record; 179 struct task_struct *ts; 180 181 MPASS(type < RCU_TYPE_MAX); 182 183 if (RCU_SKIP()) 184 return; 185 186 ts = current; 187 188 /* assert valid refcount */ 189 MPASS(ts->rcu_recurse[type] != INT_MAX); 190 191 if (++(ts->rcu_recurse[type]) != 1) 192 return; 193 194 /* 195 * Pin thread to current CPU so that the unlock code gets the 196 * same per-CPU epoch record: 197 */ 198 sched_pin(); 199 200 record = &DPCPU_GET(linux_epoch_record[type]); 201 202 /* 203 * Use a critical section to prevent recursion inside 204 * ck_epoch_begin(). Else this function supports recursion. 205 */ 206 critical_enter(); 207 ck_epoch_begin(&record->epoch_record, 208 (ck_epoch_section_t *)&ts->rcu_section[type]); 209 TAILQ_INSERT_TAIL(&record->ts_head, ts, rcu_entry[type]); 210 critical_exit(); 211 } 212 213 void 214 linux_rcu_read_unlock(unsigned type) 215 { 216 struct linux_epoch_record *record; 217 struct task_struct *ts; 218 219 MPASS(type < RCU_TYPE_MAX); 220 221 if (RCU_SKIP()) 222 return; 223 224 ts = current; 225 226 /* assert valid refcount */ 227 MPASS(ts->rcu_recurse[type] > 0); 228 229 if (--(ts->rcu_recurse[type]) != 0) 230 return; 231 232 record = &DPCPU_GET(linux_epoch_record[type]); 233 234 /* 235 * Use a critical section to prevent recursion inside 236 * ck_epoch_end(). Else this function supports recursion. 237 */ 238 critical_enter(); 239 ck_epoch_end(&record->epoch_record, 240 (ck_epoch_section_t *)&ts->rcu_section[type]); 241 TAILQ_REMOVE(&record->ts_head, ts, rcu_entry[type]); 242 critical_exit(); 243 244 sched_unpin(); 245 } 246 247 static void 248 linux_synchronize_rcu_cb(ck_epoch_t *epoch __unused, ck_epoch_record_t *epoch_record, void *arg __unused) 249 { 250 struct linux_epoch_record *record = 251 container_of(epoch_record, struct linux_epoch_record, epoch_record); 252 struct thread *td = curthread; 253 struct task_struct *ts; 254 255 /* check if blocked on the current CPU */ 256 if (record->cpuid == PCPU_GET(cpuid)) { 257 bool is_sleeping = 0; 258 u_char prio = 0; 259 260 /* 261 * Find the lowest priority or sleeping thread which 262 * is blocking synchronization on this CPU core. All 263 * the threads in the queue are CPU-pinned and cannot 264 * go anywhere while the current thread is locked. 265 */ 266 TAILQ_FOREACH(ts, &record->ts_head, rcu_entry[record->type]) { 267 if (ts->task_thread->td_priority > prio) 268 prio = ts->task_thread->td_priority; 269 is_sleeping |= (ts->task_thread->td_inhibitors != 0); 270 } 271 272 if (is_sleeping) { 273 thread_unlock(td); 274 pause("W", 1); 275 thread_lock(td); 276 } else { 277 /* set new thread priority */ 278 sched_prio(td, prio); 279 /* task switch */ 280 mi_switch(SW_VOL | SWT_RELINQUISH); 281 /* 282 * It is important the thread lock is dropped 283 * while yielding to allow other threads to 284 * acquire the lock pointed to by 285 * TDQ_LOCKPTR(td). Currently mi_switch() will 286 * unlock the thread lock before 287 * returning. Else a deadlock like situation 288 * might happen. 289 */ 290 thread_lock(td); 291 } 292 } else { 293 /* 294 * To avoid spinning move execution to the other CPU 295 * which is blocking synchronization. Set highest 296 * thread priority so that code gets run. The thread 297 * priority will be restored later. 298 */ 299 sched_prio(td, 0); 300 sched_bind(td, record->cpuid); 301 } 302 } 303 304 void 305 linux_synchronize_rcu(unsigned type) 306 { 307 struct thread *td; 308 int was_bound; 309 int old_cpu; 310 int old_pinned; 311 u_char old_prio; 312 313 MPASS(type < RCU_TYPE_MAX); 314 315 if (RCU_SKIP()) 316 return; 317 318 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 319 "linux_synchronize_rcu() can sleep"); 320 321 td = curthread; 322 DROP_GIANT(); 323 324 /* 325 * Synchronizing RCU might change the CPU core this function 326 * is running on. Save current values: 327 */ 328 thread_lock(td); 329 330 old_cpu = PCPU_GET(cpuid); 331 old_pinned = td->td_pinned; 332 old_prio = td->td_priority; 333 was_bound = sched_is_bound(td); 334 sched_unbind(td); 335 td->td_pinned = 0; 336 sched_bind(td, old_cpu); 337 338 ck_epoch_synchronize_wait(&linux_epoch[type], 339 &linux_synchronize_rcu_cb, NULL); 340 341 /* restore CPU binding, if any */ 342 if (was_bound != 0) { 343 sched_bind(td, old_cpu); 344 } else { 345 /* get thread back to initial CPU, if any */ 346 if (old_pinned != 0) 347 sched_bind(td, old_cpu); 348 sched_unbind(td); 349 } 350 /* restore pinned after bind */ 351 td->td_pinned = old_pinned; 352 353 /* restore thread priority */ 354 sched_prio(td, old_prio); 355 thread_unlock(td); 356 357 PICKUP_GIANT(); 358 } 359 360 void 361 linux_rcu_barrier(unsigned type) 362 { 363 struct linux_epoch_head *head; 364 365 MPASS(type < RCU_TYPE_MAX); 366 367 /* 368 * This function is not obligated to wait for a grace period. 369 * It only waits for RCU callbacks that have already been posted. 370 * If there are no RCU callbacks posted, rcu_barrier() can return 371 * immediately. 372 */ 373 head = &linux_epoch_head[type]; 374 375 /* wait for callbacks to complete */ 376 taskqueue_drain(linux_irq_work_tq, &head->task); 377 } 378 379 void 380 linux_call_rcu(unsigned type, struct rcu_head *context, rcu_callback_t func) 381 { 382 struct callback_head *rcu; 383 struct linux_epoch_head *head; 384 385 MPASS(type < RCU_TYPE_MAX); 386 387 rcu = (struct callback_head *)context; 388 head = &linux_epoch_head[type]; 389 390 rcu->func = func; 391 llist_add(&rcu->node, &head->cb_head); 392 taskqueue_enqueue(linux_irq_work_tq, &head->task); 393 } 394 395 int 396 init_srcu_struct(struct srcu_struct *srcu) 397 { 398 return (0); 399 } 400 401 void 402 cleanup_srcu_struct(struct srcu_struct *srcu) 403 { 404 } 405 406 int 407 srcu_read_lock(struct srcu_struct *srcu) 408 { 409 linux_rcu_read_lock(RCU_TYPE_SLEEPABLE); 410 return (0); 411 } 412 413 void 414 srcu_read_unlock(struct srcu_struct *srcu, int key __unused) 415 { 416 linux_rcu_read_unlock(RCU_TYPE_SLEEPABLE); 417 } 418 419 void 420 synchronize_srcu(struct srcu_struct *srcu) 421 { 422 linux_synchronize_rcu(RCU_TYPE_SLEEPABLE); 423 } 424 425 void 426 srcu_barrier(struct srcu_struct *srcu) 427 { 428 linux_rcu_barrier(RCU_TYPE_SLEEPABLE); 429 } 430