1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/counter.h> 35 #include <sys/epoch.h> 36 #include <sys/gtaskqueue.h> 37 #include <sys/kernel.h> 38 #include <sys/limits.h> 39 #include <sys/lock.h> 40 #include <sys/malloc.h> 41 #include <sys/mutex.h> 42 #include <sys/pcpu.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/sx.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/turnstile.h> 49 #ifdef EPOCH_TRACE 50 #include <machine/stdarg.h> 51 #include <sys/stack.h> 52 #include <sys/tree.h> 53 #endif 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_kern.h> 57 #include <vm/uma.h> 58 59 #include <ck_epoch.h> 60 61 static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); 62 63 #ifdef __amd64__ 64 #define EPOCH_ALIGN CACHE_LINE_SIZE*2 65 #else 66 #define EPOCH_ALIGN CACHE_LINE_SIZE 67 #endif 68 69 TAILQ_HEAD (epoch_tdlist, epoch_tracker); 70 typedef struct epoch_record { 71 ck_epoch_record_t er_record; 72 struct epoch_context er_drain_ctx; 73 struct epoch *er_parent; 74 volatile struct epoch_tdlist er_tdlist; 75 volatile uint32_t er_gen; 76 uint32_t er_cpuid; 77 } __aligned(EPOCH_ALIGN) *epoch_record_t; 78 79 struct epoch { 80 struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); 81 epoch_record_t e_pcpu_record; 82 int e_idx; 83 int e_flags; 84 struct sx e_drain_sx; 85 struct mtx e_drain_mtx; 86 volatile int e_drain_count; 87 const char *e_name; 88 }; 89 90 /* arbitrary --- needs benchmarking */ 91 #define MAX_ADAPTIVE_SPIN 100 92 #define MAX_EPOCHS 64 93 94 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); 95 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); 96 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); 97 98 /* Stats. */ 99 static counter_u64_t block_count; 100 101 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, 102 &block_count, "# of times a thread was in an epoch when epoch_wait was called"); 103 static counter_u64_t migrate_count; 104 105 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, 106 &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); 107 static counter_u64_t turnstile_count; 108 109 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, 110 &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); 111 static counter_u64_t switch_count; 112 113 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, 114 &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); 115 static counter_u64_t epoch_call_count; 116 117 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, 118 &epoch_call_count, "# of times a callback was deferred"); 119 static counter_u64_t epoch_call_task_count; 120 121 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, 122 &epoch_call_task_count, "# of times a callback task was run"); 123 124 TAILQ_HEAD (threadlist, thread); 125 126 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, 127 ck_epoch_entry_container) 128 129 epoch_t allepochs[MAX_EPOCHS]; 130 131 DPCPU_DEFINE(struct grouptask, epoch_cb_task); 132 DPCPU_DEFINE(int, epoch_cb_count); 133 134 static __read_mostly int inited; 135 static __read_mostly int epoch_count; 136 __read_mostly epoch_t global_epoch; 137 __read_mostly epoch_t global_epoch_preempt; 138 139 static void epoch_call_task(void *context __unused); 140 static uma_zone_t pcpu_zone_record; 141 142 #ifdef EPOCH_TRACE 143 struct stackentry { 144 RB_ENTRY(stackentry) se_node; 145 struct stack se_stack; 146 }; 147 148 static int 149 stackentry_compare(struct stackentry *a, struct stackentry *b) 150 { 151 152 if (a->se_stack.depth > b->se_stack.depth) 153 return (1); 154 if (a->se_stack.depth < b->se_stack.depth) 155 return (-1); 156 for (int i = 0; i < a->se_stack.depth; i++) { 157 if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) 158 return (1); 159 if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) 160 return (-1); 161 } 162 163 return (0); 164 } 165 166 RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); 167 RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); 168 169 static struct mtx epoch_stacks_lock; 170 MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); 171 172 static bool epoch_trace_stack_print = true; 173 SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN, 174 &epoch_trace_stack_print, 0, "Print stack traces on epoch reports"); 175 176 static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); 177 static inline void 178 epoch_trace_report(const char *fmt, ...) 179 { 180 va_list ap; 181 struct stackentry se, *new; 182 183 stack_zero(&se.se_stack); /* XXX: is it really needed? */ 184 stack_save(&se.se_stack); 185 186 /* Tree is never reduced - go lockless. */ 187 if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) 188 return; 189 190 new = malloc(sizeof(*new), M_STACK, M_NOWAIT); 191 if (new != NULL) { 192 bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); 193 194 mtx_lock(&epoch_stacks_lock); 195 new = RB_INSERT(stacktree, &epoch_stacks, new); 196 mtx_unlock(&epoch_stacks_lock); 197 if (new != NULL) 198 free(new, M_STACK); 199 } 200 201 va_start(ap, fmt); 202 (void)vprintf(fmt, ap); 203 va_end(ap); 204 if (epoch_trace_stack_print) 205 stack_print_ddb(&se.se_stack); 206 } 207 208 static inline void 209 epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, 210 const char *file, int line) 211 { 212 epoch_tracker_t iet; 213 214 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) 215 if (iet->et_epoch == epoch) 216 epoch_trace_report("Recursively entering epoch %s " 217 "at %s:%d, previously entered at %s:%d\n", 218 epoch->e_name, file, line, 219 iet->et_file, iet->et_line); 220 et->et_epoch = epoch; 221 et->et_file = file; 222 et->et_line = line; 223 SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); 224 } 225 226 static inline void 227 epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, 228 const char *file, int line) 229 { 230 231 if (SLIST_FIRST(&td->td_epochs) != et) { 232 epoch_trace_report("Exiting epoch %s in a not nested order " 233 "at %s:%d. Most recently entered %s at %s:%d\n", 234 epoch->e_name, 235 file, line, 236 SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, 237 SLIST_FIRST(&td->td_epochs)->et_file, 238 SLIST_FIRST(&td->td_epochs)->et_line); 239 /* This will panic if et is not anywhere on td_epochs. */ 240 SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); 241 } else 242 SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); 243 } 244 245 /* Used by assertions that check thread state before going to sleep. */ 246 void 247 epoch_trace_list(struct thread *td) 248 { 249 epoch_tracker_t iet; 250 251 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) 252 printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name, 253 iet->et_file, iet->et_line); 254 } 255 #endif /* EPOCH_TRACE */ 256 257 static void 258 epoch_init(void *arg __unused) 259 { 260 int cpu; 261 262 block_count = counter_u64_alloc(M_WAITOK); 263 migrate_count = counter_u64_alloc(M_WAITOK); 264 turnstile_count = counter_u64_alloc(M_WAITOK); 265 switch_count = counter_u64_alloc(M_WAITOK); 266 epoch_call_count = counter_u64_alloc(M_WAITOK); 267 epoch_call_task_count = counter_u64_alloc(M_WAITOK); 268 269 pcpu_zone_record = uma_zcreate("epoch_record pcpu", 270 sizeof(struct epoch_record), NULL, NULL, NULL, NULL, 271 UMA_ALIGN_PTR, UMA_ZONE_PCPU); 272 CPU_FOREACH(cpu) { 273 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, 274 epoch_call_task, NULL); 275 taskqgroup_attach_cpu(qgroup_softirq, 276 DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, 277 "epoch call task"); 278 } 279 #ifdef EPOCH_TRACE 280 SLIST_INIT(&thread0.td_epochs); 281 #endif 282 inited = 1; 283 global_epoch = epoch_alloc("Global", 0); 284 global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); 285 } 286 SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL); 287 288 #if !defined(EARLY_AP_STARTUP) 289 static void 290 epoch_init_smp(void *dummy __unused) 291 { 292 inited = 2; 293 } 294 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); 295 #endif 296 297 static void 298 epoch_ctor(epoch_t epoch) 299 { 300 epoch_record_t er; 301 int cpu; 302 303 epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); 304 CPU_FOREACH(cpu) { 305 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 306 bzero(er, sizeof(*er)); 307 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); 308 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); 309 er->er_cpuid = cpu; 310 er->er_parent = epoch; 311 } 312 } 313 314 static void 315 epoch_adjust_prio(struct thread *td, u_char prio) 316 { 317 318 thread_lock(td); 319 sched_prio(td, prio); 320 thread_unlock(td); 321 } 322 323 epoch_t 324 epoch_alloc(const char *name, int flags) 325 { 326 epoch_t epoch; 327 328 if (__predict_false(!inited)) 329 panic("%s called too early in boot", __func__); 330 epoch = malloc(sizeof(struct epoch), M_EPOCH, M_ZERO | M_WAITOK); 331 ck_epoch_init(&epoch->e_epoch); 332 epoch_ctor(epoch); 333 MPASS(epoch_count < MAX_EPOCHS - 2); 334 epoch->e_flags = flags; 335 epoch->e_idx = epoch_count; 336 epoch->e_name = name; 337 sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); 338 mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); 339 allepochs[epoch_count++] = epoch; 340 return (epoch); 341 } 342 343 void 344 epoch_free(epoch_t epoch) 345 { 346 347 epoch_drain_callbacks(epoch); 348 allepochs[epoch->e_idx] = NULL; 349 epoch_wait(global_epoch); 350 uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); 351 mtx_destroy(&epoch->e_drain_mtx); 352 sx_destroy(&epoch->e_drain_sx); 353 free(epoch, M_EPOCH); 354 } 355 356 static epoch_record_t 357 epoch_currecord(epoch_t epoch) 358 { 359 360 return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu)); 361 } 362 363 #define INIT_CHECK(epoch) \ 364 do { \ 365 if (__predict_false((epoch) == NULL)) \ 366 return; \ 367 } while (0) 368 369 void 370 _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 371 { 372 struct epoch_record *er; 373 struct thread *td; 374 375 MPASS(cold || epoch != NULL); 376 MPASS(epoch->e_flags & EPOCH_PREEMPT); 377 td = curthread; 378 MPASS((vm_offset_t)et >= td->td_kstack && 379 (vm_offset_t)et + sizeof(struct epoch_tracker) <= 380 td->td_kstack + td->td_kstack_pages * PAGE_SIZE); 381 382 INIT_CHECK(epoch); 383 #ifdef EPOCH_TRACE 384 epoch_trace_enter(td, epoch, et, file, line); 385 #endif 386 et->et_td = td; 387 THREAD_NO_SLEEPING(); 388 critical_enter(); 389 sched_pin(); 390 td->td_pre_epoch_prio = td->td_priority; 391 er = epoch_currecord(epoch); 392 TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); 393 ck_epoch_begin(&er->er_record, &et->et_section); 394 critical_exit(); 395 } 396 397 void 398 epoch_enter(epoch_t epoch) 399 { 400 epoch_record_t er; 401 402 MPASS(cold || epoch != NULL); 403 INIT_CHECK(epoch); 404 critical_enter(); 405 er = epoch_currecord(epoch); 406 ck_epoch_begin(&er->er_record, NULL); 407 } 408 409 void 410 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 411 { 412 struct epoch_record *er; 413 struct thread *td; 414 415 INIT_CHECK(epoch); 416 td = curthread; 417 critical_enter(); 418 sched_unpin(); 419 THREAD_SLEEPING_OK(); 420 er = epoch_currecord(epoch); 421 MPASS(epoch->e_flags & EPOCH_PREEMPT); 422 MPASS(et != NULL); 423 MPASS(et->et_td == td); 424 #ifdef INVARIANTS 425 et->et_td = (void*)0xDEADBEEF; 426 #endif 427 ck_epoch_end(&er->er_record, &et->et_section); 428 TAILQ_REMOVE(&er->er_tdlist, et, et_link); 429 er->er_gen++; 430 if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) 431 epoch_adjust_prio(td, td->td_pre_epoch_prio); 432 critical_exit(); 433 #ifdef EPOCH_TRACE 434 epoch_trace_exit(td, epoch, et, file, line); 435 #endif 436 } 437 438 void 439 epoch_exit(epoch_t epoch) 440 { 441 epoch_record_t er; 442 443 INIT_CHECK(epoch); 444 er = epoch_currecord(epoch); 445 ck_epoch_end(&er->er_record, NULL); 446 critical_exit(); 447 } 448 449 /* 450 * epoch_block_handler_preempt() is a callback from the CK code when another 451 * thread is currently in an epoch section. 452 */ 453 static void 454 epoch_block_handler_preempt(struct ck_epoch *global __unused, 455 ck_epoch_record_t *cr, void *arg __unused) 456 { 457 epoch_record_t record; 458 struct thread *td, *owner, *curwaittd; 459 struct epoch_tracker *tdwait; 460 struct turnstile *ts; 461 struct lock_object *lock; 462 int spincount, gen; 463 int locksheld __unused; 464 465 record = __containerof(cr, struct epoch_record, er_record); 466 td = curthread; 467 locksheld = td->td_locks; 468 spincount = 0; 469 counter_u64_add(block_count, 1); 470 /* 471 * We lost a race and there's no longer any threads 472 * on the CPU in an epoch section. 473 */ 474 if (TAILQ_EMPTY(&record->er_tdlist)) 475 return; 476 477 if (record->er_cpuid != curcpu) { 478 /* 479 * If the head of the list is running, we can wait for it 480 * to remove itself from the list and thus save us the 481 * overhead of a migration 482 */ 483 gen = record->er_gen; 484 thread_unlock(td); 485 /* 486 * We can't actually check if the waiting thread is running 487 * so we simply poll for it to exit before giving up and 488 * migrating. 489 */ 490 do { 491 cpu_spinwait(); 492 } while (!TAILQ_EMPTY(&record->er_tdlist) && 493 gen == record->er_gen && 494 spincount++ < MAX_ADAPTIVE_SPIN); 495 thread_lock(td); 496 /* 497 * If the generation has changed we can poll again 498 * otherwise we need to migrate. 499 */ 500 if (gen != record->er_gen) 501 return; 502 /* 503 * Being on the same CPU as that of the record on which 504 * we need to wait allows us access to the thread 505 * list associated with that CPU. We can then examine the 506 * oldest thread in the queue and wait on its turnstile 507 * until it resumes and so on until a grace period 508 * elapses. 509 * 510 */ 511 counter_u64_add(migrate_count, 1); 512 sched_bind(td, record->er_cpuid); 513 /* 514 * At this point we need to return to the ck code 515 * to scan to see if a grace period has elapsed. 516 * We can't move on to check the thread list, because 517 * in the meantime new threads may have arrived that 518 * in fact belong to a different epoch. 519 */ 520 return; 521 } 522 /* 523 * Try to find a thread in an epoch section on this CPU 524 * waiting on a turnstile. Otherwise find the lowest 525 * priority thread (highest prio value) and drop our priority 526 * to match to allow it to run. 527 */ 528 TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { 529 /* 530 * Propagate our priority to any other waiters to prevent us 531 * from starving them. They will have their original priority 532 * restore on exit from epoch_wait(). 533 */ 534 curwaittd = tdwait->et_td; 535 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { 536 critical_enter(); 537 thread_unlock(td); 538 thread_lock(curwaittd); 539 sched_prio(curwaittd, td->td_priority); 540 thread_unlock(curwaittd); 541 thread_lock(td); 542 critical_exit(); 543 } 544 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && 545 ((ts = curwaittd->td_blocked) != NULL)) { 546 /* 547 * We unlock td to allow turnstile_wait to reacquire 548 * the thread lock. Before unlocking it we enter a 549 * critical section to prevent preemption after we 550 * reenable interrupts by dropping the thread lock in 551 * order to prevent curwaittd from getting to run. 552 */ 553 critical_enter(); 554 thread_unlock(td); 555 556 if (turnstile_lock(ts, &lock, &owner)) { 557 if (ts == curwaittd->td_blocked) { 558 MPASS(TD_IS_INHIBITED(curwaittd) && 559 TD_ON_LOCK(curwaittd)); 560 critical_exit(); 561 turnstile_wait(ts, owner, 562 curwaittd->td_tsqueue); 563 counter_u64_add(turnstile_count, 1); 564 thread_lock(td); 565 return; 566 } 567 turnstile_unlock(ts, lock); 568 } 569 thread_lock(td); 570 critical_exit(); 571 KASSERT(td->td_locks == locksheld, 572 ("%d extra locks held", td->td_locks - locksheld)); 573 } 574 } 575 /* 576 * We didn't find any threads actually blocked on a lock 577 * so we have nothing to do except context switch away. 578 */ 579 counter_u64_add(switch_count, 1); 580 mi_switch(SW_VOL | SWT_RELINQUISH, NULL); 581 582 /* 583 * Release the thread lock while yielding to 584 * allow other threads to acquire the lock 585 * pointed to by TDQ_LOCKPTR(td). Else a 586 * deadlock like situation might happen. (HPS) 587 */ 588 thread_unlock(td); 589 thread_lock(td); 590 } 591 592 void 593 epoch_wait_preempt(epoch_t epoch) 594 { 595 struct thread *td; 596 int was_bound; 597 int old_cpu; 598 int old_pinned; 599 u_char old_prio; 600 int locks __unused; 601 602 MPASS(cold || epoch != NULL); 603 INIT_CHECK(epoch); 604 td = curthread; 605 #ifdef INVARIANTS 606 locks = curthread->td_locks; 607 MPASS(epoch->e_flags & EPOCH_PREEMPT); 608 if ((epoch->e_flags & EPOCH_LOCKED) == 0) 609 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 610 "epoch_wait() can be long running"); 611 KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle " 612 "of an epoch section of the same epoch")); 613 #endif 614 thread_lock(td); 615 DROP_GIANT(); 616 617 old_cpu = PCPU_GET(cpuid); 618 old_pinned = td->td_pinned; 619 old_prio = td->td_priority; 620 was_bound = sched_is_bound(td); 621 sched_unbind(td); 622 td->td_pinned = 0; 623 sched_bind(td, old_cpu); 624 625 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, 626 NULL); 627 628 /* restore CPU binding, if any */ 629 if (was_bound != 0) { 630 sched_bind(td, old_cpu); 631 } else { 632 /* get thread back to initial CPU, if any */ 633 if (old_pinned != 0) 634 sched_bind(td, old_cpu); 635 sched_unbind(td); 636 } 637 /* restore pinned after bind */ 638 td->td_pinned = old_pinned; 639 640 /* restore thread priority */ 641 sched_prio(td, old_prio); 642 thread_unlock(td); 643 PICKUP_GIANT(); 644 KASSERT(td->td_locks == locks, 645 ("%d residual locks held", td->td_locks - locks)); 646 } 647 648 static void 649 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, 650 void *arg __unused) 651 { 652 cpu_spinwait(); 653 } 654 655 void 656 epoch_wait(epoch_t epoch) 657 { 658 659 MPASS(cold || epoch != NULL); 660 INIT_CHECK(epoch); 661 MPASS(epoch->e_flags == 0); 662 critical_enter(); 663 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); 664 critical_exit(); 665 } 666 667 void 668 epoch_call(epoch_t epoch, epoch_context_t ctx, void (*callback) (epoch_context_t)) 669 { 670 epoch_record_t er; 671 ck_epoch_entry_t *cb; 672 673 cb = (void *)ctx; 674 675 MPASS(callback); 676 /* too early in boot to have epoch set up */ 677 if (__predict_false(epoch == NULL)) 678 goto boottime; 679 #if !defined(EARLY_AP_STARTUP) 680 if (__predict_false(inited < 2)) 681 goto boottime; 682 #endif 683 684 critical_enter(); 685 *DPCPU_PTR(epoch_cb_count) += 1; 686 er = epoch_currecord(epoch); 687 ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); 688 critical_exit(); 689 return; 690 boottime: 691 callback(ctx); 692 } 693 694 static void 695 epoch_call_task(void *arg __unused) 696 { 697 ck_stack_entry_t *cursor, *head, *next; 698 ck_epoch_record_t *record; 699 epoch_record_t er; 700 epoch_t epoch; 701 ck_stack_t cb_stack; 702 int i, npending, total; 703 704 ck_stack_init(&cb_stack); 705 critical_enter(); 706 epoch_enter(global_epoch); 707 for (total = i = 0; i < epoch_count; i++) { 708 if (__predict_false((epoch = allepochs[i]) == NULL)) 709 continue; 710 er = epoch_currecord(epoch); 711 record = &er->er_record; 712 if ((npending = record->n_pending) == 0) 713 continue; 714 ck_epoch_poll_deferred(record, &cb_stack); 715 total += npending - record->n_pending; 716 } 717 epoch_exit(global_epoch); 718 *DPCPU_PTR(epoch_cb_count) -= total; 719 critical_exit(); 720 721 counter_u64_add(epoch_call_count, total); 722 counter_u64_add(epoch_call_task_count, 1); 723 724 head = ck_stack_batch_pop_npsc(&cb_stack); 725 for (cursor = head; cursor != NULL; cursor = next) { 726 struct ck_epoch_entry *entry = 727 ck_epoch_entry_container(cursor); 728 729 next = CK_STACK_NEXT(cursor); 730 entry->function(entry); 731 } 732 } 733 734 int 735 in_epoch_verbose(epoch_t epoch, int dump_onfail) 736 { 737 struct epoch_tracker *tdwait; 738 struct thread *td; 739 epoch_record_t er; 740 741 td = curthread; 742 if (THREAD_CAN_SLEEP()) 743 return (0); 744 if (__predict_false((epoch) == NULL)) 745 return (0); 746 critical_enter(); 747 er = epoch_currecord(epoch); 748 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 749 if (tdwait->et_td == td) { 750 critical_exit(); 751 return (1); 752 } 753 #ifdef INVARIANTS 754 if (dump_onfail) { 755 MPASS(td->td_pinned); 756 printf("cpu: %d id: %d\n", curcpu, td->td_tid); 757 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 758 printf("td_tid: %d ", tdwait->et_td->td_tid); 759 printf("\n"); 760 } 761 #endif 762 critical_exit(); 763 return (0); 764 } 765 766 int 767 in_epoch(epoch_t epoch) 768 { 769 return (in_epoch_verbose(epoch, 0)); 770 } 771 772 static void 773 epoch_drain_cb(struct epoch_context *ctx) 774 { 775 struct epoch *epoch = 776 __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; 777 778 if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { 779 mtx_lock(&epoch->e_drain_mtx); 780 wakeup(epoch); 781 mtx_unlock(&epoch->e_drain_mtx); 782 } 783 } 784 785 void 786 epoch_drain_callbacks(epoch_t epoch) 787 { 788 epoch_record_t er; 789 struct thread *td; 790 int was_bound; 791 int old_pinned; 792 int old_cpu; 793 int cpu; 794 795 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 796 "epoch_drain_callbacks() may sleep!"); 797 798 /* too early in boot to have epoch set up */ 799 if (__predict_false(epoch == NULL)) 800 return; 801 #if !defined(EARLY_AP_STARTUP) 802 if (__predict_false(inited < 2)) 803 return; 804 #endif 805 DROP_GIANT(); 806 807 sx_xlock(&epoch->e_drain_sx); 808 mtx_lock(&epoch->e_drain_mtx); 809 810 td = curthread; 811 thread_lock(td); 812 old_cpu = PCPU_GET(cpuid); 813 old_pinned = td->td_pinned; 814 was_bound = sched_is_bound(td); 815 sched_unbind(td); 816 td->td_pinned = 0; 817 818 CPU_FOREACH(cpu) 819 epoch->e_drain_count++; 820 CPU_FOREACH(cpu) { 821 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 822 sched_bind(td, cpu); 823 epoch_call(epoch, &er->er_drain_ctx, &epoch_drain_cb); 824 } 825 826 /* restore CPU binding, if any */ 827 if (was_bound != 0) { 828 sched_bind(td, old_cpu); 829 } else { 830 /* get thread back to initial CPU, if any */ 831 if (old_pinned != 0) 832 sched_bind(td, old_cpu); 833 sched_unbind(td); 834 } 835 /* restore pinned after bind */ 836 td->td_pinned = old_pinned; 837 838 thread_unlock(td); 839 840 while (epoch->e_drain_count != 0) 841 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); 842 843 mtx_unlock(&epoch->e_drain_mtx); 844 sx_xunlock(&epoch->e_drain_sx); 845 846 PICKUP_GIANT(); 847 } 848