1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/counter.h> 35 #include <sys/epoch.h> 36 #include <sys/gtaskqueue.h> 37 #include <sys/kernel.h> 38 #include <sys/limits.h> 39 #include <sys/lock.h> 40 #include <sys/malloc.h> 41 #include <sys/mutex.h> 42 #include <sys/pcpu.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/sx.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/turnstile.h> 49 #ifdef EPOCH_TRACE 50 #include <machine/stdarg.h> 51 #include <sys/stack.h> 52 #include <sys/tree.h> 53 #endif 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_kern.h> 57 #include <vm/uma.h> 58 59 #include <ck_epoch.h> 60 61 static MALLOC_DEFINE(M_EPOCH, "epoch", "epoch based reclamation"); 62 63 #ifdef __amd64__ 64 #define EPOCH_ALIGN CACHE_LINE_SIZE*2 65 #else 66 #define EPOCH_ALIGN CACHE_LINE_SIZE 67 #endif 68 69 TAILQ_HEAD (epoch_tdlist, epoch_tracker); 70 typedef struct epoch_record { 71 ck_epoch_record_t er_record; 72 struct epoch_context er_drain_ctx; 73 struct epoch *er_parent; 74 volatile struct epoch_tdlist er_tdlist; 75 volatile uint32_t er_gen; 76 uint32_t er_cpuid; 77 } __aligned(EPOCH_ALIGN) *epoch_record_t; 78 79 struct epoch { 80 struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); 81 epoch_record_t e_pcpu_record; 82 int e_idx; 83 int e_flags; 84 struct sx e_drain_sx; 85 struct mtx e_drain_mtx; 86 volatile int e_drain_count; 87 const char *e_name; 88 }; 89 90 /* arbitrary --- needs benchmarking */ 91 #define MAX_ADAPTIVE_SPIN 100 92 #define MAX_EPOCHS 64 93 94 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); 95 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW, 0, "epoch information"); 96 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW, 0, "epoch stats"); 97 98 /* Stats. */ 99 static counter_u64_t block_count; 100 101 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, 102 &block_count, "# of times a thread was in an epoch when epoch_wait was called"); 103 static counter_u64_t migrate_count; 104 105 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, 106 &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); 107 static counter_u64_t turnstile_count; 108 109 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, 110 &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); 111 static counter_u64_t switch_count; 112 113 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, 114 &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); 115 static counter_u64_t epoch_call_count; 116 117 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, 118 &epoch_call_count, "# of times a callback was deferred"); 119 static counter_u64_t epoch_call_task_count; 120 121 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, 122 &epoch_call_task_count, "# of times a callback task was run"); 123 124 TAILQ_HEAD (threadlist, thread); 125 126 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, 127 ck_epoch_entry_container) 128 129 epoch_t allepochs[MAX_EPOCHS]; 130 131 DPCPU_DEFINE(struct grouptask, epoch_cb_task); 132 DPCPU_DEFINE(int, epoch_cb_count); 133 134 static __read_mostly int inited; 135 static __read_mostly int epoch_count; 136 __read_mostly epoch_t global_epoch; 137 __read_mostly epoch_t global_epoch_preempt; 138 139 static void epoch_call_task(void *context __unused); 140 static uma_zone_t pcpu_zone_record; 141 142 #ifdef EPOCH_TRACE 143 struct stackentry { 144 RB_ENTRY(stackentry) se_node; 145 struct stack se_stack; 146 }; 147 148 static int 149 stackentry_compare(struct stackentry *a, struct stackentry *b) 150 { 151 152 if (a->se_stack.depth > b->se_stack.depth) 153 return (1); 154 if (a->se_stack.depth < b->se_stack.depth) 155 return (-1); 156 for (int i = 0; i < a->se_stack.depth; i++) { 157 if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) 158 return (1); 159 if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) 160 return (-1); 161 } 162 163 return (0); 164 } 165 166 RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); 167 RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); 168 169 static struct mtx epoch_stacks_lock; 170 MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); 171 172 static bool epoch_trace_stack_print = true; 173 SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN, 174 &epoch_trace_stack_print, 0, "Print stack traces on epoch reports"); 175 176 static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); 177 static inline void 178 epoch_trace_report(const char *fmt, ...) 179 { 180 va_list ap; 181 struct stackentry se, *new; 182 183 stack_zero(&se.se_stack); /* XXX: is it really needed? */ 184 stack_save(&se.se_stack); 185 186 /* Tree is never reduced - go lockless. */ 187 if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) 188 return; 189 190 new = malloc(sizeof(*new), M_STACK, M_NOWAIT); 191 if (new != NULL) { 192 bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); 193 194 mtx_lock(&epoch_stacks_lock); 195 new = RB_INSERT(stacktree, &epoch_stacks, new); 196 mtx_unlock(&epoch_stacks_lock); 197 if (new != NULL) 198 free(new, M_STACK); 199 } 200 201 va_start(ap, fmt); 202 (void)vprintf(fmt, ap); 203 va_end(ap); 204 if (epoch_trace_stack_print) 205 stack_print_ddb(&se.se_stack); 206 } 207 208 static inline void 209 epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, 210 const char *file, int line) 211 { 212 epoch_tracker_t iet; 213 214 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) 215 if (iet->et_epoch == epoch) 216 epoch_trace_report("Recursively entering epoch %s " 217 "at %s:%d, previously entered at %s:%d\n", 218 epoch->e_name, file, line, 219 iet->et_file, iet->et_line); 220 et->et_epoch = epoch; 221 et->et_file = file; 222 et->et_line = line; 223 SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); 224 } 225 226 static inline void 227 epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, 228 const char *file, int line) 229 { 230 231 if (SLIST_FIRST(&td->td_epochs) != et) { 232 epoch_trace_report("Exiting epoch %s in a not nested order " 233 "at %s:%d. Most recently entered %s at %s:%d\n", 234 epoch->e_name, 235 file, line, 236 SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, 237 SLIST_FIRST(&td->td_epochs)->et_file, 238 SLIST_FIRST(&td->td_epochs)->et_line); 239 /* This will panic if et is not anywhere on td_epochs. */ 240 SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); 241 } else 242 SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); 243 } 244 245 /* Used by assertions that check thread state before going to sleep. */ 246 void 247 epoch_trace_list(struct thread *td) 248 { 249 epoch_tracker_t iet; 250 251 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) 252 printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name, 253 iet->et_file, iet->et_line); 254 } 255 #endif /* EPOCH_TRACE */ 256 257 static void 258 epoch_init(void *arg __unused) 259 { 260 int cpu; 261 262 block_count = counter_u64_alloc(M_WAITOK); 263 migrate_count = counter_u64_alloc(M_WAITOK); 264 turnstile_count = counter_u64_alloc(M_WAITOK); 265 switch_count = counter_u64_alloc(M_WAITOK); 266 epoch_call_count = counter_u64_alloc(M_WAITOK); 267 epoch_call_task_count = counter_u64_alloc(M_WAITOK); 268 269 pcpu_zone_record = uma_zcreate("epoch_record pcpu", 270 sizeof(struct epoch_record), NULL, NULL, NULL, NULL, 271 UMA_ALIGN_PTR, UMA_ZONE_PCPU); 272 CPU_FOREACH(cpu) { 273 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, 274 epoch_call_task, NULL); 275 taskqgroup_attach_cpu(qgroup_softirq, 276 DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, 277 "epoch call task"); 278 } 279 #ifdef EPOCH_TRACE 280 SLIST_INIT(&thread0.td_epochs); 281 #endif 282 inited = 1; 283 global_epoch = epoch_alloc("Global", 0); 284 global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); 285 } 286 SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL); 287 288 #if !defined(EARLY_AP_STARTUP) 289 static void 290 epoch_init_smp(void *dummy __unused) 291 { 292 inited = 2; 293 } 294 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); 295 #endif 296 297 static void 298 epoch_ctor(epoch_t epoch) 299 { 300 epoch_record_t er; 301 int cpu; 302 303 epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); 304 CPU_FOREACH(cpu) { 305 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 306 bzero(er, sizeof(*er)); 307 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); 308 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); 309 er->er_cpuid = cpu; 310 er->er_parent = epoch; 311 } 312 } 313 314 static void 315 epoch_adjust_prio(struct thread *td, u_char prio) 316 { 317 318 thread_lock(td); 319 sched_prio(td, prio); 320 thread_unlock(td); 321 } 322 323 epoch_t 324 epoch_alloc(const char *name, int flags) 325 { 326 epoch_t epoch; 327 328 if (__predict_false(!inited)) 329 panic("%s called too early in boot", __func__); 330 epoch = malloc(sizeof(struct epoch), M_EPOCH, M_ZERO | M_WAITOK); 331 ck_epoch_init(&epoch->e_epoch); 332 epoch_ctor(epoch); 333 MPASS(epoch_count < MAX_EPOCHS - 2); 334 epoch->e_flags = flags; 335 epoch->e_idx = epoch_count; 336 epoch->e_name = name; 337 sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); 338 mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); 339 allepochs[epoch_count++] = epoch; 340 return (epoch); 341 } 342 343 void 344 epoch_free(epoch_t epoch) 345 { 346 347 epoch_drain_callbacks(epoch); 348 allepochs[epoch->e_idx] = NULL; 349 epoch_wait(global_epoch); 350 uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); 351 mtx_destroy(&epoch->e_drain_mtx); 352 sx_destroy(&epoch->e_drain_sx); 353 free(epoch, M_EPOCH); 354 } 355 356 static epoch_record_t 357 epoch_currecord(epoch_t epoch) 358 { 359 360 return (zpcpu_get_cpu(epoch->e_pcpu_record, curcpu)); 361 } 362 363 #define INIT_CHECK(epoch) \ 364 do { \ 365 if (__predict_false((epoch) == NULL)) \ 366 return; \ 367 } while (0) 368 369 void 370 _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 371 { 372 struct epoch_record *er; 373 struct thread *td; 374 375 MPASS(cold || epoch != NULL); 376 MPASS(epoch->e_flags & EPOCH_PREEMPT); 377 td = curthread; 378 MPASS((vm_offset_t)et >= td->td_kstack && 379 (vm_offset_t)et + sizeof(struct epoch_tracker) <= 380 td->td_kstack + td->td_kstack_pages * PAGE_SIZE); 381 382 INIT_CHECK(epoch); 383 #ifdef EPOCH_TRACE 384 epoch_trace_enter(td, epoch, et, file, line); 385 #endif 386 et->et_td = td; 387 THREAD_NO_SLEEPING(); 388 critical_enter(); 389 sched_pin(); 390 td->td_pre_epoch_prio = td->td_priority; 391 er = epoch_currecord(epoch); 392 TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); 393 ck_epoch_begin(&er->er_record, &et->et_section); 394 critical_exit(); 395 } 396 397 void 398 epoch_enter(epoch_t epoch) 399 { 400 epoch_record_t er; 401 402 MPASS(cold || epoch != NULL); 403 INIT_CHECK(epoch); 404 critical_enter(); 405 er = epoch_currecord(epoch); 406 ck_epoch_begin(&er->er_record, NULL); 407 } 408 409 void 410 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 411 { 412 struct epoch_record *er; 413 struct thread *td; 414 415 INIT_CHECK(epoch); 416 td = curthread; 417 critical_enter(); 418 sched_unpin(); 419 THREAD_SLEEPING_OK(); 420 er = epoch_currecord(epoch); 421 MPASS(epoch->e_flags & EPOCH_PREEMPT); 422 MPASS(et != NULL); 423 MPASS(et->et_td == td); 424 #ifdef INVARIANTS 425 et->et_td = (void*)0xDEADBEEF; 426 #endif 427 ck_epoch_end(&er->er_record, &et->et_section); 428 TAILQ_REMOVE(&er->er_tdlist, et, et_link); 429 er->er_gen++; 430 if (__predict_false(td->td_pre_epoch_prio != td->td_priority)) 431 epoch_adjust_prio(td, td->td_pre_epoch_prio); 432 critical_exit(); 433 #ifdef EPOCH_TRACE 434 epoch_trace_exit(td, epoch, et, file, line); 435 #endif 436 } 437 438 void 439 epoch_exit(epoch_t epoch) 440 { 441 epoch_record_t er; 442 443 INIT_CHECK(epoch); 444 er = epoch_currecord(epoch); 445 ck_epoch_end(&er->er_record, NULL); 446 critical_exit(); 447 } 448 449 /* 450 * epoch_block_handler_preempt() is a callback from the CK code when another 451 * thread is currently in an epoch section. 452 */ 453 static void 454 epoch_block_handler_preempt(struct ck_epoch *global __unused, 455 ck_epoch_record_t *cr, void *arg __unused) 456 { 457 epoch_record_t record; 458 struct thread *td, *owner, *curwaittd; 459 struct epoch_tracker *tdwait; 460 struct turnstile *ts; 461 struct lock_object *lock; 462 int spincount, gen; 463 int locksheld __unused; 464 465 record = __containerof(cr, struct epoch_record, er_record); 466 td = curthread; 467 locksheld = td->td_locks; 468 spincount = 0; 469 counter_u64_add(block_count, 1); 470 /* 471 * We lost a race and there's no longer any threads 472 * on the CPU in an epoch section. 473 */ 474 if (TAILQ_EMPTY(&record->er_tdlist)) 475 return; 476 477 if (record->er_cpuid != curcpu) { 478 /* 479 * If the head of the list is running, we can wait for it 480 * to remove itself from the list and thus save us the 481 * overhead of a migration 482 */ 483 gen = record->er_gen; 484 thread_unlock(td); 485 /* 486 * We can't actually check if the waiting thread is running 487 * so we simply poll for it to exit before giving up and 488 * migrating. 489 */ 490 do { 491 cpu_spinwait(); 492 } while (!TAILQ_EMPTY(&record->er_tdlist) && 493 gen == record->er_gen && 494 spincount++ < MAX_ADAPTIVE_SPIN); 495 thread_lock(td); 496 /* 497 * If the generation has changed we can poll again 498 * otherwise we need to migrate. 499 */ 500 if (gen != record->er_gen) 501 return; 502 /* 503 * Being on the same CPU as that of the record on which 504 * we need to wait allows us access to the thread 505 * list associated with that CPU. We can then examine the 506 * oldest thread in the queue and wait on its turnstile 507 * until it resumes and so on until a grace period 508 * elapses. 509 * 510 */ 511 counter_u64_add(migrate_count, 1); 512 sched_bind(td, record->er_cpuid); 513 /* 514 * At this point we need to return to the ck code 515 * to scan to see if a grace period has elapsed. 516 * We can't move on to check the thread list, because 517 * in the meantime new threads may have arrived that 518 * in fact belong to a different epoch. 519 */ 520 return; 521 } 522 /* 523 * Try to find a thread in an epoch section on this CPU 524 * waiting on a turnstile. Otherwise find the lowest 525 * priority thread (highest prio value) and drop our priority 526 * to match to allow it to run. 527 */ 528 TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { 529 /* 530 * Propagate our priority to any other waiters to prevent us 531 * from starving them. They will have their original priority 532 * restore on exit from epoch_wait(). 533 */ 534 curwaittd = tdwait->et_td; 535 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { 536 critical_enter(); 537 thread_unlock(td); 538 thread_lock(curwaittd); 539 sched_prio(curwaittd, td->td_priority); 540 thread_unlock(curwaittd); 541 thread_lock(td); 542 critical_exit(); 543 } 544 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && 545 ((ts = curwaittd->td_blocked) != NULL)) { 546 /* 547 * We unlock td to allow turnstile_wait to reacquire 548 * the thread lock. Before unlocking it we enter a 549 * critical section to prevent preemption after we 550 * reenable interrupts by dropping the thread lock in 551 * order to prevent curwaittd from getting to run. 552 */ 553 critical_enter(); 554 thread_unlock(td); 555 556 if (turnstile_lock(ts, &lock, &owner)) { 557 if (ts == curwaittd->td_blocked) { 558 MPASS(TD_IS_INHIBITED(curwaittd) && 559 TD_ON_LOCK(curwaittd)); 560 critical_exit(); 561 turnstile_wait(ts, owner, 562 curwaittd->td_tsqueue); 563 counter_u64_add(turnstile_count, 1); 564 thread_lock(td); 565 return; 566 } 567 turnstile_unlock(ts, lock); 568 } 569 thread_lock(td); 570 critical_exit(); 571 KASSERT(td->td_locks == locksheld, 572 ("%d extra locks held", td->td_locks - locksheld)); 573 } 574 } 575 /* 576 * We didn't find any threads actually blocked on a lock 577 * so we have nothing to do except context switch away. 578 */ 579 counter_u64_add(switch_count, 1); 580 mi_switch(SW_VOL | SWT_RELINQUISH); 581 /* 582 * It is important the thread lock is dropped while yielding 583 * to allow other threads to acquire the lock pointed to by 584 * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the 585 * thread lock before returning. Else a deadlock like 586 * situation might happen. 587 */ 588 thread_lock(td); 589 } 590 591 void 592 epoch_wait_preempt(epoch_t epoch) 593 { 594 struct thread *td; 595 int was_bound; 596 int old_cpu; 597 int old_pinned; 598 u_char old_prio; 599 int locks __unused; 600 601 MPASS(cold || epoch != NULL); 602 INIT_CHECK(epoch); 603 td = curthread; 604 #ifdef INVARIANTS 605 locks = curthread->td_locks; 606 MPASS(epoch->e_flags & EPOCH_PREEMPT); 607 if ((epoch->e_flags & EPOCH_LOCKED) == 0) 608 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 609 "epoch_wait() can be long running"); 610 KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle " 611 "of an epoch section of the same epoch")); 612 #endif 613 DROP_GIANT(); 614 thread_lock(td); 615 616 old_cpu = PCPU_GET(cpuid); 617 old_pinned = td->td_pinned; 618 old_prio = td->td_priority; 619 was_bound = sched_is_bound(td); 620 sched_unbind(td); 621 td->td_pinned = 0; 622 sched_bind(td, old_cpu); 623 624 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, 625 NULL); 626 627 /* restore CPU binding, if any */ 628 if (was_bound != 0) { 629 sched_bind(td, old_cpu); 630 } else { 631 /* get thread back to initial CPU, if any */ 632 if (old_pinned != 0) 633 sched_bind(td, old_cpu); 634 sched_unbind(td); 635 } 636 /* restore pinned after bind */ 637 td->td_pinned = old_pinned; 638 639 /* restore thread priority */ 640 sched_prio(td, old_prio); 641 thread_unlock(td); 642 PICKUP_GIANT(); 643 KASSERT(td->td_locks == locks, 644 ("%d residual locks held", td->td_locks - locks)); 645 } 646 647 static void 648 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, 649 void *arg __unused) 650 { 651 cpu_spinwait(); 652 } 653 654 void 655 epoch_wait(epoch_t epoch) 656 { 657 658 MPASS(cold || epoch != NULL); 659 INIT_CHECK(epoch); 660 MPASS(epoch->e_flags == 0); 661 critical_enter(); 662 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); 663 critical_exit(); 664 } 665 666 void 667 epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx) 668 { 669 epoch_record_t er; 670 ck_epoch_entry_t *cb; 671 672 cb = (void *)ctx; 673 674 MPASS(callback); 675 /* too early in boot to have epoch set up */ 676 if (__predict_false(epoch == NULL)) 677 goto boottime; 678 #if !defined(EARLY_AP_STARTUP) 679 if (__predict_false(inited < 2)) 680 goto boottime; 681 #endif 682 683 critical_enter(); 684 *DPCPU_PTR(epoch_cb_count) += 1; 685 er = epoch_currecord(epoch); 686 ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); 687 critical_exit(); 688 return; 689 boottime: 690 callback(ctx); 691 } 692 693 static void 694 epoch_call_task(void *arg __unused) 695 { 696 ck_stack_entry_t *cursor, *head, *next; 697 ck_epoch_record_t *record; 698 epoch_record_t er; 699 epoch_t epoch; 700 ck_stack_t cb_stack; 701 int i, npending, total; 702 703 ck_stack_init(&cb_stack); 704 critical_enter(); 705 epoch_enter(global_epoch); 706 for (total = i = 0; i < epoch_count; i++) { 707 if (__predict_false((epoch = allepochs[i]) == NULL)) 708 continue; 709 er = epoch_currecord(epoch); 710 record = &er->er_record; 711 if ((npending = record->n_pending) == 0) 712 continue; 713 ck_epoch_poll_deferred(record, &cb_stack); 714 total += npending - record->n_pending; 715 } 716 epoch_exit(global_epoch); 717 *DPCPU_PTR(epoch_cb_count) -= total; 718 critical_exit(); 719 720 counter_u64_add(epoch_call_count, total); 721 counter_u64_add(epoch_call_task_count, 1); 722 723 head = ck_stack_batch_pop_npsc(&cb_stack); 724 for (cursor = head; cursor != NULL; cursor = next) { 725 struct ck_epoch_entry *entry = 726 ck_epoch_entry_container(cursor); 727 728 next = CK_STACK_NEXT(cursor); 729 entry->function(entry); 730 } 731 } 732 733 int 734 in_epoch_verbose(epoch_t epoch, int dump_onfail) 735 { 736 struct epoch_tracker *tdwait; 737 struct thread *td; 738 epoch_record_t er; 739 740 td = curthread; 741 if (THREAD_CAN_SLEEP()) 742 return (0); 743 if (__predict_false((epoch) == NULL)) 744 return (0); 745 critical_enter(); 746 er = epoch_currecord(epoch); 747 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 748 if (tdwait->et_td == td) { 749 critical_exit(); 750 return (1); 751 } 752 #ifdef INVARIANTS 753 if (dump_onfail) { 754 MPASS(td->td_pinned); 755 printf("cpu: %d id: %d\n", curcpu, td->td_tid); 756 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 757 printf("td_tid: %d ", tdwait->et_td->td_tid); 758 printf("\n"); 759 } 760 #endif 761 critical_exit(); 762 return (0); 763 } 764 765 int 766 in_epoch(epoch_t epoch) 767 { 768 return (in_epoch_verbose(epoch, 0)); 769 } 770 771 static void 772 epoch_drain_cb(struct epoch_context *ctx) 773 { 774 struct epoch *epoch = 775 __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; 776 777 if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { 778 mtx_lock(&epoch->e_drain_mtx); 779 wakeup(epoch); 780 mtx_unlock(&epoch->e_drain_mtx); 781 } 782 } 783 784 void 785 epoch_drain_callbacks(epoch_t epoch) 786 { 787 epoch_record_t er; 788 struct thread *td; 789 int was_bound; 790 int old_pinned; 791 int old_cpu; 792 int cpu; 793 794 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 795 "epoch_drain_callbacks() may sleep!"); 796 797 /* too early in boot to have epoch set up */ 798 if (__predict_false(epoch == NULL)) 799 return; 800 #if !defined(EARLY_AP_STARTUP) 801 if (__predict_false(inited < 2)) 802 return; 803 #endif 804 DROP_GIANT(); 805 806 sx_xlock(&epoch->e_drain_sx); 807 mtx_lock(&epoch->e_drain_mtx); 808 809 td = curthread; 810 thread_lock(td); 811 old_cpu = PCPU_GET(cpuid); 812 old_pinned = td->td_pinned; 813 was_bound = sched_is_bound(td); 814 sched_unbind(td); 815 td->td_pinned = 0; 816 817 CPU_FOREACH(cpu) 818 epoch->e_drain_count++; 819 CPU_FOREACH(cpu) { 820 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 821 sched_bind(td, cpu); 822 epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx); 823 } 824 825 /* restore CPU binding, if any */ 826 if (was_bound != 0) { 827 sched_bind(td, old_cpu); 828 } else { 829 /* get thread back to initial CPU, if any */ 830 if (old_pinned != 0) 831 sched_bind(td, old_cpu); 832 sched_unbind(td); 833 } 834 /* restore pinned after bind */ 835 td->td_pinned = old_pinned; 836 837 thread_unlock(td); 838 839 while (epoch->e_drain_count != 0) 840 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); 841 842 mtx_unlock(&epoch->e_drain_mtx); 843 sx_xunlock(&epoch->e_drain_sx); 844 845 PICKUP_GIANT(); 846 } 847