1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2018, Matthew Macy <mmacy@freebsd.org> 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 * 27 */ 28 29 #include <sys/cdefs.h> 30 __FBSDID("$FreeBSD$"); 31 32 #include <sys/param.h> 33 #include <sys/systm.h> 34 #include <sys/counter.h> 35 #include <sys/epoch.h> 36 #include <sys/gtaskqueue.h> 37 #include <sys/kernel.h> 38 #include <sys/limits.h> 39 #include <sys/lock.h> 40 #include <sys/malloc.h> 41 #include <sys/mutex.h> 42 #include <sys/pcpu.h> 43 #include <sys/proc.h> 44 #include <sys/sched.h> 45 #include <sys/sx.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/turnstile.h> 49 #ifdef EPOCH_TRACE 50 #include <machine/stdarg.h> 51 #include <sys/stack.h> 52 #include <sys/tree.h> 53 #endif 54 #include <vm/vm.h> 55 #include <vm/vm_extern.h> 56 #include <vm/vm_kern.h> 57 #include <vm/uma.h> 58 59 #include <ck_epoch.h> 60 61 #ifdef __amd64__ 62 #define EPOCH_ALIGN CACHE_LINE_SIZE*2 63 #else 64 #define EPOCH_ALIGN CACHE_LINE_SIZE 65 #endif 66 67 TAILQ_HEAD (epoch_tdlist, epoch_tracker); 68 typedef struct epoch_record { 69 ck_epoch_record_t er_record; 70 struct epoch_context er_drain_ctx; 71 struct epoch *er_parent; 72 volatile struct epoch_tdlist er_tdlist; 73 volatile uint32_t er_gen; 74 uint32_t er_cpuid; 75 #ifdef INVARIANTS 76 /* Used to verify record ownership for non-preemptible epochs. */ 77 struct thread *er_td; 78 #endif 79 } __aligned(EPOCH_ALIGN) *epoch_record_t; 80 81 struct epoch { 82 struct ck_epoch e_epoch __aligned(EPOCH_ALIGN); 83 epoch_record_t e_pcpu_record; 84 int e_in_use; 85 int e_flags; 86 struct sx e_drain_sx; 87 struct mtx e_drain_mtx; 88 volatile int e_drain_count; 89 const char *e_name; 90 }; 91 92 /* arbitrary --- needs benchmarking */ 93 #define MAX_ADAPTIVE_SPIN 100 94 #define MAX_EPOCHS 64 95 96 CTASSERT(sizeof(ck_epoch_entry_t) == sizeof(struct epoch_context)); 97 SYSCTL_NODE(_kern, OID_AUTO, epoch, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 98 "epoch information"); 99 SYSCTL_NODE(_kern_epoch, OID_AUTO, stats, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 100 "epoch stats"); 101 102 /* Stats. */ 103 static counter_u64_t block_count; 104 105 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, nblocked, CTLFLAG_RW, 106 &block_count, "# of times a thread was in an epoch when epoch_wait was called"); 107 static counter_u64_t migrate_count; 108 109 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, migrations, CTLFLAG_RW, 110 &migrate_count, "# of times thread was migrated to another CPU in epoch_wait"); 111 static counter_u64_t turnstile_count; 112 113 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, ncontended, CTLFLAG_RW, 114 &turnstile_count, "# of times a thread was blocked on a lock in an epoch during an epoch_wait"); 115 static counter_u64_t switch_count; 116 117 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, switches, CTLFLAG_RW, 118 &switch_count, "# of times a thread voluntarily context switched in epoch_wait"); 119 static counter_u64_t epoch_call_count; 120 121 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_calls, CTLFLAG_RW, 122 &epoch_call_count, "# of times a callback was deferred"); 123 static counter_u64_t epoch_call_task_count; 124 125 SYSCTL_COUNTER_U64(_kern_epoch_stats, OID_AUTO, epoch_call_tasks, CTLFLAG_RW, 126 &epoch_call_task_count, "# of times a callback task was run"); 127 128 TAILQ_HEAD (threadlist, thread); 129 130 CK_STACK_CONTAINER(struct ck_epoch_entry, stack_entry, 131 ck_epoch_entry_container) 132 133 static struct epoch epoch_array[MAX_EPOCHS]; 134 135 DPCPU_DEFINE(struct grouptask, epoch_cb_task); 136 DPCPU_DEFINE(int, epoch_cb_count); 137 138 static __read_mostly int inited; 139 __read_mostly epoch_t global_epoch; 140 __read_mostly epoch_t global_epoch_preempt; 141 142 static void epoch_call_task(void *context __unused); 143 static uma_zone_t pcpu_zone_record; 144 145 static struct sx epoch_sx; 146 147 #define EPOCH_LOCK() sx_xlock(&epoch_sx) 148 #define EPOCH_UNLOCK() sx_xunlock(&epoch_sx) 149 150 #ifdef EPOCH_TRACE 151 struct stackentry { 152 RB_ENTRY(stackentry) se_node; 153 struct stack se_stack; 154 }; 155 156 static int 157 stackentry_compare(struct stackentry *a, struct stackentry *b) 158 { 159 160 if (a->se_stack.depth > b->se_stack.depth) 161 return (1); 162 if (a->se_stack.depth < b->se_stack.depth) 163 return (-1); 164 for (int i = 0; i < a->se_stack.depth; i++) { 165 if (a->se_stack.pcs[i] > b->se_stack.pcs[i]) 166 return (1); 167 if (a->se_stack.pcs[i] < b->se_stack.pcs[i]) 168 return (-1); 169 } 170 171 return (0); 172 } 173 174 RB_HEAD(stacktree, stackentry) epoch_stacks = RB_INITIALIZER(&epoch_stacks); 175 RB_GENERATE_STATIC(stacktree, stackentry, se_node, stackentry_compare); 176 177 static struct mtx epoch_stacks_lock; 178 MTX_SYSINIT(epochstacks, &epoch_stacks_lock, "epoch_stacks", MTX_DEF); 179 180 static bool epoch_trace_stack_print = true; 181 SYSCTL_BOOL(_kern_epoch, OID_AUTO, trace_stack_print, CTLFLAG_RWTUN, 182 &epoch_trace_stack_print, 0, "Print stack traces on epoch reports"); 183 184 static void epoch_trace_report(const char *fmt, ...) __printflike(1, 2); 185 static inline void 186 epoch_trace_report(const char *fmt, ...) 187 { 188 va_list ap; 189 struct stackentry se, *new; 190 191 stack_zero(&se.se_stack); /* XXX: is it really needed? */ 192 stack_save(&se.se_stack); 193 194 /* Tree is never reduced - go lockless. */ 195 if (RB_FIND(stacktree, &epoch_stacks, &se) != NULL) 196 return; 197 198 new = malloc(sizeof(*new), M_STACK, M_NOWAIT); 199 if (new != NULL) { 200 bcopy(&se.se_stack, &new->se_stack, sizeof(struct stack)); 201 202 mtx_lock(&epoch_stacks_lock); 203 new = RB_INSERT(stacktree, &epoch_stacks, new); 204 mtx_unlock(&epoch_stacks_lock); 205 if (new != NULL) 206 free(new, M_STACK); 207 } 208 209 va_start(ap, fmt); 210 (void)vprintf(fmt, ap); 211 va_end(ap); 212 if (epoch_trace_stack_print) 213 stack_print_ddb(&se.se_stack); 214 } 215 216 static inline void 217 epoch_trace_enter(struct thread *td, epoch_t epoch, epoch_tracker_t et, 218 const char *file, int line) 219 { 220 epoch_tracker_t iet; 221 222 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) { 223 if (iet->et_epoch != epoch) 224 continue; 225 epoch_trace_report("Recursively entering epoch %s " 226 "at %s:%d, previously entered at %s:%d\n", 227 epoch->e_name, file, line, 228 iet->et_file, iet->et_line); 229 } 230 et->et_epoch = epoch; 231 et->et_file = file; 232 et->et_line = line; 233 SLIST_INSERT_HEAD(&td->td_epochs, et, et_tlink); 234 } 235 236 static inline void 237 epoch_trace_exit(struct thread *td, epoch_t epoch, epoch_tracker_t et, 238 const char *file, int line) 239 { 240 241 if (SLIST_FIRST(&td->td_epochs) != et) { 242 epoch_trace_report("Exiting epoch %s in a not nested order " 243 "at %s:%d. Most recently entered %s at %s:%d\n", 244 epoch->e_name, 245 file, line, 246 SLIST_FIRST(&td->td_epochs)->et_epoch->e_name, 247 SLIST_FIRST(&td->td_epochs)->et_file, 248 SLIST_FIRST(&td->td_epochs)->et_line); 249 /* This will panic if et is not anywhere on td_epochs. */ 250 SLIST_REMOVE(&td->td_epochs, et, epoch_tracker, et_tlink); 251 } else 252 SLIST_REMOVE_HEAD(&td->td_epochs, et_tlink); 253 } 254 255 /* Used by assertions that check thread state before going to sleep. */ 256 void 257 epoch_trace_list(struct thread *td) 258 { 259 epoch_tracker_t iet; 260 261 SLIST_FOREACH(iet, &td->td_epochs, et_tlink) 262 printf("Epoch %s entered at %s:%d\n", iet->et_epoch->e_name, 263 iet->et_file, iet->et_line); 264 } 265 #endif /* EPOCH_TRACE */ 266 267 static void 268 epoch_init(void *arg __unused) 269 { 270 int cpu; 271 272 block_count = counter_u64_alloc(M_WAITOK); 273 migrate_count = counter_u64_alloc(M_WAITOK); 274 turnstile_count = counter_u64_alloc(M_WAITOK); 275 switch_count = counter_u64_alloc(M_WAITOK); 276 epoch_call_count = counter_u64_alloc(M_WAITOK); 277 epoch_call_task_count = counter_u64_alloc(M_WAITOK); 278 279 pcpu_zone_record = uma_zcreate("epoch_record pcpu", 280 sizeof(struct epoch_record), NULL, NULL, NULL, NULL, 281 UMA_ALIGN_PTR, UMA_ZONE_PCPU); 282 CPU_FOREACH(cpu) { 283 GROUPTASK_INIT(DPCPU_ID_PTR(cpu, epoch_cb_task), 0, 284 epoch_call_task, NULL); 285 taskqgroup_attach_cpu(qgroup_softirq, 286 DPCPU_ID_PTR(cpu, epoch_cb_task), NULL, cpu, NULL, NULL, 287 "epoch call task"); 288 } 289 #ifdef EPOCH_TRACE 290 SLIST_INIT(&thread0.td_epochs); 291 #endif 292 sx_init(&epoch_sx, "epoch-sx"); 293 inited = 1; 294 global_epoch = epoch_alloc("Global", 0); 295 global_epoch_preempt = epoch_alloc("Global preemptible", EPOCH_PREEMPT); 296 } 297 SYSINIT(epoch, SI_SUB_EPOCH, SI_ORDER_FIRST, epoch_init, NULL); 298 299 #if !defined(EARLY_AP_STARTUP) 300 static void 301 epoch_init_smp(void *dummy __unused) 302 { 303 inited = 2; 304 } 305 SYSINIT(epoch_smp, SI_SUB_SMP + 1, SI_ORDER_FIRST, epoch_init_smp, NULL); 306 #endif 307 308 static void 309 epoch_ctor(epoch_t epoch) 310 { 311 epoch_record_t er; 312 int cpu; 313 314 epoch->e_pcpu_record = uma_zalloc_pcpu(pcpu_zone_record, M_WAITOK); 315 CPU_FOREACH(cpu) { 316 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 317 bzero(er, sizeof(*er)); 318 ck_epoch_register(&epoch->e_epoch, &er->er_record, NULL); 319 TAILQ_INIT((struct threadlist *)(uintptr_t)&er->er_tdlist); 320 er->er_cpuid = cpu; 321 er->er_parent = epoch; 322 } 323 } 324 325 static void 326 epoch_adjust_prio(struct thread *td, u_char prio) 327 { 328 329 thread_lock(td); 330 sched_prio(td, prio); 331 thread_unlock(td); 332 } 333 334 epoch_t 335 epoch_alloc(const char *name, int flags) 336 { 337 epoch_t epoch; 338 int i; 339 340 MPASS(name != NULL); 341 342 if (__predict_false(!inited)) 343 panic("%s called too early in boot", __func__); 344 345 EPOCH_LOCK(); 346 347 /* 348 * Find a free index in the epoch array. If no free index is 349 * found, try to use the index after the last one. 350 */ 351 for (i = 0;; i++) { 352 /* 353 * If too many epochs are currently allocated, 354 * return NULL. 355 */ 356 if (i == MAX_EPOCHS) { 357 epoch = NULL; 358 goto done; 359 } 360 if (epoch_array[i].e_in_use == 0) 361 break; 362 } 363 364 epoch = epoch_array + i; 365 ck_epoch_init(&epoch->e_epoch); 366 epoch_ctor(epoch); 367 epoch->e_flags = flags; 368 epoch->e_name = name; 369 sx_init(&epoch->e_drain_sx, "epoch-drain-sx"); 370 mtx_init(&epoch->e_drain_mtx, "epoch-drain-mtx", NULL, MTX_DEF); 371 372 /* 373 * Set e_in_use last, because when this field is set the 374 * epoch_call_task() function will start scanning this epoch 375 * structure. 376 */ 377 atomic_store_rel_int(&epoch->e_in_use, 1); 378 done: 379 EPOCH_UNLOCK(); 380 return (epoch); 381 } 382 383 void 384 epoch_free(epoch_t epoch) 385 { 386 #ifdef INVARIANTS 387 int cpu; 388 #endif 389 390 EPOCH_LOCK(); 391 392 MPASS(epoch->e_in_use != 0); 393 394 epoch_drain_callbacks(epoch); 395 396 atomic_store_rel_int(&epoch->e_in_use, 0); 397 /* 398 * Make sure the epoch_call_task() function see e_in_use equal 399 * to zero, by calling epoch_wait() on the global_epoch: 400 */ 401 epoch_wait(global_epoch); 402 #ifdef INVARIANTS 403 CPU_FOREACH(cpu) { 404 epoch_record_t er; 405 406 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 407 408 /* 409 * Sanity check: none of the records should be in use anymore. 410 * We drained callbacks above and freeing the pcpu records is 411 * imminent. 412 */ 413 MPASS(er->er_td == NULL); 414 MPASS(TAILQ_EMPTY(&er->er_tdlist)); 415 } 416 #endif 417 uma_zfree_pcpu(pcpu_zone_record, epoch->e_pcpu_record); 418 mtx_destroy(&epoch->e_drain_mtx); 419 sx_destroy(&epoch->e_drain_sx); 420 memset(epoch, 0, sizeof(*epoch)); 421 422 EPOCH_UNLOCK(); 423 } 424 425 static epoch_record_t 426 epoch_currecord(epoch_t epoch) 427 { 428 429 return (zpcpu_get(epoch->e_pcpu_record)); 430 } 431 432 #define INIT_CHECK(epoch) \ 433 do { \ 434 if (__predict_false((epoch) == NULL)) \ 435 return; \ 436 } while (0) 437 438 void 439 _epoch_enter_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 440 { 441 struct epoch_record *er; 442 struct thread *td; 443 444 MPASS(cold || epoch != NULL); 445 td = curthread; 446 MPASS((vm_offset_t)et >= td->td_kstack && 447 (vm_offset_t)et + sizeof(struct epoch_tracker) <= 448 td->td_kstack + td->td_kstack_pages * PAGE_SIZE); 449 450 INIT_CHECK(epoch); 451 MPASS(epoch->e_flags & EPOCH_PREEMPT); 452 453 #ifdef EPOCH_TRACE 454 epoch_trace_enter(td, epoch, et, file, line); 455 #endif 456 et->et_td = td; 457 THREAD_NO_SLEEPING(); 458 critical_enter(); 459 sched_pin(); 460 et->et_old_priority = td->td_priority; 461 er = epoch_currecord(epoch); 462 /* Record-level tracking is reserved for non-preemptible epochs. */ 463 MPASS(er->er_td == NULL); 464 TAILQ_INSERT_TAIL(&er->er_tdlist, et, et_link); 465 ck_epoch_begin(&er->er_record, &et->et_section); 466 critical_exit(); 467 } 468 469 void 470 epoch_enter(epoch_t epoch) 471 { 472 epoch_record_t er; 473 474 MPASS(cold || epoch != NULL); 475 INIT_CHECK(epoch); 476 critical_enter(); 477 er = epoch_currecord(epoch); 478 #ifdef INVARIANTS 479 if (er->er_record.active == 0) { 480 MPASS(er->er_td == NULL); 481 er->er_td = curthread; 482 } else { 483 /* We've recursed, just make sure our accounting isn't wrong. */ 484 MPASS(er->er_td == curthread); 485 } 486 #endif 487 ck_epoch_begin(&er->er_record, NULL); 488 } 489 490 void 491 _epoch_exit_preempt(epoch_t epoch, epoch_tracker_t et EPOCH_FILE_LINE) 492 { 493 struct epoch_record *er; 494 struct thread *td; 495 496 INIT_CHECK(epoch); 497 td = curthread; 498 critical_enter(); 499 sched_unpin(); 500 THREAD_SLEEPING_OK(); 501 er = epoch_currecord(epoch); 502 MPASS(epoch->e_flags & EPOCH_PREEMPT); 503 MPASS(et != NULL); 504 MPASS(et->et_td == td); 505 #ifdef INVARIANTS 506 et->et_td = (void*)0xDEADBEEF; 507 /* Record-level tracking is reserved for non-preemptible epochs. */ 508 MPASS(er->er_td == NULL); 509 #endif 510 ck_epoch_end(&er->er_record, &et->et_section); 511 TAILQ_REMOVE(&er->er_tdlist, et, et_link); 512 er->er_gen++; 513 if (__predict_false(et->et_old_priority != td->td_priority)) 514 epoch_adjust_prio(td, et->et_old_priority); 515 critical_exit(); 516 #ifdef EPOCH_TRACE 517 epoch_trace_exit(td, epoch, et, file, line); 518 #endif 519 } 520 521 void 522 epoch_exit(epoch_t epoch) 523 { 524 epoch_record_t er; 525 526 INIT_CHECK(epoch); 527 er = epoch_currecord(epoch); 528 ck_epoch_end(&er->er_record, NULL); 529 #ifdef INVARIANTS 530 MPASS(er->er_td == curthread); 531 if (er->er_record.active == 0) 532 er->er_td = NULL; 533 #endif 534 critical_exit(); 535 } 536 537 /* 538 * epoch_block_handler_preempt() is a callback from the CK code when another 539 * thread is currently in an epoch section. 540 */ 541 static void 542 epoch_block_handler_preempt(struct ck_epoch *global __unused, 543 ck_epoch_record_t *cr, void *arg __unused) 544 { 545 epoch_record_t record; 546 struct thread *td, *owner, *curwaittd; 547 struct epoch_tracker *tdwait; 548 struct turnstile *ts; 549 struct lock_object *lock; 550 int spincount, gen; 551 int locksheld __unused; 552 553 record = __containerof(cr, struct epoch_record, er_record); 554 td = curthread; 555 locksheld = td->td_locks; 556 spincount = 0; 557 counter_u64_add(block_count, 1); 558 /* 559 * We lost a race and there's no longer any threads 560 * on the CPU in an epoch section. 561 */ 562 if (TAILQ_EMPTY(&record->er_tdlist)) 563 return; 564 565 if (record->er_cpuid != curcpu) { 566 /* 567 * If the head of the list is running, we can wait for it 568 * to remove itself from the list and thus save us the 569 * overhead of a migration 570 */ 571 gen = record->er_gen; 572 thread_unlock(td); 573 /* 574 * We can't actually check if the waiting thread is running 575 * so we simply poll for it to exit before giving up and 576 * migrating. 577 */ 578 do { 579 cpu_spinwait(); 580 } while (!TAILQ_EMPTY(&record->er_tdlist) && 581 gen == record->er_gen && 582 spincount++ < MAX_ADAPTIVE_SPIN); 583 thread_lock(td); 584 /* 585 * If the generation has changed we can poll again 586 * otherwise we need to migrate. 587 */ 588 if (gen != record->er_gen) 589 return; 590 /* 591 * Being on the same CPU as that of the record on which 592 * we need to wait allows us access to the thread 593 * list associated with that CPU. We can then examine the 594 * oldest thread in the queue and wait on its turnstile 595 * until it resumes and so on until a grace period 596 * elapses. 597 * 598 */ 599 counter_u64_add(migrate_count, 1); 600 sched_bind(td, record->er_cpuid); 601 /* 602 * At this point we need to return to the ck code 603 * to scan to see if a grace period has elapsed. 604 * We can't move on to check the thread list, because 605 * in the meantime new threads may have arrived that 606 * in fact belong to a different epoch. 607 */ 608 return; 609 } 610 /* 611 * Try to find a thread in an epoch section on this CPU 612 * waiting on a turnstile. Otherwise find the lowest 613 * priority thread (highest prio value) and drop our priority 614 * to match to allow it to run. 615 */ 616 TAILQ_FOREACH(tdwait, &record->er_tdlist, et_link) { 617 /* 618 * Propagate our priority to any other waiters to prevent us 619 * from starving them. They will have their original priority 620 * restore on exit from epoch_wait(). 621 */ 622 curwaittd = tdwait->et_td; 623 if (!TD_IS_INHIBITED(curwaittd) && curwaittd->td_priority > td->td_priority) { 624 critical_enter(); 625 thread_unlock(td); 626 thread_lock(curwaittd); 627 sched_prio(curwaittd, td->td_priority); 628 thread_unlock(curwaittd); 629 thread_lock(td); 630 critical_exit(); 631 } 632 if (TD_IS_INHIBITED(curwaittd) && TD_ON_LOCK(curwaittd) && 633 ((ts = curwaittd->td_blocked) != NULL)) { 634 /* 635 * We unlock td to allow turnstile_wait to reacquire 636 * the thread lock. Before unlocking it we enter a 637 * critical section to prevent preemption after we 638 * reenable interrupts by dropping the thread lock in 639 * order to prevent curwaittd from getting to run. 640 */ 641 critical_enter(); 642 thread_unlock(td); 643 644 if (turnstile_lock(ts, &lock, &owner)) { 645 if (ts == curwaittd->td_blocked) { 646 MPASS(TD_IS_INHIBITED(curwaittd) && 647 TD_ON_LOCK(curwaittd)); 648 critical_exit(); 649 turnstile_wait(ts, owner, 650 curwaittd->td_tsqueue); 651 counter_u64_add(turnstile_count, 1); 652 thread_lock(td); 653 return; 654 } 655 turnstile_unlock(ts, lock); 656 } 657 thread_lock(td); 658 critical_exit(); 659 KASSERT(td->td_locks == locksheld, 660 ("%d extra locks held", td->td_locks - locksheld)); 661 } 662 } 663 /* 664 * We didn't find any threads actually blocked on a lock 665 * so we have nothing to do except context switch away. 666 */ 667 counter_u64_add(switch_count, 1); 668 mi_switch(SW_VOL | SWT_RELINQUISH); 669 /* 670 * It is important the thread lock is dropped while yielding 671 * to allow other threads to acquire the lock pointed to by 672 * TDQ_LOCKPTR(td). Currently mi_switch() will unlock the 673 * thread lock before returning. Else a deadlock like 674 * situation might happen. 675 */ 676 thread_lock(td); 677 } 678 679 void 680 epoch_wait_preempt(epoch_t epoch) 681 { 682 struct thread *td; 683 int was_bound; 684 int old_cpu; 685 int old_pinned; 686 u_char old_prio; 687 int locks __unused; 688 689 MPASS(cold || epoch != NULL); 690 INIT_CHECK(epoch); 691 td = curthread; 692 #ifdef INVARIANTS 693 locks = curthread->td_locks; 694 MPASS(epoch->e_flags & EPOCH_PREEMPT); 695 if ((epoch->e_flags & EPOCH_LOCKED) == 0) 696 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 697 "epoch_wait() can be long running"); 698 KASSERT(!in_epoch(epoch), ("epoch_wait_preempt() called in the middle " 699 "of an epoch section of the same epoch")); 700 #endif 701 DROP_GIANT(); 702 thread_lock(td); 703 704 old_cpu = PCPU_GET(cpuid); 705 old_pinned = td->td_pinned; 706 old_prio = td->td_priority; 707 was_bound = sched_is_bound(td); 708 sched_unbind(td); 709 td->td_pinned = 0; 710 sched_bind(td, old_cpu); 711 712 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler_preempt, 713 NULL); 714 715 /* restore CPU binding, if any */ 716 if (was_bound != 0) { 717 sched_bind(td, old_cpu); 718 } else { 719 /* get thread back to initial CPU, if any */ 720 if (old_pinned != 0) 721 sched_bind(td, old_cpu); 722 sched_unbind(td); 723 } 724 /* restore pinned after bind */ 725 td->td_pinned = old_pinned; 726 727 /* restore thread priority */ 728 sched_prio(td, old_prio); 729 thread_unlock(td); 730 PICKUP_GIANT(); 731 KASSERT(td->td_locks == locks, 732 ("%d residual locks held", td->td_locks - locks)); 733 } 734 735 static void 736 epoch_block_handler(struct ck_epoch *g __unused, ck_epoch_record_t *c __unused, 737 void *arg __unused) 738 { 739 cpu_spinwait(); 740 } 741 742 void 743 epoch_wait(epoch_t epoch) 744 { 745 746 MPASS(cold || epoch != NULL); 747 INIT_CHECK(epoch); 748 MPASS(epoch->e_flags == 0); 749 critical_enter(); 750 ck_epoch_synchronize_wait(&epoch->e_epoch, epoch_block_handler, NULL); 751 critical_exit(); 752 } 753 754 void 755 epoch_call(epoch_t epoch, epoch_callback_t callback, epoch_context_t ctx) 756 { 757 epoch_record_t er; 758 ck_epoch_entry_t *cb; 759 760 cb = (void *)ctx; 761 762 MPASS(callback); 763 /* too early in boot to have epoch set up */ 764 if (__predict_false(epoch == NULL)) 765 goto boottime; 766 #if !defined(EARLY_AP_STARTUP) 767 if (__predict_false(inited < 2)) 768 goto boottime; 769 #endif 770 771 critical_enter(); 772 *DPCPU_PTR(epoch_cb_count) += 1; 773 er = epoch_currecord(epoch); 774 ck_epoch_call(&er->er_record, cb, (ck_epoch_cb_t *)callback); 775 critical_exit(); 776 return; 777 boottime: 778 callback(ctx); 779 } 780 781 static void 782 epoch_call_task(void *arg __unused) 783 { 784 ck_stack_entry_t *cursor, *head, *next; 785 ck_epoch_record_t *record; 786 epoch_record_t er; 787 epoch_t epoch; 788 ck_stack_t cb_stack; 789 int i, npending, total; 790 791 ck_stack_init(&cb_stack); 792 critical_enter(); 793 epoch_enter(global_epoch); 794 for (total = i = 0; i != MAX_EPOCHS; i++) { 795 epoch = epoch_array + i; 796 if (__predict_false( 797 atomic_load_acq_int(&epoch->e_in_use) == 0)) 798 continue; 799 er = epoch_currecord(epoch); 800 record = &er->er_record; 801 if ((npending = record->n_pending) == 0) 802 continue; 803 ck_epoch_poll_deferred(record, &cb_stack); 804 total += npending - record->n_pending; 805 } 806 epoch_exit(global_epoch); 807 *DPCPU_PTR(epoch_cb_count) -= total; 808 critical_exit(); 809 810 counter_u64_add(epoch_call_count, total); 811 counter_u64_add(epoch_call_task_count, 1); 812 813 head = ck_stack_batch_pop_npsc(&cb_stack); 814 for (cursor = head; cursor != NULL; cursor = next) { 815 struct ck_epoch_entry *entry = 816 ck_epoch_entry_container(cursor); 817 818 next = CK_STACK_NEXT(cursor); 819 entry->function(entry); 820 } 821 } 822 823 static int 824 in_epoch_verbose_preempt(epoch_t epoch, int dump_onfail) 825 { 826 epoch_record_t er; 827 struct epoch_tracker *tdwait; 828 struct thread *td; 829 830 MPASS(epoch != NULL); 831 MPASS((epoch->e_flags & EPOCH_PREEMPT) != 0); 832 td = curthread; 833 if (THREAD_CAN_SLEEP()) 834 return (0); 835 critical_enter(); 836 er = epoch_currecord(epoch); 837 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 838 if (tdwait->et_td == td) { 839 critical_exit(); 840 return (1); 841 } 842 #ifdef INVARIANTS 843 if (dump_onfail) { 844 MPASS(td->td_pinned); 845 printf("cpu: %d id: %d\n", curcpu, td->td_tid); 846 TAILQ_FOREACH(tdwait, &er->er_tdlist, et_link) 847 printf("td_tid: %d ", tdwait->et_td->td_tid); 848 printf("\n"); 849 } 850 #endif 851 critical_exit(); 852 return (0); 853 } 854 855 #ifdef INVARIANTS 856 static void 857 epoch_assert_nocpu(epoch_t epoch, struct thread *td) 858 { 859 epoch_record_t er; 860 int cpu; 861 bool crit; 862 863 crit = td->td_critnest > 0; 864 865 /* Check for a critical section mishap. */ 866 CPU_FOREACH(cpu) { 867 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 868 KASSERT(er->er_td != td, 869 ("%s critical section in epoch '%s', from cpu %d", 870 (crit ? "exited" : "re-entered"), epoch->e_name, cpu)); 871 } 872 } 873 #else 874 #define epoch_assert_nocpu(e, td) do {} while (0) 875 #endif 876 877 int 878 in_epoch_verbose(epoch_t epoch, int dump_onfail) 879 { 880 epoch_record_t er; 881 struct thread *td; 882 883 if (__predict_false((epoch) == NULL)) 884 return (0); 885 if ((epoch->e_flags & EPOCH_PREEMPT) != 0) 886 return (in_epoch_verbose_preempt(epoch, dump_onfail)); 887 888 /* 889 * The thread being in a critical section is a necessary 890 * condition to be correctly inside a non-preemptible epoch, 891 * so it's definitely not in this epoch. 892 */ 893 td = curthread; 894 if (td->td_critnest == 0) { 895 epoch_assert_nocpu(epoch, td); 896 return (0); 897 } 898 899 /* 900 * The current cpu is in a critical section, so the epoch record will be 901 * stable for the rest of this function. Knowing that the record is not 902 * active is sufficient for knowing whether we're in this epoch or not, 903 * since it's a pcpu record. 904 */ 905 er = epoch_currecord(epoch); 906 if (er->er_record.active == 0) { 907 epoch_assert_nocpu(epoch, td); 908 return (0); 909 } 910 911 MPASS(er->er_td == td); 912 return (1); 913 } 914 915 int 916 in_epoch(epoch_t epoch) 917 { 918 return (in_epoch_verbose(epoch, 0)); 919 } 920 921 static void 922 epoch_drain_cb(struct epoch_context *ctx) 923 { 924 struct epoch *epoch = 925 __containerof(ctx, struct epoch_record, er_drain_ctx)->er_parent; 926 927 if (atomic_fetchadd_int(&epoch->e_drain_count, -1) == 1) { 928 mtx_lock(&epoch->e_drain_mtx); 929 wakeup(epoch); 930 mtx_unlock(&epoch->e_drain_mtx); 931 } 932 } 933 934 void 935 epoch_drain_callbacks(epoch_t epoch) 936 { 937 epoch_record_t er; 938 struct thread *td; 939 int was_bound; 940 int old_pinned; 941 int old_cpu; 942 int cpu; 943 944 WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, NULL, 945 "epoch_drain_callbacks() may sleep!"); 946 947 /* too early in boot to have epoch set up */ 948 if (__predict_false(epoch == NULL)) 949 return; 950 #if !defined(EARLY_AP_STARTUP) 951 if (__predict_false(inited < 2)) 952 return; 953 #endif 954 DROP_GIANT(); 955 956 sx_xlock(&epoch->e_drain_sx); 957 mtx_lock(&epoch->e_drain_mtx); 958 959 td = curthread; 960 thread_lock(td); 961 old_cpu = PCPU_GET(cpuid); 962 old_pinned = td->td_pinned; 963 was_bound = sched_is_bound(td); 964 sched_unbind(td); 965 td->td_pinned = 0; 966 967 CPU_FOREACH(cpu) 968 epoch->e_drain_count++; 969 CPU_FOREACH(cpu) { 970 er = zpcpu_get_cpu(epoch->e_pcpu_record, cpu); 971 sched_bind(td, cpu); 972 epoch_call(epoch, &epoch_drain_cb, &er->er_drain_ctx); 973 } 974 975 /* restore CPU binding, if any */ 976 if (was_bound != 0) { 977 sched_bind(td, old_cpu); 978 } else { 979 /* get thread back to initial CPU, if any */ 980 if (old_pinned != 0) 981 sched_bind(td, old_cpu); 982 sched_unbind(td); 983 } 984 /* restore pinned after bind */ 985 td->td_pinned = old_pinned; 986 987 thread_unlock(td); 988 989 while (epoch->e_drain_count != 0) 990 msleep(epoch, &epoch->e_drain_mtx, PZERO, "EDRAIN", 0); 991 992 mtx_unlock(&epoch->e_drain_mtx); 993 sx_xunlock(&epoch->e_drain_sx); 994 995 PICKUP_GIANT(); 996 } 997