1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause 3 * 4 * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 /* 29 * This module holds the global variables and machine independent functions 30 * used for the kernel SMP support. 31 */ 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/kernel.h> 36 #include <sys/ktr.h> 37 #include <sys/proc.h> 38 #include <sys/bus.h> 39 #include <sys/lock.h> 40 #include <sys/malloc.h> 41 #include <sys/mutex.h> 42 #include <sys/pcpu.h> 43 #include <sys/sched.h> 44 #include <sys/smp.h> 45 #include <sys/sysctl.h> 46 47 #include <machine/cpu.h> 48 #include <machine/pcb.h> 49 #include <machine/smp.h> 50 51 #include "opt_sched.h" 52 53 #ifdef SMP 54 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); 55 56 volatile cpuset_t stopped_cpus; 57 volatile cpuset_t started_cpus; 58 volatile cpuset_t suspended_cpus; 59 cpuset_t hlt_cpus_mask; 60 cpuset_t logical_cpus_mask; 61 62 void (*cpustop_restartfunc)(void); 63 #endif 64 65 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); 66 67 /* This is used in modules that need to work in both SMP and UP. */ 68 cpuset_t all_cpus; 69 70 int mp_ncpus; 71 /* export this for libkvm consumers. */ 72 int mp_maxcpus = MAXCPU; 73 74 volatile int smp_started; 75 u_int mp_maxid; 76 77 /* Array of CPU contexts saved during a panic. */ 78 struct pcb *stoppcbs; 79 80 static SYSCTL_NODE(_kern, OID_AUTO, smp, 81 CTLFLAG_RD | CTLFLAG_CAPRD | CTLFLAG_MPSAFE, NULL, 82 "Kernel SMP"); 83 84 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, 85 "Max CPU ID."); 86 87 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 88 0, "Max number of CPUs that the system was compiled for."); 89 90 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE, 91 NULL, 0, sysctl_kern_smp_active, "I", 92 "Indicates system is running in SMP mode"); 93 94 int smp_disabled = 0; /* has smp been disabled? */ 95 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, 96 &smp_disabled, 0, "SMP has been disabled from the loader"); 97 98 int smp_cpus = 1; /* how many cpu's running */ 99 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, 100 "Number of CPUs online"); 101 102 int smp_threads_per_core = 1; /* how many SMT threads are running per core */ 103 SYSCTL_INT(_kern_smp, OID_AUTO, threads_per_core, CTLFLAG_RD|CTLFLAG_CAPRD, 104 &smp_threads_per_core, 0, "Number of SMT threads online per core"); 105 106 int mp_ncores = -1; /* how many physical cores running */ 107 SYSCTL_INT(_kern_smp, OID_AUTO, cores, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_ncores, 0, 108 "Number of physical cores online"); 109 110 int smp_topology = 0; /* Which topology we're using. */ 111 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, 112 "Topology override setting; 0 is default provided by hardware."); 113 114 #ifdef SMP 115 /* Variables needed for SMP rendezvous. */ 116 static volatile int smp_rv_ncpus; 117 static void (*volatile smp_rv_setup_func)(void *arg); 118 static void (*volatile smp_rv_action_func)(void *arg); 119 static void (*volatile smp_rv_teardown_func)(void *arg); 120 static void *volatile smp_rv_func_arg; 121 static volatile int smp_rv_waiters[4]; 122 123 /* 124 * Shared mutex to restrict busywaits between smp_rendezvous() and 125 * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these 126 * functions trigger at once and cause multiple CPUs to busywait with 127 * interrupts disabled. 128 */ 129 struct mtx smp_ipi_mtx; 130 131 /* 132 * Let the MD SMP code initialize mp_maxid very early if it can. 133 */ 134 static void 135 mp_setmaxid(void *dummy) 136 { 137 138 cpu_mp_setmaxid(); 139 140 KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); 141 KASSERT(mp_ncpus > 1 || mp_maxid == 0, 142 ("%s: one CPU but mp_maxid is not zero", __func__)); 143 KASSERT(mp_maxid >= mp_ncpus - 1, 144 ("%s: counters out of sync: max %d, count %d", __func__, 145 mp_maxid, mp_ncpus)); 146 147 cpusetsizemin = howmany(mp_maxid + 1, NBBY); 148 } 149 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); 150 151 /* 152 * Call the MD SMP initialization code. 153 */ 154 static void 155 mp_start(void *dummy) 156 { 157 158 mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); 159 160 /* Probe for MP hardware. */ 161 if (smp_disabled != 0 || cpu_mp_probe() == 0) { 162 mp_ncores = 1; 163 mp_ncpus = 1; 164 CPU_SETOF(PCPU_GET(cpuid), &all_cpus); 165 return; 166 } 167 168 cpu_mp_start(); 169 printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", 170 mp_ncpus); 171 172 /* Provide a default for most architectures that don't have SMT/HTT. */ 173 if (mp_ncores < 0) 174 mp_ncores = mp_ncpus; 175 176 stoppcbs = mallocarray(mp_maxid + 1, sizeof(struct pcb), M_DEVBUF, 177 M_WAITOK | M_ZERO); 178 179 cpu_mp_announce(); 180 } 181 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); 182 183 void 184 forward_signal(struct thread *td) 185 { 186 int id; 187 188 /* 189 * signotify() has already set TDA_AST and TDA_SIG on td_ast for 190 * this thread, so all we need to do is poke it if it is currently 191 * executing so that it executes ast(). 192 */ 193 THREAD_LOCK_ASSERT(td, MA_OWNED); 194 KASSERT(TD_IS_RUNNING(td), 195 ("forward_signal: thread is not TDS_RUNNING")); 196 197 CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); 198 199 if (!smp_started || cold || KERNEL_PANICKED()) 200 return; 201 202 /* No need to IPI ourself. */ 203 if (td == curthread) 204 return; 205 206 id = td->td_oncpu; 207 if (id == NOCPU) 208 return; 209 ipi_cpu(id, IPI_AST); 210 } 211 212 /* 213 * When called the executing CPU will send an IPI to all other CPUs 214 * requesting that they halt execution. 215 * 216 * Usually (but not necessarily) called with 'other_cpus' as its arg. 217 * 218 * - Signals all CPUs in map to stop. 219 * - Waits for each to stop. 220 * 221 * Returns: 222 * -1: error 223 * 0: NA 224 * 1: ok 225 * 226 */ 227 #if defined(__amd64__) || defined(__i386__) 228 #define X86 1 229 #else 230 #define X86 0 231 #endif 232 static int 233 generic_stop_cpus(cpuset_t map, u_int type) 234 { 235 #ifdef KTR 236 char cpusetbuf[CPUSETBUFSIZ]; 237 #endif 238 static volatile u_int stopping_cpu = NOCPU; 239 int i; 240 volatile cpuset_t *cpus; 241 242 KASSERT( 243 type == IPI_STOP || type == IPI_STOP_HARD 244 #if X86 245 || type == IPI_SUSPEND 246 #endif 247 , ("%s: invalid stop type", __func__)); 248 249 if (!smp_started) 250 return (0); 251 252 CTR2(KTR_SMP, "stop_cpus(%s) with %u type", 253 cpusetobj_strprint(cpusetbuf, &map), type); 254 255 #if X86 256 /* 257 * When suspending, ensure there are are no IPIs in progress. 258 * IPIs that have been issued, but not yet delivered (e.g. 259 * not pending on a vCPU when running under virtualization) 260 * will be lost, violating FreeBSD's assumption of reliable 261 * IPI delivery. 262 */ 263 if (type == IPI_SUSPEND) 264 mtx_lock_spin(&smp_ipi_mtx); 265 #endif 266 267 #if X86 268 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { 269 #endif 270 if (stopping_cpu != PCPU_GET(cpuid)) 271 while (atomic_cmpset_int(&stopping_cpu, NOCPU, 272 PCPU_GET(cpuid)) == 0) 273 while (stopping_cpu != NOCPU) 274 cpu_spinwait(); /* spin */ 275 276 /* send the stop IPI to all CPUs in map */ 277 ipi_selected(map, type); 278 #if X86 279 } 280 #endif 281 282 #if X86 283 if (type == IPI_SUSPEND) 284 cpus = &suspended_cpus; 285 else 286 #endif 287 cpus = &stopped_cpus; 288 289 i = 0; 290 while (!CPU_SUBSET(cpus, &map)) { 291 /* spin */ 292 cpu_spinwait(); 293 i++; 294 if (i == 100000000) { 295 printf("timeout stopping cpus\n"); 296 break; 297 } 298 } 299 300 #if X86 301 if (type == IPI_SUSPEND) 302 mtx_unlock_spin(&smp_ipi_mtx); 303 #endif 304 305 stopping_cpu = NOCPU; 306 return (1); 307 } 308 309 int 310 stop_cpus(cpuset_t map) 311 { 312 313 return (generic_stop_cpus(map, IPI_STOP)); 314 } 315 316 int 317 stop_cpus_hard(cpuset_t map) 318 { 319 320 return (generic_stop_cpus(map, IPI_STOP_HARD)); 321 } 322 323 #if X86 324 int 325 suspend_cpus(cpuset_t map) 326 { 327 328 return (generic_stop_cpus(map, IPI_SUSPEND)); 329 } 330 #endif 331 332 /* 333 * Called by a CPU to restart stopped CPUs. 334 * 335 * Usually (but not necessarily) called with 'stopped_cpus' as its arg. 336 * 337 * - Signals all CPUs in map to restart. 338 * - Waits for each to restart. 339 * 340 * Returns: 341 * -1: error 342 * 0: NA 343 * 1: ok 344 */ 345 static int 346 generic_restart_cpus(cpuset_t map, u_int type) 347 { 348 #ifdef KTR 349 char cpusetbuf[CPUSETBUFSIZ]; 350 #endif 351 volatile cpuset_t *cpus; 352 353 #if X86 354 KASSERT(type == IPI_STOP || type == IPI_STOP_HARD 355 || type == IPI_SUSPEND, ("%s: invalid stop type", __func__)); 356 357 if (!smp_started) 358 return (0); 359 360 CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); 361 362 if (type == IPI_SUSPEND) 363 cpus = &resuming_cpus; 364 else 365 cpus = &stopped_cpus; 366 367 /* signal other cpus to restart */ 368 if (type == IPI_SUSPEND) 369 CPU_COPY_STORE_REL(&map, &toresume_cpus); 370 else 371 CPU_COPY_STORE_REL(&map, &started_cpus); 372 373 /* 374 * Wake up any CPUs stopped with MWAIT. From MI code we can't tell if 375 * MONITOR/MWAIT is enabled, but the potentially redundant writes are 376 * relatively inexpensive. 377 */ 378 if (type == IPI_STOP) { 379 struct monitorbuf *mb; 380 u_int id; 381 382 CPU_FOREACH(id) { 383 if (!CPU_ISSET(id, &map)) 384 continue; 385 386 mb = &pcpu_find(id)->pc_monitorbuf; 387 atomic_store_int(&mb->stop_state, 388 MONITOR_STOPSTATE_RUNNING); 389 } 390 } 391 392 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { 393 /* wait for each to clear its bit */ 394 while (CPU_OVERLAP(cpus, &map)) 395 cpu_spinwait(); 396 } 397 #else /* !X86 */ 398 KASSERT(type == IPI_STOP || type == IPI_STOP_HARD, 399 ("%s: invalid stop type", __func__)); 400 401 if (!smp_started) 402 return (0); 403 404 CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); 405 406 cpus = &stopped_cpus; 407 408 /* signal other cpus to restart */ 409 CPU_COPY_STORE_REL(&map, &started_cpus); 410 411 /* wait for each to clear its bit */ 412 while (CPU_OVERLAP(cpus, &map)) 413 cpu_spinwait(); 414 #endif 415 return (1); 416 } 417 418 int 419 restart_cpus(cpuset_t map) 420 { 421 422 return (generic_restart_cpus(map, IPI_STOP)); 423 } 424 425 #if X86 426 int 427 resume_cpus(cpuset_t map) 428 { 429 430 return (generic_restart_cpus(map, IPI_SUSPEND)); 431 } 432 #endif 433 #undef X86 434 435 /* 436 * All-CPU rendezvous. CPUs are signalled, all execute the setup function 437 * (if specified), rendezvous, execute the action function (if specified), 438 * rendezvous again, execute the teardown function (if specified), and then 439 * resume. 440 * 441 * Note that the supplied external functions _must_ be reentrant and aware 442 * that they are running in parallel and in an unknown lock context. 443 */ 444 void 445 smp_rendezvous_action(void) 446 { 447 struct thread *td; 448 void *local_func_arg; 449 void (*local_setup_func)(void*); 450 void (*local_action_func)(void*); 451 void (*local_teardown_func)(void*); 452 #ifdef INVARIANTS 453 int owepreempt; 454 #endif 455 456 /* Ensure we have up-to-date values. */ 457 atomic_add_acq_int(&smp_rv_waiters[0], 1); 458 while (smp_rv_waiters[0] < smp_rv_ncpus) 459 cpu_spinwait(); 460 461 /* Fetch rendezvous parameters after acquire barrier. */ 462 local_func_arg = smp_rv_func_arg; 463 local_setup_func = smp_rv_setup_func; 464 local_action_func = smp_rv_action_func; 465 local_teardown_func = smp_rv_teardown_func; 466 467 /* 468 * Use a nested critical section to prevent any preemptions 469 * from occurring during a rendezvous action routine. 470 * Specifically, if a rendezvous handler is invoked via an IPI 471 * and the interrupted thread was in the critical_exit() 472 * function after setting td_critnest to 0 but before 473 * performing a deferred preemption, this routine can be 474 * invoked with td_critnest set to 0 and td_owepreempt true. 475 * In that case, a critical_exit() during the rendezvous 476 * action would trigger a preemption which is not permitted in 477 * a rendezvous action. To fix this, wrap all of the 478 * rendezvous action handlers in a critical section. We 479 * cannot use a regular critical section however as having 480 * critical_exit() preempt from this routine would also be 481 * problematic (the preemption must not occur before the IPI 482 * has been acknowledged via an EOI). Instead, we 483 * intentionally ignore td_owepreempt when leaving the 484 * critical section. This should be harmless because we do 485 * not permit rendezvous action routines to schedule threads, 486 * and thus td_owepreempt should never transition from 0 to 1 487 * during this routine. 488 */ 489 td = curthread; 490 td->td_critnest++; 491 #ifdef INVARIANTS 492 owepreempt = td->td_owepreempt; 493 #endif 494 495 /* 496 * If requested, run a setup function before the main action 497 * function. Ensure all CPUs have completed the setup 498 * function before moving on to the action function. 499 */ 500 if (local_setup_func != smp_no_rendezvous_barrier) { 501 if (local_setup_func != NULL) 502 local_setup_func(local_func_arg); 503 atomic_add_int(&smp_rv_waiters[1], 1); 504 while (smp_rv_waiters[1] < smp_rv_ncpus) 505 cpu_spinwait(); 506 } 507 508 if (local_action_func != NULL) 509 local_action_func(local_func_arg); 510 511 if (local_teardown_func != smp_no_rendezvous_barrier) { 512 /* 513 * Signal that the main action has been completed. If a 514 * full exit rendezvous is requested, then all CPUs will 515 * wait here until all CPUs have finished the main action. 516 */ 517 atomic_add_int(&smp_rv_waiters[2], 1); 518 while (smp_rv_waiters[2] < smp_rv_ncpus) 519 cpu_spinwait(); 520 521 if (local_teardown_func != NULL) 522 local_teardown_func(local_func_arg); 523 } 524 525 /* 526 * Signal that the rendezvous is fully completed by this CPU. 527 * This means that no member of smp_rv_* pseudo-structure will be 528 * accessed by this target CPU after this point; in particular, 529 * memory pointed by smp_rv_func_arg. 530 * 531 * The release semantic ensures that all accesses performed by 532 * the current CPU are visible when smp_rendezvous_cpus() 533 * returns, by synchronizing with the 534 * atomic_load_acq_int(&smp_rv_waiters[3]). 535 */ 536 atomic_add_rel_int(&smp_rv_waiters[3], 1); 537 538 td->td_critnest--; 539 KASSERT(owepreempt == td->td_owepreempt, 540 ("rendezvous action changed td_owepreempt")); 541 } 542 543 void 544 smp_rendezvous_cpus(cpuset_t map, 545 void (* setup_func)(void *), 546 void (* action_func)(void *), 547 void (* teardown_func)(void *), 548 void *arg) 549 { 550 int curcpumap, i, ncpus = 0; 551 552 /* See comments in the !SMP case. */ 553 if (!smp_started) { 554 spinlock_enter(); 555 if (setup_func != NULL) 556 setup_func(arg); 557 if (action_func != NULL) 558 action_func(arg); 559 if (teardown_func != NULL) 560 teardown_func(arg); 561 spinlock_exit(); 562 return; 563 } 564 565 /* 566 * Make sure we come here with interrupts enabled. Otherwise we 567 * livelock if smp_ipi_mtx is owned by a thread which sent us an IPI. 568 */ 569 MPASS(curthread->td_md.md_spinlock_count == 0); 570 571 CPU_FOREACH(i) { 572 if (CPU_ISSET(i, &map)) 573 ncpus++; 574 } 575 if (ncpus == 0) 576 panic("ncpus is 0 with non-zero map"); 577 578 mtx_lock_spin(&smp_ipi_mtx); 579 580 /* Pass rendezvous parameters via global variables. */ 581 smp_rv_ncpus = ncpus; 582 smp_rv_setup_func = setup_func; 583 smp_rv_action_func = action_func; 584 smp_rv_teardown_func = teardown_func; 585 smp_rv_func_arg = arg; 586 smp_rv_waiters[1] = 0; 587 smp_rv_waiters[2] = 0; 588 smp_rv_waiters[3] = 0; 589 atomic_store_rel_int(&smp_rv_waiters[0], 0); 590 591 /* 592 * Signal other processors, which will enter the IPI with 593 * interrupts off. 594 */ 595 curcpumap = CPU_ISSET(curcpu, &map); 596 CPU_CLR(curcpu, &map); 597 ipi_selected(map, IPI_RENDEZVOUS); 598 599 /* Check if the current CPU is in the map */ 600 if (curcpumap != 0) 601 smp_rendezvous_action(); 602 603 /* 604 * Ensure that the master CPU waits for all the other 605 * CPUs to finish the rendezvous, so that smp_rv_* 606 * pseudo-structure and the arg are guaranteed to not 607 * be in use. 608 * 609 * Load acquire synchronizes with the release add in 610 * smp_rendezvous_action(), which ensures that our caller sees 611 * all memory actions done by the called functions on other 612 * CPUs. 613 */ 614 while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) 615 cpu_spinwait(); 616 617 mtx_unlock_spin(&smp_ipi_mtx); 618 } 619 620 void 621 smp_rendezvous(void (* setup_func)(void *), 622 void (* action_func)(void *), 623 void (* teardown_func)(void *), 624 void *arg) 625 { 626 smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); 627 } 628 629 static void 630 smp_topo_fill(struct cpu_group *cg) 631 { 632 int c; 633 634 for (c = 0; c < cg->cg_children; c++) 635 smp_topo_fill(&cg->cg_child[c]); 636 cg->cg_first = CPU_FFS(&cg->cg_mask) - 1; 637 cg->cg_last = CPU_FLS(&cg->cg_mask) - 1; 638 } 639 640 struct cpu_group * 641 smp_topo(void) 642 { 643 char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; 644 static struct cpu_group *top = NULL; 645 646 /* 647 * The first call to smp_topo() is guaranteed to occur 648 * during the kernel boot while we are still single-threaded. 649 */ 650 if (top != NULL) 651 return (top); 652 653 /* 654 * Check for a fake topology request for debugging purposes. 655 */ 656 switch (smp_topology) { 657 case 1: 658 /* Dual core with no sharing. */ 659 top = smp_topo_1level(CG_SHARE_NONE, 2, 0); 660 break; 661 case 2: 662 /* No topology, all cpus are equal. */ 663 top = smp_topo_none(); 664 break; 665 case 3: 666 /* Dual core with shared L2. */ 667 top = smp_topo_1level(CG_SHARE_L2, 2, 0); 668 break; 669 case 4: 670 /* quad core, shared l3 among each package, private l2. */ 671 top = smp_topo_1level(CG_SHARE_L3, 4, 0); 672 break; 673 case 5: 674 /* quad core, 2 dualcore parts on each package share l2. */ 675 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); 676 break; 677 case 6: 678 /* Single-core 2xHTT */ 679 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); 680 break; 681 case 7: 682 /* quad core with a shared l3, 8 threads sharing L2. */ 683 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, 684 CG_FLAG_SMT); 685 break; 686 default: 687 /* Default, ask the system what it wants. */ 688 top = cpu_topo(); 689 break; 690 } 691 /* 692 * Verify the returned topology. 693 */ 694 if (top->cg_count != mp_ncpus) 695 panic("Built bad topology at %p. CPU count %d != %d", 696 top, top->cg_count, mp_ncpus); 697 if (CPU_CMP(&top->cg_mask, &all_cpus)) 698 panic("Built bad topology at %p. CPU mask (%s) != (%s)", 699 top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), 700 cpusetobj_strprint(cpusetbuf2, &all_cpus)); 701 702 /* 703 * Collapse nonsense levels that may be created out of convenience by 704 * the MD layers. They cause extra work in the search functions. 705 */ 706 while (top->cg_children == 1) { 707 top = &top->cg_child[0]; 708 top->cg_parent = NULL; 709 } 710 smp_topo_fill(top); 711 return (top); 712 } 713 714 struct cpu_group * 715 smp_topo_alloc(u_int count) 716 { 717 static struct cpu_group *group = NULL; 718 static u_int index; 719 u_int curr; 720 721 if (group == NULL) { 722 group = mallocarray((mp_maxid + 1) * MAX_CACHE_LEVELS + 1, 723 sizeof(*group), M_DEVBUF, M_WAITOK | M_ZERO); 724 } 725 curr = index; 726 index += count; 727 return (&group[curr]); 728 } 729 730 struct cpu_group * 731 smp_topo_none(void) 732 { 733 struct cpu_group *top; 734 735 top = smp_topo_alloc(1); 736 top->cg_parent = NULL; 737 top->cg_child = NULL; 738 top->cg_mask = all_cpus; 739 top->cg_count = mp_ncpus; 740 top->cg_children = 0; 741 top->cg_level = CG_SHARE_NONE; 742 top->cg_flags = 0; 743 744 return (top); 745 } 746 747 static int 748 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, 749 int count, int flags, int start) 750 { 751 char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; 752 cpuset_t mask; 753 int i; 754 755 CPU_ZERO(&mask); 756 for (i = 0; i < count; i++, start++) 757 CPU_SET(start, &mask); 758 child->cg_parent = parent; 759 child->cg_child = NULL; 760 child->cg_children = 0; 761 child->cg_level = share; 762 child->cg_count = count; 763 child->cg_flags = flags; 764 child->cg_mask = mask; 765 parent->cg_children++; 766 for (; parent != NULL; parent = parent->cg_parent) { 767 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) 768 panic("Duplicate children in %p. mask (%s) child (%s)", 769 parent, 770 cpusetobj_strprint(cpusetbuf, &parent->cg_mask), 771 cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); 772 CPU_OR(&parent->cg_mask, &parent->cg_mask, &child->cg_mask); 773 parent->cg_count += child->cg_count; 774 } 775 776 return (start); 777 } 778 779 struct cpu_group * 780 smp_topo_1level(int share, int count, int flags) 781 { 782 struct cpu_group *child; 783 struct cpu_group *top; 784 int packages; 785 int cpu; 786 int i; 787 788 cpu = 0; 789 packages = mp_ncpus / count; 790 top = smp_topo_alloc(1 + packages); 791 top->cg_child = child = top + 1; 792 top->cg_level = CG_SHARE_NONE; 793 for (i = 0; i < packages; i++, child++) 794 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); 795 return (top); 796 } 797 798 struct cpu_group * 799 smp_topo_2level(int l2share, int l2count, int l1share, int l1count, 800 int l1flags) 801 { 802 struct cpu_group *top; 803 struct cpu_group *l1g; 804 struct cpu_group *l2g; 805 int cpu; 806 int i; 807 int j; 808 809 cpu = 0; 810 top = smp_topo_alloc(1 + mp_ncpus / (l2count * l1count) + 811 mp_ncpus / l1count); 812 l2g = top + 1; 813 top->cg_child = l2g; 814 top->cg_level = CG_SHARE_NONE; 815 top->cg_children = mp_ncpus / (l2count * l1count); 816 l1g = l2g + top->cg_children; 817 for (i = 0; i < top->cg_children; i++, l2g++) { 818 l2g->cg_parent = top; 819 l2g->cg_child = l1g; 820 l2g->cg_level = l2share; 821 for (j = 0; j < l2count; j++, l1g++) 822 cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, 823 l1flags, cpu); 824 } 825 return (top); 826 } 827 828 struct cpu_group * 829 smp_topo_find(struct cpu_group *top, int cpu) 830 { 831 struct cpu_group *cg; 832 cpuset_t mask; 833 int children; 834 int i; 835 836 CPU_SETOF(cpu, &mask); 837 cg = top; 838 for (;;) { 839 if (!CPU_OVERLAP(&cg->cg_mask, &mask)) 840 return (NULL); 841 if (cg->cg_children == 0) 842 return (cg); 843 children = cg->cg_children; 844 for (i = 0, cg = cg->cg_child; i < children; cg++, i++) 845 if (CPU_OVERLAP(&cg->cg_mask, &mask)) 846 break; 847 } 848 return (NULL); 849 } 850 #else /* !SMP */ 851 852 void 853 smp_rendezvous_cpus(cpuset_t map, 854 void (*setup_func)(void *), 855 void (*action_func)(void *), 856 void (*teardown_func)(void *), 857 void *arg) 858 { 859 /* 860 * In the !SMP case we just need to ensure the same initial conditions 861 * as the SMP case. 862 */ 863 spinlock_enter(); 864 if (setup_func != NULL) 865 setup_func(arg); 866 if (action_func != NULL) 867 action_func(arg); 868 if (teardown_func != NULL) 869 teardown_func(arg); 870 spinlock_exit(); 871 } 872 873 void 874 smp_rendezvous(void (*setup_func)(void *), 875 void (*action_func)(void *), 876 void (*teardown_func)(void *), 877 void *arg) 878 { 879 880 smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, 881 arg); 882 } 883 884 /* 885 * Provide dummy SMP support for UP kernels. Modules that need to use SMP 886 * APIs will still work using this dummy support. 887 */ 888 static void 889 mp_setvariables_for_up(void *dummy) 890 { 891 mp_ncpus = 1; 892 mp_ncores = 1; 893 mp_maxid = PCPU_GET(cpuid); 894 CPU_SETOF(mp_maxid, &all_cpus); 895 KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); 896 } 897 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, 898 mp_setvariables_for_up, NULL); 899 #endif /* SMP */ 900 901 void 902 smp_no_rendezvous_barrier(void *dummy) 903 { 904 #ifdef SMP 905 KASSERT((!smp_started),("smp_no_rendezvous called and smp is started")); 906 #endif 907 } 908 909 void 910 smp_rendezvous_cpus_retry(cpuset_t map, 911 void (* setup_func)(void *), 912 void (* action_func)(void *), 913 void (* teardown_func)(void *), 914 void (* wait_func)(void *, int), 915 struct smp_rendezvous_cpus_retry_arg *arg) 916 { 917 int cpu; 918 919 CPU_COPY(&map, &arg->cpus); 920 921 /* 922 * Only one CPU to execute on. 923 */ 924 if (!smp_started) { 925 spinlock_enter(); 926 if (setup_func != NULL) 927 setup_func(arg); 928 if (action_func != NULL) 929 action_func(arg); 930 if (teardown_func != NULL) 931 teardown_func(arg); 932 spinlock_exit(); 933 return; 934 } 935 936 /* 937 * Execute an action on all specified CPUs while retrying until they 938 * all acknowledge completion. 939 */ 940 for (;;) { 941 smp_rendezvous_cpus( 942 arg->cpus, 943 setup_func, 944 action_func, 945 teardown_func, 946 arg); 947 948 if (CPU_EMPTY(&arg->cpus)) 949 break; 950 951 CPU_FOREACH(cpu) { 952 if (!CPU_ISSET(cpu, &arg->cpus)) 953 continue; 954 wait_func(arg, cpu); 955 } 956 } 957 } 958 959 void 960 smp_rendezvous_cpus_done(struct smp_rendezvous_cpus_retry_arg *arg) 961 { 962 963 CPU_CLR_ATOMIC(curcpu, &arg->cpus); 964 } 965 966 /* 967 * If (prio & PDROP) == 0: 968 * Wait for specified idle threads to switch once. This ensures that even 969 * preempted threads have cycled through the switch function once, 970 * exiting their codepaths. This allows us to change global pointers 971 * with no other synchronization. 972 * If (prio & PDROP) != 0: 973 * Force the specified CPUs to switch context at least once. 974 */ 975 int 976 quiesce_cpus(cpuset_t map, const char *wmesg, int prio) 977 { 978 struct pcpu *pcpu; 979 u_int *gen; 980 int error; 981 int cpu; 982 983 error = 0; 984 if ((prio & PDROP) == 0) { 985 gen = mallocarray(sizeof(u_int), mp_maxid + 1, M_TEMP, 986 M_WAITOK); 987 for (cpu = 0; cpu <= mp_maxid; cpu++) { 988 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) 989 continue; 990 pcpu = pcpu_find(cpu); 991 gen[cpu] = pcpu->pc_idlethread->td_generation; 992 } 993 } 994 for (cpu = 0; cpu <= mp_maxid; cpu++) { 995 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) 996 continue; 997 pcpu = pcpu_find(cpu); 998 thread_lock(curthread); 999 sched_bind(curthread, cpu); 1000 thread_unlock(curthread); 1001 if ((prio & PDROP) != 0) 1002 continue; 1003 while (gen[cpu] == pcpu->pc_idlethread->td_generation) { 1004 error = tsleep(quiesce_cpus, prio & ~PDROP, wmesg, 1); 1005 if (error != EWOULDBLOCK) 1006 goto out; 1007 error = 0; 1008 } 1009 } 1010 out: 1011 thread_lock(curthread); 1012 sched_unbind(curthread); 1013 thread_unlock(curthread); 1014 if ((prio & PDROP) == 0) 1015 free(gen, M_TEMP); 1016 1017 return (error); 1018 } 1019 1020 int 1021 quiesce_all_cpus(const char *wmesg, int prio) 1022 { 1023 1024 return quiesce_cpus(all_cpus, wmesg, prio); 1025 } 1026 1027 /* 1028 * Observe all CPUs not executing in critical section. 1029 * We are not in one so the check for us is safe. If the found 1030 * thread changes to something else we know the section was 1031 * exited as well. 1032 */ 1033 void 1034 quiesce_all_critical(void) 1035 { 1036 struct thread *td, *newtd; 1037 struct pcpu *pcpu; 1038 int cpu; 1039 1040 MPASS(curthread->td_critnest == 0); 1041 1042 CPU_FOREACH(cpu) { 1043 pcpu = cpuid_to_pcpu[cpu]; 1044 td = pcpu->pc_curthread; 1045 for (;;) { 1046 if (td->td_critnest == 0) 1047 break; 1048 cpu_spinwait(); 1049 newtd = (struct thread *) 1050 atomic_load_acq_ptr((void *)pcpu->pc_curthread); 1051 if (td != newtd) 1052 break; 1053 } 1054 } 1055 } 1056 1057 static void 1058 cpus_fence_seq_cst_issue(void *arg __unused) 1059 { 1060 1061 atomic_thread_fence_seq_cst(); 1062 } 1063 1064 /* 1065 * Send an IPI forcing a sequentially consistent fence. 1066 * 1067 * Allows replacement of an explicitly fence with a compiler barrier. 1068 * Trades speed up during normal execution for a significant slowdown when 1069 * the barrier is needed. 1070 */ 1071 void 1072 cpus_fence_seq_cst(void) 1073 { 1074 1075 #ifdef SMP 1076 smp_rendezvous( 1077 smp_no_rendezvous_barrier, 1078 cpus_fence_seq_cst_issue, 1079 smp_no_rendezvous_barrier, 1080 NULL 1081 ); 1082 #else 1083 cpus_fence_seq_cst_issue(NULL); 1084 #endif 1085 } 1086 1087 /* Extra care is taken with this sysctl because the data type is volatile */ 1088 static int 1089 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) 1090 { 1091 int error, active; 1092 1093 active = smp_started; 1094 error = SYSCTL_OUT(req, &active, sizeof(active)); 1095 return (error); 1096 } 1097 1098 #ifdef SMP 1099 void 1100 topo_init_node(struct topo_node *node) 1101 { 1102 1103 bzero(node, sizeof(*node)); 1104 TAILQ_INIT(&node->children); 1105 } 1106 1107 void 1108 topo_init_root(struct topo_node *root) 1109 { 1110 1111 topo_init_node(root); 1112 root->type = TOPO_TYPE_SYSTEM; 1113 } 1114 1115 /* 1116 * Add a child node with the given ID under the given parent. 1117 * Do nothing if there is already a child with that ID. 1118 */ 1119 struct topo_node * 1120 topo_add_node_by_hwid(struct topo_node *parent, int hwid, 1121 topo_node_type type, uintptr_t subtype) 1122 { 1123 struct topo_node *node; 1124 1125 TAILQ_FOREACH_REVERSE(node, &parent->children, 1126 topo_children, siblings) { 1127 if (node->hwid == hwid 1128 && node->type == type && node->subtype == subtype) { 1129 return (node); 1130 } 1131 } 1132 1133 node = malloc(sizeof(*node), M_TOPO, M_WAITOK); 1134 topo_init_node(node); 1135 node->parent = parent; 1136 node->hwid = hwid; 1137 node->type = type; 1138 node->subtype = subtype; 1139 TAILQ_INSERT_TAIL(&parent->children, node, siblings); 1140 parent->nchildren++; 1141 1142 return (node); 1143 } 1144 1145 /* 1146 * Find a child node with the given ID under the given parent. 1147 */ 1148 struct topo_node * 1149 topo_find_node_by_hwid(struct topo_node *parent, int hwid, 1150 topo_node_type type, uintptr_t subtype) 1151 { 1152 1153 struct topo_node *node; 1154 1155 TAILQ_FOREACH(node, &parent->children, siblings) { 1156 if (node->hwid == hwid 1157 && node->type == type && node->subtype == subtype) { 1158 return (node); 1159 } 1160 } 1161 1162 return (NULL); 1163 } 1164 1165 /* 1166 * Given a node change the order of its parent's child nodes such 1167 * that the node becomes the firt child while preserving the cyclic 1168 * order of the children. In other words, the given node is promoted 1169 * by rotation. 1170 */ 1171 void 1172 topo_promote_child(struct topo_node *child) 1173 { 1174 struct topo_node *next; 1175 struct topo_node *node; 1176 struct topo_node *parent; 1177 1178 parent = child->parent; 1179 next = TAILQ_NEXT(child, siblings); 1180 TAILQ_REMOVE(&parent->children, child, siblings); 1181 TAILQ_INSERT_HEAD(&parent->children, child, siblings); 1182 1183 while (next != NULL) { 1184 node = next; 1185 next = TAILQ_NEXT(node, siblings); 1186 TAILQ_REMOVE(&parent->children, node, siblings); 1187 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); 1188 child = node; 1189 } 1190 } 1191 1192 /* 1193 * Iterate to the next node in the depth-first search (traversal) of 1194 * the topology tree. 1195 */ 1196 struct topo_node * 1197 topo_next_node(struct topo_node *top, struct topo_node *node) 1198 { 1199 struct topo_node *next; 1200 1201 if ((next = TAILQ_FIRST(&node->children)) != NULL) 1202 return (next); 1203 1204 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1205 return (next); 1206 1207 while (node != top && (node = node->parent) != top) 1208 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1209 return (next); 1210 1211 return (NULL); 1212 } 1213 1214 /* 1215 * Iterate to the next node in the depth-first search of the topology tree, 1216 * but without descending below the current node. 1217 */ 1218 struct topo_node * 1219 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) 1220 { 1221 struct topo_node *next; 1222 1223 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1224 return (next); 1225 1226 while (node != top && (node = node->parent) != top) 1227 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1228 return (next); 1229 1230 return (NULL); 1231 } 1232 1233 /* 1234 * Assign the given ID to the given topology node that represents a logical 1235 * processor. 1236 */ 1237 void 1238 topo_set_pu_id(struct topo_node *node, cpuid_t id) 1239 { 1240 1241 KASSERT(node->type == TOPO_TYPE_PU, 1242 ("topo_set_pu_id: wrong node type: %u", node->type)); 1243 KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, 1244 ("topo_set_pu_id: cpuset already not empty")); 1245 node->id = id; 1246 CPU_SET(id, &node->cpuset); 1247 node->cpu_count = 1; 1248 node->subtype = 1; 1249 1250 while ((node = node->parent) != NULL) { 1251 KASSERT(!CPU_ISSET(id, &node->cpuset), 1252 ("logical ID %u is already set in node %p", id, node)); 1253 CPU_SET(id, &node->cpuset); 1254 node->cpu_count++; 1255 } 1256 } 1257 1258 static struct topology_spec { 1259 topo_node_type type; 1260 bool match_subtype; 1261 uintptr_t subtype; 1262 } topology_level_table[TOPO_LEVEL_COUNT] = { 1263 [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, }, 1264 [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, }, 1265 [TOPO_LEVEL_CACHEGROUP] = { 1266 .type = TOPO_TYPE_CACHE, 1267 .match_subtype = true, 1268 .subtype = CG_SHARE_L3, 1269 }, 1270 [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, }, 1271 [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, }, 1272 }; 1273 1274 static bool 1275 topo_analyze_table(struct topo_node *root, int all, enum topo_level level, 1276 struct topo_analysis *results) 1277 { 1278 struct topology_spec *spec; 1279 struct topo_node *node; 1280 int count; 1281 1282 if (level >= TOPO_LEVEL_COUNT) 1283 return (true); 1284 1285 spec = &topology_level_table[level]; 1286 count = 0; 1287 node = topo_next_node(root, root); 1288 1289 while (node != NULL) { 1290 if (node->type != spec->type || 1291 (spec->match_subtype && node->subtype != spec->subtype)) { 1292 node = topo_next_node(root, node); 1293 continue; 1294 } 1295 if (!all && CPU_EMPTY(&node->cpuset)) { 1296 node = topo_next_nonchild_node(root, node); 1297 continue; 1298 } 1299 1300 count++; 1301 1302 if (!topo_analyze_table(node, all, level + 1, results)) 1303 return (false); 1304 1305 node = topo_next_nonchild_node(root, node); 1306 } 1307 1308 /* No explicit subgroups is essentially one subgroup. */ 1309 if (count == 0) { 1310 count = 1; 1311 1312 if (!topo_analyze_table(root, all, level + 1, results)) 1313 return (false); 1314 } 1315 1316 if (results->entities[level] == -1) 1317 results->entities[level] = count; 1318 else if (results->entities[level] != count) 1319 return (false); 1320 1321 return (true); 1322 } 1323 1324 /* 1325 * Check if the topology is uniform, that is, each package has the same number 1326 * of cores in it and each core has the same number of threads (logical 1327 * processors) in it. If so, calculate the number of packages, the number of 1328 * groups per package, the number of cachegroups per group, and the number of 1329 * logical processors per cachegroup. 'all' parameter tells whether to include 1330 * administratively disabled logical processors into the analysis. 1331 */ 1332 int 1333 topo_analyze(struct topo_node *topo_root, int all, 1334 struct topo_analysis *results) 1335 { 1336 1337 results->entities[TOPO_LEVEL_PKG] = -1; 1338 results->entities[TOPO_LEVEL_CORE] = -1; 1339 results->entities[TOPO_LEVEL_THREAD] = -1; 1340 results->entities[TOPO_LEVEL_GROUP] = -1; 1341 results->entities[TOPO_LEVEL_CACHEGROUP] = -1; 1342 1343 if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results)) 1344 return (0); 1345 1346 KASSERT(results->entities[TOPO_LEVEL_PKG] > 0, 1347 ("bug in topology or analysis")); 1348 1349 return (1); 1350 } 1351 1352 #endif /* SMP */ 1353