1 /*- 2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD 3 * 4 * Copyright (c) 2001, John Baldwin <jhb@FreeBSD.org>. 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 20 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 22 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 23 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 24 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 25 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 26 * SUCH DAMAGE. 27 */ 28 29 /* 30 * This module holds the global variables and machine independent functions 31 * used for the kernel SMP support. 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include <sys/param.h> 38 #include <sys/systm.h> 39 #include <sys/kernel.h> 40 #include <sys/ktr.h> 41 #include <sys/proc.h> 42 #include <sys/bus.h> 43 #include <sys/lock.h> 44 #include <sys/malloc.h> 45 #include <sys/mutex.h> 46 #include <sys/pcpu.h> 47 #include <sys/sched.h> 48 #include <sys/smp.h> 49 #include <sys/sysctl.h> 50 51 #include <machine/cpu.h> 52 #include <machine/smp.h> 53 54 #include "opt_sched.h" 55 56 #ifdef SMP 57 MALLOC_DEFINE(M_TOPO, "toponodes", "SMP topology data"); 58 59 volatile cpuset_t stopped_cpus; 60 volatile cpuset_t started_cpus; 61 volatile cpuset_t suspended_cpus; 62 cpuset_t hlt_cpus_mask; 63 cpuset_t logical_cpus_mask; 64 65 void (*cpustop_restartfunc)(void); 66 #endif 67 68 static int sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS); 69 70 /* This is used in modules that need to work in both SMP and UP. */ 71 cpuset_t all_cpus; 72 73 int mp_ncpus; 74 /* export this for libkvm consumers. */ 75 int mp_maxcpus = MAXCPU; 76 77 volatile int smp_started; 78 u_int mp_maxid; 79 80 static SYSCTL_NODE(_kern, OID_AUTO, smp, CTLFLAG_RD|CTLFLAG_CAPRD, NULL, 81 "Kernel SMP"); 82 83 SYSCTL_INT(_kern_smp, OID_AUTO, maxid, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxid, 0, 84 "Max CPU ID."); 85 86 SYSCTL_INT(_kern_smp, OID_AUTO, maxcpus, CTLFLAG_RD|CTLFLAG_CAPRD, &mp_maxcpus, 87 0, "Max number of CPUs that the system was compiled for."); 88 89 SYSCTL_PROC(_kern_smp, OID_AUTO, active, CTLFLAG_RD|CTLTYPE_INT|CTLFLAG_MPSAFE, 90 NULL, 0, sysctl_kern_smp_active, "I", 91 "Indicates system is running in SMP mode"); 92 93 int smp_disabled = 0; /* has smp been disabled? */ 94 SYSCTL_INT(_kern_smp, OID_AUTO, disabled, CTLFLAG_RDTUN|CTLFLAG_CAPRD, 95 &smp_disabled, 0, "SMP has been disabled from the loader"); 96 97 int smp_cpus = 1; /* how many cpu's running */ 98 SYSCTL_INT(_kern_smp, OID_AUTO, cpus, CTLFLAG_RD|CTLFLAG_CAPRD, &smp_cpus, 0, 99 "Number of CPUs online"); 100 101 int smp_topology = 0; /* Which topology we're using. */ 102 SYSCTL_INT(_kern_smp, OID_AUTO, topology, CTLFLAG_RDTUN, &smp_topology, 0, 103 "Topology override setting; 0 is default provided by hardware."); 104 105 #ifdef SMP 106 /* Enable forwarding of a signal to a process running on a different CPU */ 107 static int forward_signal_enabled = 1; 108 SYSCTL_INT(_kern_smp, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, 109 &forward_signal_enabled, 0, 110 "Forwarding of a signal to a process on a different CPU"); 111 112 /* Variables needed for SMP rendezvous. */ 113 static volatile int smp_rv_ncpus; 114 static void (*volatile smp_rv_setup_func)(void *arg); 115 static void (*volatile smp_rv_action_func)(void *arg); 116 static void (*volatile smp_rv_teardown_func)(void *arg); 117 static void *volatile smp_rv_func_arg; 118 static volatile int smp_rv_waiters[4]; 119 120 /* 121 * Shared mutex to restrict busywaits between smp_rendezvous() and 122 * smp(_targeted)_tlb_shootdown(). A deadlock occurs if both of these 123 * functions trigger at once and cause multiple CPUs to busywait with 124 * interrupts disabled. 125 */ 126 struct mtx smp_ipi_mtx; 127 128 /* 129 * Let the MD SMP code initialize mp_maxid very early if it can. 130 */ 131 static void 132 mp_setmaxid(void *dummy) 133 { 134 135 cpu_mp_setmaxid(); 136 137 KASSERT(mp_ncpus >= 1, ("%s: CPU count < 1", __func__)); 138 KASSERT(mp_ncpus > 1 || mp_maxid == 0, 139 ("%s: one CPU but mp_maxid is not zero", __func__)); 140 KASSERT(mp_maxid >= mp_ncpus - 1, 141 ("%s: counters out of sync: max %d, count %d", __func__, 142 mp_maxid, mp_ncpus)); 143 } 144 SYSINIT(cpu_mp_setmaxid, SI_SUB_TUNABLES, SI_ORDER_FIRST, mp_setmaxid, NULL); 145 146 /* 147 * Call the MD SMP initialization code. 148 */ 149 static void 150 mp_start(void *dummy) 151 { 152 153 mtx_init(&smp_ipi_mtx, "smp rendezvous", NULL, MTX_SPIN); 154 155 /* Probe for MP hardware. */ 156 if (smp_disabled != 0 || cpu_mp_probe() == 0) { 157 mp_ncpus = 1; 158 CPU_SETOF(PCPU_GET(cpuid), &all_cpus); 159 return; 160 } 161 162 cpu_mp_start(); 163 printf("FreeBSD/SMP: Multiprocessor System Detected: %d CPUs\n", 164 mp_ncpus); 165 cpu_mp_announce(); 166 } 167 SYSINIT(cpu_mp, SI_SUB_CPU, SI_ORDER_THIRD, mp_start, NULL); 168 169 void 170 forward_signal(struct thread *td) 171 { 172 int id; 173 174 /* 175 * signotify() has already set TDF_ASTPENDING and TDF_NEEDSIGCHECK on 176 * this thread, so all we need to do is poke it if it is currently 177 * executing so that it executes ast(). 178 */ 179 THREAD_LOCK_ASSERT(td, MA_OWNED); 180 KASSERT(TD_IS_RUNNING(td), 181 ("forward_signal: thread is not TDS_RUNNING")); 182 183 CTR1(KTR_SMP, "forward_signal(%p)", td->td_proc); 184 185 if (!smp_started || cold || panicstr) 186 return; 187 if (!forward_signal_enabled) 188 return; 189 190 /* No need to IPI ourself. */ 191 if (td == curthread) 192 return; 193 194 id = td->td_oncpu; 195 if (id == NOCPU) 196 return; 197 ipi_cpu(id, IPI_AST); 198 } 199 200 /* 201 * When called the executing CPU will send an IPI to all other CPUs 202 * requesting that they halt execution. 203 * 204 * Usually (but not necessarily) called with 'other_cpus' as its arg. 205 * 206 * - Signals all CPUs in map to stop. 207 * - Waits for each to stop. 208 * 209 * Returns: 210 * -1: error 211 * 0: NA 212 * 1: ok 213 * 214 */ 215 #if defined(__amd64__) || defined(__i386__) 216 #define X86 1 217 #else 218 #define X86 0 219 #endif 220 static int 221 generic_stop_cpus(cpuset_t map, u_int type) 222 { 223 #ifdef KTR 224 char cpusetbuf[CPUSETBUFSIZ]; 225 #endif 226 static volatile u_int stopping_cpu = NOCPU; 227 int i; 228 volatile cpuset_t *cpus; 229 230 KASSERT( 231 type == IPI_STOP || type == IPI_STOP_HARD 232 #if X86 233 || type == IPI_SUSPEND 234 #endif 235 , ("%s: invalid stop type", __func__)); 236 237 if (!smp_started) 238 return (0); 239 240 CTR2(KTR_SMP, "stop_cpus(%s) with %u type", 241 cpusetobj_strprint(cpusetbuf, &map), type); 242 243 #if X86 244 /* 245 * When suspending, ensure there are are no IPIs in progress. 246 * IPIs that have been issued, but not yet delivered (e.g. 247 * not pending on a vCPU when running under virtualization) 248 * will be lost, violating FreeBSD's assumption of reliable 249 * IPI delivery. 250 */ 251 if (type == IPI_SUSPEND) 252 mtx_lock_spin(&smp_ipi_mtx); 253 #endif 254 255 #if X86 256 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { 257 #endif 258 if (stopping_cpu != PCPU_GET(cpuid)) 259 while (atomic_cmpset_int(&stopping_cpu, NOCPU, 260 PCPU_GET(cpuid)) == 0) 261 while (stopping_cpu != NOCPU) 262 cpu_spinwait(); /* spin */ 263 264 /* send the stop IPI to all CPUs in map */ 265 ipi_selected(map, type); 266 #if X86 267 } 268 #endif 269 270 #if X86 271 if (type == IPI_SUSPEND) 272 cpus = &suspended_cpus; 273 else 274 #endif 275 cpus = &stopped_cpus; 276 277 i = 0; 278 while (!CPU_SUBSET(cpus, &map)) { 279 /* spin */ 280 cpu_spinwait(); 281 i++; 282 if (i == 100000000) { 283 printf("timeout stopping cpus\n"); 284 break; 285 } 286 } 287 288 #if X86 289 if (type == IPI_SUSPEND) 290 mtx_unlock_spin(&smp_ipi_mtx); 291 #endif 292 293 stopping_cpu = NOCPU; 294 return (1); 295 } 296 297 int 298 stop_cpus(cpuset_t map) 299 { 300 301 return (generic_stop_cpus(map, IPI_STOP)); 302 } 303 304 int 305 stop_cpus_hard(cpuset_t map) 306 { 307 308 return (generic_stop_cpus(map, IPI_STOP_HARD)); 309 } 310 311 #if X86 312 int 313 suspend_cpus(cpuset_t map) 314 { 315 316 return (generic_stop_cpus(map, IPI_SUSPEND)); 317 } 318 #endif 319 320 /* 321 * Called by a CPU to restart stopped CPUs. 322 * 323 * Usually (but not necessarily) called with 'stopped_cpus' as its arg. 324 * 325 * - Signals all CPUs in map to restart. 326 * - Waits for each to restart. 327 * 328 * Returns: 329 * -1: error 330 * 0: NA 331 * 1: ok 332 */ 333 static int 334 generic_restart_cpus(cpuset_t map, u_int type) 335 { 336 #ifdef KTR 337 char cpusetbuf[CPUSETBUFSIZ]; 338 #endif 339 volatile cpuset_t *cpus; 340 341 KASSERT(type == IPI_STOP || type == IPI_STOP_HARD 342 #if X86 343 || type == IPI_SUSPEND 344 #endif 345 , ("%s: invalid stop type", __func__)); 346 347 if (!smp_started) 348 return (0); 349 350 CTR1(KTR_SMP, "restart_cpus(%s)", cpusetobj_strprint(cpusetbuf, &map)); 351 352 #if X86 353 if (type == IPI_SUSPEND) 354 cpus = &suspended_cpus; 355 else 356 #endif 357 cpus = &stopped_cpus; 358 359 /* signal other cpus to restart */ 360 CPU_COPY_STORE_REL(&map, &started_cpus); 361 362 #if X86 363 if (!nmi_is_broadcast || nmi_kdb_lock == 0) { 364 #endif 365 /* wait for each to clear its bit */ 366 while (CPU_OVERLAP(cpus, &map)) 367 cpu_spinwait(); 368 #if X86 369 } 370 #endif 371 372 return (1); 373 } 374 375 int 376 restart_cpus(cpuset_t map) 377 { 378 379 return (generic_restart_cpus(map, IPI_STOP)); 380 } 381 382 #if X86 383 int 384 resume_cpus(cpuset_t map) 385 { 386 387 return (generic_restart_cpus(map, IPI_SUSPEND)); 388 } 389 #endif 390 #undef X86 391 392 /* 393 * All-CPU rendezvous. CPUs are signalled, all execute the setup function 394 * (if specified), rendezvous, execute the action function (if specified), 395 * rendezvous again, execute the teardown function (if specified), and then 396 * resume. 397 * 398 * Note that the supplied external functions _must_ be reentrant and aware 399 * that they are running in parallel and in an unknown lock context. 400 */ 401 void 402 smp_rendezvous_action(void) 403 { 404 struct thread *td; 405 void *local_func_arg; 406 void (*local_setup_func)(void*); 407 void (*local_action_func)(void*); 408 void (*local_teardown_func)(void*); 409 #ifdef INVARIANTS 410 int owepreempt; 411 #endif 412 413 /* Ensure we have up-to-date values. */ 414 atomic_add_acq_int(&smp_rv_waiters[0], 1); 415 while (smp_rv_waiters[0] < smp_rv_ncpus) 416 cpu_spinwait(); 417 418 /* Fetch rendezvous parameters after acquire barrier. */ 419 local_func_arg = smp_rv_func_arg; 420 local_setup_func = smp_rv_setup_func; 421 local_action_func = smp_rv_action_func; 422 local_teardown_func = smp_rv_teardown_func; 423 424 /* 425 * Use a nested critical section to prevent any preemptions 426 * from occurring during a rendezvous action routine. 427 * Specifically, if a rendezvous handler is invoked via an IPI 428 * and the interrupted thread was in the critical_exit() 429 * function after setting td_critnest to 0 but before 430 * performing a deferred preemption, this routine can be 431 * invoked with td_critnest set to 0 and td_owepreempt true. 432 * In that case, a critical_exit() during the rendezvous 433 * action would trigger a preemption which is not permitted in 434 * a rendezvous action. To fix this, wrap all of the 435 * rendezvous action handlers in a critical section. We 436 * cannot use a regular critical section however as having 437 * critical_exit() preempt from this routine would also be 438 * problematic (the preemption must not occur before the IPI 439 * has been acknowledged via an EOI). Instead, we 440 * intentionally ignore td_owepreempt when leaving the 441 * critical section. This should be harmless because we do 442 * not permit rendezvous action routines to schedule threads, 443 * and thus td_owepreempt should never transition from 0 to 1 444 * during this routine. 445 */ 446 td = curthread; 447 td->td_critnest++; 448 #ifdef INVARIANTS 449 owepreempt = td->td_owepreempt; 450 #endif 451 452 /* 453 * If requested, run a setup function before the main action 454 * function. Ensure all CPUs have completed the setup 455 * function before moving on to the action function. 456 */ 457 if (local_setup_func != smp_no_rendezvous_barrier) { 458 if (smp_rv_setup_func != NULL) 459 smp_rv_setup_func(smp_rv_func_arg); 460 atomic_add_int(&smp_rv_waiters[1], 1); 461 while (smp_rv_waiters[1] < smp_rv_ncpus) 462 cpu_spinwait(); 463 } 464 465 if (local_action_func != NULL) 466 local_action_func(local_func_arg); 467 468 if (local_teardown_func != smp_no_rendezvous_barrier) { 469 /* 470 * Signal that the main action has been completed. If a 471 * full exit rendezvous is requested, then all CPUs will 472 * wait here until all CPUs have finished the main action. 473 */ 474 atomic_add_int(&smp_rv_waiters[2], 1); 475 while (smp_rv_waiters[2] < smp_rv_ncpus) 476 cpu_spinwait(); 477 478 if (local_teardown_func != NULL) 479 local_teardown_func(local_func_arg); 480 } 481 482 /* 483 * Signal that the rendezvous is fully completed by this CPU. 484 * This means that no member of smp_rv_* pseudo-structure will be 485 * accessed by this target CPU after this point; in particular, 486 * memory pointed by smp_rv_func_arg. 487 * 488 * The release semantic ensures that all accesses performed by 489 * the current CPU are visible when smp_rendezvous_cpus() 490 * returns, by synchronizing with the 491 * atomic_load_acq_int(&smp_rv_waiters[3]). 492 */ 493 atomic_add_rel_int(&smp_rv_waiters[3], 1); 494 495 td->td_critnest--; 496 KASSERT(owepreempt == td->td_owepreempt, 497 ("rendezvous action changed td_owepreempt")); 498 } 499 500 void 501 smp_rendezvous_cpus(cpuset_t map, 502 void (* setup_func)(void *), 503 void (* action_func)(void *), 504 void (* teardown_func)(void *), 505 void *arg) 506 { 507 int curcpumap, i, ncpus = 0; 508 509 /* Look comments in the !SMP case. */ 510 if (!smp_started) { 511 spinlock_enter(); 512 if (setup_func != NULL) 513 setup_func(arg); 514 if (action_func != NULL) 515 action_func(arg); 516 if (teardown_func != NULL) 517 teardown_func(arg); 518 spinlock_exit(); 519 return; 520 } 521 522 CPU_FOREACH(i) { 523 if (CPU_ISSET(i, &map)) 524 ncpus++; 525 } 526 if (ncpus == 0) 527 panic("ncpus is 0 with non-zero map"); 528 529 mtx_lock_spin(&smp_ipi_mtx); 530 531 /* Pass rendezvous parameters via global variables. */ 532 smp_rv_ncpus = ncpus; 533 smp_rv_setup_func = setup_func; 534 smp_rv_action_func = action_func; 535 smp_rv_teardown_func = teardown_func; 536 smp_rv_func_arg = arg; 537 smp_rv_waiters[1] = 0; 538 smp_rv_waiters[2] = 0; 539 smp_rv_waiters[3] = 0; 540 atomic_store_rel_int(&smp_rv_waiters[0], 0); 541 542 /* 543 * Signal other processors, which will enter the IPI with 544 * interrupts off. 545 */ 546 curcpumap = CPU_ISSET(curcpu, &map); 547 CPU_CLR(curcpu, &map); 548 ipi_selected(map, IPI_RENDEZVOUS); 549 550 /* Check if the current CPU is in the map */ 551 if (curcpumap != 0) 552 smp_rendezvous_action(); 553 554 /* 555 * Ensure that the master CPU waits for all the other 556 * CPUs to finish the rendezvous, so that smp_rv_* 557 * pseudo-structure and the arg are guaranteed to not 558 * be in use. 559 * 560 * Load acquire synchronizes with the release add in 561 * smp_rendezvous_action(), which ensures that our caller sees 562 * all memory actions done by the called functions on other 563 * CPUs. 564 */ 565 while (atomic_load_acq_int(&smp_rv_waiters[3]) < ncpus) 566 cpu_spinwait(); 567 568 mtx_unlock_spin(&smp_ipi_mtx); 569 } 570 571 void 572 smp_rendezvous(void (* setup_func)(void *), 573 void (* action_func)(void *), 574 void (* teardown_func)(void *), 575 void *arg) 576 { 577 smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, arg); 578 } 579 580 static struct cpu_group group[MAXCPU * MAX_CACHE_LEVELS + 1]; 581 582 struct cpu_group * 583 smp_topo(void) 584 { 585 char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; 586 struct cpu_group *top; 587 588 /* 589 * Check for a fake topology request for debugging purposes. 590 */ 591 switch (smp_topology) { 592 case 1: 593 /* Dual core with no sharing. */ 594 top = smp_topo_1level(CG_SHARE_NONE, 2, 0); 595 break; 596 case 2: 597 /* No topology, all cpus are equal. */ 598 top = smp_topo_none(); 599 break; 600 case 3: 601 /* Dual core with shared L2. */ 602 top = smp_topo_1level(CG_SHARE_L2, 2, 0); 603 break; 604 case 4: 605 /* quad core, shared l3 among each package, private l2. */ 606 top = smp_topo_1level(CG_SHARE_L3, 4, 0); 607 break; 608 case 5: 609 /* quad core, 2 dualcore parts on each package share l2. */ 610 top = smp_topo_2level(CG_SHARE_NONE, 2, CG_SHARE_L2, 2, 0); 611 break; 612 case 6: 613 /* Single-core 2xHTT */ 614 top = smp_topo_1level(CG_SHARE_L1, 2, CG_FLAG_HTT); 615 break; 616 case 7: 617 /* quad core with a shared l3, 8 threads sharing L2. */ 618 top = smp_topo_2level(CG_SHARE_L3, 4, CG_SHARE_L2, 8, 619 CG_FLAG_SMT); 620 break; 621 default: 622 /* Default, ask the system what it wants. */ 623 top = cpu_topo(); 624 break; 625 } 626 /* 627 * Verify the returned topology. 628 */ 629 if (top->cg_count != mp_ncpus) 630 panic("Built bad topology at %p. CPU count %d != %d", 631 top, top->cg_count, mp_ncpus); 632 if (CPU_CMP(&top->cg_mask, &all_cpus)) 633 panic("Built bad topology at %p. CPU mask (%s) != (%s)", 634 top, cpusetobj_strprint(cpusetbuf, &top->cg_mask), 635 cpusetobj_strprint(cpusetbuf2, &all_cpus)); 636 637 /* 638 * Collapse nonsense levels that may be created out of convenience by 639 * the MD layers. They cause extra work in the search functions. 640 */ 641 while (top->cg_children == 1) { 642 top = &top->cg_child[0]; 643 top->cg_parent = NULL; 644 } 645 return (top); 646 } 647 648 struct cpu_group * 649 smp_topo_alloc(u_int count) 650 { 651 static u_int index; 652 u_int curr; 653 654 curr = index; 655 index += count; 656 return (&group[curr]); 657 } 658 659 struct cpu_group * 660 smp_topo_none(void) 661 { 662 struct cpu_group *top; 663 664 top = &group[0]; 665 top->cg_parent = NULL; 666 top->cg_child = NULL; 667 top->cg_mask = all_cpus; 668 top->cg_count = mp_ncpus; 669 top->cg_children = 0; 670 top->cg_level = CG_SHARE_NONE; 671 top->cg_flags = 0; 672 673 return (top); 674 } 675 676 static int 677 smp_topo_addleaf(struct cpu_group *parent, struct cpu_group *child, int share, 678 int count, int flags, int start) 679 { 680 char cpusetbuf[CPUSETBUFSIZ], cpusetbuf2[CPUSETBUFSIZ]; 681 cpuset_t mask; 682 int i; 683 684 CPU_ZERO(&mask); 685 for (i = 0; i < count; i++, start++) 686 CPU_SET(start, &mask); 687 child->cg_parent = parent; 688 child->cg_child = NULL; 689 child->cg_children = 0; 690 child->cg_level = share; 691 child->cg_count = count; 692 child->cg_flags = flags; 693 child->cg_mask = mask; 694 parent->cg_children++; 695 for (; parent != NULL; parent = parent->cg_parent) { 696 if (CPU_OVERLAP(&parent->cg_mask, &child->cg_mask)) 697 panic("Duplicate children in %p. mask (%s) child (%s)", 698 parent, 699 cpusetobj_strprint(cpusetbuf, &parent->cg_mask), 700 cpusetobj_strprint(cpusetbuf2, &child->cg_mask)); 701 CPU_OR(&parent->cg_mask, &child->cg_mask); 702 parent->cg_count += child->cg_count; 703 } 704 705 return (start); 706 } 707 708 struct cpu_group * 709 smp_topo_1level(int share, int count, int flags) 710 { 711 struct cpu_group *child; 712 struct cpu_group *top; 713 int packages; 714 int cpu; 715 int i; 716 717 cpu = 0; 718 top = &group[0]; 719 packages = mp_ncpus / count; 720 top->cg_child = child = &group[1]; 721 top->cg_level = CG_SHARE_NONE; 722 for (i = 0; i < packages; i++, child++) 723 cpu = smp_topo_addleaf(top, child, share, count, flags, cpu); 724 return (top); 725 } 726 727 struct cpu_group * 728 smp_topo_2level(int l2share, int l2count, int l1share, int l1count, 729 int l1flags) 730 { 731 struct cpu_group *top; 732 struct cpu_group *l1g; 733 struct cpu_group *l2g; 734 int cpu; 735 int i; 736 int j; 737 738 cpu = 0; 739 top = &group[0]; 740 l2g = &group[1]; 741 top->cg_child = l2g; 742 top->cg_level = CG_SHARE_NONE; 743 top->cg_children = mp_ncpus / (l2count * l1count); 744 l1g = l2g + top->cg_children; 745 for (i = 0; i < top->cg_children; i++, l2g++) { 746 l2g->cg_parent = top; 747 l2g->cg_child = l1g; 748 l2g->cg_level = l2share; 749 for (j = 0; j < l2count; j++, l1g++) 750 cpu = smp_topo_addleaf(l2g, l1g, l1share, l1count, 751 l1flags, cpu); 752 } 753 return (top); 754 } 755 756 757 struct cpu_group * 758 smp_topo_find(struct cpu_group *top, int cpu) 759 { 760 struct cpu_group *cg; 761 cpuset_t mask; 762 int children; 763 int i; 764 765 CPU_SETOF(cpu, &mask); 766 cg = top; 767 for (;;) { 768 if (!CPU_OVERLAP(&cg->cg_mask, &mask)) 769 return (NULL); 770 if (cg->cg_children == 0) 771 return (cg); 772 children = cg->cg_children; 773 for (i = 0, cg = cg->cg_child; i < children; cg++, i++) 774 if (CPU_OVERLAP(&cg->cg_mask, &mask)) 775 break; 776 } 777 return (NULL); 778 } 779 #else /* !SMP */ 780 781 void 782 smp_rendezvous_cpus(cpuset_t map, 783 void (*setup_func)(void *), 784 void (*action_func)(void *), 785 void (*teardown_func)(void *), 786 void *arg) 787 { 788 /* 789 * In the !SMP case we just need to ensure the same initial conditions 790 * as the SMP case. 791 */ 792 spinlock_enter(); 793 if (setup_func != NULL) 794 setup_func(arg); 795 if (action_func != NULL) 796 action_func(arg); 797 if (teardown_func != NULL) 798 teardown_func(arg); 799 spinlock_exit(); 800 } 801 802 void 803 smp_rendezvous(void (*setup_func)(void *), 804 void (*action_func)(void *), 805 void (*teardown_func)(void *), 806 void *arg) 807 { 808 809 smp_rendezvous_cpus(all_cpus, setup_func, action_func, teardown_func, 810 arg); 811 } 812 813 /* 814 * Provide dummy SMP support for UP kernels. Modules that need to use SMP 815 * APIs will still work using this dummy support. 816 */ 817 static void 818 mp_setvariables_for_up(void *dummy) 819 { 820 mp_ncpus = 1; 821 mp_maxid = PCPU_GET(cpuid); 822 CPU_SETOF(mp_maxid, &all_cpus); 823 KASSERT(PCPU_GET(cpuid) == 0, ("UP must have a CPU ID of zero")); 824 } 825 SYSINIT(cpu_mp_setvariables, SI_SUB_TUNABLES, SI_ORDER_FIRST, 826 mp_setvariables_for_up, NULL); 827 #endif /* SMP */ 828 829 void 830 smp_no_rendezvous_barrier(void *dummy) 831 { 832 #ifdef SMP 833 KASSERT((!smp_started),("smp_no_rendezvous called and smp is started")); 834 #endif 835 } 836 837 /* 838 * Wait for specified idle threads to switch once. This ensures that even 839 * preempted threads have cycled through the switch function once, 840 * exiting their codepaths. This allows us to change global pointers 841 * with no other synchronization. 842 */ 843 int 844 quiesce_cpus(cpuset_t map, const char *wmesg, int prio) 845 { 846 struct pcpu *pcpu; 847 u_int gen[MAXCPU]; 848 int error; 849 int cpu; 850 851 error = 0; 852 for (cpu = 0; cpu <= mp_maxid; cpu++) { 853 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) 854 continue; 855 pcpu = pcpu_find(cpu); 856 gen[cpu] = pcpu->pc_idlethread->td_generation; 857 } 858 for (cpu = 0; cpu <= mp_maxid; cpu++) { 859 if (!CPU_ISSET(cpu, &map) || CPU_ABSENT(cpu)) 860 continue; 861 pcpu = pcpu_find(cpu); 862 thread_lock(curthread); 863 sched_bind(curthread, cpu); 864 thread_unlock(curthread); 865 while (gen[cpu] == pcpu->pc_idlethread->td_generation) { 866 error = tsleep(quiesce_cpus, prio, wmesg, 1); 867 if (error != EWOULDBLOCK) 868 goto out; 869 error = 0; 870 } 871 } 872 out: 873 thread_lock(curthread); 874 sched_unbind(curthread); 875 thread_unlock(curthread); 876 877 return (error); 878 } 879 880 int 881 quiesce_all_cpus(const char *wmesg, int prio) 882 { 883 884 return quiesce_cpus(all_cpus, wmesg, prio); 885 } 886 887 /* Extra care is taken with this sysctl because the data type is volatile */ 888 static int 889 sysctl_kern_smp_active(SYSCTL_HANDLER_ARGS) 890 { 891 int error, active; 892 893 active = smp_started; 894 error = SYSCTL_OUT(req, &active, sizeof(active)); 895 return (error); 896 } 897 898 899 #ifdef SMP 900 void 901 topo_init_node(struct topo_node *node) 902 { 903 904 bzero(node, sizeof(*node)); 905 TAILQ_INIT(&node->children); 906 } 907 908 void 909 topo_init_root(struct topo_node *root) 910 { 911 912 topo_init_node(root); 913 root->type = TOPO_TYPE_SYSTEM; 914 } 915 916 /* 917 * Add a child node with the given ID under the given parent. 918 * Do nothing if there is already a child with that ID. 919 */ 920 struct topo_node * 921 topo_add_node_by_hwid(struct topo_node *parent, int hwid, 922 topo_node_type type, uintptr_t subtype) 923 { 924 struct topo_node *node; 925 926 TAILQ_FOREACH_REVERSE(node, &parent->children, 927 topo_children, siblings) { 928 if (node->hwid == hwid 929 && node->type == type && node->subtype == subtype) { 930 return (node); 931 } 932 } 933 934 node = malloc(sizeof(*node), M_TOPO, M_WAITOK); 935 topo_init_node(node); 936 node->parent = parent; 937 node->hwid = hwid; 938 node->type = type; 939 node->subtype = subtype; 940 TAILQ_INSERT_TAIL(&parent->children, node, siblings); 941 parent->nchildren++; 942 943 return (node); 944 } 945 946 /* 947 * Find a child node with the given ID under the given parent. 948 */ 949 struct topo_node * 950 topo_find_node_by_hwid(struct topo_node *parent, int hwid, 951 topo_node_type type, uintptr_t subtype) 952 { 953 954 struct topo_node *node; 955 956 TAILQ_FOREACH(node, &parent->children, siblings) { 957 if (node->hwid == hwid 958 && node->type == type && node->subtype == subtype) { 959 return (node); 960 } 961 } 962 963 return (NULL); 964 } 965 966 /* 967 * Given a node change the order of its parent's child nodes such 968 * that the node becomes the firt child while preserving the cyclic 969 * order of the children. In other words, the given node is promoted 970 * by rotation. 971 */ 972 void 973 topo_promote_child(struct topo_node *child) 974 { 975 struct topo_node *next; 976 struct topo_node *node; 977 struct topo_node *parent; 978 979 parent = child->parent; 980 next = TAILQ_NEXT(child, siblings); 981 TAILQ_REMOVE(&parent->children, child, siblings); 982 TAILQ_INSERT_HEAD(&parent->children, child, siblings); 983 984 while (next != NULL) { 985 node = next; 986 next = TAILQ_NEXT(node, siblings); 987 TAILQ_REMOVE(&parent->children, node, siblings); 988 TAILQ_INSERT_AFTER(&parent->children, child, node, siblings); 989 child = node; 990 } 991 } 992 993 /* 994 * Iterate to the next node in the depth-first search (traversal) of 995 * the topology tree. 996 */ 997 struct topo_node * 998 topo_next_node(struct topo_node *top, struct topo_node *node) 999 { 1000 struct topo_node *next; 1001 1002 if ((next = TAILQ_FIRST(&node->children)) != NULL) 1003 return (next); 1004 1005 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1006 return (next); 1007 1008 while (node != top && (node = node->parent) != top) 1009 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1010 return (next); 1011 1012 return (NULL); 1013 } 1014 1015 /* 1016 * Iterate to the next node in the depth-first search of the topology tree, 1017 * but without descending below the current node. 1018 */ 1019 struct topo_node * 1020 topo_next_nonchild_node(struct topo_node *top, struct topo_node *node) 1021 { 1022 struct topo_node *next; 1023 1024 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1025 return (next); 1026 1027 while (node != top && (node = node->parent) != top) 1028 if ((next = TAILQ_NEXT(node, siblings)) != NULL) 1029 return (next); 1030 1031 return (NULL); 1032 } 1033 1034 /* 1035 * Assign the given ID to the given topology node that represents a logical 1036 * processor. 1037 */ 1038 void 1039 topo_set_pu_id(struct topo_node *node, cpuid_t id) 1040 { 1041 1042 KASSERT(node->type == TOPO_TYPE_PU, 1043 ("topo_set_pu_id: wrong node type: %u", node->type)); 1044 KASSERT(CPU_EMPTY(&node->cpuset) && node->cpu_count == 0, 1045 ("topo_set_pu_id: cpuset already not empty")); 1046 node->id = id; 1047 CPU_SET(id, &node->cpuset); 1048 node->cpu_count = 1; 1049 node->subtype = 1; 1050 1051 while ((node = node->parent) != NULL) { 1052 KASSERT(!CPU_ISSET(id, &node->cpuset), 1053 ("logical ID %u is already set in node %p", id, node)); 1054 CPU_SET(id, &node->cpuset); 1055 node->cpu_count++; 1056 } 1057 } 1058 1059 static struct topology_spec { 1060 topo_node_type type; 1061 bool match_subtype; 1062 uintptr_t subtype; 1063 } topology_level_table[TOPO_LEVEL_COUNT] = { 1064 [TOPO_LEVEL_PKG] = { .type = TOPO_TYPE_PKG, }, 1065 [TOPO_LEVEL_GROUP] = { .type = TOPO_TYPE_GROUP, }, 1066 [TOPO_LEVEL_CACHEGROUP] = { 1067 .type = TOPO_TYPE_CACHE, 1068 .match_subtype = true, 1069 .subtype = CG_SHARE_L3, 1070 }, 1071 [TOPO_LEVEL_CORE] = { .type = TOPO_TYPE_CORE, }, 1072 [TOPO_LEVEL_THREAD] = { .type = TOPO_TYPE_PU, }, 1073 }; 1074 1075 static bool 1076 topo_analyze_table(struct topo_node *root, int all, enum topo_level level, 1077 struct topo_analysis *results) 1078 { 1079 struct topology_spec *spec; 1080 struct topo_node *node; 1081 int count; 1082 1083 if (level >= TOPO_LEVEL_COUNT) 1084 return (true); 1085 1086 spec = &topology_level_table[level]; 1087 count = 0; 1088 node = topo_next_node(root, root); 1089 1090 while (node != NULL) { 1091 if (node->type != spec->type || 1092 (spec->match_subtype && node->subtype != spec->subtype)) { 1093 node = topo_next_node(root, node); 1094 continue; 1095 } 1096 if (!all && CPU_EMPTY(&node->cpuset)) { 1097 node = topo_next_nonchild_node(root, node); 1098 continue; 1099 } 1100 1101 count++; 1102 1103 if (!topo_analyze_table(node, all, level + 1, results)) 1104 return (false); 1105 1106 node = topo_next_nonchild_node(root, node); 1107 } 1108 1109 /* No explicit subgroups is essentially one subgroup. */ 1110 if (count == 0) { 1111 count = 1; 1112 1113 if (!topo_analyze_table(root, all, level + 1, results)) 1114 return (false); 1115 } 1116 1117 if (results->entities[level] == -1) 1118 results->entities[level] = count; 1119 else if (results->entities[level] != count) 1120 return (false); 1121 1122 return (true); 1123 } 1124 1125 /* 1126 * Check if the topology is uniform, that is, each package has the same number 1127 * of cores in it and each core has the same number of threads (logical 1128 * processors) in it. If so, calculate the number of packages, the number of 1129 * groups per package, the number of cachegroups per group, and the number of 1130 * logical processors per cachegroup. 'all' parameter tells whether to include 1131 * administratively disabled logical processors into the analysis. 1132 */ 1133 int 1134 topo_analyze(struct topo_node *topo_root, int all, 1135 struct topo_analysis *results) 1136 { 1137 1138 results->entities[TOPO_LEVEL_PKG] = -1; 1139 results->entities[TOPO_LEVEL_CORE] = -1; 1140 results->entities[TOPO_LEVEL_THREAD] = -1; 1141 results->entities[TOPO_LEVEL_GROUP] = -1; 1142 results->entities[TOPO_LEVEL_CACHEGROUP] = -1; 1143 1144 if (!topo_analyze_table(topo_root, all, TOPO_LEVEL_PKG, results)) 1145 return (0); 1146 1147 KASSERT(results->entities[TOPO_LEVEL_PKG] > 0, 1148 ("bug in topology or analysis")); 1149 1150 return (1); 1151 } 1152 1153 #endif /* SMP */ 1154 1155