1 /*- 2 * Copyright (c) 2015 Nathan Whitehorn 3 * Copyright (c) 2017-2018 Semihalf 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 17 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 19 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 21 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 22 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 23 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 25 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/kernel.h> 31 #include <sys/bus.h> 32 #include <sys/pcpu.h> 33 #include <sys/proc.h> 34 #include <sys/smp.h> 35 #include <vm/vm.h> 36 #include <vm/pmap.h> 37 38 #include <machine/bus.h> 39 #include <machine/cpu.h> 40 #include <machine/hid.h> 41 #include <machine/platformvar.h> 42 #include <machine/pmap.h> 43 #include <machine/rtas.h> 44 #include <machine/smp.h> 45 #include <machine/spr.h> 46 #include <machine/trap.h> 47 48 #include <dev/ofw/openfirm.h> 49 #include <dev/ofw/ofw_bus.h> 50 #include <dev/ofw/ofw_bus_subr.h> 51 #include <machine/ofw_machdep.h> 52 #include <powerpc/aim/mmu_oea64.h> 53 54 #include "platform_if.h" 55 #include "opal.h" 56 57 #ifdef SMP 58 extern void *ap_pcpu; 59 #endif 60 61 void (*powernv_smp_ap_extra_init)(void); 62 63 static int powernv_probe(platform_t); 64 static int powernv_attach(platform_t); 65 void powernv_mem_regions(platform_t, struct mem_region *phys, int *physsz, 66 struct mem_region *avail, int *availsz); 67 static void powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz); 68 static u_long powernv_timebase_freq(platform_t, struct cpuref *cpuref); 69 static int powernv_smp_first_cpu(platform_t, struct cpuref *cpuref); 70 static int powernv_smp_next_cpu(platform_t, struct cpuref *cpuref); 71 static int powernv_smp_get_bsp(platform_t, struct cpuref *cpuref); 72 static void powernv_smp_ap_init(platform_t); 73 #ifdef SMP 74 static int powernv_smp_start_cpu(platform_t, struct pcpu *cpu); 75 static void powernv_smp_probe_threads(platform_t); 76 static struct cpu_group *powernv_smp_topo(platform_t plat); 77 #endif 78 static void powernv_reset(platform_t); 79 static void powernv_cpu_idle(sbintime_t sbt); 80 static int powernv_cpuref_init(void); 81 static int powernv_node_numa_domain(platform_t platform, phandle_t node); 82 83 static platform_method_t powernv_methods[] = { 84 PLATFORMMETHOD(platform_probe, powernv_probe), 85 PLATFORMMETHOD(platform_attach, powernv_attach), 86 PLATFORMMETHOD(platform_mem_regions, powernv_mem_regions), 87 PLATFORMMETHOD(platform_numa_mem_regions, powernv_numa_mem_regions), 88 PLATFORMMETHOD(platform_timebase_freq, powernv_timebase_freq), 89 90 PLATFORMMETHOD(platform_smp_ap_init, powernv_smp_ap_init), 91 PLATFORMMETHOD(platform_smp_first_cpu, powernv_smp_first_cpu), 92 PLATFORMMETHOD(platform_smp_next_cpu, powernv_smp_next_cpu), 93 PLATFORMMETHOD(platform_smp_get_bsp, powernv_smp_get_bsp), 94 #ifdef SMP 95 PLATFORMMETHOD(platform_smp_start_cpu, powernv_smp_start_cpu), 96 PLATFORMMETHOD(platform_smp_probe_threads, powernv_smp_probe_threads), 97 PLATFORMMETHOD(platform_smp_topo, powernv_smp_topo), 98 #endif 99 PLATFORMMETHOD(platform_node_numa_domain, powernv_node_numa_domain), 100 101 PLATFORMMETHOD(platform_reset, powernv_reset), 102 { 0, 0 } 103 }; 104 105 static platform_def_t powernv_platform = { 106 "powernv", 107 powernv_methods, 108 0 109 }; 110 111 static struct cpuref platform_cpuref[MAXCPU]; 112 static int platform_cpuref_cnt; 113 static int platform_cpuref_valid; 114 static int platform_associativity; 115 116 PLATFORM_DEF(powernv_platform); 117 118 static uint64_t powernv_boot_pir; 119 120 static int 121 powernv_probe(platform_t plat) 122 { 123 if (opal_check() == 0) 124 return (BUS_PROBE_SPECIFIC); 125 126 return (ENXIO); 127 } 128 129 static int 130 powernv_attach(platform_t plat) 131 { 132 uint32_t nptlp, shift = 0, slb_encoding = 0; 133 int32_t lp_size, lp_encoding; 134 char buf[255]; 135 pcell_t refpoints[3]; 136 pcell_t prop; 137 phandle_t cpu; 138 phandle_t opal; 139 int res, len, idx; 140 register_t msr; 141 bool has_lp; 142 143 /* Ping OPAL again just to make sure */ 144 opal_check(); 145 146 #if BYTE_ORDER == LITTLE_ENDIAN 147 opal_call(OPAL_REINIT_CPUS, 2 /* Little endian */); 148 #else 149 opal_call(OPAL_REINIT_CPUS, 1 /* Big endian */); 150 #endif 151 opal = OF_finddevice("/ibm,opal"); 152 153 platform_associativity = 4; /* Skiboot default. */ 154 if (OF_getencprop(opal, "ibm,associativity-reference-points", refpoints, 155 sizeof(refpoints)) > 0) { 156 platform_associativity = refpoints[0]; 157 } 158 159 if (cpu_idle_hook == NULL) 160 cpu_idle_hook = powernv_cpu_idle; 161 162 powernv_boot_pir = mfspr(SPR_PIR); 163 164 /* LPID must not be altered when PSL_DR or PSL_IR is set */ 165 msr = mfmsr(); 166 mtmsr(msr & ~(PSL_DR | PSL_IR)); 167 168 /* Direct interrupts to SRR instead of HSRR and reset LPCR otherwise */ 169 mtspr(SPR_LPID, 0); 170 isync(); 171 172 if (cpu_features2 & PPC_FEATURE2_ARCH_3_00) 173 lpcr |= LPCR_HVICE; 174 175 #if BYTE_ORDER == LITTLE_ENDIAN 176 lpcr |= LPCR_ILE; 177 #endif 178 179 mtspr(SPR_LPCR, lpcr); 180 isync(); 181 182 mtmsr(msr); 183 184 powernv_cpuref_init(); 185 186 /* Set SLB count from device tree */ 187 cpu = OF_peer(0); 188 cpu = OF_child(cpu); 189 while (cpu != 0) { 190 res = OF_getprop(cpu, "name", buf, sizeof(buf)); 191 if (res > 0 && strcmp(buf, "cpus") == 0) 192 break; 193 cpu = OF_peer(cpu); 194 } 195 if (cpu == 0) 196 goto out; 197 198 cpu = OF_child(cpu); 199 while (cpu != 0) { 200 res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); 201 if (res > 0 && strcmp(buf, "cpu") == 0) 202 break; 203 cpu = OF_peer(cpu); 204 } 205 if (cpu == 0) 206 goto out; 207 208 res = OF_getencprop(cpu, "ibm,slb-size", &prop, sizeof(prop)); 209 if (res > 0) 210 n_slbs = prop; 211 212 /* 213 * Scan the large page size property for PAPR compatible machines. 214 * See PAPR D.5 Changes to Section 5.1.4, 'CPU Node Properties' 215 * for the encoding of the property. 216 */ 217 218 len = OF_getproplen(cpu, "ibm,segment-page-sizes"); 219 if (len > 0) { 220 /* 221 * We have to use a variable length array on the stack 222 * since we have very limited stack space. 223 */ 224 pcell_t arr[len/sizeof(cell_t)]; 225 res = OF_getencprop(cpu, "ibm,segment-page-sizes", arr, 226 sizeof(arr)); 227 len /= 4; 228 idx = 0; 229 has_lp = false; 230 while (len > 0) { 231 shift = arr[idx]; 232 slb_encoding = arr[idx + 1]; 233 nptlp = arr[idx + 2]; 234 idx += 3; 235 len -= 3; 236 while (len > 0 && nptlp) { 237 lp_size = arr[idx]; 238 lp_encoding = arr[idx+1]; 239 if (slb_encoding == SLBV_L && lp_encoding == 0) 240 has_lp = true; 241 242 if (slb_encoding == SLB_PGSZ_4K_4K && 243 lp_encoding == LP_4K_16M) 244 moea64_has_lp_4k_16m = true; 245 246 idx += 2; 247 len -= 2; 248 nptlp--; 249 } 250 if (has_lp && moea64_has_lp_4k_16m) 251 break; 252 } 253 254 if (!has_lp) 255 panic("Standard large pages (SLB[L] = 1, PTE[LP] = 0) " 256 "not supported by this system."); 257 258 moea64_large_page_shift = shift; 259 moea64_large_page_size = 1ULL << lp_size; 260 } 261 262 out: 263 return (0); 264 } 265 266 void 267 powernv_mem_regions(platform_t plat, struct mem_region *phys, int *physsz, 268 struct mem_region *avail, int *availsz) 269 { 270 271 ofw_mem_regions(phys, physsz, avail, availsz); 272 } 273 274 static void 275 powernv_numa_mem_regions(platform_t plat, struct numa_mem_region *phys, int *physsz) 276 { 277 278 ofw_numa_mem_regions(phys, physsz); 279 } 280 281 static u_long 282 powernv_timebase_freq(platform_t plat, struct cpuref *cpuref) 283 { 284 char buf[8]; 285 phandle_t cpu, dev, root; 286 int res; 287 int32_t ticks = -1; 288 289 root = OF_peer(0); 290 dev = OF_child(root); 291 while (dev != 0) { 292 res = OF_getprop(dev, "name", buf, sizeof(buf)); 293 if (res > 0 && strcmp(buf, "cpus") == 0) 294 break; 295 dev = OF_peer(dev); 296 } 297 298 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { 299 res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); 300 if (res > 0 && strcmp(buf, "cpu") == 0) 301 break; 302 } 303 if (cpu == 0) 304 return (512000000); 305 306 OF_getencprop(cpu, "timebase-frequency", &ticks, sizeof(ticks)); 307 308 if (ticks <= 0) 309 panic("Unable to determine timebase frequency!"); 310 311 return (ticks); 312 313 } 314 315 static int 316 powernv_cpuref_init(void) 317 { 318 phandle_t cpu, dev; 319 char buf[32]; 320 int a, res, tmp_cpuref_cnt; 321 static struct cpuref tmp_cpuref[MAXCPU]; 322 cell_t interrupt_servers[32]; 323 uint64_t bsp; 324 325 if (platform_cpuref_valid) 326 return (0); 327 328 dev = OF_peer(0); 329 dev = OF_child(dev); 330 while (dev != 0) { 331 res = OF_getprop(dev, "name", buf, sizeof(buf)); 332 if (res > 0 && strcmp(buf, "cpus") == 0) 333 break; 334 dev = OF_peer(dev); 335 } 336 337 bsp = 0; 338 tmp_cpuref_cnt = 0; 339 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { 340 res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); 341 if (res > 0 && strcmp(buf, "cpu") == 0) { 342 if (!ofw_bus_node_status_okay(cpu)) 343 continue; 344 res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s"); 345 if (res > 0) { 346 OF_getencprop(cpu, "ibm,ppc-interrupt-server#s", 347 interrupt_servers, res); 348 349 for (a = 0; a < res/sizeof(cell_t); a++) { 350 tmp_cpuref[tmp_cpuref_cnt].cr_hwref = interrupt_servers[a]; 351 tmp_cpuref[tmp_cpuref_cnt].cr_cpuid = tmp_cpuref_cnt; 352 tmp_cpuref[tmp_cpuref_cnt].cr_domain = 353 powernv_node_numa_domain(NULL, cpu); 354 if (interrupt_servers[a] == (uint32_t)powernv_boot_pir) 355 bsp = tmp_cpuref_cnt; 356 357 tmp_cpuref_cnt++; 358 } 359 } 360 } 361 } 362 363 /* Map IDs, so BSP has CPUID 0 regardless of hwref */ 364 for (a = bsp; a < tmp_cpuref_cnt; a++) { 365 platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; 366 platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; 367 platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; 368 platform_cpuref_cnt++; 369 } 370 for (a = 0; a < bsp; a++) { 371 platform_cpuref[platform_cpuref_cnt].cr_hwref = tmp_cpuref[a].cr_hwref; 372 platform_cpuref[platform_cpuref_cnt].cr_cpuid = platform_cpuref_cnt; 373 platform_cpuref[platform_cpuref_cnt].cr_domain = tmp_cpuref[a].cr_domain; 374 platform_cpuref_cnt++; 375 } 376 377 platform_cpuref_valid = 1; 378 379 return (0); 380 } 381 382 static int 383 powernv_smp_first_cpu(platform_t plat, struct cpuref *cpuref) 384 { 385 if (platform_cpuref_valid == 0) 386 return (EINVAL); 387 388 cpuref->cr_cpuid = 0; 389 cpuref->cr_hwref = platform_cpuref[0].cr_hwref; 390 cpuref->cr_domain = platform_cpuref[0].cr_domain; 391 392 return (0); 393 } 394 395 static int 396 powernv_smp_next_cpu(platform_t plat, struct cpuref *cpuref) 397 { 398 int id; 399 400 if (platform_cpuref_valid == 0) 401 return (EINVAL); 402 403 id = cpuref->cr_cpuid + 1; 404 if (id >= platform_cpuref_cnt) 405 return (ENOENT); 406 407 cpuref->cr_cpuid = platform_cpuref[id].cr_cpuid; 408 cpuref->cr_hwref = platform_cpuref[id].cr_hwref; 409 cpuref->cr_domain = platform_cpuref[id].cr_domain; 410 411 return (0); 412 } 413 414 static int 415 powernv_smp_get_bsp(platform_t plat, struct cpuref *cpuref) 416 { 417 418 cpuref->cr_cpuid = platform_cpuref[0].cr_cpuid; 419 cpuref->cr_hwref = platform_cpuref[0].cr_hwref; 420 cpuref->cr_domain = platform_cpuref[0].cr_domain; 421 return (0); 422 } 423 424 #ifdef SMP 425 static int 426 powernv_smp_start_cpu(platform_t plat, struct pcpu *pc) 427 { 428 int result; 429 430 ap_pcpu = pc; 431 powerpc_sync(); 432 433 result = opal_call(OPAL_START_CPU, pc->pc_hwref, EXC_RST); 434 if (result != OPAL_SUCCESS) { 435 printf("OPAL error (%d): unable to start AP %d\n", 436 result, (int)pc->pc_hwref); 437 return (ENXIO); 438 } 439 440 return (0); 441 } 442 443 static void 444 powernv_smp_probe_threads(platform_t plat) 445 { 446 char buf[8]; 447 phandle_t cpu, dev, root; 448 int res, nthreads; 449 450 root = OF_peer(0); 451 452 dev = OF_child(root); 453 while (dev != 0) { 454 res = OF_getprop(dev, "name", buf, sizeof(buf)); 455 if (res > 0 && strcmp(buf, "cpus") == 0) 456 break; 457 dev = OF_peer(dev); 458 } 459 460 nthreads = 1; 461 for (cpu = OF_child(dev); cpu != 0; cpu = OF_peer(cpu)) { 462 res = OF_getprop(cpu, "device_type", buf, sizeof(buf)); 463 if (res <= 0 || strcmp(buf, "cpu") != 0) 464 continue; 465 466 res = OF_getproplen(cpu, "ibm,ppc-interrupt-server#s"); 467 468 if (res >= 0) 469 nthreads = res / sizeof(cell_t); 470 else 471 nthreads = 1; 472 break; 473 } 474 475 smp_threads_per_core = nthreads; 476 if (mp_ncpus % nthreads == 0) 477 mp_ncores = mp_ncpus / nthreads; 478 } 479 480 static struct cpu_group * 481 cpu_group_init(struct cpu_group *group, struct cpu_group *parent, 482 const cpuset_t *cpus, int children, int level, int flags) 483 { 484 struct cpu_group *child; 485 486 child = children != 0 ? smp_topo_alloc(children) : NULL; 487 488 group->cg_parent = parent; 489 group->cg_child = child; 490 CPU_COPY(cpus, &group->cg_mask); 491 group->cg_count = CPU_COUNT(cpus); 492 group->cg_children = children; 493 group->cg_level = level; 494 group->cg_flags = flags; 495 496 return (child); 497 } 498 499 static struct cpu_group * 500 powernv_smp_topo(platform_t plat) 501 { 502 struct cpu_group *core, *dom, *root; 503 cpuset_t corecpus, domcpus; 504 int cpuid, i, j, k, ncores; 505 506 if (mp_ncpus % smp_threads_per_core != 0) { 507 printf("%s: irregular SMP topology (%d threads, %d per core)\n", 508 __func__, mp_ncpus, smp_threads_per_core); 509 return (smp_topo_none()); 510 } 511 512 root = smp_topo_alloc(1); 513 dom = cpu_group_init(root, NULL, &all_cpus, vm_ndomains, CG_SHARE_NONE, 514 0); 515 516 /* 517 * Redundant layers will be collapsed by the caller so we don't need a 518 * special case for a single domain. 519 */ 520 for (i = 0; i < vm_ndomains; i++, dom++) { 521 CPU_COPY(&cpuset_domain[i], &domcpus); 522 ncores = CPU_COUNT(&domcpus) / smp_threads_per_core; 523 KASSERT(CPU_COUNT(&domcpus) % smp_threads_per_core == 0, 524 ("%s: domain %d core count not divisible by thread count", 525 __func__, i)); 526 527 core = cpu_group_init(dom, root, &domcpus, ncores, CG_SHARE_L3, 528 0); 529 for (j = 0; j < ncores; j++, core++) { 530 /* 531 * Assume that consecutive CPU IDs correspond to sibling 532 * threads. 533 */ 534 CPU_ZERO(&corecpus); 535 for (k = 0; k < smp_threads_per_core; k++) { 536 cpuid = CPU_FFS(&domcpus) - 1; 537 CPU_CLR(cpuid, &domcpus); 538 CPU_SET(cpuid, &corecpus); 539 } 540 (void)cpu_group_init(core, dom, &corecpus, 0, 541 CG_SHARE_L1, CG_FLAG_SMT); 542 } 543 } 544 545 return (root); 546 } 547 548 #endif 549 550 static void 551 powernv_reset(platform_t platform) 552 { 553 554 opal_call(OPAL_CEC_REBOOT); 555 } 556 557 static void 558 powernv_smp_ap_init(platform_t platform) 559 { 560 561 if (powernv_smp_ap_extra_init != NULL) 562 powernv_smp_ap_extra_init(); 563 } 564 565 static void 566 powernv_cpu_idle(sbintime_t sbt) 567 { 568 } 569 570 static int 571 powernv_node_numa_domain(platform_t platform, phandle_t node) 572 { 573 /* XXX: Is locking necessary in here? */ 574 static int numa_domains[MAXMEMDOM]; 575 static int numa_max_domain; 576 cell_t associativity[5]; 577 int i, res; 578 579 #ifndef NUMA 580 return (0); 581 #endif 582 i = 0; 583 TUNABLE_INT_FETCH("vm.numa.disabled", &i); 584 if (i) 585 return (0); 586 587 res = OF_getencprop(node, "ibm,associativity", 588 associativity, sizeof(associativity)); 589 590 /* 591 * If this node doesn't have associativity, or if there are not 592 * enough elements in it, check its parent. 593 */ 594 if (res < (int)(sizeof(cell_t) * (platform_associativity + 1))) { 595 node = OF_parent(node); 596 /* If already at the root, use default domain. */ 597 if (node == 0) 598 return (0); 599 return (powernv_node_numa_domain(platform, node)); 600 } 601 602 for (i = 0; i < numa_max_domain; i++) { 603 if (numa_domains[i] == associativity[platform_associativity]) 604 return (i); 605 } 606 if (i < MAXMEMDOM) 607 numa_domains[numa_max_domain++] = 608 associativity[platform_associativity]; 609 else 610 i = 0; 611 612 return (i); 613 } 614 615 /* Set up the Nest MMU on POWER9 relatively early, but after pmap is setup. */ 616 static void 617 powernv_setup_nmmu(void *unused) 618 { 619 if (opal_check() != 0) 620 return; 621 opal_call(OPAL_NMMU_SET_PTCR, -1, mfspr(SPR_PTCR)); 622 } 623 624 SYSINIT(powernv_setup_nmmu, SI_SUB_CPU, SI_ORDER_ANY, powernv_setup_nmmu, NULL); 625