1 /*- 2 * Copyright (c) 2010 Hudson River Trading LLC 3 * Written by: John H. Baldwin <jhb@FreeBSD.org> 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions 8 * are met: 9 * 1. Redistributions of source code must retain the above copyright 10 * notice, this list of conditions and the following disclaimer. 11 * 2. Redistributions in binary form must reproduce the above copyright 12 * notice, this list of conditions and the following disclaimer in the 13 * documentation and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 19 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 21 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 23 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 24 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 25 * SUCH DAMAGE. 26 */ 27 28 #include <sys/cdefs.h> 29 __FBSDID("$FreeBSD$"); 30 31 #include "opt_vm.h" 32 33 #include <sys/param.h> 34 #include <sys/systm.h> 35 #include <sys/bus.h> 36 #include <sys/kernel.h> 37 #include <sys/lock.h> 38 #include <sys/mutex.h> 39 #include <sys/smp.h> 40 #include <sys/vmmeter.h> 41 #include <vm/vm.h> 42 #include <vm/pmap.h> 43 #include <vm/vm_param.h> 44 #include <vm/vm_page.h> 45 #include <vm/vm_phys.h> 46 47 #include <contrib/dev/acpica/include/acpi.h> 48 #include <contrib/dev/acpica/include/aclocal.h> 49 #include <contrib/dev/acpica/include/actables.h> 50 51 #include <machine/intr_machdep.h> 52 #include <machine/md_var.h> 53 #include <x86/apicvar.h> 54 55 #include <dev/acpica/acpivar.h> 56 57 #if MAXMEMDOM > 1 58 static struct cpu_info { 59 int enabled:1; 60 int has_memory:1; 61 int domain; 62 } *cpus; 63 64 struct mem_affinity mem_info[VM_PHYSSEG_MAX + 1]; 65 int num_mem; 66 67 static ACPI_TABLE_SRAT *srat; 68 static vm_paddr_t srat_physaddr; 69 70 static int domain_pxm[MAXMEMDOM]; 71 static int ndomain; 72 73 static ACPI_TABLE_SLIT *slit; 74 static vm_paddr_t slit_physaddr; 75 static int vm_locality_table[MAXMEMDOM * MAXMEMDOM]; 76 77 static void srat_walk_table(acpi_subtable_handler *handler, void *arg); 78 79 /* 80 * SLIT parsing. 81 */ 82 83 static void 84 slit_parse_table(ACPI_TABLE_SLIT *s) 85 { 86 int i, j; 87 int i_domain, j_domain; 88 int offset = 0; 89 uint8_t e; 90 91 /* 92 * This maps the SLIT data into the VM-domain centric view. 93 * There may be sparse entries in the PXM namespace, so 94 * remap them to a VM-domain ID and if it doesn't exist, 95 * skip it. 96 * 97 * It should result in a packed 2d array of VM-domain 98 * locality information entries. 99 */ 100 101 if (bootverbose) 102 printf("SLIT.Localities: %d\n", (int) s->LocalityCount); 103 for (i = 0; i < s->LocalityCount; i++) { 104 i_domain = acpi_map_pxm_to_vm_domainid(i); 105 if (i_domain < 0) 106 continue; 107 108 if (bootverbose) 109 printf("%d: ", i); 110 for (j = 0; j < s->LocalityCount; j++) { 111 j_domain = acpi_map_pxm_to_vm_domainid(j); 112 if (j_domain < 0) 113 continue; 114 e = s->Entry[i * s->LocalityCount + j]; 115 if (bootverbose) 116 printf("%d ", (int) e); 117 /* 255 == "no locality information" */ 118 if (e == 255) 119 vm_locality_table[offset] = -1; 120 else 121 vm_locality_table[offset] = e; 122 offset++; 123 } 124 if (bootverbose) 125 printf("\n"); 126 } 127 } 128 129 /* 130 * Look for an ACPI System Locality Distance Information Table ("SLIT") 131 */ 132 static int 133 parse_slit(void) 134 { 135 136 if (resource_disabled("slit", 0)) { 137 return (-1); 138 } 139 140 slit_physaddr = acpi_find_table(ACPI_SIG_SLIT); 141 if (slit_physaddr == 0) { 142 return (-1); 143 } 144 145 /* 146 * Make a pass over the table to populate the cpus[] and 147 * mem_info[] tables. 148 */ 149 slit = acpi_map_table(slit_physaddr, ACPI_SIG_SLIT); 150 slit_parse_table(slit); 151 acpi_unmap_table(slit); 152 slit = NULL; 153 154 #ifdef VM_NUMA_ALLOC 155 /* Tell the VM about it! */ 156 mem_locality = vm_locality_table; 157 #endif 158 return (0); 159 } 160 161 /* 162 * SRAT parsing. 163 */ 164 165 /* 166 * Returns true if a memory range overlaps with at least one range in 167 * phys_avail[]. 168 */ 169 static int 170 overlaps_phys_avail(vm_paddr_t start, vm_paddr_t end) 171 { 172 int i; 173 174 for (i = 0; phys_avail[i] != 0 && phys_avail[i + 1] != 0; i += 2) { 175 if (phys_avail[i + 1] <= start) 176 continue; 177 if (phys_avail[i] < end) 178 return (1); 179 break; 180 } 181 return (0); 182 183 } 184 185 static void 186 srat_parse_entry(ACPI_SUBTABLE_HEADER *entry, void *arg) 187 { 188 ACPI_SRAT_CPU_AFFINITY *cpu; 189 ACPI_SRAT_X2APIC_CPU_AFFINITY *x2apic; 190 ACPI_SRAT_MEM_AFFINITY *mem; 191 int domain, i, slot; 192 193 switch (entry->Type) { 194 case ACPI_SRAT_TYPE_CPU_AFFINITY: 195 cpu = (ACPI_SRAT_CPU_AFFINITY *)entry; 196 domain = cpu->ProximityDomainLo | 197 cpu->ProximityDomainHi[0] << 8 | 198 cpu->ProximityDomainHi[1] << 16 | 199 cpu->ProximityDomainHi[2] << 24; 200 if (bootverbose) 201 printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", 202 cpu->ApicId, domain, 203 (cpu->Flags & ACPI_SRAT_CPU_ENABLED) ? 204 "enabled" : "disabled"); 205 if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) 206 break; 207 if (cpu->ApicId > max_apic_id) { 208 printf("SRAT: Ignoring local APIC ID %u (too high)\n", 209 cpu->ApicId); 210 break; 211 } 212 213 if (cpus[cpu->ApicId].enabled) { 214 printf("SRAT: Duplicate local APIC ID %u\n", 215 cpu->ApicId); 216 *(int *)arg = ENXIO; 217 break; 218 } 219 cpus[cpu->ApicId].domain = domain; 220 cpus[cpu->ApicId].enabled = 1; 221 break; 222 case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: 223 x2apic = (ACPI_SRAT_X2APIC_CPU_AFFINITY *)entry; 224 if (bootverbose) 225 printf("SRAT: Found CPU APIC ID %u domain %d: %s\n", 226 x2apic->ApicId, x2apic->ProximityDomain, 227 (x2apic->Flags & ACPI_SRAT_CPU_ENABLED) ? 228 "enabled" : "disabled"); 229 if (!(x2apic->Flags & ACPI_SRAT_CPU_ENABLED)) 230 break; 231 if (x2apic->ApicId > max_apic_id) { 232 printf("SRAT: Ignoring local APIC ID %u (too high)\n", 233 x2apic->ApicId); 234 break; 235 } 236 237 KASSERT(!cpus[x2apic->ApicId].enabled, 238 ("Duplicate local APIC ID %u", x2apic->ApicId)); 239 cpus[x2apic->ApicId].domain = x2apic->ProximityDomain; 240 cpus[x2apic->ApicId].enabled = 1; 241 break; 242 case ACPI_SRAT_TYPE_MEMORY_AFFINITY: 243 mem = (ACPI_SRAT_MEM_AFFINITY *)entry; 244 if (bootverbose) 245 printf( 246 "SRAT: Found memory domain %d addr 0x%jx len 0x%jx: %s\n", 247 mem->ProximityDomain, (uintmax_t)mem->BaseAddress, 248 (uintmax_t)mem->Length, 249 (mem->Flags & ACPI_SRAT_MEM_ENABLED) ? 250 "enabled" : "disabled"); 251 if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) 252 break; 253 if (!overlaps_phys_avail(mem->BaseAddress, 254 mem->BaseAddress + mem->Length)) { 255 printf("SRAT: Ignoring memory at addr 0x%jx\n", 256 (uintmax_t)mem->BaseAddress); 257 break; 258 } 259 if (num_mem == VM_PHYSSEG_MAX) { 260 printf("SRAT: Too many memory regions\n"); 261 *(int *)arg = ENXIO; 262 break; 263 } 264 slot = num_mem; 265 for (i = 0; i < num_mem; i++) { 266 if (mem_info[i].end <= mem->BaseAddress) 267 continue; 268 if (mem_info[i].start < 269 (mem->BaseAddress + mem->Length)) { 270 printf("SRAT: Overlapping memory entries\n"); 271 *(int *)arg = ENXIO; 272 return; 273 } 274 slot = i; 275 } 276 for (i = num_mem; i > slot; i--) 277 mem_info[i] = mem_info[i - 1]; 278 mem_info[slot].start = mem->BaseAddress; 279 mem_info[slot].end = mem->BaseAddress + mem->Length; 280 mem_info[slot].domain = mem->ProximityDomain; 281 num_mem++; 282 break; 283 } 284 } 285 286 /* 287 * Ensure each memory domain has at least one CPU and that each CPU 288 * has at least one memory domain. 289 */ 290 static int 291 check_domains(void) 292 { 293 int found, i, j; 294 295 for (i = 0; i < num_mem; i++) { 296 found = 0; 297 for (j = 0; j <= max_apic_id; j++) 298 if (cpus[j].enabled && 299 cpus[j].domain == mem_info[i].domain) { 300 cpus[j].has_memory = 1; 301 found++; 302 } 303 if (!found) { 304 printf("SRAT: No CPU found for memory domain %d\n", 305 mem_info[i].domain); 306 return (ENXIO); 307 } 308 } 309 for (i = 0; i <= max_apic_id; i++) 310 if (cpus[i].enabled && !cpus[i].has_memory) { 311 printf("SRAT: No memory found for CPU %d\n", i); 312 return (ENXIO); 313 } 314 return (0); 315 } 316 317 /* 318 * Check that the SRAT memory regions cover all of the regions in 319 * phys_avail[]. 320 */ 321 static int 322 check_phys_avail(void) 323 { 324 vm_paddr_t address; 325 int i, j; 326 327 /* j is the current offset into phys_avail[]. */ 328 address = phys_avail[0]; 329 j = 0; 330 for (i = 0; i < num_mem; i++) { 331 /* 332 * Consume as many phys_avail[] entries as fit in this 333 * region. 334 */ 335 while (address >= mem_info[i].start && 336 address <= mem_info[i].end) { 337 /* 338 * If we cover the rest of this phys_avail[] entry, 339 * advance to the next entry. 340 */ 341 if (phys_avail[j + 1] <= mem_info[i].end) { 342 j += 2; 343 if (phys_avail[j] == 0 && 344 phys_avail[j + 1] == 0) { 345 return (0); 346 } 347 address = phys_avail[j]; 348 } else 349 address = mem_info[i].end + 1; 350 } 351 } 352 printf("SRAT: No memory region found for 0x%jx - 0x%jx\n", 353 (uintmax_t)phys_avail[j], (uintmax_t)phys_avail[j + 1]); 354 return (ENXIO); 355 } 356 357 /* 358 * Renumber the memory domains to be compact and zero-based if not 359 * already. Returns an error if there are too many domains. 360 */ 361 static int 362 renumber_domains(void) 363 { 364 int i, j, slot; 365 366 /* Enumerate all the domains. */ 367 ndomain = 0; 368 for (i = 0; i < num_mem; i++) { 369 /* See if this domain is already known. */ 370 for (j = 0; j < ndomain; j++) { 371 if (domain_pxm[j] >= mem_info[i].domain) 372 break; 373 } 374 if (j < ndomain && domain_pxm[j] == mem_info[i].domain) 375 continue; 376 377 if (ndomain >= MAXMEMDOM) { 378 ndomain = 1; 379 printf("SRAT: Too many memory domains\n"); 380 return (EFBIG); 381 } 382 383 /* Insert the new domain at slot 'j'. */ 384 slot = j; 385 for (j = ndomain; j > slot; j--) 386 domain_pxm[j] = domain_pxm[j - 1]; 387 domain_pxm[slot] = mem_info[i].domain; 388 ndomain++; 389 } 390 391 /* Renumber each domain to its index in the sorted 'domain_pxm' list. */ 392 for (i = 0; i < ndomain; i++) { 393 /* 394 * If the domain is already the right value, no need 395 * to renumber. 396 */ 397 if (domain_pxm[i] == i) 398 continue; 399 400 /* Walk the cpu[] and mem_info[] arrays to renumber. */ 401 for (j = 0; j < num_mem; j++) 402 if (mem_info[j].domain == domain_pxm[i]) 403 mem_info[j].domain = i; 404 for (j = 0; j <= max_apic_id; j++) 405 if (cpus[j].enabled && cpus[j].domain == domain_pxm[i]) 406 cpus[j].domain = i; 407 } 408 409 return (0); 410 } 411 412 /* 413 * Look for an ACPI System Resource Affinity Table ("SRAT") 414 */ 415 static int 416 parse_srat(void) 417 { 418 unsigned int idx, size; 419 vm_paddr_t addr; 420 int error; 421 422 if (resource_disabled("srat", 0)) 423 return (-1); 424 425 srat_physaddr = acpi_find_table(ACPI_SIG_SRAT); 426 if (srat_physaddr == 0) 427 return (-1); 428 429 /* 430 * Allocate data structure: 431 * 432 * Find the last physical memory region and steal some memory from 433 * it. This is done because at this point in the boot process 434 * malloc is still not usable. 435 */ 436 for (idx = 0; phys_avail[idx + 1] != 0; idx += 2); 437 KASSERT(idx != 0, ("phys_avail is empty!")); 438 idx -= 2; 439 440 size = sizeof(*cpus) * (max_apic_id + 1); 441 addr = trunc_page(phys_avail[idx + 1] - size); 442 KASSERT(addr >= phys_avail[idx], 443 ("Not enough memory for SRAT table items")); 444 phys_avail[idx + 1] = addr - 1; 445 446 /* 447 * We cannot rely on PHYS_TO_DMAP because this code is also used in 448 * i386, so use pmap_mapbios to map the memory, this will end up using 449 * the default memory attribute (WB), and the DMAP when available. 450 */ 451 cpus = (struct cpu_info *)pmap_mapbios(addr, size); 452 bzero(cpus, size); 453 454 /* 455 * Make a pass over the table to populate the cpus[] and 456 * mem_info[] tables. 457 */ 458 srat = acpi_map_table(srat_physaddr, ACPI_SIG_SRAT); 459 error = 0; 460 srat_walk_table(srat_parse_entry, &error); 461 acpi_unmap_table(srat); 462 srat = NULL; 463 if (error || check_domains() != 0 || check_phys_avail() != 0 || 464 renumber_domains() != 0) { 465 srat_physaddr = 0; 466 return (-1); 467 } 468 469 #ifdef VM_NUMA_ALLOC 470 /* Point vm_phys at our memory affinity table. */ 471 vm_ndomains = ndomain; 472 mem_affinity = mem_info; 473 #endif 474 475 return (0); 476 } 477 478 static void 479 init_mem_locality(void) 480 { 481 int i; 482 483 /* 484 * For now, assume -1 == "no locality information for 485 * this pairing. 486 */ 487 for (i = 0; i < MAXMEMDOM * MAXMEMDOM; i++) 488 vm_locality_table[i] = -1; 489 } 490 491 static void 492 parse_acpi_tables(void *dummy) 493 { 494 495 if (parse_srat() < 0) 496 return; 497 init_mem_locality(); 498 (void) parse_slit(); 499 } 500 SYSINIT(parse_acpi_tables, SI_SUB_VM - 1, SI_ORDER_FIRST, parse_acpi_tables, 501 NULL); 502 503 static void 504 srat_walk_table(acpi_subtable_handler *handler, void *arg) 505 { 506 507 acpi_walk_subtables(srat + 1, (char *)srat + srat->Header.Length, 508 handler, arg); 509 } 510 511 /* 512 * Setup per-CPU domain IDs. 513 */ 514 static void 515 srat_set_cpus(void *dummy) 516 { 517 struct cpu_info *cpu; 518 struct pcpu *pc; 519 u_int i; 520 521 if (srat_physaddr == 0) 522 return; 523 for (i = 0; i < MAXCPU; i++) { 524 if (CPU_ABSENT(i)) 525 continue; 526 pc = pcpu_find(i); 527 KASSERT(pc != NULL, ("no pcpu data for CPU %u", i)); 528 cpu = &cpus[pc->pc_apic_id]; 529 if (!cpu->enabled) 530 panic("SRAT: CPU with APIC ID %u is not known", 531 pc->pc_apic_id); 532 pc->pc_domain = cpu->domain; 533 CPU_SET(i, &cpuset_domain[cpu->domain]); 534 if (bootverbose) 535 printf("SRAT: CPU %u has memory domain %d\n", i, 536 cpu->domain); 537 } 538 539 /* Last usage of the cpus array, unmap it. */ 540 pmap_unmapbios((vm_offset_t)cpus, sizeof(*cpus) * (max_apic_id + 1)); 541 cpus = NULL; 542 } 543 SYSINIT(srat_set_cpus, SI_SUB_CPU, SI_ORDER_ANY, srat_set_cpus, NULL); 544 545 /* 546 * Map a _PXM value to a VM domain ID. 547 * 548 * Returns the domain ID, or -1 if no domain ID was found. 549 */ 550 int 551 acpi_map_pxm_to_vm_domainid(int pxm) 552 { 553 int i; 554 555 for (i = 0; i < ndomain; i++) { 556 if (domain_pxm[i] == pxm) 557 return (i); 558 } 559 560 return (-1); 561 } 562 563 #else /* MAXMEMDOM == 1 */ 564 565 int 566 acpi_map_pxm_to_vm_domainid(int pxm) 567 { 568 569 return (-1); 570 } 571 572 #endif /* MAXMEMDOM > 1 */ 573